From d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Fri, 27 Apr 2018 10:37:02 -0700
Subject: Check in gVisor.

PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
---
 pkg/sentry/BUILD                                   |   12 +
 pkg/sentry/arch/BUILD                              |   66 +
 pkg/sentry/arch/aligned.go                         |   31 +
 pkg/sentry/arch/arch.go                            |  351 ++++
 pkg/sentry/arch/arch_amd64.go                      |  302 +++
 pkg/sentry/arch/arch_amd64.s                       |  135 ++
 pkg/sentry/arch/arch_state_x86.go                  |   97 +
 pkg/sentry/arch/arch_x86.go                        |  613 ++++++
 pkg/sentry/arch/auxv.go                            |   28 +
 pkg/sentry/arch/registers.proto                    |   55 +
 pkg/sentry/arch/signal_act.go                      |   79 +
 pkg/sentry/arch/signal_amd64.go                    |  476 +++++
 pkg/sentry/arch/signal_info.go                     |   66 +
 pkg/sentry/arch/signal_stack.go                    |   58 +
 pkg/sentry/arch/stack.go                           |  246 +++
 pkg/sentry/arch/syscalls_amd64.go                  |   52 +
 pkg/sentry/context/BUILD                           |   14 +
 pkg/sentry/context/context.go                      |  103 ++
 pkg/sentry/context/contexttest/BUILD               |   34 +
 pkg/sentry/context/contexttest/contexttest.go      |  133 ++
 pkg/sentry/control/BUILD                           |   39 +
 pkg/sentry/control/control.go                      |   17 +
 pkg/sentry/control/proc.go                         |  293 +++
 pkg/sentry/control/proc_test.go                    |  164 ++
 pkg/sentry/device/BUILD                            |   18 +
 pkg/sentry/device/device.go                        |  193 ++
 pkg/sentry/device/device_test.go                   |   59 +
 pkg/sentry/fs/BUILD                                |  154 ++
 pkg/sentry/fs/README.md                            |  217 +++
 pkg/sentry/fs/anon/BUILD                           |   21 +
 pkg/sentry/fs/anon/anon.go                         |   46 +
 pkg/sentry/fs/anon/device.go                       |   22 +
 pkg/sentry/fs/ashmem/BUILD                         |   83 +
 pkg/sentry/fs/ashmem/area.go                       |  313 ++++
 pkg/sentry/fs/ashmem/device.go                     |  169 ++
 pkg/sentry/fs/ashmem/pin_board.go                  |  125 ++
 pkg/sentry/fs/ashmem/pin_board_test.go             |  130 ++
 pkg/sentry/fs/attr.go                              |  382 ++++
 pkg/sentry/fs/binder/BUILD                         |   38 +
 pkg/sentry/fs/binder/binder.go                     |  358 ++++
 pkg/sentry/fs/context.go                           |   97 +
 pkg/sentry/fs/copy_up.go                           |  414 +++++
 pkg/sentry/fs/copy_up_test.go                      |  182 ++
 pkg/sentry/fs/dentry.go                            |  232 +++
 pkg/sentry/fs/dev/BUILD                            |   53 +
 pkg/sentry/fs/dev/dev.go                           |  122 ++
 pkg/sentry/fs/dev/device.go                        |   20 +
 pkg/sentry/fs/dev/fs.go                            |   90 +
 pkg/sentry/fs/dev/full.go                          |   53 +
 pkg/sentry/fs/dev/null.go                          |   96 +
 pkg/sentry/fs/dev/random.go                        |   55 +
 pkg/sentry/fs/dirent.go                            | 1605 ++++++++++++++++
 pkg/sentry/fs/dirent_cache.go                      |  142 ++
 pkg/sentry/fs/dirent_cache_test.go                 |  157 ++
 pkg/sentry/fs/dirent_refs_test.go                  |  417 +++++
 pkg/sentry/fs/dirent_state.go                      |   44 +
 pkg/sentry/fs/fdpipe/BUILD                         |   76 +
 pkg/sentry/fs/fdpipe/pipe.go                       |  167 ++
 pkg/sentry/fs/fdpipe/pipe_opener.go                |  193 ++
 pkg/sentry/fs/fdpipe/pipe_opener_test.go           |  522 ++++++
 pkg/sentry/fs/fdpipe/pipe_state.go                 |   88 +
 pkg/sentry/fs/fdpipe/pipe_test.go                  |  489 +++++
 pkg/sentry/fs/file.go                              |  404 ++++
 pkg/sentry/fs/file_operations.go                   |  106 ++
 pkg/sentry/fs/file_overlay.go                      |  345 ++++
 pkg/sentry/fs/file_overlay_test.go                 |  137 ++
 pkg/sentry/fs/file_state.go                        |   30 +
 pkg/sentry/fs/file_test.go                         |   24 +
 pkg/sentry/fs/filesystems.go                       |  162 ++
 pkg/sentry/fs/filetest/BUILD                       |   35 +
 pkg/sentry/fs/filetest/filetest.go                 |   59 +
 pkg/sentry/fs/flags.go                             |   67 +
 pkg/sentry/fs/fs.go                                |   88 +
 pkg/sentry/fs/fsutil/BUILD                         |  149 ++
 pkg/sentry/fs/fsutil/README.md                     |  207 +++
 pkg/sentry/fs/fsutil/dirty_set.go                  |  213 +++
 pkg/sentry/fs/fsutil/dirty_set_test.go             |   38 +
 pkg/sentry/fs/fsutil/file.go                       |  267 +++
 pkg/sentry/fs/fsutil/file_range_set.go             |  208 +++
 pkg/sentry/fs/fsutil/frame_ref_set.go              |   50 +
 pkg/sentry/fs/fsutil/fsutil.go                     |   26 +
 pkg/sentry/fs/fsutil/handle.go                     |  126 ++
 pkg/sentry/fs/fsutil/handle_test.go                |  227 +++
 pkg/sentry/fs/fsutil/host_file_mapper.go           |  209 +++
 pkg/sentry/fs/fsutil/host_file_mapper_state.go     |   20 +
 pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go    |   27 +
 pkg/sentry/fs/fsutil/inode.go                      |  380 ++++
 pkg/sentry/fs/fsutil/inode_cached.go               |  845 +++++++++
 pkg/sentry/fs/fsutil/inode_cached_test.go          |  403 ++++
 pkg/sentry/fs/g3doc/inotify.md                     |  122 ++
 pkg/sentry/fs/gofer/BUILD                          |   90 +
 pkg/sentry/fs/gofer/attr.go                        |  162 ++
 pkg/sentry/fs/gofer/context_file.go                |  190 ++
 pkg/sentry/fs/gofer/device.go                      |   20 +
 pkg/sentry/fs/gofer/file.go                        |  255 +++
 pkg/sentry/fs/gofer/file_state.go                  |   37 +
 pkg/sentry/fs/gofer/fs.go                          |  252 +++
 pkg/sentry/fs/gofer/gofer_test.go                  |  776 ++++++++
 pkg/sentry/fs/gofer/handles.go                     |  144 ++
 pkg/sentry/fs/gofer/inode.go                       |  554 ++++++
 pkg/sentry/fs/gofer/inode_state.go                 |  141 ++
 pkg/sentry/fs/gofer/path.go                        |  331 ++++
 pkg/sentry/fs/gofer/session.go                     |  251 +++
 pkg/sentry/fs/gofer/session_state.go               |   90 +
 pkg/sentry/fs/gofer/socket.go                      |  127 ++
 pkg/sentry/fs/gofer/util.go                        |   60 +
 pkg/sentry/fs/host/BUILD                           |  104 ++
 pkg/sentry/fs/host/control.go                      |   90 +
 pkg/sentry/fs/host/descriptor.go                   |  118 ++
 pkg/sentry/fs/host/descriptor_state.go             |   29 +
 pkg/sentry/fs/host/device.go                       |   25 +
 pkg/sentry/fs/host/file.go                         |  371 ++++
 pkg/sentry/fs/host/fs.go                           |  327 ++++
 pkg/sentry/fs/host/fs_test.go                      |  383 ++++
 pkg/sentry/fs/host/inode.go                        |  506 +++++
 pkg/sentry/fs/host/inode_state.go                  |   79 +
 pkg/sentry/fs/host/inode_test.go                   |  112 ++
 pkg/sentry/fs/host/ioctl_unsafe.go                 |   39 +
 pkg/sentry/fs/host/socket.go                       |  471 +++++
 pkg/sentry/fs/host/socket_state.go                 |   39 +
 pkg/sentry/fs/host/socket_test.go                  |  401 ++++
 pkg/sentry/fs/host/socket_unsafe.go                |   82 +
 pkg/sentry/fs/host/util.go                         |  197 ++
 pkg/sentry/fs/host/util_unsafe.go                  |  137 ++
 pkg/sentry/fs/host/wait_test.go                    |   70 +
 pkg/sentry/fs/inode.go                             |  455 +++++
 pkg/sentry/fs/inode_inotify.go                     |  166 ++
 pkg/sentry/fs/inode_operations.go                  |  385 ++++
 pkg/sentry/fs/inode_overlay.go                     |  555 ++++++
 pkg/sentry/fs/inode_overlay_test.go                |  251 +++
 pkg/sentry/fs/inotify.go                           |  329 ++++
 pkg/sentry/fs/inotify_event.go                     |  138 ++
 pkg/sentry/fs/inotify_watch.go                     |  129 ++
 pkg/sentry/fs/lock/BUILD                           |   72 +
 pkg/sentry/fs/lock/lock.go                         |  457 +++++
 pkg/sentry/fs/lock/lock_range_test.go              |  136 ++
 pkg/sentry/fs/lock/lock_set_functions.go           |   69 +
 pkg/sentry/fs/lock/lock_test.go                    | 1059 +++++++++++
 pkg/sentry/fs/mock.go                              |  177 ++
 pkg/sentry/fs/mount.go                             |  298 +++
 pkg/sentry/fs/mount_overlay.go                     |   95 +
 pkg/sentry/fs/mount_state.go                       |   25 +
 pkg/sentry/fs/mount_test.go                        |  216 +++
 pkg/sentry/fs/mounts.go                            |  511 +++++
 pkg/sentry/fs/mounts_test.go                       |  102 +
 pkg/sentry/fs/offset.go                            |   65 +
 pkg/sentry/fs/overlay.go                           |  268 +++
 pkg/sentry/fs/path.go                              |   92 +
 pkg/sentry/fs/path_test.go                         |  211 +++
 pkg/sentry/fs/proc/BUILD                           |   95 +
 pkg/sentry/fs/proc/README.md                       |  317 ++++
 pkg/sentry/fs/proc/cpuinfo.go                      |   64 +
 pkg/sentry/fs/proc/device/BUILD                    |   11 +
 pkg/sentry/fs/proc/device/device.go                |   23 +
 pkg/sentry/fs/proc/exec_args.go                    |  129 ++
 pkg/sentry/fs/proc/fds.go                          |  258 +++
 pkg/sentry/fs/proc/file.go                         |   56 +
 pkg/sentry/fs/proc/filesystems.go                  |   55 +
 pkg/sentry/fs/proc/fs.go                           |   69 +
 pkg/sentry/fs/proc/loadavg.go                      |   51 +
 pkg/sentry/fs/proc/meminfo.go                      |   82 +
 pkg/sentry/fs/proc/mounts.go                       |  176 ++
 pkg/sentry/fs/proc/net.go                          |  151 ++
 pkg/sentry/fs/proc/net_test.go                     |   74 +
 pkg/sentry/fs/proc/proc.go                         |  182 ++
 pkg/sentry/fs/proc/seqfile/BUILD                   |   55 +
 pkg/sentry/fs/proc/seqfile/seqfile.go              |  232 +++
 pkg/sentry/fs/proc/seqfile/seqfile_test.go         |  272 +++
 pkg/sentry/fs/proc/stat.go                         |  139 ++
 pkg/sentry/fs/proc/sys.go                          |  117 ++
 pkg/sentry/fs/proc/sys_net.go                      |  188 ++
 pkg/sentry/fs/proc/sys_net_test.go                 |  121 ++
 pkg/sentry/fs/proc/task.go                         |  567 ++++++
 pkg/sentry/fs/proc/uid_gid_map.go                  |  152 ++
 pkg/sentry/fs/proc/uptime.go                       |   61 +
 pkg/sentry/fs/proc/version.go                      |   75 +
 pkg/sentry/fs/ramfs/BUILD                          |   62 +
 pkg/sentry/fs/ramfs/dir.go                         |  364 ++++
 pkg/sentry/fs/ramfs/file.go                        |  148 ++
 pkg/sentry/fs/ramfs/ramfs.go                       |  433 +++++
 pkg/sentry/fs/ramfs/socket.go                      |   42 +
 pkg/sentry/fs/ramfs/symlink.go                     |   72 +
 pkg/sentry/fs/ramfs/test/BUILD                     |   31 +
 pkg/sentry/fs/ramfs/test/test.go                   |   46 +
 pkg/sentry/fs/ramfs/tree.go                        |   71 +
 pkg/sentry/fs/ramfs/tree_test.go                   |   79 +
 pkg/sentry/fs/restore.go                           |   75 +
 pkg/sentry/fs/save.go                              |   77 +
 pkg/sentry/fs/seek.go                              |   43 +
 pkg/sentry/fs/sync.go                              |   43 +
 pkg/sentry/fs/sys/BUILD                            |   34 +
 pkg/sentry/fs/sys/device.go                        |   20 +
 pkg/sentry/fs/sys/fs.go                            |   56 +
 pkg/sentry/fs/sys/sys.go                           |   57 +
 pkg/sentry/fs/timerfd/BUILD                        |   35 +
 pkg/sentry/fs/timerfd/timerfd.go                   |  144 ++
 pkg/sentry/fs/tmpfs/BUILD                          |   64 +
 pkg/sentry/fs/tmpfs/device.go                      |   20 +
 pkg/sentry/fs/tmpfs/file_regular.go                |   56 +
 pkg/sentry/fs/tmpfs/file_test.go                   |   73 +
 pkg/sentry/fs/tmpfs/fs.go                          |  131 ++
 pkg/sentry/fs/tmpfs/inode_file.go                  |  492 +++++
 pkg/sentry/fs/tmpfs/tmpfs.go                       |  204 ++
 pkg/sentry/fs/tty/BUILD                            |   63 +
 pkg/sentry/fs/tty/dir.go                           |  398 ++++
 pkg/sentry/fs/tty/fs.go                            |   95 +
 pkg/sentry/fs/tty/inode.go                         |  143 ++
 pkg/sentry/fs/tty/line_discipline.go               |  342 ++++
 pkg/sentry/fs/tty/master.go                        |  173 ++
 pkg/sentry/fs/tty/slave.go                         |  151 ++
 pkg/sentry/fs/tty/terminal.go                      |   44 +
 pkg/sentry/fs/tty/tty_test.go                      |   56 +
 pkg/sentry/hostcpu/BUILD                           |   20 +
 pkg/sentry/hostcpu/getcpu_amd64.s                  |   24 +
 pkg/sentry/hostcpu/hostcpu.go                      |   67 +
 pkg/sentry/hostcpu/hostcpu_test.go                 |   52 +
 pkg/sentry/inet/BUILD                              |   28 +
 pkg/sentry/inet/inet.go                            |   99 +
 pkg/sentry/inet/test_stack.go                      |   83 +
 pkg/sentry/kernel/BUILD                            |  234 +++
 pkg/sentry/kernel/README.md                        |  106 ++
 pkg/sentry/kernel/abstract_socket_namespace.go     |  108 ++
 pkg/sentry/kernel/auth/BUILD                       |   73 +
 pkg/sentry/kernel/auth/auth.go                     |   22 +
 pkg/sentry/kernel/auth/capability_set.go           |   61 +
 pkg/sentry/kernel/auth/context.go                  |   36 +
 pkg/sentry/kernel/auth/credentials.go              |  227 +++
 pkg/sentry/kernel/auth/id.go                       |  121 ++
 pkg/sentry/kernel/auth/id_map.go                   |  283 +++
 pkg/sentry/kernel/auth/id_map_functions.go         |   45 +
 pkg/sentry/kernel/auth/user_namespace.go           |  130 ++
 pkg/sentry/kernel/context.go                       |  135 ++
 pkg/sentry/kernel/epoll/BUILD                      |   52 +
 pkg/sentry/kernel/epoll/epoll.go                   |  466 +++++
 pkg/sentry/kernel/epoll/epoll_state.go             |   51 +
 pkg/sentry/kernel/epoll/epoll_test.go              |   54 +
 pkg/sentry/kernel/eventfd/BUILD                    |   46 +
 pkg/sentry/kernel/eventfd/eventfd.go               |  172 ++
 pkg/sentry/kernel/eventfd/eventfd_test.go          |   78 +
 pkg/sentry/kernel/fd_map.go                        |  340 ++++
 pkg/sentry/kernel/fd_map_test.go                   |  134 ++
 pkg/sentry/kernel/fs_context.go                    |  172 ++
 pkg/sentry/kernel/futex/BUILD                      |   48 +
 pkg/sentry/kernel/futex/futex.go                   |  405 ++++
 pkg/sentry/kernel/futex/futex_test.go              |  500 +++++
 pkg/sentry/kernel/g3doc/run_states.dot             |   99 +
 pkg/sentry/kernel/ipc_namespace.go                 |   43 +
 pkg/sentry/kernel/kdefs/BUILD                      |   10 +
 pkg/sentry/kernel/kdefs/kdefs.go                   |   20 +
 pkg/sentry/kernel/kernel.go                        |  957 ++++++++++
 pkg/sentry/kernel/memevent/BUILD                   |   31 +
 pkg/sentry/kernel/memevent/memory_events.go        |   98 +
 pkg/sentry/kernel/memevent/memory_events.proto     |   25 +
 pkg/sentry/kernel/pending_signals.go               |  126 ++
 pkg/sentry/kernel/pipe/BUILD                       |   68 +
 pkg/sentry/kernel/pipe/buffers.go                  |   50 +
 pkg/sentry/kernel/pipe/device.go                   |   20 +
 pkg/sentry/kernel/pipe/node.go                     |  175 ++
 pkg/sentry/kernel/pipe/node_test.go                |  308 ++++
 pkg/sentry/kernel/pipe/pipe.go                     |  335 ++++
 pkg/sentry/kernel/pipe/pipe_test.go                |  138 ++
 pkg/sentry/kernel/pipe/reader.go                   |   37 +
 pkg/sentry/kernel/pipe/reader_writer.go            |   91 +
 pkg/sentry/kernel/pipe/writer.go                   |   37 +
 pkg/sentry/kernel/ptrace.go                        | 1054 +++++++++++
 pkg/sentry/kernel/rseq.go                          |  118 ++
 pkg/sentry/kernel/sched/BUILD                      |   20 +
 pkg/sentry/kernel/sched/cpuset.go                  |  105 ++
 pkg/sentry/kernel/sched/cpuset_test.go             |   44 +
 pkg/sentry/kernel/sched/sched.go                   |   16 +
 pkg/sentry/kernel/seccomp.go                       |  205 +++
 pkg/sentry/kernel/semaphore/BUILD                  |   62 +
 pkg/sentry/kernel/semaphore/semaphore.go           |  473 +++++
 pkg/sentry/kernel/semaphore/semaphore_test.go      |  172 ++
 pkg/sentry/kernel/sessions.go                      |  462 +++++
 pkg/sentry/kernel/signal.go                        |   69 +
 pkg/sentry/kernel/signal_handlers.go               |   79 +
 pkg/sentry/kernel/syscalls.go                      |  305 +++
 pkg/sentry/kernel/syscalls_state.go                |   29 +
 pkg/sentry/kernel/syslog.go                        |  100 +
 pkg/sentry/kernel/table_test.go                    |  108 ++
 pkg/sentry/kernel/task.go                          |  606 ++++++
 pkg/sentry/kernel/task_acct.go                     |  111 ++
 pkg/sentry/kernel/task_block.go                    |  207 +++
 pkg/sentry/kernel/task_clone.go                    |  475 +++++
 pkg/sentry/kernel/task_context.go                  |  179 ++
 pkg/sentry/kernel/task_exec.go                     |  240 +++
 pkg/sentry/kernel/task_exit.go                     | 1139 ++++++++++++
 pkg/sentry/kernel/task_identity.go                 |  557 ++++++
 pkg/sentry/kernel/task_log.go                      |  137 ++
 pkg/sentry/kernel/task_net.go                      |   35 +
 pkg/sentry/kernel/task_resources.go                |  126 ++
 pkg/sentry/kernel/task_run.go                      |  346 ++++
 pkg/sentry/kernel/task_sched.go                    |  329 ++++
 pkg/sentry/kernel/task_signals.go                  | 1056 +++++++++++
 pkg/sentry/kernel/task_start.go                    |  252 +++
 pkg/sentry/kernel/task_stop.go                     |  226 +++
 pkg/sentry/kernel/task_syscall.go                  |  434 +++++
 pkg/sentry/kernel/task_test.go                     |   69 +
 pkg/sentry/kernel/task_usermem.go                  |  298 +++
 pkg/sentry/kernel/thread_group.go                  |  269 +++
 pkg/sentry/kernel/threads.go                       |  443 +++++
 pkg/sentry/kernel/time/BUILD                       |   32 +
 pkg/sentry/kernel/time/context.go                  |   44 +
 pkg/sentry/kernel/time/time.go                     |  649 +++++++
 pkg/sentry/kernel/timekeeper.go                    |  270 +++
 pkg/sentry/kernel/timekeeper_state.go              |   41 +
 pkg/sentry/kernel/timekeeper_test.go               |  156 ++
 pkg/sentry/kernel/timer.go                         |  282 +++
 pkg/sentry/kernel/uts_namespace.go                 |  100 +
 pkg/sentry/kernel/vdso.go                          |  145 ++
 pkg/sentry/kernel/version.go                       |   33 +
 pkg/sentry/limits/BUILD                            |   39 +
 pkg/sentry/limits/context.go                       |   35 +
 pkg/sentry/limits/limits.go                        |  128 ++
 pkg/sentry/limits/limits_test.go                   |   37 +
 pkg/sentry/limits/linux.go                         |  100 +
 pkg/sentry/loader/BUILD                            |   59 +
 pkg/sentry/loader/elf.go                           |  637 +++++++
 pkg/sentry/loader/interpreter.go                   |  105 ++
 pkg/sentry/loader/loader.go                        |  277 +++
 pkg/sentry/loader/vdso.go                          |  382 ++++
 pkg/sentry/loader/vdso_state.go                    |   47 +
 pkg/sentry/memmap/BUILD                            |   71 +
 pkg/sentry/memmap/mapping_set.go                   |  245 +++
 pkg/sentry/memmap/mapping_set_test.go              |  186 ++
 pkg/sentry/memmap/memmap.go                        |  297 +++
 pkg/sentry/memutil/BUILD                           |   14 +
 pkg/sentry/memutil/memutil.go                      |   16 +
 pkg/sentry/memutil/memutil_unsafe.go               |   35 +
 pkg/sentry/mm/BUILD                                |  155 ++
 pkg/sentry/mm/README.md                            |  279 +++
 pkg/sentry/mm/address_space.go                     |  223 +++
 pkg/sentry/mm/aio_context.go                       |  377 ++++
 pkg/sentry/mm/aio_context_state.go                 |   20 +
 pkg/sentry/mm/debug.go                             |   98 +
 pkg/sentry/mm/io.go                                |  604 ++++++
 pkg/sentry/mm/lifecycle.go                         |  218 +++
 pkg/sentry/mm/metadata.go                          |  139 ++
 pkg/sentry/mm/mm.go                                |  417 +++++
 pkg/sentry/mm/mm_test.go                           |  174 ++
 pkg/sentry/mm/pma.go                               |  928 ++++++++++
 pkg/sentry/mm/proc_pid_maps.go                     |  105 ++
 pkg/sentry/mm/save_restore.go                      |   57 +
 pkg/sentry/mm/special_mappable.go                  |  147 ++
 pkg/sentry/mm/syscalls.go                          |  794 ++++++++
 pkg/sentry/mm/vma.go                               |  476 +++++
 pkg/sentry/platform/BUILD                          |   51 +
 pkg/sentry/platform/context.go                     |   36 +
 pkg/sentry/platform/filemem/BUILD                  |   69 +
 pkg/sentry/platform/filemem/filemem.go             |  838 +++++++++
 pkg/sentry/platform/filemem/filemem_state.go       |  170 ++
 pkg/sentry/platform/filemem/filemem_test.go        |  122 ++
 pkg/sentry/platform/filemem/filemem_unsafe.go      |   40 +
 pkg/sentry/platform/interrupt/BUILD                |   19 +
 pkg/sentry/platform/interrupt/interrupt.go         |   96 +
 pkg/sentry/platform/interrupt/interrupt_test.go    |   99 +
 pkg/sentry/platform/kvm/BUILD                      |   90 +
 pkg/sentry/platform/kvm/address_space.go           |  207 +++
 pkg/sentry/platform/kvm/bluepill.go                |   41 +
 pkg/sentry/platform/kvm/bluepill_amd64.go          |  143 ++
 pkg/sentry/platform/kvm/bluepill_amd64.s           |   87 +
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go   |   28 +
 pkg/sentry/platform/kvm/bluepill_fault.go          |  127 ++
 pkg/sentry/platform/kvm/bluepill_unsafe.go         |  175 ++
 pkg/sentry/platform/kvm/context.go                 |   81 +
 pkg/sentry/platform/kvm/host_map.go                |  168 ++
 pkg/sentry/platform/kvm/kvm.go                     |  149 ++
 pkg/sentry/platform/kvm/kvm_amd64.go               |  213 +++
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go        |   93 +
 pkg/sentry/platform/kvm/kvm_const.go               |   56 +
 pkg/sentry/platform/kvm/kvm_test.go                |  415 +++++
 pkg/sentry/platform/kvm/machine.go                 |  412 +++++
 pkg/sentry/platform/kvm/machine_amd64.go           |  168 ++
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go    |  156 ++
 pkg/sentry/platform/kvm/machine_unsafe.go          |  112 ++
 pkg/sentry/platform/kvm/physical_map.go            |  221 +++
 pkg/sentry/platform/kvm/testutil/BUILD             |   15 +
 pkg/sentry/platform/kvm/testutil/testutil.go       |   75 +
 pkg/sentry/platform/kvm/testutil/testutil_amd64.go |  135 ++
 pkg/sentry/platform/kvm/testutil/testutil_amd64.s  |   98 +
 pkg/sentry/platform/kvm/virtual_map.go             |  113 ++
 pkg/sentry/platform/kvm/virtual_map_test.go        |   78 +
 pkg/sentry/platform/mmap_min_addr.go               |   60 +
 pkg/sentry/platform/platform.go                    |  428 +++++
 pkg/sentry/platform/procid/BUILD                   |   32 +
 pkg/sentry/platform/procid/procid.go               |   21 +
 pkg/sentry/platform/procid/procid_amd64.s          |   30 +
 pkg/sentry/platform/procid/procid_net_test.go      |   21 +
 pkg/sentry/platform/procid/procid_test.go          |   85 +
 pkg/sentry/platform/ptrace/BUILD                   |   31 +
 pkg/sentry/platform/ptrace/ptrace.go               |  242 +++
 pkg/sentry/platform/ptrace/ptrace_unsafe.go        |  166 ++
 pkg/sentry/platform/ptrace/stub_amd64.s            |  114 ++
 pkg/sentry/platform/ptrace/stub_unsafe.go          |   98 +
 pkg/sentry/platform/ptrace/subprocess.go           |  559 ++++++
 pkg/sentry/platform/ptrace/subprocess_amd64.go     |  104 ++
 pkg/sentry/platform/ptrace/subprocess_linux.go     |  146 ++
 .../ptrace/subprocess_linux_amd64_unsafe.go        |  109 ++
 pkg/sentry/platform/ptrace/subprocess_unsafe.go    |   28 +
 pkg/sentry/platform/ring0/BUILD                    |   52 +
 pkg/sentry/platform/ring0/defs.go                  |   93 +
 pkg/sentry/platform/ring0/defs_amd64.go            |  113 ++
 pkg/sentry/platform/ring0/entry_amd64.go           |  128 ++
 pkg/sentry/platform/ring0/entry_amd64.s            |  334 ++++
 pkg/sentry/platform/ring0/gen_offsets/BUILD        |   25 +
 pkg/sentry/platform/ring0/gen_offsets/main.go      |   24 +
 pkg/sentry/platform/ring0/kernel.go                |   71 +
 pkg/sentry/platform/ring0/kernel_amd64.go          |  280 +++
 pkg/sentry/platform/ring0/kernel_unsafe.go         |   41 +
 pkg/sentry/platform/ring0/lib_amd64.go             |  128 ++
 pkg/sentry/platform/ring0/lib_amd64.s              |  247 +++
 pkg/sentry/platform/ring0/offsets_amd64.go         |   93 +
 pkg/sentry/platform/ring0/pagetables/BUILD         |   32 +
 pkg/sentry/platform/ring0/pagetables/pagetables.go |  193 ++
 .../platform/ring0/pagetables/pagetables_amd64.go  |  397 ++++
 .../platform/ring0/pagetables/pagetables_test.go   |  161 ++
 .../platform/ring0/pagetables/pagetables_unsafe.go |   31 +
 .../platform/ring0/pagetables/pagetables_x86.go    |   79 +
 .../ring0/pagetables/pagetables_x86_test.go        |   79 +
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go  |   74 +
 .../platform/ring0/pagetables/pcids_x86_test.go    |   65 +
 pkg/sentry/platform/ring0/ring0.go                 |   16 +
 pkg/sentry/platform/ring0/x86.go                   |  242 +++
 pkg/sentry/platform/safecopy/BUILD                 |   28 +
 pkg/sentry/platform/safecopy/atomic_amd64.s        |  108 ++
 pkg/sentry/platform/safecopy/memclr_amd64.s        |  157 ++
 pkg/sentry/platform/safecopy/memcpy_amd64.s        |  242 +++
 pkg/sentry/platform/safecopy/safecopy.go           |  140 ++
 pkg/sentry/platform/safecopy/safecopy_test.go      |  617 +++++++
 pkg/sentry/platform/safecopy/safecopy_unsafe.go    |  315 ++++
 pkg/sentry/platform/safecopy/sighandler_amd64.s    |  124 ++
 pkg/sentry/safemem/BUILD                           |   28 +
 pkg/sentry/safemem/block_unsafe.go                 |  269 +++
 pkg/sentry/safemem/io.go                           |  339 ++++
 pkg/sentry/safemem/io_test.go                      |  199 ++
 pkg/sentry/safemem/safemem.go                      |   16 +
 pkg/sentry/safemem/seq_test.go                     |  196 ++
 pkg/sentry/safemem/seq_unsafe.go                   |  299 +++
 pkg/sentry/sighandling/BUILD                       |   18 +
 pkg/sentry/sighandling/sighandling.go              |  116 ++
 pkg/sentry/sighandling/sighandling_unsafe.go       |   74 +
 pkg/sentry/socket/BUILD                            |   37 +
 pkg/sentry/socket/control/BUILD                    |   39 +
 pkg/sentry/socket/control/control.go               |  370 ++++
 pkg/sentry/socket/epsocket/BUILD                   |   61 +
 pkg/sentry/socket/epsocket/device.go               |   20 +
 pkg/sentry/socket/epsocket/epsocket.go             | 1230 +++++++++++++
 pkg/sentry/socket/epsocket/provider.go             |  113 ++
 pkg/sentry/socket/epsocket/save_restore.go         |   27 +
 pkg/sentry/socket/epsocket/stack.go                |  132 ++
 pkg/sentry/socket/hostinet/BUILD                   |   53 +
 pkg/sentry/socket/hostinet/device.go               |   19 +
 pkg/sentry/socket/hostinet/hostinet.go             |   17 +
 pkg/sentry/socket/hostinet/save_restore.go         |   20 +
 pkg/sentry/socket/hostinet/socket.go               |  562 ++++++
 pkg/sentry/socket/hostinet/socket_unsafe.go        |  138 ++
 pkg/sentry/socket/hostinet/stack.go                |  244 +++
 pkg/sentry/socket/netlink/BUILD                    |   47 +
 pkg/sentry/socket/netlink/message.go               |  159 ++
 pkg/sentry/socket/netlink/port/BUILD               |   28 +
 pkg/sentry/socket/netlink/port/port.go             |  114 ++
 pkg/sentry/socket/netlink/port/port_test.go        |   82 +
 pkg/sentry/socket/netlink/provider.go              |  104 ++
 pkg/sentry/socket/netlink/route/BUILD              |   33 +
 pkg/sentry/socket/netlink/route/protocol.go        |  189 ++
 pkg/sentry/socket/netlink/socket.go                |  517 ++++++
 pkg/sentry/socket/rpcinet/BUILD                    |   59 +
 pkg/sentry/socket/rpcinet/conn/BUILD               |   17 +
 pkg/sentry/socket/rpcinet/conn/conn.go             |  167 ++
 pkg/sentry/socket/rpcinet/device.go                |   19 +
 pkg/sentry/socket/rpcinet/notifier/BUILD           |   15 +
 pkg/sentry/socket/rpcinet/notifier/notifier.go     |  230 +++
 pkg/sentry/socket/rpcinet/rpcinet.go               |   16 +
 pkg/sentry/socket/rpcinet/socket.go                |  567 ++++++
 pkg/sentry/socket/rpcinet/stack.go                 |  175 ++
 pkg/sentry/socket/rpcinet/stack_unsafe.go          |  193 ++
 pkg/sentry/socket/rpcinet/syscall_rpc.proto        |  351 ++++
 pkg/sentry/socket/socket.go                        |  205 +++
 pkg/sentry/socket/unix/BUILD                       |   48 +
 pkg/sentry/socket/unix/device.go                   |   20 +
 pkg/sentry/socket/unix/io.go                       |   88 +
 pkg/sentry/socket/unix/unix.go                     |  571 ++++++
 pkg/sentry/state/BUILD                             |   21 +
 pkg/sentry/state/state.go                          |  113 ++
 pkg/sentry/state/state_metadata.go                 |   29 +
 pkg/sentry/strace/BUILD                            |   48 +
 pkg/sentry/strace/clone.go                         |  113 ++
 pkg/sentry/strace/futex.go                         |   91 +
 pkg/sentry/strace/linux64.go                       |  338 ++++
 pkg/sentry/strace/open.go                          |  105 ++
 pkg/sentry/strace/ptrace.go                        |  178 ++
 pkg/sentry/strace/socket.go                        |  674 +++++++
 pkg/sentry/strace/strace.go                        |  666 +++++++
 pkg/sentry/strace/strace.proto                     |   50 +
 pkg/sentry/strace/syscalls.go                      |  217 +++
 pkg/sentry/syscalls/BUILD                          |   43 +
 pkg/sentry/syscalls/epoll.go                       |  174 ++
 pkg/sentry/syscalls/linux/BUILD                    |  103 ++
 pkg/sentry/syscalls/linux/error.go                 |  117 ++
 pkg/sentry/syscalls/linux/flags.go                 |   95 +
 pkg/sentry/syscalls/linux/linux64.go               |  376 ++++
 pkg/sentry/syscalls/linux/sigset.go                |   69 +
 pkg/sentry/syscalls/linux/sys_aio.go               |  402 ++++
 pkg/sentry/syscalls/linux/sys_capability.go        |  149 ++
 pkg/sentry/syscalls/linux/sys_epoll.go             |  171 ++
 pkg/sentry/syscalls/linux/sys_eventfd.go           |   65 +
 pkg/sentry/syscalls/linux/sys_file.go              | 1942 ++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_futex.go             |  319 ++++
 pkg/sentry/syscalls/linux/sys_getdents.go          |  269 +++
 pkg/sentry/syscalls/linux/sys_identity.go          |  180 ++
 pkg/sentry/syscalls/linux/sys_inotify.go           |  135 ++
 pkg/sentry/syscalls/linux/sys_lseek.go             |   55 +
 pkg/sentry/syscalls/linux/sys_mmap.go              |  435 +++++
 pkg/sentry/syscalls/linux/sys_mount.go             |  140 ++
 pkg/sentry/syscalls/linux/sys_pipe.go              |   78 +
 pkg/sentry/syscalls/linux/sys_poll.go              |  429 +++++
 pkg/sentry/syscalls/linux/sys_prctl.go             |  188 ++
 pkg/sentry/syscalls/linux/sys_random.go            |   92 +
 pkg/sentry/syscalls/linux/sys_read.go              |  274 +++
 pkg/sentry/syscalls/linux/sys_rlimit.go            |  217 +++
 pkg/sentry/syscalls/linux/sys_rusage.go            |  112 ++
 pkg/sentry/syscalls/linux/sys_sched.go             |  100 +
 pkg/sentry/syscalls/linux/sys_sem.go               |  166 ++
 pkg/sentry/syscalls/linux/sys_signal.go            |  553 ++++++
 pkg/sentry/syscalls/linux/sys_socket.go            | 1059 +++++++++++
 pkg/sentry/syscalls/linux/sys_stat.go              |  209 +++
 pkg/sentry/syscalls/linux/sys_sync.go              |   75 +
 pkg/sentry/syscalls/linux/sys_sysinfo.go           |   42 +
 pkg/sentry/syscalls/linux/sys_syslog.go            |   61 +
 pkg/sentry/syscalls/linux/sys_thread.go            |  704 +++++++
 pkg/sentry/syscalls/linux/sys_time.go              |  338 ++++
 pkg/sentry/syscalls/linux/sys_timer.go             |  168 ++
 pkg/sentry/syscalls/linux/sys_timerfd.go           |  135 ++
 pkg/sentry/syscalls/linux/sys_tls.go               |   48 +
 pkg/sentry/syscalls/linux/sys_utsname.go           |   89 +
 pkg/sentry/syscalls/linux/sys_write.go             |  274 +++
 pkg/sentry/syscalls/linux/timespec.go              |  112 ++
 pkg/sentry/syscalls/polling.go                     |  137 ++
 pkg/sentry/syscalls/syscalls.go                    |   72 +
 pkg/sentry/syscalls/unimplemented_syscall.proto    |   27 +
 pkg/sentry/time/BUILD                              |   48 +
 pkg/sentry/time/calibrated_clock.go                |  269 +++
 pkg/sentry/time/calibrated_clock_test.go           |  186 ++
 pkg/sentry/time/clock_id.go                        |   40 +
 pkg/sentry/time/clocks.go                          |   31 +
 pkg/sentry/time/muldiv_amd64.s                     |   44 +
 pkg/sentry/time/parameters.go                      |  239 +++
 pkg/sentry/time/parameters_test.go                 |  486 +++++
 pkg/sentry/time/sampler.go                         |  225 +++
 pkg/sentry/time/sampler_test.go                    |  183 ++
 pkg/sentry/time/sampler_unsafe.go                  |   56 +
 pkg/sentry/time/tsc_amd64.s                        |   27 +
 pkg/sentry/uniqueid/BUILD                          |   11 +
 pkg/sentry/uniqueid/context.go                     |   44 +
 pkg/sentry/usage/BUILD                             |   38 +
 pkg/sentry/usage/cpu.go                            |   44 +
 pkg/sentry/usage/io.go                             |   88 +
 pkg/sentry/usage/memory.go                         |  282 +++
 pkg/sentry/usage/memory_unsafe.go                  |   27 +
 pkg/sentry/usage/usage.go                          |   16 +
 pkg/sentry/usermem/BUILD                           |   70 +
 pkg/sentry/usermem/README.md                       |   31 +
 pkg/sentry/usermem/access_type.go                  |  117 ++
 pkg/sentry/usermem/addr.go                         |  106 ++
 pkg/sentry/usermem/addr_range_seq_test.go          |  197 ++
 pkg/sentry/usermem/addr_range_seq_unsafe.go        |  277 +++
 pkg/sentry/usermem/bytes_io.go                     |  126 ++
 pkg/sentry/usermem/bytes_io_unsafe.go              |   39 +
 pkg/sentry/usermem/usermem.go                      |  572 ++++++
 pkg/sentry/usermem/usermem_test.go                 |  411 +++++
 pkg/sentry/usermem/usermem_x86.go                  |   38 +
 pkg/sentry/watchdog/BUILD                          |   17 +
 pkg/sentry/watchdog/watchdog.go                    |  279 +++
 574 files changed, 104532 insertions(+)
 create mode 100644 pkg/sentry/BUILD
 create mode 100644 pkg/sentry/arch/BUILD
 create mode 100644 pkg/sentry/arch/aligned.go
 create mode 100644 pkg/sentry/arch/arch.go
 create mode 100644 pkg/sentry/arch/arch_amd64.go
 create mode 100644 pkg/sentry/arch/arch_amd64.s
 create mode 100644 pkg/sentry/arch/arch_state_x86.go
 create mode 100644 pkg/sentry/arch/arch_x86.go
 create mode 100644 pkg/sentry/arch/auxv.go
 create mode 100644 pkg/sentry/arch/registers.proto
 create mode 100644 pkg/sentry/arch/signal_act.go
 create mode 100644 pkg/sentry/arch/signal_amd64.go
 create mode 100644 pkg/sentry/arch/signal_info.go
 create mode 100644 pkg/sentry/arch/signal_stack.go
 create mode 100644 pkg/sentry/arch/stack.go
 create mode 100644 pkg/sentry/arch/syscalls_amd64.go
 create mode 100644 pkg/sentry/context/BUILD
 create mode 100644 pkg/sentry/context/context.go
 create mode 100644 pkg/sentry/context/contexttest/BUILD
 create mode 100644 pkg/sentry/context/contexttest/contexttest.go
 create mode 100644 pkg/sentry/control/BUILD
 create mode 100644 pkg/sentry/control/control.go
 create mode 100644 pkg/sentry/control/proc.go
 create mode 100644 pkg/sentry/control/proc_test.go
 create mode 100644 pkg/sentry/device/BUILD
 create mode 100644 pkg/sentry/device/device.go
 create mode 100644 pkg/sentry/device/device_test.go
 create mode 100644 pkg/sentry/fs/BUILD
 create mode 100644 pkg/sentry/fs/README.md
 create mode 100644 pkg/sentry/fs/anon/BUILD
 create mode 100644 pkg/sentry/fs/anon/anon.go
 create mode 100644 pkg/sentry/fs/anon/device.go
 create mode 100644 pkg/sentry/fs/ashmem/BUILD
 create mode 100644 pkg/sentry/fs/ashmem/area.go
 create mode 100644 pkg/sentry/fs/ashmem/device.go
 create mode 100644 pkg/sentry/fs/ashmem/pin_board.go
 create mode 100644 pkg/sentry/fs/ashmem/pin_board_test.go
 create mode 100644 pkg/sentry/fs/attr.go
 create mode 100644 pkg/sentry/fs/binder/BUILD
 create mode 100644 pkg/sentry/fs/binder/binder.go
 create mode 100644 pkg/sentry/fs/context.go
 create mode 100644 pkg/sentry/fs/copy_up.go
 create mode 100644 pkg/sentry/fs/copy_up_test.go
 create mode 100644 pkg/sentry/fs/dentry.go
 create mode 100644 pkg/sentry/fs/dev/BUILD
 create mode 100644 pkg/sentry/fs/dev/dev.go
 create mode 100644 pkg/sentry/fs/dev/device.go
 create mode 100644 pkg/sentry/fs/dev/fs.go
 create mode 100644 pkg/sentry/fs/dev/full.go
 create mode 100644 pkg/sentry/fs/dev/null.go
 create mode 100644 pkg/sentry/fs/dev/random.go
 create mode 100644 pkg/sentry/fs/dirent.go
 create mode 100644 pkg/sentry/fs/dirent_cache.go
 create mode 100644 pkg/sentry/fs/dirent_cache_test.go
 create mode 100644 pkg/sentry/fs/dirent_refs_test.go
 create mode 100644 pkg/sentry/fs/dirent_state.go
 create mode 100644 pkg/sentry/fs/fdpipe/BUILD
 create mode 100644 pkg/sentry/fs/fdpipe/pipe.go
 create mode 100644 pkg/sentry/fs/fdpipe/pipe_opener.go
 create mode 100644 pkg/sentry/fs/fdpipe/pipe_opener_test.go
 create mode 100644 pkg/sentry/fs/fdpipe/pipe_state.go
 create mode 100644 pkg/sentry/fs/fdpipe/pipe_test.go
 create mode 100644 pkg/sentry/fs/file.go
 create mode 100644 pkg/sentry/fs/file_operations.go
 create mode 100644 pkg/sentry/fs/file_overlay.go
 create mode 100644 pkg/sentry/fs/file_overlay_test.go
 create mode 100644 pkg/sentry/fs/file_state.go
 create mode 100644 pkg/sentry/fs/file_test.go
 create mode 100644 pkg/sentry/fs/filesystems.go
 create mode 100644 pkg/sentry/fs/filetest/BUILD
 create mode 100644 pkg/sentry/fs/filetest/filetest.go
 create mode 100644 pkg/sentry/fs/flags.go
 create mode 100644 pkg/sentry/fs/fs.go
 create mode 100644 pkg/sentry/fs/fsutil/BUILD
 create mode 100644 pkg/sentry/fs/fsutil/README.md
 create mode 100644 pkg/sentry/fs/fsutil/dirty_set.go
 create mode 100644 pkg/sentry/fs/fsutil/dirty_set_test.go
 create mode 100644 pkg/sentry/fs/fsutil/file.go
 create mode 100644 pkg/sentry/fs/fsutil/file_range_set.go
 create mode 100644 pkg/sentry/fs/fsutil/frame_ref_set.go
 create mode 100644 pkg/sentry/fs/fsutil/fsutil.go
 create mode 100644 pkg/sentry/fs/fsutil/handle.go
 create mode 100644 pkg/sentry/fs/fsutil/handle_test.go
 create mode 100644 pkg/sentry/fs/fsutil/host_file_mapper.go
 create mode 100644 pkg/sentry/fs/fsutil/host_file_mapper_state.go
 create mode 100644 pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
 create mode 100644 pkg/sentry/fs/fsutil/inode.go
 create mode 100644 pkg/sentry/fs/fsutil/inode_cached.go
 create mode 100644 pkg/sentry/fs/fsutil/inode_cached_test.go
 create mode 100644 pkg/sentry/fs/g3doc/inotify.md
 create mode 100644 pkg/sentry/fs/gofer/BUILD
 create mode 100644 pkg/sentry/fs/gofer/attr.go
 create mode 100644 pkg/sentry/fs/gofer/context_file.go
 create mode 100644 pkg/sentry/fs/gofer/device.go
 create mode 100644 pkg/sentry/fs/gofer/file.go
 create mode 100644 pkg/sentry/fs/gofer/file_state.go
 create mode 100644 pkg/sentry/fs/gofer/fs.go
 create mode 100644 pkg/sentry/fs/gofer/gofer_test.go
 create mode 100644 pkg/sentry/fs/gofer/handles.go
 create mode 100644 pkg/sentry/fs/gofer/inode.go
 create mode 100644 pkg/sentry/fs/gofer/inode_state.go
 create mode 100644 pkg/sentry/fs/gofer/path.go
 create mode 100644 pkg/sentry/fs/gofer/session.go
 create mode 100644 pkg/sentry/fs/gofer/session_state.go
 create mode 100644 pkg/sentry/fs/gofer/socket.go
 create mode 100644 pkg/sentry/fs/gofer/util.go
 create mode 100644 pkg/sentry/fs/host/BUILD
 create mode 100644 pkg/sentry/fs/host/control.go
 create mode 100644 pkg/sentry/fs/host/descriptor.go
 create mode 100644 pkg/sentry/fs/host/descriptor_state.go
 create mode 100644 pkg/sentry/fs/host/device.go
 create mode 100644 pkg/sentry/fs/host/file.go
 create mode 100644 pkg/sentry/fs/host/fs.go
 create mode 100644 pkg/sentry/fs/host/fs_test.go
 create mode 100644 pkg/sentry/fs/host/inode.go
 create mode 100644 pkg/sentry/fs/host/inode_state.go
 create mode 100644 pkg/sentry/fs/host/inode_test.go
 create mode 100644 pkg/sentry/fs/host/ioctl_unsafe.go
 create mode 100644 pkg/sentry/fs/host/socket.go
 create mode 100644 pkg/sentry/fs/host/socket_state.go
 create mode 100644 pkg/sentry/fs/host/socket_test.go
 create mode 100644 pkg/sentry/fs/host/socket_unsafe.go
 create mode 100644 pkg/sentry/fs/host/util.go
 create mode 100644 pkg/sentry/fs/host/util_unsafe.go
 create mode 100644 pkg/sentry/fs/host/wait_test.go
 create mode 100644 pkg/sentry/fs/inode.go
 create mode 100644 pkg/sentry/fs/inode_inotify.go
 create mode 100644 pkg/sentry/fs/inode_operations.go
 create mode 100644 pkg/sentry/fs/inode_overlay.go
 create mode 100644 pkg/sentry/fs/inode_overlay_test.go
 create mode 100644 pkg/sentry/fs/inotify.go
 create mode 100644 pkg/sentry/fs/inotify_event.go
 create mode 100644 pkg/sentry/fs/inotify_watch.go
 create mode 100644 pkg/sentry/fs/lock/BUILD
 create mode 100644 pkg/sentry/fs/lock/lock.go
 create mode 100644 pkg/sentry/fs/lock/lock_range_test.go
 create mode 100644 pkg/sentry/fs/lock/lock_set_functions.go
 create mode 100644 pkg/sentry/fs/lock/lock_test.go
 create mode 100644 pkg/sentry/fs/mock.go
 create mode 100644 pkg/sentry/fs/mount.go
 create mode 100644 pkg/sentry/fs/mount_overlay.go
 create mode 100644 pkg/sentry/fs/mount_state.go
 create mode 100644 pkg/sentry/fs/mount_test.go
 create mode 100644 pkg/sentry/fs/mounts.go
 create mode 100644 pkg/sentry/fs/mounts_test.go
 create mode 100644 pkg/sentry/fs/offset.go
 create mode 100644 pkg/sentry/fs/overlay.go
 create mode 100644 pkg/sentry/fs/path.go
 create mode 100644 pkg/sentry/fs/path_test.go
 create mode 100644 pkg/sentry/fs/proc/BUILD
 create mode 100644 pkg/sentry/fs/proc/README.md
 create mode 100644 pkg/sentry/fs/proc/cpuinfo.go
 create mode 100644 pkg/sentry/fs/proc/device/BUILD
 create mode 100644 pkg/sentry/fs/proc/device/device.go
 create mode 100644 pkg/sentry/fs/proc/exec_args.go
 create mode 100644 pkg/sentry/fs/proc/fds.go
 create mode 100644 pkg/sentry/fs/proc/file.go
 create mode 100644 pkg/sentry/fs/proc/filesystems.go
 create mode 100644 pkg/sentry/fs/proc/fs.go
 create mode 100644 pkg/sentry/fs/proc/loadavg.go
 create mode 100644 pkg/sentry/fs/proc/meminfo.go
 create mode 100644 pkg/sentry/fs/proc/mounts.go
 create mode 100644 pkg/sentry/fs/proc/net.go
 create mode 100644 pkg/sentry/fs/proc/net_test.go
 create mode 100644 pkg/sentry/fs/proc/proc.go
 create mode 100644 pkg/sentry/fs/proc/seqfile/BUILD
 create mode 100644 pkg/sentry/fs/proc/seqfile/seqfile.go
 create mode 100644 pkg/sentry/fs/proc/seqfile/seqfile_test.go
 create mode 100644 pkg/sentry/fs/proc/stat.go
 create mode 100644 pkg/sentry/fs/proc/sys.go
 create mode 100644 pkg/sentry/fs/proc/sys_net.go
 create mode 100644 pkg/sentry/fs/proc/sys_net_test.go
 create mode 100644 pkg/sentry/fs/proc/task.go
 create mode 100644 pkg/sentry/fs/proc/uid_gid_map.go
 create mode 100644 pkg/sentry/fs/proc/uptime.go
 create mode 100644 pkg/sentry/fs/proc/version.go
 create mode 100644 pkg/sentry/fs/ramfs/BUILD
 create mode 100644 pkg/sentry/fs/ramfs/dir.go
 create mode 100644 pkg/sentry/fs/ramfs/file.go
 create mode 100644 pkg/sentry/fs/ramfs/ramfs.go
 create mode 100644 pkg/sentry/fs/ramfs/socket.go
 create mode 100644 pkg/sentry/fs/ramfs/symlink.go
 create mode 100644 pkg/sentry/fs/ramfs/test/BUILD
 create mode 100644 pkg/sentry/fs/ramfs/test/test.go
 create mode 100644 pkg/sentry/fs/ramfs/tree.go
 create mode 100644 pkg/sentry/fs/ramfs/tree_test.go
 create mode 100644 pkg/sentry/fs/restore.go
 create mode 100644 pkg/sentry/fs/save.go
 create mode 100644 pkg/sentry/fs/seek.go
 create mode 100644 pkg/sentry/fs/sync.go
 create mode 100644 pkg/sentry/fs/sys/BUILD
 create mode 100644 pkg/sentry/fs/sys/device.go
 create mode 100644 pkg/sentry/fs/sys/fs.go
 create mode 100644 pkg/sentry/fs/sys/sys.go
 create mode 100644 pkg/sentry/fs/timerfd/BUILD
 create mode 100644 pkg/sentry/fs/timerfd/timerfd.go
 create mode 100644 pkg/sentry/fs/tmpfs/BUILD
 create mode 100644 pkg/sentry/fs/tmpfs/device.go
 create mode 100644 pkg/sentry/fs/tmpfs/file_regular.go
 create mode 100644 pkg/sentry/fs/tmpfs/file_test.go
 create mode 100644 pkg/sentry/fs/tmpfs/fs.go
 create mode 100644 pkg/sentry/fs/tmpfs/inode_file.go
 create mode 100644 pkg/sentry/fs/tmpfs/tmpfs.go
 create mode 100644 pkg/sentry/fs/tty/BUILD
 create mode 100644 pkg/sentry/fs/tty/dir.go
 create mode 100644 pkg/sentry/fs/tty/fs.go
 create mode 100644 pkg/sentry/fs/tty/inode.go
 create mode 100644 pkg/sentry/fs/tty/line_discipline.go
 create mode 100644 pkg/sentry/fs/tty/master.go
 create mode 100644 pkg/sentry/fs/tty/slave.go
 create mode 100644 pkg/sentry/fs/tty/terminal.go
 create mode 100644 pkg/sentry/fs/tty/tty_test.go
 create mode 100644 pkg/sentry/hostcpu/BUILD
 create mode 100644 pkg/sentry/hostcpu/getcpu_amd64.s
 create mode 100644 pkg/sentry/hostcpu/hostcpu.go
 create mode 100644 pkg/sentry/hostcpu/hostcpu_test.go
 create mode 100644 pkg/sentry/inet/BUILD
 create mode 100644 pkg/sentry/inet/inet.go
 create mode 100644 pkg/sentry/inet/test_stack.go
 create mode 100644 pkg/sentry/kernel/BUILD
 create mode 100644 pkg/sentry/kernel/README.md
 create mode 100644 pkg/sentry/kernel/abstract_socket_namespace.go
 create mode 100644 pkg/sentry/kernel/auth/BUILD
 create mode 100644 pkg/sentry/kernel/auth/auth.go
 create mode 100644 pkg/sentry/kernel/auth/capability_set.go
 create mode 100644 pkg/sentry/kernel/auth/context.go
 create mode 100644 pkg/sentry/kernel/auth/credentials.go
 create mode 100644 pkg/sentry/kernel/auth/id.go
 create mode 100644 pkg/sentry/kernel/auth/id_map.go
 create mode 100644 pkg/sentry/kernel/auth/id_map_functions.go
 create mode 100644 pkg/sentry/kernel/auth/user_namespace.go
 create mode 100644 pkg/sentry/kernel/context.go
 create mode 100644 pkg/sentry/kernel/epoll/BUILD
 create mode 100644 pkg/sentry/kernel/epoll/epoll.go
 create mode 100644 pkg/sentry/kernel/epoll/epoll_state.go
 create mode 100644 pkg/sentry/kernel/epoll/epoll_test.go
 create mode 100644 pkg/sentry/kernel/eventfd/BUILD
 create mode 100644 pkg/sentry/kernel/eventfd/eventfd.go
 create mode 100644 pkg/sentry/kernel/eventfd/eventfd_test.go
 create mode 100644 pkg/sentry/kernel/fd_map.go
 create mode 100644 pkg/sentry/kernel/fd_map_test.go
 create mode 100644 pkg/sentry/kernel/fs_context.go
 create mode 100644 pkg/sentry/kernel/futex/BUILD
 create mode 100644 pkg/sentry/kernel/futex/futex.go
 create mode 100644 pkg/sentry/kernel/futex/futex_test.go
 create mode 100644 pkg/sentry/kernel/g3doc/run_states.dot
 create mode 100644 pkg/sentry/kernel/ipc_namespace.go
 create mode 100644 pkg/sentry/kernel/kdefs/BUILD
 create mode 100644 pkg/sentry/kernel/kdefs/kdefs.go
 create mode 100644 pkg/sentry/kernel/kernel.go
 create mode 100644 pkg/sentry/kernel/memevent/BUILD
 create mode 100644 pkg/sentry/kernel/memevent/memory_events.go
 create mode 100644 pkg/sentry/kernel/memevent/memory_events.proto
 create mode 100644 pkg/sentry/kernel/pending_signals.go
 create mode 100644 pkg/sentry/kernel/pipe/BUILD
 create mode 100644 pkg/sentry/kernel/pipe/buffers.go
 create mode 100644 pkg/sentry/kernel/pipe/device.go
 create mode 100644 pkg/sentry/kernel/pipe/node.go
 create mode 100644 pkg/sentry/kernel/pipe/node_test.go
 create mode 100644 pkg/sentry/kernel/pipe/pipe.go
 create mode 100644 pkg/sentry/kernel/pipe/pipe_test.go
 create mode 100644 pkg/sentry/kernel/pipe/reader.go
 create mode 100644 pkg/sentry/kernel/pipe/reader_writer.go
 create mode 100644 pkg/sentry/kernel/pipe/writer.go
 create mode 100644 pkg/sentry/kernel/ptrace.go
 create mode 100644 pkg/sentry/kernel/rseq.go
 create mode 100644 pkg/sentry/kernel/sched/BUILD
 create mode 100644 pkg/sentry/kernel/sched/cpuset.go
 create mode 100644 pkg/sentry/kernel/sched/cpuset_test.go
 create mode 100644 pkg/sentry/kernel/sched/sched.go
 create mode 100644 pkg/sentry/kernel/seccomp.go
 create mode 100644 pkg/sentry/kernel/semaphore/BUILD
 create mode 100644 pkg/sentry/kernel/semaphore/semaphore.go
 create mode 100644 pkg/sentry/kernel/semaphore/semaphore_test.go
 create mode 100644 pkg/sentry/kernel/sessions.go
 create mode 100644 pkg/sentry/kernel/signal.go
 create mode 100644 pkg/sentry/kernel/signal_handlers.go
 create mode 100644 pkg/sentry/kernel/syscalls.go
 create mode 100644 pkg/sentry/kernel/syscalls_state.go
 create mode 100644 pkg/sentry/kernel/syslog.go
 create mode 100644 pkg/sentry/kernel/table_test.go
 create mode 100644 pkg/sentry/kernel/task.go
 create mode 100644 pkg/sentry/kernel/task_acct.go
 create mode 100644 pkg/sentry/kernel/task_block.go
 create mode 100644 pkg/sentry/kernel/task_clone.go
 create mode 100644 pkg/sentry/kernel/task_context.go
 create mode 100644 pkg/sentry/kernel/task_exec.go
 create mode 100644 pkg/sentry/kernel/task_exit.go
 create mode 100644 pkg/sentry/kernel/task_identity.go
 create mode 100644 pkg/sentry/kernel/task_log.go
 create mode 100644 pkg/sentry/kernel/task_net.go
 create mode 100644 pkg/sentry/kernel/task_resources.go
 create mode 100644 pkg/sentry/kernel/task_run.go
 create mode 100644 pkg/sentry/kernel/task_sched.go
 create mode 100644 pkg/sentry/kernel/task_signals.go
 create mode 100644 pkg/sentry/kernel/task_start.go
 create mode 100644 pkg/sentry/kernel/task_stop.go
 create mode 100644 pkg/sentry/kernel/task_syscall.go
 create mode 100644 pkg/sentry/kernel/task_test.go
 create mode 100644 pkg/sentry/kernel/task_usermem.go
 create mode 100644 pkg/sentry/kernel/thread_group.go
 create mode 100644 pkg/sentry/kernel/threads.go
 create mode 100644 pkg/sentry/kernel/time/BUILD
 create mode 100644 pkg/sentry/kernel/time/context.go
 create mode 100644 pkg/sentry/kernel/time/time.go
 create mode 100644 pkg/sentry/kernel/timekeeper.go
 create mode 100644 pkg/sentry/kernel/timekeeper_state.go
 create mode 100644 pkg/sentry/kernel/timekeeper_test.go
 create mode 100644 pkg/sentry/kernel/timer.go
 create mode 100644 pkg/sentry/kernel/uts_namespace.go
 create mode 100644 pkg/sentry/kernel/vdso.go
 create mode 100644 pkg/sentry/kernel/version.go
 create mode 100644 pkg/sentry/limits/BUILD
 create mode 100644 pkg/sentry/limits/context.go
 create mode 100644 pkg/sentry/limits/limits.go
 create mode 100644 pkg/sentry/limits/limits_test.go
 create mode 100644 pkg/sentry/limits/linux.go
 create mode 100644 pkg/sentry/loader/BUILD
 create mode 100644 pkg/sentry/loader/elf.go
 create mode 100644 pkg/sentry/loader/interpreter.go
 create mode 100644 pkg/sentry/loader/loader.go
 create mode 100644 pkg/sentry/loader/vdso.go
 create mode 100644 pkg/sentry/loader/vdso_state.go
 create mode 100644 pkg/sentry/memmap/BUILD
 create mode 100644 pkg/sentry/memmap/mapping_set.go
 create mode 100644 pkg/sentry/memmap/mapping_set_test.go
 create mode 100644 pkg/sentry/memmap/memmap.go
 create mode 100644 pkg/sentry/memutil/BUILD
 create mode 100644 pkg/sentry/memutil/memutil.go
 create mode 100644 pkg/sentry/memutil/memutil_unsafe.go
 create mode 100644 pkg/sentry/mm/BUILD
 create mode 100644 pkg/sentry/mm/README.md
 create mode 100644 pkg/sentry/mm/address_space.go
 create mode 100644 pkg/sentry/mm/aio_context.go
 create mode 100644 pkg/sentry/mm/aio_context_state.go
 create mode 100644 pkg/sentry/mm/debug.go
 create mode 100644 pkg/sentry/mm/io.go
 create mode 100644 pkg/sentry/mm/lifecycle.go
 create mode 100644 pkg/sentry/mm/metadata.go
 create mode 100644 pkg/sentry/mm/mm.go
 create mode 100644 pkg/sentry/mm/mm_test.go
 create mode 100644 pkg/sentry/mm/pma.go
 create mode 100644 pkg/sentry/mm/proc_pid_maps.go
 create mode 100644 pkg/sentry/mm/save_restore.go
 create mode 100644 pkg/sentry/mm/special_mappable.go
 create mode 100644 pkg/sentry/mm/syscalls.go
 create mode 100644 pkg/sentry/mm/vma.go
 create mode 100644 pkg/sentry/platform/BUILD
 create mode 100644 pkg/sentry/platform/context.go
 create mode 100644 pkg/sentry/platform/filemem/BUILD
 create mode 100644 pkg/sentry/platform/filemem/filemem.go
 create mode 100644 pkg/sentry/platform/filemem/filemem_state.go
 create mode 100644 pkg/sentry/platform/filemem/filemem_test.go
 create mode 100644 pkg/sentry/platform/filemem/filemem_unsafe.go
 create mode 100644 pkg/sentry/platform/interrupt/BUILD
 create mode 100644 pkg/sentry/platform/interrupt/interrupt.go
 create mode 100644 pkg/sentry/platform/interrupt/interrupt_test.go
 create mode 100644 pkg/sentry/platform/kvm/BUILD
 create mode 100644 pkg/sentry/platform/kvm/address_space.go
 create mode 100644 pkg/sentry/platform/kvm/bluepill.go
 create mode 100644 pkg/sentry/platform/kvm/bluepill_amd64.go
 create mode 100644 pkg/sentry/platform/kvm/bluepill_amd64.s
 create mode 100644 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
 create mode 100644 pkg/sentry/platform/kvm/bluepill_fault.go
 create mode 100644 pkg/sentry/platform/kvm/bluepill_unsafe.go
 create mode 100644 pkg/sentry/platform/kvm/context.go
 create mode 100644 pkg/sentry/platform/kvm/host_map.go
 create mode 100644 pkg/sentry/platform/kvm/kvm.go
 create mode 100644 pkg/sentry/platform/kvm/kvm_amd64.go
 create mode 100644 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
 create mode 100644 pkg/sentry/platform/kvm/kvm_const.go
 create mode 100644 pkg/sentry/platform/kvm/kvm_test.go
 create mode 100644 pkg/sentry/platform/kvm/machine.go
 create mode 100644 pkg/sentry/platform/kvm/machine_amd64.go
 create mode 100644 pkg/sentry/platform/kvm/machine_amd64_unsafe.go
 create mode 100644 pkg/sentry/platform/kvm/machine_unsafe.go
 create mode 100644 pkg/sentry/platform/kvm/physical_map.go
 create mode 100644 pkg/sentry/platform/kvm/testutil/BUILD
 create mode 100644 pkg/sentry/platform/kvm/testutil/testutil.go
 create mode 100644 pkg/sentry/platform/kvm/testutil/testutil_amd64.go
 create mode 100644 pkg/sentry/platform/kvm/testutil/testutil_amd64.s
 create mode 100644 pkg/sentry/platform/kvm/virtual_map.go
 create mode 100644 pkg/sentry/platform/kvm/virtual_map_test.go
 create mode 100644 pkg/sentry/platform/mmap_min_addr.go
 create mode 100644 pkg/sentry/platform/platform.go
 create mode 100644 pkg/sentry/platform/procid/BUILD
 create mode 100644 pkg/sentry/platform/procid/procid.go
 create mode 100644 pkg/sentry/platform/procid/procid_amd64.s
 create mode 100644 pkg/sentry/platform/procid/procid_net_test.go
 create mode 100644 pkg/sentry/platform/procid/procid_test.go
 create mode 100644 pkg/sentry/platform/ptrace/BUILD
 create mode 100644 pkg/sentry/platform/ptrace/ptrace.go
 create mode 100644 pkg/sentry/platform/ptrace/ptrace_unsafe.go
 create mode 100644 pkg/sentry/platform/ptrace/stub_amd64.s
 create mode 100644 pkg/sentry/platform/ptrace/stub_unsafe.go
 create mode 100644 pkg/sentry/platform/ptrace/subprocess.go
 create mode 100644 pkg/sentry/platform/ptrace/subprocess_amd64.go
 create mode 100644 pkg/sentry/platform/ptrace/subprocess_linux.go
 create mode 100644 pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
 create mode 100644 pkg/sentry/platform/ptrace/subprocess_unsafe.go
 create mode 100644 pkg/sentry/platform/ring0/BUILD
 create mode 100644 pkg/sentry/platform/ring0/defs.go
 create mode 100644 pkg/sentry/platform/ring0/defs_amd64.go
 create mode 100644 pkg/sentry/platform/ring0/entry_amd64.go
 create mode 100644 pkg/sentry/platform/ring0/entry_amd64.s
 create mode 100644 pkg/sentry/platform/ring0/gen_offsets/BUILD
 create mode 100644 pkg/sentry/platform/ring0/gen_offsets/main.go
 create mode 100644 pkg/sentry/platform/ring0/kernel.go
 create mode 100644 pkg/sentry/platform/ring0/kernel_amd64.go
 create mode 100644 pkg/sentry/platform/ring0/kernel_unsafe.go
 create mode 100644 pkg/sentry/platform/ring0/lib_amd64.go
 create mode 100644 pkg/sentry/platform/ring0/lib_amd64.s
 create mode 100644 pkg/sentry/platform/ring0/offsets_amd64.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/BUILD
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_test.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_x86.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
 create mode 100644 pkg/sentry/platform/ring0/ring0.go
 create mode 100644 pkg/sentry/platform/ring0/x86.go
 create mode 100644 pkg/sentry/platform/safecopy/BUILD
 create mode 100644 pkg/sentry/platform/safecopy/atomic_amd64.s
 create mode 100644 pkg/sentry/platform/safecopy/memclr_amd64.s
 create mode 100644 pkg/sentry/platform/safecopy/memcpy_amd64.s
 create mode 100644 pkg/sentry/platform/safecopy/safecopy.go
 create mode 100644 pkg/sentry/platform/safecopy/safecopy_test.go
 create mode 100644 pkg/sentry/platform/safecopy/safecopy_unsafe.go
 create mode 100644 pkg/sentry/platform/safecopy/sighandler_amd64.s
 create mode 100644 pkg/sentry/safemem/BUILD
 create mode 100644 pkg/sentry/safemem/block_unsafe.go
 create mode 100644 pkg/sentry/safemem/io.go
 create mode 100644 pkg/sentry/safemem/io_test.go
 create mode 100644 pkg/sentry/safemem/safemem.go
 create mode 100644 pkg/sentry/safemem/seq_test.go
 create mode 100644 pkg/sentry/safemem/seq_unsafe.go
 create mode 100644 pkg/sentry/sighandling/BUILD
 create mode 100644 pkg/sentry/sighandling/sighandling.go
 create mode 100644 pkg/sentry/sighandling/sighandling_unsafe.go
 create mode 100644 pkg/sentry/socket/BUILD
 create mode 100644 pkg/sentry/socket/control/BUILD
 create mode 100644 pkg/sentry/socket/control/control.go
 create mode 100644 pkg/sentry/socket/epsocket/BUILD
 create mode 100644 pkg/sentry/socket/epsocket/device.go
 create mode 100644 pkg/sentry/socket/epsocket/epsocket.go
 create mode 100644 pkg/sentry/socket/epsocket/provider.go
 create mode 100644 pkg/sentry/socket/epsocket/save_restore.go
 create mode 100644 pkg/sentry/socket/epsocket/stack.go
 create mode 100644 pkg/sentry/socket/hostinet/BUILD
 create mode 100644 pkg/sentry/socket/hostinet/device.go
 create mode 100644 pkg/sentry/socket/hostinet/hostinet.go
 create mode 100644 pkg/sentry/socket/hostinet/save_restore.go
 create mode 100644 pkg/sentry/socket/hostinet/socket.go
 create mode 100644 pkg/sentry/socket/hostinet/socket_unsafe.go
 create mode 100644 pkg/sentry/socket/hostinet/stack.go
 create mode 100644 pkg/sentry/socket/netlink/BUILD
 create mode 100644 pkg/sentry/socket/netlink/message.go
 create mode 100644 pkg/sentry/socket/netlink/port/BUILD
 create mode 100644 pkg/sentry/socket/netlink/port/port.go
 create mode 100644 pkg/sentry/socket/netlink/port/port_test.go
 create mode 100644 pkg/sentry/socket/netlink/provider.go
 create mode 100644 pkg/sentry/socket/netlink/route/BUILD
 create mode 100644 pkg/sentry/socket/netlink/route/protocol.go
 create mode 100644 pkg/sentry/socket/netlink/socket.go
 create mode 100644 pkg/sentry/socket/rpcinet/BUILD
 create mode 100644 pkg/sentry/socket/rpcinet/conn/BUILD
 create mode 100644 pkg/sentry/socket/rpcinet/conn/conn.go
 create mode 100644 pkg/sentry/socket/rpcinet/device.go
 create mode 100644 pkg/sentry/socket/rpcinet/notifier/BUILD
 create mode 100644 pkg/sentry/socket/rpcinet/notifier/notifier.go
 create mode 100644 pkg/sentry/socket/rpcinet/rpcinet.go
 create mode 100644 pkg/sentry/socket/rpcinet/socket.go
 create mode 100644 pkg/sentry/socket/rpcinet/stack.go
 create mode 100644 pkg/sentry/socket/rpcinet/stack_unsafe.go
 create mode 100644 pkg/sentry/socket/rpcinet/syscall_rpc.proto
 create mode 100644 pkg/sentry/socket/socket.go
 create mode 100644 pkg/sentry/socket/unix/BUILD
 create mode 100644 pkg/sentry/socket/unix/device.go
 create mode 100644 pkg/sentry/socket/unix/io.go
 create mode 100644 pkg/sentry/socket/unix/unix.go
 create mode 100644 pkg/sentry/state/BUILD
 create mode 100644 pkg/sentry/state/state.go
 create mode 100644 pkg/sentry/state/state_metadata.go
 create mode 100644 pkg/sentry/strace/BUILD
 create mode 100644 pkg/sentry/strace/clone.go
 create mode 100644 pkg/sentry/strace/futex.go
 create mode 100644 pkg/sentry/strace/linux64.go
 create mode 100644 pkg/sentry/strace/open.go
 create mode 100644 pkg/sentry/strace/ptrace.go
 create mode 100644 pkg/sentry/strace/socket.go
 create mode 100644 pkg/sentry/strace/strace.go
 create mode 100644 pkg/sentry/strace/strace.proto
 create mode 100644 pkg/sentry/strace/syscalls.go
 create mode 100644 pkg/sentry/syscalls/BUILD
 create mode 100644 pkg/sentry/syscalls/epoll.go
 create mode 100644 pkg/sentry/syscalls/linux/BUILD
 create mode 100644 pkg/sentry/syscalls/linux/error.go
 create mode 100644 pkg/sentry/syscalls/linux/flags.go
 create mode 100644 pkg/sentry/syscalls/linux/linux64.go
 create mode 100644 pkg/sentry/syscalls/linux/sigset.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_aio.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_capability.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_epoll.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_eventfd.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_file.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_futex.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_getdents.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_identity.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_inotify.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_lseek.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_mmap.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_mount.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_pipe.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_poll.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_prctl.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_random.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_read.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_rlimit.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_rusage.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_sched.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_sem.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_signal.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_socket.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_stat.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_sync.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_sysinfo.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_syslog.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_thread.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_time.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_timer.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_timerfd.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_tls.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_utsname.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_write.go
 create mode 100644 pkg/sentry/syscalls/linux/timespec.go
 create mode 100644 pkg/sentry/syscalls/polling.go
 create mode 100644 pkg/sentry/syscalls/syscalls.go
 create mode 100644 pkg/sentry/syscalls/unimplemented_syscall.proto
 create mode 100644 pkg/sentry/time/BUILD
 create mode 100644 pkg/sentry/time/calibrated_clock.go
 create mode 100644 pkg/sentry/time/calibrated_clock_test.go
 create mode 100644 pkg/sentry/time/clock_id.go
 create mode 100644 pkg/sentry/time/clocks.go
 create mode 100644 pkg/sentry/time/muldiv_amd64.s
 create mode 100644 pkg/sentry/time/parameters.go
 create mode 100644 pkg/sentry/time/parameters_test.go
 create mode 100644 pkg/sentry/time/sampler.go
 create mode 100644 pkg/sentry/time/sampler_test.go
 create mode 100644 pkg/sentry/time/sampler_unsafe.go
 create mode 100644 pkg/sentry/time/tsc_amd64.s
 create mode 100644 pkg/sentry/uniqueid/BUILD
 create mode 100644 pkg/sentry/uniqueid/context.go
 create mode 100644 pkg/sentry/usage/BUILD
 create mode 100644 pkg/sentry/usage/cpu.go
 create mode 100644 pkg/sentry/usage/io.go
 create mode 100644 pkg/sentry/usage/memory.go
 create mode 100644 pkg/sentry/usage/memory_unsafe.go
 create mode 100644 pkg/sentry/usage/usage.go
 create mode 100644 pkg/sentry/usermem/BUILD
 create mode 100644 pkg/sentry/usermem/README.md
 create mode 100644 pkg/sentry/usermem/access_type.go
 create mode 100644 pkg/sentry/usermem/addr.go
 create mode 100644 pkg/sentry/usermem/addr_range_seq_test.go
 create mode 100644 pkg/sentry/usermem/addr_range_seq_unsafe.go
 create mode 100644 pkg/sentry/usermem/bytes_io.go
 create mode 100644 pkg/sentry/usermem/bytes_io_unsafe.go
 create mode 100644 pkg/sentry/usermem/usermem.go
 create mode 100644 pkg/sentry/usermem/usermem_test.go
 create mode 100644 pkg/sentry/usermem/usermem_x86.go
 create mode 100644 pkg/sentry/watchdog/BUILD
 create mode 100644 pkg/sentry/watchdog/watchdog.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
new file mode 100644
index 000000000..d18cf3555
--- /dev/null
+++ b/pkg/sentry/BUILD
@@ -0,0 +1,12 @@
+# This BUILD file defines a package_group that allows for interdependencies for
+# sentry-internal packages.
+
+package(licenses = ["notice"])  # Apache 2.0
+
+package_group(
+    name = "internal",
+    packages = [
+        "//pkg/sentry/...",
+        "//runsc/...",
+    ],
+)
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
new file mode 100644
index 000000000..a88f57ac7
--- /dev/null
+++ b/pkg/sentry/arch/BUILD
@@ -0,0 +1,66 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "arch_state",
+    srcs = [
+        "arch.go",
+        "arch_amd64.go",
+        "arch_state_x86.go",
+        "arch_x86.go",
+        "auxv.go",
+        "signal_amd64.go",
+    ],
+    out = "arch_state.go",
+    package = "arch",
+)
+
+go_library(
+    name = "arch",
+    srcs = [
+        "aligned.go",
+        "arch.go",
+        "arch_amd64.go",
+        "arch_amd64.s",
+        "arch_state.go",
+        "arch_state_x86.go",
+        "arch_x86.go",
+        "auxv.go",
+        "signal_act.go",
+        "signal_amd64.go",
+        "signal_info.go",
+        "signal_stack.go",
+        "stack.go",
+        "syscalls_amd64.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
+    visibility = ["//:sandbox"],
+    deps = [
+        ":registers_go_proto",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/cpuid",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
+
+proto_library(
+    name = "registers_proto",
+    srcs = ["registers.proto"],
+    visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+    name = "registers_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto",
+    proto = ":registers_proto",
+    visibility = ["//visibility:public"],
+)
diff --git a/pkg/sentry/arch/aligned.go b/pkg/sentry/arch/aligned.go
new file mode 100644
index 000000000..193232e27
--- /dev/null
+++ b/pkg/sentry/arch/aligned.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"reflect"
+)
+
+// alignedBytes returns a slice of size bytes, aligned in memory to the given
+// alignment. This is used because we require certain structures to be aligned
+// in a specific way (for example, the X86 floating point data).
+func alignedBytes(size, alignment uint) []byte {
+	data := make([]byte, size+alignment-1)
+	offset := uint(reflect.ValueOf(data).Index(0).Addr().Pointer() % uintptr(alignment))
+	if offset == 0 {
+		return data[:size:size]
+	}
+	return data[alignment-offset:][:size:size]
+}
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
new file mode 100644
index 000000000..021789e4b
--- /dev/null
+++ b/pkg/sentry/arch/arch.go
@@ -0,0 +1,351 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package arch provides abstractions around architecture-dependent details,
+// such as syscall calling conventions, native types, etc.
+package arch
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Arch describes an architecture.
+type Arch int
+
+const (
+	// AMD64 is the x86-64 architecture.
+	AMD64 Arch = iota
+)
+
+// String implements fmt.Stringer.
+func (a Arch) String() string {
+	switch a {
+	case AMD64:
+		return "amd64"
+	default:
+		return fmt.Sprintf("Arch(%d)", a)
+	}
+}
+
+// FloatingPointData is a generic type, and will always be passed as a pointer.
+// We rely on the individual arch implementations to meet all the necessary
+// requirements. For example, on x86 the region must be 16-byte aligned and 512
+// bytes in size.
+type FloatingPointData byte
+
+// Context provides architecture-dependent information for a specific thread.
+//
+// NOTE: Currently we use uintptr here to refer to a generic native
+// register value. While this will work for the foreseeable future, it isn't
+// strictly correct. We may want to create some abstraction that makes this
+// more clear or enables us to store values of arbitrary widths. This is
+// particularly true for RegisterMap().
+type Context interface {
+	// Arch returns the architecture for this Context.
+	Arch() Arch
+
+	// Native converts a generic type to a native value.
+	//
+	// Because the architecture is not specified here, we may be dealing
+	// with return values of varying sizes (for example ARCH_GETFS). This
+	// is a simple utility function to convert to the native size in these
+	// cases, and then we can CopyOut.
+	Native(val uintptr) interface{}
+
+	// Value converts a native type back to a generic value.
+	// Once a value has been converted to native via the above call -- it
+	// can be converted back here.
+	Value(val interface{}) uintptr
+
+	// Width returns the number of bytes for a native value.
+	Width() uint
+
+	// Fork creates a clone of the context.
+	Fork() Context
+
+	// SyscallNo returns the syscall number.
+	SyscallNo() uintptr
+
+	// SyscallArgs returns the syscall arguments in an array.
+	SyscallArgs() SyscallArguments
+
+	// Return returns the return value for a system call.
+	Return() uintptr
+
+	// SetReturn sets the return value for a system call.
+	SetReturn(value uintptr)
+
+	// RestartSyscall reverses over the current syscall instruction, such that
+	// when the application resumes execution the syscall will be re-attempted.
+	RestartSyscall()
+
+	// RestartSyscallWithRestartBlock reverses over the current syscall
+	// instraction and overwrites the current syscall number with that of
+	// restart_syscall(2). This causes the application to restart the current
+	// syscall with a custom function when execution resumes.
+	RestartSyscallWithRestartBlock()
+
+	// IP returns the current instruction pointer.
+	IP() uintptr
+
+	// SetIP sets the current instruction pointer.
+	SetIP(value uintptr)
+
+	// Stack returns the current stack pointer.
+	Stack() uintptr
+
+	// SetStack sets the current stack pointer.
+	SetStack(value uintptr)
+
+	// SetRSEQInterruptedIP sets the register that contains the old IP when a
+	// restartable sequence is interrupted.
+	SetRSEQInterruptedIP(value uintptr)
+
+	// StateData returns a pointer to underlying architecture state.
+	StateData() *State
+
+	// RegisterMap returns a map of all registers.
+	RegisterMap() (map[string]uintptr, error)
+
+	// NewSignalAct returns a new object that is equivalent to struct sigaction
+	// in the guest architecture.
+	NewSignalAct() NativeSignalAct
+
+	// NewSignalStack returns a new object that is equivalent to stack_t in the
+	// guest architecture.
+	NewSignalStack() NativeSignalStack
+
+	// SignalSetup modifies the context in preparation for handling the
+	// given signal.
+	//
+	// st is the stack where the signal handler frame should be
+	// constructed.
+	//
+	// act is the SignalAct that specifies how this signal is being
+	// handled.
+	//
+	// info is the SignalInfo of the signal being delivered.
+	//
+	// alt is the alternate signal stack (even if the alternate signal
+	// stack is not going to be used).
+	//
+	// sigset is the signal mask before entering the signal handler.
+	SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error
+
+	// SignalRestore restores context after returning from a signal
+	// handler.
+	//
+	// st is the current thread stack.
+	//
+	// rt is true if SignalRestore is being entered from rt_sigreturn and
+	// false if SignalRestore is being entered from sigreturn.
+	// SignalRestore returns the thread's new signal mask.
+	SignalRestore(st *Stack, rt bool) (linux.SignalSet, error)
+
+	// CPUIDEmulate emulates a CPUID instruction according to current register state.
+	CPUIDEmulate(l log.Logger)
+
+	// SingleStep returns true if single stepping is enabled.
+	SingleStep() bool
+
+	// SetSingleStep enables single stepping.
+	SetSingleStep()
+
+	// ClearSingleStep disables single stepping.
+	ClearSingleStep()
+
+	// FloatingPointData will be passed to underlying save routines.
+	FloatingPointData() *FloatingPointData
+
+	// NewMmapLayout returns a layout for a new MM, where MinAddr for the
+	// returned layout must be no lower than min, and MaxAddr for the returned
+	// layout must be no higher than max. Repeated calls to NewMmapLayout may
+	// return different layouts.
+	NewMmapLayout(min, max usermem.Addr, limits *limits.LimitSet) (MmapLayout, error)
+
+	// PIELoadAddress returns a preferred load address for a
+	// position-independent executable within l.
+	PIELoadAddress(l MmapLayout) usermem.Addr
+
+	// FeatureSet returns the FeatureSet in use in this context.
+	FeatureSet() *cpuid.FeatureSet
+
+	// Hack around our package dependences being too broken to support the
+	// equivalent of arch_ptrace():
+
+	// PtracePeekUser implements ptrace(PTRACE_PEEKUSR).
+	PtracePeekUser(addr uintptr) (interface{}, error)
+
+	// PtracePokeUser implements ptrace(PTRACE_POKEUSR).
+	PtracePokeUser(addr, data uintptr) error
+
+	// PtraceGetRegs implements ptrace(PTRACE_GETREGS) by writing the
+	// general-purpose registers represented by this Context to dst and
+	// returning the number of bytes written.
+	PtraceGetRegs(dst io.Writer) (int, error)
+
+	// PtraceSetRegs implements ptrace(PTRACE_SETREGS) by reading
+	// general-purpose registers from src into this Context and returning the
+	// number of bytes read.
+	PtraceSetRegs(src io.Reader) (int, error)
+
+	// PtraceGetFPRegs implements ptrace(PTRACE_GETFPREGS) by writing the
+	// floating-point registers represented by this Context to addr in dst and
+	// returning the number of bytes written.
+	PtraceGetFPRegs(dst io.Writer) (int, error)
+
+	// PtraceSetFPRegs implements ptrace(PTRACE_SETFPREGS) by reading
+	// floating-point registers from src into this Context and returning the
+	// number of bytes read.
+	PtraceSetFPRegs(src io.Reader) (int, error)
+
+	// PtraceGetRegSet implements ptrace(PTRACE_GETREGSET) by writing the
+	// register set given by architecture-defined value regset from this
+	// Context to dst and returning the number of bytes written, which must be
+	// less than or equal to maxlen.
+	PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error)
+
+	// PtraceSetRegSet implements ptrace(PTRACE_SETREGSET) by reading the
+	// register set given by architecture-defined value regset from src and
+	// returning the number of bytes read, which must be less than or equal to
+	// maxlen.
+	PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error)
+
+	// FullRestore returns 'true' if all CPU registers must be restored
+	// when switching to the untrusted application. Typically a task enters
+	// and leaves the kernel via a system call. Platform.Switch() may
+	// optimize for this by not saving/restoring all registers if allowed
+	// by the ABI. For e.g. the amd64 ABI specifies that syscall clobbers
+	// %rcx and %r11. If FullRestore returns true then these optimizations
+	// must be disabled and all registers restored.
+	FullRestore() bool
+}
+
+// MmapDirection is a search direction for mmaps.
+type MmapDirection int
+
+const (
+	// MmapBottomUp instructs mmap to prefer lower addresses.
+	MmapBottomUp MmapDirection = iota
+
+	// MmapTopDown instructs mmap to prefer higher addresses.
+	MmapTopDown
+)
+
+// MmapLayout defines the layout of the user address space for a particular
+// MemoryManager.
+//
+// Note that "highest address" below is always exclusive.
+type MmapLayout struct {
+	// MinAddr is the lowest mappable address.
+	MinAddr usermem.Addr
+
+	// MaxAddr is the highest mappable address.
+	MaxAddr usermem.Addr
+
+	// BottomUpBase is the lowest address that may be returned for a
+	// MmapBottomUp mmap.
+	BottomUpBase usermem.Addr
+
+	// TopDownBase is the highest address that may be returned for a
+	// MmapTopDown mmap.
+	TopDownBase usermem.Addr
+
+	// DefaultDirection is the direction for most non-fixed mmaps in this
+	// layout.
+	DefaultDirection MmapDirection
+
+	// MaxStackRand is the maximum randomization to apply to stack
+	// allocations to maintain a proper gap between the stack and
+	// TopDownBase.
+	MaxStackRand uint64
+}
+
+// Valid returns true if this layout is valid.
+func (m *MmapLayout) Valid() bool {
+	if m.MinAddr > m.MaxAddr {
+		return false
+	}
+	if m.BottomUpBase < m.MinAddr {
+		return false
+	}
+	if m.BottomUpBase > m.MaxAddr {
+		return false
+	}
+	if m.TopDownBase < m.MinAddr {
+		return false
+	}
+	if m.TopDownBase > m.MaxAddr {
+		return false
+	}
+	return true
+}
+
+// SyscallArgument is an argument supplied to a syscall implementation. The
+// methods used to access the arguments are named after the ***C type name*** and
+// they convert to the closest Go type available. For example, Int() refers to a
+// 32-bit signed integer argument represented in Go as an int32.
+//
+// Using the accessor methods guarantees that the conversion between types is
+// correct, taking into account size and signedness (i.e., zero-extension vs
+// signed-extension).
+type SyscallArgument struct {
+	// Prefer to use accessor methods instead of 'Value' directly.
+	Value uintptr
+}
+
+// SyscallArguments represents the set of arguments passed to a syscall.
+type SyscallArguments [6]SyscallArgument
+
+// Pointer returns the usermem.Addr representation of a pointer argument.
+func (a SyscallArgument) Pointer() usermem.Addr {
+	return usermem.Addr(a.Value)
+}
+
+// Int returns the int32 representation of a 32-bit signed integer argument.
+func (a SyscallArgument) Int() int32 {
+	return int32(a.Value)
+}
+
+// Uint returns the uint32 representation of a 32-bit unsigned integer argument.
+func (a SyscallArgument) Uint() uint32 {
+	return uint32(a.Value)
+}
+
+// Int64 returns the int64 representation of a 64-bit signed integer argument.
+func (a SyscallArgument) Int64() int64 {
+	return int64(a.Value)
+}
+
+// Uint64 returns the uint64 representation of a 64-bit unsigned integer argument.
+func (a SyscallArgument) Uint64() uint64 {
+	return uint64(a.Value)
+}
+
+// SizeT returns the uint representation of a size_t argument.
+func (a SyscallArgument) SizeT() uint {
+	return uint(a.Value)
+}
+
+// ModeT returns the int representation of a mode_t argument.
+func (a SyscallArgument) ModeT() uint {
+	return uint(uint16(a.Value))
+}
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
new file mode 100644
index 000000000..23526fe8e
--- /dev/null
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -0,0 +1,302 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// These constants come directly from Linux.
+const (
+	// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
+	// for a 64-bit process.
+	maxAddr64 usermem.Addr = (1 << 47) - usermem.PageSize
+
+	// maxStackRand64 is the maximum randomization to apply to the stack.
+	// It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux.
+	maxStackRand64 = 16 << 30 // 16 GB
+
+	// maxMmapRand64 is the maximum randomization to apply to the mmap
+	// layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux.
+	maxMmapRand64 = (1 << 28) * usermem.PageSize
+
+	// minGap64 is the minimum gap to leave at the top of the address space
+	// for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux.
+	minGap64 = (128 << 20) + maxStackRand64
+
+	// preferredPIELoadAddr is the standard Linux position-independent
+	// executable base load address. It is ELF_ET_DYN_BASE in Linux.
+	//
+	// The Platform {Min,Max}UserAddress() may preclude loading at this
+	// address. See other preferredFoo comments below.
+	preferredPIELoadAddr usermem.Addr = maxAddr64 / 3 * 2
+)
+
+// These constants are selected as heuristics to help make the Platform's
+// potentially limited address space conform as closely to Linux as possible.
+const (
+	// Select a preferred minimum TopDownBase address.
+	//
+	// Some applications (TSAN and other *SANs) are very particular about
+	// the way the Linux mmap allocator layouts out the address space.
+	//
+	// TSAN in particular expects top down allocations to be made in the
+	// range [0x7e8000000000, 0x800000000000).
+	//
+	// The minimum TopDownBase on Linux would be:
+	// 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000.
+	//
+	// (minGap64 because TSAN uses a small RLIMIT_STACK.)
+	//
+	// 0x7e8000000000 is selected arbitrarily by TSAN to leave room for
+	// allocations below TopDownBase.
+	//
+	// N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all
+	// the way down to 0x10007fff8000, and MSAN down to 0x700000000000.
+	//
+	// Of course, there is no hard minimum to allocation; an allocator can
+	// search all the way from TopDownBase to Min. However, TSAN declared
+	// their range "good enough".
+	//
+	// We would like to pick a TopDownBase such that it is unlikely that an
+	// allocator will select an address below TSAN's minimum. We achieve
+	// this by trying to leave a sizable gap below TopDownBase.
+	//
+	// This is all "preferred" because the layout min/max address may not
+	// allow us to select such a TopDownBase, in which case we have to fall
+	// back to a layout that TSAN may not be happy with.
+	preferredTopDownAllocMin usermem.Addr = 0x7e8000000000
+	preferredAllocationGap                = 128 << 30 // 128 GB
+	preferredTopDownBaseMin               = preferredTopDownAllocMin + preferredAllocationGap
+
+	// minMmapRand64 is the smallest we are willing to make the
+	// randomization to stay above preferredTopDownBaseMin.
+	minMmapRand64 = (1 << 26) * usermem.PageSize
+)
+
+// context64 represents an AMD64 context.
+type context64 struct {
+	State
+	sigFPState []x86FPState // fpstate to be restored on sigreturn.
+}
+
+// Arch implements Context.Arch.
+func (c *context64) Arch() Arch {
+	return AMD64
+}
+
+func (c *context64) copySigFPState() []x86FPState {
+	var sigfps []x86FPState
+	for _, s := range c.sigFPState {
+		sigfps = append(sigfps, s.fork())
+	}
+	return sigfps
+}
+
+// Fork returns an exact copy of this context.
+func (c *context64) Fork() Context {
+	return &context64{
+		State:      c.State.Fork(),
+		sigFPState: c.copySigFPState(),
+	}
+}
+
+// Return returns the current syscall return value.
+func (c *context64) Return() uintptr {
+	return uintptr(c.Regs.Rax)
+}
+
+// SetReturn sets the syscall return value.
+func (c *context64) SetReturn(value uintptr) {
+	c.Regs.Rax = uint64(value)
+}
+
+// IP returns the current instruction pointer.
+func (c *context64) IP() uintptr {
+	return uintptr(c.Regs.Rip)
+}
+
+// SetIP sets the current instruction pointer.
+func (c *context64) SetIP(value uintptr) {
+	c.Regs.Rip = uint64(value)
+}
+
+// Stack returns the current stack pointer.
+func (c *context64) Stack() uintptr {
+	return uintptr(c.Regs.Rsp)
+}
+
+// SetStack sets the current stack pointer.
+func (c *context64) SetStack(value uintptr) {
+	c.Regs.Rsp = uint64(value)
+}
+
+// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
+func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+	c.Regs.R10 = uint64(value)
+}
+
+// Native returns the native type for the given val.
+func (c *context64) Native(val uintptr) interface{} {
+	v := uint64(val)
+	return &v
+}
+
+// Value returns the generic val for the given native type.
+func (c *context64) Value(val interface{}) uintptr {
+	return uintptr(*val.(*uint64))
+}
+
+// Width returns the byte width of this architecture.
+func (c *context64) Width() uint {
+	return 8
+}
+
+// FeatureSet returns the FeatureSet in use.
+func (c *context64) FeatureSet() *cpuid.FeatureSet {
+	return c.State.FeatureSet
+}
+
+// mmapRand returns a random adjustment for randomizing an mmap layout.
+func mmapRand(max uint64) usermem.Addr {
+	return usermem.Addr(rand.Int63n(int64(max))).RoundDown()
+}
+
+// NewMmapLayout implements Context.NewMmapLayout consistently with Linux.
+func (c *context64) NewMmapLayout(min, max usermem.Addr, r *limits.LimitSet) (MmapLayout, error) {
+	min, ok := min.RoundUp()
+	if !ok {
+		return MmapLayout{}, syscall.EINVAL
+	}
+	if max > maxAddr64 {
+		max = maxAddr64
+	}
+	max = max.RoundDown()
+
+	if min > max {
+		return MmapLayout{}, syscall.EINVAL
+	}
+
+	stackSize := r.Get(limits.Stack)
+
+	// MAX_GAP in Linux.
+	maxGap := (max / 6) * 5
+	gap := usermem.Addr(stackSize.Cur)
+	if gap < minGap64 {
+		gap = minGap64
+	}
+	if gap > maxGap {
+		gap = maxGap
+	}
+	defaultDir := MmapTopDown
+	if stackSize.Cur == limits.Infinity {
+		defaultDir = MmapBottomUp
+	}
+
+	topDownMin := max - gap - maxMmapRand64
+	maxRand := usermem.Addr(maxMmapRand64)
+	if topDownMin < preferredTopDownBaseMin {
+		// Try to keep TopDownBase above preferredTopDownBaseMin by
+		// shrinking maxRand.
+		maxAdjust := maxRand - minMmapRand64
+		needAdjust := preferredTopDownBaseMin - topDownMin
+		if needAdjust <= maxAdjust {
+			maxRand -= needAdjust
+		}
+	}
+
+	rnd := mmapRand(uint64(maxRand))
+	l := MmapLayout{
+		MinAddr: min,
+		MaxAddr: max,
+		// TASK_UNMAPPED_BASE in Linux.
+		BottomUpBase:     (max/3 + rnd).RoundDown(),
+		TopDownBase:      (max - gap - rnd).RoundDown(),
+		DefaultDirection: defaultDir,
+		// We may have reduced the maximum randomization to keep
+		// TopDownBase above preferredTopDownBaseMin while maintaining
+		// our stack gap. Stack allocations must use that max
+		// randomization to avoiding eating into the gap.
+		MaxStackRand: uint64(maxRand),
+	}
+
+	// Final sanity check on the layout.
+	if !l.Valid() {
+		panic(fmt.Sprintf("Invalid MmapLayout: %+v", l))
+	}
+
+	return l, nil
+}
+
+// PIELoadAddress implements Context.PIELoadAddress.
+func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
+	base := preferredPIELoadAddr
+	max, ok := base.AddLength(maxMmapRand64)
+	if !ok {
+		panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base))
+	}
+
+	if max > l.MaxAddr {
+		// preferredPIELoadAddr won't fit; fall back to the standard
+		// Linux behavior of 2/3 of TopDownBase. TSAN won't like this.
+		//
+		// Don't bother trying to shrink the randomization for now.
+		base = l.TopDownBase / 3 * 2
+	}
+
+	return base + mmapRand(maxMmapRand64)
+}
+
+// userStructSize is the size in bytes of Linux's struct user on amd64.
+const userStructSize = 928
+
+// PtracePeekUser implements Context.PtracePeekUser.
+func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+	if addr&7 != 0 || addr >= userStructSize {
+		return nil, syscall.EIO
+	}
+	// PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
+	// u_debugreg, returning 0 or silently no-oping for other fields
+	// respectively.
+	if addr < uintptr(ptraceRegsSize) {
+		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
+		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
+	}
+	// TODO: debug registers
+	return c.Native(0), nil
+}
+
+// PtracePokeUser implements Context.PtracePokeUser.
+func (c *context64) PtracePokeUser(addr, data uintptr) error {
+	if addr&7 != 0 || addr >= userStructSize {
+		return syscall.EIO
+	}
+	if addr < uintptr(ptraceRegsSize) {
+		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
+		usermem.ByteOrder.PutUint64(buf[addr:], uint64(data))
+		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
+		return err
+	}
+	// TODO: debug registers
+	return nil
+}
diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s
new file mode 100644
index 000000000..10d621b6d
--- /dev/null
+++ b/pkg/sentry/arch/arch_amd64.s
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// MXCSR_DEFAULT is the reset value of MXCSR (Intel SDM Vol. 2, Ch. 3.2
+// "LDMXCSR")
+#define MXCSR_DEFAULT	0x1f80
+
+// MXCSR_OFFSET is the offset in bytes of the MXCSR field from the start of the
+// FXSAVE/XSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE Area")
+#define MXCSR_OFFSET	24
+
+// initX86FPState initializes floating point state.
+//
+// func initX86FPState(data *FloatingPointData, useXsave bool)
+//
+// We need to clear out and initialize an empty fp state area since the sentry
+// may have left sensitive information in the floating point registers.
+//
+// Preconditions: data is zeroed
+TEXT ·initX86FPState(SB), $24-16
+	// Save MXCSR (callee-save)
+	STMXCSR	mxcsr-8(SP)
+
+	// Save x87 CW (callee-save)
+	FSTCW	cw-16(SP)
+
+	MOVQ	fpState+0(FP), DI
+
+	// Do we use xsave?
+	MOVBQZX	useXsave+8(FP), AX
+	TESTQ	AX, AX
+	JZ	no_xsave
+
+	// Use XRSTOR to clear all FP state to an initial state.
+	//
+	// The fpState XSAVE area is zeroed on function entry, meaning
+	// XSTATE_BV is zero.
+	//
+	// "If RFBM[i] = 1 and bit i is clear in the XSTATE_BV field in the
+	// XSAVE header, XRSTOR initializes state component i."
+	//
+	// Initialization is defined in SDM Vol 1, Chapter 13.3. It puts all
+	// the registers in a reasonable initial state, except MXCSR:
+	//
+	// "The MXCSR register is part of state component 1, SSE state (see
+	// Section 13.5.2). However, the standard form of XRSTOR loads the
+	// MXCSR register from memory whenever the RFBM[1] (SSE) or RFBM[2]
+	// (AVX) is set, regardless of the values of XSTATE_BV[1] and
+	// XSTATE_BV[2]."
+
+	// Set MXCSR to the default value.
+	MOVL	$MXCSR_DEFAULT, MXCSR_OFFSET(DI)
+
+	// Initialize registers with XRSTOR.
+	MOVL	$0xffffffff, AX
+	MOVL	$0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI)
+
+	// Now that all the state has been reset, write it back out to the
+	// XSAVE area.
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27 // XSAVE64 0(DI)
+
+	JMP	out
+
+no_xsave:
+	// Clear out existing X values.
+	PXOR	X0, X0
+	MOVO	X0, X1
+	MOVO	X0, X2
+	MOVO	X0, X3
+	MOVO	X0, X4
+	MOVO	X0, X5
+	MOVO	X0, X6
+	MOVO	X0, X7
+	MOVO	X0, X8
+	MOVO	X0, X9
+	MOVO	X0, X10
+	MOVO	X0, X11
+	MOVO	X0, X12
+	MOVO	X0, X13
+	MOVO	X0, X14
+	MOVO	X0, X15
+
+	// Zero out %rax and store into MMX registers. MMX registers are
+	// an alias of 8x64 bits of the 8x80 bits used for the original
+	// x87 registers. Storing zero into them will reset the FPU registers
+	// to bits [63:0] = 0, [79:64] = 1. But the contents aren't too
+	// important, just the fact that we have reset them to a known value.
+	XORQ	AX, AX
+	MOVQ	AX, M0
+	MOVQ	AX, M1
+	MOVQ	AX, M2
+	MOVQ	AX, M3
+	MOVQ	AX, M4
+	MOVQ	AX, M5
+	MOVQ	AX, M6
+	MOVQ	AX, M7
+
+	// The Go assembler doesn't support FNINIT, so we use BYTE.
+	// This will:
+	//  - Reset FPU control word to 0x037f
+	//  - Clear FPU status word
+	//  - Reset FPU tag word to 0xffff
+	//  - Clear FPU data pointer
+	//  - Clear FPU instruction pointer
+	BYTE $0xDB; BYTE $0xE3; // FNINIT
+
+	// Reset MXCSR.
+	MOVL	$MXCSR_DEFAULT, tmpmxcsr-24(SP)
+	LDMXCSR	tmpmxcsr-24(SP)
+
+	// Save the floating point state with fxsave.
+	FXSAVE64	0(DI)
+
+out:
+	// Restore MXCSR.
+	LDMXCSR	mxcsr-8(SP)
+
+	// Restore x87 CW.
+	FLDCW	cw-16(SP)
+
+	RET
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
new file mode 100644
index 000000000..cb38d098a
--- /dev/null
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -0,0 +1,97 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// warnOnce is used to warn about truncated state only once.
+var warnOnce sync.Once
+
+// afterLoad is invoked by stateify.
+func (s *State) afterLoad() {
+	old := s.x86FPState
+
+	// Recreate the slice. This is done to ensure that it is aligned
+	// appropriately in memory, and large enough to accommodate any new
+	// state that may be saved by the new CPU. Even if extraneous new state
+	// is saved, the state we care about is guaranteed to be a subset of
+	// new state. Later optimizations can use less space when using a
+	// smaller state component bitmap. Intel SDM section 13 has more info.
+	s.x86FPState = newX86FPState()
+
+	// x86FPState always contains all the FP state supported by the host.
+	// We may have come from a newer machine that supports additional state
+	// which we cannot restore.
+	//
+	// The x86 FP state areas are backwards compatible, so we can simply
+	// truncate the additional floating point state. Applications should
+	// not depend on the truncated state because it should relate only to
+	// features that were not exposed in the app FeatureSet.
+	if len(s.x86FPState) < len(old) {
+		warnOnce.Do(func() {
+			// This will occur on every instance of state, don't
+			// bother warning more than once.
+			log.Infof("dropping %d bytes of floating point state; the application should not depend on this state", len(old)-len(s.x86FPState))
+		})
+	}
+
+	// Copy to the new, aligned location.
+	copy(s.x86FPState, old)
+}
+
+type syscallPtraceRegs struct {
+	R15      uint64
+	R14      uint64
+	R13      uint64
+	R12      uint64
+	Rbp      uint64
+	Rbx      uint64
+	R11      uint64
+	R10      uint64
+	R9       uint64
+	R8       uint64
+	Rax      uint64
+	Rcx      uint64
+	Rdx      uint64
+	Rsi      uint64
+	Rdi      uint64
+	Orig_rax uint64
+	Rip      uint64
+	Cs       uint64
+	Eflags   uint64
+	Rsp      uint64
+	Ss       uint64
+	Fs_base  uint64
+	Gs_base  uint64
+	Ds       uint64
+	Es       uint64
+	Fs       uint64
+	Gs       uint64
+}
+
+// saveRegs is invoked by stateify.
+func (s *State) saveRegs() syscallPtraceRegs {
+	return syscallPtraceRegs(s.Regs)
+}
+
+// loadRegs is invoked by stateify.
+func (s *State) loadRegs(r syscallPtraceRegs) {
+	s.Regs = syscall.PtraceRegs(r)
+}
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
new file mode 100644
index 000000000..5cc4f8377
--- /dev/null
+++ b/pkg/sentry/arch/arch_x86.go
@@ -0,0 +1,613 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 i386
+
+package arch
+
+import (
+	"fmt"
+	"io"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// System-related constants for x86.
+const (
+	// SyscallWidth is the width of syscall, sysenter, and int 80 insturctions.
+	SyscallWidth = 2
+)
+
+// EFLAGS register bits.
+const (
+	// eflagsCF is the mask for the carry flag.
+	eflagsCF = uint64(1) << 0
+	// eflagsPF is the mask for the parity flag.
+	eflagsPF = uint64(1) << 2
+	// eflagsAF is the mask for the auxiliary carry flag.
+	eflagsAF = uint64(1) << 4
+	// eflagsZF is the mask for the zero flag.
+	eflagsZF = uint64(1) << 6
+	// eflagsSF is the mask for the sign flag.
+	eflagsSF = uint64(1) << 7
+	// eflagsTF is the mask for the trap flag.
+	eflagsTF = uint64(1) << 8
+	// eflagsIF is the mask for the interrupt flag.
+	eflagsIF = uint64(1) << 9
+	// eflagsDF is the mask for the direction flag.
+	eflagsDF = uint64(1) << 10
+	// eflagsOF is the mask for the overflow flag.
+	eflagsOF = uint64(1) << 11
+	// eflagsIOPL is the mask for the I/O privilege level.
+	eflagsIOPL = uint64(3) << 12
+	// eflagsNT is the mask for the nested task bit.
+	eflagsNT = uint64(1) << 14
+	// eflagsRF is the mask for the resume flag.
+	eflagsRF = uint64(1) << 16
+	// eflagsVM is the mask for the virtual mode bit.
+	eflagsVM = uint64(1) << 17
+	// eflagsAC is the mask for the alignment check / access control bit.
+	eflagsAC = uint64(1) << 18
+	// eflagsVIF is the mask for the virtual interrupt flag.
+	eflagsVIF = uint64(1) << 19
+	// eflagsVIP is the mask for the virtual interrupt pending bit.
+	eflagsVIP = uint64(1) << 20
+	// eflagsID is the mask for the CPUID detection bit.
+	eflagsID = uint64(1) << 21
+
+	// eflagsPtraceMutable is the mask for the set of EFLAGS that may be
+	// changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to
+	// Linux's FLAG_MASK.
+	eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT
+
+	// eflagsRestorable is the mask for the set of EFLAGS that may be changed by
+	// SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS.
+	eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF
+)
+
+// Segment selectors. See arch/x86/include/asm/segment.h.
+const (
+	userCS   = 0x33 // guest ring 3 code selector
+	user32CS = 0x23 // guest ring 3 32 bit code selector
+	userDS   = 0x2b // guest ring 3 data selector
+
+	_FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector
+	_GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector
+)
+
+var (
+	// TrapInstruction is the x86 trap instruction.
+	TrapInstruction = [1]byte{0xcc}
+
+	// CPUIDInstruction is the x86 CPUID instruction.
+	CPUIDInstruction = [2]byte{0xf, 0xa2}
+
+	// X86TrapFlag is an exported const for use by other packages.
+	X86TrapFlag uint64 = (1 << 8)
+)
+
+// x86FPState is x86 floating point state.
+type x86FPState []byte
+
+// initX86FPState (defined in asm files) sets up initial state.
+func initX86FPState(data *FloatingPointData, useXsave bool)
+
+func newX86FPStateSlice() []byte {
+	size, align := cpuid.HostFeatureSet().ExtendedStateSize()
+	capacity := size
+	// Always use at least 4096 bytes.
+	if capacity < 4096 {
+		capacity = 4096
+	}
+	return alignedBytes(capacity, align)[:size]
+}
+
+// newX86FPState returns an initialized floating point state.
+//
+// The returned state is large enough to store all floating point state
+// supported by host, even if the app won't use much of it due to a restricted
+// FeatureSet. Since they may still be able to see state not advertised by
+// CPUID we must ensure it does not contain any sentry state.
+func newX86FPState() x86FPState {
+	f := x86FPState(newX86FPStateSlice())
+	initX86FPState(f.FloatingPointData(), cpuid.HostFeatureSet().UseXsave())
+	return f
+}
+
+// fork creates and returns an identical copy of the x86 floating point state.
+func (f x86FPState) fork() x86FPState {
+	n := x86FPState(newX86FPStateSlice())
+	copy(n, f)
+	return n
+}
+
+// FloatingPointData returns the raw data pointer.
+func (f x86FPState) FloatingPointData() *FloatingPointData {
+	return (*FloatingPointData)(&f[0])
+}
+
+// NewFloatingPointData returns a new floating point data blob.
+//
+// This is primarily for use in tests.
+func NewFloatingPointData() *FloatingPointData {
+	return (*FloatingPointData)(&(newX86FPState()[0]))
+}
+
+// State contains the common architecture bits for X86 (the build tag of this
+// file ensures it's only built on x86).
+type State struct {
+	// The system registers.
+	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+
+	// Our floating point state.
+	x86FPState `state:"wait"`
+
+	// FeatureSet is a pointer to the currently active feature set.
+	FeatureSet *cpuid.FeatureSet
+}
+
+// Proto returns a protobuf representation of the system registers in State.
+func (s State) Proto() *rpb.Registers {
+	regs := &rpb.AMD64Registers{
+		Rax:     s.Regs.Rax,
+		Rbx:     s.Regs.Rbx,
+		Rcx:     s.Regs.Rcx,
+		Rdx:     s.Regs.Rdx,
+		Rsi:     s.Regs.Rsi,
+		Rdi:     s.Regs.Rdi,
+		Rsp:     s.Regs.Rsp,
+		Rbp:     s.Regs.Rbp,
+		R8:      s.Regs.R8,
+		R9:      s.Regs.R9,
+		R10:     s.Regs.R10,
+		R11:     s.Regs.R11,
+		R12:     s.Regs.R12,
+		R13:     s.Regs.R13,
+		R14:     s.Regs.R14,
+		R15:     s.Regs.R15,
+		Rip:     s.Regs.Rip,
+		Rflags:  s.Regs.Eflags,
+		OrigRax: s.Regs.Orig_rax,
+		Cs:      s.Regs.Cs,
+		Ds:      s.Regs.Ds,
+		Es:      s.Regs.Es,
+		Fs:      s.Regs.Fs,
+		Gs:      s.Regs.Gs,
+		Ss:      s.Regs.Ss,
+		FsBase:  s.Regs.Fs_base,
+		GsBase:  s.Regs.Gs_base,
+	}
+	return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}}
+}
+
+// Fork creates and returns an identical copy of the state.
+func (s *State) Fork() State {
+	return State{
+		Regs:       s.Regs,
+		x86FPState: s.x86FPState.fork(),
+		FeatureSet: s.FeatureSet,
+	}
+}
+
+// StateData implements Context.StateData.
+func (s *State) StateData() *State {
+	return s
+}
+
+// CPUIDEmulate emulates a cpuid instruction.
+func (s *State) CPUIDEmulate(l log.Logger) {
+	argax := uint32(s.Regs.Rax)
+	argcx := uint32(s.Regs.Rcx)
+	ax, bx, cx, dx := s.FeatureSet.EmulateID(argax, argcx)
+	s.Regs.Rax = uint64(ax)
+	s.Regs.Rbx = uint64(bx)
+	s.Regs.Rcx = uint64(cx)
+	s.Regs.Rdx = uint64(dx)
+	l.Debugf("CPUID(%x,%x): %x %x %x %x", argax, argcx, ax, bx, cx, dx)
+}
+
+// SingleStep implements Context.SingleStep.
+func (s *State) SingleStep() bool {
+	return s.Regs.Eflags&X86TrapFlag != 0
+}
+
+// SetSingleStep enables single stepping.
+func (s *State) SetSingleStep() {
+	// Set the trap flag.
+	s.Regs.Eflags |= X86TrapFlag
+}
+
+// ClearSingleStep enables single stepping.
+func (s *State) ClearSingleStep() {
+	// Clear the trap flag.
+	s.Regs.Eflags &= ^X86TrapFlag
+}
+
+// RegisterMap returns a map of all registers.
+func (s *State) RegisterMap() (map[string]uintptr, error) {
+	return map[string]uintptr{
+		"R15":      uintptr(s.Regs.R15),
+		"R14":      uintptr(s.Regs.R14),
+		"R13":      uintptr(s.Regs.R13),
+		"R12":      uintptr(s.Regs.R12),
+		"Rbp":      uintptr(s.Regs.Rbp),
+		"Rbx":      uintptr(s.Regs.Rbx),
+		"R11":      uintptr(s.Regs.R11),
+		"R10":      uintptr(s.Regs.R10),
+		"R9":       uintptr(s.Regs.R9),
+		"R8":       uintptr(s.Regs.R8),
+		"Rax":      uintptr(s.Regs.Rax),
+		"Rcx":      uintptr(s.Regs.Rcx),
+		"Rdx":      uintptr(s.Regs.Rdx),
+		"Rsi":      uintptr(s.Regs.Rsi),
+		"Rdi":      uintptr(s.Regs.Rdi),
+		"Orig_rax": uintptr(s.Regs.Orig_rax),
+		"Rip":      uintptr(s.Regs.Rip),
+		"Cs":       uintptr(s.Regs.Cs),
+		"Eflags":   uintptr(s.Regs.Eflags),
+		"Rsp":      uintptr(s.Regs.Rsp),
+		"Ss":       uintptr(s.Regs.Ss),
+		"Fs_base":  uintptr(s.Regs.Fs_base),
+		"Gs_base":  uintptr(s.Regs.Gs_base),
+		"Ds":       uintptr(s.Regs.Ds),
+		"Es":       uintptr(s.Regs.Es),
+		"Fs":       uintptr(s.Regs.Fs),
+		"Gs":       uintptr(s.Regs.Gs),
+	}, nil
+}
+
+// PtraceGetRegs implements Context.PtraceGetRegs.
+func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
+	return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs()))
+}
+
+func (s *State) ptraceGetRegs() syscall.PtraceRegs {
+	regs := s.Regs
+	// These may not be initialized.
+	if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 {
+		regs.Eflags = eflagsIF
+		regs.Cs = userCS
+		regs.Ss = userDS
+	}
+	// As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base
+	// addresses using reserved descriptors in the GDT instead of the MSRs,
+	// with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These
+	// values are actually visible in struct user_regs_struct::fs/gs;
+	// arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct
+	// thread_struct::fsindex/gsindex.
+	//
+	// We always use fs == gs == 0 when fs_base/gs_base is in use, for
+	// simplicity.
+	//
+	// Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via
+	// arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a
+	// 32-bit value and fsindex/gsindex indicates that this optimization is
+	// in use, as well as the reverse case of setting fs/gs to
+	// FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the
+	// same in PtraceSetRegs.)
+	//
+	// TODO: Remove this fixup since newer Linux doesn't have
+	// this behavior anymore.
+	if regs.Fs == 0 && regs.Fs_base <= 0xffffffff {
+		regs.Fs = _FS_TLS_SEL
+	}
+	if regs.Gs == 0 && regs.Gs_base <= 0xffffffff {
+		regs.Gs = _GS_TLS_SEL
+	}
+	return regs
+}
+
+var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{}))
+
+// PtraceSetRegs implements Context.PtraceSetRegs.
+func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
+	var regs syscall.PtraceRegs
+	buf := make([]byte, ptraceRegsSize)
+	if _, err := io.ReadFull(src, buf); err != nil {
+		return 0, err
+	}
+	binary.Unmarshal(buf, usermem.ByteOrder, &regs)
+	// Truncate segment registers to 16 bits.
+	regs.Cs = uint64(uint16(regs.Cs))
+	regs.Ds = uint64(uint16(regs.Ds))
+	regs.Es = uint64(uint16(regs.Es))
+	regs.Fs = uint64(uint16(regs.Fs))
+	regs.Gs = uint64(uint16(regs.Gs))
+	regs.Ss = uint64(uint16(regs.Ss))
+	// In Linux this validation is via arch/x86/kernel/ptrace.c:putreg().
+	if !isUserSegmentSelector(regs.Cs) {
+		return 0, syscall.EIO
+	}
+	if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) {
+		return 0, syscall.EIO
+	}
+	if regs.Es != 0 && !isUserSegmentSelector(regs.Es) {
+		return 0, syscall.EIO
+	}
+	if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) {
+		return 0, syscall.EIO
+	}
+	if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) {
+		return 0, syscall.EIO
+	}
+	if !isUserSegmentSelector(regs.Ss) {
+		return 0, syscall.EIO
+	}
+	if regs.Fs_base >= uint64(maxAddr64) {
+		return 0, syscall.EIO
+	}
+	if regs.Gs_base >= uint64(maxAddr64) {
+		return 0, syscall.EIO
+	}
+	// CS and SS are validated, but changes to them are otherwise silently
+	// ignored on amd64.
+	regs.Cs = s.Regs.Cs
+	regs.Ss = s.Regs.Ss
+	// fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux.
+	if regs.Fs_base != s.Regs.Fs_base {
+		regs.Fs = 0
+	}
+	if regs.Gs_base != s.Regs.Gs_base {
+		regs.Gs = 0
+	}
+	// Ignore "stale" TLS segment selectors for FS and GS. See comment in
+	// ptraceGetRegs.
+	if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 {
+		regs.Fs = 0
+	}
+	if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 {
+		regs.Gs = 0
+	}
+	regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
+	s.Regs = regs
+	return ptraceRegsSize, nil
+}
+
+// isUserSegmentSelector returns true if the given segment selector specifies a
+// privilege level of 3 (USER_RPL).
+func isUserSegmentSelector(reg uint64) bool {
+	return reg&3 == 3
+}
+
+// ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type
+// manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently,
+// ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area.
+const ptraceFPRegsSize = 512
+
+// PtraceGetFPRegs implements Context.PtraceGetFPRegs.
+func (s *State) PtraceGetFPRegs(dst io.Writer) (int, error) {
+	return dst.Write(s.x86FPState[:ptraceFPRegsSize])
+}
+
+// PtraceSetFPRegs implements Context.PtraceSetFPRegs.
+func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) {
+	var f [ptraceFPRegsSize]byte
+	n, err := io.ReadFull(src, f[:])
+	if err != nil {
+		return 0, err
+	}
+	// Force reserved bits in MXCSR to 0. This is consistent with Linux.
+	sanitizeMXCSR(x86FPState(f[:]))
+	// N.B. this only copies the beginning of the FP state, which
+	// corresponds to the FXSAVE area.
+	copy(s.x86FPState, f[:])
+	return n, nil
+}
+
+const (
+	// mxcsrOffset is the offset in bytes of the MXCSR field from the start of
+	// the FXSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE
+	// Area")
+	mxcsrOffset = 24
+
+	// mxcsrMaskOffset is the offset in bytes of the MXCSR_MASK field from the
+	// start of the FXSAVE area.
+	mxcsrMaskOffset = 28
+)
+
+var (
+	mxcsrMask     uint32
+	initMXCSRMask sync.Once
+)
+
+// sanitizeMXCSR coerces reserved bits in the MXCSR field of f to 0. ("FXRSTOR
+// generates a general-protection fault (#GP) in response to an attempt to set
+// any of the reserved bits of the MXCSR register." - Intel SDM Vol. 1, Section
+// 10.5.1.2 "SSE State")
+func sanitizeMXCSR(f x86FPState) {
+	mxcsr := usermem.ByteOrder.Uint32(f[mxcsrOffset:])
+	initMXCSRMask.Do(func() {
+		temp := x86FPState(alignedBytes(uint(ptraceFPRegsSize), 16))
+		initX86FPState(temp.FloatingPointData(), false /* useXsave */)
+		mxcsrMask = usermem.ByteOrder.Uint32(temp[mxcsrMaskOffset:])
+		if mxcsrMask == 0 {
+			// "If the value of the MXCSR_MASK field is 00000000H, then the
+			// MXCSR_MASK value is the default value of 0000FFBFH." - Intel SDM
+			// Vol. 1, Section 11.6.6 "Guidelines for Writing to the MXCSR
+			// Register"
+			mxcsrMask = 0xffbf
+		}
+	})
+	mxcsr &= mxcsrMask
+	usermem.ByteOrder.PutUint32(f[mxcsrOffset:], mxcsr)
+}
+
+const (
+	// minXstateBytes is the minimum size in bytes of an x86 XSAVE area, equal
+	// to the size of the XSAVE legacy area (512 bytes) plus the size of the
+	// XSAVE header (64 bytes). Equivalently, minXstateBytes is GDB's
+	// X86_XSTATE_SSE_SIZE.
+	minXstateBytes = 512 + 64
+
+	// userXstateXCR0Offset is the offset in bytes of the USER_XSTATE_XCR0_WORD
+	// field in Linux's struct user_xstateregs, which is the type manipulated
+	// by ptrace(PTRACE_GET/SETREGSET, NT_X86_XSTATE). Equivalently,
+	// userXstateXCR0Offset is GDB's I386_LINUX_XSAVE_XCR0_OFFSET.
+	userXstateXCR0Offset = 464
+
+	// xstateBVOffset is the offset in bytes of the XSTATE_BV field in an x86
+	// XSAVE area.
+	xstateBVOffset = 512
+
+	// xsaveHeaderZeroedOffset and xsaveHeaderZeroedBytes indicate parts of the
+	// XSAVE header that we coerce to zero: "Bytes 15:8 of the XSAVE header is
+	// a state-component bitmap called XCOMP_BV. ... Bytes 63:16 of the XSAVE
+	// header are reserved." - Intel SDM Vol. 1, Section 13.4.2 "XSAVE Header".
+	// Linux ignores XCOMP_BV, but it's able to recover from XRSTOR #GP
+	// exceptions resulting from invalid values; we aren't. Linux also never
+	// uses the compacted format when doing XSAVE and doesn't even define the
+	// compaction extensions to XSAVE as a CPU feature, so for simplicity we
+	// assume no one is using them.
+	xsaveHeaderZeroedOffset = 512 + 8
+	xsaveHeaderZeroedBytes  = 64 - 8
+)
+
+func (s *State) ptraceGetXstateRegs(dst io.Writer, maxlen int) (int, error) {
+	// N.B. s.x86FPState may contain more state than the application
+	// expects. We only copy the subset that would be in their XSAVE area.
+	ess, _ := s.FeatureSet.ExtendedStateSize()
+	f := make([]byte, ess)
+	copy(f, s.x86FPState)
+	// "The XSAVE feature set does not use bytes 511:416; bytes 463:416 are
+	// reserved." - Intel SDM Vol 1., Section 13.4.1 "Legacy Region of an XSAVE
+	// Area". Linux uses the first 8 bytes of this area to store the OS XSTATE
+	// mask. GDB relies on this: see
+	// gdb/x86-linux-nat.c:x86_linux_read_description().
+	usermem.ByteOrder.PutUint64(f[userXstateXCR0Offset:], s.FeatureSet.ValidXCR0Mask())
+	if len(f) > maxlen {
+		f = f[:maxlen]
+	}
+	return dst.Write(f)
+}
+
+func (s *State) ptraceSetXstateRegs(src io.Reader, maxlen int) (int, error) {
+	// Allow users to pass an xstate register set smaller than ours (they can
+	// mask bits out of XSTATE_BV), as long as it's at least minXstateBytes.
+	// Also allow users to pass a register set larger than ours; anything after
+	// their ExtendedStateSize will be ignored. (I think Linux technically
+	// permits setting a register set smaller than minXstateBytes, but it has
+	// the same silent truncation behavior in kernel/ptrace.c:ptrace_regset().)
+	if maxlen < minXstateBytes {
+		return 0, syscall.EFAULT
+	}
+	ess, _ := s.FeatureSet.ExtendedStateSize()
+	if maxlen > int(ess) {
+		maxlen = int(ess)
+	}
+	f := make([]byte, maxlen)
+	if _, err := io.ReadFull(src, f); err != nil {
+		return 0, err
+	}
+	// Force reserved bits in MXCSR to 0. This is consistent with Linux.
+	sanitizeMXCSR(x86FPState(f))
+	// Users can't enable *more* XCR0 bits than what we, and the CPU, support.
+	xstateBV := usermem.ByteOrder.Uint64(f[xstateBVOffset:])
+	xstateBV &= s.FeatureSet.ValidXCR0Mask()
+	usermem.ByteOrder.PutUint64(f[xstateBVOffset:], xstateBV)
+	// Force XCOMP_BV and reserved bytes in the XSAVE header to 0.
+	reserved := f[xsaveHeaderZeroedOffset : xsaveHeaderZeroedOffset+xsaveHeaderZeroedBytes]
+	for i := range reserved {
+		reserved[i] = 0
+	}
+	return copy(s.x86FPState, f), nil
+}
+
+// Register sets defined in include/uapi/linux/elf.h.
+const (
+	_NT_PRSTATUS   = 1
+	_NT_PRFPREG    = 2
+	_NT_X86_XSTATE = 0x202
+)
+
+// PtraceGetRegSet implements Context.PtraceGetRegSet.
+func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
+	switch regset {
+	case _NT_PRSTATUS:
+		if maxlen < ptraceRegsSize {
+			return 0, syserror.EFAULT
+		}
+		return s.PtraceGetRegs(dst)
+	case _NT_PRFPREG:
+		if maxlen < ptraceFPRegsSize {
+			return 0, syserror.EFAULT
+		}
+		return s.PtraceGetFPRegs(dst)
+	case _NT_X86_XSTATE:
+		return s.ptraceGetXstateRegs(dst, maxlen)
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// PtraceSetRegSet implements Context.PtraceSetRegSet.
+func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
+	switch regset {
+	case _NT_PRSTATUS:
+		if maxlen < ptraceRegsSize {
+			return 0, syserror.EFAULT
+		}
+		return s.PtraceSetRegs(src)
+	case _NT_PRFPREG:
+		if maxlen < ptraceFPRegsSize {
+			return 0, syserror.EFAULT
+		}
+		return s.PtraceSetFPRegs(src)
+	case _NT_X86_XSTATE:
+		return s.ptraceSetXstateRegs(src, maxlen)
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// FullRestore indicates whether a full restore is required.
+func (s *State) FullRestore() bool {
+	// A fast system call return is possible only if
+	//
+	// * RCX matches the instruction pointer.
+	// * R11 matches our flags value.
+	// * Usermode does not expect to set either the resume flag or the
+	//   virtual mode flags (unlikely.)
+	// * CS and SS are set to the standard selectors.
+	//
+	// That is, SYSRET results in the correct final state.
+	fastRestore := s.Regs.Rcx == s.Regs.Rip &&
+		s.Regs.Eflags == s.Regs.R11 &&
+		(s.Regs.Eflags&eflagsRF == 0) &&
+		(s.Regs.Eflags&eflagsVM == 0) &&
+		s.Regs.Cs == userCS &&
+		s.Regs.Ss == userDS
+	return !fastRestore
+}
+
+// New returns a new architecture context.
+func New(arch Arch, fs *cpuid.FeatureSet) Context {
+	switch arch {
+	case AMD64:
+		return &context64{
+			State{
+				x86FPState: newX86FPState(),
+				FeatureSet: fs,
+			},
+			[]x86FPState(nil),
+		}
+	}
+	panic(fmt.Sprintf("unknown architecture %v", arch))
+}
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
new file mode 100644
index 000000000..70e0e35b7
--- /dev/null
+++ b/pkg/sentry/arch/auxv.go
@@ -0,0 +1,28 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// An AuxEntry represents an entry in an ELF auxiliary vector.
+type AuxEntry struct {
+	Key   uint64
+	Value usermem.Addr
+}
+
+// An Auxv represents an ELF auxiliary vector.
+type Auxv []AuxEntry
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
new file mode 100644
index 000000000..437ff44ca
--- /dev/null
+++ b/pkg/sentry/arch/registers.proto
@@ -0,0 +1,55 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+message AMD64Registers {
+  uint64 rax = 1;
+  uint64 rbx = 2;
+  uint64 rcx = 3;
+  uint64 rdx = 4;
+  uint64 rsi = 5;
+  uint64 rdi = 6;
+  uint64 rsp = 7;
+  uint64 rbp = 8;
+
+  uint64 r8 = 9;
+  uint64 r9 = 10;
+  uint64 r10 = 11;
+  uint64 r11 = 12;
+  uint64 r12 = 13;
+  uint64 r13 = 14;
+  uint64 r14 = 15;
+  uint64 r15 = 16;
+
+  uint64 rip = 17;
+  uint64 rflags = 18;
+  uint64 orig_rax = 19;
+  uint64 cs = 20;
+  uint64 ds = 21;
+  uint64 es = 22;
+  uint64 fs = 23;
+  uint64 gs = 24;
+  uint64 ss = 25;
+  uint64 fs_base = 26;
+  uint64 gs_base = 27;
+}
+
+message Registers {
+  oneof arch {
+    AMD64Registers amd64 = 1;
+  }
+}
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
new file mode 100644
index 000000000..36437b965
--- /dev/null
+++ b/pkg/sentry/arch/signal_act.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+// Special values for SignalAct.Handler.
+const (
+	// SignalActDefault is SIG_DFL and specifies that the default behavior for
+	// a signal should be taken.
+	SignalActDefault = 0
+
+	// SignalActIgnore is SIG_IGN and specifies that a signal should be
+	// ignored.
+	SignalActIgnore = 1
+)
+
+// Available signal flags.
+const (
+	SignalFlagNoCldStop    = 0x00000001
+	SignalFlagNoCldWait    = 0x00000002
+	SignalFlagSigInfo      = 0x00000004
+	SignalFlagRestorer     = 0x04000000
+	SignalFlagOnStack      = 0x08000000
+	SignalFlagRestart      = 0x10000000
+	SignalFlagInterrupt    = 0x20000000
+	SignalFlagNoDefer      = 0x40000000
+	SignalFlagResetHandler = 0x80000000
+)
+
+// IsSigInfo returns true iff this handle expects siginfo.
+func (s SignalAct) IsSigInfo() bool {
+	return s.Flags&SignalFlagSigInfo != 0
+}
+
+// IsNoDefer returns true iff this SignalAct has the NoDefer flag set.
+func (s SignalAct) IsNoDefer() bool {
+	return s.Flags&SignalFlagNoDefer != 0
+}
+
+// IsRestart returns true iff this SignalAct has the Restart flag set.
+func (s SignalAct) IsRestart() bool {
+	return s.Flags&SignalFlagRestart != 0
+}
+
+// IsResetHandler returns true iff this SignalAct has the ResetHandler flag set.
+func (s SignalAct) IsResetHandler() bool {
+	return s.Flags&SignalFlagResetHandler != 0
+}
+
+// IsOnStack returns true iff this SignalAct has the OnStack flag set.
+func (s SignalAct) IsOnStack() bool {
+	return s.Flags&SignalFlagOnStack != 0
+}
+
+// HasRestorer returns true iff this SignalAct has the Restorer flag set.
+func (s SignalAct) HasRestorer() bool {
+	return s.Flags&SignalFlagRestorer != 0
+}
+
+// NativeSignalAct is a type that is equivalent to struct sigaction in the
+// guest architecture.
+type NativeSignalAct interface {
+	// SerializeFrom copies the data in the host SignalAct s into this object.
+	SerializeFrom(s *SignalAct)
+
+	// DeserializeTo copies the data in this object into the host SignalAct s.
+	DeserializeTo(s *SignalAct)
+}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
new file mode 100644
index 000000000..4040b530f
--- /dev/null
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -0,0 +1,476 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package arch
+
+import (
+	"encoding/binary"
+	"math"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// SignalAct represents the action that should be taken when a signal is
+// delivered, and is equivalent to struct sigaction on 64-bit x86.
+type SignalAct struct {
+	Handler  uint64
+	Flags    uint64
+	Restorer uint64
+	Mask     linux.SignalSet
+}
+
+// SerializeFrom implements NativeSignalAct.SerializeFrom.
+func (s *SignalAct) SerializeFrom(other *SignalAct) {
+	*s = *other
+}
+
+// DeserializeTo implements NativeSignalAct.DeserializeTo.
+func (s *SignalAct) DeserializeTo(other *SignalAct) {
+	*other = *s
+}
+
+// SignalStack represents information about a user stack, and is equivalent to
+// stack_t on 64-bit x86.
+type SignalStack struct {
+	Addr  uint64
+	Flags uint32
+	_     uint32
+	Size  uint64
+}
+
+// SerializeFrom implements NativeSignalStack.SerializeFrom.
+func (s *SignalStack) SerializeFrom(other *SignalStack) {
+	*s = *other
+}
+
+// DeserializeTo implements NativeSignalStack.DeserializeTo.
+func (s *SignalStack) DeserializeTo(other *SignalStack) {
+	*other = *s
+}
+
+// SignalInfo represents information about a signal being delivered, and is
+// equivalent to struct siginfo on 64-bit x86.
+type SignalInfo struct {
+	Signo int32 // Signal number
+	Errno int32 // Errno value
+	Code  int32 // Signal code
+	_     uint32
+
+	// struct siginfo::_sifields is a union. In SignalInfo, fields in the union
+	// are accessed through methods.
+	//
+	// For reference, here is the definition of _sifields: (_sigfault._trapno,
+	// which does not exist on x86, omitted for clarity)
+	//
+	// union {
+	// 	int _pad[SI_PAD_SIZE];
+	//
+	// 	/* kill() */
+	// 	struct {
+	// 		__kernel_pid_t _pid;	/* sender's pid */
+	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
+	// 	} _kill;
+	//
+	// 	/* POSIX.1b timers */
+	// 	struct {
+	// 		__kernel_timer_t _tid;	/* timer id */
+	// 		int _overrun;		/* overrun count */
+	// 		char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
+	// 		sigval_t _sigval;	/* same as below */
+	// 		int _sys_private;       /* not to be passed to user */
+	// 	} _timer;
+	//
+	// 	/* POSIX.1b signals */
+	// 	struct {
+	// 		__kernel_pid_t _pid;	/* sender's pid */
+	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
+	// 		sigval_t _sigval;
+	// 	} _rt;
+	//
+	// 	/* SIGCHLD */
+	// 	struct {
+	// 		__kernel_pid_t _pid;	/* which child */
+	// 		__ARCH_SI_UID_T _uid;	/* sender's uid */
+	// 		int _status;		/* exit code */
+	// 		__ARCH_SI_CLOCK_T _utime;
+	// 		__ARCH_SI_CLOCK_T _stime;
+	// 	} _sigchld;
+	//
+	// 	/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
+	// 	struct {
+	// 		void *_addr; /* faulting insn/memory ref. */
+	// 		short _addr_lsb; /* LSB of the reported address */
+	// 	} _sigfault;
+	//
+	// 	/* SIGPOLL */
+	// 	struct {
+	// 		__ARCH_SI_BAND_T _band;	/* POLL_IN, POLL_OUT, POLL_MSG */
+	// 		int _fd;
+	// 	} _sigpoll;
+	//
+	// 	/* SIGSYS */
+	// 	struct {
+	// 		void *_call_addr; /* calling user insn */
+	// 		int _syscall;	/* triggering system call number */
+	// 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
+	// 	} _sigsys;
+	// } _sifields;
+	//
+	// _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128
+	// bytes.
+	Fields [128 - 16]byte
+}
+
+// FixSignalCodeForUser fixes up si_code.
+//
+// The si_code we get from Linux may contain the kernel-specific code in the
+// top 16 bits if it's positive (e.g., from ptrace). Linux's
+// copy_siginfo_to_user does
+//     err |= __put_user((short)from->si_code, &to->si_code);
+// to mask out those bits and we need to do the same.
+func (s *SignalInfo) FixSignalCodeForUser() {
+	if s.Code > 0 {
+		s.Code &= 0x0000ffff
+	}
+}
+
+// Pid returns the si_pid field.
+func (s *SignalInfo) Pid() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[0:4]))
+}
+
+// SetPid mutates the si_pid field.
+func (s *SignalInfo) SetPid(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
+}
+
+// Uid returns the si_uid field.
+func (s *SignalInfo) Uid() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
+}
+
+// SetUid mutates the si_uid field.
+func (s *SignalInfo) SetUid(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
+}
+
+// Addr returns the si_addr field.
+func (s *SignalInfo) Addr() uint64 {
+	return usermem.ByteOrder.Uint64(s.Fields[0:8])
+}
+
+// SetAddr sets the si_addr field.
+func (s *SignalInfo) SetAddr(val uint64) {
+	usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
+}
+
+// Status returns the si_status field.
+func (s *SignalInfo) Status() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
+}
+
+// SetStatus mutates the si_status field.
+func (s *SignalInfo) SetStatus(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
+}
+
+// CallAddr returns the si_call_addr field.
+func (s *SignalInfo) CallAddr() uint64 {
+	return usermem.ByteOrder.Uint64(s.Fields[0:8])
+}
+
+// SetCallAddr mutates the si_call_addr field.
+func (s *SignalInfo) SetCallAddr(val uint64) {
+	usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
+}
+
+// Syscall returns the si_syscall field.
+func (s *SignalInfo) Syscall() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
+}
+
+// SetSyscall mutates the si_syscall field.
+func (s *SignalInfo) SetSyscall(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
+}
+
+// Arch returns the si_arch field.
+func (s *SignalInfo) Arch() uint32 {
+	return usermem.ByteOrder.Uint32(s.Fields[12:16])
+}
+
+// SetArch mutates the si_arch field.
+func (s *SignalInfo) SetArch(val uint32) {
+	usermem.ByteOrder.PutUint32(s.Fields[12:16], val)
+}
+
+// SignalContext64 is equivalent to struct sigcontext, the type passed as the
+// second argument to signal handlers set by signal(2).
+type SignalContext64 struct {
+	R8      uint64
+	R9      uint64
+	R10     uint64
+	R11     uint64
+	R12     uint64
+	R13     uint64
+	R14     uint64
+	R15     uint64
+	Rdi     uint64
+	Rsi     uint64
+	Rbp     uint64
+	Rbx     uint64
+	Rdx     uint64
+	Rax     uint64
+	Rcx     uint64
+	Rsp     uint64
+	Rip     uint64
+	Eflags  uint64
+	Cs      uint16
+	Gs      uint16 // always 0 on amd64.
+	Fs      uint16 // always 0 on amd64.
+	Ss      uint16 // only restored if _UC_STRICT_RESTORE_SS (unsupported).
+	Err     uint64
+	Trapno  uint64
+	Oldmask linux.SignalSet
+	Cr2     uint64
+	// Pointer to a struct _fpstate.
+	Fpstate  uint64
+	Reserved [8]uint64
+}
+
+// Flags for UContext64.Flags.
+const (
+	_UC_FP_XSTATE         = 1
+	_UC_SIGCONTEXT_SS     = 2
+	_UC_STRICT_RESTORE_SS = 4
+)
+
+// UContext64 is equivalent to ucontext_t on 64-bit x86.
+type UContext64 struct {
+	Flags    uint64
+	Link     uint64
+	Stack    SignalStack
+	MContext SignalContext64
+	Sigset   linux.SignalSet
+}
+
+// NewSignalAct implements Context.NewSignalAct.
+func (c *context64) NewSignalAct() NativeSignalAct {
+	return &SignalAct{}
+}
+
+// NewSignalStack implements Context.NewSignalStack.
+func (c *context64) NewSignalStack() NativeSignalStack {
+	return &SignalStack{}
+}
+
+// From Linux 'arch/x86/include/uapi/asm/sigcontext.h' the following is the
+// size of the magic cookie at the end of the xsave frame.
+//
+// NOTE: Currently we don't actually populate the fpstate
+// on the signal stack.
+const _FP_XSTATE_MAGIC2_SIZE = 4
+
+func (c *context64) fpuFrameSize() (size int, useXsave bool) {
+	size = len(c.x86FPState)
+	if size > 512 {
+		// Make room for the magic cookie at the end of the xsave frame.
+		size += _FP_XSTATE_MAGIC2_SIZE
+		useXsave = true
+	}
+	return size, useXsave
+}
+
+// SignalSetup implements Context.SignalSetup. (Compare to Linux's
+// arch/x86/kernel/signal.c:__setup_rt_frame().)
+func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error {
+	sp := st.Bottom
+
+	// "The 128-byte area beyond the location pointed to by %rsp is considered
+	// to be reserved and shall not be modified by signal or interrupt
+	// handlers. ... leaf functions may use this area for their entire stack
+	// frame, rather than adjusting the stack pointer in the prologue and
+	// epilogue." - AMD64 ABI
+	//
+	// (But this doesn't apply if we're starting at the top of the signal
+	// stack, in which case there is no following stack frame.)
+	if !(alt.IsEnabled() && sp == alt.Top()) {
+		sp -= 128
+	}
+
+	// Allocate space for floating point state on the stack.
+	//
+	// This isn't strictly necessary because we don't actually populate
+	// the fpstate. However we do store the floating point state of the
+	// interrupted thread inside the sentry. Simply accounting for this
+	// space on the user stack naturally caps the amount of memory the
+	// sentry will allocate for this purpose.
+	fpSize, _ := c.fpuFrameSize()
+	sp = (sp - usermem.Addr(fpSize)) & ^usermem.Addr(63)
+
+	// Construct the UContext64 now since we need its size.
+	uc := &UContext64{
+		// No _UC_FP_XSTATE: see Fpstate above.
+		// No _UC_STRICT_RESTORE_SS: we don't allow SS changes.
+		Flags: _UC_SIGCONTEXT_SS,
+		Stack: *alt,
+		MContext: SignalContext64{
+			R8:      c.Regs.R8,
+			R9:      c.Regs.R9,
+			R10:     c.Regs.R10,
+			R11:     c.Regs.R11,
+			R12:     c.Regs.R12,
+			R13:     c.Regs.R13,
+			R14:     c.Regs.R14,
+			R15:     c.Regs.R15,
+			Rdi:     c.Regs.Rdi,
+			Rsi:     c.Regs.Rsi,
+			Rbp:     c.Regs.Rbp,
+			Rbx:     c.Regs.Rbx,
+			Rdx:     c.Regs.Rdx,
+			Rax:     c.Regs.Rax,
+			Rcx:     c.Regs.Rcx,
+			Rsp:     c.Regs.Rsp,
+			Rip:     c.Regs.Rip,
+			Eflags:  c.Regs.Eflags,
+			Cs:      uint16(c.Regs.Cs),
+			Ss:      uint16(c.Regs.Ss),
+			Oldmask: sigset,
+		},
+		Sigset: sigset,
+	}
+
+	// TODO: Set SignalContext64.Err, Trapno, and Cr2 based on
+	// the fault that caused the signal. For now, leave Err and Trapno
+	// unset and assume CR2 == info.Addr() for SIGSEGVs and SIGBUSes.
+	if linux.Signal(info.Signo) == linux.SIGSEGV || linux.Signal(info.Signo) == linux.SIGBUS {
+		uc.MContext.Cr2 = info.Addr()
+	}
+
+	// "... the value (%rsp+8) is always a multiple of 16 (...) when control is
+	// transferred to the function entry point." - AMD64 ABI
+	ucSize := binary.Size(uc)
+	if ucSize < 0 {
+		// This can only happen if we've screwed up the definition of
+		// UContext64.
+		panic("can't get size of UContext64")
+	}
+	// st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
+	frameSize := int(st.Arch.Width()) + ucSize + 128
+	frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
+	sp = frameBottom + usermem.Addr(frameSize)
+	st.Bottom = sp
+
+	info.FixSignalCodeForUser()
+
+	// Set up the stack frame.
+	infoAddr, err := st.Push(info)
+	if err != nil {
+		return err
+	}
+	ucAddr, err := st.Push(uc)
+	if err != nil {
+		return err
+	}
+	if act.HasRestorer() {
+		// Push the restorer return address.
+		// Note that this doesn't need to be popped.
+		if _, err := st.Push(usermem.Addr(act.Restorer)); err != nil {
+			return err
+		}
+	} else {
+		// amd64 requires a restorer.
+		return syscall.EFAULT
+	}
+
+	// Set up registers.
+	c.Regs.Rip = act.Handler
+	c.Regs.Rsp = uint64(st.Bottom)
+	c.Regs.Rdi = uint64(info.Signo)
+	c.Regs.Rsi = uint64(infoAddr)
+	c.Regs.Rdx = uint64(ucAddr)
+	c.Regs.Rax = 0
+	c.Regs.Ds = userDS
+	c.Regs.Es = userDS
+	c.Regs.Cs = userCS
+	c.Regs.Ss = userDS
+
+	// Save the thread's floating point state.
+	c.sigFPState = append(c.sigFPState, c.x86FPState)
+
+	// Signal handler gets a clean floating point state.
+	c.x86FPState = newX86FPState()
+
+	return nil
+}
+
+// SignalRestore implements Context.SignalRestore. (Compare to Linux's
+// arch/x86/kernel/signal.c:sys_rt_sigreturn().)
+func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, error) {
+	// Copy out the stack frame.
+	var uc UContext64
+	if _, err := st.Pop(&uc); err != nil {
+		return 0, err
+	}
+	var info SignalInfo
+	if _, err := st.Pop(&info); err != nil {
+		return 0, err
+	}
+
+	// Restore registers.
+	c.Regs.R8 = uc.MContext.R8
+	c.Regs.R9 = uc.MContext.R9
+	c.Regs.R10 = uc.MContext.R10
+	c.Regs.R11 = uc.MContext.R11
+	c.Regs.R12 = uc.MContext.R12
+	c.Regs.R13 = uc.MContext.R13
+	c.Regs.R14 = uc.MContext.R14
+	c.Regs.R15 = uc.MContext.R15
+	c.Regs.Rdi = uc.MContext.Rdi
+	c.Regs.Rsi = uc.MContext.Rsi
+	c.Regs.Rbp = uc.MContext.Rbp
+	c.Regs.Rbx = uc.MContext.Rbx
+	c.Regs.Rdx = uc.MContext.Rdx
+	c.Regs.Rax = uc.MContext.Rax
+	c.Regs.Rcx = uc.MContext.Rcx
+	c.Regs.Rsp = uc.MContext.Rsp
+	c.Regs.Rip = uc.MContext.Rip
+	c.Regs.Eflags = (c.Regs.Eflags & ^eflagsRestorable) | (uc.MContext.Eflags & eflagsRestorable)
+	c.Regs.Cs = uint64(uc.MContext.Cs) | 3
+	// N.B. _UC_STRICT_RESTORE_SS not supported.
+	c.Regs.Orig_rax = math.MaxUint64
+
+	// Restore floating point state.
+	l := len(c.sigFPState)
+	if l > 0 {
+		c.x86FPState = c.sigFPState[l-1]
+		// NOTE: State save requires that any slice
+		// elements from '[len:cap]' to be zero value.
+		c.sigFPState[l-1] = nil
+		c.sigFPState = c.sigFPState[0 : l-1]
+	} else {
+		// This might happen if sigreturn(2) calls are unbalanced with
+		// respect to signal handler entries. This is not expected so
+		// don't bother to do anything fancy with the floating point
+		// state.
+		log.Infof("sigreturn unable to restore application fpstate")
+	}
+
+	return uc.Sigset, nil
+}
diff --git a/pkg/sentry/arch/signal_info.go b/pkg/sentry/arch/signal_info.go
new file mode 100644
index 000000000..ec004ae75
--- /dev/null
+++ b/pkg/sentry/arch/signal_info.go
@@ -0,0 +1,66 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+// Possible values for SignalInfo.Code. These values originate from the Linux
+// kernel's include/uapi/asm-generic/siginfo.h.
+const (
+	// SignalInfoUser (properly SI_USER) indicates that a signal was sent from
+	// a kill() or raise() syscall.
+	SignalInfoUser = 0
+
+	// SignalInfoKernel (properly SI_KERNEL) indicates that the signal was sent
+	// by the kernel.
+	SignalInfoKernel = 0x80
+
+	// SignalInfoTimer (properly SI_TIMER) indicates that the signal was sent
+	// by an expired timer.
+	SignalInfoTimer = -2
+
+	// SignalInfoTkill (properly SI_TKILL) indicates that the signal was sent
+	// from a tkill() or tgkill() syscall.
+	SignalInfoTkill = -6
+
+	// CLD_* codes are only meaningful for SIGCHLD.
+
+	// CLD_EXITED indicates that a task exited.
+	CLD_EXITED = 1
+
+	// CLD_KILLED indicates that a task was killed by a signal.
+	CLD_KILLED = 2
+
+	// CLD_DUMPED indicates that a task was killed by a signal and then dumped
+	// core.
+	CLD_DUMPED = 3
+
+	// CLD_TRAPPED indicates that a task was stopped by ptrace.
+	CLD_TRAPPED = 4
+
+	// CLD_STOPPED indicates that a thread group completed a group stop.
+	CLD_STOPPED = 5
+
+	// CLD_CONTINUED indicates that a group-stopped thread group was continued.
+	CLD_CONTINUED = 6
+
+	// SYS_* codes are only meaningful for SIGSYS.
+
+	// SYS_SECCOMP indicates that a signal originates from seccomp.
+	SYS_SECCOMP = 1
+
+	// TRAP_* codes are only meaningful for SIGTRAP.
+
+	// TRAP_BRKPT indicates a breakpoint trap.
+	TRAP_BRKPT = 1
+)
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
new file mode 100644
index 000000000..7c6531d79
--- /dev/null
+++ b/pkg/sentry/arch/signal_stack.go
@@ -0,0 +1,58 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package arch
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// SignalStackFlagOnStack is possible set on return from getaltstack,
+	// in order to indicate that the thread is currently on the alt stack.
+	SignalStackFlagOnStack = 1
+
+	// SignalStackFlagDisable is a flag to indicate the stack is disabled.
+	SignalStackFlagDisable = 2
+)
+
+// IsEnabled returns true iff this signal stack is marked as enabled.
+func (s SignalStack) IsEnabled() bool {
+	return s.Flags&SignalStackFlagDisable == 0
+}
+
+// Top returns the stack's top address.
+func (s SignalStack) Top() usermem.Addr {
+	return usermem.Addr(s.Addr + s.Size)
+}
+
+// SetOnStack marks this signal stack as in use. (This is only called on copies
+// sent to user applications, so there's no corresponding ClearOnStack.)
+func (s *SignalStack) SetOnStack() {
+	s.Flags |= SignalStackFlagOnStack
+}
+
+// NativeSignalStack is a type that is equivalent to stack_t in the guest
+// architecture.
+type NativeSignalStack interface {
+	// SerializeFrom copies the data in the host SignalStack s into this
+	// object.
+	SerializeFrom(s *SignalStack)
+
+	// DeserializeTo copies the data in this object into the host SignalStack
+	// s.
+	DeserializeTo(s *SignalStack)
+}
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
new file mode 100644
index 000000000..6c1b9be82
--- /dev/null
+++ b/pkg/sentry/arch/stack.go
@@ -0,0 +1,246 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Stack is a simple wrapper around a usermem.IO and an address.
+type Stack struct {
+	// Our arch info.
+	// We use this for automatic Native conversion of usermem.Addrs during
+	// Push() and Pop().
+	Arch Context
+
+	// The interface used to actually copy user memory.
+	IO usermem.IO
+
+	// Our current stack bottom.
+	Bottom usermem.Addr
+}
+
+// Push pushes the given values on to the stack.
+//
+// (This method supports Addrs and treats them as native types.)
+func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
+	for _, v := range vals {
+
+		// We convert some types to well-known serializable quanities.
+		var norm interface{}
+
+		// For array types, we will automatically add an appropriate
+		// terminal value. This is done simply to make the interface
+		// easier to use.
+		var term interface{}
+
+		switch v.(type) {
+		case string:
+			norm = []byte(v.(string))
+			term = byte(0)
+		case []int8, []uint8:
+			norm = v
+			term = byte(0)
+		case []int16, []uint16:
+			norm = v
+			term = uint16(0)
+		case []int32, []uint32:
+			norm = v
+			term = uint32(0)
+		case []int64, []uint64:
+			norm = v
+			term = uint64(0)
+		case []usermem.Addr:
+			// Special case: simply push recursively.
+			_, err := s.Push(s.Arch.Native(uintptr(0)))
+			if err != nil {
+				return 0, err
+			}
+			varr := v.([]usermem.Addr)
+			for i := len(varr) - 1; i >= 0; i-- {
+				_, err := s.Push(varr[i])
+				if err != nil {
+					return 0, err
+				}
+			}
+			continue
+		case usermem.Addr:
+			norm = s.Arch.Native(uintptr(v.(usermem.Addr)))
+		default:
+			norm = v
+		}
+
+		if term != nil {
+			_, err := s.Push(term)
+			if err != nil {
+				return 0, err
+			}
+		}
+
+		c := binary.Size(norm)
+		if c < 0 {
+			return 0, fmt.Errorf("bad binary.Size for %T", v)
+		}
+		// TODO: Use a real context.Context.
+		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
+		if err != nil || c != n {
+			return 0, err
+		}
+
+		s.Bottom -= usermem.Addr(n)
+	}
+
+	return s.Bottom, nil
+}
+
+// Pop pops the given values off the stack.
+//
+// (This method supports Addrs and treats them as native types.)
+func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
+	for _, v := range vals {
+
+		vaddr, isVaddr := v.(*usermem.Addr)
+
+		var n int
+		var err error
+		if isVaddr {
+			value := s.Arch.Native(uintptr(0))
+			// TODO: Use a real context.Context.
+			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
+			*vaddr = usermem.Addr(s.Arch.Value(value))
+		} else {
+			// TODO: Use a real context.Context.
+			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
+		}
+		if err != nil {
+			return 0, err
+		}
+
+		s.Bottom += usermem.Addr(n)
+	}
+
+	return s.Bottom, nil
+}
+
+// Align aligns the stack to the given offset.
+func (s *Stack) Align(offset int) {
+	if s.Bottom%usermem.Addr(offset) != 0 {
+		s.Bottom -= (s.Bottom % usermem.Addr(offset))
+	}
+}
+
+// StackLayout describes the location of the arguments and environment on the
+// stack.
+type StackLayout struct {
+	// ArgvStart is the beginning of the argument vector.
+	ArgvStart usermem.Addr
+
+	// ArgvEnd is the end of the argument vector.
+	ArgvEnd usermem.Addr
+
+	// EnvvStart is the beginning of the environment vector.
+	EnvvStart usermem.Addr
+
+	// EnvvEnd is the end of the environment vector.
+	EnvvEnd usermem.Addr
+}
+
+// Load pushes the given args, env and aux vector to the stack using the
+// well-known format for a new executable. It returns the start and end
+// of the argument and environment vectors.
+func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error) {
+	l := StackLayout{}
+
+	// Make sure we start with a 16-byte alignment.
+	s.Align(16)
+
+	// Push our strings.
+	l.ArgvEnd = s.Bottom
+	argAddrs := make([]usermem.Addr, len(args))
+	for i := len(args) - 1; i >= 0; i-- {
+		addr, err := s.Push(args[i])
+		if err != nil {
+			return StackLayout{}, err
+		}
+		argAddrs[i] = addr
+	}
+	l.ArgvStart = s.Bottom
+
+	// Push our environment.
+	l.EnvvEnd = s.Bottom
+	envAddrs := make([]usermem.Addr, len(env))
+	for i := len(env) - 1; i >= 0; i-- {
+		addr, err := s.Push(env[i])
+		if err != nil {
+			return StackLayout{}, err
+		}
+		envAddrs[i] = addr
+	}
+	l.EnvvStart = s.Bottom
+
+	// We need to align the arguments appropriately.
+	//
+	// We must finish on a 16-byte alignment, but we'll play it
+	// conservatively and finish at 32-bytes. It would be nice to be able
+	// to call Align here, but unfortunately we need to align the stack
+	// with all the variable sized arrays pushed. So we just need to do
+	// some calculations.
+	argvSize := s.Arch.Width() * uint(len(args)+1)
+	envvSize := s.Arch.Width() * uint(len(env)+1)
+	auxvSize := s.Arch.Width() * 2 * uint(len(aux)+1)
+	total := usermem.Addr(argvSize) + usermem.Addr(envvSize) + usermem.Addr(auxvSize) + usermem.Addr(s.Arch.Width())
+	expectedBottom := s.Bottom - total
+	if expectedBottom%32 != 0 {
+		s.Bottom -= expectedBottom % 32
+	}
+
+	// Push our auxvec.
+	// NOTE: We need an extra zero here per spec.
+	// The Push function will automatically terminate
+	// strings and arrays with a single null value.
+	auxv := make([]usermem.Addr, 0, len(aux))
+	for _, a := range aux {
+		auxv = append(auxv, usermem.Addr(a.Key), a.Value)
+	}
+	auxv = append(auxv, usermem.Addr(0))
+	_, err := s.Push(auxv)
+	if err != nil {
+		return StackLayout{}, err
+	}
+
+	// Push environment.
+	_, err = s.Push(envAddrs)
+	if err != nil {
+		return StackLayout{}, err
+	}
+
+	// Push args.
+	_, err = s.Push(argAddrs)
+	if err != nil {
+		return StackLayout{}, err
+	}
+
+	// Push arg count.
+	_, err = s.Push(usermem.Addr(len(args)))
+	if err != nil {
+		return StackLayout{}, err
+	}
+
+	return l, nil
+}
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
new file mode 100644
index 000000000..41d8ba0d1
--- /dev/null
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -0,0 +1,52 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package arch
+
+const restartSyscallNr = uintptr(219)
+
+// SyscallNo returns the syscall number according to the 64-bit convention.
+func (c *context64) SyscallNo() uintptr {
+	return uintptr(c.Regs.Orig_rax)
+}
+
+// SyscallArgs provides syscall arguments according to the 64-bit convention.
+//
+// Due to the way addresses are mapped for the sentry this binary *must* be
+// built in 64-bit mode. So we can just assume the syscall numbers that come
+// back match the expected host system call numbers.
+func (c *context64) SyscallArgs() SyscallArguments {
+	return SyscallArguments{
+		SyscallArgument{Value: uintptr(c.Regs.Rdi)},
+		SyscallArgument{Value: uintptr(c.Regs.Rsi)},
+		SyscallArgument{Value: uintptr(c.Regs.Rdx)},
+		SyscallArgument{Value: uintptr(c.Regs.R10)},
+		SyscallArgument{Value: uintptr(c.Regs.R8)},
+		SyscallArgument{Value: uintptr(c.Regs.R9)},
+	}
+}
+
+// RestartSyscall implements Context.RestartSyscall.
+func (c *context64) RestartSyscall() {
+	c.Regs.Rip -= SyscallWidth
+	c.Regs.Rax = c.Regs.Orig_rax
+}
+
+// RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock.
+func (c *context64) RestartSyscallWithRestartBlock() {
+	c.Regs.Rip -= SyscallWidth
+	c.Regs.Rax = uint64(restartSyscallNr)
+}
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
new file mode 100644
index 000000000..ff39f94ba
--- /dev/null
+++ b/pkg/sentry/context/BUILD
@@ -0,0 +1,14 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "context",
+    srcs = ["context.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/amutex",
+        "//pkg/log",
+    ],
+)
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
new file mode 100644
index 000000000..e0dffafba
--- /dev/null
+++ b/pkg/sentry/context/context.go
@@ -0,0 +1,103 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package context defines the sentry's Context type.
+package context
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/amutex"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// A Context represents a thread of execution (hereafter "goroutine" to reflect
+// Go idiosyncrasy). It carries state associated with the goroutine across API
+// boundaries.
+//
+// While Context exists for essentially the same reasons as Go's standard
+// context.Context, the standard type represents the state of an operation
+// rather than that of a goroutine. This is a critical distinction:
+//
+// - Unlike context.Context, which "may be passed to functions running in
+// different goroutines", it is *not safe* to use the same Context in multiple
+// concurrent goroutines.
+//
+// - It is *not safe* to retain a Context passed to a function beyond the scope
+// of that function call.
+//
+// In both cases, values extracted from the Context should be used instead.
+type Context interface {
+	log.Logger
+	amutex.Sleeper
+
+	// UninterruptibleSleepStart indicates the beginning of an uninterruptible
+	// sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate
+	// is true and the Context represents a Task, the Task's AddressSpace is
+	// deactivated.
+	UninterruptibleSleepStart(deactivate bool)
+
+	// UninterruptibleSleepFinish indicates the end of an uninterruptible sleep
+	// state that was begun by a previous call to UninterruptibleSleepStart. If
+	// activate is true and the Context represents a Task, the Task's
+	// AddressSpace is activated. Normally activate is the same value as the
+	// deactivate parameter passed to UninterruptibleSleepStart.
+	UninterruptibleSleepFinish(activate bool)
+
+	// Value returns the value associated with this Context for key, or nil if
+	// no value is associated with key. Successive calls to Value with the same
+	// key returns the same result.
+	//
+	// A key identifies a specific value in a Context. Functions that wish to
+	// retrieve values from Context typically allocate a key in a global
+	// variable then use that key as the argument to Context.Value. A key can
+	// be any type that supports equality; packages should define keys as an
+	// unexported type to avoid collisions.
+	Value(key interface{}) interface{}
+}
+
+type logContext struct {
+	log.Logger
+	NoopSleeper
+}
+
+// Value implements Context.Value.
+func (logContext) Value(key interface{}) interface{} {
+	return nil
+}
+
+// NoopSleeper is a noop implementation of amutex.Sleeper and
+// Context.UninterruptibleSleep* methods for anonymous embedding in other types
+// that do not want to notify kernel.Task about sleeps.
+type NoopSleeper struct {
+	amutex.NoopSleeper
+}
+
+// UninterruptibleSleepStart does nothing.
+func (NoopSleeper) UninterruptibleSleepStart(bool) {}
+
+// UninterruptibleSleepFinish does nothing.
+func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
+
+// Background returns an empty context using the default logger.
+//
+// Users should be wary of using a Background context. Please tag any use with
+// FIXME and a note to remove this use.
+//
+// Generally, one should use the Task as their context when available, or avoid
+// having to use a context in places where a Task is unavailable.
+//
+// Using a Background context for tests is fine, as long as no values are
+// needed from the context in the tested code paths.
+func Background() Context {
+	return logContext{Logger: log.Log()}
+}
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
new file mode 100644
index 000000000..5977344de
--- /dev/null
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -0,0 +1,34 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "contexttest_state",
+    srcs = [
+        "contexttest.go",
+    ],
+    out = "contexttest_state.go",
+    package = "contexttest",
+)
+
+go_library(
+    name = "contexttest",
+    testonly = 1,
+    srcs = [
+        "contexttest.go",
+        "contexttest_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/uniqueid",
+        "//pkg/state",
+    ],
+)
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
new file mode 100644
index 000000000..193ce3440
--- /dev/null
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -0,0 +1,133 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package contexttest builds a test context.Context.
+package contexttest
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+)
+
+// Context returns a Context that may be used in tests. Uses ptrace as the
+// platform.Platform.
+func Context(tb testing.TB) context.Context {
+	p, err := ptrace.New()
+	if err != nil {
+		tb.Fatal(err)
+	}
+	// Test usage of context.Background is fine.
+	return &testContext{
+		Context:  context.Background(),
+		l:        limits.NewLimitSet(),
+		platform: p,
+	}
+}
+
+type testContext struct {
+	context.Context
+	l        *limits.LimitSet
+	platform platform.Platform
+}
+
+// globalUniqueID tracks incremental unique identifiers for tests.
+var globalUniqueID uint64
+
+// lastInotifyCookie is a monotonically increasing counter for generating unique
+// inotify cookies. Must be accessed using atomic ops.
+var lastInotifyCookie uint32
+
+// hostClock implements ktime.Clock.
+type hostClock struct {
+	ktime.WallRateClock
+	ktime.NoClockEvents
+}
+
+// Now implements ktime.Clock.Now.
+func (hostClock) Now() ktime.Time {
+	return ktime.FromNanoseconds(time.Now().UnixNano())
+}
+
+// Value implements context.Context.
+func (t *testContext) Value(key interface{}) interface{} {
+	switch key {
+	case limits.CtxLimits:
+		return t.l
+	case platform.CtxPlatform:
+		return t.platform
+	case uniqueid.CtxGlobalUniqueID:
+		return atomic.AddUint64(&globalUniqueID, 1)
+	case uniqueid.CtxInotifyCookie:
+		return atomic.AddUint32(&lastInotifyCookie, 1)
+	case ktime.CtxRealtimeClock:
+		return hostClock{}
+	default:
+		return t.Context.Value(key)
+	}
+}
+
+// RootContext returns a Context that may be used in tests that need root
+// credentials. Uses ptrace as the platform.Platform.
+func RootContext(tb testing.TB) context.Context {
+	return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace()))
+}
+
+// WithCreds returns a copy of ctx carrying creds.
+func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context {
+	return &authContext{ctx, creds}
+}
+
+type authContext struct {
+	context.Context
+	creds *auth.Credentials
+}
+
+// Value implements context.Context.
+func (ac *authContext) Value(key interface{}) interface{} {
+	switch key {
+	case auth.CtxCredentials:
+		return ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+// WithLimitSet returns a copy of ctx carrying l.
+func WithLimitSet(ctx context.Context, l *limits.LimitSet) context.Context {
+	return limitContext{ctx, l}
+}
+
+type limitContext struct {
+	context.Context
+	l *limits.LimitSet
+}
+
+// Value implements context.Context.
+func (lc limitContext) Value(key interface{}) interface{} {
+	switch key {
+	case limits.CtxLimits:
+		return lc.l
+	default:
+		return lc.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
new file mode 100644
index 000000000..4d1d0d019
--- /dev/null
+++ b/pkg/sentry/control/BUILD
@@ -0,0 +1,39 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "control",
+    srcs = [
+        "control.go",
+        "proc.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/control",
+    visibility = [
+        "//pkg/sentry:internal",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/host",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/usage",
+        "//pkg/urpc",
+    ],
+)
+
+go_test(
+    name = "control_test",
+    size = "small",
+    srcs = ["proc_test.go"],
+    embed = [":control"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usage",
+    ],
+)
diff --git a/pkg/sentry/control/control.go b/pkg/sentry/control/control.go
new file mode 100644
index 000000000..a6ee6e649
--- /dev/null
+++ b/pkg/sentry/control/control.go
@@ -0,0 +1,17 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package control contains types that expose control server methods, and can
+// be used to configure and interact with a running sandbox process.
+package control
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
new file mode 100644
index 000000000..7d06a1d04
--- /dev/null
+++ b/pkg/sentry/control/proc.go
@@ -0,0 +1,293 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"syscall"
+	"text/tabwriter"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Proc includes task-related functions.
+//
+// At the moment, this is limited to exec support.
+type Proc struct {
+	Kernel *kernel.Kernel
+}
+
+// ExecArgs is the set of arguments to exec.
+type ExecArgs struct {
+	// Filename is the filename to load.
+	//
+	// If this is provided as "", then the file will be guessed via Argv[0].
+	Filename string `json:"filename"`
+
+	// Argv is a list of arguments.
+	Argv []string `json:"argv"`
+
+	// Envv is a list of environment variables.
+	Envv []string `json:"envv"`
+
+	// WorkingDirectory defines the working directory for the new process.
+	WorkingDirectory string `json:"wd"`
+
+	// KUID is the UID to run with in the root user namespace. Defaults to
+	// root if not set explicitly.
+	KUID auth.KUID
+
+	// KGID is the GID to run with in the root user namespace. Defaults to
+	// the root group if not set explicitly.
+	KGID auth.KGID
+
+	// ExtraKGIDs is the list of additional groups to which the user
+	// belongs.
+	ExtraKGIDs []auth.KGID
+
+	// Capabilities is the list of capabilities to give to the process.
+	Capabilities *auth.TaskCapabilities
+
+	// Detach indicates whether Exec should detach once the process starts.
+	Detach bool
+
+	// FilePayload determines the files to give to the new process.
+	urpc.FilePayload
+}
+
+// Exec runs a new task.
+func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
+	// Import file descriptors.
+	l := limits.NewLimitSet()
+	fdm := proc.Kernel.NewFDMap()
+	defer fdm.DecRef()
+
+	creds := auth.NewUserCredentials(
+		args.KUID,
+		args.KGID,
+		args.ExtraKGIDs,
+		args.Capabilities,
+		proc.Kernel.RootUserNamespace())
+
+	initArgs := kernel.CreateProcessArgs{
+		Filename:             args.Filename,
+		Argv:                 args.Argv,
+		Envv:                 args.Envv,
+		WorkingDirectory:     args.WorkingDirectory,
+		Credentials:          creds,
+		FDMap:                fdm,
+		Umask:                0022,
+		Limits:               l,
+		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+		UTSNamespace:         proc.Kernel.RootUTSNamespace(),
+		IPCNamespace:         proc.Kernel.RootIPCNamespace(),
+	}
+	ctx := initArgs.NewContext(proc.Kernel)
+	mounter := fs.FileOwnerFromContext(ctx)
+
+	for appFD, f := range args.FilePayload.Files {
+		// Copy the underlying FD.
+		newFD, err := syscall.Dup(int(f.Fd()))
+		if err != nil {
+			return err
+		}
+		f.Close()
+
+		// Install the given file as an FD.
+		file, err := host.NewFile(ctx, newFD, mounter)
+		if err != nil {
+			syscall.Close(newFD)
+			return err
+		}
+		defer file.DecRef()
+		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
+			return err
+		}
+	}
+
+	// Start the new task.
+	newTG, err := proc.Kernel.CreateProcess(initArgs)
+	if err != nil {
+		return err
+	}
+
+	// If we're supposed to detach, don't wait for the process to exit.
+	if args.Detach {
+		*waitStatus = 0
+		return nil
+	}
+
+	// Wait for completion.
+	newTG.WaitExited()
+	*waitStatus = newTG.ExitStatus().Status()
+	return nil
+}
+
+// PsArgs is the set of arguments to ps.
+type PsArgs struct {
+	// JSON will force calls to Ps to return the result as a JSON payload.
+	JSON bool
+}
+
+// Ps provides a process listing for the running kernel.
+func (proc *Proc) Ps(args *PsArgs, out *string) error {
+	var p []*Process
+	if e := Processes(proc.Kernel, &p); e != nil {
+		return e
+	}
+	if !args.JSON {
+		*out = ProcessListToTable(p)
+	} else {
+		s, e := ProcessListToJSON(p)
+		if e != nil {
+			return e
+		}
+		*out = s
+	}
+	return nil
+}
+
+// Process contains information about a single process in a Sandbox.
+// TODO: Implement TTY field.
+type Process struct {
+	UID auth.KUID       `json:"uid"`
+	PID kernel.ThreadID `json:"pid"`
+	// Parent PID
+	PPID kernel.ThreadID `json:"ppid"`
+	// Processor utilization
+	C int32 `json:"c"`
+	// Start time
+	STime string `json:"stime"`
+	// CPU time
+	Time string `json:"time"`
+	// Executable shortname (e.g. "sh" for /bin/sh)
+	Cmd string `json:"cmd"`
+}
+
+// ProcessListToTable prints a table with the following format:
+// UID       PID       PPID      C         STIME     TIME       CMD
+// 0         1         0         0         14:04     505262ns   tail
+func ProcessListToTable(pl []*Process) string {
+	var buf bytes.Buffer
+	tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0)
+	fmt.Fprint(tw, "UID\tPID\tPPID\tC\tSTIME\tTIME\tCMD")
+	for _, d := range pl {
+		fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s",
+			d.UID,
+			d.PID,
+			d.PPID,
+			d.C,
+			d.STime,
+			d.Time,
+			d.Cmd)
+	}
+	tw.Flush()
+	return buf.String()
+}
+
+// ProcessListToJSON will return the JSON representation of ps.
+func ProcessListToJSON(pl []*Process) (string, error) {
+	b, err := json.Marshal(pl)
+	if err != nil {
+		return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err)
+	}
+	return string(b), nil
+}
+
+// PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This
+// behavior is the same as runc's.
+func PrintPIDsJSON(pl []*Process) (string, error) {
+	pids := make([]kernel.ThreadID, 0, len(pl))
+	for _, d := range pl {
+		pids = append(pids, d.PID)
+	}
+	b, err := json.Marshal(pids)
+	if err != nil {
+		return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err)
+	}
+	return string(b), nil
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func Processes(k *kernel.Kernel, out *[]*Process) error {
+	ts := k.TaskSet()
+	now := k.RealtimeClock().Now()
+	for _, tg := range ts.Root.ThreadGroups() {
+		pid := ts.Root.IDOfThreadGroup(tg)
+		// If tg has already been reaped ignore it.
+		if pid == 0 {
+			continue
+		}
+
+		*out = append(*out, &Process{
+			UID: tg.Leader().Credentials().EffectiveKUID,
+			PID: pid,
+			// If Parent is null (i.e. tg is the init process), PPID will be 0.
+			PPID:  ts.Root.IDOfTask(tg.Leader().Parent()),
+			STime: formatStartTime(now, tg.Leader().StartTime()),
+			C:     percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
+			Time:  tg.CPUStats().SysTime.String(),
+			Cmd:   tg.Leader().Name(),
+		})
+	}
+	return nil
+}
+
+// formatStartTime formats startTime depending on the current time:
+// - If startTime was today, HH:MM is used.
+// - If startTime was not today but was this year, MonDD is used (e.g. Jan02)
+// - If startTime was not this year, the year is used.
+func formatStartTime(now, startTime ktime.Time) string {
+	nowS, nowNs := now.Unix()
+	n := time.Unix(nowS, nowNs)
+	startTimeS, startTimeNs := startTime.Unix()
+	st := time.Unix(startTimeS, startTimeNs)
+	format := "15:04"
+	if st.YearDay() != n.YearDay() {
+		format = "Jan02"
+	}
+	if st.Year() != n.Year() {
+		format = "2006"
+	}
+	return st.Format(format)
+}
+
+func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 {
+	// Note: In procps, there is an option to include child CPU stats. As
+	// it is disabled by default, we do not include them.
+	total := stats.UserTime + stats.SysTime
+	lifetime := now.Sub(startTime)
+	if lifetime <= 0 {
+		return 0
+	}
+	percentCPU := total * 100 / lifetime
+	// Cap at 99% since procps does the same.
+	if percentCPU > 99 {
+		percentCPU = 99
+	}
+	return int32(percentCPU)
+}
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
new file mode 100644
index 000000000..18286496f
--- /dev/null
+++ b/pkg/sentry/control/proc_test.go
@@ -0,0 +1,164 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+}
+
+// Tests that ProcessData.Table() prints with the correct format.
+func TestProcessListTable(t *testing.T) {
+	testCases := []struct {
+		pl       []*Process
+		expected string
+	}{
+		{
+			pl:       []*Process{},
+			expected: "UID       PID       PPID      C         STIME     TIME      CMD",
+		},
+		{
+			pl: []*Process{
+				&Process{
+					UID:   0,
+					PID:   0,
+					PPID:  0,
+					C:     0,
+					STime: "0",
+					Time:  "0",
+					Cmd:   "zero",
+				},
+				&Process{
+					UID:   1,
+					PID:   1,
+					PPID:  1,
+					C:     1,
+					STime: "1",
+					Time:  "1",
+					Cmd:   "one",
+				},
+			},
+			expected: `UID       PID       PPID      C         STIME     TIME      CMD
+0         0         0         0         0         0         zero
+1         1         1         1         1         1         one`,
+		},
+	}
+
+	for _, tc := range testCases {
+		output := ProcessListToTable(tc.pl)
+
+		if tc.expected != output {
+			t.Errorf("PrintTable(%v): got:\n%s\nwant:\n%s", tc.pl, output, tc.expected)
+		}
+	}
+}
+
+func TestProcessListJSON(t *testing.T) {
+	testCases := []struct {
+		pl       []*Process
+		expected string
+	}{
+		{
+			pl:       []*Process{},
+			expected: "[]",
+		},
+		{
+			pl: []*Process{
+				&Process{
+					UID:   0,
+					PID:   0,
+					PPID:  0,
+					C:     0,
+					STime: "0",
+					Time:  "0",
+					Cmd:   "zero",
+				},
+				&Process{
+					UID:   1,
+					PID:   1,
+					PPID:  1,
+					C:     1,
+					STime: "1",
+					Time:  "1",
+					Cmd:   "one",
+				},
+			},
+			expected: "[0,1]",
+		},
+	}
+
+	for _, tc := range testCases {
+		output, err := PrintPIDsJSON(tc.pl)
+		if err != nil {
+			t.Errorf("failed to generate JSON: %v", err)
+		}
+
+		if tc.expected != output {
+			t.Errorf("PrintJSON(%v): got:\n%s\nwant:\n%s", tc.pl, output, tc.expected)
+		}
+	}
+}
+
+func TestPercentCPU(t *testing.T) {
+	testCases := []struct {
+		stats     usage.CPUStats
+		startTime ktime.Time
+		now       ktime.Time
+		expected  int32
+	}{
+		{
+			// Verify that 100% use is capped at 99.
+			stats:     usage.CPUStats{UserTime: 1e9, SysTime: 1e9},
+			startTime: ktime.FromNanoseconds(7e9),
+			now:       ktime.FromNanoseconds(9e9),
+			expected:  99,
+		},
+		{
+			// Verify that if usage > lifetime, we get at most 99%
+			// usage.
+			stats:     usage.CPUStats{UserTime: 2e9, SysTime: 2e9},
+			startTime: ktime.FromNanoseconds(7e9),
+			now:       ktime.FromNanoseconds(9e9),
+			expected:  99,
+		},
+		{
+			// Verify that 50% usage is reported correctly.
+			stats:     usage.CPUStats{UserTime: 1e9, SysTime: 1e9},
+			startTime: ktime.FromNanoseconds(12e9),
+			now:       ktime.FromNanoseconds(16e9),
+			expected:  50,
+		},
+		{
+			// Verify that 0% usage is reported correctly.
+			stats:     usage.CPUStats{UserTime: 0, SysTime: 0},
+			startTime: ktime.FromNanoseconds(12e9),
+			now:       ktime.FromNanoseconds(14e9),
+			expected:  0,
+		},
+	}
+
+	for _, tc := range testCases {
+		if pcpu := percentCPU(tc.stats, tc.startTime, tc.now); pcpu != tc.expected {
+			t.Errorf("percentCPU(%v, %v, %v): got %d, want %d", tc.stats, tc.startTime, tc.now, pcpu, tc.expected)
+		}
+	}
+}
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
new file mode 100644
index 000000000..1a8b461ba
--- /dev/null
+++ b/pkg/sentry/device/BUILD
@@ -0,0 +1,18 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "device",
+    srcs = ["device.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/device",
+    visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/abi/linux"],
+)
+
+go_test(
+    name = "device_test",
+    size = "small",
+    srcs = ["device_test.go"],
+    embed = [":device"],
+)
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
new file mode 100644
index 000000000..a5514c72f
--- /dev/null
+++ b/pkg/sentry/device/device.go
@@ -0,0 +1,193 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package device defines reserved virtual kernel devices and structures
+// for managing them.
+//
+// Saving and restoring devices is not necessary if the devices are initialized
+// as package global variables. Package initialization happens in a single goroutine
+// and in a deterministic order, so minor device numbers will be assigned in the
+// same order as packages are loaded.
+package device
+
+import (
+	"bytes"
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// ID identifies a device.
+type ID struct {
+	Major uint64
+	Minor uint64
+}
+
+// DeviceID formats a major and minor device number into a standard device number.
+func (i *ID) DeviceID() uint64 {
+	return uint64(linux.MakeDeviceID(uint16(i.Major), uint32(i.Minor)))
+}
+
+// nextAnonDeviceMinor is the next minor number for a new anonymous device.
+// Must be accessed atomically.
+var nextAnonDeviceMinor uint64
+
+// NewAnonDevice creates a new anonymous device. Packages that require an anonymous
+// device should initialize the device in a global variable in a file called device.go:
+//
+// var myDevice = device.NewAnonDevice()
+func NewAnonDevice() *Device {
+	return &Device{
+		ID: newAnonID(),
+	}
+}
+
+// NewAnonMultiDevice creates a new multi-keyed anonymous device. Packages that require
+// a multi-key anonymous device should initialize the device in a global variable in a
+// file called device.go:
+//
+// var myDevice = device.NewAnonMultiDevice()
+func NewAnonMultiDevice() *MultiDevice {
+	return &MultiDevice{
+		ID: newAnonID(),
+	}
+}
+
+// newAnonID assigns a major and minor number to an anonymous device ID.
+func newAnonID() ID {
+	return ID{
+		// Anon devices always have a major number of 0.
+		Major: 0,
+		// Use the next minor number.
+		Minor: atomic.AddUint64(&nextAnonDeviceMinor, 1),
+	}
+}
+
+// Device is a simple virtual kernel device.
+type Device struct {
+	ID
+
+	// last is the last generated inode.
+	last uint64
+}
+
+// NextIno generates a new inode number
+func (d *Device) NextIno() uint64 {
+	return atomic.AddUint64(&d.last, 1)
+}
+
+// MultiDeviceKey provides a hashable key for a MultiDevice. The key consists
+// of a raw device and inode for a resource, which must consistently identify
+// the unique resource.  It may optionally include a secondary device if
+// appropriate.
+//
+// Note that using the path is not enough, because filesystems may rename a file
+// to a different backing resource, at which point the path points to a different
+// entity.  Using only the inode is also not enough because the inode is assumed
+// to be unique only within the device on which the resource exists.
+type MultiDeviceKey struct {
+	Device          uint64
+	SecondaryDevice string
+	Inode           uint64
+}
+
+// String stringifies the key.
+func (m MultiDeviceKey) String() string {
+	return fmt.Sprintf("key{device: %d, sdevice: %s, inode: %d}", m.Device, m.SecondaryDevice, m.Inode)
+}
+
+// MultiDevice allows for remapping resources that come from a variety of raw
+// devices into a single device.  The device ID should be one of the static
+// Device IDs above and cannot be reused.
+type MultiDevice struct {
+	ID
+
+	mu     sync.Mutex
+	last   uint64
+	cache  map[MultiDeviceKey]uint64
+	rcache map[uint64]MultiDeviceKey
+}
+
+// String stringifies MultiDevice.
+func (m *MultiDevice) String() string {
+	buf := bytes.NewBuffer(nil)
+	buf.WriteString("cache{")
+	for k, v := range m.cache {
+		buf.WriteString(fmt.Sprintf("%s -> %d, ", k, v))
+	}
+	buf.WriteString("}")
+	return buf.String()
+}
+
+// Map maps a raw device and inode into the inode space of MultiDevice,
+// returning a virtualized inode.  Raw devices and inodes can be reused;
+// in this case, the same virtual inode will be returned.
+func (m *MultiDevice) Map(key MultiDeviceKey) uint64 {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if m.cache == nil {
+		m.cache = make(map[MultiDeviceKey]uint64)
+		m.rcache = make(map[uint64]MultiDeviceKey)
+	}
+
+	id, ok := m.cache[key]
+	if ok {
+		return id
+	}
+	// Step over reserved entries that may have been loaded.
+	idx := m.last + 1
+	for {
+		if _, ok := m.rcache[idx]; !ok {
+			break
+		}
+		idx++
+	}
+	// We found a non-reserved entry, use it.
+	m.last = idx
+	m.cache[key] = m.last
+	m.rcache[m.last] = key
+	return m.last
+}
+
+// Load loads a raw device and inode into MultiDevice inode mappings
+// with value as the virtual inode.
+//
+// By design, inodes start from 1 and continue until max uint64.  This means
+// that the zero value, which is often the uninitialized value, can be rejected
+// as invalid.
+func (m *MultiDevice) Load(key MultiDeviceKey, value uint64) bool {
+	// Reject the uninitialized value; see comment above.
+	if value == 0 {
+		return false
+	}
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if m.cache == nil {
+		m.cache = make(map[MultiDeviceKey]uint64)
+		m.rcache = make(map[uint64]MultiDeviceKey)
+	}
+
+	// Cache value at key.
+	m.cache[key] = value
+
+	// Prevent value from being used by new inode mappings.
+	m.rcache[value] = key
+
+	return true
+}
diff --git a/pkg/sentry/device/device_test.go b/pkg/sentry/device/device_test.go
new file mode 100644
index 000000000..dfec45046
--- /dev/null
+++ b/pkg/sentry/device/device_test.go
@@ -0,0 +1,59 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package device
+
+import (
+	"testing"
+)
+
+func TestMultiDevice(t *testing.T) {
+	device := &MultiDevice{}
+
+	// Check that Load fails to install virtual inodes that are
+	// uninitialized.
+	if device.Load(MultiDeviceKey{}, 0) {
+		t.Fatalf("got load of invalid virtual inode 0, want unsuccessful")
+	}
+
+	inode := device.Map(MultiDeviceKey{})
+
+	// Assert that the same raw device and inode map to
+	// a consistent virtual inode.
+	if i := device.Map(MultiDeviceKey{}); i != inode {
+		t.Fatalf("got inode %d, want %d in %s", i, inode, device)
+	}
+
+	// Assert that a new inode or new device does not conflict.
+	if i := device.Map(MultiDeviceKey{Device: 0, Inode: 1}); i == inode {
+		t.Fatalf("got reused inode %d, want new distinct inode in %s", i, device)
+	}
+	last := device.Map(MultiDeviceKey{Device: 1, Inode: 0})
+	if last == inode {
+		t.Fatalf("got reused inode %d, want new distinct inode in %s", last, device)
+	}
+
+	// Virtual is the virtual inode we want to load.
+	virtual := last + 1
+
+	// Assert that we can load a virtual inode at a new place.
+	if !device.Load(MultiDeviceKey{Device: 0, Inode: 2}, virtual) {
+		t.Fatalf("got load of virtual inode %d failed, want success in %s", virtual, device)
+	}
+
+	// Assert that the next inode skips over the loaded one.
+	if i := device.Map(MultiDeviceKey{Device: 0, Inode: 3}); i != virtual+1 {
+		t.Fatalf("got inode %d, want %d in %s", i, virtual+1, device)
+	}
+}
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
new file mode 100644
index 000000000..9b7264753
--- /dev/null
+++ b/pkg/sentry/fs/BUILD
@@ -0,0 +1,154 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "fs_state",
+    srcs = [
+        "attr.go",
+        "dentry.go",
+        "dirent.go",
+        "dirent_cache.go",
+        "dirent_list.go",
+        "dirent_state.go",
+        "file.go",
+        "file_overlay.go",
+        "file_state.go",
+        "filesystems.go",
+        "flags.go",
+        "inode.go",
+        "inode_inotify.go",
+        "inode_operations.go",
+        "inode_overlay.go",
+        "inotify.go",
+        "inotify_event.go",
+        "inotify_watch.go",
+        "mock.go",
+        "mount.go",
+        "mount_overlay.go",
+        "mount_state.go",
+        "mounts.go",
+        "overlay.go",
+        "path.go",
+    ],
+    out = "fs_state.go",
+    package = "fs",
+)
+
+go_library(
+    name = "fs",
+    srcs = [
+        "attr.go",
+        "context.go",
+        "copy_up.go",
+        "dentry.go",
+        "dirent.go",
+        "dirent_cache.go",
+        "dirent_list.go",
+        "dirent_state.go",
+        "file.go",
+        "file_operations.go",
+        "file_overlay.go",
+        "file_state.go",
+        "filesystems.go",
+        "flags.go",
+        "fs.go",
+        "fs_state.go",
+        "inode.go",
+        "inode_inotify.go",
+        "inode_operations.go",
+        "inode_overlay.go",
+        "inotify.go",
+        "inotify_event.go",
+        "inotify_watch.go",
+        "mock.go",
+        "mount.go",
+        "mount_overlay.go",
+        "mount_state.go",
+        "mounts.go",
+        "offset.go",
+        "overlay.go",
+        "path.go",
+        "restore.go",
+        "save.go",
+        "seek.go",
+        "sync.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/ilist",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
+
+go_template_instance(
+    name = "dirent_list",
+    out = "dirent_list.go",
+    package = "fs",
+    prefix = "dirent",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Dirent",
+    },
+)
+
+go_test(
+    name = "fs_x_test",
+    size = "small",
+    srcs = [
+        "copy_up_test.go",
+        "file_overlay_test.go",
+        "inode_overlay_test.go",
+        "mounts_test.go",
+    ],
+    deps = [
+        ":fs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs/ramfs/test",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "fs_test",
+    size = "small",
+    srcs = [
+        "dirent_cache_test.go",
+        "dirent_refs_test.go",
+        "file_test.go",
+        "mount_test.go",
+        "path_test.go",
+    ],
+    embed = [":fs"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+    ],
+)
diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
new file mode 100644
index 000000000..898271ee8
--- /dev/null
+++ b/pkg/sentry/fs/README.md
@@ -0,0 +1,217 @@
+This package provides an implementation of the Linux virtual filesystem.
+
+[TOC]
+
+## Overview
+
+-   An `fs.Dirent` caches an `fs.Inode` in memory at a path in the VFS, giving
+    the `fs.Inode` a relative position with respect to other `fs.Inode`s.
+
+-   If an `fs.Dirent` is referenced by two file descriptors, then those file
+    descriptors are coherent with each other: they depend on the same
+    `fs.Inode`.
+
+-   A mount point is an `fs.Dirent` for which `fs.Dirent.mounted` is true. It
+    exposes the root of a mounted filesystem.
+
+-   The `fs.Inode` produced by a registered filesystem on mount(2) owns an
+    `fs.MountedFilesystem` from which other `fs.Inode`s will be looked up. For a
+    remote filesystem, the `fs.MountedFilesystem` owns the connection to that
+    remote filesystem.
+
+-   In general:
+
+```
+fs.Inode <------------------------------
+|                                      |
+|                                      |
+produced by                            |
+exactly one                            |
+|                             responsible for the
+|                             virtual identity of
+v                                      |
+fs.MountedFilesystem -------------------
+```
+
+Glossary:
+
+-   VFS: virtual filesystem.
+
+-   inode: a virtual file object holding a cached view of a file on a backing
+    filesystem (includes metadata and page caches).
+
+-   superblock: the virtual state of a mounted filesystem (e.g. the virtual
+    inode number set).
+
+-   mount namespace: a view of the mounts under a root (during path traversal,
+    the VFS makes visible/follows the mount point that is in the current task's
+    mount namespace).
+
+## Save and restore
+
+An application's hard dependencies on filesystem state can be broken down into
+two categories:
+
+-   The state necessary to execute a traversal on or view the *virtual*
+    filesystem hierarchy, regardless of what files an application has open.
+
+-   The state necessary to represent open files.
+
+The first is always necessary to save and restore. An application may never have
+any open file descriptors, but across save and restore it should see a coherent
+view of any mount namespace. NOTE: Currently only one "initial"
+mount namespace is supported.
+
+The second is so that system calls across save and restore are coherent with
+each other (e.g. so that unintended re-reads or overwrites do not occur).
+
+Specifically this state is:
+
+-   An `fs.MountManager` containing mount points.
+
+-   A `kernel.FDMap` containing pointers to open files.
+
+Anything else managed by the VFS that can be easily loaded into memory from a
+filesystem is synced back to those filesystems and is no saved. Examples are
+pages in page caches used for optimizations (i.e. readahead and writeback), and
+directory entries used to accelerate path lookups.
+
+### Mount points
+
+Saving and restoring a mount point means saving and restoring:
+
+-   The root of the mounted filesystem.
+
+-   Mount flags, which control how the VFS interacts with the mounted
+    filesystem.
+
+-   Any relevant metadata about the mounted filesystem.
+
+-   All `fs.Inode`s referenced by the application that reside under the mount
+    point.
+
+`fs.MountedFilesystem` is metadata about a filesystem that is mounted. It is
+referenced by every `fs.Inode` loaded into memory under the mount point
+including the `fs.Inode` of the mount point itself. The `fs.MountedFilesystem`
+maps file objects on the filesystem to a virtualized `fs.Inode` number and vice
+versa.
+
+To restore all `fs.Inode`s under a given mount point, each `fs.Inode` leverages
+its dependency on an `fs.MountedFilesystem`. Since the `fs.MountedFilesystem`
+knows how an `fs.Inode` maps to a file object on a backing filesystem, this
+mapping can be trivially consulted by each `fs.Inode` when the `fs.Inode` is
+restored.
+
+In detail, a mount point is saved in two steps:
+
+-   First, after the kernel is paused but before state.Save, we walk all mount
+    namespaces and install a mapping from `fs.Inode` numbers to file paths
+    relative to the root of the mounted filesystem in each
+    `fs.MountedFilesystem`. This is subsequently called the set of `fs.Inode`
+    mappings.
+
+-   Second, during state.Save, each `fs.MountedFilesystem` decides whether to
+    save the set of `fs.Inode` mappings. In-memory filesystems, like tmpfs, have
+    no need to save a set of `fs.Inode` mappings, since the `fs.Inode`s can be
+    entirely encoded in state file. Each `fs.MountedFilesystem` also optionally
+    saves the device name from when the filesystem was originally mounted. Each
+    `fs.Inode` saves its virtual identifier and a reference to a
+    `fs.MountedFilesystem`.
+
+A mount point is restored in two steps:
+
+-   First, before state.Load, all mount configurations are stored in a global
+    `fs.RestoreEnvironment`. This tells us what mount points the user wants to
+    restore and how to re-establish pointers to backing filesystems.
+
+-   Second, during state.Load, each `fs.MountedFilesystem` optionally searches
+    for a mount in the `fs.RestoreEnvironment` that matches its saved device
+    name. The `fs.MountedFilesystem` then restablishes a pointer to the root of
+    the mounted filesystem. For example, the mount specification provides the
+    network connection for a mounted remote filesystem client to communicate
+    with its remote file server. The `fs.MountedFilesystem` also trivially loads
+    its set of `fs.Inode` mappings. When an `fs.Inode` is encountered, the
+    `fs.Inode` loads its virtual identifier and its reference a
+    `fs.MountedFilesystem`. It uses the `fs.MountedFilesystem` to obtain the
+    root of the mounted filesystem and the `fs.Inode` mappings to obtain the
+    relative file path to its data. With these, the `fs.Inode` re-establishes a
+    pointer to its file object.
+
+A mount point can trivially restore its `fs.Inode`s in parallel since
+`fs.Inode`s have a restore dependency on their `fs.MountedFilesystem` and not on
+each other.
+
+### Open files
+
+An `fs.File` references the following filesystem objects:
+
+```go
+fs.File -> fs.Dirent -> fs.Inode -> fs.MountedFilesystem
+```
+
+The `fs.Inode` is restored using its `fs.MountedFilesystem`. The [Mount
+points](#mount-points) section above describes how this happens in detail. The
+`fs.Dirent` restores its pointer to an `fs.Inode`, pointers to parent and
+children `fs.Dirents`, and the basename of the file.
+
+Otherwise an `fs.File` restores flags, an offset, and a unique identifier (only
+used internally).
+
+It may use the `fs.Inode`, which it indirectly holds a reference on through the
+`fs.Dirent`, to restablish an open file handle on the backing filesystem (e.g.
+to continue reading and writing).
+
+## Overlay
+
+The overlay implementation in the fs package takes Linux overlayfs as a frame of
+reference but corrects for several POSIX consistency errors.
+
+In Linux overlayfs, the `struct inode` used for reading and writing to the same
+file may be different. This is because the `struct inode` is dissociated with
+the process of copying up the file from the upper to the lower directory. Since
+flock(2) and fcntl(2) locks, inotify(7) watches, page caches, and a file's
+identity are all stored directly or indirectly off the `struct inode`, these
+properties of the `struct inode` may be stale after the first modification. This
+can lead to file locking bugs, missed inotify events, and inconsistent data in
+shared memory mappings of files, to name a few problems.
+
+The fs package maintains a single `fs.Inode` to represent a directory entry in
+an overlay and defines operations on this `fs.Inode` which synchronize with the
+copy up process. This achieves several things:
+
++   File locks, inotify watches, and the identity of the file need not be copied
+    at all.
+
++   Memory mappings of files coordinate with the copy up process so that if a
+    file in the lower directory is memory mapped, all references to it are
+    invalidated, forcing the application to re-fault on memory mappings of the
+    file under the upper directory.
+
+The `fs.Inode` holds metadata about files in the upper and/or lower directories
+via an `fs.overlayEntry`. The `fs.overlayEntry` implements the `fs.Mappable`
+interface. It multiplexes between upper and lower directory memory mappings and
+stores a copy of memory references so they can be transferred to the upper
+directory `fs.Mappable` when the file is copied up.
+
+The `fs.Inode` also holds a reference to a `fs.MountedFilesystem` that
+normalizes across the mounted filesystem state of the upper and lower
+directories.
+
+When a file is copied from the lower to the upper directory, attempts to
+interact with the file block until the copy completes. All copying synchronizes
+with rename(2).
+
+## Future Work
+
+### Overlay
+
+When a file is copied from a lower directory to an upper directory, several
+locks are taken: the global renamuMu and the copyMu of the `fs.Inode` being
+copied. This blocks operations on the file, including fault handling of memory
+mappings. Performance could be improved by copying files into a temporary
+directory that resides on the same filesystem as the upper directory and doing
+an atomic rename, holding locks only during the rename operation.
+
+Additionally files are copied up synchronously. For large files, this causes a
+noticeable latency. Performance could be improved by pipelining copies at
+non-overlapping file offsets.
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
new file mode 100644
index 000000000..6b18aee47
--- /dev/null
+++ b/pkg/sentry/fs/anon/BUILD
@@ -0,0 +1,21 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "anon",
+    srcs = [
+        "anon.go",
+        "device.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
new file mode 100644
index 000000000..ddc2c0985
--- /dev/null
+++ b/pkg/sentry/fs/anon/anon.go
@@ -0,0 +1,46 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package anon implements an anonymous inode, useful for implementing
+// inodes for pseudo filesystems.
+package anon
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// NewInode constructs an anonymous Inode that is not associated
+// with any real filesystem. Some types depend on completely pseudo
+// "anon" inodes (eventfds, epollfds, etc).
+func NewInode(ctx context.Context) *fs.Inode {
+	return fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
+		FSType: linux.ANON_INODE_FS_MAGIC,
+		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: fs.FileOwnerFromContext(ctx),
+			Perms: fs.FilePermissions{
+				User: fs.PermMask{Read: true, Write: true},
+			},
+			Links: 1,
+		}),
+	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+		Type:      fs.Anonymous,
+		DeviceID:  PseudoDevice.DeviceID(),
+		InodeID:   PseudoDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+	})
+}
diff --git a/pkg/sentry/fs/anon/device.go b/pkg/sentry/fs/anon/device.go
new file mode 100644
index 000000000..1c666729c
--- /dev/null
+++ b/pkg/sentry/fs/anon/device.go
@@ -0,0 +1,22 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package anon
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// PseudoDevice is the device on which all anonymous inodes reside.
+var PseudoDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
new file mode 100644
index 000000000..e20e22a0f
--- /dev/null
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -0,0 +1,83 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_stateify(
+    name = "ashmem_state",
+    srcs = [
+        "area.go",
+        "device.go",
+        "pin_board.go",
+        "uint64_range.go",
+        "uint64_set.go",
+    ],
+    out = "ashmem_state.go",
+    package = "ashmem",
+)
+
+go_library(
+    name = "ashmem",
+    srcs = [
+        "area.go",
+        "ashmem_state.go",
+        "device.go",
+        "pin_board.go",
+        "uint64_range.go",
+        "uint64_set.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ashmem",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+    ],
+)
+
+go_test(
+    name = "ashmem_test",
+    size = "small",
+    srcs = ["pin_board_test.go"],
+    embed = [":ashmem"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/usermem",
+    ],
+)
+
+go_template_instance(
+    name = "uint64_range",
+    out = "uint64_range.go",
+    package = "ashmem",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint64",
+    },
+)
+
+go_template_instance(
+    name = "uint64_set",
+    out = "uint64_set.go",
+    package = "ashmem",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "Range",
+        "Value": "noValue",
+        "Functions": "setFunctions",
+    },
+)
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
new file mode 100644
index 000000000..e4f76f0d0
--- /dev/null
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -0,0 +1,313 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ashmem
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// namePrefix is the name prefix assumed and forced by the Linux implementation.
+	namePrefix = "dev/ashmem"
+
+	// nameLen is the maximum name length.
+	nameLen = 256
+)
+
+// Area implements fs.FileOperations.
+type Area struct {
+	fsutil.NoFsync
+	fsutil.DeprecatedFileOperations
+	fsutil.NotDirReaddir
+
+	ad *Device
+
+	// mu protects fields below.
+	mu        sync.Mutex `state:"nosave"`
+	tmpfsFile *fs.File
+	name      string
+	size      uint64
+	perms     usermem.AccessType
+	pb        *PinBoard
+}
+
+// Release implements fs.FileOperations.Release.
+func (a *Area) Release() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if a.tmpfsFile != nil {
+		a.tmpfsFile.DecRef()
+		a.tmpfsFile = nil
+	}
+}
+
+// Seek implements fs.FileOperations.Seek.
+func (a *Area) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if a.size == 0 {
+		return 0, syserror.EINVAL
+	}
+	if a.tmpfsFile == nil {
+		return 0, syserror.EBADF
+	}
+	return a.tmpfsFile.FileOperations.Seek(ctx, file, whence, offset)
+}
+
+// Read implements fs.FileOperations.Read.
+func (a *Area) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if a.size == 0 {
+		return 0, nil
+	}
+	if a.tmpfsFile == nil {
+		return 0, syserror.EBADF
+	}
+	return a.tmpfsFile.FileOperations.Read(ctx, file, dst, offset)
+}
+
+// Write implements fs.FileOperations.Write.
+func (a *Area) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	return 0, syserror.ENOSYS
+}
+
+// Flush implements fs.FileOperations.Flush.
+func (a *Area) Flush(ctx context.Context, file *fs.File) error {
+	return nil
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if a.size == 0 {
+		return syserror.EINVAL
+	}
+
+	if !a.perms.SupersetOf(opts.Perms) {
+		return syserror.EPERM
+	}
+	opts.MaxPerms = opts.MaxPerms.Intersect(a.perms)
+
+	if a.tmpfsFile == nil {
+		p := platform.FromContext(ctx)
+		if p == nil {
+			return syserror.ENOMEM
+		}
+		tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}, p)
+		// This is not backed by a real filesystem, so we pass in nil.
+		tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{})
+		dirent := fs.NewDirent(tmpfsInode, namePrefix+"/"+a.name)
+		tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true})
+		// Drop the extra reference on the Dirent.
+		dirent.DecRef()
+
+		if err != nil {
+			return err
+		}
+
+		// Truncate to the size set by ASHMEM_SET_SIZE ioctl.
+		err = tmpfsInodeOps.Truncate(ctx, tmpfsInode, int64(a.size))
+		if err != nil {
+			return err
+		}
+		a.tmpfsFile = tmpfsFile
+		a.pb = NewPinBoard()
+	}
+
+	return a.tmpfsFile.ConfigureMMap(ctx, opts)
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (a *Area) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch args[1].Uint() {
+	case linux.AshmemSetNameIoctl:
+		name, err := usermem.CopyStringIn(ctx, io, args[2].Pointer(), nameLen-1, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		if err != nil {
+			return 0, err
+		}
+
+		a.mu.Lock()
+		defer a.mu.Unlock()
+
+		// Cannot set name for already mapped ashmem.
+		if a.tmpfsFile != nil {
+			return 0, syserror.EINVAL
+		}
+		a.name = name
+		return 0, nil
+
+	case linux.AshmemGetNameIoctl:
+		a.mu.Lock()
+		var local []byte
+		if a.name != "" {
+			nameLen := len([]byte(a.name))
+			local = make([]byte, nameLen, nameLen+1)
+			copy(local, []byte(a.name))
+			local = append(local, 0)
+		} else {
+			nameLen := len([]byte(namePrefix))
+			local = make([]byte, nameLen, nameLen+1)
+			copy(local, []byte(namePrefix))
+			local = append(local, 0)
+		}
+		a.mu.Unlock()
+
+		if _, err := io.CopyOut(ctx, args[2].Pointer(), local, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, syserror.EFAULT
+		}
+		return 0, nil
+
+	case linux.AshmemSetSizeIoctl:
+		a.mu.Lock()
+		defer a.mu.Unlock()
+
+		// Cannot set size for already mapped ashmem.
+		if a.tmpfsFile != nil {
+			return 0, syserror.EINVAL
+		}
+		a.size = uint64(args[2].SizeT())
+		return 0, nil
+
+	case linux.AshmemGetSizeIoctl:
+		return uintptr(a.size), nil
+
+	case linux.AshmemPinIoctl, linux.AshmemUnpinIoctl, linux.AshmemGetPinStatusIoctl:
+		// Locking and unlocking is ok since once tmpfsFile is set, it won't be nil again
+		// even after unmapping! Unlocking is needed in order to avoid a deadlock on
+		// usermem.CopyObjectIn.
+
+		// Cannot execute pin-related ioctls before mapping.
+		a.mu.Lock()
+		if a.tmpfsFile == nil {
+			a.mu.Unlock()
+			return 0, syserror.EINVAL
+		}
+		a.mu.Unlock()
+
+		var pin linux.AshmemPin
+		_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pin, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		if err != nil {
+			return 0, syserror.EFAULT
+		}
+
+		a.mu.Lock()
+		defer a.mu.Unlock()
+		return a.pinOperation(pin, args[1].Uint())
+
+	case linux.AshmemPurgeAllCachesIoctl:
+		return 0, nil
+
+	case linux.AshmemSetProtMaskIoctl:
+		prot := uint64(args[2].ModeT())
+		perms := usermem.AccessType{
+			Read:    prot&linux.PROT_READ != 0,
+			Write:   prot&linux.PROT_WRITE != 0,
+			Execute: prot&linux.PROT_EXEC != 0,
+		}
+
+		a.mu.Lock()
+		defer a.mu.Unlock()
+
+		// Can only narrow prot mask.
+		if !a.perms.SupersetOf(perms) {
+			return 0, syserror.EINVAL
+		}
+
+		// TODO: If personality flag
+		// READ_IMPLIES_EXEC is set, set PROT_EXEC if PORT_READ is set.
+
+		a.perms = perms
+		return 0, nil
+
+	case linux.AshmemGetProtMaskIoctl:
+		return uintptr(a.perms.Prot()), nil
+	default:
+		// Ioctls irrelevant to Ashmem.
+		return 0, syserror.EINVAL
+	}
+}
+
+// pinOperation should only be called while holding a.mu.
+func (a *Area) pinOperation(pin linux.AshmemPin, op uint32) (uintptr, error) {
+	// Page-align a.size for checks.
+	pageAlignedSize, ok := usermem.Addr(a.size).RoundUp()
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+	// Len 0 means everything onward.
+	if pin.Len == 0 {
+		pin.Len = uint32(pageAlignedSize) - pin.Offset
+	}
+	// Both Offset and Len have to be page-aligned.
+	if pin.Offset%uint32(usermem.PageSize) != 0 {
+		return 0, syserror.EINVAL
+	}
+	if pin.Len%uint32(usermem.PageSize) != 0 {
+		return 0, syserror.EINVAL
+	}
+	// Adding Offset and Len must not cause an uint32 overflow.
+	if end := pin.Offset + pin.Len; end < pin.Offset {
+		return 0, syserror.EINVAL
+	}
+	// Pin range must not exceed a's size.
+	if uint32(pageAlignedSize) < pin.Offset+pin.Len {
+		return 0, syserror.EINVAL
+	}
+	// Handle each operation.
+	r := RangeFromAshmemPin(pin)
+	switch op {
+	case linux.AshmemPinIoctl:
+		if a.pb.PinRange(r) {
+			return linux.AshmemWasPurged, nil
+		}
+		return linux.AshmemNotPurged, nil
+
+	case linux.AshmemUnpinIoctl:
+		// TODO: Implement purge on unpin.
+		a.pb.UnpinRange(r)
+		return 0, nil
+
+	case linux.AshmemGetPinStatusIoctl:
+		if a.pb.RangePinnedStatus(r) {
+			return linux.AshmemIsPinned, nil
+		}
+		return linux.AshmemIsUnpinned, nil
+
+	default:
+		panic("unreachable")
+	}
+
+}
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
new file mode 100644
index 000000000..c5b51d4a7
--- /dev/null
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -0,0 +1,169 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ashmem implements Android ashmem module (Anonymus Shared Memory).
+package ashmem
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Device implements fs.InodeOperations.
+type Device struct {
+	fsutil.DeprecatedFileOperations
+	fsutil.InodeNoExtendedAttributes
+	fsutil.InodeNotDirectory
+	fsutil.InodeNotRenameable
+	fsutil.InodeNotSocket
+	fsutil.InodeNotSymlink
+	fsutil.NoFsync
+	fsutil.NoMappable
+	fsutil.NoopWriteOut
+	fsutil.NotDirReaddir
+
+	mu       sync.Mutex `state:"nosave"`
+	unstable fs.UnstableAttr
+}
+
+// NewDevice creates and intializes a Device structure.
+func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
+	return &Device{
+		unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: owner,
+			Perms: fp,
+			Links: 1,
+		}),
+	}
+}
+
+// Release implements fs.InodeOperations.Release.
+func (ad *Device) Release(context.Context) {}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &Area{
+		ad:        ad,
+		tmpfsFile: nil,
+		perms:     usermem.AnyAccess,
+	}), nil
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (ad *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	ad.mu.Lock()
+	defer ad.mu.Unlock()
+	return ad.unstable, nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (ad *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (ad *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool {
+	ad.mu.Lock()
+	defer ad.mu.Unlock()
+	ad.unstable.Perms = fp
+	ad.unstable.StatusChangeTime = time.NowFromContext(ctx)
+	return true
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (ad *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	ad.mu.Lock()
+	defer ad.mu.Unlock()
+	if owner.UID.Ok() {
+		ad.unstable.Owner.UID = owner.UID
+	}
+	if owner.GID.Ok() {
+		ad.unstable.Owner.GID = owner.GID
+	}
+	return nil
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (ad *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+
+	ad.mu.Lock()
+	defer ad.mu.Unlock()
+
+	now := time.NowFromContext(ctx)
+	if !ts.ATimeOmit {
+		if ts.ATimeSetSystemTime {
+			ad.unstable.AccessTime = now
+		} else {
+			ad.unstable.AccessTime = ts.ATime
+		}
+	}
+	if !ts.MTimeOmit {
+		if ts.MTimeSetSystemTime {
+			ad.unstable.ModificationTime = now
+		} else {
+			ad.unstable.ModificationTime = ts.MTime
+		}
+	}
+	ad.unstable.StatusChangeTime = now
+	return nil
+}
+
+// Truncate implements fs.InodeOperations.WriteOut.
+//
+// Ignored by ashmem.
+func (ad *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	return nil
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+//
+// Ashmem doesn't support links, no-op.
+func (ad *Device) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+//
+// Ashmem doesn't support links, no-op.
+func (ad *Device) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+func (ad *Device) NotifyStatusChange(ctx context.Context) {
+	ad.mu.Lock()
+	defer ad.mu.Unlock()
+	now := time.NowFromContext(ctx)
+	ad.unstable.ModificationTime = now
+	ad.unstable.StatusChangeTime = now
+}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+//
+// Ashmem is virtual.
+func (ad *Device) IsVirtual() bool {
+	return true
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+//
+// Ashmem doesn't support querying for filesystem info.
+func (ad *Device) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syserror.ENOSYS
+}
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
new file mode 100644
index 000000000..c7fb3822c
--- /dev/null
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -0,0 +1,125 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ashmem
+
+import "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+
+const maxUint64 = ^uint64(0)
+
+// setFunctions implements segment.Functions generated from segment.Functions for
+// uint64 Key and noValue Value. For more information, see the build file and
+// segment set implementation at pkg/segment/set.go.
+type setFunctions struct{}
+
+// noValue is a type of range attached value, which is irrelevant here.
+type noValue struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (setFunctions) MinKey() uint64 {
+	return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (setFunctions) MaxKey() uint64 {
+	return maxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (setFunctions) ClearValue(*noValue) {
+	return
+}
+
+// Merge implements segment.Functions.Merge.
+func (setFunctions) Merge(Range, noValue, Range, noValue) (noValue, bool) {
+	return noValue{}, true
+}
+
+// Split implements segment.Functions.Split.
+func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) {
+	return noValue{}, noValue{}
+}
+
+// PinBoard represents a set of pinned ranges in ashmem.
+//
+// segment.Set is used for implementation where segments represent
+// ranges of pinned bytes, while gaps represent ranges of unpinned
+// bytes. All ranges are page-aligned.
+type PinBoard struct {
+	Set
+}
+
+// NewPinBoard creates a new pin board with all pages pinned.
+func NewPinBoard() *PinBoard {
+	var pb PinBoard
+	pb.PinRange(Range{0, maxUint64})
+	return &pb
+}
+
+// PinRange pins all pages in the specified range and returns true
+// if there are any newly pinned pages.
+func (pb *PinBoard) PinRange(r Range) bool {
+	pinnedPages := false
+	for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; {
+		common := gap.Range().Intersect(r)
+		if common.Length() == 0 {
+			gap = gap.NextGap()
+			continue
+		}
+		pinnedPages = true
+		gap = pb.Insert(gap, common, noValue{}).NextGap()
+	}
+	return pinnedPages
+}
+
+// UnpinRange unpins all pages in the specified range.
+func (pb *PinBoard) UnpinRange(r Range) {
+	for seg := pb.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; {
+		common := seg.Range().Intersect(r)
+		if common.Length() == 0 {
+			seg = seg.NextSegment()
+			continue
+		}
+		seg = pb.RemoveRange(common).NextSegment()
+	}
+}
+
+// RangePinnedStatus returns false if there's at least one unpinned page in the
+// specified range.
+func (pb *PinBoard) RangePinnedStatus(r Range) bool {
+	for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; {
+		common := gap.Range().Intersect(r)
+		if common.Length() == 0 {
+			gap = gap.NextGap()
+			continue
+		}
+		return false
+	}
+	return true
+}
+
+// RangeFromAshmemPin converts ashmem's original pin structure
+// to Range.
+func RangeFromAshmemPin(ap linux.AshmemPin) Range {
+	if ap.Len == 0 {
+		return Range{
+			uint64(ap.Offset),
+			maxUint64,
+		}
+	}
+	return Range{
+		uint64(ap.Offset),
+		uint64(ap.Offset) + uint64(ap.Len),
+	}
+}
diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go
new file mode 100644
index 000000000..f4ea5de6d
--- /dev/null
+++ b/pkg/sentry/fs/ashmem/pin_board_test.go
@@ -0,0 +1,130 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ashmem
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func TestPinBoard(t *testing.T) {
+	pb := NewPinBoard()
+
+	// Confirm that all pages are pinned.
+	if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) {
+		t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.")
+	}
+
+	// Unpin pages [1, 11) (counting from 0)
+	pb.UnpinRange(RangeFromAshmemPin(linux.AshmemPin{
+		usermem.PageSize,
+		usermem.PageSize * 10,
+	}))
+
+	// Confirm that pages [1, 11) are unpinned and that page 0 and pages
+	// larger than 10 are pinned.
+	pinned := []linux.AshmemPin{
+		{
+			0,
+			usermem.PageSize,
+		}, {
+			usermem.PageSize * 11,
+			0,
+		},
+	}
+
+	for _, pin := range pinned {
+		if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) {
+			t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).",
+				pin.Offset, pin.Len)
+		}
+	}
+
+	unpinned := []linux.AshmemPin{
+		{
+			usermem.PageSize,
+			usermem.PageSize * 10,
+		},
+	}
+
+	for _, pin := range unpinned {
+		if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) {
+			t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).",
+				pin.Offset, pin.Len)
+		}
+	}
+
+	// Pin pages [2, 6).
+	pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{
+		usermem.PageSize * 2,
+		usermem.PageSize * 4,
+	}))
+
+	// Confirm that pages 0, [2, 6) and pages larger than 10 are pinned
+	// while others remain unpinned.
+	pinned = []linux.AshmemPin{
+		{
+			0,
+			usermem.PageSize,
+		},
+		{
+			usermem.PageSize * 2,
+			usermem.PageSize * 4,
+		},
+		{
+			usermem.PageSize * 11,
+			0,
+		},
+	}
+
+	for _, pin := range pinned {
+		if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) {
+			t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).",
+				pin.Offset, pin.Len)
+		}
+	}
+
+	unpinned = []linux.AshmemPin{
+		{
+			usermem.PageSize,
+			usermem.PageSize,
+		}, {
+			usermem.PageSize * 6,
+			usermem.PageSize * 5,
+		},
+	}
+
+	for _, pin := range unpinned {
+		if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) {
+			t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).",
+				pin.Offset, pin.Len)
+		}
+	}
+
+	// Status of a partially pinned range is unpinned.
+	if pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) {
+		t.Errorf("RangePinnedStatus(all pages) returned true (pinned).")
+	}
+
+	// Pin the whole range again.
+	pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{0, 0}))
+
+	// Confirm that all pages are pinned.
+	if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) {
+		t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.")
+	}
+}
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
new file mode 100644
index 000000000..56a2ad6f7
--- /dev/null
+++ b/pkg/sentry/fs/attr.go
@@ -0,0 +1,382 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+// InodeType enumerates types of Inodes.
+type InodeType int
+
+const (
+	// RegularFile is a regular file.
+	RegularFile InodeType = iota
+
+	// SpecialFile is a file that doesn't support SeekEnd. It is used for
+	// things like proc files.
+	SpecialFile
+
+	// Directory is a directory.
+	Directory
+
+	// SpecialDirectory is a directory that *does* support SeekEnd. It's
+	// the opposite of the SpecialFile scenario above. It similarly
+	// supports proc files.
+	SpecialDirectory
+
+	// Symlink is a symbolic link.
+	Symlink
+
+	// Pipe is a pipe (named or regular).
+	Pipe
+
+	// Socket is a socket.
+	Socket
+
+	// CharacterDevice is a character device.
+	CharacterDevice
+
+	// BlockDevice is a block device.
+	BlockDevice
+
+	// Anonymous is an anonymous type when none of the above apply.
+	// Epoll fds and event-driven fds fit this category.
+	Anonymous
+)
+
+// String returns a human-readable representation of the InodeType.
+func (n InodeType) String() string {
+	switch n {
+	case RegularFile, SpecialFile:
+		return "file"
+	case Directory, SpecialDirectory:
+		return "directory"
+	case Symlink:
+		return "symlink"
+	case Pipe:
+		return "pipe"
+	case Socket:
+		return "socket"
+	case CharacterDevice:
+		return "character-device"
+	case BlockDevice:
+		return "block-device"
+	case Anonymous:
+		return "anonymous"
+	default:
+		return "unknown"
+	}
+}
+
+// StableAttr contains Inode attributes that will be stable throughout the
+// lifetime of the Inode.
+type StableAttr struct {
+	// Type is the InodeType of a InodeOperations.
+	Type InodeType
+
+	// DeviceID is the device on which a InodeOperations resides.
+	DeviceID uint64
+
+	// InodeID uniquely identifies InodeOperations on its device.
+	InodeID uint64
+
+	// BlockSize is the block size of data backing this InodeOperations.
+	BlockSize int64
+
+	// DeviceFileMajor is the major device number of this Node, if it is a
+	// device file.
+	DeviceFileMajor uint16
+
+	// DeviceFileMinor is the minor device number of this Node, if it is a
+	// device file.
+	DeviceFileMinor uint32
+}
+
+// IsRegular returns true if StableAttr.Type matches a regular file.
+func IsRegular(s StableAttr) bool {
+	return s.Type == RegularFile
+}
+
+// IsFile returns true if StableAttr.Type matches any type of file.
+func IsFile(s StableAttr) bool {
+	return s.Type == RegularFile || s.Type == SpecialFile
+}
+
+// IsDir returns true if StableAttr.Type matches any type of directory.
+func IsDir(s StableAttr) bool {
+	return s.Type == Directory || s.Type == SpecialDirectory
+}
+
+// IsSymlink returns true if StableAttr.Type matches a symlink.
+func IsSymlink(s StableAttr) bool {
+	return s.Type == Symlink
+}
+
+// IsPipe returns true if StableAttr.Type matches any type of pipe.
+func IsPipe(s StableAttr) bool {
+	return s.Type == Pipe
+}
+
+// IsSocket returns true if StableAttr.Type matches any type of socket.
+func IsSocket(s StableAttr) bool {
+	return s.Type == Socket
+}
+
+// IsCharDevice returns true if StableAttr.Type matches a character device.
+func IsCharDevice(s StableAttr) bool {
+	return s.Type == CharacterDevice
+}
+
+// UnstableAttr contains Inode attributes that may change over the lifetime
+// of the Inode.
+type UnstableAttr struct {
+	// Size is the file size in bytes.
+	Size int64
+
+	// Usage is the actual data usage in bytes.
+	Usage int64
+
+	// Perms is the protection (read/write/execute for user/group/other).
+	Perms FilePermissions
+
+	// Owner describes the ownership of this file.
+	Owner FileOwner
+
+	// AccessTime is the time of last access
+	AccessTime ktime.Time
+
+	// ModificationTime is the time of last modification.
+	ModificationTime ktime.Time
+
+	// StatusChangeTime is the time of last attribute modification.
+	StatusChangeTime ktime.Time
+
+	// Links is the number of hard links.
+	Links uint64
+}
+
+// WithCurrentTime returns u with AccessTime == ModificationTime == current time.
+func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr {
+	t := ktime.NowFromContext(ctx)
+	u.AccessTime = t
+	u.ModificationTime = t
+	u.StatusChangeTime = t
+	return u
+}
+
+// AttrMask contains fields to mask StableAttr and UnstableAttr.
+type AttrMask struct {
+	Type             bool
+	DeviceID         bool
+	InodeID          bool
+	BlockSize        bool
+	Size             bool
+	Usage            bool
+	Perms            bool
+	UID              bool
+	GID              bool
+	AccessTime       bool
+	ModificationTime bool
+	StatusChangeTime bool
+	Links            bool
+}
+
+// Empty returns true if all fields in AttrMask are false.
+func (a AttrMask) Empty() bool {
+	return a == AttrMask{}
+}
+
+// Union returns an AttrMask containing the inclusive disjunction of fields in a and b.
+func (a AttrMask) Union(b AttrMask) AttrMask {
+	return AttrMask{
+		Type:             a.Type || b.Type,
+		DeviceID:         a.DeviceID || b.DeviceID,
+		InodeID:          a.InodeID || b.InodeID,
+		BlockSize:        a.BlockSize || b.BlockSize,
+		Size:             a.Size || b.Size,
+		Usage:            a.Usage || b.Usage,
+		Perms:            a.Perms || b.Perms,
+		UID:              a.UID || b.UID,
+		GID:              a.GID || b.GID,
+		AccessTime:       a.AccessTime || b.AccessTime,
+		ModificationTime: a.ModificationTime || b.ModificationTime,
+		StatusChangeTime: a.StatusChangeTime || b.StatusChangeTime,
+		Links:            a.Links || b.Links,
+	}
+}
+
+// PermMask are file access permissions.
+type PermMask struct {
+	// Read indicates reading is permitted.
+	Read bool
+
+	// Write indicates writing is permitted.
+	Write bool
+
+	// Execute indicates execution is permitted.
+	Execute bool
+}
+
+// OnlyRead returns true when only the read bit is set.
+func (p PermMask) OnlyRead() bool {
+	return p.Read && !p.Write && !p.Execute
+}
+
+// String implements the fmt.Stringer interface for PermMask.
+func (p PermMask) String() string {
+	return fmt.Sprintf("PermMask{Read: %v, Write: %v, Execute: %v}", p.Read, p.Write, p.Execute)
+}
+
+// Mode returns the system mode (syscall.S_IXOTH, etc.) for these permissions
+// in the "other" bits.
+func (p PermMask) Mode() (mode os.FileMode) {
+	if p.Read {
+		mode |= syscall.S_IROTH
+	}
+	if p.Write {
+		mode |= syscall.S_IWOTH
+	}
+	if p.Execute {
+		mode |= syscall.S_IXOTH
+	}
+	return
+}
+
+// SupersetOf returns true iff the permissions in p are a superset of the
+// permissions in other.
+func (p PermMask) SupersetOf(other PermMask) bool {
+	if !p.Read && other.Read {
+		return false
+	}
+	if !p.Write && other.Write {
+		return false
+	}
+	if !p.Execute && other.Execute {
+		return false
+	}
+	return true
+}
+
+// FilePermissions represents the permissions of a file, with
+// Read/Write/Execute bits for user, group, and other.
+type FilePermissions struct {
+	User  PermMask
+	Group PermMask
+	Other PermMask
+
+	// Sticky, if set on directories, restricts renaming and deletion of
+	// files in those directories to the directory owner, file owner, or
+	// CAP_FOWNER. The sticky bit is ignored when set on other files.
+	Sticky bool
+
+	// SetUID executables can call UID-setting syscalls without CAP_SETUID.
+	SetUID bool
+
+	// SetGID executables can call GID-setting syscalls without CAP_SETGID.
+	SetGID bool
+}
+
+// PermsFromMode takes the Other permissions (last 3 bits) of a FileMode and
+// returns a set of PermMask.
+func PermsFromMode(mode linux.FileMode) (perms PermMask) {
+	perms.Read = mode&linux.ModeOtherRead != 0
+	perms.Write = mode&linux.ModeOtherWrite != 0
+	perms.Execute = mode&linux.ModeOtherExec != 0
+	return
+}
+
+// FilePermsFromP9 converts a p9.FileMode to a FilePermissions struct.
+func FilePermsFromP9(mode p9.FileMode) FilePermissions {
+	return FilePermsFromMode(linux.FileMode(mode))
+}
+
+// FilePermsFromMode converts a system file mode to a FilePermissions struct.
+func FilePermsFromMode(mode linux.FileMode) (fp FilePermissions) {
+	perm := mode.Permissions()
+	fp.Other = PermsFromMode(perm)
+	fp.Group = PermsFromMode(perm >> 3)
+	fp.User = PermsFromMode(perm >> 6)
+	fp.Sticky = mode&linux.ModeSticky == linux.ModeSticky
+	fp.SetUID = mode&linux.ModeSetUID == linux.ModeSetUID
+	fp.SetGID = mode&linux.ModeSetGID == linux.ModeSetGID
+	return
+}
+
+// LinuxMode returns the linux mode_t representation of these permissions.
+func (f FilePermissions) LinuxMode() linux.FileMode {
+	m := linux.FileMode(f.User.Mode()<<6 | f.Group.Mode()<<3 | f.Other.Mode())
+	if f.SetUID {
+		m |= linux.ModeSetUID
+	}
+	if f.SetGID {
+		m |= linux.ModeSetGID
+	}
+	if f.Sticky {
+		m |= linux.ModeSticky
+	}
+	return m
+}
+
+// OSMode returns the Go runtime's OS independent os.FileMode representation of
+// these permissions.
+func (f FilePermissions) OSMode() os.FileMode {
+	m := os.FileMode(f.User.Mode()<<6 | f.Group.Mode()<<3 | f.Other.Mode())
+	if f.SetUID {
+		m |= os.ModeSetuid
+	}
+	if f.SetGID {
+		m |= os.ModeSetgid
+	}
+	if f.Sticky {
+		m |= os.ModeSticky
+	}
+	return m
+}
+
+// AnyExecute returns true if any of U/G/O have the execute bit set.
+func (f FilePermissions) AnyExecute() bool {
+	return f.User.Execute || f.Group.Execute || f.Other.Execute
+}
+
+// AnyWrite returns true if any of U/G/O have the write bit set.
+func (f FilePermissions) AnyWrite() bool {
+	return f.User.Write || f.Group.Write || f.Other.Write
+}
+
+// AnyRead returns true if any of U/G/O have the read bit set.
+func (f FilePermissions) AnyRead() bool {
+	return f.User.Read || f.Group.Read || f.Other.Read
+}
+
+// FileOwner represents ownership of a file.
+type FileOwner struct {
+	UID auth.KUID
+	GID auth.KGID
+}
+
+// RootOwner corresponds to KUID/KGID 0/0.
+var RootOwner = FileOwner{
+	UID: auth.RootKUID,
+	GID: auth.RootKGID,
+}
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
new file mode 100644
index 000000000..15f91699f
--- /dev/null
+++ b/pkg/sentry/fs/binder/BUILD
@@ -0,0 +1,38 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "binder_state",
+    srcs = ["binder.go"],
+    out = "binder_state.go",
+    package = "binder",
+)
+
+go_library(
+    name = "binder",
+    srcs = [
+        "binder.go",
+        "binder_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+    ],
+)
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
new file mode 100644
index 000000000..3f87b6b08
--- /dev/null
+++ b/pkg/sentry/fs/binder/binder.go
@@ -0,0 +1,358 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package binder implements Android Binder IPC module.
+package binder
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	currentProtocolVersion = 8
+
+	// mmapSizeLimit is the upper limit for mapped memory size in Binder.
+	mmapSizeLimit = 4 * 1024 * 1024 // 4MB
+)
+
+// Device implements fs.InodeOperations.
+type Device struct {
+	fsutil.InodeNoExtendedAttributes
+	fsutil.InodeNotDirectory
+	fsutil.InodeNotRenameable
+	fsutil.InodeNotSocket
+	fsutil.InodeNotSymlink
+	fsutil.NoMappable
+	fsutil.NoopWriteOut
+	fsutil.DeprecatedFileOperations
+
+	// mu protects unstable.
+	mu       sync.Mutex `state:"nosave"`
+	unstable fs.UnstableAttr
+}
+
+// NewDevice creates and intializes a Device structure.
+func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
+	return &Device{
+		unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: owner,
+			Perms: fp,
+			Links: 1,
+		}),
+	}
+}
+
+// Release implements fs.InodeOperations.Release.
+func (bd *Device) Release(context.Context) {}
+
+// GetFile implements fs.InodeOperations.GetFile.
+//
+// TODO: Add functionality to GetFile: Additional fields will be
+// needed in the Device structure, initialize them here. Also, Device will need
+// to keep track of the created Procs in order to implement BINDER_READ_WRITE
+// ioctl.
+func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &Proc{
+		bd:       bd,
+		task:     kernel.TaskFromContext(ctx),
+		platform: platform.FromContext(ctx),
+	}), nil
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (bd *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	bd.mu.Lock()
+	defer bd.mu.Unlock()
+	return bd.unstable, nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (bd *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (bd *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool {
+	bd.mu.Lock()
+	defer bd.mu.Unlock()
+	bd.unstable.Perms = fp
+	bd.unstable.StatusChangeTime = time.NowFromContext(ctx)
+	return true
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (bd *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	bd.mu.Lock()
+	defer bd.mu.Unlock()
+	if owner.UID.Ok() {
+		bd.unstable.Owner.UID = owner.UID
+	}
+	if owner.GID.Ok() {
+		bd.unstable.Owner.GID = owner.GID
+	}
+	return nil
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (bd *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+
+	bd.mu.Lock()
+	defer bd.mu.Unlock()
+
+	now := time.NowFromContext(ctx)
+	if !ts.ATimeOmit {
+		if ts.ATimeSetSystemTime {
+			bd.unstable.AccessTime = now
+		} else {
+			bd.unstable.AccessTime = ts.ATime
+		}
+	}
+	if !ts.MTimeOmit {
+		if ts.MTimeSetSystemTime {
+			bd.unstable.ModificationTime = now
+		} else {
+			bd.unstable.ModificationTime = ts.MTime
+		}
+	}
+	bd.unstable.StatusChangeTime = now
+	return nil
+}
+
+// Truncate implements fs.InodeOperations.WriteOut.
+//
+// Ignored for a character device, such as Binder.
+func (bd *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	return nil
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+//
+// Binder doesn't support links, no-op.
+func (bd *Device) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+//
+// Binder doesn't support links, no-op.
+func (bd *Device) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+func (bd *Device) NotifyStatusChange(ctx context.Context) {
+	bd.mu.Lock()
+	defer bd.mu.Unlock()
+	now := time.NowFromContext(ctx)
+	bd.unstable.ModificationTime = now
+	bd.unstable.StatusChangeTime = now
+}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+//
+// Binder is virtual.
+func (bd *Device) IsVirtual() bool {
+	return true
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+//
+// Binder doesn't support querying for filesystem info.
+func (bd *Device) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syserror.ENOSYS
+}
+
+// Proc implements fs.FileOperations and fs.IoctlGetter.
+type Proc struct {
+	fsutil.NoFsync
+	fsutil.DeprecatedFileOperations
+	fsutil.NotDirReaddir
+
+	bd       *Device
+	task     *kernel.Task
+	platform platform.Platform
+
+	// mu protects fr.
+	mu sync.Mutex `state:"nosave"`
+
+	// mapped is memory allocated from platform.Memory() by AddMapping.
+	mapped platform.FileRange
+}
+
+// Release implements fs.FileOperations.Release.
+func (bp *Proc) Release() {
+	bp.mu.Lock()
+	defer bp.mu.Unlock()
+	if bp.mapped.Length() != 0 {
+		bp.platform.Memory().DecRef(bp.mapped)
+	}
+}
+
+// Seek implements fs.FileOperations.Seek.
+//
+// Binder doesn't support seek operation (unless in debug mode).
+func (bp *Proc) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return offset, syserror.EOPNOTSUPP
+}
+
+// Read implements fs.FileOperations.Read.
+//
+// Binder doesn't support read operation (unless in debug mode).
+func (bp *Proc) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	return 0, syserror.EOPNOTSUPP
+}
+
+// Write implements fs.FileOperations.Write.
+//
+// Binder doesn't support write operation.
+func (bp *Proc) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	return 0, syserror.EOPNOTSUPP
+}
+
+// Flush implements fs.FileOperations.Flush.
+//
+// TODO: Implement.
+func (bp *Proc) Flush(ctx context.Context, file *fs.File) error {
+	return nil
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	// Compare drivers/android/binder.c:binder_mmap().
+	if caller := kernel.TaskFromContext(ctx); caller != bp.task {
+		return syserror.EINVAL
+	}
+	if opts.Length > mmapSizeLimit {
+		opts.Length = mmapSizeLimit
+	}
+	opts.MaxPerms.Write = false
+
+	// TODO: Binder sets VM_DONTCOPY, preventing the created vma
+	// from being copied across fork(), but we don't support this yet. As
+	// a result, MMs containing a Binder mapping cannot be forked (MM.Fork will
+	// fail when AddMapping returns EBUSY).
+
+	return fsutil.GenericConfigureMMap(file, bp, opts)
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+//
+// TODO: Implement.
+func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch uint32(args[1].Int()) {
+	case linux.BinderVersionIoctl:
+		ver := &linux.BinderVersion{
+			ProtocolVersion: currentProtocolVersion,
+		}
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ver, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.BinderWriteReadIoctl:
+		// TODO: Implement.
+		fallthrough
+	case linux.BinderSetIdleTimeoutIoctl:
+		// TODO: Implement.
+		fallthrough
+	case linux.BinderSetMaxThreadsIoctl:
+		// TODO: Implement.
+		fallthrough
+	case linux.BinderSetIdlePriorityIoctl:
+		// TODO: Implement.
+		fallthrough
+	case linux.BinderSetContextMgrIoctl:
+		// TODO: Implement.
+		fallthrough
+	case linux.BinderThreadExitIoctl:
+		// TODO: Implement.
+		return 0, syserror.ENOSYS
+	default:
+		// Ioctls irrelevant to Binder.
+		return 0, syserror.EINVAL
+	}
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	bp.mu.Lock()
+	defer bp.mu.Unlock()
+	if bp.mapped.Length() != 0 {
+		// mmap has been called before, which binder_mmap() doesn't like.
+		return syserror.EBUSY
+	}
+	// Binder only allocates and maps a single page up-front
+	// (drivers/android/binder.c:binder_mmap() => binder_update_page_range()).
+	fr, err := bp.platform.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+	if err != nil {
+		return err
+	}
+	bp.mapped = fr
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (bp *Proc) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+	// Nothing to do. Notably, we don't free bp.mapped to allow another mmap.
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	// Nothing to do. Notably, this is one case where CopyMapping isn't
+	// equivalent to AddMapping, as AddMapping would return EBUSY.
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (bp *Proc) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	// TODO: In addition to the page initially allocated and mapped
+	// in AddMapping (Linux: binder_mmap), Binder allocates and maps pages for
+	// each transaction (Linux: binder_ioctl => binder_ioctl_write_read =>
+	// binder_thread_write => binder_transaction => binder_alloc_buf =>
+	// binder_update_page_range). Since we don't actually implement
+	// BinderWriteReadIoctl (Linux: BINDER_WRITE_READ), we only ever have the
+	// first page.
+	var err error
+	if required.End > usermem.PageSize {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if required.Start == 0 {
+		return []memmap.Translation{
+			{
+				Source: memmap.MappableRange{0, usermem.PageSize},
+				File:   bp.platform.Memory(),
+				Offset: bp.mapped.Start,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (bp *Proc) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
new file mode 100644
index 000000000..b521bce75
--- /dev/null
+++ b/pkg/sentry/fs/context.go
@@ -0,0 +1,97 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// contextID is the kernel package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxRoot is a Context.Value key for a Dirent.
+	CtxRoot contextID = iota
+)
+
+// ContextCanAccessFile determines whether `file` can be accessed in the requested way
+// (for reading, writing, or execution) using the caller's credentials and user
+// namespace, as does Linux's fs/namei.c:generic_permission.
+func ContextCanAccessFile(ctx context.Context, inode *Inode, reqPerms PermMask) bool {
+	creds := auth.CredentialsFromContext(ctx)
+	uattr, err := inode.UnstableAttr(ctx)
+	if err != nil {
+		return false
+	}
+
+	p := uattr.Perms.Other
+	// Are we owner or in group?
+	if uattr.Owner.UID == creds.EffectiveKUID {
+		p = uattr.Perms.User
+	} else if creds.InGroup(uattr.Owner.GID) {
+		p = uattr.Perms.Group
+	}
+
+	// Are permissions satisfied without capability checks?
+	if p.SupersetOf(reqPerms) {
+		return true
+	}
+
+	if IsDir(inode.StableAttr) {
+		// CAP_DAC_OVERRIDE can override any perms on directories.
+		if inode.CheckCapability(ctx, linux.CAP_DAC_OVERRIDE) {
+			return true
+		}
+
+		// CAP_DAC_READ_SEARCH can normally only override Read perms,
+		// but for directories it can also override execution.
+		if !reqPerms.Write && inode.CheckCapability(ctx, linux.CAP_DAC_READ_SEARCH) {
+			return true
+		}
+	}
+
+	// CAP_DAC_OVERRIDE can always override Read/Write.
+	// Can override executable only when at least one execute bit is set.
+	if !reqPerms.Execute || uattr.Perms.AnyExecute() {
+		if inode.CheckCapability(ctx, linux.CAP_DAC_OVERRIDE) {
+			return true
+		}
+	}
+
+	// Read perms can be overridden by CAP_DAC_READ_SEARCH.
+	if reqPerms.OnlyRead() && inode.CheckCapability(ctx, linux.CAP_DAC_READ_SEARCH) {
+		return true
+	}
+	return false
+}
+
+// FileOwnerFromContext returns a FileOwner using the effective user and group
+// IDs used by ctx.
+func FileOwnerFromContext(ctx context.Context) FileOwner {
+	creds := auth.CredentialsFromContext(ctx)
+	return FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
+}
+
+// RootFromContext returns the root of the virtual filesystem observed by ctx,
+// or nil if ctx is not associated with a virtual filesystem. If
+// RootFromContext returns a non-nil fs.Dirent, a reference is taken on it.
+func RootFromContext(ctx context.Context) *Dirent {
+	if v := ctx.Value(CtxRoot); v != nil {
+		return v.(*Dirent)
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
new file mode 100644
index 000000000..ea74d0efd
--- /dev/null
+++ b/pkg/sentry/fs/copy_up.go
@@ -0,0 +1,414 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"io"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// copyUp copies a file in an overlay from a lower filesystem to an
+// upper filesytem so that the file can be modified in the upper
+// filesystem. Copying a file involves several steps:
+//
+// - All parent directories of the file are created in the upper
+//   filesystem if they don't exist there. For instance:
+//
+//     upper /dir0
+//     lower /dir0/dir1/file
+//
+//   copyUp of /dir0/dir1/file creates /dir0/dir1 in order to create
+//   /dir0/dir1/file.
+//
+// - The file content is copied from the lower file to the upper
+//   file. For symlinks this is the symlink target. For directories,
+//   upper directory entries are merged with lower directory entries
+//   so there is no need to copy any entries.
+//
+// - A subset of file attributes of the lower file are set on the
+//   upper file. These are the file owner, the file timestamps,
+//   and all non-overlay extended attributes. copyUp will fail if
+//   the upper filesystem does not support the setting of these
+//   attributes.
+//
+//   The file's permissions are set when the file is created and its
+//   size will be brought up to date when its contents are copied.
+//   Notably no attempt is made to bring link count up to date because
+//   hard links are currently not preserved across overlay filesystems.
+//
+// - Memory mappings of the lower file are invalidated and memory
+//   references are transferred to the upper file. From this point on,
+//   memory mappings of the file will be backed by content in the upper
+//   filesystem.
+//
+// Synchronization:
+//
+// copyUp synchronizes with rename(2) using renameMu to ensure that
+// parentage does not change while a file is being copied. In the context
+// of rename(2), copyUpLockedForRename should be used to avoid deadlock on
+// renameMu.
+//
+// The following operations synchronize with copyUp using copyMu:
+//
+// - InodeOperations, i.e. to ensure that looking up a directory takes
+//   into account new upper filesystem directories created by copy up,
+//   which subsequently can be modified.
+//
+// - FileOperations, i.e. to ensure that reading from a file does not
+//   continue using a stale, lower filesystem handle when the file is
+//   written to.
+//
+// Lock ordering: Dirent.mu -> Inode.overlay.copyMu -> Inode.mu.
+//
+// Caveats:
+//
+// If any step in copying up a file fails, copyUp cleans the upper
+// filesystem of any partially up-to-date file. If this cleanup fails,
+// the overlay may be in an unacceptable, inconsistent state, so copyUp
+// panics. If copyUp fails because any step (above) fails, a generic
+// error is returned.
+//
+// copyUp currently makes no attempt to optimize copying up file content.
+// For large files, this means that copyUp blocks until the entire file
+// is copied synchronously.
+func copyUp(ctx context.Context, d *Dirent) error {
+	renameMu.RLock()
+	defer renameMu.RUnlock()
+	return copyUpLockedForRename(ctx, d)
+}
+
+// copyUpLockedForRename is the same as copyUp except that it does not lock
+// renameMu.
+//
+// It copies each component of d that does not yet exist in the upper
+// filesystem. If d already exists in the upper filesystem, it is a no-op.
+//
+// Any error returned indicates a failure to copy all of d. This may
+// leave the upper filesystem filled with any number of parent directories
+// but the upper filesystem will never be in an inconsistent state.
+//
+// Preconditions:
+// - d.Inode.overlay is non-nil.
+func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
+	for {
+		// Did we race with another copy up or does there
+		// already exist something in the upper filesystem
+		// for d?
+		d.Inode.overlay.copyMu.Lock()
+		if d.Inode.overlay.upper != nil {
+			d.Inode.overlay.copyMu.Unlock()
+			// Done, d is in the upper filesystem.
+			return nil
+		}
+		d.Inode.overlay.copyMu.Unlock()
+
+		// Find the next component to copy up. We will work our way
+		// down to the last component of d and finally copy it.
+		next := findNextCopyUp(ctx, d)
+
+		// Attempt to copy.
+		if err := doCopyUp(ctx, next); err != nil {
+			return err
+		}
+	}
+}
+
+// findNextCopyUp finds the next component of d from root that does not
+// yet exist in the upper filesystem. The parent of this component is
+// also returned, which is the root of the overlay in the worst case.
+func findNextCopyUp(ctx context.Context, d *Dirent) *Dirent {
+	next := d
+	for parent := next.parent; ; /* checked in-loop */ /* updated in-loop */ {
+		// Does this parent have a non-nil upper Inode?
+		parent.Inode.overlay.copyMu.RLock()
+		if parent.Inode.overlay.upper != nil {
+			parent.Inode.overlay.copyMu.RUnlock()
+			// Note that since we found an upper, it is stable.
+			return next
+		}
+		parent.Inode.overlay.copyMu.RUnlock()
+
+		// Continue searching for a parent with a non-nil
+		// upper Inode.
+		next = parent
+		parent = next.parent
+	}
+}
+
+func doCopyUp(ctx context.Context, d *Dirent) error {
+	// Wait to get exclusive access to the upper Inode.
+	d.Inode.overlay.copyMu.Lock()
+	defer d.Inode.overlay.copyMu.Unlock()
+	if d.Inode.overlay.upper != nil {
+		// We raced with another doCopyUp, no problem.
+		return nil
+	}
+
+	// Perform the copy.
+	return copyUpLocked(ctx, d.parent, d)
+}
+
+// copyUpLocked creates a copy of next in the upper filesystem of parent.
+//
+// copyUpLocked must be called with d.Inode.overlay.copyMu locked.
+//
+// Returns a generic error on failure.
+//
+// Preconditions:
+// - parent.Inode.overlay.upper must be non-nil.
+// - next.Inode.overlay.copyMu must be locked writable.
+// - next.Inode.overlay.lower must be non-nil.
+// - upper filesystem must support setting file ownership and timestamps.
+func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
+	// Extract the attributes of the file we wish to copy.
+	attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx)
+	if err != nil {
+		log.Warningf("copy up failed to get lower attributes: %v", err)
+		return syserror.EIO
+	}
+
+	var childUpperInode *Inode
+	parentUpper := parent.Inode.overlay.upper
+
+	// Create the file in the upper filesystem and get an Inode for it.
+	switch next.Inode.StableAttr.Type {
+	case RegularFile:
+		childFile, err := parentUpper.Create(ctx, RootFromContext(ctx), next.name, FileFlags{Read: true, Write: true}, attrs.Perms)
+		if err != nil {
+			log.Warningf("copy up failed to create file: %v", err)
+			return syserror.EIO
+		}
+		defer childFile.DecRef()
+		childUpperInode = childFile.Dirent.Inode
+
+	case Directory:
+		if err := parentUpper.CreateDirectory(ctx, RootFromContext(ctx), next.name, attrs.Perms); err != nil {
+			log.Warningf("copy up failed to create directory: %v", err)
+			return syserror.EIO
+		}
+		childUpper, err := parentUpper.Lookup(ctx, next.name)
+		if err != nil {
+			log.Warningf("copy up failed to lookup directory: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name)
+			return syserror.EIO
+		}
+		defer childUpper.DecRef()
+		childUpperInode = childUpper.Inode
+
+	case Symlink:
+		childLower := next.Inode.overlay.lower
+		link, err := childLower.Readlink(ctx)
+		if err != nil {
+			log.Warningf("copy up failed to read symlink value: %v", err)
+			return syserror.EIO
+		}
+		if err := parentUpper.CreateLink(ctx, RootFromContext(ctx), link, next.name); err != nil {
+			log.Warningf("copy up failed to create symlink: %v", err)
+			return syserror.EIO
+		}
+		childUpper, err := parentUpper.Lookup(ctx, next.name)
+		if err != nil {
+			log.Warningf("copy up failed to lookup symlink: %v", err)
+			cleanupUpper(ctx, parentUpper, next.name)
+			return syserror.EIO
+		}
+		defer childUpper.DecRef()
+		childUpperInode = childUpper.Inode
+
+	default:
+		return syserror.EINVAL
+	}
+
+	// Bring file attributes up to date. This does not include size, which will be
+	// brought up to date with copyContentsLocked.
+	if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil {
+		log.Warningf("copy up failed to copy up attributes: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name)
+		return syserror.EIO
+	}
+
+	// Copy the entire file.
+	if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil {
+		log.Warningf("copy up failed to copy up contents: %v", err)
+		cleanupUpper(ctx, parentUpper, next.name)
+		return syserror.EIO
+	}
+
+	lowerMappable := next.Inode.overlay.lower.Mappable()
+	upperMappable := childUpperInode.Mappable()
+	if lowerMappable != nil && upperMappable == nil {
+		log.Warningf("copy up failed: cannot ensure memory mapping coherence")
+		cleanupUpper(ctx, parentUpper, next.name)
+		return syserror.EIO
+	}
+
+	// Propagate memory mappings to the upper Inode.
+	next.Inode.overlay.mapsMu.Lock()
+	defer next.Inode.overlay.mapsMu.Unlock()
+	if upperMappable != nil {
+		// Remember which mappings we added so we can remove them on failure.
+		allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+		for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			added := make(memmap.MappingsOfRange)
+			for m := range seg.Value() {
+				if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start()); err != nil {
+					for m := range added {
+						upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start())
+					}
+					for mr, mappings := range allAdded {
+						for m := range mappings {
+							upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start)
+						}
+					}
+					return err
+				}
+				added[m] = struct{}{}
+			}
+			allAdded[seg.Range()] = added
+		}
+	}
+
+	// Take a reference on the upper Inode (transferred to
+	// next.Inode.overlay.upper) and make new translations use it.
+	next.Inode.overlay.dataMu.Lock()
+	childUpperInode.IncRef()
+	next.Inode.overlay.upper = childUpperInode
+	next.Inode.overlay.dataMu.Unlock()
+
+	// Invalidate existing translations through the lower Inode.
+	next.Inode.overlay.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+	// Remove existing memory mappings from the lower Inode.
+	if lowerMappable != nil {
+		for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			for m := range seg.Value() {
+				lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start())
+			}
+		}
+	}
+
+	return nil
+}
+
+// cleanupUpper removes name from parent, and panics if it is unsuccessful.
+func cleanupUpper(ctx context.Context, parent *Inode, name string) {
+	if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil {
+		// Unfortunately we don't have much choice. We shouldn't
+		// willingly give the caller access to a nonsense filesystem.
+		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: failed to remove %q from upper filesystem: %v", name, err))
+	}
+}
+
+// copyUpBuffers is a buffer pool for copying file content. The buffer
+// size is the same used by io.Copy.
+var copyUpBuffers = sync.Pool{New: func() interface{} { return make([]byte, 8*usermem.PageSize) }}
+
+// copyContentsLocked copies the contents of lower to upper. It panics if
+// less than size bytes can be copied.
+func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size int64) error {
+	// We don't support copying up for anything other than regular files.
+	if lower.StableAttr.Type != RegularFile {
+		return nil
+	}
+
+	// Get a handle to the upper filesystem, which we will write to.
+	upperFile, err := overlayFile(ctx, upper, FileFlags{Write: true})
+	if err != nil {
+		return err
+	}
+	defer upperFile.DecRef()
+
+	// Get a handle to the lower filesystem, which we will read from.
+	lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true})
+	if err != nil {
+		return err
+	}
+	defer lowerFile.DecRef()
+
+	// Use a buffer pool to minimize allocations.
+	buf := copyUpBuffers.Get().([]byte)
+	defer copyUpBuffers.Put(buf)
+
+	// Transfer the contents.
+	//
+	// One might be able to optimize this by doing parallel reads, parallel writes and reads, larger
+	// buffers, etc. But we really don't know anything about the underlying implementation, so these
+	// optimizations could be self-defeating. So we leave this as simple as possible.
+	var offset int64
+	for {
+		nr, err := lowerFile.FileOperations.Read(ctx, lowerFile, usermem.BytesIOSequence(buf), offset)
+		if err != nil && err != io.EOF {
+			return err
+		}
+		if nr == 0 {
+			if offset != size {
+				// Same as in cleanupUpper, we cannot live
+				// with ourselves if we do anything less.
+				panic(fmt.Sprintf("filesystem is in an inconsistent state: wrote only %d bytes of %d sized file", offset, size))
+			}
+			return nil
+		}
+		nw, err := upperFile.FileOperations.Write(ctx, upperFile, usermem.BytesIOSequence(buf[:nr]), offset)
+		if err != nil {
+			return err
+		}
+		offset += nw
+	}
+}
+
+// copyAttributesLocked copies a subset of lower's attributes to upper,
+// specifically owner, timestamps (except of status change time), and
+// extended attributes. Notably no attempt is made to copy link count.
+// Size and permissions are set on upper when the file content is copied
+// and when the file is created respectively.
+func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error {
+	// Extract attributes fro the lower filesystem.
+	lowerAttr, err := lower.UnstableAttr(ctx)
+	if err != nil {
+		return err
+	}
+	lowerXattr, err := lower.Listxattr()
+	if err != nil && err != syserror.EOPNOTSUPP {
+		return err
+	}
+
+	// Set the attributes on the upper filesystem.
+	if err := upper.InodeOperations.SetOwner(ctx, upper, lowerAttr.Owner); err != nil {
+		return err
+	}
+	if err := upper.InodeOperations.SetTimestamps(ctx, upper, TimeSpec{
+		ATime: lowerAttr.AccessTime,
+		MTime: lowerAttr.ModificationTime,
+	}); err != nil {
+		return err
+	}
+	for name := range lowerXattr {
+		value, err := lower.Getxattr(name)
+		if err != nil {
+			return err
+		}
+		if err := upper.InodeOperations.Setxattr(upper, name, value); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
new file mode 100644
index 000000000..c3c9d963d
--- /dev/null
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -0,0 +1,182 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs_test
+
+import (
+	"bytes"
+	"crypto/rand"
+	"fmt"
+	"io"
+	"sync"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// origFileSize is the original file size. This many bytes should be
+	// copied up before the test file is modified.
+	origFileSize = 4096
+
+	// truncatedFileSize is the size to truncate all test files.
+	truncateFileSize = 10
+)
+
+// TestConcurrentCopyUp is a copy up stress test for an overlay.
+//
+// It creates a 64-level deep directory tree in the lower filesystem and
+// populates the last subdirectory with 64 files containing random content:
+//
+//    /lower
+//      /sudir0/.../subdir63/
+//                     /file0
+//                     ...
+//                     /file63
+//
+// The files are truncated concurrently by 4 goroutines per file.
+// These goroutines contend with copying up all parent 64 subdirectories
+// as well as the final file content.
+//
+// At the end of the test, we assert that the files respect the new truncated
+// size and contain the content we expect.
+func TestConcurrentCopyUp(t *testing.T) {
+	ctx := contexttest.Context(t)
+	files := makeOverlayTestFiles(t)
+
+	var wg sync.WaitGroup
+	for _, file := range files {
+		for i := 0; i < 4; i++ {
+			wg.Add(1)
+			go func(o *overlayTestFile) {
+				if err := o.File.Dirent.Inode.Truncate(ctx, o.File.Dirent, truncateFileSize); err != nil {
+					t.Fatalf("failed to copy up: %v", err)
+				}
+				wg.Done()
+			}(file)
+		}
+	}
+	wg.Wait()
+
+	for _, file := range files {
+		got := make([]byte, origFileSize)
+		n, err := file.File.Readv(ctx, usermem.BytesIOSequence(got))
+		if int(n) != truncateFileSize {
+			t.Fatalf("read %d bytes from file, want %d", n, truncateFileSize)
+		}
+		if err != nil && err != io.EOF {
+			t.Fatalf("read got error %v, want nil", err)
+		}
+		if !bytes.Equal(got[:n], file.content[:truncateFileSize]) {
+			t.Fatalf("file content is %v, want %v", got[:n], file.content[:truncateFileSize])
+		}
+	}
+}
+
+type overlayTestFile struct {
+	File    *fs.File
+	name    string
+	content []byte
+}
+
+func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
+	ctx := contexttest.Context(t)
+
+	// Create a lower tmpfs mount.
+	fsys, _ := fs.FindFilesystem("tmpfs")
+	lower, err := fsys.Mount(contexttest.Context(t), "", fs.MountSourceFlags{}, "")
+	if err != nil {
+		t.Fatalf("failed to mount tmpfs: %v", err)
+	}
+	lowerRoot := fs.NewDirent(lower, "")
+
+	// Make a deep set of subdirectories that everyone shares.
+	next := lowerRoot
+	for i := 0; i < 64; i++ {
+		name := fmt.Sprintf("subdir%d", i)
+		err := next.CreateDirectory(ctx, lowerRoot, name, fs.FilePermsFromMode(0777))
+		if err != nil {
+			t.Fatalf("failed to create dir %q: %v", name, err)
+		}
+		next, err = next.Walk(ctx, lowerRoot, name)
+		if err != nil {
+			t.Fatalf("failed to walk to %q: %v", name, err)
+		}
+	}
+
+	// Make a bunch of files in the last directory.
+	var files []*overlayTestFile
+	for i := 0; i < 64; i++ {
+		name := fmt.Sprintf("file%d", i)
+		f, err := next.Create(ctx, next, name, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+		if err != nil {
+			t.Fatalf("failed to create file %q: %v", name, err)
+		}
+		defer f.DecRef()
+
+		relname, _ := f.Dirent.FullName(lowerRoot)
+
+		o := &overlayTestFile{
+			name:    relname,
+			content: make([]byte, origFileSize),
+		}
+
+		if _, err := rand.Read(o.content); err != nil {
+			t.Fatalf("failed to read from /dev/urandom: %v", err)
+		}
+
+		if _, err := f.Writev(ctx, usermem.BytesIOSequence(o.content)); err != nil {
+			t.Fatalf("failed to write content to file %q: %v", name, err)
+		}
+
+		files = append(files, o)
+	}
+
+	// Create an empty upper tmpfs mount which we will copy up into.
+	upper, err := fsys.Mount(ctx, "", fs.MountSourceFlags{}, "")
+	if err != nil {
+		t.Fatalf("failed to mount tmpfs: %v", err)
+	}
+
+	// Construct an overlay root.
+	overlay, err := fs.NewOverlayRoot(ctx, upper, lower, fs.MountSourceFlags{})
+	if err != nil {
+		t.Fatalf("failed to construct overlay root: %v", err)
+	}
+
+	// Create a MountNamespace to traverse the file system.
+	mns, err := fs.NewMountNamespace(ctx, overlay)
+	if err != nil {
+		t.Fatalf("failed to construct mount manager: %v", err)
+	}
+
+	// Walk to all of the files in the overlay, open them readable.
+	for _, f := range files {
+		d, err := mns.FindInode(ctx, mns.Root(), mns.Root(), f.name, 0)
+		if err != nil {
+			t.Fatalf("failed to find %q: %v", f.name, err)
+		}
+		defer d.DecRef()
+
+		f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+		if err != nil {
+			t.Fatalf("failed to open file %q readable: %v", f.name, err)
+		}
+	}
+
+	return files
+}
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
new file mode 100644
index 000000000..d42e8da81
--- /dev/null
+++ b/pkg/sentry/fs/dentry.go
@@ -0,0 +1,232 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"sort"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// DentAttr is the metadata of a directory entry. It is a subset of StableAttr.
+type DentAttr struct {
+	// Type is the InodeType of an Inode.
+	Type InodeType
+
+	// InodeID uniquely identifies an Inode on a device.
+	InodeID uint64
+}
+
+// GenericDentAttr returns a generic DentAttr where:
+//
+// Type == nt
+// InodeID == the inode id of a new inode on device.
+func GenericDentAttr(nt InodeType, device *device.Device) DentAttr {
+	return DentAttr{
+		Type:    nt,
+		InodeID: device.NextIno(),
+	}
+}
+
+// DentrySerializer serializes a directory entry.
+type DentrySerializer interface {
+	// CopyOut serializes a directory entry based on its name and attributes.
+	CopyOut(name string, attributes DentAttr) error
+
+	// Written returns the number of bytes written.
+	Written() int
+}
+
+// CollectEntriesSerializer copies DentAttrs to Entries. The order in
+// which entries are encountered is preserved in Order.
+type CollectEntriesSerializer struct {
+	Entries map[string]DentAttr
+	Order   []string
+}
+
+// CopyOut implements DentrySerializer.CopyOut.
+func (c *CollectEntriesSerializer) CopyOut(name string, attr DentAttr) error {
+	if c.Entries == nil {
+		c.Entries = make(map[string]DentAttr)
+	}
+	c.Entries[name] = attr
+	c.Order = append(c.Order, name)
+	return nil
+}
+
+// Written implements DentrySerializer.Written.
+func (c *CollectEntriesSerializer) Written() int {
+	return len(c.Entries)
+}
+
+// DirCtx is used by node.Readdir to emit directory entries.  It is not
+// thread-safe.
+type DirCtx struct {
+	// Serializer is used to serialize the node attributes.
+	Serializer DentrySerializer
+
+	// attrs are DentAttrs
+	attrs map[string]DentAttr
+
+	// DirCursor is the directory cursor.
+	// TODO: Once Handles are removed this can just live in the
+	// respective FileOperations implementations and not need to get
+	// plumbed everywhere.
+	DirCursor *string
+}
+
+// DirEmit is called for each directory entry.
+func (c *DirCtx) DirEmit(name string, attr DentAttr) error {
+	if c.Serializer != nil {
+		if err := c.Serializer.CopyOut(name, attr); err != nil {
+			return err
+		}
+	}
+	if c.attrs == nil {
+		c.attrs = make(map[string]DentAttr)
+	}
+	c.attrs[name] = attr
+	return nil
+}
+
+// DentAttrs returns a map of DentAttrs corresponding to the emitted directory
+// entries.
+func (c *DirCtx) DentAttrs() map[string]DentAttr {
+	if c.attrs == nil {
+		c.attrs = make(map[string]DentAttr)
+	}
+	return c.attrs
+}
+
+// GenericReaddir serializes DentAttrs based on a SortedDentryMap that must
+// contain _all_ up-to-date DentAttrs under a directory. If ctx.DirCursor is
+// not nil, it is updated to the name of the last DentAttr that was
+// successfully serialized.
+//
+// Returns the number of entries serialized.
+func GenericReaddir(ctx *DirCtx, s *SortedDentryMap) (int, error) {
+	// Retrieve the next directory entries.
+	var names []string
+	var entries map[string]DentAttr
+	if ctx.DirCursor != nil {
+		names, entries = s.GetNext(*ctx.DirCursor)
+	} else {
+		names, entries = s.GetAll()
+	}
+
+	// Try to serialize each entry.
+	var serialized int
+	for _, name := range names {
+		// Skip "" per POSIX. Skip "." and ".." which will be added by Dirent.Readdir.
+		if name == "" || name == "." || name == ".." {
+			continue
+		}
+
+		// Emit the directory entry.
+		if err := ctx.DirEmit(name, entries[name]); err != nil {
+			// Return potentially a partial serialized count.
+			return serialized, err
+		}
+
+		// We successfully serialized this entry.
+		serialized++
+
+		// Update the cursor with the name of the entry last serialized.
+		if ctx.DirCursor != nil {
+			*ctx.DirCursor = name
+		}
+	}
+
+	// Everything was serialized.
+	return serialized, nil
+}
+
+// SortedDentryMap is a sorted map of names and fs.DentAttr entries.
+type SortedDentryMap struct {
+	// names is always kept in sorted-order.
+	names []string
+
+	// entries maps names to fs.DentAttrs.
+	entries map[string]DentAttr
+}
+
+// NewSortedDentryMap maintains entries in name sorted order.
+func NewSortedDentryMap(entries map[string]DentAttr) *SortedDentryMap {
+	s := &SortedDentryMap{
+		names:   make([]string, 0, len(entries)),
+		entries: entries,
+	}
+	// Don't allow s.entries to be nil, because nil maps arn't Saveable.
+	if s.entries == nil {
+		s.entries = make(map[string]DentAttr)
+	}
+
+	// Collect names from entries and sort them.
+	for name := range s.entries {
+		s.names = append(s.names, name)
+	}
+	sort.Strings(s.names)
+	return s
+}
+
+// GetAll returns all names and entries in s.
+func (s *SortedDentryMap) GetAll() ([]string, map[string]DentAttr) {
+	return s.names, s.entries
+}
+
+// GetNext returns names after cursor in s and all entries.
+func (s *SortedDentryMap) GetNext(cursor string) ([]string, map[string]DentAttr) {
+	i := sort.SearchStrings(s.names, cursor)
+	if i == len(s.names) {
+		return nil, s.entries
+	}
+
+	// Return everything strictly after the cursor.
+	if s.names[i] == cursor {
+		i++
+	}
+	return s.names[i:], s.entries
+}
+
+// Add adds an entry with the given name to the map, preserving sort order.  If
+// name already exists in the map, its entry will be overwritten.
+func (s *SortedDentryMap) Add(name string, entry DentAttr) {
+	if _, ok := s.entries[name]; !ok {
+		// Map does not yet contain an entry with this name.  We must
+		// insert it in s.names at the appropriate spot.
+		i := sort.SearchStrings(s.names, name)
+		s.names = append(s.names, "")
+		copy(s.names[i+1:], s.names[i:])
+		s.names[i] = name
+	}
+	s.entries[name] = entry
+}
+
+// Remove removes an entry with the given name from the map, preserving sort order.
+func (s *SortedDentryMap) Remove(name string) {
+	if _, ok := s.entries[name]; !ok {
+		return
+	}
+	i := sort.SearchStrings(s.names, name)
+	copy(s.names[i:], s.names[i+1:])
+	s.names = s.names[:len(s.names)-1]
+	delete(s.entries, name)
+}
+
+// Contains reports whether the map contains an entry with the given name.
+func (s *SortedDentryMap) Contains(name string) bool {
+	_, ok := s.entries[name]
+	return ok
+}
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
new file mode 100644
index 000000000..42049ecb5
--- /dev/null
+++ b/pkg/sentry/fs/dev/BUILD
@@ -0,0 +1,53 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "dev_state",
+    srcs = [
+        "dev.go",
+        "fs.go",
+        "full.go",
+        "null.go",
+        "random.go",
+    ],
+    out = "dev_state.go",
+    package = "dev",
+)
+
+go_library(
+    name = "dev",
+    srcs = [
+        "dev.go",
+        "dev_state.go",
+        "device.go",
+        "fs.go",
+        "full.go",
+        "null.go",
+        "random.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/ashmem",
+        "//pkg/sentry/fs/binder",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
new file mode 100644
index 000000000..36c61bfc2
--- /dev/null
+++ b/pkg/sentry/fs/dev/dev.go
@@ -0,0 +1,122 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package dev provides a filesystem with simple devices.
+package dev
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ashmem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Dev is the root node.
+type Dev struct {
+	ramfs.Dir
+}
+
+func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
+	return fs.NewInode(iops, msrc, fs.StableAttr{
+		DeviceID:  devDevice.DeviceID(),
+		InodeID:   devDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.CharacterDevice,
+	})
+}
+
+func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	iops := &ramfs.Dir{}
+	iops.InitDir(ctx, map[string]*fs.Inode{}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return fs.NewInode(iops, msrc, fs.StableAttr{
+		DeviceID:  devDevice.DeviceID(),
+		InodeID:   devDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Directory,
+	})
+}
+
+func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.Inode {
+	iops := &ramfs.Symlink{}
+	iops.InitSymlink(ctx, fs.RootOwner, target)
+	return fs.NewInode(iops, msrc, fs.StableAttr{
+		DeviceID:  devDevice.DeviceID(),
+		InodeID:   devDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Symlink,
+	})
+}
+
+// New returns the root node of a device filesystem.
+func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEnabled bool) *fs.Inode {
+	d := &Dev{}
+
+	contents := map[string]*fs.Inode{
+		"fd":     newSymlink(ctx, "/proc/self/fd", msrc),
+		"stdin":  newSymlink(ctx, "/proc/self/fd/0", msrc),
+		"stdout": newSymlink(ctx, "/proc/self/fd/1", msrc),
+		"stderr": newSymlink(ctx, "/proc/self/fd/2", msrc),
+
+		"null": newCharacterDevice(newNullDevice(ctx, fs.RootOwner, 0666), msrc),
+		"zero": newCharacterDevice(newZeroDevice(ctx, fs.RootOwner, 0666), msrc),
+		"full": newCharacterDevice(newFullDevice(ctx, fs.RootOwner, 0666), msrc),
+
+		// This is not as good as /dev/random in linux because go
+		// runtime uses sys_random and /dev/urandom internally.
+		// According to 'man 4 random', this will be sufficient unless
+		// application uses this to generate long-lived GPG/SSL/SSH
+		// keys.
+		"random":  newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
+		"urandom": newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
+
+		"shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc, platform.FromContext(ctx)),
+
+		// A devpts is typically mounted at /dev/pts to provide
+		// pseudoterminal support. Place an empty directory there for
+		// the devpts to be mounted over.
+		"pts": newDirectory(ctx, msrc),
+		// Similarly, applications expect a ptmx device at /dev/ptmx
+		// connected to the terminals provided by /dev/pts/. Rather
+		// than creating a device directly (which requires a hairy
+		// lookup on open to determine if a devpts exists), just create
+		// a symlink to the ptmx provided by devpts. (The Linux devpts
+		// documentation recommends this).
+		//
+		// If no devpts is mounted, this will simply be a dangling
+		// symlink, which is fine.
+		"ptmx": newSymlink(ctx, "pts/ptmx", msrc),
+	}
+
+	if binderEnabled {
+		binder := binder.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666))
+		contents["binder"] = newCharacterDevice(binder, msrc)
+	}
+
+	if ashmemEnabled {
+		ashmem := ashmem.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666))
+		contents["ashmem"] = newCharacterDevice(ashmem, msrc)
+	}
+
+	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return fs.NewInode(d, msrc, fs.StableAttr{
+		DeviceID:  devDevice.DeviceID(),
+		InodeID:   devDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Directory,
+	})
+}
diff --git a/pkg/sentry/fs/dev/device.go b/pkg/sentry/fs/dev/device.go
new file mode 100644
index 000000000..9d935e008
--- /dev/null
+++ b/pkg/sentry/fs/dev/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// devDevice is the pseudo-filesystem device.
+var devDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
new file mode 100644
index 000000000..4945ac962
--- /dev/null
+++ b/pkg/sentry/fs/dev/fs.go
@@ -0,0 +1,90 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Optional key containing boolean flag which specifies if Android Binder IPC should be enabled.
+const binderEnabledKey = "binder_enabled"
+
+// Optional key containing boolean flag which specifies if Android ashmem should be enabled.
+const ashmemEnabledKey = "ashmem_enabled"
+
+// filesystem is a devtmpfs.
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// FilesystemName is the name underwhich the filesystem is registered.
+// Name matches drivers/base/devtmpfs.c:dev_fs_type.name.
+const FilesystemName = "devtmpfs"
+
+// Name is the name of the file system.
+func (*filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount allows users to mount(2) this file system.
+func (*filesystem) AllowUserMount() bool {
+	return true
+}
+
+// Flags returns that there is nothing special about this file system.
+//
+// In Linux, devtmpfs does the same thing.
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns a devtmpfs root that can be positioned in the vfs.
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// device is always ignored.
+	// devtmpfs backed by ramfs ignores bad options. See fs/ramfs/inode.c:ramfs_parse_options.
+	//  -> we should consider parsing the mode and backing devtmpfs by this.
+
+	// Parse generic comma-separated key=value options.
+	options := fs.GenericMountSourceOptions(data)
+
+	// binerEnabledKey is optional and binder is disabled by default.
+	binderEnabled := false
+	if beStr, exists := options[binderEnabledKey]; exists {
+		var err error
+		binderEnabled, err = strconv.ParseBool(beStr)
+		if err != nil {
+			return nil, syserror.EINVAL
+		}
+	}
+
+	// ashmemEnabledKey is optional and ashmem is disabled by default.
+	ashmemEnabled := false
+	if aeStr, exists := options[ashmemEnabledKey]; exists {
+		var err error
+		ashmemEnabled, err = strconv.ParseBool(aeStr)
+		if err != nil {
+			return nil, syserror.EINVAL
+		}
+	}
+
+	// Construct the devtmpfs root.
+	return New(ctx, fs.NewNonCachingMountSource(f, flags), binderEnabled, ashmemEnabled), nil
+}
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
new file mode 100644
index 000000000..e13eb6c03
--- /dev/null
+++ b/pkg/sentry/fs/dev/full.go
@@ -0,0 +1,53 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// fullDevice is used to implement /dev/full.
+type fullDevice struct {
+	ramfs.Entry
+}
+
+func newFullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *fullDevice {
+	f := &fullDevice{}
+	f.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	return f
+}
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev by
+// returining ENOSPC.
+func (f *fullDevice) DeprecatedPwritev(_ context.Context, _ usermem.IOSequence, _ int64) (int64, error) {
+	return 0, syserror.ENOSPC
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (f *fullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, _ int64) (int64, error) {
+	return dst.ZeroOut(ctx, math.MaxInt64)
+}
+
+// Truncate should be simply ignored for character devices on linux.
+func (f *fullDevice) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
new file mode 100644
index 000000000..66b8ba967
--- /dev/null
+++ b/pkg/sentry/fs/dev/null.go
@@ -0,0 +1,96 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"io"
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type nullDevice struct {
+	ramfs.Entry
+}
+
+func newNullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *nullDevice {
+	n := &nullDevice{}
+	n.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	return n
+}
+
+// DeprecatedPreadv reads data from the device.
+func (n *nullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	return 0, io.EOF
+}
+
+// DeprecatedPwritev discards writes.
+func (n *nullDevice) DeprecatedPwritev(_ context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// Truncate should be simply ignored for character devices on linux.
+func (n *nullDevice) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+type zeroDevice struct {
+	nullDevice
+}
+
+func newZeroDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *zeroDevice {
+	zd := &zeroDevice{}
+	zd.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	return zd
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (zd *zeroDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	return dst.ZeroOut(ctx, math.MaxInt64)
+}
+
+// GetFile overrides ramfs.Entry.GetFile and returns a zeroFile instead.
+func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	// Allow pread(2) and pwrite(2) on this file.
+	flags.Pread = true
+	flags.Pwrite = true
+
+	return fs.NewFile(ctx, dirent, flags, &zeroFileOperations{
+		FileOperations: &fsutil.Handle{HandleOperations: dirent.Inode.HandleOps()},
+	}), nil
+}
+
+type zeroFileOperations struct {
+	fs.FileOperations
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (*zeroFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	m, err := mm.NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+	if err != nil {
+		return err
+	}
+	opts.MappingIdentity = m
+	opts.Mappable = m
+	return nil
+}
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
new file mode 100644
index 000000000..0402f9355
--- /dev/null
+++ b/pkg/sentry/fs/dev/random.go
@@ -0,0 +1,55 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package dev
+
+import (
+	"crypto/rand"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type randomDevice struct {
+	ramfs.Entry
+}
+
+func newRandomDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *randomDevice {
+	r := &randomDevice{}
+	r.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	return r
+}
+
+// DeprecatedPreadv reads random data.
+func (*randomDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
+}
+
+// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev.
+func (*randomDevice) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	// On Linux, "Writing to /dev/random or /dev/urandom will update the
+	// entropy pool with the data written, but this will not result in a higher
+	// entropy count" - random(4). We don't need to support this, but we do
+	// need to support the write, so just make it a no-op a la /dev/null.
+	return src.NumBytes(), nil
+}
+
+// Truncate should be simply ignored for character devices on linux.
+func (r *randomDevice) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
new file mode 100644
index 000000000..a75c7ea7e
--- /dev/null
+++ b/pkg/sentry/fs/dirent.go
@@ -0,0 +1,1605 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"path"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type globalDirentMap struct {
+	mu      sync.Mutex
+	dirents map[*Dirent]struct{}
+}
+
+func (g *globalDirentMap) add(d *Dirent) {
+	g.mu.Lock()
+	g.dirents[d] = struct{}{}
+	g.mu.Unlock()
+}
+
+func (g *globalDirentMap) remove(d *Dirent) {
+	g.mu.Lock()
+	delete(g.dirents, d)
+	g.mu.Unlock()
+}
+
+// allDirents keeps track of all Dirents that need to be considered in
+// Save/Restore for inode mappings.
+//
+// Because inodes do not hold paths, but inodes for external file systems map
+// to an external path, every user-visible Dirent is stored in this map and
+// iterated through upon save to keep inode ID -> restore path mappings.
+var allDirents = globalDirentMap{
+	dirents: map[*Dirent]struct{}{},
+}
+
+// renameMu protects the parent of *all* Dirents. (See explanation in
+// lockForRename.)
+//
+// See fs.go for lock ordering.
+var renameMu sync.RWMutex
+
+// Dirent holds an Inode in memory.
+//
+// A Dirent may be negative or positive:
+//
+// A negative Dirent contains a nil Inode and indicates that a path does not exist. This
+// is a convention taken from the Linux dcache, see fs/dcache.c. A negative Dirent remains
+// cached until a create operation replaces it with a positive Dirent. A negative Dirent
+// always has one reference owned by its parent and takes _no_ reference on its parent. This
+// ensures that its parent can be unhashed regardless of negative children.
+//
+// A positive Dirent contains a non-nil Inode. It remains cached for as long as there remain
+// references to it. A positive Dirent always takes a reference on its parent.
+//
+// A Dirent may be a root Dirent (parent is nil) or be parented (non-nil parent).
+//
+// Dirents currently do not attempt to free entries that lack application references under
+// memory pressure.
+type Dirent struct {
+	// AtomicRefCount is our reference count.
+	refs.AtomicRefCount
+
+	// userVisible indicates whether the Dirent is visible to the user or
+	// not.  Only user-visible Dirents should save inode mappings in
+	// save/restore, as only they hold the real path to the underlying
+	// inode.
+	//
+	// See newDirent and Dirent.afterLoad.
+	userVisible bool
+
+	// Inode is the underlying file object.
+	//
+	// Inode is exported currently to assist in implementing overlay Inodes (where a
+	// Inode.InodeOperations.Lookup may need to merge the Inode contained in a positive Dirent with
+	// another Inode). This is normally done before the Dirent is parented (there are
+	// no external references to it).
+	//
+	// Other objects in the VFS may take a reference to this Inode but only while holding
+	// a reference to this Dirent.
+	Inode *Inode
+
+	// name is the name (i.e. basename) of this entry.
+	//
+	// N.B. name is protected by parent.mu, not this node's mu!
+	name string
+
+	// parent is the parent directory.
+	//
+	// We hold a hard reference to the parent.
+	//
+	// parent is protected by renameMu.
+	parent *Dirent
+
+	// deleted may be set atomically when removed.
+	deleted int32 `state:"nosave"`
+
+	// frozen indicates this entry can't walk to unknown nodes.
+	frozen bool
+
+	// mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED.
+	mounted bool
+
+	// direntEntry identifies this Dirent as an element in a DirentCache. DirentCaches
+	// and their contents are not saved.
+	direntEntry `state:"nosave"`
+
+	// dirMu is a read-write mutex that protects caching decisions made by directory operations.
+	// Lock ordering: dirMu must be taken before mu (see below). Details:
+	//
+	// dirMu does not participate in Rename; instead mu and renameMu are used, see lockForRename.
+	//
+	// Creation and Removal operations must be synchronized with Walk to prevent stale negative
+	// caching. Note that this requirement is not specific to a _Dirent_ doing negative caching.
+	// The following race exists at any level of the VFS:
+	//
+	// For an object D that represents a directory, containing a cache of non-existent paths,
+	// protected by D.cacheMu:
+	//
+	// T1:                       T2:
+	//                           D.lookup(name)
+	//                           --> ENOENT
+	// D.create(name)
+	// --> success
+	// D.cacheMu.Lock
+	//   delete(D.cache, name)
+	// D.cacheMu.Unlock
+	//                           D.cacheMu.Lock
+	//                             D.cache[name] = true
+	//                           D.cacheMu.Unlock
+	//
+	// D.lookup(name)
+	// D.cacheMu.Lock
+	//   if D.cache[name] {
+	//   --> ENOENT (wrong)
+	//   }
+	// D.cacheMu.Lock
+	//
+	// Correct:
+	//
+	// T1:                       T2:
+	//                           D.cacheMu.Lock
+	//                             D.lookup(name)
+	//                             --> ENOENT
+	//                             D.cache[name] = true
+	//                           D.cacheMu.Unlock
+	// D.cacheMu.Lock
+	//   D.create(name)
+	//   --> success
+	//   delete(D.cache, name)
+	// D.cacheMu.Unlock
+	//
+	// D.cacheMu.Lock
+	//   D.lookup(name)
+	//   --> EXISTS (right)
+	// D.cacheMu.Unlock
+	//
+	// Note that the above "correct" solution causes too much lock contention: all lookups are
+	// synchronized with each other. This is a problem because lookups are involved in any VFS
+	// path operation.
+	//
+	// A Dirent diverges from the single D.cacheMu and instead uses two locks: dirMu to protect
+	// concurrent creation/removal/lookup caching, and mu to protect the Dirent's children map
+	// in general.
+	//
+	// This allows for concurrent Walks to be executed in order to pipeline lookups. For instance
+	// for a hot directory /a/b, threads T1, T2, T3 will only block on each other update the
+	// children map of /a/b when their individual lookups complete.
+	//
+	// T1:           T2:           T3:
+	// stat(/a/b/c)  stat(/a/b/d)  stat(/a/b/e)
+	dirMu sync.RWMutex `state:"nosave"`
+
+	// mu protects the below fields. Lock ordering: mu must be taken after dirMu.
+	mu sync.Mutex `state:"nosave"`
+
+	// children are cached via weak references.
+	children map[string]*refs.WeakRef
+}
+
+// NewDirent returns a new root Dirent, taking the caller's reference on inode. The caller
+// holds the only reference to the Dirent. Parents may call hashChild to parent this Dirent.
+func NewDirent(inode *Inode, name string) *Dirent {
+	d := newDirent(inode, name)
+	allDirents.add(d)
+	d.userVisible = true
+	return d
+}
+
+// NewTransientDirent creates a transient Dirent that shouldn't actually be
+// visible to users.
+func NewTransientDirent(inode *Inode) *Dirent {
+	return newDirent(inode, "transient")
+}
+
+func newDirent(inode *Inode, name string) *Dirent {
+	// The Dirent needs to maintain one reference to MountSource.
+	if inode != nil {
+		inode.MountSource.IncDirentRefs()
+	}
+	return &Dirent{
+		Inode:    inode,
+		name:     name,
+		children: make(map[string]*refs.WeakRef),
+	}
+}
+
+// NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent.
+func NewNegativeDirent(name string) *Dirent {
+	return newDirent(nil, name)
+}
+
+// IsRoot returns true if d is a root Dirent.
+func (d *Dirent) IsRoot() bool {
+	return d.parent == nil
+}
+
+// IsNegative returns true if d represents a path that does not exist.
+func (d *Dirent) IsNegative() bool {
+	return d.Inode == nil
+}
+
+// hashChild will hash child into the children list of its new parent d, carrying over
+// any "frozen" state from d.
+//
+// Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must
+// validate the returned unhashed weak reference. Common cases:
+//
+// * Remove: hashing a negative Dirent unhashes a positive Dirent (unimplemented).
+// * Create: hashing a positive Dirent unhashes a negative Dirent.
+// * Lookup: hashing any Dirent should not unhash any other Dirent.
+//
+// Preconditions:
+// * d.mu must be held.
+// * child must be a root Dirent.
+func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) {
+	if !child.IsRoot() {
+		panic("hashChild must be a root Dirent")
+	}
+
+	// Assign parentage.
+	child.parent = d
+
+	// Avoid letting negative Dirents take a reference on their parent; these Dirents
+	// don't have a role outside of the Dirent cache and should not keep their parent
+	// indefinitely pinned.
+	if !child.IsNegative() {
+		// Positive dirents must take a reference on their parent.
+		d.IncRef()
+	}
+
+	// Carry over parent's frozen state.
+	child.frozen = d.frozen
+
+	return d.hashChildParentSet(child)
+}
+
+// hashChildParentSet will rehash child into the children list of its parent d.
+//
+// Assumes that child.parent = d already.
+func (d *Dirent) hashChildParentSet(child *Dirent) (*refs.WeakRef, bool) {
+	if child.parent != d {
+		panic("hashChildParentSet assumes the child already belongs to the parent")
+	}
+
+	// Save any replaced child so our caller can validate it.
+	old, ok := d.children[child.name]
+
+	// Hash the child.
+	d.children[child.name] = refs.NewWeakRef(child, nil)
+
+	// Return any replaced child.
+	return old, ok
+}
+
+// SyncAll iterates through mount points under d and writes back their buffered
+// modifications to filesystems.
+func (d *Dirent) SyncAll(ctx context.Context) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// For negative Dirents there is nothing to sync. By definition these are
+	// leaves (there is nothing left to traverse).
+	if d.IsNegative() {
+		return
+	}
+
+	// There is nothing to sync for a read-only filesystem.
+	if !d.Inode.MountSource.Flags.ReadOnly {
+		// FIXME: This should be a mount traversal, not a
+		// Dirent traversal, because some Inodes that need to be synced
+		// may no longer be reachable by name (after sys_unlink).
+		//
+		// Write out metadata, dirty page cached pages, and sync disk/remote
+		// caches.
+		d.Inode.WriteOut(ctx)
+	}
+
+	// Continue iterating through other mounted filesystems.
+	for _, w := range d.children {
+		if child := w.Get(); child != nil {
+			child.(*Dirent).SyncAll(ctx)
+			child.DecRef()
+		}
+	}
+}
+
+// FullName returns the fully-qualified name and a boolean value representing
+// whether this Dirent was a descendant of root.
+// If the root argument is nil it is assumed to be the root of the Dirent tree.
+func (d *Dirent) FullName(root *Dirent) (string, bool) {
+	renameMu.RLock()
+	defer renameMu.RUnlock()
+	return d.fullName(root)
+}
+
+// fullName returns the fully-qualified name and a boolean value representing
+// if the root node was reachable from this Dirent.
+func (d *Dirent) fullName(root *Dirent) (string, bool) {
+	if d == root {
+		return "/", true
+	}
+
+	if d.IsRoot() {
+		if root != nil {
+			// We reached the top of the Dirent tree but did not encounter
+			// the given root. Return false for reachable so the caller
+			// can handle this situation accordingly.
+			return d.name, false
+		}
+		return d.name, true
+	}
+
+	// Traverse up to parent.
+	d.parent.mu.Lock()
+	name := d.name
+	d.parent.mu.Unlock()
+	parentName, reachable := d.parent.fullName(root)
+	s := path.Join(parentName, name)
+	if atomic.LoadInt32(&d.deleted) != 0 {
+		return s + " (deleted)", reachable
+	}
+	return s, reachable
+}
+
+func (d *Dirent) freeze() {
+	if d.frozen {
+		// Already frozen.
+		return
+	}
+	d.frozen = true
+
+	// Take a reference when freezing.
+	for _, w := range d.children {
+		if child := w.Get(); child != nil {
+			// NOTE: We would normally drop the reference here. But
+			// instead we're hanging on to it.
+			ch := child.(*Dirent)
+			ch.Freeze()
+		}
+	}
+
+	// Drop all expired weak references.
+	d.flush()
+}
+
+// Freeze prevents this dirent from walking to more nodes. Freeze is applied
+// recursively to all children.
+//
+// If this particular Dirent represents a Virtual node, then Walks and Creates
+// may proceed as before.
+//
+// Freeze can only be called before the application starts running, otherwise
+// the root it might be out of sync with the application root if modified by
+// sys_chroot.
+func (d *Dirent) Freeze() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	d.freeze()
+}
+
+// descendantOf returns true if the receiver dirent is equal to, or a
+// descendant of, the argument dirent.
+//
+// d.mu must be held.
+func (d *Dirent) descendantOf(p *Dirent) bool {
+	if d == p {
+		return true
+	}
+	if d.IsRoot() {
+		return false
+	}
+	return d.parent.descendantOf(p)
+}
+
+// walk walks to path name starting at the dirent, and will not traverse above
+// root Dirent.
+//
+// If walkMayUnlock is true then walk can unlock d.mu to execute a slow
+// Inode.Lookup, otherwise walk will keep d.mu locked.
+//
+// Preconditions:
+// - d.mu must be held.
+// - name must must not contain "/"s.
+func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
+	if !IsDir(d.Inode.StableAttr) {
+		return nil, syscall.ENOTDIR
+	}
+	if name == "" || name == "." {
+		d.IncRef()
+		return d, nil
+	} else if name == ".." {
+		renameMu.RLock()
+		// Respect the chroot. Note that in Linux there is no check to enforce
+		// that d is a descendant of root.
+		if d == root {
+			d.IncRef()
+			renameMu.RUnlock()
+			return d, nil
+		}
+		// Are we already at the root? Then ".." is ".".
+		if d.IsRoot() {
+			d.IncRef()
+			renameMu.RUnlock()
+			return d, nil
+		}
+		d.parent.IncRef()
+		renameMu.RUnlock()
+		return d.parent, nil
+	}
+
+	if w, ok := d.children[name]; ok {
+		// Try to resolve the weak reference to a hard reference.
+		if child := w.Get(); child != nil {
+			cd := child.(*Dirent)
+
+			// Is this a negative Dirent?
+			if cd.IsNegative() {
+				// Don't leak a reference; this doesn't matter as much for negative Dirents,
+				// which don't hold a hard reference on their parent (their parent holds a
+				// hard reference on them, and they contain virtually no state). But this is
+				// good house-keeping.
+				child.DecRef()
+				return nil, syscall.ENOENT
+			}
+
+			// Do we need to revalidate this child?
+			//
+			// We never allow the file system to revalidate mounts, that could cause them
+			// to unexpectedly drop out before umount.
+			if cd.mounted || !cd.Inode.MountSource.Revalidate(cd) {
+				// Good to go. This is the fast-path.
+				return cd, nil
+			}
+
+			// If we're revalidating a child, we must ensure all inotify watches release
+			// their pins on the child. Inotify doesn't properly support filesystems that
+			// revalidate dirents (since watches are lost on revalidation), but if we fail
+			// to unpin the watches child will never be GCed.
+			cd.Inode.Watches.Unpin(cd)
+
+			// This child needs to be revalidated, fallthrough to unhash it. Make sure
+			// to not leak a reference from Get().
+			//
+			// Note that previous lookups may still have a reference to this stale child;
+			// this can't be helped, but we can ensure that *new* lookups are up-to-date.
+			child.DecRef()
+		}
+
+		// Either our weak reference expired or we need to revalidate it. Unhash child first, we're
+		// about to replace it.
+		delete(d.children, name)
+		w.Drop()
+	}
+
+	// Are we allowed to do the lookup?
+	if d.frozen && !d.Inode.IsVirtual() {
+		return nil, syscall.ENOENT
+	}
+
+	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be expensive,
+	// if possible release the lock and re-acquire it.
+	if walkMayUnlock {
+		d.mu.Unlock()
+	}
+	c, err := d.Inode.Lookup(ctx, name)
+	if walkMayUnlock {
+		d.mu.Lock()
+	}
+	// No dice.
+	if err != nil {
+		return nil, err
+	}
+
+	// Sanity check c, its name must be consistent.
+	if c.name != name {
+		panic(fmt.Sprintf("lookup from %q to %q returned unexpected name %q", d.name, name, c.name))
+	}
+
+	// Now that we have the lock again, check if we raced.
+	if w, ok := d.children[name]; ok {
+		// Someone else looked up or created a child at name before us.
+		if child := w.Get(); child != nil {
+			cd := child.(*Dirent)
+
+			// There are active references to the existing child, prefer it to the one we
+			// retrieved from Lookup. Likely the Lookup happened very close to the insertion
+			// of child, so considering one stale over the other is fairly arbitrary.
+			c.DecRef()
+
+			// The child that was installed could be negative.
+			if cd.IsNegative() {
+				// If so, don't leak a reference and short circuit.
+				child.DecRef()
+				return nil, syscall.ENOENT
+			}
+
+			// We make the judgement call that if c raced with cd they are close enough to have
+			// the same staleness, so we don't attempt to revalidate cd. In Linux revalidations
+			// can continue indefinitely (see fs/namei.c, retry_estale); we try to avoid this.
+			return cd, nil
+		}
+
+		// Weak reference expired. We went through a full cycle of create/destroy in the time
+		// we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child
+		// we looked up.
+		delete(d.children, name)
+		w.Drop()
+	}
+
+	// Give the looked up child a parent. We cannot kick out entries, since we just checked above
+	// that there is nothing at name in d's children list.
+	if _, kicked := d.hashChild(c); kicked {
+		// Yell loudly.
+		panic(fmt.Sprintf("hashed child %q over existing child", c.name))
+	}
+
+	// Is this a negative Dirent?
+	if c.IsNegative() {
+		// Don't drop a reference on the negative Dirent, it was just installed and this is the
+		// only reference we'll ever get. d owns the reference.
+		return nil, syscall.ENOENT
+	}
+
+	// Return the positive Dirent.
+	return c, nil
+}
+
+// Walk walks to a new dirent, and will not walk higher than the given root
+// Dirent, which must not be nil.
+func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent, error) {
+	if root == nil {
+		panic("Dirent.Walk: root must not be nil")
+	}
+
+	d.dirMu.RLock()
+	d.mu.Lock()
+	child, err := d.walk(ctx, root, name, true /* may unlock */)
+	d.mu.Unlock()
+	d.dirMu.RUnlock()
+
+	return child, err
+}
+
+// exists returns true if name exists in relation to d.
+//
+// Preconditions: d.mu must be held.
+func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
+	child, err := d.walk(ctx, root, name, true /* may unlock */)
+	if err != nil {
+		// Child may not exist.
+		return false
+	}
+	// Child exists.
+	child.DecRef()
+	return true
+}
+
+// lockDirectory should be called for any operation that changes this `d`s
+// children (creating or removing them).
+func (d *Dirent) lockDirectory() func() {
+	if d.Inode.overlay != nil {
+		// overlay copyUp may need to look at Dirent parents, and hence
+		// may need renameMu.
+		renameMu.RLock()
+		d.dirMu.Lock()
+		d.mu.Lock()
+		return func() {
+			d.mu.Unlock()
+			d.dirMu.Unlock()
+			renameMu.RUnlock()
+		}
+	}
+
+	d.dirMu.Lock()
+	d.mu.Lock()
+	return func() {
+		d.mu.Unlock()
+		d.dirMu.Unlock()
+	}
+}
+
+// Create creates a new regular file in this directory.
+func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) {
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	// Does something already exist?
+	if d.exists(ctx, root, name) {
+		return nil, syscall.EEXIST
+	}
+
+	// Are we frozen?
+	if d.frozen && !d.Inode.IsVirtual() {
+		return nil, syscall.ENOENT
+	}
+
+	// Try the create. We need to trust the file system to return EEXIST (or something
+	// that will translate to EEXIST) if name already exists.
+	file, err := d.Inode.Create(ctx, d, name, flags, perms)
+	if err != nil {
+		return nil, err
+	}
+	child := file.Dirent
+
+	// Sanity check c, its name must be consistent.
+	if child.name != name {
+		panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
+	}
+
+	// File systems cannot return a negative Dirent on Create, that makes no sense.
+	if child.IsNegative() {
+		panic(fmt.Sprintf("create from %q to %q returned negative Dirent", d.name, name))
+	}
+
+	// Hash the child into its parent. We can only kick out a Dirent if it is negative
+	// (we are replacing something that does not exist with something that now does).
+	if w, kicked := d.hashChild(child); kicked {
+		if old := w.Get(); old != nil {
+			if !old.(*Dirent).IsNegative() {
+				panic(fmt.Sprintf("hashed child %q over a positive child", child.name))
+			}
+			// Don't leak a reference.
+			old.DecRef()
+
+			// Drop d's reference.
+			old.DecRef()
+		}
+
+		// Finally drop the useless weak reference on the floor.
+		w.Drop()
+	}
+
+	d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+
+	// Allow the file system to take extra references on c.
+	child.maybeExtendReference()
+
+	// Return the reference and the new file. When the last reference to
+	// the file is dropped, file.Dirent may no longer be cached.
+	return file, nil
+}
+
+// genericCreate executes create if name does not exist. Removes a negative Dirent at name if
+// create succeeds.
+//
+// Preconditions: d.mu must be held.
+func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error {
+	// Does something already exist?
+	if d.exists(ctx, root, name) {
+		return syscall.EEXIST
+	}
+
+	// Are we frozen?
+	if d.frozen && !d.Inode.IsVirtual() {
+		return syscall.ENOENT
+	}
+
+	// Execute the create operation.
+	if err := create(); err != nil {
+		return err
+	}
+
+	// Remove any negative Dirent. We've already asserted above with d.exists
+	// that the only thing remaining here can be a negative Dirent.
+	if w, ok := d.children[name]; ok {
+		// Same as Create.
+		if old := w.Get(); old != nil {
+			if !old.(*Dirent).IsNegative() {
+				panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name))
+			}
+			// Don't leak a reference.
+			old.DecRef()
+
+			// Drop d's reference.
+			old.DecRef()
+		}
+
+		// Unhash the negative Dirent, name needs to exist now.
+		delete(d.children, name)
+
+		// Finally drop the useless weak reference on the floor.
+		w.Drop()
+	}
+
+	return nil
+}
+
+// CreateLink creates a new link in this directory.
+func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname string) error {
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	return d.genericCreate(ctx, root, newname, func() error {
+		if err := d.Inode.CreateLink(ctx, d, oldname, newname); err != nil {
+			return err
+		}
+		d.Inode.Watches.Notify(newname, linux.IN_CREATE, 0)
+		return nil
+	})
+}
+
+// CreateHardLink creates a new hard link in this directory.
+func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Dirent, name string) error {
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	// Make sure that target does not span filesystems.
+	if d.Inode.MountSource != target.Inode.MountSource {
+		return syscall.EXDEV
+	}
+
+	return d.genericCreate(ctx, root, name, func() error {
+		if err := d.Inode.CreateHardLink(ctx, d, target, name); err != nil {
+			return err
+		}
+		target.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) // Link count change.
+		d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+		return nil
+	})
+}
+
+// CreateDirectory creates a new directory under this dirent.
+func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	return d.genericCreate(ctx, root, name, func() error {
+		if err := d.Inode.CreateDirectory(ctx, d, name, perms); err != nil {
+			return err
+		}
+		d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_CREATE, 0)
+		return nil
+	})
+}
+
+// Bind satisfies the InodeOperations interface; otherwise same as GetFile.
+func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, socket unix.BoundEndpoint, perms FilePermissions) error {
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	err := d.genericCreate(ctx, root, name, func() error {
+		if err := d.Inode.Bind(ctx, name, socket, perms); err != nil {
+			return err
+		}
+		d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+		return nil
+	})
+	if err == syscall.EEXIST {
+		return syscall.EADDRINUSE
+	}
+	return err
+}
+
+// CreateFifo creates a new named pipe under this dirent.
+func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	return d.genericCreate(ctx, root, name, func() error {
+		if err := d.Inode.CreateFifo(ctx, d, name, perms); err != nil {
+			return err
+		}
+		d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+		return nil
+	})
+}
+
+// getDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
+func (d *Dirent) getDotAttrs(root *Dirent) (DentAttr, DentAttr) {
+	// Get '.'.
+	sattr := d.Inode.StableAttr
+	dot := DentAttr{
+		Type:    sattr.Type,
+		InodeID: sattr.InodeID,
+	}
+
+	// Get '..'.
+	if !d.IsRoot() && d.descendantOf(root) {
+		// Dirent is a descendant of the root.  Get its parent's attrs.
+		psattr := d.parent.Inode.StableAttr
+		dotdot := DentAttr{
+			Type:    psattr.Type,
+			InodeID: psattr.InodeID,
+		}
+		return dot, dotdot
+	}
+	// Dirent is either root or not a descendant of the root.  ".." is the
+	// same as ".".
+	return dot, dot
+}
+
+// readdirFrozen returns readdir results based solely on the frozen children.
+func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) {
+	// Collect attrs for "." and  "..".
+	attrs := make(map[string]DentAttr)
+	names := []string{".", ".."}
+	attrs["."], attrs[".."] = d.getDotAttrs(root)
+
+	// Get info from all children.
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	for name, w := range d.children {
+		if child := w.Get(); child != nil {
+			defer child.DecRef()
+
+			// Skip negative children.
+			if child.(*Dirent).IsNegative() {
+				continue
+			}
+
+			sattr := child.(*Dirent).Inode.StableAttr
+			attrs[name] = DentAttr{
+				Type:    sattr.Type,
+				InodeID: sattr.InodeID,
+			}
+			names = append(names, name)
+		}
+	}
+
+	sort.Strings(names)
+
+	if int(offset) >= len(names) {
+		return offset, nil
+	}
+	names = names[int(offset):]
+	for _, name := range names {
+		if err := dirCtx.DirEmit(name, attrs[name]); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
+
+// DirIterator is an open directory containing directory entries that can be read.
+type DirIterator interface {
+	// IterateDir emits directory entries by calling dirCtx.EmitDir, beginning
+	// with the entry at offset and returning the next directory offset.
+	//
+	// Entries for "." and ".." must *not* be included.
+	//
+	// If the offset returned is the same as the argument offset, then
+	// nothing has been serialized.  This is equivalent to reaching EOF.
+	// In this case serializer.Written() should return 0.
+	//
+	// The order of entries to emit must be consistent between Readdir
+	// calls, and must start with the given offset.
+	//
+	// The caller must ensure that this operation is permitted.
+	IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error)
+}
+
+// DirentReaddir serializes the directory entries of d including "." and "..".
+//
+// Arguments:
+//
+// * d:		the Dirent of the directory being read; required to provide "." and "..".
+// * it:	the directory iterator; which represents an open directory handle.
+// * root: 	fs root; if d is equal to the root, then '..' will refer to d.
+// * ctx: 	context provided to file systems in order to select and serialize entries.
+// * offset:	the current directory offset.
+//
+// Returns the offset of the *next* element which was not serialized.
+func DirentReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
+	offset, err := direntReaddir(ctx, d, it, root, dirCtx, offset)
+	// Serializing any directory entries at all means success.
+	if dirCtx.Serializer.Written() > 0 {
+		return offset, nil
+	}
+	return offset, err
+}
+
+func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
+	if root == nil {
+		panic("Dirent.Readdir: root must not be nil")
+	}
+	if dirCtx.Serializer == nil {
+		panic("Dirent.Readdir: serializer must not be nil")
+	}
+	if d.frozen {
+		return d.readdirFrozen(root, offset, dirCtx)
+	}
+
+	// Check that this is actually a directory before emitting anything.
+	// Once we have written entries for "." and "..", future errors from
+	// IterateDir will be hidden.
+	if !IsDir(d.Inode.StableAttr) {
+		return 0, syserror.ENOTDIR
+	}
+
+	// Collect attrs for "." and "..".
+	dot, dotdot := d.getDotAttrs(root)
+
+	// Emit "." and ".." if the offset is low enough.
+	if offset == 0 {
+		// Serialize ".".
+		if err := dirCtx.DirEmit(".", dot); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	if offset == 1 {
+		// Serialize "..".
+		if err := dirCtx.DirEmit("..", dotdot); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+
+	// it.IterateDir should be passed an offset that does not include the
+	// initial dot elements.  We will add them back later.
+	offset -= 2
+	newOffset, err := it.IterateDir(ctx, dirCtx, int(offset))
+	if int64(newOffset) < offset {
+		panic(fmt.Sprintf("node.Readdir returned offset %v less that input offset %v", offset, newOffset))
+	}
+	// Add the initial nodes back to the offset count.
+	newOffset += 2
+	return int64(newOffset), err
+}
+
+// flush flushes all weak references recursively, and removes any cached
+// references to children.
+//
+// Preconditions: d.mu must be held.
+func (d *Dirent) flush() {
+	expired := make(map[string]*refs.WeakRef)
+	for n, w := range d.children {
+		// Call flush recursively on each child before removing our
+		// reference on it, and removing the cache's reference.
+		if child := w.Get(); child != nil {
+			cd := child.(*Dirent)
+
+			if !cd.IsNegative() {
+				// Flush the child.
+				cd.mu.Lock()
+				cd.flush()
+				cd.mu.Unlock()
+
+				// Allow the file system to drop extra references on child.
+				cd.dropExtendedReference()
+			}
+
+			// Don't leak a reference.
+			child.DecRef()
+		}
+		// Check if the child dirent is closed, and mark it as expired if it is.
+		// We must call w.Get() again here, since the child could have been closed
+		// by the calls to flush() and cache.Remove() in the above if-block.
+		if child := w.Get(); child != nil {
+			child.DecRef()
+		} else {
+			expired[n] = w
+		}
+	}
+
+	// Remove expired entries.
+	for n, w := range expired {
+		delete(d.children, n)
+		w.Drop()
+	}
+}
+
+// Busy indicates whether this Dirent is a mount point or root dirent, or has
+// active positive children.
+//
+// This is expensive, since it flushes the children cache.
+//
+// TODO: Fix this busy-ness check.
+func (d *Dirent) Busy() bool {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.mounted || d.parent == nil {
+		return true
+	}
+
+	// Flush any cached references to children that are doomed.
+	d.flush()
+
+	// Count positive children.
+	var nonNegative int
+	for _, w := range d.children {
+		if child := w.Get(); child != nil {
+			if !child.(*Dirent).IsNegative() {
+				nonNegative++
+			}
+			child.DecRef()
+		}
+	}
+	return nonNegative > 0
+}
+
+// mount mounts a new dirent with the given inode over d.
+//
+// Precondition: must be called with mm.withMountLocked held on `d`.
+func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) {
+	// Did we race with deletion?
+	if atomic.LoadInt32(&d.deleted) != 0 {
+		return nil, syserror.ENOENT
+	}
+
+	// Refuse to mount a symlink.
+	//
+	// See Linux equivalent in fs/namespace.c:do_add_mount.
+	if IsSymlink(inode.StableAttr) {
+		return nil, syserror.EINVAL
+	}
+
+	// Are we frozen?
+	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
+		return nil, syserror.ENOENT
+	}
+
+	// Dirent that'll replace d.
+	//
+	// Note that NewDirent returns with one reference taken; the reference
+	// is donated to the caller as the mount reference.
+	replacement := NewDirent(inode, d.name)
+	replacement.mounted = true
+
+	weakRef, ok := d.parent.hashChild(replacement)
+	if !ok {
+		panic("mount must mount over an existing dirent")
+	}
+	weakRef.Drop()
+
+	// Note that even though `d` is now hidden, it still holds a reference
+	// to its parent.
+	return replacement, nil
+}
+
+// unmount unmounts `d` and replaces it with the last Dirent that was in its
+// place, supplied by the MountNamespace as `replacement`.
+//
+// Precondition: must be called with mm.withMountLocked held on `d`.
+func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
+	// Did we race with deletion?
+	if atomic.LoadInt32(&d.deleted) != 0 {
+		return syserror.ENOENT
+	}
+
+	// Are we frozen?
+	if d.parent.frozen && !d.parent.Inode.IsVirtual() {
+		return syserror.ENOENT
+	}
+
+	// Remount our former child in its place.
+	//
+	// As replacement used to be our child, it must already have the right
+	// parent.
+	weakRef, ok := d.parent.hashChildParentSet(replacement)
+	if !ok {
+		panic("mount must mount over an existing dirent")
+	}
+	weakRef.Drop()
+
+	// d is not reachable anymore, and hence not mounted anymore.
+	d.mounted = false
+
+	// Drop mount reference.
+	d.DecRef()
+	return nil
+}
+
+// Remove removes the given file or symlink.  The root dirent is used to
+// resolve name, and must not be nil.
+func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error {
+	// Check the root.
+	if root == nil {
+		panic("Dirent.Remove: root must not be nil")
+	}
+
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	// Are we frozen?
+	if d.frozen && !d.Inode.IsVirtual() {
+		return syscall.ENOENT
+	}
+
+	// Try to walk to the node.
+	child, err := d.walk(ctx, root, name, false /* may unlock */)
+	if err != nil {
+		// Child does not exist.
+		return err
+	}
+	defer child.DecRef()
+
+	// Remove cannot remove directories.
+	if IsDir(child.Inode.StableAttr) {
+		return syscall.EISDIR
+	}
+
+	// Remove cannot remove a mount point.
+	if child.Busy() {
+		return syscall.EBUSY
+	}
+
+	// Try to remove name on the file system.
+	if err := d.Inode.Remove(ctx, d, child); err != nil {
+		return err
+	}
+
+	// Link count changed, this only applies to non-directory nodes.
+	child.Inode.Watches.Notify("", linux.IN_ATTRIB, 0)
+
+	// Mark name as deleted and remove from children.
+	atomic.StoreInt32(&child.deleted, 1)
+	if w, ok := d.children[name]; ok {
+		delete(d.children, name)
+		w.Drop()
+	}
+
+	// Allow the file system to drop extra references on child.
+	child.dropExtendedReference()
+
+	// Finally, let inotify know the child is being unlinked. Drop any extra
+	// refs from inotify to this child dirent. This doesn't necessarily mean the
+	// watches on the underlying inode will be destroyed, since the underlying
+	// inode may have other links. If this was the last link, the events for the
+	// watch removal will be queued by the inode destructor.
+	child.Inode.Watches.MarkUnlinked()
+	child.Inode.Watches.Unpin(child)
+	d.Inode.Watches.Notify(name, linux.IN_DELETE, 0)
+
+	return nil
+}
+
+// RemoveDirectory removes the given directory.  The root dirent is used to
+// resolve name, and must not be nil.
+func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) error {
+	// Check the root.
+	if root == nil {
+		panic("Dirent.Remove: root must not be nil")
+	}
+
+	unlock := d.lockDirectory()
+	defer unlock()
+
+	// Are we frozen?
+	if d.frozen && !d.Inode.IsVirtual() {
+		return syscall.ENOENT
+	}
+
+	// Check for dots.
+	if name == "." {
+		// Rejected as the last component by rmdir(2).
+		return syscall.EINVAL
+	}
+	if name == ".." {
+		// If d was found, then its parent is not empty.
+		return syscall.ENOTEMPTY
+	}
+
+	// Try to walk to the node.
+	child, err := d.walk(ctx, root, name, false /* may unlock */)
+	if err != nil {
+		// Child does not exist.
+		return err
+	}
+	defer child.DecRef()
+
+	// RemoveDirectory can only remove directories.
+	if !IsDir(child.Inode.StableAttr) {
+		return syscall.ENOTDIR
+	}
+
+	// Remove cannot remove a mount point.
+	if child.Busy() {
+		return syscall.EBUSY
+	}
+
+	// Try to remove name on the file system.
+	if err := d.Inode.Remove(ctx, d, child); err != nil {
+		return err
+	}
+
+	// Mark name as deleted and remove from children.
+	atomic.StoreInt32(&child.deleted, 1)
+	if w, ok := d.children[name]; ok {
+		delete(d.children, name)
+		w.Drop()
+	}
+
+	// Allow the file system to drop extra references on child.
+	child.dropExtendedReference()
+
+	// Finally, let inotify know the child is being unlinked. Drop any extra
+	// refs from inotify to this child dirent.
+	child.Inode.Watches.MarkUnlinked()
+	child.Inode.Watches.Unpin(child)
+	d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0)
+
+	return nil
+}
+
+// destroy closes this node and all children.
+func (d *Dirent) destroy() {
+	if d.IsNegative() {
+		// Nothing to tear-down and no parent references to drop, since a negative
+		// Dirent does not take a references on its parent, has no Inode and no children.
+		return
+	}
+
+	var wg sync.WaitGroup
+	defer wg.Wait()
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Drop all weak references.
+	for _, w := range d.children {
+		w.Drop()
+	}
+	d.children = nil
+
+	allDirents.remove(d)
+
+	// Drop our reference to the Inode.
+	d.Inode.DecRef()
+
+	// Allow the Dirent to be GC'ed after this point, since the Inode may still
+	// be referenced after the Dirent is destroyed (for instance by filesystem
+	// internal caches or hard links).
+	d.Inode = nil
+
+	// Drop the reference we have on our parent if we took one. renameMu doesn't need to be
+	// held because d can't be reparented without any references to it left.
+	if d.parent != nil {
+		d.parent.DecRef()
+	}
+}
+
+// IncRef increases the Dirent's refcount as well as its mount's refcount.
+//
+// IncRef implements RefCounter.IncRef.
+func (d *Dirent) IncRef() {
+	if d.Inode != nil {
+		d.Inode.MountSource.IncDirentRefs()
+	}
+	d.AtomicRefCount.IncRef()
+}
+
+// TryIncRef implements RefCounter.TryIncRef.
+func (d *Dirent) TryIncRef() bool {
+	ok := d.AtomicRefCount.TryIncRef()
+	if ok && d.Inode != nil {
+		d.Inode.MountSource.IncDirentRefs()
+	}
+	return ok
+}
+
+// DecRef decreases the Dirent's refcount and drops its reference on its mount.
+//
+// DecRef implements RefCounter.DecRef with destructor d.destroy.
+func (d *Dirent) DecRef() {
+	if d.Inode != nil {
+		// Keep mount around, since DecRef may destroy d.Inode.
+		msrc := d.Inode.MountSource
+		d.DecRefWithDestructor(d.destroy)
+		msrc.DecDirentRefs()
+	} else {
+		d.DecRefWithDestructor(d.destroy)
+	}
+}
+
+// InotifyEvent notifies all watches on the inode for this dirent and its parent
+// of potential events. The events may not actually propagate up to the user,
+// depending on the event masks. InotifyEvent automatically provides the name of
+// the current dirent as the subject of the event as required, and adds the
+// IN_ISDIR flag for dirents that refer to directories.
+func (d *Dirent) InotifyEvent(events, cookie uint32) {
+	// N.B. We don't defer the unlocks because InotifyEvent is in the hot
+	// path of all IO operations, and the defers cost too much for small IO
+	// operations.
+	renameMu.RLock()
+
+	if IsDir(d.Inode.StableAttr) {
+		events |= linux.IN_ISDIR
+	}
+
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.Inode.Watches.Notify(d.name, events, cookie)
+	}
+	d.Inode.Watches.Notify("", events, cookie)
+
+	renameMu.RUnlock()
+}
+
+// maybeExtendReference caches a reference on this Dirent if
+// MountSourceOperations.Keep returns true.
+func (d *Dirent) maybeExtendReference() {
+	if msrc := d.Inode.MountSource; msrc.Keep(d) {
+		msrc.fscache.Add(d)
+	}
+}
+
+// dropExtendedReference drops any cached reference held by the
+// MountSource on the dirent.
+func (d *Dirent) dropExtendedReference() {
+	d.Inode.MountSource.fscache.Remove(d)
+}
+
+// lockForRename takes locks on oldParent and newParent as required by Rename
+// and returns a function that will unlock the locks taken. The returned
+// function must be called even if a non-nil error is returned.
+func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) {
+	if oldParent == newParent {
+		oldParent.mu.Lock()
+		return oldParent.mu.Unlock, nil
+	}
+
+	// Renaming between directories is a bit subtle:
+	//
+	// - A concurrent cross-directory Rename may try to lock in the opposite
+	// order; take renameMu to prevent this from happening.
+	//
+	// - If either directory is an ancestor of the other, then a concurrent
+	// Remove may lock the descendant (in DecRef -> closeAll) while holding a
+	// lock on the ancestor; to avoid this, ensure we take locks in the same
+	// ancestor-to-descendant order. (Holding renameMu prevents this
+	// relationship from changing.)
+	renameMu.Lock()
+
+	// First check if newParent is a descendant of oldParent.
+	child := newParent
+	for p := newParent.parent; p != nil; p = p.parent {
+		if p == oldParent {
+			oldParent.mu.Lock()
+			newParent.mu.Lock()
+			var err error
+			if child.name == oldName {
+				// newParent is not just a descendant of oldParent, but
+				// more specifically of oldParent/oldName. That is, we're
+				// trying to rename something into a subdirectory of
+				// itself.
+				err = syscall.EINVAL
+			}
+			return func() {
+				newParent.mu.Unlock()
+				oldParent.mu.Unlock()
+				renameMu.Unlock()
+			}, err
+		}
+		child = p
+	}
+
+	// Otherwise, either oldParent is a descendant of newParent or the two
+	// have no relationship; in either case we can do this:
+	newParent.mu.Lock()
+	oldParent.mu.Lock()
+	return func() {
+		oldParent.mu.Unlock()
+		newParent.mu.Unlock()
+		renameMu.Unlock()
+	}, nil
+}
+
+func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
+	uattr, err := dir.Inode.UnstableAttr(ctx)
+	if err != nil {
+		return syserror.EPERM
+	}
+	if !uattr.Perms.Sticky {
+		return nil
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	if uattr.Owner.UID == creds.EffectiveKUID {
+		return nil
+	}
+
+	vuattr, err := victim.Inode.UnstableAttr(ctx)
+	if err != nil {
+		return syserror.EPERM
+	}
+	if vuattr.Owner.UID == creds.EffectiveKUID {
+		return nil
+	}
+	if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// MayDelete determines whether `name`, a child of `dir`, can be deleted or
+// renamed by `ctx`.
+//
+// Compare Linux kernel fs/namei.c:may_delete.
+func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
+	victim, err := dir.Walk(ctx, root, name)
+	if err != nil {
+		return err
+	}
+	defer victim.DecRef()
+
+	return mayDelete(ctx, dir, victim)
+}
+
+func mayDelete(ctx context.Context, dir *Dirent, victim *Dirent) error {
+	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+		return err
+	}
+
+	return checkSticky(ctx, dir, victim)
+}
+
+// Rename atomically converts the child of oldParent named oldName to a
+// child of newParent named newName.
+func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string, newParent *Dirent, newName string) error {
+	if root == nil {
+		panic("Rename: root must not be nil")
+	}
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+
+	// Acquire global renameMu lock, and mu locks on oldParent/newParent.
+	unlock, err := lockForRename(oldParent, oldName, newParent, newName)
+	defer unlock()
+	if err != nil {
+		return err
+	}
+
+	// Are we frozen?
+	// TODO: Is this the right errno?
+	if oldParent.frozen && !oldParent.Inode.IsVirtual() {
+		return syscall.ENOENT
+	}
+	if newParent.frozen && !newParent.Inode.IsVirtual() {
+		return syscall.ENOENT
+	}
+
+	// Check constraints on the object being renamed.
+	renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */)
+	if err != nil {
+		return err
+	}
+	defer renamed.DecRef()
+
+	// Make sure we have write permissions on old and new parent.
+	if err := mayDelete(ctx, oldParent, renamed); err != nil {
+		return err
+	}
+	if newParent != oldParent {
+		if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+	}
+
+	// Source should not be an ancestor of the target.
+	if renamed == newParent {
+		return syscall.EINVAL
+	}
+
+	// Is the thing we're trying to rename busy?
+	if renamed.Busy() {
+		return syscall.EBUSY
+	}
+
+	// Per rename(2): "... EACCES: ... or oldpath is a directory and does not
+	// allow write permission (needed to update the .. entry)."
+	if IsDir(renamed.Inode.StableAttr) {
+		if err := renamed.Inode.CheckPermission(ctx, PermMask{Write: true}); err != nil {
+			return err
+		}
+	}
+
+	// Check constraints on the object being replaced, if any.
+	replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */)
+	if err == nil {
+		defer replaced.DecRef()
+
+		// Target should not be an ancestor of source.
+		if replaced == oldParent {
+			// Why is this not EINVAL? See fs/namei.c.
+			return syscall.ENOTEMPTY
+		}
+
+		// Is the thing we're trying to replace busy?
+		if replaced.Busy() {
+			return syscall.EBUSY
+		}
+
+		// Require that a directory is replaced by a directory.
+		oldIsDir := IsDir(renamed.Inode.StableAttr)
+		newIsDir := IsDir(replaced.Inode.StableAttr)
+		if !newIsDir && oldIsDir {
+			return syscall.ENOTDIR
+		}
+		if !oldIsDir && newIsDir {
+			return syscall.EISDIR
+		}
+
+		// Allow the file system to drop extra references on replaced.
+		replaced.dropExtendedReference()
+
+		// NOTE: Keeping a dirent
+		// open across renames is currently broken for multiple
+		// reasons, so we flush all references on the replaced node and
+		// its children.
+		replaced.Inode.Watches.Unpin(replaced)
+		replaced.flush()
+	}
+
+	if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName); err != nil {
+		return err
+	}
+
+	renamed.name = newName
+	renamed.parent = newParent
+	if oldParent != newParent {
+		// Reparent the reference held by renamed.parent. oldParent.DecRef
+		// can't destroy oldParent (and try to retake its lock) because
+		// Rename's caller must be holding a reference.
+		newParent.IncRef()
+		oldParent.DecRef()
+	}
+	if w, ok := newParent.children[newName]; ok {
+		w.Drop()
+		delete(newParent.children, newName)
+	}
+	if w, ok := oldParent.children[oldName]; ok {
+		w.Drop()
+		delete(oldParent.children, oldName)
+	}
+
+	// Add a weak reference from the new parent.  This ensures that the child
+	// can still be found from the new parent if a prior hard reference is
+	// held on renamed.
+	//
+	// This is required for file lock correctness because file locks are per-Dirent
+	// and without maintaining the a cached child (via a weak reference) for renamed,
+	// multiple Dirents can correspond to the same resource (by virtue of the renamed
+	// Dirent being unreachable by its parent and it being looked up).
+	newParent.children[newName] = refs.NewWeakRef(renamed, nil)
+
+	// Queue inotify events for the rename.
+	var ev uint32
+	if IsDir(renamed.Inode.StableAttr) {
+		ev |= linux.IN_ISDIR
+	}
+
+	cookie := uniqueid.InotifyCookie(ctx)
+	oldParent.Inode.Watches.Notify(oldName, ev|linux.IN_MOVED_FROM, cookie)
+	newParent.Inode.Watches.Notify(newName, ev|linux.IN_MOVED_TO, cookie)
+	// Somewhat surprisingly, self move events do not have a cookie.
+	renamed.Inode.Watches.Notify("", linux.IN_MOVE_SELF, 0)
+
+	// Allow the file system to drop extra references on renamed.
+	renamed.dropExtendedReference()
+
+	// Same as replaced.flush above.
+	renamed.flush()
+
+	return nil
+}
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
new file mode 100644
index 000000000..e786e4f65
--- /dev/null
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -0,0 +1,142 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sync"
+)
+
+// DirentCache is an LRU cache of Dirents. The Dirent's refCount is
+// incremented when it is added to the cache, and decremented when it is
+// removed.
+//
+// A nil DirentCache corresponds to a cache with size 0. All methods can be
+// called, but nothing is actually cached.
+type DirentCache struct {
+	// Maximum size of the cache. This must be saved manually, to handle the case
+	// when cache is nil.
+	maxSize uint64
+
+	// mu protects currentSize and direntList.
+	mu sync.Mutex `state:"nosave"`
+
+	// currentSize is the number of elements in the cache. It must be zero (i.e.
+	// the cache must be empty) on Save.
+	currentSize uint64 `state:"zerovalue"`
+
+	// list is a direntList, an ilist of Dirents. New Dirents are added
+	// to the front of the list. Old Dirents are removed from the back of
+	// the list. It must be zerovalue (i.e. the cache must be empty) on Save.
+	list direntList `state:"zerovalue"`
+}
+
+// NewDirentCache returns a new DirentCache with the given maxSize. If maxSize
+// is 0, nil is returned.
+func NewDirentCache(maxSize uint64) *DirentCache {
+	return &DirentCache{
+		maxSize: maxSize,
+	}
+}
+
+// Add adds the element to the cache and increments the refCount. If the
+// argument is already in the cache, it is moved to the front. An element is
+// removed from the back if the cache is over capacity.
+func (c *DirentCache) Add(d *Dirent) {
+	if c == nil || c.maxSize == 0 {
+		return
+	}
+
+	c.mu.Lock()
+	if c.contains(d) {
+		// d is already in cache. Bump it to the front.
+		// currentSize and refCount are unaffected.
+		c.list.Remove(d)
+		c.list.PushFront(d)
+		c.mu.Unlock()
+		return
+	}
+
+	// d is not in cache. Add it and take a reference.
+	c.list.PushFront(d)
+	d.IncRef()
+	c.currentSize++
+
+	// Remove the oldest until we are under the size limit.
+	for c.maxSize > 0 && c.currentSize > c.maxSize {
+		c.remove(c.list.Back())
+	}
+	c.mu.Unlock()
+}
+
+func (c *DirentCache) remove(d *Dirent) {
+	if !c.contains(d) {
+		panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d))
+	}
+	c.list.Remove(d)
+	d.SetPrev(nil)
+	d.SetNext(nil)
+	d.DecRef()
+	c.currentSize--
+}
+
+// Remove removes the element from the cache and decrements its refCount. It
+// also sets the previous and next elements to nil, which allows us to
+// determine if a given element is in the cache.
+func (c *DirentCache) Remove(d *Dirent) {
+	if c == nil || c.maxSize == 0 {
+		return
+	}
+	c.mu.Lock()
+	if !c.contains(d) {
+		c.mu.Unlock()
+		return
+	}
+	c.remove(d)
+	c.mu.Unlock()
+}
+
+// Size returns the number of elements in the cache.
+func (c *DirentCache) Size() uint64 {
+	if c == nil {
+		return 0
+	}
+	c.mu.Lock()
+	size := c.currentSize
+	c.mu.Unlock()
+	return size
+}
+
+func (c *DirentCache) contains(d *Dirent) bool {
+	// If d has a Prev or Next element, then it is in the cache.
+	if d.Prev() != nil || d.Next() != nil {
+		return true
+	}
+	// Otherwise, d is in the cache if it is the only element (and thus the
+	// first element).
+	return c.list.Front() == d
+}
+
+// Invalidate removes all Dirents from the cache, caling DecRef on each.
+func (c *DirentCache) Invalidate() {
+	if c == nil {
+		return
+	}
+	c.mu.Lock()
+	for c.list.Front() != nil {
+		c.remove(c.list.Front())
+	}
+	c.mu.Unlock()
+}
diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go
new file mode 100644
index 000000000..82b7f6bd5
--- /dev/null
+++ b/pkg/sentry/fs/dirent_cache_test.go
@@ -0,0 +1,157 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"testing"
+)
+
+func TestDirentCache(t *testing.T) {
+	const maxSize = 5
+
+	c := NewDirentCache(maxSize)
+
+	// Size starts at 0.
+	if got, want := c.Size(), uint64(0); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// Create a Dirent d.
+	d := NewNegativeDirent("")
+
+	// c does not contain d.
+	if got, want := c.contains(d), false; got != want {
+		t.Errorf("c.contains(d) got %v want %v", got, want)
+	}
+
+	// Add d to the cache.
+	c.Add(d)
+
+	// Size is now 1.
+	if got, want := c.Size(), uint64(1); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// c contains d.
+	if got, want := c.contains(d), true; got != want {
+		t.Errorf("c.contains(d) got %v want %v", got, want)
+	}
+
+	// Add maxSize-1 more elements.  d should be oldest element.
+	for i := 0; i < maxSize-1; i++ {
+		c.Add(NewNegativeDirent(""))
+	}
+
+	// Size is maxSize.
+	if got, want := c.Size(), uint64(maxSize); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// c contains d.
+	if got, want := c.contains(d), true; got != want {
+		t.Errorf("c.contains(d) got %v want %v", got, want)
+	}
+
+	// "Bump" d to the front by re-adding it.
+	c.Add(d)
+
+	// Size is maxSize.
+	if got, want := c.Size(), uint64(maxSize); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// c contains d.
+	if got, want := c.contains(d), true; got != want {
+		t.Errorf("c.contains(d) got %v want %v", got, want)
+	}
+
+	// Add maxSize-1 more elements.  d should again be oldest element.
+	for i := 0; i < maxSize-1; i++ {
+		c.Add(NewNegativeDirent(""))
+	}
+
+	// Size is maxSize.
+	if got, want := c.Size(), uint64(maxSize); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// c contains d.
+	if got, want := c.contains(d), true; got != want {
+		t.Errorf("c.contains(d) got %v want %v", got, want)
+	}
+
+	// Add one more element, which will bump d from the cache.
+	c.Add(NewNegativeDirent(""))
+
+	// Size is maxSize.
+	if got, want := c.Size(), uint64(maxSize); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// c does not contain d.
+	if got, want := c.contains(d), false; got != want {
+		t.Errorf("c.contains(d) got %v want %v", got, want)
+	}
+
+	// Invalidating causes size to be 0 and list to be empty.
+	c.Invalidate()
+	if got, want := c.Size(), uint64(0); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+	if got, want := c.list.Empty(), true; got != want {
+		t.Errorf("c.list.Empty() got %v, want %v", got, want)
+	}
+
+	// Fill cache with maxSize dirents.
+	for i := 0; i < maxSize; i++ {
+		c.Add(NewNegativeDirent(""))
+	}
+}
+
+// TestNilDirentCache tests that a nil cache supports all cache operations, but
+// treats them as noop.
+func TestNilDirentCache(t *testing.T) {
+	// Create a nil cache.
+	var c *DirentCache
+
+	// Size is zero.
+	if got, want := c.Size(), uint64(0); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// Call Add.
+	c.Add(NewNegativeDirent(""))
+
+	// Size is zero.
+	if got, want := c.Size(), uint64(0); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// Call Remove.
+	c.Remove(NewNegativeDirent(""))
+
+	// Size is zero.
+	if got, want := c.Size(), uint64(0); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+
+	// Call Invalidate.
+	c.Invalidate()
+
+	// Size is zero.
+	if got, want := c.Size(), uint64(0); got != want {
+		t.Errorf("c.Size() got %v, want %v", got, want)
+	}
+}
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
new file mode 100644
index 000000000..8ce9ba02d
--- /dev/null
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -0,0 +1,417 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+)
+
+func newMockDirInode(ctx context.Context, cache *DirentCache) *Inode {
+	return NewMockInode(ctx, NewMockMountSource(cache), StableAttr{Type: Directory})
+}
+
+func TestWalkPositive(t *testing.T) {
+	// refs == 0 -> one reference.
+	// refs == -1 -> has been destroyed.
+
+	ctx := contexttest.Context(t)
+	root := NewDirent(newMockDirInode(ctx, nil), "root")
+
+	if got := root.TestReadRefs(); got != 0 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	}
+
+	name := "d"
+	d, err := root.walk(ctx, root, name, false)
+	if err != nil {
+		t.Fatalf("root.walk(root, %q) got %v, want nil", name, err)
+	}
+
+	if got := root.TestReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
+	}
+
+	if got := d.TestReadRefs(); got != 0 {
+		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0)
+	}
+
+	d.DecRef()
+
+	if got := root.TestReadRefs(); got != 0 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	}
+
+	if got := d.TestReadRefs(); got != -1 {
+		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, -1)
+	}
+
+	root.flush()
+
+	if got := len(root.children); got != 0 {
+		t.Fatalf("root has %d children, want %d", got, 0)
+	}
+}
+
+func TestWalkNegative(t *testing.T) {
+	// refs == 0 -> one reference.
+	// refs == -1 -> has been destroyed.
+
+	ctx := contexttest.Context(t)
+	root := NewDirent(NewEmptyDir(ctx, nil), "root")
+	mn := root.Inode.InodeOperations.(*mockInodeOperationsLookupNegative)
+
+	if got := root.TestReadRefs(); got != 0 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	}
+
+	name := "d"
+	for i := 0; i < 100; i++ {
+		_, err := root.walk(ctx, root, name, false)
+		if err != syscall.ENOENT {
+			t.Fatalf("root.walk(root, %q) got %v, want %v", name, err, syscall.ENOENT)
+		}
+	}
+
+	if got := root.TestReadRefs(); got != 0 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
+	}
+
+	if got := len(root.children); got != 1 {
+		t.Fatalf("root has %d children, want %d", got, 1)
+	}
+
+	w, ok := root.children[name]
+	if !ok {
+		t.Fatalf("root wants child at %q", name)
+	}
+
+	child := w.Get()
+	if child == nil {
+		t.Fatalf("root wants to resolve weak reference")
+	}
+
+	if !child.(*Dirent).IsNegative() {
+		t.Fatalf("root found positive child at %q, want negative", name)
+	}
+
+	if got := child.(*Dirent).TestReadRefs(); got != 1 {
+		t.Fatalf("child has a ref count of %d, want %d", got, 1)
+	}
+
+	child.DecRef()
+
+	if got := child.(*Dirent).TestReadRefs(); got != 0 {
+		t.Fatalf("child has a ref count of %d, want %d", got, 0)
+	}
+
+	if got := len(root.children); got != 1 {
+		t.Fatalf("root has %d children, want %d", got, 1)
+	}
+
+	root.DecRef()
+
+	if got := root.TestReadRefs(); got != -1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	}
+
+	AsyncBarrier()
+
+	if got := mn.releaseCalled; got != true {
+		t.Fatalf("root.Close was called %v, want true", got)
+	}
+}
+
+type mockInodeOperationsLookupNegative struct {
+	*MockInodeOperations
+	releaseCalled bool
+}
+
+func NewEmptyDir(ctx context.Context, cache *DirentCache) *Inode {
+	m := NewMockMountSource(cache)
+	return NewInode(&mockInodeOperationsLookupNegative{
+		MockInodeOperations: NewMockInodeOperations(ctx),
+	}, m, StableAttr{Type: Directory})
+}
+
+func (m *mockInodeOperationsLookupNegative) Lookup(ctx context.Context, dir *Inode, p string) (*Dirent, error) {
+	return NewNegativeDirent(p), nil
+}
+
+func (m *mockInodeOperationsLookupNegative) Release(context.Context) {
+	m.releaseCalled = true
+}
+
+func TestHashNegativeToPositive(t *testing.T) {
+	// refs == 0 -> one reference.
+	// refs == -1 -> has been destroyed.
+
+	ctx := contexttest.Context(t)
+	root := NewDirent(NewEmptyDir(ctx, nil), "root")
+
+	name := "d"
+	_, err := root.walk(ctx, root, name, false)
+	if err != syscall.ENOENT {
+		t.Fatalf("root.walk(root, %q) got %v, want %v", name, err, syscall.ENOENT)
+	}
+
+	if got := root.exists(ctx, root, name); got != false {
+		t.Fatalf("got %q exists, want does not exist", name)
+	}
+
+	f, err := root.Create(ctx, root, name, FileFlags{}, FilePermissions{})
+	if err != nil {
+		t.Fatalf("root.Create(%q, _), got error %v, want nil", name, err)
+	}
+	d := f.Dirent
+
+	if d.IsNegative() {
+		t.Fatalf("got negative Dirent, want positive")
+	}
+
+	if got := d.TestReadRefs(); got != 0 {
+		t.Fatalf("child %q has a ref count of %d, want %d", name, got, 0)
+	}
+
+	if got := root.TestReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
+	}
+
+	if got := len(root.children); got != 1 {
+		t.Fatalf("got %d children, want %d", got, 1)
+	}
+
+	w, ok := root.children[name]
+	if !ok {
+		t.Fatalf("failed to find weak reference to %q", name)
+	}
+
+	child := w.Get()
+	if child == nil {
+		t.Fatalf("want to resolve weak reference")
+	}
+
+	if child.(*Dirent) != d {
+		t.Fatalf("got foreign child")
+	}
+}
+
+func TestRevalidate(t *testing.T) {
+	// refs == 0 -> one reference.
+	// refs == -1 -> has been destroyed.
+
+	ctx := contexttest.Context(t)
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// Whether to make negative Dirents.
+		makeNegative bool
+	}{
+		{
+			desc:         "Revalidate negative Dirent",
+			makeNegative: true,
+		},
+		{
+			desc:         "Revalidate positive Dirent",
+			makeNegative: false,
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			root := NewDirent(NewMockInodeRevalidate(ctx, test.makeNegative), "root")
+
+			name := "d"
+			d1, err := root.walk(ctx, root, name, false)
+			if !test.makeNegative && err != nil {
+				t.Fatalf("root.walk(root, %q) got %v, want nil", name, err)
+			}
+			d2, err := root.walk(ctx, root, name, false)
+			if !test.makeNegative && err != nil {
+				t.Fatalf("root.walk(root, %q) got %v, want nil", name, err)
+			}
+			if !test.makeNegative && d1 == d2 {
+				t.Fatalf("revalidating walk got same *Dirent, want different")
+			}
+			if got := len(root.children); got != 1 {
+				t.Errorf("revalidating walk got %d children, want %d", got, 1)
+			}
+		})
+	}
+}
+
+type MockInodeOperationsRevalidate struct {
+	*MockInodeOperations
+	makeNegative bool
+}
+
+func NewMockInodeRevalidate(ctx context.Context, makeNegative bool) *Inode {
+	mn := NewMockInodeOperations(ctx)
+	m := NewMockMountSource(nil)
+	m.MountSourceOperations.(*MockMountSourceOps).revalidate = true
+	return NewInode(&MockInodeOperationsRevalidate{MockInodeOperations: mn, makeNegative: makeNegative}, m, StableAttr{Type: Directory})
+}
+
+func (m *MockInodeOperationsRevalidate) Lookup(ctx context.Context, dir *Inode, p string) (*Dirent, error) {
+	if !m.makeNegative {
+		return m.MockInodeOperations.Lookup(ctx, dir, p)
+	}
+	return NewNegativeDirent(p), nil
+}
+
+func TestCreateExtraRefs(t *testing.T) {
+	// refs == 0 -> one reference.
+	// refs == -1 -> has been destroyed.
+
+	ctx := contexttest.Context(t)
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// root is the Dirent to create from.
+		root *Dirent
+
+		// expected references on walked Dirent.
+		refs int64
+	}{
+		{
+			desc: "Create caching",
+			root: NewDirent(NewEmptyDir(ctx, NewDirentCache(1)), "root"),
+			refs: 1,
+		},
+		{
+			desc: "Create not caching",
+			root: NewDirent(NewEmptyDir(ctx, nil), "root"),
+			refs: 0,
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			name := "d"
+			f, err := test.root.Create(ctx, test.root, name, FileFlags{}, FilePermissions{})
+			if err != nil {
+				t.Fatalf("root.Create(root, %q) failed: %v", name, err)
+			}
+			d := f.Dirent
+
+			if got := d.TestReadRefs(); got != test.refs {
+				t.Errorf("dirent has a ref count of %d, want %d", got, test.refs)
+			}
+		})
+	}
+}
+
+func TestRemoveExtraRefs(t *testing.T) {
+	// refs == 0 -> one reference.
+	// refs == -1 -> has been destroyed.
+
+	ctx := contexttest.Context(t)
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// root is the Dirent to make and remove from.
+		root *Dirent
+	}{
+		{
+			desc: "Remove caching",
+			root: NewDirent(NewEmptyDir(ctx, NewDirentCache(1)), "root"),
+		},
+		{
+			desc: "Remove not caching",
+			root: NewDirent(NewEmptyDir(ctx, nil), "root"),
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			name := "d"
+			f, err := test.root.Create(ctx, test.root, name, FileFlags{}, FilePermissions{})
+			if err != nil {
+				t.Fatalf("root.Create(%q, _) failed: %v", name, err)
+			}
+			d := f.Dirent
+
+			if err := test.root.Remove(contexttest.Context(t), test.root, name); err != nil {
+				t.Fatalf("root.Remove(root, %q) failed: %v", name, err)
+			}
+
+			if got := d.TestReadRefs(); got != 0 {
+				t.Fatalf("dirent has a ref count of %d, want %d", got, 0)
+			}
+
+			d.DecRef()
+
+			test.root.flush()
+
+			if got := len(test.root.children); got != 0 {
+				t.Errorf("root has %d children, want %d", got, 0)
+			}
+		})
+	}
+}
+
+func TestRenameExtraRefs(t *testing.T) {
+	// refs == 0 -> one reference.
+	// refs == -1 -> has been destroyed.
+
+	ctx := contexttest.Context(t)
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// cache of extra Dirent references, may be nil.
+		cache *DirentCache
+	}{
+		{
+			desc:  "Rename no caching",
+			cache: nil,
+		},
+		{
+			desc:  "Rename caching",
+			cache: NewDirentCache(5),
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			dirAttr := StableAttr{Type: Directory}
+
+			oldParent := NewDirent(NewMockInode(ctx, NewMockMountSource(test.cache), dirAttr), "old_parent")
+			newParent := NewDirent(NewMockInode(ctx, NewMockMountSource(test.cache), dirAttr), "new_parent")
+
+			renamed, err := oldParent.Walk(ctx, oldParent, "old_child")
+			if err != nil {
+				t.Fatalf("Walk(oldParent, %q) got error %v, want nil", "old_child", err)
+			}
+			replaced, err := newParent.Walk(ctx, oldParent, "new_child")
+			if err != nil {
+				t.Fatalf("Walk(newParent, %q) got error %v, want nil", "new_child", err)
+			}
+
+			if err := Rename(contexttest.RootContext(t), oldParent /*root */, oldParent, "old_child", newParent, "new_child"); err != nil {
+				t.Fatalf("Rename got error %v, want nil", err)
+			}
+
+			oldParent.flush()
+			newParent.flush()
+
+			// Expect to have only active references.
+			if got := renamed.TestReadRefs(); got != 0 {
+				t.Errorf("renamed has ref count %d, want only active references %d", got, 0)
+			}
+			if got := replaced.TestReadRefs(); got != 0 {
+				t.Errorf("replaced has ref count %d, want only active references %d", got, 0)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
new file mode 100644
index 000000000..c6a1b5e38
--- /dev/null
+++ b/pkg/sentry/fs/dirent_state.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sync/atomic"
+)
+
+// beforeSave is invoked by stateify.
+func (d *Dirent) beforeSave() {
+	// Refuse to save if the file has already been deleted (but still has
+	// open fds, which is why the Dirent is still accessible). We know the
+	// the restore opening of the file will always fail. This condition will
+	// last until all the open fds and this Dirent are closed and released.
+	//
+	// Note that this is rejection rather than failure---it would be
+	// perfectly OK to save---we are simply disallowing it here to prevent
+	// generating non-restorable state dumps. As the program continues its
+	// execution, it may become allowed to save again.
+	if atomic.LoadInt32(&d.deleted) != 0 {
+		n, _ := d.FullName(nil /* root */)
+		panic(ErrSaveRejection{fmt.Errorf("deleted file %q still has open fds", n)})
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *Dirent) afterLoad() {
+	if d.userVisible {
+		allDirents.add(d)
+	}
+}
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
new file mode 100644
index 000000000..9e1f65d3e
--- /dev/null
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -0,0 +1,76 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "pipe_state",
+    srcs = [
+        "pipe.go",
+        "pipe_state.go",
+    ],
+    out = "pipe_autogen_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"],
+    package = "fdpipe",
+)
+
+go_library(
+    name = "fdpipe",
+    srcs = [
+        "pipe.go",
+        "pipe_autogen_state.go",
+        "pipe_opener.go",
+        "pipe_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/p9",
+        "//pkg/refs",
+        "//pkg/secio",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/unet",
+        "//pkg/waiter",
+        "//pkg/waiter/fdnotifier",
+    ],
+)
+
+go_test(
+    name = "fdpipe_test",
+    size = "small",
+    srcs = [
+        "pipe_opener_test.go",
+        "pipe_test.go",
+    ],
+    embed = [":fdpipe"],
+    deps = [
+        "//pkg/fd",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+        "//pkg/waiter/fdnotifier",
+        "@com_github_google_uuid//:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
new file mode 100644
index 000000000..f7bbd4aff
--- /dev/null
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -0,0 +1,167 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fdpipe implements common namedpipe opening and accessing logic.
+package fdpipe
+
+import (
+	"os"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// pipeOperations are the fs.FileOperations of a host pipe.
+type pipeOperations struct {
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	fsutil.NoIoctl       `state:"nosave"`
+	waiter.Queue         `state:"nosave"`
+
+	// flags are the flags used to open the pipe.
+	flags fs.FileFlags `state:".(fs.FileFlags)"`
+
+	// opener is how the pipe was opened.
+	opener NonBlockingOpener `state:"wait"`
+
+	// file represents the host pipe.
+	file *fd.FD `state:"nosave"`
+
+	// mu protects readAheadBuffer access below.
+	mu sync.Mutex `state:"nosave"`
+
+	// readAheadBuffer contains read bytes that have not yet been read
+	// by the application but need to be buffered for save-restore for correct
+	// opening semantics.  The readAheadBuffer will only be non-empty when the
+	// is first opened and will be drained by subsequent reads on the pipe.
+	readAheadBuffer []byte
+}
+
+// newPipeOperations returns an implementation of fs.FileOperations for a pipe.
+func newPipeOperations(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags, file *fd.FD, readAheadBuffer []byte) (*pipeOperations, error) {
+	pipeOps := &pipeOperations{
+		flags:           flags,
+		opener:          opener,
+		file:            file,
+		readAheadBuffer: readAheadBuffer,
+	}
+	if err := pipeOps.init(); err != nil {
+		return nil, err
+	}
+	return pipeOps, nil
+}
+
+// init initializes p.file.
+func (p *pipeOperations) init() error {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(p.file.FD(), &s); err != nil {
+		log.Warningf("pipe: cannot stat fd %d: %v", p.file.FD(), err)
+		return syscall.EINVAL
+	}
+	if s.Mode&syscall.S_IFIFO != syscall.S_IFIFO {
+		log.Warningf("pipe: cannot load fd %d as pipe, file type: %o", p.file.FD(), s.Mode)
+		return syscall.EINVAL
+	}
+	if err := syscall.SetNonblock(p.file.FD(), true); err != nil {
+		return err
+	}
+	if err := fdnotifier.AddFD(int32(p.file.FD()), &p.Queue); err != nil {
+		return err
+	}
+	return nil
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (p *pipeOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	p.Queue.EventRegister(e, mask)
+	fdnotifier.UpdateFD(int32(p.file.FD()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (p *pipeOperations) EventUnregister(e *waiter.Entry) {
+	p.Queue.EventUnregister(e)
+	fdnotifier.UpdateFD(int32(p.file.FD()))
+}
+
+// Readiness returns a mask of ready events for stream.
+func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.EventMask) {
+	return fdnotifier.NonBlockingPoll(int32(p.file.FD()), mask)
+}
+
+// Release implements fs.FileOperations.Release.
+func (p *pipeOperations) Release() {
+	fdnotifier.RemoveFD(int32(p.file.FD()))
+	p.file.Close()
+	p.file = nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (p *pipeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	// Drain the read ahead buffer, if it contains anything first.
+	var bufN int
+	var bufErr error
+	p.mu.Lock()
+	if len(p.readAheadBuffer) > 0 {
+		bufN, bufErr = dst.CopyOut(ctx, p.readAheadBuffer)
+		p.readAheadBuffer = p.readAheadBuffer[bufN:]
+		dst = dst.DropFirst(bufN)
+	}
+	p.mu.Unlock()
+	if dst.NumBytes() == 0 || bufErr != nil {
+		return int64(bufN), bufErr
+	}
+
+	// Pipes expect full reads.
+	n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{secio.FullReader{p.file}})
+	total := int64(bufN) + n
+	if err != nil && isBlockError(err) {
+		return total, syserror.ErrWouldBlock
+	}
+	return total, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (p *pipeOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	n, err := src.CopyInTo(ctx, safemem.FromIOWriter{p.file})
+	if err != nil && isBlockError(err) {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// isBlockError unwraps os errors and checks if they are caused by EAGAIN or
+// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK {
+		return true
+	}
+	if pe, ok := err.(*os.PathError); ok {
+		return isBlockError(pe.Err)
+	}
+	return false
+}
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
new file mode 100644
index 000000000..a0d59575f
--- /dev/null
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -0,0 +1,193 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fdpipe
+
+import (
+	"io"
+	"os"
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// NonBlockingOpener is a generic host file opener used to retry opening host
+// pipes if necessary.
+type NonBlockingOpener interface {
+	// NonBlockingOpen tries to open a host pipe in a non-blocking way,
+	// and otherwise returns an error. Implementations should be idempotent.
+	NonBlockingOpen(context.Context, fs.PermMask) (*fd.FD, error)
+}
+
+// Open blocks until a host pipe can be opened or the action was cancelled.
+// On success, returns fs.FileOperations wrapping the opened host pipe.
+func Open(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (fs.FileOperations, error) {
+	p := &pipeOpenState{}
+	canceled := false
+	for {
+		if file, err := p.TryOpen(ctx, opener, flags); err != syserror.ErrWouldBlock {
+			return file, err
+		}
+
+		// Honor the cancellation request if open still blocks.
+		if canceled {
+			// If we were canceled but we have a handle to a host
+			// file, we need to close it.
+			if p.hostFile != nil {
+				p.hostFile.Close()
+			}
+			return nil, syserror.ErrInterrupted
+		}
+
+		cancel := ctx.SleepStart()
+		select {
+		case <-cancel:
+			// The cancellation request received here really says
+			// "cancel from now on (or ASAP)". Any environmental
+			// changes happened before receiving it, that might have
+			// caused open to not block anymore, should still be
+			// respected. So we cannot just return here. We have to
+			// give open another try below first.
+			canceled = true
+			ctx.SleepFinish(false)
+		case <-time.After(100 * time.Millisecond):
+			// If we would block, then delay retrying for a bit, since there
+			// is no way to know when the pipe would be ready to be
+			// re-opened. This is identical to sending an event notification
+			// to stop blocking in Task.Block, given that this routine will
+			// stop retrying if a cancelation is received.
+			ctx.SleepFinish(true)
+		}
+	}
+}
+
+// pipeOpenState holds state needed to open a blocking named pipe read only, for instance the
+// file that has been opened but doesn't yet have a corresponding writer.
+type pipeOpenState struct {
+	// hostFile is the read only named pipe which lacks a corresponding writer.
+	hostFile *fd.FD
+}
+
+// unwrapError is needed to match against ENXIO primarily.
+func unwrapError(err error) error {
+	if pe, ok := err.(*os.PathError); ok {
+		return pe.Err
+	}
+	return err
+}
+
+// TryOpen uses a NonBlockingOpener to try to open a host pipe, respecting the fs.FileFlags.
+func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (*pipeOperations, error) {
+	switch {
+	// Reject invalid configurations so they don't accidently succeed below.
+	case !flags.Read && !flags.Write:
+		return nil, syscall.EINVAL
+
+	// Handle opening RDWR or with O_NONBLOCK: will never block, so try only once.
+	case (flags.Read && flags.Write) || flags.NonBlocking:
+		f, err := opener.NonBlockingOpen(ctx, fs.PermMask{Read: flags.Read, Write: flags.Write})
+		if err != nil {
+			return nil, err
+		}
+		return newPipeOperations(ctx, opener, flags, f, nil)
+
+	// Handle opening O_WRONLY blocking: convert ENXIO to syserror.ErrWouldBlock.
+	// See TryOpenWriteOnly for more details.
+	case flags.Write:
+		return p.TryOpenWriteOnly(ctx, opener)
+
+	default:
+		// Handle opening O_RDONLY blocking: convert EOF from read to syserror.ErrWouldBlock.
+		// See TryOpenReadOnly for more details.
+		return p.TryOpenReadOnly(ctx, opener)
+	}
+}
+
+// TryOpenReadOnly tries to open a host pipe read only but only returns a fs.File when
+// there is a coordinating writer.  Call TryOpenReadOnly repeatedly on the same pipeOpenState
+// until syserror.ErrWouldBlock is no longer returned.
+//
+// How it works:
+//
+// Opening a pipe read only will return no error, but each non zero Read will return EOF
+// until a writer becomes available, then EWOULDBLOCK.  This is the only state change
+// available to us.  We keep a read ahead buffer in case we read bytes instead of getting
+// EWOULDBLOCK, to be read from on the first read request to this fs.File.
+func (p *pipeOpenState) TryOpenReadOnly(ctx context.Context, opener NonBlockingOpener) (*pipeOperations, error) {
+	// Waiting for a blocking read only open involves reading from the host pipe until
+	// bytes or other writers are available, so instead of retrying opening the pipe,
+	// it's necessary to retry reading from the pipe. To do this we need to keep around
+	// the read only pipe we opened, until success or an irrecoverable read error (at
+	// which point it must be closed).
+	if p.hostFile == nil {
+		var err error
+		p.hostFile, err = opener.NonBlockingOpen(ctx, fs.PermMask{Read: true})
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Try to read from the pipe to see if writers are around.
+	tryReadBuffer := make([]byte, 1)
+	n, rerr := p.hostFile.Read(tryReadBuffer)
+
+	// No bytes were read.
+	if n == 0 {
+		// EOF means that we're not ready yet.
+		if rerr == nil || rerr == io.EOF {
+			return nil, syserror.ErrWouldBlock
+		}
+		// Any error that is not EWOULDBLOCK also means we're not
+		// ready yet, and probably never will be ready.  In this
+		// case we need to close the host pipe we opened.
+		if unwrapError(rerr) != syscall.EWOULDBLOCK {
+			p.hostFile.Close()
+			return nil, rerr
+		}
+	}
+
+	// If any bytes were read, no matter the corresponding error, we need
+	// to keep them around so they can be read by the application.
+	var readAheadBuffer []byte
+	if n > 0 {
+		readAheadBuffer = tryReadBuffer
+	}
+
+	// Successfully opened read only blocking pipe with either bytes available
+	// to read and/or a writer available.
+	return newPipeOperations(ctx, opener, fs.FileFlags{Read: true}, p.hostFile, readAheadBuffer)
+}
+
+// TryOpenWriteOnly tries to open a host pipe write only but only returns a fs.File when
+// there is a coordinating reader.  Call TryOpenWriteOnly repeatedly on the same pipeOpenState
+// until syserror.ErrWouldBlock is no longer returned.
+//
+// How it works:
+//
+// Opening a pipe write only will return ENXIO until readers are available.  Converts the ENXIO
+// to an syserror.ErrWouldBlock, to tell callers to retry.
+func (*pipeOpenState) TryOpenWriteOnly(ctx context.Context, opener NonBlockingOpener) (*pipeOperations, error) {
+	hostFile, err := opener.NonBlockingOpen(ctx, fs.PermMask{Write: true})
+	if unwrapError(err) == syscall.ENXIO {
+		return nil, syserror.ErrWouldBlock
+	}
+	if err != nil {
+		return nil, err
+	}
+	return newPipeOperations(ctx, opener, fs.FileFlags{Write: true}, hostFile, nil)
+}
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
new file mode 100644
index 000000000..83f6c1986
--- /dev/null
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -0,0 +1,522 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fdpipe
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type hostOpener struct {
+	name string
+}
+
+func (h *hostOpener) NonBlockingOpen(_ context.Context, p fs.PermMask) (*fd.FD, error) {
+	var flags int
+	switch {
+	case p.Read && p.Write:
+		flags = syscall.O_RDWR
+	case p.Write:
+		flags = syscall.O_WRONLY
+	case p.Read:
+		flags = syscall.O_RDONLY
+	default:
+		return nil, syscall.EINVAL
+	}
+	f, err := syscall.Open(h.name, flags|syscall.O_NONBLOCK, 0666)
+	if err != nil {
+		return nil, err
+	}
+	return fd.New(f), nil
+}
+
+func pipename() string {
+	return fmt.Sprintf(path.Join(os.TempDir(), "test-named-pipe-%s"), uuid.New())
+}
+
+func mkpipe(name string) error {
+	return syscall.Mknod(name, syscall.S_IFIFO|0666, 0)
+}
+
+func TestTryOpen(t *testing.T) {
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// makePipe is true if the test case should create the pipe.
+		makePipe bool
+
+		// flags are the fs.FileFlags used to open the pipe.
+		flags fs.FileFlags
+
+		// expectFile is true if a fs.File is expected.
+		expectFile bool
+
+		// err is the expected error
+		err error
+	}{
+		{
+			desc:       "FileFlags lacking Read and Write are invalid",
+			makePipe:   false,
+			flags:      fs.FileFlags{}, /* bogus */
+			expectFile: false,
+			err:        syscall.EINVAL,
+		},
+		{
+			desc:       "NonBlocking Read only error returns immediately",
+			makePipe:   false, /* causes the error */
+			flags:      fs.FileFlags{Read: true, NonBlocking: true},
+			expectFile: false,
+			err:        syscall.ENOENT,
+		},
+		{
+			desc:       "NonBlocking Read only success returns immediately",
+			makePipe:   true,
+			flags:      fs.FileFlags{Read: true, NonBlocking: true},
+			expectFile: true,
+			err:        nil,
+		},
+		{
+			desc:       "NonBlocking Write only error returns immediately",
+			makePipe:   false, /* causes the error */
+			flags:      fs.FileFlags{Write: true, NonBlocking: true},
+			expectFile: false,
+			err:        syscall.ENOENT,
+		},
+		{
+			desc:       "NonBlocking Write only no reader error returns immediately",
+			makePipe:   true,
+			flags:      fs.FileFlags{Write: true, NonBlocking: true},
+			expectFile: false,
+			err:        syscall.ENXIO,
+		},
+		{
+			desc:       "ReadWrite error returns immediately",
+			makePipe:   false, /* causes the error */
+			flags:      fs.FileFlags{Read: true, Write: true},
+			expectFile: false,
+			err:        syscall.ENOENT,
+		},
+		{
+			desc:       "ReadWrite returns immediately",
+			makePipe:   true,
+			flags:      fs.FileFlags{Read: true, Write: true},
+			expectFile: true,
+			err:        nil,
+		},
+		{
+			desc:       "Blocking Write only returns open error",
+			makePipe:   false, /* causes the error */
+			flags:      fs.FileFlags{Write: true},
+			expectFile: false,
+			err:        syscall.ENOENT, /* from bogus perms */
+		},
+		{
+			desc:       "Blocking Read only returns open error",
+			makePipe:   false, /* causes the error */
+			flags:      fs.FileFlags{Read: true},
+			expectFile: false,
+			err:        syscall.ENOENT,
+		},
+		{
+			desc:       "Blocking Write only returns with syserror.ErrWouldBlock",
+			makePipe:   true,
+			flags:      fs.FileFlags{Write: true},
+			expectFile: false,
+			err:        syserror.ErrWouldBlock,
+		},
+		{
+			desc:       "Blocking Read only returns with syserror.ErrWouldBlock",
+			makePipe:   true,
+			flags:      fs.FileFlags{Read: true},
+			expectFile: false,
+			err:        syserror.ErrWouldBlock,
+		},
+	} {
+		name := pipename()
+		if test.makePipe {
+			// Create the pipe.  We do this per-test case to keep tests independent.
+			if err := mkpipe(name); err != nil {
+				t.Errorf("%s: failed to make host pipe: %v", test.desc, err)
+				continue
+			}
+			defer syscall.Unlink(name)
+		}
+
+		// Use a host opener to keep things simple.
+		opener := &hostOpener{name: name}
+
+		pipeOpenState := &pipeOpenState{}
+		ctx := contexttest.Context(t)
+		pipeOps, err := pipeOpenState.TryOpen(ctx, opener, test.flags)
+		if unwrapError(err) != test.err {
+			t.Errorf("%s: got error %v, want %v", test.desc, err, test.err)
+			if pipeOps != nil {
+				// Cleanup the state of the pipe, and remove the fd from the
+				// fdnotifier.  Sadly this needed to maintain the correctness
+				// of other tests because the fdnotifier is global.
+				pipeOps.Release()
+			}
+			continue
+		}
+		if (pipeOps != nil) != test.expectFile {
+			t.Errorf("%s: got non-nil file %v, want %v", test.desc, pipeOps != nil, test.expectFile)
+		}
+		if pipeOps != nil {
+			// Same as above.
+			pipeOps.Release()
+		}
+	}
+}
+
+func TestPipeOpenUnblocksEventually(t *testing.T) {
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// partnerIsReader is true if the goroutine opening the same pipe as the test case
+		// should open the pipe read only.  Otherwise write only.  This also means that the
+		// test case will open the pipe in the opposite way.
+		partnerIsReader bool
+
+		// partnerIsBlocking is true if the goroutine opening the same pipe as the test case
+		// should do so without the O_NONBLOCK flag, otherwise opens the pipe with O_NONBLOCK
+		// until ENXIO is not returned.
+		partnerIsBlocking bool
+	}{
+		{
+			desc:              "Blocking Read with blocking writer partner opens eventually",
+			partnerIsReader:   false,
+			partnerIsBlocking: true,
+		},
+		{
+			desc:              "Blocking Write with blocking reader partner opens eventually",
+			partnerIsReader:   true,
+			partnerIsBlocking: true,
+		},
+		{
+			desc:              "Blocking Read with non-blocking writer partner opens eventually",
+			partnerIsReader:   false,
+			partnerIsBlocking: false,
+		},
+		{
+			desc:              "Blocking Write with non-blocking reader partner opens eventually",
+			partnerIsReader:   true,
+			partnerIsBlocking: false,
+		},
+	} {
+		// Create the pipe.  We do this per-test case to keep tests independent.
+		name := pipename()
+		if err := mkpipe(name); err != nil {
+			t.Errorf("%s: failed to make host pipe: %v", test.desc, err)
+			continue
+		}
+		defer syscall.Unlink(name)
+
+		// Spawn the partner.
+		type fderr struct {
+			fd  int
+			err error
+		}
+		errch := make(chan fderr, 1)
+		go func() {
+			var flags int
+			if test.partnerIsReader {
+				flags = syscall.O_RDONLY
+			} else {
+				flags = syscall.O_WRONLY
+			}
+			if test.partnerIsBlocking {
+				fd, err := syscall.Open(name, flags, 0666)
+				errch <- fderr{fd: fd, err: err}
+			} else {
+				var fd int
+				err := error(syscall.ENXIO)
+				for err == syscall.ENXIO {
+					fd, err = syscall.Open(name, flags|syscall.O_NONBLOCK, 0666)
+					time.Sleep(1 * time.Second)
+				}
+				errch <- fderr{fd: fd, err: err}
+			}
+		}()
+
+		// Setup file flags for either a read only or write only open.
+		flags := fs.FileFlags{
+			Read:  !test.partnerIsReader,
+			Write: test.partnerIsReader,
+		}
+
+		// Open the pipe in a blocking way, which should succeed eventually.
+		opener := &hostOpener{name: name}
+		ctx := contexttest.Context(t)
+		pipeOps, err := Open(ctx, opener, flags)
+		if pipeOps != nil {
+			// Same as TestTryOpen.
+			pipeOps.Release()
+		}
+
+		// Check that the partner opened the file successfully.
+		e := <-errch
+		if e.err != nil {
+			t.Errorf("%s: partner got error %v, wanted nil", test.desc, e.err)
+			continue
+		}
+		// If so, then close the partner fd to avoid leaking an fd.
+		syscall.Close(e.fd)
+
+		// Check that our blocking open was successful.
+		if err != nil {
+			t.Errorf("%s: blocking open got error %v, wanted nil", test.desc, err)
+			continue
+		}
+		if pipeOps == nil {
+			t.Errorf("%s: blocking open got nil file, wanted non-nil", test.desc)
+			continue
+		}
+	}
+}
+
+func TestCopiedReadAheadBuffer(t *testing.T) {
+	// Create the pipe.
+	name := pipename()
+	if err := mkpipe(name); err != nil {
+		t.Fatalf("failed to make host pipe: %v", err)
+	}
+	defer syscall.Unlink(name)
+
+	// We're taking advantage of the fact that pipes opened read only always return
+	// success, but internally they are not deemed "opened" until we're sure that
+	// another writer comes along.  This means we can open the same pipe write only
+	// with no problems + write to it, given that opener.Open already tried to open
+	// the pipe RDONLY and succeeded, which we know happened if TryOpen returns
+	// syserror.ErrwouldBlock.
+	//
+	// This simulates the open(RDONLY) <-> open(WRONLY)+write race we care about, but
+	// does not cause our test to be racy (which would be terrible).
+	opener := &hostOpener{name: name}
+	pipeOpenState := &pipeOpenState{}
+	ctx := contexttest.Context(t)
+	pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true})
+	if pipeOps != nil {
+		pipeOps.Release()
+		t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY)
+	}
+	if err != syserror.ErrWouldBlock {
+		t.Fatalf("open(%s, %o) got error %v, want %v", name, syscall.O_RDONLY, err, syserror.ErrWouldBlock)
+	}
+
+	// Then open the same pipe write only and write some bytes to it.  The next
+	// time we try to open the pipe read only again via the pipeOpenState, we should
+	// succeed and buffer some of the bytes written.
+	fd, err := syscall.Open(name, syscall.O_WRONLY, 0666)
+	if err != nil {
+		t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_WRONLY, err)
+	}
+	defer syscall.Close(fd)
+
+	data := []byte("hello")
+	if n, err := syscall.Write(fd, data); n != len(data) || err != nil {
+		t.Fatalf("write(%v) got (%d, %v), want (%d, nil)", data, n, err, len(data))
+	}
+
+	// Try the read again, knowing that it should succeed this time.
+	pipeOps, err = pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true})
+	if pipeOps == nil {
+		t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY)
+	}
+	defer pipeOps.Release()
+
+	if err != nil {
+		t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err)
+	}
+
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
+		Type: fs.Pipe,
+	})
+	file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, pipeOps)
+
+	// Check that the file we opened points to a pipe with a non-empty read ahead buffer.
+	bufsize := len(pipeOps.readAheadBuffer)
+	if bufsize != 1 {
+		t.Fatalf("read ahead buffer got %d bytes, want %d", bufsize, 1)
+	}
+
+	// Now for the final test, try to read everything in, expecting to get back all of
+	// the bytes that were written at once.  Note that in the wild there is no atomic
+	// read size so expecting to get all bytes from a single writer when there are
+	// multiple readers is a bad expectation.
+	buf := make([]byte, len(data))
+	ioseq := usermem.BytesIOSequence(buf)
+	n, err := pipeOps.Read(ctx, file, ioseq, 0)
+	if err != nil {
+		t.Fatalf("read request got error %v, want nil", err)
+	}
+	if n != int64(len(data)) {
+		t.Fatalf("read request got %d bytes, want %d", n, len(data))
+	}
+	if !bytes.Equal(buf, data) {
+		t.Errorf("read request got bytes [%v], want [%v]", buf, data)
+	}
+}
+
+func TestPipeHangup(t *testing.T) {
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// flags control how we open our end of the pipe and must be read
+		// only or write only.  They also dicate how a coordinating partner
+		// fd is opened, which is their inverse (read only -> write only, etc).
+		flags fs.FileFlags
+
+		// hangupSelf if true causes the test case to close our end of the pipe
+		// and causes hangup errors to be asserted on our coordinating partner's
+		// fd.  If hangupSelf is false, then our partner's fd is closed and the
+		// hangup errors are expected on our end of the pipe.
+		hangupSelf bool
+	}{
+		{
+			desc:  "Read only gets hangup error",
+			flags: fs.FileFlags{Read: true},
+		},
+		{
+			desc:  "Write only gets hangup error",
+			flags: fs.FileFlags{Write: true},
+		},
+		{
+			desc:       "Read only generates hangup error",
+			flags:      fs.FileFlags{Read: true},
+			hangupSelf: true,
+		},
+		{
+			desc:       "Write only generates hangup error",
+			flags:      fs.FileFlags{Write: true},
+			hangupSelf: true,
+		},
+	} {
+		if test.flags.Read == test.flags.Write {
+			t.Errorf("%s: test requires a single reader or writer", test.desc)
+			continue
+		}
+
+		// Create the pipe.  We do this per-test case to keep tests independent.
+		name := pipename()
+		if err := mkpipe(name); err != nil {
+			t.Errorf("%s: failed to make host pipe: %v", test.desc, err)
+			continue
+		}
+		defer syscall.Unlink(name)
+
+		// Fire off a partner routine which tries to open the same pipe blocking,
+		// which will synchronize with us.  The channel allows us to get back the
+		// fd once we expect this partner routine to succeed, so we can manifest
+		// hangup events more directly.
+		fdchan := make(chan int, 1)
+		go func() {
+			// Be explicit about the flags to protect the test from
+			// misconfiguration.
+			var flags int
+			if test.flags.Read {
+				flags = syscall.O_WRONLY
+			} else {
+				flags = syscall.O_RDONLY
+			}
+			fd, err := syscall.Open(name, flags, 0666)
+			if err != nil {
+				t.Logf("Open(%q, %o, 0666) partner failed: %v", name, flags, err)
+			}
+			fdchan <- fd
+		}()
+
+		// Open our end in a blocking way to ensure that we coordinate.
+		opener := &hostOpener{name: name}
+		ctx := contexttest.Context(t)
+		pipeOps, err := Open(ctx, opener, test.flags)
+		if err != nil {
+			t.Errorf("%s: Open got error %v, want nil", test.desc, err)
+			continue
+		}
+		// Don't defer file.DecRef here because that causes the hangup we're
+		// trying to test for.
+
+		// Expect the partner routine to have coordinated with us and get back
+		// its open fd.
+		f := <-fdchan
+		if f < 0 {
+			t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f)
+			pipeOps.Release()
+			continue
+		}
+
+		if test.hangupSelf {
+			// Hangup self and assert that our partner got the expected hangup
+			// error.
+			pipeOps.Release()
+
+			if test.flags.Read {
+				// Partner is writer.
+				assertWriterHungup(t, test.desc, fd.NewReadWriter(f))
+			} else {
+				// Partner is reader.
+				assertReaderHungup(t, test.desc, fd.NewReadWriter(f))
+			}
+		} else {
+			// Hangup our partner and expect us to get the hangup error.
+			syscall.Close(f)
+			defer pipeOps.Release()
+
+			if test.flags.Read {
+				assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file)
+			} else {
+				assertWriterHungup(t, test.desc, pipeOps.(*pipeOperations).file)
+			}
+		}
+	}
+}
+
+func assertReaderHungup(t *testing.T, desc string, reader io.Reader) bool {
+	// Drain the pipe completely, it might have crap in it, but expect EOF eventually.
+	var err error
+	for err == nil {
+		_, err = reader.Read(make([]byte, 10))
+	}
+	if err != io.EOF {
+		t.Errorf("%s: read from self after hangup got error %v, want %v", desc, err, io.EOF)
+		return false
+	}
+	return true
+}
+
+func assertWriterHungup(t *testing.T, desc string, writer io.Writer) bool {
+	if _, err := writer.Write([]byte("hello")); unwrapError(err) != syscall.EPIPE {
+		t.Errorf("%s: write to self after hangup got error %v, want %v", desc, err, syscall.EPIPE)
+		return false
+	}
+	return true
+}
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
new file mode 100644
index 000000000..8996a2178
--- /dev/null
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -0,0 +1,88 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fdpipe
+
+import (
+	"fmt"
+	"io/ioutil"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// beforeSave is invoked by stateify.
+func (p *pipeOperations) beforeSave() {
+	if p.flags.Read {
+		data, err := ioutil.ReadAll(p.file)
+		if err != nil && !isBlockError(err) {
+			panic(fmt.Sprintf("failed to read from pipe: %v", err))
+		}
+		p.readAheadBuffer = append(p.readAheadBuffer, data...)
+	} else if p.flags.Write {
+		file, err := p.opener.NonBlockingOpen(context.Background(), fs.PermMask{Write: true})
+		if err != nil {
+			panic(fs.ErrSaveRejection{fmt.Errorf("write-only pipe end cannot be re-opened as %v: %v", p, err)})
+		}
+		file.Close()
+	}
+}
+
+// saveFlags is invoked by stateify.
+func (p *pipeOperations) saveFlags() fs.FileFlags {
+	return p.flags
+}
+
+// readPipeOperationsLoading is used to ensure that write-only pipe fds are
+// opened after read/write and read-only pipe fds, to avoid ENXIO when
+// multiple pipe fds refer to different ends of the same pipe.
+var readPipeOperationsLoading sync.WaitGroup
+
+// loadFlags is invoked by stateify.
+func (p *pipeOperations) loadFlags(flags fs.FileFlags) {
+	// This is a hack to ensure that readPipeOperationsLoading includes all
+	// readable pipe fds before any asynchronous calls to
+	// readPipeOperationsLoading.Wait().
+	if flags.Read {
+		readPipeOperationsLoading.Add(1)
+	}
+	p.flags = flags
+}
+
+// afterLoad is invoked by stateify.
+func (p *pipeOperations) afterLoad() {
+	load := func() {
+		if !p.flags.Read {
+			readPipeOperationsLoading.Wait()
+		} else {
+			defer readPipeOperationsLoading.Done()
+		}
+		var err error
+		p.file, err = p.opener.NonBlockingOpen(context.Background(), fs.PermMask{
+			Read:  p.flags.Read,
+			Write: p.flags.Write,
+		})
+		if err != nil {
+			panic(fmt.Sprintf("unable to open pipe %v: %v", p, err))
+		}
+		if err := p.init(); err != nil {
+			panic(fmt.Sprintf("unable to initialize pipe %v: %v", p, err))
+		}
+	}
+
+	// Do background opening of pipe ends. Note for write-only pipe ends we
+	// have to do it asynchronously to avoid blocking the restore.
+	fs.Async(load)
+}
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
new file mode 100644
index 000000000..6cd314f5b
--- /dev/null
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -0,0 +1,489 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fdpipe
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+func singlePipeFD() (int, error) {
+	fds := make([]int, 2)
+	if err := syscall.Pipe(fds); err != nil {
+		return -1, err
+	}
+	syscall.Close(fds[1])
+	return fds[0], nil
+}
+
+func singleDirFD() (int, error) {
+	return syscall.Open(os.TempDir(), syscall.O_RDONLY, 0666)
+}
+
+func mockPipeDirent(t *testing.T) *fs.Dirent {
+	ctx := contexttest.Context(t)
+	node := fs.NewMockInodeOperations(ctx)
+	node.UAttr = fs.UnstableAttr{
+		Perms: fs.FilePermissions{
+			User: fs.PermMask{Read: true, Write: true},
+		},
+	}
+	inode := fs.NewInode(node, fs.NewMockMountSource(nil), fs.StableAttr{
+		Type:      fs.Pipe,
+		BlockSize: usermem.PageSize,
+	})
+	return fs.NewDirent(inode, "")
+}
+
+func TestNewPipe(t *testing.T) {
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// getfd generates the fd to pass to newPipeOperations.
+		getfd func() (int, error)
+
+		// flags are the fs.FileFlags passed to newPipeOperations.
+		flags fs.FileFlags
+
+		// readAheadBuffer is the buffer passed to newPipeOperations.
+		readAheadBuffer []byte
+
+		// err is the expected error.
+		err error
+	}{
+		{
+			desc:  "Cannot make new pipe from bad fd",
+			getfd: func() (int, error) { return -1, nil },
+			err:   syscall.EINVAL,
+		},
+		{
+			desc:  "Cannot make new pipe from non-pipe fd",
+			getfd: singleDirFD,
+			err:   syscall.EINVAL,
+		},
+		{
+			desc:            "Can make new pipe from pipe fd",
+			getfd:           singlePipeFD,
+			flags:           fs.FileFlags{Read: true},
+			readAheadBuffer: []byte("hello"),
+		},
+	} {
+		gfd, err := test.getfd()
+		if err != nil {
+			t.Errorf("%s: getfd got (%d, %v), want (fd, nil)", test.desc, gfd, err)
+			continue
+		}
+		f := fd.New(gfd)
+
+		p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer)
+		if p != nil {
+			// This is necessary to remove the fd from the global fd notifier.
+			defer p.Release()
+		} else {
+			// If there is no p to DecRef on, because newPipeOperations failed, then the
+			// file still needs to be closed.
+			defer f.Close()
+		}
+
+		if err != test.err {
+			t.Errorf("%s: got error %v, want %v", test.desc, err, test.err)
+			continue
+		}
+		// Check the state of the pipe given that it was successfully opened.
+		if err == nil {
+			if p == nil {
+				t.Errorf("%s: got nil pipe and nil error, want (pipe, nil)", test.desc)
+				continue
+			}
+			if flags := p.flags; test.flags != flags {
+				t.Errorf("%s: got file flags %s, want %s", test.desc, flags, test.flags)
+				continue
+			}
+			if len(test.readAheadBuffer) != len(p.readAheadBuffer) {
+				t.Errorf("%s: got read ahead buffer length %d, want %d", test.desc, len(p.readAheadBuffer), len(test.readAheadBuffer))
+				continue
+			}
+			fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(p.file.FD()), syscall.F_GETFL, 0)
+			if errno != 0 {
+				t.Errorf("%s: failed to get file flags for fd %d, got %v, want 0", test.desc, p.file.FD(), errno)
+				continue
+			}
+			if fileFlags&syscall.O_NONBLOCK == 0 {
+				t.Errorf("%s: pipe is blocking, expected non-blocking", test.desc)
+				continue
+			}
+			if !fdnotifier.HasFD(int32(f.FD())) {
+				t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD)
+			}
+		}
+	}
+}
+
+func TestPipeDestruction(t *testing.T) {
+	fds := make([]int, 2)
+	if err := syscall.Pipe(fds); err != nil {
+		t.Fatalf("failed to create pipes: got %v, want nil", err)
+	}
+	f := fd.New(fds[0])
+
+	// We don't care about the other end, just use the read end.
+	syscall.Close(fds[1])
+
+	// Test the read end, but it doesn't really matter which.
+	p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil)
+	if err != nil {
+		f.Close()
+		t.Fatalf("newPipeOperations got error %v, want nil", err)
+	}
+	// Drop our only reference, which should trigger the destructor.
+	p.Release()
+
+	if fdnotifier.HasFD(int32(fds[0])) {
+		t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0])
+	}
+	if p.file != nil {
+		t.Errorf("after DecRef got file, want nil")
+	}
+}
+
+type Seek struct{}
+
+type ReadDir struct{}
+
+type Writev struct {
+	Src usermem.IOSequence
+}
+
+type Readv struct {
+	Dst usermem.IOSequence
+}
+
+type Fsync struct{}
+
+func TestPipeRequest(t *testing.T) {
+	for _, test := range []struct {
+		// desc is the test's description.
+		desc string
+
+		// request to execute.
+		context interface{}
+
+		// flags determines whether to use the read or write end
+		// of the pipe, for this test it can only be Read or Write.
+		flags fs.FileFlags
+
+		// keepOpenPartner if false closes the other end of the pipe,
+		// otherwise this is delayed until the end of the test.
+		keepOpenPartner bool
+
+		// expected error
+		err error
+	}{
+		{
+			desc:    "ReadDir on pipe returns ENOTDIR",
+			context: &ReadDir{},
+			err:     syscall.ENOTDIR,
+		},
+		{
+			desc:    "Fsync on pipe returns EINVAL",
+			context: &Fsync{},
+			err:     syscall.EINVAL,
+		},
+		{
+			desc:    "Seek on pipe returns ESPIPE",
+			context: &Seek{},
+			err:     syscall.ESPIPE,
+		},
+		{
+			desc:    "Readv on pipe from empty buffer returns nil",
+			context: &Readv{Dst: usermem.BytesIOSequence(nil)},
+			flags:   fs.FileFlags{Read: true},
+		},
+		{
+			desc:    "Readv on pipe from non-empty buffer and closed partner returns EOF",
+			context: &Readv{Dst: usermem.BytesIOSequence(make([]byte, 10))},
+			flags:   fs.FileFlags{Read: true},
+			err:     io.EOF,
+		},
+		{
+			desc:            "Readv on pipe from non-empty buffer and open partner returns EWOULDBLOCK",
+			context:         &Readv{Dst: usermem.BytesIOSequence(make([]byte, 10))},
+			flags:           fs.FileFlags{Read: true},
+			keepOpenPartner: true,
+			err:             syserror.ErrWouldBlock,
+		},
+		{
+			desc:    "Writev on pipe from empty buffer returns nil",
+			context: &Writev{Src: usermem.BytesIOSequence(nil)},
+			flags:   fs.FileFlags{Write: true},
+		},
+		{
+			desc:    "Writev on pipe from non-empty buffer and closed partner returns EPIPE",
+			context: &Writev{Src: usermem.BytesIOSequence([]byte("hello"))},
+			flags:   fs.FileFlags{Write: true},
+			err:     syscall.EPIPE,
+		},
+		{
+			desc:            "Writev on pipe from non-empty buffer and open partner succeeds",
+			context:         &Writev{Src: usermem.BytesIOSequence([]byte("hello"))},
+			flags:           fs.FileFlags{Write: true},
+			keepOpenPartner: true,
+		},
+	} {
+		if test.flags.Read && test.flags.Write {
+			panic("both read and write not supported for this test")
+		}
+
+		fds := make([]int, 2)
+		if err := syscall.Pipe(fds); err != nil {
+			t.Errorf("%s: failed to create pipes: got %v, want nil", test.desc, err)
+			continue
+		}
+
+		// Configure the fd and partner fd based on the file flags.
+		testFd, partnerFd := fds[0], fds[1]
+		if test.flags.Write {
+			testFd, partnerFd = fds[1], fds[0]
+		}
+
+		// Configure closing the fds.
+		if test.keepOpenPartner {
+			defer syscall.Close(partnerFd)
+		} else {
+			syscall.Close(partnerFd)
+		}
+
+		// Create the pipe.
+		ctx := contexttest.Context(t)
+		p, err := newPipeOperations(ctx, nil, test.flags, fd.New(testFd), nil)
+		if err != nil {
+			t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err)
+		}
+		defer p.Release()
+
+		inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
+		file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p)
+
+		// Issue request via the appropriate function.
+		switch c := test.context.(type) {
+		case *Seek:
+			_, err = p.Seek(ctx, file, 0, 0)
+		case *ReadDir:
+			_, err = p.Readdir(ctx, file, nil)
+		case *Readv:
+			_, err = p.Read(ctx, file, c.Dst, 0)
+		case *Writev:
+			_, err = p.Write(ctx, file, c.Src, 0)
+		case *Fsync:
+			err = p.Fsync(ctx, file, 0, fs.FileMaxOffset, fs.SyncAll)
+		default:
+			t.Errorf("%s: unknown request type %T", test.desc, test.context)
+		}
+
+		if unwrapError(err) != test.err {
+			t.Errorf("%s: got error %v, want %v", test.desc, err, test.err)
+		}
+	}
+}
+
+func TestPipeReadAheadBuffer(t *testing.T) {
+	fds := make([]int, 2)
+	if err := syscall.Pipe(fds); err != nil {
+		t.Fatalf("failed to create pipes: got %v, want nil", err)
+	}
+	rfile := fd.New(fds[0])
+
+	// Eventually close the write end, which is not wrapped in a pipe object.
+	defer syscall.Close(fds[1])
+
+	// Write some bytes to this end.
+	data := []byte("world")
+	if n, err := syscall.Write(fds[1], data); n != len(data) || err != nil {
+		rfile.Close()
+		t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(data))
+	}
+	// Close the write end immediately, we don't care about it.
+
+	buffered := []byte("hello ")
+	ctx := contexttest.Context(t)
+	p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, rfile, buffered)
+	if err != nil {
+		rfile.Close()
+		t.Fatalf("newPipeOperations got error %v, want nil", err)
+	}
+	defer p.Release()
+
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
+		Type: fs.Pipe,
+	})
+	file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p)
+
+	// In total we expect to read data + buffered.
+	total := append(buffered, data...)
+
+	buf := make([]byte, len(total))
+	iov := usermem.BytesIOSequence(buf)
+	n, err := p.Read(contexttest.Context(t), file, iov, 0)
+	if err != nil {
+		t.Fatalf("read request got error %v, want nil", err)
+	}
+	if n != int64(len(total)) {
+		t.Fatalf("read request got %d bytes, want %d", n, len(total))
+	}
+	if !bytes.Equal(buf, total) {
+		t.Errorf("read request got bytes [%v], want [%v]", buf, total)
+	}
+}
+
+// This is very important for pipes in general because they can return EWOULDBLOCK and for
+// those that block they must continue until they have read all of the data (and report it
+// as such.
+func TestPipeReadsAccumulate(t *testing.T) {
+	fds := make([]int, 2)
+	if err := syscall.Pipe(fds); err != nil {
+		t.Fatalf("failed to create pipes: got %v, want nil", err)
+	}
+	rfile := fd.New(fds[0])
+
+	// Eventually close the write end, it doesn't depend on a pipe object.
+	defer syscall.Close(fds[1])
+
+	// Get a new read only pipe reference.
+	ctx := contexttest.Context(t)
+	p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, rfile, nil)
+	if err != nil {
+		rfile.Close()
+		t.Fatalf("newPipeOperations got error %v, want nil", err)
+	}
+	// Don't forget to remove the fd from the fd notifier.  Otherwise other tests will
+	// likely be borked, because it's global :(
+	defer p.Release()
+
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
+		Type: fs.Pipe,
+	})
+	file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p)
+
+	// Write some some bytes to the pipe.
+	data := []byte("some message")
+	if n, err := syscall.Write(fds[1], data); n != len(data) || err != nil {
+		t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(data))
+	}
+
+	// Construct a segment vec that is a bit more than we have written so we trigger
+	// an EWOULDBLOCK.
+	wantBytes := len(data) + 1
+	readBuffer := make([]byte, wantBytes)
+	iov := usermem.BytesIOSequence(readBuffer)
+	n, err := p.Read(ctx, file, iov, 0)
+	total := n
+	iov = iov.DropFirst64(n)
+	if err != syserror.ErrWouldBlock {
+		t.Fatalf("Readv got error %v, want %v", err, syserror.ErrWouldBlock)
+	}
+
+	// Write a few more bytes to allow us to read more/accumulate.
+	extra := []byte("extra")
+	if n, err := syscall.Write(fds[1], extra); n != len(extra) || err != nil {
+		t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(extra))
+	}
+
+	// This time, using the same request, we should not block.
+	n, err = p.Read(ctx, file, iov, 0)
+	total += n
+	if err != nil {
+		t.Fatalf("Readv got error %v, want nil", err)
+	}
+
+	// Assert that the result we got back is cumulative.
+	if total != int64(wantBytes) {
+		t.Fatalf("Readv sequence got %d bytes, want %d", total, wantBytes)
+	}
+
+	if want := append(data, extra[0]); !bytes.Equal(readBuffer, want) {
+		t.Errorf("Readv sequence got %v, want %v", readBuffer, want)
+	}
+}
+
+// Same as TestReadsAccumulate.
+func TestPipeWritesAccumulate(t *testing.T) {
+	fds := make([]int, 2)
+	if err := syscall.Pipe(fds); err != nil {
+		t.Fatalf("failed to create pipes: got %v, want nil", err)
+	}
+	wfile := fd.New(fds[1])
+
+	// Eventually close the read end, it doesn't depend on a pipe object.
+	defer syscall.Close(fds[0])
+
+	// Get a new write only pipe reference.
+	ctx := contexttest.Context(t)
+	p, err := newPipeOperations(ctx, nil, fs.FileFlags{Write: true}, wfile, nil)
+	if err != nil {
+		wfile.Close()
+		t.Fatalf("newPipeOperations got error %v, want nil", err)
+	}
+	// Don't forget to remove the fd from the fd notifier.  Otherwise other tests will
+	// likely be borked, because it's global :(
+	defer p.Release()
+
+	inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{
+		Type: fs.Pipe,
+	})
+	file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p)
+
+	// Construct a segment vec that is larger than the pipe size to trigger an EWOULDBLOCK.
+	wantBytes := 65536 * 2
+	writeBuffer := make([]byte, wantBytes)
+	for i := 0; i < wantBytes; i++ {
+		writeBuffer[i] = 'a'
+	}
+	iov := usermem.BytesIOSequence(writeBuffer)
+	n, err := p.Write(ctx, file, iov, 0)
+	total := n
+	iov = iov.DropFirst64(n)
+	if err != syserror.ErrWouldBlock {
+		t.Fatalf("Writev got error %v, want %v", err, syserror.ErrWouldBlock)
+	}
+
+	// Read the entire pipe buf size to make space for the second half.
+	throwAway := make([]byte, 65536)
+	if n, err := syscall.Read(fds[0], throwAway); n != len(throwAway) || err != nil {
+		t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(throwAway))
+	}
+
+	// This time we should not block.
+	n, err = p.Write(ctx, file, iov, 0)
+	total += n
+	if err != nil {
+		t.Fatalf("Writev got error %v, want nil", err)
+	}
+
+	// Assert that the result we got back is cumulative.
+	if total != int64(wantBytes) {
+		t.Fatalf("Writev sequence got %d bytes, want %d", total, wantBytes)
+	}
+}
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
new file mode 100644
index 000000000..de2e80bf0
--- /dev/null
+++ b/pkg/sentry/fs/file.go
@@ -0,0 +1,404 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"math"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/amutex"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// FileMaxOffset is the maximum possible file offset.
+const FileMaxOffset = math.MaxInt64
+
+// File is an open file handle. It is thread-safe.
+//
+// File provides stronger synchronization guarantees than Linux. Linux
+// synchronizes lseek(2), read(2), and write(2) with respect to the file
+// offset for regular files and only for those interfaces. See
+// fs/read_write.c:fdget_pos, fs.read_write.c:fdput_pos and FMODE_ATOMIC_POS.
+//
+// In contrast, File synchronizes any operation that could take a long time
+// under a single abortable mutex which also synchronizes lseek(2), read(2),
+// and write(2).
+//
+// FIXME: Split synchronization from cancellation.
+type File struct {
+	refs.AtomicRefCount
+
+	// UniqueID is the globally unique identifier of the File.
+	UniqueID uint64
+
+	// Dirent is the Dirent backing this File. This encodes the name
+	// of the File via Dirent.FullName() as well as its identity via the
+	// Dirent's Inode. The Dirent is non-nil.
+	//
+	// A File holds a reference to this Dirent. Using the returned Dirent is
+	// only safe as long as a reference on the File is held. The association
+	// between a File and a Dirent is immutable.
+	//
+	// Files that are not parented in a filesystem return a root Dirent
+	// that holds a reference to their Inode.
+	//
+	// The name of the Dirent may reflect parentage if the Dirent is not a
+	// root Dirent or the identity of the File on a pseudo filesystem (pipefs,
+	// sockfs, etc).
+	//
+	// Multiple Files may hold a reference to the same Dirent. This is the
+	// common case for Files that are parented and maintain consistency with
+	// other files via the Dirent cache.
+	Dirent *Dirent
+
+	// flags are the File's flags. Setting or getting flags is fully atomic
+	// and is not protected by mu (below).
+	flags atomic.Value `state:".(FileFlags)"`
+
+	// mu is dual-purpose: first, to make read(2) and write(2) thread-safe
+	// in conformity with POSIX, and second, to cancel operations before they
+	// begin in response to interruptions (i.e. signals).
+	mu amutex.AbortableMutex `state:"nosave"`
+
+	// FileOperations implements file system specific behavior for this File.
+	FileOperations FileOperations
+
+	// offset is the File's offset. Updating offset is protected by mu but
+	// can be read atomically via File.Offset() outside of mu.
+	offset int64
+}
+
+// NewFile returns a File. It takes a reference on the Dirent and owns the
+// lifetime of the FileOperations. Files that do not support reading and
+// writing at an arbitrary offset should set flags.Pread and flags.Pwrite
+// to false respectively.
+func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File {
+	dirent.IncRef()
+	f := &File{
+		UniqueID:       uniqueid.GlobalFromContext(ctx),
+		Dirent:         dirent,
+		FileOperations: fops,
+	}
+	f.flags.Store(flags)
+	f.mu.Init()
+	return f
+}
+
+// DecRef destroys the File when it is no longer referenced.
+func (f *File) DecRef() {
+	f.DecRefWithDestructor(func() {
+		// Drop BSD style locks.
+		lockRng := lock.LockRange{Start: 0, End: lock.LockEOF}
+		f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng)
+
+		// Release resources held by the FileOperations.
+		f.FileOperations.Release()
+
+		// Release a reference on the Dirent.
+		f.Dirent.DecRef()
+	})
+}
+
+// Flags atomically loads the File's flags.
+func (f *File) Flags() FileFlags {
+	return f.flags.Load().(FileFlags)
+}
+
+// SetFlags atomically changes the File's flags to the values contained
+// in newFlags. See SettableFileFlags for values that can be set.
+func (f *File) SetFlags(newFlags SettableFileFlags) {
+	flags := f.flags.Load().(FileFlags)
+	flags.Direct = newFlags.Direct
+	flags.NonBlocking = newFlags.NonBlocking
+	flags.Append = newFlags.Append
+	f.flags.Store(flags)
+}
+
+// Offset atomically loads the File's offset.
+func (f *File) Offset() int64 {
+	return atomic.LoadInt64(&f.offset)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (f *File) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return f.FileOperations.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *File) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	f.FileOperations.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *File) EventUnregister(e *waiter.Entry) {
+	f.FileOperations.EventUnregister(e)
+}
+
+// Seek calls f.FileOperations.Seek with f as the File, updating the file
+// offset to the value returned by f.FileOperations.Seek if the operation
+// is successful.
+//
+// Returns syserror.ErrInterrupted if seeking was interrupted.
+func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64, error) {
+	if !f.mu.Lock(ctx) {
+		return 0, syserror.ErrInterrupted
+	}
+	defer f.mu.Unlock()
+
+	newOffset, err := f.FileOperations.Seek(ctx, f, whence, offset)
+	if err == nil {
+		atomic.StoreInt64(&f.offset, newOffset)
+	}
+	return newOffset, err
+}
+
+// Readdir reads the directory entries of this File and writes them out
+// to the DentrySerializer until entries can no longer be written. If even
+// a single directory entry is written then Readdir returns a nil error
+// and the directory offset is advanced.
+//
+// Readdir unconditionally updates the access time on the File's Inode,
+// see fs/readdir.c:iterate_dir.
+//
+// Returns syserror.ErrInterrupted if reading was interrupted.
+func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error {
+	if !f.mu.Lock(ctx) {
+		return syserror.ErrInterrupted
+	}
+	defer f.mu.Unlock()
+
+	offset, err := f.FileOperations.Readdir(ctx, f, serializer)
+	atomic.StoreInt64(&f.offset, offset)
+	return err
+}
+
+// Readv calls f.FileOperations.Read with f as the File, advancing the file
+// offset if f.FileOperations.Read returns bytes read > 0.
+//
+// Returns syserror.ErrInterrupted if reading was interrupted.
+func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	if !f.mu.Lock(ctx) {
+		return 0, syserror.ErrInterrupted
+	}
+
+	n, err := f.FileOperations.Read(ctx, f, dst, f.offset)
+	if n > 0 {
+		atomic.AddInt64(&f.offset, n)
+	}
+	f.mu.Unlock()
+	return n, err
+}
+
+// Preadv calls f.FileOperations.Read with f as the File. It does not
+// advance the file offset. If !f.Flags().Pread, Preadv should not be
+// called.
+//
+// Otherwise same as Readv.
+func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if !f.mu.Lock(ctx) {
+		return 0, syserror.ErrInterrupted
+	}
+
+	n, err := f.FileOperations.Read(ctx, f, dst, offset)
+	f.mu.Unlock()
+	return n, err
+}
+
+// Writev calls f.FileOperations.Write with f as the File, advancing the
+// file offset if f.FileOperations.Write returns bytes written > 0.
+//
+// Writev positions the write offset at EOF if f.Flags().Append. This is
+// unavoidably racy for network file systems. Writev also truncates src
+// to avoid overrunning the current file size limit if necessary.
+//
+// Returns syserror.ErrInterrupted if writing was interrupted.
+func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	if !f.mu.Lock(ctx) {
+		return 0, syserror.ErrInterrupted
+	}
+
+	offset, err := f.checkWriteLocked(ctx, &src, f.offset)
+	if err != nil {
+		f.mu.Unlock()
+		return 0, err
+	}
+	n, err := f.FileOperations.Write(ctx, f, src, offset)
+	if n >= 0 {
+		atomic.StoreInt64(&f.offset, offset+n)
+	}
+	f.mu.Unlock()
+	return n, err
+}
+
+// Pwritev calls f.FileOperations.Write with f as the File. It does not
+// advance the file offset. If !f.Flags().Pwritev, Pwritev should not be
+// called.
+//
+// Otherwise same as Writev.
+func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if !f.mu.Lock(ctx) {
+		return 0, syserror.ErrInterrupted
+	}
+
+	offset, err := f.checkWriteLocked(ctx, &src, offset)
+	if err != nil {
+		f.mu.Unlock()
+		return 0, err
+	}
+	n, err := f.FileOperations.Write(ctx, f, src, offset)
+	f.mu.Unlock()
+	return n, err
+}
+
+// checkWriteLocked returns the offset to write at or an error if the write
+// would not succeed. May update src to fit a write operation into a file
+// size limit.
+func (f *File) checkWriteLocked(ctx context.Context, src *usermem.IOSequence, offset int64) (int64, error) {
+	// Handle append only files. Note that this is still racy for network
+	// filesystems.
+	if f.Flags().Append {
+		uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
+		if err != nil {
+			// This is an odd error, most likely it is evidence
+			// that something is terribly wrong with the filesystem.
+			// Return a generic EIO error.
+			log.Warningf("Failed to check write of inode %#v: %v", f.Dirent.Inode.StableAttr, err)
+			return offset, syserror.EIO
+		}
+		offset = uattr.Size
+	}
+
+	// Is this a regular file?
+	if IsRegular(f.Dirent.Inode.StableAttr) {
+		// Enforce size limits.
+		fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+		if fileSizeLimit <= math.MaxInt64 {
+			if offset >= int64(fileSizeLimit) {
+				return offset, syserror.ErrExceedsFileSizeLimit
+			}
+			*src = src.TakeFirst64(int64(fileSizeLimit) - offset)
+		}
+	}
+
+	return offset, nil
+}
+
+// Fsync calls f.FileOperations.Fsync with f as the File.
+//
+// Returns syserror.ErrInterrupted if syncing was interrupted.
+func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncType) error {
+	if !f.mu.Lock(ctx) {
+		return syserror.ErrInterrupted
+	}
+	defer f.mu.Unlock()
+
+	return f.FileOperations.Fsync(ctx, f, start, end, syncType)
+}
+
+// Flush calls f.FileOperations.Flush with f as the File.
+//
+// Returns syserror.ErrInterrupted if syncing was interrupted.
+func (f *File) Flush(ctx context.Context) error {
+	if !f.mu.Lock(ctx) {
+		return syserror.ErrInterrupted
+	}
+	defer f.mu.Unlock()
+
+	return f.FileOperations.Flush(ctx, f)
+}
+
+// ConfigureMMap calls f.FileOperations.ConfigureMMap with f as the File.
+//
+// Returns syserror.ErrInterrupted if interrupted.
+func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	if !f.mu.Lock(ctx) {
+		return syserror.ErrInterrupted
+	}
+	defer f.mu.Unlock()
+
+	return f.FileOperations.ConfigureMMap(ctx, f, opts)
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (f *File) MappedName(ctx context.Context) string {
+	name, _ := f.Dirent.FullName(RootFromContext(ctx))
+	return name
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (f *File) DeviceID() uint64 {
+	return f.Dirent.Inode.StableAttr.DeviceID
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (f *File) InodeID() uint64 {
+	return f.Dirent.Inode.StableAttr.InodeID
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (f *File) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	return f.Fsync(ctx, int64(mr.Start), int64(mr.End-1), SyncData)
+}
+
+// FileReader implements io.Reader and io.ReaderAt.
+type FileReader struct {
+	// Ctx is the context for the file reader.
+	Ctx context.Context
+
+	// File is the file to read from.
+	File *File
+}
+
+// Read implements io.Reader.Read.
+func (r *FileReader) Read(buf []byte) (int, error) {
+	n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf))
+	return int(n), err
+}
+
+// ReadAt implementes io.Reader.ReadAt.
+func (r *FileReader) ReadAt(buf []byte, offset int64) (int, error) {
+	n, err := r.File.Preadv(r.Ctx, usermem.BytesIOSequence(buf), offset)
+	return int(n), err
+}
+
+// FileWriter implements io.Writer and io.WriterAt.
+type FileWriter struct {
+	// Ctx is the context for the file writer.
+	Ctx context.Context
+
+	// File is the file to write to.
+	File *File
+}
+
+// Write implements io.Writer.Write.
+func (w *FileWriter) Write(buf []byte) (int, error) {
+	n, err := w.File.Writev(w.Ctx, usermem.BytesIOSequence(buf))
+	return int(n), err
+}
+
+// WriteAt implements io.Writer.WriteAt.
+func (w *FileWriter) WriteAt(buf []byte, offset int64) (int, error) {
+	n, err := w.File.Pwritev(w.Ctx, usermem.BytesIOSequence(buf), offset)
+	return int(n), err
+}
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
new file mode 100644
index 000000000..d223bb5c7
--- /dev/null
+++ b/pkg/sentry/fs/file_operations.go
@@ -0,0 +1,106 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// FileOperations are operations on a File that diverge per file system.
+//
+// Operations that take a *File may use only the following interfaces:
+//
+// - File.UniqueID:	Operations may only read this value.
+// - File.Dirent:	Operations must not take or drop a reference.
+// - File.Offset(): 	This value is guaranteed to not change for the
+//			duration of the operation.
+// - File.Flags():	This value may change during the operation.
+type FileOperations interface {
+	// Release release resources held by FileOperations.
+	Release()
+
+	// Waitable defines how this File can be waited on for read and
+	// write readiness.
+	waiter.Waitable
+
+	// Seek seeks to offset based on SeekWhence. Returns the new
+	// offset or no change in the offset and an error.
+	Seek(ctx context.Context, file *File, whence SeekWhence, offset int64) (int64, error)
+
+	// Readdir reads the directory entries of file and serializes them
+	// using serializer.
+	//
+	// Returns the new directory offset or no change in the offset and
+	// an error. The offset returned must not be less than file.Offset().
+	//
+	// Serialization of directory entries must not happen asynchronously.
+	Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error)
+
+	// Read reads from file into dst at offset and returns the number
+	// of bytes read which must be greater than or equal to 0. File
+	// systems that do not support reading at an offset, (i.e. pipefs,
+	// sockfs) may ignore the offset. These file systems are expected
+	// to construct Files with !FileFlags.Pread.
+	//
+	// Read may return a nil error and only partially fill dst (at or
+	// before EOF). If the file represents a symlink, Read reads the target
+	// value of the symlink.
+	//
+	// Read does not check permissions nor flags.
+	//
+	// Read must not be called if !FileFlags.Read.
+	Read(ctx context.Context, file *File, dst usermem.IOSequence, offset int64) (int64, error)
+
+	// Write writes src to file at offset and returns the number of bytes
+	// written which must be greater than or equal to 0. Like Read, file
+	// systems that do not support writing at an offset (i.e. pipefs, sockfs)
+	// may ignore the offset. These file systems are expected to construct
+	// Files with !FileFlags.Pwrite.
+	//
+	// If only part of src could be written, Write must return an error
+	// indicating why (e.g. syserror.ErrWouldBlock).
+	//
+	// Write does not check permissions nor flags.
+	//
+	// Write must not be called if !FileFlags.Write.
+	Write(ctx context.Context, file *File, src usermem.IOSequence, offset int64) (int64, error)
+
+	// Fsync writes buffered modifications of file and/or flushes in-flight
+	// operations to backing storage based on syncType. The range to sync is
+	// [start, end]. The end is inclusive so that the last byte of a maximally
+	// sized file can be synced.
+	Fsync(ctx context.Context, file *File, start, end int64, syncType SyncType) error
+
+	// Flush this file's buffers/state (on close(2)).
+	Flush(ctx context.Context, file *File) error
+
+	// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
+	// implementations can either embed fsutil.NoMMap (if they don't support
+	// memory mapping) or call fsutil.GenericConfigureMMap with the appropriate
+	// memmap.Mappable.
+	ConfigureMMap(ctx context.Context, file *File, opts *memmap.MMapOpts) error
+
+	// Ioctl implements the ioctl(2) linux syscall.
+	//
+	// io provides access to the virtual memory space to which pointers in args
+	// refer.
+	//
+	// Preconditions: The AddressSpace (if any) that io refers to is activated.
+	Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error)
+}
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
new file mode 100644
index 000000000..0c6e622b9
--- /dev/null
+++ b/pkg/sentry/fs/file_overlay.go
@@ -0,0 +1,345 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// overlayFile gets a handle to a file from the upper or lower filesystem
+// in an overlay. The caller is responsible for calling File.DecRef on
+// the returned file.
+func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, error) {
+	// Do a song and dance to eventually get to:
+	//
+	//   File -> single reference
+	//   Dirent -> single reference
+	//   Inode -> multiple references
+	//
+	// So that File.DecRef() -> File.destroy -> Dirent.DecRef -> Dirent.destroy,
+	// and both the transitory File and Dirent can be GC'ed but the Inode
+	// remains.
+
+	// Take another reference on the Inode.
+	inode.IncRef()
+
+	// Start with a single reference on the Dirent. It inherits the reference
+	// we just took on the Inode above.
+	dirent := NewTransientDirent(inode)
+
+	// Get a File. This will take another reference on the Dirent.
+	f, err := inode.GetFile(ctx, dirent, flags)
+
+	// Drop the extra reference on the Dirent. Now there's only one reference
+	// on the dirent, either owned by f (if non-nil), or the Dirent is about
+	// to be destroyed (if GetFile failed).
+	dirent.DecRef()
+
+	return f, err
+}
+
+// overlayFileOperations implements FileOperations for a file in an overlay.
+type overlayFileOperations struct {
+	// upperMu protects upper below. In contrast lower is stable.
+	upperMu sync.Mutex `state:"nosave"`
+
+	// We can't share Files in upper and lower filesystems between all Files
+	// in an overlay because some file systems expect to get distinct handles
+	// that are not consistent with each other on open(2).
+	//
+	// So we lazily acquire an upper File when the overlayEntry acquires an
+	// upper Inode (it might have one from the start). This synchronizes with
+	// copy up.
+	//
+	// If upper is non-nil and this is not a directory, then lower is ignored.
+	//
+	// For directories, upper and lower are ignored because it is always
+	// necessary to acquire new directory handles so that the directory cursors
+	// of the upper and lower Files are not exhausted.
+	upper *File
+	lower *File
+
+	// dirCursor is a directory cursor for a directory in an overlay.
+	dirCursor string
+
+	// dirCache is cache of DentAttrs from upper and lower Inodes.
+	dirCache *SortedDentryMap
+}
+
+// Release implements FileOperations.Release.
+func (f *overlayFileOperations) Release() {
+	if f.upper != nil {
+		f.upper.DecRef()
+	}
+	if f.lower != nil {
+		f.lower.DecRef()
+	}
+}
+
+// EventRegister implements FileOperations.EventRegister.
+func (f *overlayFileOperations) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
+	f.upperMu.Lock()
+	defer f.upperMu.Unlock()
+	if f.upper != nil {
+		f.upper.EventRegister(we, mask)
+		return
+	}
+	f.lower.EventRegister(we, mask)
+}
+
+// EventUnregister implements FileOperations.Unregister.
+func (f *overlayFileOperations) EventUnregister(we *waiter.Entry) {
+	f.upperMu.Lock()
+	defer f.upperMu.Unlock()
+	if f.upper != nil {
+		f.upper.EventUnregister(we)
+		return
+	}
+	f.lower.EventUnregister(we)
+}
+
+// Readiness implements FileOperations.Readiness.
+func (f *overlayFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	f.upperMu.Lock()
+	defer f.upperMu.Unlock()
+	if f.upper != nil {
+		return f.upper.Readiness(mask)
+	}
+	return f.lower.Readiness(mask)
+}
+
+// Seek implements FileOperations.Seek.
+func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence SeekWhence, offset int64) (int64, error) {
+	f.upperMu.Lock()
+	defer f.upperMu.Unlock()
+
+	var seekDir bool
+	var n int64
+	if f.upper != nil {
+		var err error
+		if n, err = f.upper.FileOperations.Seek(ctx, file, whence, offset); err != nil {
+			return n, err
+		}
+		seekDir = IsDir(f.upper.Dirent.Inode.StableAttr)
+	} else {
+		var err error
+		if n, err = f.lower.FileOperations.Seek(ctx, file, whence, offset); err != nil {
+			return n, err
+		}
+		seekDir = IsDir(f.lower.Dirent.Inode.StableAttr)
+	}
+
+	// If this was a seek on a directory, we must update the cursor.
+	if seekDir && whence == SeekSet && offset == 0 {
+		// Currenly only seeking to 0 on a directory is supported.
+		// FIXME: Lift directory seeking limitations.
+		f.dirCursor = ""
+	}
+	return n, nil
+}
+
+// Readdir implements FileOperations.Readdir.
+func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) {
+	o := file.Dirent.Inode.overlay
+
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
+	var err error
+	f.dirCache, err = readdirEntries(ctx, o)
+	if err != nil {
+		return file.Offset(), err
+	}
+
+	root := RootFromContext(ctx)
+	defer root.DecRef()
+
+	dirCtx := &DirCtx{
+		Serializer: serializer,
+		DirCursor:  &f.dirCursor,
+	}
+	return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+}
+
+// IterateDir implements DirIterator.IterateDir.
+func (f *overlayFileOperations) IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error) {
+	n, err := GenericReaddir(dirCtx, f.dirCache)
+	return offset + n, err
+}
+
+// Read implements FileOperations.Read.
+func (f *overlayFileOperations) Read(ctx context.Context, file *File, dst usermem.IOSequence, offset int64) (int64, error) {
+	o := file.Dirent.Inode.overlay
+
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
+	if o.upper != nil {
+		// We may need to acquire an open file handle to read from if
+		// copy up has occurred. Otherwise we risk reading from the
+		// wrong source.
+		f.upperMu.Lock()
+		if f.upper == nil {
+			var err error
+			f.upper, err = overlayFile(ctx, o.upper, file.Flags())
+			if err != nil {
+				f.upperMu.Unlock()
+				log.Warningf("failed to acquire handle with flags %v: %v", file.Flags(), err)
+				return 0, syserror.EIO
+			}
+		}
+		f.upperMu.Unlock()
+		return f.upper.FileOperations.Read(ctx, f.upper, dst, offset)
+	}
+	return f.lower.FileOperations.Read(ctx, f.lower, dst, offset)
+}
+
+// Write implements FileOperations.Write.
+func (f *overlayFileOperations) Write(ctx context.Context, file *File, src usermem.IOSequence, offset int64) (int64, error) {
+	// f.upper must be non-nil. See inode_overlay.go:overlayGetFile, where the
+	// file is copied up and opened in the upper filesystem if FileFlags.Write.
+	// Write cannot be called if !FileFlags.Write, see FileOperations.Write.
+	return f.upper.FileOperations.Write(ctx, f.upper, src, offset)
+}
+
+// Fsync implements FileOperations.Fsync.
+func (f *overlayFileOperations) Fsync(ctx context.Context, file *File, start, end int64, syncType SyncType) error {
+	var err error
+	f.upperMu.Lock()
+	if f.upper != nil {
+		err = f.upper.FileOperations.Fsync(ctx, f.upper, start, end, syncType)
+	}
+	f.upperMu.Unlock()
+	if f.lower != nil {
+		// N.B. Fsync on the lower filesystem can cause writes of file
+		// attributes (i.e. access time) despite the fact that we must
+		// treat the lower filesystem as read-only.
+		//
+		// This matches the semantics of fsync(2) in Linux overlayfs.
+		err = f.lower.FileOperations.Fsync(ctx, f.lower, start, end, syncType)
+	}
+	return err
+}
+
+// Flush implements FileOperations.Flush.
+func (f *overlayFileOperations) Flush(ctx context.Context, file *File) error {
+	// Flush whatever handles we have.
+	var err error
+	f.upperMu.Lock()
+	if f.upper != nil {
+		err = f.upper.FileOperations.Flush(ctx, f.upper)
+	}
+	f.upperMu.Unlock()
+	if f.lower != nil {
+		err = f.lower.FileOperations.Flush(ctx, f.lower)
+	}
+	return err
+}
+
+// ConfigureMMap implements FileOperations.ConfigureMMap.
+func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opts *memmap.MMapOpts) error {
+	o := file.Dirent.Inode.overlay
+
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
+	if !o.isMappableLocked() {
+		return syserror.ENODEV
+	}
+	// FIXME: This is a copy/paste of fsutil.GenericConfigureMMap,
+	// which we can't use because the overlay implementation is in package fs,
+	// so depending on fs/fsutil would create a circular dependency. Move
+	// overlay to fs/overlay.
+	opts.Mappable = o
+	opts.MappingIdentity = file
+	file.IncRef()
+	return nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl and always returns ENOTTY.
+func (*overlayFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return 0, syserror.ENOTTY
+}
+
+// readdirEntries returns a sorted map of directory entries from the
+// upper and/or lower filesystem.
+func readdirEntries(ctx context.Context, o *overlayEntry) (*SortedDentryMap, error) {
+	// Assert that there is at least one upper or lower entry.
+	if o.upper == nil && o.lower == nil {
+		panic("invalid overlayEntry, needs at least one Inode")
+	}
+	entries := make(map[string]DentAttr)
+
+	// Try the upper filesystem first.
+	if o.upper != nil {
+		var err error
+		entries, err = readdirOne(ctx, NewTransientDirent(o.upper))
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Try the lower filesystem next.
+	if o.lower != nil {
+		lowerEntries, err := readdirOne(ctx, NewTransientDirent(o.lower))
+		if err != nil {
+			return nil, err
+		}
+		for name, entry := range lowerEntries {
+			// Skip this name if it is a negative entry in the
+			// upper or there exists a whiteout for it.
+			if o.upper != nil {
+				if overlayHasWhiteout(o.upper, name) {
+					continue
+				}
+			}
+			// Prefer the entries from the upper filesystem
+			// when names overlap.
+			if _, ok := entries[name]; !ok {
+				entries[name] = entry
+			}
+		}
+	}
+
+	// Sort and return the entries.
+	return NewSortedDentryMap(entries), nil
+}
+
+// readdirOne reads all of the directory entries from d.
+func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) {
+	dir, err := d.Inode.GetFile(ctx, d, FileFlags{Read: true})
+	if err != nil {
+		return nil, err
+	}
+	defer dir.DecRef()
+
+	// Use a stub serializer to read the entries into memory.
+	stubSerializer := &CollectEntriesSerializer{}
+	if err := dir.Readdir(ctx, stubSerializer); err != nil {
+		return nil, err
+	}
+	// The "." and ".." entries are from the overlay Inode's Dirent, not the stub.
+	delete(stubSerializer.Entries, ".")
+	delete(stubSerializer.Entries, "..")
+	return stubSerializer.Entries, nil
+}
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
new file mode 100644
index 000000000..407ba8562
--- /dev/null
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs_test
+
+import (
+	"reflect"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+func TestReaddir(t *testing.T) {
+	ctx := contexttest.Context(t)
+	ctx = &rootContext{
+		Context: ctx,
+		root:    fs.NewDirent(newTestRamfsDir(ctx, nil, nil), "root"),
+	}
+	for _, test := range []struct {
+		// Test description.
+		desc string
+
+		// Lookup parameters.
+		dir *fs.Inode
+
+		// Want from lookup.
+		err   error
+		names []string
+	}{
+		{
+			desc: "no upper, lower has entries",
+			dir: fs.NewTestOverlayDir(ctx,
+				nil, /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "a"},
+					{name: "b"},
+				}, nil), /* lower */
+			),
+			names: []string{".", "..", "a", "b"},
+		},
+		{
+			desc: "upper has entries, no lower",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "a"},
+					{name: "b"},
+				}, nil), /* upper */
+				nil, /* lower */
+			),
+			names: []string{".", "..", "a", "b"},
+		},
+		{
+			desc: "upper and lower, entries combine",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "a"},
+				}, nil), /* lower */
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "b"},
+				}, nil), /* lower */
+			),
+			names: []string{".", "..", "a", "b"},
+		},
+		{
+			desc: "upper and lower, entries combine, none are masked",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "a"},
+				}, []string{"b"}), /* lower */
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "c"},
+				}, nil), /* lower */
+			),
+			names: []string{".", "..", "a", "c"},
+		},
+		{
+			desc: "upper and lower, entries combine, upper masks some of lower",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "a"},
+				}, []string{"b"}), /* lower */
+				newTestRamfsDir(ctx, []dirContent{
+					{name: "b"}, /* will be masked */
+					{name: "c"},
+				}, nil), /* lower */
+			),
+			names: []string{".", "..", "a", "c"},
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			openDir, err := test.dir.GetFile(ctx, fs.NewDirent(test.dir, "stub"), fs.FileFlags{Read: true})
+			if err != nil {
+				t.Fatalf("GetFile got error %v, want nil", err)
+			}
+			stubSerializer := &fs.CollectEntriesSerializer{}
+			err = openDir.Readdir(ctx, stubSerializer)
+			if err != test.err {
+				t.Fatalf("Readdir got error %v, want nil", err)
+			}
+			if err != nil {
+				return
+			}
+			if !reflect.DeepEqual(stubSerializer.Order, test.names) {
+				t.Errorf("Readdir got names %v, want %v", stubSerializer.Order, test.names)
+			}
+		})
+	}
+}
+
+type rootContext struct {
+	context.Context
+	root *fs.Dirent
+}
+
+// Value implements context.Context.
+func (r *rootContext) Value(key interface{}) interface{} {
+	switch key {
+	case fs.CtxRoot:
+		r.root.IncRef()
+		return r.root
+	default:
+		return r.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go
new file mode 100644
index 000000000..341cbda0b
--- /dev/null
+++ b/pkg/sentry/fs/file_state.go
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+// afterLoad is invoked by stateify.
+func (f *File) afterLoad() {
+	f.mu.Init()
+}
+
+// saveFlags is invoked by stateify.
+func (f *File) saveFlags() FileFlags {
+	return f.flags.Load().(FileFlags)
+}
+
+// loadFlags is invoked by stateify.
+func (f *File) loadFlags(flags FileFlags) {
+	f.flags.Store(flags)
+}
diff --git a/pkg/sentry/fs/file_test.go b/pkg/sentry/fs/file_test.go
new file mode 100644
index 000000000..18aee7101
--- /dev/null
+++ b/pkg/sentry/fs/file_test.go
@@ -0,0 +1,24 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import "io"
+
+var (
+	_ = io.Reader(&FileReader{})
+	_ = io.ReaderAt(&FileReader{})
+	_ = io.Writer(&FileWriter{})
+	_ = io.WriterAt(&FileWriter{})
+)
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
new file mode 100644
index 000000000..7cd76dfe9
--- /dev/null
+++ b/pkg/sentry/fs/filesystems.go
@@ -0,0 +1,162 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// FilesystemFlags matches include/linux/fs.h:file_system_type.fs_flags.
+type FilesystemFlags int
+
+const (
+	// FilesystemRequiresDev indicates that the file system requires a device name
+	// on mount. It is used to construct the output of /proc/filesystems.
+	FilesystemRequiresDev FilesystemFlags = 1
+
+	// Currently other flags are not used, but can be pulled in from
+	// include/linux/fs.h:file_system_type as needed.
+)
+
+// Filesystem is a mountable file system.
+type Filesystem interface {
+	// Name is the unique identifier of the file system. It corresponds to the
+	// filesystemtype argument of sys_mount and will appear in the output of
+	// /proc/filesystems.
+	Name() string
+
+	// Flags indicate common properties of the file system.
+	Flags() FilesystemFlags
+
+	// Mount generates a mountable Inode backed by device and configured
+	// using file system independent flags and file system dependent
+	// data options.
+	Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error)
+
+	// AllowUserMount determines whether mount(2) is allowed to mount a
+	// file system of this type.
+	AllowUserMount() bool
+}
+
+// filesystems is the global set of registered file systems. It does not need
+// to be saved. Packages registering and unregistering file systems must do so
+// before calling save/restore methods.
+var filesystems = struct {
+	// mu protects registered below.
+	mu sync.Mutex
+
+	// registered is a set of registered Filesystems.
+	registered map[string]Filesystem
+}{
+	registered: make(map[string]Filesystem),
+}
+
+// RegisterFilesystem registers a new file system that is visible to mount and
+// the /proc/filesystems list. Packages implementing Filesystem should call
+// RegisterFilesystem in init().
+func RegisterFilesystem(f Filesystem) {
+	filesystems.mu.Lock()
+	defer filesystems.mu.Unlock()
+
+	if _, ok := filesystems.registered[f.Name()]; ok {
+		panic(fmt.Sprintf("filesystem already registered at %q", f.Name()))
+	}
+	filesystems.registered[f.Name()] = f
+}
+
+// UnregisterFilesystem removes a file system from the global set. To keep the
+// file system set compatible with save/restore, UnregisterFilesystem must be
+// called before save/restore methods.
+//
+// For instance, packages may unregister their file system after it is mounted.
+// This makes sense for pseudo file systems that should not be visible or
+// mountable. See whitelistfs in fs/host/fs.go for one example.
+func UnregisterFilesystem(name string) {
+	filesystems.mu.Lock()
+	defer filesystems.mu.Unlock()
+
+	delete(filesystems.registered, name)
+}
+
+// FindFilesystem returns a Filesystem registered at name or (nil, false) if name
+// is not a file system type that can be found in /proc/filesystems.
+func FindFilesystem(name string) (Filesystem, bool) {
+	filesystems.mu.Lock()
+	defer filesystems.mu.Unlock()
+
+	f, ok := filesystems.registered[name]
+	return f, ok
+}
+
+// GetFilesystems returns the set of registered filesystems in a consistent order.
+func GetFilesystems() []Filesystem {
+	filesystems.mu.Lock()
+	defer filesystems.mu.Unlock()
+
+	var ss []Filesystem
+	for _, s := range filesystems.registered {
+		ss = append(ss, s)
+	}
+	sort.Slice(ss, func(i, j int) bool { return ss[i].Name() < ss[j].Name() })
+	return ss
+}
+
+// MountSourceFlags represents all mount option flags as a struct.
+type MountSourceFlags struct {
+	// ReadOnly corresponds to mount(2)'s "MS_RDONLY" and indicates that
+	// the filesystem should be mounted read-only.
+	ReadOnly bool
+
+	// NoAtime corresponds to mount(2)'s "MS_NOATIME" and indicates that
+	// the filesystem should not update access time in-place.
+	NoAtime bool
+
+	// ForcePageCache causes all filesystem I/O operations to use the page
+	// cache, even when the platform supports direct mapped I/O. This
+	// doesn't correspond to any Linux mount options.
+	ForcePageCache bool
+}
+
+// GenericMountSourceOptions splits a string containing comma separated tokens of the
+// format 'key=value' or 'key' into a map of keys and values. For example:
+//
+// data = "key0=value0,key1,key2=value2" -> map{'key0':'value0','key1':'','key2':'value2'}
+//
+// If data contains duplicate keys, then the last token wins.
+func GenericMountSourceOptions(data string) map[string]string {
+	options := make(map[string]string)
+	if len(data) == 0 {
+		// Don't return a nil map, callers might not be expecting that.
+		return options
+	}
+
+	// Parse options and skip empty ones.
+	for _, opt := range strings.Split(data, ",") {
+		if len(opt) > 0 {
+			res := strings.SplitN(opt, "=", 2)
+			if len(res) == 2 {
+				options[res[0]] = res[1]
+			} else {
+				options[opt] = ""
+			}
+		}
+	}
+	return options
+}
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
new file mode 100644
index 000000000..51a390d77
--- /dev/null
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -0,0 +1,35 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "filetest_state",
+    srcs = [
+        "filetest.go",
+    ],
+    out = "filetest_state.go",
+    package = "filetest",
+)
+
+go_library(
+    name = "filetest",
+    testonly = 1,
+    srcs = [
+        "filetest.go",
+        "filetest_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
new file mode 100644
index 000000000..1831aa82f
--- /dev/null
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -0,0 +1,59 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filetest provides a test implementation of an fs.File.
+package filetest
+
+import (
+	"fmt"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// TestFileOperations is an implementation of the File interface. It provides all
+// required methods.
+type TestFileOperations struct {
+	fsutil.NoopRelease   `state:"nosave"`
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	fsutil.NoIoctl       `state:"nosave"`
+	waiter.AlwaysReady   `state:"nosave"`
+}
+
+// NewTestFile creates and initializes a new test file.
+func NewTestFile(tb testing.TB) *fs.File {
+	ctx := contexttest.Context(tb)
+	dirent := fs.NewDirent(anon.NewInode(ctx), "test")
+	return fs.NewFile(ctx, dirent, fs.FileFlags{}, &TestFileOperations{})
+}
+
+// Read just fails the request.
+func (*TestFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, fmt.Errorf("Readv not implemented")
+}
+
+// Write just fails the request.
+func (*TestFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, fmt.Errorf("Writev not implemented")
+}
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
new file mode 100644
index 000000000..dfa6a3d62
--- /dev/null
+++ b/pkg/sentry/fs/flags.go
@@ -0,0 +1,67 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+// FileFlags encodes file flags.
+type FileFlags struct {
+	// Direct indicates that I/O should be done directly.
+	Direct bool
+
+	// NonBlocking indicates that I/O should not block.
+	NonBlocking bool
+
+	// Sync indicates that any writes should be synchronous.
+	Sync bool
+
+	// Append indicates this file is append only.
+	Append bool
+
+	// Read indicates this file is readable.
+	Read bool
+
+	// Write indicates this file is writeable.
+	Write bool
+
+	// Pread indicates this file is readable at an arbitrary offset.
+	Pread bool
+
+	// Pwrite indicates this file is writable at an arbitrary offset.
+	Pwrite bool
+
+	// Directory indicates that this file must be a directory.
+	Directory bool
+}
+
+// SettableFileFlags is a subset of FileFlags above that can be changed
+// via fcntl(2) using the F_SETFL command.
+type SettableFileFlags struct {
+	// Direct indicates that I/O should be done directly.
+	Direct bool
+
+	// NonBlocking indicates that I/O should not block.
+	NonBlocking bool
+
+	// Append indicates this file is append only.
+	Append bool
+}
+
+// Settable returns the subset of f that are settable.
+func (f FileFlags) Settable() SettableFileFlags {
+	return SettableFileFlags{
+		Direct:      f.Direct,
+		NonBlocking: f.NonBlocking,
+		Append:      f.Append,
+	}
+}
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
new file mode 100644
index 000000000..f54f767d3
--- /dev/null
+++ b/pkg/sentry/fs/fs.go
@@ -0,0 +1,88 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fs implements a virtual filesystem layer.
+//
+// Specific filesystem implementations must implement the InodeOperations
+// interface (inode.go).
+//
+// The MountNamespace (mounts.go) is used to create a collection of mounts in
+// a filesystem rooted at a given Inode.
+//
+// MountSources (mount.go) form a tree, with each mount holding pointers to its
+// parent and children.
+//
+// Dirents (dirents.go) wrap Inodes in a caching layer.
+//
+// When multiple locks are to be held at the same time, they should be acquired
+// in the following order.
+//
+// Either:
+//   File.mu
+//     Locks in FileOperations implementations
+//       goto Dirent-Locks
+//
+// Or:
+//   MountNamespace.mu
+//     goto Dirent-Locks
+//
+// Dirent-Locks:
+//   renameMu
+//     Dirent.dirMu
+//       Dirent.mu
+//         DirentCache.mu
+//         Locks in InodeOperations implementations or overlayEntry
+//         Inode.Watches.mu (see `Inotify` for other lock ordering)
+//	   MountSource.mu
+//
+// If multiple Dirent or MountSource locks must be taken, locks in the parent must be
+// taken before locks in their children.
+//
+// If locks must be taken on multiple unrelated Dirents, renameMu must be taken
+// first. See lockForRename.
+package fs
+
+import (
+	"sync"
+)
+
+// work is a sync.WaitGroup that can be used to queue asynchronous operations
+// via Do. Callers can use Barrier to ensure no operations are outstanding.
+var work sync.WaitGroup
+
+// AsyncBarrier waits for all outstanding asynchronous work to complete.
+func AsyncBarrier() {
+	work.Wait()
+}
+
+// Async executes a function asynchronously.
+func Async(f func()) {
+	work.Add(1)
+	go func() { // S/R-SAFE: Barrier must be called.
+		defer work.Done() // Ensure Done in case of panic.
+		f()
+	}()
+}
+
+// ErrSaveRejection indicates a failed save due to unsupported file system state
+// such as dangling open fd, etc.
+type ErrSaveRejection struct {
+	// Err is the wrapped error.
+	Err error
+}
+
+// Error returns a sensible description of the save rejection error.
+func (e ErrSaveRejection) Error() string {
+	return "save rejected due to unsupported file system state: " + e.Err.Error()
+}
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
new file mode 100644
index 000000000..4fa6395f7
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -0,0 +1,149 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "fsutil_state",
+    srcs = [
+        "dirty_set_impl.go",
+        "file.go",
+        "file_range_set_impl.go",
+        "frame_ref_set_impl.go",
+        "handle.go",
+        "host_file_mapper.go",
+        "host_file_mapper_state.go",
+        "inode.go",
+        "inode_cached.go",
+    ],
+    out = "fsutil_state.go",
+    package = "fsutil",
+)
+
+go_template_instance(
+    name = "dirty_set_impl",
+    out = "dirty_set_impl.go",
+    imports = {
+        "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "fsutil",
+    prefix = "Dirty",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "memmap.MappableRange",
+        "Value": "DirtyInfo",
+        "Functions": "dirtySetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "frame_ref_set_impl",
+    out = "frame_ref_set_impl.go",
+    imports = {
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "fsutil",
+    prefix = "frameRef",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "uint64",
+        "Functions": "frameRefSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "file_range_set_impl",
+    out = "file_range_set_impl.go",
+    imports = {
+        "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "fsutil",
+    prefix = "FileRange",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "memmap.MappableRange",
+        "Value": "uint64",
+        "Functions": "fileRangeSetFunctions",
+    },
+)
+
+go_library(
+    name = "fsutil",
+    srcs = [
+        "dirty_set.go",
+        "dirty_set_impl.go",
+        "file.go",
+        "file_range_set.go",
+        "file_range_set_impl.go",
+        "frame_ref_set.go",
+        "frame_ref_set_impl.go",
+        "fsutil.go",
+        "fsutil_state.go",
+        "handle.go",
+        "host_file_mapper.go",
+        "host_file_mapper_state.go",
+        "host_file_mapper_unsafe.go",
+        "inode.go",
+        "inode_cached.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "fsutil_x_test",
+    size = "small",
+    srcs = ["handle_test.go"],
+    deps = [
+        ":fsutil",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/ramfs/test",
+        "//pkg/sentry/usermem",
+    ],
+)
+
+go_test(
+    name = "fsutil_test",
+    size = "small",
+    srcs = [
+        "dirty_set_test.go",
+        "inode_cached_test.go",
+    ],
+    embed = [":fsutil"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/fsutil/README.md b/pkg/sentry/fs/fsutil/README.md
new file mode 100644
index 000000000..d3780e9fa
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/README.md
@@ -0,0 +1,207 @@
+This package provides utilities for implementing virtual filesystem objects.
+
+[TOC]
+
+## Page cache
+
+`CachingInodeOperations` implements a page cache for files that cannot use the
+host page cache. Normally these are files that store their data in a remote
+filesystem. This also applies to files that are accessed on a platform that does
+not support directly memory mapping host file descriptors (e.g. the ptrace
+platform).
+
+An `CachingInodeOperations` buffers regions of a single file into memory. It is
+owned by an `fs.Inode`, the in-memory representation of a file (all open file
+descriptors are backed by an `fs.Inode`). The `fs.Inode` provides operations for
+reading memory into an `CachingInodeOperations`, to represent the contents of
+the file in-memory, and for writing memory out, to relieve memory pressure on
+the kernel and to synchronize in-memory changes to filesystems.
+
+An `CachingInodeOperations` enables readable and/or writable memory access to
+file content. Files can be mapped shared or private, see mmap(2). When a file is
+mapped shared, changes to the file via write(2) and truncate(2) are reflected in
+the shared memory region. Conversely, when the shared memory region is modified,
+changes to the file are visible via read(2). Multiple shared mappings of the
+same file are coherent with each other. This is consistent with Linux.
+
+When a file is mapped private, updates to the mapped memory are not visible to
+other memory mappings. Updates to the mapped memory are also not reflected in
+the file content as seen by read(2). If the file is changed after a private
+mapping is created, for instance by write(2), the change to the file may or may
+not be reflected in the private mapping. This is consistent with Linux.
+
+An `CachingInodeOperations` keeps track of ranges of memory that were modified
+(or "dirtied"). When the file is explicitly synced via fsync(2), only the dirty
+ranges are written out to the filesystem. Any error returned indicates a failure
+to write all dirty memory of an `CachingInodeOperations` to the filesystem. In
+this case the filesystem may be in an inconsistent state. The same operation can
+be performed on the shared memory itself using msync(2). If neither fsync(2) nor
+msync(2) is performed, then the dirty memory is written out in accordance with
+the `CachingInodeOperations` eviction strategy (see below) and there is no
+guarantee that memory will be written out successfully in full.
+
+### Memory allocation and eviction
+
+An `CachingInodeOperations` implements the following allocation and eviction
+strategy:
+
+-   Memory is allocated and brought up to date with the contents of a file when
+    a region of mapped memory is accessed (or "faulted on").
+
+-   Dirty memory is written out to filesystems when an fsync(2) or msync(2)
+    operation is performed on a memory mapped file, for all memory mapped files
+    when saved, and/or when there are no longer any memory mappings of a range
+    of a file, see munmap(2). As the latter implies, in the absence of a panic
+    or SIGKILL, dirty memory is written out for all memory mapped files when an
+    application exits.
+
+-   Memory is freed when there are no longer any memory mappings of a range of a
+    file (e.g. when an application exits). This behavior is consistent with
+    Linux for shared memory that has been locked via mlock(2).
+
+Notably, memory is not allocated for read(2) or write(2) operations. This means
+that reads and writes to the file are only accelerated by an
+`CachingInodeOperations` if the file being read or written has been memory
+mapped *and* if the shared memory has been accessed at the region being read or
+written. This diverges from Linux which buffers memory into a page cache on
+read(2) proactively (i.e. readahead) and delays writing it out to filesystems on
+write(2) (i.e. writeback). The absence of these optimizations is not visible to
+applications beyond less than optimal performance when repeatedly reading and/or
+writing to same region of a file. See [Future Work](#future-work) for plans to
+implement these optimizations.
+
+Additionally, memory held by `CachingInodeOperationss` is currently unbounded in
+size. An `CachingInodeOperations` does not write out dirty memory and free it
+under system memory pressure. This can cause pathological memory usage.
+
+When memory is written back, an `CachingInodeOperations` may write regions of
+shared memory that were never modified. This is due to the strategy of
+minimizing page faults (see below) and handling only a subset of memory write
+faults. In the absence of an application or sentry crash, it is guaranteed that
+if a region of shared memory was written to, it is written back to a filesystem.
+
+### Life of a shared memory mapping
+
+A file is memory mapped via mmap(2). For example, if `A` is an address, an
+application may execute:
+
+```
+mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+```
+
+This creates a shared mapping of fd that reflects 4k of the contents of fd
+starting at offset 0, accessible at address `A`. This in turn creates a virtual
+memory area region ("vma") which indicates that [`A`, `A`+0x1000) is now a valid
+address range for this application to access.
+
+At this point, memory has not been allocated in the file's
+`CachingInodeOperations`. It is also the case that the address range [`A`,
+`A`+0x1000) has not been mapped on the host on behalf of the application. If the
+application then tries to modify 8 bytes of the shared memory:
+
+```
+char buffer[] = "aaaaaaaa";
+memcpy(A, buffer, 8);
+```
+
+The host then sends a `SIGSEGV` to the sentry because the address range [`A`,
+`A`+8) is not mapped on the host. The `SIGSEGV` indicates that the memory was
+accessed writable. The sentry looks up the vma associated with [`A`, `A`+8),
+finds the file that was mapped and its `CachingInodeOperations`. It then calls
+`CachingInodeOperations.MapInto` which allocates memory to back [`A`, `A`+8). It
+may choose to allocate more memory (i.e. do "readahead") to minimize subsequent
+faults.
+
+Memory that is allocated comes from a host tmpfs file (see `filemem.FileMem`).
+The host tmpfs file memory is brought up to date with the contents of the mapped
+file on its filesystem. The region of the host tmpfs file that reflects the
+mapped file is then mapped into the host address space of the application so
+that subsequent memory accesses do not repeatedly generate a `SIGSEGV`.
+
+The range that was allocated, including any extra memory allocation to minimize
+faults, is marked dirty due to the write fault. This overcounts dirty memory if
+the extra memory allocated is never modified.
+
+To make the scenario more interesting, imagine that this application spawns
+another process and maps the same file in the exact same way:
+
+```
+mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+```
+
+Imagine that this process then tries to modify the file again but with only 4
+bytes:
+
+```
+char buffer[] = "bbbb";
+memcpy(A, buffer, 4);
+```
+
+Since the first process has already mapped and accessed the same region of the
+file writable, `CachingInodeOperations.MapInto` is called but re-maps the memory
+that has already been allocated (because the host mapping can be invalidated at
+any time) rather than allocating new memory. The address range [`A`, `A`+0x1000)
+reflects the same cached view of the file as the first process sees. For
+example, reading 8 bytes from the file from either process via read(2) starting
+at offset 0 returns a consistent "bbbbaaaa".
+
+When this process no longer needs the shared memory, it may do:
+
+```
+munmap(A, 0x1000);
+```
+
+At this point, the modified memory cached by the `CachingInodeOperations` is not
+written back to the file because it is still in use by the first process that
+mapped it. When the first process also does:
+
+```
+munmap(A, 0x1000);
+```
+
+Then the last memory mapping of the file at the range [0, 0x1000) is gone. The
+file's `CachingInodeOperations` then starts writing back memory marked dirty to
+the file on its filesystem. Once writing completes, regardless of whether it was
+successful, the `CachingInodeOperations` frees the memory cached at the range
+[0, 0x1000).
+
+Subsequent read(2) or write(2) operations on the file go directly to the
+filesystem since there no longer exists memory for it in its
+`CachingInodeOperations`.
+
+## Future Work
+
+### Page cache
+
+The sentry does not yet implement the readahead and writeback optimizations for
+read(2) and write(2) respectively. To do so, on read(2) and/or write(2) the
+sentry must ensure that memory is allocated in a page cache to read or write
+into. However, the sentry cannot boundlessly allocate memory. If it did, the
+host would eventually OOM-kill the sentry+application process. This means that
+the sentry must implement a page cache memory allocation strategy that is
+bounded by a global user or container imposed limit. When this limit is
+approached, the sentry must decide from which page cache memory should be freed
+so that it can allocate more memory. If it makes a poor decision, the sentry may
+end up freeing and re-allocating memory to back regions of files that are
+frequently used, nullifying the optimization (and in some cases causing worse
+performance due to the overhead of memory allocation and general management).
+This is a form of "cache thrashing".
+
+In Linux, much research has been done to select and implement a lightweight but
+optimal page cache eviction algorithm. Linux makes use of hardware page bits to
+keep track of whether memory has been accessed. The sentry does not have direct
+access to hardware. Implementing a similarly lightweight and optimal page cache
+eviction algorithm will need to either introduce a kernel interface to obtain
+these page bits or find a suitable alternative proxy for access events.
+
+In Linux, readahead happens by default but is not always ideal. For instance,
+for files that are not read sequentially, it would be more ideal to simply read
+from only those regions of the file rather than to optimistically cache some
+number of bytes ahead of the read (up to 2MB in Linux) if the bytes cached won't
+be accessed. Linux implements the fadvise64(2) system call for applications to
+specify that a range of a file will not be accessed sequentially. The advice bit
+FADV_RANDOM turns off the readahead optimization for the given range in the
+given file. However fadvise64 is rarely used by applications so Linux implements
+a readahead backoff strategy if reads are not sequential. To ensure that
+application performance is not degraded, the sentry must implement a similar
+backoff strategy.
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
new file mode 100644
index 000000000..9c6c98542
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -0,0 +1,213 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to
+// implement Mappables that cache data from another source.
+//
+// type DirtySet <generated by go_generics>
+
+// DirtyInfo is the value type of DirtySet, and represents information about a
+// Mappable offset that is dirty (the cached data for that offset is newer than
+// its source).
+type DirtyInfo struct {
+	// Keep is true if the represented offset is concurrently writable, such
+	// that writing the data for that offset back to the source does not
+	// guarantee that the offset is clean (since it may be concurrently
+	// rewritten after the writeback).
+	Keep bool
+}
+
+// dirtySetFunctions implements segment.Functions for DirtySet.
+type dirtySetFunctions struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (dirtySetFunctions) MinKey() uint64 {
+	return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (dirtySetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (dirtySetFunctions) ClearValue(val *DirtyInfo) {
+}
+
+// Merge implements segment.Functions.Merge.
+func (dirtySetFunctions) Merge(_ memmap.MappableRange, val1 DirtyInfo, _ memmap.MappableRange, val2 DirtyInfo) (DirtyInfo, bool) {
+	if val1 != val2 {
+		return DirtyInfo{}, false
+	}
+	return val1, true
+}
+
+// Split implements segment.Functions.Split.
+func (dirtySetFunctions) Split(_ memmap.MappableRange, val DirtyInfo, _ uint64) (DirtyInfo, DirtyInfo) {
+	return val, val
+}
+
+// MarkClean marks all offsets in mr as not dirty, except for those to which
+// KeepDirty has been applied.
+func (ds *DirtySet) MarkClean(mr memmap.MappableRange) {
+	seg := ds.LowerBoundSegment(mr.Start)
+	for seg.Ok() && seg.Start() < mr.End {
+		if seg.Value().Keep {
+			seg = seg.NextSegment()
+			continue
+		}
+		seg = ds.Isolate(seg, mr)
+		seg = ds.Remove(seg).NextSegment()
+	}
+}
+
+// KeepClean marks all offsets in mr as not dirty, even those that were
+// previously kept dirty by KeepDirty.
+func (ds *DirtySet) KeepClean(mr memmap.MappableRange) {
+	ds.RemoveRange(mr)
+}
+
+// MarkDirty marks all offsets in mr as dirty.
+func (ds *DirtySet) MarkDirty(mr memmap.MappableRange) {
+	ds.setDirty(mr, false)
+}
+
+// KeepDirty marks all offsets in mr as dirty and prevents them from being
+// marked as clean by MarkClean.
+func (ds *DirtySet) KeepDirty(mr memmap.MappableRange) {
+	ds.setDirty(mr, true)
+}
+
+func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
+	var changedAny bool
+	defer func() {
+		if changedAny {
+			ds.MergeRange(mr)
+		}
+	}()
+	seg, gap := ds.Find(mr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < mr.End:
+			if keep && !seg.Value().Keep {
+				changedAny = true
+				seg = ds.Isolate(seg, mr)
+				seg.ValuePtr().Keep = true
+			}
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok() && gap.Start() < mr.End:
+			changedAny = true
+			seg = ds.Insert(gap, gap.Range().Intersect(mr), DirtyInfo{keep})
+			seg, gap = seg.NextNonEmpty()
+
+		default:
+			return
+		}
+	}
+}
+
+// SyncDirty passes pages in the range mr that are stored in cache and
+// identified as dirty to writeAt, updating dirty to reflect successful writes.
+// If writeAt returns a successful partial write, SyncDirty will call it
+// repeatedly until all bytes have been written. max is the true size of the
+// cached object; offsets beyond max will not be passed to writeAt, even if
+// they are marked dirty.
+func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+	var changedDirty bool
+	defer func() {
+		if changedDirty {
+			dirty.MergeRange(mr)
+		}
+	}()
+	dseg := dirty.LowerBoundSegment(mr.Start)
+	for dseg.Ok() && dseg.Start() < mr.End {
+		var dr memmap.MappableRange
+		if dseg.Value().Keep {
+			dr = dseg.Range().Intersect(mr)
+		} else {
+			changedDirty = true
+			dseg = dirty.Isolate(dseg, mr)
+			dr = dseg.Range()
+		}
+		if err := syncDirtyRange(ctx, dr, cache, max, mem, writeAt); err != nil {
+			return err
+		}
+		if dseg.Value().Keep {
+			dseg = dseg.NextSegment()
+		} else {
+			dseg = dirty.Remove(dseg).NextSegment()
+		}
+	}
+	return nil
+}
+
+// SyncDirtyAll passes all pages stored in cache identified as dirty to
+// writeAt, updating dirty to reflect successful writes. If writeAt returns a
+// successful partial write, SyncDirtyAll will call it repeatedly until all
+// bytes have been written. max is the true size of the cached object; offsets
+// beyond max will not be passed to writeAt, even if they are marked dirty.
+func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+	dseg := dirty.FirstSegment()
+	for dseg.Ok() {
+		if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil {
+			return err
+		}
+		if dseg.Value().Keep {
+			dseg = dseg.NextSegment()
+		} else {
+			dseg = dirty.Remove(dseg).NextSegment()
+		}
+	}
+	return nil
+}
+
+// Preconditions: mr must be page-aligned.
+func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+	for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() {
+		wbr := cseg.Range().Intersect(mr)
+		if max < wbr.Start {
+			break
+		}
+		ims, err := mem.MapInternal(cseg.FileRangeOf(wbr), usermem.Read)
+		if err != nil {
+			return err
+		}
+		if max < wbr.End {
+			ims = ims.TakeFirst64(max - wbr.Start)
+		}
+		offset := wbr.Start
+		for !ims.IsEmpty() {
+			n, err := writeAt(ctx, ims, offset)
+			if err != nil {
+				return err
+			}
+			offset += n
+			ims = ims.DropFirst64(n)
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go
new file mode 100644
index 000000000..f7693cb19
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/dirty_set_test.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"reflect"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func TestDirtySet(t *testing.T) {
+	var set DirtySet
+	set.MarkDirty(memmap.MappableRange{0, 2 * usermem.PageSize})
+	set.KeepDirty(memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize})
+	set.MarkClean(memmap.MappableRange{0, 2 * usermem.PageSize})
+	want := &DirtySegmentDataSlices{
+		Start:  []uint64{usermem.PageSize},
+		End:    []uint64{2 * usermem.PageSize},
+		Values: []DirtyInfo{{Keep: true}},
+	}
+	if got := set.ExportSortedSlices(); !reflect.DeepEqual(got, want) {
+		t.Errorf("set:\n\tgot %v,\n\twant %v", got, want)
+	}
+}
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
new file mode 100644
index 000000000..a7329f1c9
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -0,0 +1,267 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// NoopRelease implements FileOperations.Release for files that have no
+// resources to release.
+type NoopRelease struct{}
+
+// Release is a no-op.
+func (NoopRelease) Release() {}
+
+// SeekWithDirCursor is used to implement fs.FileOperations.Seek.  If dirCursor
+// is not nil and the seek was on a directory, the cursor will be updated.
+//
+// Currenly only seeking to 0 on a directory is supported.
+//
+// FIXME: Lift directory seeking limitations.
+func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) {
+	inode := file.Dirent.Inode
+	current := file.Offset()
+
+	// Does the Inode represents a non-seekable type?
+	if fs.IsPipe(inode.StableAttr) || fs.IsSocket(inode.StableAttr) {
+		return current, syserror.ESPIPE
+	}
+
+	// Does the Inode represent a character device?
+	if fs.IsCharDevice(inode.StableAttr) {
+		// Ignore seek requests.
+		//
+		// FIXME: This preserves existing
+		// behavior but is not universally correct.
+		return 0, nil
+	}
+
+	// Otherwise compute the new offset.
+	switch whence {
+	case fs.SeekSet:
+		switch inode.StableAttr.Type {
+		case fs.RegularFile, fs.SpecialFile, fs.BlockDevice:
+			if offset < 0 {
+				return current, syserror.EINVAL
+			}
+			return offset, nil
+		case fs.Directory, fs.SpecialDirectory:
+			if offset != 0 {
+				return current, syserror.EINVAL
+			}
+			// SEEK_SET to 0 moves the directory "cursor" to the beginning.
+			if dirCursor != nil {
+				*dirCursor = ""
+			}
+			return 0, nil
+		default:
+			return current, syserror.EINVAL
+		}
+	case fs.SeekCurrent:
+		switch inode.StableAttr.Type {
+		case fs.RegularFile, fs.SpecialFile, fs.BlockDevice:
+			if current+offset < 0 {
+				return current, syserror.EINVAL
+			}
+			return current + offset, nil
+		case fs.Directory, fs.SpecialDirectory:
+			if offset != 0 {
+				return current, syserror.EINVAL
+			}
+			return current, nil
+		default:
+			return current, syserror.EINVAL
+		}
+	case fs.SeekEnd:
+		switch inode.StableAttr.Type {
+		case fs.RegularFile, fs.BlockDevice:
+			// Allow the file to determine the end.
+			uattr, err := inode.UnstableAttr(ctx)
+			if err != nil {
+				return current, err
+			}
+			sz := uattr.Size
+			if sz+offset < 0 {
+				return current, syserror.EINVAL
+			}
+			return sz + offset, nil
+		// FIXME: This is not universally correct.
+		// Remove SpecialDirectory.
+		case fs.SpecialDirectory:
+			if offset != 0 {
+				return current, syserror.EINVAL
+			}
+			// SEEK_END to 0 moves the directory "cursor" to the end.
+			//
+			// FIXME: The ensures that after the seek,
+			// reading on the directory will get EOF. But it is not
+			// correct in general because the directory can grow in
+			// size; attempting to read those new entries will be
+			// futile (EOF will always be the result).
+			return fs.FileMaxOffset, nil
+		default:
+			return current, syserror.EINVAL
+		}
+	}
+
+	// Not a valid seek request.
+	return current, syserror.EINVAL
+}
+
+// GenericSeek implements FileOperations.Seek for files that use a generic
+// seek implementation.
+type GenericSeek struct{}
+
+// Seek implements fs.FileOperations.Seek.
+func (GenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return SeekWithDirCursor(ctx, file, whence, offset, nil)
+}
+
+// ZeroSeek implements FileOperations.Seek for files that maintain a constant
+// zero-value offset and require a no-op Seek.
+type ZeroSeek struct{}
+
+// Seek implements FileOperations.Seek.
+func (ZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+	return 0, nil
+}
+
+// PipeSeek implements FileOperations.Seek and can be used for files that behave
+// like pipes (seeking is not supported).
+type PipeSeek struct{}
+
+// Seek implements FileOperations.Seek.
+func (PipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// NotDirReaddir implements FileOperations.Readdir for non-directories.
+type NotDirReaddir struct{}
+
+// Readdir implements FileOperations.NotDirReaddir.
+func (NotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) {
+	return 0, syserror.ENOTDIR
+}
+
+// NoFsync implements FileOperations.Fsync for files that don't support syncing.
+type NoFsync struct{}
+
+// Fsync implements FileOperations.Fsync.
+func (NoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+	return syserror.EINVAL
+}
+
+// NoopFsync implements FileOperations.Fsync for files that don't need to synced.
+type NoopFsync struct{}
+
+// Fsync implements FileOperations.Fsync.
+func (NoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+	return nil
+}
+
+// NoopFlush implements FileOperations.Flush as a no-op.
+type NoopFlush struct{}
+
+// Flush implements FileOperations.Flush.
+func (NoopFlush) Flush(context.Context, *fs.File) error {
+	return nil
+}
+
+// NoMMap implements fs.FileOperations.Mappable for files that cannot
+// be memory mapped.
+type NoMMap struct{}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (NoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error {
+	return syserror.ENODEV
+}
+
+// GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most
+// filesystems that support memory mapping.
+func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpts) error {
+	opts.Mappable = m
+	opts.MappingIdentity = file
+	file.IncRef()
+	return nil
+}
+
+// NoIoctl implements fs.FileOperations.Ioctl for files that don't implement
+// the ioctl syscall.
+type NoIoctl struct{}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (NoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return 0, syserror.ENOTTY
+}
+
+// DirFileOperations implements FileOperations for directories.
+type DirFileOperations struct {
+	waiter.AlwaysReady `state:"nosave"`
+	NoopRelease        `state:"nosave"`
+	GenericSeek        `state:"nosave"`
+	NoFsync            `state:"nosave"`
+	NoopFlush          `state:"nosave"`
+	NoMMap             `state:"nosave"`
+	NoIoctl            `state:"nosave"`
+
+	// dentryMap is a SortedDentryMap used to implement Readdir.
+	dentryMap *fs.SortedDentryMap
+
+	// dirCursor contains the name of the last directory entry that was
+	// serialized.
+	dirCursor string
+}
+
+// NewDirFileOperations returns a new DirFileOperations that will iterate the
+// given denty map.
+func NewDirFileOperations(dentries *fs.SortedDentryMap) *DirFileOperations {
+	return &DirFileOperations{
+		dentryMap: dentries,
+	}
+}
+
+// IterateDir implements DirIterator.IterateDir.
+func (dfo *DirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	n, err := fs.GenericReaddir(dirCtx, dfo.dentryMap)
+	return offset + n, err
+}
+
+// Readdir implements FileOperations.Readdir.
+func (dfo *DirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &dfo.dirCursor,
+	}
+	return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset())
+}
+
+// Read implements FileOperations.Read
+func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements FileOperations.Write.
+func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
new file mode 100644
index 000000000..da6949ccb
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -0,0 +1,208 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"fmt"
+	"io"
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// FileRangeSet maps offsets into a memmap.Mappable to offsets into a
+// platform.File. It is used to implement Mappables that store data in
+// sparsely-allocated memory.
+//
+// type FileRangeSet <generated by go_generics>
+
+// fileRangeSetFunctions implements segment.Functions for FileRangeSet.
+type fileRangeSetFunctions struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (fileRangeSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (fileRangeSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (fileRangeSetFunctions) ClearValue(_ *uint64) {
+}
+
+// Merge implements segment.Functions.Merge.
+func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
+	if frstart1+mr1.Length() != frstart2 {
+		return 0, false
+	}
+	return frstart1, true
+}
+
+// Split implements segment.Functions.Split.
+func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
+	return frstart, frstart + (split - mr.Start)
+}
+
+// FileRange returns the FileRange mapped by seg.
+func (seg FileRangeIterator) FileRange() platform.FileRange {
+	return seg.FileRangeOf(seg.Range())
+}
+
+// FileRangeOf returns the FileRange mapped by mr.
+//
+// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
+func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange {
+	frstart := seg.Value() + (mr.Start - seg.Start())
+	return platform.FileRange{frstart, frstart + mr.Length()}
+}
+
+// Fill attempts to ensure that all memmap.Mappable offsets in required are
+// mapped to a platform.File offset, by allocating from mem with the given
+// memory usage kind and invoking readAt to store data into memory. (If readAt
+// returns a successful partial read, Fill will call it repeatedly until all
+// bytes have been read.) EOF is handled consistently with the requirements of
+// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
+// invalid.
+//
+// Fill may read offsets outside of required, but will never read offsets
+// outside of optional. It returns a non-nil error if any error occurs, even
+// if the error only affects offsets in optional, but not in required.
+//
+// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
+// required and optional must be page-aligned.
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mem platform.Memory, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+	gap := frs.LowerBoundGap(required.Start)
+	for gap.Ok() && gap.Start() < required.End {
+		if gap.Range().Length() == 0 {
+			gap = gap.NextGap()
+			continue
+		}
+		gr := gap.Range().Intersect(optional)
+
+		// Read data into the gap.
+		fr, err := platform.AllocateAndFill(mem, gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
+			var done uint64
+			for !dsts.IsEmpty() {
+				n, err := readAt(ctx, dsts, gr.Start+done)
+				done += n
+				dsts = dsts.DropFirst64(n)
+				if err != nil {
+					if err == io.EOF {
+						// platform.AllocateAndFill truncates down to a page
+						// boundary, but FileRangeSet.Fill is supposed to
+						// zero-fill to the end of the page in this case.
+						donepgaddr, ok := usermem.Addr(done).RoundUp()
+						if donepg := uint64(donepgaddr); ok && donepg != done {
+							dsts.DropFirst64(donepg - done)
+							done = donepg
+							if dsts.IsEmpty() {
+								return done, nil
+							}
+						}
+					}
+					return done, err
+				}
+			}
+			return done, nil
+		}))
+
+		// Store anything we managed to read into the cache.
+		if done := fr.Length(); done != 0 {
+			gr.End = gr.Start + done
+			gap = frs.Insert(gap, gr, fr.Start).NextGap()
+		}
+
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Drop removes segments for memmap.Mappable offsets in mr, freeing the
+// corresponding platform.FileRanges.
+//
+// Preconditions: mr must be page-aligned.
+func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mem platform.Memory) {
+	seg := frs.LowerBoundSegment(mr.Start)
+	for seg.Ok() && seg.Start() < mr.End {
+		seg = frs.Isolate(seg, mr)
+		mem.DecRef(seg.FileRange())
+		seg = frs.Remove(seg).NextSegment()
+	}
+}
+
+// DropAll removes all segments in mr, freeing the corresponding
+// platform.FileRanges.
+func (frs *FileRangeSet) DropAll(mem platform.Memory) {
+	for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		mem.DecRef(seg.FileRange())
+	}
+	frs.RemoveAll()
+}
+
+// Truncate updates frs to reflect Mappable truncation to the given length:
+// bytes after the new EOF on the same page are zeroed, and pages after the new
+// EOF are freed.
+func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
+	pgendaddr, ok := usermem.Addr(end).RoundUp()
+	if ok {
+		pgend := uint64(pgendaddr)
+
+		// Free truncated pages.
+		frs.SplitAt(pgend)
+		seg := frs.LowerBoundSegment(pgend)
+		for seg.Ok() {
+			mem.DecRef(seg.FileRange())
+			seg = frs.Remove(seg).NextSegment()
+		}
+
+		if end == pgend {
+			return
+		}
+	}
+
+	// Here we know end < end.RoundUp(). If the new EOF lands in the
+	// middle of a page that we have, zero out its contents beyond the new
+	// length.
+	seg := frs.FindSegment(end)
+	if seg.Ok() {
+		fr := seg.FileRange()
+		fr.Start += end - seg.Start()
+		ims, err := mem.MapInternal(fr, usermem.Write)
+		if err != nil {
+			// There's no good recourse from here. This means
+			// that we can't keep cached memory consistent with
+			// the new end of file. The caller may have already
+			// updated the file size on their backing file system.
+			//
+			// We don't want to risk blindly continuing onward,
+			// so in the extremely rare cases this does happen,
+			// we abandon ship.
+			panic(fmt.Sprintf("Failed to map %v: %v", fr, err))
+		}
+		if _, err := safemem.ZeroSeq(ims); err != nil {
+			panic(fmt.Sprintf("Zeroing %v failed: %v", fr, err))
+		}
+	}
+}
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
new file mode 100644
index 000000000..14dece315
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -0,0 +1,50 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+type frameRefSetFunctions struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (frameRefSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (frameRefSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (frameRefSetFunctions) ClearValue(val *uint64) {
+}
+
+// Merge implements segment.Functions.Merge.
+func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+	if val1 != val2 {
+		return 0, false
+	}
+	return val1, true
+}
+
+// Split implements segment.Functions.Split.
+func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+	return val, val
+}
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
new file mode 100644
index 000000000..6fe4ef13d
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -0,0 +1,26 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsutil provides utilities for implementing fs.InodeOperations
+// and fs.FileOperations:
+//
+// - For embeddable utilities, see inode.go and file.go.
+//
+// - For fs.Inodes that require a page cache to be memory mapped, see
+//   inode_cache.go.
+//
+// - For fs.Files that implement fs.HandleOps, see handle.go.
+//
+// - For anon fs.Inodes, see anon.go.
+package fsutil
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
new file mode 100644
index 000000000..149c0f84a
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/handle.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Handle implements FileOperations.
+//
+// FIXME: Remove Handle entirely in favor of individual fs.File
+// implementations using simple generic utilities.
+type Handle struct {
+	NoopRelease      `state:"nosave"`
+	NoIoctl          `state:"nosave"`
+	HandleOperations fs.HandleOperations
+
+	// dirCursor is the directory cursor.
+	dirCursor string
+}
+
+// NewHandle returns a File backed by the Dirent and FileFlags.
+func NewHandle(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, hops fs.HandleOperations) *fs.File {
+	if !fs.IsPipe(dirent.Inode.StableAttr) && !fs.IsSocket(dirent.Inode.StableAttr) {
+		// Allow reading/writing at an arbitrary offset for non-pipes
+		// and non-sockets.
+		flags.Pread = true
+		flags.Pwrite = true
+	}
+
+	return fs.NewFile(ctx, dirent, flags, &Handle{HandleOperations: hops})
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (h *Handle) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return h.HandleOperations.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (h *Handle) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	h.HandleOperations.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (h *Handle) EventUnregister(e *waiter.Entry) {
+	h.HandleOperations.EventUnregister(e)
+}
+
+// Readdir implements FileOperations.Readdir.
+func (h *Handle) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &h.dirCursor,
+	}
+	n, err := fs.DirentReaddir(ctx, file.Dirent, h, root, dirCtx, file.Offset())
+	return n, err
+}
+
+// Seek implements FileOperations.Seek.
+func (h *Handle) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return SeekWithDirCursor(ctx, file, whence, offset, &h.dirCursor)
+}
+
+// IterateDir implements DirIterator.IterateDir.
+func (h *Handle) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	return h.HandleOperations.DeprecatedReaddir(ctx, dirCtx, offset)
+}
+
+// Read implements FileOperations.Read.
+func (h *Handle) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	return h.HandleOperations.DeprecatedPreadv(ctx, dst, offset)
+}
+
+// Write implements FileOperations.Write.
+func (h *Handle) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	return h.HandleOperations.DeprecatedPwritev(ctx, src, offset)
+}
+
+// Fsync implements FileOperations.Fsync.
+func (h *Handle) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+	switch syncType {
+	case fs.SyncAll, fs.SyncData:
+		// Write out metadata.
+		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+			return err
+		}
+		fallthrough
+	case fs.SyncBackingStorage:
+		// Use DeprecatedFsync to sync disks.
+		return h.HandleOperations.DeprecatedFsync()
+	}
+	panic("invalid sync type")
+}
+
+// Flush implements FileOperations.Flush.
+func (h *Handle) Flush(context.Context, *fs.File) error {
+	return h.HandleOperations.DeprecatedFlush()
+}
+
+// ConfigureMMap implements FileOperations.ConfigureMMap.
+func (h *Handle) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	mappable := file.Dirent.Inode.Mappable()
+	if mappable == nil {
+		return syserror.ENODEV
+	}
+	return GenericConfigureMMap(file, mappable, opts)
+}
diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go
new file mode 100644
index 000000000..d94c3eb0d
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/handle_test.go
@@ -0,0 +1,227 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil_test
+
+import (
+	"io"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type testInodeOperations struct {
+	fs.InodeOperations
+	fs.InodeType
+	FileSize int64
+	writes   uint
+	reads    uint
+}
+
+func (t *testInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	return fs.UnstableAttr{Size: t.FileSize}, nil
+}
+
+// Check implements InodeOperations.Check.
+func (t *testInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+func (t *testInodeOperations) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	t.reads++
+	return t.InodeOperations.DeprecatedPreadv(ctx, dst, offset)
+}
+
+func (t *testInodeOperations) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	t.writes++
+	return t.InodeOperations.DeprecatedPwritev(ctx, src, offset)
+}
+
+// testHandle returns a handle for a test node.
+//
+// The size of the node is fixed at 20 bytes.
+func testHandle(t *testing.T, flags fs.FileFlags, nt fs.InodeType) (*fs.File, *testInodeOperations) {
+	ctx := contexttest.Context(t)
+	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	n := &testInodeOperations{
+		InodeOperations: ramfstest.NewFile(ctx, fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}}),
+		FileSize:        20,
+	}
+	d := fs.NewDirent(fs.NewInode(n, m, fs.StableAttr{Type: nt}), "test")
+	return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), n
+}
+
+func TestHandleOps(t *testing.T) {
+	h, n := testHandle(t, fs.FileFlags{Read: true, Write: true}, fs.RegularFile)
+	defer h.DecRef()
+
+	// Make sure a write request works.
+	if n, err := h.Writev(contexttest.Context(t), usermem.BytesIOSequence([]byte("a"))); n != 1 || err != nil {
+		t.Fatalf("Writev: got (%d, %v), wanted (1, nil)", n, err)
+	}
+	if n.writes != 1 {
+		t.Errorf("found %d writes, expected 1", n.writes)
+	}
+
+	// Make sure a read request works.
+	dst := make([]byte, 1)
+	if n, err := h.Preadv(contexttest.Context(t), usermem.BytesIOSequence(dst), 0); n != 1 || (err != nil && err != io.EOF) {
+		t.Errorf("Preadv: got (%d, %v), wanted (1, nil or EOF)", n, err)
+	}
+	if dst[0] != 'a' {
+		t.Errorf("Preadv: read %q, wanted 'a'", dst[0])
+	}
+	if n.reads != 1 {
+		t.Errorf("found %d reads, expected 1", n.reads)
+	}
+}
+
+type seekTest struct {
+	whence fs.SeekWhence
+	offset int64
+	result int64
+	err    error
+}
+
+type seekSuite struct {
+	nodeType fs.InodeType
+	cases    []seekTest
+}
+
+// FIXME: This is currently missing fs.SeekEnd tests due to the
+// fact that NullInodeOperations returns an error on stat.
+func TestHandleSeek(t *testing.T) {
+	ts := []seekSuite{
+		{
+			nodeType: fs.RegularFile,
+			cases: []seekTest{
+				{fs.SeekSet, 0, 0, nil},
+				{fs.SeekSet, 10, 10, nil},
+				{fs.SeekSet, -5, 10, syscall.EINVAL},
+				{fs.SeekCurrent, -1, 9, nil},
+				{fs.SeekCurrent, 2, 11, nil},
+				{fs.SeekCurrent, -12, 11, syscall.EINVAL},
+				{fs.SeekEnd, -1, 19, nil},
+				{fs.SeekEnd, 0, 20, nil},
+				{fs.SeekEnd, 2, 22, nil},
+			},
+		},
+		{
+			nodeType: fs.Directory,
+			cases: []seekTest{
+				{fs.SeekSet, 0, 0, nil},
+				{fs.SeekSet, 10, 0, syscall.EINVAL},
+				{fs.SeekSet, -5, 0, syscall.EINVAL},
+				{fs.SeekCurrent, 0, 0, nil},
+				{fs.SeekCurrent, 11, 0, syscall.EINVAL},
+				{fs.SeekCurrent, -6, 0, syscall.EINVAL},
+				{fs.SeekEnd, 0, 0, syscall.EINVAL},
+				{fs.SeekEnd, -1, 0, syscall.EINVAL},
+				{fs.SeekEnd, 2, 0, syscall.EINVAL},
+			},
+		},
+		{
+			nodeType: fs.Symlink,
+			cases: []seekTest{
+				{fs.SeekSet, 5, 0, syscall.EINVAL},
+				{fs.SeekSet, -5, 0, syscall.EINVAL},
+				{fs.SeekSet, 0, 0, syscall.EINVAL},
+				{fs.SeekCurrent, 5, 0, syscall.EINVAL},
+				{fs.SeekCurrent, -5, 0, syscall.EINVAL},
+				{fs.SeekCurrent, 0, 0, syscall.EINVAL},
+				{fs.SeekEnd, 5, 0, syscall.EINVAL},
+				{fs.SeekEnd, -5, 0, syscall.EINVAL},
+				{fs.SeekEnd, 0, 0, syscall.EINVAL},
+			},
+		},
+		{
+			nodeType: fs.Pipe,
+			cases: []seekTest{
+				{fs.SeekSet, 5, 0, syscall.ESPIPE},
+				{fs.SeekSet, -5, 0, syscall.ESPIPE},
+				{fs.SeekSet, 0, 0, syscall.ESPIPE},
+				{fs.SeekCurrent, 5, 0, syscall.ESPIPE},
+				{fs.SeekCurrent, -5, 0, syscall.ESPIPE},
+				{fs.SeekCurrent, 0, 0, syscall.ESPIPE},
+				{fs.SeekEnd, 5, 0, syscall.ESPIPE},
+				{fs.SeekEnd, -5, 0, syscall.ESPIPE},
+				{fs.SeekEnd, 0, 0, syscall.ESPIPE},
+			},
+		},
+		{
+			nodeType: fs.Socket,
+			cases: []seekTest{
+				{fs.SeekSet, 5, 0, syscall.ESPIPE},
+				{fs.SeekSet, -5, 0, syscall.ESPIPE},
+				{fs.SeekSet, 0, 0, syscall.ESPIPE},
+				{fs.SeekCurrent, 5, 0, syscall.ESPIPE},
+				{fs.SeekCurrent, -5, 0, syscall.ESPIPE},
+				{fs.SeekCurrent, 0, 0, syscall.ESPIPE},
+				{fs.SeekEnd, 5, 0, syscall.ESPIPE},
+				{fs.SeekEnd, -5, 0, syscall.ESPIPE},
+				{fs.SeekEnd, 0, 0, syscall.ESPIPE},
+			},
+		},
+		{
+			nodeType: fs.CharacterDevice,
+			cases: []seekTest{
+				{fs.SeekSet, 5, 0, nil},
+				{fs.SeekSet, -5, 0, nil},
+				{fs.SeekSet, 0, 0, nil},
+				{fs.SeekCurrent, 5, 0, nil},
+				{fs.SeekCurrent, -5, 0, nil},
+				{fs.SeekCurrent, 0, 0, nil},
+				{fs.SeekEnd, 5, 0, nil},
+				{fs.SeekEnd, -5, 0, nil},
+				{fs.SeekEnd, 0, 0, nil},
+			},
+		},
+		{
+			nodeType: fs.BlockDevice,
+			cases: []seekTest{
+				{fs.SeekSet, 0, 0, nil},
+				{fs.SeekSet, 10, 10, nil},
+				{fs.SeekSet, -5, 10, syscall.EINVAL},
+				{fs.SeekCurrent, -1, 9, nil},
+				{fs.SeekCurrent, 2, 11, nil},
+				{fs.SeekCurrent, -12, 11, syscall.EINVAL},
+				{fs.SeekEnd, -1, 19, nil},
+				{fs.SeekEnd, 0, 20, nil},
+				{fs.SeekEnd, 2, 22, nil},
+			},
+		},
+	}
+
+	for _, s := range ts {
+		h, _ := testHandle(t, fs.FileFlags{Read: true, Write: true}, s.nodeType)
+		defer h.DecRef()
+
+		for _, c := range s.cases {
+			// Try the given seek.
+			offset, err := h.Seek(contexttest.Context(t), c.whence, c.offset)
+			if err != c.err {
+				t.Errorf("seek(%s, %d) on %s had unexpected error: expected %v, got %v", c.whence, c.offset, s.nodeType, c.err, err)
+			}
+			if err == nil && offset != c.result {
+				t.Errorf("seek(%s, %d) on %s had bad result: expected %v, got %v", c.whence, c.offset, s.nodeType, c.result, offset)
+			}
+		}
+	}
+}
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
new file mode 100644
index 000000000..d0a27fc1c
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -0,0 +1,209 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// HostFileMapper caches mappings of an arbitrary host file descriptor. It is
+// used by implementations of memmap.Mappable that represent a host file
+// descriptor.
+type HostFileMapper struct {
+	// HostFile conceptually breaks the file into pieces called chunks, of
+	// size and alignment chunkSize, and caches mappings of the file on a chunk
+	// granularity.
+
+	refsMu sync.Mutex `state:"nosave"`
+
+	// refs maps chunk start offsets to the sum of reference counts for all
+	// pages in that chunk. refs is protected by refsMu.
+	refs map[uint64]int32
+
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings maps chunk start offsets to mappings of those chunks,
+	// obtained by calling syscall.Mmap. mappings is protected by
+	// mapsMu.
+	mappings map[uint64]mapping `state:"nosave"`
+}
+
+const (
+	chunkShift = usermem.HugePageShift
+	chunkSize  = 1 << chunkShift
+	chunkMask  = chunkSize - 1
+)
+
+func pagesInChunk(mr memmap.MappableRange, chunkStart uint64) int32 {
+	return int32(mr.Intersect(memmap.MappableRange{chunkStart, chunkStart + chunkSize}).Length() / usermem.PageSize)
+}
+
+type mapping struct {
+	addr     uintptr
+	writable bool
+}
+
+// NewHostFileMapper returns a HostFileMapper with no references or cached
+// mappings.
+func NewHostFileMapper() *HostFileMapper {
+	return &HostFileMapper{
+		refs:     make(map[uint64]int32),
+		mappings: make(map[uint64]mapping),
+	}
+}
+
+// IncRefOn increments the reference count on all offsets in mr.
+//
+// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
+	f.refsMu.Lock()
+	defer f.refsMu.Unlock()
+	for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize {
+		refs := f.refs[chunkStart]
+		pgs := pagesInChunk(mr, chunkStart)
+		if refs+pgs < refs {
+			// Would overflow.
+			panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs))
+		}
+		f.refs[chunkStart] = refs + pgs
+	}
+}
+
+// DecRefOn decrements the reference count on all offsets in mr.
+//
+// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
+	f.refsMu.Lock()
+	defer f.refsMu.Unlock()
+	for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize {
+		refs := f.refs[chunkStart]
+		pgs := pagesInChunk(mr, chunkStart)
+		switch {
+		case refs > pgs:
+			f.refs[chunkStart] = refs - pgs
+		case refs == pgs:
+			f.mapsMu.Lock()
+			delete(f.refs, chunkStart)
+			if m, ok := f.mappings[chunkStart]; ok {
+				f.unmapAndRemoveLocked(chunkStart, m)
+			}
+			f.mapsMu.Unlock()
+		case refs < pgs:
+			panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs))
+		}
+	}
+}
+
+// MapInternal returns a mapping of offsets in fr from fd. The returned
+// safemem.BlockSeq is valid as long as at least one reference is held on all
+// offsets in fr or until the next call to UnmapAll.
+//
+// Preconditions: The caller must hold a reference on all offsets in fr.
+func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
+	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
+	f.mapsMu.Lock()
+	defer f.mapsMu.Unlock()
+	if chunks == 1 {
+		// Avoid an unnecessary slice allocation.
+		var seq safemem.BlockSeq
+		err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) {
+			seq = safemem.BlockSeqOf(b)
+		})
+		return seq, err
+	}
+	blocks := make([]safemem.Block, 0, chunks)
+	err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) {
+		blocks = append(blocks, b)
+	})
+	return safemem.BlockSeqFromSlice(blocks), err
+}
+
+// Preconditions: f.mapsMu must be locked.
+func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error {
+	prot := syscall.PROT_READ
+	if write {
+		prot |= syscall.PROT_WRITE
+	}
+	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
+		m, ok := f.mappings[chunkStart]
+		if !ok {
+			addr, _, errno := syscall.Syscall6(
+				syscall.SYS_MMAP,
+				0,
+				chunkSize,
+				uintptr(prot),
+				syscall.MAP_SHARED,
+				uintptr(fd),
+				uintptr(chunkStart))
+			if errno != 0 {
+				return errno
+			}
+			m = mapping{addr, write}
+			f.mappings[chunkStart] = m
+		} else if write && !m.writable {
+			addr, _, errno := syscall.Syscall6(
+				syscall.SYS_MMAP,
+				m.addr,
+				chunkSize,
+				uintptr(prot),
+				syscall.MAP_SHARED|syscall.MAP_FIXED,
+				uintptr(fd),
+				uintptr(chunkStart))
+			if errno != 0 {
+				return errno
+			}
+			m = mapping{addr, write}
+			f.mappings[chunkStart] = m
+		}
+		var startOff uint64
+		if chunkStart < fr.Start {
+			startOff = fr.Start - chunkStart
+		}
+		endOff := uint64(chunkSize)
+		if chunkStart+chunkSize > fr.End {
+			endOff = fr.End - chunkStart
+		}
+		fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff))
+	}
+	return nil
+}
+
+// UnmapAll unmaps all cached mappings. Callers are responsible for
+// synchronization with mappings returned by previous calls to MapInternal.
+func (f *HostFileMapper) UnmapAll() {
+	f.mapsMu.Lock()
+	defer f.mapsMu.Unlock()
+	for chunkStart, m := range f.mappings {
+		f.unmapAndRemoveLocked(chunkStart, m)
+	}
+}
+
+// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m.
+func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
+	if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
+		// This leaks address space and is unexpected, but is otherwise
+		// harmless, so complain but don't panic.
+		log.Warningf("HostFileMapper: failed to unmap mapping %#x for chunk %#x: %v", m.addr, chunkStart, errno)
+	}
+	delete(f.mappings, chunkStart)
+}
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
new file mode 100644
index 000000000..57705decd
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+// afterLoad is invoked by stateify.
+func (f *HostFileMapper) afterLoad() {
+	f.mappings = make(map[uint64]mapping)
+}
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
new file mode 100644
index 000000000..790f3a5a6
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
@@ -0,0 +1,27 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+)
+
+func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block {
+	// We don't control the host file's length, so touching its mappings may
+	// raise SIGBUS. Thus accesses to it must use safecopy.
+	return safemem.BlockFromUnsafePointer((unsafe.Pointer)(addr), chunkSize)
+}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
new file mode 100644
index 000000000..e1ad07df2
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -0,0 +1,380 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// NewSimpleInodeOperations constructs fs.InodeOperations from InodeSimpleAttributes.
+func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations {
+	return &simpleInodeOperations{InodeSimpleAttributes: i}
+}
+
+// simpleInodeOperations is a simple implementation of Inode.
+type simpleInodeOperations struct {
+	DeprecatedFileOperations  `state:"nosave"`
+	InodeNotDirectory         `state:"nosave"`
+	InodeNotSocket            `state:"nosave"`
+	InodeNotRenameable        `state:"nosave"`
+	InodeNotOpenable          `state:"nosave"`
+	InodeNotVirtual           `state:"nosave"`
+	InodeNotSymlink           `state:"nosave"`
+	InodeNoExtendedAttributes `state:"nosave"`
+	NoMappable                `state:"nosave"`
+	NoopWriteOut              `state:"nosave"`
+
+	InodeSimpleAttributes
+}
+
+// InodeSimpleAttributes implements a subset of the Inode interface. It provides
+// read-only access to attributes.
+type InodeSimpleAttributes struct {
+	// FSType is the filesystem type reported by StatFS.
+	FSType uint64
+
+	// UAttr are the unstable attributes of the Inode.
+	UAttr fs.UnstableAttr
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *InodeSimpleAttributes) Release(context.Context) {}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{Type: i.FSType}, nil
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *InodeSimpleAttributes) UnstableAttr(context.Context, *fs.Inode) (fs.UnstableAttr, error) {
+	return i.UAttr, nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *InodeSimpleAttributes) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+func (*InodeSimpleAttributes) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+func (*InodeSimpleAttributes) DropLink() {}
+
+// NotifyStatusChange implements fs.fs.InodeOperations.
+func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
+	i.UAttr.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (*InodeSimpleAttributes) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool {
+	return false
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (*InodeSimpleAttributes) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
+	return syserror.EINVAL
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (*InodeSimpleAttributes) SetTimestamps(context.Context, *fs.Inode, fs.TimeSpec) error {
+	return syserror.EINVAL
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error {
+	return syserror.EINVAL
+}
+
+// InMemoryAttributes implements utilities for updating in-memory unstable
+// attributes and extended attributes. It is not thread-safe.
+//
+// Users need not initialize Xattrs to non-nil (it will be initialized
+// when the first extended attribute is set.
+type InMemoryAttributes struct {
+	Unstable fs.UnstableAttr
+	Xattrs   map[string][]byte
+}
+
+// SetPermissions updates the permissions to p.
+func (i *InMemoryAttributes) SetPermissions(ctx context.Context, p fs.FilePermissions) bool {
+	i.Unstable.Perms = p
+	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+	return true
+}
+
+// SetOwner updates the file owner to owner.
+func (i *InMemoryAttributes) SetOwner(ctx context.Context, owner fs.FileOwner) error {
+	if owner.UID.Ok() {
+		i.Unstable.Owner.UID = owner.UID
+	}
+	if owner.GID.Ok() {
+		i.Unstable.Owner.GID = owner.GID
+	}
+	return nil
+}
+
+// SetTimestamps sets the timestamps to ts.
+func (i *InMemoryAttributes) SetTimestamps(ctx context.Context, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+
+	now := ktime.NowFromContext(ctx)
+	if !ts.ATimeOmit {
+		if ts.ATimeSetSystemTime {
+			i.Unstable.AccessTime = now
+		} else {
+			i.Unstable.AccessTime = ts.ATime
+		}
+	}
+	if !ts.MTimeOmit {
+		if ts.MTimeSetSystemTime {
+			i.Unstable.ModificationTime = now
+		} else {
+			i.Unstable.ModificationTime = ts.MTime
+		}
+	}
+	i.Unstable.StatusChangeTime = now
+	return nil
+}
+
+// TouchAccessTime updates access time to the current time.
+func (i *InMemoryAttributes) TouchAccessTime(ctx context.Context) {
+	i.Unstable.AccessTime = ktime.NowFromContext(ctx)
+}
+
+// TouchModificationTime updates modification and status change
+// time to the current time.
+func (i *InMemoryAttributes) TouchModificationTime(ctx context.Context) {
+	now := ktime.NowFromContext(ctx)
+	i.Unstable.ModificationTime = now
+	i.Unstable.StatusChangeTime = now
+}
+
+// TouchStatusChangeTime updates status change time to the current time.
+func (i *InMemoryAttributes) TouchStatusChangeTime(ctx context.Context) {
+	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// Getxattr returns the extended attribute at name or ENOATTR if
+// it isn't set.
+func (i *InMemoryAttributes) Getxattr(name string) ([]byte, error) {
+	if value, ok := i.Xattrs[name]; ok {
+		return value, nil
+	}
+	return nil, syserror.ENOATTR
+}
+
+// Setxattr sets the extended attribute at name to value.
+func (i *InMemoryAttributes) Setxattr(name string, value []byte) error {
+	if i.Xattrs == nil {
+		i.Xattrs = make(map[string][]byte)
+	}
+	i.Xattrs[name] = value
+	return nil
+}
+
+// Listxattr returns the set of all currently set extended attributes.
+func (i *InMemoryAttributes) Listxattr() (map[string]struct{}, error) {
+	names := make(map[string]struct{}, len(i.Xattrs))
+	for name := range i.Xattrs {
+		names[name] = struct{}{}
+	}
+	return names, nil
+}
+
+// NoMappable returns a nil memmap.Mappable.
+type NoMappable struct{}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (NoMappable) Mappable(*fs.Inode) memmap.Mappable {
+	return nil
+}
+
+// NoopWriteOut is a no-op implementation of Inode.WriteOut.
+type NoopWriteOut struct{}
+
+// WriteOut is a no-op.
+func (NoopWriteOut) WriteOut(context.Context, *fs.Inode) error {
+	return nil
+}
+
+// InodeNotDirectory can be used by Inodes that are not directories.
+type InodeNotDirectory struct{}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (InodeNotDirectory) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) {
+	return nil, syserror.ENOTDIR
+}
+
+// Create implements fs.InodeOperations.Create.
+func (InodeNotDirectory) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) {
+	return nil, syserror.ENOTDIR
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (InodeNotDirectory) CreateLink(context.Context, *fs.Inode, string, string) error {
+	return syserror.ENOTDIR
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (InodeNotDirectory) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+	return syserror.ENOTDIR
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return syserror.ENOTDIR
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error {
+	return syserror.ENOTDIR
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (InodeNotDirectory) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return syserror.ENOTDIR
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (InodeNotDirectory) Remove(context.Context, *fs.Inode, string) error {
+	return syserror.ENOTDIR
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) error {
+	return syserror.ENOTDIR
+}
+
+// InodeNotSocket can be used by Inodes that are not sockets.
+type InodeNotSocket struct{}
+
+// BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
+func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint {
+	return nil
+}
+
+// InodeNotRenameable can be used by Inodes that cannot be renamed.
+type InodeNotRenameable struct{}
+
+// Rename implements fs.InodeOperations.Rename.
+func (InodeNotRenameable) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error {
+	return syserror.EINVAL
+}
+
+// InodeNotOpenable can be used by Inodes that cannot be opened.
+type InodeNotOpenable struct{}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (InodeNotOpenable) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) {
+	return nil, syserror.EIO
+}
+
+// InodeNotVirtual can be used by Inodes that are not virtual.
+type InodeNotVirtual struct{}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (InodeNotVirtual) IsVirtual() bool {
+	return false
+}
+
+// InodeNotSymlink can be used by Inodes that are not symlinks.
+type InodeNotSymlink struct{}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (InodeNotSymlink) Readlink(context.Context, *fs.Inode) (string, error) {
+	return "", syserror.ENOLINK
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	return nil, syserror.ENOLINK
+}
+
+// InodeNoExtendedAttributes can be used by Inodes that do not support
+// extended attributes.
+type InodeNoExtendedAttributes struct{}
+
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) ([]byte, error) {
+	return nil, syserror.EOPNOTSUPP
+}
+
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, []byte) error {
+	return syserror.EOPNOTSUPP
+}
+
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, error) {
+	return nil, syserror.EOPNOTSUPP
+}
+
+// DeprecatedFileOperations panics if any deprecated Inode method is called.
+type DeprecatedFileOperations struct{}
+
+// Readiness implements fs.InodeOperations.Waitable.Readiness.
+func (DeprecatedFileOperations) Readiness(waiter.EventMask) waiter.EventMask {
+	panic("not implemented")
+}
+
+// EventRegister implements fs.InodeOperations.Waitable.EventRegister.
+func (DeprecatedFileOperations) EventRegister(*waiter.Entry, waiter.EventMask) {
+	panic("not implemented")
+}
+
+// EventUnregister implements fs.InodeOperations.Waitable.EventUnregister.
+func (DeprecatedFileOperations) EventUnregister(*waiter.Entry) {
+	panic("not implemented")
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (DeprecatedFileOperations) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
+	panic("not implemented")
+}
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
+func (DeprecatedFileOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
+	panic("not implemented")
+}
+
+// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir.
+func (DeprecatedFileOperations) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) {
+	panic("not implemented")
+}
+
+// DeprecatedFsync implements fs.InodeOperations.DeprecatedFsync.
+func (DeprecatedFileOperations) DeprecatedFsync() error {
+	panic("not implemented")
+}
+
+// DeprecatedFlush implements fs.InodeOperations.DeprecatedFlush.
+func (DeprecatedFileOperations) DeprecatedFlush() error {
+	panic("not implemented")
+}
+
+// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable.
+func (DeprecatedFileOperations) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) {
+	panic("not implemented")
+}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
new file mode 100644
index 000000000..484668735
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -0,0 +1,845 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"fmt"
+	"io"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Lock order (compare the lock order model in mm/mm.go):
+//
+// CachingInodeOperations.attrMu ("fs locks")
+//   CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate")
+//     CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate")
+//       CachedFileObject locks
+
+// CachingInodeOperations caches the metadata and content of a CachedFileObject.
+// It implements a subset of InodeOperations. As a utility it can be used to
+// implement the full set of InodeOperations. Generally it should not be
+// embedded to avoid unexpected inherited behavior.
+//
+// CachingInodeOperations implements Mappable for the CachedFileObject:
+//
+// - If CachedFileObject.FD returns a value >= 0 and the current platform shares
+//   a host fd table with the sentry, then the value of CachedFileObject.FD
+//   will be memory mapped on the host.
+//
+// - Otherwise, the contents of CachedFileObject are buffered into memory
+//   managed by the CachingInodeOperations.
+//
+// Implementations of FileOperations for a CachedFileObject must read and
+// write through CachingInodeOperations using Read and Write respectively.
+//
+// Implementations of InodeOperations.WriteOut must call Sync to write out
+// in-memory modifications of data and metadata to the CachedFileObject.
+type CachingInodeOperations struct {
+	// backingFile is a handle to a cached file object.
+	backingFile CachedFileObject
+
+	// platform is used to allocate memory that caches backingFile's contents.
+	platform platform.Platform
+
+	// forcePageCache indicates the sentry page cache should be used regardless
+	// of whether the platform supports host mapped I/O or not. This must not be
+	// modified after inode creation.
+	forcePageCache bool
+
+	attrMu sync.Mutex `state:"nosave"`
+
+	// attr is unstable cached metadata.
+	//
+	// attr is protected by attrMu. attr.Size is protected by both attrMu and
+	// dataMu; reading it requires locking either mutex, while mutating it
+	// requires locking both.
+	attr fs.UnstableAttr
+
+	// dirtyAttr is metadata that was updated in-place but hasn't yet
+	// been successfully written out.
+	//
+	// dirtyAttr is protected by attrMu.
+	dirtyAttr fs.AttrMask
+
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings tracks mappings of the cached file object into
+	// memmap.MappingSpaces.
+	//
+	// mappings is protected by mapsMu.
+	mappings memmap.MappingSet
+
+	dataMu sync.RWMutex `state:"nosave"`
+
+	// cache maps offsets into the cached file to offsets into
+	// platform.Memory() that store the file's data.
+	//
+	// cache is protected by dataMu.
+	cache FileRangeSet
+
+	// dirty tracks dirty segments in cache.
+	//
+	// dirty is protected by dataMu.
+	dirty DirtySet
+
+	// hostFileMapper caches internal mappings of backingFile.FD().
+	hostFileMapper *HostFileMapper
+
+	// refs tracks active references to data in the cache.
+	//
+	// refs is protected by dataMu.
+	refs frameRefSet
+}
+
+// CachedFileObject is a file that may require caching.
+type CachedFileObject interface {
+	// ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts,
+	// starting at offset, and returns the number of bytes read. ReadToBlocksAt
+	// may return a partial read without an error.
+	ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)
+
+	// WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the
+	// file, starting at offset, and returns the number of bytes written.
+	// WriteFromBlocksAt may return a partial write without an error.
+	WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)
+
+	// SetMaskedAttributes sets the attributes in attr that are true in mask
+	// on the backing file.
+	//
+	// SetMaskedAttributes may be called at any point, regardless of whether
+	// the file was opened.
+	SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error
+
+	// Sync instructs the remote filesystem to sync the file to stable storage.
+	Sync(ctx context.Context) error
+
+	// FD returns a host file descriptor. Return value must be -1 or not -1
+	// for the lifetime of the CachedFileObject.
+	//
+	// FD is called iff the file has been memory mapped. This implies that
+	// the file was opened (see fs.InodeOperations.GetFile).
+	//
+	// FIXME: This interface seems to be
+	// fundamentally broken.  We should clarify CachingInodeOperation's
+	// behavior with metadata.
+	FD() int
+}
+
+// NewCachingInodeOperations returns a new CachingInodeOperations backed by
+// a CachedFileObject and its initial unstable attributes.
+func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations {
+	p := platform.FromContext(ctx)
+	if p == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+	}
+	return &CachingInodeOperations{
+		backingFile:    backingFile,
+		platform:       p,
+		forcePageCache: forcePageCache,
+		attr:           uattr,
+		hostFileMapper: NewHostFileMapper(),
+	}
+}
+
+// Release implements fs.InodeOperations.Release.
+func (c *CachingInodeOperations) Release() {
+	c.mapsMu.Lock()
+	defer c.mapsMu.Unlock()
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+	// The cache should be empty (something has gone terribly wrong if we're
+	// releasing an inode that is still memory-mapped).
+	if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() {
+		panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty))
+	}
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+	return c.attr, nil
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool {
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+
+	masked := fs.AttrMask{Perms: true}
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil {
+		return false
+	}
+	c.attr.Perms = perms
+	// FIXME: Clarify CachingInodeOperations behavior with metadata.
+	c.dirtyAttr.Perms = true
+	c.touchStatusChangeTimeLocked(ctx)
+	return true
+
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	if !owner.UID.Ok() && !owner.GID.Ok() {
+		return nil
+	}
+
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+
+	masked := fs.AttrMask{
+		UID: owner.UID.Ok(),
+		GID: owner.GID.Ok(),
+	}
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil {
+		return err
+	}
+	if owner.UID.Ok() {
+		c.attr.Owner.UID = owner.UID
+		// FIXME: Clarify CachingInodeOperations behavior with metadata.
+		c.dirtyAttr.UID = true
+	}
+	if owner.GID.Ok() {
+		c.attr.Owner.GID = owner.GID
+		// FIXME: Clarify CachingInodeOperations behavior with metadata.
+		c.dirtyAttr.GID = true
+	}
+	c.touchStatusChangeTimeLocked(ctx)
+	return nil
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+
+	// Replace requests to use the "system time" with the current time to
+	// ensure that cached timestamps remain consistent with the remote
+	// filesystem.
+	now := ktime.NowFromContext(ctx)
+	if ts.ATimeSetSystemTime {
+		ts.ATime = now
+	}
+	if ts.MTimeSetSystemTime {
+		ts.MTime = now
+	}
+	masked := fs.AttrMask{
+		AccessTime:       !ts.ATimeOmit,
+		ModificationTime: !ts.MTimeOmit,
+	}
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil {
+		return err
+	}
+	if !ts.ATimeOmit {
+		c.attr.AccessTime = ts.ATime
+		// FIXME: Clarify CachingInodeOperations behavior with metadata.
+		c.dirtyAttr.AccessTime = true
+	}
+	if !ts.MTimeOmit {
+		c.attr.ModificationTime = ts.MTime
+		// FIXME: Clarify CachingInodeOperations behavior with metadata.
+		c.dirtyAttr.ModificationTime = true
+	}
+	c.touchStatusChangeTimeLocked(ctx)
+	return nil
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+
+	// c.attr.Size is protected by both c.attrMu and c.dataMu.
+	c.dataMu.Lock()
+	if err := c.backingFile.SetMaskedAttributes(ctx, fs.AttrMask{
+		Size: true,
+	}, fs.UnstableAttr{
+		Size: size,
+	}); err != nil {
+		c.dataMu.Unlock()
+		return err
+	}
+	oldSize := c.attr.Size
+	if oldSize != size {
+		c.attr.Size = size
+		// FIXME: Clarify CachingInodeOperations behavior with metadata.
+		c.dirtyAttr.Size = true
+		c.touchModificationTimeLocked(ctx)
+	}
+	// We drop c.dataMu here so that we can lock c.mapsMu and invalidate
+	// mappings below. This allows concurrent calls to Read/Translate/etc.
+	// These functions synchronize with an in-progress Truncate by refusing to
+	// use cache contents beyond the new c.attr.Size. (We are still holding
+	// c.attrMu, so we can't race with Truncate/Write.)
+	c.dataMu.Unlock()
+
+	// Nothing left to do unless shrinking the file.
+	if size >= oldSize {
+		return nil
+	}
+
+	oldpgend := fs.OffsetPageEnd(oldSize)
+	newpgend := fs.OffsetPageEnd(size)
+
+	// Invalidate past translations of truncated pages.
+	if newpgend != oldpgend {
+		c.mapsMu.Lock()
+		c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+			// Compare Linux's mm/truncate.c:truncate_setsize() =>
+			// truncate_pagecache() =>
+			// mm/memory.c:unmap_mapping_range(evencows=1).
+			InvalidatePrivate: true,
+		})
+		c.mapsMu.Unlock()
+	}
+
+	// We are now guaranteed that there are no translations of truncated pages,
+	// and can remove them from the cache. Since truncated pages have been
+	// removed from the backing file, they should be dropped without being
+	// written back.
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+	c.cache.Truncate(uint64(size), c.platform.Memory())
+	c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend})
+
+	return nil
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+	c.attrMu.Lock()
+
+	// Write dirty pages back.
+	c.dataMu.RLock()
+	err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt)
+	c.dataMu.RUnlock()
+	if err != nil {
+		c.attrMu.Unlock()
+		return err
+	}
+
+	// Write out cached attributes.
+	if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil {
+		c.attrMu.Unlock()
+		return err
+	}
+	c.dirtyAttr = fs.AttrMask{}
+
+	c.attrMu.Unlock()
+
+	// Fsync the remote file.
+	return c.backingFile.Sync(ctx)
+}
+
+// IncLinks increases the link count and updates cached access time.
+func (c *CachingInodeOperations) IncLinks(ctx context.Context) {
+	c.attrMu.Lock()
+	c.attr.Links++
+	c.touchModificationTimeLocked(ctx)
+	c.attrMu.Unlock()
+}
+
+// DecLinks decreases the link count and updates cached access time.
+func (c *CachingInodeOperations) DecLinks(ctx context.Context) {
+	c.attrMu.Lock()
+	c.attr.Links--
+	c.touchModificationTimeLocked(ctx)
+	c.attrMu.Unlock()
+}
+
+// TouchAccessTime updates the cached access time in-place to the
+// current time. It does not update status change time in-place. See
+// mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed.
+func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) {
+	if inode.MountSource.Flags.NoAtime {
+		return
+	}
+
+	c.attrMu.Lock()
+	c.touchAccessTimeLocked(ctx)
+	c.attrMu.Unlock()
+}
+
+// touchAccesstimeLocked updates the cached access time in-place to the current
+// time.
+//
+// Preconditions: c.attrMu is locked for writing.
+func (c *CachingInodeOperations) touchAccessTimeLocked(ctx context.Context) {
+	c.attr.AccessTime = ktime.NowFromContext(ctx)
+	c.dirtyAttr.AccessTime = true
+}
+
+// TouchModificationTime updates the cached modification and status change time
+// in-place to the current time.
+func (c *CachingInodeOperations) TouchModificationTime(ctx context.Context) {
+	c.attrMu.Lock()
+	c.touchModificationTimeLocked(ctx)
+	c.attrMu.Unlock()
+}
+
+// touchModificationTimeLocked updates the cached modification and status
+// change time in-place to the current time.
+//
+// Preconditions: c.attrMu is locked for writing.
+func (c *CachingInodeOperations) touchModificationTimeLocked(ctx context.Context) {
+	now := ktime.NowFromContext(ctx)
+	c.attr.ModificationTime = now
+	c.dirtyAttr.ModificationTime = true
+	c.attr.StatusChangeTime = now
+	c.dirtyAttr.StatusChangeTime = true
+}
+
+// touchStatusChangeTimeLocked updates the cached status change time
+// in-place to the current time.
+//
+// Preconditions: c.attrMu is locked for writing.
+func (c *CachingInodeOperations) touchStatusChangeTimeLocked(ctx context.Context) {
+	now := ktime.NowFromContext(ctx)
+	c.attr.StatusChangeTime = now
+	c.dirtyAttr.StatusChangeTime = true
+}
+
+// Read reads from frames and otherwise directly from the backing file
+// into dst starting at offset until dst is full, EOF is reached, or an
+// error is encountered.
+//
+// Read may partially fill dst and return a nil error.
+func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Have we reached EOF? We check for this again in
+	// inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would
+	// serialize reads) or c.dataMu (which would violate lock ordering), but
+	// check here first (before calling into MM) since reading at EOF is
+	// common: getting a return value of 0 from a read syscall is the only way
+	// to detect EOF.
+	//
+	// TODO: Separate out c.attr.Size and use atomics instead of
+	// c.dataMu.
+	c.dataMu.RLock()
+	size := c.attr.Size
+	c.dataMu.RUnlock()
+	if offset >= size {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset})
+	// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+	c.TouchAccessTime(ctx, file.Dirent.Inode)
+	return n, err
+}
+
+// Write writes to frames and otherwise directly to the backing file
+// from src starting at offset and until src is empty or an error is
+// encountered.
+//
+// If Write partially fills src, a non-nil error is returned.
+func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
+	c.touchModificationTimeLocked(ctx)
+	return src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
+}
+
+type inodeReadWriter struct {
+	ctx    context.Context
+	c      *CachingInodeOperations
+	offset int64
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	rw.c.dataMu.RLock()
+	defer rw.c.dataMu.RUnlock()
+
+	// Compute the range to read.
+	if rw.offset >= rw.c.attr.Size {
+		return 0, io.EOF
+	}
+	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size)
+	if end == rw.offset { // dsts.NumBytes() == 0?
+		return 0, nil
+	}
+
+	mem := rw.c.platform.Memory()
+	var done uint64
+	seg, gap := rw.c.cache.Find(uint64(rw.offset))
+	for rw.offset < end {
+		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.offset += int64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Read directly from the backing file.
+			gapmr := gap.Range().Intersect(mr)
+			dst := dsts.TakeFirst64(gapmr.Length())
+			n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapmr.Start)
+			done += n
+			rw.offset += int64(n)
+			dsts = dsts.DropFirst64(n)
+			// Partial reads are fine. But we must stop reading.
+			if n != dst.NumBytes() || err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), FileRangeGapIterator{}
+
+		default:
+			break
+		}
+	}
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: rw.c.attrMu must be locked.
+func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	rw.c.dataMu.Lock()
+	defer rw.c.dataMu.Unlock()
+
+	// Compute the range to write.
+	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
+	if end == rw.offset { // srcs.NumBytes() == 0?
+		return 0, nil
+	}
+
+	defer func() {
+		// If the write ends beyond the file's previous size, it causes the
+		// file to grow.
+		if rw.offset > rw.c.attr.Size {
+			rw.c.attr.Size = rw.offset
+			rw.c.dirtyAttr.Size = true
+		}
+		if rw.offset > rw.c.attr.Usage {
+			// This is incorrect if CachingInodeOperations is caching a sparse
+			// file. (In Linux, keeping inode::i_blocks up to date is the
+			// filesystem's responsibility.)
+			rw.c.attr.Usage = rw.offset
+			rw.c.dirtyAttr.Usage = true
+		}
+	}()
+
+	mem := rw.c.platform.Memory()
+	var done uint64
+	seg, gap := rw.c.cache.Find(uint64(rw.offset))
+	for rw.offset < end {
+		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
+		switch {
+		case seg.Ok() && seg.Start() < mr.End:
+			// Get internal mappings from the cache.
+			segMR := seg.Range().Intersect(mr)
+			ims, err := mem.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+			if err != nil {
+				return done, err
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.offset += int64(n)
+			srcs = srcs.DropFirst64(n)
+			rw.c.dirty.MarkDirty(segMR)
+			if err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok() && gap.Start() < mr.End:
+			// Write directly to the backing file.
+			gapmr := gap.Range().Intersect(mr)
+			src := srcs.TakeFirst64(gapmr.Length())
+			n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start)
+			done += n
+			rw.offset += int64(n)
+			srcs = srcs.DropFirst64(n)
+			// Partial writes are fine. But we must stop writing.
+			if n != src.NumBytes() || err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), FileRangeGapIterator{}
+
+		default:
+			break
+		}
+	}
+	return done, nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	c.mapsMu.Lock()
+	defer c.mapsMu.Unlock()
+	mapped := c.mappings.AddMapping(ms, ar, offset)
+	// Do this unconditionally since whether we have c.backingFile.FD() >= 0
+	// can change across save/restore.
+	for _, r := range mapped {
+		c.hostFileMapper.IncRefOn(r)
+	}
+	if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 {
+		for _, r := range mapped {
+			usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
+		}
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+	c.mapsMu.Lock()
+	defer c.mapsMu.Unlock()
+	unmapped := c.mappings.RemoveMapping(ms, ar, offset)
+	for _, r := range unmapped {
+		c.hostFileMapper.DecRefOn(r)
+	}
+	if !c.forcePageCache && c.backingFile.FD() >= 0 {
+		if !usage.IncrementalMappedAccounting {
+			for _, r := range unmapped {
+				usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
+			}
+		}
+		return
+	}
+
+	// Writeback dirty mapped memory now that there are no longer any
+	// mappings that reference it. This is our naive memory eviction
+	// strategy.
+	mem := c.platform.Memory()
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+	for _, r := range unmapped {
+		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+			log.Warningf("Failed to writeback cached data %v: %v", r, err)
+		}
+		c.cache.Drop(r, mem)
+		c.dirty.KeepClean(r)
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	return c.AddMapping(ctx, ms, dstAR, offset)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	if !c.forcePageCache && c.backingFile.FD() >= 0 {
+		return []memmap.Translation{
+			{
+				Source: optional,
+				File:   c,
+				Offset: optional.Start,
+			},
+		}, nil
+	}
+
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+
+	// Constrain translations to c.attr.Size (rounded up) to prevent
+	// translation to pages that may be concurrently truncated.
+	pgend := fs.OffsetPageEnd(c.attr.Size)
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	mem := c.platform.Memory()
+	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mem, usage.PageCache, c.backingFile.ReadToBlocksAt)
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   mem,
+			Offset: seg.FileRangeOf(segMR).Start,
+		})
+		if at.Write {
+			// From this point forward, this memory can be dirtied through the
+			// mapping at any time.
+			c.dirty.KeepDirty(segMR)
+		}
+		translatedEnd = segMR.End
+	}
+
+	// Don't return the error returned by c.cache.Fill if it occurred outside
+	// of required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
+	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
+	if required.Length() >= maxReadahead {
+		return required
+	}
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.Start = required.Start
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.End = optional.Start + maxReadahead
+	return optional
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error {
+	// Whether we have a host fd (and consequently what platform.File is
+	// mapped) can change across save/restore, so invalidate all translations
+	// unconditionally.
+	c.mapsMu.Lock()
+	defer c.mapsMu.Unlock()
+	c.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+	// Sync the cache's contents so that if we have a host fd after restore,
+	// the remote file's contents are coherent.
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+	if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+		return err
+	}
+
+	// Discard the cache so that it's not stored in saved state. This is safe
+	// because per InvalidateUnsavable invariants, no new translations can have
+	// been returned after we invalidated all existing translations above.
+	c.cache.DropAll(c.platform.Memory())
+	c.dirty.RemoveAll()
+
+	return nil
+}
+
+// MapInto implements platform.File.MapInto. This is used when we directly map
+// an underlying host fd and CachingInodeOperations is used as the platform.File
+// during translation.
+func (c *CachingInodeOperations) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	return as.MapFile(addr, c.backingFile.FD(), fr, at, precommit)
+}
+
+// MapInternal implements platform.File.MapInternal. This is used when we
+// directly map an underlying host fd and CachingInodeOperations is used as the
+// platform.File during translation.
+func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
+}
+
+// IncRef implements platform.File.IncRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the platform.File
+// during translation.
+func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+
+	seg, gap := c.refs.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = c.refs.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			newRange := gap.Range().Intersect(fr)
+			if usage.IncrementalMappedAccounting {
+				usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+			}
+			seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+		default:
+			c.refs.MergeAdjacent(fr)
+			return
+		}
+	}
+}
+
+// DecRef implements platform.File.DecRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the platform.File
+// during translation.
+func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+
+	seg := c.refs.FindSegment(fr.Start)
+
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = c.refs.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			if usage.IncrementalMappedAccounting {
+				usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+			}
+			seg = c.refs.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	c.refs.MergeAdjacent(fr)
+}
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
new file mode 100644
index 000000000..996c91849
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -0,0 +1,403 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"bytes"
+	"io"
+	"reflect"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type noopBackingFile struct{}
+
+func (noopBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	return dsts.NumBytes(), nil
+}
+
+func (noopBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	return srcs.NumBytes(), nil
+}
+
+func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error {
+	return nil
+}
+
+func (noopBackingFile) Sync(context.Context) error {
+	return nil
+}
+
+func (noopBackingFile) FD() int {
+	return -1
+}
+
+func TestSetPermissions(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
+		Perms: fs.FilePermsFromMode(0444),
+	})
+	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+	defer iops.Release()
+
+	perms := fs.FilePermsFromMode(0777)
+	if !iops.SetPermissions(ctx, nil, perms) {
+		t.Fatalf("SetPermissions failed, want success")
+	}
+
+	// Did permissions change?
+	if !iops.dirtyAttr.Perms {
+		t.Fatalf("got perms not dirty, want dirty")
+	}
+	if iops.attr.Perms != perms {
+		t.Fatalf("got perms +%v, want +%v", iops.attr.Perms, perms)
+	}
+
+	// Did status change time change?
+	if !iops.dirtyAttr.StatusChangeTime {
+		t.Fatalf("got status change time not dirty, want dirty")
+	}
+	if iops.attr.StatusChangeTime.Equal(uattr.StatusChangeTime) {
+		t.Fatalf("got status change time unchanged")
+	}
+}
+
+func TestSetTimestamps(t *testing.T) {
+	ctx := contexttest.Context(t)
+	for _, test := range []struct {
+		desc      string
+		ts        fs.TimeSpec
+		wantDirty fs.AttrMask
+	}{
+		{
+			desc: "noop",
+			ts: fs.TimeSpec{
+				ATimeOmit: true,
+				MTimeOmit: true,
+			},
+			wantDirty: fs.AttrMask{},
+		},
+		{
+			desc: "access time only",
+			ts: fs.TimeSpec{
+				ATime:     ktime.NowFromContext(ctx),
+				MTimeOmit: true,
+			},
+			wantDirty: fs.AttrMask{
+				AccessTime:       true,
+				StatusChangeTime: true,
+			},
+		},
+		{
+			desc: "modification time only",
+			ts: fs.TimeSpec{
+				ATimeOmit: true,
+				MTime:     ktime.NowFromContext(ctx),
+			},
+			wantDirty: fs.AttrMask{
+				ModificationTime: true,
+				StatusChangeTime: true,
+			},
+		},
+		{
+			desc: "access and modification time",
+			ts: fs.TimeSpec{
+				ATime: ktime.NowFromContext(ctx),
+				MTime: ktime.NowFromContext(ctx),
+			},
+			wantDirty: fs.AttrMask{
+				AccessTime:       true,
+				ModificationTime: true,
+				StatusChangeTime: true,
+			},
+		},
+		{
+			desc: "system time access and modification time",
+			ts: fs.TimeSpec{
+				ATimeSetSystemTime: true,
+				MTimeSetSystemTime: true,
+			},
+			wantDirty: fs.AttrMask{
+				AccessTime:       true,
+				ModificationTime: true,
+				StatusChangeTime: true,
+			},
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			ctx := contexttest.Context(t)
+
+			epoch := ktime.ZeroTime
+			uattr := fs.UnstableAttr{
+				AccessTime:       epoch,
+				ModificationTime: epoch,
+				StatusChangeTime: epoch,
+			}
+			iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+			defer iops.Release()
+
+			if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil {
+				t.Fatalf("SetTimestamps got error %v, want nil", err)
+			}
+			if !reflect.DeepEqual(iops.dirtyAttr, test.wantDirty) {
+				t.Fatalf("dirty got %+v, want %+v", iops.dirtyAttr, test.wantDirty)
+			}
+			if iops.dirtyAttr.AccessTime {
+				if !iops.attr.AccessTime.After(uattr.AccessTime) {
+					t.Fatalf("diritied access time did not advance, want %v > %v", iops.attr.AccessTime, uattr.AccessTime)
+				}
+				if !iops.dirtyAttr.StatusChangeTime {
+					t.Fatalf("dirty access time requires dirty status change time")
+				}
+				if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) {
+					t.Fatalf("dirtied status change time did not advance")
+				}
+			}
+			if iops.dirtyAttr.ModificationTime {
+				if !iops.attr.ModificationTime.After(uattr.ModificationTime) {
+					t.Fatalf("diritied modification time did not advance")
+				}
+				if !iops.dirtyAttr.StatusChangeTime {
+					t.Fatalf("dirty modification time requires dirty status change time")
+				}
+				if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) {
+					t.Fatalf("dirtied status change time did not advance")
+				}
+			}
+		})
+	}
+}
+
+func TestTruncate(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	uattr := fs.UnstableAttr{
+		Size: 0,
+	}
+	iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+	defer iops.Release()
+
+	if err := iops.Truncate(ctx, nil, uattr.Size); err != nil {
+		t.Fatalf("Truncate got error %v, want nil", err)
+	}
+	if iops.dirtyAttr.Size {
+		t.Fatalf("Truncate caused size to be dirtied")
+	}
+	var size int64 = 4096
+	if err := iops.Truncate(ctx, nil, size); err != nil {
+		t.Fatalf("Truncate got error %v, want nil", err)
+	}
+	if !iops.dirtyAttr.Size {
+		t.Fatalf("Truncate caused size to not be dirtied")
+	}
+	if iops.attr.Size != size {
+		t.Fatalf("Truncate got %d, want %d", iops.attr.Size, size)
+	}
+	if !iops.dirtyAttr.ModificationTime || !iops.dirtyAttr.StatusChangeTime {
+		t.Fatalf("Truncate did not dirty modification and status change time")
+	}
+	if !iops.attr.ModificationTime.After(uattr.ModificationTime) {
+		t.Fatalf("dirtied modification time did not change")
+	}
+	if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) {
+		t.Fatalf("dirtied status change time did not change")
+	}
+}
+
+type sliceBackingFile struct {
+	data []byte
+}
+
+func newSliceBackingFile(data []byte) *sliceBackingFile {
+	return &sliceBackingFile{data}
+}
+
+func (f *sliceBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	r := safemem.BlockSeqReader{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)}
+	return r.ReadToBlocks(dsts)
+}
+
+func (f *sliceBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	w := safemem.BlockSeqWriter{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)}
+	return w.WriteFromBlocks(srcs)
+}
+
+func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error {
+	return nil
+}
+
+func (*sliceBackingFile) Sync(context.Context) error {
+	return nil
+}
+
+func (*sliceBackingFile) FD() int {
+	return -1
+}
+
+type noopMappingSpace struct{}
+
+// Invalidate implements memmap.MappingSpace.Invalidate.
+func (noopMappingSpace) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
+}
+
+func anonInode(ctx context.Context) *fs.Inode {
+	return fs.NewInode(NewSimpleInodeOperations(InodeSimpleAttributes{
+		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: fs.FileOwnerFromContext(ctx),
+			Perms: fs.FilePermissions{
+				User: fs.PermMask{Read: true, Write: true},
+			},
+			Links: 1,
+		}),
+	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+		Type:      fs.Anonymous,
+		BlockSize: usermem.PageSize,
+	})
+}
+
+func pagesOf(bs ...byte) []byte {
+	buf := make([]byte, 0, len(bs)*usermem.PageSize)
+	for _, b := range bs {
+		buf = append(buf, bytes.Repeat([]byte{b}, usermem.PageSize)...)
+	}
+	return buf
+}
+
+func TestRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	// Construct a 3-page file.
+	buf := pagesOf('a', 'b', 'c')
+	file := fs.NewFile(ctx, fs.NewDirent(anonInode(ctx), "anon"), fs.FileFlags{}, nil)
+	uattr := fs.UnstableAttr{
+		Size: int64(len(buf)),
+	}
+	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+	defer iops.Release()
+
+	// Expect the cache to be initially empty.
+	if cached := iops.cache.Span(); cached != 0 {
+		t.Errorf("Span got %d, want 0", cached)
+	}
+
+	// Create a memory mapping of the second page (as CachingInodeOperations
+	// expects to only cache mapped pages), then call Translate to force it to
+	// be cached.
+	var ms noopMappingSpace
+	ar := usermem.AddrRange{usermem.PageSize, 2 * usermem.PageSize}
+	if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil {
+		t.Fatalf("AddMapping got %v, want nil", err)
+	}
+	mr := memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize}
+	if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil {
+		t.Fatalf("Translate got %v, want nil", err)
+	}
+	if cached := iops.cache.Span(); cached != usermem.PageSize {
+		t.Errorf("SpanRange got %d, want %d", cached, usermem.PageSize)
+	}
+
+	// Try to read 4 pages. The first and third pages should be read directly
+	// from the "file", the second page should be read from the cache, and only
+	// 3 pages (the size of the file) should be readable.
+	rbuf := make([]byte, 4*usermem.PageSize)
+	dst := usermem.BytesIOSequence(rbuf)
+	n, err := iops.Read(ctx, file, dst, 0)
+	if n != 3*usermem.PageSize || (err != nil && err != io.EOF) {
+		t.Fatalf("Read got (%d, %v), want (%d, nil or EOF)", n, err, 3*usermem.PageSize)
+	}
+	rbuf = rbuf[:3*usermem.PageSize]
+
+	// Did we get the bytes we expect?
+	if !bytes.Equal(rbuf, buf) {
+		t.Errorf("Read back bytes %v, want %v", rbuf, buf)
+	}
+
+	// Delete the memory mapping and expect it to cause the cached page to be
+	// uncached.
+	iops.RemoveMapping(ctx, ms, ar, usermem.PageSize)
+	if cached := iops.cache.Span(); cached != 0 {
+		t.Fatalf("Span got %d, want 0", cached)
+	}
+}
+
+func TestWrite(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	// Construct a 4-page file.
+	buf := pagesOf('a', 'b', 'c', 'd')
+	orig := append([]byte(nil), buf...)
+	inode := anonInode(ctx)
+	uattr := fs.UnstableAttr{
+		Size: int64(len(buf)),
+	}
+	iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+	defer iops.Release()
+
+	// Expect the cache to be initially empty.
+	if cached := iops.cache.Span(); cached != 0 {
+		t.Errorf("Span got %d, want 0", cached)
+	}
+
+	// Create a memory mapping of the second and third pages (as
+	// CachingInodeOperations expects to only cache mapped pages), then call
+	// Translate to force them to be cached.
+	var ms noopMappingSpace
+	ar := usermem.AddrRange{usermem.PageSize, 3 * usermem.PageSize}
+	if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil {
+		t.Fatalf("AddMapping got %v, want nil", err)
+	}
+	defer iops.RemoveMapping(ctx, ms, ar, usermem.PageSize)
+	mr := memmap.MappableRange{usermem.PageSize, 3 * usermem.PageSize}
+	if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil {
+		t.Fatalf("Translate got %v, want nil", err)
+	}
+	if cached := iops.cache.Span(); cached != 2*usermem.PageSize {
+		t.Errorf("SpanRange got %d, want %d", cached, 2*usermem.PageSize)
+	}
+
+	// Write to the first 2 pages.
+	wbuf := pagesOf('e', 'f')
+	src := usermem.BytesIOSequence(wbuf)
+	n, err := iops.Write(ctx, src, 0)
+	if n != 2*usermem.PageSize || err != nil {
+		t.Fatalf("Write got (%d, %v), want (%d, nil)", n, err, 2*usermem.PageSize)
+	}
+
+	// The first page should have been written directly, since it was not cached.
+	want := append([]byte(nil), orig...)
+	copy(want, pagesOf('e'))
+	if !bytes.Equal(buf, want) {
+		t.Errorf("File contents are %v, want %v", buf, want)
+	}
+
+	// Sync back to the "backing file".
+	if err := iops.WriteOut(ctx, inode); err != nil {
+		t.Errorf("Sync got %v, want nil", err)
+	}
+
+	// Now the second page should have been written as well.
+	copy(want[usermem.PageSize:], pagesOf('f'))
+	if !bytes.Equal(buf, want) {
+		t.Errorf("File contents are %v, want %v", buf, want)
+	}
+}
diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md
new file mode 100644
index 000000000..1e99a3357
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/inotify.md
@@ -0,0 +1,122 @@
+# Inotify
+
+Inotify implements the like-named filesystem event notification system for the
+sentry, see `inotify(7)`.
+
+## Architecture
+
+For the most part, the sentry implementation of inotify mirrors the Linux
+architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are
+backed by a pseudo-filesystem. Events are generated from various places in the
+sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and
+the [process fd table][fd_map]. Watches are stored in inodes and generated
+events are queued to the inotify instance owning the watches for delivery to the
+user.
+
+## Objects
+
+Here is a brief description of the existing and new objects involved in the
+sentry inotify mechanism, and how they interact:
+
+### [`fs.Inotify`][inotify]
+
+-   An inotify instances, created by inotify_init(2)/inotify_init1(2).
+-   The inotify fd has a `fs.Dirent`, supports filesystem syscalls to read
+    events.
+-   Has multiple `fs.Watch`es, with at most one watch per target inode, per
+    inotify instance.
+-   Has an instance `id` which is globally unique. This is *not* the fd number
+    for this instance, since the fd can be duped. This `id` is not externally
+    visible.
+
+### [`fs.Watch`][watch]
+
+-   An inotify watch, created/deleted by
+    inotify_add_watch(2)/inotify_rm_watch(2).
+-   Owned by an `fs.Inotify` instance, each watch keeps a pointer to the
+    `owner`.
+-   Associated with a single `fs.Inode`, which is the watch `target`. While the
+    watch is active, it indirectly pins `target` to memory. See the "Reference
+    Model" section for a detailed explanation.
+-   Filesystem operations on `target` generate `fs.Event`s.
+
+### [`fs.Event`][event]
+
+-   A simple struct encapsulating all the fields for an inotify event.
+-   Generated by `fs.Watch`es and forwarded to the watches' `owner`s.
+-   Serialized to the user during read(2) syscalls on the associated
+    `fs.Inotify`'s fd.
+
+### [`fs.Dirent`][dirent]
+
+-   Many inotify events are generated inside dirent methods. Events are
+    generated in the dirent methods rather than `fs.Inode` methods because some
+    events carry the name of the subject node, and node names are generally
+    unavailable in an `fs.Inode`.
+-   Dirents do not directly contain state for any watches. Instead, they forward
+    notifications to the underlying `fs.Inode`.
+
+### [`fs.Inode`][inode]
+
+-   Interacts with inotify through `fs.Watch`es.
+-   Inodes contain a map of all active `fs.Watch`es on them.
+-   An `fs.Inotify` instance can have at most one `fs.Watch` per inode.
+    `fs.Watch`es on an inode are indexed by their `owner`'s `id`.
+-   All inotify logic is encapsulated in the [`Watches`][inode_watches] struct
+    in an inode. Logically, `Watches` is the set of inotify watches on the
+    inode.
+
+## Reference Model
+
+The sentry inotify implementation has a complex reference model. An inotify
+watch observes a single inode. For efficient lookup, the state for a watch is
+stored directly on the target inode. This state needs to be persistent for the
+lifetime of watch. Unlike usual filesystem metadata, the watch state has no
+"on-disk" representation, so they cannot be reconstructed by the filesystem if
+the inode is flushed from memory. This effectively means we need to keep any
+inodes with actives watches pinned to memory.
+
+We can't just hold an extra ref on the inode to pin it to memory because some
+filesystems (such as gofer-based filesystems) don't have persistent inodes. In
+such a filesystem, if we just pin the inode, nothing prevents the enclosing
+dirent from being GCed. Once the dirent is GCed, the pinned inode is
+unreachable -- these filesystems generate a new inode by re-reading the node
+state on the next walk. Incidentally, hardlinks also don't work on these
+filesystems for this reason.
+
+To prevent the above scenario, when a new watch is added on an inode, we *pin*
+the dirent we used to reach the inode. Note that due to hardlinks, this dirent
+may not be the only dirent pointing to the inode. Attempting to set an inotify
+watch via multiple hardlinks to the same file results in the same watch being
+returned for both links. However, for each new dirent we use to reach the same
+inode, we add a new pin. We need a new pin for each new dirent used to reach the
+inode because we have no guarantees about the deletion order of the different
+links to the inode.
+
+## Lock Ordering
+
+There are 4 locks related to the inotify implementation:
+
+-   `Inotify.mu`: the inotify instance lock.
+-   `Inotify.evMu`: the inotify event queue lock.
+-   `Watch.mu`: the watch lock, used to protect pins.
+-   `fs.Watches.mu`: the inode watch set mu, used to protect the collection of
+    watches on the inode.
+
+The correct lock ordering for inotify code is:
+
+`Inotify.mu` -> `fs.Watches.mu` -> `Watch.mu` -> `Inotify.evMu`.
+
+We need a distinct lock for the event queue because by the time a goroutine
+attempts to queue a new event, it is already holding `fs.Watches.mu`. If we used
+`Inotify.mu` to also protect the event queue, this would violate the above lock
+ordering.
+
+[dirent]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/dirent.go
+[event]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inotify_event.go
+[fd_map]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/kernel/fd_map.go
+[inode]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inode.go
+[inode_watches]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inode_inotify.go
+[inotify]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inotify.go
+[syscall_dir]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/syscalls/linux/
+[watch]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inotify_watch.go
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
new file mode 100644
index 000000000..ca42b0a54
--- /dev/null
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -0,0 +1,90 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "gofer_state",
+    srcs = [
+        "file.go",
+        "file_state.go",
+        "fs.go",
+        "inode.go",
+        "inode_state.go",
+        "session.go",
+        "session_state.go",
+    ],
+    out = "gofer_state.go",
+    package = "gofer",
+)
+
+go_library(
+    name = "gofer",
+    srcs = [
+        "attr.go",
+        "context_file.go",
+        "device.go",
+        "file.go",
+        "file_state.go",
+        "fs.go",
+        "gofer_state.go",
+        "handles.go",
+        "inode.go",
+        "inode_state.go",
+        "path.go",
+        "session.go",
+        "session_state.go",
+        "socket.go",
+        "util.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/p9",
+        "//pkg/refs",
+        "//pkg/secio",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fdpipe",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/host",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/unet",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "gofer_test",
+    size = "small",
+    srcs = ["gofer_test.go"],
+    embed = [":gofer"],
+    deps = [
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/p9/p9test",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usermem",
+        "//pkg/unet",
+    ],
+)
diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
new file mode 100644
index 000000000..5e24767f9
--- /dev/null
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -0,0 +1,162 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// getattr returns the 9p attributes of the p9.File. On success, Mode, Size, and RDev
+// are guaranteed to be masked as valid.
+func getattr(ctx context.Context, file contextFile) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	// Retrieve attributes over the wire.
+	qid, valid, attr, err := file.getAttr(ctx, p9.AttrMaskAll())
+	if err != nil {
+		return qid, valid, attr, err
+	}
+
+	// Require mode, size, and raw device id.
+	if !valid.Mode || !valid.Size || !valid.RDev {
+		return qid, valid, attr, syscall.EIO
+	}
+
+	return qid, valid, attr, nil
+}
+
+func unstable(ctx context.Context, valid p9.AttrMask, pattr p9.Attr, mounter fs.FileOwner, client *p9.Client) fs.UnstableAttr {
+	return fs.UnstableAttr{
+		Size:             int64(pattr.Size),
+		Usage:            int64(pattr.Size),
+		Perms:            perms(valid, pattr, client),
+		Owner:            owner(mounter, valid, pattr),
+		AccessTime:       atime(ctx, valid, pattr),
+		ModificationTime: mtime(ctx, valid, pattr),
+		StatusChangeTime: ctime(ctx, valid, pattr),
+		Links:            links(valid, pattr),
+	}
+}
+
+func perms(valid p9.AttrMask, pattr p9.Attr, client *p9.Client) fs.FilePermissions {
+	if pattr.Mode.IsDir() && !p9.VersionSupportsMultiUser(client.Version()) {
+		// If user and group permissions bits are not supplied, use
+		// "other" bits to supplement them.
+		//
+		// Older Gofer's fake directories only have "other" permission,
+		// but will often be accessed via user or group permissions.
+		if pattr.Mode&0770 == 0 {
+			other := pattr.Mode & 07
+			pattr.Mode = pattr.Mode | other<<3 | other<<6
+		}
+	}
+	return fs.FilePermsFromP9(pattr.Mode)
+}
+
+func owner(mounter fs.FileOwner, valid p9.AttrMask, pattr p9.Attr) fs.FileOwner {
+	// Unless the file returned its UID and GID, it belongs to the mounting
+	// task's EUID/EGID.
+	owner := mounter
+	if valid.UID {
+		owner.UID = auth.KUID(pattr.UID)
+	}
+	if valid.GID {
+		owner.GID = auth.KGID(pattr.GID)
+	}
+	return owner
+}
+
+// bsize returns a block size from 9p attributes.
+func bsize(pattr p9.Attr) int64 {
+	if pattr.BlockSize > 0 {
+		return int64(pattr.BlockSize)
+	}
+	// Some files may have no clue of their block size. Better not to report
+	// something misleading or buggy and have a safe default.
+	return usermem.PageSize
+}
+
+// ntype returns an fs.InodeType from 9p attributes.
+func ntype(pattr p9.Attr) fs.InodeType {
+	switch {
+	case pattr.Mode.IsNamedPipe():
+		return fs.Pipe
+	case pattr.Mode.IsDir():
+		return fs.Directory
+	case pattr.Mode.IsSymlink():
+		return fs.Symlink
+	case pattr.Mode.IsCharacterDevice():
+		return fs.CharacterDevice
+	case pattr.Mode.IsBlockDevice():
+		return fs.BlockDevice
+	case pattr.Mode.IsSocket():
+		return fs.Socket
+	case pattr.Mode.IsRegular():
+		fallthrough
+	default:
+		return fs.RegularFile
+	}
+}
+
+// ctime returns a change time from 9p attributes.
+func ctime(ctx context.Context, valid p9.AttrMask, pattr p9.Attr) ktime.Time {
+	if valid.CTime {
+		return ktime.FromUnix(int64(pattr.CTimeSeconds), int64(pattr.CTimeNanoSeconds))
+	}
+	// Approximate ctime with mtime if ctime isn't available.
+	return mtime(ctx, valid, pattr)
+}
+
+// atime returns an access time from 9p attributes.
+func atime(ctx context.Context, valid p9.AttrMask, pattr p9.Attr) ktime.Time {
+	if valid.ATime {
+		return ktime.FromUnix(int64(pattr.ATimeSeconds), int64(pattr.ATimeNanoSeconds))
+	}
+	return ktime.NowFromContext(ctx)
+}
+
+// mtime returns a modification time from 9p attributes.
+func mtime(ctx context.Context, valid p9.AttrMask, pattr p9.Attr) ktime.Time {
+	if valid.MTime {
+		return ktime.FromUnix(int64(pattr.MTimeSeconds), int64(pattr.MTimeNanoSeconds))
+	}
+	return ktime.NowFromContext(ctx)
+}
+
+// links returns a hard link count from 9p attributes.
+func links(valid p9.AttrMask, pattr p9.Attr) uint64 {
+	// For gofer file systems that support link count (such as a local file gofer),
+	// we return the link count reported by the underlying file system.
+	if valid.NLink {
+		return pattr.NLink
+	}
+
+	// This node is likely backed by a file system that doesn't support links.
+	// We could readdir() and count children directories to provide an accurate
+	// link count. However this may be expensive since the gofer may be backed by remote
+	// storage. Instead, simply return 2 links for directories and 1 for everything else
+	// since no one relies on an accurate link count for gofer-based file systems.
+	switch ntype(pattr) {
+	case fs.Directory:
+		return 2
+	default:
+		return 1
+	}
+}
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
new file mode 100644
index 000000000..d4b6f6eb7
--- /dev/null
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -0,0 +1,190 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextFile is a wrapper around p9.File that notifies the context that
+// it's about to sleep before calling the Gofer over P9.
+type contextFile struct {
+	file p9.File
+}
+
+func (c *contextFile) walk(ctx context.Context, names []string) ([]p9.QID, contextFile, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	q, f, err := c.file.Walk(names)
+	if err != nil {
+		return nil, contextFile{}, err
+	}
+	return q, contextFile{file: f}, nil
+}
+
+func (c *contextFile) statFS(ctx context.Context) (p9.FSStat, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.StatFS()
+}
+
+func (c *contextFile) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.GetAttr(req)
+}
+
+func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.SetAttr(valid, attr)
+}
+
+func (c *contextFile) remove(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Remove()
+}
+
+func (c *contextFile) rename(ctx context.Context, directory contextFile, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Rename(directory.file, name)
+}
+
+func (c *contextFile) close(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Close()
+}
+
+func (c *contextFile) open(ctx context.Context, mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Open(mode)
+}
+
+func (c *contextFile) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.ReadAt(p, offset)
+}
+
+func (c *contextFile) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.WriteAt(p, offset)
+}
+
+func (c *contextFile) fsync(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.FSync()
+}
+
+func (c *contextFile) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	fd, _, _, _, err := c.file.Create(name, flags, permissions, uid, gid)
+	return fd, err
+}
+
+func (c *contextFile) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Mkdir(name, permissions, uid, gid)
+}
+
+func (c *contextFile) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Symlink(oldName, newName, uid, gid)
+}
+
+func (c *contextFile) link(ctx context.Context, target *contextFile, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Link(target.file, newName)
+}
+
+func (c *contextFile) mknod(ctx context.Context, name string, permissions p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Mknod(name, permissions, major, minor, uid, gid)
+}
+
+func (c *contextFile) unlinkAt(ctx context.Context, name string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.UnlinkAt(name, flags)
+}
+
+func (c *contextFile) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Readdir(offset, count)
+}
+
+func (c *contextFile) readlink(ctx context.Context) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Readlink()
+}
+
+func (c *contextFile) flush(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Flush()
+}
+
+func (c *contextFile) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, contextFile, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	q, f, m, a, err := c.file.WalkGetAttr(names)
+	if err != nil {
+		return nil, contextFile{}, p9.AttrMask{}, p9.Attr{}, err
+	}
+	return q, contextFile{file: f}, m, a, nil
+}
+
+func (c *contextFile) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
+	ctx.UninterruptibleSleepStart(false)
+	defer ctx.UninterruptibleSleepFinish(false)
+
+	return c.file.Connect(flags)
+}
diff --git a/pkg/sentry/fs/gofer/device.go b/pkg/sentry/fs/gofer/device.go
new file mode 100644
index 000000000..fac7306d4
--- /dev/null
+++ b/pkg/sentry/fs/gofer/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// goferDevice is the gofer virtual device.
+var goferDevice = device.NewAnonMultiDevice()
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
new file mode 100644
index 000000000..07c9bf01d
--- /dev/null
+++ b/pkg/sentry/fs/gofer/file.go
@@ -0,0 +1,255 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
+
+// fileOperations implements fs.FileOperations for a remote file system.
+type fileOperations struct {
+	fsutil.NoIoctl     `state:"nosave"`
+	waiter.AlwaysReady `state:"nosave"`
+
+	// inodeOperations is the inodeOperations backing the file. It is protected
+	// by a reference held by File.Dirent.Inode which is stable until
+	// FileOperations.Release is called.
+	inodeOperations *inodeOperations `state:"wait"`
+
+	// dirCursor is the directory cursor.
+	dirCursor string
+
+	// handles are the opened remote file system handles, which may
+	// be shared with other files.
+	handles *handles `state:"nosave"`
+
+	// flags are the flags used to open handles.
+	flags fs.FileFlags `state:"wait"`
+}
+
+// fileOperations implements fs.FileOperations.
+var _ fs.FileOperations = (*fileOperations)(nil)
+
+// NewFile returns a file. NewFile is not appropriate with host pipes and sockets.
+func NewFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, i *inodeOperations, handles *handles) *fs.File {
+	// Remote file systems enforce readability/writability at an offset,
+	// see fs/9p/vfs_inode.c:v9fs_vfs_atomic_open -> fs/open.c:finish_open.
+	flags.Pread = true
+	flags.Pwrite = true
+
+	f := &fileOperations{
+		inodeOperations: i,
+		handles:         handles,
+		flags:           flags,
+	}
+	if flags.Write {
+		if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil {
+			name, _ := dirent.FullName(fs.RootFromContext(ctx))
+			openedWX.Increment()
+			log.Warningf("Opened a writable executable: %q", name)
+		}
+	}
+	return fs.NewFile(ctx, dirent, flags, f)
+}
+
+// Release implements fs.FileOpeations.Release.
+func (f *fileOperations) Release() {
+	f.handles.DecRef()
+}
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &f.dirCursor,
+	}
+	n, err := fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+	if f.inodeOperations.session().cachePolicy != cacheNone {
+		f.inodeOperations.cachingInodeOps.TouchAccessTime(ctx, file.Dirent.Inode)
+	}
+	return n, err
+}
+
+// IterateDir implements fs.DirIterator.IterateDir.
+func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	f.inodeOperations.readdirMu.Lock()
+	defer f.inodeOperations.readdirMu.Unlock()
+
+	// Fetch directory entries if needed.
+	if f.inodeOperations.readdirCache == nil || f.inodeOperations.session().cachePolicy == cacheNone {
+		entries, err := f.readdirAll(ctx)
+		if err != nil {
+			return offset, err
+		}
+
+		// Cache the readdir result.
+		f.inodeOperations.readdirCache = fs.NewSortedDentryMap(entries)
+	}
+
+	// Serialize the entries.
+	n, err := fs.GenericReaddir(dirCtx, f.inodeOperations.readdirCache)
+	return offset + n, err
+}
+
+// readdirAll fetches fs.DentAttrs for f, using the attributes of g.
+func (f *fileOperations) readdirAll(ctx context.Context) (map[string]fs.DentAttr, error) {
+	entries := make(map[string]fs.DentAttr)
+	var readOffset uint64
+	for {
+		// We choose some arbitrary high number of directory entries (64k) and call
+		// Readdir until we've exhausted them all.
+		dirents, err := f.handles.File.readdir(ctx, readOffset, 64*1024)
+		if err != nil {
+			return nil, err
+		}
+		if len(dirents) == 0 {
+			// We're done, we reached EOF.
+			break
+		}
+
+		// The last dirent contains the offset into the next set of dirents.  The gofer
+		// returns the offset as an index into directories, not as a byte offset, because
+		// converting a byte offset to an index into directories entries is a huge pain.
+		// But everything is fine if we're consistent.
+		readOffset = dirents[len(dirents)-1].Offset
+
+		for _, dirent := range dirents {
+			if dirent.Name == "." || dirent.Name == ".." {
+				// These must not be included in Readdir results.
+				continue
+			}
+
+			// Find a best approximation of the type.
+			var nt fs.InodeType
+			switch dirent.Type {
+			case p9.TypeDir:
+				nt = fs.Directory
+			case p9.TypeSymlink:
+				nt = fs.Symlink
+			default:
+				nt = fs.RegularFile
+			}
+
+			// Install the DentAttr.
+			entries[dirent.Name] = fs.DentAttr{
+				Type: nt,
+				// Construct the key to find the virtual inode.
+				// Directory entries reside on the same Device
+				// and SecondaryDevice as their parent.
+				InodeID: goferDevice.Map(device.MultiDeviceKey{
+					Device:          f.inodeOperations.fileState.key.Device,
+					SecondaryDevice: f.inodeOperations.fileState.key.SecondaryDevice,
+					Inode:           dirent.QID.Path,
+				}),
+			}
+		}
+	}
+
+	return entries, nil
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	if fs.IsDir(file.Dirent.Inode.StableAttr) {
+		// Not all remote file systems enforce this so this client does.
+		return 0, syserror.EISDIR
+	}
+
+	// Do cached IO for regular files only. Some character devices expect no caching.
+	isFile := fs.IsFile(file.Dirent.Inode.StableAttr)
+	if f.inodeOperations.session().cachePolicy == cacheNone || !isFile {
+		return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
+	}
+	return f.inodeOperations.cachingInodeOps.Write(ctx, src, offset)
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if fs.IsDir(file.Dirent.Inode.StableAttr) {
+		// Not all remote file systems enforce this so this client does.
+		return 0, syserror.EISDIR
+	}
+
+	// Do cached IO for regular files only. Some character devices expect no caching.
+	isFile := fs.IsFile(file.Dirent.Inode.StableAttr)
+	if f.inodeOperations.session().cachePolicy == cacheNone || !isFile {
+		return dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset))
+	}
+	return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
+}
+
+// Fsync implements fs.FileOperations.Fsync.
+func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+	switch syncType {
+	case fs.SyncAll, fs.SyncData:
+		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+			return err
+		}
+		fallthrough
+	case fs.SyncBackingStorage:
+		// Sync remote caches.
+		if f.handles.Host != nil {
+			// Sync the host fd directly.
+			return syscall.Fsync(f.handles.Host.FD())
+		}
+		// Otherwise sync on the p9.File handle.
+		return f.handles.File.fsync(ctx)
+	}
+	panic("invalid sync type")
+}
+
+// Flush implements fs.FileOperations.Flush.
+func (f *fileOperations) Flush(ctx context.Context, file *fs.File) error {
+	// If this file is not opened writable then there is nothing to flush.
+	// We do this because some p9 server implementations of Flush are
+	// over-zealous.
+	//
+	// FIXME: weaken these implementations and remove this check.
+	if !file.Flags().Write {
+		return nil
+	}
+	// Execute the flush.
+	return f.handles.File.flush(ctx)
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	if !isFileCachable(f.inodeOperations.session(), file.Dirent.Inode) {
+		return syserror.ENODEV
+	}
+	return fsutil.GenericConfigureMMap(file, f.inodeOperations.cachingInodeOps, opts)
+}
+
+// Seek implements fs.FileOperations.Seek.
+func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
+}
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
new file mode 100644
index 000000000..1d63e33ec
--- /dev/null
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// afterLoad is invoked by stateify.
+func (f *fileOperations) afterLoad() {
+	load := func() {
+		f.inodeOperations.fileState.waitForLoad()
+
+		// Manually load the open handles.
+		var err error
+		// TODO: Context is not plumbed to save/restore.
+		f.handles, err = newHandles(context.Background(), f.inodeOperations.fileState.file, f.flags)
+		if err != nil {
+			panic("failed to re-open handle: " + err.Error())
+		}
+		f.inodeOperations.fileState.setHandlesForCachedIO(f.flags, f.handles)
+	}
+	fs.Async(load)
+}
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
new file mode 100644
index 000000000..0a1a49bbd
--- /dev/null
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -0,0 +1,252 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gofer implements a remote 9p filesystem.
+package gofer
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// The following are options defined by the Linux 9p client that we support,
+// see Documentation/filesystems/9p.txt.
+const (
+	// The transport method.
+	transportKey = "trans"
+
+	// The file tree to access when the file server
+	// is exporting several file systems. Stands for "attach name".
+	anameKey = "aname"
+
+	// The caching policy.
+	cacheKey = "cache"
+
+	// The file descriptor for reading with trans=fd.
+	readFDKey = "rfdno"
+
+	// The file descriptor for writing with trans=fd.
+	writeFDKey = "wfdno"
+
+	// The number of bytes to use for a 9p packet payload.
+	msizeKey = "msize"
+
+	// The 9p protocol version.
+	versionKey = "version"
+
+	// If set to true allows the creation of unix domain sockets inside the
+	// sandbox using files backed by the gofer. If set to false, unix sockets
+	// cannot be bound to gofer files without an overlay on top.
+	privateUnixSocketKey = "privateunixsocket"
+)
+
+// cachePolicy is a 9p cache policy.
+type cachePolicy string
+
+const (
+	// Use virtual file system cache.
+	cacheAll cachePolicy = "fscache"
+
+	// TODO: fully support cache=none.
+	cacheNone cachePolicy = "none"
+
+	// defaultCache is cacheAll. Note this diverges from the 9p Linux
+	// client whose default is "none".  See TODO above.
+	defaultCache = cacheAll
+)
+
+// defaultAname is the default attach name.
+const defaultAname = "/"
+
+// defaultMSize is the message size used for chunking large read and write requests.
+// This has been tested to give good enough performance up to 64M.
+const defaultMSize = 1024 * 1024 // 1M
+
+// defaultVersion is the default 9p protocol version. Will negotiate downwards with
+// file server if needed.
+var defaultVersion = p9.HighestVersionString()
+
+// Number of names of non-children to cache, preventing unneeded walks.  64 is
+// plenty for nodejs, which seems to stat about 4 children on every require().
+const nonChildrenCacheSize = 64
+
+var (
+	// ErrNoTransport is returned when there is no 'trans' option.
+	ErrNoTransport = errors.New("missing required option: 'trans='")
+
+	// ErrNoReadFD is returned when there is no 'rfdno' option.
+	ErrNoReadFD = errors.New("missing required option: 'rfdno='")
+
+	// ErrNoWriteFD is returned when there is no 'wfdno' option.
+	ErrNoWriteFD = errors.New("missing required option: 'wfdno='")
+)
+
+// filesystem is a 9p client.
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// FilesystemName is the name under which the filesystem is registered.
+// The name matches fs/9p/vfs_super.c:v9fs_fs_type.name.
+const FilesystemName = "9p"
+
+// Name is the name of the filesystem.
+func (*filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*filesystem) AllowUserMount() bool {
+	return false
+}
+
+// Flags returns that there is nothing special about this file system.
+//
+// The 9p Linux client returns FS_RENAME_DOES_D_MOVE, see fs/9p/vfs_super.c.
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns an attached 9p client that can be positioned in the vfs.
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// Parse and validate the mount options.
+	o, err := options(data)
+	if err != nil {
+		return nil, err
+	}
+
+	// Construct the 9p root to mount. We intentionally diverge from Linux in that
+	// the first Tversion and Tattach requests are done lazily.
+	return Root(ctx, device, f, flags, o)
+}
+
+// opts are parsed 9p mount options.
+type opts struct {
+	fd                int
+	aname             string
+	policy            cachePolicy
+	msize             uint32
+	version           string
+	privateunixsocket bool
+}
+
+// options parses mount(2) data into structured options.
+func options(data string) (opts, error) {
+	var o opts
+
+	// Parse generic comma-separated key=value options, this file system expects them.
+	options := fs.GenericMountSourceOptions(data)
+
+	// Check for the required 'trans=fd' option.
+	trans, ok := options[transportKey]
+	if !ok {
+		return o, ErrNoTransport
+	}
+	if trans != "fd" {
+		return o, fmt.Errorf("unsupported transport: 'trans=%s'", trans)
+	}
+	delete(options, transportKey)
+
+	// Check for the required 'rfdno=' option.
+	srfd, ok := options[readFDKey]
+	if !ok {
+		return o, ErrNoReadFD
+	}
+	delete(options, readFDKey)
+
+	// Check for the required 'wfdno=' option.
+	swfd, ok := options[writeFDKey]
+	if !ok {
+		return o, ErrNoWriteFD
+	}
+	delete(options, writeFDKey)
+
+	// Parse the read fd.
+	rfd, err := strconv.Atoi(srfd)
+	if err != nil {
+		return o, fmt.Errorf("invalid fd for 'rfdno=%s': %v", srfd, err)
+	}
+
+	// Parse the write fd.
+	wfd, err := strconv.Atoi(swfd)
+	if err != nil {
+		return o, fmt.Errorf("invalid fd for 'wfdno=%s': %v", swfd, err)
+	}
+
+	// Require that the read and write fd are the same.
+	if rfd != wfd {
+		return o, fmt.Errorf("fd in 'rfdno=%d' and 'wfdno=%d' must match", rfd, wfd)
+	}
+	o.fd = rfd
+
+	// Parse the attach name.
+	o.aname = defaultAname
+	if an, ok := options[anameKey]; ok {
+		o.aname = an
+		delete(options, anameKey)
+	}
+
+	// Parse the cache policy. Reject unsupported policies.
+	o.policy = cacheAll
+	if cp, ok := options[cacheKey]; ok {
+		if cachePolicy(cp) != cacheAll && cachePolicy(cp) != cacheNone {
+			return o, fmt.Errorf("unsupported cache mode: 'cache=%s'", cp)
+		}
+		o.policy = cachePolicy(cp)
+		delete(options, cacheKey)
+	}
+
+	// Parse the message size. Reject malformed options.
+	o.msize = uint32(defaultMSize)
+	if m, ok := options[msizeKey]; ok {
+		i, err := strconv.ParseUint(m, 10, 32)
+		if err != nil {
+			return o, fmt.Errorf("invalid message size for 'msize=%s': %v", m, err)
+		}
+		o.msize = uint32(i)
+		delete(options, msizeKey)
+	}
+
+	// Parse the protocol version.
+	o.version = defaultVersion
+	if v, ok := options[versionKey]; ok {
+		o.version = v
+		delete(options, versionKey)
+	}
+
+	// Parse the unix socket policy. Reject non-booleans.
+	if v, ok := options[privateUnixSocketKey]; ok {
+		b, err := strconv.ParseBool(v)
+		if err != nil {
+			return o, fmt.Errorf("invalid boolean value for '%s=%s': %v", privateUnixSocketKey, v, err)
+		}
+		o.privateunixsocket = b
+		delete(options, privateUnixSocketKey)
+	}
+
+	// Fail to attach if the caller wanted us to do something that we
+	// don't support.
+	if len(options) > 0 {
+		return o, fmt.Errorf("unsupported mount options: %v", options)
+	}
+
+	return o, nil
+}
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
new file mode 100644
index 000000000..58a2e2ef5
--- /dev/null
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -0,0 +1,776 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/p9/p9test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+)
+
+// A errMock is an error that comes from bad usage of the mock.
+var errMock = errors.New("mock error")
+
+// goodMockFile returns a file that can be Walk'ed to and created.
+func goodMockFile(mode p9.FileMode, size uint64) *p9test.FileMock {
+	return &p9test.FileMock{
+		GetAttrMock: p9test.GetAttrMock{
+			Valid: p9.AttrMask{Mode: true, Size: true, RDev: true},
+			Attr:  p9.Attr{Mode: mode, Size: size, RDev: 0},
+		},
+	}
+}
+
+func newClosedSocket() (*unet.Socket, error) {
+	fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	s, err := unet.NewSocket(fd)
+	if err != nil {
+		syscall.Close(fd)
+		return nil, err
+	}
+
+	return s, s.Close()
+}
+
+// root returns a p9 file mock and an fs.InodeOperations created from that file.  Any
+// functions performed on fs.InodeOperations will use the p9 file mock.
+func root(ctx context.Context, mode p9.FileMode, size uint64) (*p9test.FileMock, *fs.Inode, error) {
+	sock, err := newClosedSocket()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Construct a dummy session that we can destruct.
+	s := &session{
+		conn:        sock,
+		mounter:     fs.RootOwner,
+		cachePolicy: cacheNone,
+	}
+
+	rootFile := goodMockFile(mode, size)
+	sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr)
+	m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{})
+	return rootFile, fs.NewInode(rootInodeOperations, m, sattr), nil
+}
+
+func TestLookup(t *testing.T) {
+	// Test parameters.
+	type lookupTest struct {
+		// Name of the test.
+		name string
+
+		// Function input parameters.
+		fileName string
+
+		// Expected return value.
+		want error
+	}
+
+	tests := []lookupTest{
+		{
+			name:     "mock Walk passes (function succeeds)",
+			fileName: "ppp",
+			want:     nil,
+		},
+		{
+			name:     "mock Walk fails (function fails)",
+			fileName: "ppp",
+			want:     syscall.ENOENT,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	for _, test := range tests {
+		// Set up mock.
+		rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
+		if err != nil {
+			t.Errorf("TestWalk %s failed: root error got %v, want nil", test.name, err)
+		}
+
+		rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
+		rootFile.WalkGetAttrMock.Err = test.want
+		rootFile.WalkGetAttrMock.File = goodMockFile(p9.PermissionsMask, 0)
+
+		// Call function.
+		dirent, err := rootInode.Lookup(ctx, test.fileName)
+
+		// Unwrap the InodeOperations.
+		var newInodeOperations fs.InodeOperations
+		if dirent != nil {
+			if dirent.IsNegative() {
+				err = syscall.ENOENT
+			} else {
+				newInodeOperations = dirent.Inode.InodeOperations
+			}
+		}
+
+		// Check return values.
+		if err != test.want {
+			t.Errorf("TestWalk %s failed: got %v, want %v", test.name, err, test.want)
+		}
+		if err == nil && newInodeOperations == nil {
+			t.Errorf("TestWalk %s failed: expected either non-nil err or non-nil node, but both are nil", test.name)
+		}
+
+		// Check mock parameters.
+		if !rootFile.WalkGetAttrMock.Called {
+			t.Errorf("TestWalk %s failed: GetAttr not called; error: %v", test.name, err)
+		} else if rootFile.WalkGetAttrMock.Names[0] != test.fileName {
+			t.Errorf("TestWalk %s failed: file name not set", test.name)
+		}
+	}
+}
+
+func TestSetTimestamps(t *testing.T) {
+	// Test parameters.
+	type setTimestampsTest struct {
+		// Name of the test.
+		name string
+
+		// Function input parameters.
+		ts fs.TimeSpec
+	}
+
+	ctx := contexttest.Context(t)
+	now := ktime.NowFromContext(ctx)
+	tests := []setTimestampsTest{
+		{
+			name: "mock SetAttr passes (function succeeds)",
+			ts: fs.TimeSpec{
+				ATime: now,
+				MTime: now,
+			},
+		},
+		{
+			name: "mock SetAttr passes, times are 0 (function succeeds)",
+			ts:   fs.TimeSpec{},
+		},
+		{
+			name: "mock SetAttr passes, times are 0 and not system time (function succeeds)",
+			ts: fs.TimeSpec{
+				ATimeSetSystemTime: false,
+				MTimeSetSystemTime: false,
+			},
+		},
+		{
+			name: "mock SetAttr passes, times are set to system time (function succeeds)",
+			ts: fs.TimeSpec{
+				ATimeSetSystemTime: true,
+				MTimeSetSystemTime: true,
+			},
+		},
+		{
+			name: "mock SetAttr passes, times are omitted (function succeeds)",
+			ts: fs.TimeSpec{
+				ATimeOmit: true,
+				MTimeOmit: true,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		// Set up mock.
+		rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
+		if err != nil {
+			t.Errorf("TestSetTimestamps %s failed: root error got %v, want nil", test.name, err)
+		}
+
+		// Call function.
+		err = rootInode.SetTimestamps(ctx, nil /* Dirent */, test.ts)
+
+		// Check return values.
+		if err != nil {
+			t.Errorf("TestSetTimestamps %s failed: got %v, want nil", test.name, err)
+		}
+
+		// Check mock parameters.
+		if !(test.ts.ATimeOmit && test.ts.MTimeOmit) && !rootFile.SetAttrMock.Called {
+			t.Errorf("TestSetTimestamps %s failed: SetAttr not called", test.name)
+			continue
+		}
+
+		// Check what was passed to the mock function.
+		attr := rootFile.SetAttrMock.Attr
+		atimeGiven := ktime.FromUnix(int64(attr.ATimeSeconds), int64(attr.ATimeNanoSeconds))
+		if test.ts.ATimeOmit {
+			if rootFile.SetAttrMock.Valid.ATime {
+				t.Errorf("TestSetTimestamps %s failed: ATime got set true in mask, wanted false", test.name)
+			}
+		} else {
+			if got, want := rootFile.SetAttrMock.Valid.ATimeNotSystemTime, !test.ts.ATimeSetSystemTime; got != want {
+				t.Errorf("TestSetTimestamps %s failed: got ATimeNotSystemTime %v, want %v", test.name, got, want)
+			}
+			if !test.ts.ATimeSetSystemTime && !test.ts.ATime.Equal(atimeGiven) {
+				t.Errorf("TestSetTimestamps %s failed: ATime got %v, want %v", test.name, atimeGiven, test.ts.ATime)
+			}
+		}
+
+		mtimeGiven := ktime.FromUnix(int64(attr.MTimeSeconds), int64(attr.MTimeNanoSeconds))
+		if test.ts.MTimeOmit {
+			if rootFile.SetAttrMock.Valid.MTime {
+				t.Errorf("TestSetTimestamps %s failed: MTime got set true in mask, wanted false", test.name)
+			}
+		} else {
+			if got, want := rootFile.SetAttrMock.Valid.MTimeNotSystemTime, !test.ts.MTimeSetSystemTime; got != want {
+				t.Errorf("TestSetTimestamps %s failed: got MTimeNotSystemTime %v, want %v", test.name, got, want)
+			}
+			if !test.ts.MTimeSetSystemTime && !test.ts.MTime.Equal(mtimeGiven) {
+				t.Errorf("TestSetTimestamps %s failed: MTime got %v, want %v", test.name, mtimeGiven, test.ts.MTime)
+			}
+		}
+
+	}
+}
+
+func TestSetPermissions(t *testing.T) {
+	// Test parameters.
+	type setPermissionsTest struct {
+		// Name of the test.
+		name string
+
+		// SetPermissions input parameters.
+		perms fs.FilePermissions
+
+		// Error that SetAttr mock should return.
+		setAttrErr error
+
+		// Expected return value.
+		want bool
+	}
+
+	tests := []setPermissionsTest{
+		{
+			name:       "SetAttr mock succeeds (function succeeds)",
+			perms:      fs.FilePermissions{User: fs.PermMask{Read: true, Write: true, Execute: true}},
+			want:       true,
+			setAttrErr: nil,
+		},
+		{
+			name:       "SetAttr mock fails (function fails)",
+			perms:      fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}},
+			want:       false,
+			setAttrErr: syscall.ENOENT,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	for _, test := range tests {
+		// Set up mock.
+		rootFile, rootInode, err := root(ctx, 0, 0)
+		if err != nil {
+			t.Errorf("TestSetPermissions %s failed: root error got %v, want nil", test.name, err)
+		}
+		rootFile.SetAttrMock.Err = test.setAttrErr
+
+		ok := rootInode.SetPermissions(ctx, nil /* Dirent */, test.perms)
+
+		// Check return value.
+		if ok != test.want {
+			t.Errorf("TestSetPermissions %s failed: got %v, want %v", test.name, ok, test.want)
+		}
+
+		// Check mock parameters.
+		pattr := rootFile.SetAttrMock.Attr
+		if !rootFile.SetAttrMock.Called {
+			t.Errorf("TestSetPermissions %s failed: SetAttr not called", test.name)
+			continue
+		}
+		if !rootFile.SetAttrMock.Valid.Permissions {
+			t.Errorf("TestSetPermissions %s failed: SetAttr did not get right request (got false, expected SetAttrMask.Permissions true)",
+				test.name)
+		}
+		if got := fs.FilePermsFromP9(pattr.Permissions); got != test.perms {
+			t.Errorf("TestSetPermissions %s failed: SetAttr did not get right permissions -- got %v, want %v",
+				test.name, got, test.perms)
+		}
+	}
+}
+
+func TestClose(t *testing.T) {
+	ctx := contexttest.Context(t)
+	// Set up mock.
+	rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
+	if err != nil {
+		t.Errorf("TestClose failed: root error got %v, want nil", err)
+	}
+
+	// Call function.
+	rootInode.InodeOperations.Release(ctx)
+
+	// Check mock parameters.
+	if !rootFile.CloseMock.Called {
+		t.Errorf("TestClose failed: Close not called")
+	}
+}
+
+func TestRename(t *testing.T) {
+	// Test parameters.
+	type renameTest struct {
+		// Name of the test.
+		name string
+
+		// Input parameters.
+		newParent *fs.Inode
+		newName   string
+
+		// Rename mock parameters.
+		renameErr    error
+		renameCalled bool
+
+		// Error want to return given the parameters. (Same as what
+		// we expect and tell rename to return.)
+		want error
+	}
+	ctx := contexttest.Context(t)
+	rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
+	if err != nil {
+		t.Errorf("TestRename failed: root error got %v, want nil", err)
+	}
+
+	tests := []renameTest{
+		{
+			name:         "mock Rename succeeds (function succeeds)",
+			newParent:    rootInode,
+			newName:      "foo2",
+			want:         nil,
+			renameErr:    nil,
+			renameCalled: true,
+		},
+		{
+			name:         "mock Rename fails (function fails)",
+			newParent:    rootInode,
+			newName:      "foo2",
+			want:         syscall.ENOENT,
+			renameErr:    syscall.ENOENT,
+			renameCalled: true,
+		},
+		{
+			name:         "newParent is not inodeOperations but should be (function fails)",
+			newParent:    fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory}),
+			newName:      "foo2",
+			want:         syscall.EXDEV,
+			renameErr:    nil,
+			renameCalled: false,
+		},
+	}
+
+	for _, test := range tests {
+		mockFile := goodMockFile(p9.PermissionsMask, 0)
+		rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
+		rootFile.WalkGetAttrMock.File = mockFile
+
+		dirent, err := rootInode.Lookup(ctx, "foo")
+		if err != nil {
+			t.Fatalf("root.Walk failed: %v", err)
+		}
+		mockFile.RenameMock.Err = test.renameErr
+		mockFile.RenameMock.Called = false
+
+		// Use a dummy oldParent to acquire write access to that directory.
+		oldParent := &inodeOperations{
+			readdirCache: fs.NewSortedDentryMap(nil),
+		}
+		oldInode := fs.NewInode(oldParent, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory})
+
+		// Call function.
+		err = dirent.Inode.InodeOperations.Rename(ctx, oldInode, "", test.newParent, test.newName)
+
+		// Check return value.
+		if err != test.want {
+			t.Errorf("TestRename %s failed: got %v, want %v", test.name, err, test.want)
+		}
+
+		// Check mock parameters.
+		if got, want := mockFile.RenameMock.Called, test.renameCalled; got != want {
+			t.Errorf("TestRename %s failed: renameCalled got %v want %v", test.name, got, want)
+		}
+	}
+}
+
+// This file is read from in TestPreadv.
+type readAtFileFake struct {
+	p9test.FileMock
+
+	// Parameters for faking ReadAt.
+	FileLength int
+	Err        error
+	ChunkSize  int
+	Called     bool
+	LengthRead int
+}
+
+func (r *readAtFileFake) ReadAt(p []byte, offset uint64) (int, error) {
+	r.Called = true
+	log.Warningf("ReadAt fake: length read so far = %d, len(p) = %d, offset = %d", r.LengthRead, len(p), offset)
+	if int(offset) != r.LengthRead {
+		return 0, fmt.Errorf("offset got %d; expected %d", offset, r.LengthRead)
+	}
+
+	if r.Err != nil {
+		return 0, r.Err
+	}
+
+	if r.LengthRead >= r.FileLength {
+		return 0, io.EOF
+	}
+
+	// Read at most ChunkSize and read at most what's left in the file.
+	toBeRead := len(p)
+	if r.LengthRead+toBeRead >= r.FileLength {
+		toBeRead = r.FileLength - int(offset)
+	}
+	if toBeRead > r.ChunkSize {
+		toBeRead = r.ChunkSize
+	}
+
+	r.LengthRead += toBeRead
+	if r.LengthRead == r.FileLength {
+		return toBeRead, io.EOF
+	}
+	return toBeRead, nil
+}
+
+func TestPreadv(t *testing.T) {
+	// Test parameters.
+	type preadvTest struct {
+		// Name of the test.
+		name string
+
+		// Mock parameters
+		mode p9.FileMode
+
+		// Buffer to read into.
+		buffer    [512]byte
+		sliceSize int
+
+		// How much readAt returns at a time.
+		chunkSize int
+
+		// Whether or not we expect ReadAt to be called.
+		readAtCalled bool
+		readAtErr    error
+
+		// Expected return values.
+		want error
+	}
+
+	tests := []preadvTest{
+		{
+			name:         "fake ReadAt succeeds, 512 bytes requested, 512 byte chunks (function succeeds)",
+			want:         nil,
+			readAtErr:    nil,
+			mode:         p9.PermissionsMask,
+			readAtCalled: true,
+			sliceSize:    512,
+			chunkSize:    512,
+		},
+		{
+			name:         "fake ReadAt succeeds, 512 bytes requested, 200 byte chunks (function succeeds)",
+			want:         nil,
+			readAtErr:    nil,
+			mode:         p9.PermissionsMask,
+			readAtCalled: true,
+			sliceSize:    512,
+			chunkSize:    200,
+		},
+		{
+			name:         "fake ReadAt succeeds, 0 bytes requested (function succeeds)",
+			want:         nil,
+			readAtErr:    nil,
+			mode:         p9.PermissionsMask,
+			readAtCalled: false,
+			sliceSize:    0,
+			chunkSize:    100,
+		},
+		{
+			name:         "fake ReadAt returns 0 bytes and EOF (function fails)",
+			want:         io.EOF,
+			readAtErr:    io.EOF,
+			mode:         p9.PermissionsMask,
+			readAtCalled: true,
+			sliceSize:    512,
+			chunkSize:    512,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	for _, test := range tests {
+		// Set up mock.
+		rootFile, rootInode, err := root(ctx, test.mode, 1024)
+		if err != nil {
+			t.Errorf("TestPreadv %s failed: root error got %v, want nil", test.name, err)
+		}
+
+		// Set up the read buffer.
+		dst := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
+
+		// This file will be read from.
+		openFile := &readAtFileFake{
+			Err:        test.readAtErr,
+			FileLength: test.sliceSize,
+			ChunkSize:  test.chunkSize,
+		}
+		rootFile.WalkGetAttrMock.File = openFile
+		rootFile.WalkGetAttrMock.Attr.Mode = test.mode
+		rootFile.WalkGetAttrMock.Valid.Mode = true
+
+		f := NewFile(
+			ctx,
+			fs.NewDirent(rootInode, ""),
+			fs.FileFlags{Read: true},
+			rootInode.InodeOperations.(*inodeOperations),
+			&handles{File: contextFile{file: openFile}},
+		)
+
+		// Call function.
+		_, err = f.Preadv(ctx, dst, 0)
+
+		// Check return value.
+		if err != test.want {
+			t.Errorf("TestPreadv %s failed: got %v, want %v", test.name, err, test.want)
+		}
+
+		// Check mock parameters.
+		if test.readAtCalled != openFile.Called {
+			t.Errorf("TestPreadv %s failed: ReadAt called: %v, but expected opposite", test.name, openFile.Called)
+		}
+	}
+}
+
+func TestReadlink(t *testing.T) {
+	// Test parameters.
+	type readlinkTest struct {
+		// Name of the test.
+		name string
+
+		// Mock parameters
+		mode p9.FileMode
+
+		// Whether or not we expect ReadAt to be called and what error
+		// it shall return.
+		readlinkCalled bool
+		readlinkErr    error
+
+		// Expected return values.
+		want error
+	}
+
+	tests := []readlinkTest{
+		{
+			name:           "file is not symlink (function fails)",
+			want:           syscall.ENOLINK,
+			mode:           p9.PermissionsMask,
+			readlinkCalled: false,
+			readlinkErr:    nil,
+		},
+		{
+			name:           "mock Readlink succeeds (function succeeds)",
+			want:           nil,
+			mode:           p9.PermissionsMask | p9.ModeSymlink,
+			readlinkCalled: true,
+			readlinkErr:    nil,
+		},
+		{
+			name:           "mock Readlink fails (function fails)",
+			want:           syscall.ENOENT,
+			mode:           p9.PermissionsMask | p9.ModeSymlink,
+			readlinkCalled: true,
+			readlinkErr:    syscall.ENOENT,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	for _, test := range tests {
+		// Set up mock.
+		rootFile, rootInode, err := root(ctx, test.mode, 0)
+		if err != nil {
+			t.Errorf("TestReadlink %s failed: root error got %v, want nil", test.name, err)
+		}
+
+		openFile := goodMockFile(test.mode, 0)
+		rootFile.WalkMock.File = openFile
+		rootFile.ReadlinkMock.Err = test.readlinkErr
+
+		// Call function.
+		_, err = rootInode.Readlink(ctx)
+
+		// Check return value.
+		if err != test.want {
+			t.Errorf("TestReadlink %s failed: got %v, want %v", test.name, err, test.want)
+		}
+
+		// Check mock parameters.
+		if test.readlinkCalled && !rootFile.ReadlinkMock.Called {
+			t.Errorf("TestReadlink %s failed: Readlink not called", test.name)
+		}
+	}
+}
+
+// This file is write from in TestPwritev.
+type writeAtFileFake struct {
+	p9test.FileMock
+
+	// Parameters for faking WriteAt.
+	Err           error
+	ChunkSize     int
+	Called        bool
+	LengthWritten int
+}
+
+func (r *writeAtFileFake) WriteAt(p []byte, offset uint64) (int, error) {
+	r.Called = true
+	log.Warningf("WriteAt fake: length written so far = %d, len(p) = %d, offset = %d", r.LengthWritten, len(p), offset)
+	if int(offset) != r.LengthWritten {
+		return 0, fmt.Errorf("offset got %d; want %d", offset, r.LengthWritten)
+	}
+
+	if r.Err != nil {
+		return 0, r.Err
+	}
+
+	// Write at most ChunkSize.
+	toBeWritten := len(p)
+	if toBeWritten > r.ChunkSize {
+		toBeWritten = r.ChunkSize
+	}
+	r.LengthWritten += toBeWritten
+	return toBeWritten, nil
+}
+
+func TestPwritev(t *testing.T) {
+	// Test parameters.
+	type pwritevTest struct {
+		// Name of the test.
+		name string
+
+		// Mock parameters
+		mode p9.FileMode
+
+		allowWrite bool
+
+		// Buffer to write into.
+		buffer    [512]byte
+		sliceSize int
+		chunkSize int
+
+		// Whether or not we expect writeAt to be called.
+		writeAtCalled bool
+		writeAtErr    error
+
+		// Expected return values.
+		want error
+	}
+
+	tests := []pwritevTest{
+		{
+			name:          "fake writeAt succeeds, one chunk (function succeeds)",
+			want:          nil,
+			writeAtErr:    nil,
+			mode:          p9.PermissionsMask,
+			allowWrite:    true,
+			writeAtCalled: true,
+			sliceSize:     512,
+			chunkSize:     512,
+		},
+		{
+			name:          "fake writeAt fails, short write (function fails)",
+			want:          io.ErrShortWrite,
+			writeAtErr:    nil,
+			mode:          p9.PermissionsMask,
+			allowWrite:    true,
+			writeAtCalled: true,
+			sliceSize:     512,
+			chunkSize:     200,
+		},
+		{
+			name:          "fake writeAt succeeds, len 0 (function succeeds)",
+			want:          nil,
+			writeAtErr:    nil,
+			mode:          p9.PermissionsMask,
+			allowWrite:    true,
+			writeAtCalled: false,
+			sliceSize:     0,
+			chunkSize:     0,
+		},
+		{
+			name:          "writeAt can still write despite file permissions read only (function succeeds)",
+			want:          nil,
+			writeAtErr:    nil,
+			mode:          p9.PermissionsMask,
+			allowWrite:    false,
+			writeAtCalled: true,
+			sliceSize:     512,
+			chunkSize:     512,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	for _, test := range tests {
+		// Set up mock.
+		_, rootInode, err := root(ctx, test.mode, 0)
+		if err != nil {
+			t.Errorf("TestPwritev %s failed: root error got %v, want nil", test.name, err)
+		}
+
+		src := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
+
+		// This is the file that will be used for writing.
+		openFile := &writeAtFileFake{
+			Err:       test.writeAtErr,
+			ChunkSize: test.chunkSize,
+		}
+
+		f := NewFile(
+			ctx,
+			fs.NewDirent(rootInode, ""),
+			fs.FileFlags{Write: true},
+			rootInode.InodeOperations.(*inodeOperations),
+			&handles{File: contextFile{file: openFile}},
+		)
+
+		// Call function.
+		_, err = f.Pwritev(ctx, src, 0)
+
+		// Check return value.
+		if err != test.want {
+			t.Errorf("TestPwritev %s failed: got %v, want %v", test.name, err, test.want)
+		}
+
+		// Check mock parameters.
+		if test.writeAtCalled != openFile.Called {
+			t.Errorf("TestPwritev %s failed: WriteAt called: %v, but expected opposite", test.name, openFile.Called)
+			continue
+		}
+		if openFile.Called && test.writeAtErr != nil && openFile.LengthWritten != test.sliceSize {
+			t.Errorf("TestPwritev %s failed: wrote %d bytes, expected %d bytes written", test.name, openFile.LengthWritten, test.sliceSize)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
new file mode 100644
index 000000000..a660c9230
--- /dev/null
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -0,0 +1,144 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+)
+
+// handles are the open handles of a gofer file. They are reference counted to
+// support open handle sharing between files for read only filesystems.
+//
+// If Host != nil then it will be used exclusively over File.
+type handles struct {
+	refs.AtomicRefCount
+
+	// File is a p9.File handle. Must not be nil.
+	File contextFile
+
+	// Host is an *fd.FD handle. May be nil.
+	Host *fd.FD
+}
+
+// DecRef drops a reference on handles.
+func (h *handles) DecRef() {
+	h.DecRefWithDestructor(func() {
+		if h.Host != nil {
+			if err := h.Host.Close(); err != nil {
+				log.Warningf("error closing host file: %v", err)
+			}
+		}
+		// FIXME: Context is not plumbed here.
+		if err := h.File.close(context.Background()); err != nil {
+			log.Warningf("error closing p9 file: %v", err)
+		}
+	})
+}
+
+func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*handles, error) {
+	_, newFile, err := file.walk(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	switch {
+	case flags.Read && flags.Write:
+		hostFile, _, _, err := newFile.open(ctx, p9.ReadWrite)
+		if err != nil {
+			newFile.close(ctx)
+			return nil, err
+		}
+		h := &handles{
+			File: newFile,
+			Host: hostFile,
+		}
+		return h, nil
+	case flags.Read && !flags.Write:
+		hostFile, _, _, err := newFile.open(ctx, p9.ReadOnly)
+		if err != nil {
+			newFile.close(ctx)
+			return nil, err
+		}
+		h := &handles{
+			File: newFile,
+			Host: hostFile,
+		}
+		return h, nil
+	case !flags.Read && flags.Write:
+		hostFile, _, _, err := newFile.open(ctx, p9.WriteOnly)
+		if err != nil {
+			newFile.close(ctx)
+			return nil, err
+		}
+		h := &handles{
+			File: newFile,
+			Host: hostFile,
+		}
+		return h, nil
+	default:
+		panic("impossible fs.FileFlags")
+	}
+}
+
+type handleReadWriter struct {
+	ctx context.Context
+	h   *handles
+	off int64
+}
+
+func (h *handles) readWriterAt(ctx context.Context, offset int64) *handleReadWriter {
+	return &handleReadWriter{ctx, h, offset}
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	var r io.Reader
+	if rw.h.Host != nil {
+		r = secio.NewOffsetReader(rw.h.Host, rw.off)
+	} else {
+		r = &p9.ReadWriterFile{File: rw.h.File.file, Offset: uint64(rw.off)}
+	}
+
+	rw.ctx.UninterruptibleSleepStart(false)
+	defer rw.ctx.UninterruptibleSleepFinish(false)
+	n, err := safemem.FromIOReader{r}.ReadToBlocks(dsts)
+	rw.off += int64(n)
+	return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	var w io.Writer
+	if rw.h.Host != nil {
+		w = secio.NewOffsetWriter(rw.h.Host, rw.off)
+	} else {
+		w = &p9.ReadWriterFile{File: rw.h.File.file, Offset: uint64(rw.off)}
+	}
+
+	rw.ctx.UninterruptibleSleepStart(false)
+	defer rw.ctx.UninterruptibleSleepFinish(false)
+	n, err := safemem.FromIOWriter{w}.WriteFromBlocks(srcs)
+	rw.off += int64(n)
+	return n, err
+}
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
new file mode 100644
index 000000000..454242923
--- /dev/null
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -0,0 +1,554 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"errors"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// inodeOperations implements fs.InodeOperations.
+type inodeOperations struct {
+	fsutil.InodeNotVirtual           `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+
+	// fileState implements fs.CachedFileObject. It exists
+	// to break a circular load dependency between inodeOperations
+	// and cachingInodeOps (below).
+	fileState *inodeFileState `state:"wait"`
+
+	// cachingInodeOps implement memmap.Mappable for inodeOperations.
+	cachingInodeOps *fsutil.CachingInodeOperations
+
+	// readdirMu protects readdirCache and concurrent Readdirs.
+	readdirMu sync.Mutex `state:"nosave"`
+
+	// readdirCache is a cache of readdir results in the form of
+	// a fs.SortedDentryMap.
+	//
+	// Starts out as nil, and is initialized under readdirMu lazily;
+	// invalidating the cache means setting it to nil.
+	readdirCache *fs.SortedDentryMap `state:"nosave"`
+}
+
+// inodeFileState implements fs.CachedFileObject and otherwise fully
+// encapsulates state that needs to be manually loaded on restore for
+// this file object.
+//
+// This unfortunate structure exists because fs.CachingInodeOperations
+// defines afterLoad and therefore cannot be lazily loaded (to break a
+// circular load dependency between it and inodeOperations). Even with
+// lazy loading, this approach defines the dependencies between objects
+// and the expected load behavior more concretely.
+type inodeFileState struct {
+	// s is common file system state for Gofers.
+	s *session `state:"wait"`
+
+	// MultiDeviceKey consists of:
+	//
+	// * Device:          file system device from a specific gofer.
+	// * SecondaryDevice: unique identifier of the attach point.
+	// * Inode:           the inode of this resource, unique per Device.=
+	//
+	// These fields combined enable consistent hashing of virtual inodes
+	// on goferDevice.
+	key device.MultiDeviceKey `state:"nosave"`
+
+	// file is the p9 file that contains a single unopened fid.
+	file contextFile `state:"nosave"`
+
+	// sattr caches the stable attributes.
+	sattr fs.StableAttr `state:"wait"`
+
+	// handlesMu protects the below fields.
+	handlesMu sync.RWMutex `state:"nosave"`
+
+	// Do minimal open handle caching: only for read only filesystems.
+	readonly *handles `state:"nosave"`
+
+	// Maintain readthrough handles for populating page caches.
+	readthrough *handles `state:"nosave"`
+
+	// Maintain writeback handles for syncing from page caches.
+	writeback *handles `state:"nosave"`
+
+	// writebackRW indicates whether writeback is opened read-write. If
+	// it is not and a read-write handle could replace writeback (above),
+	// then writeback is replaced with the read-write handle. This
+	// ensures that files that were first opened write-only and then
+	// later are opened read-write to be mapped can in fact be mapped.
+	writebackRW bool
+
+	// loading is acquired when the inodeFileState begins an asynchronous
+	// load. It releases when the load is complete. Callers that require all
+	// state to be available should call waitForLoad() to ensure that.
+	loading sync.Mutex `state:".(struct{})"`
+
+	// savedUAttr is only allocated during S/R. It points to the save-time
+	// unstable attributes and is used to validate restore-time ones.
+	//
+	// Note that these unstable attributes are only used to detect cross-S/R
+	// external file system metadata changes. They may differ from the
+	// cached unstable attributes in cachingInodeOps, as that might differ
+	// from the external file system attributes if there had been WriteOut
+	// failures. S/R is transparent to Sentry and the latter will continue
+	// using its cached values after restore.
+	savedUAttr *fs.UnstableAttr
+}
+
+// Release releases file handles.
+func (i *inodeFileState) Release(ctx context.Context) {
+	i.file.close(ctx)
+	if i.readonly != nil {
+		i.readonly.DecRef()
+	}
+	if i.readthrough != nil {
+		i.readthrough.DecRef()
+	}
+	if i.writeback != nil {
+		i.writeback.DecRef()
+	}
+}
+
+// setHandlesForCachedIO installs file handles for reading and writing
+// through fs.CachingInodeOperations.
+func (i *inodeFileState) setHandlesForCachedIO(flags fs.FileFlags, h *handles) {
+	i.handlesMu.Lock()
+	defer i.handlesMu.Unlock()
+
+	if flags.Read {
+		if i.readthrough == nil {
+			h.IncRef()
+			i.readthrough = h
+		}
+	}
+	if flags.Write {
+		if i.writeback == nil {
+			h.IncRef()
+			i.writeback = h
+		} else if !i.writebackRW && flags.Read {
+			i.writeback.DecRef()
+			h.IncRef()
+			i.writeback = h
+		}
+		if flags.Read {
+			i.writebackRW = true
+		}
+	}
+}
+
+// getCachedHandles returns any cached handles which would accelerate
+// performance generally. These handles should only be used if the mount
+// supports caching. This is distinct from fs.CachingInodeOperations
+// which is used for a limited set of file types (those that can be mapped).
+func (i *inodeFileState) getCachedHandles(ctx context.Context, flags fs.FileFlags, msrc *fs.MountSource) (*handles, bool) {
+	i.handlesMu.Lock()
+	defer i.handlesMu.Unlock()
+
+	if flags.Read && !flags.Write && msrc.Flags.ReadOnly {
+		if i.readonly != nil {
+			i.readonly.IncRef()
+			return i.readonly, true
+		}
+		h, err := newHandles(ctx, i.file, flags)
+		if err != nil {
+			return nil, false
+		}
+		i.readonly = h
+		i.readonly.IncRef()
+		return i.readonly, true
+	}
+
+	return nil, false
+}
+
+// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
+func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+	return i.readthrough.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts)
+}
+
+// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
+func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+	return i.writeback.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs)
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
+	if mask.Empty() {
+		return nil
+	}
+	as, ans := attr.AccessTime.Unix()
+	ms, mns := attr.ModificationTime.Unix()
+	// An update of status change time is implied by mask.AccessTime
+	// or mask.ModificationTime. Updating status change time to a
+	// time earlier than the system time is not possible.
+	return i.file.setAttr(
+		ctx,
+		p9.SetAttrMask{
+			Permissions:        mask.Perms,
+			Size:               mask.Size,
+			UID:                mask.UID,
+			GID:                mask.GID,
+			ATime:              mask.AccessTime,
+			ATimeNotSystemTime: true,
+			MTime:              mask.ModificationTime,
+			MTimeNotSystemTime: true,
+		}, p9.SetAttr{
+			Permissions:      p9.FileMode(attr.Perms.LinuxMode()),
+			UID:              p9.UID(attr.Owner.UID),
+			GID:              p9.GID(attr.Owner.GID),
+			Size:             uint64(attr.Size),
+			ATimeSeconds:     uint64(as),
+			ATimeNanoSeconds: uint64(ans),
+			MTimeSeconds:     uint64(ms),
+			MTimeNanoSeconds: uint64(mns),
+		})
+}
+
+// Sync implements fsutil.CachedFileObject.Sync.
+func (i *inodeFileState) Sync(ctx context.Context) error {
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+	if i.writeback == nil {
+		return nil
+	}
+	return i.writeback.File.fsync(ctx)
+}
+
+// FD implements fsutil.CachedFileObject.FD.
+//
+// FD meets the requirements of fsutil.CachedFileObject.FD because p9.File.Open
+// returns a host file descriptor to back _both_ readthrough and writeback or
+// not at all (e.g. both are nil).
+func (i *inodeFileState) FD() int {
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+
+	// Assert that the file was actually opened.
+	if i.writeback == nil && i.readthrough == nil {
+		panic("cannot get host FD for a file that was never opened")
+	}
+	// If this file is mapped, then it must have been opened
+	// read-write and i.writeback was upgraded to a read-write
+	// handle. Prefer that to map.
+	if i.writeback != nil {
+		if i.writeback.Host == nil {
+			return -1
+		}
+		return int(i.writeback.Host.FD())
+	}
+	// Otherwise the file may only have been opened readable
+	// so far. That's the only way it can be accessed.
+	if i.readthrough.Host == nil {
+		return -1
+	}
+	return int(i.readthrough.Host.FD())
+}
+
+// waitForLoad makes sure any restore-issued loading is done.
+func (i *inodeFileState) waitForLoad() {
+	// This is not a no-op. The loading mutex is hold upon restore until
+	// all loading actions are done.
+	i.loading.Lock()
+	i.loading.Unlock()
+}
+
+func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
+	_, valid, pattr, err := getattr(ctx, i.file)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	return unstable(ctx, valid, pattr, i.s.mounter, i.s.client), nil
+}
+
+// session extracts the gofer's session from the MountSource.
+func (i *inodeOperations) session() *session {
+	return i.fileState.s
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *inodeOperations) Release(ctx context.Context) {
+	i.fileState.Release(ctx)
+	i.cachingInodeOps.Release()
+}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
+	if i.session().cachePolicy == cacheNone || !fs.IsFile(inode.StableAttr) {
+		return nil
+	}
+	return i.cachingInodeOps
+}
+
+func isCachable(session *session, inode *fs.Inode) bool {
+	return session.cachePolicy != cacheNone && (fs.IsFile(inode.StableAttr) || fs.IsDir(inode.StableAttr))
+}
+
+func isFileCachable(session *session, inode *fs.Inode) bool {
+	return session.cachePolicy != cacheNone && fs.IsFile(inode.StableAttr)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	if isCachable(i.session(), inode) {
+		return i.cachingInodeOps.UnstableAttr(ctx, inode)
+	}
+	return i.fileState.unstableAttr(ctx)
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	switch d.Inode.StableAttr.Type {
+	case fs.Socket:
+		return i.getFileSocket(ctx, d, flags)
+	case fs.Pipe:
+		return i.getFilePipe(ctx, d, flags)
+	default:
+		return i.getFileDefault(ctx, d, flags)
+	}
+}
+
+func (i *inodeOperations) getFileSocket(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	f, err := i.fileState.file.connect(ctx, p9.AnonymousSocket)
+	if err != nil {
+		return nil, syscall.EIO
+	}
+	fsf, err := host.NewSocketWithDirent(ctx, d, f, flags)
+	if err != nil {
+		f.Close()
+		return nil, err
+	}
+	return fsf, nil
+}
+
+func (i *inodeOperations) getFilePipe(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	// Try to open as a host pipe.
+	if pipeOps, err := fdpipe.Open(ctx, i, flags); err != errNotHostFile {
+		return fs.NewFile(ctx, d, flags, pipeOps), err
+	}
+
+	// If the error is due to the fact that this was never a host pipe, then back
+	// this file with its dirent.
+	h, err := newHandles(ctx, i.fileState.file, flags)
+	if err != nil {
+		return nil, err
+	}
+	return NewFile(ctx, d, flags, i, h), nil
+}
+
+// errNotHostFile indicates that the file is not a host file.
+var errNotHostFile = errors.New("not a host file")
+
+// NonBlockingOpen implements fdpipe.NonBlockingOpener for opening host named pipes.
+func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*fd.FD, error) {
+	i.fileState.waitForLoad()
+
+	// Get a cloned fid which we will open.
+	_, newFile, err := i.fileState.file.walk(ctx, nil)
+	if err != nil {
+		log.Warningf("Open Walk failed: %v", err)
+		return nil, err
+	}
+	defer newFile.close(ctx)
+
+	flags, err := openFlagsFromPerms(p)
+	if err != nil {
+		log.Warningf("Open flags %s parsing failed: %v", p, err)
+		return nil, err
+	}
+	hostFile, _, _, err := newFile.open(ctx, flags)
+	// If the host file returned is nil and the error is nil,
+	// then this was never a host file to begin with, and should
+	// be treated like a remote file.
+	if hostFile == nil && err == nil {
+		return nil, errNotHostFile
+	}
+	return hostFile, err
+}
+
+func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	if !isFileCachable(i.session(), d.Inode) {
+		h, err := newHandles(ctx, i.fileState.file, flags)
+		if err != nil {
+			return nil, err
+		}
+		return NewFile(ctx, d, flags, i, h), nil
+	}
+
+	h, ok := i.fileState.getCachedHandles(ctx, flags, d.Inode.MountSource)
+	if !ok {
+		var err error
+		h, err = newHandles(ctx, i.fileState.file, flags)
+		if err != nil {
+			return nil, err
+		}
+	}
+	i.fileState.setHandlesForCachedIO(flags, h)
+
+	return NewFile(ctx, d, flags, i, h), nil
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
+	if isCachable(i.session(), inode) {
+		return i.cachingInodeOps.SetPermissions(ctx, inode, p)
+	}
+
+	mask := p9.SetAttrMask{Permissions: true}
+	pattr := p9.SetAttr{Permissions: p9.FileMode(p.LinuxMode())}
+	// Execute the chmod.
+	return i.fileState.file.setAttr(ctx, mask, pattr) == nil
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	// Save the roundtrip.
+	if !owner.UID.Ok() && !owner.GID.Ok() {
+		return nil
+	}
+
+	if isCachable(i.session(), inode) {
+		return i.cachingInodeOps.SetOwner(ctx, inode, owner)
+	}
+
+	var mask p9.SetAttrMask
+	var attr p9.SetAttr
+	if owner.UID.Ok() {
+		mask.UID = true
+		attr.UID = p9.UID(owner.UID)
+	}
+	if owner.GID.Ok() {
+		mask.GID = true
+		attr.GID = p9.GID(owner.GID)
+	}
+	return i.fileState.file.setAttr(ctx, mask, attr)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	if isCachable(i.session(), inode) {
+		return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
+	}
+
+	return utimes(ctx, i.fileState.file, ts)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error {
+	// This can only be called for files anyway.
+	if isFileCachable(i.session(), inode) {
+		return i.cachingInodeOps.Truncate(ctx, inode, length)
+	}
+
+	return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)})
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+	if !isCachable(i.session(), inode) {
+		return nil
+	}
+
+	return i.cachingInodeOps.WriteOut(ctx, inode)
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if !fs.IsSymlink(inode.StableAttr) {
+		return "", syscall.ENOLINK
+	}
+	return i.fileState.file.readlink(ctx)
+}
+
+// Getlink implementfs fs.InodeOperations.Getlink.
+func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	if !fs.IsSymlink(i.fileState.sattr) {
+		return nil, syserror.ENOLINK
+	}
+	return nil, fs.ErrResolveViaReadlink
+}
+
+// StatFS makes a StatFS request.
+func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
+	fsstat, err := i.fileState.file.statFS(ctx)
+	if err != nil {
+		return fs.Info{}, err
+	}
+
+	info := fs.Info{
+		// This is primarily for distinguishing a gofer file system in
+		// tests. Testing is important, so instead of defining
+		// something completely random, use a standard value.
+		Type:        linux.V9FS_MAGIC,
+		TotalBlocks: fsstat.Blocks,
+		FreeBlocks:  fsstat.BlocksFree,
+		TotalFiles:  fsstat.Files,
+		FreeFiles:   fsstat.FilesFree,
+	}
+
+	// If blocks available is non-zero, prefer that.
+	if fsstat.BlocksAvailable != 0 {
+		info.FreeBlocks = fsstat.BlocksAvailable
+	}
+
+	return info, nil
+}
+
+func init() {
+	syserror.AddErrorUnwrapper(func(err error) (syscall.Errno, bool) {
+		if _, ok := err.(p9.ErrSocket); ok {
+			// Treat as an I/O error.
+			return syscall.EIO, true
+		}
+		return 0, false
+	})
+}
+
+// AddLink implements InodeOperations.AddLink, but is currently a noop.
+// FIXME: Remove this from InodeOperations altogether.
+func (*inodeOperations) AddLink() {}
+
+// DropLink implements InodeOperations.DropLink, but is currently a noop.
+// FIXME: Remove this from InodeOperations altogether.
+func (*inodeOperations) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
new file mode 100644
index 000000000..997a7d1c1
--- /dev/null
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -0,0 +1,141 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+// Some fs implementations may not support atime, ctime, or mtime in getattr.
+// The unstable() logic would try to use clock time for them. However, we do not
+// want to use such time during S/R as that would cause restore timestamp
+// checking failure. Hence a dummy stable-time clock is needed.
+//
+// Note that application-visible UnstableAttrs either come from CachingInodeOps
+// (in which case they are saved), or they are requested from the gofer on each
+// stat (for non-caching), so the dummy time only affects the modification
+// timestamp check.
+type dummyClock struct {
+	time.Clock
+}
+
+// Now returns a stable dummy time.
+func (d *dummyClock) Now() time.Time {
+	return time.Time{}
+}
+
+type dummyClockContext struct {
+	context.Context
+}
+
+// Value implements context.Context
+func (d *dummyClockContext) Value(key interface{}) interface{} {
+	switch key {
+	case time.CtxRealtimeClock:
+		return &dummyClock{}
+	default:
+		return d.Context.Value(key)
+	}
+}
+
+// beforeSave is invoked by stateify.
+func (i *inodeFileState) beforeSave() {
+	if _, ok := i.s.inodeMappings[i.sattr.InodeID]; !ok {
+		panic(fmt.Sprintf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings)))
+	}
+	if i.sattr.Type == fs.RegularFile {
+		uattr, err := i.unstableAttr(&dummyClockContext{context.Background()})
+		if err != nil {
+			panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.s.inodeMappings[i.sattr.InodeID], err))
+		}
+		i.savedUAttr = &uattr
+	}
+}
+
+// saveLoading is invoked by stateify.
+func (i *inodeFileState) saveLoading() struct{} {
+	return struct{}{}
+}
+
+// loadLoading is invoked by stateify.
+func (i *inodeFileState) loadLoading(_ struct{}) {
+	i.loading.Lock()
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodeFileState) afterLoad() {
+	load := func() {
+		// See comment on i.loading().
+		defer i.loading.Unlock()
+
+		// Manually restore the p9.File.
+		name, ok := i.s.inodeMappings[i.sattr.InodeID]
+		if !ok {
+			// This should be impossible, see assertion in
+			// beforeSave.
+			panic(fmt.Sprintf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings)))
+		}
+		// TODO: Context is not plumbed to save/restore.
+		ctx := &dummyClockContext{context.Background()}
+		var err error
+		_, i.file, err = i.s.attach.walk(ctx, strings.Split(name, "/"))
+		if err != nil {
+			panic(fmt.Sprintf("failed to walk to %q: %v", name, err))
+		}
+
+		// Remap the saved inode number into the gofer device using the
+		// actual device and actual inode that exists in our new
+		// environment.
+		qid, mask, attrs, err := i.file.getAttr(ctx, p9.AttrMaskAll())
+		if err != nil {
+			panic(fmt.Sprintf("failed to get file attributes of %s: %v", name, err))
+		}
+		if !mask.RDev {
+			panic(fmt.Sprintf("file %s lacks device", name))
+		}
+		i.key = device.MultiDeviceKey{
+			Device:          attrs.RDev,
+			SecondaryDevice: i.s.connID,
+			Inode:           qid.Path,
+		}
+		if !goferDevice.Load(i.key, i.sattr.InodeID) {
+			panic(fmt.Sprintf("gofer device %s -> %d conflict in gofer device mappings: %s", i.key, i.sattr.InodeID, goferDevice))
+		}
+
+		if i.sattr.Type == fs.RegularFile {
+			env, ok := fs.CurrentRestoreEnvironment()
+			if !ok {
+				panic("missing restore environment")
+			}
+			uattr := unstable(ctx, mask, attrs, i.s.mounter, i.s.client)
+			if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
+				panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size))
+			}
+			if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
+				panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime))
+			}
+			i.savedUAttr = nil
+		}
+	}
+
+	fs.Async(load)
+}
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
new file mode 100644
index 000000000..d696f1561
--- /dev/null
+++ b/pkg/sentry/fs/gofer/path.go
@@ -0,0 +1,331 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// Lookup loads an Inode at name into a Dirent based on the session's cache
+// policy.
+func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+	if i.session().cachePolicy != cacheNone {
+		// Check to see if we have readdirCache that indicates the
+		// child does not exist.  Avoid holding readdirMu longer than
+		// we need to.
+		i.readdirMu.Lock()
+		if i.readdirCache != nil && !i.readdirCache.Contains(name) {
+			// No such child.  Return a negative dirent.
+			i.readdirMu.Unlock()
+			return fs.NewNegativeDirent(name), nil
+		}
+		i.readdirMu.Unlock()
+	}
+
+	// Get a p9.File for name.
+	qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
+	if err != nil {
+		if err == syscall.ENOENT {
+			if i.session().cachePolicy != cacheNone {
+				// Return a negative Dirent. It will stay cached until something
+				// is created over it.
+				return fs.NewNegativeDirent(name), nil
+			}
+			return nil, syserror.ENOENT
+		}
+		return nil, err
+	}
+
+	// Construct the Inode operations.
+	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr)
+
+	// Construct a positive Dirent.
+	return fs.NewDirent(fs.NewInode(node, dir.MountSource, sattr), name), nil
+}
+
+// Creates a new Inode at name and returns its File based on the session's cache policy.
+//
+// Ownership is currently ignored.
+func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+	// Create replaces the directory fid with the newly created/opened
+	// file, so clone this directory so it doesn't change out from under
+	// this node.
+	_, newFile, err := i.fileState.file.walk(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	// Map the FileFlags to p9 OpenFlags.
+	var openFlags p9.OpenFlags
+	switch {
+	case flags.Read && flags.Write:
+		openFlags = p9.ReadWrite
+	case flags.Read:
+		openFlags = p9.ReadOnly
+	case flags.Write:
+		openFlags = p9.WriteOnly
+	default:
+		panic(fmt.Sprintf("Create called with unknown or unset open flags: %v", flags))
+	}
+
+	owner := fs.FileOwnerFromContext(ctx)
+	hostFile, err := newFile.create(ctx, name, openFlags, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
+	if err != nil {
+		// Could not create the file.
+		return nil, err
+	}
+
+	i.touchModificationTime(ctx)
+
+	// Get the attributes of the file.
+	qid, mask, p9attr, err := getattr(ctx, newFile)
+	if err != nil {
+		newFile.close(ctx)
+		return nil, err
+	}
+
+	// Get an unopened p9.File for the file we created so that it can be
+	// cloned and re-opened multiple times after creation.
+	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	if err != nil {
+		newFile.close(ctx)
+		return nil, err
+	}
+
+	// Construct the InodeOperations.
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr)
+
+	// Construct the positive Dirent.
+	d := fs.NewDirent(fs.NewInode(iops, dir.MountSource, sattr), name)
+	defer d.DecRef()
+
+	// Construct the new file, caching the handles if allowed.
+	h := &handles{
+		File: newFile,
+		Host: hostFile,
+	}
+	if isFileCachable(iops.session(), d.Inode) {
+		iops.fileState.setHandlesForCachedIO(flags, h)
+	}
+	return NewFile(ctx, d, flags, iops, h), nil
+}
+
+// CreateLink uses Create to create a symlink between oldname and newname.
+func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
+	owner := fs.FileOwnerFromContext(ctx)
+	if _, err := i.fileState.file.symlink(ctx, oldname, newname, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
+		return err
+	}
+	i.touchModificationTime(ctx)
+	return nil
+}
+
+// CreateHardLink implements InodeOperations.CreateHardLink.
+func (i *inodeOperations) CreateHardLink(ctx context.Context, _ *fs.Inode, target *fs.Inode, newName string) error {
+	targetOpts, ok := target.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+
+	if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil {
+		return err
+	}
+	// TODO: Don't increase link count because we can't properly accounts for links
+	// with gofers.
+	i.touchModificationTime(ctx)
+	return nil
+}
+
+// CreateDirectory uses Create to create a directory named s under inodeOperations.
+func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s string, perm fs.FilePermissions) error {
+	owner := fs.FileOwnerFromContext(ctx)
+	if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
+		return err
+	}
+	if i.session().cachePolicy == cacheAll {
+		// Increase link count.
+		i.cachingInodeOps.IncLinks(ctx)
+
+		// Invalidate readdir cache.
+		i.markDirectoryDirty()
+	}
+	return nil
+}
+
+// Bind implements InodeOperations.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perm fs.FilePermissions) error {
+	if i.session().endpoints == nil {
+		return syscall.EOPNOTSUPP
+	}
+
+	// Create replaces the directory fid with the newly created/opened
+	// file, so clone this directory so it doesn't change out from under
+	// this node.
+	_, newFile, err := i.fileState.file.walk(ctx, nil)
+	if err != nil {
+		return err
+	}
+
+	// Stabilize the endpoint map while creation is in progress.
+	unlock := i.session().endpoints.lock()
+	defer unlock()
+
+	// Create a regular file in the gofer and then mark it as a socket by
+	// adding this inode key in the 'endpoints' map.
+	owner := fs.FileOwnerFromContext(ctx)
+	hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
+	if err != nil {
+		return err
+	}
+	// We're not going to use this file.
+	hostFile.Close()
+
+	i.touchModificationTime(ctx)
+
+	// Get the attributes of the file to create inode key.
+	qid, _, attr, err := getattr(ctx, newFile)
+	if err != nil {
+		newFile.close(ctx)
+		return err
+	}
+
+	key := device.MultiDeviceKey{
+		Device:          attr.RDev,
+		SecondaryDevice: i.session().connID,
+		Inode:           qid.Path,
+	}
+	i.session().endpoints.add(key, ep)
+
+	return nil
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo. Gofer nodes do not support the
+// creation of fifos and always returns EOPNOTSUPP.
+func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return syscall.EOPNOTSUPP
+}
+
+// Remove implements InodeOperations.Remove.
+func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+	var key device.MultiDeviceKey
+	removeSocket := false
+	if i.session().endpoints != nil {
+		// Find out if file being deleted is a socket that needs to be
+		// removed from endpoint map.
+		if d, err := i.Lookup(ctx, dir, name); err == nil {
+			defer d.DecRef()
+			if fs.IsSocket(d.Inode.StableAttr) {
+				child := d.Inode.InodeOperations.(*inodeOperations)
+				key = child.fileState.key
+				removeSocket = true
+
+				// Stabilize the endpoint map while deletion is in progress.
+				unlock := i.session().endpoints.lock()
+				defer unlock()
+			}
+		}
+	}
+
+	if err := i.fileState.file.unlinkAt(ctx, name, 0); err != nil {
+		return err
+	}
+	if removeSocket {
+		i.session().endpoints.remove(key)
+	}
+	i.touchModificationTime(ctx)
+
+	return nil
+}
+
+// Remove implements InodeOperations.RemoveDirectory.
+func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+	// 0x200 = AT_REMOVEDIR.
+	if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil {
+		return err
+	}
+	if i.session().cachePolicy == cacheAll {
+		// Decrease link count and updates atime.
+		i.cachingInodeOps.DecLinks(ctx)
+
+		// Invalidate readdir cache.
+		i.markDirectoryDirty()
+	}
+	return nil
+}
+
+// Rename renames this node.
+func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	// Unwrap the new parent to a *inodeOperations.
+	newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+
+	// Unwrap the old parent to a *inodeOperations.
+	oldParentInodeOperations, ok := oldParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+
+	// Do the rename.
+	if err := i.fileState.file.rename(ctx, newParentInodeOperations.fileState.file, newName); err != nil {
+		return err
+	}
+
+	// Update cached state.
+	if i.session().cachePolicy == cacheAll {
+		// Is the renamed entity a directory? Fix link counts.
+		if fs.IsDir(i.fileState.sattr) {
+			oldParentInodeOperations.cachingInodeOps.DecLinks(ctx)
+			newParentInodeOperations.cachingInodeOps.IncLinks(ctx)
+		}
+
+		// Mark old directory dirty.
+		oldParentInodeOperations.markDirectoryDirty()
+		if oldParent != newParent {
+			// Mark new directory dirty.
+			newParentInodeOperations.markDirectoryDirty()
+		}
+	}
+	return nil
+}
+
+func (i *inodeOperations) touchModificationTime(ctx context.Context) {
+	if i.session().cachePolicy == cacheAll {
+		i.cachingInodeOps.TouchModificationTime(ctx)
+
+		// Invalidate readdir cache.
+		i.markDirectoryDirty()
+	}
+}
+
+// markDirectoryDirty marks any cached data dirty for this directory. This is necessary in order
+// to ensure that this node does not retain stale state throughout its lifetime across multiple
+// open directory handles.
+//
+// Currently this means invalidating any readdir caches.
+func (i *inodeOperations) markDirectoryDirty() {
+	i.readdirMu.Lock()
+	defer i.readdirMu.Unlock()
+	i.readdirCache = nil
+}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
new file mode 100644
index 000000000..ab3b964e0
--- /dev/null
+++ b/pkg/sentry/fs/gofer/session.go
@@ -0,0 +1,251 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+)
+
+type endpointMap struct {
+	mu sync.RWMutex
+	m  map[device.MultiDeviceKey]unix.BoundEndpoint
+}
+
+// add adds the endpoint to the map.
+//
+// Precondition: map must have been locked with 'lock'.
+func (e *endpointMap) add(key device.MultiDeviceKey, ep unix.BoundEndpoint) {
+	e.m[key] = ep
+}
+
+// remove deletes the key from the map.
+//
+// Precondition: map must have been locked with 'lock'.
+func (e *endpointMap) remove(key device.MultiDeviceKey) {
+	delete(e.m, key)
+}
+
+// lock blocks other addition and removal operations from happening while
+// the backing file is being created or deleted. Returns a function that unlocks
+// the endpoint map.
+func (e *endpointMap) lock() func() {
+	e.mu.Lock()
+	return func() { e.mu.Unlock() }
+}
+
+func (e *endpointMap) get(key device.MultiDeviceKey) unix.BoundEndpoint {
+	e.mu.RLock()
+	ep := e.m[key]
+	e.mu.RUnlock()
+	return ep
+}
+
+// session holds state for each 9p session established during sys_mount.
+type session struct {
+	refs.AtomicRefCount
+
+	// conn is a unet.Socket that wraps the readFD/writeFD mount option,
+	// see fs/gofer/fs.go.
+	conn *unet.Socket `state:"nosave"`
+
+	// msize is the value of the msize mount option, see fs/gofer/fs.go.
+	msize uint32 `state:"wait"`
+
+	// version is the value of the version mount option, see fs/gofer/fs.go.
+	version string `state:"wait"`
+
+	// cachePolicy is the cache policy. It may be either cacheAll or cacheNone.
+	cachePolicy cachePolicy `state:"wait"`
+
+	// aname is the value of the aname mount option, see fs/gofer/fs.go.
+	aname string `state:"wait"`
+
+	// The client associated with this session. This will be initialized lazily.
+	client *p9.Client `state:"nosave"`
+
+	// The p9.File pointing to attachName via the client. This will be initialized
+	// lazily.
+	attach contextFile `state:"nosave"`
+
+	// Flags provided to the mount.
+	superBlockFlags fs.MountSourceFlags `state:"wait"`
+
+	// connID is a unique identifier for the session connection.
+	connID string `state:"wait"`
+
+	// inodeMappings contains mappings of fs.Inodes associated with this session
+	// to paths relative to the attach point, where inodeMappings is keyed by
+	// Inode.StableAttr.InodeID.
+	inodeMappings map[uint64]string `state:"wait"`
+
+	// mounter is the EUID/EGID that mounted this file system.
+	mounter fs.FileOwner `state:"wait"`
+
+	// endpoints is used to map inodes that represent socket files to their
+	// corresponding endpoint. Socket files are created as regular files in the
+	// gofer and their presence in this map indicate that they should indeed be
+	// socket files. This allows unix domain sockets to be used with paths that
+	// belong to a gofer.
+	//
+	// TODO: there are few possible races with someone stat'ing the
+	// file and another deleting it concurrently, where the file will not be
+	// reported as socket file.
+	endpoints *endpointMap `state:"wait"`
+}
+
+// Destroy tears down the session.
+func (s *session) Destroy() {
+	s.conn.Close()
+}
+
+// Revalidate returns true if the cache policy is does not allow for VFS caching.
+func (s *session) Revalidate(*fs.Dirent) bool {
+	return s.cachePolicy == cacheNone
+}
+
+// TakeRefs takes an extra reference on dirent if possible.
+func (s *session) Keep(dirent *fs.Dirent) bool {
+	// NOTE: Only cache files and directories.
+	sattr := dirent.Inode.StableAttr
+	return s.cachePolicy != cacheNone && (fs.IsFile(sattr) || fs.IsDir(sattr))
+}
+
+// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
+func (s *session) ResetInodeMappings() {
+	s.inodeMappings = make(map[uint64]string)
+}
+
+// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
+func (s *session) SaveInodeMapping(inode *fs.Inode, path string) {
+	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
+	// because overlay copyUp may have changed them out from under us.
+	// So much for "immutable".
+	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
+	s.inodeMappings[sattr.InodeID] = path
+}
+
+// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes
+// (p9.QID, p9.AttrMask, p9.Attr).
+func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) {
+	deviceKey := device.MultiDeviceKey{
+		Device:          attr.RDev,
+		SecondaryDevice: s.connID,
+		Inode:           qid.Path,
+	}
+
+	sattr := fs.StableAttr{
+		Type:      ntype(attr),
+		DeviceID:  goferDevice.DeviceID(),
+		InodeID:   goferDevice.Map(deviceKey),
+		BlockSize: bsize(attr),
+	}
+
+	if s.endpoints != nil {
+		// If unix sockets are allowed on this filesystem, check if this file is
+		// supposed to be a socket file.
+		if s.endpoints.get(deviceKey) != nil {
+			sattr.Type = fs.Socket
+		}
+	}
+
+	fileState := &inodeFileState{
+		s:     s,
+		file:  file,
+		sattr: sattr,
+		key:   deviceKey,
+	}
+
+	uattr := unstable(ctx, valid, attr, s.mounter, s.client)
+	return sattr, &inodeOperations{
+		fileState:       fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, s.superBlockFlags.ForcePageCache),
+	}
+}
+
+// Root returns the root of a 9p mount. This mount is bound to a 9p server
+// based on conn. Otherwise configuration parameters are:
+//
+// * dev:         connection id
+// * filesystem:  the filesystem backing the mount
+// * superBlockFlags:  the mount flags describing general mount options
+// * opts:        parsed 9p mount options
+func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockFlags fs.MountSourceFlags, o opts) (*fs.Inode, error) {
+	// The mounting EUID/EGID will be cached by this file system. This will
+	// be used to assign ownership to files that the Gofer owns.
+	mounter := fs.FileOwnerFromContext(ctx)
+
+	conn, err := unet.NewSocket(o.fd)
+	if err != nil {
+		return nil, err
+	}
+
+	// Construct the session.
+	s := &session{
+		connID:          dev,
+		conn:            conn,
+		msize:           o.msize,
+		version:         o.version,
+		cachePolicy:     o.policy,
+		aname:           o.aname,
+		superBlockFlags: superBlockFlags,
+		mounter:         mounter,
+	}
+
+	if o.privateunixsocket {
+		s.endpoints = &endpointMap{m: make(map[device.MultiDeviceKey]unix.BoundEndpoint)}
+	}
+
+	// Construct the MountSource with the session and superBlockFlags.
+	m := fs.NewMountSource(s, filesystem, superBlockFlags)
+
+	// Send the Tversion request.
+	s.client, err = p9.NewClient(s.conn, s.msize, s.version)
+	if err != nil {
+		// Drop our reference on the session, it needs to be torn down.
+		s.DecRef()
+		return nil, err
+	}
+
+	// Notify that we're about to call the Gofer and block.
+	ctx.UninterruptibleSleepStart(false)
+	// Send the Tattach request.
+	s.attach.file, err = s.client.Attach(s.aname)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		// Same as above.
+		s.DecRef()
+		return nil, err
+	}
+
+	qid, valid, attr, err := s.attach.getAttr(ctx, p9.AttrMaskAll())
+	if err != nil {
+		s.attach.close(ctx)
+		// Same as above, but after we execute the Close request.
+		s.DecRef()
+		return nil, err
+	}
+
+	sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr)
+	return fs.NewInode(iops, m, sattr), nil
+}
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
new file mode 100644
index 000000000..4d993a219
--- /dev/null
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -0,0 +1,90 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+)
+
+// afterLoad is invoked by stateify.
+func (s *session) afterLoad() {
+	// The restore environment contains the 9p connection of this mount.
+	fsys := filesystem{}
+	env, ok := fs.CurrentRestoreEnvironment()
+	if !ok {
+		panic("failed to find restore environment")
+	}
+	mounts, ok := env.MountSources[fsys.Name()]
+	if !ok {
+		panic("failed to find mounts for filesystem type " + fsys.Name())
+	}
+	var args fs.MountArgs
+	var found bool
+	for _, mount := range mounts {
+		if mount.Dev == s.connID {
+			args = mount
+			found = true
+		}
+	}
+	if !found {
+		panic(fmt.Sprintf("no connection for connection id %q", s.connID))
+	}
+
+	// Validate the mount flags and options.
+	opts, err := options(args.Data)
+	if err != nil {
+		panic("failed to parse mount options: " + err.Error())
+	}
+	if opts.msize != s.msize {
+		panic(fmt.Sprintf("new message size %v, want %v", opts.msize, s.msize))
+	}
+	if opts.version != s.version {
+		panic(fmt.Sprintf("new version %v, want %v", opts.version, s.version))
+	}
+	if opts.policy != s.cachePolicy {
+		panic(fmt.Sprintf("new cache policy %v, want %v", opts.policy, s.cachePolicy))
+	}
+	if opts.aname != s.aname {
+		panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname))
+	}
+	if opts.privateunixsocket != (s.endpoints != nil) {
+		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil))
+	}
+	if args.Flags != s.superBlockFlags {
+		panic(fmt.Sprintf("new mount flags %v, want %v", args.Flags, s.superBlockFlags))
+	}
+
+	// Manually restore the connection.
+	s.conn, err = unet.NewSocket(opts.fd)
+	if err != nil {
+		panic(fmt.Sprintf("failed to create Socket for FD %d: %v", opts.fd, err))
+	}
+
+	// Manually restore the client.
+	s.client, err = p9.NewClient(s.conn, s.msize, s.version)
+	if err != nil {
+		panic(fmt.Sprintf("failed to connect client to server: %v", err))
+	}
+
+	// Manually restore the attach point.
+	s.attach.file, err = s.client.Attach(s.aname)
+	if err != nil {
+		panic(fmt.Sprintf("failed to attach to aname: %v", err))
+	}
+}
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
new file mode 100644
index 000000000..954000ef0
--- /dev/null
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -0,0 +1,127 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// BoundEndpoint returns a gofer-backed unix.BoundEndpoint.
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint {
+	if !fs.IsSocket(i.fileState.sattr) {
+		return nil
+	}
+
+	if i.session().endpoints != nil {
+		ep := i.session().endpoints.get(i.fileState.key)
+		if ep != nil {
+			return ep
+		}
+
+		// Not found in endpoints map, it may be a gofer backed unix socket...
+	}
+
+	inode.IncRef()
+	return &endpoint{inode, i.fileState.file.file, path}
+}
+
+// endpoint is a Gofer-backed unix.BoundEndpoint.
+//
+// An endpoint's lifetime is the time between when InodeOperations.BoundEndpoint()
+// is called and either BoundEndpoint.BidirectionalConnect or
+// BoundEndpoint.UnidirectionalConnect is called.
+type endpoint struct {
+	// inode is the filesystem inode which produced this endpoint.
+	inode *fs.Inode
+
+	// file is the p9 file that contains a single unopened fid.
+	file p9.File
+
+	// path is the sentry path where this endpoint is bound.
+	path string
+}
+
+func unixSockToP9(t unix.SockType) (p9.ConnectFlags, bool) {
+	switch t {
+	case unix.SockStream:
+		return p9.StreamSocket, true
+	case unix.SockSeqpacket:
+		return p9.SeqpacketSocket, true
+	case unix.SockDgram:
+		return p9.DgramSocket, true
+	}
+	return 0, false
+}
+
+// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
+func (e *endpoint) BidirectionalConnect(ce unix.ConnectingEndpoint, returnConnect func(unix.Receiver, unix.ConnectedEndpoint)) *tcpip.Error {
+	cf, ok := unixSockToP9(ce.Type())
+	if !ok {
+		return tcpip.ErrConnectionRefused
+	}
+
+	// No lock ordering required as only the ConnectingEndpoint has a mutex.
+	ce.Lock()
+	defer ce.Unlock()
+
+	// Check connecting state.
+	if ce.Connected() {
+		return tcpip.ErrAlreadyConnected
+	}
+	if ce.Listening() {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	hostFile, err := e.file.Connect(cf)
+	if err != nil {
+		return tcpip.ErrConnectionRefused
+	}
+
+	r, c, terr := host.NewConnectedEndpoint(hostFile, ce.WaiterQueue(), e.path)
+	if terr != nil {
+		return terr
+	}
+	returnConnect(r, c)
+	return nil
+}
+
+// UnidirectionalConnect implements unix.BoundEndpoint.UnidirectionalConnect.
+func (e *endpoint) UnidirectionalConnect() (unix.ConnectedEndpoint, *tcpip.Error) {
+	hostFile, err := e.file.Connect(p9.DgramSocket)
+	if err != nil {
+		return nil, tcpip.ErrConnectionRefused
+	}
+
+	r, c, terr := host.NewConnectedEndpoint(hostFile, &waiter.Queue{}, e.path)
+	if terr != nil {
+		return nil, terr
+	}
+
+	// We don't need the receiver.
+	r.CloseRecv()
+	r.Release()
+
+	return c, nil
+}
+
+// Release implements unix.BoundEndpoint.Release.
+func (e *endpoint) Release() {
+	e.inode.DecRef()
+}
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
new file mode 100644
index 000000000..d9ed8c81e
--- /dev/null
+++ b/pkg/sentry/fs/gofer/util.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+func utimes(ctx context.Context, file contextFile, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+	mask := p9.SetAttrMask{
+		ATime:              !ts.ATimeOmit,
+		ATimeNotSystemTime: !ts.ATimeSetSystemTime,
+		MTime:              !ts.MTimeOmit,
+		MTimeNotSystemTime: !ts.MTimeSetSystemTime,
+	}
+	as, ans := ts.ATime.Unix()
+	ms, mns := ts.MTime.Unix()
+	attr := p9.SetAttr{
+		ATimeSeconds:     uint64(as),
+		ATimeNanoSeconds: uint64(ans),
+		MTimeSeconds:     uint64(ms),
+		MTimeNanoSeconds: uint64(mns),
+	}
+	// 9p2000.L SetAttr: "If a time bit is set without the corresponding SET bit,
+	// the current system time on the server is used instead of the value sent
+	// in the request."
+	return file.setAttr(ctx, mask, attr)
+}
+
+func openFlagsFromPerms(p fs.PermMask) (p9.OpenFlags, error) {
+	switch {
+	case p.Read && p.Write:
+		return p9.ReadWrite, nil
+	case p.Write:
+		return p9.WriteOnly, nil
+	case p.Read:
+		return p9.ReadOnly, nil
+	default:
+		return 0, syscall.EINVAL
+	}
+}
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
new file mode 100644
index 000000000..97b64daed
--- /dev/null
+++ b/pkg/sentry/fs/host/BUILD
@@ -0,0 +1,104 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "host_state",
+    srcs = [
+        "control.go",
+        "descriptor.go",
+        "descriptor_state.go",
+        "file.go",
+        "fs.go",
+        "inode.go",
+        "inode_state.go",
+        "socket.go",
+        "socket_state.go",
+    ],
+    out = "host_state.go",
+    package = "host",
+)
+
+go_library(
+    name = "host",
+    srcs = [
+        "control.go",
+        "descriptor.go",
+        "descriptor_state.go",
+        "device.go",
+        "file.go",
+        "fs.go",
+        "host_state.go",
+        "inode.go",
+        "inode_state.go",
+        "ioctl_unsafe.go",
+        "socket.go",
+        "socket_state.go",
+        "socket_unsafe.go",
+        "util.go",
+        "util_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/secio",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/unet",
+        "//pkg/waiter",
+        "//pkg/waiter/fdnotifier",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "host_test",
+    size = "small",
+    srcs = [
+        "fs_test.go",
+        "inode_test.go",
+        "socket_test.go",
+        "wait_test.go",
+    ],
+    embed = [":host"],
+    deps = [
+        "//pkg/fd",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/usermem",
+        "//pkg/syserr",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+        "//pkg/waiter/fdnotifier",
+    ],
+)
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
new file mode 100644
index 000000000..d2b007ab2
--- /dev/null
+++ b/pkg/sentry/fs/host/control.go
@@ -0,0 +1,90 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type scmRights struct {
+	fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRights {
+	return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) control.RightsFiles {
+	n := max
+	if l := len(c.fds); n > l {
+		n = l
+	}
+
+	rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n]))
+
+	// Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+	c.fds = c.fds[len(rf):]
+	return rf
+}
+
+// Clone implements unix.RightsControlMessage.Clone.
+func (c *scmRights) Clone() unix.RightsControlMessage {
+	// Host rights never need to be cloned.
+	return nil
+}
+
+// Release implements unix.RightsControlMessage.Release.
+func (c *scmRights) Release() {
+	for _, fd := range c.fds {
+		syscall.Close(fd)
+	}
+	c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*fs.File {
+	files := make([]*fs.File, 0, len(fds))
+	for _, fd := range fds {
+		// Get flags. We do it here because they may be modified
+		// by subsequent functions.
+		fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+		if errno != 0 {
+			ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+			break
+		}
+
+		// Create the file backed by hostFD.
+		file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx))
+		if err != nil {
+			ctx.Warningf("Error creating file from host FD: %v", err)
+			break
+		}
+
+		// Set known flags.
+		file.SetFlags(fs.SettableFileFlags{
+			NonBlocking: fileFlags&syscall.O_NONBLOCK != 0,
+		})
+
+		files = append(files, file)
+	}
+	return files
+}
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
new file mode 100644
index 000000000..613bd06e8
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -0,0 +1,118 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"path"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// descriptor wraps a host fd.
+type descriptor struct {
+	// donated is true if the host fd was donated by another process.
+	donated bool
+
+	// If origFD >= 0, it is the host fd that this file was
+	// originally created from, which must be available at time
+	// of restore. Only valid if donated is true.
+	origFD int
+
+	// wouldBlock is true if value (below) points to a file that can
+	// return EWOULDBLOCK for operations that would block.
+	wouldBlock bool
+
+	// value is the wrapped host fd. It is never saved or restored
+	// directly. How it is restored depends on whether it was
+	// donated and the fs.MountSource it was originally
+	// opened/created from.
+	value int `state:"nosave"`
+}
+
+// newDescriptor returns a wrapped host file descriptor. On success,
+// the descriptor is registered for event notifications with queue.
+func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) {
+	ownedFD := fd
+	origFD := -1
+	if saveable {
+		var err error
+		ownedFD, err = syscall.Dup(fd)
+		if err != nil {
+			return nil, err
+		}
+		origFD = fd
+	}
+	if wouldBlock {
+		if err := syscall.SetNonblock(ownedFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil {
+			return nil, err
+		}
+	}
+	return &descriptor{
+		donated:    donated,
+		origFD:     origFD,
+		wouldBlock: wouldBlock,
+		value:      ownedFD,
+	}, nil
+}
+
+// initAfterLoad initializes the value of the descriptor after Load.
+func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error {
+	if d.donated {
+		var err error
+		d.value, err = syscall.Dup(d.origFD)
+		if err != nil {
+			return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err)
+		}
+	} else {
+		name, ok := mo.inodeMappings[id]
+		if !ok {
+			return fmt.Errorf("failed to find path for inode number %d", id)
+		}
+		fullpath := path.Join(mo.root, name)
+
+		var err error
+		d.value, err = open(nil, fullpath)
+		if err != nil {
+			return fmt.Errorf("failed to open %q: %v", fullpath, err)
+		}
+	}
+	if d.wouldBlock {
+		if err := syscall.SetNonblock(d.value, true); err != nil {
+			return err
+		}
+		if err := fdnotifier.AddFD(int32(d.value), queue); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Release releases all resources held by descriptor.
+func (d *descriptor) Release() {
+	if d.wouldBlock {
+		fdnotifier.RemoveFD(int32(d.value))
+	}
+	if err := syscall.Close(d.value); err != nil {
+		log.Warningf("error closing fd %d: %v", d.value, err)
+	}
+	d.value = -1
+}
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
new file mode 100644
index 000000000..7fb274451
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+// beforeSave is invoked by stateify.
+func (d *descriptor) beforeSave() {
+	if d.donated && d.origFD < 0 {
+		panic("donated file descriptor cannot be saved")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *descriptor) afterLoad() {
+	// value must be manually restored by the descriptor's parent using
+	// initAfterLoad.
+	d.value = -1
+}
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
new file mode 100644
index 000000000..f2a0b6b15
--- /dev/null
+++ b/pkg/sentry/fs/host/device.go
@@ -0,0 +1,25 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// hostFileDevice is the host file virtual device.
+var hostFileDevice = device.NewAnonMultiDevice()
+
+// hostPipeDevice is the host pipe virtual device.
+var hostPipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
new file mode 100644
index 000000000..bdf844337
--- /dev/null
+++ b/pkg/sentry/fs/host/file.go
@@ -0,0 +1,371 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// fileOperations implements fs.FileOperations for a host file descriptor.
+type fileOperations struct {
+	fsutil.NoopRelease `state:"nosave"`
+
+	// iops are the Inode operations for this file.
+	iops *inodeOperations `state:"wait"`
+
+	// a scratch buffer for reading directory entries.
+	dirinfo *dirInfo `state:"nosave"`
+
+	// dirCursor is the directory cursor.
+	dirCursor string
+
+	// allowIoctl determines whether ioctls should be passed through to the
+	// host.
+	allowIoctl bool
+}
+
+// fileOperations implements fs.FileOperations.
+var _ fs.FileOperations = (*fileOperations)(nil)
+
+// NewFile creates a new File backed by the provided host file descriptor. If
+// NewFile succeeds, ownership of the fd is transferred to the returned File.
+//
+// The returned File cannot be saved, since there is no guarantee that the same
+// fd will exist or represent the same file at time of restore. If such a
+// guarantee does exist, use ImportFile instead.
+func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, false, false)
+}
+
+// ImportFile creates a new File backed by the provided host file descriptor.
+// Unlike NewFile, the file descriptor used by the File is duped from fd to
+// ensure that later changes to fd are not reflected by the fs.File.
+//
+// If the returned file is saved, it will be restored by re-importing the fd
+// originally passed to ImportFile. It is the restorer's responsibility to
+// ensure that the fd represents the same file.
+func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, allowIoctl bool) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, true, allowIoctl)
+}
+
+// newFileFromDonatedFD returns an fs.File from a donated fd. If the fd is
+// saveable, then saveable is true.
+func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, allowIoctl bool) (*fs.File, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(donated, &s); err != nil {
+		return nil, err
+	}
+	switch s.Mode & syscall.S_IFMT {
+	case syscall.S_IFSOCK:
+		flags, err := fileFlagsFromDonatedFD(donated)
+		if err != nil {
+			return nil, err
+		}
+		s, err := newSocket(ctx, donated, saveable)
+		if err != nil {
+			return nil, err
+		}
+		s.SetFlags(fs.SettableFileFlags{
+			NonBlocking: flags.NonBlocking,
+		})
+		return s, nil
+	default:
+		flags, err := fileFlagsFromDonatedFD(donated)
+		if err != nil {
+			return nil, err
+		}
+		msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
+		inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
+		if err != nil {
+			return nil, err
+		}
+		iops := inode.InodeOperations.(*inodeOperations)
+
+		name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID)
+		dirent := fs.NewDirent(inode, name)
+		defer dirent.DecRef()
+
+		return newFile(ctx, dirent, flags, iops, allowIoctl), nil
+	}
+}
+
+func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
+	flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0)
+	if errno != 0 {
+		log.Warningf("Failed to get file flags for donated fd %d (errno=%d)", donated, errno)
+		return fs.FileFlags{}, syscall.EIO
+	}
+	accmode := flags & syscall.O_ACCMODE
+	return fs.FileFlags{
+		Direct:      flags&syscall.O_DIRECT != 0,
+		NonBlocking: flags&syscall.O_NONBLOCK != 0,
+		Sync:        flags&syscall.O_SYNC != 0,
+		Append:      flags&syscall.O_APPEND != 0,
+		Read:        accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR,
+		Write:       accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR,
+	}, nil
+}
+
+// newFile returns a new fs.File.
+func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations, allowIoctl bool) *fs.File {
+	if !iops.ReturnsWouldBlock() {
+		// Allow reading/writing at an arbitrary offset for files
+		// that support it.
+		flags.Pread = true
+		flags.Pwrite = true
+	}
+	return fs.NewFile(ctx, dirent, flags, &fileOperations{
+		iops:       iops,
+		allowIoctl: allowIoctl,
+	})
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	f.iops.fileState.queue.EventRegister(e, mask)
+	fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *fileOperations) EventUnregister(e *waiter.Entry) {
+	f.iops.fileState.queue.EventUnregister(e)
+	fdnotifier.UpdateFD(int32(f.iops.fileState.FD()))
+}
+
+// Readiness uses the poll() syscall to check the status of the underlying FD.
+func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask)
+}
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &f.dirCursor,
+	}
+	return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+}
+
+// IterateDir implements fs.DirIterator.IterateDir.
+func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	if f.dirinfo == nil {
+		f.dirinfo = new(dirInfo)
+		f.dirinfo.buf = make([]byte, usermem.PageSize)
+	}
+	entries, err := f.iops.readdirAll(f.dirinfo)
+	if err != nil {
+		return offset, err
+	}
+	count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries))
+	return offset + count, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	// Would this file block?
+	if f.iops.ReturnsWouldBlock() {
+		// These files can't be memory mapped, assert this. This also
+		// means that writes do not need to synchronize with memory
+		// mappings nor metadata cached by this file's fs.Inode.
+		if canMap(file.Dirent.Inode) {
+			panic("files that can return EWOULDBLOCK cannot be memory mapped")
+		}
+		// Ignore the offset, these files don't support writing at
+		// an arbitrary offset.
+		writer := fd.NewReadWriter(f.iops.fileState.FD())
+		n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+	if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+		writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+		return src.CopyInTo(ctx, safemem.FromIOWriter{writer})
+	}
+	return f.iops.cachingInodeOps.Write(ctx, src, offset)
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	// Would this file block?
+	if f.iops.ReturnsWouldBlock() {
+		// These files can't be memory mapped, assert this. This also
+		// means that reads do not need to synchronize with memory
+		// mappings nor metadata cached by this file's fs.Inode.
+		if canMap(file.Dirent.Inode) {
+			panic("files that can return EWOULDBLOCK cannot be memory mapped")
+		}
+		// Ignore the offset, these files don't support reading at
+		// an arbitrary offset.
+		reader := fd.NewReadWriter(f.iops.fileState.FD())
+		n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+	if !file.Dirent.Inode.MountSource.Flags.ForcePageCache {
+		reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset)
+		return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader})
+	}
+	return f.iops.cachingInodeOps.Read(ctx, file, dst, offset)
+}
+
+// Fsync implements fs.FileOperations.Fsync.
+func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+	switch syncType {
+	case fs.SyncAll, fs.SyncData:
+		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+			return err
+		}
+		fallthrough
+	case fs.SyncBackingStorage:
+		return syscall.Fsync(f.iops.fileState.FD())
+	}
+	panic("invalid sync type")
+}
+
+// Flush implements fs.FileOperations.Flush.
+func (f *fileOperations) Flush(context.Context, *fs.File) error {
+	// This is a no-op because flushing the resource backing this
+	// file would mean closing it. We can't do that because other
+	// open files may depend on the backing host fd.
+	return nil
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	if !canMap(file.Dirent.Inode) {
+		return syserror.ENODEV
+	}
+	return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts)
+}
+
+// Seek implements fs.FileOperations.Seek.
+func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
+}
+
+// Ioctl implements fs.FileOperations.Iocotl.
+func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	if !f.allowIoctl {
+		return 0, syserror.ENOTTY
+	}
+	// Ignore arg[0].  This is the real FD:
+	fd := f.iops.fileState.FD()
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case unix.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case unix.TCSETS, unix.TCSETSW:
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		return 0, err
+
+	case unix.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on
+		// this terminal.
+
+		t := kernel.TaskFromContext(ctx)
+		if t == nil {
+			panic(fmt.Sprintf("cannot get thread group from context %v", ctx))
+		}
+		tid := t.ThreadID()
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tid, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case unix.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		// Not much we can do with this one at the moment, so we just
+		// lie and pretend everything is great. Bash and Sh seem fine
+		// with this.
+		log.Warningf("Ignoring application ioctl(TIOCSPGRP) call")
+		return 0, nil
+
+	case unix.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := unix.IoctlGetWinsize(fd, unix.TIOCGWINSZ)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case unix.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+		var winsize unix.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := unix.IoctlSetWinsize(fd, unix.TIOCSWINSZ, &winsize)
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
new file mode 100644
index 000000000..ffd55a5ab
--- /dev/null
+++ b/pkg/sentry/fs/host/fs.go
@@ -0,0 +1,327 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host implements an fs.Filesystem for files backed by host
+// file descriptors.
+package host
+
+import (
+	"fmt"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FilesystemName is the name under which Filesystem is registered.
+const FilesystemName = "whitelistfs"
+
+const (
+	// whitelistKey is the mount option containing a comma-separated list
+	// of host paths to whitelist.
+	whitelistKey = "whitelist"
+
+	// rootPathKey is the mount option containing the root path of the
+	// mount.
+	rootPathKey = "root"
+
+	// dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership.
+	dontTranslateOwnershipKey = "dont_translate_ownership"
+)
+
+// maxTraversals determines link traversals in building the whitelist.
+const maxTraversals = 10
+
+// Filesystem is a pseudo file system that is only available during the setup
+// to lock down the configurations. This filesystem should only be mounted at root.
+//
+// Think twice before exposing this to applications.
+type Filesystem struct {
+	// whitelist is a set of host paths to whitelist.
+	paths []string
+}
+
+// Name is the identifier of this file system.
+func (*Filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*Filesystem) AllowUserMount() bool {
+	return false
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*Filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns an fs.Inode exposing the host file system.  It is intended to be locked
+// down in PreExec below.
+func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// Parse generic comma-separated key=value options.
+	options := fs.GenericMountSourceOptions(data)
+
+	// Grab the whitelist if one was specified.
+	// TODO: require another option "testonly" in order to allow
+	// no whitelist.
+	if wl, ok := options[whitelistKey]; ok {
+		f.paths = strings.Split(wl, "|")
+		delete(options, whitelistKey)
+	}
+
+	// If the rootPath was set, use it. Othewise default to the root of the
+	// host fs.
+	rootPath := "/"
+	if rp, ok := options[rootPathKey]; ok {
+		rootPath = rp
+		delete(options, rootPathKey)
+
+		// We must relativize the whitelisted paths to the new root.
+		for i, p := range f.paths {
+			rel, err := filepath.Rel(rootPath, p)
+			if err != nil {
+				return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath)
+			}
+			f.paths[i] = path.Join("/", rel)
+		}
+	}
+	fd, err := open(nil, rootPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to find root: %v", err)
+	}
+
+	var dontTranslateOwnership bool
+	if v, ok := options[dontTranslateOwnershipKey]; ok {
+		b, err := strconv.ParseBool(v)
+		if err != nil {
+			return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err)
+		}
+		dontTranslateOwnership = b
+		delete(options, dontTranslateOwnershipKey)
+	}
+
+	// Fail if the caller passed us more options than we know about.
+	if len(options) > 0 {
+		return nil, fmt.Errorf("unsupported mount options: %v", options)
+	}
+
+	// The mounting EUID/EGID will be cached by this file system. This will
+	// be used to assign ownership to files that we own.
+	owner := fs.FileOwnerFromContext(ctx)
+
+	// Construct the host file system mount and inode.
+	msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership)
+	return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */)
+}
+
+// InstallWhitelist locks down the MountNamespace to only the currently installed
+// Dirents and the given paths.
+func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error {
+	return installWhitelist(ctx, m, f.paths)
+}
+
+func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error {
+	if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") {
+		// Warning will be logged during filter installation if the empty
+		// whitelist matters (allows for host file access).
+		return nil
+	}
+
+	// Done tracks entries already added.
+	done := make(map[string]bool)
+	root := m.Root()
+	defer root.DecRef()
+
+	for i := 0; i < len(paths); i++ {
+		// Make sure the path is absolute. This is a sanity check.
+		if !path.IsAbs(paths[i]) {
+			return fmt.Errorf("path %q is not absolute", paths[i])
+		}
+
+		// We need to add all the intermediate paths, in case one of
+		// them is a symlink that needs to be resolved.
+		for j := 1; j <= len(paths[i]); j++ {
+			if j < len(paths[i]) && paths[i][j] != '/' {
+				continue
+			}
+			current := paths[i][:j]
+
+			// Lookup the given component in the tree.
+			d, err := m.FindLink(ctx, root, nil, current, maxTraversals)
+			if err != nil {
+				log.Warningf("populate failed for %q: %v", current, err)
+				continue
+			}
+
+			// It's critical that this DecRef happens after the
+			// freeze below. This ensures that the dentry is in
+			// place to be frozen. Otherwise, we freeze without
+			// these entries.
+			defer d.DecRef()
+
+			// Expand the last component if necessary.
+			if current == paths[i] {
+				// Is it a directory or symlink?
+				sattr := d.Inode.StableAttr
+				if fs.IsDir(sattr) {
+					for name := range childDentAttrs(ctx, d) {
+						paths = append(paths, path.Join(current, name))
+					}
+				}
+				if fs.IsSymlink(sattr) {
+					// Only expand symlinks once. The
+					// folder structure may contain
+					// recursive symlinks and we don't want
+					// to end up infinitely expanding this
+					// symlink. This is safe because this
+					// is the last component. If a later
+					// path wants to symlink something
+					// beneath this symlink that will still
+					// be handled by the FindLink above.
+					if done[current] {
+						continue
+					}
+
+					s, err := d.Inode.Readlink(ctx)
+					if err != nil {
+						log.Warningf("readlink failed for %q: %v", current, err)
+						continue
+					}
+					if path.IsAbs(s) {
+						paths = append(paths, s)
+					} else {
+						target := path.Join(path.Dir(current), s)
+						paths = append(paths, target)
+					}
+				}
+			}
+
+			// Only report this one once even though we may look
+			// it up more than once. If we whitelist /a/b,/a then
+			// /a will be "done" when it is looked up for /a/b,
+			// however we still need to expand all of its contents
+			// when whitelisting /a.
+			if !done[current] {
+				log.Debugf("whitelisted: %s", current)
+			}
+			done[current] = true
+		}
+	}
+
+	// Freeze the mount tree in place. This prevents any new paths from
+	// being opened and any old ones from being removed. If we do provide
+	// tmpfs mounts, we'll want to freeze/thaw those separately.
+	m.Freeze()
+	return nil
+}
+
+func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr {
+	dirname, _ := d.FullName(nil /* root */)
+	dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		log.Warningf("failed to open directory %q: %v", dirname, err)
+		return nil
+	}
+	dir.DecRef()
+	var stubSerializer fs.CollectEntriesSerializer
+	if err := dir.Readdir(ctx, &stubSerializer); err != nil {
+		log.Warningf("failed to iterate on host directory %q: %v", dirname, err)
+		return nil
+	}
+	delete(stubSerializer.Entries, ".")
+	delete(stubSerializer.Entries, "..")
+	return stubSerializer.Entries
+}
+
+// newMountSource constructs a new host fs.MountSource
+// relative to a root path. The root should match the mount point.
+func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource {
+	return fs.NewMountSource(&superOperations{
+		root:                   root,
+		inodeMappings:          make(map[uint64]string),
+		mounter:                mounter,
+		dontTranslateOwnership: dontTranslateOwnership,
+	}, filesystem, flags)
+}
+
+// superOperations implements fs.MountSourceOperations.
+type superOperations struct {
+	fs.SimpleMountSourceOperations `state:"nosave"`
+
+	// root is the path of the mount point. All inode mappings
+	// are relative to this root.
+	root string
+
+	// inodeMappings contains mappings of fs.Inodes associated
+	// with this MountSource to paths under root.
+	inodeMappings map[uint64]string
+
+	// mounter is the cached EUID/EGID that mounted this file system.
+	mounter fs.FileOwner
+
+	// dontTranslateOwnership indicates whether to not translate file
+	// ownership.
+	//
+	// By default, files/directories owned by the sandbox uses UID/GID
+	// of the mounter. For files/directories that are not owned by the
+	// sandbox, file UID/GID is translated to a UID/GID which cannot
+	// be mapped in the sandboxed application's user namespace. The
+	// UID/GID will look like the nobody UID/GID (65534) but is not
+	// strictly owned by the user "nobody".
+	//
+	// If whitelistfs is a lower filesystem in an overlay, set
+	// dont_translate_ownership=true in mount options.
+	dontTranslateOwnership bool
+}
+
+var _ fs.MountSourceOperations = (*superOperations)(nil)
+
+// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
+func (m *superOperations) ResetInodeMappings() {
+	m.inodeMappings = make(map[uint64]string)
+}
+
+// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping.
+func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
+	// This is very unintuitive. We *CANNOT* trust the inode's StableAttrs,
+	// because overlay copyUp may have changed them out from under us.
+	// So much for "immutable".
+	sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr
+	m.inodeMappings[sattr.InodeID] = path
+}
+
+// Keep implements fs.MountSourceOperations.Keep.
+//
+// TODO: It is possible to change the permissions on a
+// host file while it is in the dirent cache (say from RO to RW), but it is not
+// possible to re-open the file with more relaxed permissions, since the host
+// FD is already open and stored in the inode.
+//
+// Using the dirent LRU cache increases the odds that this bug is encountered.
+// Since host file access is relatively fast anyways, we disable the LRU cache
+// for host fs files.  Once we can properly deal with permissions changes and
+// re-opening host files, we should revisit whether or not to make use of the
+// LRU cache.
+func (*superOperations) Keep(*fs.Dirent) bool {
+	return false
+}
+
+func init() {
+	fs.RegisterFilesystem(&Filesystem{})
+}
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
new file mode 100644
index 000000000..c000afc49
--- /dev/null
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -0,0 +1,383 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"reflect"
+	"sort"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// newTestMountNamespace creates a MountNamespace with a ramfs root.
+// It returns the host folder created, which should be removed when done.
+func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) {
+	p, err := ioutil.TempDir("", "root")
+	if err != nil {
+		return nil, "", err
+	}
+
+	fd, err := open(nil, p)
+	if err != nil {
+		os.RemoveAll(p)
+		return nil, "", err
+	}
+	ctx := contexttest.Context(t)
+	root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
+	if err != nil {
+		os.RemoveAll(p)
+		return nil, "", err
+	}
+	mm, err := fs.NewMountNamespace(ctx, root)
+	if err != nil {
+		os.RemoveAll(p)
+		return nil, "", err
+	}
+	return mm, p, nil
+}
+
+// createTestDirs populates the root with some test files and directories.
+// /a/a1.txt
+// /a/a2.txt
+// /b/b1.txt
+// /b/c/c1.txt
+// /symlinks/normal.txt
+// /symlinks/to_normal.txt -> /symlinks/normal.txt
+// /symlinks/recursive -> /symlinks
+func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error {
+	r := m.Root()
+	defer r.DecRef()
+
+	if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	a, err := r.Walk(ctx, r, "a")
+	if err != nil {
+		return err
+	}
+	defer a.DecRef()
+
+	a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	a1.DecRef()
+
+	a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	a2.DecRef()
+
+	if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	b, err := r.Walk(ctx, r, "b")
+	if err != nil {
+		return err
+	}
+	defer b.DecRef()
+
+	b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	b1.DecRef()
+
+	if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	c, err := b.Walk(ctx, r, "c")
+	if err != nil {
+		return err
+	}
+	defer c.DecRef()
+
+	c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	c1.DecRef()
+
+	if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil {
+		return err
+	}
+
+	symlinks, err := r.Walk(ctx, r, "symlinks")
+	if err != nil {
+		return err
+	}
+	defer symlinks.DecRef()
+
+	normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666))
+	if err != nil {
+		return err
+	}
+	normal.DecRef()
+
+	if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil {
+		return err
+	}
+
+	if err := symlinks.CreateLink(ctx, r, "/symlinks", "recursive"); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// allPaths returns a slice of all paths of entries visible in the rootfs.
+func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) {
+	var paths []string
+	root := m.Root()
+	defer root.DecRef()
+
+	d, err := m.FindLink(ctx, root, nil, base, 1)
+	if err != nil {
+		t.Logf("FindLink failed for %q", base)
+		return paths, err
+	}
+	defer d.DecRef()
+
+	if fs.IsDir(d.Inode.StableAttr) {
+		dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+		if err != nil {
+			return nil, fmt.Errorf("failed to open directory %q: %v", base, err)
+		}
+		iter, ok := dir.FileOperations.(fs.DirIterator)
+		if !ok {
+			return nil, fmt.Errorf("cannot directly iterate on host directory %q", base)
+		}
+		dirCtx := &fs.DirCtx{
+			Serializer: noopDentrySerializer{},
+		}
+		if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil {
+			return nil, err
+		}
+		for name := range dirCtx.DentAttrs() {
+			if name == "." || name == ".." {
+				continue
+			}
+
+			fullName := path.Join(base, name)
+			paths = append(paths, fullName)
+
+			// Recurse.
+			subpaths, err := allPaths(ctx, t, m, fullName)
+			if err != nil {
+				return paths, err
+			}
+			paths = append(paths, subpaths...)
+		}
+	}
+
+	return paths, nil
+}
+
+type noopDentrySerializer struct{}
+
+func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error {
+	return nil
+}
+func (noopDentrySerializer) Written() int {
+	return 4096
+}
+
+// pathsEqual returns true if the two string slices contain the same entries.
+func pathsEqual(got, want []string) bool {
+	sort.Strings(got)
+	sort.Strings(want)
+
+	if len(got) != len(want) {
+		return false
+	}
+
+	for i := range got {
+		if got[i] != want[i] {
+			return false
+		}
+	}
+
+	return true
+}
+
+func TestWhitelist(t *testing.T) {
+	for _, test := range []struct {
+		// description of the test.
+		desc string
+		// paths are the paths to whitelist
+		paths []string
+		// want are all of the directory entries that should be
+		// visible (nothing beyond this set should be visible).
+		want []string
+	}{
+		{
+			desc:  "root",
+			paths: []string{"/"},
+			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"},
+		},
+		{
+			desc:  "top-level directories",
+			paths: []string{"/a", "/b"},
+			want:  []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "nested directories (1/2)",
+			paths: []string{"/b", "/b/c"},
+			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "nested directories (2/2)",
+			paths: []string{"/b/c", "/b"},
+			want:  []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "single file",
+			paths: []string{"/b/c/c1.txt"},
+			want:  []string{"/b", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "single file and directory",
+			paths: []string{"/a/a1.txt", "/b/c"},
+			want:  []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"},
+		},
+		{
+			desc:  "symlink",
+			paths: []string{"/symlinks/to_normal.txt"},
+			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"},
+		},
+		{
+			desc:  "recursive symlink",
+			paths: []string{"/symlinks/recursive/normal.txt"},
+			want:  []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"},
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			m, p, err := newTestMountNamespace(t)
+			if err != nil {
+				t.Errorf("Failed to create MountNamespace: %v", err)
+			}
+			defer os.RemoveAll(p)
+
+			ctx := withRoot(contexttest.RootContext(t), m.Root())
+			if err := createTestDirs(ctx, t, m); err != nil {
+				t.Errorf("Failed to create test dirs: %v", err)
+			}
+
+			if err := installWhitelist(ctx, m, test.paths); err != nil {
+				t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err)
+			}
+
+			got, err := allPaths(ctx, t, m, "/")
+			if err != nil {
+				t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err)
+			}
+
+			if !pathsEqual(got, test.want) {
+				t.Errorf("For paths %v got %v want %v", test.paths, got, test.want)
+			}
+		})
+	}
+}
+
+func TestRootPath(t *testing.T) {
+	// Create a temp dir, which will be the root of our mounted fs.
+	rootPath, err := ioutil.TempDir(os.TempDir(), "root")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+	defer os.RemoveAll(rootPath)
+
+	// Create two files inside the new root, one which will be whitelisted
+	// and one not.
+	whitelisted, err := ioutil.TempFile(rootPath, "white")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	if _, err := ioutil.TempFile(rootPath, "black"); err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+
+	// Create a mount with a root path and single whitelisted file.
+	hostFS := &Filesystem{}
+	ctx := contexttest.Context(t)
+	data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name())
+	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data)
+	if err != nil {
+		t.Fatalf("Mount failed: %v", err)
+	}
+	mm, err := fs.NewMountNamespace(ctx, inode)
+	if err != nil {
+		t.Fatalf("NewMountNamespace failed: %v", err)
+	}
+	if err := hostFS.InstallWhitelist(ctx, mm); err != nil {
+		t.Fatalf("InstallWhitelist failed: %v", err)
+	}
+
+	// Get the contents of the root directory.
+	rootDir := mm.Root()
+	rctx := withRoot(ctx, rootDir)
+	f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{})
+	if err != nil {
+		t.Fatalf("GetFile failed: %v", err)
+	}
+	c := &fs.CollectEntriesSerializer{}
+	if err := f.Readdir(rctx, c); err != nil {
+		t.Fatalf("Readdir failed: %v", err)
+	}
+
+	// We should have only our whitelisted file, plus the dots.
+	want := []string{path.Base(whitelisted.Name()), ".", ".."}
+	got := c.Order
+	sort.Strings(want)
+	sort.Strings(got)
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Readdir got %v, wanted %v", got, want)
+	}
+}
+
+type rootContext struct {
+	context.Context
+	root *fs.Dirent
+}
+
+// withRoot returns a copy of ctx with the given root.
+func withRoot(ctx context.Context, root *fs.Dirent) context.Context {
+	return &rootContext{
+		Context: ctx,
+		root:    root,
+	}
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+	switch key {
+	case fs.CtxRoot:
+		rc.root.IncRef()
+		return rc.root
+	default:
+		return rc.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
new file mode 100644
index 000000000..226bc5164
--- /dev/null
+++ b/pkg/sentry/fs/host/inode.go
@@ -0,0 +1,506 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// inodeOperations implements fs.InodeOperations for an fs.Inodes backed
+// by a host file descriptor.
+type inodeOperations struct {
+	fsutil.InodeNotVirtual           `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+
+	// fileState implements fs.CachedFileObject. It exists
+	// to break a circular load dependency between inodeOperations
+	// and cachingInodeOps (below).
+	fileState *inodeFileState `state:"wait"`
+
+	// cachedInodeOps implements memmap.Mappable.
+	cachingInodeOps *fsutil.CachingInodeOperations
+
+	// readdirMu protects the file offset on the host FD. This is needed
+	// for readdir because getdents must use the kernel offset, so
+	// concurrent readdirs must be exclusive.
+	//
+	// All read/write functions pass the offset directly to the kernel and
+	// thus don't need a lock.
+	readdirMu sync.Mutex `state:"nosave"`
+}
+
+// inodeFileState implements fs.CachedFileObject and otherwise fully
+// encapsulates state that needs to be manually loaded on restore for
+// this file object.
+//
+// This unfortunate structure exists because fs.CachingInodeOperations
+// defines afterLoad and therefore cannot be lazily loaded (to break a
+// circular load dependency between it and inodeOperations). Even with
+// lazy loading, this approach defines the dependencies between objects
+// and the expected load behavior more concretely.
+type inodeFileState struct {
+	// Common file system state.
+	mops *superOperations `state:"wait"`
+
+	// descriptor is the backing host fd.
+	descriptor *descriptor `state:"wait"`
+
+	// Event queue for blocking operations.
+	queue waiter.Queue `state:"nosave"`
+
+	// sattr is used to restore the inodeOperations.
+	sattr fs.StableAttr `state:"wait"`
+
+	// savedUAttr is only allocated during S/R. It points to the save-time
+	// unstable attributes and is used to validate restore-time ones.
+	//
+	// Note that these unstable attributes are only used to detect cross-S/R
+	// external file system metadata changes. They may differ from the
+	// cached unstable attributes in cachingInodeOps, as that might differ
+	// from the external file system attributes if there had been WriteOut
+	// failures. S/R is transparent to Sentry and the latter will continue
+	// using its cached values after restore.
+	savedUAttr *fs.UnstableAttr
+}
+
+// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
+func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	// TODO: Using safemem.FromIOReader here is wasteful for two
+	// reasons:
+	//
+	// - Using preadv instead of iterated preads saves on host system calls.
+	//
+	// - Host system calls can handle destination memory that would fault in
+	// gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true),
+	// so the buffering performed by FromIOReader is unnecessary.
+	//
+	// This also applies to the write path below.
+	return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts)
+}
+
+// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
+func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs)
+}
+
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
+	if mask.Empty() {
+		return nil
+	}
+	if mask.UID || mask.GID {
+		return syserror.EPERM
+	}
+	if mask.Perms {
+		if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil {
+			return err
+		}
+	}
+	if mask.Size {
+		if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil {
+			return err
+		}
+	}
+	if mask.AccessTime || mask.ModificationTime {
+		ts := fs.TimeSpec{
+			ATime:     attr.AccessTime,
+			ATimeOmit: !mask.AccessTime,
+			MTime:     attr.ModificationTime,
+			MTimeOmit: !mask.ModificationTime,
+		}
+		if err := setTimestamps(i.FD(), ts); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Sync implements fsutil.CachedFileObject.Sync.
+func (i *inodeFileState) Sync(ctx context.Context) error {
+	return syscall.Fsync(i.FD())
+}
+
+// FD implements fsutil.CachedFileObject.FD.
+func (i *inodeFileState) FD() int {
+	return i.descriptor.value
+}
+
+func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.FD(), &s); err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	return unstableAttr(i.mops, &s), nil
+}
+
+// inodeOperations implements fs.InodeOperations.
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// newInode returns a new fs.Inode backed by the host fd.
+func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
+	// Retrieve metadata.
+	var s syscall.Stat_t
+	err := syscall.Fstat(fd, &s)
+	if err != nil {
+		return nil, err
+	}
+
+	fileState := &inodeFileState{
+		mops:  msrc.MountSourceOperations.(*superOperations),
+		sattr: stableAttr(&s),
+	}
+
+	// Initialize the wrapped host file descriptor.
+	fileState.descriptor, err = newDescriptor(
+		fd,
+		donated,
+		saveable,
+		wouldBlock(&s),
+		&fileState.queue,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	// Build the fs.InodeOperations.
+	uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
+	iops := &inodeOperations{
+		fileState:       fileState,
+		cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+	}
+
+	// Return the fs.Inode.
+	return fs.NewInode(iops, msrc, fileState.sattr), nil
+}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
+	if !canMap(inode) {
+		return nil
+	}
+	return i.cachingInodeOps
+}
+
+// ReturnsWouldBlock returns true if this host fd can return EWOULDBLOCK
+// for operations that would block.
+func (i *inodeOperations) ReturnsWouldBlock() bool {
+	return i.fileState.descriptor.wouldBlock
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *inodeOperations) Release(context.Context) {
+	i.fileState.descriptor.Release()
+	i.cachingInodeOps.Release()
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+	// Get a new fd relative to i at name.
+	fd, err := open(i, name)
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, syserror.ENOENT
+		}
+		return nil, err
+	}
+
+	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+	if err != nil {
+		return nil, err
+	}
+
+	// Return the fs.Dirent.
+	return fs.NewDirent(inode, name), nil
+}
+
+// Create implements fs.InodeOperations.Create.
+func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+	// Create a file relative to i at name.
+	//
+	// N.B. We always open this file O_RDWR regardless of flags because a
+	// future GetFile might want more access. Open allows this regardless
+	// of perm.
+	fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode())
+	if err != nil {
+		return nil, err
+	}
+
+	inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */)
+	if err != nil {
+		return nil, err
+	}
+
+	d := fs.NewDirent(inode, name)
+	defer d.DecRef()
+	return inode.GetFile(ctx, d, flags)
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode()))
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
+	return createLink(i.fileState.FD(), oldname, newname)
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+	return syserror.EPERM
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return syserror.EOPNOTSUPP
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+	return unlinkAt(i.fileState.FD(), name, false /* dir */)
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+	return unlinkAt(i.fileState.FD(), name, true /* dir */)
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	op, ok := oldParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+	np, ok := newParent.InodeOperations.(*inodeOperations)
+	if !ok {
+		return syscall.EXDEV
+	}
+	return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName)
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
+	return syserror.EOPNOTSUPP
+}
+
+// BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint {
+	return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return newFile(ctx, d, flags, i, false), nil
+}
+
+// canMap returns true if this fs.Inode can be memory mapped.
+func canMap(inode *fs.Inode) bool {
+	// FIXME: Some obscure character devices can be mapped.
+	return fs.IsFile(inode.StableAttr)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	// When the kernel supports mapping host FDs, we do so to take
+	// advantage of the host page cache. We forego updating fs.Inodes
+	// because the host manages consistency of its own inode structures.
+	//
+	// For fs.Inodes that can never be mapped we take advantage of
+	// synchronizing metadata updates through host caches.
+	//
+	// So can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just obtain the attributes.
+		return i.fileState.unstableAttr(ctx)
+	}
+	// No, we're maintaining consistency of metadata ourselves.
+	return i.cachingInodeOps.UnstableAttr(ctx, inode)
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
+	return syserror.EPERM
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool {
+	// Can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just change the timestamps on the fd, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil
+	}
+	// Otherwise update our cached metadata.
+	return i.cachingInodeOps.SetPermissions(ctx, inode, f)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	// Can we use host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then just change the timestamps on the fd, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return setTimestamps(i.fileState.FD(), ts)
+	}
+	// Otherwise update our cached metadata.
+	return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	// Is the file not memory-mappable?
+	if !canMap(inode) {
+		// Then just change the file size on the fd, the host
+		// will synchronize the metadata update with any host
+		// inode and page cache.
+		return syscall.Ftruncate(i.fileState.FD(), size)
+	}
+	// Otherwise we need to go through cachingInodeOps, even if the host page
+	// cache is in use, to invalidate private copies of truncated pages.
+	return i.cachingInodeOps.Truncate(ctx, inode, size)
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+	// Have we been using host kernel metadata caches?
+	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
+		// Then the metadata is already up to date on the host.
+		return nil
+	}
+	// Otherwise we need to write out cached pages and attributes
+	// that are dirty.
+	return i.cachingInodeOps.WriteOut(ctx, inode)
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	return readLink(i.fileState.FD())
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	if !fs.IsSymlink(i.fileState.sattr) {
+		return nil, syserror.ENOLINK
+	}
+	return nil, fs.ErrResolveViaReadlink
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syserror.ENOSYS
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+// FIXME: Remove this from InodeOperations altogether.
+func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
+
+// readdirAll returns all of the directory entries in i.
+func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) {
+	i.readdirMu.Lock()
+	defer i.readdirMu.Unlock()
+
+	fd := i.fileState.FD()
+
+	// syscall.ReadDirent will use getdents, which will seek the file past
+	// the last directory entry. To read the directory entries a second
+	// time, we need to seek back to the beginning.
+	if _, err := syscall.Seek(fd, 0, 0); err != nil {
+		if err == syscall.ESPIPE {
+			// All directories should be seekable. If this file
+			// isn't seekable, it is not a directory and we should
+			// return that more sane error.
+			err = syscall.ENOTDIR
+		}
+		return nil, err
+	}
+
+	names := make([]string, 0, 100)
+	for {
+		// Refill the buffer if necessary
+		if d.bufp >= d.nbuf {
+			d.bufp = 0
+			// ReadDirent will just do a sys_getdents64 to the kernel.
+			n, err := syscall.ReadDirent(fd, d.buf)
+			if err != nil {
+				return nil, err
+			}
+			if n == 0 {
+				break // EOF
+			}
+			d.nbuf = n
+		}
+
+		var nb int
+		// Parse the dirent buffer we just get and return the directory names along
+		// with the number of bytes consumed in the buffer.
+		nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names)
+		d.bufp += nb
+	}
+
+	entries := make(map[string]fs.DentAttr)
+	for _, filename := range names {
+		// Lookup the type and host device and inode.
+		stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW)
+		if lerr == syscall.ENOENT {
+			// File disappeared between readdir and lstat.
+			// Just treat it as if it didn't exist.
+			continue
+		}
+
+		// There was a serious problem, we should probably report it.
+		if lerr != nil {
+			return nil, lerr
+		}
+
+		entries[filename] = fs.DentAttr{
+			Type: nodeType(&stat),
+			InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+				Device: stat.Dev,
+				Inode:  stat.Ino,
+			}),
+		}
+	}
+	return entries, nil
+}
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
new file mode 100644
index 000000000..80066512a
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inodeFileState) beforeSave() {
+	if !i.queue.IsEmpty() {
+		panic("event queue must be empty")
+	}
+	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+		uattr, err := i.unstableAttr(context.Background())
+		if err != nil {
+			panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err))
+		}
+		i.savedUAttr = &uattr
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodeFileState) afterLoad() {
+	// Initialize the descriptor value.
+	if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil {
+		panic(fmt.Sprintf("failed to load value of descriptor: %v", err))
+	}
+
+	// Remap the inode number.
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.FD(), &s); err != nil {
+		panic(fmt.Sprintf("failed to get metadata for fd %d: %v", i.FD(), err))
+	}
+	key := device.MultiDeviceKey{
+		Device: s.Dev,
+		Inode:  s.Ino,
+	}
+	if !hostFileDevice.Load(key, i.sattr.InodeID) {
+		// This means there was a conflict at s.Dev and s.Ino with
+		// another inode mapping: two files that were unique on the
+		// saved filesystem are no longer unique on this filesystem.
+		// Since this violates the contract that filesystems cannot
+		// change across save and restore, error out.
+		panic(fmt.Sprintf("host %s conflict in host device mappings: %s", key, hostFileDevice))
+	}
+
+	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
+		env, ok := fs.CurrentRestoreEnvironment()
+		if !ok {
+			panic("missing restore environment")
+		}
+		uattr := unstableAttr(i.mops, &s)
+		if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
+			panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size))
+		}
+		if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
+			panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime))
+		}
+		i.savedUAttr = nil
+	}
+}
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
new file mode 100644
index 000000000..0ff87c418
--- /dev/null
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"io/ioutil"
+	"os"
+	"path"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// TestMultipleReaddir verifies that multiple Readdir calls return the same
+// thing if they use different dir contexts.
+func TestMultipleReaddir(t *testing.T) {
+	p, err := ioutil.TempDir("", "readdir")
+	if err != nil {
+		t.Fatalf("Failed to create test dir: %v", err)
+	}
+	defer os.RemoveAll(p)
+
+	f, err := os.Create(path.Join(p, "a.txt"))
+	if err != nil {
+		t.Fatalf("Failed to create a.txt: %v", err)
+	}
+	f.Close()
+
+	f, err = os.Create(path.Join(p, "b.txt"))
+	if err != nil {
+		t.Fatalf("Failed to create b.txt: %v", err)
+	}
+	f.Close()
+
+	fd, err := open(nil, p)
+	if err != nil {
+		t.Fatalf("Failed to open %q: %v", p, err)
+	}
+	ctx := contexttest.Context(t)
+	n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false)
+	if err != nil {
+		t.Fatalf("Failed to create inode: %v", err)
+	}
+
+	dirent := fs.NewDirent(n, "readdir")
+	openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true})
+	if err != nil {
+		t.Fatalf("Failed to get file: %v", err)
+	}
+	defer openFile.DecRef()
+
+	c1 := &fs.DirCtx{DirCursor: new(string)}
+	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c1, 0); err != nil {
+		t.Fatalf("First Readdir failed: %v", err)
+	}
+
+	c2 := &fs.DirCtx{DirCursor: new(string)}
+	if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c2, 0); err != nil {
+		t.Errorf("Second Readdir failed: %v", err)
+	}
+
+	if _, ok := c1.DentAttrs()["a.txt"]; !ok {
+		t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs())
+	}
+	if _, ok := c1.DentAttrs()["b.txt"]; !ok {
+		t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs())
+	}
+
+	if _, ok := c2.DentAttrs()["a.txt"]; !ok {
+		t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs())
+	}
+	if _, ok := c2.DentAttrs()["b.txt"]; !ok {
+		t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs())
+	}
+}
+
+// TestCloseFD verifies fds will be closed.
+func TestCloseFD(t *testing.T) {
+	var p [2]int
+	if err := syscall.Pipe(p[0:]); err != nil {
+		t.Fatalf("Failed to create pipe %v", err)
+	}
+	defer syscall.Close(p[0])
+	defer syscall.Close(p[1])
+
+	// Use the write-end because we will detect if it's closed on the read end.
+	ctx := contexttest.Context(t)
+	file, err := NewFile(ctx, p[1], fs.RootOwner)
+	if err != nil {
+		t.Fatalf("Failed to create File: %v", err)
+	}
+	file.DecRef()
+
+	s := make([]byte, 10)
+	if c, err := syscall.Read(p[0], s); c != 0 || err != nil {
+		t.Errorf("want 0, nil (EOF) from read end, got %v, %v", c, err)
+	}
+}
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
new file mode 100644
index 000000000..3c07c3850
--- /dev/null
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+	var t linux.Termios
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TCGETS, uintptr(unsafe.Pointer(&t)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
new file mode 100644
index 000000000..8e36ed7ee
--- /dev/null
+++ b/pkg/sentry/fs/host/socket.go
@@ -0,0 +1,471 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+// endpoint encapsulates the state needed to represent a host Unix socket.
+type endpoint struct {
+	queue waiter.Queue `state:"nosave"`
+
+	// stype is the type of Unix socket. (Ex: unix.SockStream,
+	// unix.SockSeqpacket, unix.SockDgram)
+	stype unix.SockType `state:"nosave"`
+
+	// fd is the host fd backing this file.
+	fd int `state:"nosave"`
+
+	// If srfd >= 0, it is the host fd that fd was imported from.
+	srfd int `state:"wait"`
+}
+
+func (e *endpoint) init() error {
+	family, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+	if err != nil {
+		return err
+	}
+
+	if family != syscall.AF_UNIX {
+		// We only allow Unix sockets.
+		return syserror.EINVAL
+	}
+
+	stype, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return err
+	}
+
+	if err := syscall.SetNonblock(e.fd, true); err != nil {
+		return err
+	}
+
+	e.stype = unix.SockType(stype)
+	if err := fdnotifier.AddFD(int32(e.fd), &e.queue); err != nil {
+		return err
+	}
+	return nil
+}
+
+// newEndpoint creates a new host endpoint.
+func newEndpoint(fd int, srfd int) (*endpoint, error) {
+	ep := &endpoint{fd: fd, srfd: srfd}
+	if err := ep.init(); err != nil {
+		return nil, err
+	}
+	return ep, nil
+}
+
+// newSocket allocates a new unix socket with host endpoint.
+func newSocket(ctx context.Context, fd int, saveable bool) (*fs.File, error) {
+	ownedfd := fd
+	srfd := -1
+	if saveable {
+		var err error
+		ownedfd, err = syscall.Dup(fd)
+		if err != nil {
+			return nil, err
+		}
+		srfd = fd
+	}
+	ep, err := newEndpoint(ownedfd, srfd)
+	if err != nil {
+		if saveable {
+			syscall.Close(ownedfd)
+		}
+		return nil, err
+	}
+	return unixsocket.New(ctx, ep), nil
+}
+
+// NewSocketWithDirent allocates a new unix socket with host endpoint.
+//
+// This is currently only used by unsaveable Gofer nodes.
+//
+// NewSocketWithDirent takes ownership of f on success.
+func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
+	ep, err := newEndpoint(f.FD(), -1)
+	if err != nil {
+		return nil, err
+	}
+
+	// Take ownship of the FD.
+	f.Release()
+
+	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
+}
+
+// Close implements unix.Endpoint.Close.
+func (e *endpoint) Close() {
+	fdnotifier.RemoveFD(int32(e.fd))
+	syscall.Close(e.fd)
+	e.fd = -1
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (e *endpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
+	e.queue.EventRegister(we, mask)
+	fdnotifier.UpdateFD(int32(e.fd))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (e *endpoint) EventUnregister(we *waiter.Entry) {
+	e.queue.EventUnregister(we)
+	fdnotifier.UpdateFD(int32(e.fd))
+}
+
+// Readiness implements unix.Endpoint.Readiness.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(e.fd), mask)
+}
+
+// Type implements unix.Endpoint.Type.
+func (e *endpoint) Type() unix.SockType {
+	return e.stype
+}
+
+// Connect implements unix.Endpoint.Connect.
+func (e *endpoint) Connect(server unix.BoundEndpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Bind implements unix.Endpoint.Bind.
+func (e *endpoint) Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Listen implements unix.Endpoint.Listen.
+func (e *endpoint) Listen(backlog int) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Accept implements unix.Endpoint.Accept.
+func (e *endpoint) Accept() (unix.Endpoint, *tcpip.Error) {
+	return nil, tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown implements unix.Endpoint.Shutdown.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// GetSockOpt implements unix.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		_, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_ERROR)
+		return translateError(err)
+	case *tcpip.PasscredOption:
+		// We don't support passcred on host sockets.
+		*o = 0
+		return nil
+	case *tcpip.SendBufferSizeOption:
+		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+		*o = tcpip.SendBufferSizeOption(v)
+		return translateError(err)
+	case *tcpip.ReceiveBufferSizeOption:
+		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+		*o = tcpip.ReceiveBufferSizeOption(v)
+		return translateError(err)
+	case *tcpip.ReuseAddressOption:
+		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR)
+		*o = tcpip.ReuseAddressOption(v)
+		return translateError(err)
+	case *tcpip.ReceiveQueueSizeOption:
+		return tcpip.ErrQueueSizeNotSupported
+	}
+	return tcpip.ErrInvalidEndpointState
+}
+
+// SetSockOpt implements unix.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	return nil
+}
+
+// GetLocalAddress implements unix.Endpoint.GetLocalAddress.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, nil
+}
+
+// GetRemoteAddress implements unix.Endpoint.GetRemoteAddress.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, nil
+}
+
+// Passcred returns whether or not the SO_PASSCRED socket option is
+// enabled on this end.
+func (e *endpoint) Passcred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
+// is enabled on the connected end.
+func (e *endpoint) ConnectedPasscred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// SendMsg implements unix.Endpoint.SendMsg.
+func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages, to unix.BoundEndpoint) (uintptr, *tcpip.Error) {
+	if to != nil {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+	return sendMsg(e.fd, data, controlMessages)
+}
+
+func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages) (uintptr, *tcpip.Error) {
+	if !controlMessages.Empty() {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+	n, err := fdWriteVec(fd, data)
+	return n, translateError(err)
+}
+
+// RecvMsg implements unix.Endpoint.RecvMsg.
+func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+	return recvMsg(e.fd, data, numRights, peek, addr)
+}
+
+func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+	var cm unet.ControlMessage
+	if numRights > 0 {
+		cm.EnableFDs(int(numRights))
+	}
+	rl, ml, cl, err := fdReadVec(fd, data, []byte(cm), peek)
+	if err == syscall.EAGAIN {
+		return 0, 0, unix.ControlMessages{}, tcpip.ErrWouldBlock
+	}
+	if err != nil {
+		return 0, 0, unix.ControlMessages{}, translateError(err)
+	}
+
+	// Trim the control data if we received less than the full amount.
+	if cl < uint64(len(cm)) {
+		cm = cm[:cl]
+	}
+
+	// Avoid extra allocations in the case where there isn't any control data.
+	if len(cm) == 0 {
+		return rl, ml, unix.ControlMessages{}, nil
+	}
+
+	fds, err := cm.ExtractFDs()
+	if err != nil {
+		return 0, 0, unix.ControlMessages{}, translateError(err)
+	}
+
+	if len(fds) == 0 {
+		return rl, ml, unix.ControlMessages{}, nil
+	}
+	return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil
+}
+
+// NewConnectedEndpoint creates a new unix.Receiver and unix.ConnectedEndpoint
+// backed by a host FD that will pretend to be bound at a given sentry path.
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (unix.Receiver, unix.ConnectedEndpoint, *tcpip.Error) {
+	if err := fdnotifier.AddFD(int32(file.FD()), queue); err != nil {
+		return nil, nil, translateError(err)
+	}
+
+	e := &connectedEndpoint{path: path, queue: queue, file: file}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+
+	return e, e, nil
+}
+
+// connectedEndpoint is a host FD backed implementation of
+// unix.ConnectedEndpoint and unix.Receiver.
+//
+// connectedEndpoint does not support save/restore for now.
+type connectedEndpoint struct {
+	queue *waiter.Queue
+	path  string
+
+	// ref keeps track of references to a connectedEndpoint.
+	ref refs.AtomicRefCount
+
+	// mu protects fd, readClosed and writeClosed.
+	mu sync.RWMutex
+
+	// file is an *fd.FD containing the FD backing this endpoint. It must be
+	// set to nil if it has been closed.
+	file *fd.FD
+
+	// readClosed is true if the FD has read shutdown or if it has been closed.
+	readClosed bool
+
+	// writeClosed is true if the FD has write shutdown or if it has been
+	// closed.
+	writeClosed bool
+}
+
+// Send implements unix.ConnectedEndpoint.Send.
+func (c *connectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.writeClosed {
+		return 0, false, tcpip.ErrClosedForSend
+	}
+	n, err := sendMsg(c.file.FD(), data, controlMessages)
+	// There is no need for the callee to call SendNotify because sendMsg uses
+	// the host's sendmsg(2) and the host kernel's queue.
+	return n, false, err
+}
+
+// SendNotify implements unix.ConnectedEndpoint.SendNotify.
+func (c *connectedEndpoint) SendNotify() {}
+
+// CloseSend implements unix.ConnectedEndpoint.CloseSend.
+func (c *connectedEndpoint) CloseSend() {
+	c.mu.Lock()
+	c.writeClosed = true
+	c.mu.Unlock()
+}
+
+// CloseNotify implements unix.ConnectedEndpoint.CloseNotify.
+func (c *connectedEndpoint) CloseNotify() {}
+
+// Writable implements unix.ConnectedEndpoint.Writable.
+func (c *connectedEndpoint) Writable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.writeClosed {
+		return true
+	}
+	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
+}
+
+// Passcred implements unix.ConnectedEndpoint.Passcred.
+func (c *connectedEndpoint) Passcred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// GetLocalAddress implements unix.ConnectedEndpoint.GetLocalAddress.
+func (c *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil
+}
+
+// EventUpdate implements unix.ConnectedEndpoint.EventUpdate.
+func (c *connectedEndpoint) EventUpdate() {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.file.FD() != -1 {
+		fdnotifier.UpdateFD(int32(c.file.FD()))
+	}
+}
+
+// Recv implements unix.Receiver.Recv.
+func (c *connectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.readClosed {
+		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
+	}
+	rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil)
+	// There is no need for the callee to call RecvNotify because recvMsg uses
+	// the host's recvmsg(2) and the host kernel's queue.
+	return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err
+}
+
+// close releases all resources related to the endpoint.
+func (c *connectedEndpoint) close() {
+	fdnotifier.RemoveFD(int32(c.file.FD()))
+	c.file.Close()
+	c.file = nil
+}
+
+// RecvNotify implements unix.Receiver.RecvNotify.
+func (c *connectedEndpoint) RecvNotify() {}
+
+// CloseRecv implements unix.Receiver.CloseRecv.
+func (c *connectedEndpoint) CloseRecv() {
+	c.mu.Lock()
+	c.readClosed = true
+	c.mu.Unlock()
+}
+
+// Readable implements unix.Receiver.Readable.
+func (c *connectedEndpoint) Readable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.readClosed {
+		return true
+	}
+	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
+}
+
+// SendQueuedSize implements unix.Receiver.SendQueuedSize.
+func (c *connectedEndpoint) SendQueuedSize() int64 {
+	// SendQueuedSize isn't supported for host sockets because we don't allow the
+	// sentry to call ioctl(2).
+	return -1
+}
+
+// RecvQueuedSize implements unix.Receiver.RecvQueuedSize.
+func (c *connectedEndpoint) RecvQueuedSize() int64 {
+	// RecvQueuedSize isn't supported for host sockets because we don't allow the
+	// sentry to call ioctl(2).
+	return -1
+}
+
+// SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize.
+func (c *connectedEndpoint) SendMaxQueueSize() int64 {
+	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return -1
+	}
+	return int64(v)
+}
+
+// RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize.
+func (c *connectedEndpoint) RecvMaxQueueSize() int64 {
+	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+	if err != nil {
+		return -1
+	}
+	return int64(v)
+}
+
+// Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release.
+func (c *connectedEndpoint) Release() {
+	c.ref.DecRefWithDestructor(c.close)
+}
+
+func translateError(err error) *tcpip.Error {
+	if err == nil {
+		return nil
+	}
+	return rawfile.TranslateErrno(err.(syscall.Errno))
+}
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
new file mode 100644
index 000000000..6acabd55a
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+)
+
+// beforeSave is invoked by stateify.
+func (ep *endpoint) beforeSave() {
+	if ep.srfd < 0 {
+		panic("only host file descriptors provided at sentry startup can be saved")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (ep *endpoint) afterLoad() {
+	fd, err := syscall.Dup(ep.srfd)
+	if err != nil {
+		panic(fmt.Sprintf("failed to dup restored fd %d: %v", ep.srfd, err))
+	}
+	ep.fd = fd
+	if err := ep.init(); err != nil {
+		panic(fmt.Sprintf("Could not restore host socket fd %d: %v", ep.srfd, err))
+	}
+}
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
new file mode 100644
index 000000000..80c46dcfa
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -0,0 +1,401 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"reflect"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+var (
+	// Make sure that connectedEndpoint implements unix.ConnectedEndpoint.
+	_ = unix.ConnectedEndpoint(new(connectedEndpoint))
+
+	// Make sure that connectedEndpoint implements unix.Receiver.
+	_ = unix.Receiver(new(connectedEndpoint))
+)
+
+func getFl(fd int) (uint32, error) {
+	fl, _, err := syscall.RawSyscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+	if err == 0 {
+		return uint32(fl), nil
+	}
+	return 0, err
+}
+
+func TestSocketIsBlocking(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+
+	fl, err := getFl(pair[0])
+	if err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
+	}
+	if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+		t.Fatalf("Expected socket %v to be blocking", pair[0])
+	}
+	if fl, err = getFl(pair[1]); err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err)
+	}
+	if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+		t.Fatalf("Expected socket %v to be blocking", pair[1])
+	}
+	sock, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) failed => %v", pair[0], err)
+	}
+	defer sock.DecRef()
+	// Test that the socket now is non blocking.
+	if fl, err = getFl(pair[0]); err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
+	}
+	if fl&syscall.O_NONBLOCK != syscall.O_NONBLOCK {
+		t.Errorf("Expected socket %v to have becoming non blocking", pair[0])
+	}
+	if fl, err = getFl(pair[1]); err != nil {
+		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err)
+	}
+	if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
+		t.Errorf("Did not expect socket %v to become non blocking", pair[1])
+	}
+}
+
+func TestSocketWritev(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+	socket, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer socket.DecRef()
+	buf := []byte("hello world\n")
+	n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf))
+	if err != nil {
+		t.Fatalf("socket writev failed: %v", err)
+	}
+
+	if n != int64(len(buf)) {
+		t.Fatalf("socket writev wrote incorrect bytes: %d", n)
+	}
+}
+
+func TestSocketWritevLen0(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+	socket, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer socket.DecRef()
+	n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil))
+	if err != nil {
+		t.Fatalf("socket writev failed: %v", err)
+	}
+
+	if n != 0 {
+		t.Fatalf("socket writev wrote incorrect bytes: %d", n)
+	}
+}
+
+func TestSocketSendMsgLen0(t *testing.T) {
+	// Using socketpair here because it's already connected.
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("host socket creation failed: %v", err)
+	}
+	sfile, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer sfile.DecRef()
+
+	s := sfile.FileOperations.(socket.Socket)
+	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, unix.ControlMessages{})
+	if n != 0 {
+		t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n)
+	}
+
+	if terr != nil {
+		t.Fatalf("socket sendmsg() failed: %v", terr)
+	}
+}
+
+func TestListen(t *testing.T) {
+	pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
+	}
+	sfile1, err := newSocket(contexttest.Context(t), pair[0], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[0], err)
+	}
+	defer sfile1.DecRef()
+	socket1 := sfile1.FileOperations.(socket.Socket)
+
+	sfile2, err := newSocket(contexttest.Context(t), pair[1], false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", pair[1], err)
+	}
+	defer sfile2.DecRef()
+	socket2 := sfile2.FileOperations.(socket.Socket)
+
+	// Socketpairs can not be listened to.
+	if err := socket1.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+		t.Fatalf("socket1.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+	}
+	if err := socket2.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+		t.Fatalf("socket2.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+	}
+
+	// Create a Unix socket, do not bind it.
+	sock, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+	if err != nil {
+		t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err)
+	}
+	sfile3, err := newSocket(contexttest.Context(t), sock, false)
+	if err != nil {
+		t.Fatalf("newSocket(%v) => %v", sock, err)
+	}
+	defer sfile3.DecRef()
+	socket3 := sfile3.FileOperations.(socket.Socket)
+
+	// This socket is not bound so we can't listen on it.
+	if err := socket3.Listen(nil, 64); err != syserr.ErrInvalidEndpointState {
+		t.Fatalf("socket3.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err)
+	}
+}
+
+func TestSend(t *testing.T) {
+	e := connectedEndpoint{writeClosed: true}
+	if _, _, err := e.Send(nil, unix.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend {
+		t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend)
+	}
+}
+
+func TestRecv(t *testing.T) {
+	e := connectedEndpoint{readClosed: true}
+	if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != tcpip.ErrClosedForReceive {
+		t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, tcpip.ErrClosedForReceive)
+	}
+}
+
+func TestPasscred(t *testing.T) {
+	e := connectedEndpoint{}
+	if got, want := e.Passcred(), false; got != want {
+		t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want)
+	}
+}
+
+func TestGetLocalAddress(t *testing.T) {
+	e := connectedEndpoint{path: "foo"}
+	want := tcpip.FullAddress{Addr: tcpip.Address("foo")}
+	if got, err := e.GetLocalAddress(); err != nil || got != want {
+		t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil)
+	}
+}
+
+func TestQueuedSize(t *testing.T) {
+	e := connectedEndpoint{}
+	tests := []struct {
+		name string
+		f    func() int64
+	}{
+		{"SendQueuedSize", e.SendQueuedSize},
+		{"RecvQueuedSize", e.RecvQueuedSize},
+	}
+
+	for _, test := range tests {
+		if got, want := test.f(), int64(-1); got != want {
+			t.Errorf("Got %#v.%s() = %d, want = %d", e, test.name, got, want)
+		}
+	}
+}
+
+func TestReadable(t *testing.T) {
+	e := connectedEndpoint{readClosed: true}
+	if got, want := e.Readable(), true; got != want {
+		t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want)
+	}
+}
+
+func TestWritable(t *testing.T) {
+	e := connectedEndpoint{writeClosed: true}
+	if got, want := e.Writable(), true; got != want {
+		t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want)
+	}
+}
+
+func TestRelease(t *testing.T) {
+	f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	want := &connectedEndpoint{queue: c.queue}
+	want.ref.DecRef()
+	fdnotifier.AddFD(int32(c.file.FD()), nil)
+	c.Release()
+	if !reflect.DeepEqual(c, want) {
+		t.Errorf("got = %#v, want = %#v", c, want)
+	}
+}
+
+func TestClose(t *testing.T) {
+	type testCase struct {
+		name  string
+		cep   *connectedEndpoint
+		addFD bool
+		f     func()
+		want  *connectedEndpoint
+	}
+
+	var tests []testCase
+
+	// nil is the value used by connectedEndpoint to indicate a closed file.
+	// Non-nil files are used to check if the file gets closed.
+
+	f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	tests = append(tests, testCase{
+		name:  "First CloseRecv",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+	tests = append(tests, testCase{
+		name:  "Second CloseRecv",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	tests = append(tests, testCase{
+		name:  "First CloseSend",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "Second CloseSend",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "CloseSend then CloseRecv",
+		cep:   c,
+		addFD: true,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+	tests = append(tests, testCase{
+		name:  "CloseRecv then CloseSend",
+		cep:   c,
+		addFD: true,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "Full close then CloseRecv",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseRecv,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		t.Fatal("Creating socket:", err)
+	}
+	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+	tests = append(tests, testCase{
+		name:  "Full close then CloseSend",
+		cep:   c,
+		addFD: false,
+		f:     c.CloseSend,
+		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+	})
+
+	for _, test := range tests {
+		if test.addFD {
+			fdnotifier.AddFD(int32(test.cep.file.FD()), nil)
+		}
+		if test.f(); !reflect.DeepEqual(test.cep, test.want) {
+			t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
new file mode 100644
index 000000000..bf8da6867
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// buildIovec builds an iovec slice from the given []byte slice.
+func buildIovec(bufs [][]byte) (uintptr, []syscall.Iovec) {
+	var length uintptr
+	iovecs := make([]syscall.Iovec, 0, 10)
+	for i := range bufs {
+		if l := len(bufs[i]); l > 0 {
+			length += uintptr(l)
+			iovecs = append(iovecs, syscall.Iovec{
+				Base: &bufs[i][0],
+				Len:  uint64(l),
+			})
+		}
+	}
+	return length, iovecs
+}
+
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
+	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
+	if peek {
+		flags |= syscall.MSG_PEEK
+	}
+
+	length, iovecs := buildIovec(bufs)
+
+	var msg syscall.Msghdr
+	if len(control) != 0 {
+		msg.Control = &control[0]
+		msg.Controllen = uint64(len(control))
+	}
+
+	if len(iovecs) != 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+	n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+	if e != 0 {
+		return 0, 0, 0, e
+	}
+
+	if n > length {
+		return length, n, msg.Controllen, nil
+	}
+
+	return n, n, msg.Controllen, nil
+}
+
+func fdWriteVec(fd int, bufs [][]byte) (uintptr, error) {
+	_, iovecs := buildIovec(bufs)
+
+	var msg syscall.Msghdr
+	if len(iovecs) > 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+	n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
+	if e != 0 {
+		return 0, e
+	}
+
+	return n, nil
+}
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
new file mode 100644
index 000000000..74c703eb7
--- /dev/null
+++ b/pkg/sentry/fs/host/util.go
@@ -0,0 +1,197 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"os"
+	"path"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func open(parent *inodeOperations, name string) (int, error) {
+	if parent == nil && !path.IsAbs(name) {
+		return -1, syserror.EINVAL
+	}
+	name = path.Clean(name)
+
+	// Don't follow through symlinks.
+	flags := syscall.O_NOFOLLOW
+
+	if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil {
+		return fd, nil
+	}
+	// Retry as read-only.
+	if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil {
+		return fd, nil
+	}
+
+	// Retry as write-only.
+	if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil {
+		return fd, nil
+	}
+
+	// Retry as a symlink, by including O_PATH as an option.
+	fd, err := openAt(parent, name, linux.O_PATH|flags, 0)
+	if err == nil {
+		return fd, nil
+	}
+
+	// Everything failed.
+	return -1, err
+}
+
+func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) {
+	if parent == nil {
+		return syscall.Open(name, flags, uint32(perm))
+	}
+	return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm))
+}
+
+func nodeType(s *syscall.Stat_t) fs.InodeType {
+	switch x := (s.Mode & syscall.S_IFMT); x {
+	case syscall.S_IFLNK:
+		return fs.Symlink
+	case syscall.S_IFIFO:
+		return fs.Pipe
+	case syscall.S_IFCHR:
+		return fs.CharacterDevice
+	case syscall.S_IFBLK:
+		return fs.BlockDevice
+	case syscall.S_IFSOCK:
+		return fs.Socket
+	case syscall.S_IFDIR:
+		return fs.Directory
+	case syscall.S_IFREG:
+		return fs.RegularFile
+	default:
+		// This shouldn't happen, but just in case...
+		log.Warningf("unknown host file type %d: assuming regular", x)
+		return fs.RegularFile
+	}
+}
+
+func wouldBlock(s *syscall.Stat_t) bool {
+	typ := nodeType(s)
+	return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice
+}
+
+func stableAttr(s *syscall.Stat_t) fs.StableAttr {
+	return fs.StableAttr{
+		Type:     nodeType(s),
+		DeviceID: hostFileDevice.DeviceID(),
+		InodeID: hostFileDevice.Map(device.MultiDeviceKey{
+			Device: s.Dev,
+			Inode:  s.Ino,
+		}),
+		BlockSize: int64(s.Blksize),
+	}
+}
+
+func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner {
+	// User requested no translation, just return actual owner.
+	if mo.dontTranslateOwnership {
+		return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)}
+	}
+
+	// Show only IDs relevant to the sandboxed task. I.e. if we not own the
+	// file, no sandboxed task can own the file. In that case, we
+	// use OverflowID for UID, implying that the IDs are not mapped in the
+	// "root" user namespace.
+	//
+	// E.g.
+	// sandbox's host EUID/EGID is 1/1.
+	// some_dir's host UID/GID is 2/1.
+	// Task that mounted this fs has virtualized EUID/EGID 5/5.
+	//
+	// If you executed `ls -n` in the sandboxed task, it would show:
+	// drwxwrxwrx [...] 65534 5 [...] some_dir
+
+	// Files are owned by OverflowID by default.
+	owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)}
+
+	// If we own file on host, let mounting task's initial EUID own
+	// the file.
+	if s.Uid == hostUID {
+		owner.UID = mo.mounter.UID
+	}
+
+	// If our group matches file's group, make file's group match
+	// the mounting task's initial EGID.
+	for _, gid := range hostGIDs {
+		if s.Gid == gid {
+			owner.GID = mo.mounter.GID
+			break
+		}
+	}
+	return owner
+}
+
+func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr {
+	return fs.UnstableAttr{
+		Size:             s.Size,
+		Usage:            s.Blocks * 512,
+		Perms:            fs.FilePermsFromMode(linux.FileMode(s.Mode)),
+		Owner:            owner(mo, s),
+		AccessTime:       ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec),
+		ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec),
+		StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec),
+		Links:            s.Nlink,
+	}
+}
+
+type dirInfo struct {
+	buf  []byte // buffer for directory I/O.
+	nbuf int    // length of buf; return value from ReadDirent.
+	bufp int    // location of next record in buf.
+}
+
+// isBlockError unwraps os errors and checks if they are caused by EAGAIN or
+// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK {
+		return true
+	}
+	if pe, ok := err.(*os.PathError); ok {
+		return isBlockError(pe.Err)
+	}
+	return false
+}
+
+func hostEffectiveKIDs() (uint32, []uint32, error) {
+	gids, err := os.Getgroups()
+	if err != nil {
+		return 0, nil, err
+	}
+	egids := make([]uint32, len(gids))
+	for i, gid := range gids {
+		egids[i] = uint32(gid)
+	}
+	return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil
+}
+
+var hostUID uint32
+var hostGIDs []uint32
+
+func init() {
+	hostUID, hostGIDs, _ = hostEffectiveKIDs()
+}
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
new file mode 100644
index 000000000..c38d2392d
--- /dev/null
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+func createLink(fd int, name string, linkName string) error {
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return err
+	}
+	linkNamePtr, err := syscall.BytePtrFromString(linkName)
+	if err != nil {
+		return err
+	}
+	_, _, errno := syscall.Syscall(
+		syscall.SYS_SYMLINKAT,
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(fd),
+		uintptr(unsafe.Pointer(linkNamePtr)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func readLink(fd int) (string, error) {
+	// Buffer sizing copied from os.Readlink.
+	for l := 128; ; l *= 2 {
+		b := make([]byte, l)
+		n, _, errno := syscall.Syscall6(
+			syscall.SYS_READLINKAT,
+			uintptr(fd),
+			uintptr(unsafe.Pointer(syscall.StringBytePtr(""))),
+			uintptr(unsafe.Pointer(&b[0])),
+			uintptr(l),
+			0, 0)
+		if n < 0 {
+			n = 0
+		}
+		if errno != 0 {
+			return "", errno
+		}
+		if n < uintptr(l) {
+			return string(b[:n]), nil
+		}
+	}
+}
+
+func unlinkAt(fd int, name string, dir bool) error {
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return err
+	}
+	var flags uintptr
+	if dir {
+		flags = linux.AT_REMOVEDIR
+	}
+	_, _, errno := syscall.Syscall(
+		syscall.SYS_UNLINKAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		flags,
+	)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec {
+	if omit {
+		return syscall.Timespec{0, linux.UTIME_OMIT}
+	}
+	if setSysTime {
+		return syscall.Timespec{0, linux.UTIME_NOW}
+	}
+	return syscall.NsecToTimespec(t.Nanoseconds())
+}
+
+func setTimestamps(fd int, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+	var sts [2]syscall.Timespec
+	sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime)
+	sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime)
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(fd),
+		0, /* path */
+		uintptr(unsafe.Pointer(&sts)),
+		0, /* flags */
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	namePtr, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return stat, err
+	}
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(fd),
+		uintptr(unsafe.Pointer(namePtr)),
+		uintptr(unsafe.Pointer(&stat)),
+		uintptr(flags),
+		0, 0)
+	if errno != 0 {
+		return stat, errno
+	}
+	return stat, nil
+}
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
new file mode 100644
index 000000000..c5f5c9c0d
--- /dev/null
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -0,0 +1,70 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestWait(t *testing.T) {
+	var fds [2]int
+	err := syscall.Pipe(fds[:])
+	if err != nil {
+		t.Fatalf("Unable to create pipe: %v", err)
+	}
+
+	defer syscall.Close(fds[1])
+
+	ctx := contexttest.Context(t)
+	file, err := NewFile(ctx, fds[0], fs.RootOwner)
+	if err != nil {
+		syscall.Close(fds[0])
+		t.Fatalf("NewFile failed: %v", err)
+	}
+
+	defer file.DecRef()
+
+	r := file.Readiness(waiter.EventIn)
+	if r != 0 {
+		t.Fatalf("File is ready for read when it shouldn't be.")
+	}
+
+	e, ch := waiter.NewChannelEntry(nil)
+	file.EventRegister(&e, waiter.EventIn)
+	defer file.EventUnregister(&e)
+
+	// Check that there are no notifications yet.
+	if len(ch) != 0 {
+		t.Fatalf("Channel is non-empty")
+	}
+
+	// Write to the pipe, so it should be writable now.
+	syscall.Write(fds[1], []byte{1})
+
+	// Check that we get a notification. We need to yield the current thread
+	// so that the fdnotifier can deliver notifications, so we use a
+	// 1-second timeout instead of just checking the length of the channel.
+	select {
+	case <-ch:
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Channel not notified")
+	}
+}
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
new file mode 100644
index 000000000..b624f4182
--- /dev/null
+++ b/pkg/sentry/fs/inode.go
@@ -0,0 +1,455 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// Inode is a file system object that can be simulatenously referenced by different
+// components of the VFS (Dirent, fs.File, etc).
+type Inode struct {
+	// AtomicRefCount is our reference count.
+	refs.AtomicRefCount
+
+	// InodeOperations is the file system specific behavior of the Inode.
+	InodeOperations InodeOperations
+
+	// StableAttr are stable cached attributes of the Inode.
+	StableAttr StableAttr
+
+	// LockCtx is the file lock context. It manages its own sychronization and tracks
+	// regions of the Inode that have locks held.
+	LockCtx LockCtx
+
+	// Watches is the set of inotify watches for this inode.
+	Watches *Watches
+
+	// MountSource is the mount source this Inode is a part of.
+	MountSource *MountSource
+
+	// overlay is the overlay entry for this Inode.
+	overlay *overlayEntry
+}
+
+// LockCtx is an Inode's lock context and contains different personalities of locks; both
+// Posix and BSD style locks are supported.
+//
+// Note that in Linux fcntl(2) and flock(2) locks are _not_ cooperative, because race and
+// deadlock conditions make merging them prohibitive. We do the same and keep them oblivious
+// to each other but provide a "context" as a convenient container.
+type LockCtx struct {
+	// Posix is a set of POSIX-style regional advisory locks, see fcntl(2).
+	Posix lock.Locks
+
+	// BSD is a set of BSD-style advisory file wide locks, see flock(2).
+	BSD lock.Locks
+}
+
+// NewInode constructs an Inode from InodeOperations, a MountSource, and stable attributes.
+//
+// NewInode takes a reference on msrc.
+func NewInode(iops InodeOperations, msrc *MountSource, sattr StableAttr) *Inode {
+	msrc.IncRef()
+	return &Inode{
+		InodeOperations: iops,
+		StableAttr:      sattr,
+		Watches:         newWatches(),
+		MountSource:     msrc,
+	}
+}
+
+// DecRef drops a reference on the Inode.
+func (i *Inode) DecRef() {
+	i.DecRefWithDestructor(i.destroy)
+}
+
+// destroy releases the Inode and releases the msrc reference taken.
+func (i *Inode) destroy() {
+	// FIXME: Context is not plumbed here.
+	ctx := context.Background()
+	if err := i.WriteOut(ctx); err != nil {
+		// FIXME: Mark as warning again once noatime is
+		// properly supported.
+		log.Debugf("Inode %+v, failed to sync all metadata: %v", i.StableAttr, err)
+	}
+
+	// If this inode is being destroyed because it was unlinked, queue a
+	// deletion event. This may not be the case for inodes being revalidated.
+	if i.Watches.unlinked {
+		i.Watches.Notify("", linux.IN_DELETE_SELF, 0)
+	}
+
+	// Remove references from the watch owners to the watches on this inode,
+	// since the watches are about to be GCed. Note that we don't need to worry
+	// about the watch pins since if there were any active pins, this inode
+	// wouldn't be in the destructor.
+	i.Watches.targetDestroyed()
+
+	// Overlay resources should be released synchronously, since they may
+	// trigger more Inode.destroy calls which must themselves be handled
+	// synchronously, like the WriteOut call above.
+	if i.overlay != nil {
+		i.overlay.release()
+		i.MountSource.DecRef()
+		return
+	}
+
+	// Regular (non-overlay) resources may be released asynchronously.
+	Async(func() {
+		i.InodeOperations.Release(ctx)
+		i.MountSource.DecRef()
+	})
+}
+
+// Mappable calls i.InodeOperations.Mappable.
+func (i *Inode) Mappable() memmap.Mappable {
+	if i.overlay != nil {
+		// In an overlay, Mappable is always implemented by
+		// the overlayEntry metadata to synchronize memory
+		// access of files with copy up. But first check if
+		// the Inodes involved would be mappable in the first
+		// place.
+		i.overlay.copyMu.RLock()
+		ok := i.overlay.isMappableLocked()
+		i.overlay.copyMu.RUnlock()
+		if !ok {
+			return nil
+		}
+		return i.overlay
+	}
+	return i.InodeOperations.Mappable(i)
+}
+
+// WriteOut calls i.InodeOperations.WriteOut with i as the Inode.
+func (i *Inode) WriteOut(ctx context.Context) error {
+	if i.overlay != nil {
+		return overlayWriteOut(ctx, i.overlay)
+	}
+	return i.InodeOperations.WriteOut(ctx, i)
+}
+
+// Lookup calls i.InodeOperations.Lookup with i as the directory.
+func (i *Inode) Lookup(ctx context.Context, name string) (*Dirent, error) {
+	if i.overlay != nil {
+		return overlayLookup(ctx, i.overlay, i, name)
+	}
+	return i.InodeOperations.Lookup(ctx, i, name)
+}
+
+// Create calls i.InodeOperations.Create with i as the directory.
+func (i *Inode) Create(ctx context.Context, d *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) {
+	if i.overlay != nil {
+		return overlayCreate(ctx, i.overlay, d, name, flags, perm)
+	}
+	return i.InodeOperations.Create(ctx, i, name, flags, perm)
+}
+
+// CreateDirectory calls i.InodeOperations.CreateDirectory with i as the directory.
+func (i *Inode) CreateDirectory(ctx context.Context, d *Dirent, name string, perm FilePermissions) error {
+	if i.overlay != nil {
+		return overlayCreateDirectory(ctx, i.overlay, d, name, perm)
+	}
+	return i.InodeOperations.CreateDirectory(ctx, i, name, perm)
+}
+
+// CreateLink calls i.InodeOperations.CreateLink with i as the directory.
+func (i *Inode) CreateLink(ctx context.Context, d *Dirent, oldname string, newname string) error {
+	if i.overlay != nil {
+		return overlayCreateLink(ctx, i.overlay, d, oldname, newname)
+	}
+	return i.InodeOperations.CreateLink(ctx, i, oldname, newname)
+}
+
+// CreateHardLink calls i.InodeOperations.CreateHardLink with i as the directory.
+func (i *Inode) CreateHardLink(ctx context.Context, d *Dirent, target *Dirent, name string) error {
+	if i.overlay != nil {
+		return overlayCreateHardLink(ctx, i.overlay, d, target, name)
+	}
+	return i.InodeOperations.CreateHardLink(ctx, i, target.Inode, name)
+}
+
+// CreateFifo calls i.InodeOperations.CreateFifo with i as the directory.
+func (i *Inode) CreateFifo(ctx context.Context, d *Dirent, name string, perm FilePermissions) error {
+	if i.overlay != nil {
+		return overlayCreateFifo(ctx, i.overlay, d, name, perm)
+	}
+	return i.InodeOperations.CreateFifo(ctx, i, name, perm)
+}
+
+// Remove calls i.InodeOperations.Remove/RemoveDirectory with i as the directory.
+func (i *Inode) Remove(ctx context.Context, d *Dirent, remove *Dirent) error {
+	if i.overlay != nil {
+		return overlayRemove(ctx, i.overlay, d, remove)
+	}
+	switch remove.Inode.StableAttr.Type {
+	case Directory, SpecialDirectory:
+		return i.InodeOperations.RemoveDirectory(ctx, i, remove.name)
+	default:
+		return i.InodeOperations.Remove(ctx, i, remove.name)
+	}
+}
+
+// Rename calls i.InodeOperations.Rename with the given arguments.
+func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string) error {
+	if i.overlay != nil {
+		return overlayRename(ctx, i.overlay, oldParent, renamed, newParent, newName)
+	}
+	return i.InodeOperations.Rename(ctx, oldParent.Inode, renamed.name, newParent.Inode, newName)
+}
+
+// Bind calls i.InodeOperations.Bind with i as the directory.
+func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) error {
+	if i.overlay != nil {
+		return overlayBind(ctx, i.overlay, name, data, perm)
+	}
+	return i.InodeOperations.Bind(ctx, i, name, data, perm)
+}
+
+// BoundEndpoint calls i.InodeOperations.BoundEndpoint with i as the Inode.
+func (i *Inode) BoundEndpoint(path string) unix.BoundEndpoint {
+	if i.overlay != nil {
+		return overlayBoundEndpoint(i.overlay, path)
+	}
+	return i.InodeOperations.BoundEndpoint(i, path)
+}
+
+// GetFile calls i.InodeOperations.GetFile with the given arguments.
+func (i *Inode) GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File, error) {
+	if i.overlay != nil {
+		return overlayGetFile(ctx, i.overlay, d, flags)
+	}
+	return i.InodeOperations.GetFile(ctx, d, flags)
+}
+
+// UnstableAttr calls i.InodeOperations.UnstableAttr with i as the Inode.
+func (i *Inode) UnstableAttr(ctx context.Context) (UnstableAttr, error) {
+	if i.overlay != nil {
+		return overlayUnstableAttr(ctx, i.overlay)
+	}
+	return i.InodeOperations.UnstableAttr(ctx, i)
+}
+
+// Getxattr calls i.InodeOperations.Getxattr with i as the Inode.
+func (i *Inode) Getxattr(name string) ([]byte, error) {
+	if i.overlay != nil {
+		return overlayGetxattr(i.overlay, name)
+	}
+	return i.InodeOperations.Getxattr(i, name)
+}
+
+// Listxattr calls i.InodeOperations.Listxattr with i as the Inode.
+func (i *Inode) Listxattr() (map[string]struct{}, error) {
+	if i.overlay != nil {
+		return overlayListxattr(i.overlay)
+	}
+	return i.InodeOperations.Listxattr(i)
+}
+
+// CheckPermission will check if the caller may access this file in the
+// requested way for reading, writing, or executing.
+//
+// CheckPermission is like Linux's fs/namei.c:inode_permission. It
+// - checks file system mount flags,
+// - and utilizes InodeOperations.Check to check capabilities and modes.
+func (i *Inode) CheckPermission(ctx context.Context, p PermMask) error {
+	// First check the outer-most mounted filesystem.
+	if p.Write && i.MountSource.Flags.ReadOnly {
+		return syserror.EROFS
+	}
+
+	if i.overlay != nil {
+		// CheckPermission requires some special handling for
+		// an overlay.
+		//
+		// Writes will always be redirected to an upper filesystem,
+		// so ignore all lower layers being read-only.
+		//
+		// But still honor the upper-most filesystem's mount flags;
+		// we should not attempt to modify the writable layer if it
+		// is mounted read-only.
+		if p.Write && overlayUpperMountSource(i.MountSource).Flags.ReadOnly {
+			return syserror.EROFS
+		}
+	}
+
+	return i.check(ctx, p)
+}
+
+func (i *Inode) check(ctx context.Context, p PermMask) error {
+	if i.overlay != nil {
+		return overlayCheck(ctx, i.overlay, p)
+	}
+	if !i.InodeOperations.Check(ctx, i, p) {
+		return syserror.EACCES
+	}
+	return nil
+}
+
+// SetPermissions calls i.InodeOperations.SetPermissions with i as the Inode.
+func (i *Inode) SetPermissions(ctx context.Context, d *Dirent, f FilePermissions) bool {
+	if i.overlay != nil {
+		return overlaySetPermissions(ctx, i.overlay, d, f)
+	}
+	return i.InodeOperations.SetPermissions(ctx, i, f)
+}
+
+// SetOwner calls i.InodeOperations.SetOwner with i as the Inode.
+func (i *Inode) SetOwner(ctx context.Context, d *Dirent, o FileOwner) error {
+	if i.overlay != nil {
+		return overlaySetOwner(ctx, i.overlay, d, o)
+	}
+	return i.InodeOperations.SetOwner(ctx, i, o)
+}
+
+// SetTimestamps calls i.InodeOperations.SetTimestamps with i as the Inode.
+func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error {
+	if i.overlay != nil {
+		return overlaySetTimestamps(ctx, i.overlay, d, ts)
+	}
+	return i.InodeOperations.SetTimestamps(ctx, i, ts)
+}
+
+// Truncate calls i.InodeOperations.Truncate with i as the Inode.
+func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error {
+	if i.overlay != nil {
+		return overlayTruncate(ctx, i.overlay, d, size)
+	}
+	return i.InodeOperations.Truncate(ctx, i, size)
+}
+
+// Readlink calls i.InodeOperations.Readlnk with i as the Inode.
+func (i *Inode) Readlink(ctx context.Context) (string, error) {
+	if i.overlay != nil {
+		return overlayReadlink(ctx, i.overlay)
+	}
+	return i.InodeOperations.Readlink(ctx, i)
+}
+
+// Getlink calls i.InodeOperations.Getlink.
+func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) {
+	if i.overlay != nil {
+		return overlayGetlink(ctx, i.overlay)
+	}
+	return i.InodeOperations.Getlink(ctx, i)
+}
+
+// AddLink calls i.InodeOperations.AddLink.
+func (i *Inode) AddLink() {
+	if i.overlay != nil {
+		// FIXME: Remove this from InodeOperations altogether.
+		//
+		// This interface (including DropLink and NotifyStatusChange)
+		// is only used by ramfs to update metadata of children. These
+		// filesystems should _never_ have overlay Inodes cached as
+		// children. So explicitly disallow this scenario and avoid plumbing
+		// Dirents through to do copy up.
+		panic("overlay Inodes cached in ramfs directories are not supported")
+	}
+	i.InodeOperations.AddLink()
+}
+
+// DropLink calls i.InodeOperations.DropLink.
+func (i *Inode) DropLink() {
+	if i.overlay != nil {
+		// Same as AddLink.
+		panic("overlay Inodes cached in ramfs directories are not supported")
+	}
+	i.InodeOperations.DropLink()
+}
+
+// NotifyStatusChange calls i.InodeOperations.NotifyStatusChange.
+func (i *Inode) NotifyStatusChange(ctx context.Context) {
+	if i.overlay != nil {
+		// Same as AddLink.
+		panic("overlay Inodes cached in ramfs directories are not supported")
+	}
+	i.InodeOperations.NotifyStatusChange(ctx)
+}
+
+// IsVirtual calls i.InodeOperations.IsVirtual.
+func (i *Inode) IsVirtual() bool {
+	if i.overlay != nil {
+		// An overlay configuration does not support virtual files.
+		return false
+	}
+	return i.InodeOperations.IsVirtual()
+}
+
+// StatFS calls i.InodeOperations.StatFS.
+func (i *Inode) StatFS(ctx context.Context) (Info, error) {
+	if i.overlay != nil {
+		return overlayStatFS(ctx, i.overlay)
+	}
+	return i.InodeOperations.StatFS(ctx)
+}
+
+// HandleOps extracts HandleOperations from i.
+func (i *Inode) HandleOps() HandleOperations {
+	if i.overlay != nil {
+		return overlayHandleOps(i.overlay)
+	}
+	if h, ok := i.InodeOperations.(HandleOperations); ok {
+		return h
+	}
+	return nil
+}
+
+// CheckOwnership checks whether `ctx` owns this Inode or may act as its owner.
+// Compare Linux's fs/inode.c:inode_owner_or_capable().
+func (i *Inode) CheckOwnership(ctx context.Context) bool {
+	uattr, err := i.UnstableAttr(ctx)
+	if err != nil {
+		return false
+	}
+	creds := auth.CredentialsFromContext(ctx)
+	if uattr.Owner.UID == creds.EffectiveKUID {
+		return true
+	}
+	if creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(uattr.Owner.UID).Ok() {
+		return true
+	}
+	return false
+}
+
+// CheckCapability checks whether `ctx` has capability `cp` with respect to
+// operations on this Inode.
+//
+// Compare Linux's kernel/capability.c:capable_wrt_inode_uidgid(). Note that
+// this function didn't exist in Linux 3.11.10, but was added by upstream
+// 23adbe12ef7d "fs,userns: Change inode_capable to capable_wrt_inode_uidgid"
+// to fix local privilege escalation CVE-2014-4014.
+func (i *Inode) CheckCapability(ctx context.Context, cp linux.Capability) bool {
+	uattr, err := i.UnstableAttr(ctx)
+	if err != nil {
+		return false
+	}
+	creds := auth.CredentialsFromContext(ctx)
+	if !creds.UserNamespace.MapFromKUID(uattr.Owner.UID).Ok() {
+		return false
+	}
+	if !creds.UserNamespace.MapFromKGID(uattr.Owner.GID).Ok() {
+		return false
+	}
+	return creds.HasCapability(cp)
+}
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
new file mode 100644
index 000000000..358bbecdf
--- /dev/null
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -0,0 +1,166 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sync"
+)
+
+// Watches is the collection of inotify watches on an inode.
+type Watches struct {
+	// mu protects the fields below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// ws is the map of active watches in this collection, keyed by the inotify
+	// instance id of the owner.
+	ws map[uint64]*Watch
+
+	// unlinked indicates whether the target inode was ever unlinked. This is a
+	// hack to figure out if we should queue a IN_DELETE_SELF event when this
+	// watches collection is being destroyed, since otherwise we have no way of
+	// knowing if the target inode is going down due to a deletion or
+	// revalidation.
+	unlinked bool
+}
+
+func newWatches() *Watches {
+	return &Watches{
+		ws: make(map[uint64]*Watch),
+	}
+}
+
+// MarkUnlinked indicates the target for this set of watches to be unlinked.
+// This has implications for the IN_EXCL_UNLINK flag.
+func (w *Watches) MarkUnlinked() {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.unlinked = true
+}
+
+// Lookup returns a matching watch with the given id. Returns nil if no such
+// watch exists. Note that the result returned by this method only remains valid
+// if the inotify instance owning the watch is locked, preventing modification
+// of the returned watch and preventing the replacement of the watch by another
+// one from the same instance (since there may be at most one watch per
+// instance, per target).
+func (w *Watches) Lookup(id uint64) *Watch {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.ws[id]
+}
+
+// Add adds watch into this set of watches. The watch being added must be unique
+// - its ID() should not collide with any existing watches.
+func (w *Watches) Add(watch *Watch) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	// Sanity check, the new watch shouldn't collide with an existing
+	// watch. Silently replacing an existing watch would result in a ref leak on
+	// this inode. We could handle this collision by calling Unpin() on the
+	// existing watch, but then we end up leaking watch descriptor ids at the
+	// inotify level.
+	if _, exists := w.ws[watch.ID()]; exists {
+		panic(fmt.Sprintf("Watch collision with ID %+v", watch.ID()))
+	}
+	w.ws[watch.ID()] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches. The caller
+// is responsible for generating any watch removal event, as appropriate. The
+// provided id must match an existing watch in this collection.
+func (w *Watches) Remove(id uint64) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	if w.ws == nil {
+		// This watch set is being destroyed. The thread executing the
+		// destructor is already in the process of deleting all our watches. We
+		// got here with no refs on the inode because we raced with the
+		// destructor notifying all the watch owners of the inode's destruction.
+		// See the comment in Watches.TargetDestroyed for why this race exists.
+		return
+	}
+
+	watch, ok := w.ws[id]
+	if !ok {
+		// While there's technically no problem with silently ignoring a missing
+		// watch, this is almost certainly a bug.
+		panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+	}
+	delete(w.ws, watch.ID())
+}
+
+// Notify queues a new event with all watches in this set.
+func (w *Watches) Notify(name string, events, cookie uint32) {
+	// N.B. We don't defer the unlocks because Notify is in the hot path of
+	// all IO operations, and the defer costs too much for small IO
+	// operations.
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if name != "" && w.unlinked && !watch.NotifyParentAfterUnlink() {
+			// IN_EXCL_UNLINK - By default, when watching events on the children
+			// of a directory, events are generated for children even after they
+			// have been unlinked from the directory. This can result in large
+			// numbers of uninteresting events for some applications (e.g., if
+			// watching /tmp, in which many applications create temporary files
+			// whose names are immediately unlinked). Specifying IN_EXCL_UNLINK
+			// changes the default behavior, so that events are not generated
+			// for children after they have been unlinked from the watched
+			// directory.  -- inotify(7)
+			//
+			// We know we're dealing with events for a parent when the name
+			// isn't empty.
+			continue
+		}
+		watch.Notify(name, events, cookie)
+	}
+	w.mu.RUnlock()
+}
+
+// Unpin unpins dirent from all watches in this set.
+func (w *Watches) Unpin(d *Dirent) {
+	w.mu.RLock()
+	defer w.mu.RUnlock()
+	for _, watch := range w.ws {
+		watch.Unpin(d)
+	}
+}
+
+// targetDestroyed is called by the inode destructor to notify the watch owners
+// of the impending destruction of the watch target.
+func (w *Watches) targetDestroyed() {
+	var ws map[uint64]*Watch
+
+	// We can't hold w.mu while calling watch.TargetDestroyed to preserve lock
+	// ordering w.r.t to the owner inotify instances. Instead, atomically move
+	// the watches map into a local variable so we can iterate over it safely.
+	//
+	// Because of this however, it is possible for the watches' owners to reach
+	// this inode while the inode has no refs. This is still safe because the
+	// owners can only reach the inode until this function finishes calling
+	// watch.TargetDestroyed() below and the inode is guaranteed to exist in the
+	// meanwhile. But we still have to be very careful not to rely on inode
+	// state that may have been already destroyed.
+	w.mu.Lock()
+	ws = w.ws
+	w.ws = nil
+	w.mu.Unlock()
+
+	for _, watch := range ws {
+		watch.TargetDestroyed()
+	}
+}
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
new file mode 100644
index 000000000..b33980178
--- /dev/null
+++ b/pkg/sentry/fs/inode_operations.go
@@ -0,0 +1,385 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"errors"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+var (
+	// ErrResolveViaReadlink is a special error value returned by
+	// InodeOperations.Getlink() to indicate that a link should be
+	// resolved automatically by walking to the path returned by
+	// InodeOperations.Readlink().
+	ErrResolveViaReadlink = errors.New("link should be resolved via Readlink()")
+)
+
+// TimeSpec contains access and modification timestamps. If either ATimeOmit or
+// MTimeOmit is true, then the corresponding timestamp should not be updated.
+// If either ATimeSetSystemTime or MTimeSetSystemTime are set then the
+// corresponding timestamp should be ignored and the time will be set to the
+// current system time.
+type TimeSpec struct {
+	ATime              ktime.Time
+	ATimeOmit          bool
+	ATimeSetSystemTime bool
+	MTime              ktime.Time
+	MTimeOmit          bool
+	MTimeSetSystemTime bool
+}
+
+// InodeOperations are operations on an Inode that diverge per file system.
+//
+// Objects that implement InodeOperations may cache file system "private"
+// data that is useful for implementing these methods. In contrast, Inode
+// contains state that is common to all Inodes; this state may be optionally
+// used by InodeOperations. An object that implements InodeOperations may
+// not take a reference on an Inode.
+type InodeOperations interface {
+	// Release releases all private file system data held by this object.
+	// Once Release is called, this object is dead (no other methods will
+	// ever be called).
+	Release(context.Context)
+
+	// Lookup loads an Inode at name under dir into a Dirent. The name
+	// is a valid component path: it contains no "/"s nor is the empty
+	// string.
+	//
+	// Lookup may return one of:
+	//
+	// * A nil Dirent and a non-nil error. If the reason that Lookup failed
+	//   was because the name does not exist under Inode, then must return
+	//   syserror.ENOENT.
+	//
+	// * If name does not exist under dir and the file system wishes this
+	//   fact to be cached, a non-nil Dirent containing a nil Inode and a
+	//   nil error. This is a negative Dirent and must have exactly one
+	//   reference (at-construction reference).
+	//
+	// * If name does exist under this dir, a non-nil Dirent containing a
+	//   non-nil Inode, and a nil error. File systems that take extra
+	//   references on this Dirent should implement DirentOperations.
+	Lookup(ctx context.Context, dir *Inode, name string) (*Dirent, error)
+
+	// Create creates an Inode at name under dir and returns a new File
+	// whose Dirent backs the new Inode. Implementations must ensure that
+	// name does not already exist. Create may return one of:
+	//
+	// * A nil File and a non-nil error.
+	//
+	// * A non-nil File and a nil error. File.Dirent will be a new Dirent,
+	// with a single reference held by File. File systems that take extra
+	// references on this Dirent should implement DirentOperations.
+	//
+	// The caller must ensure that this operation is permitted.
+	Create(ctx context.Context, dir *Inode, name string, flags FileFlags, perm FilePermissions) (*File, error)
+
+	// CreateDirectory creates a new directory under this dir.
+	// CreateDirectory should otherwise do the same as Create.
+	//
+	// The caller must ensure that this operation is permitted.
+	CreateDirectory(ctx context.Context, dir *Inode, name string, perm FilePermissions) error
+
+	// CreateLink creates a symbolic link under dir between newname
+	// and oldname. CreateLink should otherwise do the same as Create.
+	//
+	// The caller must ensure that this operation is permitted.
+	CreateLink(ctx context.Context, dir *Inode, oldname string, newname string) error
+
+	// CreateHardLink creates a hard link under dir between the target
+	// Inode and name. Implementations must ensure that name does not
+	// already exist.
+	//
+	// The caller must ensure this operation is permitted.
+	CreateHardLink(ctx context.Context, dir *Inode, target *Inode, name string) error
+
+	// CreateFifo creates a new named pipe under dir at name.
+	// Implementations must ensure that an Inode at name does not
+	// already exist.
+	//
+	// The caller must ensure that this operation is permitted.
+	CreateFifo(ctx context.Context, dir *Inode, name string, perm FilePermissions) error
+
+	// Remove removes the given named non-directory under dir.
+	//
+	// The caller must ensure that this operation is permitted.
+	//
+	// TODO: merge Remove and RemoveDirectory, Remove
+	// just needs a type flag.
+	Remove(ctx context.Context, dir *Inode, name string) error
+
+	// RemoveDirectory removes the given named directory under dir.
+	//
+	// The caller must ensure that this operation is permitted.
+	//
+	// RemoveDirectory should check that the directory to be
+	// removed is empty.
+	RemoveDirectory(ctx context.Context, dir *Inode, name string) error
+
+	// Rename atomically renames oldName under oldParent to newName
+	// under newParent where oldParent and newParent are directories.
+	//
+	// Implementations are responsible for rejecting renames that
+	// replace non-empty directories.
+	Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string) error
+
+	// Bind binds a new socket under dir at the given name.
+	// Implementations must ensure that name does not already exist.
+	//
+	// The caller must ensure that this operation is permitted.
+	Bind(ctx context.Context, dir *Inode, name string, data unix.BoundEndpoint, perm FilePermissions) error
+
+	// BoundEndpoint returns the socket endpoint at path stored in
+	// or generated by an Inode.
+	//
+	// The path is only relevant for generated endpoint because stored
+	// endpoints already know their path. It is ok for the endpoint to
+	// hold onto their path because the only way to change a bind
+	// address is to rebind the socket.
+	//
+	// This is valid iff the type of the Inode is a Socket, which
+	// generally implies that this Inode was created via CreateSocket.
+	//
+	// If there is no socket endpoint available, nil will be returned.
+	BoundEndpoint(inode *Inode, path string) unix.BoundEndpoint
+
+	// GetFile returns a new open File backed by a Dirent and FileFlags.
+	// It may block as long as it is done with ctx.
+	//
+	// The returned File will uniquely back an application fd.
+	GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File, error)
+
+	// UnstableAttr returns the most up-to-date "unstable" attributes of
+	// an Inode, where "unstable" means that they change in response to
+	// file system events.
+	UnstableAttr(ctx context.Context, inode *Inode) (UnstableAttr, error)
+
+	// Getxattr retrieves the value of extended attribute name. Inodes that
+	// do not support extended attributes return EOPNOTSUPP. Inodes that
+	// support extended attributes but don't have a value at name return
+	// ENODATA.
+	Getxattr(inode *Inode, name string) ([]byte, error)
+
+	// Setxattr sets the value of extended attribute name. Inodes that
+	// do not support extended attributes return EOPNOTSUPP.
+	Setxattr(inode *Inode, name string, value []byte) error
+
+	// Listxattr returns the set of all extended attributes names that
+	// have values. Inodes that do not support extended attributes return
+	// EOPNOTSUPP.
+	Listxattr(inode *Inode) (map[string]struct{}, error)
+
+	// Check determines whether an Inode can be accessed with the
+	// requested permission mask using the context (which gives access
+	// to Credentials and UserNamespace).
+	Check(ctx context.Context, inode *Inode, p PermMask) bool
+
+	// SetPermissions sets new permissions for an Inode.  Returns false
+	// if it was not possible to set the new permissions.
+	//
+	// The caller must ensure that this operation is permitted.
+	SetPermissions(ctx context.Context, inode *Inode, f FilePermissions) bool
+
+	// SetOwner sets the ownership for this file.
+	//
+	// If either UID or GID are set to auth.NoID, its value will not be
+	// changed.
+	//
+	// The caller must ensure that this operation is permitted.
+	SetOwner(ctx context.Context, inode *Inode, owner FileOwner) error
+
+	// SetTimestamps sets the access and modification timestamps of an
+	// Inode according to the access and modification times in the TimeSpec.
+	//
+	// If either ATimeOmit or MTimeOmit is set, then the corresponding
+	// timestamp is not updated.
+	//
+	// If either ATimeSetSystemTime or MTimeSetSystemTime is true, that
+	// timestamp is set to the current time instead.
+	//
+	// The caller must ensure that this operation is permitted.
+	SetTimestamps(ctx context.Context, inode *Inode, ts TimeSpec) error
+
+	// Truncate changes the size of an Inode. Truncate should not check
+	// permissions internally, as it is used for both sys_truncate and
+	// sys_ftruncate.
+	//
+	// Implementations need not check that length >= 0.
+	Truncate(ctx context.Context, inode *Inode, size int64) error
+
+	// WriteOut writes cached Inode state to a backing filesystem in a
+	// synchronous manner.
+	//
+	// File systems that do not cache metadata or data via an Inode
+	// implement WriteOut as a no-op. File systems that are entirely in
+	// memory also implement WriteOut as a no-op. Otherwise file systems
+	// call Inode.Sync to write back page cached data and cached metadata
+	// followed by syncing writeback handles.
+	//
+	// It derives from include/linux/fs.h:super_operations->write_inode.
+	WriteOut(ctx context.Context, inode *Inode) error
+
+	// Readlink reads the symlink path of an Inode.
+	//
+	// Readlink is permitted to return a different path depending on ctx,
+	// the request originator.
+	//
+	// The caller must ensure that this operation is permitted.
+	//
+	// Readlink should check that Inode is a symlink and its content is
+	// at least readable.
+	Readlink(ctx context.Context, inode *Inode) (string, error)
+
+	// Getlink resolves a symlink to a target *Dirent.
+	//
+	// Filesystems that can resolve the link by walking to the path returned
+	// by Readlink should return (nil, ErrResolveViaReadlink), which
+	// triggers link resolution via Realink and Lookup.
+	//
+	// Some links cannot be followed by Lookup. In this case, Getlink can
+	// return the Dirent of the link target. The caller holds a reference
+	// to the Dirent. Filesystems that return a non-nil *Dirent from Getlink
+	// cannot participate in an overlay because it is impossible for the
+	// overlay to ascertain whether or not the *Dirent should contain an
+	// overlayEntry.
+	//
+	// Any error returned from Getlink other than ErrResolveViaReadlink
+	// indicates the caller's inability to traverse this Inode as a link
+	// (e.g. syserror.ENOLINK indicates that the Inode is not a link,
+	// syscall.EPERM indicates that traversing the link is not allowed, etc).
+	Getlink(context.Context, *Inode) (*Dirent, error)
+
+	// Mappable returns a memmap.Mappable that provides memory mappings of the
+	// Inode's data. Mappable may return nil if this is not supported. The
+	// returned Mappable must remain valid until InodeOperations.Release is
+	// called.
+	Mappable(*Inode) memmap.Mappable
+
+	// The below methods require cleanup.
+
+	// AddLink increments the hard link count of an Inode.
+	//
+	// Remove in favor of Inode.IncLink.
+	AddLink()
+
+	// DropLink decrements the hard link count of an Inode.
+	//
+	// Remove in favor of Inode.DecLink.
+	DropLink()
+
+	// NotifyStatusChange sets the status change time to the current time.
+	//
+	// Remove in favor of updating the Inode's cached status change time.
+	NotifyStatusChange(ctx context.Context)
+
+	// IsVirtual indicates whether or not this corresponds to a virtual
+	// resource.
+	//
+	// If IsVirtual returns true, then caching will be disabled for this
+	// node, and fs.Dirent.Freeze() will not stop operations on the node.
+	//
+	// Remove in favor of freezing specific mounts.
+	IsVirtual() bool
+
+	// StatFS returns a filesystem Info implementation or an error.  If
+	// the filesystem does not support this operation (maybe in the future
+	// it will), then ENOSYS should be returned.
+	//
+	// Move to MountSourceOperations.
+	StatFS(context.Context) (Info, error)
+
+	HandleOperations
+}
+
+// HandleOperations are extended InodeOperations that are only implemented
+// for file systems that use fs/handle.go:Handle to generate open Files.
+//
+// Handle is deprecated; these methods are deprecated as well.
+//
+// Filesystems are encouraged to implement the File interface directly
+// instead of using Handle. To indicate that the below methods should never
+// be called, embed DeprecatedFileOperations to satisfy this interface.
+type HandleOperations interface {
+	waiter.Waitable
+
+	// DeprecatedPreadv is deprecated in favor of filesystems
+	// implementing File.Preadv directly.
+	//
+	// DeprecatedPreadv reads up to dst.NumBytes() bytes into dst, starting at
+	// the given offset, and returns the number of bytes read.
+	//
+	// Preadv may return a partial read result before EOF is reached.
+	//
+	// If a symlink, Preadv reads the target value of the symlink.
+	//
+	// Preadv should not check for readable permissions.
+	DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+
+	// DeprecatedPwritev is deprecated in favor of filesystems
+	// implementing File.Pwritev directly.
+	//
+	// DeprecatedPwritev writes up to src.NumBytes() bytes from src to the
+	// Inode, starting at the given offset and returns the number of bytes
+	// written.
+	//
+	// Pwritev should not check that the Inode has writable permissions.
+	DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
+
+	// DeprecatedReaddir is deprecated in favor of filesystems
+	// implementing File.Readdir directly.
+	//
+	// DeprecatedReaddir emits directory entries by calling dirCtx.EmitDir,
+	// beginning with the entry at offset.
+	//
+	// Entries for "." and ".." must *not* be included.
+	//
+	// If the offset returned is the same as the argument offset, then
+	// nothing has been serialized.  This is equivalent to reaching EOF.
+	// In this case serializer.Written() should return 0.
+	//
+	// The order of entries to emit must be consistent between Readdir
+	// calls, and must start with the given offset.
+	//
+	// The caller must ensure that this operation is permitted.
+	DeprecatedReaddir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error)
+
+	// DeprecatedFsync is deprecated in favor of filesystems implementing
+	// File.Fsync directly.
+	//
+	// DeprecatedFsync syncs a file.
+	DeprecatedFsync() error
+
+	// DeprecatedMappable is deprecated in favor of filesystems implementing
+	// File.Mappable directly.
+	//
+	// DeprecatedMappable returns a Mappable if the Inode can be mapped.
+	DeprecatedMappable(ctx context.Context, inode *Inode) (memmap.Mappable, bool)
+
+	// DeprecatedFlush is deprecated in favor of filesystems implementing
+	// File.Flush directly.
+	//
+	// DeprecatedFlush flushes a file.
+	//
+	// Implementations may choose to free up memory or complete pending I/O
+	// but also may implement Flush as a no-op.
+	DeprecatedFlush() error
+}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
new file mode 100644
index 000000000..343150bb8
--- /dev/null
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -0,0 +1,555 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+func overlayHasWhiteout(parent *Inode, name string) bool {
+	buf, err := parent.Getxattr(XattrOverlayWhiteout(name))
+	return err == nil && string(buf) == "y"
+}
+
+func overlayCreateWhiteout(parent *Inode, name string) error {
+	return parent.InodeOperations.Setxattr(parent, XattrOverlayWhiteout(name), []byte("y"))
+}
+
+func overlayWriteOut(ctx context.Context, o *overlayEntry) error {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	if o.upper == nil {
+		return nil
+	}
+	return o.upper.InodeOperations.WriteOut(ctx, o.upper)
+}
+
+func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name string) (*Dirent, error) {
+	parent.copyMu.RLock()
+	defer parent.copyMu.RUnlock()
+
+	// Assert that there is at least one upper or lower entry.
+	if parent.upper == nil && parent.lower == nil {
+		panic("invalid overlayEntry, needs at least one Inode")
+	}
+
+	var upperInode *Inode
+	var lowerInode *Inode
+
+	// Does the parent directory exist in the upper file system?
+	if parent.upper != nil {
+		// First check if a file object exists in the upper file system.
+		// A file could have been created over a whiteout, so we need to
+		// check if something exists in the upper file system first.
+		child, err := parent.upper.Lookup(ctx, name)
+		if err != nil && err != syserror.ENOENT {
+			// We encountered an error that an overlay cannot handle,
+			// we must propagate it to the caller.
+			return nil, err
+		}
+		if child != nil {
+			defer child.DecRef()
+
+			// Is the child non-negative?
+			if !child.IsNegative() {
+				upperInode = child.Inode
+				upperInode.IncRef()
+			}
+		}
+
+		// Are we done?
+		if overlayHasWhiteout(parent.upper, name) {
+			if upperInode == nil {
+				return NewNegativeDirent(name), nil
+			}
+			entry, err := newOverlayEntry(ctx, upperInode, nil, false)
+			if err != nil {
+				// Don't leak resources.
+				upperInode.DecRef()
+				return nil, err
+			}
+			return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
+		}
+	}
+
+	// Check the lower file system. We do this unconditionally (even for
+	// non-directories) because we may need to use stable attributes from
+	// the lower filesystem (e.g. device number, inode number) that were
+	// visible before a copy up.
+	if parent.lower != nil {
+		// Check the lower file system.
+		child, err := parent.lower.Lookup(ctx, name)
+		// Same song and dance as above.
+		if err != nil && err != syserror.ENOENT {
+			// Don't leak resources.
+			if upperInode != nil {
+				upperInode.DecRef()
+			}
+			return nil, err
+		}
+		if child != nil {
+			defer child.DecRef()
+
+			// Is the child negative?
+			if !child.IsNegative() {
+				// Did we find something in the upper filesystem? We can
+				// only use it if the types match.
+				if upperInode == nil || upperInode.StableAttr.Type == child.Inode.StableAttr.Type {
+					lowerInode = child.Inode
+					lowerInode.IncRef()
+				}
+			}
+		}
+	}
+
+	// Was all of this for naught?
+	if upperInode == nil && lowerInode == nil {
+		// Return a negative Dirent indicating that nothing was found.
+		return NewNegativeDirent(name), nil
+	}
+
+	// Did we find a lower Inode? Remember this because we may decide we don't
+	// actually need the lower Inode (see below).
+	lowerExists := lowerInode != nil
+
+	// If we found something in the upper filesystem and the lower filesystem,
+	// use the stable attributes from the lower filesystem. If we don't do this,
+	// then it may appear that the file was magically recreated across copy up.
+	if upperInode != nil && lowerInode != nil {
+		// Steal attributes.
+		upperInode.StableAttr = lowerInode.StableAttr
+
+		// For non-directories, the lower filesystem resource is strictly
+		// unnecessary because we don't need to copy-up and we will always
+		// operate (e.g. read/write) on the upper Inode.
+		if !IsDir(upperInode.StableAttr) {
+			lowerInode.DecRef()
+			lowerInode = nil
+		}
+	}
+
+	// Phew, finally done.
+	entry, err := newOverlayEntry(ctx, upperInode, lowerInode, lowerExists)
+	if err != nil {
+		// Well, not quite, we failed at the last moment, how depressing.
+		// Be sure not to leak resources.
+		if upperInode != nil {
+			upperInode.DecRef()
+		}
+		if lowerInode != nil {
+			lowerInode.DecRef()
+		}
+		return nil, err
+	}
+	return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
+}
+
+func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) {
+	// Dirent.Create takes renameMu if the Inode is an overlay Inode.
+	if err := copyUpLockedForRename(ctx, parent); err != nil {
+		return nil, err
+	}
+
+	upperFile, err := o.upper.InodeOperations.Create(ctx, o.upper, name, flags, perm)
+	if err != nil {
+		return nil, err
+	}
+
+	// Take another reference on the upper file's inode, which will be
+	// owned by the overlay entry.
+	upperFile.Dirent.Inode.IncRef()
+	entry, err := newOverlayEntry(ctx, upperFile.Dirent.Inode, nil, false)
+	if err != nil {
+		cleanupUpper(ctx, o.upper, name)
+		return nil, err
+	}
+
+	// NOTE: Replace the Dirent with a transient Dirent, since
+	// we are about to create the real Dirent: an overlay Dirent.
+	//
+	// This ensures the *fs.File returned from overlayCreate is in the same
+	// state as the *fs.File returned by overlayGetFile, where the upper
+	// file has a transient Dirent.
+	//
+	// This is necessary for Save/Restore, as otherwise the upper Dirent
+	// (which has no path as it is unparented and never reachable by the
+	// user) will clobber the real path for the underlying Inode.
+	upperFile.Dirent.Inode.IncRef()
+	upperDirent := NewTransientDirent(upperFile.Dirent.Inode)
+	upperFile.Dirent.DecRef()
+	upperFile.Dirent = upperDirent
+
+	// Create the overlay inode and dirent.  We need this to construct the
+	// overlay file.
+	overlayInode := newOverlayInode(ctx, entry, parent.Inode.MountSource)
+	// d will own the inode reference.
+	overlayDirent := NewDirent(overlayInode, name)
+	// The overlay file created below with NewFile will take a reference on
+	// the overlayDirent, and it should be the only thing holding a
+	// reference at the time of creation, so we must drop this reference.
+	defer overlayDirent.DecRef()
+
+	// Create a new overlay file that wraps the upper file.
+	flags.Pread = upperFile.Flags().Pread
+	flags.Pwrite = upperFile.Flags().Pwrite
+	overlayFile := NewFile(ctx, overlayDirent, flags, &overlayFileOperations{upper: upperFile})
+
+	return overlayFile, nil
+}
+
+func overlayCreateDirectory(ctx context.Context, o *overlayEntry, parent *Dirent, name string, perm FilePermissions) error {
+	// Dirent.CreateDirectory takes renameMu if the Inode is an overlay
+	// Inode.
+	if err := copyUpLockedForRename(ctx, parent); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.CreateDirectory(ctx, o.upper, name, perm)
+}
+
+func overlayCreateLink(ctx context.Context, o *overlayEntry, parent *Dirent, oldname string, newname string) error {
+	// Dirent.CreateLink takes renameMu if the Inode is an overlay Inode.
+	if err := copyUpLockedForRename(ctx, parent); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.CreateLink(ctx, o.upper, oldname, newname)
+}
+
+func overlayCreateHardLink(ctx context.Context, o *overlayEntry, parent *Dirent, target *Dirent, name string) error {
+	// Dirent.CreateHardLink takes renameMu if the Inode is an overlay
+	// Inode.
+	if err := copyUpLockedForRename(ctx, parent); err != nil {
+		return err
+	}
+	if err := copyUpLockedForRename(ctx, target); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.CreateHardLink(ctx, o.upper, target.Inode.overlay.upper, name)
+}
+
+func overlayCreateFifo(ctx context.Context, o *overlayEntry, parent *Dirent, name string, perm FilePermissions) error {
+	// Dirent.CreateFifo takes renameMu if the Inode is an overlay Inode.
+	if err := copyUpLockedForRename(ctx, parent); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.CreateFifo(ctx, o.upper, name, perm)
+}
+
+func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child *Dirent) error {
+	// Dirent.Remove and Dirent.RemoveDirectory take renameMu if the Inode
+	// is an overlay Inode.
+	if err := copyUpLockedForRename(ctx, parent); err != nil {
+		return err
+	}
+	child.Inode.overlay.copyMu.RLock()
+	defer child.Inode.overlay.copyMu.RUnlock()
+	if child.Inode.overlay.upper != nil {
+		if child.Inode.StableAttr.Type == Directory {
+			if err := o.upper.InodeOperations.RemoveDirectory(ctx, o.upper, child.name); err != nil {
+				return err
+			}
+		} else {
+			if err := o.upper.InodeOperations.Remove(ctx, o.upper, child.name); err != nil {
+				return err
+			}
+		}
+	}
+	if child.Inode.overlay.lowerExists {
+		return overlayCreateWhiteout(o.upper, child.name)
+	}
+	return nil
+}
+
+func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string) error {
+	// To be able to copy these up below, they have to be part of an
+	// overlay file system.
+	//
+	// Maybe some day we can allow the more complicated case of
+	// non-overlay X overlay renames, but that's not necessary right now.
+	if renamed.Inode.overlay == nil || newParent.Inode.overlay == nil || oldParent.Inode.overlay == nil {
+		return syserror.EXDEV
+	}
+
+	// Check here if the file to be replaced exists and is a non-empty
+	// directory. If we copy up first, we may end up copying the directory
+	// but none of its children, so the directory will appear empty in the
+	// upper fs, which will then allow the rename to proceed when it should
+	// return ENOTEMPTY.
+	replaced, err := newParent.Inode.Lookup(ctx, newName)
+	if err != nil && err != syserror.ENOENT {
+		return err
+	}
+	if err == nil && !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) {
+		children, err := readdirOne(ctx, replaced)
+		if err != nil {
+			return err
+		}
+
+		// readdirOne ensures that "." and ".." are not
+		// included among the returned children, so we don't
+		// need to bother checking for them.
+		if len(children) > 0 {
+			return syserror.ENOTEMPTY
+		}
+	}
+	if err := copyUpLockedForRename(ctx, renamed); err != nil {
+		return err
+	}
+	if err := copyUpLockedForRename(ctx, newParent); err != nil {
+		return err
+	}
+	oldName := renamed.name
+	if err := o.upper.InodeOperations.Rename(ctx, oldParent.Inode.overlay.upper, oldName, newParent.Inode.overlay.upper, newName); err != nil {
+		return err
+	}
+	if renamed.Inode.overlay.lowerExists {
+		return overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName)
+	}
+	return nil
+}
+
+func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.BoundEndpoint, perm FilePermissions) error {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	// We do not support doing anything exciting with sockets unless there
+	// is already a directory in the upper filesystem.
+	if o.upper == nil {
+		return syserror.EOPNOTSUPP
+	}
+	return o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm)
+}
+
+func overlayBoundEndpoint(o *overlayEntry, path string) unix.BoundEndpoint {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
+	if o.upper != nil {
+		return o.upper.InodeOperations.BoundEndpoint(o.upper, path)
+	}
+	// If a socket is already in the lower file system, allow connections
+	// to it.
+	return o.lower.InodeOperations.BoundEndpoint(o.lower, path)
+}
+
+func overlayGetFile(ctx context.Context, o *overlayEntry, d *Dirent, flags FileFlags) (*File, error) {
+	if flags.Write {
+		if err := copyUp(ctx, d); err != nil {
+			return nil, err
+		}
+	}
+
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
+	if o.upper != nil {
+		upper, err := overlayFile(ctx, o.upper, flags)
+		if err != nil {
+			return nil, err
+		}
+		flags.Pread = upper.Flags().Pread
+		flags.Pwrite = upper.Flags().Pwrite
+		return NewFile(ctx, d, flags, &overlayFileOperations{upper: upper}), nil
+	}
+
+	lower, err := overlayFile(ctx, o.lower, flags)
+	if err != nil {
+		return nil, err
+	}
+	flags.Pread = lower.Flags().Pread
+	flags.Pwrite = lower.Flags().Pwrite
+	return NewFile(ctx, d, flags, &overlayFileOperations{lower: lower}), nil
+}
+
+func overlayUnstableAttr(ctx context.Context, o *overlayEntry) (UnstableAttr, error) {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	if o.upper != nil {
+		return o.upper.UnstableAttr(ctx)
+	}
+	return o.lower.UnstableAttr(ctx)
+}
+
+func overlayGetxattr(o *overlayEntry, name string) ([]byte, error) {
+	// Don't forward the value of the extended attribute if it would
+	// unexpectedly change the behavior of a wrapping overlay layer.
+	if strings.HasPrefix(XattrOverlayPrefix, name) {
+		return nil, syserror.ENODATA
+	}
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	if o.upper != nil {
+		return o.upper.Getxattr(name)
+	}
+	return o.lower.Getxattr(name)
+}
+
+func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	var names map[string]struct{}
+	var err error
+	if o.upper != nil {
+		names, err = o.upper.Listxattr()
+	} else {
+		names, err = o.lower.Listxattr()
+	}
+	for name := range names {
+		// Same as overlayGetxattr, we shouldn't forward along
+		// overlay attributes.
+		if strings.HasPrefix(XattrOverlayPrefix, name) {
+			delete(names, name)
+		}
+	}
+	return names, err
+}
+
+func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	if o.upper != nil {
+		return o.upper.check(ctx, p)
+	}
+	if p.Write {
+		// Since writes will be redirected to the upper filesystem, the lower
+		// filesystem need not be writable, but must be readable for copy-up.
+		p.Write = false
+		p.Read = true
+	}
+	return o.lower.check(ctx, p)
+}
+
+func overlaySetPermissions(ctx context.Context, o *overlayEntry, d *Dirent, f FilePermissions) bool {
+	if err := copyUp(ctx, d); err != nil {
+		return false
+	}
+	return o.upper.InodeOperations.SetPermissions(ctx, o.upper, f)
+}
+
+func overlaySetOwner(ctx context.Context, o *overlayEntry, d *Dirent, owner FileOwner) error {
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.SetOwner(ctx, o.upper, owner)
+}
+
+func overlaySetTimestamps(ctx context.Context, o *overlayEntry, d *Dirent, ts TimeSpec) error {
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.SetTimestamps(ctx, o.upper, ts)
+}
+
+func overlayTruncate(ctx context.Context, o *overlayEntry, d *Dirent, size int64) error {
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.Truncate(ctx, o.upper, size)
+}
+
+func overlayReadlink(ctx context.Context, o *overlayEntry) (string, error) {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	if o.upper != nil {
+		return o.upper.Readlink(ctx)
+	}
+	return o.lower.Readlink(ctx)
+}
+
+func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) {
+	var dirent *Dirent
+	var err error
+
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
+	if o.upper != nil {
+		dirent, err = o.upper.Getlink(ctx)
+	} else {
+		dirent, err = o.lower.Getlink(ctx)
+	}
+	if dirent != nil {
+		// This dirent is likely bogus (its Inode likely doesn't contain
+		// the right overlayEntry). So we're forced to drop it on the
+		// ground and claim that jumping around the filesystem like this
+		// is not supported.
+		name, _ := dirent.FullName(nil)
+		dirent.DecRef()
+
+		// Claim that the path is not accessible.
+		err = syserror.EACCES
+		log.Warningf("Getlink not supported in overlay for %q", name)
+	}
+	return nil, err
+}
+
+func overlayStatFS(ctx context.Context, o *overlayEntry) (Info, error) {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
+	var i Info
+	var err error
+	if o.upper != nil {
+		i, err = o.upper.StatFS(ctx)
+	} else {
+		i, err = o.lower.StatFS(ctx)
+	}
+	if err != nil {
+		return Info{}, err
+	}
+
+	i.Type = linux.OVERLAYFS_SUPER_MAGIC
+
+	return i, nil
+}
+
+func overlayHandleOps(o *overlayEntry) HandleOperations {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+	if o.upper != nil {
+		return o.upper.HandleOps()
+	}
+	return o.lower.HandleOps()
+}
+
+// NewTestOverlayDir returns an overlay Inode for tests.
+func NewTestOverlayDir(ctx context.Context, upper *Inode, lower *Inode) *Inode {
+	fs := &overlayFilesystem{}
+	msrc := NewMountSource(&overlayMountSourceOperations{
+		upper: NewNonCachingMountSource(fs, MountSourceFlags{}),
+		lower: NewNonCachingMountSource(fs, MountSourceFlags{}),
+	}, fs, MountSourceFlags{})
+	overlay := &overlayEntry{
+		upper: upper,
+		lower: lower,
+	}
+	return newOverlayInode(ctx, overlay, msrc)
+}
+
+// TestHasUpperFS returns true if i is an overlay Inode and it has a pointer
+// to an Inode on an upper filesystem.
+func (i *Inode) TestHasUpperFS() bool {
+	return i.overlay != nil && i.overlay.upper != nil
+}
+
+// TestHasLowerFS returns true if i is an overlay Inode and it has a pointer
+// to an Inode on a lower filesystem.
+func (i *Inode) TestHasLowerFS() bool {
+	return i.overlay != nil && i.overlay.lower != nil
+}
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
new file mode 100644
index 000000000..684d54bd2
--- /dev/null
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -0,0 +1,251 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs_test
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func TestLookup(t *testing.T) {
+	ctx := contexttest.Context(t)
+	for _, test := range []struct {
+		// Test description.
+		desc string
+
+		// Lookup parameters.
+		dir  *fs.Inode
+		name string
+
+		// Want from lookup.
+		err      error
+		found    bool
+		hasUpper bool
+		hasLower bool
+	}{
+		{
+			desc: "no upper, lower has name",
+			dir: fs.NewTestOverlayDir(ctx,
+				nil, /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* lower */
+			),
+			name:     "a",
+			found:    true,
+			hasUpper: false,
+			hasLower: true,
+		},
+		{
+			desc: "no lower, upper has name",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* upper */
+				nil, /* lower */
+			),
+			name:     "a",
+			found:    true,
+			hasUpper: true,
+			hasLower: false,
+		},
+		{
+			desc: "upper and lower, only lower has name",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "b",
+						dir:  false,
+					},
+				}, nil), /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* lower */
+			),
+			name:     "a",
+			found:    true,
+			hasUpper: false,
+			hasLower: true,
+		},
+		{
+			desc: "upper and lower, only upper has name",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "b",
+						dir:  false,
+					},
+				}, nil), /* lower */
+			),
+			name:     "a",
+			found:    true,
+			hasUpper: true,
+			hasLower: false,
+		},
+		{
+			desc: "upper and lower, both have file",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* lower */
+			),
+			name:     "a",
+			found:    true,
+			hasUpper: true,
+			hasLower: false,
+		},
+		{
+			desc: "upper and lower, both have directory",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  true,
+					},
+				}, nil), /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  true,
+					},
+				}, nil), /* lower */
+			),
+			name:     "a",
+			found:    true,
+			hasUpper: true,
+			hasLower: true,
+		},
+		{
+			desc: "upper and lower, upper negative masks lower file",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, nil, []string{"a"}), /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* lower */
+			),
+			name:     "a",
+			found:    false,
+			hasUpper: false,
+			hasLower: false,
+		},
+		{
+			desc: "upper and lower, upper negative does not mask lower file",
+			dir: fs.NewTestOverlayDir(ctx,
+				newTestRamfsDir(ctx, nil, []string{"b"}), /* upper */
+				newTestRamfsDir(ctx, []dirContent{
+					{
+						name: "a",
+						dir:  false,
+					},
+				}, nil), /* lower */
+			),
+			name:     "a",
+			found:    true,
+			hasUpper: false,
+			hasLower: true,
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			dirent, err := test.dir.Lookup(ctx, test.name)
+			if err != test.err {
+				t.Fatalf("lookup got error %v, want %v", err, test.err)
+			}
+			if test.found && dirent.IsNegative() {
+				t.Fatalf("lookup expected to find %q, got negative dirent", test.name)
+			}
+			if !test.found {
+				return
+			}
+			if hasUpper := dirent.Inode.TestHasUpperFS(); hasUpper != test.hasUpper {
+				t.Fatalf("lookup got upper filesystem %v, want %v", hasUpper, test.hasUpper)
+			}
+			if hasLower := dirent.Inode.TestHasLowerFS(); hasLower != test.hasLower {
+				t.Errorf("lookup got lower filesystem %v, want %v", hasLower, test.hasLower)
+			}
+		})
+	}
+}
+
+type dir struct {
+	fs.InodeOperations
+
+	// list of negative child names.
+	negative []string
+}
+
+func (d *dir) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
+	for _, n := range d.negative {
+		if name == fs.XattrOverlayWhiteout(n) {
+			return []byte("y"), nil
+		}
+	}
+	return nil, syserror.ENOATTR
+}
+
+type dirContent struct {
+	name string
+	dir  bool
+}
+
+func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []string) *fs.Inode {
+	msrc := fs.NewCachingMountSource(nil, fs.MountSourceFlags{})
+	contents := make(map[string]*fs.Inode)
+	for _, c := range contains {
+		if c.dir {
+			contents[c.name] = newTestRamfsDir(ctx, nil, nil)
+		} else {
+			contents[c.name] = fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}), msrc, fs.StableAttr{Type: fs.RegularFile})
+		}
+	}
+	dops := ramfstest.NewDir(ctx, contents, fs.FilePermissions{
+		User: fs.PermMask{Read: true, Execute: true},
+	})
+	return fs.NewInode(&dir{
+		InodeOperations: dops,
+		negative:        negative,
+	}, msrc, fs.StableAttr{Type: fs.Directory})
+}
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
new file mode 100644
index 000000000..9f50cb800
--- /dev/null
+++ b/pkg/sentry/fs/inotify.go
@@ -0,0 +1,329 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements the FileOperations interface.
+//
+// Lock ordering:
+//   Inotify.mu -> Inode.Watches.mu -> Watch.mu -> Inotify.evMu
+type Inotify struct {
+	// Unique identifier for this inotify instance. We don't just reuse the
+	// inotify fd because fds can be duped. These should not be exposed to the
+	// user, since we may aggressively reuse an id on S/R.
+	id uint64
+
+	// evMu *only* protects the event queue. We need a separate lock because
+	// while queuing events, a watch needs to lock the event queue, and using mu
+	// for that would violate lock ordering since at that point the calling
+	// goroutine already holds Watch.target.Watches.mu.
+	evMu sync.Mutex `state:"nosave"`
+
+	waiter.Queue `state:"nosave"`
+
+	// A list of pending events for this inotify instance. Protected by evMu.
+	events ilist.List
+
+	// A scratch buffer, use to serialize inotify events. Use allocate this
+	// ahead of time and reuse performance. Protected by evMu.
+	scratch []byte
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// The next watch descriptor number to use for this inotify instance. Note
+	// that Linux starts numbering watch descriptors from 1.
+	nextWatch int32
+
+	// Map from watch descriptors to watch objects.
+	watches map[int32]*Watch
+}
+
+// NewInotify constructs a new Inotify instance.
+func NewInotify(ctx context.Context) *Inotify {
+	return &Inotify{
+		id:        uniqueid.GlobalFromContext(ctx),
+		scratch:   make([]byte, inotifyEventBaseSize),
+		nextWatch: 1, // Linux starts numbering watch descriptors from 1.
+		watches:   make(map[int32]*Watch),
+	}
+}
+
+// Release implements FileOperations.Release. Release removes all watches and
+// frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+	// We need to hold i.mu to avoid a race with concurrent calls to
+	// Inotify.targetDestroyed from Watches. There's no risk of Watches
+	// accessing this Inotify after the destructor ends, because we remove all
+	// references to it below.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	for _, w := range i.watches {
+		// Remove references to the watch from the watch target. We don't need
+		// to worry about the references from the owner instance, since we're in
+		// the owner's destructor.
+		w.target.Watches.Remove(w.ID())
+		// Don't leak any references to the target, held by pins in the watch.
+		w.destroy()
+	}
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if !i.events.Empty() {
+		ready |= waiter.EventIn
+	}
+
+	return mask & ready
+}
+
+// Seek implements FileOperations.Seek.
+func (*Inotify) Seek(context.Context, *File, SeekWhence, int64) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Readdir implements FileOperatons.Readdir.
+func (*Inotify) Readdir(context.Context, *File, DentrySerializer) (int64, error) {
+	return 0, syserror.ENOTDIR
+}
+
+// Write implements FileOperations.Write.
+func (*Inotify) Write(context.Context, *File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Read implements FileOperations.Read.
+func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() < inotifyEventBaseSize {
+		return 0, syserror.EINVAL
+	}
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if i.events.Empty() {
+		// Nothing to read yet, tell caller to block.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var writeLen int64
+	for e := i.events.Front(); e != nil; e = e.Next() {
+		event := e.(*Event)
+
+		// Does the buffer have enough remaining space to hold the event we're
+		// about to write out?
+		if dst.NumBytes() < int64(event.sizeOf()) {
+			if writeLen > 0 {
+				// Buffer wasn't big enough for all pending events, but we did
+				// write some events out.
+				return writeLen, nil
+			}
+			return 0, syserror.EINVAL
+		}
+
+		// Linux always dequeues an available event as long as there's enough
+		// buffer space to copy it out, even if the copy below fails. Emulate
+		// this behaviour.
+		i.events.Remove(e)
+
+		// Buffer has enough space, copy event to the read buffer.
+		n, err := event.CopyTo(ctx, i.scratch, dst)
+		if err != nil {
+			return 0, err
+		}
+
+		writeLen += n
+		dst = dst.DropFirst64(n)
+	}
+	return writeLen, nil
+}
+
+// Fsync implements FileOperations.Fsync.
+func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error {
+	return syserror.EINVAL
+}
+
+// Flush implements FileOperations.Flush.
+func (*Inotify) Flush(context.Context, *File) error {
+	return nil
+}
+
+// ConfigureMMap implements FileOperations.ConfigureMMap.
+func (*Inotify) ConfigureMMap(context.Context, *File, *memmap.MMapOpts) error {
+	return syserror.ENODEV
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch args[1].Int() {
+	case linux.FIONREAD:
+		i.evMu.Lock()
+		defer i.evMu.Unlock()
+		var n uint32
+		for e := i.events.Front(); e != nil; e = e.Next() {
+			event := e.(*Event)
+			n += uint32(event.sizeOf())
+		}
+		var buf [4]byte
+		usermem.ByteOrder.PutUint32(buf[:], n)
+		_, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	// Check if we should coalesce the event we're about to queue with the last
+	// one currently in the queue. Events are coalesced if they are identical.
+	if last := i.events.Back(); last != nil {
+		if ev.equals(last.(*Event)) {
+			// "Coalesce" the two events by simply not queuing the new one. We
+			// don't need to raise a waiter.EventIn notification because no new
+			// data is available for reading.
+			return
+		}
+	}
+
+	i.events.PushBack(ev)
+	i.Queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+func (i *Inotify) newWatchLocked(target *Dirent, mask uint32) *Watch {
+	wd := i.nextWatch
+	i.nextWatch++
+
+	watch := &Watch{
+		owner:  i,
+		wd:     wd,
+		mask:   mask,
+		target: target.Inode,
+		pins:   make(map[*Dirent]bool),
+	}
+
+	i.watches[wd] = watch
+
+	// Grab an extra reference to target to prevent it from being evicted from
+	// memory. This ref is dropped during either watch removal, target
+	// destruction, or inotify instance destruction. See callers of Watch.Unpin.
+	watch.Pin(target)
+	target.Inode.Watches.Add(watch)
+
+	return watch
+}
+
+// targetDestroyed is called by w to notify i that w's target is gone. This
+// automatically generates a watch removal event.
+func (i *Inotify) targetDestroyed(w *Watch) {
+	i.mu.Lock()
+	_, found := i.watches[w.wd]
+	delete(i.watches, w.wd)
+	i.mu.Unlock()
+
+	if found {
+		i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
+	}
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target dirent. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 {
+	// Note: Locking this inotify instance protects the result returned by
+	// Lookup() below. With the lock held, we know for sure the lookup result
+	// won't become stale because it's impossible for *this* instance to
+	// add/remove watches on target.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	// Does the target already have a watch from this inotify instance?
+	if existing := target.Inode.Watches.Lookup(i.id); existing != nil {
+		// This may be a watch on a different dirent pointing to the
+		// same inode. Obtain an extra reference if necessary.
+		existing.Pin(target)
+
+		if mergeMask := mask&linux.IN_MASK_ADD != 0; mergeMask {
+			// "Add (OR) events to watch mask for this pathname if it already
+			// exists (instead of replacing mask)." -- inotify(7)
+			existing.mask |= mask
+		} else {
+			existing.mask = mask
+		}
+		return existing.wd
+	}
+
+	// No existing watch, create a new watch.
+	watch := i.newWatchLocked(target, mask)
+	return watch.wd
+}
+
+// RmWatch implements watcher.Watchable.RmWatch.
+//
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target dirent to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+	i.mu.Lock()
+
+	// Find the watch we were asked to removed.
+	watch, ok := i.watches[wd]
+	if !ok {
+		i.mu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// Remove the watch from this instance.
+	delete(i.watches, wd)
+
+	// Remove the watch from the watch target.
+	watch.target.Watches.Remove(watch.ID())
+
+	// The watch is now isolated and we can safely drop the instance lock. We
+	// need to do so because watch.destroy() acquires Watch.mu, which cannot be
+	// aquired with Inotify.mu held.
+	i.mu.Unlock()
+
+	// Generate the event for the removal.
+	i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
+
+	// Remove all pins.
+	watch.destroy()
+
+	return nil
+}
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
new file mode 100644
index 000000000..217915ba4
--- /dev/null
+++ b/pkg/sentry/fs/inotify_event.go
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// Event represents a struct inotify_event from linux.
+type Event struct {
+	ilist.Entry
+
+	wd     int32
+	mask   uint32
+	cookie uint32
+
+	// len is computed based on the name field is set automatically by
+	// Event.setName. It should be 0 when no name is set; otherwise it is the
+	// length of the name slice.
+	len uint32
+
+	// The name field has special padding requirements and should only be set by
+	// calling Event.setName.
+	name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+	e := &Event{
+		wd:     wd,
+		mask:   events,
+		cookie: cookie,
+	}
+	if name != "" {
+		e.setName(name)
+	}
+	return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+	if l < uint32(len(s)+1) {
+		panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+	}
+	b := make([]byte, l)
+	copy(b, s)
+
+	// b was zero-value initialized during make(), so the rest of the slice is
+	// already filled with null bytes.
+
+	return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+	// We need to pad the name such that the entire event length ends up a
+	// multiple of inotifyEventBaseSize.
+	unpaddedLen := len(name) + 1
+	// Round up to nearest multiple of inotifyEventBaseSize.
+	e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+	// Make sure we haven't overflowed and wrapped around when rounding.
+	if unpaddedLen > int(e.len) {
+		panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+	}
+	e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+	s := inotifyEventBaseSize + int(e.len)
+	if s < inotifyEventBaseSize {
+		panic("overflow")
+	}
+	return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+	usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+	usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+	usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+	usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+	writeLen := 0
+
+	n, err := dst.CopyOut(ctx, buf)
+	if err != nil {
+		return 0, err
+	}
+	writeLen += n
+	dst = dst.DropFirst(n)
+
+	if e.len > 0 {
+		n, err = dst.CopyOut(ctx, e.name)
+		if err != nil {
+			return 0, err
+		}
+		writeLen += n
+	}
+
+	// Santiy check.
+	if writeLen != e.sizeOf() {
+		panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %v, wrote %v.", e.sizeOf(), writeLen))
+	}
+
+	return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+	return e.wd == other.wd &&
+		e.mask == other.mask &&
+		e.cookie == other.cookie &&
+		e.len == other.len &&
+		bytes.Equal(e.name, other.name)
+}
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
new file mode 100644
index 000000000..ff6ec6e3e
--- /dev/null
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -0,0 +1,129 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// While a watch is active, it ensures the target inode is pinned in memory by
+// holding an extra ref on each dirent known (by inotify) to point to the
+// inode. These are known as pins. For a full discussion, see
+// fs/g3doc/inotify.md.
+type Watch struct {
+	// Inotify instance which owns this watch.
+	owner *Inotify
+
+	// Descriptor for this watch. This is unique across an inotify instance.
+	wd int32
+
+	// Events being monitored via this watch.
+	mask uint32
+
+	// The inode being watched. Note that we don't directly hold a reference on
+	// this inode. Instead we hold a reference on the dirent(s) containing the
+	// inode, which we record in pins.
+	target *Inode
+
+	// unpinned indicates whether we have a hard reference on target. This field
+	// may only be modified through atomic ops.
+	unpinned uint32
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// pins is the set of dirents this watch is currently pinning in memory by
+	// holding a reference to them. See Pin()/Unpin().
+	pins map[*Dirent]bool
+}
+
+// ID returns the id of the inotify instance that owns this watch.
+func (w *Watch) ID() uint64 {
+	return w.owner.id
+}
+
+// NotifyParentAfterUnlink indicates whether the parent of the watched object
+// should continue to be be notified of events after the target has been
+// unlinked.
+func (w *Watch) NotifyParentAfterUnlink() bool {
+	return w.mask&linux.IN_EXCL_UNLINK == 0
+}
+
+// isRenameEvent returns true if eventMask describes a rename event.
+func isRenameEvent(eventMask uint32) bool {
+	return eventMask&(linux.IN_MOVED_FROM|linux.IN_MOVED_TO|linux.IN_MOVE_SELF) != 0
+}
+
+// Notify queues a new event on this watch.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+	effectiveMask := unmaskableBits | w.mask
+	matchedEvents := effectiveMask & events
+
+	if matchedEvents == 0 {
+		// We weren't watching for this event.
+		return
+	}
+
+	w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+}
+
+// Pin acquires a new ref on dirent, which pins the dirent in memory while
+// the watch is active. Calling Pin for a second time on the same dirent for
+// the same watch is a no-op.
+func (w *Watch) Pin(d *Dirent) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if !w.pins[d] {
+		w.pins[d] = true
+		d.IncRef()
+	}
+}
+
+// Unpin drops any extra refs held on dirent due to a previous Pin
+// call. Calling Unpin multiple times for the same dirent, or on a dirent
+// without a corresponding Pin call is a no-op.
+func (w *Watch) Unpin(d *Dirent) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.pins[d] {
+		delete(w.pins, d)
+		d.DecRef()
+	}
+}
+
+// TargetDestroyed notifies the owner of the watch that the watch target is
+// gone. The owner should release its own references to the watcher upon
+// receiving this notification.
+func (w *Watch) TargetDestroyed() {
+	w.owner.targetDestroyed(w)
+}
+
+// destroy prepares the watch for destruction. It unpins all dirents pinned by
+// this watch. Destroy does not cause any new events to be generated. The caller
+// is responsible for ensuring there are no outstanding references to this
+// watch.
+func (w *Watch) destroy() {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	for d := range w.pins {
+		d.DecRef()
+	}
+	w.pins = nil
+}
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
new file mode 100644
index 000000000..c15dde800
--- /dev/null
+++ b/pkg/sentry/fs/lock/BUILD
@@ -0,0 +1,72 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "lock_state",
+    srcs = [
+        "lock.go",
+        "lock_range.go",
+        "lock_set.go",
+    ],
+    out = "lock_state.go",
+    package = "lock",
+)
+
+go_template_instance(
+    name = "lock_range",
+    out = "lock_range.go",
+    package = "lock",
+    prefix = "Lock",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint64",
+    },
+)
+
+go_template_instance(
+    name = "lock_set",
+    out = "lock_set.go",
+    consts = {
+        "minDegree": "3",
+    },
+    package = "lock",
+    prefix = "Lock",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "LockRange",
+        "Value": "Lock",
+        "Functions": "lockSetFunctions",
+    },
+)
+
+go_library(
+    name = "lock",
+    srcs = [
+        "lock.go",
+        "lock_range.go",
+        "lock_set.go",
+        "lock_set_functions.go",
+        "lock_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/log",
+        "//pkg/state",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "lock_test",
+    size = "small",
+    srcs = [
+        "lock_range_test.go",
+        "lock_test.go",
+    ],
+    embed = [":lock"],
+)
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
new file mode 100644
index 000000000..24d54c989
--- /dev/null
+++ b/pkg/sentry/fs/lock/lock.go
@@ -0,0 +1,457 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lock is the API for POSIX-style advisory regional file locks and
+// BSD-style full file locks.
+//
+// Callers needing to enforce these types of locks, like sys_fcntl, can call
+// LockRegion and UnlockRegion on a thread-safe set of Locks.  Locks are
+// specific to a unique file (unique device/inode pair) and for this reason
+// should not be shared between files.
+//
+// A Lock has a set of holders identified by UniqueID.  Normally this is the
+// pid of the thread attempting to acquire the lock.
+//
+// Since these are advisory locks, they do not need to be integrated into
+// Reads/Writes and for this reason there is no way to *check* if a lock is
+// held.  One can only attempt to take a lock or unlock an existing lock.
+//
+// A Lock in a set of Locks is typed: it is either a read lock with any number
+// of readers and no writer, or a write lock with no readers.
+//
+// As expected from POSIX, any attempt to acquire a write lock on a file region
+// when there already exits a write lock held by a different uid will fail. Any
+// attempt to acquire a write lock on a file region when there is more than one
+// reader will fail.  Any attempt to acquire a read lock on a file region when
+// there is already a writer will fail.
+//
+// In special cases, a read lock may be upgraded to a write lock and a write lock
+// can be downgraded to a read lock.  This can only happen if:
+//
+//  * read lock upgrade to write lock: There can be only one reader and the reader
+//    must be the same as the requested write lock holder.
+//
+//  * write lock downgrade to read lock: The writer must be the same as the requested
+//    read lock holder.
+//
+// UnlockRegion always succeeds.  If LockRegion fails the caller should normally
+// interpret this as "try again later".
+package lock
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// LockType is a type of regional file lock.
+type LockType int
+
+// UniqueID is a unique identifier of the holder of a regional file lock.
+type UniqueID uint64
+
+const (
+	// ReadLock describes a POSIX regional file lock to be taken
+	// read only.  There may be multiple of these locks on a single
+	// file region as long as there is no writer lock on the same
+	// region.
+	ReadLock LockType = iota
+
+	// WriteLock describes a POSIX regional file lock to be taken
+	// write only.  There may be only a single holder of this lock
+	// and no read locks.
+	WriteLock
+)
+
+// LockEOF is the maximal possible end of a regional file lock.
+const LockEOF = math.MaxUint64
+
+// Lock is a regional file lock.  It consists of either a single writer
+// or a set of readers.
+//
+// A Lock may be upgraded from a read lock to a write lock only if there
+// is a single reader and that reader has the same uid as the write lock.
+//
+// A Lock may be downgraded from a write lock to a read lock only if
+// the write lock's uid is the same as the read lock.
+type Lock struct {
+	// Readers are the set of read lock holders identified by UniqueID.
+	// If len(Readers) > 0 then HasWriter must be false.
+	Readers map[UniqueID]bool
+
+	// HasWriter indicates that this is a write lock held by a single
+	// UniqueID.
+	HasWriter bool
+
+	// Writer is only valid if HasWriter is true.  It identifies a
+	// single write lock holder.
+	Writer UniqueID
+}
+
+// Locks is a thread-safe wrapper around a LockSet.
+type Locks struct {
+	// mu protects locks below.
+	mu sync.Mutex `state:"nosave"`
+
+	// locks is the set of region locks currently held on an Inode.
+	locks LockSet
+
+	// blockedQueue is the queue of waiters that are waiting on a lock.
+	blockedQueue waiter.Queue
+}
+
+// Blocker is the interface used for blocking locks. Passing a nil Blocker
+// will be treated as non-blocking.
+type Blocker interface {
+	Block(C chan struct{}) error
+}
+
+const (
+	// EventMaskAll is the mask we will always use for locks, by using the
+	// same mask all the time we can wake up everyone anytime the lock
+	// changes state.
+	EventMaskAll waiter.EventMask = 0xFFFF
+)
+
+// LockRegion attempts to acquire a typed lock for the uid on a region
+// of a file. Returns true if successful in locking the region. If false
+// is returned, the caller should normally interpret this as "try again later" if
+// accquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
+// Blocker is the interface used to provide blocking behavior, passing a nil Blocker
+// will result in non-blocking behavior.
+func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
+	for {
+		l.mu.Lock()
+
+		// Blocking locks must run in a loop because we'll be woken up whenever an unlock event
+		// happens for this lock. We will then attempt to take the lock again and if it fails
+		// continue blocking.
+		res := l.locks.lock(uid, t, r)
+		if !res && block != nil {
+			e, ch := waiter.NewChannelEntry(nil)
+			l.blockedQueue.EventRegister(&e, EventMaskAll)
+			l.mu.Unlock()
+			if err := block.Block(ch); err != nil {
+				// We were interrupted, the caller can translate this to EINTR if applicable.
+				l.blockedQueue.EventUnregister(&e)
+				return false
+			}
+			l.blockedQueue.EventUnregister(&e)
+			continue // Try again now that someone has unlocked.
+		}
+
+		l.mu.Unlock()
+		return res
+	}
+}
+
+// UnlockRegion attempts to release a lock for the uid on a region of a file.
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	l.locks.unlock(uid, r)
+
+	// Now that we've released the lock, we need to wake up any waiters.
+	l.blockedQueue.Notify(EventMaskAll)
+}
+
+// makeLock returns a new typed Lock that has either uid as its only reader
+// or uid as its only writer.
+func makeLock(uid UniqueID, t LockType) Lock {
+	value := Lock{Readers: make(map[UniqueID]bool)}
+	switch t {
+	case ReadLock:
+		value.Readers[uid] = true
+	case WriteLock:
+		value.HasWriter = true
+		value.Writer = uid
+	default:
+		panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
+	}
+	return value
+}
+
+// isHeld returns true if uid is a holder of Lock.
+func (l Lock) isHeld(uid UniqueID) bool {
+	if l.HasWriter && l.Writer == uid {
+		return true
+	}
+	return l.Readers[uid]
+}
+
+// lock sets uid as a holder of a typed lock on Lock.
+//
+// Preconditions: canLock is true for the range containing this Lock.
+func (l *Lock) lock(uid UniqueID, t LockType) {
+	switch t {
+	case ReadLock:
+		// If we are already a reader, then this is a no-op.
+		if l.Readers[uid] {
+			return
+		}
+		// We cannot downgrade a write lock to a read lock unless the
+		// uid is the same.
+		if l.HasWriter {
+			if l.Writer != uid {
+				panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
+			}
+			// Ensure that there is only one reader if upgrading.
+			l.Readers = make(map[UniqueID]bool)
+			// Ensure that there is no longer a writer.
+			l.HasWriter = false
+		}
+		l.Readers[uid] = true
+		return
+	case WriteLock:
+		// If we are already the writer, then this is a no-op.
+		if l.HasWriter && l.Writer == uid {
+			return
+		}
+		// We can only upgrade a read lock to a write lock if there
+		// is only one reader and that reader has the same uid as
+		// the write lock.
+		if readers := len(l.Readers); readers > 0 {
+			if readers != 1 {
+				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
+			}
+			if !l.Readers[uid] {
+				panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
+			}
+		}
+		// Ensure that there is only a writer.
+		l.Readers = make(map[UniqueID]bool)
+		l.HasWriter = true
+		l.Writer = uid
+	default:
+		panic(fmt.Sprintf("lock: invalid lock type %d", t))
+	}
+}
+
+// lockable returns true if check returns true for every Lock in LockRange.
+// Further, check should return true if Lock meets the callers requirements
+// for locking Lock.
+func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
+	// Get our starting point.
+	seg := l.LowerBoundSegment(r.Start)
+	for seg.Ok() && seg.Start() < r.End {
+		// Note that we don't care about overruning the end of the
+		// last segment because if everything checks out we'll just
+		// split the last segment.
+		if !check(seg.Value()) {
+			return false
+		}
+		// Jump to the next segment, ignoring gaps, for the same
+		// reason we ignored the first gap.
+		seg = seg.NextSegment()
+	}
+	// No conflict, we can get a lock for uid over the entire range.
+	return true
+}
+
+// canLock returns true if uid will be able to take a Lock of type t on the
+// entire range specified by LockRange.
+func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
+	switch t {
+	case ReadLock:
+		return l.lockable(r, func(value Lock) bool {
+			// If there is no writer, there's no problem adding
+			// another reader.
+			if !value.HasWriter {
+				return true
+			}
+			// If there is a writer, then it must be the same uid
+			// in order to downgrade the lock to a read lock.
+			return value.Writer == uid
+		})
+	case WriteLock:
+		return l.lockable(r, func(value Lock) bool {
+			// If there are only readers.
+			if !value.HasWriter {
+				// Then this uid can only take a write lock if
+				// this is a private upgrade, meaning that the
+				// only reader is uid.
+				return len(value.Readers) == 1 && value.Readers[uid]
+			}
+			// If the uid is already a writer on this region, then
+			// adding a write lock would be a no-op.
+			return value.Writer == uid
+		})
+	default:
+		panic(fmt.Sprintf("canLock: invalid lock type %d", t))
+	}
+}
+
+// lock returns true if uid took a lock of type t on the entire range of LockRange.
+//
+// Preconditions: r.Start <= r.End (will panic otherwise).
+func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
+	if r.Start > r.End {
+		panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
+	}
+
+	// Don't attempt to insert anything with a range of 0 and treat this
+	// as a successful no-op.
+	if r.Length() == 0 {
+		return true
+	}
+
+	// Do a first-pass check.  We *could* hold onto the segments we
+	// checked if canLock would return true, but traversing the segment
+	// set should be fast and this keeps things simple.
+	if !l.canLock(uid, t, r) {
+		return false
+	}
+	// Get our starting point.
+	seg, gap := l.Find(r.Start)
+	if gap.Ok() {
+		// Fill in the gap and get the next segment to modify.
+		seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment()
+	} else if seg.Start() < r.Start {
+		// Get our first segment to modify.
+		_, seg = l.Split(seg, r.Start)
+	}
+	for seg.Ok() && seg.Start() < r.End {
+		// Split the last one if necessary.
+		if seg.End() > r.End {
+			seg, _ = l.SplitUnchecked(seg, r.End)
+		}
+
+		// Set the lock on the segment.  This is guaranteed to
+		// always be safe, given canLock above.
+		value := seg.ValuePtr()
+		value.lock(uid, t)
+
+		// Fill subsequent gaps.
+		gap = seg.NextGap()
+		if gr := gap.Range().Intersect(r); gr.Length() > 0 {
+			seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment()
+		} else {
+			seg = gap.NextSegment()
+		}
+	}
+	return true
+}
+
+// unlock is always successful.  If uid has no locks held for the range LockRange,
+// unlock is a no-op.
+//
+// Preconditions: same as lock.
+func (l *LockSet) unlock(uid UniqueID, r LockRange) {
+	if r.Start > r.End {
+		panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
+	}
+
+	// Same as setlock.
+	if r.Length() == 0 {
+		return
+	}
+
+	// Get our starting point.
+	seg := l.LowerBoundSegment(r.Start)
+	for seg.Ok() && seg.Start() < r.End {
+		// If this segment doesn't have a lock from uid then
+		// there is no need to fragment the set with Isolate (below).
+		// In this case just move on to the next segment.
+		if !seg.Value().isHeld(uid) {
+			seg = seg.NextSegment()
+			continue
+		}
+
+		// Ensure that if we need to unlock a sub-segment that
+		// we don't unlock/remove that entire segment.
+		seg = l.Isolate(seg, r)
+
+		value := seg.Value()
+		var remove bool
+		if value.HasWriter && value.Writer == uid {
+			// If we are unlocking a writer, then since there can
+			// only ever be one writer and no readers, then this
+			// lock should always be removed from the set.
+			remove = true
+		} else if value.Readers[uid] {
+			// If uid is the last reader, then just remove the entire
+			// segment.
+			if len(value.Readers) == 1 {
+				remove = true
+			} else {
+				// Otherwise we need to remove this reader without
+				// affecting any other segment's readers.  To do
+				// this, we need to make a copy of the Readers map
+				// and not add this uid.
+				newValue := Lock{Readers: make(map[UniqueID]bool)}
+				for k, v := range value.Readers {
+					if k != uid {
+						newValue.Readers[k] = v
+					}
+				}
+				seg.SetValue(newValue)
+			}
+		}
+		if remove {
+			seg = l.Remove(seg).NextSegment()
+		} else {
+			seg = seg.NextSegment()
+		}
+	}
+}
+
+// ComputeRange takes a positive file offset and computes the start of a LockRange
+// using start (relative to offset) and the end of the LockRange using length. The
+// values of start and length may be negative but the resulting LockRange must
+// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
+func ComputeRange(start, length, offset int64) (LockRange, error) {
+	offset += start
+	// fcntl(2): "l_start can be a negative number provided the offset
+	// does not lie before the start of the file"
+	if offset < 0 {
+		return LockRange{}, syscall.EINVAL
+	}
+
+	// fcntl(2): Specifying 0 for l_len has the  special meaning: lock all
+	// bytes starting at the location specified by l_whence and l_start
+	// through to the end of file, no matter how large the file grows.
+	end := uint64(LockEOF)
+	if length > 0 {
+		// fcntl(2): If l_len is positive, then the range to be locked
+		// covers bytes l_start up to and including l_start+l_len-1.
+		//
+		// Since LockRange.End is exclusive we need not -1 from length..
+		end = uint64(offset + length)
+	} else if length < 0 {
+		// fcntl(2): If l_len is negative, the interval described by
+		// lock covers bytes l_start+l_len up to and including l_start-1.
+		//
+		// Since LockRange.End is exclusive we need not -1 from offset.
+		signedEnd := offset
+		// Add to offset using a negative length (subtract).
+		offset += length
+		if offset < 0 {
+			return LockRange{}, syscall.EINVAL
+		}
+		if signedEnd < offset {
+			return LockRange{}, syscall.EOVERFLOW
+		}
+		// At this point signedEnd cannot be negative,
+		// since we asserted that offset is not negative
+		// and it is not less than offset.
+		end = uint64(signedEnd)
+	}
+	// Offset is guaranteed to be positive at this point.
+	return LockRange{Start: uint64(offset), End: end}, nil
+}
diff --git a/pkg/sentry/fs/lock/lock_range_test.go b/pkg/sentry/fs/lock/lock_range_test.go
new file mode 100644
index 000000000..06a37c701
--- /dev/null
+++ b/pkg/sentry/fs/lock/lock_range_test.go
@@ -0,0 +1,136 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lock
+
+import (
+	"syscall"
+	"testing"
+)
+
+func TestComputeRange(t *testing.T) {
+	tests := []struct {
+		// Description of test.
+		name string
+
+		// Requested start of the lock range.
+		start int64
+
+		// Requested length of the lock range,
+		// can be negative :(
+		length int64
+
+		// Pre-computed file offset based on whence.
+		// Will be added to start.
+		offset int64
+
+		// Expected error.
+		err error
+
+		// If error is nil, the expected LockRange.
+		LockRange
+	}{
+		{
+			name:      "offset, start, and length all zero",
+			LockRange: LockRange{Start: 0, End: LockEOF},
+		},
+		{
+			name:      "zero offset, zero start, positive length",
+			start:     0,
+			length:    4096,
+			offset:    0,
+			LockRange: LockRange{Start: 0, End: 4096},
+		},
+		{
+			name:   "zero offset, negative start",
+			start:  -4096,
+			offset: 0,
+			err:    syscall.EINVAL,
+		},
+		{
+			name:      "large offset, negative start, positive length",
+			start:     -2048,
+			length:    2048,
+			offset:    4096,
+			LockRange: LockRange{Start: 2048, End: 4096},
+		},
+		{
+			name:      "large offset, negative start, zero length",
+			start:     -2048,
+			length:    0,
+			offset:    4096,
+			LockRange: LockRange{Start: 2048, End: LockEOF},
+		},
+		{
+			name:   "zero offset, zero start, negative length",
+			start:  0,
+			length: -4096,
+			offset: 0,
+			err:    syscall.EINVAL,
+		},
+		{
+			name:      "large offset, zero start, negative length",
+			start:     0,
+			length:    -4096,
+			offset:    4096,
+			LockRange: LockRange{Start: 0, End: 4096},
+		},
+		{
+			name:      "offset, start, and length equal, length is negative",
+			start:     1024,
+			length:    -1024,
+			offset:    1024,
+			LockRange: LockRange{Start: 1024, End: 2048},
+		},
+		{
+			name:      "offset, start, and length equal, start is negative",
+			start:     -1024,
+			length:    1024,
+			offset:    1024,
+			LockRange: LockRange{Start: 0, End: 1024},
+		},
+		{
+			name:      "offset, start, and length equal, offset is negative",
+			start:     1024,
+			length:    1024,
+			offset:    -1024,
+			LockRange: LockRange{Start: 0, End: 1024},
+		},
+		{
+			name:   "offset, start, and length equal, all negative",
+			start:  -1024,
+			length: -1024,
+			offset: -1024,
+			err:    syscall.EINVAL,
+		},
+		{
+			name:      "offset, start, and length equal, all positive",
+			start:     1024,
+			length:    1024,
+			offset:    1024,
+			LockRange: LockRange{Start: 2048, End: 3072},
+		},
+	}
+
+	for _, test := range tests {
+		rng, err := ComputeRange(test.start, test.length, test.offset)
+		if err != test.err {
+			t.Errorf("%s: lockRange(%d, %d, %d) got error %v, want %v", test.name, test.start, test.length, test.offset, err, test.err)
+			continue
+		}
+		if err == nil && rng != test.LockRange {
+			t.Errorf("%s: lockRange(%d, %d, %d) got LockRange %v, want %v", test.name, test.start, test.length, test.offset, rng, test.LockRange)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
new file mode 100644
index 000000000..e16f485be
--- /dev/null
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lock
+
+import (
+	"math"
+)
+
+// LockSet maps a set of Locks into a file.  The key is the file offset.
+
+type lockSetFunctions struct{}
+
+func (lockSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (lockSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (lockSetFunctions) ClearValue(l *Lock) {
+	*l = Lock{}
+}
+
+func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) (Lock, bool) {
+	// Merge only if the Readers/Writers are identical.
+	if len(val1.Readers) != len(val2.Readers) {
+		return Lock{}, false
+	}
+	for k := range val1.Readers {
+		if !val2.Readers[k] {
+			return Lock{}, false
+		}
+	}
+	if val1.HasWriter != val2.HasWriter {
+		return Lock{}, false
+	}
+	if val1.HasWriter {
+		if val1.Writer != val2.Writer {
+			return Lock{}, false
+		}
+	}
+	return val1, true
+}
+
+func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) {
+	// Copy the segment so that split segments don't contain map references
+	// to other segments.
+	val0 := Lock{Readers: make(map[UniqueID]bool)}
+	for k, v := range val.Readers {
+		val0.Readers[k] = v
+	}
+	val0.HasWriter = val.HasWriter
+	val0.Writer = val.Writer
+
+	return val, val0
+}
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
new file mode 100644
index 000000000..c60f5f7a2
--- /dev/null
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -0,0 +1,1059 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lock
+
+import (
+	"reflect"
+	"testing"
+)
+
+type entry struct {
+	Lock
+	LockRange
+}
+
+func equals(e0, e1 []entry) bool {
+	if len(e0) != len(e1) {
+		return false
+	}
+	for i := range e0 {
+		for k := range e0[i].Lock.Readers {
+			if !e1[i].Lock.Readers[k] {
+				return false
+			}
+		}
+		for k := range e1[i].Lock.Readers {
+			if !e0[i].Lock.Readers[k] {
+				return false
+			}
+		}
+		if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) {
+			return false
+		}
+		if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter {
+			return false
+		}
+		if e0[i].Lock.Writer != e1[i].Lock.Writer {
+			return false
+		}
+	}
+	return true
+}
+
+// fill a LockSet with consecutive region locks.  Will panic if
+// LockRanges are not consecutive.
+func fill(entries []entry) LockSet {
+	l := LockSet{}
+	for _, e := range entries {
+		gap := l.FindGap(e.LockRange.Start)
+		if !gap.Ok() {
+			panic("cannot insert into existing segment")
+		}
+		l.Insert(gap, e.LockRange, e.Lock)
+	}
+	return l
+}
+
+func TestCanLockEmpty(t *testing.T) {
+	l := LockSet{}
+
+	// Expect to be able to take any locks given that the set is empty.
+	eof := l.FirstGap().End()
+	r := LockRange{0, eof}
+	if !l.canLock(1, ReadLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 1)
+	}
+	if !l.canLock(2, ReadLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 2)
+	}
+	if !l.canLock(1, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1)
+	}
+	if !l.canLock(2, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 2)
+	}
+}
+
+func TestCanLock(t *testing.T) {
+	// + -------------- + ---------- + -------------- + --------- +
+	// | Readers 1 & 2  | Readers 1  | Readers 1 & 3  | Writer 1  |
+	// + -------------  + ---------- + -------------- + --------- +
+	// 0             1024         2048             3072        4096
+	l := fill([]entry{
+		{
+			Lock:      Lock{Readers: map[UniqueID]bool{1: true, 2: true}},
+			LockRange: LockRange{0, 1024},
+		},
+		{
+			Lock:      Lock{Readers: map[UniqueID]bool{1: true}},
+			LockRange: LockRange{1024, 2048},
+		},
+		{
+			Lock:      Lock{Readers: map[UniqueID]bool{1: true, 3: true}},
+			LockRange: LockRange{2048, 3072},
+		},
+		{
+			Lock:      Lock{HasWriter: true, Writer: 1},
+			LockRange: LockRange{3072, 4096},
+		},
+	})
+
+	// Now that we have a mildly interesting layout, try some checks on different
+	// ranges, uids, and lock types.
+	//
+	// Expect to be able to extend the read lock, despite the writer lock, because
+	// the writer has the same uid as the requested read lock.
+	r := LockRange{0, 8192}
+	if !l.canLock(1, ReadLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 1)
+	}
+	// Expect to *not* be able to extend the read lock since there is an overlapping
+	// writer region locked by someone other than the uid.
+	if l.canLock(2, ReadLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got true, want false", ReadLock, r, 2)
+	}
+	// Expect to be able to extend the read lock if there are only other readers in
+	// the way.
+	r = LockRange{64, 3072}
+	if !l.canLock(2, ReadLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 2)
+	}
+	// Expect to be able to set a read lock beyond the range of any existing locks.
+	r = LockRange{4096, 10240}
+	if !l.canLock(2, ReadLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 2)
+	}
+
+	// Expect to not be able to take a write lock with other readers in the way.
+	r = LockRange{0, 8192}
+	if l.canLock(1, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got true, want false", WriteLock, r, 1)
+	}
+	// Expect to be able to extend the write lock for the same uid.
+	r = LockRange{3072, 8192}
+	if !l.canLock(1, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1)
+	}
+	// Expect to not be able to overlap a write lock for two different uids.
+	if l.canLock(2, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got true, want false", WriteLock, r, 2)
+	}
+	// Expect to be able to set a write lock that is beyond the range of any
+	// existing locks.
+	r = LockRange{8192, 10240}
+	if !l.canLock(2, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 2)
+	}
+	// Expect to be able to upgrade a read lock (any portion of it).
+	r = LockRange{1024, 2048}
+	if !l.canLock(1, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1)
+	}
+	r = LockRange{1080, 2000}
+	if !l.canLock(1, WriteLock, r) {
+		t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1)
+	}
+}
+
+func TestSetLock(t *testing.T) {
+	tests := []struct {
+		// description of test.
+		name string
+
+		// LockSet entries to pre-fill.
+		before []entry
+
+		// Description of region to lock:
+		//
+		// start is the file offset of the lock.
+		start uint64
+		// end is the end file offset of the lock.
+		end uint64
+		// uid of lock attempter.
+		uid UniqueID
+		// lock type requested.
+		lockType LockType
+
+		// success is true if taking the above
+		// lock should succeed.
+		success bool
+
+		// Expected layout of the set after locking
+		// if success is true.
+		after []entry
+	}{
+		{
+			name:     "set zero length ReadLock on empty set",
+			start:    0,
+			end:      0,
+			uid:      0,
+			lockType: ReadLock,
+			success:  true,
+		},
+		{
+			name:     "set zero length WriteLock on empty set",
+			start:    0,
+			end:      0,
+			uid:      0,
+			lockType: WriteLock,
+			success:  true,
+		},
+		{
+			name:     "set ReadLock on empty set",
+			start:    0,
+			end:      LockEOF,
+			uid:      0,
+			lockType: ReadLock,
+			success:  true,
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+		},
+		{
+			name:     "set WriteLock on empty set",
+			start:    0,
+			end:      LockEOF,
+			uid:      0,
+			lockType: WriteLock,
+			success:  true,
+			// + ----------------------------------------- +
+			// | Writer  0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			after: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+		},
+		{
+			name: "set ReadLock on WriteLock same uid",
+			// + ----------------------------------------- +
+			// | Writer 0                                  |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start:    0,
+			end:      4096,
+			uid:      0,
+			lockType: ReadLock,
+			success:  true,
+			// + ----------- + --------------------------- +
+			// | Readers 0   | Writer 0                    |
+			// + ----------- + --------------------------- +
+			// 0          4096                    max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, 4096},
+				},
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "set WriteLock on ReadLock same uid",
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start:    0,
+			end:      4096,
+			uid:      0,
+			lockType: WriteLock,
+			success:  true,
+			// + ----------- + --------------------------- +
+			// | Writer 0    | Readers 0                   |
+			// + ----------- + --------------------------- +
+			// 0          4096                    max uint64
+			after: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "set ReadLock on WriteLock different uid",
+			// + ----------------------------------------- +
+			// | Writer 0                                  |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start:    0,
+			end:      4096,
+			uid:      1,
+			lockType: ReadLock,
+			success:  false,
+		},
+		{
+			name: "set WriteLock on ReadLock different uid",
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start:    0,
+			end:      4096,
+			uid:      1,
+			lockType: WriteLock,
+			success:  false,
+		},
+		{
+			name: "split ReadLock for overlapping lock at start 0",
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start:    0,
+			end:      4096,
+			uid:      1,
+			lockType: ReadLock,
+			success:  true,
+			// + -------------- + --------------------------- +
+			// | Readers 0 & 1  | Readers 0                   |
+			// + -------------- + --------------------------- +
+			// 0             4096                    max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{0, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "split ReadLock for overlapping lock at non-zero start",
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start:    4096,
+			end:      8192,
+			uid:      1,
+			lockType: ReadLock,
+			success:  true,
+			// + ---------- + -------------- + ----------- +
+			// | Readers 0  | Readers 0 & 1  | Readers 0   |
+			// + ---------- + -------------- + ----------- +
+			// 0         4096             8192    max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{4096, 8192},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{8192, LockEOF},
+				},
+			},
+		},
+		{
+			name: "fill front gap with ReadLock",
+			// + --------- + ---------------------------- +
+			// | gap       | Readers 0                    |
+			// + --------- + ---------------------------- +
+			// 0        1024                     max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{1024, LockEOF},
+				},
+			},
+			start:    0,
+			end:      8192,
+			uid:      0,
+			lockType: ReadLock,
+			success:  true,
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+		},
+		{
+			name: "fill end gap with ReadLock",
+			// + ---------------------------- +
+			// | Readers 0                    |
+			// + ---------------------------- +
+			// 0                           4096
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, 4096},
+				},
+			},
+			start:    1024,
+			end:      LockEOF,
+			uid:      0,
+			lockType: ReadLock,
+			success:  true,
+			// Note that this is not merged after lock does a Split.  This is
+			// fine because the two locks will still *behave* as one.  In other
+			// words we can fragment any lock all we want and semantically it
+			// makes no difference.
+			//
+			// + ----------- + --------------------------- +
+			// | Readers 0   | Readers 0                   |
+			// + ----------- + --------------------------- +
+			// 0                                  max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{1024, LockEOF},
+				},
+			},
+		},
+		{
+			name: "fill gap with ReadLock and split",
+			// + --------- + ---------------------------- +
+			// | gap       | Readers 0                    |
+			// + --------- + ---------------------------- +
+			// 0        1024                     max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{1024, LockEOF},
+				},
+			},
+			start:    0,
+			end:      4096,
+			uid:      1,
+			lockType: ReadLock,
+			success:  true,
+			// + --------- + ------------- + ------------- +
+			// | Reader 1  | Readers 0 & 1 | Reader 0      |
+			// + ----------+ ------------- + ------------- +
+			// 0        1024            4096      max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{1024, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "upgrade ReadLock to WriteLock for single uid fill gap",
+			// + ------------- + --------- + --- + ------------- +
+			// | Readers 0 & 1 | Readers 0 | gap | Readers 0 & 2 |
+			// + ------------- + --------- + --- + ------------- +
+			// 0            1024        2048  4096      max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{1024, 2048},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 2: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+			start:    1024,
+			end:      4096,
+			uid:      0,
+			lockType: WriteLock,
+			success:  true,
+			// + ------------- + -------- + ------------- +
+			// | Readers 0 & 1 | Writer 0 | Readers 0 & 2 |
+			// + ------------- + -------- + ------------- +
+			// 0            1024       4096      max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{1024, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 2: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "upgrade ReadLock to WriteLock for single uid keep gap",
+			// + ------------- + --------- + --- + ------------- +
+			// | Readers 0 & 1 | Readers 0 | gap | Readers 0 & 2 |
+			// + ------------- + --------- + --- + ------------- +
+			// 0            1024        2048  4096      max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{1024, 2048},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 2: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+			start:    1024,
+			end:      3072,
+			uid:      0,
+			lockType: WriteLock,
+			success:  true,
+			// + ------------- + -------- + --- + ------------- +
+			// | Readers 0 & 1 | Writer 0 | gap | Readers 0 & 2 |
+			// + ------------- + -------- + --- + ------------- +
+			// 0            1024       3072  4096      max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{1024, 3072},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 2: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "fail to upgrade ReadLock to WriteLock with conflicting Reader",
+			// + ------------- + --------- +
+			// | Readers 0 & 1 | Readers 0 |
+			// + ------------- + --------- +
+			// 0            1024        2048
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{1024, 2048},
+				},
+			},
+			start:    0,
+			end:      2048,
+			uid:      0,
+			lockType: WriteLock,
+			success:  false,
+		},
+		{
+			name: "take WriteLock on whole file if all uids are the same",
+			// + ------------- + --------- + --------- + ---------- +
+			// | Writer 0      | Readers 0 | Readers 0 | Readers 0  |
+			// + ------------- + --------- + --------- + ---------- +
+			// 0            1024        2048        4096   max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{1024, 2048},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{2048, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+			start:    0,
+			end:      LockEOF,
+			uid:      0,
+			lockType: WriteLock,
+			success:  true,
+			// We do not manually merge locks.  Semantically a fragmented lock
+			// held by the same uid will behave as one lock so it makes no difference.
+			//
+			// + ------------- + ---------------------------- +
+			// | Writer 0      | Writer 0                     |
+			// + ------------- + ---------------------------- +
+			// 0            1024                     max uint64
+			after: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{1024, LockEOF},
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		l := fill(test.before)
+
+		r := LockRange{Start: test.start, End: test.end}
+		success := l.lock(test.uid, test.lockType, r)
+		var got []entry
+		for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			got = append(got, entry{
+				Lock:      seg.Value(),
+				LockRange: seg.Range(),
+			})
+		}
+
+		if success != test.success {
+			t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success)
+			continue
+		}
+
+		if success {
+			if !equals(got, test.after) {
+				t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
+			}
+		}
+	}
+}
+
+func TestUnlock(t *testing.T) {
+	tests := []struct {
+		// description of test.
+		name string
+
+		// LockSet entries to pre-fill.
+		before []entry
+
+		// Description of region to unlock:
+		//
+		// start is the file start of the lock.
+		start uint64
+		// end is the end file start of the lock.
+		end uint64
+		// uid of lock holder.
+		uid UniqueID
+
+		// Expected layout of the set after unlocking.
+		after []entry
+	}{
+		{
+			name:  "unlock zero length on empty set",
+			start: 0,
+			end:   0,
+			uid:   0,
+		},
+		{
+			name:  "unlock on empty set (no-op)",
+			start: 0,
+			end:   LockEOF,
+			uid:   0,
+		},
+		{
+			name: "unlock uid not locked (no-op)",
+			// + --------------------------- +
+			// | Readers 1 & 2               |
+			// + --------------------------- +
+			// 0                    max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{1: true, 2: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 1024,
+			end:   4096,
+			uid:   0,
+			// + --------------------------- +
+			// | Readers 1 & 2               |
+			// + --------------------------- +
+			// 0                    max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{1: true, 2: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+		},
+		{
+			name: "unlock ReadLock over entire file",
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 0,
+			end:   LockEOF,
+			uid:   0,
+		},
+		{
+			name: "unlock WriteLock over entire file",
+			// + ----------------------------------------- +
+			// | Writer 0                                  |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 0,
+			end:   LockEOF,
+			uid:   0,
+		},
+		{
+			name: "unlock partial ReadLock (start)",
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 0,
+			end:   4096,
+			uid:   0,
+			// + ------ + --------------------------- +
+			// | gap    | Readers 0                   |
+			// +------- + --------------------------- +
+			// 0     4096                    max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "unlock partial WriteLock (start)",
+			// + ----------------------------------------- +
+			// | Writer  0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 0,
+			end:   4096,
+			uid:   0,
+			// + ------ + --------------------------- +
+			// | gap    | Writer  0                   |
+			// +------- + --------------------------- +
+			// 0     4096                    max uint64
+			after: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "unlock partial ReadLock (end)",
+			// + ----------------------------------------- +
+			// | Readers 0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 4096,
+			end:   LockEOF,
+			uid:   0,
+			// + --------------------------- +
+			// | Readers 0                   |
+			// +---------------------------- +
+			// 0                          4096
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true}},
+					LockRange: LockRange{0, 4096},
+				},
+			},
+		},
+		{
+			name: "unlock partial WriteLock (end)",
+			// + ----------------------------------------- +
+			// | Writer  0                                 |
+			// + ----------------------------------------- +
+			// 0                                  max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 4096,
+			end:   LockEOF,
+			uid:   0,
+			// + --------------------------- +
+			// | Writer  0                   |
+			// +---------------------------- +
+			// 0                          4096
+			after: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 4096},
+				},
+			},
+		},
+		{
+			name: "unlock for single uid",
+			// + ------------- + --------- + ------------------- +
+			// | Readers 0 & 1 | Writer 0  | Readers 0 & 1 & 2   |
+			// + ------------- + --------- + ------------------- +
+			// 0            1024        4096            max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{1024, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+			start: 0,
+			end:   LockEOF,
+			uid:   0,
+			// + --------- + --- + --------------- +
+			// | Readers 1 | gap | Readers 1 & 2   |
+			// + --------- + --- + --------------- +
+			// 0        1024  4096        max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{1: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{1: true, 2: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "unlock subsection locked",
+			// + ------------------------------- +
+			// | Readers 0 & 1 & 2               |
+			// + ------------------------------- +
+			// 0                        max uint64
+			before: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}},
+					LockRange: LockRange{0, LockEOF},
+				},
+			},
+			start: 1024,
+			end:   4096,
+			uid:   0,
+			// + ----------------- + ------------- + ----------------- +
+			// | Readers 0 & 1 & 2 | Readers 1 & 2 | Readers 0 & 1 & 2 |
+			// + ----------------- + ------------- + ----------------- +
+			// 0                1024            4096          max uint64
+			after: []entry{
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{1: true, 2: true}},
+					LockRange: LockRange{1024, 4096},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "unlock mid-gap to increase gap",
+			// + --------- + ----- + ------------------- +
+			// | Writer 0  |  gap  | Readers 0 & 1       |
+			// + --------- + ----- + ------------------- +
+			// 0        1024    4096            max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+			start: 8,
+			end:   2048,
+			uid:   0,
+			// + --------- + ----- + ------------------- +
+			// | Writer 0  |  gap  | Readers 0 & 1       |
+			// + --------- + ----- + ------------------- +
+			// 0           8    4096            max uint64
+			after: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 8},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+		},
+		{
+			name: "unlock split region on uid mid-gap",
+			// + --------- + ----- + ------------------- +
+			// | Writer 0  |  gap  | Readers 0 & 1       |
+			// + --------- + ----- + ------------------- +
+			// 0        1024    4096            max uint64
+			before: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{4096, LockEOF},
+				},
+			},
+			start: 2048,
+			end:   8192,
+			uid:   0,
+			// + --------- + ----- + --------- + ------------- +
+			// | Writer 0  |  gap  | Readers 1 | Readers 0 & 1 |
+			// + --------- + ----- + --------- + ------------- +
+			// 0       1024     4096        8192      max uint64
+			after: []entry{
+				{
+					Lock:      Lock{HasWriter: true, Writer: 0},
+					LockRange: LockRange{0, 1024},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{1: true}},
+					LockRange: LockRange{4096, 8192},
+				},
+				{
+					Lock:      Lock{Readers: map[UniqueID]bool{0: true, 1: true}},
+					LockRange: LockRange{8192, LockEOF},
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		l := fill(test.before)
+
+		r := LockRange{Start: test.start, End: test.end}
+		l.unlock(test.uid, r)
+		var got []entry
+		for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			got = append(got, entry{
+				Lock:      seg.Value(),
+				LockRange: seg.Range(),
+			})
+		}
+		if !equals(got, test.after) {
+			t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
new file mode 100644
index 000000000..b3bfa5268
--- /dev/null
+++ b/pkg/sentry/fs/mock.go
@@ -0,0 +1,177 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MockInodeOperations implements InodeOperations for testing Inodes.
+type MockInodeOperations struct {
+	InodeOperations
+
+	UAttr UnstableAttr
+
+	createCalled          bool
+	createDirectoryCalled bool
+	createLinkCalled      bool
+	renameCalled          bool
+	walkCalled            bool
+}
+
+// NewMockInode returns a mock *Inode using MockInodeOperations.
+func NewMockInode(ctx context.Context, msrc *MountSource, sattr StableAttr) *Inode {
+	return NewInode(NewMockInodeOperations(ctx), msrc, sattr)
+}
+
+// NewMockInodeOperations returns a *MockInodeOperations.
+func NewMockInodeOperations(ctx context.Context) *MockInodeOperations {
+	return &MockInodeOperations{
+		UAttr: WithCurrentTime(ctx, UnstableAttr{
+			Perms: FilePermsFromMode(0777),
+		}),
+	}
+}
+
+// MockMountSourceOps implements fs.MountSourceOperations.
+type MockMountSourceOps struct {
+	MountSourceOperations
+	keep       bool
+	revalidate bool
+}
+
+// NewMockMountSource returns a new *MountSource using MockMountSourceOps.
+func NewMockMountSource(cache *DirentCache) *MountSource {
+	var keep bool
+	if cache != nil {
+		keep = cache.maxSize > 0
+	}
+	return &MountSource{
+		MountSourceOperations: &MockMountSourceOps{keep: keep},
+		fscache:               cache,
+		children:              make(map[*MountSource]struct{}),
+	}
+}
+
+// Revalidate implements fs.MountSourceOperations.Revalidate.
+func (n *MockMountSourceOps) Revalidate(*Dirent) bool {
+	return n.revalidate
+}
+
+// Keep implements fs.MountSourceOperations.Keep.
+func (n *MockMountSourceOps) Keep(dirent *Dirent) bool {
+	return n.keep
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (n *MockInodeOperations) WriteOut(context.Context, *Inode) error {
+	return nil
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (n *MockInodeOperations) UnstableAttr(context.Context, *Inode) (UnstableAttr, error) {
+	return n.UAttr, nil
+}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (n *MockInodeOperations) IsVirtual() bool {
+	return false
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (n *MockInodeOperations) Lookup(ctx context.Context, dir *Inode, p string) (*Dirent, error) {
+	n.walkCalled = true
+	return NewDirent(NewInode(&MockInodeOperations{}, dir.MountSource, StableAttr{}), p), nil
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (n *MockInodeOperations) SetPermissions(context.Context, *Inode, FilePermissions) bool {
+	return false
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (*MockInodeOperations) SetOwner(context.Context, *Inode, FileOwner) error {
+	return syserror.EINVAL
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (n *MockInodeOperations) SetTimestamps(context.Context, *Inode, TimeSpec) error {
+	return nil
+}
+
+// Create implements fs.InodeOperations.Create.
+func (n *MockInodeOperations) Create(ctx context.Context, dir *Inode, p string, flags FileFlags, perms FilePermissions) (*File, error) {
+	n.createCalled = true
+	d := NewDirent(NewInode(&MockInodeOperations{}, dir.MountSource, StableAttr{}), p)
+	return &File{Dirent: d}, nil
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (n *MockInodeOperations) CreateLink(_ context.Context, dir *Inode, oldname string, newname string) error {
+	n.createLinkCalled = true
+	return nil
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (n *MockInodeOperations) CreateDirectory(context.Context, *Inode, string, FilePermissions) error {
+	n.createDirectoryCalled = true
+	return nil
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (n *MockInodeOperations) Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string) error {
+	n.renameCalled = true
+	return nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (n *MockInodeOperations) Check(ctx context.Context, inode *Inode, p PermMask) bool {
+	return ContextCanAccessFile(ctx, inode, p)
+}
+
+// Release implements fs.InodeOperations.Release.
+func (n *MockInodeOperations) Release(context.Context) {}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (n *MockInodeOperations) Truncate(ctx context.Context, inode *Inode, size int64) error {
+	return nil
+}
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
+func (n *MockInodeOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
+	return 0, nil
+}
+
+// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir.
+func (n *MockInodeOperations) DeprecatedReaddir(context.Context, *DirCtx, int) (int, error) {
+	return 0, nil
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (n *MockInodeOperations) Remove(context.Context, *Inode, string) error {
+	return nil
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (n *MockInodeOperations) RemoveDirectory(context.Context, *Inode, string) error {
+	return nil
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (n *MockInodeOperations) Getlink(context.Context, *Inode) (*Dirent, error) {
+	return nil, syserror.ENOLINK
+}
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
new file mode 100644
index 000000000..a2943b097
--- /dev/null
+++ b/pkg/sentry/fs/mount.go
@@ -0,0 +1,298 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"bytes"
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+)
+
+// DirentOperations provide file systems greater control over how long a Dirent stays pinned
+// in core. Implementations must not take Dirent.mu.
+type DirentOperations interface {
+	// Revalidate returns true if the Dirent is stale and its InodeOperations needs to be reloaded. Revalidate
+	// will never be called on a Dirent that is mounted.
+	Revalidate(dirent *Dirent) bool
+
+	// Keep returns true if the Dirent should be kept in memory for as long as possible
+	// beyond any active references.
+	Keep(dirent *Dirent) bool
+}
+
+// MountSourceOperations contains filesystem specific operations.
+type MountSourceOperations interface {
+	// TODO: Add:
+	//
+	// StatFS() (Info, error)
+	// BlockSize() int64
+	// FS() Filesystem
+
+	// DirentOperations provide optional extra management of Dirents.
+	DirentOperations
+
+	// Destroy destroys the MountSource.
+	Destroy()
+
+	// Below are MountSourceOperations that do not conform to Linux.
+
+	// ResetInodeMappings clears all mappings of Inodes before SaveInodeMapping
+	// is called.
+	ResetInodeMappings()
+
+	// SaveInodeMappings is called during saving to store, for each reachable
+	// Inode in the mounted filesystem, a mapping of Inode.StableAttr.InodeID
+	// to the Inode's path relative to its mount point. If an Inode is
+	// reachable at more than one path due to hard links, it is unspecified
+	// which path is mapped. Filesystems that do not use this information to
+	// restore inodes can make SaveInodeMappings a no-op.
+	SaveInodeMapping(inode *Inode, path string)
+}
+
+// InodeMappings defines a fmt.Stringer MountSource Inode mappings.
+type InodeMappings map[uint64]string
+
+// String implements fmt.Stringer.String.
+func (i InodeMappings) String() string {
+	var mappingsBuf bytes.Buffer
+	mappingsBuf.WriteString("\n")
+	for ino, name := range i {
+		mappingsBuf.WriteString(fmt.Sprintf("\t%q\t\tinode number %d\n", name, ino))
+	}
+	return mappingsBuf.String()
+}
+
+// MountSource represents a source of file objects.
+//
+// MountSource corresponds to struct super_block in Linux.
+//
+// A mount source may represent a physical device (or a partition of a physical
+// device) or a virtual source of files such as procfs for a specific PID
+// namespace. There should be only one mount source per logical device. E.g.
+// there should be only procfs mount source for a given PID namespace.
+//
+// A mount source represents files as inodes. Every inode belongs to exactly
+// one mount source. Each file object may only be represented using one inode
+// object in a sentry instance.
+//
+// This is an amalgamation of structs super_block, vfsmount, and mount, while
+// MountSourceOperations is akin to struct super_operations.
+//
+// Hence, mount source also contains common mounted file system state, such as
+// mount flags, the root Dirent, and children mounts. For now, this
+// amalgamation implies that a mount source cannot be shared by multiple mounts
+// (e.g. cannot be mounted at different locations).
+//
+// TODO: Move mount-specific information out of MountSource.
+type MountSource struct {
+	refs.AtomicRefCount
+
+	// MountSourceOperations defines filesystem specific behavior.
+	MountSourceOperations
+
+	// Filesystem is the filesystem backing the mount. Can be nil if there
+	// is no filesystem backing the mount.
+	Filesystem Filesystem
+
+	// Flags are the flags that this filesystem was mounted with.
+	Flags MountSourceFlags
+
+	// fscache keeps Dirents pinned beyond application references to them.
+	// It must be flushed before kernel.SaveTo.
+	fscache *DirentCache `state:"nosave"`
+
+	// direntRefs is the sum of references on all Dirents in this MountSource.
+	//
+	// direntRefs is increased when a Dirent in MountSource is IncRef'd, and
+	// decreased when a Dirent in MountSource is DecRef'd.
+	//
+	// To cleanly unmount a MountSource, one must check that no direntRefs are
+	// held anymore. To check, one must hold root.parent.dirMu of the
+	// MountSource's root Dirent before reading direntRefs to prevent further
+	// walks to Dirents in this MountSource.
+	//
+	// direntRefs must be atomically changed.
+	direntRefs uint64
+
+	// mu protects the fields below, which are set by the MountNamespace
+	// during MountSource/Unmount.
+	mu sync.Mutex `state:"nosave"`
+
+	// id is a unique id for this mount.
+	id uint64
+
+	// root is the root Dirent of this mount.
+	root *Dirent
+
+	// parent is the parent MountSource, or nil if this MountSource is the root.
+	parent *MountSource
+
+	// children are the child MountSources of this MountSource.
+	children map[*MountSource]struct{}
+}
+
+// defaultDirentCacheSize is the number of Dirents that the VFS can hold an extra
+// reference on.
+const defaultDirentCacheSize uint64 = 1000
+
+// NewMountSource returns a new MountSource. Filesystem may be nil if there is no
+// filesystem backing the mount.
+func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags MountSourceFlags) *MountSource {
+	return &MountSource{
+		MountSourceOperations: mops,
+		Flags:      flags,
+		Filesystem: filesystem,
+		fscache:    NewDirentCache(defaultDirentCacheSize),
+		children:   make(map[*MountSource]struct{}),
+	}
+}
+
+// Parent returns the parent mount, or nil if this mount is the root.
+func (msrc *MountSource) Parent() *MountSource {
+	msrc.mu.Lock()
+	defer msrc.mu.Unlock()
+	return msrc.parent
+}
+
+// ID returns the ID of this mount.
+func (msrc *MountSource) ID() uint64 {
+	msrc.mu.Lock()
+	defer msrc.mu.Unlock()
+	return msrc.id
+}
+
+// Children returns the (immediate) children of this MountSource.
+func (msrc *MountSource) Children() []*MountSource {
+	msrc.mu.Lock()
+	defer msrc.mu.Unlock()
+
+	ms := make([]*MountSource, 0, len(msrc.children))
+	for c := range msrc.children {
+		ms = append(ms, c)
+	}
+	return ms
+}
+
+// Submounts returns all mounts that are descendants of this mount.
+func (msrc *MountSource) Submounts() []*MountSource {
+	var ms []*MountSource
+	for _, c := range msrc.Children() {
+		ms = append(ms, c)
+		ms = append(ms, c.Submounts()...)
+	}
+	return ms
+}
+
+// Root returns the root dirent of this mount.
+func (msrc *MountSource) Root() *Dirent {
+	msrc.mu.Lock()
+	defer msrc.mu.Unlock()
+	return msrc.root
+}
+
+// DirentRefs returns the current mount direntRefs.
+func (msrc *MountSource) DirentRefs() uint64 {
+	return atomic.LoadUint64(&msrc.direntRefs)
+}
+
+// IncDirentRefs increases direntRefs.
+func (msrc *MountSource) IncDirentRefs() {
+	atomic.AddUint64(&msrc.direntRefs, 1)
+}
+
+// DecDirentRefs decrements direntRefs.
+func (msrc *MountSource) DecDirentRefs() {
+	if atomic.AddUint64(&msrc.direntRefs, ^uint64(0)) == ^uint64(0) {
+		panic("Decremented zero mount reference direntRefs")
+	}
+}
+
+func (msrc *MountSource) destroy() {
+	if c := msrc.DirentRefs(); c != 0 {
+		panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c))
+	}
+	msrc.MountSourceOperations.Destroy()
+}
+
+// DecRef drops a reference on the MountSource.
+func (msrc *MountSource) DecRef() {
+	msrc.DecRefWithDestructor(msrc.destroy)
+}
+
+// FlushDirentRefs drops all references held by the MountSource on Dirents.
+func (msrc *MountSource) FlushDirentRefs() {
+	msrc.fscache.Invalidate()
+}
+
+// NewCachingMountSource returns a generic mount that will cache dirents
+// aggressively. Filesystem may be nil if there is no backing filesystem.
+func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
+	return NewMountSource(&SimpleMountSourceOperations{
+		keep: true,
+	}, filesystem, flags)
+}
+
+// NewNonCachingMountSource returns a generic mount that will never cache dirents.
+// Filesystem may be nil if there is no backing filesystem.
+func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
+	return NewMountSource(&SimpleMountSourceOperations{
+		keep: false,
+	}, filesystem, flags)
+}
+
+// SimpleMountSourceOperations implements MountSourceOperations.
+type SimpleMountSourceOperations struct {
+	keep bool
+}
+
+// Revalidate implements MountSourceOperations.Revalidate.
+func (*SimpleMountSourceOperations) Revalidate(*Dirent) bool {
+	return false
+}
+
+// Keep implements MountSourceOperations.Keep.
+func (smo *SimpleMountSourceOperations) Keep(*Dirent) bool {
+	return smo.keep
+}
+
+// ResetInodeMappings implements MountSourceOperations.ResetInodeMappings.
+func (*SimpleMountSourceOperations) ResetInodeMappings() {}
+
+// SaveInodeMapping implements MountSourceOperations.SaveInodeMapping.
+func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {}
+
+// Destroy implements MountSourceOperations.Destroy.
+func (*SimpleMountSourceOperations) Destroy() {}
+
+// Info defines attributes of a filesystem.
+type Info struct {
+	// Type is the filesystem type magic value.
+	Type uint64
+
+	// TotalBlocks is the total data blocks in the filesystem.
+	TotalBlocks uint64
+
+	// FreeBlocks is the number of free blocks available.
+	FreeBlocks uint64
+
+	// TotalFiles is the total file nodes in the filesystem.
+	TotalFiles uint64
+
+	// FreeFiles is the number of free file nodes.
+	FreeFiles uint64
+}
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
new file mode 100644
index 000000000..16c25e46c
--- /dev/null
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -0,0 +1,95 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+
+// overlayMountSourceOperations implements MountSourceOperations for an overlay
+// mount point.
+type overlayMountSourceOperations struct {
+	upper *MountSource
+	lower *MountSource
+}
+
+func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *MountSource {
+	upper.IncRef()
+	lower.IncRef()
+	return NewMountSource(&overlayMountSourceOperations{
+		upper: upper,
+		lower: lower,
+	}, &overlayFilesystem{}, flags)
+}
+
+// Revalidate panics if the upper or lower MountSource require that dirent be
+// revalidated. Otherwise always returns false.
+func (o *overlayMountSourceOperations) Revalidate(dirent *Dirent) bool {
+	if o.upper.Revalidate(dirent) || o.lower.Revalidate(dirent) {
+		panic("an overlay cannot revalidate file objects")
+	}
+	return false
+}
+
+// Keep returns true if either upper or lower MountSource require that the
+// dirent be kept in memory.
+func (o *overlayMountSourceOperations) Keep(dirent *Dirent) bool {
+	return o.upper.Keep(dirent) || o.lower.Keep(dirent)
+}
+
+// ResetInodeMappings propagates the call to both upper and lower MountSource.
+func (o *overlayMountSourceOperations) ResetInodeMappings() {
+	o.upper.ResetInodeMappings()
+	o.lower.ResetInodeMappings()
+}
+
+// SaveInodeMapping propagates the call to both upper and lower MountSource.
+func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path string) {
+	inode.overlay.copyMu.RLock()
+	defer inode.overlay.copyMu.RUnlock()
+	if inode.overlay.upper != nil {
+		o.upper.SaveInodeMapping(inode.overlay.upper, path)
+	}
+	if inode.overlay.lower != nil {
+		o.lower.SaveInodeMapping(inode.overlay.lower, path)
+	}
+}
+
+// Destroy drops references on the upper and lower MountSource.
+func (o *overlayMountSourceOperations) Destroy() {
+	o.upper.DecRef()
+	o.lower.DecRef()
+}
+
+// type overlayFilesystem is the filesystem for overlay mounts.
+type overlayFilesystem struct{}
+
+// Name implements Filesystem.Name.
+func (ofs *overlayFilesystem) Name() string {
+	return "overlayfs"
+}
+
+// Flags implements Filesystem.Flags.
+func (ofs *overlayFilesystem) Flags() FilesystemFlags {
+	return 0
+}
+
+// AllowUserMount implements Filesystem.AllowUserMount.
+func (ofs *overlayFilesystem) AllowUserMount() bool {
+	return false
+}
+
+// Mount implements Filesystem.Mount.
+func (ofs *overlayFilesystem) Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error) {
+	panic("overlayFilesystem.Mount should not be called!")
+}
diff --git a/pkg/sentry/fs/mount_state.go b/pkg/sentry/fs/mount_state.go
new file mode 100644
index 000000000..f5ed1dd8d
--- /dev/null
+++ b/pkg/sentry/fs/mount_state.go
@@ -0,0 +1,25 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+// afterLoad is invoked by stateify.
+//
+// Beyond the cache, this method's existence is required to ensure that this
+// object is not marked "complete" until all dependent objects are also marked
+// "complete". Implementations (e.g. see gofer_state.go) reach into the
+// MountSourceOperations through this object, this is necessary on restore.
+func (msrc *MountSource) afterLoad() {
+	msrc.fscache = NewDirentCache(defaultDirentCacheSize)
+}
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
new file mode 100644
index 000000000..3a053c154
--- /dev/null
+++ b/pkg/sentry/fs/mount_test.go
@@ -0,0 +1,216 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+)
+
+// cacheReallyContains iterates through the dirent cache to determine whether
+// it contains the given dirent.
+func cacheReallyContains(cache *DirentCache, d *Dirent) bool {
+	for i := cache.list.Front(); i != nil; i = i.Next() {
+		if i == d {
+			return true
+		}
+	}
+	return false
+}
+
+// TestMountSourceOnlyCachedOnce tests that a Dirent that is mounted over only ends
+// up in a single Dirent Cache. NOTE: Having a dirent in multiple
+// caches causes major consistency issues.
+func TestMountSourceOnlyCachedOnce(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	rootCache := NewDirentCache(100)
+	rootInode := NewMockInode(ctx, NewMockMountSource(rootCache), StableAttr{
+		Type: Directory,
+	})
+	mm, err := NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		t.Fatalf("NewMountNamespace failed: %v", err)
+	}
+	rootDirent := mm.Root()
+	defer rootDirent.DecRef()
+
+	// Get a child of the root which we will mount over.  Note that the
+	// MockInodeOperations causes Walk to always succeed.
+	child, err := rootDirent.Walk(ctx, rootDirent, "child")
+	if err != nil {
+		t.Fatalf("failed to walk to child dirent: %v", err)
+	}
+	child.maybeExtendReference() // Cache.
+
+	// Ensure that the root cache contains the child.
+	if !cacheReallyContains(rootCache, child) {
+		t.Errorf("wanted rootCache to contain child dirent, but it did not")
+	}
+
+	// Create a new cache and inode, and mount it over child.
+	submountCache := NewDirentCache(100)
+	submountInode := NewMockInode(ctx, NewMockMountSource(submountCache), StableAttr{
+		Type: Directory,
+	})
+	if err := mm.Mount(ctx, child, submountInode); err != nil {
+		t.Fatalf("failed to mount over child: %v", err)
+	}
+
+	// Walk to the child again.
+	child2, err := rootDirent.Walk(ctx, rootDirent, "child")
+	if err != nil {
+		t.Fatalf("failed to walk to child dirent: %v", err)
+	}
+
+	// Should have a different Dirent than before.
+	if child == child2 {
+		t.Fatalf("expected %v not equal to %v, but they are the same", child, child2)
+	}
+
+	// Neither of the caches should no contain the child.
+	if cacheReallyContains(rootCache, child) {
+		t.Errorf("wanted rootCache not to contain child dirent, but it did")
+	}
+	if cacheReallyContains(submountCache, child) {
+		t.Errorf("wanted submountCache not to contain child dirent, but it did")
+	}
+}
+
+// Test that mounts have proper parent/child relationships.
+func TestMountSourceParentChildRelationship(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	rootCache := NewDirentCache(100)
+	rootInode := NewMockInode(ctx, NewMockMountSource(rootCache), StableAttr{
+		Type: Directory,
+	})
+	mm, err := NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		t.Fatalf("NewMountNamespace failed: %v", err)
+	}
+	rootDirent := mm.Root()
+	defer rootDirent.DecRef()
+
+	// Add mounts at the following paths:
+	paths := []string{
+		"/foo",
+		"/foo/bar",
+		"/foo/bar/baz",
+		"/foo/qux",
+		"/waldo",
+	}
+
+	for _, p := range paths {
+		d, err := mm.FindLink(ctx, rootDirent, nil, p, 0)
+		if err != nil {
+			t.Fatalf("could not find path %q in mount manager: %v", p, err)
+		}
+		submountInode := NewMockInode(ctx, NewMockMountSource(nil), StableAttr{
+			Type: Directory,
+		})
+		if err := mm.Mount(ctx, d, submountInode); err != nil {
+			t.Fatalf("could not mount at %q: %v", p, err)
+		}
+	}
+
+	// mm root should contain all submounts (and does not include the root
+	// mount).
+	allMountSources := rootDirent.Inode.MountSource.Submounts()
+	if err := mountPathsAre(rootDirent, allMountSources, paths...); err != nil {
+		t.Error(err)
+	}
+
+	// Each mount should have a unique ID.
+	foundIDs := make(map[uint64]struct{})
+	for _, m := range allMountSources {
+		id := m.ID()
+		if _, ok := foundIDs[id]; ok {
+			t.Errorf("got multiple mounts with id %d", id)
+		}
+		foundIDs[id] = struct{}{}
+	}
+
+	// Root mount should have no parent.
+	rootMountSource := mm.root.Inode.MountSource
+	if p := rootMountSource.Parent(); p != nil {
+		t.Errorf("root.Parent got %v wanted nil", p)
+	}
+
+	// Root mount should have 2 children: foo and waldo.
+	rootChildren := rootMountSource.Children()
+	if err := mountPathsAre(rootDirent, rootChildren, "/foo", "/waldo"); err != nil {
+		t.Error(err)
+	}
+	// All root mount children should have root as parent.
+	for _, c := range rootChildren {
+		if p := c.Parent(); p != rootMountSource {
+			t.Errorf("root mount child got parent %+v, wanted root mount", p)
+		}
+	}
+
+	// "foo" mount should have two children: /foo/bar, and /foo/qux.
+	d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", 0)
+	if err != nil {
+		t.Fatalf("could not find path %q in mount manager: %v", "/foo", err)
+	}
+	fooMountSource := d.Inode.MountSource
+	fooMountSourceChildren := fooMountSource.Children()
+	if err := mountPathsAre(rootDirent, fooMountSourceChildren, "/foo/bar", "/foo/qux"); err != nil {
+		t.Error(err)
+	}
+	// Each child should have fooMountSource as parent.
+	for _, c := range fooMountSourceChildren {
+		if p := c.Parent(); p != fooMountSource {
+			t.Errorf("foo mount child got parent %+v, wanted foo mount", p)
+		}
+	}
+	// Submounts of foo are /foo/bar, /foo/qux, and /foo/bar/baz.
+	if err := mountPathsAre(rootDirent, fooMountSource.Submounts(), "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
+		t.Error(err)
+	}
+
+	// "waldo" mount should have no submounts or children.
+	waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", 0)
+	if err != nil {
+		t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err)
+	}
+	waldoMountSource := waldo.Inode.MountSource
+	if got := len(waldoMountSource.Children()); got != 0 {
+		t.Errorf("waldo got %d children, wanted 0", got)
+	}
+	if got := len(waldoMountSource.Submounts()); got != 0 {
+		t.Errorf("waldo got %d children, wanted 0", got)
+	}
+}
+
+func mountPathsAre(root *Dirent, got []*MountSource, want ...string) error {
+	if len(got) != len(want) {
+		return fmt.Errorf("mount paths have different lengths: got %d want %d", len(got), len(want))
+	}
+	gotPaths := make(map[string]struct{}, len(got))
+	for _, g := range got {
+		n, _ := g.Root().FullName(root)
+		gotPaths[n] = struct{}{}
+	}
+	for _, w := range want {
+		if _, ok := gotPaths[w]; !ok {
+			return fmt.Errorf("no mount with path %q found", w)
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
new file mode 100644
index 000000000..1e6b5b70e
--- /dev/null
+++ b/pkg/sentry/fs/mounts.go
@@ -0,0 +1,511 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// DefaultTraversalLimit provides a sensible default traversal limit that may
+// be passed to FindInode and FindLink. You may want to provide other options in
+// individual syscall implementations, but for internal functions this will be
+// sane.
+const DefaultTraversalLimit = 10
+
+// MountNamespace defines a collection of mounts.
+type MountNamespace struct {
+	refs.AtomicRefCount
+
+	// userns is the user namespace associated with this mount namespace.
+	//
+	// All privileged operations on this mount namespace must have
+	// appropriate capabilities in this userns.
+	//
+	// userns is immutable.
+	userns *auth.UserNamespace
+
+	// root is the root directory.
+	root *Dirent
+
+	// mu protects mounts and mountID counter.
+	mu sync.Mutex `state:"nosave"`
+
+	// mounts is a map of the last mounted Dirent -> stack of old Dirents
+	// that were mounted over, with the oldest mounted Dirent first and
+	// more recent mounted Dirents at the end of the slice.
+	//
+	// A reference to all Dirents in mounts (keys and values) must be held
+	// to ensure the Dirents are recoverable when unmounting.
+	mounts map[*Dirent][]*Dirent
+
+	// mountID is the next mount id to assign.
+	mountID uint64
+}
+
+// NewMountNamespace returns a new MountNamespace, with the provided node at the
+// root, and the given cache size. A root must always be provided.
+func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
+	creds := auth.CredentialsFromContext(ctx)
+
+	root.MountSource.mu.Lock()
+	defer root.MountSource.mu.Unlock()
+
+	// Set the root dirent and id on the root mount.
+	d := NewDirent(root, "/")
+	root.MountSource.root = d
+	root.MountSource.id = 1
+
+	return &MountNamespace{
+		userns:  creds.UserNamespace,
+		root:    d,
+		mounts:  make(map[*Dirent][]*Dirent),
+		mountID: 2,
+	}, nil
+}
+
+// UserNamespace returns the user namespace associated with this mount manager.
+func (mns *MountNamespace) UserNamespace() *auth.UserNamespace {
+	return mns.userns
+}
+
+// Root returns the MountNamespace's root Dirent and increments its reference
+// count.  The caller must call DecRef when finished.
+func (mns *MountNamespace) Root() *Dirent {
+	mns.root.IncRef()
+	return mns.root
+}
+
+// FlushMountSourceRefs flushes extra references held by MountSources for all active mount points;
+// see fs/mount.go:MountSource.FlushDirentRefs.
+func (mns *MountNamespace) FlushMountSourceRefs() {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+	mns.flushMountSourceRefsLocked()
+}
+
+func (mns *MountNamespace) flushMountSourceRefsLocked() {
+	// Flush mounts' MountSource references.
+	for current, stack := range mns.mounts {
+		current.Inode.MountSource.FlushDirentRefs()
+		for _, prev := range stack {
+			prev.Inode.MountSource.FlushDirentRefs()
+		}
+	}
+
+	// Flush root's MountSource references.
+	mns.root.Inode.MountSource.FlushDirentRefs()
+}
+
+// destroy drops root and mounts dirent references and closes any original nodes.
+//
+// After destroy is called, the MountNamespace may continue to be referenced (for
+// example via /proc/mounts), but should free all resources and shouldn't have
+// Find* methods called.
+func (mns *MountNamespace) destroy() {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+
+	// Flush all mounts' MountSource references to Dirents. This allows for mount
+	// points to be torn down since there should be no remaining references after
+	// this and DecRef below.
+	mns.flushMountSourceRefsLocked()
+
+	// Teardown mounts.
+	for current, mp := range mns.mounts {
+		// Drop the mount reference on all mounted dirents.
+		for _, d := range mp {
+			d.DecRef()
+		}
+		current.DecRef()
+	}
+	mns.mounts = nil
+
+	// Drop reference on the root.
+	mns.root.DecRef()
+
+	// Wait for asynchronous work (queued by dropping Dirent references
+	// above) to complete before destroying this MountNamespace.
+	AsyncBarrier()
+}
+
+// DecRef implements RefCounter.DecRef with destructor mns.destroy.
+func (mns *MountNamespace) DecRef() {
+	mns.DecRefWithDestructor(mns.destroy)
+}
+
+// Freeze freezes the entire mount tree.
+func (mns *MountNamespace) Freeze() {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+
+	// We only want to freeze Dirents with active references, not Dirents referenced
+	// by a mount's MountSource.
+	mns.flushMountSourceRefsLocked()
+
+	// Freeze the entire shebang.
+	mns.root.Freeze()
+}
+
+// withMountLocked prevents further walks to `node`, because `node` is about to
+// be a mount point.
+func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+
+	renameMu.Lock()
+	defer renameMu.Unlock()
+
+	// Linux allows mounting over the root (?). It comes with a strange set
+	// of semantics. We'll just not do this for now.
+	if node.parent == nil {
+		return syserror.EBUSY
+	}
+
+	// For both mount and unmount, we take this lock so we can swap out the
+	// appropriate child in parent.children.
+	//
+	// For unmount, this also ensures that if `node` is a mount point, the
+	// underlying mount's MountSource.direntRefs cannot increase by preventing
+	// walks to node.
+	node.parent.dirMu.Lock()
+	defer node.parent.dirMu.Unlock()
+
+	node.parent.mu.Lock()
+	defer node.parent.mu.Unlock()
+
+	// We need not take node.dirMu since we have parent.dirMu.
+
+	// We need to take node.mu, so that we can check for deletion.
+	node.mu.Lock()
+	defer node.mu.Unlock()
+
+	return fn()
+}
+
+// Mount mounts a `inode` over the subtree at `node`.
+func (mns *MountNamespace) Mount(ctx context.Context, node *Dirent, inode *Inode) error {
+	return mns.withMountLocked(node, func() error {
+		// replacement already has one reference taken; this is the mount
+		// reference.
+		replacement, err := node.mount(ctx, inode)
+		if err != nil {
+			return err
+		}
+
+		// Set child/parent dirent relationship.
+		parentMountSource := node.Inode.MountSource
+		childMountSource := inode.MountSource
+		parentMountSource.mu.Lock()
+		defer parentMountSource.mu.Unlock()
+		childMountSource.mu.Lock()
+		defer childMountSource.mu.Unlock()
+
+		parentMountSource.children[childMountSource] = struct{}{}
+		childMountSource.parent = parentMountSource
+
+		// Set the mount's root dirent and id.
+		childMountSource.root = replacement
+		childMountSource.id = mns.mountID
+		mns.mountID++
+
+		// Drop node from its dirent cache.
+		node.dropExtendedReference()
+
+		// If node is already a mount point, push node on the stack so it can
+		// be recovered on unmount.
+		if stack, ok := mns.mounts[node]; ok {
+			mns.mounts[replacement] = append(stack, node)
+			delete(mns.mounts, node)
+			return nil
+		}
+
+		// Was not already mounted, just add another mount point.
+		// Take a reference on node so it can be recovered on unmount.
+		node.IncRef()
+		mns.mounts[replacement] = []*Dirent{node}
+		return nil
+	})
+}
+
+// Unmount ensures no references to the MountSource remain and removes `node` from
+// this subtree. The subtree formerly mounted in `node`'s place will be
+// restored. node's MountSource will be destroyed as soon as the last reference to
+// `node` is dropped, as no references to Dirents within will remain.
+//
+// If detachOnly is set, Unmount merely removes `node` from the subtree, but
+// allows existing references to the MountSource remain. E.g. if an open file still
+// refers to Dirents in MountSource, the Unmount will succeed anyway and MountSource will
+// be destroyed at a later time when all references to Dirents within are
+// dropped.
+//
+// The caller must hold a reference to node from walking to it.
+func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly bool) error {
+	// This takes locks to prevent further walks to Dirents in this mount
+	// under the assumption that `node` is the root of the mount.
+	return mns.withMountLocked(node, func() error {
+		origs, ok := mns.mounts[node]
+		if !ok {
+			// node is not a mount point.
+			return syserror.EINVAL
+		}
+
+		if len(origs) == 0 {
+			panic("cannot unmount initial dirent")
+		}
+
+		if !detachOnly {
+			m := node.Inode.MountSource
+
+			// Lock the parent MountSource first, if it exists. We are
+			// holding mns.Lock, so the parent can not change out
+			// from under us.
+			parent := m.Parent()
+			if parent != nil {
+				parent.mu.Lock()
+				defer parent.mu.Unlock()
+			}
+
+			// Lock the mount that is being unmounted.
+			m.mu.Lock()
+			defer m.mu.Unlock()
+
+			if m.parent != nil {
+				// Sanity check.
+				if _, ok := m.parent.children[m]; !ok {
+					panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent))
+				}
+				delete(m.parent.children, m)
+				m.parent = nil
+			}
+
+			// Flush all references on the mounted node.
+			m.FlushDirentRefs()
+
+			// At this point, exactly two references must be held
+			// to mount: one mount reference on node, and one due
+			// to walking to node.
+			//
+			// We must also be guaranteed that no more references
+			// can be taken on mount. This is why withMountLocked
+			// must be held at this point to prevent any walks to
+			// and from node.
+			if refs := m.DirentRefs(); refs < 2 {
+				panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs))
+			} else if refs != 2 {
+				return syserror.EBUSY
+			}
+		}
+
+		original := origs[len(origs)-1]
+		if err := node.unmount(ctx, original); err != nil {
+			return err
+		}
+
+		switch {
+		case len(origs) > 1:
+			mns.mounts[original] = origs[:len(origs)-1]
+		case len(origs) == 1:
+			// Drop mount reference taken at the end of
+			// MountNamespace.Mount.
+			original.DecRef()
+		}
+
+		delete(mns.mounts, node)
+		return nil
+	})
+}
+
+// FindLink returns an Dirent from a given node, which may be a symlink.
+//
+// The root argument is treated as the root directory, and FindLink will not
+// return anything above that. The wd dirent provides the starting directory,
+// and may be nil which indicates the root should be used. You must call DecRef
+// on the resulting Dirent when you are no longer using the object.
+//
+// If wd is nil, then the root will be used as the working directory. If the
+// path is absolute, this has no functional impact.
+//
+// Precondition: root must be non-nil.
+// Precondition: the path must be non-empty.
+func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
+	if root == nil {
+		panic("MountNamespace.FindInode: root must not be nil")
+	}
+	if len(path) == 0 {
+		panic("MountNamespace.FindInode: path is empty")
+	}
+
+	// Split the path.
+	first, remainder := SplitFirst(path)
+
+	// Where does this walk originate?
+	current := wd
+	if current == nil {
+		current = root
+	}
+	for first == "/" {
+		// Special case: it's possible that we have nothing to walk at
+		// all. This is necessary since we're resplitting the path.
+		if remainder == "" {
+			root.IncRef()
+			return root, nil
+		}
+
+		// Start at the root and advance the path component so that the
+		// walk below can proceed. Note at this point, it handles the
+		// no-op walk case perfectly fine.
+		current = root
+		first, remainder = SplitFirst(remainder)
+	}
+
+	current.IncRef() // Transferred during walk.
+
+	for {
+		// Check that the file is a directory and that we have
+		// permissions to walk.
+		//
+		// Note that we elide this check for the root directory as an
+		// optimization; a non-executable root may still be walked.  A
+		// non-directory root is hopeless.
+		if current != root {
+			if !IsDir(current.Inode.StableAttr) {
+				current.DecRef() // Drop reference from above.
+				return nil, syserror.ENOTDIR
+			}
+			if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil {
+				current.DecRef() // Drop reference from above.
+				return nil, err
+			}
+		}
+
+		// Move to the next level.
+		next, err := current.Walk(ctx, root, first)
+		if err != nil {
+			// Allow failed walks to cache the dirent, because no
+			// children will acquire a reference at the end.
+			current.maybeExtendReference()
+			current.DecRef()
+			return nil, err
+		}
+
+		// Drop old reference.
+		current.DecRef()
+
+		if remainder != "" {
+			// Ensure it's resolved, unless it's the last level.
+			//
+			// See resolve for reference semantics; on err next
+			// will have one dropped.
+			current, err = mns.resolve(ctx, root, next, maxTraversals)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			// Allow the file system to take an extra reference on the
+			// found child. This will hold a reference on the containing
+			// directory, so the whole tree will be implicitly cached.
+			next.maybeExtendReference()
+			return next, nil
+		}
+
+		// Move to the next element.
+		first, remainder = SplitFirst(remainder)
+	}
+}
+
+// FindInode is identical to FindLink except the return value is resolved.
+//
+//go:nosplit
+func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
+	d, err := mns.FindLink(ctx, root, wd, path, maxTraversals)
+	if err != nil {
+		return nil, err
+	}
+
+	// See resolve for reference semantics; on err d will have the
+	// reference dropped.
+	return mns.resolve(ctx, root, d, maxTraversals)
+}
+
+// resolve resolves the given link.
+//
+// If successful, a reference is dropped on node and one is acquired on the
+// caller's behalf for the returned dirent.
+//
+// If not successful, a reference is _also_ dropped on the node and an error
+// returned. This is for convenience in using resolve directly as a return
+// value.
+func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxTraversals uint) (*Dirent, error) {
+	// Resolve the path.
+	target, err := node.Inode.Getlink(ctx)
+
+	switch err {
+	case nil:
+		// Make sure we didn't exhaust the traversal budget.
+		if maxTraversals == 0 {
+			target.DecRef()
+			return nil, syscall.ELOOP
+		}
+
+		node.DecRef() // Drop the original reference.
+		return target, nil
+
+	case syscall.ENOLINK:
+		// Not a symlink.
+		return node, nil
+
+	case ErrResolveViaReadlink:
+		defer node.DecRef() // See above.
+
+		// First, check if we should traverse.
+		if maxTraversals == 0 {
+			return nil, syscall.ELOOP
+		}
+
+		// Read the target path.
+		targetPath, err := node.Inode.Readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+
+		// Find the node; we resolve relative to the current symlink's parent.
+		d, err := mns.FindInode(ctx, root, node.parent, targetPath, maxTraversals-1)
+		if err != nil {
+			return nil, err
+		}
+
+		return d, err
+
+	default:
+		node.DecRef() // Drop for err; see above.
+
+		// Propagate the error.
+		return nil, err
+	}
+}
+
+// SyncAll calls Dirent.SyncAll on the root.
+func (mns *MountNamespace) SyncAll(ctx context.Context) {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+	mns.root.SyncAll(ctx)
+}
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
new file mode 100644
index 000000000..8669f3a38
--- /dev/null
+++ b/pkg/sentry/fs/mounts_test.go
@@ -0,0 +1,102 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs_test
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+)
+
+// Creates a new MountNamespace with filesystem:
+// /       (root dir)
+// |-foo   (dir)
+//   |-bar (file)
+func createMountNamespace(ctx context.Context) (*fs.MountNamespace, error) {
+	perms := fs.FilePermsFromMode(0777)
+	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+
+	barFile := ramfstest.NewFile(ctx, perms)
+	fooDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{
+		"bar": fs.NewInode(barFile, m, fs.StableAttr{Type: fs.RegularFile}),
+	}, perms)
+	rootDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{
+		"foo": fs.NewInode(fooDir, m, fs.StableAttr{Type: fs.Directory}),
+	}, perms)
+
+	return fs.NewMountNamespace(ctx, fs.NewInode(rootDir, m, fs.StableAttr{Type: fs.Directory}))
+}
+
+func TestFindLink(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm, err := createMountNamespace(ctx)
+	if err != nil {
+		t.Fatalf("createMountNamespace failed: %v", err)
+	}
+
+	root := mm.Root()
+	defer root.DecRef()
+	foo, err := root.Walk(ctx, root, "foo")
+	if err != nil {
+		t.Fatalf("Error walking to foo: %v", err)
+	}
+
+	// Positive cases.
+	for _, tc := range []struct {
+		findPath string
+		wd       *fs.Dirent
+		wantPath string
+	}{
+		{".", root, "/"},
+		{".", foo, "/foo"},
+		{"..", foo, "/"},
+		{"../../..", foo, "/"},
+		{"///foo", foo, "/foo"},
+		{"/foo", foo, "/foo"},
+		{"/foo/bar", foo, "/foo/bar"},
+		{"/foo/.///./bar", foo, "/foo/bar"},
+		{"/foo///bar", foo, "/foo/bar"},
+		{"/foo/../foo/bar", foo, "/foo/bar"},
+		{"foo/bar", root, "/foo/bar"},
+		{"foo////bar", root, "/foo/bar"},
+		{"bar", foo, "/foo/bar"},
+	} {
+		wdPath, _ := tc.wd.FullName(root)
+		if d, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err != nil {
+			t.Errorf("FindLink(%q, wd=%q) failed: %v", tc.findPath, wdPath, err)
+		} else if got, _ := d.FullName(root); got != tc.wantPath {
+			t.Errorf("FindLink(%q, wd=%q) got dirent %q, want %q", tc.findPath, wdPath, got, tc.wantPath)
+		}
+	}
+
+	// Negative cases.
+	for _, tc := range []struct {
+		findPath string
+		wd       *fs.Dirent
+	}{
+		{"bar", root},
+		{"/bar", root},
+		{"/foo/../../bar", root},
+		{"foo", foo},
+	} {
+		wdPath, _ := tc.wd.FullName(root)
+		if _, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err == nil {
+			t.Errorf("FindLink(%q, wd=%q) did not return error", tc.findPath, wdPath)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go
new file mode 100644
index 000000000..7cc8398e6
--- /dev/null
+++ b/pkg/sentry/fs/offset.go
@@ -0,0 +1,65 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// OffsetPageEnd returns the file offset rounded up to the nearest
+// page boundary. OffsetPageEnd panics if rounding up causes overflow,
+// which shouldn't be possible given that offset is an int64.
+func OffsetPageEnd(offset int64) uint64 {
+	end, ok := usermem.Addr(offset).RoundUp()
+	if !ok {
+		panic("impossible overflow")
+	}
+	return uint64(end)
+}
+
+// ReadEndOffset returns an exclusive end offset for a read operation
+// so that the read does not overflow an int64 nor size.
+//
+// Parameters:
+// - offset: the starting offset of the read.
+// - length: the number of bytes to read.
+// - size:   the size of the file.
+//
+// Postconditions: The returned offset is >= offset.
+func ReadEndOffset(offset int64, length int64, size int64) int64 {
+	if offset >= size {
+		return offset
+	}
+	end := offset + length
+	// Don't overflow.
+	if end < offset || end > size {
+		end = size
+	}
+	return end
+}
+
+// WriteEndOffset returns an exclusive end offset for a write operation
+// so that the write does not overflow an int64.
+//
+// Parameters:
+// - offset: the starting offset of the write.
+// - length: the number of bytes to write.
+//
+// Postconditions: The returned offset is >= offset.
+func WriteEndOffset(offset int64, length int64) int64 {
+	return ReadEndOffset(offset, length, math.MaxInt64)
+}
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
new file mode 100644
index 000000000..40eed3feb
--- /dev/null
+++ b/pkg/sentry/fs/overlay.go
@@ -0,0 +1,268 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// The virtual filesystem implements an overlay configuration. For a high-level
+// description, see README.md.
+//
+// Note on whiteouts:
+//
+// This implementation does not use the "Docker-style" whiteouts (symlinks with
+// ".wh." prefix). Instead upper filesystem directories support a set of extended
+// attributes to encode whiteouts: "trusted.overlay.whiteout.<filename>". This
+// gives flexibility to persist whiteouts independently of the filesystem layout
+// while additionally preventing name conflicts with files prefixed with ".wh.".
+//
+// Known deficiencies:
+//
+// - The device number of two files under the same overlay mount point may be
+//   different. This can happen if a file is found in the lower filesystem (takes
+//   the lower filesystem device) and another file is created in the upper
+//   filesystem (takes the upper filesystem device). This may appear odd but
+//   should not break applications.
+//
+// - Registered events on files (i.e. for notification of read/write readiness)
+//   are not copied across copy up. This is fine in the common case of files that
+//   do not block. For files that do block, like pipes and sockets, copy up is not
+//   supported.
+//
+// - Hardlinks in a lower filesystem are broken by copy up. For this reason, no
+//   attempt is made to preserve link count across copy up.
+//
+// - The maximum length of an extended attribute name is the same as the maximum
+//   length of a file path in Linux (XATTR_NAME_MAX == NAME_MAX). This means that
+//   whiteout attributes, if set directly on the host, are limited additionally by
+//   the extra whiteout prefix length (file paths must be strictly shorter than
+//   NAME_MAX). This is not a problem for in-memory filesystems which don't enforce
+//   XATTR_NAME_MAX.
+
+const (
+	// XattrOverlayPrefix is the prefix for extended attributes that affect
+	// the behavior of an overlay.
+	XattrOverlayPrefix = "trusted.overlay."
+
+	// XattrOverlayWhiteoutPrefix is the prefix for extended attributes
+	// that indicate that a whiteout exists.
+	XattrOverlayWhiteoutPrefix = XattrOverlayPrefix + "whiteout."
+)
+
+// XattrOverlayWhiteout returns an extended attribute that indicates a
+// whiteout exists for name. It is supported by directories that wish to
+// mask the existence of name.
+func XattrOverlayWhiteout(name string) string {
+	return XattrOverlayWhiteoutPrefix + name
+}
+
+// NewOverlayRoot produces the root of an overlay.
+//
+// Preconditions:
+//
+// - upper and lower must be non-nil.
+// - lower should not expose character devices, pipes, or sockets, because
+//   copying up these types of files is not supported.
+// - upper and lower must not require that file objects be revalidated.
+// - upper and lower must not have dynamic file/directory content.
+func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
+	if !IsDir(upper.StableAttr) {
+		return nil, fmt.Errorf("upper Inode is not a directory")
+	}
+	if !IsDir(lower.StableAttr) {
+		return nil, fmt.Errorf("lower Inode is not a directory")
+	}
+
+	msrc := newOverlayMountSource(upper.MountSource, lower.MountSource, flags)
+	overlay, err := newOverlayEntry(ctx, upper, lower, true)
+	if err != nil {
+		msrc.DecRef()
+		return nil, err
+	}
+
+	return newOverlayInode(ctx, overlay, msrc), nil
+}
+
+// newOverlayInode creates a new Inode for an overlay.
+func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *Inode {
+	var inode *Inode
+	if o.upper != nil {
+		inode = NewInode(nil, msrc, o.upper.StableAttr)
+	} else {
+		inode = NewInode(nil, msrc, o.lower.StableAttr)
+	}
+	inode.overlay = o
+	return inode
+}
+
+// overlayEntry is the overlay metadata of an Inode. It implements Mappable.
+type overlayEntry struct {
+	// lowerExists is true if an Inode exists for this file in the lower
+	// filesystem. If lowerExists is true, then the overlay must create
+	// a whiteout entry when renaming and removing this entry to mask the
+	// lower Inode.
+	//
+	// Note that this is distinct from actually holding onto a non-nil
+	// lower Inode (below). The overlay does not need to keep a lower Inode
+	// around unless it needs to operate on it, but it always needs to know
+	// whether the lower Inode exists to correctly execute a rename or
+	// remove operation.
+	lowerExists bool
+
+	// lower is an Inode from a lower filesystem. Modifications are
+	// never made on this Inode.
+	lower *Inode
+
+	// copyMu serializes copy-up for operations above
+	// mm.MemoryManager.mappingMu in the lock order.
+	copyMu sync.RWMutex `state:"nosave"`
+
+	// mapsMu serializes copy-up for operations between
+	// mm.MemoryManager.mappingMu and mm.MemoryManager.activeMu in the lock
+	// order.
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings tracks memory mappings of this Mappable so they can be removed
+	// from the lower filesystem Mappable and added to the upper filesystem
+	// Mappable when copy up occurs. It is strictly unnecessary after copy-up.
+	//
+	// mappings is protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// dataMu serializes copy-up for operations below mm.MemoryManager.activeMu
+	// in the lock order.
+	dataMu sync.RWMutex `state:"nosave"`
+
+	// upper is an Inode from an upper filesystem. It is non-nil if
+	// the file exists in the upper filesystem. It becomes non-nil
+	// when the Inode that owns this overlayEntry is modified.
+	//
+	// upper is protected by all of copyMu, mapsMu, and dataMu. Holding any of
+	// these locks is sufficient to read upper; holding all three for writing
+	// is required to mutate it.
+	upper *Inode
+}
+
+// newOverlayEntry returns a new overlayEntry.
+func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExists bool) (*overlayEntry, error) {
+	if upper == nil && lower == nil {
+		panic("invalid overlayEntry, needs at least one Inode")
+	}
+	if upper != nil && upper.overlay != nil {
+		panic("nested writable layers are not supported")
+	}
+	// Check for supported lower filesystem types.
+	if lower != nil {
+		switch lower.StableAttr.Type {
+		case RegularFile, Directory, Symlink, Socket:
+		default:
+			// We don't support copying up from character devices,
+			// named pipes, or anything weird (like proc files).
+			log.Warningf("%s not supported in lower filesytem", lower.StableAttr.Type)
+			return nil, syserror.EINVAL
+		}
+	}
+	return &overlayEntry{
+		lowerExists: lowerExists,
+		lower:       lower,
+		upper:       upper,
+	}, nil
+}
+
+func (o *overlayEntry) release() {
+	// We drop a reference on upper and lower file system Inodes
+	// rather than releasing them, because in-memory filesystems
+	// may hold an extra reference to these Inodes so that they
+	// stay in memory.
+	if o.upper != nil {
+		o.upper.DecRef()
+	}
+	if o.lower != nil {
+		o.lower.DecRef()
+	}
+}
+
+// overlayUpperMountSource gives the upper mount of an overlay mount.
+//
+// The caller may not use this MountSource past the lifetime of overlayMountSource and may
+// not call DecRef on it.
+func overlayUpperMountSource(overlayMountSource *MountSource) *MountSource {
+	return overlayMountSource.MountSourceOperations.(*overlayMountSourceOperations).upper
+}
+
+// Preconditions: At least one of o.copyMu, o.mapsMu, or o.dataMu must be locked.
+func (o *overlayEntry) inodeLocked() *Inode {
+	if o.upper != nil {
+		return o.upper
+	}
+	return o.lower
+}
+
+// Preconditions: At least one of o.copyMu, o.mapsMu, or o.dataMu must be locked.
+func (o *overlayEntry) isMappableLocked() bool {
+	return o.inodeLocked().Mappable() != nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (o *overlayEntry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	o.mapsMu.Lock()
+	defer o.mapsMu.Unlock()
+	if err := o.inodeLocked().Mappable().AddMapping(ctx, ms, ar, offset); err != nil {
+		return err
+	}
+	o.mappings.AddMapping(ms, ar, offset)
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (o *overlayEntry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+	o.mapsMu.Lock()
+	defer o.mapsMu.Unlock()
+	o.inodeLocked().Mappable().RemoveMapping(ctx, ms, ar, offset)
+	o.mappings.RemoveMapping(ms, ar, offset)
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (o *overlayEntry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	o.mapsMu.Lock()
+	defer o.mapsMu.Unlock()
+	if err := o.inodeLocked().Mappable().CopyMapping(ctx, ms, srcAR, dstAR, offset); err != nil {
+		return err
+	}
+	o.mappings.AddMapping(ms, dstAR, offset)
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (o *overlayEntry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	o.dataMu.RLock()
+	defer o.dataMu.RUnlock()
+	return o.inodeLocked().Mappable().Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (o *overlayEntry) InvalidateUnsavable(ctx context.Context) error {
+	o.mapsMu.Lock()
+	defer o.mapsMu.Unlock()
+	return o.inodeLocked().Mappable().InvalidateUnsavable(ctx)
+}
diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go
new file mode 100644
index 000000000..b74f6ed8c
--- /dev/null
+++ b/pkg/sentry/fs/path.go
@@ -0,0 +1,92 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+// TrimTrailingSlashes trims any trailing slashes.
+//
+// The returned boolean indicates whether any changes were made.
+//
+//go:nosplit
+func TrimTrailingSlashes(dir string) (trimmed string, changed bool) {
+	// Trim the trailing slash, except for root.
+	for len(dir) > 1 && dir[len(dir)-1] == '/' {
+		dir = dir[:len(dir)-1]
+		changed = true
+	}
+	return dir, changed
+}
+
+// SplitLast splits the given path into a directory and a file.
+//
+// The "absoluteness" of the path is preserved, but dir is always stripped of
+// trailing slashes.
+//
+//go:nosplit
+func SplitLast(path string) (dir, file string) {
+	path, _ = TrimTrailingSlashes(path)
+	if path == "" {
+		return ".", "."
+	} else if path == "/" {
+		return "/", "."
+	}
+
+	var slash int // Last location of slash in path.
+	for slash = len(path) - 1; slash >= 0 && path[slash] != '/'; slash-- {
+	}
+	switch {
+	case slash < 0:
+		return ".", path
+	case slash == 0:
+		// Directory of the form "/foo", or just "/". We need to
+		// preserve the first slash here, since it indicates an
+		// absolute path.
+		return "/", path[1:]
+	default:
+		// Drop the trailing slash.
+		dir, _ = TrimTrailingSlashes(path[:slash])
+		return dir, path[slash+1:]
+	}
+}
+
+// SplitFirst splits the given path into a first directory and the remainder.
+//
+// If remainder is empty, then the path is a single element.
+//
+//go:nosplit
+func SplitFirst(path string) (current, remainder string) {
+	path, _ = TrimTrailingSlashes(path)
+	if path == "" {
+		return ".", ""
+	}
+
+	var slash int // First location of slash in path.
+	for slash = 0; slash < len(path) && path[slash] != '/'; slash++ {
+	}
+	switch {
+	case slash >= len(path):
+		return path, ""
+	case slash == 0:
+		// See above.
+		return "/", path[1:]
+	default:
+		current = path[:slash]
+		remainder = path[slash+1:]
+		// Strip redundant slashes.
+		for len(remainder) > 0 && remainder[0] == '/' {
+			remainder = remainder[1:]
+		}
+		return current, remainder
+	}
+}
diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go
new file mode 100644
index 000000000..7ab070855
--- /dev/null
+++ b/pkg/sentry/fs/path_test.go
@@ -0,0 +1,211 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"testing"
+)
+
+// TestSplitLast tests variants of path splitting.
+func TestSplitLast(t *testing.T) {
+	cases := []struct {
+		path string
+		dir  string
+		file string
+	}{
+		{path: "/", dir: "/", file: "."},
+		{path: "/.", dir: "/", file: "."},
+		{path: "/./", dir: "/", file: "."},
+		{path: "/./.", dir: "/.", file: "."},
+		{path: "/././", dir: "/.", file: "."},
+		{path: "/./..", dir: "/.", file: ".."},
+		{path: "/./../", dir: "/.", file: ".."},
+		{path: "/..", dir: "/", file: ".."},
+		{path: "/../", dir: "/", file: ".."},
+		{path: "/../.", dir: "/..", file: "."},
+		{path: "/.././", dir: "/..", file: "."},
+		{path: "/../..", dir: "/..", file: ".."},
+		{path: "/../../", dir: "/..", file: ".."},
+
+		{path: "", dir: ".", file: "."},
+		{path: ".", dir: ".", file: "."},
+		{path: "./", dir: ".", file: "."},
+		{path: "./.", dir: ".", file: "."},
+		{path: "././", dir: ".", file: "."},
+		{path: "./..", dir: ".", file: ".."},
+		{path: "./../", dir: ".", file: ".."},
+		{path: "..", dir: ".", file: ".."},
+		{path: "../", dir: ".", file: ".."},
+		{path: "../.", dir: "..", file: "."},
+		{path: ".././", dir: "..", file: "."},
+		{path: "../..", dir: "..", file: ".."},
+		{path: "../../", dir: "..", file: ".."},
+
+		{path: "/foo", dir: "/", file: "foo"},
+		{path: "/foo/", dir: "/", file: "foo"},
+		{path: "/foo/.", dir: "/foo", file: "."},
+		{path: "/foo/./", dir: "/foo", file: "."},
+		{path: "/foo/./.", dir: "/foo/.", file: "."},
+		{path: "/foo/./..", dir: "/foo/.", file: ".."},
+		{path: "/foo/..", dir: "/foo", file: ".."},
+		{path: "/foo/../", dir: "/foo", file: ".."},
+		{path: "/foo/../.", dir: "/foo/..", file: "."},
+		{path: "/foo/../..", dir: "/foo/..", file: ".."},
+
+		{path: "/foo/bar", dir: "/foo", file: "bar"},
+		{path: "/foo/bar/", dir: "/foo", file: "bar"},
+		{path: "/foo/bar/.", dir: "/foo/bar", file: "."},
+		{path: "/foo/bar/./", dir: "/foo/bar", file: "."},
+		{path: "/foo/bar/./.", dir: "/foo/bar/.", file: "."},
+		{path: "/foo/bar/./..", dir: "/foo/bar/.", file: ".."},
+		{path: "/foo/bar/..", dir: "/foo/bar", file: ".."},
+		{path: "/foo/bar/../", dir: "/foo/bar", file: ".."},
+		{path: "/foo/bar/../.", dir: "/foo/bar/..", file: "."},
+		{path: "/foo/bar/../..", dir: "/foo/bar/..", file: ".."},
+
+		{path: "foo", dir: ".", file: "foo"},
+		{path: "foo", dir: ".", file: "foo"},
+		{path: "foo/", dir: ".", file: "foo"},
+		{path: "foo/.", dir: "foo", file: "."},
+		{path: "foo/./", dir: "foo", file: "."},
+		{path: "foo/./.", dir: "foo/.", file: "."},
+		{path: "foo/./..", dir: "foo/.", file: ".."},
+		{path: "foo/..", dir: "foo", file: ".."},
+		{path: "foo/../", dir: "foo", file: ".."},
+		{path: "foo/../.", dir: "foo/..", file: "."},
+		{path: "foo/../..", dir: "foo/..", file: ".."},
+		{path: "foo/", dir: ".", file: "foo"},
+		{path: "foo/.", dir: "foo", file: "."},
+
+		{path: "foo/bar", dir: "foo", file: "bar"},
+		{path: "foo/bar/", dir: "foo", file: "bar"},
+		{path: "foo/bar/.", dir: "foo/bar", file: "."},
+		{path: "foo/bar/./", dir: "foo/bar", file: "."},
+		{path: "foo/bar/./.", dir: "foo/bar/.", file: "."},
+		{path: "foo/bar/./..", dir: "foo/bar/.", file: ".."},
+		{path: "foo/bar/..", dir: "foo/bar", file: ".."},
+		{path: "foo/bar/../", dir: "foo/bar", file: ".."},
+		{path: "foo/bar/../.", dir: "foo/bar/..", file: "."},
+		{path: "foo/bar/../..", dir: "foo/bar/..", file: ".."},
+		{path: "foo/bar/", dir: "foo", file: "bar"},
+		{path: "foo/bar/.", dir: "foo/bar", file: "."},
+	}
+
+	for _, c := range cases {
+		dir, file := SplitLast(c.path)
+		if dir != c.dir || file != c.file {
+			t.Errorf("SplitLast(%q) got (%q, %q), expected (%q, %q)", c.path, dir, file, c.dir, c.file)
+		}
+	}
+}
+
+// TestSplitFirst tests variants of path splitting.
+func TestSplitFirst(t *testing.T) {
+	cases := []struct {
+		path      string
+		first     string
+		remainder string
+	}{
+		{path: "/", first: "/", remainder: ""},
+		{path: "/.", first: "/", remainder: "."},
+		{path: "///.", first: "/", remainder: "//."},
+		{path: "/.///", first: "/", remainder: "."},
+		{path: "/./.", first: "/", remainder: "./."},
+		{path: "/././", first: "/", remainder: "./."},
+		{path: "/./..", first: "/", remainder: "./.."},
+		{path: "/./../", first: "/", remainder: "./.."},
+		{path: "/..", first: "/", remainder: ".."},
+		{path: "/../", first: "/", remainder: ".."},
+		{path: "/../.", first: "/", remainder: "../."},
+		{path: "/.././", first: "/", remainder: "../."},
+		{path: "/../..", first: "/", remainder: "../.."},
+		{path: "/../../", first: "/", remainder: "../.."},
+
+		{path: "", first: ".", remainder: ""},
+		{path: ".", first: ".", remainder: ""},
+		{path: "./", first: ".", remainder: ""},
+		{path: ".///", first: ".", remainder: ""},
+		{path: "./.", first: ".", remainder: "."},
+		{path: "././", first: ".", remainder: "."},
+		{path: "./..", first: ".", remainder: ".."},
+		{path: "./../", first: ".", remainder: ".."},
+		{path: "..", first: "..", remainder: ""},
+		{path: "../", first: "..", remainder: ""},
+		{path: "../.", first: "..", remainder: "."},
+		{path: ".././", first: "..", remainder: "."},
+		{path: "../..", first: "..", remainder: ".."},
+		{path: "../../", first: "..", remainder: ".."},
+
+		{path: "/foo", first: "/", remainder: "foo"},
+		{path: "/foo/", first: "/", remainder: "foo"},
+		{path: "/foo///", first: "/", remainder: "foo"},
+		{path: "/foo/.", first: "/", remainder: "foo/."},
+		{path: "/foo/./", first: "/", remainder: "foo/."},
+		{path: "/foo/./.", first: "/", remainder: "foo/./."},
+		{path: "/foo/./..", first: "/", remainder: "foo/./.."},
+		{path: "/foo/..", first: "/", remainder: "foo/.."},
+		{path: "/foo/../", first: "/", remainder: "foo/.."},
+		{path: "/foo/../.", first: "/", remainder: "foo/../."},
+		{path: "/foo/../..", first: "/", remainder: "foo/../.."},
+
+		{path: "/foo/bar", first: "/", remainder: "foo/bar"},
+		{path: "///foo/bar", first: "/", remainder: "//foo/bar"},
+		{path: "/foo///bar", first: "/", remainder: "foo///bar"},
+		{path: "/foo/bar/.", first: "/", remainder: "foo/bar/."},
+		{path: "/foo/bar/./", first: "/", remainder: "foo/bar/."},
+		{path: "/foo/bar/./.", first: "/", remainder: "foo/bar/./."},
+		{path: "/foo/bar/./..", first: "/", remainder: "foo/bar/./.."},
+		{path: "/foo/bar/..", first: "/", remainder: "foo/bar/.."},
+		{path: "/foo/bar/../", first: "/", remainder: "foo/bar/.."},
+		{path: "/foo/bar/../.", first: "/", remainder: "foo/bar/../."},
+		{path: "/foo/bar/../..", first: "/", remainder: "foo/bar/../.."},
+
+		{path: "foo", first: "foo", remainder: ""},
+		{path: "foo", first: "foo", remainder: ""},
+		{path: "foo/", first: "foo", remainder: ""},
+		{path: "foo///", first: "foo", remainder: ""},
+		{path: "foo/.", first: "foo", remainder: "."},
+		{path: "foo/./", first: "foo", remainder: "."},
+		{path: "foo/./.", first: "foo", remainder: "./."},
+		{path: "foo/./..", first: "foo", remainder: "./.."},
+		{path: "foo/..", first: "foo", remainder: ".."},
+		{path: "foo/../", first: "foo", remainder: ".."},
+		{path: "foo/../.", first: "foo", remainder: "../."},
+		{path: "foo/../..", first: "foo", remainder: "../.."},
+		{path: "foo/", first: "foo", remainder: ""},
+		{path: "foo/.", first: "foo", remainder: "."},
+
+		{path: "foo/bar", first: "foo", remainder: "bar"},
+		{path: "foo///bar", first: "foo", remainder: "bar"},
+		{path: "foo/bar/", first: "foo", remainder: "bar"},
+		{path: "foo/bar/.", first: "foo", remainder: "bar/."},
+		{path: "foo/bar/./", first: "foo", remainder: "bar/."},
+		{path: "foo/bar/./.", first: "foo", remainder: "bar/./."},
+		{path: "foo/bar/./..", first: "foo", remainder: "bar/./.."},
+		{path: "foo/bar/..", first: "foo", remainder: "bar/.."},
+		{path: "foo/bar/../", first: "foo", remainder: "bar/.."},
+		{path: "foo/bar/../.", first: "foo", remainder: "bar/../."},
+		{path: "foo/bar/../..", first: "foo", remainder: "bar/../.."},
+		{path: "foo/bar/", first: "foo", remainder: "bar"},
+		{path: "foo/bar/.", first: "foo", remainder: "bar/."},
+	}
+
+	for _, c := range cases {
+		first, remainder := SplitFirst(c.path)
+		if first != c.first || remainder != c.remainder {
+			t.Errorf("SplitFirst(%q) got (%q, %q), expected (%q, %q)", c.path, first, remainder, c.first, c.remainder)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
new file mode 100644
index 000000000..18372cfbf
--- /dev/null
+++ b/pkg/sentry/fs/proc/BUILD
@@ -0,0 +1,95 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "proc_state",
+    srcs = [
+        "cpuinfo.go",
+        "exec_args.go",
+        "fds.go",
+        "file.go",
+        "filesystems.go",
+        "fs.go",
+        "loadavg.go",
+        "meminfo.go",
+        "mounts.go",
+        "net.go",
+        "proc.go",
+        "stat.go",
+        "sys.go",
+        "sys_net.go",
+        "task.go",
+        "uid_gid_map.go",
+        "uptime.go",
+        "version.go",
+    ],
+    out = "proc_state.go",
+    package = "proc",
+)
+
+go_library(
+    name = "proc",
+    srcs = [
+        "cpuinfo.go",
+        "exec_args.go",
+        "fds.go",
+        "file.go",
+        "filesystems.go",
+        "fs.go",
+        "loadavg.go",
+        "meminfo.go",
+        "mounts.go",
+        "net.go",
+        "proc.go",
+        "proc_state.go",
+        "stat.go",
+        "sys.go",
+        "sys_net.go",
+        "task.go",
+        "uid_gid_map.go",
+        "uptime.go",
+        "version.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/proc/device",
+        "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "proc_test",
+    size = "small",
+    srcs = [
+        "net_test.go",
+        "sys_net_test.go",
+    ],
+    embed = [":proc"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
new file mode 100644
index 000000000..c510ee63a
--- /dev/null
+++ b/pkg/sentry/fs/proc/README.md
@@ -0,0 +1,317 @@
+This document tracks what is implemented in procfs. Refer to
+Documentation/filesystems/proc.txt in the Linux project for information about
+procfs generally.
+
+**NOTE**: This document is not guaranteed to be up to date. If you find an
+inconsistency, please file a bug.
+
+[TOC]
+## Kernel data
+
+The following files are implemented:
+
+| File /proc/               | Content                                          |
+| :------------------------ | :----------------------------------------------- |
+| [cpuinfo](#cpuinfo)       | Info about the CPU                               |
+| [filesystem](#filesystem) | Supported filesystems                            |
+| [loadavg](#loadavg)       | Load average of last 1, 5 & 15 minutes           |
+| [meminfo](#meminfo)       | Overall memory info                              |
+| [stat](#stat)             | Overall kernel statistics                        |
+| [sys](#sys)               | Change parameters within the kernel              |
+| [uptime](#uptime)         | Wall clock since boot, combined idle time of all |
+:                           : cpus                                             :
+| [version](#version)       | Kernel version                                   |
+
+### cpuinfo
+
+```bash
+$ cat /proc/cpuinfo
+processor   : 0
+vendor_id   : GenuineIntel
+cpu family  : 6
+model       : 45
+model name  : unknown
+stepping    : unknown
+cpu MHz     : 1234.588
+fpu     : yes
+fpu_exception   : yes
+cpuid level : 13
+wp      : yes
+flags       : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx xsaveopt
+bogomips    : 1234.59
+clflush size    : 64
+cache_alignment : 64
+address sizes   : 46 bits physical, 48 bits virtual
+power management:
+
+...
+```
+
+Notable divergences:
+
+Field name       | Notes
+:--------------- | :---------------------------------------
+model name       | Always unknown
+stepping         | Always unknown
+fpu              | Always yes
+fpu_exception    | Always yes
+wp               | Always yes
+bogomips         | Bogus value (matches cpu MHz)
+clflush size     | Always 64
+cache_alignment  | Always 64
+address sizes    | Always 46 bits physical, 48 bits virtual
+power management | Always blank
+
+Otherwise fields are derived from the SentryCPUIDSpec proto config.
+
+### filesystem
+
+```bash
+$ cat /proc/filesystem
+nodev   9p
+nodev   devtmpfs
+nodev   proc
+nodev   ramdiskfs
+nodev   sysfs
+nodev   tmpfs
+```
+
+Notable divergences:
+
+Filesystem | Notes
+:--------- | :--------------------------------------------------------
+ramdiskfs  | No Linux equivalent, see the SentryRamdiskFS proto config
+
+### loadavg
+
+```bash
+$ cat /proc/loadavg
+0.00 0.00 0.00 0/0 0
+```
+
+Column                                | Notes
+:------------------------------------ | :----------
+CPU.IO utilization in last 1 minute   | Always zero
+CPU.IO utilization in last 5 minutes  | Always zero
+CPU.IO utilization in last 10 minutes | Always zero
+Num currently running processes       | Always zero
+Total num processes                   | Always zero
+
+TODO: Populate the columns with accurate statistics.
+### meminfo
+
+```bash
+$ cat /proc/meminfo
+MemTotal:        2097152 kB
+MemFree:         2083540 kB
+MemAvailable:    2083540 kB
+Buffers:               0 kB
+Cached:             4428 kB
+SwapCache:             0 kB
+Active:            10812 kB
+Inactive:           2216 kB
+Active(anon):       8600 kB
+Inactive(anon):        0 kB
+Active(file):       2212 kB
+Inactive(file):     2216 kB
+Unevictable:           0 kB
+Mlocked:               0 kB
+SwapTotal:             0 kB
+SwapFree:              0 kB
+Dirty:                 0 kB
+Writeback:             0 kB
+AnonPages:          8600 kB
+Mapped:             4428 kB
+Shmem:                 0 kB
+
+```
+
+Notable divergences:
+
+Field name        | Notes
+:---------------- | :--------------------------------------------------------
+Buffers           | Always zero, no block devices
+SwapCache         | Always zero, no swap
+Inactive(anon)    | Always zero, see SwapCache
+Unevictable       | Always zero TODO
+Mlocked           | Always zero TODO
+SwapTotal         | Always zero, no swap
+SwapFree          | Always zero, no swap
+Dirty             | Always zero TODO
+Writeback         | Always zero TODO
+MemAvailable      | Uses the same value as MemFree since there is no swap.
+Slab              | Missing
+SReclaimable      | Missing
+SUnreclaim        | Missing
+KernelStack       | Missing
+PageTables        | Missing
+NFS_Unstable      | Missing
+Bounce            | Missing
+WritebackTmp      | Missing
+CommitLimit       | Missing
+Committed_AS      | Missing
+VmallocTotal      | Missing
+VmallocUsed       | Missing
+VmallocChunk      | Missing
+HardwareCorrupted | Missing
+AnonHugePages     | Missing
+ShmemHugePages    | Missing
+ShmemPmdMapped    | Missing
+HugePages_Total   | Missing
+HugePages_Free    | Missing
+HugePages_Rsvd    | Missing
+HugePages_Surp    | Missing
+Hugepagesize      | Missing
+DirectMap4k       | Missing
+DirectMap2M       | Missing
+DirectMap1G       | Missing
+
+See [Memory
+Accounting](pkg/sentry/usage/g3doc/memory-accounting.md)
+for general caveats.
+
+### stat
+
+```bash
+$ cat /proc/stat
+cpu  0 0 0 0 0 0 0 0 0 0
+cpu0 0 0 0 0 0 0 0 0 0 0
+cpu1 0 0 0 0 0 0 0 0 0 0
+cpu2 0 0 0 0 0 0 0 0 0 0
+cpu3 0 0 0 0 0 0 0 0 0 0
+cpu4 0 0 0 0 0 0 0 0 0 0
+cpu5 0 0 0 0 0 0 0 0 0 0
+cpu6 0 0 0 0 0 0 0 0 0 0
+cpu7 0 0 0 0 0 0 0 0 0 0
+intr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ctxt 0
+btime 1504040968
+processes 0
+procs_running 0
+procs_blokkcked 0
+softirq 0 0 0 0 0 0 0 0 0 0 0
+```
+
+All fields except for `btime` are always zero.
+TODO: Populate with accurate fields.
+
+### sys
+
+```bash
+$ ls /proc/sys
+kernel vm
+```
+
+Directory | Notes
+:-------- | :----------------------------
+abi       | Missing
+debug     | Missing
+dev       | Missing
+fs        | Missing
+kernel    | Contains hostname (only)
+net       | Missing
+user      | Missing
+vm        | Contains mmap_min_addr (only)
+
+### uptime
+
+```bash
+$ cat /proc/uptime
+3204.62 0.00
+```
+
+Column                           | Notes
+:------------------------------- | :----------------------------
+Total num seconds system running | Time since procfs was mounted
+Number of seconds idle           | Always zero
+
+### version
+
+```bash
+$ cat /proc/version
+Linux version 3.11.10 #1 SMP Fri Nov 29 10:47:50 PST 2013
+```
+
+## Process-specific data
+
+The following files are implemented:
+
+File /proc/PID      | Content
+:------------------ | :---------------------------------------------------
+[auxv](#auxv)       | Copy of auxiliary vector for the process
+[cmdline](#cmdline) | Command line arguments
+[comm](#comm)       | Command name associated with the process
+[exe](#exe)         | Symlink to the process's executable
+[fd](#fd)           | Directory containing links to open file descriptors
+[fdinfo](#fdinfo)   | Information associated with open file descriptors
+[gid_map](#gid_map) | Mappings for group IDs inside the user namespace
+[io](#io)           | IO statistics
+[maps](#maps)       | Memory mappings (anon, executables, library files)
+[ns](#ns)           | Directory containing info about supported namespaces
+[stat](#stat)       | Process statistics
+[status](#status)   | Process status in human readable format
+[task](#task)       | Directory containing info about running threads
+[uid_map](#uid_map) | Mappings for user IDs inside the user namespace
+
+### auxv
+
+TODO
+
+### cmdline
+
+TODO
+
+### comm
+
+TODO
+
+### exe
+
+TODO
+
+### fd
+
+TODO
+
+### fdinfo
+
+TODO
+
+### gid_map
+
+TODO
+
+### io
+
+Only has data for rchar, wchar, syscr, and syscw.
+
+TODO: add more detail.
+
+### maps
+
+TODO
+
+### ns
+
+TODO
+
+### stat
+
+Only has data for pid, comm, state, ppid, utime, stime, cutime, cstime,
+num_threads, and exit_signal.
+
+TODO: add more detail.
+
+### status
+
+Statically created, most of the fields have no data.
+
+TODO: add more detail.
+
+### task
+
+TODO
+
+### uid_map
+
+TODO
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
new file mode 100644
index 000000000..f80aaa5b1
--- /dev/null
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -0,0 +1,64 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// cpuinfo is a file describing the CPU capabilities.
+//
+// Presently cpuinfo never changes, so it doesn't need to be a SeqFile.
+type cpuinfo struct {
+	ramfs.Entry
+
+	// k is the system kernel.
+	k *kernel.Kernel
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (c *cpuinfo) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	features := c.k.FeatureSet()
+	if features == nil {
+		// Kernel is always initialized with a FeatureSet.
+		panic("cpuinfo read with nil FeatureSet")
+	}
+
+	contents := make([]byte, 0, 1024)
+	for i, max := uint(0), c.k.ApplicationCores(); i < max; i++ {
+		contents = append(contents, []byte(features.CPUInfo(i))...)
+	}
+	if offset >= int64(len(contents)) {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOut(ctx, contents[offset:])
+	return int64(n), err
+}
+
+func (p *proc) newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	f := &cpuinfo{
+		k: p.k,
+	}
+	f.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
+
+	return newFile(f, msrc, fs.SpecialFile, nil)
+}
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
new file mode 100644
index 000000000..b62062bd7
--- /dev/null
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -0,0 +1,11 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "device",
+    srcs = ["device.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device",
+    visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/sentry/device"],
+)
diff --git a/pkg/sentry/fs/proc/device/device.go b/pkg/sentry/fs/proc/device/device.go
new file mode 100644
index 000000000..6194afe88
--- /dev/null
+++ b/pkg/sentry/fs/proc/device/device.go
@@ -0,0 +1,23 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package device contains the proc device to avoid dependency loops.
+package device
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+)
+
+// ProcDevice is the kernel proc device.
+var ProcDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
new file mode 100644
index 000000000..0e1523bf1
--- /dev/null
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -0,0 +1,129 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// execArgType enumerates the types of exec arguments that are exposed through
+// proc.
+type execArgType int
+
+const (
+	cmdlineExecArg execArgType = iota
+	environExecArg
+)
+
+// execArgFile is a file containing the exec args (either cmdline or environ)
+// for a given task.
+type execArgFile struct {
+	ramfs.Entry
+
+	// arg is the type of exec argument this file contains.
+	arg execArgType
+
+	// t is the Task to read the exec arg line from.
+	t *kernel.Task
+}
+
+// newExecArgFile creates a file containing the exec args of the given type.
+func newExecArgFile(t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode {
+	if arg != cmdlineExecArg && arg != environExecArg {
+		panic(fmt.Sprintf("unknown exec arg type %v", arg))
+	}
+	f := &execArgFile{
+		arg: arg,
+		t:   t,
+	}
+	f.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444))
+	return newFile(f, msrc, fs.SpecialFile, t)
+}
+
+// DeprecatedPreadv reads the exec arg from the process's address space..
+func (f *execArgFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// N.B. Linux 4.2 eliminates the arbitrary one page limit.
+	if offset > usermem.PageSize {
+		return 0, io.EOF
+	}
+	dst = dst.TakeFirst64(usermem.PageSize - offset)
+
+	m, err := getTaskMM(f.t)
+	if err != nil {
+		return 0, err
+	}
+	defer m.DecUsers(ctx)
+
+	// Figure out the bounds of the exec arg we are trying to read.
+	var execArgStart, execArgEnd usermem.Addr
+	switch f.arg {
+	case cmdlineExecArg:
+		execArgStart, execArgEnd = m.ArgvStart(), m.ArgvEnd()
+	case environExecArg:
+		execArgStart, execArgEnd = m.EnvvStart(), m.EnvvEnd()
+	default:
+		panic(fmt.Sprintf("unknown exec arg type %v", f.arg))
+	}
+	if execArgStart == 0 || execArgEnd == 0 {
+		// Don't attempt to read before the start/end are set up.
+		return 0, io.EOF
+	}
+
+	start, ok := execArgStart.AddLength(uint64(offset))
+	if !ok {
+		return 0, io.EOF
+	}
+	if start >= execArgEnd {
+		return 0, io.EOF
+	}
+
+	length := int(execArgEnd - start)
+	if dstlen := dst.NumBytes(); int64(length) > dstlen {
+		length = int(dstlen)
+	}
+
+	buf := make([]byte, length)
+	// N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true
+	// until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading
+	// cmdline and environment").
+	copyN, copyErr := m.CopyIn(ctx, start, buf, usermem.IOOpts{})
+	if copyN == 0 {
+		// Nothing to copy.
+		return 0, copyErr
+	}
+	buf = buf[:copyN]
+
+	// TODO: On Linux, if the NUL byte at the end of the
+	// argument vector has been overwritten, it continues reading the
+	// environment vector as part of the argument vector.
+
+	n, dstErr := dst.CopyOut(ctx, buf)
+	if dstErr != nil {
+		return int64(n), dstErr
+	}
+	return int64(n), copyErr
+}
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
new file mode 100644
index 000000000..2eca9ac31
--- /dev/null
+++ b/pkg/sentry/fs/proc/fds.go
@@ -0,0 +1,258 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"sort"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// walkDescriptors finds the descriptor (file-flag pair) for the fd identified
+// by p, and calls the toInodeOperations callback with that descriptor.  This is a helper
+// method for implementing fs.InodeOperations.Lookup.
+func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDFlags) *fs.Inode) (*fs.Inode, error) {
+	n, err := strconv.ParseUint(p, 10, 64)
+	if err != nil {
+		// Not found.
+		return nil, syserror.ENOENT
+	}
+
+	var file *fs.File
+	var flags kernel.FDFlags
+	t.WithMuLocked(func(t *kernel.Task) {
+		if fdm := t.FDMap(); fdm != nil {
+			file, flags = fdm.GetDescriptor(kdefs.FD(n))
+		}
+	})
+	if file == nil {
+		return nil, syserror.ENOENT
+	}
+	return toInode(file, flags), nil
+}
+
+// readDescriptors reads fds in the task starting at offset, and calls the
+// toDentAttr callback for each to get a DentAttr, which it then emits. This is
+// a helper for implementing fs.InodeOperations.Readdir.
+func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(int) fs.DentAttr) (int, error) {
+	var fds kernel.FDs
+	t.WithMuLocked(func(t *kernel.Task) {
+		if fdm := t.FDMap(); fdm != nil {
+			fds = fdm.GetFDs()
+		}
+	})
+
+	fdInts := make([]int, 0, len(fds))
+	for _, fd := range fds {
+		fdInts = append(fdInts, int(fd))
+	}
+
+	// Find the fd to start at.
+	idx := sort.SearchInts(fdInts, offset)
+	if idx == len(fdInts) {
+		return offset, nil
+	}
+	fdInts = fdInts[idx:]
+
+	var fd int
+	for _, fd = range fdInts {
+		name := strconv.FormatUint(uint64(fd), 10)
+		if err := c.DirEmit(name, toDentAttr(fd)); err != nil {
+			// Returned offset is the next fd to serialize.
+			return fd, err
+		}
+	}
+	// We serialized them all.  Next offset should be higher than last
+	// serialized fd.
+	return fd + 1, nil
+}
+
+// fd is a single file in /proc/TID/fd/.
+type fd struct {
+	ramfs.Symlink
+	*fs.File
+}
+
+// newFD returns a new fd based on an existing file.
+//
+// This inherits one reference to the file.
+func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode {
+	fd := &fd{File: f}
+	// RootOwner by default, is overridden in UnstableAttr()
+	fd.InitSymlink(t, fs.RootOwner, "")
+	return newFile(fd, msrc, fs.Symlink, t)
+}
+
+// GetFile returns the fs.File backing this fd.  The dirent and flags
+// arguments are ignored.
+func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) {
+	// Take a reference on the fs.File.
+	f.File.IncRef()
+	return f.File, nil
+}
+
+// Readlink returns the current target.
+func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	n, _ := f.Dirent.FullName(root)
+	return n, nil
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (f *fd) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	f.Dirent.IncRef()
+	return f.Dirent, nil
+}
+
+// Truncate is ignored.
+func (f *fd) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// Close releases the reference on the file.
+func (f *fd) Close() error {
+	f.DecRef()
+	return nil
+}
+
+// fdDir implements /proc/TID/fd.
+type fdDir struct {
+	ramfs.Dir
+
+	// We hold a reference on the task's fdmap but only keep an indirect
+	// task pointer to avoid Dirent loading circularity caused by fdmap's
+	// potential back pointers into the dirent tree.
+	t *kernel.Task
+}
+
+// newFdDir creates a new fdDir.
+func newFdDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	f := &fdDir{t: t}
+	f.InitDir(t, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}})
+	return newFile(f, msrc, fs.SpecialDirectory, t)
+}
+
+// Check implements InodeOperations.Check.
+//
+// This is to match Linux, which uses a special permission handler to guarantee
+// that a process can still access /proc/self/fd after it has executed
+// setuid. See fs/proc/fd.c:proc_fd_permission.
+func (f *fdDir) Check(ctx context.Context, inode *fs.Inode, req fs.PermMask) bool {
+	if fs.ContextCanAccessFile(ctx, inode, req) {
+		return true
+	}
+	if t := kernel.TaskFromContext(ctx); t != nil {
+		// Allow access if the task trying to access it is in the
+		// thread group corresponding to this directory.
+		//
+		// N.B. Technically, in Linux 3.11, this compares what would be
+		// the equivalent of task pointers. However, this was fixed
+		// later in 54708d2858e7 ("proc: actually make
+		// proc_fd_permission() thread-friendly").
+		if f.t.ThreadGroup() == t.ThreadGroup() {
+			return true
+		}
+	}
+	return false
+}
+
+// Lookup loads an Inode in /proc/TID/fd into a Dirent.
+func (f *fdDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+	n, err := walkDescriptors(f.t, p, func(file *fs.File, _ kernel.FDFlags) *fs.Inode {
+		return newFd(f.t, file, dir.MountSource)
+	})
+	if err != nil {
+		return nil, err
+	}
+	return fs.NewDirent(n, p), nil
+}
+
+// DeprecatedReaddir lists fds in /proc/TID/fd.
+func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	return readDescriptors(f.t, dirCtx, offset, func(fd int) fs.DentAttr {
+		return fs.GenericDentAttr(fs.Symlink, device.ProcDevice)
+	})
+}
+
+// fdInfo is a single file in /proc/TID/fdinfo/.
+type fdInfo struct {
+	ramfs.File
+
+	flags kernel.FDFlags
+}
+
+// newFdInfo returns a new fdInfo based on an existing file.
+func newFdInfo(t *kernel.Task, _ *fs.File, flags kernel.FDFlags, msrc *fs.MountSource) *fs.Inode {
+	fdi := &fdInfo{flags: flags}
+	fdi.InitFile(t, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true}})
+	// TODO: Get pos, locks, and other data.  For now we only
+	// have flags.
+	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
+	fdi.Append([]byte(fmt.Sprintf("flags: %08o\n", flags)))
+	return newFile(fdi, msrc, fs.SpecialFile, t)
+}
+
+// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev.
+func (*fdInfo) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	return 0, ramfs.ErrInvalidOp
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	return ramfs.ErrInvalidOp
+}
+
+// fdInfoDir implements /proc/TID/fdinfo.  It embeds an fdDir, but overrides
+// Lookup and Readdir.
+type fdInfoDir struct {
+	ramfs.Dir
+
+	t *kernel.Task
+}
+
+// newFdInfoDir creates a new fdInfoDir.
+func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	fdid := &fdInfoDir{t: t}
+	fdid.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0500))
+	return newFile(fdid, msrc, fs.SpecialDirectory, t)
+}
+
+// Lookup loads an fd in /proc/TID/fdinfo into a Dirent.
+func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+	n, err := walkDescriptors(fdid.t, p, func(file *fs.File, flags kernel.FDFlags) *fs.Inode {
+		return newFdInfo(fdid.t, file, flags, dir.MountSource)
+	})
+	if err != nil {
+		return nil, err
+	}
+	return fs.NewDirent(n, p), nil
+}
+
+// DeprecatedReaddir lists fds in /proc/TID/fdinfo.
+func (fdid *fdInfoDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	return readDescriptors(fdid.t, dirCtx, offset, func(fd int) fs.DentAttr {
+		return fs.GenericDentAttr(fs.RegularFile, device.ProcDevice)
+	})
+}
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
new file mode 100644
index 000000000..9a433cdf8
--- /dev/null
+++ b/pkg/sentry/fs/proc/file.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type file struct {
+	fs.InodeOperations
+
+	// nodeType is the file type of this file.
+	nodeType fs.InodeType
+
+	// t is the associated kernel task that owns this file.
+	t *kernel.Task
+}
+
+func newFile(node fs.InodeOperations, msrc *fs.MountSource, nodeType fs.InodeType, t *kernel.Task) *fs.Inode {
+	iops := &file{node, nodeType, t}
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      nodeType,
+	}
+	return fs.NewInode(iops, msrc, sattr)
+}
+
+// UnstableAttr returns all attributes of this file.
+func (f *file) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	uattr, err := f.InodeOperations.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	if f.t != nil {
+		uattr.Owner = fs.FileOwnerFromContext(f.t)
+	}
+	return uattr, nil
+}
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
new file mode 100644
index 000000000..fe4de18ba
--- /dev/null
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -0,0 +1,55 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+)
+
+// filesystemsData backs /proc/filesystems.
+type filesystemsData struct{}
+
+// NeedsUpdate returns true on the first generation. The set of registered file
+// systems doesn't change so there's no need to generate SeqData more than once.
+func (*filesystemsData) NeedsUpdate(generation int64) bool {
+	return generation == 0
+}
+
+// ReadSeqFileData returns data for the SeqFile reader.
+// SeqData, the current generation and where in the file the handle corresponds to.
+func (*filesystemsData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	// We don't ever expect to see a non-nil SeqHandle.
+	if h != nil {
+		return nil, 0
+	}
+
+	// Generate the file contents.
+	var buf bytes.Buffer
+	for _, sys := range fs.GetFilesystems() {
+		nodev := "nodev"
+		if sys.Flags()&fs.FilesystemRequiresDev != 0 {
+			nodev = ""
+		}
+		// Matches the format of fs/filesystems.c:filesystems_proc_show.
+		fmt.Fprintf(&buf, "%s\t%s\n", nodev, sys.Name())
+	}
+
+	// Return the SeqData and advance the generation counter.
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*filesystemsData)(nil)}}, 1
+}
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
new file mode 100644
index 000000000..072d00beb
--- /dev/null
+++ b/pkg/sentry/fs/proc/fs.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// filesystem is a procfs.
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// FilesystemName is the name underwhich the filesystem is registered.
+// Name matches fs/proc/root.c:proc_fs_type.name.
+const FilesystemName = "proc"
+
+// Name is the name of the file system.
+func (*filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount allows users to mount(2) this file system.
+func (*filesystem) AllowUserMount() bool {
+	return true
+}
+
+// Flags returns that there is nothing special about this file system.
+//
+// In Linux, proc returns FS_USERNS_VISIBLE | FS_USERNS_MOUNT, see fs/proc/root.c.
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns the root of a procfs that can be positioned in the vfs.
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// device is always ignored.
+
+	// Parse generic comma-separated key=value options, this file system expects them.
+	options := fs.GenericMountSourceOptions(data)
+
+	// Proc options parsing checks for either a gid= or hidepid= and barfs on
+	// anything else, see fs/proc/root.c:proc_parse_options. Since we don't know
+	// what to do with gid= or hidepid=, we blow up if we get any options.
+	if len(options) > 0 {
+		return nil, fmt.Errorf("unsupported mount options: %v", options)
+	}
+
+	// Construct the procfs root. Since procfs files are all virtual, we
+	// never want them cached.
+	return New(ctx, fs.NewNonCachingMountSource(f, flags))
+}
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
new file mode 100644
index 000000000..694cde656
--- /dev/null
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+)
+
+// loadavgData backs /proc/loadavg.
+type loadavgData struct{}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*loadavgData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+func (d *loadavgData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+
+	// TODO: Include real data in fields.
+	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
+	// Column 4-5: currently running processes and the total number of processes.
+	// Column 6: the last process ID used.
+	fmt.Fprintf(&buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
+
+	return []seqfile.SeqData{
+		{
+			Buf:    buf.Bytes(),
+			Handle: (*loadavgData)(nil),
+		},
+	}, 0
+}
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
new file mode 100644
index 000000000..489f796e5
--- /dev/null
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// meminfoData backs /proc/meminfo.
+type meminfoData struct {
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*meminfoData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (d *meminfoData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	mem := d.k.Platform.Memory()
+	mem.UpdateUsage()
+	snapshot, totalUsage := usage.MemoryAccounting.Copy()
+	totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage)
+	anon := snapshot.Anonymous + snapshot.Tmpfs
+	file := snapshot.PageCache + snapshot.Mapped
+	// We don't actually have active/inactive LRUs, so just make up numbers.
+	activeFile := (file / 2) &^ (usermem.PageSize - 1)
+	inactiveFile := file - activeFile
+
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "MemTotal:       %8d kB\n", totalSize/1024)
+	memFree := (totalSize - totalUsage) / 1024
+	// We use MemFree as MemAvailable because we don't swap.
+	// TODO: When reclaim is implemented the value of MemAvailable
+	// should change.
+	fmt.Fprintf(&buf, "MemFree:        %8d kB\n", memFree)
+	fmt.Fprintf(&buf, "MemAvailable:   %8d kB\n", memFree)
+	fmt.Fprintf(&buf, "Buffers:               0 kB\n") // memory usage by block devices
+	fmt.Fprintf(&buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
+	// Emulate a system with no swap, which disables inactivation of anon pages.
+	fmt.Fprintf(&buf, "SwapCache:             0 kB\n")
+	fmt.Fprintf(&buf, "Active:         %8d kB\n", (anon+activeFile)/1024)
+	fmt.Fprintf(&buf, "Inactive:       %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(&buf, "Active(anon):   %8d kB\n", anon/1024)
+	fmt.Fprintf(&buf, "Inactive(anon):        0 kB\n")
+	fmt.Fprintf(&buf, "Active(file):   %8d kB\n", activeFile/1024)
+	fmt.Fprintf(&buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(&buf, "Unevictable:           0 kB\n") // TODO
+	fmt.Fprintf(&buf, "Mlocked:               0 kB\n") // TODO
+	fmt.Fprintf(&buf, "SwapTotal:             0 kB\n")
+	fmt.Fprintf(&buf, "SwapFree:              0 kB\n")
+	fmt.Fprintf(&buf, "Dirty:                 0 kB\n")
+	fmt.Fprintf(&buf, "Writeback:             0 kB\n")
+	fmt.Fprintf(&buf, "AnonPages:      %8d kB\n", anon/1024)
+	fmt.Fprintf(&buf, "Mapped:         %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
+	fmt.Fprintf(&buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*meminfoData)(nil)}}, 0
+}
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
new file mode 100644
index 000000000..76092567d
--- /dev/null
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -0,0 +1,176 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// forEachMountSource runs f for the process root mount and  each mount that is a
+// descendant of the root.
+func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
+	// All mount points must be relative to the rootDir, and mounts outside
+	// will be excluded.
+	rootDir := t.FSContext().RootDirectory()
+	defer rootDir.DecRef()
+
+	if rootDir.Inode == nil {
+		panic(fmt.Sprintf("root dirent has nil inode: %+v", rootDir))
+	}
+	if rootDir.Inode.MountSource == nil {
+		panic(fmt.Sprintf("root dirent has nil mount: %+v", rootDir))
+	}
+
+	ms := append(rootDir.Inode.MountSource.Submounts(), rootDir.Inode.MountSource)
+	sort.Slice(ms, func(i, j int) bool {
+		return ms[i].ID() < ms[j].ID()
+	})
+	for _, m := range ms {
+		mountPath, desc := m.Root().FullName(rootDir)
+		if !desc {
+			// MountSources that are not descendants of the chroot jail are ignored.
+			continue
+		}
+
+		fn(mountPath, m)
+	}
+}
+
+// mountInfoFile is used to implement /proc/[pid]/mountinfo.
+type mountInfoFile struct {
+	t *kernel.Task
+}
+
+// NeedsUpdate implements SeqSource.NeedsUpdate.
+func (mif *mountInfoFile) NeedsUpdate(_ int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements SeqSource.ReadSeqFileData.
+func (mif *mountInfoFile) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if handle != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+	forEachMountSource(mif.t, func(mountPath string, m *fs.MountSource) {
+		// Format:
+		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+		// (1) MountSource ID.
+		fmt.Fprintf(&buf, "%d ", m.ID())
+
+		// (2)  Parent ID (or this ID if there is no parent).
+		pID := m.ID()
+		if p := m.Parent(); p != nil {
+			pID = p.ID()
+		}
+		fmt.Fprintf(&buf, "%d ", pID)
+
+		// (3) Major:Minor device ID. We don't have a superblock, so we
+		// just use the root inode device number.
+		sa := m.Root().Inode.StableAttr
+		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
+
+		// (4) Root: the pathname of the directory in the filesystem
+		// which forms the root of this mount.
+		//
+		// NOTE: This will always be "/" until we implement
+		// bind mounts.
+		fmt.Fprintf(&buf, "/ ")
+
+		// (5) Mount point (relative to process root).
+		fmt.Fprintf(&buf, "%s ", mountPath)
+
+		// (6) Mount options.
+		opts := "rw"
+		if m.Flags.ReadOnly {
+			opts = "ro"
+		}
+		if m.Flags.NoAtime {
+			opts += ",noatime"
+		}
+		fmt.Fprintf(&buf, "%s ", opts)
+
+		// (7) Optional fields: zero or more fields of the form "tag[:value]".
+		// (8) Separator: the end of the optional fields is marked by a single hyphen.
+		fmt.Fprintf(&buf, "- ")
+
+		// (9) Filesystem type.
+		name := "none"
+		if m.Filesystem != nil {
+			name = m.Filesystem.Name()
+		}
+		fmt.Fprintf(&buf, "%s ", name)
+
+		// (10) Mount source: filesystem-specific information or "none".
+		fmt.Fprintf(&buf, "none ")
+
+		// (11) Superblock options. Only "ro/rw" is supported for now,
+		// and is the same as the filesystem option.
+		fmt.Fprintf(&buf, "%s\n", opts)
+	})
+
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountInfoFile)(nil)}}, 0
+}
+
+// mountsFile is used to implement /proc/[pid]/mountinfo.
+type mountsFile struct {
+	t *kernel.Task
+}
+
+// NeedsUpdate implements SeqSource.NeedsUpdate.
+func (mf *mountsFile) NeedsUpdate(_ int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements SeqSource.ReadSeqFileData.
+func (mf *mountsFile) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if handle != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+	forEachMountSource(mf.t, func(mountPath string, m *fs.MountSource) {
+		// Format (tab-separated):
+		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+		//
+		// We use the filesystem name as the first field, since there
+		// is no real block device we can point to, and we also should
+		// not expose anything about the remote filesystem.
+		//
+		// Only ro/rw option is supported for now.
+		//
+		// The "needs dump"and fsck flags are always 0, which is allowed.
+		opts := "rw"
+		if m.Flags.ReadOnly {
+			opts = "ro"
+		}
+		name := "none"
+		if m.Filesystem != nil {
+			name = m.Filesystem.Name()
+		}
+		fmt.Fprintf(&buf, "%s\t%s\t%s\t%s\t%d\t%d\n", "none", mountPath, name, opts, 0, 0)
+	})
+
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0
+}
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
new file mode 100644
index 000000000..6e464857a
--- /dev/null
+++ b/pkg/sentry/fs/proc/net.go
@@ -0,0 +1,151 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+)
+
+// newNet creates a new proc net entry.
+func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	if s := p.k.NetworkStack(); s != nil && s.SupportsIPv6() {
+		d.AddChild(ctx, "dev", seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc))
+		d.AddChild(ctx, "if_inet6", seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc))
+	}
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+// ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
+type ifinet6 struct {
+	s inet.Stack `state:"nosave"` // S/R-FIXME
+}
+
+func (n *ifinet6) contents() []string {
+	var lines []string
+	nics := n.s.Interfaces()
+	for id, naddrs := range n.s.InterfaceAddrs() {
+		nic, ok := nics[id]
+		if !ok {
+			// NIC was added after NICNames was called. We'll just
+			// ignore it.
+			continue
+		}
+
+		for _, a := range naddrs {
+			// IPv6 only.
+			if a.Family != linux.AF_INET6 {
+				continue
+			}
+
+			// Fields:
+			// IPv6 address displayed in 32 hexadecimal chars without colons
+			// Netlink device number (interface index) in hexadecimal (use nic id)
+			// Prefix length in hexadecimal
+			// Scope value (use 0)
+			// Interface flags
+			// Device name
+			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
+		}
+	}
+	return lines
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*ifinet6) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (n *ifinet6) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	var data []seqfile.SeqData
+	for _, l := range n.contents() {
+		data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*ifinet6)(nil)})
+	}
+
+	return data, 0
+}
+
+// netDev implements seqfile.SeqSource for /proc/net/dev.
+type netDev struct {
+	s inet.Stack `state:"nosave"` // S/R-FIXME
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (n *netDev) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. See Linux's
+// net/core/net-procfs.c:dev_seq_show.
+func (n *netDev) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	interfaces := n.s.Interfaces()
+	contents := make([]string, 2, 2+len(interfaces))
+	// Add the table header. From net/core/net-procfs.c:dev_seq_show.
+	contents[0] = "Inter-|   Receive                                                |  Transmit\n"
+	contents[1] = " face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n"
+
+	for _, i := range interfaces {
+		// TODO: Collect stats from each inet.Stack
+		// implementation (hostinet, epsocket, and rpcinet).
+
+		// Implements the same format as
+		// net/core/net-procfs.c:dev_seq_printf_stats.
+		l := fmt.Sprintf("%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
+			i.Name,
+			// Received
+			0, // bytes
+			0, // packets
+			0, // errors
+			0, // dropped
+			0, // fifo
+			0, // frame
+			0, // compressed
+			0, // multicast
+			// Transmitted
+			0, // bytes
+			0, // packets
+			0, // errors
+			0, // dropped
+			0, // fifo
+			0, // frame
+			0, // compressed
+			0) // multicast
+		contents = append(contents, l)
+	}
+
+	var data []seqfile.SeqData
+	for _, l := range contents {
+		data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*ifinet6)(nil)})
+	}
+
+	return data, 0
+}
diff --git a/pkg/sentry/fs/proc/net_test.go b/pkg/sentry/fs/proc/net_test.go
new file mode 100644
index 000000000..a31a20494
--- /dev/null
+++ b/pkg/sentry/fs/proc/net_test.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"reflect"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+)
+
+func newIPv6TestStack() *inet.TestStack {
+	s := inet.NewTestStack()
+	s.SupportsIPv6Flag = true
+	return s
+}
+
+func TestIfinet6NoAddresses(t *testing.T) {
+	n := &ifinet6{s: newIPv6TestStack()}
+	if got := n.contents(); got != nil {
+		t.Errorf("Got n.contents() = %v, want = %v", got, nil)
+	}
+}
+
+func TestIfinet6(t *testing.T) {
+	s := newIPv6TestStack()
+	s.InterfacesMap[1] = inet.Interface{Name: "eth0"}
+	s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{
+		{
+			Family:    linux.AF_INET6,
+			PrefixLen: 128,
+			Addr:      []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"),
+		},
+	}
+	s.InterfacesMap[2] = inet.Interface{Name: "eth1"}
+	s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{
+		{
+			Family:    linux.AF_INET6,
+			PrefixLen: 128,
+			Addr:      []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
+		},
+	}
+	want := map[string]struct{}{
+		"000102030405060708090a0b0c0d0e0f 01 80 00 00     eth0\n": {},
+		"101112131415161718191a1b1c1d1e1f 02 80 00 00     eth1\n": {},
+	}
+
+	n := &ifinet6{s: s}
+	contents := n.contents()
+	if len(contents) != len(want) {
+		t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want))
+	}
+	got := map[string]struct{}{}
+	for _, l := range contents {
+		got[l] = struct{}{}
+	}
+
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Got n.contents() = %v, want = %v", got, want)
+	}
+}
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
new file mode 100644
index 000000000..459eb7e62
--- /dev/null
+++ b/pkg/sentry/fs/proc/proc.go
@@ -0,0 +1,182 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package proc implements a partial in-memory file system for profs.
+package proc
+
+import (
+	"fmt"
+	"sort"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// proc is a root proc node.
+type proc struct {
+	ramfs.Dir
+
+	// k is the Kernel containing this proc node.
+	k *kernel.Kernel
+
+	// pidns is the PID namespace of the task that mounted the proc filesystem
+	// that this node represents.
+	pidns *kernel.PIDNamespace
+}
+
+// New returns the root node of a partial simple procfs.
+func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		return nil, fmt.Errorf("procfs requires a kernel")
+	}
+	pidns := kernel.PIDNamespaceFromContext(ctx)
+	if pidns == nil {
+		return nil, fmt.Errorf("procfs requires a PID namespace")
+	}
+
+	p := &proc{k: k, pidns: pidns}
+	p.InitDir(ctx, map[string]*fs.Inode{
+		// Note that these are just the static members. There are
+		// dynamic members populated in Readdir and Lookup below.
+		"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
+		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
+		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
+		"mounts":      newMountsSymlink(ctx, msrc),
+		"stat":        seqfile.NewSeqFileInode(ctx, &statData{k}, msrc),
+		"version":     seqfile.NewSeqFileInode(ctx, &versionData{k}, msrc),
+	}, fs.RootOwner, fs.FilePermsFromMode(0555))
+
+	p.AddChild(ctx, "cpuinfo", p.newCPUInfo(ctx, msrc))
+	p.AddChild(ctx, "uptime", p.newUptime(ctx, msrc))
+
+	return newFile(p, msrc, fs.SpecialDirectory, nil), nil
+}
+
+// self is a magical link.
+type self struct {
+	ramfs.Symlink
+
+	pidns *kernel.PIDNamespace
+}
+
+// newSelf returns a new "self" node.
+func (p *proc) newSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	s := &self{pidns: p.pidns}
+	s.InitSymlink(ctx, fs.RootOwner, "")
+	return newFile(s, msrc, fs.Symlink, nil)
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if t := kernel.TaskFromContext(ctx); t != nil {
+		tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+		if tgid == 0 {
+			return "", ramfs.ErrNotFound
+		}
+		return strconv.FormatUint(uint64(tgid), 10), nil
+	}
+
+	// Who is reading this link?
+	return "", ramfs.ErrInvalidOp
+}
+
+// Lookup loads an Inode at name into a Dirent.
+func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+	// Is it one of the static ones?
+	dirent, walkErr := p.Dir.Lookup(ctx, dir, name)
+	if walkErr == nil {
+		return dirent, nil
+	}
+
+	// Is it a dynamic element?
+	nfs := map[string]func() *fs.Inode{
+		"net":  func() *fs.Inode { return p.newNetDir(ctx, dir.MountSource) },
+		"self": func() *fs.Inode { return p.newSelf(ctx, dir.MountSource) },
+		"sys":  func() *fs.Inode { return p.newSysDir(ctx, dir.MountSource) },
+	}
+	if nf, ok := nfs[name]; ok {
+		return fs.NewDirent(nf(), name), nil
+	}
+
+	// Try to lookup a corresponding task.
+	tid, err := strconv.ParseUint(name, 10, 64)
+	if err != nil {
+		// Ignore the parse error and return the original.
+		return nil, walkErr
+	}
+
+	// Grab the other task.
+	otherTask := p.pidns.TaskWithID(kernel.ThreadID(tid))
+	if otherTask == nil {
+		// Per above.
+		return nil, walkErr
+	}
+
+	// Wrap it in a taskDir.
+	td := newTaskDir(otherTask, dir.MountSource, p.pidns, true)
+	return fs.NewDirent(td, name), nil
+}
+
+// Readdir synthesizes proc contents.
+func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	// Serialize normal contents.
+	_, err := p.Dir.DeprecatedReaddir(ctx, dirCtx, offset)
+	if err != nil {
+		return offset, err
+	}
+
+	m := make(map[string]fs.DentAttr)
+	var names []string
+
+	// Add special files.
+	m["sys"] = fs.GenericDentAttr(fs.SpecialFile, device.ProcDevice)
+	names = append(names, "sys")
+
+	// Collect tasks.
+	// Per linux we only include it in directory listings if it's the leader.
+	// But for whatever crazy reason, you can still walk to the given node.
+	for _, tg := range p.pidns.ThreadGroups() {
+		if leader := tg.Leader(); leader != nil {
+			name := strconv.FormatUint(uint64(tg.ID()), 10)
+			m[name] = fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice)
+			names = append(names, name)
+		}
+	}
+
+	if offset >= len(m) {
+		return offset, nil
+	}
+	sort.Strings(names)
+	names = names[offset:]
+	for _, name := range names {
+		if err := dirCtx.DirEmit(name, m[name]); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, err
+}
+
+// newMountsSymlink returns a symlink to "self/mounts"
+func newMountsSymlink(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	s := &ramfs.Symlink{}
+	s.InitSymlink(ctx, fs.RootOwner, "self/mounts")
+	return newFile(s, msrc, fs.Symlink, nil)
+}
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
new file mode 100644
index 000000000..48dd25e5b
--- /dev/null
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -0,0 +1,55 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "seqfile_state",
+    srcs = [
+        "seqfile.go",
+    ],
+    out = "seqfile_state.go",
+    package = "seqfile",
+)
+
+go_library(
+    name = "seqfile",
+    srcs = [
+        "seqfile.go",
+        "seqfile_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/proc/device",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+    ],
+)
+
+go_stateify(
+    name = "seqfile_test_state",
+    srcs = ["seqfile_test.go"],
+    out = "seqfile_test_state.go",
+    package = "seqfile",
+)
+
+go_test(
+    name = "seqfile_test",
+    size = "small",
+    srcs = [
+        "seqfile_test.go",
+        "seqfile_test_state.go",
+    ],
+    embed = [":seqfile"],
+    deps = [
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/ramfs/test",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
new file mode 100644
index 000000000..e37a85869
--- /dev/null
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -0,0 +1,232 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seqfile
+
+import (
+	"io"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// SeqHandle is a helper handle to seek in the file.
+type SeqHandle interface{}
+
+// SeqData holds the data for one unit in the file.
+type SeqData struct {
+	// The data to be returned to the user.
+	Buf []byte
+
+	// A seek handle used to find the next valid unit in ReadSeqFiledata.
+	Handle SeqHandle
+}
+
+// SeqSource is a data source for a SeqFile file.
+type SeqSource interface {
+	// NeedsUpdate returns true if the consumer of SeqData should call
+	// ReadSeqFileData again. Generation is the generation returned by
+	// ReadSeqFile or 0.
+	NeedsUpdate(generation int64) bool
+
+	// Returns a slice of SeqData ordered by unit and the current
+	// generation. The first entry in the slice is greater than the handle.
+	// If handle is nil then all known records are returned. Generation
+	// must always be greater than 0.
+	ReadSeqFileData(handle SeqHandle) ([]SeqData, int64)
+}
+
+// SeqGenerationCounter is a counter to keep track if the SeqSource should be
+// updated. SeqGenerationCounter is not thread-safe and should be protected
+// with a mutex.
+type SeqGenerationCounter struct {
+	// The generation that the SeqData is at.
+	generation int64
+}
+
+// SetGeneration sets the generation to the new value, be careful to not set it
+// to a value less than current.
+func (s *SeqGenerationCounter) SetGeneration(generation int64) {
+	s.generation = generation
+}
+
+// Update increments the current generation.
+func (s *SeqGenerationCounter) Update() {
+	s.generation++
+}
+
+// Generation returns the current generation counter.
+func (s *SeqGenerationCounter) Generation() int64 {
+	return s.generation
+}
+
+// IsCurrent returns whether the given generation is current or not.
+func (s *SeqGenerationCounter) IsCurrent(generation int64) bool {
+	return s.Generation() == generation
+}
+
+// SeqFile is used to provide dynamic files that can be ordered by record.
+type SeqFile struct {
+	ramfs.Entry
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	SeqSource
+
+	source     []SeqData
+	generation int64
+	lastRead   int64
+}
+
+// NewSeqFile returns a seqfile suitable for use by external consumers.
+func NewSeqFile(ctx context.Context, source SeqSource) *SeqFile {
+	s := &SeqFile{SeqSource: source}
+	s.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
+	return s
+}
+
+// NewSeqFileInode returns an Inode with SeqFile InodeOperations.
+func NewSeqFileInode(ctx context.Context, source SeqSource, msrc *fs.MountSource) *fs.Inode {
+	iops := NewSeqFile(ctx, source)
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.SpecialFile,
+	}
+	return fs.NewInode(iops, msrc, sattr)
+}
+
+// UnstableAttr returns unstable attributes of the SeqFile.
+func (s *SeqFile) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	uattr, _ := s.Entry.UnstableAttr(ctx, inode)
+	uattr.ModificationTime = ktime.NowFromContext(ctx)
+	return uattr, nil
+}
+
+// findIndexAndOffset finds the unit that corresponds to a certain offset.
+// Returns the unit and the offset within the unit. If there are not enough
+// units len(data) and leftover offset is returned.
+func findIndexAndOffset(data []SeqData, offset int64) (int, int64) {
+	for i, buf := range data {
+		l := int64(len(buf.Buf))
+		if offset < l {
+			return i, offset
+		}
+		offset -= l
+	}
+	return len(data), offset
+}
+
+// DeprecatedPreadv reads from the file at the given offset.
+func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.Entry.NotifyAccess(ctx)
+	defer func() { s.lastRead = offset }()
+
+	updated := false
+
+	// Try to find where we should start reading this file.
+	i, recordOffset := findIndexAndOffset(s.source, offset)
+	if i == len(s.source) {
+		// Ok, we're at EOF. Let's first check to see if there might be
+		// more data available to us. If there is more data, add it to
+		// the end and try reading again.
+		if !s.SeqSource.NeedsUpdate(s.generation) {
+			return 0, io.EOF
+		}
+		oldLen := len(s.source)
+		s.updateSourceLocked(len(s.source))
+		updated = true
+		// We know that we had consumed everything up until this point
+		// so we search in the new slice instead of starting over.
+		i, recordOffset = findIndexAndOffset(s.source[oldLen:], recordOffset)
+		i += oldLen
+		// i is at most the length of the slice which is
+		// len(s.source) - oldLen. So at most i will be equal to
+		// len(s.source).
+		if i == len(s.source) {
+			return 0, io.EOF
+		}
+	}
+
+	var done int64
+	// We're reading parts of a record, finish reading the current object
+	// before continuing on to the next. We don't refresh our data source
+	// before this record is completed.
+	if recordOffset != 0 {
+		n, err := dst.CopyOut(ctx, s.source[i].Buf[recordOffset:])
+		done += int64(n)
+		dst = dst.DropFirst(n)
+		if dst.NumBytes() == 0 || err != nil {
+			return done, err
+		}
+		i++
+	}
+
+	// Next/New unit, update the source file if necessary. Make an extra
+	// check to see if we've seeked backwards and if so always update our
+	// data source.
+	if !updated && (s.SeqSource.NeedsUpdate(s.generation) || s.lastRead > offset) {
+		s.updateSourceLocked(i)
+		// recordOffset is 0 here and we won't update records behind the
+		// current one so recordOffset is still 0 even though source
+		// just got updated. Just read the next record.
+	}
+
+	// Finish by reading all the available data.
+	for _, buf := range s.source[i:] {
+		n, err := dst.CopyOut(ctx, buf.Buf)
+		done += int64(n)
+		dst = dst.DropFirst(n)
+		if dst.NumBytes() == 0 || err != nil {
+			return done, err
+		}
+	}
+
+	// If the file shrank (entries not yet read were removed above)
+	// while we tried to read we can end up with nothing read.
+	if done == 0 && dst.NumBytes() != 0 {
+		return 0, io.EOF
+	}
+	return done, nil
+}
+
+// updateSourceLocked requires that s.mu is held.
+func (s *SeqFile) updateSourceLocked(record int) {
+	var h SeqHandle
+	if record == 0 {
+		h = nil
+	} else {
+		h = s.source[record-1].Handle
+	}
+	// Save what we have previously read.
+	s.source = s.source[:record]
+	var newSource []SeqData
+	newSource, s.generation = s.SeqSource.ReadSeqFileData(h)
+	s.source = append(s.source, newSource...)
+}
+
+// DeprecatedPwritev is always denied.
+func (*SeqFile) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
+	return 0, ramfs.ErrDenied
+}
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
new file mode 100644
index 000000000..0bf39ad82
--- /dev/null
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -0,0 +1,272 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seqfile
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type seqTest struct {
+	actual []SeqData
+	update bool
+}
+
+func (s *seqTest) Init() {
+	var sq []SeqData
+	// Create some SeqData.
+	for i := 0; i < 10; i++ {
+		var b []byte
+		for j := 0; j < 10; j++ {
+			b = append(b, byte(i))
+		}
+		sq = append(sq, SeqData{
+			Buf:    b,
+			Handle: &testHandle{i: i},
+		})
+	}
+	s.actual = sq
+}
+
+// NeedsUpdate reports whether we need to update the data we've previously read.
+func (s *seqTest) NeedsUpdate(int64) bool {
+	return s.update
+}
+
+// ReadSeqFiledata returns a slice of SeqData which contains elements
+// greater than the handle.
+func (s *seqTest) ReadSeqFileData(handle SeqHandle) ([]SeqData, int64) {
+	if handle == nil {
+		return s.actual, 0
+	}
+	h := *handle.(*testHandle)
+	var ret []SeqData
+	for _, b := range s.actual {
+		// We want the next one.
+		h2 := *b.Handle.(*testHandle)
+		if h2.i > h.i {
+			ret = append(ret, b)
+		}
+	}
+	return ret, 0
+}
+
+// Flatten a slice of slices into one slice.
+func flatten(buf ...[]byte) []byte {
+	var flat []byte
+	for _, b := range buf {
+		flat = append(flat, b...)
+	}
+	return flat
+}
+
+type testHandle struct {
+	i int
+}
+
+type testTable struct {
+	offset         int64
+	readBufferSize int
+	expectedData   []byte
+	expectedError  error
+}
+
+func runTableTests(ctx context.Context, table []testTable, n fs.InodeOperations) error {
+	for _, tt := range table {
+		data := make([]byte, tt.readBufferSize)
+		resultLen, err := n.DeprecatedPreadv(ctx, usermem.BytesIOSequence(data), tt.offset)
+		if err != tt.expectedError {
+			return fmt.Errorf("t.Preadv(len: %v, offset: %v) (error) => %v expected %v", tt.readBufferSize, tt.offset, err, tt.expectedError)
+		}
+		expectedLen := int64(len(tt.expectedData))
+		if resultLen != expectedLen {
+			// We make this just an error so we wall through and print the data below.
+			return fmt.Errorf("t.Preadv(len: %v, offset: %v) (size) => %v expected %v", tt.readBufferSize, tt.offset, resultLen, expectedLen)
+		}
+		if !bytes.Equal(data[:expectedLen], tt.expectedData) {
+			return fmt.Errorf("t.Preadv(len: %v, offset: %v) (data) => %v expected %v", tt.readBufferSize, tt.offset, data[:expectedLen], tt.expectedData)
+		}
+	}
+	return nil
+}
+
+func TestSeqFile(t *testing.T) {
+	testSource := &seqTest{}
+	testSource.Init()
+
+	// Create a file that can be R/W.
+	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	ctx := contexttest.Context(t)
+	contents := map[string]*fs.Inode{
+		"foo": NewSeqFileInode(ctx, testSource, m),
+	}
+	root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777))
+
+	// How about opening it?
+	inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory})
+	dirent2, err := root.Lookup(ctx, inode, "foo")
+	if err != nil {
+		t.Fatalf("failed to walk to foo for n2: %v", err)
+	}
+	n2 := dirent2.Inode.InodeOperations
+
+	// Writing?
+	if _, err := n2.DeprecatedPwritev(nil, usermem.BytesIOSequence([]byte("test")), 0); err == nil {
+		t.Fatalf("managed to write to n2: %v", err)
+	}
+
+	// How about reading?
+	dirent3, err := root.Lookup(ctx, inode, "foo")
+	if err != nil {
+		t.Fatalf("failed to walk to foo: %v", err)
+	}
+	n3 := dirent3.Inode.InodeOperations
+
+	if n2 != n3 {
+		t.Error("got n2 != n3, want same")
+	}
+
+	testSource.update = true
+
+	table := []testTable{
+		// Read past the end.
+		{100, 4, []byte{}, io.EOF},
+		{110, 4, []byte{}, io.EOF},
+		{200, 4, []byte{}, io.EOF},
+		// Read a truncated first line.
+		{0, 4, testSource.actual[0].Buf[:4], nil},
+		// Read the whole first line.
+		{0, 10, testSource.actual[0].Buf, nil},
+		// Read the whole first line + 5 bytes of second line.
+		{0, 15, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf[:5]), nil},
+		// First 4 bytes of the second line.
+		{10, 4, testSource.actual[1].Buf[:4], nil},
+		// Read the two first lines.
+		{0, 20, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf), nil},
+		// Read three lines.
+		{0, 30, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf, testSource.actual[2].Buf), nil},
+		// Read everything, but use a bigger buffer than necessary.
+		{0, 150, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf, testSource.actual[2].Buf, testSource.actual[3].Buf, testSource.actual[4].Buf, testSource.actual[5].Buf, testSource.actual[6].Buf, testSource.actual[7].Buf, testSource.actual[8].Buf, testSource.actual[9].Buf), nil},
+		// Read the last 3 bytes.
+		{97, 10, testSource.actual[9].Buf[7:], nil},
+	}
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed with testSource.update = %v : %v", testSource.update, err)
+	}
+
+	// Disable updates and do it again.
+	testSource.update = false
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed with testSource.update = %v: %v", testSource.update, err)
+	}
+}
+
+// Test that we behave correctly when the file is updated.
+func TestSeqFileFileUpdated(t *testing.T) {
+	testSource := &seqTest{}
+	testSource.Init()
+	testSource.update = true
+
+	// Create a file that can be R/W.
+	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	ctx := contexttest.Context(t)
+	contents := map[string]*fs.Inode{
+		"foo": NewSeqFileInode(ctx, testSource, m),
+	}
+	root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777))
+
+	// How about opening it?
+	inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory})
+	dirent2, err := root.Lookup(ctx, inode, "foo")
+	if err != nil {
+		t.Fatalf("failed to walk to foo for n2: %v", err)
+	}
+	n2 := dirent2.Inode.InodeOperations
+
+	table := []testTable{
+		{0, 16, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf[:6]), nil},
+	}
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed: %v", err)
+	}
+	// Delete the first entry.
+	cut := testSource.actual[0].Buf
+	testSource.actual = testSource.actual[1:]
+
+	table = []testTable{
+		// Try reading buffer 0 with an offset. This will not delete the old data.
+		{1, 5, cut[1:6], nil},
+		// Reset our file by reading at offset 0.
+		{0, 10, testSource.actual[0].Buf, nil},
+		{16, 14, flatten(testSource.actual[1].Buf[6:], testSource.actual[2].Buf), nil},
+		// Read the same data a second time.
+		{16, 14, flatten(testSource.actual[1].Buf[6:], testSource.actual[2].Buf), nil},
+		// Read the following two lines.
+		{30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil},
+	}
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed after removing first entry: %v", err)
+	}
+
+	// Add a new duplicate line in the middle (6666...)
+	after := testSource.actual[5:]
+	testSource.actual = testSource.actual[:4]
+	// Note the list must be sorted.
+	testSource.actual = append(testSource.actual, after[0])
+	testSource.actual = append(testSource.actual, after...)
+
+	table = []testTable{
+		{50, 20, flatten(testSource.actual[4].Buf, testSource.actual[5].Buf), nil},
+	}
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed after adding middle entry: %v", err)
+	}
+	// This will be used in a later test.
+	oldTestData := testSource.actual
+
+	// Delete everything.
+	testSource.actual = testSource.actual[:0]
+	table = []testTable{
+		{20, 20, []byte{}, io.EOF},
+	}
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed after removing all entries: %v", err)
+	}
+	// Restore some of the data.
+	testSource.actual = oldTestData[:1]
+	table = []testTable{
+		{6, 20, testSource.actual[0].Buf[6:], nil},
+	}
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed after adding first entry back: %v", err)
+	}
+
+	// Re-extend the data
+	testSource.actual = oldTestData
+	table = []testTable{
+		{30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil},
+	}
+	if err := runTableTests(ctx, table, n2); err != nil {
+		t.Errorf("runTableTest failed after extending testSource: %v", err)
+	}
+}
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
new file mode 100644
index 000000000..dee836a05
--- /dev/null
+++ b/pkg/sentry/fs/proc/stat.go
@@ -0,0 +1,139 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// statData backs /proc/stat.
+type statData struct {
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*statData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// cpuStats contains the breakdown of CPU time for /proc/stat.
+type cpuStats struct {
+	// user is time spent in userspace tasks with non-positive niceness.
+	user uint64
+
+	// nice is time spent in userspace tasks with positive niceness.
+	nice uint64
+
+	// system is time spent in non-interrupt kernel context.
+	system uint64
+
+	// idle is time spent idle.
+	idle uint64
+
+	// ioWait is time spent waiting for IO.
+	ioWait uint64
+
+	// irq is time spent in interrupt context.
+	irq uint64
+
+	// softirq is time spent in software interrupt context.
+	softirq uint64
+
+	// steal is involuntary wait time.
+	steal uint64
+
+	// guest is time spent in guests with non-positive niceness.
+	guest uint64
+
+	// guestNice is time spent in guests with positive niceness.
+	guestNice uint64
+}
+
+// String implements fmt.Stringer.
+func (c cpuStats) String() string {
+	return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (s *statData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+
+	// TODO: We currently export only zero CPU stats. We could
+	// at least provide some aggregate stats.
+	var cpu cpuStats
+	fmt.Fprintf(&buf, "cpu  %s\n", cpu)
+
+	for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
+		fmt.Fprintf(&buf, "cpu%d %s\n", c, cpu)
+	}
+
+	// The total number of interrupts is dependent on the CPUs and PCI
+	// devices on the system. See arch_probe_nr_irqs.
+	//
+	// Since we don't report real interrupt stats, just choose an arbitrary
+	// value from a representative VM.
+	const numInterrupts = 256
+
+	// The Kernel doesn't handle real interrupts, so report all zeroes.
+	// TODO: We could count page faults as #PF.
+	fmt.Fprintf(&buf, "intr 0") // total
+	for i := 0; i < numInterrupts; i++ {
+		fmt.Fprintf(&buf, " 0")
+	}
+	fmt.Fprintf(&buf, "\n")
+
+	// Total number of context switches.
+	// TODO: Count this.
+	fmt.Fprintf(&buf, "ctxt 0\n")
+
+	// CLOCK_REALTIME timestamp from boot, in seconds.
+	fmt.Fprintf(&buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
+
+	// Total number of clones.
+	// TODO: Count this.
+	fmt.Fprintf(&buf, "processes 0\n")
+
+	// Number of runnable tasks.
+	// TODO: Count this.
+	fmt.Fprintf(&buf, "procs_running 0\n")
+
+	// Number of tasks waiting on IO.
+	// TODO: Count this.
+	fmt.Fprintf(&buf, "procs_blocked 0\n")
+
+	// Number of each softirq handled.
+	fmt.Fprintf(&buf, "softirq 0") // total
+	for i := 0; i < linux.NumSoftIRQ; i++ {
+		fmt.Fprintf(&buf, " 0")
+	}
+	fmt.Fprintf(&buf, "\n")
+
+	return []seqfile.SeqData{
+		{
+			Buf:    buf.Bytes(),
+			Handle: (*statData)(nil),
+		},
+	}, 0
+}
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
new file mode 100644
index 000000000..4323f3650
--- /dev/null
+++ b/pkg/sentry/fs/proc/sys.go
@@ -0,0 +1,117 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// hostname is a file containing the system hostname.
+type hostname struct {
+	ramfs.Entry
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (hostname) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	utsns := kernel.UTSNamespaceFromContext(ctx)
+	contents := []byte(utsns.HostName() + "\n")
+
+	if offset >= int64(len(contents)) {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOut(ctx, contents[offset:])
+	return int64(n), err
+}
+
+func (p *proc) newHostname(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	h := &hostname{}
+	h.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
+	return newFile(h, msrc, fs.SpecialFile, nil)
+}
+
+// mmapMinAddrData backs /proc/sys/vm/mmap_min_addr.
+type mmapMinAddrData struct {
+	k *kernel.Kernel
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*mmapMinAddrData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (d *mmapMinAddrData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+	return []seqfile.SeqData{
+		{
+			Buf:    []byte(fmt.Sprintf("%d\n", d.k.Platform.MinUserAddress())),
+			Handle: (*mmapMinAddrData)(nil),
+		},
+	}, 0
+}
+
+type overcommitMemory struct{}
+
+func (*overcommitMemory) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.
+func (*overcommitMemory) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+	return []seqfile.SeqData{
+		{
+			Buf:    []byte("0\n"),
+			Handle: (*overcommitMemory)(nil),
+		},
+	}, 0
+}
+
+func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	d.AddChild(ctx, "hostname", p.newHostname(ctx, msrc))
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+func (p *proc) newVMDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	d.AddChild(ctx, "mmap_min_addr", seqfile.NewSeqFileInode(ctx, &mmapMinAddrData{p.k}, msrc))
+	d.AddChild(ctx, "overcommit_memory", seqfile.NewSeqFileInode(ctx, &overcommitMemory{}, msrc))
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+func (p *proc) newSysDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	d.AddChild(ctx, "kernel", p.newKernelDir(ctx, msrc))
+	d.AddChild(ctx, "vm", p.newVMDir(ctx, msrc))
+	d.AddChild(ctx, "net", p.newSysNetDir(ctx, msrc))
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
new file mode 100644
index 000000000..db44c95cb
--- /dev/null
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -0,0 +1,188 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type tcpMemDir int
+
+const (
+	tcpRMem tcpMemDir = iota
+	tcpWMem
+)
+
+type tcpMem struct {
+	ramfs.Entry
+	s    inet.Stack
+	size inet.TCPBufferSize
+	dir  tcpMemDir
+}
+
+func newTCPMem(s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *tcpMem {
+	return &tcpMem{s: s, size: size, dir: dir}
+}
+
+func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *fs.Inode {
+	tm := newTCPMem(s, size, dir)
+	tm.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644))
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.SpecialFile,
+	}
+	return fs.NewInode(tm, msrc, sattr)
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (m *tcpMem) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+	s := fmt.Sprintf("%d\t%d\t%d\n", m.size.Min, m.size.Default, m.size.Max)
+	n, err := dst.CopyOut(ctx, []byte(s))
+	return int64(n), err
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*tcpMem) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
+func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	buf := []int32{int32(m.size.Min), int32(m.size.Default), int32(m.size.Max)}
+	n, cperr := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
+	size := inet.TCPBufferSize{
+		Min:     int(buf[0]),
+		Default: int(buf[1]),
+		Max:     int(buf[2]),
+	}
+	var err error
+	switch m.dir {
+	case tcpRMem:
+		err = m.s.SetTCPReceiveBufferSize(size)
+	case tcpWMem:
+		err = m.s.SetTCPSendBufferSize(size)
+	default:
+		panic(fmt.Sprintf("unknown tcpMem.dir: %v", m.dir))
+	}
+	if err != nil {
+		return n, err
+	}
+	return n, cperr
+}
+
+type tcpSack struct {
+	ramfs.Entry
+	s inet.Stack `state:"nosave"` // S/R-FIXME
+}
+
+func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+	ts := &tcpSack{s: s}
+	ts.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644))
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.SpecialFile,
+	}
+	return fs.NewInode(ts, msrc, sattr)
+}
+
+func (s *tcpSack) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		return 0, io.EOF
+	}
+
+	sack, err := s.s.TCPSACKEnabled()
+	if err != nil {
+		return 0, err
+	}
+
+	val := "0\n"
+	if sack {
+		// Technically, this is not quite compatible with Linux. Linux
+		// stores these as an integer, so if you write "2" into
+		// tcp_sack, you should get 2 back. Tough luck.
+		val = "1\n"
+	}
+	n, err := dst.CopyOut(ctx, []byte(val))
+	return int64(n), err
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
+func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return n, err
+	}
+	return n, s.s.SetTCPSACKEnabled(v != 0)
+}
+
+func newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+
+	// Add tcp_rmem.
+	if rs, err := s.TCPReceiveBufferSize(); err == nil {
+		d.AddChild(ctx, "tcp_rmem", newTCPMemInode(ctx, msrc, s, rs, tcpRMem))
+	}
+
+	// Add tcp_wmem.
+	if ss, err := s.TCPSendBufferSize(); err == nil {
+		d.AddChild(ctx, "tcp_wmem", newTCPMemInode(ctx, msrc, s, ss, tcpWMem))
+	}
+
+	// Add tcp_sack.
+	d.AddChild(ctx, "tcp_sack", newTCPSackInode(ctx, msrc, s))
+
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	if s := p.k.NetworkStack(); s != nil {
+		d.AddChild(ctx, "ipv4", newSysNetIPv4Dir(ctx, msrc, s))
+	}
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
new file mode 100644
index 000000000..7ba392346
--- /dev/null
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -0,0 +1,121 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func TestQuerySendBufferSize(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+	s.TCPSendBufSize = inet.TCPBufferSize{100, 200, 300}
+	tm := newTCPMem(s, s.TCPSendBufSize, tcpWMem)
+
+	buf := make([]byte, 100)
+	dst := usermem.BytesIOSequence(buf)
+	n, err := tm.DeprecatedPreadv(ctx, dst, 0)
+	if err != nil {
+		t.Fatalf("DeprecatedPreadv failed: %v", err)
+	}
+
+	if got, want := string(buf[:n]), "100\t200\t300\n"; got != want {
+		t.Fatalf("Bad string: got %v, want %v", got, want)
+	}
+}
+
+func TestQueryRecvBufferSize(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+	s.TCPRecvBufSize = inet.TCPBufferSize{100, 200, 300}
+	tm := newTCPMem(s, s.TCPRecvBufSize, tcpRMem)
+
+	buf := make([]byte, 100)
+	dst := usermem.BytesIOSequence(buf)
+	n, err := tm.DeprecatedPreadv(ctx, dst, 0)
+	if err != nil {
+		t.Fatalf("DeprecatedPreadv failed: %v", err)
+	}
+
+	if got, want := string(buf[:n]), "100\t200\t300\n"; got != want {
+		t.Fatalf("Bad string: got %v, want %v", got, want)
+	}
+}
+
+var cases = []struct {
+	str     string
+	initial inet.TCPBufferSize
+	final   inet.TCPBufferSize
+}{
+	{
+		str:     "",
+		initial: inet.TCPBufferSize{1, 2, 3},
+		final:   inet.TCPBufferSize{1, 2, 3},
+	},
+	{
+		str:     "100\n",
+		initial: inet.TCPBufferSize{1, 100, 200},
+		final:   inet.TCPBufferSize{100, 100, 200},
+	},
+	{
+		str:     "100 200 300\n",
+		initial: inet.TCPBufferSize{1, 2, 3},
+		final:   inet.TCPBufferSize{100, 200, 300},
+	},
+}
+
+func TestConfigureSendBufferSize(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+	for _, c := range cases {
+		s.TCPSendBufSize = c.initial
+		tm := newTCPMem(s, c.initial, tcpWMem)
+
+		// Write the values.
+		src := usermem.BytesIOSequence([]byte(c.str))
+		if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+			t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
+		}
+
+		// Read the values from the stack and check them.
+		if s.TCPSendBufSize != c.final {
+			t.Errorf("TCPSendBufferSize, case = %q: got %v, wanted %v", c.str, s.TCPSendBufSize, c.final)
+		}
+	}
+}
+
+func TestConfigureRecvBufferSize(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+	for _, c := range cases {
+		s.TCPRecvBufSize = c.initial
+		tm := newTCPMem(s, c.initial, tcpRMem)
+
+		// Write the values.
+		src := usermem.BytesIOSequence([]byte(c.str))
+		if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+			t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
+		}
+
+		// Read the values from the stack and check them.
+		if s.TCPRecvBufSize != c.final {
+			t.Errorf("TCPRecvBufferSize, case = %q: got %v, wanted %v", c.str, s.TCPRecvBufSize, c.final)
+		}
+	}
+}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
new file mode 100644
index 000000000..3e9a1e50e
--- /dev/null
+++ b/pkg/sentry/fs/proc/task.go
@@ -0,0 +1,567 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"sort"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's
+// users count is incremented, and must be decremented by the caller when it is
+// no longer in use.
+func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
+	if t.ExitState() == kernel.TaskExitDead {
+		return nil, syserror.ESRCH
+	}
+	var m *mm.MemoryManager
+	t.WithMuLocked(func(t *kernel.Task) {
+		m = t.MemoryManager()
+	})
+	if m == nil || !m.IncUsers() {
+		return nil, io.EOF
+	}
+	return m, nil
+}
+
+// taskDir represents a task-level directory.
+type taskDir struct {
+	ramfs.Dir
+
+	// t is the associated kernel task that owns this file.
+	t *kernel.Task
+}
+
+// newTaskDir creates a new proc task entry.
+func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace, showSubtasks bool) *fs.Inode {
+	d := &taskDir{t: t}
+	// TODO: Set EUID/EGID based on dumpability.
+	d.InitDir(t, map[string]*fs.Inode{
+		"auxv":    newAuxvec(t, msrc),
+		"cmdline": newExecArgFile(t, msrc, cmdlineExecArg),
+		"comm":    newComm(t, msrc),
+		"environ": newExecArgFile(t, msrc, environExecArg),
+		"exe":     newExe(t, msrc),
+		"fd":      newFdDir(t, msrc),
+		"fdinfo":  newFdInfoDir(t, msrc),
+		"gid_map": newGIDMap(t, msrc),
+		// TODO: This is incorrect for /proc/[pid]/task/[tid]/io, i.e. if
+		// showSubtasks is false:
+		// http://lxr.free-electrons.com/source/fs/proc/base.c?v=3.11#L2980
+		"io":        newIO(t, msrc),
+		"maps":      newMaps(t, msrc),
+		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+		"ns":        newNamespaceDir(t, msrc),
+		"stat":      newTaskStat(t, msrc, showSubtasks, pidns),
+		"status":    newStatus(t, msrc, pidns),
+		"uid_map":   newUIDMap(t, msrc),
+	}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	if showSubtasks {
+		d.AddChild(t, "task", newSubtasks(t, msrc, pidns))
+	}
+	return newFile(d, msrc, fs.SpecialDirectory, t)
+}
+
+// subtasks represents a /proc/TID/task directory.
+type subtasks struct {
+	ramfs.Dir
+
+	t *kernel.Task
+
+	pidns *kernel.PIDNamespace
+}
+
+func newSubtasks(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
+	s := &subtasks{t: t, pidns: pidns}
+	s.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newFile(s, msrc, fs.SpecialDirectory, t)
+}
+
+// UnstableAttr returns unstable attributes of the subtasks.
+func (s *subtasks) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	uattr, err := s.Dir.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	// We can't rely on ramfs' implementation because the task directories are
+	// generated dynamically.
+	uattr.Links = uint64(2 + s.t.ThreadGroup().Count())
+	return uattr, nil
+}
+
+// Lookup loads an Inode in a task's subtask directory into a Dirent.
+func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+	tid, err := strconv.ParseUint(p, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+
+	task := s.pidns.TaskWithID(kernel.ThreadID(tid))
+	if task == nil {
+		return nil, syserror.ENOENT
+	}
+	if task.ThreadGroup() != s.t.ThreadGroup() {
+		return nil, syserror.ENOENT
+	}
+
+	td := newTaskDir(task, dir.MountSource, s.pidns, false)
+	return fs.NewDirent(td, p), nil
+}
+
+// DeprecatedReaddir lists a task's subtask directory.
+func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	tasks := s.t.ThreadGroup().MemberIDs(s.pidns)
+	taskInts := make([]int, 0, len(tasks))
+	for _, tid := range tasks {
+		taskInts = append(taskInts, int(tid))
+	}
+
+	// Find the task to start at.
+	idx := sort.SearchInts(taskInts, offset)
+	if idx == len(taskInts) {
+		return offset, nil
+	}
+	taskInts = taskInts[idx:]
+
+	var tid int
+	for _, tid = range taskInts {
+		name := strconv.FormatUint(uint64(tid), 10)
+		attr := fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice)
+		if err := dirCtx.DirEmit(name, attr); err != nil {
+			// Returned offset is next tid to serialize.
+			return tid, err
+		}
+	}
+	// We serialized them all.  Next offset should be higher than last
+	// serialized tid.
+	return tid + 1, nil
+}
+
+// exe is an fs.InodeOperations symlink for the /proc/PID/exe file.
+type exe struct {
+	ramfs.Symlink
+
+	t *kernel.Task
+}
+
+func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	exeSymlink := &exe{t: t}
+	exeSymlink.InitSymlink(t, fs.RootOwner, "")
+	return newFile(exeSymlink, msrc, fs.Symlink, t)
+}
+
+func (e *exe) executable() (d *fs.Dirent, err error) {
+	e.t.WithMuLocked(func(t *kernel.Task) {
+		mm := t.MemoryManager()
+		if mm == nil {
+			// TODO: Check shouldn't allow Readlink once the
+			// Task is zombied.
+			err = syserror.EACCES
+			return
+		}
+
+		// The MemoryManager may be destroyed, in which case
+		// MemoryManager.destroy will simply set the executable to nil
+		// (with locks held).
+		d = mm.Executable()
+		if d == nil {
+			err = syserror.ENOENT
+		}
+	})
+	return
+}
+
+// Readlink implements fs.InodeOperations.
+func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if !kernel.ContextCanTrace(ctx, e.t, false) {
+		return "", syserror.EACCES
+	}
+
+	// Pull out the executable for /proc/TID/exe.
+	exec, err := e.executable()
+	if err != nil {
+		return "", err
+	}
+	defer exec.DecRef()
+
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// This doesn't correspond to anything in Linux because the vfs is
+		// global there.
+		return "", syserror.EINVAL
+	}
+	defer root.DecRef()
+	n, _ := exec.FullName(root)
+	return n, nil
+}
+
+// namespaceFile represents a file in the namespacefs, such as the files in
+// /proc/<pid>/ns.
+type namespaceFile struct {
+	ramfs.Symlink
+
+	t *kernel.Task
+}
+
+func newNamespaceFile(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
+	n := &namespaceFile{t: t}
+	n.InitSymlink(t, fs.RootOwner, "")
+
+	// TODO: Namespace symlinks should contain the namespace name and the
+	// inode number for the namespace instance, so for example user:[123456]. We
+	// currently fake the inode number by sticking the symlink inode in its
+	// place.
+	n.Target = fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno())
+
+	return newFile(n, msrc, fs.Symlink, t)
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (n *namespaceFile) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
+	if !kernel.ContextCanTrace(ctx, n.t, false) {
+		return nil, syserror.EACCES
+	}
+
+	// Create a new regular file to fake the namespace file.
+	node := &ramfs.Entry{}
+	node.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0777))
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.RegularFile,
+	}
+	return fs.NewDirent(fs.NewInode(node, inode.MountSource, sattr), n.Symlink.Target), nil
+}
+
+func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(t, map[string]*fs.Inode{
+		"net":  newNamespaceFile(t, msrc, "net"),
+		"pid":  newNamespaceFile(t, msrc, "pid"),
+		"user": newNamespaceFile(t, msrc, "user"),
+	}, fs.RootOwner, fs.FilePermsFromMode(0511))
+	return newFile(d, msrc, fs.SpecialDirectory, t)
+}
+
+// mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
+type mapsData struct {
+	t *kernel.Task
+}
+
+func newMaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newFile(seqfile.NewSeqFile(t, &mapsData{t}), msrc, fs.SpecialFile, t)
+}
+
+func (md *mapsData) mm() *mm.MemoryManager {
+	var tmm *mm.MemoryManager
+	md.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			// No additional reference is taken on mm here. This is safe
+			// because MemoryManager.destroy is required to leave the
+			// MemoryManager in a state where it's still usable as a SeqSource.
+			tmm = mm
+		}
+	})
+	return tmm
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (md *mapsData) NeedsUpdate(generation int64) bool {
+	if mm := md.mm(); mm != nil {
+		return mm.NeedsUpdate(generation)
+	}
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (md *mapsData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if mm := md.mm(); mm != nil {
+		return mm.ReadSeqFileData(md.t.AsyncContext(), h)
+	}
+	return []seqfile.SeqData{}, 0
+}
+
+type taskStatData struct {
+	t *kernel.Task
+
+	// If tgstats is true, accumulate fault stats (not implemented) and CPU
+	// time across all tasks in t's thread group.
+	tgstats bool
+
+	// pidns is the PID namespace associated with the proc filesystem that
+	// includes the file using this statData.
+	pidns *kernel.PIDNamespace
+}
+
+func newTaskStat(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool, pidns *kernel.PIDNamespace) *fs.Inode {
+	return newFile(seqfile.NewSeqFile(t, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t)
+}
+
+// NeedsUpdate returns whether the generation is old or not.
+func (s *taskStatData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData returns data for the SeqFile reader.
+// SeqData, the current generation and where in the file the handle corresponds to.
+func (s *taskStatData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+
+	fmt.Fprintf(&buf, "%d ", s.pidns.IDOfTask(s.t))
+	fmt.Fprintf(&buf, "(%s) ", s.t.Name())
+	fmt.Fprintf(&buf, "%c ", s.t.StateStatus()[0])
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(&buf, "%d ", ppid)
+	fmt.Fprintf(&buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
+	fmt.Fprintf(&buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
+	fmt.Fprintf(&buf, "0 0 " /* tty_nr tpgid */)
+	fmt.Fprintf(&buf, "0 " /* flags */)
+	fmt.Fprintf(&buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
+	var cputime usage.CPUStats
+	if s.tgstats {
+		cputime = s.t.ThreadGroup().CPUStats()
+	} else {
+		cputime = s.t.CPUStats()
+	}
+	fmt.Fprintf(&buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
+	fmt.Fprintf(&buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	fmt.Fprintf(&buf, "%d %d ", s.t.Priority(), s.t.Niceness())
+	fmt.Fprintf(&buf, "%d ", s.t.ThreadGroup().Count())
+	fmt.Fprintf(&buf, "0 0 " /* itrealvalue starttime */)
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+	fmt.Fprintf(&buf, "%d %d ", vss, rss/usermem.PageSize)
+	fmt.Fprintf(&buf, "0 0 0 0 0 0 " /* rsslim startcode endcode startstack kstkesp kstkeip */)
+	fmt.Fprintf(&buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
+	fmt.Fprintf(&buf, "0 0 " /* nswap cnswap */)
+	terminationSignal := linux.Signal(0)
+	if s.t == s.t.ThreadGroup().Leader() {
+		terminationSignal = s.t.ThreadGroup().TerminationSignal()
+	}
+	fmt.Fprintf(&buf, "%d ", terminationSignal)
+	fmt.Fprintf(&buf, "0 0 0 " /* processor rt_priority policy */)
+	fmt.Fprintf(&buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
+	fmt.Fprintf(&buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
+	fmt.Fprintf(&buf, "0\n" /* exit_code */)
+
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*taskStatData)(nil)}}, 0
+}
+
+// statusData implements seqfile.SeqSource for /proc/[pid]/status.
+type statusData struct {
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
+}
+
+func newStatus(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
+	return newFile(seqfile.NewSeqFile(t, &statusData{t, pidns}), msrc, fs.SpecialFile, t)
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (s *statusData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (s *statusData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "Name:\t%s\n", s.t.Name())
+	fmt.Fprintf(&buf, "State:\t%s\n", s.t.StateStatus())
+	fmt.Fprintf(&buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
+	fmt.Fprintf(&buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
+	ppid := kernel.ThreadID(0)
+	if parent := s.t.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(&buf, "PPid:\t%d\n", ppid)
+	tpid := kernel.ThreadID(0)
+	if tracer := s.t.Tracer(); tracer != nil {
+		tpid = s.pidns.IDOfTask(tracer)
+	}
+	fmt.Fprintf(&buf, "TracerPid:\t%d\n", tpid)
+	var fds int
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if fdm := t.FDMap(); fdm != nil {
+			fds = fdm.Size()
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+	fmt.Fprintf(&buf, "FDSize:\t%d\n", fds)
+	fmt.Fprintf(&buf, "VmSize:\t%d kB\n", vss>>10)
+	fmt.Fprintf(&buf, "VmRSS:\t%d kB\n", rss>>10)
+	fmt.Fprintf(&buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
+	creds := s.t.Credentials()
+	fmt.Fprintf(&buf, "CapInh:\t%016x\n", creds.InheritableCaps)
+	fmt.Fprintf(&buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
+	fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
+	fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
+	fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0
+}
+
+// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+type ioUsage interface {
+	// IOUsage returns the io usage data.
+	IOUsage() *usage.IO
+}
+
+type ioData struct {
+	ioUsage
+}
+
+func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newFile(seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+}
+
+// NeedsUpdate returns whether the generation is old or not.
+func (i *ioData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData returns data for the SeqFile reader.
+// SeqData, the current generation and where in the file the handle corresponds to.
+func (i *ioData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	io := usage.IO{}
+	io.Accumulate(i.IOUsage())
+
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "char: %d\n", io.CharsRead)
+	fmt.Fprintf(&buf, "wchar: %d\n", io.CharsWritten)
+	fmt.Fprintf(&buf, "syscr: %d\n", io.ReadSyscalls)
+	fmt.Fprintf(&buf, "syscw: %d\n", io.WriteSyscalls)
+	fmt.Fprintf(&buf, "read_bytes: %d\n", io.BytesRead)
+	fmt.Fprintf(&buf, "write_bytes: %d\n", io.BytesWritten)
+	fmt.Fprintf(&buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
+
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*ioData)(nil)}}, 0
+}
+
+// comm is a file containing the command name for a task.
+//
+// On Linux, /proc/[pid]/comm is writable, and writing to the comm file changes
+// the thread name. We don't implement this yet as there are no known users of
+// this feature.
+type comm struct {
+	ramfs.Entry
+
+	t *kernel.Task
+}
+
+// newComm returns a new comm file.
+func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	c := &comm{t: t}
+	c.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444))
+	return newFile(c, msrc, fs.SpecialFile, t)
+}
+
+// DeprecatedPreadv reads the current command name.
+func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	buf := []byte(c.t.Name() + "\n")
+	if offset >= int64(len(buf)) {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOut(ctx, buf[offset:])
+	return int64(n), err
+}
+
+// auxvec is a file containing the auxiliary vector for a task.
+type auxvec struct {
+	ramfs.Entry
+
+	t *kernel.Task
+}
+
+// newAuxvec returns a new auxvec file.
+func newAuxvec(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	a := &auxvec{t: t}
+	a.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0400))
+	return newFile(a, msrc, fs.SpecialFile, t)
+}
+
+// DeprecatedPreadv reads the current auxiliary vector.
+func (a *auxvec) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	m, err := getTaskMM(a.t)
+	if err != nil {
+		return 0, err
+	}
+	defer m.DecUsers(ctx)
+	auxv := m.Auxv()
+
+	// Space for buffer with AT_NULL (0) terminator at the end.
+	size := (len(auxv) + 1) * 16
+	if offset >= int64(size) {
+		return 0, io.EOF
+	}
+
+	buf := make([]byte, size)
+	for i, e := range auxv {
+		usermem.ByteOrder.PutUint64(buf[16*i:], e.Key)
+		usermem.ByteOrder.PutUint64(buf[16*i+8:], uint64(e.Value))
+	}
+
+	n, err := dst.CopyOut(ctx, buf[offset:])
+	return int64(n), err
+}
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
new file mode 100644
index 000000000..a2a070bdd
--- /dev/null
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -0,0 +1,152 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// An idMapSeqSource is a seqfile.SeqSource that returns UID or GID mappings
+// from a task's user namespace.
+type idMapSeqSource struct {
+	t    *kernel.Task
+	gids bool
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (imss *idMapSeqSource) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (imss *idMapSeqSource) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	var start int
+	if handle != nil {
+		start = handle.(*idMapSeqHandle).value
+	}
+	var entries []auth.IDMapEntry
+	if imss.gids {
+		entries = imss.t.UserNamespace().GIDMap()
+	} else {
+		entries = imss.t.UserNamespace().UIDMap()
+	}
+	var data []seqfile.SeqData
+	i := 1
+	for _, e := range entries {
+		if i > start {
+			data = append(data, seqfile.SeqData{
+				Buf:    idMapLineFromEntry(e),
+				Handle: &idMapSeqHandle{i},
+			})
+		}
+		i++
+	}
+	return data, 0
+}
+
+// TODO: Fix issue requiring idMapSeqHandle wrapping an int.
+type idMapSeqHandle struct {
+	value int
+}
+
+type idMapSeqFile struct {
+	seqfile.SeqFile
+}
+
+// newUIDMap returns a new uid_map file.
+func newUIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newIDMap(t, msrc, false /* gids */)
+}
+
+// newGIDMap returns a new gid_map file.
+func newGIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newIDMap(t, msrc, true /* gids */)
+}
+
+func newIDMap(t *kernel.Task, msrc *fs.MountSource, gids bool) *fs.Inode {
+	imsf := &idMapSeqFile{seqfile.SeqFile{SeqSource: &idMapSeqSource{
+		t:    t,
+		gids: gids,
+	}}}
+	imsf.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0644))
+	return newFile(imsf, msrc, fs.SpecialFile, t)
+}
+
+func (imsf *idMapSeqFile) source() *idMapSeqSource {
+	return imsf.SeqFile.SeqSource.(*idMapSeqSource)
+}
+
+// "There is an (arbitrary) limit on the number of lines in the file. As at
+// Linux 3.18, the limit is five lines." - user_namespaces(7)
+const maxIDMapLines = 5
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
+func (imsf *idMapSeqFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	// "In addition, the number of bytes written to the file must be less than
+	// the system page size, and the write must be performed at the start of
+	// the file ..." - user_namespaces(7)
+	srclen := src.NumBytes()
+	if srclen >= usermem.PageSize || offset != 0 {
+		return 0, syserror.EINVAL
+	}
+	b := make([]byte, srclen)
+	if _, err := src.CopyIn(ctx, b); err != nil {
+		return 0, err
+	}
+	lines := bytes.SplitN(bytes.TrimSpace(b), []byte("\n"), maxIDMapLines+1)
+	if len(lines) > maxIDMapLines {
+		return 0, syserror.EINVAL
+	}
+	entries := make([]auth.IDMapEntry, len(lines))
+	for i, l := range lines {
+		e, err := idMapEntryFromLine(string(l))
+		if err != nil {
+			return 0, syserror.EINVAL
+		}
+		entries[i] = e
+	}
+	t := imsf.source().t
+	var err error
+	if imsf.source().gids {
+		err = t.UserNamespace().SetGIDMap(ctx, entries)
+	} else {
+		err = t.UserNamespace().SetUIDMap(ctx, entries)
+	}
+	if err != nil {
+		return 0, err
+	}
+	return int64(len(b)), nil
+}
+
+func idMapLineFromEntry(e auth.IDMapEntry) []byte {
+	var b bytes.Buffer
+	fmt.Fprintf(&b, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
+	return b.Bytes()
+}
+
+func idMapEntryFromLine(line string) (auth.IDMapEntry, error) {
+	var e auth.IDMapEntry
+	_, err := fmt.Sscan(line, &e.FirstID, &e.FirstParentID, &e.Length)
+	return e, err
+}
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
new file mode 100644
index 000000000..4679d5821
--- /dev/null
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// uptime is a file containing the system uptime.
+type uptime struct {
+	ramfs.Entry
+
+	// The "start time" of the sandbox.
+	startTime ktime.Time
+}
+
+// newUptime returns a new uptime file.
+func (p *proc) newUptime(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	u := &uptime{
+		startTime: ktime.NowFromContext(ctx),
+	}
+	u.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
+	return newFile(u, msrc, fs.SpecialFile, nil)
+}
+
+// DeprecatedPreadv reads the current uptime.
+func (u *uptime) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	now := ktime.NowFromContext(ctx)
+	// Pretend that we've spent zero time sleeping (second number).
+	s := []byte(fmt.Sprintf("%.2f 0.00\n", now.Sub(u.startTime).Seconds()))
+	if offset >= int64(len(s)) {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOut(ctx, s[offset:])
+	return int64(n), err
+}
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
new file mode 100644
index 000000000..df3040d37
--- /dev/null
+++ b/pkg/sentry/fs/proc/version.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// versionData backs /proc/version.
+type versionData struct {
+	// k is the owning Kernel.
+	k *kernel.Kernel
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*versionData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (v *versionData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	init := v.k.GlobalInit()
+	if init == nil {
+		// Attempted to read before the init Task is created. This can
+		// only occur during startup, which should never need to read
+		// this file.
+		panic("Attempted to read version before initial Task is available")
+	}
+
+	// /proc/version takes the form:
+	//
+	// "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
+	// (COMPILER_VERSION) VERSION"
+	//
+	// where:
+	// - SYSNAME, RELEASE, and VERSION are the same as returned by
+	// sys_utsname
+	// - COMPILE_USER is the user that build the kernel
+	// - COMPILE_HOST is the hostname of the machine on which the kernel
+	// was built
+	// - COMPILER_VERSION is the version reported by the building compiler
+	//
+	// Since we don't really want to expose build information to
+	// applications, those fields are omitted.
+	//
+	// FIXME: Using Version from the init task SyscallTable
+	// disregards the different version a task may have (e.g., in a uts
+	// namespace).
+	ver := init.Leader().SyscallTable().Version
+	return []seqfile.SeqData{
+		{
+			Buf:    []byte(fmt.Sprintf("%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)),
+			Handle: (*versionData)(nil),
+		},
+	}, 0
+}
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
new file mode 100644
index 000000000..663a1aeb9
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -0,0 +1,62 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "ramfs_state",
+    srcs = [
+        "dir.go",
+        "file.go",
+        "ramfs.go",
+        "socket.go",
+        "symlink.go",
+    ],
+    out = "ramfs_state.go",
+    package = "ramfs",
+)
+
+go_library(
+    name = "ramfs",
+    srcs = [
+        "dir.go",
+        "file.go",
+        "ramfs.go",
+        "ramfs_state.go",
+        "socket.go",
+        "symlink.go",
+        "tree.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/amutex",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/secio",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "ramfs_test",
+    size = "small",
+    srcs = ["tree_test.go"],
+    embed = [":ramfs"],
+    deps = [
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+    ],
+)
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
new file mode 100644
index 000000000..bf4cd8dfd
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -0,0 +1,364 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ramfs
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// CreateOps represents operations to create different file types.
+type CreateOps struct {
+	// NewDir creates a new directory.
+	NewDir func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error)
+
+	// NewFile creates a new file.
+	NewFile func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error)
+
+	// NewSymlink creates a new symlink with permissions 0777.
+	NewSymlink func(ctx context.Context, dir *fs.Inode, target string) (*fs.Inode, error)
+
+	// NewBoundEndpoint creates a new socket.
+	NewBoundEndpoint func(ctx context.Context, dir *fs.Inode, ep unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error)
+
+	// NewFifo creates a new fifo.
+	NewFifo func(ctx context.Context, dir *fs.Inode, perm fs.FilePermissions) (*fs.Inode, error)
+}
+
+// Dir represents a single directory in the filesystem.
+type Dir struct {
+	Entry
+
+	// CreateOps may be provided.
+	//
+	// These may only be modified during initialization (while the application
+	// is not running). No sychronization is performed when accessing these
+	// operations during syscalls.
+	*CreateOps `state:"nosave"`
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// children are inodes that are in this directory.  A reference is held
+	// on each inode while it is in the map.
+	children map[string]*fs.Inode
+
+	// dentryMap is a sortedDentryMap containing entries for all children.
+	// Its entries ar kept up-to-date with d.children.
+	dentryMap *fs.SortedDentryMap
+}
+
+// InitDir initializes a directory.
+func (d *Dir) InitDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) {
+	d.InitEntry(ctx, owner, perms)
+	if contents == nil {
+		contents = make(map[string]*fs.Inode)
+	}
+	d.children = contents
+	// Build the entries map ourselves, rather than calling addChildLocked,
+	// because it will be faster.
+	entries := make(map[string]fs.DentAttr, len(contents))
+	for name, inode := range contents {
+		entries[name] = fs.DentAttr{
+			Type:    inode.StableAttr.Type,
+			InodeID: inode.StableAttr.InodeID,
+		}
+	}
+	d.dentryMap = fs.NewSortedDentryMap(entries)
+
+	// Directories have an extra link, corresponding to '.'.
+	d.AddLink()
+}
+
+// addChildLocked add the child inode, inheriting its reference.
+func (d *Dir) addChildLocked(name string, inode *fs.Inode) {
+	d.children[name] = inode
+	d.dentryMap.Add(name, fs.DentAttr{
+		Type:    inode.StableAttr.Type,
+		InodeID: inode.StableAttr.InodeID,
+	})
+
+	// If the child is a directory, increment this dir's link count,
+	// corresponding to '..' from the subdirectory.
+	if fs.IsDir(inode.StableAttr) {
+		d.AddLink()
+	}
+
+	// Given we're now adding this inode to the directory we must also
+	// increase its link count. Similiarly we decremented it in removeChildLocked.
+	inode.AddLink()
+}
+
+// AddChild adds a child to this dir.
+func (d *Dir) AddChild(ctx context.Context, name string, inode *fs.Inode) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	d.addChildLocked(name, inode)
+}
+
+// FindChild returns (child, true) if the directory contains name.
+func (d *Dir) FindChild(name string) (*fs.Inode, bool) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	child, ok := d.children[name]
+	return child, ok
+}
+
+// removeChildLocked attempts to remove an entry from this directory.
+// This Entry's mutex must be held. It returns the removed Inode.
+func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) {
+	inode, ok := d.children[name]
+	if !ok {
+		return nil, ErrNotFound
+	}
+
+	delete(d.children, name)
+	d.dentryMap.Remove(name)
+	d.Entry.NotifyModification(ctx)
+
+	// If the child was a subdirectory, then we must decrement this dir's
+	// link count which was the child's ".." directory entry.
+	if fs.IsDir(inode.StableAttr) {
+		d.DropLink()
+	}
+
+	// Update ctime.
+	inode.NotifyStatusChange(ctx)
+
+	// Given we're now removing this inode to the directory we must also
+	// decrease its link count. Similiarly it is increased in addChildLocked.
+	inode.DropLink()
+
+	return inode, nil
+}
+
+// RemoveEntry attempts to remove an entry from this directory.
+func (d *Dir) RemoveEntry(ctx context.Context, name string) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	inode, err := d.removeChildLocked(ctx, name)
+	if err != nil {
+		return err
+	}
+
+	// Remove our reference on the inode.
+	inode.DecRef()
+	return nil
+}
+
+// Remove removes the named non-directory.
+func (d *Dir) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+	return d.RemoveEntry(ctx, name)
+}
+
+// RemoveDirectory removes the named directory.
+func (d *Dir) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	n, err := d.walkLocked(ctx, name)
+	if err != nil {
+		return err
+	}
+	dirCtx := &fs.DirCtx{}
+	if _, err := n.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0); err != nil {
+		return err
+	}
+	if len(dirCtx.DentAttrs()) > 0 {
+		return ErrNotEmpty
+	}
+	inode, err := d.removeChildLocked(ctx, name)
+	if err != nil {
+		return err
+	}
+
+	// Remove our reference on the inode.
+	inode.DecRef()
+
+	return err
+}
+
+// Lookup loads an inode at p into a Dirent.
+func (d *Dir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	inode, err := d.walkLocked(ctx, p)
+	if err != nil {
+		return nil, err
+	}
+
+	// Take a reference on the inode before returning it.  This reference
+	// is owned by the dirent we are about to create.
+	inode.IncRef()
+	return fs.NewDirent(inode, p), nil
+}
+
+// walkLocked must be called with this Entry's mutex held.
+func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) {
+	d.Entry.NotifyAccess(ctx)
+
+	// Lookup a child node.
+	if inode, ok := d.children[p]; ok {
+		return inode, nil
+	}
+
+	// fs.InodeOperations.Lookup returns syserror.ENOENT if p
+	// does not exist.
+	return nil, syserror.ENOENT
+}
+
+// createInodeOperationsCommon creates a new child node at this dir by calling
+// makeInodeOperations. It is the common logic for creating a new child.
+func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, makeInodeOperations func() (*fs.Inode, error)) (*fs.Inode, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if _, ok := d.children[name]; ok {
+		return nil, syscall.EEXIST
+	}
+
+	inode, err := makeInodeOperations()
+	if err != nil {
+		return nil, err
+	}
+
+	d.addChildLocked(name, inode)
+	d.Entry.NotifyModification(ctx)
+
+	return inode, nil
+}
+
+// Create creates a new Inode with the given name and returns its File.
+func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) {
+	if d.CreateOps == nil || d.CreateOps.NewFile == nil {
+		return nil, ErrDenied
+	}
+
+	inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
+		return d.NewFile(ctx, dir, perms)
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	// Take an extra ref on inode, which will be owned by the dirent.
+	inode.IncRef()
+
+	// Create the Dirent and corresponding file.
+	created := fs.NewDirent(inode, name)
+	defer created.DecRef()
+	return created.Inode.GetFile(ctx, created, flags)
+}
+
+// CreateLink returns a new link.
+func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error {
+	if d.CreateOps == nil || d.CreateOps.NewSymlink == nil {
+		return ErrDenied
+	}
+	_, err := d.createInodeOperationsCommon(ctx, newname, func() (*fs.Inode, error) {
+		return d.NewSymlink(ctx, dir, oldname)
+	})
+	return err
+}
+
+// CreateHardLink creates a new hard link.
+func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Take an extra reference on the inode and add it to our children.
+	target.IncRef()
+
+	// The link count will be incremented in addChildLocked.
+	d.addChildLocked(name, target)
+	d.Entry.NotifyModification(ctx)
+
+	// Update ctime.
+	target.NotifyStatusChange(ctx)
+
+	return nil
+}
+
+// CreateDirectory returns a new subdirectory.
+func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
+	if d.CreateOps == nil || d.CreateOps.NewDir == nil {
+		return ErrDenied
+	}
+	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
+		return d.NewDir(ctx, dir, perms)
+	})
+	// TODO: Support updating status times, as those should be
+	// updated by links.
+	return err
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perms fs.FilePermissions) error {
+	if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil {
+		return ErrDenied
+	}
+	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
+		return d.NewBoundEndpoint(ctx, dir, ep, perms)
+	})
+	if err == syscall.EEXIST {
+		return syscall.EADDRINUSE
+	}
+	return err
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
+	if d.CreateOps == nil || d.CreateOps.NewFifo == nil {
+		return ErrDenied
+	}
+	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
+		return d.NewFifo(ctx, dir, perms)
+	})
+	return err
+}
+
+func (d *Dir) readdirLocked(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	// Serialize the entries in dentryMap.
+	n, err := fs.GenericReaddir(dirCtx, d.dentryMap)
+
+	// Touch the access time.
+	d.Entry.NotifyAccess(ctx)
+
+	return offset + n, err
+}
+
+// DeprecatedReaddir emits the entries contained in this directory.
+func (d *Dir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.readdirLocked(ctx, dirCtx, offset)
+}
+
+// DeprecatedPreadv always returns ErrIsDirectory
+func (*Dir) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
+	return 0, ErrIsDirectory
+}
+
+// DeprecatedPwritev always returns ErrIsDirectory
+func (*Dir) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
+	return 0, ErrIsDirectory
+}
diff --git a/pkg/sentry/fs/ramfs/file.go b/pkg/sentry/fs/ramfs/file.go
new file mode 100644
index 000000000..e8363c3e2
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/file.go
@@ -0,0 +1,148 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ramfs
+
+import (
+	"io"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// File represents a unique file.  It uses a simple byte slice as storage, and
+// thus should only be used for small files.
+//
+// A File is not mappable.
+type File struct {
+	Entry
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// data tracks backing data for the file.
+	data []byte
+}
+
+// InitFile initializes a file.
+func (f *File) InitFile(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions) {
+	f.InitEntry(ctx, owner, perms)
+}
+
+// UnstableAttr returns unstable attributes of this ramfs file.
+func (f *File) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	uattr, _ := f.Entry.UnstableAttr(ctx, inode)
+	uattr.Size = int64(len(f.data))
+	uattr.Usage = f.usageLocked()
+
+	return uattr, nil
+}
+
+// usageLocked returns the disk usage. Caller must hold f.mu.
+func (f *File) usageLocked() int64 {
+	return int64(len(f.data))
+}
+
+// Append appends the given data. This is for internal use.
+func (f *File) Append(data []byte) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.data = append(f.data, data...)
+}
+
+// Truncate truncates this node.
+func (f *File) Truncate(ctx context.Context, inode *fs.Inode, l int64) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if l < int64(len(f.data)) {
+		// Remove excess bytes.
+		f.data = f.data[:l]
+		return nil
+	} else if l > int64(len(f.data)) {
+		// Create a new slice with size l, and copy f.data into it.
+		d := make([]byte, l)
+		copy(d, f.data)
+		f.data = d
+	}
+	f.Entry.NotifyModification(ctx)
+	return nil
+}
+
+// ReadAt implements io.ReaderAt.
+func (f *File) ReadAt(data []byte, offset int64) (int, error) {
+	if offset < 0 {
+		return 0, ErrInvalidOp
+	}
+	if offset >= int64(len(f.data)) {
+		return 0, io.EOF
+	}
+	n := copy(data, f.data[offset:])
+	// Did we read past the end?
+	if offset+int64(len(data)) >= int64(len(f.data)) {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+// DeprecatedPreadv reads into a collection of slices from a given offset.
+func (f *File) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if offset >= int64(len(f.data)) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, f.data[offset:])
+	if n > 0 {
+		f.Entry.NotifyAccess(ctx)
+	}
+	return int64(n), err
+}
+
+// WriteAt implements io.WriterAt.
+func (f *File) WriteAt(data []byte, offset int64) (int, error) {
+	if offset < 0 {
+		return 0, ErrInvalidOp
+	}
+	newLen := offset + int64(len(data))
+	if newLen < 0 {
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
+	if newLen > int64(len(f.data)) {
+		// Copy f.data into new slice with expanded length.
+		d := make([]byte, newLen)
+		copy(d, f.data)
+		f.data = d
+	}
+	return copy(f.data[offset:], data), nil
+}
+
+// DeprecatedPwritev writes from a collection of slices at a given offset.
+func (f *File) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	n, err := src.CopyInTo(ctx, safemem.FromIOWriter{secio.NewOffsetWriter(f, offset)})
+	if n > 0 {
+		f.Entry.NotifyModification(ctx)
+	}
+	return n, err
+}
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
new file mode 100644
index 000000000..04f2d38de
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -0,0 +1,433 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ramfs implements an in-memory file system that can be associated with
+// any device.
+package ramfs
+
+import (
+	"errors"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+var (
+	// ErrInvalidOp indicates the operation is not valid.
+	ErrInvalidOp = errors.New("invalid operation")
+
+	// ErrDenied indicates the operation was denid.
+	ErrDenied = errors.New("operation denied")
+
+	// ErrNotFound indicates that a node was not found on a walk.
+	ErrNotFound = errors.New("node not found")
+
+	// ErrCrossDevice indicates a cross-device link or rename.
+	ErrCrossDevice = errors.New("can't link across filesystems")
+
+	// ErrIsDirectory indicates that the operation failed because
+	// the node is a directory.
+	ErrIsDirectory = errors.New("is a directory")
+
+	// ErrNotDirectory indicates that the operation failed because
+	// the node is a not directory.
+	ErrNotDirectory = errors.New("not a directory")
+
+	// ErrNotEmpty indicates that the operation failed because the
+	// directory is not empty.
+	ErrNotEmpty = errors.New("directory not empty")
+)
+
+// Entry represents common internal state for file and directory nodes.
+// This may be used by other packages to easily create ramfs files.
+type Entry struct {
+	waiter.AlwaysReady    `state:"nosave"`
+	fsutil.NoMappable     `state:"nosave"`
+	fsutil.NoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotSocket `state:"nosave"`
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// unstable is unstable attributes.
+	unstable fs.UnstableAttr
+
+	// xattrs are the extended attributes of the Entry.
+	xattrs map[string][]byte
+}
+
+// InitEntry initializes an entry.
+func (e *Entry) InitEntry(ctx context.Context, owner fs.FileOwner, p fs.FilePermissions) {
+	e.InitEntryWithAttr(ctx, fs.WithCurrentTime(ctx, fs.UnstableAttr{
+		Owner: owner,
+		Perms: p,
+		// Always start unlinked.
+		Links: 0,
+	}))
+}
+
+// InitEntryWithAttr initializes an entry with a complete set of attributes.
+func (e *Entry) InitEntryWithAttr(ctx context.Context, uattr fs.UnstableAttr) {
+	e.unstable = uattr
+	e.xattrs = make(map[string][]byte)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (e *Entry) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.unstable, nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (*Entry) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (e *Entry) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if value, ok := e.xattrs[name]; ok {
+		return value, nil
+	}
+	return nil, syserror.ENOATTR
+}
+
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (e *Entry) Setxattr(inode *fs.Inode, name string, value []byte) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.xattrs[name] = value
+	return nil
+}
+
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (e *Entry) Listxattr(inode *fs.Inode) (map[string]struct{}, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	names := make(map[string]struct{}, len(e.xattrs))
+	for name := range e.xattrs {
+		names[name] = struct{}{}
+	}
+	return names, nil
+}
+
+// GetFile returns a fs.File backed by the dirent argument and flags.
+func (*Entry) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), nil
+}
+
+// SetPermissions always sets the permissions.
+func (e *Entry) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.unstable.Perms = p
+	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+	return true
+}
+
+// SetOwner always sets ownership.
+func (e *Entry) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if owner.UID.Ok() {
+		e.unstable.Owner.UID = owner.UID
+	}
+	if owner.GID.Ok() {
+		e.unstable.Owner.GID = owner.GID
+	}
+	return nil
+}
+
+// SetTimestamps sets the timestamps.
+func (e *Entry) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	now := ktime.NowFromContext(ctx)
+	if !ts.ATimeOmit {
+		if ts.ATimeSetSystemTime {
+			e.unstable.AccessTime = now
+		} else {
+			e.unstable.AccessTime = ts.ATime
+		}
+	}
+	if !ts.MTimeOmit {
+		if ts.MTimeSetSystemTime {
+			e.unstable.ModificationTime = now
+		} else {
+			e.unstable.ModificationTime = ts.MTime
+		}
+	}
+	e.unstable.StatusChangeTime = now
+	return nil
+}
+
+// NotifyStatusChange updates the status change time (ctime).
+func (e *Entry) NotifyStatusChange(ctx context.Context) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// StatusChangeTime returns the last status change time for this node.
+func (e *Entry) StatusChangeTime() ktime.Time {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.unstable.StatusChangeTime
+}
+
+// NotifyModification updates the modification time and the status change time.
+func (e *Entry) NotifyModification(ctx context.Context) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	now := ktime.NowFromContext(ctx)
+	e.unstable.ModificationTime = now
+	e.unstable.StatusChangeTime = now
+}
+
+// ModificationTime returns the last modification time for this node.
+func (e *Entry) ModificationTime() ktime.Time {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.unstable.ModificationTime
+}
+
+// NotifyAccess updates the access time.
+func (e *Entry) NotifyAccess(ctx context.Context) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	now := ktime.NowFromContext(ctx)
+	e.unstable.AccessTime = now
+}
+
+// AccessTime returns the last access time for this node.
+func (e *Entry) AccessTime() ktime.Time {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.unstable.AccessTime
+}
+
+// Permissions returns permissions on this entry.
+func (e *Entry) Permissions() fs.FilePermissions {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.unstable.Perms
+}
+
+// Lookup is not supported by default.
+func (*Entry) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) {
+	return nil, ErrInvalidOp
+}
+
+// Create is not supported by default.
+func (*Entry) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) {
+	return nil, ErrInvalidOp
+}
+
+// CreateLink is not supported by default.
+func (*Entry) CreateLink(context.Context, *fs.Inode, string, string) error {
+	return ErrInvalidOp
+}
+
+// CreateHardLink is not supported by default.
+func (*Entry) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+	return ErrInvalidOp
+}
+
+// IsVirtual returns true.
+func (*Entry) IsVirtual() bool {
+	return true
+}
+
+// CreateDirectory is not supported by default.
+func (*Entry) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return ErrInvalidOp
+}
+
+// Bind is not supported by default.
+func (*Entry) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error {
+	return ErrInvalidOp
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo. CreateFifo is not supported by
+// default.
+func (*Entry) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+	return ErrInvalidOp
+}
+
+// Remove is not supported by default.
+func (*Entry) Remove(context.Context, *fs.Inode, string) error {
+	return ErrInvalidOp
+}
+
+// RemoveDirectory is not supported by default.
+func (*Entry) RemoveDirectory(context.Context, *fs.Inode, string) error {
+	return ErrInvalidOp
+}
+
+// StatFS always returns ENOSYS.
+func (*Entry) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syscall.ENOSYS
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (e *Entry) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName)
+}
+
+// Rename renames from a *ramfs.Dir to another *ramfs.Dir.
+func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string) error {
+	op, ok := oldParent.(*Dir)
+	if !ok {
+		return ErrCrossDevice
+	}
+	np, ok := newParent.(*Dir)
+	if !ok {
+		return ErrCrossDevice
+	}
+
+	np.mu.Lock()
+	defer np.mu.Unlock()
+
+	// Check whether the ramfs entry to be replaced is a non-empty directory.
+	if replaced, ok := np.children[newName]; ok {
+		if fs.IsDir(replaced.StableAttr) {
+			// FIXME: simplify by pinning children of ramfs-backed directories
+			// in the Dirent tree: this allows us to generalize ramfs operations without
+			// relying on an implementation of Readdir (which may do anything, like require
+			// that the file be open ... which would be reasonable).
+			dirCtx := &fs.DirCtx{}
+			_, err := replaced.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0)
+			if err != nil {
+				return err
+			}
+			attrs := dirCtx.DentAttrs()
+
+			// ramfs-backed directories should not contain "." and "..", but we do this
+			// just in case.
+			delete(attrs, ".")
+			delete(attrs, "..")
+
+			// If the directory to be replaced is not empty, reject the rename.
+			if len(attrs) != 0 {
+				return ErrNotEmpty
+			}
+		}
+	}
+
+	// Be careful, we may have already grabbed this mutex above.
+	if op != np {
+		op.mu.Lock()
+		defer op.mu.Unlock()
+	}
+
+	// Do the swap.
+	n := op.children[oldName]
+	op.removeChildLocked(ctx, oldName)
+	np.addChildLocked(newName, n)
+
+	// Update ctime.
+	n.NotifyStatusChange(ctx)
+
+	return nil
+}
+
+// Truncate is not supported by default.
+func (*Entry) Truncate(context.Context, *fs.Inode, int64) error {
+	return ErrInvalidOp
+}
+
+// Readlink always returns ENOLINK.
+func (*Entry) Readlink(context.Context, *fs.Inode) (string, error) {
+	return "", syscall.ENOLINK
+}
+
+// Getlink always returns ENOLINK.
+func (*Entry) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	return nil, syscall.ENOLINK
+}
+
+// Release is a no-op.
+func (e *Entry) Release(context.Context) {}
+
+// AddLink implements InodeOperationss.AddLink.
+func (e *Entry) AddLink() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.unstable.Links++
+}
+
+// DropLink implements InodeOperationss.DropLink.
+func (e *Entry) DropLink() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.unstable.Links--
+}
+
+// DeprecatedReaddir is not supported by default.
+func (*Entry) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) {
+	return 0, ErrNotDirectory
+}
+
+// DeprecatedPreadv always returns ErrInvalidOp.
+func (*Entry) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
+	return 0, ErrInvalidOp
+}
+
+// DeprecatedPwritev always returns ErrInvalidOp.
+func (*Entry) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
+	return 0, ErrInvalidOp
+}
+
+// DeprecatedFsync is a noop.
+func (*Entry) DeprecatedFsync() error {
+	// Ignore, this is in memory.
+	return nil
+}
+
+// DeprecatedFlush always returns nil.
+func (*Entry) DeprecatedFlush() error {
+	return nil
+}
+
+// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable.
+func (*Entry) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) {
+	return nil, false
+}
+
+func init() {
+	// Register ramfs errors.
+	syserror.AddErrorTranslation(ErrInvalidOp, syscall.EINVAL)
+	syserror.AddErrorTranslation(ErrDenied, syscall.EACCES)
+	syserror.AddErrorTranslation(ErrNotFound, syscall.ENOENT)
+	syserror.AddErrorTranslation(ErrCrossDevice, syscall.EXDEV)
+	syserror.AddErrorTranslation(ErrIsDirectory, syscall.EISDIR)
+	syserror.AddErrorTranslation(ErrNotDirectory, syscall.ENOTDIR)
+	syserror.AddErrorTranslation(ErrNotEmpty, syscall.ENOTEMPTY)
+}
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
new file mode 100644
index 000000000..b0c79325f
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -0,0 +1,42 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ramfs
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// Socket represents a socket.
+type Socket struct {
+	Entry
+
+	// ep is the bound endpoint.
+	ep unix.BoundEndpoint
+}
+
+// InitSocket initializes a socket.
+func (s *Socket) InitSocket(ctx context.Context, ep unix.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) {
+	s.InitEntry(ctx, owner, perms)
+	s.ep = ep
+}
+
+// BoundEndpoint returns the socket data.
+func (s *Socket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint {
+	// ramfs only supports stored sentry internal sockets. Only gofer sockets
+	// care about the path argument.
+	return s.ep
+}
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
new file mode 100644
index 000000000..9bbf78619
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -0,0 +1,72 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ramfs
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// Symlink represents a symlink.
+type Symlink struct {
+	Entry
+
+	mu sync.Mutex `state:"nosave"`
+
+	// Target is the symlink target.
+	Target string
+}
+
+// InitSymlink initializes a symlink, pointing to the given target.
+// A symlink is assumed to always have permissions 0777.
+func (s *Symlink) InitSymlink(ctx context.Context, owner fs.FileOwner, target string) {
+	s.InitEntry(ctx, owner, fs.FilePermsFromMode(0777))
+	s.Target = target
+}
+
+// UnstableAttr returns all attributes of this ramfs symlink.
+func (s *Symlink) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	uattr, _ := s.Entry.UnstableAttr(ctx, inode)
+	uattr.Size = int64(len(s.Target))
+	uattr.Usage = uattr.Size
+	return uattr, nil
+}
+
+// Check implements InodeOperations.Check.
+func (s *Symlink) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetPermissions on a symlink is always rejected.
+func (s *Symlink) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool {
+	return false
+}
+
+// Readlink reads the symlink value.
+func (s *Symlink) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.Entry.NotifyAccess(ctx)
+	return s.Target, nil
+}
+
+// Getlink returns ErrResolveViaReadlink, falling back to walking to the result
+// of Readlink().
+func (*Symlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+	return nil, fs.ErrResolveViaReadlink
+}
diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD
new file mode 100644
index 000000000..074b0f5ad
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/test/BUILD
@@ -0,0 +1,31 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "test_state",
+    srcs = [
+        "test.go",
+    ],
+    out = "test_state.go",
+    package = "test",
+)
+
+go_library(
+    name = "test",
+    testonly = 1,
+    srcs = [
+        "test.go",
+        "test_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/state",
+    ],
+)
diff --git a/pkg/sentry/fs/ramfs/test/test.go b/pkg/sentry/fs/ramfs/test/test.go
new file mode 100644
index 000000000..fb669558f
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/test/test.go
@@ -0,0 +1,46 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package test provides a simple ramfs-based filesystem for use in testing.
+package test
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+)
+
+// Dir is a simple ramfs.Dir that supports save/restore as-is.
+type Dir struct {
+	ramfs.Dir
+}
+
+// NewDir returns a simple ramfs directory with the passed contents.
+func NewDir(ctx context.Context, contents map[string]*fs.Inode, perms fs.FilePermissions) *Dir {
+	d := &Dir{}
+	d.InitDir(ctx, contents, fs.RootOwner, perms)
+	return d
+}
+
+// File is a simple ramfs.File that supports save/restore as-is.
+type File struct {
+	ramfs.File
+}
+
+// NewFile returns a simple ramfs File.
+func NewFile(ctx context.Context, perms fs.FilePermissions) *File {
+	f := &File{}
+	f.InitFile(ctx, fs.RootOwner, perms)
+	return f
+}
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
new file mode 100644
index 000000000..1fb335f74
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -0,0 +1,71 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ramfs
+
+import (
+	"fmt"
+	"path"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// MakeDirectoryTree constructs a ramfs tree of all directories containing
+// subdirs. Each element of subdir must be a clean path, and cannot be empty or
+// "/".
+func MakeDirectoryTree(ctx context.Context, msrc *fs.MountSource, subdirs []string) (*fs.Inode, error) {
+	root := emptyDir(ctx, msrc)
+	for _, subdir := range subdirs {
+		if path.Clean(subdir) != subdir {
+			return nil, fmt.Errorf("cannot add subdir at an unclean path: %q", subdir)
+		}
+		if subdir == "" || subdir == "/" {
+			return nil, fmt.Errorf("cannot add subdir at %q", subdir)
+		}
+		makeSubdir(ctx, msrc, root.InodeOperations.(*Dir), subdir)
+	}
+	return root, nil
+}
+
+// makeSubdir installs into root each component of subdir. The final component is
+// a *ramfs.Dir.
+func makeSubdir(ctx context.Context, msrc *fs.MountSource, root *Dir, subdir string) {
+	for _, c := range strings.Split(subdir, "/") {
+		if len(c) == 0 {
+			continue
+		}
+		child, ok := root.FindChild(c)
+		if !ok {
+			child = emptyDir(ctx, msrc)
+			root.AddChild(ctx, c, child)
+		}
+		root = child.InodeOperations.(*Dir)
+	}
+}
+
+// emptyDir returns an empty *ramfs.Dir that is traversable but not writable.
+func emptyDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	dir := &Dir{}
+	dir.InitDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0555))
+	return fs.NewInode(dir, msrc, fs.StableAttr{
+		DeviceID:  anon.PseudoDevice.DeviceID(),
+		InodeID:   anon.PseudoDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Directory,
+	})
+}
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
new file mode 100644
index 000000000..68e2929d5
--- /dev/null
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ramfs
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+func TestMakeDirectoryTree(t *testing.T) {
+	mount := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+
+	for _, test := range []struct {
+		name    string
+		subdirs []string
+	}{
+		{
+			name: "abs paths",
+			subdirs: []string{
+				"/tmp",
+				"/tmp/a/b",
+				"/tmp/a/c/d",
+				"/tmp/c",
+				"/proc",
+				"/dev/a/b",
+				"/tmp",
+			},
+		},
+		{
+			name: "rel paths",
+			subdirs: []string{
+				"tmp",
+				"tmp/a/b",
+				"tmp/a/c/d",
+				"tmp/c",
+				"proc",
+				"dev/a/b",
+				"tmp",
+			},
+		},
+	} {
+		ctx := contexttest.Context(t)
+		tree, err := MakeDirectoryTree(ctx, mount, test.subdirs)
+		if err != nil {
+			t.Errorf("%s: failed to make ramfs tree, got error %v, want nil", test.name, err)
+			continue
+		}
+
+		// Expect to be able to find each of the paths.
+		mm, err := fs.NewMountNamespace(ctx, tree)
+		if err != nil {
+			t.Errorf("%s: failed to create mount manager: %v", test.name, err)
+			continue
+		}
+		root := mm.Root()
+		defer mm.DecRef()
+
+		for _, p := range test.subdirs {
+			if _, err := mm.FindInode(ctx, root, nil, p, 0); err != nil {
+				t.Errorf("%s: failed to find node %s: %v", test.name, p, err)
+				break
+			}
+		}
+	}
+}
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
new file mode 100644
index 000000000..b4ac85a27
--- /dev/null
+++ b/pkg/sentry/fs/restore.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"sync"
+)
+
+// RestoreEnvironment is the restore environment for file systems. It consists
+// of things that change across save and restore and therefore cannot be saved
+// in the object graph.
+type RestoreEnvironment struct {
+	// MountSources maps Filesystem.Name() to mount arguments.
+	MountSources map[string][]MountArgs
+
+	// ValidateFileSize indicates file size should not change across S/R.
+	ValidateFileSize bool
+
+	// ValidateFileTimestamp indicates file modification timestamp should
+	// not change across S/R.
+	ValidateFileTimestamp bool
+}
+
+// MountArgs holds arguments to Mount.
+type MountArgs struct {
+	// Dev corresponds to the devname argumnent of Mount.
+	Dev string
+
+	// Flags corresponds to the flags argument of Mount.
+	Flags MountSourceFlags
+
+	// Data corresponds to the data argument of Mount.
+	Data string
+}
+
+// restoreEnv holds the fs package global RestoreEnvironment.
+var restoreEnv = struct {
+	mu  sync.Mutex
+	env RestoreEnvironment
+	set bool
+}{}
+
+// SetRestoreEnvironment sets the RestoreEnvironment. Must be called before
+// state.Load and only once.
+func SetRestoreEnvironment(r RestoreEnvironment) {
+	restoreEnv.mu.Lock()
+	defer restoreEnv.mu.Unlock()
+	if restoreEnv.set {
+		panic("RestoreEnvironment may only be set once")
+	}
+	restoreEnv.env = r
+	restoreEnv.set = true
+}
+
+// CurrentRestoreEnvironment returns the current, read-only RestoreEnvironment.
+// If no RestoreEnvironment was ever set, returns (_, false).
+func CurrentRestoreEnvironment() (RestoreEnvironment, bool) {
+	restoreEnv.mu.Lock()
+	defer restoreEnv.mu.Unlock()
+	e := restoreEnv.env
+	set := restoreEnv.set
+	return e, set
+}
diff --git a/pkg/sentry/fs/save.go b/pkg/sentry/fs/save.go
new file mode 100644
index 000000000..bf2a85143
--- /dev/null
+++ b/pkg/sentry/fs/save.go
@@ -0,0 +1,77 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// SaveInodeMappings saves a mapping of path -> inode ID for every
+// user-reachable Dirent.
+//
+// The entire kernel must be frozen to call this, and filesystem state must not
+// change between SaveInodeMappings and state.Save, otherwise the saved state
+// of any MountSource may be incoherent.
+func SaveInodeMappings() {
+	mountsSeen := make(map[*MountSource]struct{})
+	for dirent := range allDirents.dirents {
+		if _, ok := mountsSeen[dirent.Inode.MountSource]; !ok {
+			dirent.Inode.MountSource.ResetInodeMappings()
+			mountsSeen[dirent.Inode.MountSource] = struct{}{}
+		}
+	}
+
+	for dirent := range allDirents.dirents {
+		if dirent.Inode != nil {
+			// We cannot trust the root provided in the mount due
+			// to the overlay. We can trust the overlay to delegate
+			// SaveInodeMappings to the right underlying
+			// filesystems, though.
+			root := dirent
+			for !root.mounted && root.parent != nil {
+				root = root.parent
+			}
+
+			// Add the mapping.
+			n, reachable := dirent.FullName(root)
+			if !reachable {
+				// Something has gone seriously wrong if we can't reach our root.
+				panic(fmt.Sprintf("Unreachable root on dirent file %s", n))
+			}
+			dirent.Inode.MountSource.SaveInodeMapping(dirent.Inode, n)
+		}
+	}
+}
+
+// SaveFileFsyncError converts an fs.File.Fsync error to an error that
+// indicates that the fs.File was not synced sufficiently to be saved.
+func SaveFileFsyncError(err error) error {
+	switch err {
+	case nil:
+		// We succeeded, everything is great.
+		return nil
+	case syscall.EBADF, syscall.EINVAL, syscall.EROFS, syscall.ENOSYS, syscall.EPERM:
+		// These errors mean that the underlying node might not be syncable,
+		// which we expect to be reported as such even from the gofer.
+		log.Infof("failed to sync during save: %v", err)
+		return nil
+	default:
+		// We failed in some way that indicates potential data loss.
+		return fmt.Errorf("failed to sync: %v, data loss may occur", err)
+	}
+}
diff --git a/pkg/sentry/fs/seek.go b/pkg/sentry/fs/seek.go
new file mode 100644
index 000000000..1268726c2
--- /dev/null
+++ b/pkg/sentry/fs/seek.go
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+// SeekWhence determines seek direction.
+type SeekWhence int
+
+const (
+	// SeekSet sets the absolute offset.
+	SeekSet SeekWhence = iota
+
+	// SeekCurrent sets relative to the current position.
+	SeekCurrent
+
+	// SeekEnd sets relative to the end of the file.
+	SeekEnd
+)
+
+// String returns a human readable string for whence.
+func (s SeekWhence) String() string {
+	switch s {
+	case SeekSet:
+		return "Set"
+	case SeekCurrent:
+		return "Current"
+	case SeekEnd:
+		return "End"
+	default:
+		return "Unknown"
+	}
+}
diff --git a/pkg/sentry/fs/sync.go b/pkg/sentry/fs/sync.go
new file mode 100644
index 000000000..9738a8f22
--- /dev/null
+++ b/pkg/sentry/fs/sync.go
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+// SyncType enumerates ways in which a File can be synced.
+type SyncType int
+
+const (
+	// SyncAll indicates that modified in-memory metadata and data should
+	// be written to backing storage. SyncAll implies SyncBackingStorage.
+	SyncAll SyncType = iota
+
+	// SyncData indicates that along with modified in-memory data, only
+	// metadata needed to access that data needs to be written.
+	//
+	// For example, changes to access time or modification time do not
+	// need to be written because they are not necessary for a data read
+	// to be handled correctly, unlike the file size.
+	//
+	// The aim of SyncData is to reduce disk activity for applications
+	// that do not require all metadata to be synchronized with the disk,
+	// see fdatasync(2). File systems that implement SyncData as SyncAll
+	// do not support this optimization.
+	//
+	// SyncData implies SyncBackingStorage.
+	SyncData
+
+	// SyncBackingStorage indicates that in-flight write operations to
+	// backing storage should be flushed.
+	SyncBackingStorage
+)
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
new file mode 100644
index 000000000..0ae2cbac8
--- /dev/null
+++ b/pkg/sentry/fs/sys/BUILD
@@ -0,0 +1,34 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "sys_state",
+    srcs = [
+        "fs.go",
+        "sys.go",
+    ],
+    out = "sys_state.go",
+    package = "sys",
+)
+
+go_library(
+    name = "sys",
+    srcs = [
+        "device.go",
+        "fs.go",
+        "sys.go",
+        "sys_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+    ],
+)
diff --git a/pkg/sentry/fs/sys/device.go b/pkg/sentry/fs/sys/device.go
new file mode 100644
index 000000000..54e414d1b
--- /dev/null
+++ b/pkg/sentry/fs/sys/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// sysfsDevice is the sysfs virtual device.
+var sysfsDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
new file mode 100644
index 000000000..f25f648c3
--- /dev/null
+++ b/pkg/sentry/fs/sys/fs.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// filesystem is a sysfs.
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// FilesystemName is the name underwhich the filesystem is registered.
+// Name matches fs/sysfs/mount.c:sysfs_fs_type.name.
+const FilesystemName = "sysfs"
+
+// Name is the name of the file system.
+func (*filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount allows users to mount(2) this file system.
+func (*filesystem) AllowUserMount() bool {
+	return true
+}
+
+// Flags returns that there is nothing special about this file system.
+//
+// In Linux, sysfs returns FS_USERNS_VISIBLE | FS_USERNS_MOUNT, see fs/sysfs/mount.c.
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns a sysfs root which can be positioned in the vfs.
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// device is always ignored.
+	// sysfs ignores data, see fs/sysfs/mount.c:sysfs_mount.
+
+	return New(ctx, fs.NewNonCachingMountSource(f, flags)), nil
+}
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
new file mode 100644
index 000000000..ccf56f644
--- /dev/null
+++ b/pkg/sentry/fs/sys/sys.go
@@ -0,0 +1,57 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sys implements a sysfs filesystem.
+package sys
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type Dir struct {
+	ramfs.Dir
+}
+
+func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode {
+	d := &Dir{}
+	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return fs.NewInode(d, msrc, fs.StableAttr{
+		DeviceID:  sysfsDevice.DeviceID(),
+		InodeID:   sysfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.SpecialDirectory,
+	})
+}
+
+// New returns the root node of a partial simple sysfs.
+func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	return newDir(ctx, msrc, map[string]*fs.Inode{
+		// Add a basic set of top-level directories. In Linux, these
+		// are dynamically added depending on the KConfig. Here we just
+		// add the most common ones.
+		"block":    newDir(ctx, msrc, nil),
+		"bus":      newDir(ctx, msrc, nil),
+		"class":    newDir(ctx, msrc, nil),
+		"dev":      newDir(ctx, msrc, nil),
+		"devices":  newDir(ctx, msrc, nil),
+		"firmware": newDir(ctx, msrc, nil),
+		"fs":       newDir(ctx, msrc, nil),
+		"kernel":   newDir(ctx, msrc, nil),
+		"module":   newDir(ctx, msrc, nil),
+		"power":    newDir(ctx, msrc, nil),
+	})
+}
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
new file mode 100644
index 000000000..7fddc29f4
--- /dev/null
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -0,0 +1,35 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "timerfd_state",
+    srcs = [
+        "timerfd.go",
+    ],
+    out = "timerfd_state.go",
+    package = "timerfd",
+)
+
+go_library(
+    name = "timerfd",
+    srcs = [
+        "timerfd.go",
+        "timerfd_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
new file mode 100644
index 000000000..ae58f6fd7
--- /dev/null
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -0,0 +1,144 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package timerfd implements the semantics of Linux timerfd objects as
+// described by timerfd_create(2).
+package timerfd
+
+import (
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// TimerOperations implements fs.FileOperations for timerfds.
+type TimerOperations struct {
+	fsutil.ZeroSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	fsutil.NoIoctl       `state:"nosave"`
+
+	events waiter.Queue `state:"nosave"`
+	timer  *ktime.Timer
+
+	// val is the number of timer expirations since the last successful call to
+	// Readv, Preadv, or SetTime. val is accessed using atomic memory
+	// operations.
+	val uint64
+}
+
+// NewFile returns a timerfd File that receives time from c.
+func NewFile(ctx context.Context, c ktime.Clock) *fs.File {
+	dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[timerfd]")
+	tops := &TimerOperations{}
+	tops.timer = ktime.NewTimer(c, tops)
+	// Timerfds reject writes, but the Write flag must be set in order to
+	// ensure that our Writev/Pwritev methods actually get called to return
+	// the correct errors.
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, tops)
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *TimerOperations) Release() {
+	t.timer.Destroy()
+}
+
+// PauseTimer pauses the associated Timer.
+func (t *TimerOperations) PauseTimer() {
+	t.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (t *TimerOperations) ResumeTimer() {
+	t.timer.Resume()
+}
+
+// Clock returns the associated Timer's Clock.
+func (t *TimerOperations) Clock() ktime.Clock {
+	return t.timer.Clock()
+}
+
+// GetTime returns the associated Timer's setting and the time at which it was
+// observed.
+func (t *TimerOperations) GetTime() (ktime.Time, ktime.Setting) {
+	return t.timer.Get()
+}
+
+// SetTime atomically changes the associated Timer's setting, resets the number
+// of expirations to 0, and returns the previous setting and the time at which
+// it was observed.
+func (t *TimerOperations) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
+	return t.timer.SwapAnd(s, func() { atomic.StoreUint64(&t.val, 0) })
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (t *TimerOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	if atomic.LoadUint64(&t.val) != 0 {
+		ready |= waiter.EventIn
+	}
+	return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (t *TimerOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	t.events.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (t *TimerOperations) EventUnregister(e *waiter.Entry) {
+	t.events.EventUnregister(e)
+}
+
+// Read implements fs.FileOperations.Read.
+func (t *TimerOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	const sizeofUint64 = 8
+	if dst.NumBytes() < sizeofUint64 {
+		return 0, syserror.EINVAL
+	}
+	if val := atomic.SwapUint64(&t.val, 0); val != 0 {
+		var buf [sizeofUint64]byte
+		usermem.ByteOrder.PutUint64(buf[:], val)
+		if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
+			// Linux does not undo consuming the number of expirations even if
+			// writing to userspace fails.
+			return 0, err
+		}
+		return sizeofUint64, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// Write implements fs.FileOperations.Write.
+func (t *TimerOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (t *TimerOperations) Notify(exp uint64) {
+	atomic.AddUint64(&t.val, exp)
+	t.events.Notify(waiter.EventIn)
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (t *TimerOperations) Destroy() {}
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
new file mode 100644
index 000000000..be4e695d3
--- /dev/null
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -0,0 +1,64 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "tmpfs_state",
+    srcs = [
+        "file_regular.go",
+        "fs.go",
+        "inode_file.go",
+        "tmpfs.go",
+    ],
+    out = "tmpfs_state.go",
+    package = "tmpfs",
+)
+
+go_library(
+    name = "tmpfs",
+    srcs = [
+        "device.go",
+        "file_regular.go",
+        "fs.go",
+        "inode_file.go",
+        "tmpfs.go",
+        "tmpfs_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "tmpfs_test",
+    size = "small",
+    srcs = ["file_test.go"],
+    embed = [":tmpfs"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/tmpfs/device.go b/pkg/sentry/fs/tmpfs/device.go
new file mode 100644
index 000000000..e588b3440
--- /dev/null
+++ b/pkg/sentry/fs/tmpfs/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// tmpfsDevice is the kernel tmpfs device.
+var tmpfsDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
new file mode 100644
index 000000000..9811d90bc
--- /dev/null
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// regularFileOperations implements fs.FileOperations for a regular
+// tmpfs file.
+type regularFileOperations struct {
+	waiter.AlwaysReady   `state:"nosave"`
+	fsutil.NoopRelease   `state:"nosave"`
+	fsutil.GenericSeek   `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoopFsync     `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoIoctl       `state:"nosave"`
+
+	// iops is the InodeOperations of a regular tmpfs file. It is
+	// guaranteed to be the same as file.Dirent.Inode.InodeOperations,
+	// see operations that take fs.File below.
+	iops *fileInodeOperations
+}
+
+// Read implements fs.FileOperations.Read.
+func (r *regularFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	return r.iops.read(ctx, dst, offset)
+}
+
+// Write implements fs.FileOperations.Write.
+func (r *regularFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	return r.iops.write(ctx, src, offset)
+}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (r *regularFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+	return fsutil.GenericConfigureMMap(file, r.iops, opts)
+}
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
new file mode 100644
index 000000000..f064eb1ac
--- /dev/null
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -0,0 +1,73 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func newFileInode(ctx context.Context) *fs.Inode {
+	m := fs.NewCachingMountSource(&Filesystem{}, fs.MountSourceFlags{})
+	iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.WithCurrentTime(ctx, fs.UnstableAttr{}), platform.FromContext(ctx))
+	return fs.NewInode(iops, m, fs.StableAttr{
+		DeviceID:  tmpfsDevice.DeviceID(),
+		InodeID:   tmpfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.RegularFile,
+	})
+}
+
+func newFile(ctx context.Context) *fs.File {
+	inode := newFileInode(ctx)
+	f, _ := inode.GetFile(ctx, fs.NewDirent(inode, "stub"), fs.FileFlags{Read: true, Write: true})
+	return f
+}
+
+// Allocate once, write twice.
+func TestGrow(t *testing.T) {
+	ctx := contexttest.Context(t)
+	f := newFile(ctx)
+	defer f.DecRef()
+
+	abuf := bytes.Repeat([]byte{'a'}, 68)
+	n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0)
+	if n != int64(len(abuf)) || err != nil {
+		t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(abuf))
+	}
+
+	bbuf := bytes.Repeat([]byte{'b'}, 856)
+	n, err = f.Pwritev(ctx, usermem.BytesIOSequence(bbuf), 68)
+	if n != int64(len(bbuf)) || err != nil {
+		t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(bbuf))
+	}
+
+	rbuf := make([]byte, len(abuf)+len(bbuf))
+	n, err = f.Preadv(ctx, usermem.BytesIOSequence(rbuf), 0)
+	if n != int64(len(rbuf)) || err != nil {
+		t.Fatalf("DeprecatedPreadv got (%d, %v) want (%d, nil)", n, err, len(rbuf))
+	}
+
+	if want := append(abuf, bbuf...); !bytes.Equal(rbuf, want) {
+		t.Fatalf("Read %v, want %v", rbuf, want)
+	}
+}
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
new file mode 100644
index 000000000..639a19b0d
--- /dev/null
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -0,0 +1,131 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+const (
+	// Set initial permissions for the root directory.
+	modeKey = "mode"
+
+	// UID for the root directory.
+	rootUIDKey = "uid"
+
+	// GID for the root directory.
+	rootGIDKey = "gid"
+
+	// TODO: support a tmpfs size limit.
+	// size = "size"
+
+	// default permissions are read/write/execute.
+	defaultMode = 0777
+)
+
+// modeRegexp is the expected format of the mode option.
+var modeRegexp = regexp.MustCompile("0[0-7][0-7][0-7]")
+
+// Filesystem is a tmpfs.
+type Filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&Filesystem{})
+}
+
+// FilesystemName is the name underwhich the filesystem is registered.
+// Name matches mm/shmem.c:shmem_fs_type.name.
+const FilesystemName = "tmpfs"
+
+// Name is the name of the file system.
+func (*Filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount allows users to mount(2) this file system.
+func (*Filesystem) AllowUserMount() bool {
+	return true
+}
+
+// Flags returns that there is nothing special about this file system.
+//
+// In Linux, tmpfs returns FS_USERNS_MOUNT, see mm/shmem.c.
+func (*Filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// Mount returns a tmpfs root that can be positioned in the vfs.
+func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// device is always ignored.
+
+	// Parse generic comma-separated key=value options, this file system expects them.
+	options := fs.GenericMountSourceOptions(data)
+
+	// Parse the root directory permissions.
+	perms := fs.FilePermsFromMode(defaultMode)
+	if m, ok := options[modeKey]; ok {
+		if !modeRegexp.MatchString(m) {
+			return nil, fmt.Errorf("unsupported mode value: 'mode=%s'", m)
+		}
+		// It's basically impossible that we error out at this point,
+		// maybe we should panic.
+		i, err := strconv.ParseUint(m, 8, 32)
+		if err != nil {
+			return nil, fmt.Errorf("mode value not parsable 'mode=%s': %v", m, err)
+		}
+		perms = fs.FilePermsFromMode(linux.FileMode(i))
+		delete(options, modeKey)
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	owner := fs.FileOwnerFromContext(ctx)
+	if uidstr, ok := options[rootUIDKey]; ok {
+		uid, err := strconv.ParseInt(uidstr, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("uid value not parsable 'uid=%d': %v", uid, err)
+		}
+		owner.UID = creds.UserNamespace.MapToKUID(auth.UID(uid))
+		delete(options, rootUIDKey)
+	}
+
+	if gidstr, ok := options[rootGIDKey]; ok {
+		gid, err := strconv.ParseInt(gidstr, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("gid value not parsable 'gid=%d': %v", gid, err)
+		}
+		owner.GID = creds.UserNamespace.MapToKGID(auth.GID(gid))
+		delete(options, rootGIDKey)
+	}
+
+	// Fail if the caller passed us more options than we can parse. They may be
+	// expecting us to set something we can't set.
+	if len(options) > 0 {
+		return nil, fmt.Errorf("unsupported mount options: %v", options)
+	}
+
+	// Construct a mount which will cache dirents.
+	msrc := fs.NewCachingMountSource(f, flags)
+
+	// Construct the tmpfs root.
+	return NewDir(ctx, nil, owner, perms, msrc, platform.FromContext(ctx)), nil
+}
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
new file mode 100644
index 000000000..66bc934ae
--- /dev/null
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -0,0 +1,492 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"io"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
+// These files are backed by FrameRegions allocated from a platform.Memory,
+// and may be directly mapped.
+//
+// The tmpfs file memory is backed by FrameRegions, each of which is reference
+// counted. frames maintains a single reference on each of the FrameRegions.
+// Since these contain the contents of the file, the reference may only be
+// decremented once this file is both deleted and all handles to the file have
+// been closed.
+//
+// Mappable users may also call IncRefOn/DecRefOn, generally to indicate that
+// they plan to use MapInto to map the file into an AddressSpace. These calls
+// include an InvalidatorRegion associated with that reference. When the
+// referenced portion of the file is removed (with Truncate), the associated
+// InvalidatorRegion is invalidated.
+type fileInodeOperations struct {
+	fsutil.DeprecatedFileOperations `state:"nosave"`
+	fsutil.InodeNotDirectory        `state:"nosave"`
+	fsutil.InodeNotSocket           `state:"nosave"`
+	fsutil.InodeNotSymlink          `state:"nosave"`
+	fsutil.NoopWriteOut             `state:"nosave"`
+
+	// platform is used to allocate memory that stores the file's contents.
+	platform platform.Platform
+
+	// memUsage is the default memory usage that will be reported by this file.
+	memUsage usage.MemoryKind
+
+	attrMu sync.Mutex `state:"nosave"`
+
+	// attr contains the unstable metadata for the file.
+	//
+	// attr is protected by attrMu. attr.Unstable.Size is protected by both
+	// attrMu and dataMu; reading it requires locking either mutex, while
+	// mutating it requires locking both.
+	attr fsutil.InMemoryAttributes
+
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings tracks mappings of the file into memmap.MappingSpaces.
+	//
+	// mappings is protected by mapsMu.
+	mappings memmap.MappingSet
+
+	dataMu sync.RWMutex `state:"nosave"`
+
+	// data maps offsets into the file to offsets into platform.Memory() that
+	// store the file's data.
+	//
+	// data is protected by dataMu.
+	data fsutil.FileRangeSet
+}
+
+// NewInMemoryFile returns a new file backed by p.Memory().
+func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr, p platform.Platform) fs.InodeOperations {
+	return &fileInodeOperations{
+		attr: fsutil.InMemoryAttributes{
+			Unstable: uattr,
+		},
+		platform: p,
+		memUsage: usage,
+	}
+}
+
+// Release implements fs.InodeOperations.Release.
+func (f *fileInodeOperations) Release(context.Context) {
+	f.dataMu.Lock()
+	defer f.dataMu.Unlock()
+	f.data.DropAll(f.platform.Memory())
+}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (f *fileInodeOperations) Mappable(*fs.Inode) memmap.Mappable {
+	return f
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (*fileInodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return rename(ctx, oldParent, oldName, newParent, newName)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
+	return fs.NewFile(ctx, d, flags, &regularFileOperations{iops: f}), nil
+}
+
+// UnstableAttr returns unstable attributes of this tmpfs file.
+func (f *fileInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	f.dataMu.RLock()
+	defer f.dataMu.RUnlock()
+	attr := f.attr.Unstable
+	attr.Usage = int64(f.data.Span())
+	return attr, nil
+}
+
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (f *fileInodeOperations) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	return f.attr.Getxattr(name)
+}
+
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (f *fileInodeOperations) Setxattr(inode *fs.Inode, name string, value []byte) error {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	return f.attr.Setxattr(name, value)
+}
+
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (f *fileInodeOperations) Listxattr(inode *fs.Inode) (map[string]struct{}, error) {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	return f.attr.Listxattr()
+}
+
+// Check implements fs.InodeOperations.Check.
+func (f *fileInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (f *fileInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	return f.attr.SetPermissions(ctx, p)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (f *fileInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	return f.attr.SetTimestamps(ctx, ts)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (f *fileInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	return f.attr.SetOwner(ctx, owner)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (f *fileInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+
+	f.dataMu.Lock()
+	oldSize := f.attr.Unstable.Size
+	if oldSize != size {
+		f.attr.Unstable.Size = size
+		f.attr.TouchModificationTime(ctx)
+	}
+	f.dataMu.Unlock()
+
+	// Nothing left to do unless shrinking the file.
+	if oldSize <= size {
+		return nil
+	}
+
+	oldpgend := fs.OffsetPageEnd(oldSize)
+	newpgend := fs.OffsetPageEnd(size)
+
+	// Invalidate past translations of truncated pages.
+	if newpgend != oldpgend {
+		f.mapsMu.Lock()
+		f.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+			// Compare Linux's mm/shmem.c:shmem_setattr() =>
+			// mm/memory.c:unmap_mapping_range(evencows=1).
+			InvalidatePrivate: true,
+		})
+		f.mapsMu.Unlock()
+	}
+
+	// We are now guaranteed that there are no translations of truncated pages,
+	// and can remove them.
+	f.dataMu.Lock()
+	defer f.dataMu.Unlock()
+	f.data.Truncate(uint64(size), f.platform.Memory())
+
+	return nil
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+func (f *fileInodeOperations) AddLink() {
+	f.attrMu.Lock()
+	f.attr.Unstable.Links++
+	f.attrMu.Unlock()
+}
+
+// DropLink implements fs.InodeOperations.DropLink.
+func (f *fileInodeOperations) DropLink() {
+	f.attrMu.Lock()
+	f.attr.Unstable.Links--
+	f.attrMu.Unlock()
+}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+func (f *fileInodeOperations) NotifyStatusChange(ctx context.Context) {
+	f.attrMu.Lock()
+	f.attr.TouchStatusChangeTime(ctx)
+	f.attrMu.Unlock()
+}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (*fileInodeOperations) IsVirtual() bool {
+	return true
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) {
+	return fsInfo, nil
+}
+
+func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	// Zero length reads for tmpfs are no-ops.
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Have we reached EOF? We check for this again in
+	// fileReadWriter.ReadToBlocks to avoid holding f.attrMu (which would
+	// serialize reads) or f.dataMu (which would violate lock ordering), but
+	// check here first (before calling into MM) since reading at EOF is
+	// common: getting a return value of 0 from a read syscall is the only way
+	// to detect EOF.
+	//
+	// TODO: Separate out f.attr.Size and use atomics instead of
+	// f.dataMu.
+	f.dataMu.RLock()
+	size := f.attr.Unstable.Size
+	f.dataMu.RUnlock()
+	if offset >= size {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOutFrom(ctx, &fileReadWriter{f, offset})
+	// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+	f.attrMu.Lock()
+	f.attr.TouchAccessTime(ctx)
+	f.attrMu.Unlock()
+	return n, err
+}
+
+func (f *fileInodeOperations) write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	// Zero length writes for tmpfs are no-ops.
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
+	f.attr.TouchModificationTime(ctx)
+	return src.CopyInTo(ctx, &fileReadWriter{f, offset})
+}
+
+type fileReadWriter struct {
+	f      *fileInodeOperations
+	offset int64
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	rw.f.dataMu.RLock()
+	defer rw.f.dataMu.RUnlock()
+
+	// Compute the range to read.
+	if rw.offset >= rw.f.attr.Unstable.Size {
+		return 0, io.EOF
+	}
+	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.f.attr.Unstable.Size)
+	if end == rw.offset { // dsts.NumBytes() == 0?
+		return 0, nil
+	}
+
+	mem := rw.f.platform.Memory()
+	var done uint64
+	seg, gap := rw.f.data.Find(uint64(rw.offset))
+	for rw.offset < end {
+		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.offset += int64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Tmpfs holes are zero-filled.
+			gapmr := gap.Range().Intersect(mr)
+			dst := dsts.TakeFirst64(gapmr.Length())
+			n, err := safemem.ZeroSeq(dst)
+			done += n
+			rw.offset += int64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+
+		default:
+			break
+		}
+	}
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	rw.f.dataMu.Lock()
+	defer rw.f.dataMu.Unlock()
+
+	// Compute the range to write.
+	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
+	if end == rw.offset { // srcs.NumBytes() == 0?
+		return 0, nil
+	}
+
+	defer func() {
+		// If the write ends beyond the file's previous size, it causes the
+		// file to grow.
+		if rw.offset > rw.f.attr.Unstable.Size {
+			rw.f.attr.Unstable.Size = rw.offset
+		}
+	}()
+
+	mem := rw.f.platform.Memory()
+	// Page-aligned mr for when we need to allocate memory. RoundUp can't
+	// overflow since end is an int64.
+	pgstartaddr := usermem.Addr(rw.offset).RoundDown()
+	pgendaddr, _ := usermem.Addr(end).RoundUp()
+	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
+
+	var done uint64
+	seg, gap := rw.f.data.Find(uint64(rw.offset))
+	for rw.offset < end {
+		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+			if err != nil {
+				return done, err
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.offset += int64(n)
+			srcs = srcs.DropFirst64(n)
+			if err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Allocate memory for the write.
+			gapMR := gap.Range().Intersect(pgMR)
+			fr, err := mem.Allocate(gapMR.Length(), rw.f.memUsage)
+			if err != nil {
+				return done, err
+			}
+
+			// Write to that memory as usual.
+			seg, gap = rw.f.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
+
+		default:
+			break
+		}
+	}
+	return done, nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	f.mapsMu.Lock()
+	defer f.mapsMu.Unlock()
+	f.mappings.AddMapping(ms, ar, offset)
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+	f.mapsMu.Lock()
+	defer f.mapsMu.Unlock()
+	f.mappings.RemoveMapping(ms, ar, offset)
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (f *fileInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	return f.AddMapping(ctx, ms, dstAR, offset)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (f *fileInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	f.dataMu.Lock()
+	defer f.dataMu.Unlock()
+
+	// Constrain translations to f.attr.Unstable.Size (rounded up) to prevent
+	// translation to pages that may be concurrently truncated.
+	pgend := fs.OffsetPageEnd(f.attr.Unstable.Size)
+	var buserr error
+	if required.End > pgend {
+		buserr = &memmap.BusError{io.EOF}
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	mem := f.platform.Memory()
+	cerr := f.data.Fill(ctx, required, optional, mem, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+		// Newly-allocated pages are zeroed, so we don't need to do anything.
+		return dsts.NumBytes(), nil
+	})
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := f.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   mem,
+			Offset: seg.FileRangeOf(segMR).Start,
+		})
+		translatedEnd = segMR.End
+	}
+
+	// Don't return the error returned by f.data.Fill if it occurred outside of
+	// required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, cerr
+	}
+	return ts, buserr
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
new file mode 100644
index 000000000..1cc7ae491
--- /dev/null
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -0,0 +1,204 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tmpfs is a filesystem implementation backed by memory.
+package tmpfs
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+var fsInfo = fs.Info{
+	Type: linux.TMPFS_MAGIC,
+
+	// TODO: allow configuring a tmpfs size and enforce it.
+	TotalBlocks: 0,
+	FreeBlocks:  0,
+}
+
+// rename implements fs.InodeOperations.Rename for tmpfs nodes.
+func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	op, ok := oldParent.InodeOperations.(*Dir)
+	if !ok {
+		return ramfs.ErrCrossDevice
+	}
+	np, ok := newParent.InodeOperations.(*Dir)
+	if !ok {
+		return ramfs.ErrCrossDevice
+	}
+	return ramfs.Rename(ctx, &op.Dir, oldName, &np.Dir, newName)
+}
+
+// Dir is a directory.
+type Dir struct {
+	ramfs.Dir
+
+	// platform is used to allocate storage for tmpfs Files.
+	platform platform.Platform
+}
+
+// NewDir returns a new directory.
+func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, platform platform.Platform) *fs.Inode {
+	d := &Dir{platform: platform}
+	d.InitDir(ctx, contents, owner, perms)
+
+	// Manually set the CreateOps.
+	d.CreateOps = d.newCreateOps()
+
+	return fs.NewInode(d, msrc, fs.StableAttr{
+		DeviceID:  tmpfsDevice.DeviceID(),
+		InodeID:   tmpfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Directory,
+	})
+}
+
+// afterLoad is invoked by stateify.
+func (d *Dir) afterLoad() {
+	// Per NewDir, manually set the CreateOps.
+	d.Dir.CreateOps = d.newCreateOps()
+}
+
+// newCreateOps builds the custom CreateOps for this Dir.
+func (d *Dir) newCreateOps() *ramfs.CreateOps {
+	return &ramfs.CreateOps{
+		NewDir: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
+			return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource, d.platform), nil
+		},
+		NewFile: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
+			uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
+				Owner: fs.FileOwnerFromContext(ctx),
+				Perms: perms,
+				// Always start unlinked.
+				Links: 0,
+			})
+			iops := NewInMemoryFile(ctx, usage.Tmpfs, uattr, d.platform)
+			return fs.NewInode(iops, dir.MountSource, fs.StableAttr{
+				DeviceID:  tmpfsDevice.DeviceID(),
+				InodeID:   tmpfsDevice.NextIno(),
+				BlockSize: usermem.PageSize,
+				Type:      fs.RegularFile,
+			}), nil
+		},
+		NewSymlink: func(ctx context.Context, dir *fs.Inode, target string) (*fs.Inode, error) {
+			return NewSymlink(ctx, target, fs.FileOwnerFromContext(ctx), dir.MountSource), nil
+		},
+		NewBoundEndpoint: func(ctx context.Context, dir *fs.Inode, socket unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error) {
+			return NewSocket(ctx, socket, fs.FileOwnerFromContext(ctx), perms, dir.MountSource), nil
+		},
+		NewFifo: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
+			return NewFifo(ctx, fs.FileOwnerFromContext(ctx), perms, dir.MountSource), nil
+		},
+	}
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (d *Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return rename(ctx, oldParent, oldName, newParent, newName)
+}
+
+// StatFS implments fs.InodeOperations.StatFS.
+func (*Dir) StatFS(context.Context) (fs.Info, error) {
+	return fsInfo, nil
+}
+
+// Symlink is a symlink.
+type Symlink struct {
+	ramfs.Symlink
+}
+
+// NewSymlink returns a new symlink with the provided permissions.
+func NewSymlink(ctx context.Context, target string, owner fs.FileOwner, msrc *fs.MountSource) *fs.Inode {
+	s := &Symlink{}
+	s.InitSymlink(ctx, owner, target)
+	return fs.NewInode(s, msrc, fs.StableAttr{
+		DeviceID:  tmpfsDevice.DeviceID(),
+		InodeID:   tmpfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Symlink,
+	})
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (s *Symlink) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return rename(ctx, oldParent, oldName, newParent, newName)
+}
+
+// StatFS returns the tmpfs info.
+func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
+	return fsInfo, nil
+}
+
+// Socket is a socket.
+type Socket struct {
+	ramfs.Socket
+}
+
+// NewSocket returns a new socket with the provided permissions.
+func NewSocket(ctx context.Context, socket unix.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
+	s := &Socket{}
+	s.InitSocket(ctx, socket, owner, perms)
+	return fs.NewInode(s, msrc, fs.StableAttr{
+		DeviceID:  tmpfsDevice.DeviceID(),
+		InodeID:   tmpfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Socket,
+	})
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (s *Socket) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return rename(ctx, oldParent, oldName, newParent, newName)
+}
+
+// StatFS returns the tmpfs info.
+func (s *Socket) StatFS(context.Context) (fs.Info, error) {
+	return fsInfo, nil
+}
+
+// Fifo is a tmpfs named pipe.
+type Fifo struct {
+	ramfs.Entry
+}
+
+// NewFifo creates a new named pipe.
+func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
+	f := &Fifo{}
+	f.InitEntry(ctx, owner, perms)
+	iops := pipe.NewInodeOperations(f, pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize))
+	return fs.NewInode(iops, msrc, fs.StableAttr{
+		DeviceID:  tmpfsDevice.DeviceID(),
+		InodeID:   tmpfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Pipe,
+	})
+}
+
+// Rename implements fs.InodeOperations.Rename.
+func (f *Fifo) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return rename(ctx, oldParent, oldName, newParent, newName)
+}
+
+// StatFS returns the tmpfs info.
+func (*Fifo) StatFS(context.Context) (fs.Info, error) {
+	return fsInfo, nil
+}
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
new file mode 100644
index 000000000..90b350410
--- /dev/null
+++ b/pkg/sentry/fs/tty/BUILD
@@ -0,0 +1,63 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "tty_state",
+    srcs = [
+        "dir.go",
+        "fs.go",
+        "inode.go",
+        "line_discipline.go",
+        "master.go",
+        "slave.go",
+        "terminal.go",
+    ],
+    out = "tty_state.go",
+    package = "tty",
+)
+
+go_library(
+    name = "tty",
+    srcs = [
+        "dir.go",
+        "fs.go",
+        "inode.go",
+        "line_discipline.go",
+        "master.go",
+        "slave.go",
+        "terminal.go",
+        "tty_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "tty_test",
+    size = "small",
+    srcs = ["tty_test.go"],
+    embed = [":tty"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
new file mode 100644
index 000000000..2c5b2aed6
--- /dev/null
+++ b/pkg/sentry/fs/tty/dir.go
@@ -0,0 +1,398 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tty provide pseudoterminals via a devpts filesystem.
+package tty
+
+import (
+	"fmt"
+	"math"
+	"strconv"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// dirInodeOperations is the root of a devpts mount.
+//
+// This indirectly manages all terminals within the mount.
+//
+// New Terminals are created by masterInodeOperations.GetFile, which registers
+// the slave Inode in the this directory for discovery via Lookup/Readdir. The
+// slave inode is unregistered when the master file is Released, as the slave
+// is no longer discoverable at that point.
+//
+// References on the underlying Terminal are held by masterFileOperations and
+// slaveInodeOperations.
+//
+// masterInodeOperations and slaveInodeOperations hold a pointer to
+// dirInodeOperations, which is reference counted by the refcount their
+// corresponding Dirents hold on their parent (this directory).
+//
+// dirInodeOperations implements fs.InodeOperations.
+type dirInodeOperations struct {
+	fsutil.DeprecatedFileOperations
+	fsutil.InodeNotSocket
+	fsutil.InodeNotRenameable
+	fsutil.InodeNotSymlink
+	fsutil.InodeNoExtendedAttributes
+	fsutil.NoMappable
+	fsutil.NoopWriteOut
+
+	// msrc is the super block this directory is on.
+	//
+	// TODO: Plumb this through instead of storing it here.
+	msrc *fs.MountSource
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// attr contains the UnstableAttrs.
+	attr fsutil.InMemoryAttributes
+
+	// master is the master PTY inode.
+	master *fs.Inode
+
+	// slaves contains the slave inodes reachable from the directory.
+	//
+	// A new slave is added by allocateTerminal and is removed by
+	// masterFileOperations.Release.
+	//
+	// A reference is held on every slave in the map.
+	slaves map[uint32]*fs.Inode
+
+	// dentryMap is a SortedDentryMap used to implement Readdir containing
+	// the master and all entries in slaves.
+	dentryMap *fs.SortedDentryMap
+
+	// next is the next pty index to use.
+	//
+	// TODO: reuse indices when ptys are closed.
+	next uint32
+}
+
+var _ fs.InodeOperations = (*dirInodeOperations)(nil)
+
+// newDir creates a new dir with a ptmx file and no terminals.
+func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
+	d := &dirInodeOperations{
+		attr: fsutil.InMemoryAttributes{
+			Unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+				Owner: fs.RootOwner,
+				Perms: fs.FilePermsFromMode(0555),
+			}),
+		},
+		msrc:      m,
+		slaves:    make(map[uint32]*fs.Inode),
+		dentryMap: fs.NewSortedDentryMap(nil),
+	}
+	// Linux devpts uses a default mode of 0000 for ptmx which can be
+	// changed with the ptmxmode mount option. However, that default is not
+	// useful here (since we'd *always* need the mount option, so it is
+	// accessible by default).
+	d.master = newMasterInode(ctx, d, fs.RootOwner, fs.FilePermsFromMode(0666))
+	d.dentryMap.Add("ptmx", fs.DentAttr{
+		Type:    d.master.StableAttr.Type,
+		InodeID: d.master.StableAttr.InodeID,
+	})
+
+	return fs.NewInode(d, m, fs.StableAttr{
+		DeviceID: ptsDevice.DeviceID(),
+		// N.B. Linux always uses inode id 1 for the directory. See
+		// fs/devpts/inode.c:devpts_fill_super.
+		//
+		// TODO: Since ptsDevice must be shared between
+		// different mounts, we must not assign fixed numbers.
+		InodeID:   ptsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.Directory,
+	})
+}
+
+// Release implements fs.InodeOperations.Release.
+func (d *dirInodeOperations) Release(ctx context.Context) {
+	d.master.DecRef()
+	if len(d.slaves) != 0 {
+		panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
+	}
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Master?
+	if name == "ptmx" {
+		d.master.IncRef()
+		return fs.NewDirent(d.master, name), nil
+	}
+
+	// Slave number?
+	n, err := strconv.ParseUint(name, 10, 32)
+	if err != nil {
+		// Not found.
+		return nil, syserror.ENOENT
+	}
+
+	s, ok := d.slaves[uint32(n)]
+	if !ok {
+		return nil, syserror.ENOENT
+	}
+
+	s.IncRef()
+	return fs.NewDirent(s, name), nil
+}
+
+// Create implements fs.InodeOperations.Create.
+//
+// Creation is never allowed.
+func (d *dirInodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+	return nil, syserror.EACCES
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+//
+// Creation is never allowed.
+func (d *dirInodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	return syserror.EACCES
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+//
+// Creation is never allowed.
+func (d *dirInodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error {
+	return syserror.EACCES
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+//
+// Creation is never allowed.
+func (d *dirInodeOperations) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error {
+	return syserror.EACCES
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+//
+// Creation is never allowed.
+func (d *dirInodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	return syserror.EACCES
+}
+
+// Remove implements fs.InodeOperations.Remove.
+//
+// Removal is never allowed.
+func (d *dirInodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+	return syserror.EPERM
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+//
+// Removal is never allowed.
+func (d *dirInodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+	return syserror.EPERM
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
+	return syserror.EPERM
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (d *dirInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &dirFileOperations{di: d}), nil
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (d *dirInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.attr.Unstable, nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (d *dirInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (d *dirInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.attr.SetPermissions(ctx, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (d *dirInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.attr.SetOwner(ctx, owner)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (d *dirInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.attr.SetTimestamps(ctx, ts)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (d *dirInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	return syserror.EINVAL
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+func (d *dirInodeOperations) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+func (d *dirInodeOperations) DropLink() {}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+func (d *dirInodeOperations) NotifyStatusChange(ctx context.Context) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	d.attr.TouchStatusChangeTime(ctx)
+}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (d *dirInodeOperations) IsVirtual() bool {
+	return true
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (d *dirInodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
+	return fs.Info{
+		Type: linux.DEVPTS_SUPER_MAGIC,
+	}, nil
+}
+
+// allocateTerminal creates a new Terminal and installs a pts node for it.
+//
+// The caller must call DecRef when done with the returned Terminal.
+func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	n := d.next
+	if n == math.MaxUint32 {
+		return nil, syserror.ENOMEM
+	}
+
+	if _, ok := d.slaves[n]; ok {
+		panic(fmt.Sprintf("pty index collision; index %d already exists", n))
+	}
+
+	t := newTerminal(ctx, d, n)
+	d.next++
+
+	// The reference returned by newTerminal is returned to the caller.
+	// Take another for the slave inode.
+	t.IncRef()
+
+	// Create a pts node. The owner is based on the context that opens
+	// ptmx.
+	creds := auth.CredentialsFromContext(ctx)
+	uid, gid := creds.EffectiveKUID, creds.EffectiveKGID
+	slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666))
+
+	d.slaves[n] = slave
+	d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{
+		Type:    slave.StableAttr.Type,
+		InodeID: slave.StableAttr.InodeID,
+	})
+
+	return t, nil
+}
+
+// masterClose is called when the master end of t is closed.
+func (d *dirInodeOperations) masterClose(t *Terminal) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// The slave end disappears from the directory when the master end is
+	// closed, even if the slave end is open elsewhere.
+	//
+	// N.B. since we're using a backdoor method to remove a directory entry
+	// we won't properly fire inotify events like Linux would.
+	s, ok := d.slaves[t.n]
+	if !ok {
+		panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d))
+	}
+
+	s.DecRef()
+	delete(d.slaves, t.n)
+	d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10))
+}
+
+// dirFileOperations are the fs.FileOperations for the directory.
+//
+// This is nearly identical to fsutil.DirFileOperations, except that it takes
+// df.di.mu in IterateDir.
+type dirFileOperations struct {
+	waiter.AlwaysReady `state:"nosave"`
+	fsutil.NoopRelease `state:"nosave"`
+	fsutil.GenericSeek `state:"nosave"`
+	fsutil.NoFsync     `state:"nosave"`
+	fsutil.NoopFlush   `state:"nosave"`
+	fsutil.NoMMap      `state:"nosave"`
+	fsutil.NoIoctl     `state:"nosave"`
+
+	// di is the inode operations.
+	di *dirInodeOperations
+
+	// dirCursor contains the name of the last directory entry that was
+	// serialized.
+	dirCursor string
+}
+
+var _ fs.FileOperations = (*dirFileOperations)(nil)
+
+// IterateDir implements DirIterator.IterateDir.
+func (df *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	df.di.mu.Lock()
+	defer df.di.mu.Unlock()
+
+	n, err := fs.GenericReaddir(dirCtx, df.di.dentryMap)
+	return offset + n, err
+}
+
+// Readdir implements FileOperations.Readdir.
+func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &df.dirCursor,
+	}
+	return fs.DirentReaddir(ctx, file.Dirent, df, root, dirCtx, file.Offset())
+}
+
+// Read implements FileOperations.Read
+func (df *dirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements FileOperations.Write.
+func (df *dirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
new file mode 100644
index 000000000..f5e7a3162
--- /dev/null
+++ b/pkg/sentry/fs/tty/fs.go
@@ -0,0 +1,95 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptsDevice is the pseudo-filesystem device.
+var ptsDevice = device.NewAnonDevice()
+
+// filesystem is a devpts filesystem.
+//
+// This devpts is always in the new "multi-instance" mode. i.e., it contains a
+// ptmx device tied to this mount.
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// Name matches drivers/devpts/indoe.c:devpts_fs_type.name.
+func (*filesystem) Name() string {
+	return "devpts"
+}
+
+// AllowUserMount allows users to mount(2) this file system.
+func (*filesystem) AllowUserMount() bool {
+	// TODO: Users may mount this once the terminals are in a
+	// usable state.
+	return false
+}
+
+// Flags returns that there is nothing special about this file system.
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return 0
+}
+
+// MountSource returns a devpts root that can be positioned in the vfs.
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+	// device is always ignored.
+
+	// No options are supported.
+	if data != "" {
+		return nil, syserror.EINVAL
+	}
+
+	return newDir(ctx, fs.NewMountSource(&superOperations{}, f, flags)), nil
+}
+
+// superOperations implements fs.MountSourceOperations, preventing caching.
+type superOperations struct{}
+
+// Revalidate implements fs.DirentOperations.Revalidate.
+//
+// It always returns true, forcing a Lookup for all entries.
+//
+// Slave entries are dropped from dir when their master is closed, so an
+// existing slave Dirent in the tree is not sufficient to guarantee that it
+// still exists on the filesystem.
+func (superOperations) Revalidate(*fs.Dirent) bool {
+	return true
+}
+
+// Keep implements fs.DirentOperations.Keep.
+//
+// Keep returns false because Revalidate would force a lookup on cached entries
+// anyways.
+func (superOperations) Keep(*fs.Dirent) bool {
+	return false
+}
+
+// ResetInodeMappings implements MountSourceOperations.ResetInodeMappings.
+func (superOperations) ResetInodeMappings() {}
+
+// SaveInodeMapping implements MountSourceOperations.SaveInodeMapping.
+func (superOperations) SaveInodeMapping(*fs.Inode, string) {}
+
+// Destroy implements MountSourceOperations.Destroy.
+func (superOperations) Destroy() {}
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
new file mode 100644
index 000000000..04b9a7727
--- /dev/null
+++ b/pkg/sentry/fs/tty/inode.go
@@ -0,0 +1,143 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// inodeOperations are the base fs.InodeOperations for master and slave Inodes.
+//
+// inodeOperations does not implement:
+//
+// * fs.InodeOperations.Release
+// * fs.InodeOperations.GetFile
+type inodeOperations struct {
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.NoMappable                `state:"nosave"`
+	fsutil.NoopWriteOut              `state:"nosave"`
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// uattr is the inode's UnstableAttr.
+	uattr fs.UnstableAttr
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	return i.uattr, nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions
+func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	i.uattr.Perms = p
+	i.uattr.StatusChangeTime = ktime.NowFromContext(ctx)
+	return true
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	if owner.UID.Ok() {
+		i.uattr.Owner.UID = owner.UID
+	}
+	if owner.GID.Ok() {
+		i.uattr.Owner.GID = owner.GID
+	}
+	return nil
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return nil
+	}
+
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	now := ktime.NowFromContext(ctx)
+	if !ts.ATimeOmit {
+		if ts.ATime.IsZero() {
+			i.uattr.AccessTime = now
+		} else {
+			i.uattr.AccessTime = ts.ATime
+		}
+	}
+	if !ts.MTimeOmit {
+		if ts.MTime.IsZero() {
+			i.uattr.ModificationTime = now
+		} else {
+			i.uattr.ModificationTime = ts.MTime
+		}
+	}
+	i.uattr.StatusChangeTime = now
+	return nil
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+	return syserror.EINVAL
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+func (i *inodeOperations) AddLink() {
+}
+
+// DropLink implements fs.InodeOperations.DropLink.
+func (i *inodeOperations) DropLink() {
+}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	i.uattr.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (i *inodeOperations) IsVirtual() bool {
+	return true
+}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
+	return fs.Info{
+		Type: linux.DEVPTS_SUPER_MAGIC,
+	}, nil
+}
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
new file mode 100644
index 000000000..fde4e7941
--- /dev/null
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -0,0 +1,342 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"bytes"
+	"sync"
+	"unicode/utf8"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	spacesPerTab = 8
+)
+
+// lineDiscipline dictates how input and output are handled between the
+// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
+// pages are good resources for how to affect the line discipline:
+//
+//   * termios(3)
+//   * tty_ioctl(4)
+//
+// This file corresponds most closely to drivers/tty/n_tty.c.
+//
+// lineDiscipline has a simple structure but supports a multitude of options
+// (see the above man pages). It consists of two queues of bytes: one from the
+// terminal master to slave (the input queue) and one from slave to master (the
+// output queue). When bytes are written to one end of the pty, the line
+// discipline reads the bytes, modifies them or takes special action if
+// required, and enqueues them to be read by the other end of the pty:
+//
+//       input from terminal    +-------------+   input to process (e.g. bash)
+//    +------------------------>| input queue |---------------------------+
+//    |                         +-------------+                           |
+//    |                                                                   |
+//    |                                                                   v
+// masterFD                                                            slaveFD
+//    ^                                                                   |
+//    |                                                                   |
+//    |   output to terminal   +--------------+    output from process    |
+//    +------------------------| output queue |<--------------------------+
+//                             +--------------+
+//
+// Lock order:
+//  inMu
+//    outMu
+//      termiosMu
+type lineDiscipline struct {
+	// inMu protects inQueue.
+	inMu sync.Mutex `state:"nosave"`
+
+	// inQueue is the input queue of the terminal.
+	inQueue queue
+
+	// outMu protects outQueue.
+	outMu sync.Mutex `state:"nosave"`
+
+	// outQueue is the output queue of the terminal.
+	outQueue queue
+
+	// termiosMu protects termios.
+	termiosMu sync.Mutex `state:"nosave"`
+
+	// termios is the terminal configuration used by the lineDiscipline.
+	termios linux.KernelTermios
+
+	// column is the location in a row of the cursor. This is important for
+	// handling certain special characters like backspace.
+	column int
+}
+
+// getTermios gets the linux.Termios for the tty.
+func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	// We must copy a Termios struct, not KernelTermios.
+	t := l.termios.ToTermios()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// setTermios sets a linux.Termios for the tty.
+func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	// We must copy a Termios struct, not KernelTermios.
+	var t linux.Termios
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	l.termios.FromTermios(t)
+	return 0, err
+}
+
+func (l *lineDiscipline) masterReadiness() waiter.EventMask {
+	l.inMu.Lock()
+	defer l.inMu.Unlock()
+	l.outMu.Lock()
+	defer l.outMu.Unlock()
+	return l.inQueue.writeReadiness() | l.outQueue.readReadiness()
+}
+
+func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+	l.inMu.Lock()
+	defer l.inMu.Unlock()
+	l.outMu.Lock()
+	defer l.outMu.Unlock()
+	return l.outQueue.writeReadiness() | l.inQueue.readReadiness()
+}
+
+// queue represents one of the input or output queues between a pty master and
+// slave.
+type queue struct {
+	waiter.Queue `state:"nosave"`
+	buf          bytes.Buffer `state:".([]byte)"`
+}
+
+// saveBuf is invoked by stateify.
+func (q *queue) saveBuf() []byte {
+	return append([]byte(nil), q.buf.Bytes()...)
+}
+
+// loadBuf is invoked by stateify.
+func (q *queue) loadBuf(b []byte) {
+	q.buf.Write(b)
+}
+
+// readReadiness returns whether q is ready to be read from.
+//
+// Preconditions: q's mutex must be held.
+func (q *queue) readReadiness() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if q.buf.Len() > 0 {
+		ready |= waiter.EventIn
+	}
+	return ready
+}
+
+// writeReadiness returns whether q is ready to be written to.
+func (q *queue) writeReadiness() waiter.EventMask {
+	return waiter.EventOut
+}
+
+func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.inMu.Lock()
+	defer l.inMu.Unlock()
+	return l.queueRead(ctx, dst, &l.inQueue)
+}
+
+func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.inMu.Lock()
+	defer l.inMu.Unlock()
+	return l.queueWrite(ctx, src, &l.inQueue, false)
+}
+
+func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.outMu.Lock()
+	defer l.outMu.Unlock()
+	return l.queueRead(ctx, dst, &l.outQueue)
+}
+
+func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.outMu.Lock()
+	defer l.outMu.Unlock()
+	return l.queueWrite(ctx, src, &l.outQueue, true)
+}
+
+// queueRead reads from q to userspace.
+//
+// Preconditions: q's lock must be held.
+func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence, q *queue) (int64, error) {
+	// Copy bytes out to user-space. queueRead doesn't have to do any
+	// processing or other extra work -- that's all taken care of when
+	// writing to a queue.
+	n, err := q.buf.WriteTo(dst.Writer(ctx))
+
+	// If state changed, notify any waiters. If nothing was available to
+	// read, let the caller know we could block.
+	if n > 0 {
+		q.Notify(waiter.EventOut)
+	} else if err == nil {
+		return 0, syserror.ErrWouldBlock
+	}
+	return int64(n), err
+}
+
+// queueWrite writes to q from userspace. `output` is whether the queue being
+// written to should be subject to output processing (i.e. whether it is the
+// output queue).
+//
+// Precondition: q's lock must be held.
+func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence, q *queue, output bool) (int64, error) {
+	// TODO: Use CopyInTo/safemem to avoid extra copying.
+	// Get the bytes to write from user-space.
+	b := make([]byte, src.NumBytes())
+	n, err := src.CopyIn(ctx, b)
+	if err != nil {
+		return 0, err
+	}
+	b = b[:n]
+
+	// If state changed, notify any waiters. If we were unable to write
+	// anything, let the caller know we could block.
+	if n > 0 {
+		q.Notify(waiter.EventIn)
+	} else {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Optionally perform line discipline transformations depending on
+	// whether we're writing to the input queue or output queue.
+	var buf *bytes.Buffer
+	l.termiosMu.Lock()
+	if output {
+		buf = l.transformOutput(b)
+	} else {
+		buf = l.transformInput(b)
+	}
+	l.termiosMu.Unlock()
+
+	// Enqueue buf at the end of the queue.
+	buf.WriteTo(&q.buf)
+	return int64(n), err
+}
+
+// transformOutput does ouput processing for one end of the pty. See
+// drivers/tty/n_tty.c:do_output_char for an analagous kernel function.
+//
+// Precondition: l.termiosMu must be held.
+func (l *lineDiscipline) transformOutput(buf []byte) *bytes.Buffer {
+	if !l.termios.OEnabled(linux.OPOST) {
+		return bytes.NewBuffer(buf)
+	}
+
+	var ret bytes.Buffer
+	for len(buf) > 0 {
+		c := l.removeRune(&buf)
+		switch c {
+		case '\n':
+			if l.termios.OEnabled(linux.ONLRET) {
+				l.column = 0
+			}
+			if l.termios.OEnabled(linux.ONLCR) {
+				ret.Write([]byte{'\r', '\n'})
+				continue
+			}
+		case '\r':
+			if l.termios.OEnabled(linux.ONOCR) && l.column == 0 {
+				continue
+			}
+			if l.termios.OEnabled(linux.OCRNL) {
+				c = '\n'
+				if l.termios.OEnabled(linux.ONLRET) {
+					l.column = 0
+				}
+				break
+			}
+			l.column = 0
+		case '\t':
+			spaces := spacesPerTab - l.column%spacesPerTab
+			if l.termios.OutputFlags&linux.TABDLY == linux.XTABS {
+				l.column += spaces
+				ret.Write(bytes.Repeat([]byte{' '}, 8))
+				continue
+			}
+			l.column += spaces
+		case '\b':
+			if l.column > 0 {
+				l.column--
+			}
+		default:
+			l.column++
+		}
+		ret.WriteRune(c)
+	}
+	return &ret
+}
+
+// transformInput does input processing for one end of the pty. Characters
+// read are transformed according to flags set in the termios struct. See
+// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
+// function.
+//
+// Precondition: l.termiosMu must be held.
+func (l *lineDiscipline) transformInput(buf []byte) *bytes.Buffer {
+	var ret bytes.Buffer
+	for len(buf) > 0 {
+		c := l.removeRune(&buf)
+		switch c {
+		case '\r':
+			if l.termios.IEnabled(linux.IGNCR) {
+				continue
+			}
+			if l.termios.IEnabled(linux.ICRNL) {
+				c = '\n'
+			}
+		case '\n':
+			if l.termios.IEnabled(linux.INLCR) {
+				c = '\r'
+			}
+		}
+		ret.WriteRune(c)
+	}
+	return &ret
+}
+
+// removeRune removes and returns the first rune from the byte array. The
+// buffer's length is updated accordingly.
+func (l *lineDiscipline) removeRune(b *[]byte) rune {
+	var c rune
+	var size int
+	// If UTF-8 support is enabled, runes might be multiple bytes.
+	if l.termios.IEnabled(linux.IUTF8) {
+		c, size = utf8.DecodeRune(*b)
+	} else {
+		c = rune((*b)[0])
+		size = 1
+	}
+	*b = (*b)[size:]
+	return c
+}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
new file mode 100644
index 000000000..3c47ee517
--- /dev/null
+++ b/pkg/sentry/fs/tty/master.go
@@ -0,0 +1,173 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// masterInodeOperations are the fs.InodeOperations for the master end of the
+// Terminal (ptmx file).
+type masterInodeOperations struct {
+	inodeOperations
+
+	// d is the containing dir.
+	d *dirInodeOperations
+}
+
+var _ fs.InodeOperations = (*masterInodeOperations)(nil)
+
+// newMasterInode creates an Inode for the master end of a terminal.
+func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+	iops := &masterInodeOperations{
+		inodeOperations: inodeOperations{
+			uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+				Owner: owner,
+				Perms: p,
+				Links: 1,
+				// Size and Blocks are always 0.
+			}),
+		},
+		d: d,
+	}
+
+	return fs.NewInode(iops, d.msrc, fs.StableAttr{
+		DeviceID: ptsDevice.DeviceID(),
+		// N.B. Linux always uses inode id 2 for ptmx. See
+		// fs/devpts/inode.c:mknod_ptmx.
+		//
+		// TODO: Since ptsDevice must be shared between
+		// different mounts, we must not assign fixed numbers.
+		InodeID: ptsDevice.NextIno(),
+		Type:    fs.CharacterDevice,
+		// See fs/devpts/inode.c:devpts_fill_super.
+		BlockSize: 1024,
+		// The PTY master effectively has two different major/minor
+		// device numbers.
+		//
+		// This one is returned by stat for both opened and unopened
+		// instances of this inode.
+		//
+		// When the inode is opened (GetFile), a new device number is
+		// allocated based on major UNIX98_PTY_MASTER_MAJOR and the tty
+		// index as minor number. However, this device number is only
+		// accessible via ioctl(TIOCGDEV) and /proc/TID/stat.
+		DeviceFileMajor: linux.TTYAUX_MAJOR,
+		DeviceFileMinor: linux.PTMX_MINOR,
+	})
+}
+
+// Release implements fs.InodeOperations.Release.
+func (mi *masterInodeOperations) Release(ctx context.Context) {
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+//
+// It allocates a new terminal.
+func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	t, err := mi.d.allocateTerminal(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	return fs.NewFile(ctx, d, flags, &masterFileOperations{
+		d: mi.d,
+		t: t,
+	}), nil
+}
+
+// masterFileOperations are the fs.FileOperations for the master end of a terminal.
+type masterFileOperations struct {
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+
+	// d is the containing dir.
+	d *dirInodeOperations
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ fs.FileOperations = (*masterFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (mf *masterFileOperations) Release() {
+	mf.d.masterClose(mf.t)
+	mf.t.DecRef()
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (mf *masterFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	mf.t.ld.inQueue.EventRegister(e, mask)
+	mf.t.ld.outQueue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (mf *masterFileOperations) EventUnregister(e *waiter.Entry) {
+	mf.t.ld.inQueue.EventUnregister(e)
+	mf.t.ld.outQueue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (mf *masterFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return mf.t.ld.masterReadiness()
+}
+
+// Read implements fs.FileOperations.Read.
+func (mf *masterFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	return mf.t.ld.outputQueueRead(ctx, dst)
+}
+
+// Write implements fs.FileOperations.Write.
+func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	return mf.t.ld.inputQueueWrite(ctx, src)
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch args[1].Uint() {
+	case linux.TCGETS:
+		// N.B. TCGETS on the master actually returns the configuration
+		// of the slave end.
+		return mf.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		// N.B. TCSETS on the master actually affects the configuration
+		// of the slave end.
+		return mf.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO: This should drain the output queue first.
+		return mf.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.TIOCSPTLCK:
+		// TODO: Implement pty locking. For now just pretend we do.
+		return 0, nil
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
new file mode 100644
index 000000000..9178071a4
--- /dev/null
+++ b/pkg/sentry/fs/tty/slave.go
@@ -0,0 +1,151 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// slaveInodeOperations are the fs.InodeOperations for the slave end of the
+// Terminal (pts file).
+type slaveInodeOperations struct {
+	inodeOperations
+
+	// d is the containing dir.
+	d *dirInodeOperations
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
+
+// newSlaveInode creates an fs.Inode for the slave end of a terminal.
+//
+// newSlaveInode takes ownership of t.
+func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
+	iops := &slaveInodeOperations{
+		inodeOperations: inodeOperations{
+			uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+				Owner: owner,
+				Perms: p,
+				Links: 1,
+				// Size and Blocks are always 0.
+			}),
+		},
+		d: d,
+		t: t,
+	}
+
+	return fs.NewInode(iops, d.msrc, fs.StableAttr{
+		DeviceID: ptsDevice.DeviceID(),
+		// N.B. Linux always uses inode id = tty index + 3. See
+		// fs/devpts/inode.c:devpts_pty_new.
+		//
+		// TODO: Since ptsDevice must be shared between
+		// different mounts, we must not assign fixed numbers.
+		InodeID: ptsDevice.NextIno(),
+		Type:    fs.CharacterDevice,
+		// See fs/devpts/inode.c:devpts_fill_super.
+		BlockSize:       1024,
+		DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR,
+		DeviceFileMinor: t.n,
+	})
+}
+
+// Release implements fs.InodeOperations.Release.
+func (si *slaveInodeOperations) Release(ctx context.Context) {
+	si.t.DecRef()
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+//
+// This may race with destruction of the terminal. If the terminal is gone, it
+// returns ENOENT.
+func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil
+}
+
+// slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+type slaveFileOperations struct {
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+
+	// si is the inode operations.
+	si *slaveInodeOperations
+}
+
+var _ fs.FileOperations = (*slaveFileOperations)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (sf *slaveFileOperations) Release() {
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sf.si.t.ld.outQueue.EventRegister(e, mask)
+	sf.si.t.ld.inQueue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
+	sf.si.t.ld.outQueue.EventUnregister(e)
+	sf.si.t.ld.inQueue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sf.si.t.ld.slaveReadiness()
+}
+
+// Read implements fs.FileOperations.Read.
+func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	return sf.si.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements fs.FileOperations.Write.
+func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	return sf.si.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch args[1].Uint() {
+	case linux.TCGETS:
+		return sf.si.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		return sf.si.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO: This should drain the output queue first.
+		return sf.si.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
new file mode 100644
index 000000000..6ae713a32
--- /dev/null
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// Terminal is a pseudoterminal.
+type Terminal struct {
+	refs.AtomicRefCount
+
+	// n is the terminal index.
+	n uint32
+
+	// d is the containing directory.
+	d *dirInodeOperations
+
+	// ld is the line discipline of the terminal.
+	ld lineDiscipline
+}
+
+func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
+	termios := linux.DefaultSlaveTermios
+	return &Terminal{
+		d:  d,
+		n:  n,
+		ld: lineDiscipline{termios: termios},
+	}
+}
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
new file mode 100644
index 000000000..0c7560ed7
--- /dev/null
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func TestSimpleMasterToSlave(t *testing.T) {
+	ld := lineDiscipline{termios: linux.DefaultSlaveTermios}
+	ctx := contexttest.Context(t)
+	inBytes := []byte("hello, tty\n")
+	src := usermem.BytesIOSequence(inBytes)
+	outBytes := make([]byte, 32)
+	dst := usermem.BytesIOSequence(outBytes)
+
+	// Write to the input queue.
+	nw, err := ld.inputQueueWrite(ctx, src)
+	if err != nil {
+		t.Fatalf("error writing to input queue: %v", err)
+	}
+	if nw != int64(len(inBytes)) {
+		t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes))
+	}
+
+	// Read from the input queue.
+	nr, err := ld.inputQueueRead(ctx, dst)
+	if err != nil {
+		t.Fatalf("error reading from input queue: %v", err)
+	}
+	if nr != int64(len(inBytes)) {
+		t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes))
+	}
+
+	outStr := string(outBytes[:nr])
+	inStr := string(inBytes)
+	if outStr != inStr {
+		t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr)
+	}
+}
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
new file mode 100644
index 000000000..9457618d8
--- /dev/null
+++ b/pkg/sentry/hostcpu/BUILD
@@ -0,0 +1,20 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "hostcpu",
+    srcs = [
+        "getcpu_amd64.s",
+        "hostcpu.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu",
+    visibility = ["//:sandbox"],
+)
+
+go_test(
+    name = "hostcpu_test",
+    size = "small",
+    srcs = ["hostcpu_test.go"],
+    embed = [":hostcpu"],
+)
diff --git a/pkg/sentry/hostcpu/getcpu_amd64.s b/pkg/sentry/hostcpu/getcpu_amd64.s
new file mode 100644
index 000000000..7f6247d81
--- /dev/null
+++ b/pkg/sentry/hostcpu/getcpu_amd64.s
@@ -0,0 +1,24 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// func GetCPU() (cpu uint32)
+TEXT ·GetCPU(SB), NOSPLIT, $0-4
+	BYTE $0x0f; BYTE $0x01; BYTE $0xf9; // RDTSCP
+	// On Linux, the bottom 12 bits of IA32_TSC_AUX are CPU and the upper 20
+	// are node. See arch/x86/entry/vdso/vma.c:vgetcpu_cpu_init().
+	ANDL	$0xfff, CX
+	MOVL	CX, cpu+0(FP)
+	RET
diff --git a/pkg/sentry/hostcpu/hostcpu.go b/pkg/sentry/hostcpu/hostcpu.go
new file mode 100644
index 000000000..fa46499ad
--- /dev/null
+++ b/pkg/sentry/hostcpu/hostcpu.go
@@ -0,0 +1,67 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hostcpu provides utilities for working with CPU information provided
+// by a host Linux kernel.
+package hostcpu
+
+import (
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+	"unicode"
+)
+
+// GetCPU returns the caller's current CPU number, without using the Linux VDSO
+// (which is not available to the sentry) or the getcpu(2) system call (which
+// is relatively slow).
+func GetCPU() uint32
+
+// MaxPossibleCPU returns the highest possible CPU number, which is guaranteed
+// not to change for the lifetime of the host kernel.
+func MaxPossibleCPU() (uint32, error) {
+	const path = "/sys/devices/system/cpu/possible"
+	data, err := ioutil.ReadFile(path)
+	if err != nil {
+		return 0, err
+	}
+	str := string(data)
+	// Linux: drivers/base/cpu.c:show_cpus_attr() =>
+	// include/linux/cpumask.h:cpumask_print_to_pagebuf() =>
+	// lib/bitmap.c:bitmap_print_to_pagebuf()
+	i, err := maxValueInLinuxBitmap(str)
+	if err != nil {
+		return 0, fmt.Errorf("invalid %s (%q): %v", path, str, err)
+	}
+	return uint32(i), nil
+}
+
+// maxValueInLinuxBitmap returns the maximum value specified in str, which is a
+// string emitted by Linux's lib/bitmap.c:bitmap_print_to_pagebuf(list=true).
+func maxValueInLinuxBitmap(str string) (uint64, error) {
+	str = strings.TrimSpace(str)
+	// Find the last decimal number in str.
+	idx := strings.LastIndexFunc(str, func(c rune) bool {
+		return !unicode.IsDigit(c)
+	})
+	if idx != -1 {
+		str = str[idx+1:]
+	}
+	i, err := strconv.ParseUint(str, 10, 64)
+	if err != nil {
+		return 0, err
+	}
+	return i, nil
+}
diff --git a/pkg/sentry/hostcpu/hostcpu_test.go b/pkg/sentry/hostcpu/hostcpu_test.go
new file mode 100644
index 000000000..a82e1a271
--- /dev/null
+++ b/pkg/sentry/hostcpu/hostcpu_test.go
@@ -0,0 +1,52 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostcpu
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestMaxValueInLinuxBitmap(t *testing.T) {
+	for _, test := range []struct {
+		str string
+		max uint64
+	}{
+		{"0", 0},
+		{"0\n", 0},
+		{"0,2", 2},
+		{"0-63", 63},
+		{"0-3,8-11", 11},
+	} {
+		t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) {
+			max, err := maxValueInLinuxBitmap(test.str)
+			if err != nil || max != test.max {
+				t.Errorf("maxValueInLinuxBitmap: got (%d, %v), wanted (%d, nil)", max, err, test.max)
+			}
+		})
+	}
+}
+
+func TestMaxValueInLinuxBitmapErrors(t *testing.T) {
+	for _, str := range []string{"", "\n"} {
+		t.Run(fmt.Sprintf("%q", str), func(t *testing.T) {
+			max, err := maxValueInLinuxBitmap(str)
+			if err == nil {
+				t.Errorf("maxValueInLinuxBitmap: got (%d, nil), wanted (_, error)", max)
+			}
+			t.Log(err)
+		})
+	}
+}
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
new file mode 100644
index 000000000..207cdb692
--- /dev/null
+++ b/pkg/sentry/inet/BUILD
@@ -0,0 +1,28 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "inet_state",
+    srcs = ["inet.go"],
+    out = "inet_state.go",
+    package = "inet",
+)
+
+go_library(
+    name = "inet",
+    srcs = [
+        "inet.go",
+        "inet_state.go",
+        "test_stack.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/inet",
+    deps = [
+        "//pkg/state",
+    ],
+)
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
new file mode 100644
index 000000000..e4b326993
--- /dev/null
+++ b/pkg/sentry/inet/inet.go
@@ -0,0 +1,99 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package inet defines semantics for IP stacks.
+package inet
+
+// Stack represents a TCP/IP stack.
+type Stack interface {
+	// Interfaces returns all network interfaces as a mapping from interface
+	// indexes to interface properties. Interface indices are strictly positive
+	// integers.
+	Interfaces() map[int32]Interface
+
+	// InterfaceAddrs returns all network interface addresses as a mapping from
+	// interface indexes to a slice of associated interface address properties.
+	InterfaceAddrs() map[int32][]InterfaceAddr
+
+	// SupportsIPv6 returns true if the stack supports IPv6 connectivity.
+	SupportsIPv6() bool
+
+	// TCPReceiveBufferSize returns TCP receive buffer size settings.
+	TCPReceiveBufferSize() (TCPBufferSize, error)
+
+	// SetTCPReceiveBufferSize attempts to change TCP receive buffer size
+	// settings.
+	SetTCPReceiveBufferSize(size TCPBufferSize) error
+
+	// TCPSendBufferSize returns TCP send buffer size settings.
+	TCPSendBufferSize() (TCPBufferSize, error)
+
+	// SetTCPSendBufferSize attempts to change TCP send buffer size settings.
+	SetTCPSendBufferSize(size TCPBufferSize) error
+
+	// TCPSACKEnabled returns true if RFC 2018 TCP Selective Acknowledgements
+	// are enabled.
+	TCPSACKEnabled() (bool, error)
+
+	// SetTCPSACKEnabled attempts to change TCP selective acknowledgement
+	// settings.
+	SetTCPSACKEnabled(enabled bool) error
+}
+
+// Interface contains information about a network interface.
+type Interface struct {
+	// Keep these fields sorted in the order they appear in rtnetlink(7).
+
+	// DeviceType is the device type, a Linux ARPHRD_* constant.
+	DeviceType uint16
+
+	// Flags is the device flags; see netdevice(7), under "Ioctls",
+	// "SIOCGIFFLAGS, SIOCSIFFLAGS".
+	Flags uint32
+
+	// Name is the device name.
+	Name string
+
+	// Addr is the hardware device address.
+	Addr []byte
+}
+
+// InterfaceAddr contains information about a network interface address.
+type InterfaceAddr struct {
+	// Keep these fields sorted in the order they appear in rtnetlink(7).
+
+	// Family is the address family, a Linux AF_* constant.
+	Family uint8
+
+	// PrefixLen is the address prefix length.
+	PrefixLen uint8
+
+	// Flags is the address flags.
+	Flags uint8
+
+	// Addr is the actual address.
+	Addr []byte
+}
+
+// TCPBufferSize contains settings controlling TCP buffer sizing.
+type TCPBufferSize struct {
+	// Min is the minimum size.
+	Min int
+
+	// Default is the default size.
+	Default int
+
+	// Max is the maximum size.
+	Max int
+}
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
new file mode 100644
index 000000000..bc10926ee
--- /dev/null
+++ b/pkg/sentry/inet/test_stack.go
@@ -0,0 +1,83 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package inet
+
+// TestStack is a dummy implementation of Stack for tests.
+type TestStack struct {
+	InterfacesMap     map[int32]Interface
+	InterfaceAddrsMap map[int32][]InterfaceAddr
+	SupportsIPv6Flag  bool
+	TCPRecvBufSize    TCPBufferSize
+	TCPSendBufSize    TCPBufferSize
+	TCPSACKFlag       bool
+}
+
+// NewTestStack returns a TestStack with no network interfaces. The value of
+// all other options is unspecified; tests that rely on specific values must
+// set them explicitly.
+func NewTestStack() *TestStack {
+	return &TestStack{
+		InterfacesMap:     make(map[int32]Interface),
+		InterfaceAddrsMap: make(map[int32][]InterfaceAddr),
+	}
+}
+
+// Interfaces implements Stack.Interfaces.
+func (s *TestStack) Interfaces() map[int32]Interface {
+	return s.InterfacesMap
+}
+
+// InterfaceAddrs implements Stack.InterfaceAddrs.
+func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr {
+	return s.InterfaceAddrsMap
+}
+
+// SupportsIPv6 implements Stack.SupportsIPv6.
+func (s *TestStack) SupportsIPv6() bool {
+	return s.SupportsIPv6Flag
+}
+
+// TCPReceiveBufferSize implements Stack.TCPReceiveBufferSize.
+func (s *TestStack) TCPReceiveBufferSize() (TCPBufferSize, error) {
+	return s.TCPRecvBufSize, nil
+}
+
+// SetTCPReceiveBufferSize implements Stack.SetTCPReceiveBufferSize.
+func (s *TestStack) SetTCPReceiveBufferSize(size TCPBufferSize) error {
+	s.TCPRecvBufSize = size
+	return nil
+}
+
+// TCPSendBufferSize implements Stack.TCPSendBufferSize.
+func (s *TestStack) TCPSendBufferSize() (TCPBufferSize, error) {
+	return s.TCPSendBufSize, nil
+}
+
+// SetTCPSendBufferSize implements Stack.SetTCPSendBufferSize.
+func (s *TestStack) SetTCPSendBufferSize(size TCPBufferSize) error {
+	s.TCPSendBufSize = size
+	return nil
+}
+
+// TCPSACKEnabled implements Stack.TCPSACKEnabled.
+func (s *TestStack) TCPSACKEnabled() (bool, error) {
+	return s.TCPSACKFlag, nil
+}
+
+// SetTCPSACKEnabled implements Stack.SetTCPSACKEnabled.
+func (s *TestStack) SetTCPSACKEnabled(enabled bool) error {
+	s.TCPSACKFlag = enabled
+	return nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
new file mode 100644
index 000000000..62794cff5
--- /dev/null
+++ b/pkg/sentry/kernel/BUILD
@@ -0,0 +1,234 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "kernel_state",
+    srcs = [
+        "abstract_socket_namespace.go",
+        "fd_map.go",
+        "fs_context.go",
+        "ipc_namespace.go",
+        "kernel.go",
+        "pending_signals.go",
+        "pending_signals_list.go",
+        "process_group_list.go",
+        "ptrace.go",
+        "rseq.go",
+        "session_list.go",
+        "sessions.go",
+        "signal.go",
+        "signal_handlers.go",
+        "syscalls.go",
+        "syscalls_state.go",
+        "syslog.go",
+        "task.go",
+        "task_clone.go",
+        "task_context.go",
+        "task_exec.go",
+        "task_exit.go",
+        "task_list.go",
+        "task_resources.go",
+        "task_run.go",
+        "task_sched.go",
+        "task_signals.go",
+        "task_start.go",
+        "task_syscall.go",
+        "thread_group.go",
+        "threads.go",
+        "timekeeper.go",
+        "timekeeper_state.go",
+        "timer.go",
+        "uts_namespace.go",
+        "vdso.go",
+        "version.go",
+    ],
+    out = "kernel_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"],
+    package = "kernel",
+)
+
+go_template_instance(
+    name = "pending_signals_list",
+    out = "pending_signals_list.go",
+    package = "kernel",
+    prefix = "pendingSignal",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*pendingSignal",
+    },
+)
+
+go_template_instance(
+    name = "process_group_list",
+    out = "process_group_list.go",
+    package = "kernel",
+    prefix = "processGroup",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*ProcessGroup",
+    },
+)
+
+go_template_instance(
+    name = "seqatomic_taskgoroutineschedinfo",
+    out = "seqatomic_taskgoroutineschedinfo.go",
+    package = "kernel",
+    suffix = "TaskGoroutineSchedInfo",
+    template = "//pkg/sync:generic_seqatomic",
+    types = {
+        "Value": "TaskGoroutineSchedInfo",
+    },
+)
+
+go_template_instance(
+    name = "session_list",
+    out = "session_list.go",
+    package = "kernel",
+    prefix = "session",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Session",
+    },
+)
+
+go_template_instance(
+    name = "task_list",
+    out = "task_list.go",
+    package = "kernel",
+    prefix = "task",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Task",
+    },
+)
+
+go_library(
+    name = "kernel",
+    srcs = [
+        "abstract_socket_namespace.go",
+        "context.go",
+        "fd_map.go",
+        "fs_context.go",
+        "ipc_namespace.go",
+        "kernel.go",
+        "kernel_state.go",
+        "pending_signals.go",
+        "pending_signals_list.go",
+        "process_group_list.go",
+        "ptrace.go",
+        "rseq.go",
+        "seccomp.go",
+        "seqatomic_taskgoroutineschedinfo.go",
+        "session_list.go",
+        "sessions.go",
+        "signal.go",
+        "signal_handlers.go",
+        "syscalls.go",
+        "syscalls_state.go",
+        "syslog.go",
+        "task.go",
+        "task_acct.go",
+        "task_block.go",
+        "task_clone.go",
+        "task_context.go",
+        "task_exec.go",
+        "task_exit.go",
+        "task_identity.go",
+        "task_list.go",
+        "task_log.go",
+        "task_net.go",
+        "task_resources.go",
+        "task_run.go",
+        "task_sched.go",
+        "task_signals.go",
+        "task_start.go",
+        "task_stop.go",
+        "task_syscall.go",
+        "task_usermem.go",
+        "thread_group.go",
+        "threads.go",
+        "timekeeper.go",
+        "timekeeper_state.go",
+        "timer.go",
+        "uts_namespace.go",
+        "vdso.go",
+        "version.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/binary",
+        "//pkg/bits",
+        "//pkg/bpf",
+        "//pkg/cpuid",
+        "//pkg/eventchannel",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/secio",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/hostcpu",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/futex",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket/netlink/port",
+        "//pkg/sentry/time",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/state/statefile",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "kernel_test",
+    size = "small",
+    srcs = [
+        "fd_map_test.go",
+        "table_test.go",
+        "task_test.go",
+        "timekeeper_test.go",
+    ],
+    embed = [":kernel"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs/filetest",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md
new file mode 100644
index 000000000..3306780d6
--- /dev/null
+++ b/pkg/sentry/kernel/README.md
@@ -0,0 +1,106 @@
+This package contains:
+
+- A (partial) emulation of the "core Linux kernel", which governs task
+  execution and scheduling, system call dispatch, and signal handling. See
+  below for details.
+
+- The top-level interface for the sentry's Linux kernel emulation in general,
+  used by the `main` function of all versions of the sentry. This interface
+  revolves around the `Env` type (defined in `kernel.go`).
+
+# Background
+
+In Linux, each schedulable context is referred to interchangeably as a "task" or
+"thread". Tasks can be divided into userspace and kernel tasks. In the sentry,
+scheduling is managed by the Go runtime, so each schedulable context is a
+goroutine; only "userspace" (application) contexts are referred to as tasks, and
+represented by Task objects. (From this point forward, "task" refers to the
+sentry's notion of a task unless otherwise specified.)
+
+At a high level, Linux application threads can be thought of as repeating a "run
+loop":
+
+- Some amount of application code is executed in userspace.
+
+- A trap (explicit syscall invocation, hardware interrupt or exception, etc.)
+  causes control flow to switch to the kernel.
+
+- Some amount of kernel code is executed in kernelspace, e.g. to handle the
+  cause of the trap.
+
+- The kernel "returns from the trap" into application code.
+
+Analogously, each task in the sentry is associated with a *task goroutine* that
+executes that task's run loop (`Task.run` in `task_run.go`). However, the
+sentry's task run loop differs in structure in order to support saving execution
+state to, and resuming execution from, checkpoints.
+
+While in kernelspace, a Linux thread can be descheduled (cease execution) in a
+variety of ways:
+
+- It can yield or be preempted, becoming temporarily descheduled but still
+  runnable. At present, the sentry delegates scheduling of runnable threads to
+  the Go runtime.
+
+- It can exit, becoming permanently descheduled. The sentry's equivalent is
+  returning from `Task.run`, terminating the task goroutine.
+
+- It can enter interruptible sleep, a state in which it can be woken by a
+  caller-defined wakeup or the receipt of a signal. In the sentry, interruptible
+  sleep (which is ambiguously referred to as *blocking*) is implemented by
+  making all events that can end blocking (including signal notifications)
+  communicated via Go channels and using `select` to multiplex wakeup sources;
+  see `task_block.go`.
+
+- It can enter uninterruptible sleep, a state in which it can only be woken by a
+  caller-defined wakeup. Killable sleep is a closely related variant in which
+  the task can also be woken by SIGKILL. (These definitions also include Linux's
+  "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped" (`TASK_TRACED`) states.)
+
+To maximize compatibility with Linux, sentry checkpointing appears as a spurious
+signal-delivery interrupt on all tasks; interrupted system calls return `EINTR`
+or are automatically restarted as usual. However, these semantics require that
+uninterruptible and killable sleeps do not appear to be interrupted. In other
+words, the state of the task, including its progress through the interrupted
+operation, must be preserved by checkpointing. For many such sleeps, the wakeup
+condition is application-controlled, making it infeasible to wait for the sleep
+to end before checkpointing. Instead, we must support checkpointing progress
+through sleeping operations.
+
+# Implementation
+
+We break the task's control flow graph into *states*, delimited by:
+
+1. Points where uninterruptible and killable sleeps may occur. For example,
+there exists a state boundary between signal dequeueing and signal delivery
+because there may be an intervening ptrace signal-delivery-stop.
+
+2. Points where sleep-induced branches may "rejoin" normal execution. For
+example, the syscall exit state exists because it can be reached immediately
+following a synchronous syscall, or after a task that is sleeping in `execve()`
+or `vfork()` resumes execution.
+
+3. Points containing large branches. This is strictly for organizational
+purposes. For example, the state that processes interrupt-signaled conditions is
+kept separate from the main "app" state to reduce the size of the latter.
+
+4. `SyscallReinvoke`, which does not correspond to anything in Linux, and exists
+solely to serve the autosave feature.
+
+![dot -Tsvg -Goverlap=false -orun_states.svg run_states.dot](g3doc/run_states.dot "Task control flow graph")
+
+States before which a stop may occur are represented as implementations of the
+`taskRunState` interface named `run(state)`, allowing them to be saved and
+restored. States that cannot be immediately preceded by a stop are simply `Task`
+methods named `do(state)`.
+
+Conditions that can require task goroutines to cease execution for unknown
+lengths of time are called *stops*. Stops are divided into *internal stops*,
+which are stops whose start and end conditions are implemented within the
+sentry, and *external stops*, which are stops whose start and end conditions are
+not known to the sentry. Hence all uninterruptible and killable sleeps are
+internal stops, and the existence of a pending checkpoint operation is an
+external stop. Internal stops are reified into instances of the `TaskStop` type,
+while external stops are merely counted. The task run loop alternates between
+checking for stops and advancing the task's state. This allows checkpointing to
+hold tasks in a stopped state while waiting for all tasks in the system to stop.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
new file mode 100644
index 000000000..014c4a3bf
--- /dev/null
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type abstractEndpoint struct {
+	ep   unix.BoundEndpoint
+	wr   *refs.WeakRef
+	name string
+	ns   *AbstractSocketNamespace
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (e *abstractEndpoint) WeakRefGone() {
+	e.ns.mu.Lock()
+	if e.ns.endpoints[e.name].ep == e.ep {
+		delete(e.ns.endpoints, e.name)
+	}
+	e.ns.mu.Unlock()
+}
+
+// AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
+type AbstractSocketNamespace struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// Keeps mapping from name to endpoint.
+	endpoints map[string]abstractEndpoint
+}
+
+// NewAbstractSocketNamespace returns a new AbstractSocketNamespace.
+func NewAbstractSocketNamespace() *AbstractSocketNamespace {
+	return &AbstractSocketNamespace{
+		endpoints: make(map[string]abstractEndpoint),
+	}
+}
+
+// A boundEndpoint wraps a unix.BoundEndpoint to maintain a reference on its
+// backing object.
+type boundEndpoint struct {
+	unix.BoundEndpoint
+	rc refs.RefCounter
+}
+
+// Release implements unix.BoundEndpoint.Release.
+func (e *boundEndpoint) Release() {
+	e.rc.DecRef()
+	e.BoundEndpoint.Release()
+}
+
+// BoundEndpoint retrieves the endpoint bound to the given name. The return
+// value is nil if no endpoint was bound.
+func (a *AbstractSocketNamespace) BoundEndpoint(name string) unix.BoundEndpoint {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	ep, ok := a.endpoints[name]
+	if !ok {
+		return nil
+	}
+
+	rc := ep.wr.Get()
+	if rc == nil {
+		delete(a.endpoints, name)
+		return nil
+	}
+
+	return &boundEndpoint{ep.ep, rc}
+}
+
+// Bind binds the given socket.
+//
+// When the last reference managed by rc is dropped, ep may be removed from the
+// namespace.
+func (a *AbstractSocketNamespace) Bind(name string, ep unix.BoundEndpoint, rc refs.RefCounter) error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if ep, ok := a.endpoints[name]; ok {
+		if rc := ep.wr.Get(); rc != nil {
+			rc.DecRef()
+			return syscall.EADDRINUSE
+		}
+	}
+
+	ae := abstractEndpoint{ep: ep, name: name, ns: a}
+	ae.wr = refs.NewWeakRef(rc, &ae)
+	a.endpoints[name] = ae
+	return nil
+}
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
new file mode 100644
index 000000000..7f0680b88
--- /dev/null
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -0,0 +1,73 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "auth_state",
+    srcs = [
+        "credentials.go",
+        "id.go",
+        "id_map_range.go",
+        "id_map_set.go",
+        "user_namespace.go",
+    ],
+    out = "auth_state.go",
+    package = "auth",
+)
+
+go_template_instance(
+    name = "id_map_range",
+    out = "id_map_range.go",
+    package = "auth",
+    prefix = "idMap",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint32",
+    },
+)
+
+go_template_instance(
+    name = "id_map_set",
+    out = "id_map_set.go",
+    consts = {
+        "minDegree": "3",
+    },
+    package = "auth",
+    prefix = "idMap",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint32",
+        "Range": "idMapRange",
+        "Value": "uint32",
+        "Functions": "idMapFunctions",
+    },
+)
+
+go_library(
+    name = "auth",
+    srcs = [
+        "auth.go",
+        "auth_state.go",
+        "capability_set.go",
+        "context.go",
+        "credentials.go",
+        "id.go",
+        "id_map.go",
+        "id_map_functions.go",
+        "id_map_range.go",
+        "id_map_set.go",
+        "user_namespace.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/bits",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
new file mode 100644
index 000000000..c49a6b852
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -0,0 +1,22 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package auth implements an access control model that is a subset of Linux's.
+//
+// The auth package supports two kinds of access controls: user/group IDs and
+// capabilities. Each resource in the security model is associated with a user
+// namespace; "privileged" operations check that the operator's credentials
+// have the required user/group IDs or capabilities within the user namespace
+// of accessed resources.
+package auth
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
new file mode 100644
index 000000000..5b8164c49
--- /dev/null
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+)
+
+// A CapabilitySet is a set of capabilities implemented as a bitset. The zero
+// value of CapabilitySet is a set containing no capabilities.
+type CapabilitySet uint64
+
+// AllCapabilities is a CapabilitySet containing all valid capabilities.
+var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1
+
+// CapabilitySetOf returns a CapabilitySet containing only the given
+// capability.
+func CapabilitySetOf(cp linux.Capability) CapabilitySet {
+	return CapabilitySet(bits.MaskOf64(int(cp)))
+}
+
+// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities.
+func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet {
+	var cs uint64
+	for _, cp := range cps {
+		cs |= bits.MaskOf64(int(cp))
+	}
+	return CapabilitySet(cs)
+}
+
+// TaskCapabilities represents all the capability sets for a task. Each of these
+// sets is explained in greater detail in capabilities(7).
+type TaskCapabilities struct {
+	// Permitted is a limiting superset for the effective capabilities that
+	// the thread may assume.
+	PermittedCaps CapabilitySet
+	// Inheritable is a set of capabilities preserved across an execve(2).
+	InheritableCaps CapabilitySet
+	// Effective is the set of capabilities used by the kernel to perform
+	// permission checks for the thread.
+	EffectiveCaps CapabilitySet
+	// Bounding is a limiting superset for the capabilities that a thread
+	// can add to its inheritable set using capset(2).
+	BoundingCaps CapabilitySet
+	// Ambient is a set of capabilities that are preserved across an
+	// execve(2) of a program that is not privileged.
+	AmbientCaps CapabilitySet
+}
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
new file mode 100644
index 000000000..914589b28
--- /dev/null
+++ b/pkg/sentry/kernel/auth/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCredentials is a Context.Value key for Credentials.
+	CtxCredentials contextID = iota
+)
+
+// CredentialsFromContext returns a copy of the Credentials used by ctx, or a
+// set of Credentials with no capabilities if ctx does not have Credentials.
+func CredentialsFromContext(ctx context.Context) *Credentials {
+	if v := ctx.Value(CtxCredentials); v != nil {
+		return v.(*Credentials)
+	}
+	return NewAnonymousCredentials()
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
new file mode 100644
index 000000000..b832b28fe
--- /dev/null
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -0,0 +1,227 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials contains information required to authorize privileged operations
+// in a user namespace.
+type Credentials struct {
+	// Real/effective/saved user/group IDs in the root user namespace. None of
+	// these should ever be NoID.
+	RealKUID      KUID
+	EffectiveKUID KUID
+	SavedKUID     KUID
+	RealKGID      KGID
+	EffectiveKGID KGID
+	SavedKGID     KGID
+
+	// Filesystem user/group IDs are not implemented. "... setfsuid() is
+	// nowadays unneeded and should be avoided in new applications (likewise
+	// for setfsgid(2))." - setfsuid(2)
+
+	// Supplementary groups used by set/getgroups.
+	//
+	// ExtraKGIDs slices are immutable, allowing multiple Credentials with the
+	// same ExtraKGIDs to share the same slice.
+	ExtraKGIDs []KGID
+
+	// The capability sets applicable to this set of credentials.
+	PermittedCaps   CapabilitySet
+	InheritableCaps CapabilitySet
+	EffectiveCaps   CapabilitySet
+	BoundingCaps    CapabilitySet
+	// Ambient capabilities are not introduced until Linux 4.3.
+
+	// KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be
+	// maintained after a switch from root user to non-root user via setuid().
+	KeepCaps bool
+
+	// The user namespace associated with the owner of the credentials.
+	UserNamespace *UserNamespace
+}
+
+// NewAnonymousCredentials returns a set of credentials with no capabilities in
+// any user namespace.
+func NewAnonymousCredentials() *Credentials {
+	// Create a new root user namespace. Since the new namespace's owner is
+	// KUID 0 and the returned credentials have non-zero KUID/KGID, the
+	// returned credentials do not have any capabilities in the new namespace.
+	// Since the new namespace is not part of any existing user namespace
+	// hierarchy, the returned credentials do not have any capabilities in any
+	// other namespace.
+	return &Credentials{
+		RealKUID:      NobodyKUID,
+		EffectiveKUID: NobodyKUID,
+		SavedKUID:     NobodyKUID,
+		RealKGID:      NobodyKGID,
+		EffectiveKGID: NobodyKGID,
+		SavedKGID:     NobodyKGID,
+		UserNamespace: NewRootUserNamespace(),
+	}
+}
+
+// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e.
+// global root) in user namespace ns.
+func NewRootCredentials(ns *UserNamespace) *Credentials {
+	// I can't find documentation for this anywhere, but it's correct for the
+	// inheritable capability set to be initially empty (the capabilities test
+	// checks for this property).
+	return &Credentials{
+		RealKUID:      RootKUID,
+		EffectiveKUID: RootKUID,
+		SavedKUID:     RootKUID,
+		RealKGID:      RootKGID,
+		EffectiveKGID: RootKGID,
+		SavedKGID:     RootKGID,
+		PermittedCaps: AllCapabilities,
+		EffectiveCaps: AllCapabilities,
+		BoundingCaps:  AllCapabilities,
+		UserNamespace: ns,
+	}
+}
+
+// NewUserCredentials returns a set of credentials based on the given UID, GIDs,
+// and capabilities in a given namespace. If all arguments are their zero
+// values, this returns the same credentials as NewRootCredentials.
+func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials {
+	creds := NewRootCredentials(ns)
+
+	// Set the UID.
+	uid := kuid
+	creds.RealKUID = uid
+	creds.EffectiveKUID = uid
+	creds.SavedKUID = uid
+
+	// Set GID.
+	gid := kgid
+	creds.RealKGID = gid
+	creds.EffectiveKGID = gid
+	creds.SavedKGID = gid
+
+	// Set additional GIDs.
+	creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)
+
+	// Set capabilities. If capabilities aren't specified, we default to
+	// all capabilities.
+	if capabilities != nil {
+		creds.PermittedCaps = capabilities.PermittedCaps
+		creds.EffectiveCaps = capabilities.EffectiveCaps
+		creds.BoundingCaps = capabilities.BoundingCaps
+		creds.InheritableCaps = capabilities.InheritableCaps
+		// // TODO: Support ambient capabilities.
+	} else {
+		// If no capabilities are specified, grant the same capabilites
+		// that NewRootCredentials does.
+		creds.PermittedCaps = AllCapabilities
+		creds.EffectiveCaps = AllCapabilities
+		creds.BoundingCaps = AllCapabilities
+	}
+
+	return creds
+}
+
+// Fork generates an identical copy of a set of credentials.
+func (c *Credentials) Fork() *Credentials {
+	nc := new(Credentials)
+	*nc = *c // Copy-by-value; this is legal for all fields.
+	return nc
+}
+
+// InGroup returns true if c is in group kgid. Compare Linux's
+// kernel/groups.c:in_group_p().
+func (c *Credentials) InGroup(kgid KGID) bool {
+	if c.EffectiveKGID == kgid {
+		return true
+	}
+	for _, extraKGID := range c.ExtraKGIDs {
+		if extraKGID == kgid {
+			return true
+		}
+	}
+	return false
+}
+
+// HasCapabilityIn returns true if c has capability cp in ns.
+func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool {
+	for {
+		// "1. A process has a capability inside a user namespace if it is a member
+		// of that namespace and it has the capability in its effective capability
+		// set." - user_namespaces(7)
+		if c.UserNamespace == ns {
+			return CapabilitySetOf(cp)&c.EffectiveCaps != 0
+		}
+		// "3. ... A process that resides in the parent of the user namespace and
+		// whose effective user ID matches the owner of the namespace has all
+		// capabilities in the namespace."
+		if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner {
+			return true
+		}
+		// "2. If a process has a capability in a user namespace, then it has that
+		// capability in all child (and further removed descendant) namespaces as
+		// well."
+		if ns.parent == nil {
+			return false
+		}
+		ns = ns.parent
+	}
+}
+
+// HasCapability returns true if c has capability cp in its user namespace.
+func (c *Credentials) HasCapability(cp linux.Capability) bool {
+	return c.HasCapabilityIn(cp, c.UserNamespace)
+}
+
+// UseUID checks that c can use uid in its user namespace, then translates it
+// to the root user namespace.
+//
+// The checks UseUID does are common, but you should verify that it's doing
+// exactly what you want.
+func (c *Credentials) UseUID(uid UID) (KUID, error) {
+	// uid must be mapped.
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	// If c has CAP_SETUID, then it can use any UID in its user namespace.
+	if c.HasCapability(linux.CAP_SETUID) {
+		return kuid, nil
+	}
+	// Otherwise, c must already have the UID as its real, effective, or saved
+	// set-user-ID.
+	if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
+		return kuid, nil
+	}
+	return NoID, syserror.EPERM
+}
+
+// UseGID checks that c can use gid in its user namespace, then translates it
+// to the root user namespace.
+func (c *Credentials) UseGID(gid GID) (KGID, error) {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return NoID, syserror.EINVAL
+	}
+	if c.HasCapability(linux.CAP_SETGID) {
+		return kgid, nil
+	}
+	if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
+		return kgid, nil
+	}
+	return NoID, syserror.EPERM
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
new file mode 100644
index 000000000..37522b018
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id.go
@@ -0,0 +1,121 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+)
+
+// UID is a user ID in an unspecified user namespace.
+type UID uint32
+
+// GID is a group ID in an unspecified user namespace.
+type GID uint32
+
+// In the root user namespace, user/group IDs have a 1-to-1 relationship with
+// the users/groups they represent. In other user namespaces, this is not the
+// case; for example, two different unmapped users may both "have" the overflow
+// UID. This means that it is generally only valid to compare user and group
+// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such
+// IDs to emphasize this distinction. ("k" is for "key", as in "unique key".
+// Linux also uses the prefix "k", but I think they mean "kernel".)
+
+// KUID is a user ID in the root user namespace.
+type KUID uint32
+
+// KGID is a group ID in the root user namespace.
+type KGID uint32
+
+const (
+	// NoID is uint32(-1). -1 is consistently used as a special value, in Linux
+	// and by extension in the auth package, to mean "no ID":
+	//
+	// - ID mapping returns -1 if the ID is not mapped.
+	//
+	// - Most set*id() syscalls accept -1 to mean "do not change this ID".
+	NoID = math.MaxUint32
+
+	// OverflowUID is the default value of /proc/sys/kernel/overflowuid. The
+	// "overflow UID" is usually [1] used when translating a user ID between
+	// namespaces fails because the ID is not mapped. (We don't implement this
+	// file, so the overflow UID is constant.)
+	//
+	// [1] "There is one notable case where unmapped user and group IDs are not
+	// converted to the corresponding overflow ID value. When viewing a uid_map
+	// or gid_map file in which there is no mapping for the second field, that
+	// field is displayed as 4294967295 (-1 as an unsigned integer);" -
+	// user_namespaces(7)
+	OverflowUID = UID(65534)
+	OverflowGID = GID(65534)
+
+	// NobodyKUID is the user ID usually reserved for the least privileged user
+	// "nobody".
+	NobodyKUID = KUID(65534)
+	NobodyKGID = KGID(65534)
+
+	// RootKUID is the user ID usually used for the most privileged user "root".
+	RootKUID = KUID(0)
+	RootKGID = KGID(0)
+	RootUID  = UID(0)
+	RootGID  = GID(0)
+)
+
+// Ok returns true if uid is not -1.
+func (uid UID) Ok() bool {
+	return uid != NoID
+}
+
+// Ok returns true if gid is not -1.
+func (gid GID) Ok() bool {
+	return gid != NoID
+}
+
+// Ok returns true if kuid is not -1.
+func (kuid KUID) Ok() bool {
+	return kuid != NoID
+}
+
+// Ok returns true if kgid is not -1.
+func (kgid KGID) Ok() bool {
+	return kgid != NoID
+}
+
+// OrOverflow returns uid if it is valid and the overflow UID otherwise.
+func (uid UID) OrOverflow() UID {
+	if uid.Ok() {
+		return uid
+	}
+	return OverflowUID
+}
+
+// OrOverflow returns gid if it is valid and the overflow GID otherwise.
+func (gid GID) OrOverflow() GID {
+	if gid.Ok() {
+		return gid
+	}
+	return OverflowGID
+}
+
+// In translates kuid into user namespace ns. If kuid is not mapped in ns, In
+// returns NoID.
+func (kuid KUID) In(ns *UserNamespace) UID {
+	return ns.MapFromKUID(kuid)
+}
+
+// In translates kgid into user namespace ns. If kgid is not mapped in ns, In
+// returns NoID.
+func (kgid KGID) In(ns *UserNamespace) GID {
+	return ns.MapFromKGID(kgid)
+}
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
new file mode 100644
index 000000000..6adb33530
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -0,0 +1,283 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
+func (ns *UserNamespace) MapFromKUID(kuid KUID) UID {
+	if ns.parent == nil {
+		return UID(kuid)
+	}
+	return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid))))
+}
+
+// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns.
+func (ns *UserNamespace) MapFromKGID(kgid KGID) GID {
+	if ns.parent == nil {
+		return GID(kgid)
+	}
+	return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid))))
+}
+
+// MapToKUID translates uid, a UID in ns, to a UID in the root namespace.
+func (ns *UserNamespace) MapToKUID(uid UID) KUID {
+	if ns.parent == nil {
+		return KUID(uid)
+	}
+	return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid))))
+}
+
+// MapToKGID translates gid, a GID in ns, to a GID in the root namespace.
+func (ns *UserNamespace) MapToKGID(gid GID) KGID {
+	if ns.parent == nil {
+		return KGID(gid)
+	}
+	return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid))))
+}
+
+func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 {
+	if id == NoID {
+		return NoID
+	}
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if it := m.FindSegment(id); it.Ok() {
+		return it.Value() + (id - it.Start())
+	}
+	return NoID
+}
+
+// allIDsMapped returns true if all IDs in the range [start, end) are mapped in
+// m.
+//
+// Preconditions: end >= start.
+func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	return m.SpanRange(idMapRange{start, end}) == end-start
+}
+
+// An IDMapEntry represents a mapping from a range of contiguous IDs in a user
+// namespace to an equally-sized range of contiguous IDs in the namespace's
+// parent.
+type IDMapEntry struct {
+	// FirstID is the first ID in the range in the namespace.
+	FirstID uint32
+
+	// FirstParentID is the first ID in the range in the parent namespace.
+	FirstParentID uint32
+
+	// Length is the number of IDs in the range.
+	Length uint32
+}
+
+// SetUIDMap instructs ns to translate UIDs as specified by entries.
+//
+// Note: SetUIDMap does not place an upper bound on the number of entries, but
+// Linux does. This restriction is implemented in SetUIDMap's caller, the
+// implementation of /proc/[pid]/uid_map.
+func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	// "After the creation of a new user namespace, the uid_map file of *one*
+	// of the processes in the namespace may be written to *once* to define the
+	// mapping of user IDs in the new user namespace. An attempt to write more
+	// than once to a uid_map file in a user namespace fails with the error
+	// EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
+	if !ns.uidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	// "At least one line must be written to the file."
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	// """
+	// In order for a process to write to the /proc/[pid]/uid_map
+	// (/proc/[pid]/gid_map) file, all of the following requirements must be
+	// met:
+	//
+	// 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability
+	// in the user namespace of the process pid.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
+		return syserror.EPERM
+	}
+	// "2. The writing process must either be in the user namespace of the process
+	// pid or be in the parent user namespace of the process pid."
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	// """
+	// 3. (see trySetUIDMap)
+	//
+	// 4. One of the following two cases applies:
+	//
+	// * Either the writing process has the CAP_SETUID (CAP_SETGID) capability
+	// in the parent user namespace.
+	// """
+	if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) {
+		// """
+		// * Or otherwise all of the following restrictions apply:
+		//
+		//   + The data written to uid_map (gid_map) must consist of a single line
+		//   that maps the writing process' effective user ID (group ID) in the
+		//   parent user namespace to a user ID (group ID) in the user namespace.
+		// """
+		if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// """
+		//   + The writing process must have the same effective user ID as the
+		//   process that created the user namespace.
+		// """
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+	}
+	// trySetUIDMap leaves data in maps if it fails.
+	if err := ns.trySetUIDMap(entries); err != nil {
+		ns.uidMapFromParent.RemoveAll()
+		ns.uidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		// Determine upper bounds and check for overflow. This implicitly
+		// checks for NoID.
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		// "3. The mapped user IDs (group IDs) must in turn have a mapping in
+		// the parent user namespace."
+		// Only the root namespace has a nil parent, and root is assigned
+		// mappings when it's created, so SetUIDMap would have returned EPERM
+		// without reaching this point if ns is root.
+		if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		// If either of these Adds fail, we have an overlapping range.
+		if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// SetGIDMap instructs ns to translate GIDs as specified by entries.
+func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error {
+	c := CredentialsFromContext(ctx)
+
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	if !ns.gidMapFromParent.IsEmpty() {
+		return syserror.EPERM
+	}
+	if len(entries) == 0 {
+		return syserror.EINVAL
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
+		return syserror.EPERM
+	}
+	if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+		return syserror.EPERM
+	}
+	if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
+		if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
+			return syserror.EPERM
+		}
+		// It's correct for this to still be UID.
+		if c.EffectiveKUID != ns.owner {
+			return syserror.EPERM
+		}
+		// "In the case of gid_map, use of the setgroups(2) system call must
+		// first be denied by writing "deny" to the /proc/[pid]/setgroups file
+		// (see below) before writing to gid_map." (This file isn't implemented
+		// in the version of Linux we're emulating; see comment in
+		// UserNamespace.)
+	}
+	if err := ns.trySetGIDMap(entries); err != nil {
+		ns.gidMapFromParent.RemoveAll()
+		ns.gidMapToParent.RemoveAll()
+		return err
+	}
+	return nil
+}
+
+func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
+	for _, e := range entries {
+		lastID := e.FirstID + e.Length
+		if lastID <= e.FirstID {
+			return syserror.EINVAL
+		}
+		lastParentID := e.FirstParentID + e.Length
+		if lastParentID <= e.FirstParentID {
+			return syserror.EINVAL
+		}
+		if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
+			return syserror.EPERM
+		}
+		if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+			return syserror.EINVAL
+		}
+		if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+			return syserror.EINVAL
+		}
+	}
+	return nil
+}
+
+// UIDMap returns the user ID mappings configured for ns. If no mappings
+// have been configured, UIDMap returns nil.
+func (ns *UserNamespace) UIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.uidMapToParent)
+}
+
+// GIDMap returns the group ID mappings configured for ns. If no mappings
+// have been configured, GIDMap returns nil.
+func (ns *UserNamespace) GIDMap() []IDMapEntry {
+	return ns.getIDMap(&ns.gidMapToParent)
+}
+
+func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry {
+	ns.mu.Lock()
+	defer ns.mu.Unlock()
+	var entries []IDMapEntry
+	for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() {
+		entries = append(entries, IDMapEntry{
+			FirstID:       it.Start(),
+			FirstParentID: it.Value(),
+			Length:        it.Range().Length(),
+		})
+	}
+	return entries
+}
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
new file mode 100644
index 000000000..889291d96
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -0,0 +1,45 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+// idMapFunctions "implements" generic interface segment.Functions for
+// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one
+// user namespace to non-overlapping ranges of contiguous IDs in another user
+// namespace. Each such ID mapping is implemented as a range-to-value mapping
+// in the set such that [range.Start(), range.End()) => [value, value +
+// range.Length()).
+type idMapFunctions struct{}
+
+func (idMapFunctions) MinKey() uint32 {
+	return 0
+}
+
+func (idMapFunctions) MaxKey() uint32 {
+	return NoID
+}
+
+func (idMapFunctions) ClearValue(*uint32) {}
+
+func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) {
+	// Mapped ranges have to be contiguous.
+	if val1+r1.Length() != val2 {
+		return 0, false
+	}
+	return val1, true
+}
+
+func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) {
+	return val, val + (split - r.Start)
+}
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
new file mode 100644
index 000000000..0980aeadf
--- /dev/null
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -0,0 +1,130 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+	"math"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A UserNamespace represents a user namespace. See user_namespaces(7) for
+// details.
+type UserNamespace struct {
+	// parent is this namespace's parent. If this is the root namespace, parent
+	// is nil. The parent pointer is immutable.
+	parent *UserNamespace
+
+	// owner is the effective UID of the namespace's creator in the root
+	// namespace. owner is immutable.
+	owner KUID
+
+	// mu protects the following fields.
+	//
+	// If mu will be locked in multiple UserNamespaces, it must be locked in
+	// descendant namespaces before ancestors.
+	mu sync.Mutex `state:"nosave"`
+
+	// Mappings of user/group IDs between this namespace and its parent.
+	//
+	// All ID maps, once set, cannot be changed. This means that successful
+	// UID/GID translations cannot be racy.
+	uidMapFromParent idMapSet
+	uidMapToParent   idMapSet
+	gidMapFromParent idMapSet
+	gidMapToParent   idMapSet
+
+	// TODO: Consider supporting disabling setgroups(2), which "was
+	// added in Linux 3.19, but was backported to many earlier stable kernel
+	// series, because it addresses a security issue" - user_namespaces(7). (It
+	// was not backported to 3.11.10, which we are currently imitating.)
+}
+
+// NewRootUserNamespace returns a UserNamespace that is appropriate for a
+// system's root user namespace.
+func NewRootUserNamespace() *UserNamespace {
+	var ns UserNamespace
+	// """
+	// The initial user namespace has no parent namespace, but, for
+	// consistency, the kernel provides dummy user and group ID mapping files
+	// for this namespace. Looking at the uid_map file (gid_map is the same)
+	// from a shell in the initial namespace shows:
+	//
+	// $ cat /proc/$$/uid_map
+	// 0          0 4294967295
+	// """ - user_namespaces(7)
+	for _, m := range []*idMapSet{
+		&ns.uidMapFromParent,
+		&ns.uidMapToParent,
+		&ns.gidMapFromParent,
+		&ns.gidMapToParent,
+	} {
+		if !m.Add(idMapRange{0, math.MaxUint32}, 0) {
+			panic("Failed to insert into empty ID map")
+		}
+	}
+	return &ns
+}
+
+// Root returns the root of the user namespace tree containing ns.
+func (ns *UserNamespace) Root() *UserNamespace {
+	for ns.parent != nil {
+		ns = ns.parent
+	}
+	return ns
+}
+
+// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
+// namespaces." - user_namespaces(7)
+const maxUserNamespaceDepth = 32
+
+func (ns *UserNamespace) depth() int {
+	var i int
+	for ns != nil {
+		i++
+		ns = ns.parent
+	}
+	return i
+}
+
+// NewChildUserNamespace returns a new user namespace created by a caller with
+// credentials c.
+func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
+	if c.UserNamespace.depth() >= maxUserNamespaceDepth {
+		// "... Calls to unshare(2) or clone(2) that would cause this limit to
+		// be exceeded fail with the error EUSERS." - user_namespaces(7)
+		return nil, syserror.EUSERS
+	}
+	// "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
+	// user ID or the effective group ID of the caller does not have a mapping
+	// in the parent namespace (see user_namespaces(7))." - clone(2)
+	// "CLONE_NEWUSER requires that the user ID and group ID of the calling
+	// process are mapped to user IDs and group IDs in the user namespace of
+	// the calling process at the time of the call." - unshare(2)
+	if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
+		return nil, syserror.EPERM
+	}
+	return &UserNamespace{
+		parent: c.UserNamespace,
+		owner:  c.EffectiveKUID,
+		// "When a user namespace is created, it starts without a mapping of
+		// user IDs (group IDs) to the parent user namespace." -
+		// user_namespaces(7)
+	}, nil
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
new file mode 100644
index 000000000..261ca6f7a
--- /dev/null
+++ b/pkg/sentry/kernel/context.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the kernel package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxCanTrace is a Context.Value key for a function with the same
+	// signature and semantics as kernel.Task.CanTrace.
+	CtxCanTrace contextID = iota
+
+	// CtxKernel is a Context.Value key for a Kernel.
+	CtxKernel
+
+	// CtxPIDNamespace is a Context.Value key for a PIDNamespace.
+	CtxPIDNamespace
+
+	// CtxTask is a Context.Value key for a Task.
+	CtxTask
+
+	// CtxUTSNamespace is a Context.Value key for a UTSNamespace.
+	CtxUTSNamespace
+
+	// CtxIPCNamespace is a Context.Value key for a IPCNamespace.
+	CtxIPCNamespace
+)
+
+// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense
+// as kernel.Task.CanTrace.
+func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool {
+	if v := ctx.Value(CtxCanTrace); v != nil {
+		return v.(func(*Task, bool) bool)(t, attach)
+	}
+	return false
+}
+
+// KernelFromContext returns the Kernel in which ctx is executing, or nil if
+// there is no such Kernel.
+func KernelFromContext(ctx context.Context) *Kernel {
+	if v := ctx.Value(CtxKernel); v != nil {
+		return v.(*Kernel)
+	}
+	return nil
+}
+
+// PIDNamespaceFromContext returns the PID namespace in which ctx is executing,
+// or nil if there is no such PID namespace.
+func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace {
+	if v := ctx.Value(CtxPIDNamespace); v != nil {
+		return v.(*PIDNamespace)
+	}
+	return nil
+}
+
+// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing,
+// or nil if there is no such UTS namespace.
+func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
+	if v := ctx.Value(CtxUTSNamespace); v != nil {
+		return v.(*UTSNamespace)
+	}
+	return nil
+}
+
+// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
+// or nil if there is no such IPC namespace.
+func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
+	if v := ctx.Value(CtxIPCNamespace); v != nil {
+		return v.(*IPCNamespace)
+	}
+	return nil
+}
+
+// TaskFromContext returns the Task associated with ctx, or nil if there is no
+// such Task.
+func TaskFromContext(ctx context.Context) *Task {
+	if v := ctx.Value(CtxTask); v != nil {
+		return v.(*Task)
+	}
+	return nil
+}
+
+// AsyncContext returns a context.Context that may be used by goroutines that
+// do work on behalf of t and therefore share its contextual values, but are
+// not t's task goroutine (e.g. asynchronous I/O).
+func (t *Task) AsyncContext() context.Context {
+	return taskAsyncContext{t: t}
+}
+
+type taskAsyncContext struct {
+	context.NoopSleeper
+	t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+	ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+	ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+	ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+	return ctx.t.IsLogging(level)
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+	return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
new file mode 100644
index 000000000..04651d961
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -0,0 +1,52 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "epoll_autogen_state",
+    srcs = [
+        "epoll.go",
+        "epoll_state.go",
+    ],
+    out = "epoll_autogen_state.go",
+    package = "epoll",
+)
+
+go_library(
+    name = "epoll",
+    srcs = [
+        "epoll.go",
+        "epoll_autogen_state.go",
+        "epoll_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/ilist",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "epoll_test",
+    size = "small",
+    srcs = [
+        "epoll_test.go",
+    ],
+    embed = [":epoll"],
+    deps = [
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs/filetest",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
new file mode 100644
index 000000000..b572fcd7e
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -0,0 +1,466 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package epoll provides an implementation of Linux's IO event notification
+// facility. See epoll(7) for more details.
+package epoll
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Event describes the event mask that was observed and the user data to be
+// returned when one of the events occurs. It has this format to match the linux
+// format to avoid extra copying/allocation when writing events to userspace.
+type Event struct {
+	// Events is the event mask containing the set of events that have been
+	// observed on an entry.
+	Events uint32
+
+	// Data is an opaque 64-bit value provided by the caller when adding the
+	// entry, and returned to the caller when the entry reports an event.
+	Data [2]int32
+}
+
+// EntryFlags is a bitmask that holds an entry's flags.
+type EntryFlags int
+
+// Valid entry flags.
+const (
+	OneShot EntryFlags = 1 << iota
+	EdgeTriggered
+)
+
+// FileIdentifier identifies a file. We cannot use just the FD because it could
+// potentially be reassigned. We also cannot use just the file pointer because
+// it is possible to have multiple entries for the same file object as long as
+// they are created with different FDs (i.e., the FDs point to the same file).
+type FileIdentifier struct {
+	File *fs.File
+	Fd   kdefs.FD
+}
+
+// pollEntry holds all the state associated with an event poll entry, that is,
+// a file being observed by an event poll object.
+type pollEntry struct {
+	ilist.Entry
+	file     *refs.WeakRef  `state:"manual"`
+	id       FileIdentifier `state:"wait"`
+	userData [2]int32
+	waiter   waiter.Entry `state:"manual"`
+	mask     waiter.EventMask
+	flags    EntryFlags
+
+	epoll *EventPoll
+
+	// We cannot save the current list pointer as it points into EventPoll
+	// struct, while state framework currently does not support such
+	// in-struct pointers. Instead, EventPoll will properly set this field
+	// in its loading logic.
+	curList *ilist.List `state:"nosave"`
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+// weakReferenceGone is called when the file in the weak reference is destroyed.
+// The poll entry is removed in response to this.
+func (p *pollEntry) WeakRefGone() {
+	p.epoll.RemoveEntry(p.id)
+}
+
+// EventPoll holds all the state associated with an event poll object, that is,
+// collection of files to observe and their current state.
+type EventPoll struct {
+	fsutil.PipeSeek      `state:"zerovalue"`
+	fsutil.NotDirReaddir `state:"zerovalue"`
+	fsutil.NoFsync       `state:"zerovalue"`
+	fsutil.NoopFlush     `state:"zerovalue"`
+	fsutil.NoMMap        `state:"zerovalue"`
+	fsutil.NoIoctl       `state:"zerovalue"`
+
+	// Wait queue is used to notify interested parties when the event poll
+	// object itself becomes readable or writable.
+	waiter.Queue
+
+	// files is the map of all the files currently being observed, it is
+	// protected by mu.
+	mu    sync.Mutex `state:"nosave"`
+	files map[FileIdentifier]*pollEntry
+
+	// listsMu protects manipulation of the lists below. It needs to be a
+	// different lock to avoid circular lock acquisition order involving
+	// the wait queue mutexes and mu. The full order is mu, observed file
+	// wait queue mutex, then listsMu; this allows listsMu to be acquired
+	// when readyCallback is called.
+	//
+	// An entry is always in one of the following lists:
+	//	readyList -- when there's a chance that it's ready to have
+	//		events delivered to epoll waiters. Given that being
+	//		ready is a transient state, the Readiness() and
+	//		readEvents() functions always call the entry's file
+	//		Readiness() function to confirm it's ready.
+	//	waitingList -- when there's no chance that the entry is ready,
+	//		so it's waiting for the readyCallback to be called
+	//		on it before it gets moved to the readyList.
+	//	disabledList -- when the entry is disabled. This happens when
+	//		a one-shot entry gets delivered via readEvents().
+	listsMu      sync.Mutex `state:"nosave"`
+	readyList    ilist.List
+	waitingList  ilist.List
+	disabledList ilist.List
+}
+
+// cycleMu is used to serialize all the cycle checks. This is only used when
+// an event poll file is added as an entry to another event poll. Such checks
+// are serialized to avoid lock acquisition order inversion: if a thread is
+// adding A to B, and another thread is adding B to A, each would acquire A's
+// and B's mutexes in reverse order, and could cause deadlocks. Having this
+// lock prevents this by allowing only one check at a time to happen.
+//
+// We do the cycle check to prevent callers from introducing potentially
+// infinite recursions. If a caller were to add A to B and then B to A, for
+// event poll A to know if it's readable, it would need to check event poll B,
+// which in turn would need event poll A and so on indefinitely.
+var cycleMu sync.Mutex
+
+// NewEventPoll allocates and initializes a new event poll object.
+func NewEventPoll(ctx context.Context) *fs.File {
+	// name matches fs/eventpoll.c:epoll_create1.
+	dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+	return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
+		files: make(map[FileIdentifier]*pollEntry),
+	})
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventPoll) Release() {
+	// We need to take the lock now because files may be attempting to
+	// remove entries in parallel if they get destroyed.
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through all entries and clean up.
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+		entry.file.Drop()
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// Write implements fs.FileOperations.Write.
+func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syscall.ENOSYS
+}
+
+// eventsAvailable determines if 'e' has events available for delivery.
+func (e *EventPoll) eventsAvailable() bool {
+	e.listsMu.Lock()
+
+	for it := e.readyList.Front(); it != nil; {
+		entry := it.(*pollEntry)
+		it = it.Next()
+
+		// If the entry is ready, we know 'e' has at least one entry
+		// ready for delivery.
+		ready := entry.id.File.Readiness(entry.mask)
+		if ready != 0 {
+			e.listsMu.Unlock()
+			return true
+		}
+
+		// Entry is not ready, so move it to waiting list.
+		e.readyList.Remove(entry)
+		e.waitingList.PushBack(entry)
+		entry.curList = &e.waitingList
+	}
+
+	e.listsMu.Unlock()
+
+	return false
+}
+
+// Readiness determines if the event poll object is currently readable (i.e.,
+// if there are pending events for delivery).
+func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	if (mask&waiter.EventIn) != 0 && e.eventsAvailable() {
+		ready |= waiter.EventIn
+	}
+
+	return ready
+}
+
+// ReadEvents returns up to max available events.
+func (e *EventPoll) ReadEvents(max int) []Event {
+	var local ilist.List
+	var ret []Event
+
+	e.listsMu.Lock()
+
+	// Go through all entries we believe may be ready.
+	for it := e.readyList.Front(); it != nil && len(ret) < max; {
+		entry := it.(*pollEntry)
+		it = it.Next()
+
+		// Check the entry's readiness. It it's not really ready, we
+		// just put it back in the waiting list and move on to the next
+		// entry.
+		ready := entry.id.File.Readiness(entry.mask) & entry.mask
+		if ready == 0 {
+			e.readyList.Remove(entry)
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+
+			continue
+		}
+
+		// Add event to the array that will be returned to caller.
+		ret = append(ret, Event{
+			Events: uint32(ready),
+			Data:   entry.userData,
+		})
+
+		// The entry is consumed, so we must move it to the disabled
+		// list in case it's one-shot, or back to the wait list if it's
+		// edge-triggered. If it's neither, we leave it in the ready
+		// list so that its readiness can be checked the next time
+		// around; however, we must move it to the end of the list so
+		// that other events can be delivered as well.
+		e.readyList.Remove(entry)
+		if entry.flags&OneShot != 0 {
+			e.disabledList.PushBack(entry)
+			entry.curList = &e.disabledList
+		} else if entry.flags&EdgeTriggered != 0 {
+			e.waitingList.PushBack(entry)
+			entry.curList = &e.waitingList
+		} else {
+			local.PushBack(entry)
+		}
+	}
+
+	e.readyList.PushBackList(&local)
+
+	e.listsMu.Unlock()
+
+	return ret
+}
+
+// readyCallback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+type readyCallback struct{}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (*readyCallback) Callback(w *waiter.Entry) {
+	entry := w.Context.(*pollEntry)
+	e := entry.epoll
+
+	e.listsMu.Lock()
+
+	if entry.curList == &e.waitingList {
+		e.waitingList.Remove(entry)
+		e.readyList.PushBack(entry)
+		entry.curList = &e.readyList
+
+		e.Notify(waiter.EventIn)
+	}
+
+	e.listsMu.Unlock()
+}
+
+// initEntryReadiness initializes the entry's state with regards to its
+// readiness by placing it in the appropriate list and registering for
+// notifications.
+func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
+	// A new entry starts off in the waiting list.
+	e.listsMu.Lock()
+	e.waitingList.PushBack(entry)
+	entry.curList = &e.waitingList
+	e.listsMu.Unlock()
+
+	// Register for event notifications.
+	f := entry.id.File
+	f.EventRegister(&entry.waiter, entry.mask)
+
+	// Check if the file happens to already be in a ready state.
+	ready := f.Readiness(entry.mask) & entry.mask
+	if ready != 0 {
+		(*readyCallback).Callback(nil, &entry.waiter)
+	}
+}
+
+// observes checks if event poll object e is directly or indirectly observing
+// event poll object ep. It uses a bounded recursive depth-first search.
+func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
+	// If we reached the maximum depth, we'll consider that we found it
+	// because we don't want to allow chains that are too long.
+	if depthLeft <= 0 {
+		return true
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Go through each observed file and check if it is or observes ep.
+	for id := range e.files {
+		f, ok := id.File.FileOperations.(*EventPoll)
+		if !ok {
+			continue
+		}
+
+		if f == ep || f.observes(ep, depthLeft-1) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// AddEntry adds a new file to the collection of files observed by e.
+func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	// Acquire cycle check lock if another event poll is being added.
+	ep, ok := id.File.FileOperations.(*EventPoll)
+	if ok {
+		cycleMu.Lock()
+		defer cycleMu.Unlock()
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file already has an entry.
+	if _, ok := e.files[id]; ok {
+		return syscall.EEXIST
+	}
+
+	// Check if a cycle would be created. We use 4 as the limit because
+	// that's the value used by linux and we want to emulate it.
+	if ep != nil {
+		if e == ep {
+			return syscall.EINVAL
+		}
+
+		if ep.observes(e, 4) {
+			return syscall.ELOOP
+		}
+	}
+
+	// Create new entry and add it to map.
+	//
+	// N.B. Even though we are creating a weak reference here, we know it
+	//      won't trigger a callback because we hold a reference to the file
+	//      throughout the execution of this function.
+	entry := &pollEntry{
+		id:       id,
+		userData: data,
+		epoll:    e,
+		flags:    flags,
+		waiter:   waiter.Entry{Callback: &readyCallback{}},
+		mask:     mask,
+	}
+	entry.waiter.Context = entry
+	e.files[id] = entry
+	entry.file = refs.NewWeakRef(id.File, entry)
+
+	// Initialize the readiness state of the new entry.
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// UpdateEntry updates the flags, mask and user data associated with a file that
+// is already part of the collection of observed files.
+func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister the old mask and remove entry from the list it's in, so
+	// readyCallback is guaranteed to not be called on this entry anymore.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove entry from whatever list it's in. This ensure that no other
+	// threads have access to this entry as the only way left to find it
+	// is via e.files, but we hold e.mu, which prevents that.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	e.listsMu.Unlock()
+
+	// Initialize new readiness state.
+	entry.flags = flags
+	entry.mask = mask
+	entry.userData = data
+	e.initEntryReadiness(entry)
+
+	return nil
+}
+
+// RemoveEntry a files from the collection of observed files.
+func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Fail if the file doesn't have an entry.
+	entry, ok := e.files[id]
+	if !ok {
+		return syscall.ENOENT
+	}
+
+	// Unregister from file first so that no concurrent attempts will be
+	// made to manipulate the file.
+	entry.id.File.EventUnregister(&entry.waiter)
+
+	// Remove from the current list.
+	e.listsMu.Lock()
+	entry.curList.Remove(entry)
+	entry.curList = nil
+	e.listsMu.Unlock()
+
+	// Remove file from map, and drop weak reference.
+	delete(e.files, id)
+	entry.file.Drop()
+
+	return nil
+}
+
+// UnregisterEpollWaiters removes the epoll waiter objects from the waiting
+// queues. This is different from Release() as the file is not dereferenced.
+func (e *EventPoll) UnregisterEpollWaiters() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	for _, entry := range e.files {
+		entry.id.File.EventUnregister(&entry.waiter)
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
new file mode 100644
index 000000000..dabb32f49
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// afterLoad is invoked by stateify.
+func (p *pollEntry) afterLoad() {
+	p.waiter = waiter.Entry{Callback: &readyCallback{}}
+	p.waiter.Context = p
+	p.file = refs.NewWeakRef(p.id.File, p)
+	p.id.File.EventRegister(&p.waiter, p.mask)
+}
+
+// afterLoad is invoked by stateify.
+func (e *EventPoll) afterLoad() {
+	e.listsMu.Lock()
+	defer e.listsMu.Unlock()
+
+	for _, ls := range []*ilist.List{&e.waitingList, &e.readyList, &e.disabledList} {
+		for it := ls.Front(); it != nil; it = it.Next() {
+			it.(*pollEntry).curList = ls
+		}
+	}
+
+	for it := e.waitingList.Front(); it != nil; it = it.Next() {
+		p := it.(*pollEntry)
+		if p.id.File.Readiness(p.mask) != 0 {
+			e.waitingList.Remove(p)
+			e.readyList.PushBack(p)
+			p.curList = &e.readyList
+			e.Notify(waiter.EventIn)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
new file mode 100644
index 000000000..bc869fc13
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestFileDestroyed(t *testing.T) {
+	f := filetest.NewTestFile(t)
+	id := FileIdentifier{f, 12}
+
+	efile := NewEventPoll(contexttest.Context(t))
+	e := efile.FileOperations.(*EventPoll)
+	if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil {
+		t.Fatalf("addEntry failed: %v", err)
+	}
+
+	// Check that we get an event reported twice in a row.
+	evt := e.ReadEvents(1)
+	if len(evt) != 1 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+	}
+
+	evt = e.ReadEvents(1)
+	if len(evt) != 1 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+	}
+
+	// Destroy the file. Check that we get no more events.
+	f.DecRef()
+
+	evt = e.ReadEvents(1)
+	if len(evt) != 0 {
+		t.Fatalf("Unexpected number of ready events: want %v, got %v", 0, len(evt))
+	}
+
+}
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
new file mode 100644
index 000000000..2d5a3c693
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -0,0 +1,46 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "eventfd_state",
+    srcs = [
+        "eventfd.go",
+    ],
+    out = "eventfd_state.go",
+    package = "eventfd",
+)
+
+go_library(
+    name = "eventfd",
+    srcs = [
+        "eventfd.go",
+        "eventfd_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "eventfd_test",
+    size = "small",
+    srcs = ["eventfd_test.go"],
+    embed = [":eventfd"],
+    deps = [
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
new file mode 100644
index 000000000..c9333719e
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package eventfd provides an implementation of Linux's file-based event
+// notification.
+package eventfd
+
+import (
+	"math"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// EventOperations represents an event with the semantics of Linux's file-based event
+// notification (eventfd).
+type EventOperations struct {
+	fsutil.NoopRelease   `state:"nosave"`
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	fsutil.NoIoctl       `state:"nosave"`
+
+	// Mutex that protects accesses to the fields of this event.
+	mu sync.Mutex `state:"nosave"`
+
+	// Queue is used to notify interested parties when the event object
+	// becomes readable or writable.
+	waiter.Queue `state:"nosave"`
+
+	// val is the current value of the event counter.
+	val uint64
+
+	// semMode specifies whether the event is in "semaphore" mode.
+	semMode bool
+}
+
+// New creates a new event object with the supplied initial value and mode.
+func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
+	// name matches fs/eventfd.c:eventfd_file_create.
+	dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]")
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
+		val:     initVal,
+		semMode: semMode,
+	})
+}
+
+// Read implements fs.FileOperations.Read.
+func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.read(ctx, dst); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Write implements fs.FileOperations.Write.
+func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	if src.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := e.write(ctx, src); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error {
+	e.mu.Lock()
+
+	// We can't complete the read if the value is currently zero.
+	if e.val == 0 {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	// Update the value based on the mode the event is operating in.
+	var val uint64
+	if e.semMode {
+		val = 1
+		// Consistent with Linux, this is done even if writing to memory fails.
+		e.val--
+	} else {
+		val = e.val
+		e.val = 0
+	}
+
+	e.mu.Unlock()
+
+	// Notify writers. We do this even if we were already writable because
+	// it is possible that a writer is waiting to write the maximum value
+	// to the event.
+	e.Notify(waiter.EventOut)
+
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
+		return err
+	}
+	val := usermem.ByteOrder.Uint64(buf[:])
+
+	return e.Signal(val)
+}
+
+// Signal is an internal function to signal the event fd.
+func (e *EventOperations) Signal(val uint64) error {
+	if val == math.MaxUint64 {
+		return syscall.EINVAL
+	}
+
+	e.mu.Lock()
+
+	// We only allow writes that won't cause the value to go over the max
+	// uint64 minus 1.
+	if val > math.MaxUint64-1-e.val {
+		e.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	e.val += val
+	e.mu.Unlock()
+
+	// Always trigger a notification.
+	e.Notify(waiter.EventIn)
+
+	return nil
+}
+
+// Readiness returns the ready events for the event fd.
+func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	e.mu.Lock()
+	if e.val > 0 {
+		ready |= waiter.EventIn
+	}
+
+	if e.val < math.MaxUint64-1 {
+		ready |= waiter.EventOut
+	}
+	e.mu.Unlock()
+
+	return mask & ready
+}
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
new file mode 100644
index 000000000..71326b62f
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventfd
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestEventfd(t *testing.T) {
+	initVals := []uint64{
+		0,
+		// Using a non-zero initial value verifies that writing to an
+		// eventfd signals when the eventfd's counter was already
+		// non-zero.
+		343,
+	}
+
+	for _, initVal := range initVals {
+		ctx := contexttest.Context(t)
+
+		// Make a new event that is writable.
+		event := New(ctx, initVal, false)
+
+		// Register a callback for a write event.
+		w, ch := waiter.NewChannelEntry(nil)
+		event.EventRegister(&w, waiter.EventIn)
+		defer event.EventUnregister(&w)
+
+		data := []byte("00000124")
+		// Create and submit a write request.
+		n, err := event.Writev(ctx, usermem.BytesIOSequence(data))
+		if err != nil {
+			t.Fatal(err)
+		}
+		if n != 8 {
+			t.Errorf("eventfd.write wrote %d bytes, not full int64", n)
+		}
+
+		// Check if the callback fired due to the write event.
+		select {
+		case <-ch:
+		default:
+			t.Errorf("Didn't get notified of EventIn after write")
+		}
+	}
+}
+
+func TestEventfdStat(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	// Make a new event that is writable.
+	event := New(ctx, 0, false)
+
+	// Create and submit an stat request.
+	uattr, err := event.Dirent.Inode.UnstableAttr(ctx)
+	if err != nil {
+		t.Fatalf("eventfd stat request failed: %v", err)
+	}
+	if uattr.Size != 0 {
+		t.Fatal("EventFD size should be 0")
+	}
+}
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
new file mode 100644
index 000000000..ef73125fd
--- /dev/null
+++ b/pkg/sentry/kernel/fd_map.go
@@ -0,0 +1,340 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// FDs is an ordering of FD's that can be made stable.
+type FDs []kdefs.FD
+
+func (f FDs) Len() int {
+	return len(f)
+}
+
+func (f FDs) Swap(i, j int) {
+	f[i], f[j] = f[j], f[i]
+}
+
+func (f FDs) Less(i, j int) bool {
+	return f[i] < f[j]
+}
+
+// FDFlags define flags for an individual descriptor.
+type FDFlags struct {
+	// CloseOnExec indicates the descriptor should be closed on exec.
+	CloseOnExec bool
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer the
+// file itself and the descriptor flags.
+type descriptor struct {
+	file  *fs.File
+	flags FDFlags
+}
+
+// FDMap is used to manage File references and flags.
+type FDMap struct {
+	refs.AtomicRefCount
+	k     *Kernel
+	files map[kdefs.FD]descriptor
+	mu    sync.RWMutex `state:"nosave"`
+	uid   uint64
+}
+
+// ID returns a unique identifier for this FDMap.
+func (f *FDMap) ID() uint64 {
+	return f.uid
+}
+
+// NewFDMap allocates a new FDMap that may be used by tasks in k.
+func (k *Kernel) NewFDMap() *FDMap {
+	return &FDMap{
+		k:     k,
+		files: make(map[kdefs.FD]descriptor),
+		uid:   atomic.AddUint64(&k.fdMapUids, 1),
+	}
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDMap) destroy() {
+	f.RemoveIf(func(*fs.File, FDFlags) bool {
+		return true
+	})
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDMap) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDMap) Size() int {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	return len(f.files)
+}
+
+// String is a stringer for FDMap.
+func (f *FDMap) String() string {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	var b bytes.Buffer
+	for k, v := range f.files {
+		n, _ := v.file.Dirent.FullName(nil /* root */)
+		b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
+	}
+	return b.String()
+}
+
+// NewFDFrom allocates a new FD guaranteed to be the lowest number available
+// greater than or equal to from. This property is important as Unix programs
+// tend to count on this allocation order.
+func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return 0, syscall.EINVAL
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Finds the lowest fd not in the handles map.
+	lim := limitSet.Get(limits.NumberOfFiles)
+	for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
+		if _, ok := f.files[i]; !ok {
+			file.IncRef()
+			f.files[i] = descriptor{file, flags}
+			return i, nil
+		}
+	}
+
+	return -1, syscall.EMFILE
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an
+// active reference for that FD, the ref count for that existing reference
+// is decremented.
+func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
+	if fd < 0 {
+		// Don't accept negative FDs.
+		return syscall.EBADF
+	}
+
+	// In this one case we do not do a defer of the Unlock.  The
+	// reason is that we must have done all the work needed for
+	// discarding any old open file before we return to the
+	// caller. In other words, the DecRef(), below, must have
+	// completed by the time we return to the caller to ensure
+	// side effects are, in fact, effected. A classic example is
+	// dup2(fd1, fd2); if fd2 was already open, it must be closed,
+	// and we don't want to resume the caller until it is; we have
+	// to block on the DecRef(). Hence we can not just do a 'go
+	// oldfile.DecRef()', since there would be no guarantee that
+	// it would be done before we the caller resumed. Since we
+	// must wait for the DecRef() to finish, and that could take
+	// time, it's best to first call f.muUnlock beore so we are
+	// not blocking other uses of this FDMap on the DecRef() call.
+	f.mu.Lock()
+	oldDesc, oldExists := f.files[fd]
+	lim := limitSet.Get(limits.NumberOfFiles).Cur
+	// if we're closing one then the effective limit is one
+	// more than the actual limit.
+	if oldExists && lim != limits.Infinity {
+		lim++
+	}
+	if lim != limits.Infinity && fd >= kdefs.FD(lim) {
+		f.mu.Unlock()
+		return syscall.EMFILE
+	}
+
+	file.IncRef()
+	f.files[fd] = descriptor{file, flags}
+	f.mu.Unlock()
+
+	if oldExists {
+		oldDesc.file.DecRef()
+	}
+	return nil
+}
+
+// SetFlags sets the flags for the given file descriptor, if it is valid.
+func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	desc, ok := f.files[fd]
+	if !ok {
+		return
+	}
+
+	f.files[fd] = descriptor{desc.file, flags}
+}
+
+// GetDescriptor returns a reference to the file and the flags for the FD. It
+// bumps its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	if desc, ok := f.files[fd]; ok {
+		desc.file.IncRef()
+		return desc.file, desc.flags
+	}
+	return nil, FDFlags{}
+}
+
+// GetFile returns a reference to the File for the FD and bumps
+// its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
+	f.mu.RLock()
+	if desc, ok := f.files[fd]; ok {
+		desc.file.IncRef()
+		f.mu.RUnlock()
+		return desc.file
+	}
+	f.mu.RUnlock()
+	return nil
+}
+
+// fds returns an ordering of FDs.
+func (f *FDMap) fds() FDs {
+	fds := make(FDs, 0, len(f.files))
+	for fd := range f.files {
+		fds = append(fds, fd)
+	}
+	sort.Sort(fds)
+	return fds
+}
+
+// GetFDs returns a list of valid fds.
+func (f *FDMap) GetFDs() FDs {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.fds()
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each.  The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDMap) GetRefs() []*fs.File {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	fds := f.fds()
+	fs := make([]*fs.File, 0, len(fds))
+	for _, fd := range fds {
+		desc := f.files[fd]
+		desc.file.IncRef()
+		fs = append(fs, desc.file)
+	}
+	return fs
+}
+
+// Fork returns an independent FDMap pointing to the same descriptors.
+func (f *FDMap) Fork() *FDMap {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+
+	clone := f.k.NewFDMap()
+
+	// Grab a extra reference for every file.
+	for fd, desc := range f.files {
+		desc.file.IncRef()
+		clone.files[fd] = desc
+	}
+
+	// That's it!
+	return clone
+}
+
+// unlock releases all file locks held by this FDMap's uid.  Must only be
+// called on a non-nil *fs.File.
+func (f *FDMap) unlock(file *fs.File) {
+	id := lock.UniqueID(f.ID())
+	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
+}
+
+// inotifyFileClose generates the appropriate inotify events for f being closed.
+func inotifyFileClose(f *fs.File) {
+	var ev uint32
+	d := f.Dirent
+
+	if fs.IsDir(d.Inode.StableAttr) {
+		ev |= linux.IN_ISDIR
+	}
+
+	if f.Flags().Write {
+		ev |= linux.IN_CLOSE_WRITE
+	} else {
+		ev |= linux.IN_CLOSE_NOWRITE
+	}
+
+	d.InotifyEvent(ev, 0)
+}
+
+// Remove removes an FD from the FDMap, and returns (File, true) if a File
+// one was found. Callers are expected to decrement the reference count on
+// the File. Otherwise returns (nil, false).
+func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
+	f.mu.Lock()
+	desc := f.files[fd]
+	delete(f.files, fd)
+	f.mu.Unlock()
+	if desc.file != nil {
+		f.unlock(desc.file)
+		inotifyFileClose(desc.file)
+		return desc.file, true
+	}
+	return nil, false
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+	var removed []*fs.File
+	f.mu.Lock()
+	for fd, desc := range f.files {
+		if desc.file != nil && cond(desc.file, desc.flags) {
+			delete(f.files, fd)
+			removed = append(removed, desc.file)
+		}
+	}
+	f.mu.Unlock()
+
+	for _, file := range removed {
+		f.unlock(file)
+		inotifyFileClose(file)
+		file.DecRef()
+	}
+}
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
new file mode 100644
index 000000000..e1ac900e8
--- /dev/null
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -0,0 +1,134 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+const (
+	// maxFD is the maximum FD to try to create in the map.
+	// This number of open files has been seen in the wild.
+	maxFD = 2 * 1024
+)
+
+func newTestFDMap() *FDMap {
+	return &FDMap{
+		files: make(map[kdefs.FD]descriptor),
+	}
+}
+
+// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap,
+// until there is no room, then makes sure that NewFDAt works
+// and also that if we remove one and add one that works too.
+func TestFDMapMany(t *testing.T) {
+	file := filetest.NewTestFile(t)
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+
+	f := newTestFDMap()
+	for i := 0; i < maxFD; i++ {
+		if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+			t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
+		}
+	}
+
+	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
+		t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error")
+	}
+
+	if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+	}
+}
+
+// TestFDMap does a set of simple tests to make sure simple adds,
+// removes, GetRefs, and DecRefs work. The ordering is just weird
+// enough that a table-driven approach seemed clumsy.
+func TestFDMap(t *testing.T) {
+	file := filetest.NewTestFile(t)
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD})
+
+	f := newTestFDMap()
+	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
+	}
+
+	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
+		t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
+	}
+
+	largeLimit := limits.Limit{maxFD, maxFD}
+	limitSet.Set(limits.NumberOfFiles, largeLimit)
+
+	if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+	} else if fd != kdefs.FD(1) {
+		t.Fatalf("Added an FD to a resized map: got %v, want 1", fd)
+	}
+
+	if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
+		t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+	}
+
+	if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil {
+		t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
+	}
+
+	if ref := f.GetFile(1); ref == nil {
+		t.Fatalf("f.GetFile(1): got nil, wanted %v", file)
+	}
+
+	if ref := f.GetFile(2); ref != nil {
+		t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref)
+	}
+
+	ref, ok := f.Remove(1)
+	if !ok {
+		t.Fatalf("f.Remove(1) for an existing FD: failed, want success")
+	}
+	ref.DecRef()
+
+	if ref, ok := f.Remove(1); ok {
+		ref.DecRef()
+		t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
+	}
+
+}
+
+func TestDescriptorFlags(t *testing.T) {
+	file := filetest.NewTestFile(t)
+	f := newTestFDMap()
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+
+	if err := f.NewFDAt(2, file, FDFlags{CloseOnExec: true}, limitSet); err != nil {
+		t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
+	}
+
+	newFile, flags := f.GetDescriptor(2)
+	if newFile == nil {
+		t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile)
+	}
+
+	if !flags.CloseOnExec {
+		t.Fatalf("new File flags %d don't match original %d\n", flags, 0)
+	}
+}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
new file mode 100644
index 000000000..9aa6fa951
--- /dev/null
+++ b/pkg/sentry/kernel/fs_context.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FSContext contains filesystem context.
+//
+// This includes umask and working directory.
+type FSContext struct {
+	refs.AtomicRefCount
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// root is the filesystem root. Will be nil iff the FSContext has been
+	// destroyed.
+	root *fs.Dirent
+
+	// cwd is the current working directory. Will be nil iff the FSContext
+	// has been destroyed.
+	cwd *fs.Dirent
+
+	// umask is the current file mode creation mask. When a thread using this
+	// context invokes a syscall that creates a file, bits set in umask are
+	// removed from the permissions that the file is created with.
+	umask uint
+}
+
+// newFSContext returns a new filesystem context.
+func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
+	root.IncRef()
+	cwd.IncRef()
+	return &FSContext{
+		root:  root,
+		cwd:   cwd,
+		umask: umask,
+	}
+}
+
+// destroy is the destructor for an FSContext.
+//
+// This will call DecRef on both root and cwd Dirents.  If either call to
+// DecRef returns an error, then it will be propigated.  If both calls to
+// DecRef return an error, then the one from root.DecRef will be propigated.
+//
+// Note that there may still be calls to WorkingDirectory() or RootDirectory()
+// (that return nil).  This is because valid references may still be held via
+// proc files or other mechanisms.
+func (f *FSContext) destroy() {
+	f.root.DecRef()
+	f.root = nil
+
+	f.cwd.DecRef()
+	f.cwd = nil
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FSContext) DecRef() {
+	f.DecRefWithDestructor(f.destroy)
+}
+
+// Fork forks this FSContext.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) Fork() *FSContext {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.cwd.IncRef()
+	f.root.IncRef()
+	return &FSContext{
+		cwd:   f.cwd,
+		root:  f.root,
+		umask: f.umask,
+	}
+}
+
+// WorkingDirectory returns the current working directory.
+// You should call DecRef on the returned Dirent when finished.
+//
+// This will return nil if called after destroy().
+func (f *FSContext) WorkingDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if f.cwd != nil {
+		f.cwd.IncRef()
+	}
+	return f.cwd
+}
+
+// SetWorkingDirectory sets the current working directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetWorkingDirectory called with nil dirent")
+	}
+	if f.cwd == nil {
+		panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d))
+	}
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.cwd
+	f.cwd = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// RootDirectory returns the current filesystem root.
+// You should call DecRef on the returned Dirent when finished.
+//
+// This will return nil if called after destroy().
+func (f *FSContext) RootDirectory() *fs.Dirent {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.root.IncRef()
+	return f.root
+}
+
+// SetRootDirectory sets the root directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+	if d == nil {
+		panic("FSContext.SetRootDirectory called with nil dirent")
+	}
+	if f.root == nil {
+		panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d))
+	}
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.root
+	f.root = d
+	d.IncRef()
+	old.DecRef()
+}
+
+// Umask returns the current umask.
+func (f *FSContext) Umask() uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.umask
+}
+
+// SwapUmask atomically sets the current umask and returns the old umask.
+func (f *FSContext) SwapUmask(mask uint) uint {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	old := f.umask
+	f.umask = mask
+	return old
+}
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
new file mode 100644
index 000000000..de9897c58
--- /dev/null
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -0,0 +1,48 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "futex",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Waiter",
+    },
+)
+
+go_stateify(
+    name = "futex_state",
+    srcs = [
+        "futex.go",
+        "waiter_list.go",
+    ],
+    out = "futex_state.go",
+    package = "futex",
+)
+
+go_library(
+    name = "futex",
+    srcs = [
+        "futex.go",
+        "futex_state.go",
+        "waiter_list.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "futex_test",
+    size = "small",
+    srcs = ["futex_test.go"],
+    embed = [":futex"],
+)
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
new file mode 100644
index 000000000..b3ba57a2c
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -0,0 +1,405 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package futex provides an implementation of the futex interface as found in
+// the Linux kernel. It allows one to easily transform Wait() calls into waits
+// on a channel, which is useful in a Go-based kernel, for example.
+package futex
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Checker abstracts memory accesses. This is useful because the "addresses"
+// used in this package may not be real addresses (they could be indices of an
+// array, for example), or they could be mapped via some special mechanism.
+//
+// TODO: Replace this with usermem.IO.
+type Checker interface {
+	// Check should validate that given address contains the given value.
+	// If it does not contain the value, syserror.EAGAIN must be returned.
+	// Any other error may be returned, which will be propagated.
+	Check(addr uintptr, val uint32) error
+
+	// Op should atomically perform the operation encoded in op on the data
+	// pointed to by addr, then apply the comparison encoded in op to the
+	// original value at addr, returning the result.
+	// Note that op is an opaque operation whose behaviour is defined
+	// outside of the futex manager.
+	Op(addr uintptr, op uint32) (bool, error)
+}
+
+// Waiter is the struct which gets enqueued into buckets for wake up routines
+// and requeue routines to scan and notify. Once a Waiter has been enqueued by
+// WaitPrepare(), callers may listen on C for wake up events.
+type Waiter struct {
+	// Synchronization:
+	//
+	// - A Waiter that is not enqueued in a bucket is exclusively owned (no
+	// synchronization applies).
+	//
+	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
+	// waiterEntry, complete, and addr are protected by the bucket.mu ("bucket
+	// lock") of the containing bucket, and bitmask is immutable. complete and
+	// addr are additionally mutated using atomic memory operations, ensuring
+	// that they can be read using atomic memory operations without holding the
+	// bucket lock.
+	//
+	// - A Waiter is only guaranteed to be no longer queued after calling
+	// WaitComplete().
+
+	// waiterEntry links Waiter into bucket.waiters.
+	waiterEntry
+
+	// complete is 1 if the Waiter was removed from its bucket by a wakeup and
+	// 0 otherwise.
+	complete int32
+
+	// C is sent to when the Waiter is woken.
+	C chan struct{}
+
+	// addr is the address being waited on.
+	addr uintptr
+
+	// The bitmask we're waiting on.
+	// This is used the case of a FUTEX_WAKE_BITSET.
+	bitmask uint32
+}
+
+// NewWaiter returns a new unqueued Waiter.
+func NewWaiter() *Waiter {
+	return &Waiter{
+		C: make(chan struct{}, 1),
+	}
+}
+
+// bucket holds a list of waiters for a given address hash.
+type bucket struct {
+	// mu protects waiters and contained Waiter state. See comment in Waiter.
+	mu sync.Mutex `state:"nosave"`
+
+	waiters waiterList `state:"zerovalue"`
+}
+
+// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
+// bucket and returns the number of waiters woken.
+//
+// Preconditions: b.mu must be locked.
+func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if w.addr != addr || w.bitmask&bitmask == 0 {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		// Remove from the bucket and wake the waiter.
+		woke := w
+		w = w.Next() // Next iteration.
+		b.waiters.Remove(woke)
+		woke.C <- struct{}{}
+
+		// NOTE: The above channel write establishes a write barrier
+		// according to the memory model, so nothing may be ordered
+		// around it. Since we've dequeued w and will never touch it
+		// again, we can safely store 1 to w.complete here and allow
+		// the WaitComplete() to short-circuit grabbing the bucket
+		// lock. If they somehow miss the w.complete, we are still
+		// holding the lock, so we can know that they won't dequeue w,
+		// assume it's free and have the below operation afterwards.
+		atomic.StoreInt32(&woke.complete, 1)
+		done++
+	}
+	return done
+}
+
+// requeueLocked takes n waiters from the bucket and moves them to naddr on the
+// bucket "to".
+//
+// Preconditions: b and to must be locked.
+func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int {
+	done := 0
+	for w := b.waiters.Front(); done < n && w != nil; {
+		if w.addr != addr {
+			// Not matching.
+			w = w.Next()
+			continue
+		}
+
+		requeued := w
+		w = w.Next() // Next iteration.
+		b.waiters.Remove(requeued)
+		atomic.StoreUintptr(&requeued.addr, naddr)
+		to.waiters.PushBack(requeued)
+		done++
+	}
+	return done
+}
+
+const (
+	// bucketCount is the number of buckets per Manager. By having many of
+	// these we reduce contention when concurrent yet unrelated calls are made.
+	bucketCount     = 1 << bucketCountBits
+	bucketCountBits = 10
+)
+
+func checkAddr(addr uintptr) error {
+	// Ensure the address is aligned.
+	// It must be a DWORD boundary.
+	if addr&0x3 != 0 {
+		return syserror.EINVAL
+	}
+
+	return nil
+}
+
+// bucketIndexForAddr returns the index into Manager.buckets for addr.
+func bucketIndexForAddr(addr uintptr) uintptr {
+	// - The bottom 2 bits of addr must be 0, per checkAddr.
+	//
+	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
+	// for a canonical address, and (on all existing platforms) bit 47 must be
+	// 0 for an application address.
+	//
+	// Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
+	// bits. We choose one of the simplest possible hash functions that at
+	// least uses all 45 useful bits in the output, given that bucketCountBits
+	// == 10. This hash function also has the property that it will usually map
+	// adjacent addresses to adjacent buckets, slightly improving memory
+	// locality when an application synchronization structure uses multiple
+	// nearby futexes.
+	//
+	// Note that despite the large number of arithmetic operations in the
+	// function, many components can be computed in parallel, such that the
+	// critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
+	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
+	// (addr >> 42)" without any additional grouping, the compiler puts all 4
+	// additions in the critical path.
+	h1 := (addr >> 2) + (addr >> 12) + (addr >> 22)
+	h2 := (addr >> 32) + (addr >> 42)
+	return (h1 + h2) % bucketCount
+}
+
+// Manager holds futex state for a single virtual address space.
+type Manager struct {
+	buckets [bucketCount]bucket
+}
+
+// NewManager returns an initialized futex manager.
+// N.B. we use virtual address to tag futexes, so it only works for private
+// (within a single process) futex.
+func NewManager() *Manager {
+	return &Manager{}
+}
+
+// lockBucket returns a locked bucket for the given addr.
+//
+// Preconditions: checkAddr(addr) == nil.
+func (m *Manager) lockBucket(addr uintptr) *bucket {
+	b := &m.buckets[bucketIndexForAddr(addr)]
+	b.mu.Lock()
+	return b
+}
+
+// lockBuckets returns locked buckets for the given addrs.
+//
+// Preconditions: checkAddr(addr1) == checkAddr(addr2) == nil.
+func (m *Manager) lockBuckets(addr1 uintptr, addr2 uintptr) (*bucket, *bucket) {
+	i1 := bucketIndexForAddr(addr1)
+	i2 := bucketIndexForAddr(addr2)
+	b1 := &m.buckets[i1]
+	b2 := &m.buckets[i2]
+
+	// Ensure that buckets are locked in a consistent order (lowest index
+	// first) to avoid circular locking.
+	switch {
+	case i1 < i2:
+		b1.mu.Lock()
+		b2.mu.Lock()
+	case i2 < i1:
+		b2.mu.Lock()
+		b1.mu.Lock()
+	default:
+		b1.mu.Lock()
+	}
+
+	return b1, b2
+}
+
+// Wake wakes up to n waiters matching the bitmask on the given addr.
+// The number of waiters woken is returned.
+func (m *Manager) Wake(addr uintptr, bitmask uint32, n int) (int, error) {
+	if err := checkAddr(addr); err != nil {
+		return 0, err
+	}
+
+	b := m.lockBucket(addr)
+	// This function is very hot; avoid defer.
+	r := b.wakeLocked(addr, bitmask, n)
+	b.mu.Unlock()
+	return r, nil
+}
+
+func (m *Manager) doRequeue(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
+	if err := checkAddr(addr); err != nil {
+		return 0, err
+	}
+	if err := checkAddr(naddr); err != nil {
+		return 0, err
+	}
+
+	b1, b2 := m.lockBuckets(addr, naddr)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
+
+	// Check our value.
+	// This only applied for RequeueCmp().
+	if c != nil {
+		if err := c.Check(addr, val); err != nil {
+			return 0, err
+		}
+	}
+
+	// Wake the number required.
+	done := b1.wakeLocked(addr, ^uint32(0), nwake)
+
+	// Requeue the number required.
+	b1.requeueLocked(b2, addr, naddr, nreq)
+
+	return done, nil
+}
+
+// Requeue wakes up to nwake waiters on the given addr, and unconditionally
+// requeues up to nreq waiters on naddr.
+func (m *Manager) Requeue(addr uintptr, naddr uintptr, nwake int, nreq int) (int, error) {
+	return m.doRequeue(nil, addr, 0, naddr, nwake, nreq)
+}
+
+// RequeueCmp atomically checks that the addr contains val (via the Checker),
+// wakes up to nwake waiters on addr and then unconditionally requeues nreq
+// waiters on naddr.
+func (m *Manager) RequeueCmp(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
+	return m.doRequeue(c, addr, val, naddr, nwake, nreq)
+}
+
+// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
+// waiters unconditionally from addr1, and, based on the original value at addr2
+// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
+// It returns the total number of waiters woken.
+func (m *Manager) WakeOp(c Checker, addr1 uintptr, addr2 uintptr, nwake1 int, nwake2 int, op uint32) (int, error) {
+	if err := checkAddr(addr1); err != nil {
+		return 0, err
+	}
+	if err := checkAddr(addr2); err != nil {
+		return 0, err
+	}
+
+	b1, b2 := m.lockBuckets(addr1, addr2)
+
+	done := 0
+	cond, err := c.Op(addr2, op)
+	if err == nil {
+		// Wake up up to nwake1 entries from the first bucket.
+		done = b1.wakeLocked(addr1, ^uint32(0), nwake1)
+
+		// Wake up up to nwake2 entries from the second bucket if the
+		// operation yielded true.
+		if cond {
+			done += b2.wakeLocked(addr2, ^uint32(0), nwake2)
+		}
+	}
+
+	b1.mu.Unlock()
+	if b2 != b1 {
+		b2.mu.Unlock()
+	}
+	return done, err
+}
+
+// WaitPrepare atomically checks that addr contains val (via the Checker), then
+// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
+// Waiter must be subsequently removed by calling WaitComplete, whether or not
+// a wakeup is received on w.C.
+func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bitmask uint32) error {
+	if err := checkAddr(addr); err != nil {
+		return err
+	}
+
+	// Prepare the Waiter before taking the bucket lock.
+	w.complete = 0
+	select {
+	case <-w.C:
+	default:
+	}
+	w.addr = addr
+	w.bitmask = bitmask
+
+	b := m.lockBucket(addr)
+	// This function is very hot; avoid defer.
+
+	// Perform our atomic check.
+	if err := c.Check(addr, val); err != nil {
+		b.mu.Unlock()
+		return err
+	}
+
+	// Add the waiter to the bucket.
+	b.waiters.PushBack(w)
+
+	b.mu.Unlock()
+	return nil
+}
+
+// WaitComplete must be called when a Waiter previously added by WaitPrepare is
+// no longer eligible to be woken.
+func (m *Manager) WaitComplete(w *Waiter) {
+	// Can we short-circuit acquiring the lock?
+	// This is the happy path where a notification
+	// was received and we don't need to dequeue this
+	// waiter from any list (or take any locks).
+	if atomic.LoadInt32(&w.complete) != 0 {
+		return
+	}
+
+	// Take the bucket lock. Note that without holding the bucket lock, the
+	// waiter is not guaranteed to stay in that bucket, so after we take the
+	// bucket lock, we must ensure that the bucket hasn't changed: if it
+	// happens to have changed, we release the old bucket lock and try again
+	// with the new bucket; if it hasn't changed, we know it won't change now
+	// because we hold the lock.
+	var b *bucket
+	for {
+		addr := atomic.LoadUintptr(&w.addr)
+		b = m.lockBucket(addr)
+		// We still have to use an atomic load here, because if w was racily
+		// requeued then w.addr is not protected by b.mu.
+		if addr == atomic.LoadUintptr(&w.addr) {
+			break
+		}
+		b.mu.Unlock()
+	}
+
+	// Remove waiter from the bucket. w.complete can only be stored with b.mu
+	// locked, so this load doesn't need to use sync/atomic.
+	if w.complete == 0 {
+		b.waiters.Remove(w)
+	}
+	b.mu.Unlock()
+}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
new file mode 100644
index 000000000..7b81358ec
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -0,0 +1,500 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package futex
+
+import (
+	"math"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+const (
+	testMutexSize            = 4
+	testMutexLocked   uint32 = 1
+	testMutexUnlocked uint32 = 0
+)
+
+// testData implements the Checker interface, and allows us to
+// treat the address passed for futex operations as an index in
+// a byte slice for testing simplicity.
+type testData []byte
+
+func newTestData(size uint) testData {
+	return make([]byte, size)
+}
+
+func (t testData) Check(addr uintptr, val uint32) error {
+	if val != atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))) {
+		return syscall.EAGAIN
+	}
+	return nil
+}
+
+func (t testData) Op(addr uintptr, val uint32) (bool, error) {
+	return val == 0, nil
+}
+
+// testMutex ties together a testData slice, an address, and a
+// futex manager in order to implement the sync.Locker interface.
+// Beyond being used as a Locker, this is a simple mechanism for
+// changing the underlying values for simpler tests.
+type testMutex struct {
+	a uintptr
+	d testData
+	m *Manager
+}
+
+func newTestMutex(addr uintptr, d testData, m *Manager) *testMutex {
+	return &testMutex{a: addr, d: d, m: m}
+}
+
+// Lock acquires the testMutex.
+// This may wait for it to be available via the futex manager.
+func (t *testMutex) Lock() {
+	for {
+		// Attempt to grab the lock.
+		if atomic.CompareAndSwapUint32(
+			((*uint32)(unsafe.Pointer(&t.d[t.a]))),
+			testMutexUnlocked,
+			testMutexLocked) {
+			// Lock held.
+			return
+		}
+
+		// Wait for it to be "not locked".
+		w := NewWaiter()
+		err := t.m.WaitPrepare(w, t.d, t.a, testMutexLocked, ^uint32(0))
+		if err == syscall.EAGAIN {
+			continue
+		}
+		if err != nil {
+			// Should never happen.
+			panic("WaitPrepare returned unexpected error: " + err.Error())
+		}
+		<-w.C
+		t.m.WaitComplete(w)
+	}
+}
+
+// Unlock releases the testMutex.
+// This will notify any waiters via the futex manager.
+func (t *testMutex) Unlock() {
+	// Unlock.
+	atomic.StoreUint32(((*uint32)(unsafe.Pointer(&t.d[t.a]))), testMutexUnlocked)
+
+	// Notify all waiters.
+	t.m.Wake(t.a, ^uint32(0), math.MaxInt32)
+}
+
+func TestFutexWake(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+
+	// Wait for it to be locked.
+	// (This won't trigger the wake in testMutex)
+	w := NewWaiter()
+	m.WaitPrepare(w, d, 0, testMutexUnlocked, ^uint32(0))
+
+	// Wake the single thread.
+	if _, err := m.Wake(0, ^uint32(0), 1); err != nil {
+		t.Error("wake error:", err)
+	}
+
+	<-w.C
+	m.WaitComplete(w)
+}
+
+func TestFutexWakeBitmask(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+
+	// Wait for it to be locked.
+	// (This won't trigger the wake in testMutex)
+	w := NewWaiter()
+	m.WaitPrepare(w, d, 0, testMutexUnlocked, 0x0000ffff)
+
+	// Wake the single thread, not using the bitmask.
+	if _, err := m.Wake(0, 0xffff0000, 1); err != nil {
+		t.Error("wake non-matching bitmask error:", err)
+	}
+
+	select {
+	case <-w.C:
+		t.Error("w is alive?")
+	default:
+	}
+
+	// Now use a matching bitmask.
+	if _, err := m.Wake(0, 0x00000001, 1); err != nil {
+		t.Error("wake matching bitmask error:", err)
+	}
+
+	<-w.C
+	m.WaitComplete(w)
+}
+
+func TestFutexWakeTwo(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+
+	// Wait for it to be locked.
+	// (This won't trigger the wake in testMutex)
+	w1 := NewWaiter()
+	w2 := NewWaiter()
+	w3 := NewWaiter()
+	m.WaitPrepare(w1, d, 0, testMutexUnlocked, ^uint32(0))
+	m.WaitPrepare(w2, d, 0, testMutexUnlocked, ^uint32(0))
+	m.WaitPrepare(w3, d, 0, testMutexUnlocked, ^uint32(0))
+
+	// Wake exactly two threads.
+	if _, err := m.Wake(0, ^uint32(0), 2); err != nil {
+		t.Error("wake error:", err)
+	}
+
+	// Ensure exactly two are alive.
+	// We don't get guarantees about exactly which two,
+	// (although we expect them to be w1 and w2).
+	awake := 0
+	for {
+		select {
+		case <-w1.C:
+			awake++
+		case <-w2.C:
+			awake++
+		case <-w3.C:
+			awake++
+		default:
+			if awake != 2 {
+				t.Error("awake != 2?")
+			}
+
+			// Success.
+			return
+		}
+	}
+}
+
+func TestFutexWakeUnrelated(t *testing.T) {
+	m := NewManager()
+	d := newTestData(2 * testMutexSize)
+
+	// Wait for it to be locked.
+	w1 := NewWaiter()
+	w2 := NewWaiter()
+	m.WaitPrepare(w1, d, 0*testMutexSize, testMutexUnlocked, ^uint32(0))
+	m.WaitPrepare(w2, d, 1*testMutexSize, testMutexUnlocked, ^uint32(0))
+
+	// Wake only the second one.
+	if _, err := m.Wake(1*testMutexSize, ^uint32(0), 2); err != nil {
+		t.Error("wake error:", err)
+	}
+
+	// Ensure only r2 is alive.
+	select {
+	case <-w1.C:
+		t.Error("w1 is alive?")
+	default:
+	}
+	<-w2.C
+}
+
+// This function was shamelessly stolen from mutex_test.go.
+func HammerMutex(l sync.Locker, loops int, cdone chan bool) {
+	for i := 0; i < loops; i++ {
+		l.Lock()
+		runtime.Gosched()
+		l.Unlock()
+	}
+	cdone <- true
+}
+
+func TestFutexStress(t *testing.T) {
+	m := NewManager()
+	d := newTestData(testMutexSize)
+	tm := newTestMutex(0*testMutexSize, d, m)
+	c := make(chan bool)
+
+	for i := 0; i < 10; i++ {
+		go HammerMutex(tm, 1000, c)
+	}
+
+	for i := 0; i < 10; i++ {
+		<-c
+	}
+}
+
+func TestWakeOpEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 0 {
+		t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+	}
+}
+
+func TestWakeOpFirstNonEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Wake up all waiters on address 0.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSecondNonEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 4.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Wake up all waiters on address 4.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 4.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Wake up all waiters on address 4.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 0 {
+		t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+	}
+}
+
+func TestWakeOpAllNonEmpty(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Add two waiters on address 4.
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Wake up all waiters on both addresses.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 4 {
+		t.Fatalf("Invalid number of wakes: want 4, got %d", n)
+	}
+}
+
+func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add two waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	// Add two waiters on address 4.
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Wake up all waiters on both addresses.
+	n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSameAddress(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add four waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Use the same address, with one at most one waiter from each.
+	n, err := m.WakeOp(d, 0, 0, 1, 1, 0)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 2 {
+		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+	}
+}
+
+func TestWakeOpSameAddressFailingOp(t *testing.T) {
+	m := NewManager()
+	d := newTestData(8)
+
+	// Add four waiters on address 0.
+	w1 := NewWaiter()
+	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w1)
+
+	w2 := NewWaiter()
+	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w2)
+
+	w3 := NewWaiter()
+	if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w3)
+
+	w4 := NewWaiter()
+	if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	defer m.WaitComplete(w4)
+
+	// Use the same address, with one at most one waiter from each.
+	n, err := m.WakeOp(d, 0, 0, 1, 1, 1)
+	if err != nil {
+		t.Fatalf("WakeOp failed: %v", err)
+	}
+
+	if n != 1 {
+		t.Fatalf("Invalid number of wakes: want 1, got %d", n)
+	}
+}
diff --git a/pkg/sentry/kernel/g3doc/run_states.dot b/pkg/sentry/kernel/g3doc/run_states.dot
new file mode 100644
index 000000000..7861fe1f5
--- /dev/null
+++ b/pkg/sentry/kernel/g3doc/run_states.dot
@@ -0,0 +1,99 @@
+digraph {
+  subgraph {
+    App;
+  }
+  subgraph {
+    Interrupt;
+    InterruptAfterSignalDeliveryStop;
+  }
+  subgraph {
+    Syscall;
+    SyscallAfterPtraceEventSeccomp;
+    SyscallEnter;
+    SyscallAfterSyscallEnterStop;
+    SyscallAfterSysemuStop;
+    SyscallInvoke;
+    SyscallAfterPtraceEventClone;
+    SyscallAfterExecStop;
+    SyscallAfterVforkStop;
+    SyscallReinvoke;
+    SyscallExit;
+  }
+  subgraph {
+    Vsyscall;
+    VsyscallAfterPtraceEventSeccomp;
+    VsyscallInvoke;
+  }
+  subgraph {
+    Exit;
+    ExitMain; // leave thread group, release resources, reparent children, kill PID namespace and wait if TGID 1
+    ExitNotify; // signal parent/tracer, become waitable
+    ExitDone; // represented by t.runState == nil
+  }
+
+  // Task exit
+  Exit -> ExitMain;
+  ExitMain -> ExitNotify;
+  ExitNotify -> ExitDone;
+
+  // Execution of untrusted application code
+  App -> App;
+
+  // Interrupts (usually signal delivery)
+  App -> Interrupt;
+  Interrupt -> Interrupt; // if other interrupt conditions may still apply
+  Interrupt -> Exit; // if killed
+
+  // Syscalls
+  App -> Syscall;
+  Syscall -> SyscallEnter;
+  SyscallEnter -> SyscallInvoke;
+  SyscallInvoke -> SyscallExit;
+  SyscallExit -> App;
+
+  // exit, exit_group
+  SyscallInvoke -> Exit;
+
+  // execve
+  SyscallInvoke -> SyscallAfterExecStop;
+  SyscallAfterExecStop -> SyscallExit;
+  SyscallAfterExecStop -> App; // fatal signal pending
+
+  // vfork
+  SyscallInvoke -> SyscallAfterVforkStop;
+  SyscallAfterVforkStop -> SyscallExit;
+
+  // Vsyscalls
+  App -> Vsyscall;
+  Vsyscall -> VsyscallInvoke;
+  Vsyscall -> App; // fault while reading return address from stack
+  VsyscallInvoke -> App;
+
+  // ptrace-specific branches
+  Interrupt -> InterruptAfterSignalDeliveryStop;
+  InterruptAfterSignalDeliveryStop -> Interrupt;
+  SyscallEnter -> SyscallAfterSyscallEnterStop;
+  SyscallAfterSyscallEnterStop -> SyscallInvoke;
+  SyscallAfterSyscallEnterStop -> SyscallExit; // skipped by tracer
+  SyscallAfterSyscallEnterStop -> App; // fatal signal pending
+  SyscallEnter -> SyscallAfterSysemuStop;
+  SyscallAfterSysemuStop -> SyscallExit;
+  SyscallAfterSysemuStop -> App; // fatal signal pending
+  SyscallInvoke -> SyscallAfterPtraceEventClone;
+  SyscallAfterPtraceEventClone -> SyscallExit;
+  SyscallAfterPtraceEventClone -> SyscallAfterVforkStop;
+
+  // seccomp
+  Syscall -> App; // SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_KILL, SECCOMP_RET_TRACE without tracer
+  Syscall -> SyscallAfterPtraceEventSeccomp; // SECCOMP_RET_TRACE
+  SyscallAfterPtraceEventSeccomp -> SyscallEnter;
+  SyscallAfterPtraceEventSeccomp -> SyscallExit; // skipped by tracer
+  SyscallAfterPtraceEventSeccomp -> App; // fatal signal pending
+  Vsyscall -> VsyscallAfterPtraceEventSeccomp;
+  VsyscallAfterPtraceEventSeccomp -> VsyscallInvoke;
+  VsyscallAfterPtraceEventSeccomp -> App;
+
+  // Autosave
+  SyscallInvoke -> SyscallReinvoke;
+  SyscallReinvoke -> SyscallInvoke;
+}
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
new file mode 100644
index 000000000..78737f58f
--- /dev/null
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore"
+)
+
+// IPCNamespace represents an IPC namespace.
+type IPCNamespace struct {
+	semaphores *semaphore.Registry
+}
+
+// NewIPCNamespace creates a new IPC namespace.
+func NewIPCNamespace() *IPCNamespace {
+	return &IPCNamespace{
+		semaphores: semaphore.NewRegistry(),
+	}
+}
+
+// SemaphoreRegistry returns the semanphore set registry for this namespace.
+func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
+	return i.semaphores
+}
+
+// IPCNamespace returns the task's IPC namespace.
+func (t *Task) IPCNamespace() *IPCNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.ipcns
+}
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
new file mode 100644
index 000000000..b6c00042a
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -0,0 +1,10 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "kdefs",
+    srcs = ["kdefs.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
new file mode 100644
index 000000000..bbb476544
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kdefs defines common kernel definitions.
+//
+package kdefs
+
+// FD is a File Descriptor.
+type FD int32
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
new file mode 100644
index 000000000..0932965e0
--- /dev/null
+++ b/pkg/sentry/kernel/kernel.go
@@ -0,0 +1,957 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernel provides an emulation of the Linux kernel.
+//
+// See README.md for a detailed overview.
+//
+// Lock order (outermost locks must be taken first):
+//
+// Kernel.extMu
+//   TaskSet.mu
+//     SignalHandlers.mu
+//       Task.mu
+//
+// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
+// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
+// time requires locking all of their signal mutexes first.
+package kernel
+
+import (
+	"fmt"
+	"io"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// Kernel represents an emulated Linux kernel. It must be initialized by calling
+// Init() or LoadFrom().
+type Kernel struct {
+	// extMu serializes external changes to the Kernel with calls to
+	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
+	// remains frozen for the duration of the call; it requires that the Kernel
+	// is paused as a precondition, which ensures that none of the tasks
+	// running within the Kernel can affect its state, but extMu is required to
+	// ensure that concurrent users of the Kernel *outside* the Kernel's
+	// control cannot affect its state by calling e.g.
+	// Kernel.SendExternalSignal.)
+	extMu sync.Mutex `state:"nosave"`
+
+	// started is true if Start has been called. Unless otherwise specified,
+	// all Kernel fields become immutable once started becomes true.
+	started bool `state:"nosave"`
+
+	// All of the following fields are immutable unless otherwise specified.
+
+	// Platform is the platform that is used to execute tasks in the
+	// created Kernel. It is embedded so that Kernel can directly serve as
+	// Platform in mm logic and also serve as platform.MemoryProvider in
+	// filemem S/R logic.
+	platform.Platform `state:"nosave"`
+
+	// See InitKernelArgs for the meaning of these fields.
+	featureSet        *cpuid.FeatureSet
+	timekeeper        *Timekeeper
+	tasks             *TaskSet
+	rootUserNamespace *auth.UserNamespace
+	networkStack      inet.Stack `state:"nosave"`
+	applicationCores  uint
+	useHostCores      bool
+	extraAuxv         []arch.AuxEntry
+	vdso              *loader.VDSO
+	rootUTSNamespace  *UTSNamespace
+	rootIPCNamespace  *IPCNamespace
+
+	// mounts holds the state of the virtual filesystem. mounts is initially
+	// nil, and must be set by calling Kernel.SetRootMountNamespace before
+	// Kernel.CreateProcess can succeed.
+	mounts *fs.MountNamespace
+
+	// globalInit is the thread group whose leader has ID 1 in the root PID
+	// namespace. globalInit is stored separately so that it is accessible even
+	// after all tasks in the thread group have exited, such that ID 1 is no
+	// longer mapped.
+	//
+	// globalInit is mutable until it is assigned by the first successful call
+	// to CreateProcess, and is protected by extMu.
+	globalInit *ThreadGroup
+
+	// realtimeClock is a ktime.Clock based on timekeeper's Realtime.
+	realtimeClock *timekeeperClock
+
+	// monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
+	monotonicClock *timekeeperClock
+
+	// syslog is the kernel log.
+	syslog syslog
+
+	// cpuClock is incremented every linux.ClockTick. cpuClock is used to
+	// measure task CPU usage, since sampling monotonicClock twice on every
+	// syscall turns out to be unreasonably expensive. This is similar to how
+	// Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
+	// although Linux also uses scheduler timing information to improve
+	// resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
+	// since "preeemptive" scheduling is managed by the Go runtime, which
+	// doesn't provide this information.
+	//
+	// cpuClock is mutable, and is accessed using atomic memory operations.
+	cpuClock uint64
+
+	// cpuClockTicker increments cpuClock.
+	cpuClockTicker *ktime.Timer `state:"nosave"`
+
+	// fdMapUids is an ever-increasing counter for generating FDMap uids.
+	//
+	// fdMapUids is mutable, and is accessed using atomic memory operations.
+	fdMapUids uint64
+
+	// uniqueID is used to generate unique identifiers.
+	//
+	// uniqueID is mutable, and is accessed using atomic memory operations.
+	uniqueID uint64
+
+	// nextInotifyCookie is a monotonically increasing counter used for
+	// generating unique inotify event cookies.
+	//
+	// nextInotifyCookie is mutable, and is accesed using atomic memory
+	// operations.
+	nextInotifyCookie uint32
+
+	// netlinkPorts manages allocation of netlink socket port IDs.
+	netlinkPorts *port.Manager
+
+	// exitErr is the error causing the sandbox to exit, if any. It is
+	// protected by extMu.
+	exitErr error
+}
+
+// InitKernelArgs holds arguments to Init.
+type InitKernelArgs struct {
+	// FeatureSet is the emulated CPU feature set.
+	FeatureSet *cpuid.FeatureSet
+
+	// Timekeeper manages time for all tasks in the system.
+	Timekeeper *Timekeeper
+
+	// RootUserNamespace is the root user namespace.
+	RootUserNamespace *auth.UserNamespace
+
+	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
+	NetworkStack inet.Stack
+
+	// ApplicationCores is the number of logical CPUs visible to sandboxed
+	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
+	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
+	// most significant bit in cpu_possible_mask + 1.
+	ApplicationCores uint
+
+	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
+	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
+	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
+	// will be overridden.
+	UseHostCores bool
+
+	// ExtraAuxv contains additional auxiliary vector entries that are added to
+	// each process by the ELF loader.
+	ExtraAuxv []arch.AuxEntry
+
+	// Vdso holds the VDSO and its parameter page.
+	Vdso *loader.VDSO
+
+	// RootUTSNamespace is the root UTS namepsace.
+	RootUTSNamespace *UTSNamespace
+
+	// RootIPCNamespace is the root IPC namepsace.
+	RootIPCNamespace *IPCNamespace
+}
+
+// Init initialize the Kernel with no tasks.
+//
+// Callers must manually set Kernel.Platform before caling Init.
+func (k *Kernel) Init(args InitKernelArgs) error {
+	if args.FeatureSet == nil {
+		return fmt.Errorf("FeatureSet is nil")
+	}
+	if args.Timekeeper == nil {
+		return fmt.Errorf("Timekeeper is nil")
+	}
+	if args.RootUserNamespace == nil {
+		return fmt.Errorf("RootUserNamespace is nil")
+	}
+	if args.ApplicationCores == 0 {
+		return fmt.Errorf("ApplicationCores is 0")
+	}
+
+	k.featureSet = args.FeatureSet
+	k.timekeeper = args.Timekeeper
+	k.tasks = newTaskSet()
+	k.rootUserNamespace = args.RootUserNamespace
+	k.rootUTSNamespace = args.RootUTSNamespace
+	k.rootIPCNamespace = args.RootIPCNamespace
+	k.networkStack = args.NetworkStack
+	k.applicationCores = args.ApplicationCores
+	if args.UseHostCores {
+		k.useHostCores = true
+		maxCPU, err := hostcpu.MaxPossibleCPU()
+		if err != nil {
+			return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+		}
+		minAppCores := uint(maxCPU) + 1
+		if k.applicationCores < minAppCores {
+			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
+			k.applicationCores = minAppCores
+		}
+	}
+	k.extraAuxv = args.ExtraAuxv
+	k.vdso = args.Vdso
+	k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
+	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
+	k.netlinkPorts = port.New()
+
+	return nil
+}
+
+// SaveTo saves the state of k to w.
+//
+// Preconditions: The kernel must be paused throughout the call to SaveTo.
+func (k *Kernel) SaveTo(w io.Writer) error {
+	saveStart := time.Now()
+	ctx := k.SupervisorContext()
+
+	// Do not allow other Kernel methods to affect it while it's being saved.
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	// Stop time.
+	k.pauseTimeLocked()
+	defer k.resumeTimeLocked()
+
+	// Flush write operations on open files so data reaches backing storage.
+	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
+		return err
+	}
+
+	// Remove all epoll waiter objects from underlying wait queues.
+	// NOTE: for programs to resume execution in future snapshot scenarios,
+	// we will need to re-establish these waiter objects after saving.
+	k.tasks.unregisterEpollWaiters()
+
+	// Clear the dirent cache before saving because Dirents must be Loaded in a
+	// particular order (parents before children), and Loading dirents from a cache
+	// breaks that order.
+	k.mounts.FlushMountSourceRefs()
+
+	// Ensure that all pending asynchronous work is complete:
+	//   - inode and mount release
+	//   - asynchronuous IO
+	fs.AsyncBarrier()
+
+	// Once all fs work has completed (flushed references have all been released),
+	// reset mount mappings. This allows individual mounts to save how inodes map
+	// to filesystem resources. Without this, fs.Inodes cannot be restored.
+	fs.SaveInodeMappings()
+
+	// Discard unsavable mappings, such as those for host file descriptors.
+	// This must be done after waiting for "asynchronous fs work", which
+	// includes async I/O that may touch application memory.
+	if err := k.invalidateUnsavableMappings(ctx); err != nil {
+		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+	}
+
+	// Save the kernel state.
+	kernelStart := time.Now()
+	var stats state.Stats
+	if err := state.Save(w, k, &stats); err != nil {
+		return err
+	}
+	log.Infof("Kernel save stats: %s", &stats)
+	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
+
+	// Save the memory state.
+	//
+	// FIXME: In the future, this should not be dispatched via
+	// an abstract memory type. This should be dispatched to a single
+	// memory implementation that belongs to the kernel. (There is
+	// currently a single implementation anyways, it just needs to be
+	// "unabstracted" and reparented appropriately.)
+	memoryStart := time.Now()
+	if err := k.Platform.Memory().SaveTo(w); err != nil {
+		return err
+	}
+	log.Infof("Memory save took [%s].", time.Since(memoryStart))
+
+	log.Infof("Overall save took [%s].", time.Since(saveStart))
+
+	return nil
+}
+
+func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+	for t := range ts.Root.tids {
+		if fdmap := t.FDMap(); fdmap != nil {
+			for _, desc := range fdmap.files {
+				if flags := desc.file.Flags(); !flags.Write {
+					continue
+				}
+				if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+					continue
+				}
+				// Here we need all metadata synced.
+				syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+				if err := fs.SaveFileFsyncError(syncErr); err != nil {
+					name, _ := desc.file.Dirent.FullName(nil /* root */)
+					return fmt.Errorf("%q was not sufficiently synced: %v", name, err)
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+	invalidated := make(map[*mm.MemoryManager]struct{})
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t := range k.tasks.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if mm := t.tc.MemoryManager; mm != nil {
+			if _, ok := invalidated[mm]; !ok {
+				if err := mm.InvalidateUnsavable(ctx); err != nil {
+					return err
+				}
+				invalidated[mm] = struct{}{}
+			}
+		}
+		// I really wish we just had a sync.Map of all MMs...
+		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (ts *TaskSet) unregisterEpollWaiters() {
+	ts.mu.RLock()
+	defer ts.mu.RUnlock()
+	for t := range ts.Root.tids {
+		if fdmap := t.FDMap(); fdmap != nil {
+			for _, desc := range fdmap.files {
+				if desc.file != nil {
+					if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
+						e.UnregisterEpollWaiters()
+					}
+				}
+			}
+		}
+	}
+}
+
+// LoadFrom returns a new Kernel loaded from args.
+func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error {
+	loadStart := time.Now()
+	if p == nil {
+		return fmt.Errorf("Platform is nil")
+	}
+
+	k.Platform = p
+	k.networkStack = net
+
+	initAppCores := k.applicationCores
+
+	// Load the kernel state.
+	kernelStart := time.Now()
+	var stats state.Stats
+	if err := state.Load(r, k, &stats); err != nil {
+		return err
+	}
+	log.Infof("Kernel load stats: %s", &stats)
+	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
+
+	// Load the memory state.
+	//
+	// See the note in SaveTo.
+	memoryStart := time.Now()
+	if err := k.Platform.Memory().LoadFrom(r); err != nil {
+		return err
+	}
+	log.Infof("Memory load took [%s].", time.Since(memoryStart))
+
+	// Ensure that all pending asynchronous work is complete:
+	//   - namedpipe opening
+	//   - inode file opening
+	fs.AsyncBarrier()
+
+	log.Infof("Overall load took [%s]", time.Since(loadStart))
+
+	// Applications may size per-cpu structures based on k.applicationCores, so
+	// it can't change across save/restore. When we are virtualizing CPU
+	// numbers, this isn't a problem. However, when we are exposing host CPU
+	// assignments, we can't tolerate an increase in the number of host CPUs,
+	// which could result in getcpu(2) returning CPUs that applications expect
+	// not to exist.
+	if k.useHostCores && initAppCores > k.applicationCores {
+		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
+	}
+
+	return nil
+}
+
+// Destroy releases resources owned by k.
+//
+// Preconditions: There must be no task goroutines running in k.
+func (k *Kernel) Destroy() {
+	if k.mounts != nil {
+		k.mounts.DecRef()
+		k.mounts = nil
+	}
+}
+
+// UniqueID returns a unique identifier.
+func (k *Kernel) UniqueID() uint64 {
+	id := atomic.AddUint64(&k.uniqueID, 1)
+	if id == 0 {
+		panic("unique identifier generator wrapped around")
+	}
+	return id
+}
+
+// CreateProcessArgs holds arguments to kernel.CreateProcess.
+type CreateProcessArgs struct {
+	// Filename is the filename to load.
+	//
+	// If this is provided as "", then the file will be guessed via Argv[0].
+	Filename string
+
+	// Argvv is a list of arguments.
+	Argv []string
+
+	// Envv is a list of environment variables.
+	Envv []string
+
+	// WorkingDirectory is the initial working directory.
+	//
+	// This defaults to the root if empty.
+	WorkingDirectory string
+
+	// Credentials is the initial credentials.
+	Credentials *auth.Credentials
+
+	// FDMap is the initial set of file descriptors. If CreateProcess succeeds,
+	// it takes a reference on FDMap.
+	FDMap *FDMap
+
+	// Umask is the initial umask.
+	Umask uint
+
+	// Limits is the initial resource limits.
+	Limits *limits.LimitSet
+
+	// MaxSymlinkTraversals is the maximum number of symlinks to follow
+	// during resolution.
+	MaxSymlinkTraversals uint
+
+	// UTSNamespace is the initial UTS namespace.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the initial IPC namespace.
+	IPCNamespace *IPCNamespace
+}
+
+// NewContext returns a context.Context that represents the task that will be
+// created by args.NewContext(k).
+func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
+	return &createProcessContext{
+		Logger: log.Log(),
+		k:      k,
+		args:   args,
+	}
+}
+
+// createProcessContext is a context.Context that represents the context
+// associated with a task that is being created.
+type createProcessContext struct {
+	context.NoopSleeper
+	log.Logger
+	k    *Kernel
+	args *CreateProcessArgs
+}
+
+// Value implements context.Context.Value.
+func (ctx *createProcessContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		// "The new task ... is in the root PID namespace." -
+		// Kernel.CreateProcess
+		return ctx.k.tasks.Root
+	case CtxUTSNamespace:
+		return ctx.args.UTSNamespace
+	case CtxIPCNamespace:
+		return ctx.args.IPCNamespace
+	case auth.CtxCredentials:
+		return ctx.args.Credentials
+	case fs.CtxRoot:
+		if ctx.k.mounts == nil {
+			return nil
+		}
+		return ctx.k.mounts.Root()
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		return ctx.args.Limits
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	default:
+		return nil
+	}
+}
+
+// CreateProcess creates a new task in a new thread group with the given
+// options. The new task has no parent and is in the root PID namespace.
+//
+// If k.Start() has already been called, the created task will begin running
+// immediately. Otherwise, it will be started when k.Start() is called.
+//
+// CreateProcess has no analogue in Linux; it is used to create the initial
+// application task, as well as processes started by the control server.
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	log.Infof("EXEC: %v", args.Argv)
+
+	if k.mounts == nil {
+		return nil, fmt.Errorf("no kernel MountNamespace")
+	}
+
+	tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	ctx := args.NewContext(k)
+
+	// Grab the root directory.
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+
+	// Grab the working directory.
+	wd := root // Default.
+	if args.WorkingDirectory != "" {
+		var err error
+		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
+		if err != nil {
+			return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+		}
+		defer wd.DecRef()
+	}
+
+	if args.Filename == "" {
+		// Was anything provided?
+		if len(args.Argv) == 0 {
+			return nil, fmt.Errorf("no filename or command provided")
+		}
+		if !filepath.IsAbs(args.Argv[0]) {
+			return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+		}
+		args.Filename = args.Argv[0]
+	}
+
+	// Create a fresh task context.
+	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+	if err != nil {
+		return nil, err
+	}
+	tr := newTaskResources(args.FDMap, newFSContext(root, wd, args.Umask))
+	// NewTask unconditionally takes ownership of tr, so we never have to call
+	// tr.release.
+
+	// Create the task.
+	config := &TaskConfig{
+		Kernel:         k,
+		ThreadGroup:    tg,
+		TaskContext:    tc,
+		TaskResources:  tr,
+		Credentials:    args.Credentials,
+		UTSNamespace:   args.UTSNamespace,
+		IPCNamespace:   args.IPCNamespace,
+		AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
+	}
+	t, err := k.tasks.NewTask(config)
+	if err != nil {
+		return nil, err
+	}
+
+	// Success.
+	if k.started {
+		tid := k.tasks.Root.IDOfTask(t)
+		t.Start(tid)
+	} else if k.globalInit == nil {
+		k.globalInit = tg
+	}
+	return tg, nil
+}
+
+// Start starts execution of all tasks in k.
+//
+// Preconditions: Start may be called exactly once.
+func (k *Kernel) Start() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+
+	if k.globalInit == nil {
+		return fmt.Errorf("kernel contains no tasks")
+	}
+	if k.started {
+		return fmt.Errorf("kernel already started")
+	}
+
+	k.started = true
+	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k})
+	k.cpuClockTicker.Swap(ktime.Setting{
+		Enabled: true,
+		Period:  linux.ClockTick,
+	})
+	// If k was created by LoadKernelFrom, timers were stopped during
+	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
+	// this is a no-op.
+	k.resumeTimeLocked()
+	// Start task goroutines.
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t, tid := range k.tasks.Root.tids {
+		t.Start(tid)
+	}
+	return nil
+}
+
+// pauseTimeLocked pauses all Timers and Timekeeper updates.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) pauseTimeLocked() {
+	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
+	// Kernel.Start().
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Pause()
+	}
+
+	// By precondition, nothing else can be interacting with PIDNamespace.tids
+	// or FDMap.files, so we can iterate them without synchronization. (We
+	// can't hold the TaskSet mutex when pausing thread group timers because
+	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
+	// mutex, while holding the Timer mutex.)
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.tm.pause()
+		}
+		// This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+		// but ktime.Timer.Pause is idempotent so this is harmless.
+		if fdm := t.tr.FDMap; fdm != nil {
+			for _, desc := range fdm.files {
+				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+					tfd.PauseTimer()
+				}
+			}
+		}
+	}
+	k.timekeeper.PauseUpdates()
+}
+
+// resumeTimeLocked resumes all Timers and Timekeeper updates. If
+// pauseTimeLocked has not been previously called, resumeTimeLocked has no
+// effect.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) resumeTimeLocked() {
+	if k.cpuClockTicker != nil {
+		k.cpuClockTicker.Resume()
+	}
+
+	k.timekeeper.ResumeUpdates()
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader {
+			t.tg.tm.resume()
+		}
+		if fdm := t.tr.FDMap; fdm != nil {
+			for _, desc := range fdm.files {
+				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+					tfd.ResumeTimer()
+				}
+			}
+		}
+	}
+}
+
+// WaitExited blocks until all tasks in k have exited.
+func (k *Kernel) WaitExited() {
+	k.tasks.liveGoroutines.Wait()
+}
+
+// Kill requests that all tasks in k immediately exit as if group exiting with
+// status es. Kill does not wait for tasks to exit.
+func (k *Kernel) Kill(es ExitStatus) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.Kill(es)
+}
+
+// Pause requests that all tasks in k temporarily stop executing, and blocks
+// until all tasks in k have stopped. Multiple calls to Pause nest and require
+// an equal number of calls to Unpause to resume execution.
+func (k *Kernel) Pause() {
+	k.extMu.Lock()
+	k.tasks.BeginExternalStop()
+	k.extMu.Unlock()
+	k.tasks.runningGoroutines.Wait()
+}
+
+// Unpause ends the effect of a previous call to Pause. If Unpause is called
+// without a matching preceding call to Pause, Unpause may panic.
+func (k *Kernel) Unpause() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.EndExternalStop()
+}
+
+// SendExternalSignal injects a signal into the kernel.
+//
+// context is used only for debugging to describe how the signal was received.
+//
+// Returns false if signal could not be sent because the Kernel is not fully
+// initialized yet.
+func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.sendExternalSignal(info, context)
+}
+
+// FeatureSet returns the FeatureSet.
+func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
+	return k.featureSet
+}
+
+// Timekeeper returns the Timekeeper.
+func (k *Kernel) Timekeeper() *Timekeeper {
+	return k.timekeeper
+}
+
+// TaskSet returns the TaskSet.
+func (k *Kernel) TaskSet() *TaskSet {
+	return k.tasks
+}
+
+// RootUserNamespace returns the root UserNamespace.
+func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
+	return k.rootUserNamespace
+}
+
+// RootUTSNamespace returns the root UTSNamespace.
+func (k *Kernel) RootUTSNamespace() *UTSNamespace {
+	return k.rootUTSNamespace
+}
+
+// RootIPCNamespace returns the root IPCNamespace.
+func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+	return k.rootIPCNamespace
+}
+
+// RootMountNamespace returns the MountNamespace.
+func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.mounts
+}
+
+// SetRootMountNamespace sets the MountNamespace.
+func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.mounts = mounts
+}
+
+// NetworkStack returns the network stack. NetworkStack may return nil if no
+// network stack is available.
+func (k *Kernel) NetworkStack() inet.Stack {
+	return k.networkStack
+}
+
+// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
+// nil if no such thread group exists. GlobalInit may return a thread group
+// containing no tasks if the thread group has already exited.
+func (k *Kernel) GlobalInit() *ThreadGroup {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.globalInit
+}
+
+// ApplicationCores returns the number of CPUs visible to sandboxed
+// applications.
+func (k *Kernel) ApplicationCores() uint {
+	return k.applicationCores
+}
+
+// RealtimeClock returns the application CLOCK_REALTIME clock.
+func (k *Kernel) RealtimeClock() ktime.Clock {
+	return k.realtimeClock
+}
+
+// MonotonicClock returns the application CLOCK_MONOTONIC clock.
+func (k *Kernel) MonotonicClock() ktime.Clock {
+	return k.monotonicClock
+}
+
+// CPUClockNow returns the current value of k.cpuClock.
+func (k *Kernel) CPUClockNow() uint64 {
+	return atomic.LoadUint64(&k.cpuClock)
+}
+
+// Syslog returns the syslog.
+func (k *Kernel) Syslog() *syslog {
+	return &k.syslog
+}
+
+// GenerateInotifyCookie generates a unique inotify event cookie.
+//
+// Returned values may overlap with previously returned values if the value
+// space is exhausted. 0 is not a valid cookie value, all other values
+// representable in a uint32 are allowed.
+func (k *Kernel) GenerateInotifyCookie() uint32 {
+	id := atomic.AddUint32(&k.nextInotifyCookie, 1)
+	// Wrap-around is explicitly allowed for inotify event cookies.
+	if id == 0 {
+		id = atomic.AddUint32(&k.nextInotifyCookie, 1)
+	}
+	return id
+}
+
+// NetlinkPorts returns the netlink port manager.
+func (k *Kernel) NetlinkPorts() *port.Manager {
+	return k.netlinkPorts
+}
+
+// ExitError returns the sandbox error that caused the kernel to exit.
+func (k *Kernel) ExitError() error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	return k.exitErr
+}
+
+// SetExitError sets the sandbox error that caused the kernel to exit, if one is
+// not already set.
+func (k *Kernel) SetExitError(err error) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	if k.exitErr == nil {
+		k.exitErr = err
+	}
+}
+
+// SupervisorContext returns a Context with maximum privileges in k. It should
+// only be used by goroutines outside the control of the emulated kernel
+// defined by e.
+//
+// Callers are responsible for ensuring that the returned Context is not used
+// concurrently with changes to the Kernel.
+func (k *Kernel) SupervisorContext() context.Context {
+	return supervisorContext{
+		Logger: log.Log(),
+		k:      k,
+	}
+}
+
+type supervisorContext struct {
+	context.NoopSleeper
+	log.Logger
+	k *Kernel
+}
+
+// Value implements context.Context.
+func (ctx supervisorContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		// The supervisor context can trace anything. (None of
+		// supervisorContext's users are expected to invoke ptrace, but ptrace
+		// permissions are required for certain file accesses.)
+		return func(*Task, bool) bool { return true }
+	case CtxKernel:
+		return ctx.k
+	case CtxPIDNamespace:
+		return ctx.k.tasks.Root
+	case CtxUTSNamespace:
+		return ctx.k.rootUTSNamespace
+	case CtxIPCNamespace:
+		return ctx.k.rootIPCNamespace
+	case auth.CtxCredentials:
+		// The supervisor context is global root.
+		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
+	case fs.CtxRoot:
+		return ctx.k.mounts.Root()
+	case ktime.CtxRealtimeClock:
+		return ctx.k.RealtimeClock()
+	case limits.CtxLimits:
+		// No limits apply.
+		return limits.NewLimitSet()
+	case platform.CtxPlatform:
+		return ctx.k
+	case uniqueid.CtxGlobalUniqueID:
+		return ctx.k.UniqueID()
+	case uniqueid.CtxInotifyCookie:
+		return ctx.k.GenerateInotifyCookie()
+	default:
+		return nil
+	}
+}
+
+type kernelCPUClockListener struct {
+	k *Kernel
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l kernelCPUClockListener) Notify(exp uint64) {
+	atomic.AddUint64(&l.k.cpuClock, exp)
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l kernelCPUClockListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
new file mode 100644
index 000000000..c7779e1d5
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -0,0 +1,31 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "memevent",
+    srcs = ["memory_events.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent",
+    visibility = ["//:sandbox"],
+    deps = [
+        ":memory_events_go_proto",
+        "//pkg/eventchannel",
+        "//pkg/log",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/usage",
+    ],
+)
+
+proto_library(
+    name = "memory_events_proto",
+    srcs = ["memory_events.proto"],
+    visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+    name = "memory_events_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto",
+    proto = ":memory_events_proto",
+    visibility = ["//visibility:public"],
+)
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
new file mode 100644
index 000000000..ecc9151de
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memevent implements the memory usage events controller, which
+// periodically emits events via the eventchannel.
+package memevent
+
+import (
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	pb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// MemoryEvents describes the configuration for the global memory event emitter.
+type MemoryEvents struct {
+	k *kernel.Kernel
+
+	// The period is how often to emit an event. The memory events goroutine
+	// will ensure a minimum of one event is emitted per this period, regardless
+	// how of much memory usage has changed.
+	period time.Duration
+
+	// Writing to this channel indicates the memory goroutine should stop.
+	stop chan struct{}
+
+	// done is used to signal when the memory event goroutine has exited.
+	done sync.WaitGroup
+}
+
+// New creates a new MemoryEvents.
+func New(k *kernel.Kernel, period time.Duration) *MemoryEvents {
+	return &MemoryEvents{
+		k:      k,
+		period: period,
+		stop:   make(chan struct{}),
+	}
+}
+
+// Stop stops the memory usage events emitter goroutine. Stop must not be called
+// concurrently with Start and may only be called once.
+func (m *MemoryEvents) Stop() {
+	close(m.stop)
+	m.done.Wait()
+}
+
+// Start starts the memory usage events emitter goroutine. Start must not be
+// called concurrently with Stop and may only be called once.
+func (m *MemoryEvents) Start() {
+	if m.period == 0 {
+		return
+	}
+	go m.run() // S/R-SAFE: doesn't interact with saved state.
+}
+
+func (m *MemoryEvents) run() {
+	m.done.Add(1)
+
+	ticker := time.NewTicker(m.period)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-m.stop:
+			m.done.Done()
+			return
+		case <-ticker.C:
+			m.emit()
+		}
+	}
+}
+
+func (m *MemoryEvents) emit() {
+	totalPlatform, err := m.k.Platform.Memory().TotalUsage()
+	if err != nil {
+		log.Warningf("Failed to fetch memory usage for memory events: %v", err)
+		return
+	}
+	snapshot, _ := usage.MemoryAccounting.Copy()
+	total := totalPlatform + snapshot.Mapped
+
+	eventchannel.Emit(&pb.MemoryUsageEvent{Total: total})
+}
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
new file mode 100644
index 000000000..e6e0bd628
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -0,0 +1,25 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+// MemoryUsageEvent describes the memory usage of the sandbox at a single
+// instant in time. These messages are emitted periodically on the eventchannel.
+message MemoryUsageEvent {
+  // The total memory usage of the sandboxed application in bytes, calculated
+  // using the 'fast' method.
+  uint64 total = 1;
+}
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
new file mode 100644
index 000000000..d8701f47a
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+	// stdSignalCap is the maximum number of instances of a given standard
+	// signal that may be pending. ("[If] multiple instances of a standard
+	// signal are delivered while that signal is currently blocked, then only
+	// one instance is queued.") - signal(7)
+	stdSignalCap = 1
+
+	// rtSignalCap is the maximum number of instances of a given realtime
+	// signal that may be pending.
+	//
+	// TODO: In Linux, the minimum signal queue size is
+	// RLIMIT_SIGPENDING, which is by default max_threads/2.
+	rtSignalCap = 32
+)
+
+// pendingSignals holds a collection of pending signals. The zero value of
+// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
+// users must provide synchronization.
+type pendingSignals struct {
+	// signals contains all pending signals.
+	//
+	// Note that signals is zero-indexed, but signal 1 is the first valid
+	// signal, so signals[0] contains signals with signo 1 etc. This offset is
+	// usually handled by using Signal.index().
+	signals [linux.SignalMaximum]pendingSignalQueue
+
+	// Bit i of pendingSet is set iff there is at least one signal with signo
+	// i+1 pending.
+	pendingSet linux.SignalSet
+}
+
+// pendingSignalQueue holds a pendingSignalList for a single signal number.
+type pendingSignalQueue struct {
+	pendingSignalList
+	length int
+}
+
+type pendingSignal struct {
+	// pendingSignalEntry links into a pendingSignalList.
+	pendingSignalEntry
+	*arch.SignalInfo
+}
+
+// enqueue enqueues the given signal. enqueue returns true on success and false
+// on failure (if the given signal's queue is full).
+//
+// Preconditions: info represents a valid signal.
+func (p *pendingSignals) enqueue(info *arch.SignalInfo) bool {
+	sig := linux.Signal(info.Signo)
+	q := &p.signals[sig.Index()]
+	if sig.IsStandard() {
+		if q.length >= stdSignalCap {
+			return false
+		}
+	} else if q.length >= rtSignalCap {
+		return false
+	}
+	q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info})
+	q.length++
+	p.pendingSet |= linux.SignalSetOf(sig)
+	return true
+}
+
+// dequeue dequeues and returns any pending signal not masked by mask. If no
+// unmasked signals are pending, dequeue returns nil.
+func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo {
+	// "Real-time signals are delivered in a guaranteed order. Multiple
+	// real-time signals of the same type are delivered in the order they were
+	// sent. If different real-time signals are sent to a process, they are
+	// delivered starting with the lowest-numbered signal. (I.e., low-numbered
+	// signals have highest priority.) By contrast, if multiple standard
+	// signals are pending for a process, the order in which they are delivered
+	// is unspecified. If both standard and real-time signals are pending for a
+	// process, POSIX leaves it unspecified which is delivered first. Linux,
+	// like many other implementations, gives priority to standard signals in
+	// this case." - signal(7)
+	lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask))
+	if lowestPendingUnblockedBit >= linux.SignalMaximum {
+		return nil
+	}
+	return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1))
+}
+
+func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo {
+	q := &p.signals[sig.Index()]
+	ps := q.pendingSignalList.Front()
+	if ps == nil {
+		return nil
+	}
+	q.pendingSignalList.Remove(ps)
+	q.length--
+	if q.length == 0 {
+		p.pendingSet &^= linux.SignalSetOf(sig)
+	}
+	return ps.SignalInfo
+}
+
+// discardSpecific causes all pending signals with number sig to be discarded.
+func (p *pendingSignals) discardSpecific(sig linux.Signal) {
+	q := &p.signals[sig.Index()]
+	q.pendingSignalList.Reset()
+	q.length = 0
+	p.pendingSet &^= linux.SignalSetOf(sig)
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
new file mode 100644
index 000000000..ca9825f9d
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -0,0 +1,68 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "pipe_state",
+    srcs = [
+        "buffers.go",
+        "node.go",
+        "pipe.go",
+        "reader.go",
+        "reader_writer.go",
+        "writer.go",
+    ],
+    out = "pipe_state.go",
+    package = "pipe",
+)
+
+go_library(
+    name = "pipe",
+    srcs = [
+        "buffers.go",
+        "device.go",
+        "node.go",
+        "pipe.go",
+        "pipe_state.go",
+        "reader.go",
+        "reader_writer.go",
+        "writer.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/ilist",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "pipe_test",
+    size = "small",
+    srcs = [
+        "node_test.go",
+        "pipe_test.go",
+    ],
+    embed = [":pipe"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
new file mode 100644
index 000000000..f300537c5
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -0,0 +1,50 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+)
+
+// Buffer encapsulates a queueable byte buffer that can
+// easily be truncated.  It is designed only for use with pipes.
+type Buffer struct {
+	ilist.Entry
+	data []byte
+}
+
+// newBuffer initializes a Buffer.
+func newBuffer(buf []byte) *Buffer {
+	return &Buffer{data: buf}
+}
+
+// bytes returns the bytes contained in the buffer.
+func (b *Buffer) bytes() []byte {
+	return b.data
+}
+
+// size returns the number of bytes contained in the buffer.
+func (b *Buffer) size() int {
+	return len(b.data)
+}
+
+// truncate removes the first n bytes from the buffer.
+func (b *Buffer) truncate(n int) int {
+	if n > len(b.data) {
+		panic("Trying to truncate past end of array.")
+	}
+	b.data = b.data[n:]
+	return len(b.data)
+}
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
new file mode 100644
index 000000000..8d383577a
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// pipeDevice is used for all pipe files.
+var pipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
new file mode 100644
index 000000000..5b47427ef
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/amutex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
+type inodeOperations struct {
+	fs.InodeOperations
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// p is the underlying Pipe object representing this fifo.
+	p *Pipe
+
+	// Channels for synchronizing the creation of new readers and writers of
+	// this fifo. See waitFor and newHandleLocked.
+	//
+	// These are not saved/restored because all waiters are unblocked on save,
+	// and either automatically restart (via ERESTARTSYS) or return EINTR on
+	// resume. On restarts via ERESTARTSYS, the appropriate channel will be
+	// recreated.
+	rWakeup chan struct{} `state:"nosave"`
+	wWakeup chan struct{} `state:"nosave"`
+}
+
+// NewInodeOperations creates a new pipe fs.InodeOperations.
+func NewInodeOperations(base fs.InodeOperations, p *Pipe) fs.InodeOperations {
+	return &inodeOperations{
+		InodeOperations: base,
+		p:               p,
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
+// semantics during open:
+//
+// "Normally, opening the FIFO blocks until the other end is opened also. A
+// process can open a FIFO in nonblocking mode. In this case, opening for
+// read-only will succeed even if no-one has opened on the write side yet,
+// opening for write-only will fail with ENXIO (no such device or address)
+// unless the other end has already been opened. Under Linux, opening a FIFO
+// for read and write will succeed both in blocking and nonblocking mode. POSIX
+// leaves this behavior undefined. This can be used to open a FIFO for writing
+// while there are no readers available." - fifo(7)
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	switch {
+	case flags.Read && !flags.Write: // O_RDONLY.
+		r := i.p.ROpen(ctx)
+		i.newHandleLocked(&i.rWakeup)
+
+		if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
+			if !i.waitFor(&i.wWakeup, ctx) {
+				r.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+
+		// By now, either we're doing a nonblocking open or we have a writer. On
+		// a nonblocking read-only open, the open succeeds even if no-one has
+		// opened the write side yet.
+		return r, nil
+
+	case flags.Write && !flags.Read: // O_WRONLY.
+		w := i.p.WOpen(ctx)
+		i.newHandleLocked(&i.wWakeup)
+
+		if i.p.isNamed && !i.p.HasReaders() {
+			// On a nonblocking, write-only open, the open fails with ENXIO if the
+			// read side isn't open yet.
+			if flags.NonBlocking {
+				w.DecRef()
+				return nil, syserror.ENXIO
+			}
+
+			if !i.waitFor(&i.rWakeup, ctx) {
+				w.DecRef()
+				return nil, syserror.ErrInterrupted
+			}
+		}
+		return w, nil
+
+	case flags.Read && flags.Write: // O_RDWR.
+		// Pipes opened for read-write always succeeds without blocking.
+		rw := i.p.RWOpen(ctx)
+		i.newHandleLocked(&i.rWakeup)
+		i.newHandleLocked(&i.wWakeup)
+		return rw, nil
+
+	default:
+		return nil, syserror.EINVAL
+	}
+}
+
+// waitFor blocks until the underlying pipe has at least one reader/writer is
+// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
+// function will block for either readers or writers, depending on where
+// 'wakeupChan' points.
+//
+// f.mu must be held by the caller. waitFor returns with f.mu held, but it will
+// drop f.mu before blocking for any reader/writers.
+func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
+	// Ideally this function would simply use a condition variable. However, the
+	// wait needs to be interruptible via 'sleeper', so we must sychronize via a
+	// channel. The synchronization below relies on the fact that closing a
+	// channel unblocks all receives on the channel.
+
+	// Does an appropriate wakeup channel already exist? If not, create a new
+	// one. This is all done under f.mu to avoid races.
+	if *wakeupChan == nil {
+		*wakeupChan = make(chan struct{})
+	}
+
+	// Grab a local reference to the wakeup channel since it may disappear as
+	// soon as we drop f.mu.
+	wakeup := *wakeupChan
+
+	// Drop the lock and prepare to sleep.
+	i.mu.Unlock()
+	cancel := sleeper.SleepStart()
+
+	// Wait for either a new reader/write to be signalled via 'wakeup', or
+	// for the sleep to be cancelled.
+	select {
+	case <-wakeup:
+		sleeper.SleepFinish(true)
+	case <-cancel:
+		sleeper.SleepFinish(false)
+	}
+
+	// Take the lock and check if we were woken. If we were woken and
+	// interrupted, the former takes priority.
+	i.mu.Lock()
+	select {
+	case <-wakeup:
+		return true
+	default:
+		return false
+	}
+}
+
+// newHandleLocked signals a new pipe reader or writer depending on where
+// 'wakeupChan' points. This unblocks any corresponding reader or writer
+// waiting for the other end of the channel to be opened, see Fifo.waitFor.
+//
+// i.mu must be held.
+func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) {
+	if *wakeupChan != nil {
+		close(*wakeupChan)
+		*wakeupChan = nil
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
new file mode 100644
index 000000000..cc1ebf4f6
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -0,0 +1,308 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type sleeper struct {
+	context.Context
+	ch chan struct{}
+}
+
+func newSleeperContext(t *testing.T) context.Context {
+	return &sleeper{
+		Context: contexttest.Context(t),
+		ch:      make(chan struct{}),
+	}
+}
+
+func (s *sleeper) SleepStart() <-chan struct{} {
+	return s.ch
+}
+
+func (s *sleeper) SleepFinish(bool) {
+}
+
+func (s *sleeper) Cancel() {
+	s.ch <- struct{}{}
+}
+
+type openResult struct {
+	*fs.File
+	error
+}
+
+func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
+	file, err := n.GetFile(ctx, nil, flags)
+	if err != nil {
+		t.Fatalf("open with flags %+v failed: %v", flags, err)
+	}
+	if doneChan != nil {
+		doneChan <- struct{}{}
+	}
+	return file, err
+}
+
+func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) {
+	file, err := n.GetFile(ctx, nil, flags)
+	if resChan != nil {
+		resChan <- openResult{file, err}
+	}
+	return file, err
+}
+
+func newNamedPipe(t *testing.T) *Pipe {
+	return NewPipe(contexttest.Context(t), true, DefaultPipeSize, usermem.PageSize)
+}
+
+func newAnonPipe(t *testing.T) *Pipe {
+	return NewPipe(contexttest.Context(t), false, DefaultPipeSize, usermem.PageSize)
+}
+
+// assertRecvBlocks ensures that a recv attempt on c blocks for at least
+// blockDuration. This is useful for checking that a goroutine that is supposed
+// to be executing a blocking operation is actually blocking.
+func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Duration, failMsg string) {
+	select {
+	case <-c:
+		t.Fatalf(failMsg)
+	case <-time.After(blockDuration):
+		// Ok, blocked for the required duration.
+	}
+}
+
+func TestReadOpenBlocksForWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	// Verify that the open for read is blocking.
+	assertRecvBlocks(t, rDone, time.Millisecond*100,
+		"open for read not blocking with no writers")
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	<-wDone
+	<-rDone
+}
+
+func TestWriteOpenBlocksForReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	// Verify that the open for write is blocking
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"open for write not blocking with no readers")
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	<-rDone
+	<-wDone
+}
+
+func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone1 := make(chan struct{})
+	rDone2 := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone1)
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone2)
+
+	assertRecvBlocks(t, rDone1, time.Millisecond*100,
+		"open for read didn't block with no writers")
+	assertRecvBlocks(t, rDone2, time.Millisecond*100,
+		"open for read didn't block with no writers")
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	<-wDone
+	<-rDone2
+	<-rDone1
+}
+
+func TestClosedReaderBlocksWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
+	rFile.DecRef()
+
+	wDone := make(chan struct{})
+	// This open for write should block because the reader is now gone.
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"open for write didn't block with no concurrent readers")
+
+	// Open for read again. This should unblock the open for write.
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	<-rDone
+	<-wDone
+}
+
+func TestReadWriteOpenNeverBlocks(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rwDone := make(chan struct{})
+	// Open for read-write never wait for a reader or writer, even if the
+	// nonblocking flag is not set.
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true, NonBlocking: false}, rwDone)
+	<-rwDone
+}
+
+func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	rwDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+	<-rwDone
+	<-rDone
+}
+
+func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	rwDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+	<-rwDone
+	<-wDone
+}
+
+func TestBlockedOpenIsCancellable(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	done := make(chan openResult)
+	go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done)
+	select {
+	case <-done:
+		t.Fatalf("open for read didn't block with no writers")
+	case <-time.After(time.Millisecond * 100):
+		// Ok.
+	}
+
+	ctx.(*sleeper).Cancel()
+	// If the cancel on the sleeper didn't work, the open for read would never
+	// return.
+	res := <-done
+	if res.error != syserror.ErrInterrupted {
+		t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.",
+			res.error)
+	}
+}
+
+func TestNonblockingReadOpenNoWriters(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for read failed with error %v.", err)
+	}
+}
+
+func TestNonblockingWriteOpenNoReaders(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO {
+		t.Fatalf("Nonblocking open for write failed unexpected error %v.", err)
+	}
+}
+
+func TestNonBlockingReadOpenWithWriter(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	wDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+	// Open for write blocks since there are no readers yet.
+	assertRecvBlocks(t, wDone, time.Millisecond*100,
+		"Open for write didn't block with no reader.")
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for read failed with error %v.", err)
+	}
+
+	// Open for write should now be unblocked.
+	<-wDone
+}
+
+func TestNonBlockingWriteOpenWithReader(t *testing.T) {
+	f := NewInodeOperations(nil, newNamedPipe(t))
+	ctx := newSleeperContext(t)
+
+	rDone := make(chan struct{})
+	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+	// Open for write blocked, since no reader yet.
+	assertRecvBlocks(t, rDone, time.Millisecond*100,
+		"Open for reader didn't block with no writer.")
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != nil {
+		t.Fatalf("Nonblocking open for write failed with error %v.", err)
+	}
+
+	// Open for write should now be unblocked.
+	<-rDone
+}
+
+func TestAnonReadOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newAnonPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil {
+		t.Fatalf("open anon pipe for read failed: %v", err)
+	}
+}
+
+func TestAnonWriteOpen(t *testing.T) {
+	f := NewInodeOperations(nil, newAnonPipe(t))
+	ctx := newSleeperContext(t)
+
+	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil {
+		t.Fatalf("open anon pipe for write failed: %v", err)
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
new file mode 100644
index 000000000..1656c6ff3
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -0,0 +1,335 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe provides an in-memory implementation of a unidirectional
+// pipe.
+//
+// The goal of this pipe is to emulate the pipe syscall in all of its
+// edge cases and guarantees of atomic IO.
+package pipe
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// DefaultPipeSize is the system-wide default size of a pipe in bytes.
+const DefaultPipeSize = 65536
+
+// Pipe is an encapsulation of a platform-independent pipe.
+// It manages a buffered byte queue shared between a reader/writer
+// pair.
+type Pipe struct {
+	waiter.Queue `state:"nosave"`
+
+	// Whether this is a named or anonymous pipe.
+	isNamed bool
+
+	// The dirent backing this pipe. Shared by all readers and writers.
+	dirent *fs.Dirent
+
+	// The buffered byte queue.
+	data ilist.List
+
+	// Max size of the pipe in bytes.  When this max has been reached,
+	// writers will get EWOULDBLOCK.
+	max int
+
+	// Current size of the pipe in bytes.
+	size int
+
+	// Max number of bytes the pipe can guarantee to read or write
+	// atomically.
+	atomicIOBytes int
+
+	// The number of active readers for this pipe. Load/store atomically.
+	readers int32
+
+	// The number of active writes for this pipe. Load/store atomically.
+	writers int32
+
+	// This flag indicates if this pipe ever had a writer. Note that this does
+	// not necessarily indicate there is *currently* a writer, just that there
+	// has been a writer at some point since the pipe was created.
+	//
+	// Protected by mu.
+	hadWriter bool
+
+	// Lock protecting all pipe internal state.
+	mu sync.Mutex `state:"nosave"`
+}
+
+// NewPipe initializes and returns a pipe. A pipe created by this function is
+// persistent, and will remain valid even without any open fds to it. Named
+// pipes for mknod(2) are created via this function. Note that the
+// implementation of blocking semantics for opening the read and write ends of a
+// named pipe are left to filesystems.
+func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *Pipe {
+	p := &Pipe{
+		isNamed:       isNamed,
+		max:           sizeBytes,
+		atomicIOBytes: atomicIOBytes,
+	}
+
+	// Build the fs.Dirent of this pipe, shared by all fs.Files associated
+	// with this pipe.
+	ino := pipeDevice.NextIno()
+	base := fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
+		FSType: linux.PIPEFS_MAGIC,
+		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: fs.FileOwnerFromContext(ctx),
+			Perms: fs.FilePermissions{
+				User: fs.PermMask{Read: true, Write: true},
+			},
+			Links: 1,
+		}),
+	})
+	sattr := fs.StableAttr{
+		Type:      fs.Pipe,
+		DeviceID:  pipeDevice.DeviceID(),
+		InodeID:   ino,
+		BlockSize: int64(atomicIOBytes),
+	}
+	// There is no real filesystem backing this pipe, so we pass in a nil
+	// Filesystem.
+	sb := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	p.dirent = fs.NewDirent(fs.NewInode(NewInodeOperations(base, p), sb, sattr), fmt.Sprintf("pipe:[%d]", ino))
+
+	return p
+}
+
+// NewConnectedPipe initializes a pipe and returns a pair of objects (which
+// implement kio.File) representing the read and write ends of the pipe. A pipe
+// created by this function becomes invalid as soon as either the read or write
+// end is closed, and errors on subsequent operations on either end. Pipes
+// for pipe(2) and pipe2(2) are generally created this way.
+func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*fs.File, *fs.File) {
+	p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
+	return p.ROpen(ctx), p.WOpen(ctx)
+}
+
+// ROpen opens the pipe for reading.
+func (p *Pipe) ROpen(ctx context.Context) *fs.File {
+	p.rOpen()
+	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true}, &Reader{
+		ReaderWriter: ReaderWriter{Pipe: p},
+	})
+}
+
+// WOpen opens the pipe for writing.
+func (p *Pipe) WOpen(ctx context.Context) *fs.File {
+	p.wOpen()
+	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Write: true}, &Writer{
+		ReaderWriter: ReaderWriter{Pipe: p},
+	})
+}
+
+// RWOpen opens the pipe for both reading and writing.
+func (p *Pipe) RWOpen(ctx context.Context) *fs.File {
+	p.rOpen()
+	p.wOpen()
+	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
+		Pipe: p,
+	})
+}
+
+// read reads data from the pipe into dst and returns the number of bytes
+// read, or returns ErrWouldBlock if the pipe is empty.
+func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	if !p.HasReaders() {
+		return 0, syscall.EBADF
+	}
+
+	// Don't block for a zero-length read even if the pipe is empty.
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	// If there is nothing to read at the moment but there is a writer, tell the
+	// caller to block.
+	if p.size == 0 {
+		if !p.HasWriters() {
+			// There are no writers, return EOF.
+			return 0, nil
+		}
+		return 0, syserror.ErrWouldBlock
+	}
+	var n int64
+	for b := p.data.Front(); b != nil; b = p.data.Front() {
+		buffer := b.(*Buffer)
+		n0, err := dst.CopyOut(ctx, buffer.bytes())
+		n += int64(n0)
+		p.size -= n0
+		if buffer.truncate(n0) == 0 {
+			p.data.Remove(b)
+		}
+		dst = dst.DropFirst(n0)
+		if dst.NumBytes() == 0 || err != nil {
+			return n, err
+		}
+	}
+	return n, nil
+}
+
+// write writes data from sv into the pipe and returns the number of bytes
+// written. If no bytes are written because the pipe is full (or has less than
+// atomicIOBytes free capacity), write returns ErrWouldBlock.
+func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if !p.HasWriters() {
+		return 0, syscall.EBADF
+	}
+	if !p.HasReaders() {
+		return 0, syscall.EPIPE
+	}
+
+	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
+	// atomic, but requires no atomicity for writes larger than this. However,
+	// Linux appears to provide stronger semantics than this in practice:
+	// unmerged writes are done one PAGE_SIZE buffer at a time, so for larger
+	// writes, the writing of each PIPE_BUF-sized chunk is atomic. We implement
+	// this by writing at most atomicIOBytes at a time if we can't service the
+	// write in its entirety.
+	canWrite := src.NumBytes()
+	if canWrite > int64(p.max-p.size) {
+		if p.max-p.size >= p.atomicIOBytes {
+			canWrite = int64(p.atomicIOBytes)
+		} else {
+			return 0, syserror.ErrWouldBlock
+		}
+	}
+
+	// Copy data from user memory into a pipe-owned buffer.
+	buf := make([]byte, canWrite)
+	n, err := src.CopyIn(ctx, buf)
+	if n > 0 {
+		p.data.PushBack(newBuffer(buf[:n]))
+		p.size += n
+	}
+	if int64(n) < src.NumBytes() && err == nil {
+		// Partial write due to full pipe.
+		err = syserror.ErrWouldBlock
+	}
+	return int64(n), err
+}
+
+// rOpen signals a new reader of the pipe.
+func (p *Pipe) rOpen() {
+	atomic.AddInt32(&p.readers, 1)
+}
+
+// wOpen signals a new writer of the pipe.
+func (p *Pipe) wOpen() {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.hadWriter = true
+	atomic.AddInt32(&p.writers, 1)
+}
+
+// rClose signals that a reader has closed their end of the pipe.
+func (p *Pipe) rClose() {
+	newReaders := atomic.AddInt32(&p.readers, -1)
+	if newReaders < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders))
+	}
+}
+
+// wClose signals that a writer has closed their end of the pipe.
+func (p *Pipe) wClose() {
+	newWriters := atomic.AddInt32(&p.writers, -1)
+	if newWriters < 0 {
+		panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters))
+	}
+}
+
+// HasReaders returns whether the pipe has any active readers.
+func (p *Pipe) HasReaders() bool {
+	return atomic.LoadInt32(&p.readers) > 0
+}
+
+// HasWriters returns whether the pipe has any active writers.
+func (p *Pipe) HasWriters() bool {
+	return atomic.LoadInt32(&p.writers) > 0
+}
+
+func (p *Pipe) rReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasReaders() && p.data.Front() != nil {
+		ready |= waiter.EventIn
+	}
+	if !p.HasWriters() && p.hadWriter {
+		// POLLHUP must be supressed until the pipe has had at least one writer
+		// at some point. Otherwise a reader thread may poll and immediately get
+		// a POLLHUP before the writer ever opens the pipe, which the reader may
+		// interpret as the writer opening then closing the pipe.
+		ready |= waiter.EventHUp
+	}
+	return ready
+}
+
+// rReadiness returns a mask that states whether the read end of the pipe is
+// ready for reading.
+func (p *Pipe) rReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked()
+}
+
+func (p *Pipe) wReadinessLocked() waiter.EventMask {
+	ready := waiter.EventMask(0)
+	if p.HasWriters() && p.size < p.max {
+		ready |= waiter.EventOut
+	}
+	if !p.HasReaders() {
+		ready |= waiter.EventErr
+	}
+	return ready
+}
+
+// wReadiness returns a mask that states whether the write end of the pipe
+// is ready for writing.
+func (p *Pipe) wReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.wReadinessLocked()
+}
+
+// rwReadiness returns a mask that states whether a read-write handle to the
+// pipe is ready for IO.
+func (p *Pipe) rwReadiness() waiter.EventMask {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.rReadinessLocked() | p.wReadinessLocked()
+}
+
+func (p *Pipe) queuedSize() int {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.size
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
new file mode 100644
index 000000000..49ef8c8ac
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestPipeRW(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+	wantN := int64(len(msg))
+	n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+	if n != wantN || err != nil {
+		t.Fatalf("Writev: got (%d, %v), wanted (%d, nil)", n, err, wantN)
+	}
+
+	buf := make([]byte, len(msg))
+	n, err = r.Readv(ctx, usermem.BytesIOSequence(buf))
+	if n != wantN || err != nil || !bytes.Equal(buf, msg) {
+		t.Fatalf("Readv: got (%d, %v) %q, wanted (%d, nil) %q", n, err, buf, wantN, msg)
+	}
+}
+
+func TestPipeReadBlock(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1)))
+	if n != 0 || err != syserror.ErrWouldBlock {
+		t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock)
+	}
+}
+
+func TestPipeWriteBlock(t *testing.T) {
+	const atomicIOBytes = 2
+
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, 10, atomicIOBytes)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+	n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+	if wantN, wantErr := int64(atomicIOBytes), syserror.ErrWouldBlock; n != wantN || err != wantErr {
+		t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr)
+	}
+}
+
+func TestPipeWriteUntilEnd(t *testing.T) {
+	const atomicIOBytes = 2
+
+	ctx := contexttest.Context(t)
+	r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
+	defer r.DecRef()
+	defer w.DecRef()
+
+	msg := []byte("here's some bytes")
+
+	wDone := make(chan struct{}, 0)
+	rDone := make(chan struct{}, 0)
+	defer func() {
+		// Signal the reader to stop and wait until it does so.
+		close(wDone)
+		<-rDone
+	}()
+
+	go func() {
+		defer close(rDone)
+		// Read from r until done is closed.
+		ctx := contexttest.Context(t)
+		buf := make([]byte, len(msg)+1)
+		dst := usermem.BytesIOSequence(buf)
+		e, ch := waiter.NewChannelEntry(nil)
+		r.EventRegister(&e, waiter.EventIn)
+		defer r.EventUnregister(&e)
+		for {
+			n, err := r.Readv(ctx, dst)
+			dst = dst.DropFirst64(n)
+			if err == syserror.ErrWouldBlock {
+				select {
+				case <-ch:
+					continue
+				case <-wDone:
+					// We expect to have 1 byte left in dst since len(buf) ==
+					// len(msg)+1.
+					if dst.NumBytes() != 1 || !bytes.Equal(buf[:len(msg)], msg) {
+						t.Errorf("Reader: got %q (%d bytes remaining), wanted %q", buf, dst.NumBytes(), msg)
+					}
+					return
+				}
+			}
+			if err != nil {
+				t.Fatalf("Readv: got unexpected error %v", err)
+			}
+		}
+	}()
+
+	src := usermem.BytesIOSequence(msg)
+	e, ch := waiter.NewChannelEntry(nil)
+	w.EventRegister(&e, waiter.EventOut)
+	defer w.EventUnregister(&e)
+	for src.NumBytes() != 0 {
+		n, err := w.Writev(ctx, src)
+		src = src.DropFirst64(n)
+		if err == syserror.ErrWouldBlock {
+			<-ch
+			continue
+		}
+		if err != nil {
+			t.Fatalf("Writev: got (%d, %v)", n, err)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
new file mode 100644
index 000000000..40d5e4943
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Reader satisfies the fs.FileOperations interface for read-only pipes.
+// Reader should be used with !fs.FileFlags.Write to reject writes.
+type Reader struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+func (r *Reader) Release() {
+	r.Pipe.rClose()
+	// Wake up writers.
+	r.Pipe.Notify(waiter.EventOut)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return r.Pipe.rReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
new file mode 100644
index 000000000..dc642a3a6
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -0,0 +1,91 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"fmt"
+	"math"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// ReaderWriter satisfies the FileOperations interface and services both
+// read and write requests. This should only be used directly for named pipes.
+// pipe(2) and pipe2(2) only support unidirectional pipes and should use
+// either pipe.Reader or pipe.Writer.
+type ReaderWriter struct {
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	*Pipe
+}
+
+// Release implements fs.FileOperations.Release.
+func (rw *ReaderWriter) Release() {
+	rw.Pipe.rClose()
+	rw.Pipe.wClose()
+	// Wake up readers and writers.
+	rw.Pipe.Notify(waiter.EventIn | waiter.EventOut)
+}
+
+// Read implements fs.FileOperations.Read.
+func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	n, err := rw.Pipe.read(ctx, dst)
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	n, err := rw.Pipe.write(ctx, src)
+	if n > 0 {
+		rw.Pipe.Notify(waiter.EventIn)
+	}
+	return n, err
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return rw.Pipe.rwReadiness() & mask
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch int(args[1].Int()) {
+	case syscall.TIOCINQ:
+		v := rw.queuedSize()
+		if v > math.MaxInt32 {
+			panic(fmt.Sprintf("Impossibly large pipe queued size: %d", v))
+		}
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	default:
+		return 0, syscall.ENOTTY
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
new file mode 100644
index 000000000..fd13008ac
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Writer satisfies the fs.FileOperations interface for write-only pipes.
+// Writer should be used with !fs.FileFlags.Read to reject reads.
+type Writer struct {
+	ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+func (w *Writer) Release() {
+	w.Pipe.wClose()
+	// Wake up readers.
+	w.Pipe.Notify(waiter.EventHUp)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return w.Pipe.wReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
new file mode 100644
index 000000000..20b1c4cd4
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace.go
@@ -0,0 +1,1054 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptrace constants from Linux's include/uapi/linux/ptrace.h.
+const (
+	_PTRACE_EVENT_SECCOMP  = 7
+	PTRACE_SEIZE           = 0x4206
+	PTRACE_INTERRUPT       = 0x4207
+	PTRACE_LISTEN          = 0x4208
+	PTRACE_PEEKSIGINFO     = 0x4209
+	PTRACE_GETSIGMASK      = 0x420a
+	PTRACE_SETSIGMASK      = 0x420b
+	_PTRACE_O_EXITKILL     = 1 << 20
+	_PTRACE_O_TRACESECCOMP = 1 << _PTRACE_EVENT_SECCOMP
+)
+
+// ptraceOptions are the subset of options controlling a task's ptrace behavior
+// that are set by ptrace(PTRACE_SETOPTIONS).
+type ptraceOptions struct {
+	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
+	// exits.
+	ExitKill bool
+
+	// If SysGood is true, set bit 7 in the signal number for
+	// syscall-entry-stop and syscall-exit-stop traps delivered to this task's
+	// tracer.
+	SysGood bool
+
+	// TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE
+	// events.
+	TraceClone bool
+
+	// TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC
+	// events.
+	TraceExec bool
+
+	// TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT
+	// events.
+	TraceExit bool
+
+	// TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK
+	// events.
+	TraceFork bool
+
+	// TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP
+	// events.
+	TraceSeccomp bool
+
+	// TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK
+	// events.
+	TraceVfork bool
+
+	// TraceVforkDone is true if the tracer wants to receive
+	// PTRACE_EVENT_VFORK_DONE events.
+	TraceVforkDone bool
+}
+
+// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry
+// and exit.
+type ptraceSyscallMode int
+
+const (
+	// ptraceSyscallNone indicates that the task has never ptrace-stopped, or
+	// that it was resumed from its last ptrace-stop by PTRACE_CONT or
+	// PTRACE_DETACH. The task's syscalls will not be intercepted.
+	ptraceSyscallNone ptraceSyscallMode = iota
+
+	// ptraceSyscallIntercept indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a
+	// syscall, a ptrace-stop will occur.
+	ptraceSyscallIntercept
+
+	// ptraceSyscallEmu indicates that the task was resumed from its last
+	// ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time
+	// the task enters a syscall, the syscall will be skipped, and a
+	// ptrace-stop will occur.
+	ptraceSyscallEmu
+)
+
+// CanTrace checks that t is permitted to access target's state, as defined by
+// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
+// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
+// mode PTRACE_MODE_READ.
+func (t *Task) CanTrace(target *Task, attach bool) bool {
+	// "1. If the calling thread and the target thread are in the same thread
+	// group, access is always allowed." - ptrace(2)
+	//
+	// Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
+	// should not deny sub-threads", first released in Linux 3.12), the rule
+	// only applies if t and target are the same task. But, as that commit
+	// message puts it, "[any] security check is pointless when the tasks share
+	// the same ->mm."
+	if t.tg == target.tg {
+		return true
+	}
+
+	// """
+	// 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped,
+	// doesn't exist until Linux 4.5).
+	//
+	// Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
+	// caller's real UID and GID for the checks in the next step. (Most APIs
+	// that check the caller's UID and GID use the effective IDs. For
+	// historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
+	// instead.)
+	//
+	// 3. Deny access if neither of the following is true:
+	//
+	// - The real, effective, and saved-set user IDs of the target match the
+	// caller's user ID, *and* the real, effective, and saved-set group IDs of
+	// the target match the caller's group ID.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the user namespace of
+	// the target.
+	//
+	// 4. Deny access if the target process "dumpable" attribute has a value
+	// other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
+	// prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
+	// the user namespace of the target process.
+	//
+	// 5. The kernel LSM security_ptrace_access_check() interface is invoked to
+	// see if ptrace access is permitted. The results depend on the LSM(s). The
+	// implementation of this interface in the commoncap LSM performs the
+	// following steps:
+	//
+	// a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
+	// caller's effective capability set; otherwise (the access mode specifies
+	// PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set.
+	//
+	// b) Deny access if neither of the following is true:
+	//
+	// - The caller and the target process are in the same user namespace, and
+	// the caller's capabilities are a proper superset of the target process's
+	// permitted capabilities.
+	//
+	// - The caller has the CAP_SYS_PTRACE capability in the target process's
+	// user namespace.
+	//
+	// Note that the commoncap LSM does not distinguish between
+	// PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this
+	// section: "the commoncap LSM ... is always invoked".)
+	// """
+	callerCreds := t.Credentials()
+	targetCreds := target.Credentials()
+	if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) {
+		return true
+	}
+	if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID {
+		return false
+	}
+	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
+		return false
+	}
+	// TODO: dumpability check
+	if callerCreds.UserNamespace != targetCreds.UserNamespace {
+		return false
+	}
+	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
+		return false
+	}
+	// TODO: Yama LSM
+	return true
+}
+
+// Tracer returns t's ptrace Tracer.
+func (t *Task) Tracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+// hasTracer returns true if t has a ptrace tracer attached.
+func (t *Task) hasTracer() bool {
+	// This isn't just inlined into callers so that if Task.Tracer() turns out
+	// to be too expensive because of e.g. interface conversion, we can switch
+	// to having a separate atomic flag more easily.
+	return t.Tracer() != nil
+}
+
+// ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
+type ptraceStop struct {
+	// If frozen is true, the stopped task's tracer is currently operating on
+	// it, so Task.Kill should not remove the stop.
+	frozen bool
+}
+
+// Killable implements TaskStop.Killable.
+func (s *ptraceStop) Killable() bool {
+	return !s.frozen
+}
+
+// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been
+// killed, the stop is skipped, and beginPtraceStopLocked returns false.
+//
+// beginPtraceStopLocked does not signal t's tracer or wake it if it is
+// waiting.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) beginPtraceStopLocked() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... =>
+	// kernel/sched/core.c:__schedule() => signal_pending_state() check, which
+	// is what prevents tasks from entering ptrace-stops after being killed.
+	// Note that if t was SIGKILLed and beingPtraceStopLocked is being called
+	// for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
+	// entering the exit path, so t.killable() will no longer return true. This
+	// is consistent with Linux: "Bugs: ... A SIGKILL signal may still cause a
+	// PTRACE_EVENT_EXIT stop before actual signal death. This may be changed
+	// in the future; SIGKILL is meant to always immediately kill tasks even
+	// under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
+	if t.killedLocked() {
+		return false
+	}
+	t.beginInternalStopLocked(&ptraceStop{})
+	return true
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceTrapLocked(code int32) {
+	t.ptraceCode = code
+	t.ptraceSiginfo = &arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  code,
+	}
+	t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+	t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+	if t.beginPtraceStopLocked() {
+		tracer := t.Tracer()
+		tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+}
+
+// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the
+// ptraceStop, temporarily preventing it from being removed by a concurrent
+// Task.Kill, and returns true. Otherwise it returns false.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine of t's tracer.
+func (t *Task) ptraceFreeze() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.stop == nil {
+		return false
+	}
+	s, ok := t.stop.(*ptraceStop)
+	if !ok {
+		return false
+	}
+	s.frozen = true
+	return true
+}
+
+// ptraceUnfreeze ends the effect of a previous successful call to
+// ptraceFreeze.
+//
+// Preconditions: t must be in a frozen ptraceStop.
+func (t *Task) ptraceUnfreeze() {
+	// t.tg.signalHandlers is stable because t is in a frozen ptrace-stop,
+	// preventing its thread group from completing execve.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// Do this even if the task has been killed to ensure a panic if t.stop is
+	// nil or not a ptraceStop.
+	t.stop.(*ptraceStop).frozen = false
+	if t.killedLocked() {
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL,
+// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on
+// mode and singlestep.
+//
+// Preconditions: t must be in a frozen ptrace stop.
+//
+// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace
+// stop.
+func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.ptraceCode = int32(sig)
+	t.ptraceSyscallMode = mode
+	t.ptraceSinglestep = singlestep
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endInternalStopLocked()
+	return nil
+}
+
+func (t *Task) ptraceTraceme() error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if t.hasTracer() {
+		return syserror.EPERM
+	}
+	if t.parent == nil {
+		// In Linux, only init can not have a parent, and init is assumed never
+		// to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user
+		// application that may invoke PTRACE_TRACEME; having no parent can
+		// also occur if all tasks in the parent thread group have exited, and
+		// failed to find a living thread group to reparent to. The former case
+		// is treated as if TGID 1 has an exited parent in an invisible
+		// ancestor PID namespace that is an owner of the root user namespace
+		// (and consequently has CAP_SYS_PTRACE), and the latter case is a
+		// special form of the exited parent case below. In either case,
+		// returning nil here is correct.
+		return nil
+	}
+	if !t.parent.CanTrace(t, true) {
+		return syserror.EPERM
+	}
+	if t.parent.exitState != TaskExitNone {
+		// Fail silently, as if we were successfully attached but then
+		// immediately detached. This is consistent with Linux.
+		return nil
+	}
+	t.ptraceTracer.Store(t.parent)
+	t.parent.ptraceTracees[t] = struct{}{}
+	return nil
+}
+
+// ptraceAttach implements ptrace(PTRACE_ATTACH, target). t is the caller.
+func (t *Task) ptraceAttach(target *Task) error {
+	if t.tg == target.tg {
+		return syserror.EPERM
+	}
+	if !t.CanTrace(target, true) {
+		return syserror.EPERM
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.hasTracer() {
+		return syserror.EPERM
+	}
+	// Attaching to zombies and dead tasks is not permitted; the exit
+	// notification logic relies on this. Linux allows attaching to PF_EXITING
+	// tasks, though.
+	if target.exitState >= TaskExitZombie {
+		return syserror.EPERM
+	}
+	target.ptraceTracer.Store(t)
+	t.ptraceTracees[target] = struct{}{}
+	target.tg.signalHandlers.mu.Lock()
+	target.sendSignalLocked(&arch.SignalInfo{
+		Signo: int32(linux.SIGSTOP),
+		Code:  arch.SignalInfoUser,
+	}, false /* group */)
+	// Undocumented Linux feature: If the tracee is already group-stopped (and
+	// consequently will not report the SIGSTOP just sent), force it to leave
+	// and re-enter the stop so that it will switch to a ptrace-stop.
+	if target.stop == (*groupStop)(nil) {
+		target.groupStopRequired = true
+		target.endInternalStopLocked()
+	}
+	target.tg.signalHandlers.mu.Unlock()
+	return nil
+}
+
+// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the
+// caller.
+//
+// Preconditions: target must be a tracee of t in a frozen ptrace stop.
+//
+// Postconditions: If ptraceDetach returns nil, target will no longer be in a
+// ptrace stop.
+func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error {
+	if sig != 0 && !sig.IsValid() {
+		return syserror.EIO
+	}
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	target.ptraceCode = int32(sig)
+	target.forgetTracerLocked()
+	delete(t.ptraceTracees, target)
+	return nil
+}
+
+// exitPtrace is called in the exit path to detach all of t's tracees.
+func (t *Task) exitPtrace() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	for target := range t.ptraceTracees {
+		if target.ptraceOpts.ExitKill {
+			target.tg.signalHandlers.mu.Lock()
+			target.sendSignalLocked(&arch.SignalInfo{
+				Signo: int32(linux.SIGKILL),
+			}, false /* group */)
+			target.tg.signalHandlers.mu.Unlock()
+		}
+		// Leave ptraceCode unchanged so that if the task is ptrace-stopped, it
+		// observes the ptraceCode it set before it entered the stop. I believe
+		// this is consistent with Linux.
+		target.forgetTracerLocked()
+	}
+	// "nil maps cannot be saved"
+	t.ptraceTracees = make(map[*Task]struct{})
+}
+
+// forgetTracerLocked detaches t's tracer and ensures that t is no longer
+// ptrace-stopped.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) forgetTracerLocked() {
+	t.ptraceOpts = ptraceOptions{}
+	t.ptraceSyscallMode = ptraceSyscallNone
+	t.ptraceSinglestep = false
+	t.ptraceTracer.Store((*Task)(nil))
+	if t.exitTracerNotified && !t.exitTracerAcked {
+		t.exitTracerAcked = true
+		t.exitNotifyLocked(true)
+	}
+	// If t is ptrace-stopped, but its thread group is in a group stop and t is
+	// eligible to participate, make it do so. This is essentially the reverse
+	// of the special case in ptraceAttach, which converts a group stop to a
+	// ptrace stop. ("Handling of restart from group-stop is currently buggy,
+	// but the "as planned" behavior is to leave tracee stopped and waiting for
+	// SIGCONT." - ptrace(2))
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.stop == nil {
+		return
+	}
+	if _, ok := t.stop.(*ptraceStop); ok {
+		if t.exitState < TaskExitInitiated && t.tg.groupStopPhase >= groupStopInitiated {
+			t.groupStopRequired = true
+		}
+		t.endInternalStopLocked()
+	}
+}
+
+// ptraceSignalLocked is called after signal dequeueing to check if t should
+// enter ptrace signal-delivery-stop.
+//
+// Preconditions: The signal mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
+	if linux.Signal(info.Signo) == linux.SIGKILL {
+		return false
+	}
+	if !t.hasTracer() {
+		return false
+	}
+	// The tracer might change this signal into a stop signal, in which case
+	// any SIGCONT received after the signal was originally dequeued should
+	// cancel it. This is consistent with Linux.
+	if t.tg.groupStopPhase == groupStopNone {
+		t.tg.groupStopPhase = groupStopDequeued
+	}
+	// Can't lock the TaskSet mutex while holding a signal mutex.
+	t.tg.signalHandlers.mu.Unlock()
+	defer t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	tracer := t.Tracer()
+	if tracer == nil {
+		return false
+	}
+	t.ptraceCode = info.Signo
+	t.ptraceSiginfo = info
+	t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo)
+	if t.beginPtraceStopLocked() {
+		tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo)
+		tracer.tg.eventQueue.Notify(EventTraceeStop)
+	}
+	return true
+}
+
+// ptraceSeccomp is called when a seccomp-bpf filter returns action
+// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data
+// is the lower 16 bits of the filter's return value.
+func (t *Task) ptraceSeccomp(data uint16) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceSeccomp {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
+	t.ptraceEventLocked(_PTRACE_EVENT_SECCOMP, uint64(data))
+	return true
+}
+
+// ptraceSyscallEnter is called immediately before entering a syscall to check
+// if t should enter ptrace syscall-enter-stop.
+func (t *Task) ptraceSyscallEnter() (taskRunState, bool) {
+	if !t.hasTracer() {
+		return nil, false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	switch t.ptraceSyscallMode {
+	case ptraceSyscallNone:
+		return nil, false
+	case ptraceSyscallIntercept:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSyscallEnterStop)(nil), true
+	case ptraceSyscallEmu:
+		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU")
+		t.ptraceSyscallStopLocked()
+		return (*runSyscallAfterSysemuStop)(nil), true
+	}
+	panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode))
+}
+
+// ptraceSyscallExit is called immediately after leaving a syscall to check if
+// t should enter ptrace syscall-exit-stop.
+func (t *Task) ptraceSyscallExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if t.ptraceSyscallMode != ptraceSyscallIntercept {
+		return
+	}
+	t.Debugf("Entering syscall-exit-stop")
+	t.ptraceSyscallStopLocked()
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceSyscallStopLocked() {
+	code := int32(linux.SIGTRAP)
+	if t.ptraceOpts.SysGood {
+		code |= 0x80
+	}
+	t.ptraceTrapLocked(code)
+}
+
+type ptraceCloneKind int32
+
+const (
+	// ptraceCloneKindClone represents a call to Task.Clone where
+	// TerminationSignal is not SIGCHLD and Vfork is false.
+	ptraceCloneKindClone ptraceCloneKind = iota
+
+	// ptraceCloneKindFork represents a call to Task.Clone where
+	// TerminationSignal is SIGCHLD and Vfork is false.
+	ptraceCloneKindFork
+
+	// ptraceCloneKindVfork represents a call to Task.Clone where Vfork is
+	// true.
+	ptraceCloneKindVfork
+)
+
+// ptraceClone is called at the end of a clone or fork syscall to check if t
+// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
+// stop. child is the new task.
+func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	event := false
+	if !opts.Untraced {
+		switch kind {
+		case ptraceCloneKindClone:
+			if t.ptraceOpts.TraceClone {
+				t.Debugf("Entering PTRACE_EVENT_CLONE stop")
+				t.ptraceEventLocked(syscall.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindFork:
+			if t.ptraceOpts.TraceFork {
+				t.Debugf("Entering PTRACE_EVENT_FORK stop")
+				t.ptraceEventLocked(syscall.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		case ptraceCloneKindVfork:
+			if t.ptraceOpts.TraceVfork {
+				t.Debugf("Entering PTRACE_EVENT_VFORK stop")
+				t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
+				event = true
+			}
+		default:
+			panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind))
+		}
+	}
+	// "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE
+	// options are in effect, then children created by, respectively, vfork(2)
+	// or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit
+	// signal set to SIGCHLD, and other kinds of clone(2), are automatically
+	// attached to the same tracer which traced their parent. SIGSTOP is
+	// delivered to the children, causing them to enter signal-delivery-stop
+	// after they exit the system call which created them." - ptrace(2)
+	//
+	// clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
+	// confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
+	// include/linux/ptrace.h:ptrace_init_task().
+	if event || opts.InheritTracer {
+		tracer := t.Tracer()
+		if tracer != nil {
+			child.ptraceTracer.Store(tracer)
+			tracer.ptraceTracees[child] = struct{}{}
+			// "Flags are inherited by new tracees created and "auto-attached"
+			// via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
+			// PTRACE_O_TRACECLONE options."
+			child.ptraceOpts = t.ptraceOpts
+			child.tg.signalHandlers.mu.Lock()
+			// If the child is PT_SEIZED (currently not possible in the sentry
+			// because PTRACE_SEIZE is unimplemented, but for future
+			// reference), Linux just sets JOBCTL_TRAP_STOP instead, so the
+			// child skips signal-delivery-stop and goes directly to
+			// group-stop.
+			//
+			// The child will self-t.interrupt() when its task goroutine starts
+			// running, so we don't have to.
+			child.pendingSignals.enqueue(&arch.SignalInfo{
+				Signo: int32(linux.SIGSTOP),
+			})
+			child.tg.signalHandlers.mu.Unlock()
+		}
+	}
+	return event
+}
+
+// ptraceVforkDone is called after the end of a vfork stop to check if t should
+// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's
+// PID namespace.
+func (t *Task) ptraceVforkDone(child ThreadID) bool {
+	if !t.hasTracer() {
+		return false
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceVforkDone {
+		return false
+	}
+	t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
+	t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK_DONE, uint64(child))
+	return true
+}
+
+// ptraceExec is called at the end of an execve syscall to check if t should
+// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID
+// namespace, prior to the execve. (If t did not have a tracer at the time
+// oldTID was read, oldTID may be 0. This is consistent with Linux.)
+func (t *Task) ptraceExec(oldTID ThreadID) {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	// Recheck with the TaskSet mutex locked. Most ptrace points don't need to
+	// do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC
+	// is special because both TraceExec and !TraceExec do something if a
+	// tracer is attached.
+	if !t.hasTracer() {
+		return
+	}
+	if t.ptraceOpts.TraceExec {
+		t.Debugf("Entering PTRACE_EVENT_EXEC stop")
+		t.ptraceEventLocked(syscall.PTRACE_EVENT_EXEC, uint64(oldTID))
+		return
+	}
+	// "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
+	// tracee, and if the tracee was PTRACE_ATTACHed rather that [sic]
+	// PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after
+	// execve(2) returns. This is an ordinary signal (similar to one which can
+	// be generated by `kill -TRAP`, not a special kind of ptrace-stop.
+	// Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
+	// (SI_USER). This signal may be blocked by signal mask, and thus may be
+	// delivered (much) later." - ptrace(2)
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.sendSignalLocked(&arch.SignalInfo{
+		Signo: int32(linux.SIGTRAP),
+		Code:  arch.SignalInfoUser,
+	}, false /* group */)
+}
+
+// ptraceExit is called early in the task exit path to check if t should enter
+// PTRACE_EVENT_EXIT stop.
+func (t *Task) ptraceExit() {
+	if !t.hasTracer() {
+		return
+	}
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if !t.ptraceOpts.TraceExit {
+		return
+	}
+	t.tg.signalHandlers.mu.Lock()
+	status := t.exitStatus.Status()
+	t.tg.signalHandlers.mu.Unlock()
+	t.Debugf("Entering PTRACE_EVENT_EXIT stop")
+	t.ptraceEventLocked(syscall.PTRACE_EVENT_EXIT, uint64(status))
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceEventLocked(event int32, msg uint64) {
+	t.ptraceEventMsg = msg
+	// """
+	// PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning
+	// with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An
+	// additional bit is set in the higher byte of the status word: the value
+	// status>>8 will be
+	//
+	//   (SIGTRAP | PTRACE_EVENT_foo << 8).
+	//
+	// ...
+	//
+	// """ - ptrace(2)
+	t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8))
+}
+
+// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller.
+func (t *Task) ptraceKill(target *Task) error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.Tracer() != t {
+		return syserror.ESRCH
+	}
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	// "This operation is deprecated; do not use it! Instead, send a SIGKILL
+	// directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is
+	// that it requires the tracee to be in signal-delivery-stop, otherwise it
+	// may not work (i.e., may complete successfully but won't kill the
+	// tracee)." - ptrace(2)
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	target.ptraceCode = int32(linux.SIGKILL)
+	target.endInternalStopLocked()
+	return nil
+}
+
+// Ptrace implements the ptrace system call.
+func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
+	// PTRACE_TRACEME ignores all other arguments.
+	if req == syscall.PTRACE_TRACEME {
+		return t.ptraceTraceme()
+	}
+	// All other ptrace requests operate on a current or future tracee
+	// specified by pid.
+	target := t.tg.pidns.TaskWithID(pid)
+	if target == nil {
+		return syserror.ESRCH
+	}
+
+	// PTRACE_ATTACH (and PTRACE_SEIZE, which is unimplemented) do not require
+	// that target is not already a tracee.
+	if req == syscall.PTRACE_ATTACH {
+		return t.ptraceAttach(target)
+	}
+	// PTRACE_KILL (and PTRACE_INTERRUPT, which is unimplemented) require that
+	// the target is a tracee, but does not require that it is ptrace-stopped.
+	if req == syscall.PTRACE_KILL {
+		return t.ptraceKill(target)
+	}
+	// All other ptrace requests require that the target is a ptrace-stopped
+	// tracee, and freeze the ptrace-stop so the tracee can be operated on.
+	t.tg.pidns.owner.mu.RLock()
+	if target.Tracer() != t {
+		t.tg.pidns.owner.mu.RUnlock()
+		return syserror.ESRCH
+	}
+	if !target.ptraceFreeze() {
+		t.tg.pidns.owner.mu.RUnlock()
+		// "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE,
+		// PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
+		// tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
+		// ptrace(2)
+		return syserror.ESRCH
+	}
+	t.tg.pidns.owner.mu.RUnlock()
+	// Even if the target has a ptrace-stop active, the tracee's task goroutine
+	// may not yet have reached Task.doStop; wait for it to do so. This is safe
+	// because there's no way for target to initiate a ptrace-stop and then
+	// block (by calling Task.block) before entering it.
+	//
+	// Caveat: If tasks were just restored, the tracee's first call to
+	// Task.Activate (in Task.run) occurs before its first call to Task.doStop,
+	// which may block if the tracer's address space is active.
+	t.UninterruptibleSleepStart(true)
+	target.waitGoroutineStoppedOrExited()
+	t.UninterruptibleSleepFinish(true)
+
+	// Resuming commands end the ptrace stop, but only if successful.
+	switch req {
+	case syscall.PTRACE_DETACH:
+		if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_CONT:
+		if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SYSCALL:
+		if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SYSEMU:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	case syscall.PTRACE_SYSEMU_SINGLESTEP:
+		if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
+			target.ptraceUnfreeze()
+			return err
+		}
+		return nil
+	}
+	// All other ptrace requests expect us to unfreeze the stop.
+	defer target.ptraceUnfreeze()
+
+	switch req {
+	case syscall.PTRACE_PEEKTEXT, syscall.PTRACE_PEEKDATA:
+		// "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
+		// PTRACE_PEEKUSER requests have a different API: they store the result
+		// at the address specified by the data parameter, and the return value
+		// is the error flag." - ptrace(2)
+		word := t.Arch().Native(0)
+		if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
+			IgnorePermissions: true,
+		}); err != nil {
+			return err
+		}
+		_, err := t.CopyOut(data, word)
+		return err
+
+	case syscall.PTRACE_POKETEXT, syscall.PTRACE_POKEDATA:
+		_, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		return err
+
+	case syscall.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
+		n, err := target.Arch().PtracePeekUser(uintptr(addr))
+		if err != nil {
+			return err
+		}
+		_, err = t.CopyOut(data, n)
+		return err
+
+	case syscall.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
+		return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
+
+	case syscall.PTRACE_GETREGS:
+		// "Copy the tracee's general-purpose ... registers ... to the address
+		// data in the tracer. ... (addr is ignored.) Note that SPARC systems
+		// have the meaning of data and addr reversed ..."
+		_, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_GETFPREGS:
+		_, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_GETREGSET:
+		// "Read the tracee's registers. addr specifies, in an
+		// architecture-dependent way, the type of registers to be read. ...
+		// data points to a struct iovec, which describes the destination
+		// buffer's location and length. On return, the kernel modifies iov.len
+		// to indicate the actual number of bytes returned." - ptrace(2)
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+		ar.End -= usermem.Addr(n)
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case syscall.PTRACE_SETREGS:
+		_, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_SETFPREGS:
+		_, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case syscall.PTRACE_SETREGSET:
+		ars, err := t.CopyInIovecs(data, 1)
+		if err != nil {
+			return err
+		}
+		ar := ars.Head()
+		n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: ar.Start,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		}, int(ar.Length()))
+		if err != nil {
+			return err
+		}
+		ar.End -= usermem.Addr(n)
+		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+	case syscall.PTRACE_GETSIGINFO:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		_, err := t.CopyOut(data, target.ptraceSiginfo)
+		return err
+
+	case syscall.PTRACE_SETSIGINFO:
+		var info arch.SignalInfo
+		if _, err := t.CopyIn(data, &info); err != nil {
+			return err
+		}
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if target.ptraceSiginfo == nil {
+			return syserror.EINVAL
+		}
+		target.ptraceSiginfo = &info
+		return nil
+
+	case PTRACE_GETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		target.mu.Lock()
+		defer target.mu.Unlock()
+		_, err := t.CopyOut(data, target.tr.SignalMask)
+		return err
+
+	case PTRACE_SETSIGMASK:
+		if addr != linux.SignalSetSize {
+			return syserror.EINVAL
+		}
+		var mask linux.SignalSet
+		if _, err := t.CopyIn(data, &mask); err != nil {
+			return err
+		}
+		// The target's task goroutine is stopped, so this is safe:
+		target.SetSignalMask(mask &^ UnblockableSignals)
+		return nil
+
+	case syscall.PTRACE_SETOPTIONS:
+		t.tg.pidns.owner.mu.Lock()
+		defer t.tg.pidns.owner.mu.Unlock()
+		validOpts := uintptr(_PTRACE_O_EXITKILL | syscall.PTRACE_O_TRACESYSGOOD | syscall.PTRACE_O_TRACECLONE |
+			syscall.PTRACE_O_TRACEEXEC | syscall.PTRACE_O_TRACEEXIT | syscall.PTRACE_O_TRACEFORK |
+			_PTRACE_O_TRACESECCOMP | syscall.PTRACE_O_TRACEVFORK | syscall.PTRACE_O_TRACEVFORKDONE)
+		if uintptr(data)&^validOpts != 0 {
+			return syserror.EINVAL
+		}
+		target.ptraceOpts = ptraceOptions{
+			ExitKill:       data&_PTRACE_O_EXITKILL != 0,
+			SysGood:        data&syscall.PTRACE_O_TRACESYSGOOD != 0,
+			TraceClone:     data&syscall.PTRACE_O_TRACECLONE != 0,
+			TraceExec:      data&syscall.PTRACE_O_TRACEEXEC != 0,
+			TraceExit:      data&syscall.PTRACE_O_TRACEEXIT != 0,
+			TraceFork:      data&syscall.PTRACE_O_TRACEFORK != 0,
+			TraceSeccomp:   data&_PTRACE_O_TRACESECCOMP != 0,
+			TraceVfork:     data&syscall.PTRACE_O_TRACEVFORK != 0,
+			TraceVforkDone: data&syscall.PTRACE_O_TRACEVFORKDONE != 0,
+		}
+		return nil
+
+	case syscall.PTRACE_GETEVENTMSG:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+		return err
+
+	default:
+		// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
+		return syserror.EIO
+	}
+}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
new file mode 100644
index 000000000..635372993
--- /dev/null
+++ b/pkg/sentry/kernel/rseq.go
@@ -0,0 +1,118 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+
+// RSEQCriticalRegion describes a restartable sequence critical region.
+type RSEQCriticalRegion struct {
+	// When a task in this thread group has its CPU preempted (as defined by
+	// platform.ErrContextCPUPreempted) or has a signal delivered to an
+	// application handler while its instruction pointer is in CriticalSection,
+	// set the instruction pointer to Restart and application register r10 (on
+	// amd64) to the former instruction pointer.
+	CriticalSection usermem.AddrRange
+	Restart         usermem.Addr
+}
+
+// RSEQAvailable returns true if t supports restartable sequences.
+func (t *Task) RSEQAvailable() bool {
+	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
+}
+
+// RSEQCriticalRegion returns a copy of t's thread group's current restartable
+// sequence.
+func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
+	return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+//
+// Preconditions: t.RSEQAvailable() == true.
+func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+	// These checks are somewhat more lenient than in Linux, which (bizarrely)
+	// requires rscr.CriticalSection to be non-empty and rscr.Restart to be
+	// outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+	// (which disables the critical region).
+	if rscr.CriticalSection.Start == 0 {
+		rscr.CriticalSection.End = 0
+		rscr.Restart = 0
+		t.tg.rscr.Store(&rscr)
+		return nil
+	}
+	if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+		return syserror.EINVAL
+	}
+	if rscr.CriticalSection.Contains(rscr.Restart) {
+		return syserror.EINVAL
+	}
+	// TODO: check that rscr.CriticalSection and rscr.Restart are in
+	// the application address range, for consistency with Linux
+	t.tg.rscr.Store(&rscr)
+	return nil
+}
+
+// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) RSEQCPUAddr() usermem.Addr {
+	return t.rseqCPUAddr
+}
+
+// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
+	t.rseqCPUAddr = addr
+	if addr != 0 {
+		if err := t.rseqCopyOutCPU(); err != nil {
+			t.rseqCPUAddr = 0
+			t.rseqCPU = -1
+			return syserror.EINVAL // yes, EINVAL, not err or EFAULT
+		}
+	} else {
+		t.rseqCPU = -1
+	}
+	return nil
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+	t.rseqCPU = int32(hostcpu.GetCPU())
+	buf := t.CopyScratchBuffer(4)
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
+	_, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+	rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
+	if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
+		t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
+		t.Arch().SetIP(uintptr(rscr.Restart))
+		t.Arch().SetRSEQInterruptedIP(ip)
+	}
+}
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
new file mode 100644
index 000000000..b533c51c4
--- /dev/null
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -0,0 +1,20 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "sched",
+    srcs = [
+        "cpuset.go",
+        "sched.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched",
+    visibility = ["//pkg/sentry:internal"],
+)
+
+go_test(
+    name = "sched_test",
+    size = "small",
+    srcs = ["cpuset_test.go"],
+    embed = [":sched"],
+)
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
new file mode 100644
index 000000000..0a97603f0
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import "math/bits"
+
+const (
+	bitsPerByte  = 8
+	bytesPerLong = 8 // only for 64-bit architectures
+)
+
+// CPUSet contains a bitmap to record CPU information.
+//
+// Note that this definition is only correct for little-endian architectures,
+// since Linux's cpumask_t uses unsigned long.
+type CPUSet []byte
+
+// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
+func CPUSetSize(num uint) uint {
+	// NOTE: Applications may expect that the size of a CPUSet in
+	// bytes is always a multiple of sizeof(unsigned long), since this is true
+	// in Linux. Thus we always round up.
+	bytes := (num + bitsPerByte - 1) / bitsPerByte
+	longs := (bytes + bytesPerLong - 1) / bytesPerLong
+	return longs * bytesPerLong
+}
+
+// NewCPUSet returns a CPUSet for the given number of CPUs which initially
+// contains no CPUs.
+func NewCPUSet(num uint) CPUSet {
+	return CPUSet(make([]byte, CPUSetSize(num)))
+}
+
+// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which
+// are present in the set.
+func NewFullCPUSet(num uint) CPUSet {
+	c := NewCPUSet(num)
+	var i uint
+	for ; i < num/bitsPerByte; i++ {
+		c[i] = 0xff
+	}
+	if rem := num % bitsPerByte; rem != 0 {
+		c[i] = (1 << rem) - 1
+	}
+	return c
+}
+
+// Size returns the size of 'c' in bytes.
+func (c CPUSet) Size() uint {
+	return uint(len(c))
+}
+
+// NumCPUs returns how many cpus are set in the CPUSet.
+func (c CPUSet) NumCPUs() uint {
+	var n int
+	for _, b := range c {
+		n += bits.OnesCount8(b)
+	}
+	return uint(n)
+}
+
+// Copy returns a copy of the CPUSet.
+func (c CPUSet) Copy() CPUSet {
+	return append(CPUSet(nil), c...)
+}
+
+// Set sets the bit corresponding to cpu.
+func (c *CPUSet) Set(cpu uint) {
+	(*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte)
+}
+
+// ClearAbove clears bits corresponding to cpu and all higher cpus.
+func (c *CPUSet) ClearAbove(cpu uint) {
+	i := cpu / bitsPerByte
+	if i >= c.Size() {
+		return
+	}
+	(*c)[i] &^= 0xff << (cpu % bitsPerByte)
+	for i++; i < c.Size(); i++ {
+		(*c)[i] = 0
+	}
+}
+
+// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if
+// it's set.
+func (c CPUSet) ForEachCPU(fn func(uint)) {
+	for i := uint(0); i < c.Size()*bitsPerByte; i++ {
+		bit := uint(1) << (i & (bitsPerByte - 1))
+		if uint(c[i/bitsPerByte])&bit == bit {
+			fn(i)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
new file mode 100644
index 000000000..8a6e12958
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import (
+	"testing"
+)
+
+func TestNumCPUs(t *testing.T) {
+	for i := uint(0); i < 1024; i++ {
+		c := NewCPUSet(i)
+		for j := uint(0); j < i; j++ {
+			c.Set(j)
+		}
+		n := c.NumCPUs()
+		if n != i {
+			t.Errorf("got wrong number of cpus %d, want %d", n, i)
+		}
+	}
+}
+
+func TestClearAbove(t *testing.T) {
+	const n = 1024
+	c := NewFullCPUSet(n)
+	for i := uint(0); i < n; i++ {
+		cpu := n - i
+		c.ClearAbove(cpu)
+		if got := c.NumCPUs(); got != cpu {
+			t.Errorf("iteration %d: got %d cpus, wanted %d", i, got, cpu)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
new file mode 100644
index 000000000..f1de1da60
--- /dev/null
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sched implements scheduler related features.
+package sched
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
new file mode 100644
index 000000000..b7c4a507f
--- /dev/null
+++ b/pkg/sentry/kernel/seccomp.go
@@ -0,0 +1,205 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const maxSyscallFilterInstructions = 1 << 15
+
+type seccompResult int
+
+const (
+	// seccompResultDeny indicates that a syscall should not be executed.
+	seccompResultDeny seccompResult = iota
+
+	// seccompResultAllow indicates that a syscall should be executed.
+	seccompResultAllow
+
+	// seccompResultKill indicates that the task should be killed immediately,
+	// with the exit status indicating that the task was killed by SIGSYS.
+	seccompResultKill
+
+	// seccompResultTrace indicates that a ptracer was successfully notified as
+	// a result of a SECCOMP_RET_TRACE.
+	seccompResultTrace
+)
+
+// seccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+type seccompData struct {
+	// nr is the system call number.
+	nr int32
+
+	// arch is an AUDIT_ARCH_* value indicating the system call convention.
+	arch uint32
+
+	// instructionPointer is the value of the instruction pointer at the time
+	// of the system call.
+	instructionPointer uint64
+
+	// args contains the first 6 system call arguments.
+	args [6]uint64
+}
+
+func (d *seccompData) asBPFInput() bpf.Input {
+	return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+}
+
+func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+	si := &arch.SignalInfo{
+		Signo: int32(linux.SIGSYS),
+		Errno: errno,
+		Code:  arch.SYS_SECCOMP,
+	}
+	si.SetCallAddr(uint64(ip))
+	si.SetSyscall(sysno)
+	si.SetArch(t.SyscallTable().AuditNumber)
+	return si
+}
+
+// checkSeccompSyscall applies the task's seccomp filters before the execution
+// of syscall sysno at instruction pointer ip. (These parameters must be passed
+// in because vsyscalls do not use the values in t.Arch().)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) seccompResult {
+	result := t.evaluateSyscallFilters(sysno, args, ip)
+	switch result & linux.SECCOMP_RET_ACTION {
+	case linux.SECCOMP_RET_TRAP:
+		// "Results in the kernel sending a SIGSYS signal to the triggering
+		// task without executing the system call. ... The SECCOMP_RET_DATA
+		// portion of the return value will be passed as si_errno." -
+		// Documentation/prctl/seccomp_filter.txt
+		t.SendSignal(seccompSiginfo(t, int32(result&linux.SECCOMP_RET_DATA), sysno, ip))
+		return seccompResultDeny
+
+	case linux.SECCOMP_RET_ERRNO:
+		// "Results in the lower 16-bits of the return value being passed to
+		// userland as the errno without executing the system call."
+		t.Arch().SetReturn(-uintptr(result & linux.SECCOMP_RET_DATA))
+		return seccompResultDeny
+
+	case linux.SECCOMP_RET_TRACE:
+		// "When returned, this value will cause the kernel to attempt to
+		// notify a ptrace()-based tracer prior to executing the system call.
+		// If there is no tracer present, -ENOSYS is returned to userland and
+		// the system call is not executed."
+		if t.ptraceSeccomp(uint16(result & linux.SECCOMP_RET_DATA)) {
+			return seccompResultTrace
+		}
+		// This useless-looking temporary is needed because Go.
+		tmp := uintptr(syscall.ENOSYS)
+		t.Arch().SetReturn(-tmp)
+		return seccompResultDeny
+
+	case linux.SECCOMP_RET_ALLOW:
+		// "Results in the system call being executed."
+		return seccompResultAllow
+
+	case linux.SECCOMP_RET_KILL:
+		// "Results in the task exiting immediately without executing the
+		// system call. The exit status of the task will be SIGSYS, not
+		// SIGKILL."
+		fallthrough
+	default: // consistent with Linux
+		return seccompResultKill
+	}
+}
+
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+	data := seccompData{
+		nr:                 sysno,
+		arch:               t.tc.st.AuditNumber,
+		instructionPointer: uint64(ip),
+	}
+	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
+	// we can't do any slicing tricks or even use copy/append here.
+	for i, arg := range args {
+		if i >= len(data.args) {
+			break
+		}
+		data.args[i] = arg.Uint64()
+	}
+	input := data.asBPFInput()
+
+	ret := uint32(linux.SECCOMP_RET_ALLOW)
+	// "Every filter successfully installed will be evaluated (in reverse
+	// order) for each system call the task makes." - kernel/seccomp.c
+	for i := len(t.syscallFilters) - 1; i >= 0; i-- {
+		thisRet, err := bpf.Exec(t.syscallFilters[i], input)
+		if err != nil {
+			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
+			thisRet = linux.SECCOMP_RET_KILL
+		}
+		// "If multiple filters exist, the return value for the evaluation of a
+		// given system call will always use the highest precedent value." -
+		// Documentation/prctl/seccomp_filter.txt
+		//
+		// (Note that this contradicts prctl(2): "If the filters permit prctl()
+		// calls, then additional filters can be added; they are run in order
+		// until the first non-allow result is seen." prctl(2) is incorrect.)
+		//
+		// "The ordering ensures that a min_t() over composed return values
+		// always selects the least permissive choice." -
+		// include/uapi/linux/seccomp.h
+		if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
+			ret = thisRet
+		}
+	}
+
+	return ret
+}
+
+// AppendSyscallFilter adds BPF program p as a system call filter.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) AppendSyscallFilter(p bpf.Program) error {
+	// Cap the combined length of all syscall filters (plus a penalty of 4
+	// instructions per filter beyond the first) to
+	// maxSyscallFilterInstructions. (This restriction is inherited from
+	// Linux.)
+	totalLength := p.Length()
+	for _, f := range t.syscallFilters {
+		totalLength += f.Length() + 4
+	}
+	if totalLength > maxSyscallFilterInstructions {
+		return syserror.ENOMEM
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.syscallFilters = append(t.syscallFilters, p)
+	return nil
+}
+
+// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
+// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
+// and /proc/[pid]/status.
+func (t *Task) SeccompMode() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if len(t.syscallFilters) > 0 {
+		return linux.SECCOMP_MODE_FILTER
+	}
+	return linux.SECCOMP_MODE_NONE
+}
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
new file mode 100644
index 000000000..1656ad126
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -0,0 +1,62 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "semaphore",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*waiter",
+    },
+)
+
+go_stateify(
+    name = "semaphore_state",
+    srcs = [
+        "semaphore.go",
+        "waiter_list.go",
+    ],
+    out = "semaphore_autogen_state.go",
+    package = "semaphore",
+)
+
+go_library(
+    name = "semaphore",
+    srcs = [
+        "semaphore.go",
+        "semaphore_autogen_state.go",
+        "waiter_list.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/state",
+        "//pkg/state/statefile",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "semaphore_test",
+    size = "small",
+    srcs = ["semaphore_test.go"],
+    embed = [":semaphore"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
new file mode 100644
index 000000000..19ad5d537
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -0,0 +1,473 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package semaphore implements System V semaphores.
+package semaphore
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	valueMax = 32767 // SEMVMX
+
+	// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
+	semaphoresMax = 32000
+
+	// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
+	setsMax = 32000
+
+	// semaphoresTotalMax is "system-wide limit on the number of semaphores"
+	// (SEMMNS = SEMMNI*SEMMSL).
+	semaphoresTotalMax = 1024000000
+)
+
+// Registry maintains a set of semaphores that can be found by key or ID.
+type Registry struct {
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	semaphores map[int32]*Set
+	lastIDUsed int32
+}
+
+// Set represents a set of semaphores that can be operated atomically.
+type Set struct {
+	// Id is a handle that identifies the set.
+	ID int32
+
+	// key is an user provided key that can be shared between processes.
+	key int32
+
+	// creator is the user that created the set. Immutable.
+	creator fs.FileOwner
+
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	owner      fs.FileOwner
+	perms      fs.FilePermissions
+	opTime     ktime.Time
+	changeTime ktime.Time
+	sems       []sem
+
+	// dead is set to true when the set is removed and can't be reached anymore.
+	// All waiters must wake up and fail when set is dead.
+	dead bool
+}
+
+// sem represents a single semanphore from a set.
+type sem struct {
+	value   int16
+	waiters waiterList `state:"zerovalue"`
+}
+
+// waiter represents a caller that is waiting for the semaphore value to
+// become positive or zero.
+type waiter struct {
+	waiterEntry
+
+	// value represents how much resource the waiter needs to wake up.
+	value int16
+	ch    chan struct{}
+}
+
+// NewRegistry creates a new semaphore set registry.
+func NewRegistry() *Registry {
+	return &Registry{semaphores: make(map[int32]*Set)}
+}
+
+// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
+// it may create a new one if requested. If private is true, key is ignored and
+// a new set is always created. If create is false, it fails if a set cannot
+// be found. If exclusive is true, it fails if a set with the same key already
+// exists.
+func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
+	if nsems < 0 || nsems > semaphoresMax {
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if !private {
+		// Look up an existing semaphore.
+		if set := r.findByKey(key); set != nil {
+			// Check that caller can access semaphore set.
+			creds := auth.CredentialsFromContext(ctx)
+			if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
+				return nil, syserror.EACCES
+			}
+
+			// Validate parameters.
+			if nsems > int32(set.size()) {
+				return nil, syserror.EINVAL
+			}
+			if create && exclusive {
+				return nil, syserror.EEXIST
+			}
+			return set, nil
+		}
+
+		if !create {
+			// Semaphore not found and should not be created.
+			return nil, syserror.ENOENT
+		}
+	}
+
+	// Zero is only valid if an existing set is found.
+	if nsems == 0 {
+		return nil, syserror.EINVAL
+	}
+
+	// Apply system limits.
+	if len(r.semaphores) >= setsMax {
+		return nil, syserror.EINVAL
+	}
+	if r.totalSems() > int(semaphoresTotalMax-nsems) {
+		return nil, syserror.EINVAL
+	}
+
+	// Finally create a new set.
+	owner := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newSet(ctx, key, owner, owner, perms, nsems)
+}
+
+// RemoveID removes set with give 'id' from the registry and marks the set as
+// dead. All waiters will be awakened and fail.
+func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	set := r.semaphores[id]
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	// "The effective user ID of the calling process must match the creator or
+	// owner of the semaphore set, or the caller must be privileged."
+	if !set.checkCredentials(creds) && !set.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	delete(r.semaphores, set.ID)
+	set.destroy()
+	return nil
+}
+
+func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
+	set := &Set{
+		key:        key,
+		owner:      owner,
+		creator:    owner,
+		perms:      perms,
+		changeTime: ktime.NowFromContext(ctx),
+		sems:       make([]sem, nsems),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.semaphores[id] == nil {
+			r.lastIDUsed = id
+			r.semaphores[id] = set
+			set.ID = id
+			return set, nil
+		}
+	}
+
+	log.Warningf("Semaphore map is full, they must be leaking")
+	return nil, syserror.ENOMEM
+}
+
+// FindByID looks up a set given an ID.
+func (r *Registry) FindByID(id int32) *Set {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.semaphores[id]
+}
+
+func (r *Registry) findByKey(key int32) *Set {
+	for _, v := range r.semaphores {
+		if v.key == key {
+			return v
+		}
+	}
+	return nil
+}
+
+func (r *Registry) totalSems() int {
+	totalSems := 0
+	for _, v := range r.semaphores {
+		totalSems += v.size()
+	}
+	return totalSems
+}
+
+func (s *Set) findSem(num int32) *sem {
+	if num < 0 || int(num) >= s.size() {
+		return nil
+	}
+	return &s.sems[num]
+}
+
+func (s *Set) size() int {
+	return len(s.sems)
+}
+
+// Change changes some fields from the set atomically.
+func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The effective UID of the calling process must match the owner or creator
+	// of the semaphore set, or the caller must be privileged."
+	if !s.checkCredentials(creds) && !s.checkCapability(creds) {
+		return syserror.EACCES
+	}
+
+	s.owner = owner
+	s.perms = perms
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+// SetVal overrides a semaphore value, waking up waiters as needed.
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials) error {
+	if val < 0 || val > valueMax {
+		return syserror.ERANGE
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return syserror.ERANGE
+	}
+
+	// TODO: Clear undo entries in all processes
+	sem.value = val
+	s.changeTime = ktime.NowFromContext(ctx)
+	sem.wakeWaiters()
+	return nil
+}
+
+// GetVal returns a semaphore value.
+func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.value, nil
+}
+
+// ExecuteOps attempts to execute a list of operations to the set. It only
+// suceeds when all operations can be applied. No changes are made if it fails.
+//
+// On failure, it may return an error (retries are hopeless) or it may return
+// a channel that can be waited on before attempting again.
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials) (chan struct{}, int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Did it race with a removal operation?
+	if s.dead {
+		return nil, 0, syserror.EIDRM
+	}
+
+	// Validate the operations.
+	readOnly := true
+	for _, op := range ops {
+		if s.findSem(int32(op.SemNum)) == nil {
+			return nil, 0, syserror.EFBIG
+		}
+		if op.SemOp != 0 {
+			readOnly = false
+		}
+	}
+
+	if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
+		return nil, 0, syserror.EACCES
+	}
+
+	ch, num, err := s.executeOps(ctx, ops)
+	if err != nil {
+		return nil, 0, err
+	}
+	return ch, num, nil
+}
+
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}, int32, error) {
+	// Changes to semaphores go to this slice temporarily until they all succeed.
+	tmpVals := make([]int16, len(s.sems))
+	for i := range s.sems {
+		tmpVals[i] = s.sems[i].value
+	}
+
+	for _, op := range ops {
+		sem := &s.sems[op.SemNum]
+		if op.SemOp == 0 {
+			// Handle 'wait for zero' operation.
+			if tmpVals[op.SemNum] != 0 {
+				// Semaphore isn't 0, must wait.
+				if op.SemFlg&linux.IPC_NOWAIT != 0 {
+					return nil, 0, syserror.ErrWouldBlock
+				}
+
+				w := newWaiter(op.SemOp)
+				sem.waiters.PushBack(w)
+				return w.ch, int32(op.SemNum), nil
+			}
+		} else {
+			if op.SemOp < 0 {
+				// Handle 'wait' operation.
+				if -op.SemOp > valueMax {
+					return nil, 0, syserror.ERANGE
+				}
+				if -op.SemOp > tmpVals[op.SemNum] {
+					// Not enough resources, must wait.
+					if op.SemFlg&linux.IPC_NOWAIT != 0 {
+						return nil, 0, syserror.ErrWouldBlock
+					}
+
+					w := newWaiter(op.SemOp)
+					sem.waiters.PushBack(w)
+					return w.ch, int32(op.SemNum), nil
+				}
+			} else {
+				// op.SemOp > 0: Handle 'signal' operation.
+				if tmpVals[op.SemNum] > valueMax-op.SemOp {
+					return nil, 0, syserror.ERANGE
+				}
+			}
+
+			tmpVals[op.SemNum] += op.SemOp
+		}
+	}
+
+	// All operations succeeded, apply them.
+	// TODO: handle undo operations.
+	for i, v := range tmpVals {
+		s.sems[i].value = v
+		s.sems[i].wakeWaiters()
+	}
+	s.opTime = ktime.NowFromContext(ctx)
+	return nil, 0, nil
+}
+
+// AbortWait notifies that a waiter is giving up and will not wait on the
+// channel anymore.
+func (s *Set) AbortWait(num int32, ch chan struct{}) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	sem := &s.sems[num]
+	for w := sem.waiters.Front(); w != nil; w = w.Next() {
+		if w.ch == ch {
+			sem.waiters.Remove(w)
+			return
+		}
+	}
+	// Waiter may not be found in case it raced with wakeWaiters().
+}
+
+func (s *Set) checkCredentials(creds *auth.Credentials) bool {
+	return s.owner.UID == creds.EffectiveKUID ||
+		s.owner.GID == creds.EffectiveKGID ||
+		s.creator.UID == creds.EffectiveKUID ||
+		s.creator.GID == creds.EffectiveKGID
+}
+
+func (s *Set) checkCapability(creds *auth.Credentials) bool {
+	return creds.HasCapability(linux.CAP_IPC_OWNER) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
+}
+
+func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
+	// Are we owner, or in group, or other?
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+
+	// Are permissions satisfied without capability checks?
+	if p.SupersetOf(reqPerms) {
+		return true
+	}
+
+	return s.checkCapability(creds)
+}
+
+func (s *Set) destroy() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Notify all waiters. Tney will fail on the next attempt to execute
+	// operations and return error.
+	s.dead = true
+	for _, s := range s.sems {
+		for w := s.waiters.Front(); w != nil; w = w.Next() {
+			w.ch <- struct{}{}
+		}
+		s.waiters.Reset()
+	}
+}
+
+// wakeWaiters goes over all waiters and checks which of them can be notified.
+func (s *sem) wakeWaiters() {
+	// Note that this will release all waiters waiting for 0 too.
+	for w := s.waiters.Front(); w != nil; {
+		if s.value < w.value {
+			// Still blocked, skip it.
+			continue
+		}
+		w.ch <- struct{}{}
+		old := w
+		w = w.Next()
+		s.waiters.Remove(old)
+	}
+}
+
+func newWaiter(val int16) *waiter {
+	return &waiter{
+		value: val,
+		ch:    make(chan struct{}, 1),
+	}
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
new file mode 100644
index 000000000..0386586ab
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package semaphore
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} {
+	ch, _, err := set.executeOps(ctx, ops)
+	if err != nil {
+		t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops)
+	}
+	if block {
+		if ch == nil {
+			t.Fatalf("ExecuteOps(ops) got: nil, expected: !nil, ops: %+v", ops)
+		}
+		if signalled(ch) {
+			t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+		}
+	} else {
+		if ch != nil {
+			t.Fatalf("ExecuteOps(ops) got: %v, expected: nil, ops: %+v", ch, ops)
+		}
+	}
+	return ch
+}
+
+func signalled(ch chan struct{}) bool {
+	select {
+	case <-ch:
+		return true
+	default:
+		return false
+	}
+}
+
+func TestBasic(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -1
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+	}
+}
+
+func TestWaitForZero(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 0},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ch1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = 0
+	chZero1 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 0
+	chZero2 := executeOps(ctx, t, set, ops, true)
+
+	ops[0].SemOp = 1
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(ch1) {
+		t.Fatalf("ExecuteOps(ops) channel should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+
+	ops[0].SemOp = -2
+	executeOps(ctx, t, set, ops, false)
+	if !signalled(chZero1) {
+		t.Fatalf("ExecuteOps(ops) channel zero 1 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+	if !signalled(chZero2) {
+		t.Fatalf("ExecuteOps(ops) channel zero 2 should have been signalled, ops: %+v, set: %+v", ops, set)
+	}
+}
+
+func TestNoWait(t *testing.T) {
+	ctx := contexttest.Context(t)
+	set := &Set{ID: 123, sems: make([]sem, 1)}
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: 1},
+	}
+	executeOps(ctx, t, set, ops, false)
+
+	ops[0].SemOp = -2
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+
+	ops[0].SemOp = 0
+	ops[0].SemFlg = linux.IPC_NOWAIT
+	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+	}
+}
+
+func TestUnregister(t *testing.T) {
+	ctx := contexttest.Context(t)
+	r := NewRegistry()
+	set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true)
+	if err != nil {
+		t.Fatalf("FindOrCreate() failed, err: %v", err)
+	}
+	if got := r.FindByID(set.ID); got.ID != set.ID {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set)
+	}
+
+	ops := []linux.Sembuf{
+		linux.Sembuf{SemOp: -1},
+	}
+	chs := make([]chan struct{}, 0, 5)
+	for i := 0; i < 5; i++ {
+		ch := executeOps(ctx, t, set, ops, true)
+		chs = append(chs, ch)
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	if err := r.RemoveID(set.ID, creds); err != nil {
+		t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err)
+	}
+	if !set.dead {
+		t.Fatalf("set is not dead: %+v", set)
+	}
+	if got := r.FindByID(set.ID); got != nil {
+		t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got)
+	}
+	for i, ch := range chs {
+		if !signalled(ch) {
+			t.Fatalf("channel %d should have been signalled", i)
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
new file mode 100644
index 000000000..53d8fb844
--- /dev/null
+++ b/pkg/sentry/kernel/sessions.go
@@ -0,0 +1,462 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SessionID is the public identifier.
+type SessionID ThreadID
+
+// ProcessGroupID is the public identifier.
+type ProcessGroupID ThreadID
+
+// Session contains a leader threadgroup and a list of ProcessGroups.
+type Session struct {
+	refs refs.AtomicRefCount
+
+	// leader is the originator of the Session.
+	//
+	// Note that this may no longer be running (and may be reaped), so the
+	// ID is cached upon initial creation. The leader is still required
+	// however, since its PIDNamespace defines the scope of the Session.
+	//
+	// The leader is immutable.
+	leader *ThreadGroup
+
+	// id is the cached identifier in the leader's namespace.
+	//
+	// The id is immutable.
+	id SessionID
+
+	// ProcessGroups is a list of process groups in this Session. This is
+	// protected by TaskSet.mu.
+	processGroups processGroupList
+
+	// sessionEntry is the embed for TaskSet.sessions. This is protected by
+	// TaskSet.mu.
+	sessionEntry
+}
+
+// incRef grabs a reference.
+func (s *Session) incRef() {
+	s.refs.IncRef()
+}
+
+// decRef drops a reference.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (s *Session) decRef() {
+	s.refs.DecRefWithDestructor(func() {
+		// Remove translations from the leader.
+		for ns := s.leader.pidns; ns != nil; ns = ns.parent {
+			id := ns.sids[s]
+			delete(ns.sids, s)
+			delete(ns.sessions, id)
+		}
+
+		// Remove from the list of global Sessions.
+		s.leader.pidns.owner.sessions.Remove(s)
+	})
+}
+
+// ProcessGroup contains an originator threadgroup and a parent Session.
+type ProcessGroup struct {
+	refs refs.AtomicRefCount // not exported.
+
+	// originator is the originator of the group.
+	//
+	// See note re: leader in Session. The same applies here.
+	//
+	// The originator is immutable.
+	originator *ThreadGroup
+
+	// id is the cached identifier in the originator's namespace.
+	//
+	// The id is immutable.
+	id ProcessGroupID
+
+	// Session is the parent Session.
+	//
+	// The session is immutable.
+	session *Session
+
+	// ancestors is the number of thread groups in this process group whose
+	// parent is in a different process group in the same session.
+	//
+	// The name is derived from the fact that process groups where
+	// ancestors is zero are considered "orphans".
+	//
+	// ancestors is protected by TaskSet.mu.
+	ancestors uint32
+
+	// processGroupEntry is the embedded entry for Sessions.groups. This is
+	// protected by TaskSet.mu.
+	processGroupEntry
+}
+
+// incRefWithParent grabs a reference.
+//
+// This function is called when this ProcessGroup is being associated with some
+// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent
+// ThreadGroup. If tg is init, then parentPG may be nil.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) {
+	// We acquire an "ancestor" reference in the case of a nil parent.
+	// This is because the process being associated is init, and init can
+	// never be orphaned (we count it as always having an ancestor).
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors++
+	}
+
+	pg.refs.IncRef()
+}
+
+// decRefWithParent drops a reference.
+//
+// parentPG is per incRefWithParent.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
+	// See incRefWithParent regarding parent == nil.
+	if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+		pg.ancestors--
+	}
+
+	alive := true
+	pg.refs.DecRefWithDestructor(func() {
+		alive = false // don't bother with handleOrphan.
+
+		// Remove translations from the originator.
+		for ns := pg.originator.pidns; ns != nil; ns = ns.parent {
+			id := ns.pgids[pg]
+			delete(ns.pgids, pg)
+			delete(ns.processGroups, id)
+		}
+
+		// Remove the list of process groups.
+		pg.session.processGroups.Remove(pg)
+		pg.session.decRef()
+	})
+	if alive {
+		pg.handleOrphan()
+	}
+}
+
+// parentPG returns the parent process group.
+//
+// Precondition: callers must hold TaskSet.mu.
+func (tg *ThreadGroup) parentPG() *ProcessGroup {
+	if tg.leader.parent != nil {
+		return tg.leader.parent.tg.processGroup
+	}
+	return nil
+}
+
+// handleOrphan checks whether the process group is an orphan and has any
+// stopped jobs. If yes, then appropriate signals are delivered to each thread
+// group within the process group.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) handleOrphan() {
+	// Check if this process is an orphan.
+	if pg.ancestors != 0 {
+		return
+	}
+
+	// See if there are any stopped jobs.
+	hasStopped := false
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		if tg.groupStopPhase == groupStopComplete {
+			hasStopped = true
+		}
+		tg.signalHandlers.mu.Unlock()
+	})
+	if !hasStopped {
+		return
+	}
+
+	// Deliver appropriate signals to all thread groups.
+	pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+		if tg.processGroup != pg {
+			return
+		}
+		tg.signalHandlers.mu.Lock()
+		tg.leader.sendSignalLocked(sigPriv(linux.SIGHUP), true /* group */)
+		tg.leader.sendSignalLocked(sigPriv(linux.SIGCONT), true /* group */)
+		tg.signalHandlers.mu.Unlock()
+	})
+
+	return
+}
+
+// CreateSession creates a new Session, with the ThreadGroup as the leader.
+//
+// EPERM may be returned if either the given ThreadGroup is already a Session
+// leader, or a ProcessGroup already exists for the ThreadGroup's ID.
+func (tg *ThreadGroup) CreateSession() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+	return tg.createSession()
+}
+
+// createSession creates a new session for a threadgroup.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (tg *ThreadGroup) createSession() error {
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tids[tg.leader]
+
+	// Check if this ThreadGroup already leads a Session, or
+	// if the proposed group is already taken.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		if s.id == SessionID(id) {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new Session, with a single reference.
+	s := &Session{
+		id:     SessionID(id),
+		leader: tg,
+	}
+
+	// Create a new ProcessGroup, belonging to that Session.
+	// This also has a single reference (assigned below).
+	//
+	// Note that since this is a new session and a new process group, there
+	// will be zero ancestors for this process group. (It is an orphan at
+	// this point.)
+	pg := &ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    s,
+		ancestors:  0,
+	}
+
+	// Tie them and return the result.
+	s.processGroups.PushBack(pg)
+	tg.pidns.owner.sessions.PushBack(s)
+
+	// Leave the current group, and assign the new one.
+	if tg.processGroup != nil {
+		oldParentPG := tg.parentPG()
+		tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+			childTG.processGroup.incRefWithParent(pg)
+			childTG.processGroup.decRefWithParent(oldParentPG)
+		})
+		tg.processGroup.decRefWithParent(oldParentPG)
+		tg.processGroup = pg
+	} else {
+		// The current process group may be nil only in the case of an
+		// unparented thread group (i.e. the init process). This would
+		// not normally occur, but we allow it for the convenience of
+		// CreateSession working from that point. There will be no
+		// child processes. We always say that the very first group
+		// created has ancestors (avoids checks elsewhere).
+		//
+		// Note that this mirrors the parent == nil logic in
+		// incRef/decRef/reparent, which counts nil as an ancestor.
+		tg.processGroup = pg
+		tg.processGroup.ancestors++
+	}
+
+	// Ensure a translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tids[tg.leader]
+		ns.sids[s] = SessionID(local)
+		ns.sessions[SessionID(local)] = s
+		ns.pgids[pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = pg
+	}
+
+	return nil
+}
+
+// CreateProcessGroup creates a new process group.
+//
+// An EPERM error will be returned if the ThreadGroup belongs to a different
+// Session, is a Session leader or the group already exists.
+func (tg *ThreadGroup) CreateProcessGroup() error {
+	tg.pidns.owner.mu.Lock()
+	defer tg.pidns.owner.mu.Unlock()
+
+	// Get the ID for this thread in the current namespace.
+	id := tg.pidns.tids[tg.leader]
+
+	// Per above, check for a Session leader or existing group.
+	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+		if s.leader.pidns != tg.pidns {
+			continue
+		}
+		if s.leader == tg {
+			return syserror.EPERM
+		}
+		for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+			if pg.id == ProcessGroupID(id) {
+				return syserror.EPERM
+			}
+		}
+	}
+
+	// Create a new ProcessGroup, belonging to the current Session.
+	//
+	// We manually adjust the ancestors if the parent is in the same
+	// session.
+	tg.processGroup.session.incRef()
+	pg := &ProcessGroup{
+		id:         ProcessGroupID(id),
+		originator: tg,
+		session:    tg.processGroup.session,
+	}
+	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
+		pg.ancestors++
+	}
+
+	// Assign the new process group; adjust children.
+	oldParentPG := tg.parentPG()
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.decRefWithParent(oldParentPG)
+	})
+	tg.processGroup.decRefWithParent(oldParentPG)
+	tg.processGroup = pg
+
+	// Ensure this translation is added to all namespaces.
+	for ns := tg.pidns; ns != nil; ns = ns.parent {
+		local := ns.tids[tg.leader]
+		ns.pgids[pg] = ProcessGroupID(local)
+		ns.processGroups[ProcessGroupID(local)] = pg
+	}
+
+	return nil
+}
+
+// JoinProcessGroup joins an existing process group.
+//
+// This function will return EACCES if an exec has been performed since fork
+// by the given ThreadGroup, and EPERM if the Sessions are not the same or the
+// group does not exist.
+//
+// If checkExec is set, then the join is not permitted after the process has
+// executed exec at least once.
+func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error {
+	pidns.owner.mu.Lock()
+	defer pidns.owner.mu.Unlock()
+
+	// Lookup the ProcessGroup.
+	pg := pidns.processGroups[pgid]
+	if pg == nil {
+		return syserror.EPERM
+	}
+
+	// Disallow the join if an execve has performed, per POSIX.
+	if checkExec && tg.execed {
+		return syserror.EACCES
+	}
+
+	// See if it's in the same session as ours.
+	if pg.session != tg.processGroup.session {
+		return syserror.EPERM
+	}
+
+	// Join the group; adjust children.
+	parentPG := tg.parentPG()
+	pg.incRefWithParent(parentPG)
+	tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+		childTG.processGroup.incRefWithParent(pg)
+		childTG.processGroup.decRefWithParent(tg.processGroup)
+	})
+	tg.processGroup.decRefWithParent(parentPG)
+	tg.processGroup = pg
+
+	return nil
+}
+
+// Session returns the ThreadGroup's Session.
+//
+// A reference is not taken on the session.
+func (tg *ThreadGroup) Session() *Session {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup.session
+}
+
+// IDOfSession returns the Session assigned to s in PID namespace ns.
+//
+// If this group isn't visible in this namespace, zero will be returned. It is
+// the callers responsibility to check that before using this function.
+func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sids[s]
+}
+
+// SessionWithID returns the Session with the given ID in the PID namespace ns,
+// or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the session.
+func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.sessions[id]
+}
+
+// ProcessGroup returns the ThreadGroup's ProcessGroup.
+//
+// A reference is not taken on the process group.
+func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.processGroup
+}
+
+// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
+//
+// The same constraints apply as IDOfSession.
+func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.pgids[pg]
+}
+
+// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
+// namespace ns, or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the process group.
+func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+	pidns.owner.mu.RLock()
+	defer pidns.owner.mu.RUnlock()
+	return pidns.processGroups[id]
+}
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
new file mode 100644
index 000000000..8edd05cdf
--- /dev/null
+++ b/pkg/sentry/kernel/signal.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+// SignalPanic is used to panic the running threads. It is a signal which
+// cannot be used by the application: it must be caught and ignored by the
+// runtime (in order to catch possible races).
+const SignalPanic = linux.SIGUSR2
+
+// sendExternalSignal is called when an asynchronous signal is sent to the
+// sentry ("in sentry context"). On some platforms, it may also be called when
+// an asynchronous signal is sent to sandboxed application threads ("in
+// application context").
+//
+// context is used only for debugging to differentiate these cases.
+//
+// Returns false if signal could not be sent because the Kernel is not fully
+// initialized yet.
+func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) bool {
+	switch linux.Signal(info.Signo) {
+	case platform.SignalInterrupt:
+		// Assume that a call to platform.Context.Interrupt() misfired.
+		return true
+
+	case SignalPanic:
+		// SignalPanic is also specially handled in sentry setup to ensure that
+		// it causes a panic even after tasks exit, but SignalPanic may also
+		// be sent here if it is received while in app context.
+		panic("Signal-induced panic")
+
+	default:
+		log.Infof("Received external signal %d in %s context", info.Signo, context)
+		if k.globalInit == nil {
+			log.Warningf("Received external signal %d before init created", info.Signo)
+			return false
+		}
+		k.globalInit.SendSignal(info)
+	}
+
+	return true
+}
+
+// sigPriv returns a SignalInfo representing a signal sent by the sentry. (The
+// name reflects its equivalence to Linux's SEND_SIG_PRIV.)
+func sigPriv(sig linux.Signal) *arch.SignalInfo {
+	return &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoKernel,
+	}
+}
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
new file mode 100644
index 000000000..21ba4ee70
--- /dev/null
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// SignalHandlers holds information about signal actions.
+type SignalHandlers struct {
+	// mu protects actions, as well as the signal state of all tasks and thread
+	// groups using this SignalHandlers object. (See comment on
+	// ThreadGroup.signalHandlers.)
+	mu sync.Mutex `state:"nosave"`
+
+	// actions is the action to be taken upon receiving each signal.
+	actions map[linux.Signal]arch.SignalAct
+}
+
+// NewSignalHandlers returns a new SignalHandlers specifying all default
+// actions.
+func NewSignalHandlers() *SignalHandlers {
+	return &SignalHandlers{
+		actions: make(map[linux.Signal]arch.SignalAct),
+	}
+}
+
+// Fork returns a copy of sh for a new thread group.
+func (sh *SignalHandlers) Fork() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		sh2.actions[sig] = act
+	}
+	return sh2
+}
+
+// CopyForExec returns a copy of sh for a thread group that is undergoing an
+// execve. (See comments in Task.finishExec.)
+func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
+	sh2 := NewSignalHandlers()
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	for sig, act := range sh.actions {
+		if act.Handler == arch.SignalActIgnore {
+			sh2.actions[sig] = arch.SignalAct{
+				Handler: arch.SignalActIgnore,
+			}
+		}
+	}
+	return sh2
+}
+
+// dequeueActionLocked returns the SignalAct that should be used to handle sig.
+//
+// Preconditions: sh.mu must be locked.
+func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct {
+	act := sh.actions[sig]
+	if act.IsResetHandler() {
+		delete(sh.actions, sig)
+	}
+	return act
+}
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
new file mode 100644
index 000000000..e20fa3eb6
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls.go
@@ -0,0 +1,305 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// maxSyscallNum is the highest supported syscall number.
+//
+// The types below create fast lookup slices for all syscalls. This maximum
+// serves as a sanity check that we don't allocate huge slices for a very large
+// syscall.
+const maxSyscallNum = 2000
+
+// SyscallFn is a syscall implementation.
+type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
+
+// MissingFn is a syscall to be called when an implementation is missing.
+type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
+
+// Possible flags for SyscallFlagsTable.enable.
+const (
+	// syscallPresent indicates that this is not a missing syscall.
+	//
+	// This flag is used internally in SyscallFlagsTable.
+	syscallPresent = 1 << iota
+
+	// StraceEnableLog enables syscall log tracing.
+	StraceEnableLog
+
+	// StraceEnableEvent enables syscall event tracing.
+	StraceEnableEvent
+
+	// ExternalBeforeEnable enables the external hook before syscall execution.
+	ExternalBeforeEnable
+
+	// ExternalAfterEnable enables the external hook after syscall execution.
+	ExternalAfterEnable
+)
+
+// StraceEnableBits combines both strace log and event flags.
+const StraceEnableBits = StraceEnableLog | StraceEnableEvent
+
+// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
+// basis.
+type SyscallFlagsTable struct {
+	// mu protects writes to the fields below.
+	//
+	// Atomic loads are always allowed. Atomic stores are allowed only
+	// while mu is held.
+	mu sync.Mutex
+
+	// enable contains the enable bits for each syscall.
+	//
+	// missing syscalls have the same value in enable as missingEnable to
+	// avoid an extra branch in Word.
+	enable []uint32
+
+	// missingEnable contains the enable bits for missing syscalls.
+	missingEnable uint32
+}
+
+// Init initializes the struct, with all syscalls in table set to enable.
+//
+// max is the largest syscall number in table.
+func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) {
+	e.enable = make([]uint32, max+1)
+	for num := range table {
+		e.enable[num] = syscallPresent
+	}
+}
+
+// Word returns the enable bitfield for sysno.
+func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
+	if sysno < uintptr(len(e.enable)) {
+		return atomic.LoadUint32(&e.enable[sysno])
+	}
+
+	return atomic.LoadUint32(&e.missingEnable)
+}
+
+// Enable sets enable bit bit for all syscalls based on s.
+//
+// Syscalls missing from s are disabled.
+//
+// Syscalls missing from the initial table passed to Init cannot be added as
+// individual syscalls. If present in s they will be ignored.
+//
+// Callers to Word may see either the old or new value while this function
+// is executing.
+func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	if missingEnable {
+		missingVal |= bit
+	} else {
+		missingVal &^= bit
+	}
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		if s[uintptr(num)] {
+			val |= bit
+		} else {
+			val &^= bit
+		}
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// EnableAll sets enable bit bit for all syscalls, present and missing.
+func (e *SyscallFlagsTable) EnableAll(bit uint32) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	missingVal := atomic.LoadUint32(&e.missingEnable)
+	missingVal |= bit
+	atomic.StoreUint32(&e.missingEnable, missingVal)
+
+	for num := range e.enable {
+		val := atomic.LoadUint32(&e.enable[num])
+		if !bits.IsOn32(val, syscallPresent) {
+			// Missing.
+			atomic.StoreUint32(&e.enable[num], missingVal)
+			continue
+		}
+
+		val |= bit
+		atomic.StoreUint32(&e.enable[num], val)
+	}
+}
+
+// Stracer traces syscall execution.
+type Stracer interface {
+	// SyscallEnter is called on syscall entry.
+	//
+	// The returned private data is passed to SyscallExit.
+	//
+	// TODO: remove kernel imports from the strace package so
+	// that the type can be used directly.
+	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
+
+	// SyscallExit is called on syscall exit.
+	SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
+}
+
+// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
+// is *immutable*. In order to make supporting suspend and resume sane, they
+// must be uniquely registered and may not change during operation.
+type SyscallTable struct {
+	// OS is the operating system that this syscall table implements.
+	OS abi.OS `state:"wait"`
+
+	// Arch is the architecture that this syscall table targets.
+	Arch arch.Arch `state:"wait"`
+
+	// The OS version that this syscall table implements.
+	Version Version `state:"manual"`
+
+	// AuditNumber is a numeric constant that represents the syscall table. If
+	// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
+	// linux/audit.h.
+	AuditNumber uint32 `state:"manual"`
+
+	// Table is the collection of functions.
+	Table map[uintptr]SyscallFn `state:"manual"`
+
+	// lookup is a fixed-size array that holds the syscalls (indexed by
+	// their numbers). It is used for fast look ups.
+	lookup []SyscallFn `state:"manual"`
+
+	// Emulate is a collection of instruction addresses to emulate. The
+	// keys are addresses, and the values are system call numbers.
+	Emulate map[usermem.Addr]uintptr `state:"manual"`
+
+	// The function to call in case of a missing system call.
+	Missing MissingFn `state:"manual"`
+
+	// Stracer traces this syscall table.
+	Stracer Stracer `state:"manual"`
+
+	// External is used to handle an external callback.
+	External func(*Kernel) `state:"manual"`
+
+	// ExternalFilterBefore is called before External is called before the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+	// ExternalFilterAfter is called before External is called after the syscall is executed.
+	// External is not called if it returns false.
+	ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+	// FeatureEnable stores the strace and one-shot enable bits.
+	FeatureEnable SyscallFlagsTable `state:"manual"`
+}
+
+// allSyscallTables contains all known tables.
+var allSyscallTables []*SyscallTable
+
+// SyscallTables returns a read-only slice of registered SyscallTables.
+func SyscallTables() []*SyscallTable {
+	return allSyscallTables
+}
+
+// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
+func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
+	for _, s := range allSyscallTables {
+		if s.OS == os && s.Arch == a {
+			return s, true
+		}
+	}
+	return nil, false
+}
+
+// RegisterSyscallTable registers a new syscall table for use by a Kernel.
+func RegisterSyscallTable(s *SyscallTable) {
+	if s.Table == nil {
+		// Ensure non-nil lookup table.
+		s.Table = make(map[uintptr]SyscallFn)
+	}
+	if s.Emulate == nil {
+		// Ensure non-nil emulate table.
+		s.Emulate = make(map[usermem.Addr]uintptr)
+	}
+
+	var max uintptr
+	for num := range s.Table {
+		if num > max {
+			max = num
+		}
+	}
+
+	if max > maxSyscallNum {
+		panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
+	}
+
+	s.lookup = make([]SyscallFn, max+1)
+
+	// Initialize the fast-lookup table.
+	for num, fn := range s.Table {
+		s.lookup[num] = fn
+	}
+
+	s.FeatureEnable.init(s.Table, max)
+
+	if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
+		panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
+	}
+
+	// Save a reference to this table.
+	//
+	// This is required for a Kernel to find the table and for save/restore
+	// operations below.
+	allSyscallTables = append(allSyscallTables, s)
+}
+
+// Lookup returns the syscall implementation, if one exists.
+func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
+	if sysno < uintptr(len(s.lookup)) {
+		return s.lookup[sysno]
+	}
+
+	return nil
+}
+
+// LookupEmulate looks up an emulation syscall number.
+func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
+	sysno, ok := s.Emulate[addr]
+	return sysno, ok
+}
+
+// mapLookup is similar to Lookup, except that it only uses the syscall table,
+// that is, it skips the fast look array. This is available for benchmarking.
+func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
+	return s.Table[sysno]
+}
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
new file mode 100644
index 000000000..826809a70
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "fmt"
+
+// afterLoad is invoked by stateify.
+func (s *SyscallTable) afterLoad() {
+	otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
+	if !ok {
+		// Couldn't find a reference?
+		panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+	}
+
+	// Copy the table.
+	*s = *otherTable
+}
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
new file mode 100644
index 000000000..31541749e
--- /dev/null
+++ b/pkg/sentry/kernel/syslog.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"math/rand"
+	"sync"
+)
+
+// syslog represents a sentry-global kernel log.
+//
+// Currently, it contains only fun messages for a dmesg easter egg.
+type syslog struct {
+	// mu protects the below.
+	mu sync.Mutex `state:"nosave"`
+
+	// msg is the syslog message buffer. It is lazily initialized.
+	msg []byte
+}
+
+// Log returns a copy of the syslog.
+func (s *syslog) Log() []byte {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.msg != nil {
+		// Already initialized, just return a copy.
+		o := make([]byte, len(s.msg))
+		copy(o, s.msg)
+		return o
+	}
+
+	// Not initialized, create message.
+	allMessages := []string{
+		"Synthesizing system calls...",
+		"Mounting deweydecimalfs...",
+		"Moving files to filing cabinet...",
+		"Digging up root...",
+		"Constructing home...",
+		"Segmenting fault lines...",
+		"Creating bureaucratic processes...",
+		"Searching for needles in stacks...",
+		"Preparing for the zombie uprising...",
+		"Feeding the init monster...",
+		"Creating cloned children...",
+		"Daemonizing children...",
+		"Waiting for children...",
+		"Gathering forks...",
+		"Committing treasure map to memory...",
+		"Reading process obituaries...",
+		"Searching for socket adapter...",
+		"Creating process schedule...",
+		"Generating random numbers by fair dice roll...",
+		"Rewriting operating system in Javascript...",
+		"Consulting tar man page...",
+		"Forking spaghetti code...",
+		"Checking naughty and nice process list...",
+		"Checking naughty and nice process list...", // Check it up to twice.
+		"Granting licence to kill(2)...",            // British spelling for British movie.
+		"Letting the watchdogs out...",
+	}
+
+	selectMessage := func() string {
+		i := rand.Intn(len(allMessages))
+		m := allMessages[i]
+
+		// Delete the selected message.
+		allMessages[i] = allMessages[len(allMessages)-1]
+		allMessages = allMessages[:len(allMessages)-1]
+
+		return m
+	}
+
+	time := 0.0
+	for i := 0; i < 10; i++ {
+		time += rand.Float64() / 2
+		s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] %s\n", time, selectMessage()))...)
+	}
+
+	time += rand.Float64() / 2
+	s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] Ready!\n", time))...)
+
+	// Return a copy.
+	o := make([]byte, len(s.msg))
+	copy(o, s.msg)
+	return o
+}
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
new file mode 100644
index 000000000..71ca75555
--- /dev/null
+++ b/pkg/sentry/kernel/table_test.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+	maxTestSyscall = 1000
+)
+
+func createSyscallTable() *SyscallTable {
+	m := make(map[uintptr]SyscallFn)
+	for i := uintptr(0); i <= maxTestSyscall; i++ {
+		j := i
+		m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
+			return j, nil, nil
+		}
+	}
+
+	s := &SyscallTable{
+		OS:    abi.Linux,
+		Arch:  arch.AMD64,
+		Table: m,
+	}
+
+	RegisterSyscallTable(s)
+	return s
+}
+
+func TestTable(t *testing.T) {
+	table := createSyscallTable()
+	defer func() {
+		// Cleanup registered tables to keep tests separate.
+		allSyscallTables = []*SyscallTable{}
+	}()
+
+	// Go through all functions and check that they return the right value.
+	for i := uintptr(0); i < maxTestSyscall; i++ {
+		fn := table.Lookup(i)
+		if fn == nil {
+			t.Errorf("Syscall %v is set to nil", i)
+			continue
+		}
+
+		v, _, _ := fn(nil, arch.SyscallArguments{})
+		if v != i {
+			t.Errorf("Wrong return value for syscall %v: expected %v, got %v", i, i, v)
+		}
+	}
+
+	// Check that values outside the range return nil.
+	for i := uintptr(maxTestSyscall + 1); i < maxTestSyscall+100; i++ {
+		fn := table.Lookup(i)
+		if fn != nil {
+			t.Errorf("Syscall %v is not nil: %v", i, fn)
+			continue
+		}
+	}
+}
+
+func BenchmarkTableLookup(b *testing.B) {
+	table := createSyscallTable()
+
+	b.ResetTimer()
+
+	j := uintptr(0)
+	for i := 0; i < b.N; i++ {
+		table.Lookup(j)
+		j = (j + 1) % 310
+	}
+
+	b.StopTimer()
+	// Cleanup registered tables to keep tests separate.
+	allSyscallTables = []*SyscallTable{}
+}
+
+func BenchmarkTableMapLookup(b *testing.B) {
+	table := createSyscallTable()
+
+	b.ResetTimer()
+
+	j := uintptr(0)
+	for i := 0; i < b.N; i++ {
+		table.mapLookup(j)
+		j = (j + 1) % 310
+	}
+
+	b.StopTimer()
+	// Cleanup registered tables to keep tests separate.
+	allSyscallTables = []*SyscallTable{}
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
new file mode 100644
index 000000000..3d2e035e9
--- /dev/null
+++ b/pkg/sentry/kernel/task.go
@@ -0,0 +1,606 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	ssync "gvisor.googlesource.com/gvisor/pkg/sync"
+)
+
+// Task represents a thread of execution in the untrusted app.  It
+// includes registers and any thread-specific state that you would
+// normally expect.
+//
+// Each task is associated with a goroutine, called the task goroutine, that
+// executes code (application code, system calls, etc.) on behalf of that task.
+// See Task.run (task_run.go).
+//
+// All fields that are "owned by the task goroutine" can only be mutated by the
+// task goroutine while it is running. The task goroutine does not require
+// synchronization to read these fields, although it still requires
+// synchronization as described for those fields to mutate them.
+//
+// All fields that are "exclusive to the task goroutine" can only be accessed
+// by the task goroutine while it is running. The task goroutine does not
+// require synchronization to read or write these fields.
+type Task struct {
+	taskNode
+
+	// runState is what the task goroutine is executing if it is not stopped.
+	// If runState is nil, the task goroutine should exit or has exited.
+	// runState is exclusive to the task goroutine.
+	runState taskRunState
+
+	// haveSyscallReturn is true if tc.Arch().Return() represents a value
+	// returned by a syscall (or set by ptrace after a syscall).
+	//
+	// haveSyscallReturn is exclusive to the task goroutine.
+	haveSyscallReturn bool
+
+	// interruptChan is notified whenever the task goroutine is interrupted
+	// (usually by a pending signal). interruptChan is effectively a condition
+	// variable that can be used in select statements.
+	//
+	// interruptChan is not saved; because saving interrupts all tasks,
+	// interruptChan is always notified after restore (see Task.run).
+	interruptChan chan struct{} `state:"nosave"`
+
+	// gosched contains the current scheduling state of the task goroutine.
+	//
+	// gosched is protected by goschedSeq. gosched is owned by the task
+	// goroutine.
+	goschedSeq ssync.SeqCount `state:"nosave"`
+	gosched    TaskGoroutineSchedInfo
+
+	// yieldCount is the number of times the task goroutine has called
+	// Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
+	// Task.Yield(), voluntarily ceasing execution.
+	//
+	// yieldCount is accessed using atomic memory operations. yieldCount is
+	// owned by the task goroutine.
+	yieldCount uint64
+
+	// pendingSignals is the set of pending signals that may be handled only by
+	// this task.
+	//
+	// pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
+	// (hereafter "the signal mutex"); see comment on
+	// ThreadGroup.signalHandlers.
+	pendingSignals pendingSignals
+
+	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
+	// should be applied after the task has either delivered one signal to a
+	// user handler or is about to resume execution in the untrusted
+	// application.
+	//
+	// Both haveSavedSignalMask and savedSignalMask are exclusive to the task
+	// goroutine.
+	haveSavedSignalMask bool
+	savedSignalMask     linux.SignalSet
+
+	// signalStack is the alternate signal stack used by signal handlers for
+	// which the SA_ONSTACK flag is set.
+	//
+	// signalStack is exclusive to the task goroutine.
+	signalStack arch.SignalStack
+
+	// If groupStopRequired is true, the task should enter a group stop in the
+	// interrupt path. groupStopRequired is not redundant with
+	// tg.groupStopPhase != groupStopNone, because ptrace allows tracers to
+	// resume individual tasks from a group stop without ending the group stop
+	// as a whole.
+	//
+	// groupStopRequired is analogous to JOBCTL_TRAP_STOP in Linux, except that
+	// Linux only uses that flag for ptraced tasks.
+	//
+	// groupStopRequired is protected by the signal mutex.
+	groupStopRequired bool
+
+	// If groupStopAcknowledged is true, the task has already acknowledged that
+	// it is entering the most recent group stop that has been initiated on its
+	// thread group. groupStopAcknowledged is only meaningful if
+	// tg.groupStopPhase == groupStopInitiated.
+	//
+	// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
+	//
+	// groupStopAcknowledged is protected by the signal mutex.
+	groupStopAcknowledged bool
+
+	// If stop is not nil, it is the internally-initiated condition that
+	// currently prevents the task goroutine from running.
+	//
+	// stop is protected by the signal mutex.
+	stop TaskStop
+
+	// stopCount is the number of active external stops (calls to
+	// Task.BeginExternalStop that have not been paired with a call to
+	// Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
+	// non-zero if the task goroutine should stop.
+	//
+	// Mutating stopCount requires both locking the signal mutex and using
+	// atomic memory operations. Reading stopCount requires either locking the
+	// signal mutex or using atomic memory operations. This allows Task.doStop
+	// to require only a single atomic read in the common case where stopCount
+	// is 0.
+	//
+	// stopCount is not saved, because external stops cannot be retained across
+	// a save/restore cycle. (Suppose a sentryctl command issues an external
+	// stop; after a save/restore cycle, the restored sentry has no knowledge
+	// of the pre-save sentryctl command, and the stopped task would remain
+	// stopped forever.)
+	stopCount int32 `state:"nosave"`
+
+	// endStopCond is signaled when stopCount transitions to 0. The combination
+	// of stopCount and endStopCond effectively form a sync.WaitGroup, but
+	// WaitGroup provides no way to read its counter value.
+	//
+	// Invariant: endStopCond.L is the signal mutex. (This is not racy because
+	// sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
+	// calls sync.Cond.Wait; and only the task goroutine can change the
+	// identity of the signal mutex, in Task.finishExec.)
+	endStopCond sync.Cond `state:"nosave"`
+
+	// exitStatus is the task's exit status.
+	//
+	// exitStatus is protected by the signal mutex.
+	exitStatus ExitStatus
+
+	// syscallRestartBlock represents a custom restart function to run in
+	// restart_syscall(2) to resume an interrupted syscall.
+	//
+	// syscallRestartBlock is exclusive to the task goroutine.
+	syscallRestartBlock SyscallRestartBlock
+
+	// mu protects some of the following fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// tc and tr form the majority of the task's data.
+	//
+	// tc and tr are protected by mu. tc and tr are owned by the task
+	// goroutine. tr.signalMask is protected by the signal mutex and must be
+	// written using atomic memory operations (such that reading tr.signalMask
+	// is safe if the signal mutex is locked or if atomic memory operations are
+	// used), but is also owned by the task goroutine.
+	tc TaskContext
+	tr TaskResources
+
+	// p provides the mechanism by which the task runs code in userspace. The p
+	// interface object is immutable.
+	p platform.Context `state:"nosave"`
+
+	// k is the Kernel that this task belongs to. The k pointer is immutable.
+	k *Kernel
+
+	// If vforkParent is not nil, it is the task that created this task with
+	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
+	// this TaskContext is released.
+	//
+	// vforkParent is protected by the TaskSet mutex.
+	vforkParent *Task
+
+	// exitState is the task's progress through the exit path.
+	//
+	// exitState is protected by the TaskSet mutex. exitState is owned by the
+	// task goroutine.
+	exitState TaskExitState
+
+	// exitTracerNotified is true if the exit path has either signaled the
+	// task's tracer to indicate the exit, or determined that no such signal is
+	// needed. exitTracerNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitTracerNotified is protected by the TaskSet mutex.
+	exitTracerNotified bool
+
+	// exitTracerAcked is true if exitTracerNotified is true and either the
+	// task's tracer has acknowledged the exit notification, or the exit path
+	// has determined that no such notification is needed.
+	//
+	// exitTracerAcked is protected by the TaskSet mutex.
+	exitTracerAcked bool
+
+	// exitParentNotified is true if the exit path has either signaled the
+	// task's parent to indicate the exit, or determined that no such signal is
+	// needed. exitParentNotified can only be true if exitState is
+	// TaskExitZombie or TaskExitDead.
+	//
+	// exitParentNotified is protected by the TaskSet mutex.
+	exitParentNotified bool
+
+	// exitParentAcked is true if exitParentNotified is true and either the
+	// task's parent has acknowledged the exit notification, or the exit path
+	// has determined that no such acknowledgment is needed.
+	//
+	// exitParentAcked is protected by the TaskSet mutex.
+	exitParentAcked bool
+
+	// goroutineStopped is a WaitGroup whose counter value is 1 when the task
+	// goroutine is running and 0 when the task goroutine is stopped or has
+	// exited.
+	goroutineStopped sync.WaitGroup `state:"nosave"`
+
+	// ptraceTracer is the task that is ptrace-attached to this one. If
+	// ptraceTracer is nil, this task is not being traced. Note that due to
+	// atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
+	// ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
+	//
+	// ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
+	// operations. This allows paths that wouldn't otherwise lock the TaskSet
+	// mutex, notably the syscall path, to check if ptraceTracer is nil without
+	// additional synchronization.
+	ptraceTracer atomic.Value `state:".(*Task)"`
+
+	// ptraceTracees is the set of tasks that this task is ptrace-attached to.
+	//
+	// ptraceTracees is protected by the TaskSet mutex.
+	ptraceTracees map[*Task]struct{}
+
+	// ptraceOpts contains ptrace options explicitly set by the tracer. If
+	// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
+	//
+	// ptraceOpts is protected by the TaskSet mutex.
+	ptraceOpts ptraceOptions
+
+	// ptraceSyscallMode controls ptrace behavior around syscall entry and
+	// exit.
+	//
+	// ptraceSyscallMode is protected by the TaskSet mutex.
+	ptraceSyscallMode ptraceSyscallMode
+
+	// If ptraceSinglestep is true, the next time the task executes application
+	// code, single-stepping should be enabled. ptraceSinglestep is stored
+	// independently of the architecture-specific trap flag because tracer
+	// detaching (which can happen concurrently with the tracee's execution if
+	// the tracer exits) must disable single-stepping, and the task's
+	// architectural state is implicitly exclusive to the task goroutine (no
+	// synchronization occurs before passing registers to SwitchToApp).
+	//
+	// ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
+	//
+	// ptraceSinglestep is protected by the TaskSet mutex.
+	ptraceSinglestep bool
+
+	// If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
+	// time that t entered the ptrace stop, reset to 0 when the tracer
+	// acknowledges the stop with a wait*() syscall. Otherwise, it is the
+	// signal number passed to the ptrace operation that ended the last ptrace
+	// stop on this task. In the latter case, the effect of ptraceCode depends
+	// on the nature of the ptrace stop; signal-delivery-stop uses it to
+	// conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
+	// signal to the task after leaving the stop, and PTRACE_EVENT stops and
+	// traced group stops ignore it entirely.
+	//
+	// Linux contextually stores the equivalent of ptraceCode in
+	// task_struct::exit_code.
+	//
+	// ptraceCode is protected by the TaskSet mutex.
+	ptraceCode int32
+
+	// ptraceSiginfo is the value returned to the tracer by
+	// ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
+	// (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
+	// ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
+	// required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
+	// is in turn required to distinguish group stops from other ptrace stops,
+	// per subsection "Group-stop" in ptrace(2)).
+	//
+	// ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
+	//
+	// ptraceSiginfo is protected by the TaskSet mutex.
+	ptraceSiginfo *arch.SignalInfo
+
+	// ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
+	// the tracer by ptrace(PTRACE_GETEVENTMSG).
+	//
+	// ptraceEventMsg is protected by the TaskSet mutex.
+	ptraceEventMsg uint64
+
+	// The struct that holds the IO-related usage. The ioUsage pointer is
+	// immutable.
+	ioUsage *usage.IO
+
+	// logPrefix is a string containing the task's thread ID in the root PID
+	// namespace, and is prepended to log messages emitted by Task.Infof etc.
+	logPrefix atomic.Value `state:".(string)"`
+
+	// creds is the task's credentials.
+	//
+	// creds is protected by mu.
+	creds *auth.Credentials
+
+	// utsns is the task's UTS namespace.
+	//
+	// utsns is protected by mu.
+	utsns *UTSNamespace
+
+	// ipcns is the task's IPC namespace.
+	//
+	// ipcns is protected by mu.
+	ipcns *IPCNamespace
+
+	// parentDeathSignal is sent to this task's thread group when its parent exits.
+	//
+	// parentDeathSignal is protected by mu.
+	parentDeathSignal linux.Signal
+
+	// syscallFilters is all seccomp-bpf syscall filters applicable to the
+	// task, in the order in which they were installed.
+	//
+	// syscallFilters is protected by mu. syscallFilters is owned by the task
+	// goroutine.
+	syscallFilters []bpf.Program
+
+	// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
+	// task's virtual address space; when the task exits, set the pointed-to
+	// ThreadID to 0, and wake any futex waiters.
+	//
+	// cleartid is exclusive to the task goroutine.
+	cleartid usermem.Addr
+
+	// This is mostly a fake cpumask just for sched_set/getaffinity as we
+	// don't really control the affinity.
+	//
+	// Invariant: allowedCPUMask.Size() ==
+	// sched.CPUMaskSize(Kernel.applicationCores).
+	//
+	// allowedCPUMask is protected by mu.
+	allowedCPUMask sched.CPUSet
+
+	// cpu is the fake cpu number returned by getcpu(2). cpu is ignored
+	// entirely if Kernel.useHostCores is true.
+	//
+	// cpu is accessed using atomic memory operations.
+	cpu int32
+
+	// This is used to keep track of changes made to a process' priority/niceness.
+	// It is mostly used to provide some reasonable return value from
+	// getpriority(2) after a call to setpriority(2) has been made.
+	// We currently do not actually modify a process' scheduling priority.
+	// NOTE: This represents the userspace view of priority (nice).
+	// This means that the value should be in the range [-20, 19].
+	//
+	// niceness is protected by mu.
+	niceness int
+
+	// This is used to track the numa policy for the current thread. This can be
+	// modified through a set_mempolicy(2) syscall. Since we always report a
+	// single numa node, all policies are no-ops. We only track this information
+	// so that we can return reasonable values if the application calls
+	// get_mempolicy(2) after setting a non-default policy. Note that in the
+	// real syscall, nodemask can be longer than 4 bytes, but we always report a
+	// single node so never need to save more than a single bit.
+	//
+	// numaPolicy and numaNodeMask are protected by mu.
+	numaPolicy   int32
+	numaNodeMask uint32
+
+	// If netns is true, the task is in a non-root network namespace. Network
+	// namespaces aren't currently implemented in full; being in a network
+	// namespace simply prevents the task from observing any network devices
+	// (including loopback) or using abstract socket addresses (see unix(7)).
+	//
+	// netns is protected by mu. netns is owned by the task goroutine.
+	netns bool
+
+	// If rseqPreempted is true, before the next call to p.Switch(), interrupt
+	// RSEQ critical regions as defined by tg.rseq and write the task
+	// goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
+	// written to rseqCPUAddr.
+	//
+	// If rseqCPUAddr is 0, rseqCPU is -1.
+	//
+	// rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
+	// goroutine.
+	rseqPreempted bool `state:"nosave"`
+	rseqCPUAddr   usermem.Addr
+	rseqCPU       int32
+
+	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
+	// implementations that require an intermediate buffer to copy data
+	// into/out of. It prevents these buffers from being allocated/zeroed in
+	// each syscall and eventually garbage collected.
+	//
+	// copyScratchBuffer is exclusive to the task goroutine.
+	copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`
+
+	// blockingTimer is used for blocking timeouts. blockingTimerChan is the
+	// channel that is sent to when blockingTimer fires.
+	//
+	// blockingTimer is exclusive to the task goroutine.
+	blockingTimer     *ktime.Timer    `state:"nosave"`
+	blockingTimerChan <-chan struct{} `state:"nosave"`
+
+	// futexWaiter is used for futex(FUTEX_WAIT) syscalls.
+	//
+	// futexWaiter is exclusive to the task goroutine.
+	futexWaiter *futex.Waiter `state:"nosave"`
+
+	// startTime is the real time at which the task started. It is set when
+	// a Task is created or invokes execve(2).
+	//
+	// startTime is protected by mu.
+	startTime ktime.Time
+}
+
+func (t *Task) savePtraceTracer() *Task {
+	return t.ptraceTracer.Load().(*Task)
+}
+
+func (t *Task) loadPtraceTracer(tracer *Task) {
+	t.ptraceTracer.Store(tracer)
+}
+
+func (t *Task) saveLogPrefix() string {
+	return t.logPrefix.Load().(string)
+}
+
+func (t *Task) loadLogPrefix(prefix string) {
+	t.logPrefix.Store(prefix)
+}
+
+// afterLoad is invoked by stateify.
+func (t *Task) afterLoad() {
+	t.interruptChan = make(chan struct{}, 1)
+	t.gosched.State = TaskGoroutineNonexistent
+	if t.stop != nil {
+		t.stopCount = 1
+	}
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.p = t.k.Platform.NewContext()
+	t.rseqPreempted = true
+	t.futexWaiter = futex.NewWaiter()
+}
+
+// copyScratchBufferLen is the length of the copyScratchBuffer field of the Task
+// struct.
+const copyScratchBufferLen = 52
+
+// TaskMaybe is the interface for extracting Tasks out of things which may be
+// or contain Task objects.
+type TaskMaybe interface {
+	// ExtractTask returns the Task.
+	ExtractTask() *Task
+}
+
+// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
+// functions. It must only be used within those functions and can only be used
+// by the task goroutine; it exists to improve performance and thus
+// intentionally lacks any synchronization.
+//
+// Callers should pass a constant value as an argument, which will allow the
+// compiler to inline and optimize out the if statement below.
+func (t *Task) CopyScratchBuffer(size int) []byte {
+	if size > copyScratchBufferLen {
+		return make([]byte, size)
+	}
+	return t.copyScratchBuffer[:size]
+}
+
+// FutexWaiter returns the Task's futex.Waiter.
+func (t *Task) FutexWaiter() *futex.Waiter {
+	return t.futexWaiter
+}
+
+// ExtractTask implements TaskMaybe.ExtractTask.
+func (t *Task) ExtractTask() *Task {
+	return t
+}
+
+// TaskContext returns t's TaskContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskContext() *TaskContext {
+	return &t.tc
+}
+
+// TaskResources returns t's TaskResources.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskResources() *TaskResources {
+	return &t.tr
+}
+
+// WithMuLocked executes f with t.mu locked.
+func (t *Task) WithMuLocked(f func(*Task)) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f(t)
+}
+
+// Kernel returns the Kernel containing t.
+func (t *Task) Kernel() *Kernel {
+	return t.k
+}
+
+// Value implements context.Context.Value.
+func (t *Task) Value(key interface{}) interface{} {
+	switch key {
+	case CtxCanTrace:
+		return t.CanTrace
+	case CtxKernel:
+		return t.k
+	case CtxPIDNamespace:
+		return t.tg.pidns
+	case CtxUTSNamespace:
+		return t.utsns
+	case CtxIPCNamespace:
+		return t.ipcns
+	case CtxTask:
+		return t
+	case auth.CtxCredentials:
+		return t.creds
+	case fs.CtxRoot:
+		return t.FSContext().RootDirectory()
+	case ktime.CtxRealtimeClock:
+		return t.k.RealtimeClock()
+	case limits.CtxLimits:
+		return t.tg.limits
+	case platform.CtxPlatform:
+		return t.k
+	case uniqueid.CtxGlobalUniqueID:
+		return t.k.UniqueID()
+	case uniqueid.CtxInotifyCookie:
+		return t.k.GenerateInotifyCookie()
+	default:
+		return nil
+	}
+}
+
+// SetClearTID sets t's cleartid.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetClearTID(addr usermem.Addr) {
+	t.cleartid = addr
+}
+
+// SetSyscallRestartBlock sets the restart block for use in
+// restart_syscall(2). After registering a restart block, a syscall should
+// return ERESTART_RESTARTBLOCK to request a restart using the block.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
+	t.syscallRestartBlock = r
+}
+
+// SyscallRestartBlock returns the currently registered restart block for use in
+// restart_syscall(2). This function is *not* idempotent and may be called once
+// per syscall. This function must not be called if a restart block has not been
+// registered for the current syscall.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
+	r := t.syscallRestartBlock
+	// Explicitly set the restart block to nil so that a future syscall can't
+	// accidentally reuse it.
+	t.syscallRestartBlock = nil
+	return r
+}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
new file mode 100644
index 000000000..ce12cdb64
--- /dev/null
+++ b/pkg/sentry/kernel/task_acct.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Accounting, limits, timers.
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// IOUsage returns the io usage of the thread.
+func (t *Task) IOUsage() *usage.IO {
+	return t.ioUsage
+}
+
+// IOUsage returns the total io usage of all dead and live threads in the group.
+func (tg *ThreadGroup) IOUsage() *usage.IO {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	io := *tg.ioUsage
+	// Account for active tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		io.Accumulate(t.IOUsage())
+	}
+	return &io
+}
+
+// Name returns t's name.
+func (t *Task) Name() string {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.tc.Name
+}
+
+// SetName changes t's name.
+func (t *Task) SetName(name string) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.tc.Name = name
+	t.Debugf("Set thread name to %q", name)
+}
+
+// SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft
+// limits on CPU time used by this process.
+func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit) {
+	tg.Timer().applyCPULimits(*l)
+}
+
+// Limits implements context.Context.Limits.
+func (t *Task) Limits() *limits.LimitSet {
+	return t.ThreadGroup().Limits()
+}
+
+// StartTime returns t's start time.
+func (t *Task) StartTime() ktime.Time {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.startTime
+}
+
+// MaxRSS returns the maximum resident set size of the task in bytes. which
+// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or
+// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these
+// flags.
+func (t *Task) MaxRSS(which int32) uint64 {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+
+	switch which {
+	case linux.RUSAGE_SELF, linux.RUSAGE_THREAD:
+		// If there's an active mm we can use its value.
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return t.tg.maxRSS
+	case linux.RUSAGE_CHILDREN:
+		return t.tg.childMaxRSS
+	case linux.RUSAGE_BOTH:
+		maxRSS := t.tg.maxRSS
+		if maxRSS < t.tg.childMaxRSS {
+			maxRSS = t.tg.childMaxRSS
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS {
+				return mmMaxRSS
+			}
+		}
+		return maxRSS
+	default:
+		// We'll only get here if which is invalid.
+		return 0
+	}
+}
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
new file mode 100644
index 000000000..9fd24f134
--- /dev/null
+++ b/pkg/sentry/kernel/task_block.go
@@ -0,0 +1,207 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"time"
+
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// BlockWithTimeout blocks t until an event is received from C, the application
+// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true),
+// or t is interrupted. It returns:
+//
+// - The remaining timeout, which is guaranteed to be 0 if the timeout expired,
+// and is unspecified if haveTimeout is false.
+//
+// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
+// expired, and syserror.ErrInterrupted if t is interrupted.
+func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
+	if !haveTimeout {
+		return timeout, t.block(C, nil)
+	}
+
+	start := t.Kernel().MonotonicClock().Now()
+	deadline := start.Add(timeout)
+	err := t.BlockWithDeadline(C, true, deadline)
+
+	// Timeout, explicitly return a remaining duration of 0.
+	if err == syserror.ETIMEDOUT {
+		return 0, err
+	}
+
+	// Compute the remaining timeout. Note that even if block() above didn't
+	// return due to a timeout, we may have used up any of the remaining time
+	// since then. We cap the remaining timeout to 0 to make it easier to
+	// directly use the returned duration.
+	end := t.Kernel().MonotonicClock().Now()
+	remainingTimeout := timeout - end.Sub(start)
+	if remainingTimeout < 0 {
+		remainingTimeout = 0
+	}
+
+	return remainingTimeout, err
+}
+
+// BlockWithDeadline blocks t until an event is received from C, the
+// application monotonic clock indicates a time of deadline (only if
+// haveDeadline is true), or t is interrupted. It returns nil if an event is
+// received from C, ETIMEDOUT if the deadline expired, and
+// syserror.ErrInterrupted if t is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error {
+	if !haveDeadline {
+		return t.block(C, nil)
+	}
+
+	// Start the timeout timer.
+	t.blockingTimer.Swap(ktime.Setting{
+		Enabled: true,
+		Next:    deadline,
+	})
+
+	err := t.block(C, t.blockingTimerChan)
+
+	// Stop the timeout timer and drain the channel.
+	t.blockingTimer.Swap(ktime.Setting{})
+	select {
+	case <-t.blockingTimerChan:
+	default:
+	}
+
+	return err
+}
+
+// BlockWithTimer blocks t until an event is received from C or tchan, or t is
+// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an
+// event is received from tchan, and syserror.ErrInterrupted if t is
+// interrupted.
+//
+// Most clients should use BlockWithDeadline or BlockWithTimeout instead.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithTimer(C chan struct{}, tchan <-chan struct{}) error {
+	return t.block(C, tchan)
+}
+
+// Block blocks t until an event is received from C or t is interrupted. It
+// returns nil if an event is received from C and syserror.ErrInterrupted if t
+// is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Block(C chan struct{}) error {
+	return t.block(C, nil)
+}
+
+// block blocks a task on one of many events.
+// N.B. defer is too expensive to be used here.
+func (t *Task) block(C chan struct{}, timerChan <-chan struct{}) error {
+	// Fast path if the request is already done.
+	select {
+	case <-C:
+		return nil
+	default:
+	}
+
+	// Deactive our address space, we don't need it.
+	interrupt := t.SleepStart()
+
+	select {
+	case <-C:
+		t.SleepFinish(true)
+		return nil
+
+	case <-interrupt:
+		t.SleepFinish(false)
+		// Return the indicated error on interrupt.
+		return syserror.ErrInterrupted
+
+	case <-timerChan:
+		// We've timed out.
+		t.SleepFinish(true)
+		return syserror.ETIMEDOUT
+	}
+}
+
+// SleepStart implements amutex.Sleeper.SleepStart.
+func (t *Task) SleepStart() <-chan struct{} {
+	t.Deactivate()
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
+	return t.interruptChan
+}
+
+// SleepFinish implements amutex.Sleeper.SleepFinish.
+func (t *Task) SleepFinish(success bool) {
+	if !success {
+		// The interrupted notification is consumed only at the top-level
+		// (Run). Therefore we attempt to reset the pending notification.
+		// This will also elide our next entry back into the task, so we
+		// will process signals, state changes, etc.
+		t.interruptSelf()
+	}
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
+	t.Activate()
+}
+
+// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
+func (t *Task) UninterruptibleSleepStart(deactivate bool) {
+	if deactivate {
+		t.Deactivate()
+	}
+	t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible)
+}
+
+// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish.
+func (t *Task) UninterruptibleSleepFinish(activate bool) {
+	t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible)
+	if activate {
+		t.Activate()
+	}
+}
+
+// interrupted returns true if interrupt or interruptSelf has been called at
+// least once since the last call to interrupted.
+func (t *Task) interrupted() bool {
+	select {
+	case <-t.interruptChan:
+		return true
+	default:
+		return false
+	}
+}
+
+// interrupt unblocks the task and interrupts it if it's currently running in
+// userspace.
+func (t *Task) interrupt() {
+	t.interruptSelf()
+	t.p.Interrupt()
+}
+
+// interruptSelf is like Interrupt, but can only be called by the task
+// goroutine.
+func (t *Task) interruptSelf() {
+	select {
+	case t.interruptChan <- struct{}{}:
+		t.Debugf("Interrupt queued")
+	default:
+		t.Debugf("Dropping duplicate interrupt")
+	}
+	// platform.Context.Interrupt() is unnecessary since a task goroutine
+	// calling interruptSelf() cannot also be blocked in
+	// platform.Context.Switch().
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
new file mode 100644
index 000000000..3a74abdfb
--- /dev/null
+++ b/pkg/sentry/kernel/task_clone.go
@@ -0,0 +1,475 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SharingOptions controls what resources are shared by a new task created by
+// Task.Clone, or an existing task affected by Task.Unshare.
+type SharingOptions struct {
+	// If NewAddressSpace is true, the task should have an independent virtual
+	// address space.
+	NewAddressSpace bool
+
+	// If NewSignalHandlers is true, the task should use an independent set of
+	// signal handlers.
+	NewSignalHandlers bool
+
+	// If NewThreadGroup is true, the task should be the leader of its own
+	// thread group. TerminationSignal is the signal that the thread group
+	// will send to its parent when it exits. If NewThreadGroup is false,
+	// TerminationSignal is ignored.
+	NewThreadGroup    bool
+	TerminationSignal linux.Signal
+
+	// If NewPIDNamespace is true:
+	//
+	// - In the context of Task.Clone, the new task should be the init task
+	// (TID 1) in a new PID namespace.
+	//
+	// - In the context of Task.Unshare, the task should create a new PID
+	// namespace, and all subsequent clones of the task should be members of
+	// the new PID namespace.
+	NewPIDNamespace bool
+
+	// If NewUserNamespace is true, the task should have an independent user
+	// namespace.
+	NewUserNamespace bool
+
+	// If NewNetworkNamespace is true, the task should have an independent
+	// network namespace. (Note that network namespaces are not really
+	// implemented; see comment on Task.netns for details.)
+	NewNetworkNamespace bool
+
+	// If NewFiles is true, the task should use an independent file descriptor
+	// table.
+	NewFiles bool
+
+	// If NewFSContext is true, the task should have an independent FSContext.
+	NewFSContext bool
+
+	// If NewUTSNamespace is true, the task should have an independent UTS
+	// namespace.
+	NewUTSNamespace bool
+
+	// If NewIPCNamespace is true, the task should have an independent IPC
+	// namespace.
+	NewIPCNamespace bool
+}
+
+// CloneOptions controls the behavior of Task.Clone.
+type CloneOptions struct {
+	// SharingOptions defines the set of resources that the new task will share
+	// with its parent.
+	SharingOptions
+
+	// Stack is the initial stack pointer of the new task. If Stack is 0, the
+	// new task will start with the same stack pointer as its parent.
+	Stack usermem.Addr
+
+	// If SetTLS is true, set the new task's TLS (thread-local storage)
+	// descriptor to TLS. If SetTLS is false, TLS is ignored.
+	SetTLS bool
+	TLS    usermem.Addr
+
+	// If ChildClearTID is true, when the child exits, 0 is written to the
+	// address ChildTID in the child's memory, and if the write is successful a
+	// futex wake on the same address is performed.
+	//
+	// If ChildSetTID is true, the child's thread ID (in the child's PID
+	// namespace) is written to address ChildTID in the child's memory. (As in
+	// Linux, failed writes are silently ignored.)
+	ChildClearTID bool
+	ChildSetTID   bool
+	ChildTID      usermem.Addr
+
+	// If ParentSetTID is true, the child's thread ID (in the parent's PID
+	// namespace) is written to address ParentTID in the parent's memory. (As
+	// in Linux, failed writes are silently ignored.)
+	//
+	// Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
+	// causes the child's thread ID to be written to ptid in both the parent
+	// and child's memory, but this is a documentation error fixed by
+	// 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
+	ParentSetTID bool
+	ParentTID    usermem.Addr
+
+	// If Vfork is true, place the parent in vforkStop until the cloned task
+	// releases its TaskContext.
+	Vfork bool
+
+	// If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
+	// this clone(), and do not ptrace-attach the caller's tracer to the new
+	// task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
+	Untraced bool
+
+	// If InheritTracer is true, ptrace-attach the caller's tracer to the new
+	// task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
+	// for it. If both Untraced and InheritTracer are true, no event will be
+	// reported, but tracer inheritance will still occur.
+	InheritTracer bool
+}
+
+// Clone implements the clone(2) syscall and returns the thread ID of the new
+// task in t's PID namespace. Clone may return both a non-zero thread ID and a
+// non-nil error.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
+	// Since signal actions may refer to application signal handlers by virtual
+	// address, any set of signal handlers must refer to the same address
+	// space.
+	if !opts.NewSignalHandlers && opts.NewAddressSpace {
+		return 0, nil, syserror.EINVAL
+	}
+	// In order for the behavior of thread-group-directed signals to be sane,
+	// all tasks in a thread group must share signal handlers.
+	if !opts.NewThreadGroup && opts.NewSignalHandlers {
+		return 0, nil, syserror.EINVAL
+	}
+	// All tasks in a thread group must be in the same PID namespace.
+	if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
+		return 0, nil, syserror.EINVAL
+	}
+	// The two different ways of specifying a new PID namespace are
+	// incompatible.
+	if opts.NewPIDNamespace && t.childPIDNamespace != nil {
+		return 0, nil, syserror.EINVAL
+	}
+	// Thread groups and FS contexts cannot span user namespaces.
+	if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
+	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
+	// be created first, giving the child (clone(2)) or caller (unshare(2))
+	// privileges over the remaining namespaces created by the call." -
+	// user_namespaces(7)
+	creds := t.Credentials()
+	var userns *auth.UserNamespace
+	if opts.NewUserNamespace {
+		var err error
+		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
+		// the caller is in a chroot environment (i.e., the caller's root
+		// directory does not match the root directory of the mount namespace
+		// in which it resides)." - clone(2). Neither chroot(2) nor
+		// user_namespaces(7) document this.
+		if t.IsChrooted() {
+			return 0, nil, syserror.EPERM
+		}
+		userns, err = creds.NewChildUserNamespace()
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapability(linux.CAP_SYS_ADMIN) {
+		return 0, nil, syserror.EPERM
+	}
+
+	utsns := t.UTSNamespace()
+	if opts.NewUTSNamespace {
+		// Note that this must happen after NewUserNamespace so we get
+		// the new userns if there is one.
+		utsns = t.UTSNamespace().Clone(userns)
+	}
+
+	ipcns := t.IPCNamespace()
+	if opts.NewIPCNamespace {
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		ipcns = NewIPCNamespace()
+	}
+
+	tc, err := t.tc.Fork(t, !opts.NewAddressSpace)
+	if err != nil {
+		return 0, nil, err
+	}
+	// clone() returns 0 in the child.
+	tc.Arch.SetReturn(0)
+	if opts.Stack != 0 {
+		tc.Arch.SetStack(uintptr(opts.Stack))
+	}
+	if opts.SetTLS {
+		tc.Arch.StateData().Regs.Fs_base = uint64(opts.TLS)
+	}
+
+	pidns := t.tg.pidns
+	if t.childPIDNamespace != nil {
+		pidns = t.childPIDNamespace
+	} else if opts.NewPIDNamespace {
+		pidns = pidns.NewChild(userns)
+	}
+	tg := t.tg
+	parent := t.parent
+	if opts.NewThreadGroup {
+		sh := t.tg.signalHandlers
+		if opts.NewSignalHandlers {
+			sh = sh.Fork()
+		}
+		tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+		parent = t
+	}
+	cfg := &TaskConfig{
+		Kernel:            t.k,
+		Parent:            parent,
+		ThreadGroup:       tg,
+		TaskContext:       tc,
+		TaskResources:     t.tr.Fork(!opts.NewFiles, !opts.NewFSContext),
+		Niceness:          t.Niceness(),
+		Credentials:       creds.Fork(),
+		NetworkNamespaced: t.netns,
+		AllowedCPUMask:    t.CPUMask(),
+		UTSNamespace:      utsns,
+		IPCNamespace:      ipcns,
+	}
+	if opts.NewNetworkNamespace {
+		cfg.NetworkNamespaced = true
+	}
+	nt, err := t.tg.pidns.owner.NewTask(cfg)
+	if err != nil {
+		if opts.NewThreadGroup {
+			tg.release()
+		}
+		return 0, nil, err
+	}
+
+	// "A child process created via fork(2) inherits a copy of its parent's
+	// alternate signal stack settings" - sigaltstack(2).
+	//
+	// However kernel/fork.c:copy_process() adds a limitation to this:
+	// "sigaltstack should be cleared when sharing the same VM".
+	if opts.NewAddressSpace || opts.Vfork {
+		nt.SetSignalStack(t.SignalStack())
+	}
+
+	if userns != nil {
+		if err := nt.SetUserNamespace(userns); err != nil {
+			// This shouldn't be possible: userns was created from nt.creds, so
+			// nt should have CAP_SYS_ADMIN in userns.
+			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
+		}
+	}
+
+	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
+	// nt that it must receive before its task goroutine starts running.
+	tid := nt.k.tasks.Root.IDOfTask(nt)
+	defer nt.Start(tid)
+
+	// "If fork/clone and execve are allowed by @prog, any child processes will
+	// be constrained to the same filters and system call ABI as the parent." -
+	// Documentation/prctl/seccomp_filter.txt
+	nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...)
+	if opts.Vfork {
+		nt.vforkParent = t
+	}
+
+	if opts.ChildClearTID {
+		nt.SetClearTID(opts.ChildTID)
+	}
+	if opts.ChildSetTID {
+		// Can't use Task.CopyOut, which assumes AddressSpaceActive.
+		usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+	}
+	ntid := t.tg.pidns.IDOfTask(nt)
+	if opts.ParentSetTID {
+		t.CopyOut(opts.ParentTID, ntid)
+	}
+
+	kind := ptraceCloneKindClone
+	if opts.Vfork {
+		kind = ptraceCloneKindVfork
+	} else if opts.TerminationSignal == linux.SIGCHLD {
+		kind = ptraceCloneKindFork
+	}
+	if t.ptraceClone(kind, nt, opts) {
+		if opts.Vfork {
+			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
+		}
+		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
+	}
+	if opts.Vfork {
+		t.maybeBeginVforkStop(nt)
+		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
+	}
+	return ntid, nil, nil
+}
+
+// maybeBeginVforkStop checks if a previously-started vfork child is still
+// running and has not yet released its MM, such that its parent t should enter
+// a vforkStop.
+//
+// Preconditions: The caller must be running on t's task goroutine.
+func (t *Task) maybeBeginVforkStop(child *Task) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.killedLocked() {
+		child.vforkParent = nil
+		return
+	}
+	if child.vforkParent == t {
+		t.beginInternalStopLocked((*vforkStop)(nil))
+	}
+}
+
+func (t *Task) unstopVforkParent() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	if p := t.vforkParent; p != nil {
+		p.tg.signalHandlers.mu.Lock()
+		defer p.tg.signalHandlers.mu.Unlock()
+		if _, ok := p.stop.(*vforkStop); ok {
+			p.endInternalStopLocked()
+		}
+		// Parent no longer needs to be unstopped.
+		t.vforkParent = nil
+	}
+}
+
+type runSyscallAfterPtraceEventClone struct {
+	vforkChild *Task
+
+	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
+	// PID namespace. vforkChildTID must be stored since the child may exit and
+	// release its TID before the PTRACE_EVENT stop ends.
+	vforkChildTID ThreadID
+}
+
+func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
+	if r.vforkChild != nil {
+		t.maybeBeginVforkStop(r.vforkChild)
+		return &runSyscallAfterVforkStop{r.vforkChildTID}
+	}
+	return (*runSyscallExit)(nil)
+}
+
+type runSyscallAfterVforkStop struct {
+	// childTID has the same meaning as
+	// runSyscallAfterPtraceEventClone.vforkChildTID.
+	childTID ThreadID
+}
+
+func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
+	t.ptraceVforkDone(r.childTID)
+	return (*runSyscallExit)(nil)
+}
+
+// Unshare changes the set of resources t shares with other tasks, as specified
+// by opts.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Unshare(opts *SharingOptions) error {
+	// In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
+	// NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
+	// t is the only task using its MM, which due to clone(2)'s rules imply
+	// that it is also the only task using its signal handlers / in its thread
+	// group, and cause EINVAL to be returned otherwise.
+	//
+	// Since we don't count the number of tasks using each address space or set
+	// of signal handlers, we reject NewSignalHandlers and NewAddressSpace
+	// altogether, and interpret NewThreadGroup as requiring that t be the only
+	// member of its thread group. This seems to be logically coherent, in the
+	// sense that clone(2) allows a task to share signal handlers and address
+	// spaces with tasks in other thread groups.
+	if opts.NewAddressSpace || opts.NewSignalHandlers {
+		return syserror.EINVAL
+	}
+	if opts.NewThreadGroup {
+		t.tg.signalHandlers.mu.Lock()
+		if t.tg.tasksCount != 1 {
+			t.tg.signalHandlers.mu.Unlock()
+			return syserror.EINVAL
+		}
+		t.tg.signalHandlers.mu.Unlock()
+		// This isn't racy because we're the only living task, and therefore
+		// the only task capable of creating new ones, in our thread group.
+	}
+	if opts.NewUserNamespace {
+		if t.IsChrooted() {
+			return syserror.EPERM
+		}
+		// This temporary is needed because Go.
+		creds := t.Credentials()
+		newUserNS, err := creds.NewChildUserNamespace()
+		if err != nil {
+			return err
+		}
+		err = t.SetUserNamespace(newUserNS)
+		if err != nil {
+			return err
+		}
+	}
+	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
+	if opts.NewPIDNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if opts.NewNetworkNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		t.netns = true
+	}
+	if opts.NewUTSNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		// Note that this must happen after NewUserNamespace, so the
+		// new user namespace is used if there is one.
+		t.utsns = t.utsns.Clone(t.creds.UserNamespace)
+	}
+	if opts.NewIPCNamespace {
+		if !haveCapSysAdmin {
+			return syserror.EPERM
+		}
+		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+		// namespace"
+		t.ipcns = NewIPCNamespace()
+	}
+	if opts.NewFiles {
+		oldFDMap := t.tr.FDMap
+		t.tr.FDMap = oldFDMap.Fork()
+		oldFDMap.DecRef()
+	}
+	if opts.NewFSContext {
+		oldFS := t.tr.FSContext
+		t.tr.FSContext = oldFS.Fork()
+		oldFS.DecRef()
+	}
+	return nil
+}
+
+// vforkStop is a TaskStop imposed on a task that creates a child with
+// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
+// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
+// that the child and parent share mappings until the child execve()s into a
+// new process image or exits.)
+type vforkStop struct{}
+
+// StopIgnoresKill implements TaskStop.Killable.
+func (*vforkStop) Killable() bool { return true }
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
new file mode 100644
index 000000000..5c563ba08
--- /dev/null
+++ b/pkg/sentry/kernel/task_context.go
@@ -0,0 +1,179 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"errors"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// ErrNoSyscalls is returned if there is no syscall table.
+var ErrNoSyscalls = errors.New("no syscall table found")
+
+// Auxmap contains miscellaneous data for the task.
+type Auxmap map[string]interface{}
+
+// TaskContext is the subset of a task's data that is provided by the loader.
+type TaskContext struct {
+	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
+	Name string
+
+	// Arch is the architecture-specific context (registers, etc.)
+	Arch arch.Context
+
+	// MemoryManager is the task's address space.
+	MemoryManager *mm.MemoryManager
+
+	// fu implements futexes in the address space.
+	fu *futex.Manager
+
+	// st is the task's syscall table.
+	st *SyscallTable
+}
+
+// release releases all resources held by the TaskContext. release is called by
+// the task when it execs into a new TaskContext or exits.
+func (tc *TaskContext) release() {
+	// Nil out pointers so that if the task is saved after release, it doesn't
+	// follow the pointers to possibly now-invalid objects.
+	if tc.MemoryManager != nil {
+		// TODO
+		tc.MemoryManager.DecUsers(context.Background())
+		tc.MemoryManager = nil
+	}
+	tc.fu = nil
+}
+
+// Fork returns a duplicate of tc. The copied TaskContext always has an
+// independent arch.Context. If shareAddressSpace is true, the copied
+// TaskContext shares an address space with the original; otherwise, the copied
+// TaskContext has an independent address space that is initially a duplicate
+// of the original's.
+func (tc *TaskContext) Fork(ctx context.Context, shareAddressSpace bool) (*TaskContext, error) {
+	newTC := &TaskContext{
+		Arch: tc.Arch.Fork(),
+		st:   tc.st,
+	}
+	if shareAddressSpace {
+		newTC.MemoryManager = tc.MemoryManager
+		if newTC.MemoryManager != nil {
+			if !newTC.MemoryManager.IncUsers() {
+				// Shouldn't be possible since tc.MemoryManager should be a
+				// counted user.
+				panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager"))
+			}
+		}
+		newTC.fu = tc.fu
+	} else {
+		newMM, err := tc.MemoryManager.Fork(ctx)
+		if err != nil {
+			return nil, err
+		}
+		newTC.MemoryManager = newMM
+		// TODO: revisit when shmem is supported.
+		newTC.fu = futex.NewManager()
+	}
+	return newTC, nil
+}
+
+// Arch returns t's arch.Context.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Arch() arch.Context {
+	return t.tc.Arch
+}
+
+// MemoryManager returns t's MemoryManager. MemoryManager does not take an
+// additional reference on the returned MM.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) MemoryManager() *mm.MemoryManager {
+	return t.tc.MemoryManager
+}
+
+// Futex returns t's futex manager.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Futex() *futex.Manager {
+	return t.tc.fu
+}
+
+// SyscallTable returns t's syscall table.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) SyscallTable() *SyscallTable {
+	return t.tc.st
+}
+
+// Stack returns the userspace stack.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Stack() *arch.Stack {
+	return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+}
+
+// LoadTaskImage loads filename into a new TaskContext.
+//
+// It takes several arguments:
+//  * mounts: MountNamespace to lookup filename in
+//  * root: Root to lookup filename under
+//  * wd: Working directory to lookup filename under
+//  * maxTraversals: maximum number of symlinks to follow
+//  * filename: path to binary to load
+//  * argv: Binary argv
+//  * envv: Binary envv
+//  * fs: Binary FeatureSet
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
+	// Prepare a new user address space to load into.
+	m := mm.NewMemoryManager(k)
+	defer m.DecUsers(ctx)
+
+	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
+	if err != nil {
+		return nil, err
+	}
+
+	// Lookup our new syscall table.
+	st, ok := LookupSyscallTable(os, ac.Arch())
+	if !ok {
+		// No syscall table found. Yikes.
+		return nil, ErrNoSyscalls
+	}
+
+	if !m.IncUsers() {
+		panic("Failed to increment users count on new MM")
+	}
+	return &TaskContext{
+		Name:          name,
+		Arch:          ac,
+		MemoryManager: m,
+		fu:            futex.NewManager(),
+		st:            st,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
new file mode 100644
index 000000000..2285847a2
--- /dev/null
+++ b/pkg/sentry/kernel/task_exec.go
@@ -0,0 +1,240 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the machinery behind the execve() syscall. In brief, a
+// thread executes an execve() by killing all other threads in its thread
+// group, assuming the leader's identity, and then switching process images.
+//
+// This design is effectively mandated by Linux. From ptrace(2):
+//
+// """
+// execve(2) under ptrace
+//     When one thread in a multithreaded process calls execve(2), the
+//     kernel destroys all other threads in the process, and resets the
+//     thread ID of the execing thread to the thread group ID (process ID).
+//     (Or, to put things another way, when a multithreaded process does an
+//     execve(2), at completion of the call, it appears as though the
+//     execve(2) occurred in the thread group leader, regardless of which
+//     thread did the execve(2).)  This resetting of the thread ID looks
+//     very confusing to tracers:
+//
+//     *  All other threads stop in PTRACE_EVENT_EXIT stop, if the
+//        PTRACE_O_TRACEEXIT option was turned on.  Then all other threads
+//        except the thread group leader report death as if they exited via
+//        _exit(2) with exit code 0.
+//
+//     *  The execing tracee changes its thread ID while it is in the
+//        execve(2).  (Remember, under ptrace, the "pid" returned from
+//        waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
+//        That is, the tracee's thread ID is reset to be the same as its
+//        process ID, which is the same as the thread group leader's thread
+//        ID.
+//
+//     *  Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
+//        option was turned on.
+//
+//     *  If the thread group leader has reported its PTRACE_EVENT_EXIT stop
+//        by this time, it appears to the tracer that the dead thread leader
+//        "reappears from nowhere".  (Note: the thread group leader does not
+//        report death via WIFEXITED(status) until there is at least one
+//        other live thread.  This eliminates the possibility that the
+//        tracer will see it dying and then reappearing.)  If the thread
+//        group leader was still alive, for the tracer this may look as if
+//        thread group leader returns from a different system call than it
+//        entered, or even "returned from a system call even though it was
+//        not in any system call".  If the thread group leader was not
+//        traced (or was traced by a different tracer), then during
+//        execve(2) it will appear as if it has become a tracee of the
+//        tracer of the execing tracee.
+//
+//     All of the above effects are the artifacts of the thread ID change in
+//     the tracee.
+// """
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// execStop is a TaskStop that a task sets on itself when it wants to execve
+// and is waiting for the other tasks in its thread group to exit first.
+type execStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*execStop) Killable() bool { return true }
+
+// Execve implements the execve(2) syscall by killing all other tasks in its
+// thread group and switching to newTC. Execve always takes ownership of newTC.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+
+	if t.tg.exiting || t.tg.execing != nil {
+		// We lost to a racing group-exit, kill, or exec from another thread
+		// and should just exit.
+		newTC.release()
+		return nil, syserror.EINTR
+	}
+
+	// Cancel any racing group stops.
+	t.tg.endGroupStopLocked(false)
+
+	// If the task has any siblings, they have to exit before the exec can
+	// continue.
+	t.tg.execing = t
+	if t.tg.tasks.Front() != t.tg.tasks.Back() {
+		// "[All] other threads except the thread group leader report death as
+		// if they exited via _exit(2) with exit code 0." - ptrace(2)
+		for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+			if t != sibling {
+				sibling.killLocked()
+			}
+		}
+		// The last sibling to exit will wake t.
+		t.beginInternalStopLocked((*execStop)(nil))
+	}
+
+	return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+}
+
+// The runSyscallAfterExecStop state continues execve(2) after all siblings of
+// a thread in the execve syscall have exited.
+type runSyscallAfterExecStop struct {
+	tc *TaskContext
+}
+
+func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	t.tg.execing = nil
+	if t.killed() {
+		t.tg.pidns.owner.mu.Unlock()
+		r.tc.release()
+		return (*runInterrupt)(nil)
+	}
+	// We are the thread group leader now. Save our old thread ID for
+	// PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
+	// point it will get a PID of 0, but this is consistent with Linux.
+	oldTID := ThreadID(0)
+	if tracer := t.Tracer(); tracer != nil {
+		oldTID = tracer.tg.pidns.tids[t]
+	}
+	t.promoteLocked()
+	// "During an execve(2), the dispositions of handled signals are reset to
+	// the default; the dispositions of ignored signals are left unchanged. ...
+	// [The] signal mask is preserved across execve(2). ... [The] pending
+	// signal set is preserved across an execve(2)." - signal(7)
+	//
+	// Details:
+	//
+	// - If the thread group is sharing its signal handlers with another thread
+	// group via CLONE_SIGHAND, execve forces the signal handlers to be copied
+	// (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
+	// handlers, so we always make a copy.
+	//
+	// - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
+	// restorer (if present), and mask are always reset. (See Linux's
+	// fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
+	t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	// "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
+	t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable}
+	// "The termination signal is reset to SIGCHLD (see clone(2))."
+	t.tg.terminationSignal = linux.SIGCHLD
+	// execed indicates that the process can no longer join a process group
+	// in some scenarios (namely, the parent call setpgid(2) on the child).
+	// See the JoinProcessGroup function in sessions.go for more context.
+	t.tg.execed = true
+	// Maximum RSS is preserved across execve(2).
+	t.updateRSSLocked()
+	// Restartable sequence state is discarded.
+	t.rseqPreempted = false
+	t.rseqCPUAddr = 0
+	t.rseqCPU = -1
+	t.tg.rscr.Store(&RSEQCriticalRegion{})
+	t.tg.pidns.owner.mu.Unlock()
+
+	// Remove FDs with the CloseOnExec flag set.
+	t.FDMap().RemoveIf(func(file *fs.File, flags FDFlags) bool {
+		return flags.CloseOnExec
+	})
+
+	// Switch to the new process.
+	t.MemoryManager().Deactivate()
+	t.mu.Lock()
+	// Update credentials to reflect the execve. This should precede switching
+	// MMs to ensure that dumpability has been reset first, if needed.
+	t.updateCredsForExecLocked()
+	t.tc.release()
+	t.tc = *r.tc
+	t.mu.Unlock()
+	t.unstopVforkParent()
+	// NOTE: All locks must be dropped prior to calling Activate.
+	t.MemoryManager().Activate()
+
+	t.ptraceExec(oldTID)
+	return (*runSyscallExit)(nil)
+}
+
+// promoteLocked makes t the leader of its thread group. If t is already the
+// thread group leader, promoteLocked is a no-op.
+//
+// Preconditions: All other tasks in t's thread group, including the existing
+// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
+// be locked for writing.
+func (t *Task) promoteLocked() {
+	oldLeader := t.tg.leader
+	if t == oldLeader {
+		return
+	}
+	// Swap the leader's TIDs with the execing task's. The latter will be
+	// released when the old leader is reaped below.
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
+		ns.tids[oldLeader] = oldTID
+		ns.tids[t] = leaderTID
+		ns.tasks[oldTID] = oldLeader
+		ns.tasks[leaderTID] = t
+	}
+
+	// Inherit the old leader's start time.
+	oldStartTime := oldLeader.StartTime()
+	t.mu.Lock()
+	t.startTime = oldStartTime
+	t.mu.Unlock()
+
+	t.tg.leader = t
+	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
+	t.updateLogPrefixLocked()
+	// Reap the original leader. If it has a tracer, detach it instead of
+	// waiting for it to acknowledge the original leader's death.
+	oldLeader.exitParentNotified = true
+	oldLeader.exitParentAcked = true
+	if tracer := oldLeader.Tracer(); tracer != nil {
+		delete(tracer.ptraceTracees, oldLeader)
+		oldLeader.forgetTracerLocked()
+		// Notify the tracer that it will no longer be receiving these events
+		// from the tracee.
+		tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
+	}
+	oldLeader.exitNotifyLocked(false)
+}
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
new file mode 100644
index 000000000..3d49ae350
--- /dev/null
+++ b/pkg/sentry/kernel/task_exit.go
@@ -0,0 +1,1139 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the task exit cycle:
+//
+// - Tasks are asynchronously requested to exit with Task.Kill.
+//
+// - When able, the task goroutine enters the exit path starting from state
+// runExit.
+//
+// - Other tasks observe completed exits with Task.Wait (which implements the
+// wait*() family of syscalls).
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// An ExitStatus is a value communicated from an exiting task or thread group
+// to the party that reaps it.
+type ExitStatus struct {
+	// Code is the numeric value passed to the call to exit or exit_group that
+	// caused the exit. If the exit was not caused by such a call, Code is 0.
+	Code int
+
+	// Signo is the signal that caused the exit. If the exit was not caused by
+	// a signal, Signo is 0.
+	Signo int
+}
+
+// Signaled returns true if the ExitStatus indicates that the exiting task or
+// thread group was killed by a signal.
+func (es ExitStatus) Signaled() bool {
+	return es.Signo != 0
+}
+
+// Status returns the numeric representation of the ExitStatus returned by e.g.
+// the wait4() system call.
+func (es ExitStatus) Status() uint32 {
+	return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
+}
+
+// ShellExitCode returns the numeric exit code that Bash would return for an
+// exit status of es.
+func (es ExitStatus) ShellExitCode() int {
+	if es.Signaled() {
+		return 128 + es.Signo
+	}
+	return es.Code
+}
+
+// TaskExitState represents a step in the task exit path.
+//
+// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
+type TaskExitState int
+
+const (
+	// TaskExitNone indicates that the task has not begun exiting.
+	TaskExitNone TaskExitState = iota
+
+	// TaskExitInitiated indicates that the task goroutine has entered the exit
+	// path, and the task is no longer eligible to participate in group stops
+	// or group signal handling. TaskExitInitiated is analogous to Linux's
+	// PF_EXITING.
+	TaskExitInitiated
+
+	// TaskExitZombie indicates that the task has released its resources, and
+	// the task no longer prevents a sibling thread from completing execve.
+	TaskExitZombie
+
+	// TaskExitDead indicates that the task's thread IDs have been released,
+	// and the task no longer prevents its thread group leader from being
+	// reaped. ("Reaping" refers to the transitioning of a task from
+	// TaskExitZombie to TaskExitDead.)
+	TaskExitDead
+)
+
+// String implements fmt.Stringer.
+func (t TaskExitState) String() string {
+	switch t {
+	case TaskExitNone:
+		return "TaskExitNone"
+	case TaskExitInitiated:
+		return "TaskExitInitiated"
+	case TaskExitZombie:
+		return "TaskExitZombie"
+	case TaskExitDead:
+		return "TaskExitDead"
+	default:
+		return strconv.Itoa(int(t))
+	}
+}
+
+// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
+// thread-group-affecting side effects SIGKILL usually has.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) killLocked() {
+	// Clear killable stops.
+	if t.stop != nil && t.stop.Killable() {
+		t.endInternalStopLocked()
+	}
+	t.groupStopRequired = false
+	t.pendingSignals.enqueue(&arch.SignalInfo{
+		Signo: int32(linux.SIGKILL),
+		// Linux just sets SIGKILL in the pending signal bitmask without
+		// enqueueing an actual siginfo, such that
+		// kernel/signal.c:collect_signal() initalizes si_code to SI_USER.
+		Code: arch.SignalInfoUser,
+	})
+	t.interrupt()
+}
+
+// killed returns true if t has a SIGKILL pending. killed is analogous to
+// Linux's fatal_signal_pending().
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) killed() bool {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.killedLocked()
+}
+
+func (t *Task) killedLocked() bool {
+	return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
+}
+
+// PrepareExit indicates an exit with status es.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.exitStatus = es
+}
+
+// PrepareGroupExit indicates a group exit with status es to t's thread group.
+//
+// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
+// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
+// (Linux does not do so until within do_exit(), since it reuses exit_code for
+// ptrace.)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareGroupExit(es ExitStatus) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.tg.exiting || t.tg.execing != nil {
+		// Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
+		// this "group exit" is being executed by the killed sibling of an
+		// execing task, then Task.Execve never set t.tg.exitStatus, so it's
+		// still the zero value. This is consistent with Linux, both in intent
+		// ("all other threads ... report death as if they exited via _exit(2)
+		// with exit code 0" - ptrace(2), "execve under ptrace") and in
+		// implementation (compare fs/exec.c:de_thread() =>
+		// kernel/signal.c:zap_other_threads() and
+		// kernel/exit.c:do_group_exit() =>
+		// include/linux/sched.h:signal_group_exit()).
+		t.exitStatus = t.tg.exitStatus
+		return
+	}
+	t.tg.exiting = true
+	t.tg.exitStatus = es
+	t.exitStatus = es
+	for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+		if sibling != t {
+			sibling.killLocked()
+		}
+	}
+}
+
+// Kill requests that all tasks in ts exit as if group exiting with status es.
+// Kill does not wait for tasks to exit.
+//
+// Kill has no analogue in Linux; it's provided for save/restore only.
+func (ts *TaskSet) Kill(es ExitStatus) {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.Root.exiting = true
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		if !t.tg.exiting {
+			t.tg.exiting = true
+			t.tg.exitStatus = es
+		}
+		t.killLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
+
+// advanceExitStateLocked checks that t's current exit state is oldExit, then
+// sets it to newExit. If t's current exit state is not oldExit,
+// advanceExitStateLocked panics.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
+	if t.exitState != oldExit {
+		panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
+	}
+	t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
+	t.exitState = newExit
+}
+
+// runExit is the entry point into the task exit path.
+type runExit struct{}
+
+func (*runExit) execute(t *Task) taskRunState {
+	t.ptraceExit()
+	return (*runExitMain)(nil)
+}
+
+type runExitMain struct{}
+
+func (*runExitMain) execute(t *Task) taskRunState {
+	lastExiter := t.exitThreadGroup()
+
+	// If the task has a cleartid, and the thread group wasn't killed by a
+	// signal, handle that before releasing the MM.
+	if t.cleartid != 0 {
+		t.tg.signalHandlers.mu.Lock()
+		signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
+		t.tg.signalHandlers.mu.Unlock()
+		if !signaled {
+			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+				t.Futex().Wake(uintptr(t.cleartid), ^uint32(0), 1)
+			}
+			// If the CopyOut fails, there's nothing we can do.
+		}
+	}
+
+	// Deactivate the address space before releasing the MM.
+	t.Deactivate()
+
+	// Update the max resident set size before releasing t.tc.mm.
+	t.tg.pidns.owner.mu.Lock()
+	t.updateRSSLocked()
+	t.tg.pidns.owner.mu.Unlock()
+
+	// Release all of the task's resources.
+	t.mu.Lock()
+	t.tc.release()
+	t.tr.release()
+	t.mu.Unlock()
+	t.unstopVforkParent()
+
+	// If this is the last task to exit from the thread group, release the
+	// thread group's resources.
+	if lastExiter {
+		t.tg.release()
+	}
+
+	// Detach tracees.
+	t.exitPtrace()
+
+	// Reparent the task's children.
+	t.exitChildren()
+
+	// Don't tail-call runExitNotify, as exitChildren may have initiated a stop
+	// to wait for a PID namespace to die.
+	return (*runExitNotify)(nil)
+}
+
+// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
+// group that it is no longer eligible to participate in group activities. It
+// returns true if t is the last task in its thread group to call
+// exitThreadGroup.
+func (t *Task) exitThreadGroup() bool {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.tg.signalHandlers.mu.Lock()
+	// Can't defer unlock: see below.
+
+	t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
+	t.tg.activeTasks--
+	last := t.tg.activeTasks == 0
+
+	// Ensure that someone will handle the signals we can't.
+	t.setSignalMaskLocked(^linux.SignalSet(0))
+
+	// Check if this task's exit interacts with an initiated group stop.
+	if t.tg.groupStopPhase != groupStopInitiated {
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	if t.groupStopAcknowledged {
+		// Un-acknowledge the group stop.
+		t.tg.groupStopCount--
+		t.groupStopAcknowledged = false
+		// If the group stop wasn't complete before, then there is still at
+		// least one other task that hasn't acknowledged the group stop, so
+		// it is still not complete now.
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	if t.tg.groupStopCount != t.tg.activeTasks {
+		t.tg.signalHandlers.mu.Unlock()
+		return last
+	}
+	t.Debugf("Completing group stop")
+	t.tg.groupStopPhase = groupStopComplete
+	t.tg.groupStopWaitable = true
+	sig := t.tg.groupStopSignal
+	t.tg.groupContNotify = false
+	t.tg.groupContWaitable = false
+	// signalStop must be called with t's signal mutex unlocked.
+	t.tg.signalHandlers.mu.Unlock()
+	if t.tg.leader.parent != nil {
+		t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
+		t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+	}
+	return last
+}
+
+func (t *Task) exitChildren() {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	newParent := t.findReparentTargetLocked()
+	if newParent == nil {
+		// "If the init process of a PID namespace terminates, the kernel
+		// terminates all of the processes in the namespace via a SIGKILL
+		// signal." - pid_namespaces(7)
+		t.Debugf("Init process terminating, killing namespace")
+		t.tg.pidns.exiting = true
+		for other := range t.tg.pidns.tids {
+			if other.tg != t.tg {
+				other.tg.signalHandlers.mu.Lock()
+				other.sendSignalLocked(&arch.SignalInfo{
+					Signo: int32(linux.SIGKILL),
+				}, false /* group */)
+				other.tg.signalHandlers.mu.Unlock()
+			}
+		}
+		// TODO: The init process waits for all processes in the
+		// namespace to exit before completing its own exit
+		// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
+		// other tasks in the namespace are dead, except possibly for this
+		// thread group's leader (which can't be reaped until this task exits).
+	}
+	// This is correct even if newParent is nil (it ensures that children don't
+	// wait for a parent to reap them.)
+	for c := range t.children {
+		if sig := c.ParentDeathSignal(); sig != 0 {
+			siginfo := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			siginfo.SetPid(int32(c.tg.pidns.tids[t]))
+			siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+			c.tg.signalHandlers.mu.Lock()
+			c.sendSignalLocked(siginfo, true /* group */)
+			c.tg.signalHandlers.mu.Unlock()
+		}
+		c.reparentLocked(newParent)
+		if newParent != nil {
+			newParent.children[c] = struct{}{}
+		}
+	}
+}
+
+// findReparentTargetLocked returns the task to which t's children should be
+// reparented. If no such task exists, findNewParentLocked returns nil.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) findReparentTargetLocked() *Task {
+	// Reparent to any sibling in the same thread group that hasn't begun
+	// exiting.
+	if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
+		return t2
+	}
+	// "A child process that is orphaned within the namespace will be
+	// reparented to [the init process for the namespace] ..." -
+	// pid_namespaces(7)
+	if init := t.tg.pidns.tasks[InitTID]; init != nil {
+		return init.tg.anyNonExitingTaskLocked()
+	}
+	return nil
+}
+
+func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.exitState == TaskExitNone {
+			return t
+		}
+	}
+	return nil
+}
+
+// reparentLocked changes t's parent. The new parent may be nil.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) reparentLocked(parent *Task) {
+	oldParent := t.parent
+	t.parent = parent
+	// If a thread group leader's parent changes, reset the thread group's
+	// termination signal to SIGCHLD and re-check exit notification. (Compare
+	// kernel/exit.c:reparent_leader().)
+	if t != t.tg.leader {
+		return
+	}
+	if oldParent == nil && parent == nil {
+		return
+	}
+	if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
+		return
+	}
+	t.tg.terminationSignal = linux.SIGCHLD
+	if t.exitParentNotified && !t.exitParentAcked {
+		t.exitParentNotified = false
+		t.exitNotifyLocked(false)
+	}
+}
+
+// When a task exits, other tasks in the system, notably the task's parent and
+// ptracer, may want to be notified. The exit notification system ensures that
+// interested tasks receive signals and/or are woken from blocking calls to
+// wait*() syscalls; these notifications must be resolved before exiting tasks
+// can be reaped and disappear from the system.
+//
+// Each task may have a parent task and/or a tracer task. If both a parent and
+// a tracer exist, they may be the same task, different tasks in the same
+// thread group, or tasks in different thread groups. (In the last case, Linux
+// refers to the task as being ptrace-reparented due to an implementation
+// detail; we avoid this terminology to avoid confusion.)
+//
+// A thread group is *empty* if all non-leader tasks in the thread group are
+// dead, and the leader is either a zombie or dead. The exit of a thread group
+// leader is never waitable - by either the parent or tracer - until the thread
+// group is empty.
+//
+// There are a few ways for an exit notification to be resolved:
+//
+// - The exit notification may be acknowledged by a call to Task.Wait with
+// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
+//
+// - If the notified party is the parent, and the parent thread group is not
+// also the tracer thread group, and the notification signal is SIGCHLD, the
+// parent may explicitly ignore the notification (see quote in exitNotify).
+// Note that it's possible for the notified party to ignore the signal in other
+// cases, but the notification is only resolved under the above conditions.
+// (Actually, there is one exception; see the last paragraph of the "leader,
+// has tracer, tracer thread group is parent thread group" case below.)
+//
+// - If the notified party is the parent, and the parent does not exist, the
+// notification is resolved as if ignored. (This is only possible in the
+// sentry. In Linux, the only task / thread group without a parent is global
+// init, and killing global init causes a kernel panic.)
+//
+// - If the notified party is a tracer, the tracer may detach the traced task.
+// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
+//
+// In addition, if the notified party is the parent, the parent may exit and
+// cause the notifying task to be reparented to another thread group. This does
+// not resolve the notification; instead, the notification must be resent to
+// the new parent.
+//
+// The series of notifications generated for a given task's exit depend on
+// whether it is a thread group leader; whether the task is ptraced; and, if
+// so, whether the tracer thread group is the same as the parent thread group.
+//
+// - Non-leader, no tracer: No notification is generated; the task is reaped
+// immediately.
+//
+// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
+// notification is resolved (by waiting or detaching), the task is reaped. (For
+// non-leaders, whether the tracer and parent thread groups are the same is
+// irrelevant.)
+//
+// - Leader, no tracer: The task remains a zombie, with no notification sent,
+// until all other tasks in the thread group are dead. (In Linux terms, this
+// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
+// are removed from their thread_group list in kernel/exit.c:release_task() =>
+// __exit_signal() => __unhash_process().) Then the thread group's termination
+// signal is sent to the parent. When the parent notification is resolved (by
+// waiting or ignoring), the task is reaped.
+//
+// - Leader, has tracer, tracer thread group is not parent thread group:
+// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
+// waiting or detaching), and all other tasks in the thread group are dead, the
+// thread group's termination signal is sent to the parent. (Note that the
+// tracer cannot resolve the exit notification by waiting until the thread
+// group is empty.) When the parent notification is resolved, the task is
+// reaped.
+//
+// - Leader, has tracer, tracer thread group is parent thread group:
+//
+// If all other tasks in the thread group are dead, the thread group's
+// termination signal is sent to the parent. At this point, the notification
+// can only be resolved by waiting. If the parent detaches from the task as a
+// tracer, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// If at least one task in the thread group is not dead, SIGCHLD is sent to the
+// parent. At this point, the notification cannot be resolved at all; once the
+// thread group becomes empty, it can be resolved only by waiting. If the
+// parent detaches from the task as a tracer before all remaining tasks die,
+// then exit notification proceeds as in the case where the leader never had a
+// tracer. If the parent detaches from the task as a tracer after all remaining
+// tasks die, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// In both of the above cases, when the parent detaches from the task as a
+// tracer while the thread group is empty, whether or not the parent resolves
+// the notification by ignoring it is based on the parent's SIGCHLD signal
+// action, whether or not the thread group's termination signal is SIGCHLD
+// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
+//
+// There is one final wrinkle: A leader can become a non-leader due to a
+// sibling execve. In this case, the execing thread detaches the leader's
+// tracer (if one exists) and reaps the leader immediately. In Linux, this is
+// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
+
+type runExitNotify struct{}
+
+func (*runExitNotify) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
+	t.tg.liveTasks--
+	// Check if this completes a sibling's execve.
+	if t.tg.execing != nil && t.tg.liveTasks == 1 {
+		// execing blocks the addition of new tasks to the thread group, so
+		// the sole living task must be the execing one.
+		e := t.tg.execing
+		e.tg.signalHandlers.mu.Lock()
+		if _, ok := e.stop.(*execStop); ok {
+			e.endInternalStopLocked()
+		}
+		e.tg.signalHandlers.mu.Unlock()
+	}
+	t.exitNotifyLocked(false)
+	// The task goroutine will now exit.
+	return nil
+}
+
+// exitNotifyLocked is called after changes to t's state that affect exit
+// notification.
+//
+// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
+// thanks to Linux's haphazard implementation of this functionality, such cases
+// determine whether parent notifications are ignored based on the parent's
+// handling of SIGCHLD, regardless of what the exited task's thread group's
+// termination signal is.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
+	if t.exitState != TaskExitZombie {
+		return
+	}
+	if !t.exitTracerNotified {
+		t.exitTracerNotified = true
+		tracer := t.Tracer()
+		if tracer == nil {
+			t.exitTracerAcked = true
+		} else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
+			// Don't set exitParentNotified if t is non-leader, even if the
+			// tracer is in the parent thread group, so that if the parent
+			// detaches the following call to exitNotifyLocked passes through
+			// the !exitParentNotified case below and causes t to be reaped
+			// immediately.
+			//
+			// Tracer notification doesn't care about about
+			// SIG_IGN/SA_NOCLDWAIT.
+			tracer.tg.signalHandlers.mu.Lock()
+			tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
+			tracer.tg.signalHandlers.mu.Unlock()
+			// Wake EventTraceeStop waiters as well since this task will never
+			// ptrace-stop again.
+			tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
+		} else {
+			// t is a leader and the tracer is in the parent thread group.
+			t.exitParentNotified = true
+			sig := linux.SIGCHLD
+			if t.tg.tasksCount == 1 {
+				sig = t.tg.terminationSignal
+			}
+			// This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
+			// (in Linux, the check in do_notify_parent() is gated by
+			// !tsk->ptrace.)
+			t.parent.tg.signalHandlers.mu.Lock()
+			t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
+			t.parent.tg.signalHandlers.mu.Unlock()
+			// See below for rationale for this event mask.
+			t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+		}
+	}
+	if t.exitTracerAcked && !t.exitParentNotified {
+		if t != t.tg.leader {
+			t.exitParentNotified = true
+			t.exitParentAcked = true
+		} else if t.tg.tasksCount == 1 {
+			t.exitParentNotified = true
+			if t.parent == nil {
+				t.exitParentAcked = true
+			} else {
+				// "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+				// set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+				// sigaction(2)), then children that terminate do not become
+				// zombies and a call to wait() or waitpid() will block until all
+				// children have terminated, and then fail with errno set to
+				// ECHILD. (The original POSIX standard left the behavior of
+				// setting SIGCHLD to SIG_IGN unspecified. Note that even though
+				// the default disposition of SIGCHLD is "ignore", explicitly
+				// setting the disposition to SIG_IGN results in different
+				// treatment of zombie process children.) Linux 2.6 conforms to
+				// this specification." - wait(2)
+				//
+				// Some undocumented Linux-specific details:
+				//
+				// - All of the above is ignored if the termination signal isn't
+				// SIGCHLD.
+				//
+				// - SA_NOCLDWAIT causes the leader to be immediately reaped, but
+				// does not suppress the SIGCHLD.
+				signalParent := t.tg.terminationSignal.IsValid()
+				t.parent.tg.signalHandlers.mu.Lock()
+				if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
+					if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
+						if act.Handler == arch.SignalActIgnore {
+							t.exitParentAcked = true
+							signalParent = false
+						} else if act.Flags&arch.SignalFlagNoCldWait != 0 {
+							t.exitParentAcked = true
+						}
+					}
+				}
+				if signalParent {
+					t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
+				}
+				t.parent.tg.signalHandlers.mu.Unlock()
+				// If a task in the parent was waiting for a child group stop
+				// or continue, it needs to be notified of the exit, because
+				// there may be no remaining eligible tasks (so that wait
+				// should return ECHILD).
+				t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+			}
+		}
+	}
+	if t.exitTracerAcked && t.exitParentAcked {
+		t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
+		for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+			tid := ns.tids[t]
+			delete(ns.tasks, tid)
+			delete(ns.tids, t)
+		}
+		t.tg.exitedCPUStats.Accumulate(t.CPUStats())
+		t.tg.ioUsage.Accumulate(t.ioUsage)
+		t.tg.signalHandlers.mu.Lock()
+		t.tg.tasks.Remove(t)
+		if t.tg.lastTimerSignalTask == t {
+			t.tg.lastTimerSignalTask = nil
+		}
+		t.tg.tasksCount--
+		tc := t.tg.tasksCount
+		t.tg.signalHandlers.mu.Unlock()
+		if tc == 1 && t != t.tg.leader {
+			// Our fromPtraceDetach doesn't matter here (in Linux terms, this
+			// is via a call to release_task()).
+			t.tg.leader.exitNotifyLocked(false)
+		} else if tc == 0 {
+			t.tg.processGroup.decRefWithParent(t.tg.parentPG())
+		}
+		if t.parent != nil {
+			delete(t.parent.children, t)
+			t.parent = nil
+		}
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+	}
+	info.SetPid(int32(receiver.tg.pidns.tids[t]))
+	info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	if t.exitStatus.Signaled() {
+		info.Code = arch.CLD_KILLED
+		info.SetStatus(int32(t.exitStatus.Signo))
+	} else {
+		info.Code = arch.CLD_EXITED
+		info.SetStatus(int32(t.exitStatus.Code))
+	}
+	// TODO: Set utime, stime.
+	return info
+}
+
+// ExitStatus returns t's exit status, which is only guaranteed to be
+// meaningful if t.ExitState() != TaskExitNone.
+func (t *Task) ExitStatus() ExitStatus {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.exitStatus
+}
+
+// ExitStatus returns the exit status that would be returned by a consuming
+// wait*() on tg.
+func (tg *ThreadGroup) ExitStatus() ExitStatus {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting {
+		return tg.exitStatus
+	}
+	return tg.leader.exitStatus
+}
+
+// TerminationSignal returns the thread group's termination signal.
+func (tg *ThreadGroup) TerminationSignal() linux.Signal {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.terminationSignal
+}
+
+// Task events that can be waited for.
+const (
+	// EventExit represents an exit notification generated for a child thread
+	// group leader or a tracee under the conditions specified in the comment
+	// above runExitNotify.
+	EventExit waiter.EventMask = 1 << iota
+
+	// EventChildGroupStop occurs when a child thread group completes a group
+	// stop (i.e. all tasks in the child thread group have entered a stopped
+	// state as a result of a group stop).
+	EventChildGroupStop
+
+	// EventTraceeStop occurs when a task that is ptraced by a task in the
+	// notified thread group enters a ptrace stop (see ptrace(2)).
+	EventTraceeStop
+
+	// EventGroupContinue occurs when a child thread group, or a thread group
+	// whose leader is ptraced by a task in the notified thread group, that had
+	// initiated or completed a group stop leaves the group stop, due to the
+	// child thread group or any task in the child thread group being sent
+	// SIGCONT.
+	EventGroupContinue
+)
+
+// WaitOptions controls the behavior of Task.Wait.
+type WaitOptions struct {
+	// If SpecificTID is non-zero, only events from the task with thread ID
+	// SpecificTID are eligible to be waited for. SpecificTID is resolved in
+	// the PID namespace of the waiter (the method receiver of Task.Wait). If
+	// no such task exists, or that task would not otherwise be eligible to be
+	// waited for by the waiting task, then there are no waitable tasks and
+	// Wait will return ECHILD.
+	SpecificTID ThreadID
+
+	// If SpecificPGID is non-zero, only events from ThreadGroups with a
+	// matching ProcessGroupID are eligible to be waited for. (Same
+	// constraints as SpecificTID apply.)
+	SpecificPGID ProcessGroupID
+
+	// Terminology note: Per waitpid(2), "a clone child is one which delivers
+	// no signal, or a signal other than SIGCHLD to its parent upon
+	// termination." In Linux, termination signal is technically a per-task
+	// property rather than a per-thread-group property. However, clone()
+	// forces no termination signal for tasks created with CLONE_THREAD, and
+	// execve() resets the termination signal to SIGCHLD, so all
+	// non-group-leader threads have no termination signal and are therefore
+	// "clone tasks".
+
+	// If NonCloneTasks is true, events from non-clone tasks are eligible to be
+	// waited for.
+	NonCloneTasks bool
+
+	// If CloneTasks is true, events from clone tasks are eligible to be waited
+	// for.
+	CloneTasks bool
+
+	// Events is a bitwise combination of the events defined above that specify
+	// what events are of interest to the call to Wait.
+	Events waiter.EventMask
+
+	// If ConsumeEvent is true, the Wait should consume the event such that it
+	// cannot be returned by a future Wait. Note that if a task exit is
+	// consumed in this way, in most cases the task will be reaped.
+	ConsumeEvent bool
+
+	// If BlockInterruptErr is not nil, Wait will block until either an event
+	// is available or there are no tasks that could produce a waitable event;
+	// if that blocking is interrupted, Wait returns BlockInterruptErr. If
+	// BlockInterruptErr is nil, Wait will not block.
+	BlockInterruptErr error
+}
+
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace) bool {
+	if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
+		return false
+	}
+	if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
+		return false
+	}
+	if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
+		return o.NonCloneTasks
+	}
+	return o.CloneTasks
+}
+
+// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
+// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
+// events may exist in the future. (In contrast, if a non-blocking or blocking
+// Wait determines that there are no tasks that can produce a waitable event,
+// Task.Wait returns ECHILD.)
+var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
+
+// WaitResult contains information about a waited-for event.
+type WaitResult struct {
+	// Task is the task that reported the event.
+	Task *Task
+
+	// TID is the thread ID of Task in the PID namespace of the task that
+	// called Wait (that is, the method receiver of the call to Task.Wait). TID
+	// is provided because consuming exit waits cause the thread ID to be
+	// deallocated.
+	TID ThreadID
+
+	// UID is the real UID of Task in the user namespace of the task that
+	// called Wait.
+	UID auth.UID
+
+	// Event is exactly one of the events defined above.
+	Event waiter.EventMask
+
+	// Status is the numeric status associated with the event.
+	Status uint32
+}
+
+// Wait waits for an event from a thread group that is a child of t's thread
+// group, or a task in such a thread group, or a task that is ptraced by t,
+// subject to the options specified in opts.
+func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
+	if opts.BlockInterruptErr == nil {
+		return t.waitOnce(opts)
+	}
+	w, ch := waiter.NewChannelEntry(nil)
+	t.tg.eventQueue.EventRegister(&w, opts.Events)
+	defer t.tg.eventQueue.EventUnregister(&w)
+	for {
+		wr, err := t.waitOnce(opts)
+		if err != ErrNoWaitableEvent {
+			// This includes err == nil.
+			return wr, err
+		}
+		if err := t.Block(ch); err != nil {
+			return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
+		}
+	}
+}
+
+func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
+	anyWaitableTasks := false
+
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+
+	// Without the (unimplemented) __WNOTHREAD flag, a task can wait on the
+	// children and tracees of any task in the same thread group.
+	for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
+		for child := range parent.children {
+			if !opts.matchesTask(child, parent.tg.pidns) {
+				continue
+			}
+			// Non-leaders don't notify parents on exit and aren't eligible to
+			// be waited on.
+			if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
+				anyWaitableTasks = true
+				if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
+					return wr, nil
+				}
+			}
+			// Check for group stops and continues. Tasks that have passed
+			// TaskExitInitiated can no longer participate in group stops.
+			if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
+				continue
+			}
+			if child.exitState >= TaskExitInitiated {
+				continue
+			}
+			// If the waiter is in the same thread group as the task's
+			// tracer, do not report its group stops; they will be reported
+			// as ptrace stops instead. This also skips checking for group
+			// continues, but they'll be checked for when scanning tracees
+			// below. (Per kernel/exit.c:wait_consider_task(): "If a
+			// ptracer wants to distinguish the two events for its own
+			// children, it should create a separate process which takes
+			// the role of real parent.")
+			if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
+				continue
+			}
+			anyWaitableTasks = true
+			if opts.Events&EventChildGroupStop != 0 {
+				if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
+					return wr, nil
+				}
+			}
+			if opts.Events&EventGroupContinue != 0 {
+				if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
+					return wr, nil
+				}
+			}
+		}
+		for tracee := range parent.ptraceTracees {
+			if !opts.matchesTask(tracee, parent.tg.pidns) {
+				continue
+			}
+			// Non-leaders do notify tracers on exit.
+			if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
+				anyWaitableTasks = true
+				if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
+					return wr, nil
+				}
+			}
+			if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
+				continue
+			}
+			if tracee.exitState >= TaskExitInitiated {
+				continue
+			}
+			anyWaitableTasks = true
+			if opts.Events&EventTraceeStop != 0 {
+				if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
+					return wr, nil
+				}
+			}
+			if opts.Events&EventGroupContinue != 0 {
+				if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
+					return wr, nil
+				}
+			}
+		}
+	}
+
+	if anyWaitableTasks {
+		return nil, ErrNoWaitableEvent
+	}
+	return nil, syserror.ECHILD
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
+	if asPtracer && !target.exitTracerNotified {
+		return nil
+	}
+	if !asPtracer && !target.exitParentNotified {
+		return nil
+	}
+	// Zombied thread group leaders are never waitable until their thread group
+	// is otherwise empty. Usually this is caught by the
+	// target.exitParentNotified check above, but if t is both (in the thread
+	// group of) target's tracer and parent, asPtracer may be true.
+	if target == target.tg.leader && target.tg.tasksCount != 1 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	status := target.exitStatus.Status()
+	if !opts.ConsumeEvent {
+		return &WaitResult{
+			Task:   target,
+			TID:    pid,
+			UID:    uid,
+			Event:  EventExit,
+			Status: status,
+		}
+	}
+	// Surprisingly, the exit status reported by a non-consuming wait can
+	// differ from that reported by a consuming wait; the latter will return
+	// the group exit code if one is available.
+	if target.tg.exiting {
+		status = target.tg.exitStatus.Status()
+	}
+	// t may be (in the thread group of) target's parent, tracer, or both. We
+	// don't need to check for !exitTracerAcked because tracees are detached
+	// here, and we don't need to check for !exitParentAcked because zombies
+	// will be reaped here.
+	if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
+		target.exitTracerAcked = true
+		target.ptraceTracer.Store((*Task)(nil))
+		delete(t.ptraceTracees, target)
+	}
+	if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
+		target.exitParentAcked = true
+		if target == target.tg.leader {
+			// target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
+			// and won't until after target.exitNotifyLocked() (maybe). Include
+			// target.CPUStats() explicitly. This is consistent with Linux,
+			// which accounts an exited task's cputime to its thread group in
+			// kernel/exit.c:release_task() => __exit_signal(), and uses
+			// thread_group_cputime_adjusted() in wait_task_zombie().
+			t.tg.childCPUStats.Accumulate(target.CPUStats())
+			t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
+			t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
+			// Update t's child max resident set size. The size will be the maximum
+			// of this thread's size and all its childrens' sizes.
+			if t.tg.childMaxRSS < target.tg.maxRSS {
+				t.tg.childMaxRSS = target.tg.maxRSS
+			}
+			if t.tg.childMaxRSS < target.tg.childMaxRSS {
+				t.tg.childMaxRSS = target.tg.childMaxRSS
+			}
+		}
+	}
+	target.exitNotifyLocked(false)
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventExit,
+		Status: status,
+	}
+}
+
+// updateRSSLocked updates t.tg.maxRSS.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) updateRSSLocked() {
+	if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
+		t.tg.maxRSS = mmMaxRSS
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupStopWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	sig := target.tg.groupStopSignal
+	if opts.ConsumeEvent {
+		target.tg.groupStopWaitable = false
+	}
+	return &WaitResult{
+		Task:  target,
+		TID:   pid,
+		UID:   uid,
+		Event: EventChildGroupStop,
+		// There is no name for these status constants.
+		Status: (uint32(sig)&0xff)<<8 | 0x7f,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if !target.tg.groupContWaitable {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	if opts.ConsumeEvent {
+		target.tg.groupContWaitable = false
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventGroupContinue,
+		Status: 0xffff,
+	}
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if target.stop == nil {
+		return nil
+	}
+	if _, ok := target.stop.(*ptraceStop); !ok {
+		return nil
+	}
+	if target.ptraceCode == 0 {
+		return nil
+	}
+	pid := t.tg.pidns.tids[target]
+	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+	code := target.ptraceCode
+	if opts.ConsumeEvent {
+		target.ptraceCode = 0
+	}
+	return &WaitResult{
+		Task:   target,
+		TID:    pid,
+		UID:    uid,
+		Event:  EventTraceeStop,
+		Status: uint32(code)<<8 | 0x7f,
+	}
+}
+
+// ExitState returns t's current progress through the exit path.
+func (t *Task) ExitState() TaskExitState {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	return t.exitState
+}
+
+// ParentDeathSignal returns t's parent death signal.
+func (t *Task) ParentDeathSignal() linux.Signal {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.parentDeathSignal
+}
+
+// SetParentDeathSignal sets t's parent death signal.
+func (t *Task) SetParentDeathSignal(sig linux.Signal) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.parentDeathSignal = sig
+}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
new file mode 100644
index 000000000..a51fa9d7e
--- /dev/null
+++ b/pkg/sentry/kernel/task_identity.go
@@ -0,0 +1,557 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials returns t's credentials by value.
+func (t *Task) Credentials() auth.Credentials {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return *t.creds // Copy out with lock held.
+}
+
+// UserNamespace returns the user namespace associated with the task.
+func (t *Task) UserNamespace() *auth.UserNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.UserNamespace
+}
+
+// HasCapabilityIn checks if the task has capability cp in user namespace ns.
+func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapabilityIn(cp, ns)
+}
+
+// HasCapability checks if the task has capability cp in its user namespace.
+func (t *Task) HasCapability(cp linux.Capability) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapability(cp)
+}
+
+// SetUID implements the semantics of setuid(2).
+func (t *Task) SetUID(uid auth.UID) error {
+	// setuid considers -1 to be invalid.
+	if !uid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kuid := t.creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	// "setuid() sets the effective user ID of the calling process. If the
+	// effective UID of the caller is root (more precisely: if the caller has
+	// the CAP_SETUID capability), the real UID and saved set-user-ID are also
+	// set." - setuid(2)
+	if t.creds.HasCapability(linux.CAP_SETUID) {
+		t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
+		return nil
+	}
+	// "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
+	// capability) and uid does not match the real UID or saved set-user-ID of
+	// the calling process."
+	if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID {
+		return syserror.EPERM
+	}
+	t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID)
+	return nil
+}
+
+// SetREUID implements the semantics of setreuid(2).
+func (t *Task) SetREUID(r, e auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Supplying a value of -1 for either the real or effective user ID forces
+	// the system to leave that ID unchanged." - setreuid(2)
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKUID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKUID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETUID) {
+		// "Unprivileged processes may only set the effective user ID to the
+		// real user ID, the effective user ID, or the saved set-user-ID."
+		if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID {
+			return syserror.EPERM
+		}
+		// "Unprivileged users may only set the real user ID to the real user
+		// ID or the effective user ID."
+		if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID {
+			return syserror.EPERM
+		}
+	}
+	// "If the real user ID is set (i.e., ruid is not -1) or the effective user
+	// ID is set to a value not equal to the previous real user ID, the saved
+	// set-user-ID will be set to the new effective user ID."
+	newS := t.creds.SavedKUID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) {
+		newS = newE
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESUID implements the semantics of the setresuid(2) syscall.
+func (t *Task) SetRESUID(r, e, s auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Unprivileged user processes may change the real UID, effective UID, and
+	// saved set-user-ID, each to one of: the current real UID, the current
+	// effective UID or the current saved set-user-ID. Privileged processes (on
+	// Linux, those having the CAP_SETUID capability) may set the real UID,
+	// effective UID, and saved set-user-ID to arbitrary values. If one of the
+	// arguments equals -1, the corresponding value is not changed." -
+	// setresuid(2)
+	var err error
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR, err = t.creds.UseUID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE, err = t.creds.UseUID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKUID
+	if s.Ok() {
+		newS, err = t.creds.UseUID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
+	t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
+
+	// "1. If one or more of the real, effective or saved set user IDs was
+	// previously 0, and as a result of the UID changes all of these IDs have a
+	// nonzero value, then all capabilities are cleared from the permitted and
+	// effective capability sets." - capabilities(7)
+	if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) {
+		// prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's
+		// "keep capabilities" flag, which determines whether the thread's permitted
+		// capability set is cleared when a change is made to the
+		// thread's user IDs such that the thread's real UID, effective
+		// UID, and saved set-user-ID all become nonzero when at least
+		// one of them previously had the value 0.  By default, the
+		// permitted capability set is cleared when such a change is
+		// made; setting the "keep capabilities" flag prevents it from
+		// being cleared." (A thread's effective capability set is always
+		// cleared when such a credential change is made,
+		// regardless of the setting of the "keep capabilities" flag.)
+		if !t.creds.KeepCaps {
+			t.creds.PermittedCaps = 0
+			t.creds.EffectiveCaps = 0
+		}
+	}
+	// """
+	// 2. If the effective user ID is changed from 0 to nonzero, then all
+	// capabilities are cleared from the effective set.
+	//
+	// 3. If the effective user ID is changed from nonzero to 0, then the
+	// permitted set is copied to the effective set.
+	// """
+	if oldE == root && newE != root {
+		t.creds.EffectiveCaps = 0
+	} else if oldE != root && newE == root {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	}
+	// "4. If the filesystem user ID is changed from 0 to nonzero (see
+	// setfsuid(2)), then the following capabilities are cleared from the
+	// effective set: ..."
+	// (filesystem UIDs aren't implemented, nor are any of the capabilities in
+	// question)
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetGID implements the semantics of setgid(2).
+func (t *Task) SetGID(gid auth.GID) error {
+	if !gid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kgid := t.creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	if t.creds.HasCapability(linux.CAP_SETGID) {
+		t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
+		return nil
+	}
+	if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID {
+		return syserror.EPERM
+	}
+	t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID)
+	return nil
+}
+
+// SetREGID implements the semantics of setregid(2).
+func (t *Task) SetREGID(r, e auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKGID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKGID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID {
+			return syserror.EPERM
+		}
+		if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID {
+			return syserror.EPERM
+		}
+	}
+	newS := t.creds.SavedKGID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) {
+		newS = newE
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESGID implements the semantics of the setresgid(2) syscall.
+func (t *Task) SetRESGID(r, e, s auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	var err error
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR, err = t.creds.UseGID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE, err = t.creds.UseGID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKGID
+	if s.Ok() {
+		newS, err = t.creds.UseGID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
+	oldE := t.creds.EffectiveKGID
+	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetExtraGIDs attempts to change t's supplemental groups. All IDs are
+// interpreted as being in t's user namespace.
+func (t *Task) SetExtraGIDs(gids []auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		return syserror.EPERM
+	}
+	kgids := make([]auth.KGID, len(gids))
+	for i, gid := range gids {
+		kgid := t.creds.UserNamespace.MapToKGID(gid)
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		kgids[i] = kgid
+	}
+	t.creds.ExtraKGIDs = kgids
+	return nil
+}
+
+// SetCapabilitySets attempts to change t's permitted, inheritable, and
+// effective capability sets.
+func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Permitted: This is a limiting superset for the effective capabilities
+	// that the thread may assume." - capabilities(7)
+	if effective & ^permitted != 0 {
+		return syserror.EPERM
+	}
+	// "It is also a limiting superset for the capabilities that may be added
+	// to the inheritable set by a thread that does not have the CAP_SETPCAP
+	// capability in its effective set."
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) {
+		return syserror.EPERM
+	}
+	// "If a thread drops a capability from its permitted set, it can never
+	// reacquire that capability (unless it execve(2)s ..."
+	if permitted & ^t.creds.PermittedCaps != 0 {
+		return syserror.EPERM
+	}
+	// "... if a capability is not in the bounding set, then a thread can't add
+	// this capability to its inheritable set, even if it was in its permitted
+	// capabilities ..."
+	if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
+		return syserror.EPERM
+	}
+	t.creds.PermittedCaps = permitted
+	t.creds.InheritableCaps = inheritable
+	t.creds.EffectiveCaps = effective
+	return nil
+}
+
+// DropBoundingCapability attempts to drop capability cp from t's capability
+// bounding set.
+func (t *Task) DropBoundingCapability(cp linux.Capability) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) {
+		return syserror.EPERM
+	}
+	t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	return nil
+}
+
+// SetUserNamespace attempts to move c into ns.
+func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// "A process reassociating itself with a user namespace must have the
+	// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
+	//
+	// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
+	// in ns (by rule 3 in auth.Credentials.HasCapability).
+	if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+		return syserror.EPERM
+	}
+
+	t.creds.UserNamespace = ns
+	// "The child process created by clone(2) with the CLONE_NEWUSER flag
+	// starts out with a complete set of capabilities in the new user
+	// namespace. Likewise, a process that creates a new user namespace using
+	// unshare(2) or joins an existing user namespace using setns(2) gains a
+	// full set of capabilities in that namespace."
+	t.creds.PermittedCaps = auth.AllCapabilities
+	t.creds.InheritableCaps = 0
+	t.creds.EffectiveCaps = auth.AllCapabilities
+	t.creds.BoundingCaps = auth.AllCapabilities
+	// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
+	// flag sets the "securebits" flags (see capabilities(7)) to their default
+	// values (all flags disabled) in the child (for clone(2)) or caller (for
+	// unshare(2), or setns(2)." - user_namespaces(7)
+	t.creds.KeepCaps = false
+
+	return nil
+}
+
+// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
+func (t *Task) SetKeepCaps(k bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.creds.KeepCaps = k
+}
+
+// updateCredsForExec updates t.creds to reflect an execve().
+//
+// NOTE: We currently do not implement privileged executables
+// (set-user/group-ID bits and file capabilities). This allows us to make a lot
+// of simplifying assumptions:
+//
+// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which
+// disables the features we don't support anyway, is always set. This
+// drastically simplifies this function.
+//
+// - We don't implement AT_SECURE, because no_new_privs always being set means
+// that the conditions that require AT_SECURE never arise. (Compare Linux's
+// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
+//
+// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
+// seccomp-bpf is also allowed if the task has no_new_privs set.
+//
+// - Task.ptraceAttach does not serialize with execve as it does in Linux,
+// since no_new_privs being set has the same effect as the presence of an
+// unprivileged tracer.
+//
+// Preconditions: t.mu must be locked.
+func (t *Task) updateCredsForExecLocked() {
+	// """
+	// During an execve(2), the kernel calculates the new capabilities of
+	// the process using the following algorithm:
+	//
+	//     P'(permitted) = (P(inheritable) & F(inheritable)) |
+	//                     (F(permitted) & cap_bset)
+	//
+	//     P'(effective) = F(effective) ? P'(permitted) : 0
+	//
+	//     P'(inheritable) = P(inheritable)    [i.e., unchanged]
+	//
+	// where:
+	//
+	//     P         denotes the value of a thread capability set before the
+	//               execve(2)
+	//
+	//     P'        denotes the value of a thread capability set after the
+	//               execve(2)
+	//
+	//     F         denotes a file capability set
+	//
+	//     cap_bset  is the value of the capability bounding set
+	//
+	// ...
+	//
+	// In order to provide an all-powerful root using capability sets, during
+	// an execve(2):
+	//
+	// 1. If a set-user-ID-root program is being executed, or the real user ID
+	// of the process is 0 (root) then the file inheritable and permitted sets
+	// are defined to be all ones (i.e. all capabilities enabled).
+	//
+	// 2. If a set-user-ID-root program is being executed, then the file
+	// effective bit is defined to be one (enabled).
+	//
+	// The upshot of the above rules, combined with the capabilities
+	// transformations described above, is that when a process execve(2)s a
+	// set-user-ID-root program, or when a process with an effective UID of 0
+	// execve(2)s a program, it gains all capabilities in its permitted and
+	// effective capability sets, except those masked out by the capability
+	// bounding set.
+	// """ - capabilities(7)
+	// (ambient capability sets omitted)
+	//
+	// As the last paragraph implies, the case of "a set-user-ID root program
+	// is being executed" also includes the case where (namespace) root is
+	// executing a non-set-user-ID program; the actual check is just based on
+	// the effective user ID.
+	var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
+	fileEffective := false
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	if t.creds.EffectiveKUID == root || t.creds.RealKUID == root {
+		newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps
+		if t.creds.EffectiveKUID == root {
+			fileEffective = true
+		}
+	}
+
+	// Now we enter poorly-documented, somewhat confusing territory. (The
+	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
+	// is not very helpful.) My reading of it is:
+	//
+	// If at least one of the following is true:
+	//
+	// A1. The execing task is ptraced, and the tracer did not have
+	// CAP_SYS_PTRACE in the execing task's user namespace at the time of
+	// PTRACE_ATTACH.
+	//
+	// A2. The execing task shares its FS context with at least one task in
+	// another thread group.
+	//
+	// A3. The execing task has no_new_privs set.
+	//
+	// AND at least one of the following is true:
+	//
+	// B1. The new effective user ID (which may come from set-user-ID, or be the
+	// execing task's existing effective user ID) is not equal to the task's
+	// real UID.
+	//
+	// B2. The new effective group ID (which may come from set-group-ID, or be
+	// the execing task's existing effective group ID) is not equal to the
+	// task's real GID.
+	//
+	// B3. The new permitted capability set contains capabilities not in the
+	// task's permitted capability set.
+	//
+	// Then:
+	//
+	// C1. Limit the new permitted capability set to the task's permitted
+	// capability set.
+	//
+	// C2. If either the task does not have CAP_SETUID in its user namespace, or
+	// the task has no_new_privs set, force the new effective UID and GID to
+	// the task's real UID and GID.
+	//
+	// But since no_new_privs is always set (A3 is always true), this becomes
+	// much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
+	// is a no-op. So we can just do C1 and C2 unconditionally.
+	if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID {
+		t.creds.EffectiveKUID = t.creds.RealKUID
+		t.creds.EffectiveKGID = t.creds.RealKGID
+		t.parentDeathSignal = 0
+	}
+	// (Saved set-user-ID is always set to the new effective user ID, and saved
+	// set-group-ID is always set to the new effective group ID, regardless of
+	// the above.)
+	t.creds.SavedKUID = t.creds.RealKUID
+	t.creds.SavedKGID = t.creds.RealKGID
+	t.creds.PermittedCaps &= newPermitted
+	if fileEffective {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	} else {
+		t.creds.EffectiveCaps = 0
+	}
+
+	// prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
+	// calls to execve(2).
+	t.creds.KeepCaps = false
+
+	// "The bounding set is inherited at fork(2) from the thread's parent, and
+	// is preserved across an execve(2)". So we're done.
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
new file mode 100644
index 000000000..18efacb19
--- /dev/null
+++ b/pkg/sentry/kernel/task_log.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sort"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// maxStackDebugBytes is the maximum number of user stack bytes that may be
+	// printed by debugDumpStack.
+	maxStackDebugBytes = 1024
+)
+
+// Infof logs an formatted info message by calling log.Infof.
+func (t *Task) Infof(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Info) {
+		log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Warningf logs a warning string by calling log.Warningf.
+func (t *Task) Warningf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Warning) {
+		log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// Debugf creates a debug string that includes the task ID.
+func (t *Task) Debugf(fmt string, v ...interface{}) {
+	if log.IsLogging(log.Debug) {
+		log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+	}
+}
+
+// IsLogging returns true iff this level is being logged.
+func (t *Task) IsLogging(level log.Level) bool {
+	return log.IsLogging(level)
+}
+
+// DebugDumpState logs task state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) DebugDumpState() {
+	t.debugDumpRegisters()
+	t.debugDumpStack()
+	if mm := t.MemoryManager(); mm != nil {
+		t.Debugf("Mappings:\n%s", mm)
+	}
+	t.Debugf("FDMap:\n%s", t.FDMap())
+}
+
+// debugDumpRegisters logs register state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpRegisters() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	regmap, err := t.Arch().RegisterMap()
+	if err != nil {
+		t.Debugf("Registers: %v", err)
+	} else {
+		t.Debugf("Registers:")
+		var regs []string
+		for reg := range regmap {
+			regs = append(regs, reg)
+		}
+		sort.Strings(regs)
+		for _, reg := range regs {
+			t.Debugf("%-8s = %016x", reg, regmap[reg])
+		}
+	}
+}
+
+// debugDumpStack logs user stack contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpStack() {
+	if !t.IsLogging(log.Debug) {
+		return
+	}
+	m := t.MemoryManager()
+	if m == nil {
+		t.Debugf("Memory manager for task is gone, skipping application stack dump.")
+		return
+	}
+	t.Debugf("Stack:")
+	start := usermem.Addr(t.Arch().Stack())
+	// Round addr down to a 16-byte boundary.
+	start &= ^usermem.Addr(15)
+	// Print 16 bytes per line, one byte at a time.
+	for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
+		addr, ok := start.AddLength(offset)
+		if !ok {
+			break
+		}
+		var data [16]byte
+		n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+			IgnorePermissions: true,
+		})
+		// Print as much of the line as we can, even if an error was
+		// encountered.
+		if n > 0 {
+			t.Debugf("%x: % x", addr, data[:n])
+		}
+		if err != nil {
+			t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+			break
+		}
+	}
+}
+
+// updateLogPrefix updates the task's cached log prefix to reflect its
+// current thread ID.
+//
+// Preconditions: The task's owning TaskSet.mu must be locked.
+func (t *Task) updateLogPrefixLocked() {
+	// Use the task's TID in the root PID namespace for logging.
+	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+}
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
new file mode 100644
index 000000000..4df2e53d3
--- /dev/null
+++ b/pkg/sentry/kernel/task_net.go
@@ -0,0 +1,35 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+)
+
+// IsNetworkNamespaced returns true if t is in a non-root network namespace.
+func (t *Task) IsNetworkNamespaced() bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.netns
+}
+
+// NetworkContext returns the network stack used by the task. NetworkContext
+// may return nil if no network stack is available.
+func (t *Task) NetworkContext() inet.Stack {
+	if t.IsNetworkNamespaced() {
+		return nil
+	}
+	return t.k.networkStack
+}
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
new file mode 100644
index 000000000..e529f0c2d
--- /dev/null
+++ b/pkg/sentry/kernel/task_resources.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// TaskResources is the subset of a task's data provided by its creator that is
+// not provided by the loader.
+type TaskResources struct {
+	// SignalMask is the set of signals whose delivery is currently blocked.
+	//
+	// FIXME: Determine if we also need RealSignalMask
+	SignalMask linux.SignalSet
+
+	// FSContext is the filesystem context.
+	*FSContext
+
+	// FDMap provides access to files to the task.
+	*FDMap
+
+	// Tracks abstract sockets that are in use.
+	AbstractSockets *AbstractSocketNamespace
+}
+
+// newTaskResources returns a new TaskResources, taking an additional reference
+// on fdm.
+func newTaskResources(fdm *FDMap, fc *FSContext) *TaskResources {
+	fdm.IncRef()
+	return &TaskResources{
+		FDMap:           fdm,
+		FSContext:       fc,
+		AbstractSockets: NewAbstractSocketNamespace(),
+	}
+}
+
+// release releases all resources held by the TaskResources. release is called
+// by the task when it exits.
+func (tr *TaskResources) release() {
+	tr.FDMap.DecRef()
+	tr.FDMap = nil
+	tr.FSContext.DecRef()
+	tr.FSContext = nil
+	tr.AbstractSockets = nil
+}
+
+// Fork returns a duplicate of tr.
+//
+// FIXME: Preconditions: When tr is owned by a Task, that task's
+// signal mutex must be locked, or Fork must be called by the task's goroutine.
+func (tr *TaskResources) Fork(shareFiles bool, shareFSContext bool) *TaskResources {
+	var fdmap *FDMap
+	if shareFiles {
+		fdmap = tr.FDMap
+		fdmap.IncRef()
+	} else {
+		fdmap = tr.FDMap.Fork()
+	}
+
+	var fsc *FSContext
+	if shareFSContext {
+		fsc = tr.FSContext
+		fsc.IncRef()
+	} else {
+		fsc = tr.FSContext.Fork()
+	}
+
+	return &TaskResources{
+		SignalMask:      tr.SignalMask,
+		FDMap:           fdmap,
+		FSContext:       fsc,
+		AbstractSockets: tr.AbstractSockets,
+	}
+}
+
+// FDMap returns t's FDMap.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) FDMap() *FDMap {
+	return t.tr.FDMap
+}
+
+// FSContext returns t's FSContext.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) FSContext() *FSContext {
+	return t.tr.FSContext
+}
+
+// MountNamespace returns t's MountNamespace. MountNamespace does not take an additional
+// reference on the returned MountNamespace.
+func (t *Task) MountNamespace() *fs.MountNamespace {
+	return t.k.mounts
+}
+
+// AbstractSockets returns t's AbstractSocketNamespace.
+func (t *Task) AbstractSockets() *AbstractSocketNamespace {
+	return t.tr.AbstractSockets
+}
+
+// IsChrooted returns true if the root directory of t's FSContext is not the
+// root directory of t's MountNamespace.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) IsChrooted() bool {
+	realRoot := t.k.mounts.Root()
+	defer realRoot.DecRef()
+	return t.tr.FSContext.root != realRoot
+}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
new file mode 100644
index 000000000..94ce5582b
--- /dev/null
+++ b/pkg/sentry/kernel/task_run.go
@@ -0,0 +1,346 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"bytes"
+	"runtime"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A taskRunState is a reified state in the task state machine. See README.md
+// for details. The canonical list of all run states, as well as transitions
+// between them, is given in run_states.dot.
+//
+// The set of possible states is enumerable and completely defined by the
+// kernel package, so taskRunState would ideally be represented by a
+// discriminated union. However, Go does not support sum types.
+//
+// Hence, as with TaskStop, data-free taskRunStates should be represented as
+// typecast nils to avoid unnecessary allocation.
+type taskRunState interface {
+	// execute executes the code associated with this state over the given task
+	// and returns the following state. If execute returns nil, the task
+	// goroutine should exit.
+	//
+	// It is valid to tail-call a following state's execute to avoid the
+	// overhead of converting the following state to an interface object and
+	// checking for stops, provided that the tail-call cannot recurse.
+	execute(*Task) taskRunState
+}
+
+// run runs the task goroutine.
+//
+// threadID a dummy value set to the task's TID in the root PID namespace to
+// make it visible in stack dumps. A goroutine for a given task can be identified
+// searching for Task.run()'s argument value.
+func (t *Task) run(threadID uintptr) {
+	// Construct t.blockingTimer here. We do this here because we can't
+	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
+	// kernel.timekeeper.SetClocks() hasn't been called yet.
+	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
+	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
+	defer t.blockingTimer.Destroy()
+	t.blockingTimerChan = blockingTimerChan
+
+	// Activate our address space.
+	t.Activate()
+	// The corresponding t.Deactivate occurs in the exit path
+	// (runExitMain.execute) so that when
+	// Platform.CooperativelySharesAddressSpace() == true, we give up the
+	// AddressSpace before the task goroutine finishes executing.
+
+	// Ensure that thread group timers for execution time reflect that this
+	// task now exists.
+	t.tg.tm.kick()
+
+	// If this is a newly-started task, it should check for participation in
+	// group stops. If this is a task resuming after restore, it was
+	// interrupted by saving. In either case, the task is initially
+	// interrupted.
+	t.interruptSelf()
+
+	for {
+		// Explanation for this ordering:
+		//
+		// - A freshly-started task that is stopped should not do anything
+		// before it enters the stop.
+		//
+		// - If taskRunState.execute returns nil, the task goroutine should
+		// exit without checking for a stop.
+		//
+		// - Task.Start won't start Task.run if t.runState is nil, so this
+		// ordering is safe.
+		t.doStop()
+		t.runState = t.runState.execute(t)
+		if t.runState == nil {
+			t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
+			t.goroutineStopped.Done()
+			t.tg.liveGoroutines.Done()
+			t.tg.pidns.owner.liveGoroutines.Done()
+			t.tg.pidns.owner.runningGoroutines.Done()
+
+			// Keep argument alive because stack trace for dead variables may not be correct.
+			runtime.KeepAlive(threadID)
+			return
+		}
+	}
+}
+
+// doStop is called by Task.run to block until the task is not stopped.
+func (t *Task) doStop() {
+	if atomic.LoadInt32(&t.stopCount) == 0 {
+		return
+	}
+	t.Deactivate()
+	// NOTE: t.Activate() must be called without any locks held, so
+	// this defer must precede the defer for unlocking the signal mutex.
+	defer t.Activate()
+	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
+	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.tg.pidns.owner.runningGoroutines.Add(-1)
+	defer t.tg.pidns.owner.runningGoroutines.Add(1)
+	t.goroutineStopped.Add(-1)
+	defer t.goroutineStopped.Add(1)
+	for t.stopCount > 0 {
+		t.endStopCond.Wait()
+	}
+}
+
+// The runApp state checks for interrupts before executing untrusted
+// application code.
+type runApp struct{}
+
+func (*runApp) execute(t *Task) taskRunState {
+	if t.interrupted() {
+		// Checkpointing instructs tasks to stop by sending an interrupt, so we
+		// must check for stops before entering runInterrupt (instead of
+		// tail-calling it).
+		return (*runInterrupt)(nil)
+	}
+
+	// We're about to switch to the application again. If there's still a
+	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
+	// restart the syscall that was interrupted. If there's a saved signal
+	// mask, restore it. (Note that restoring the saved signal mask may unblock
+	// a pending signal, causing another interruption, but that signal should
+	// not interact with the interrupted syscall.)
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			if sre == ERESTART_RESTARTBLOCK {
+				t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscallWithRestartBlock()
+			} else {
+				t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+				t.Arch().RestartSyscall()
+			}
+		}
+		t.haveSyscallReturn = false
+	}
+	if t.haveSavedSignalMask {
+		t.SetSignalMask(t.savedSignalMask)
+		t.haveSavedSignalMask = false
+		if t.interrupted() {
+			return (*runInterrupt)(nil)
+		}
+	}
+
+	// Apply restartable sequences.
+	if t.rseqPreempted {
+		t.rseqPreempted = false
+		if t.rseqCPUAddr != 0 {
+			if err := t.rseqCopyOutCPU(); err != nil {
+				t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+				t.forceSignal(linux.SIGSEGV, false)
+				t.SendSignal(sigPriv(linux.SIGSEGV))
+				// Re-enter the task run loop for signal delivery.
+				return (*runApp)(nil)
+			}
+		}
+		t.rseqInterrupt()
+	}
+
+	// Check if we need to enable single-stepping. Tracers expect that the
+	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
+	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
+	// includes our ptrace platform, by the way), so we should only clear the
+	// single-step flag if we're responsible for setting it. (clearSinglestep
+	// is therefore analogous to Linux's TIF_FORCED_TF.)
+	//
+	// Strictly speaking, we should also not clear the single-step flag if we
+	// single-step through an instruction that sets the single-step flag
+	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
+	// own TF. (Famous last words, I know.)
+	clearSinglestep := false
+	if t.hasTracer() {
+		t.tg.pidns.owner.mu.RLock()
+		if t.ptraceSinglestep {
+			clearSinglestep = !t.Arch().SingleStep()
+			t.Arch().SetSingleStep()
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+	}
+
+	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
+	info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+
+	if clearSinglestep {
+		t.Arch().ClearSingleStep()
+	}
+
+	switch err {
+	case nil:
+		// Handle application system call.
+		return t.doSyscall()
+
+	case platform.ErrContextInterrupt:
+		// Interrupted by platform.Context.Interrupt(). Re-enter the run
+		// loop to figure out why.
+		return (*runApp)(nil)
+
+	case platform.ErrContextSignal:
+		// Looks like a signal has been delivered to us. If it's a synchronous
+		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
+		// thread that received it.
+		sig := linux.Signal(info.Signo)
+
+		// Was it a fault that we should handle internally? If so, this wasn't
+		// an application-generated signal and we should continue execution
+		// normally.
+		if at.Any() {
+			addr := usermem.Addr(info.Addr())
+			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			if err == nil {
+				// The fault was handled appropriately.
+				// We can resume running the application.
+				return (*runApp)(nil)
+			}
+
+			// Is this a vsyscall that we need emulate?
+			if at.Execute {
+				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
+					return t.doVsyscall(addr, sysno)
+				}
+			}
+
+			// The JVM will trigger these errors constantly, so don't
+			// spam logs with this error.
+			if err == syserror.EFAULT || err == syserror.EPERM {
+				t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+			} else {
+				t.Warningf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+			}
+			t.DebugDumpState()
+
+			// Continue to signal handling.
+			//
+			// Convert a BusError error to a SIGBUS from a SIGSEGV. All
+			// other info bits stay the same (address, etc.).
+			if _, ok := err.(*memmap.BusError); ok {
+				sig = linux.SIGBUS
+				info.Signo = int32(linux.SIGBUS)
+			}
+		}
+
+		switch sig {
+		case linux.SIGILL:
+			// N.B. The debug stuff here is arguably
+			// expensive.  Don't fret. This gets called
+			// about 5 times for a typical application, if
+			// that.
+			t.Debugf("SIGILL @ %x", t.Arch().IP())
+
+			// Is this a CPUID instruction?
+			expected := arch.CPUIDInstruction[:]
+			found := make([]byte, len(expected))
+			_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+			if err == nil && bytes.Equal(expected, found) {
+				// Skip the cpuid instruction.
+				t.Arch().CPUIDEmulate(t)
+				t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+				break
+			}
+
+			// Treat it like any other synchronous signal.
+			fallthrough
+
+		case linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
+			// Synchronous signal. Send it to ourselves. Assume the signal is
+			// legitimate and force it (work around the signal being ignored or
+			// blocked) like Linux does. Conveniently, this is even the correct
+			// behavior for SIGTRAP from single-stepping.
+			t.forceSignal(linux.Signal(sig), false /* unconditional */)
+			t.SendSignal(info)
+
+		case platform.SignalInterrupt:
+			// Assume that a call to platform.Context.Interrupt() misfired.
+
+		case linux.SIGPROF:
+			// It's a profiling interrupt: there's not much
+			// we can do. We've already paid a decent cost
+			// by intercepting the signal, at this point we
+			// simply ignore it.
+
+		default:
+			// Asynchronous signal. Let the system deal with it.
+			t.k.sendExternalSignal(info, "application")
+		}
+
+		return (*runApp)(nil)
+
+	case platform.ErrContextCPUPreempted:
+		// Ensure that RSEQ critical sections are interrupted and per-thread
+		// CPU values are updated before the next platform.Context.Switch().
+		t.rseqPreempted = true
+		return (*runApp)(nil)
+
+	default:
+		// What happened? Can't continue.
+		t.Warningf("Unexpected SwitchToApp error: %v", err)
+		t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)})
+		return (*runExit)(nil)
+	}
+}
+
+// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
+func (t *Task) waitGoroutineStoppedOrExited() {
+	t.goroutineStopped.Wait()
+}
+
+// WaitExited blocks until all task goroutines in tg have exited.
+//
+// WaitExited does not correspond to anything in Linux; it's provided so that
+// external callers of Kernel.CreateProcess can wait for the created thread
+// group to terminate.
+func (tg *ThreadGroup) WaitExited() {
+	tg.liveGoroutines.Wait()
+}
+
+// Yield yields the processor for the calling task.
+func (t *Task) Yield() {
+	atomic.AddUint64(&t.yieldCount, 1)
+	runtime.Gosched()
+}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
new file mode 100644
index 000000000..b50139077
--- /dev/null
+++ b/pkg/sentry/kernel/task_sched.go
@@ -0,0 +1,329 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// CPU scheduling, real and fake.
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskGoroutineState is a coarse representation of the current execution
+// status of a kernel.Task goroutine.
+type TaskGoroutineState int
+
+const (
+	// TaskGoroutineNonexistent indicates that the task goroutine has either
+	// not yet been created by Task.Start() or has returned from Task.run().
+	// This must be the zero value for TaskGoroutineState.
+	TaskGoroutineNonexistent TaskGoroutineState = iota
+
+	// TaskGoroutineRunningSys indicates that the task goroutine is executing
+	// sentry code.
+	TaskGoroutineRunningSys
+
+	// TaskGoroutineRunningApp indicates that the task goroutine is executing
+	// application code.
+	TaskGoroutineRunningApp
+
+	// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
+	// blocked in Task.block(), and hence may be woken by Task.interrupt()
+	// (e.g. due to signal delivery).
+	TaskGoroutineBlockedInterruptible
+
+	// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
+	// stopped outside of Task.block() and Task.doStop(), and hence cannot be
+	// woken by Task.interrupt().
+	TaskGoroutineBlockedUninterruptible
+
+	// TaskGoroutineStopped indicates that the task goroutine is blocked in
+	// Task.doStop(). TaskGoroutineStopped is similar to
+	// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
+	// possible to determine when Task.stop is meaningful.
+	TaskGoroutineStopped
+)
+
+// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
+// be read and updated atomically.
+type TaskGoroutineSchedInfo struct {
+	// Timestamp was the value of Kernel.cpuClock when this
+	// TaskGoroutineSchedInfo was last updated.
+	Timestamp uint64
+
+	// State is the current state of the task goroutine.
+	State TaskGoroutineState
+
+	// UserTicks is the amount of time the task goroutine has spent executing
+	// its associated Task's application code, in units of linux.ClockTick.
+	UserTicks uint64
+
+	// SysTicks is the amount of time the task goroutine has spent executing in
+	// the sentry, in units of linux.ClockTick.
+	SysTicks uint64
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.gosched.State = state
+	t.goschedSeq.EndWrite()
+}
+
+// Preconditions: The caller must be running on the task goroutine, and leaving
+// a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
+func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != state {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	if state == TaskGoroutineRunningApp {
+		t.gosched.UserTicks += now - t.gosched.Timestamp
+	}
+	t.gosched.Timestamp = now
+	t.gosched.State = TaskGoroutineRunningSys
+	t.goschedSeq.EndWrite()
+}
+
+// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
+// Most clients should use t.CPUStats() instead.
+func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
+	return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
+}
+
+// CPUStats returns the CPU usage statistics of t.
+func (t *Task) CPUStats() usage.CPUStats {
+	return t.cpuStatsAt(t.k.CPUClockNow())
+}
+
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
+// the returned stats non-monotonic.
+func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
+	tsched := t.TaskGoroutineSchedInfo()
+	if tsched.Timestamp < now {
+		// Update stats to reflect execution since the last update to
+		// t.gosched.
+		switch tsched.State {
+		case TaskGoroutineRunningSys:
+			tsched.SysTicks += now - tsched.Timestamp
+		case TaskGoroutineRunningApp:
+			tsched.UserTicks += now - tsched.Timestamp
+		}
+	}
+	return usage.CPUStats{
+		UserTime:          time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
+		SysTime:           time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
+		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
+	}
+}
+
+// CPUStats returns the combined CPU usage statistics of all past and present
+// threads in tg.
+func (tg *ThreadGroup) CPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	// Hack to get a pointer to the Kernel.
+	if tg.leader == nil {
+		// Per comment on tg.leader, this is only possible if nothing in the
+		// ThreadGroup has ever executed anyway.
+		return usage.CPUStats{}
+	}
+	now := tg.leader.k.CPUClockNow()
+	stats := tg.exitedCPUStats
+	// Account for active tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		stats.Accumulate(t.cpuStatsAt(now))
+	}
+	return stats
+}
+
+// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
+// resource usage statistics for all children of [tg] that have terminated and
+// been waited for. These statistics will include the resources used by
+// grandchildren, and further removed descendants, if all of the intervening
+// descendants waited on their terminated children."
+func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.childCPUStats
+}
+
+// StateStatus returns a string representation of the task's current state,
+// appropriate for /proc/[pid]/status.
+func (t *Task) StateStatus() string {
+	switch s := t.TaskGoroutineSchedInfo().State; s {
+	case TaskGoroutineNonexistent:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		switch t.exitState {
+		case TaskExitZombie:
+			return "Z (zombie)"
+		case TaskExitDead:
+			return "X (dead)"
+		default:
+			// The task goroutine can't exit before passing through
+			// runExitNotify, so this indicates that the task has been created,
+			// but the task goroutine hasn't yet started. The Linux equivalent
+			// is struct task_struct::state == TASK_NEW
+			// (kernel/fork.c:copy_process() =>
+			// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
+			// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
+			// TASK_RUNNING.
+			return "R (running)"
+		}
+	case TaskGoroutineRunningSys, TaskGoroutineRunningApp:
+		return "R (running)"
+	case TaskGoroutineBlockedInterruptible:
+		return "S (sleeping)"
+	case TaskGoroutineStopped:
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		switch t.stop.(type) {
+		case *groupStop:
+			return "T (stopped)"
+		case *ptraceStop:
+			return "t (tracing stop)"
+		}
+		fallthrough
+	case TaskGoroutineBlockedUninterruptible:
+		// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
+		// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
+		// fs/proc/array.c:task_state_array.
+		return "D (disk sleep)"
+	default:
+		panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
+	}
+}
+
+// CPUMask returns a copy of t's allowed CPU mask.
+func (t *Task) CPUMask() sched.CPUSet {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.allowedCPUMask.Copy()
+}
+
+// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
+// mask.
+//
+// Preconditions: mask.Size() ==
+// sched.CPUSetSize(t.Kernel().ApplicationCores()).
+func (t *Task) SetCPUMask(mask sched.CPUSet) error {
+	if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
+		panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
+	}
+
+	// Remove CPUs in mask above Kernel.applicationCores.
+	mask.ClearAbove(t.k.applicationCores)
+
+	// Ensure that at least 1 CPU is still allowed.
+	if mask.NumCPUs() == 0 {
+		return syserror.EINVAL
+	}
+
+	if t.k.useHostCores {
+		// No-op; pretend the mask was immediately changed back.
+		return nil
+	}
+
+	t.tg.pidns.owner.mu.RLock()
+	rootTID := t.tg.pidns.owner.Root.tids[t]
+	t.tg.pidns.owner.mu.RUnlock()
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.allowedCPUMask = mask
+	atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
+	return nil
+}
+
+// CPU returns the cpu id for a given task.
+func (t *Task) CPU() int32 {
+	if t.k.useHostCores {
+		return int32(hostcpu.GetCPU())
+	}
+
+	return atomic.LoadInt32(&t.cpu)
+}
+
+// assignCPU returns the virtualized CPU number for the task with global TID
+// tid and allowedCPUMask allowed.
+func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
+	// To pretend that threads are evenly distributed to allowed CPUs, choose n
+	// to be less than the number of CPUs in allowed ...
+	n := int(tid) % int(allowed.NumCPUs())
+	// ... then pick the nth CPU in allowed.
+	allowed.ForEachCPU(func(c uint) {
+		if n--; n == 0 {
+			cpu = int32(c)
+		}
+	})
+	return cpu
+}
+
+// Niceness returns t's niceness.
+func (t *Task) Niceness() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness
+}
+
+// Priority returns t's priority.
+func (t *Task) Priority() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness + 20
+}
+
+// SetNiceness sets t's niceness to n.
+func (t *Task) SetNiceness(n int) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.niceness = n
+}
+
+// NumaPolicy returns t's current numa policy.
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.numaPolicy, t.numaNodeMask
+}
+
+// SetNumaPolicy sets t's numa policy.
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.numaPolicy = policy
+	t.numaNodeMask = nodeMask
+}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
new file mode 100644
index 000000000..2340256b0
--- /dev/null
+++ b/pkg/sentry/kernel/task_signals.go
@@ -0,0 +1,1056 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file defines the behavior of task signal handling.
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SignalAction is an internal signal action.
+type SignalAction int
+
+// Available signal actions.
+// Note that although we refer the complete set internally,
+// the application is only capable of using the Default and
+// Ignore actions from the system call interface.
+const (
+	SignalActionTerm SignalAction = iota
+	SignalActionCore
+	SignalActionStop
+	SignalActionIgnore
+	SignalActionHandler
+)
+
+// Default signal handler actions. Note that for most signals,
+// (except SIGKILL and SIGSTOP) these can be overridden by the app.
+var defaultActions = map[linux.Signal]SignalAction{
+	// POSIX.1-1990 standard.
+	linux.SIGHUP:  SignalActionTerm,
+	linux.SIGINT:  SignalActionTerm,
+	linux.SIGQUIT: SignalActionCore,
+	linux.SIGILL:  SignalActionCore,
+	linux.SIGABRT: SignalActionCore,
+	linux.SIGFPE:  SignalActionCore,
+	linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSEGV: SignalActionCore,
+	linux.SIGPIPE: SignalActionTerm,
+	linux.SIGALRM: SignalActionTerm,
+	linux.SIGTERM: SignalActionTerm,
+	linux.SIGUSR1: SignalActionTerm,
+	linux.SIGUSR2: SignalActionTerm,
+	linux.SIGCHLD: SignalActionIgnore,
+	linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
+	linux.SIGSTOP: SignalActionStop,
+	linux.SIGTSTP: SignalActionStop,
+	linux.SIGTTIN: SignalActionStop,
+	linux.SIGTTOU: SignalActionStop,
+	// POSIX.1-2001 standard.
+	linux.SIGBUS:    SignalActionCore,
+	linux.SIGPROF:   SignalActionTerm,
+	linux.SIGSYS:    SignalActionCore,
+	linux.SIGTRAP:   SignalActionCore,
+	linux.SIGURG:    SignalActionIgnore,
+	linux.SIGVTALRM: SignalActionTerm,
+	linux.SIGXCPU:   SignalActionCore,
+	linux.SIGXFSZ:   SignalActionCore,
+	// The rest on linux.
+	linux.SIGSTKFLT: SignalActionTerm,
+	linux.SIGIO:     SignalActionTerm,
+	linux.SIGPWR:    SignalActionTerm,
+	linux.SIGWINCH:  SignalActionIgnore,
+}
+
+// computeAction figures out what to do given a signal number
+// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop,
+// and SIGKILL always results in a SignalActionTerm.
+// Signal 0 is always ignored as many programs use it for various internal functions
+// and don't expect it to do anything.
+//
+// In the event the signal is not one of these, act.Handler determines what
+// happens next.
+// If act.Handler is:
+// 0, the default action is taken;
+// 1, the signal is ignored;
+// anything else, the function returns SignalActionHandler.
+func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction {
+	switch sig {
+	case linux.SIGSTOP:
+		return SignalActionStop
+	case linux.SIGKILL:
+		return SignalActionTerm
+	case linux.Signal(0):
+		return SignalActionIgnore
+	}
+
+	switch act.Handler {
+	case arch.SignalActDefault:
+		return defaultActions[sig]
+	case arch.SignalActIgnore:
+		return SignalActionIgnore
+	default:
+		return SignalActionHandler
+	}
+}
+
+// UnblockableSignals contains the set of signals which cannot be blocked.
+var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)
+
+// StopSignals is the set of signals whose default action is SignalActionStop.
+var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)
+
+// dequeueSignalLocked returns a pending unmasked signal. If there are no
+// pending unmasked signals, dequeueSignalLocked returns nil.
+//
+// Preconditions: t.tg.signalHandlers.mu must be locked.
+func (t *Task) dequeueSignalLocked() *arch.SignalInfo {
+	if info := t.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
+		return info
+	}
+	if info := t.tg.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
+		return info
+	}
+	return nil
+}
+
+// TakeSignal returns a pending signal not blocked by mask. Signal handlers are
+// not affected. If there are no pending signals not blocked by mask,
+// TakeSignal returns a nil SignalInfo.
+func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if info := t.pendingSignals.dequeue(mask); info != nil {
+		return info
+	}
+	if info := t.tg.pendingSignals.dequeue(mask); info != nil {
+		return info
+	}
+	return nil
+}
+
+// discardSpecificLocked removes all instances of the given signal from all
+// signal queues in tg.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
+	tg.pendingSignals.discardSpecific(sig)
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		t.pendingSignals.discardSpecific(sig)
+	}
+}
+
+// PendingSignals returns the set of pending signals.
+func (t *Task) PendingSignals() linux.SignalSet {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
+}
+
+// deliverSignal delivers the given signal and returns the following run state.
+func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState {
+	sigact := computeAction(linux.Signal(info.Signo), act)
+
+	if t.haveSyscallReturn {
+		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+			// Signals that are ignored, cause a thread group stop, or
+			// terminate the thread group do not interact with interrupted
+			// syscalls; in Linux terms, they are never returned to the signal
+			// handling path from get_signal => get_signal_to_deliver. The
+			// behavior of an interrupted syscall is determined by the first
+			// signal that is actually handled (by userspace).
+			if sigact == SignalActionHandler {
+				switch {
+				case sre == ERESTARTNOHAND:
+					fallthrough
+				case sre == ERESTART_RESTARTBLOCK:
+					fallthrough
+				case (sre == ERESTARTSYS && !act.IsRestart()):
+					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
+				default:
+					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+					t.Arch().RestartSyscall()
+				}
+			}
+		}
+	}
+
+	switch sigact {
+	case SignalActionTerm, SignalActionCore:
+		// "Default action is to terminate the process." - signal(7)
+		t.Debugf("Signal %d: terminating thread group", info.Signo)
+		t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
+		return (*runExit)(nil)
+
+	case SignalActionStop:
+		// "Default action is to stop the process."
+		t.initiateGroupStop(info)
+
+	case SignalActionIgnore:
+		// "Default action is to ignore the signal."
+		t.Debugf("Signal %d: ignored", info.Signo)
+
+	case SignalActionHandler:
+		// Try to deliver the signal to the user-configured handler.
+		t.Debugf("Signal %d: delivering to handler", info.Signo)
+		if err := t.deliverSignalToHandler(info, act); err != nil {
+			t.Warningf("Failed to deliver signal %+v to user handler: %v", info, err)
+			// Send a forced SIGSEGV. If the signal that couldn't be delivered
+			// was a SIGSEGV, force the handler to SIG_DFL.
+			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
+			t.SendSignal(sigPriv(linux.SIGSEGV))
+		}
+
+	default:
+		panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
+	}
+	return (*runInterrupt)(nil)
+}
+
+// deliverSignalToHandler changes the task's userspace state to enter the given
+// user-configured handler for the given signal.
+func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error {
+	// Signal delivery to an application handler interrupts restartable
+	// sequences.
+	t.rseqInterrupt()
+
+	// Are executing on the main stack,
+	// or the provided alternate stack?
+	sp := usermem.Addr(t.Arch().Stack())
+
+	// N.B. This is a *copy* of the alternate stack that the user's signal
+	// handler expects to see in its ucontext (even if it's not in use).
+	alt := t.signalStack
+	if act.IsOnStack() && alt.IsEnabled() {
+		alt.SetOnStack()
+		if !t.OnSignalStack(alt) {
+			sp = usermem.Addr(alt.Top())
+		}
+	}
+
+	// Set up the signal handler. If we have a saved signal mask, the signal
+	// handler should run with the current mask, but sigreturn should restore
+	// the saved one.
+	st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+	mask := t.tr.SignalMask
+	if t.haveSavedSignalMask {
+		mask = t.savedSignalMask
+	}
+	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
+		return err
+	}
+	t.haveSavedSignalMask = false
+
+	// Add our signal mask.
+	newMask := t.tr.SignalMask | act.Mask
+	if !act.IsNoDefer() {
+		newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
+	}
+	t.SetSignalMask(newMask)
+
+	return nil
+}
+
+var ctrlResume = &SyscallControl{ignoreReturn: true}
+
+// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
+// rt is true).
+func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
+	st := t.Stack()
+	sigset, err := t.Arch().SignalRestore(st, rt)
+	if err != nil {
+		return nil, err
+	}
+
+	// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
+	t.SetSignalMask(sigset &^ UnblockableSignals)
+
+	// TODO: sys_rt_sigreturn also calls restore_altstack from
+	// uc.stack, allowing the signal handler to implicitly mutate the signal
+	// stack.
+
+	return ctrlResume, nil
+}
+
+// SendSignal sends the given signal to t.
+//
+// The following errors may be returned:
+//
+//	syserror.ESRCH - The task has exited.
+//	syserror.EINVAL - The signal is not valid.
+//	syserror.EAGAIN - THe signal is realtime, and cannot be queued.
+//
+func (t *Task) SendSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, false /* group */)
+}
+
+// SendGroupSignal sends the given signal to t's thread group.
+func (t *Task) SendGroupSignal(info *arch.SignalInfo) error {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	return t.sendSignalLocked(info, true /* group */)
+}
+
+// SendSignal sends the given signal to tg, using tg's leader to determine if
+// the signal is blocked.
+func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) onCPULocked(includeSys bool) bool {
+	// Task is exiting.
+	if t.exitState != TaskExitNone {
+		return false
+	}
+
+	switch t.TaskGoroutineSchedInfo().State {
+	case TaskGoroutineRunningSys:
+		return includeSys
+	case TaskGoroutineRunningApp:
+		return true
+	default:
+		return false
+	}
+}
+
+// SendTimerSignal mimics the process timer signal delivery behavior in linux:
+// signals are delivered to the thread that triggers the timer expiration (see
+// kernel/time/posix-cpu-timers.c:check_process_timers(). This
+// means
+//   1) the thread is running on cpu at the time.
+//   2) a thread runs more frequently will get more of those signals.
+//
+// We approximate this behavior by selecting a running task in a round-robin
+// fashion. Statistically, a thread running more often should have a higher
+// probability to be selected.
+func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+
+	// Find the next running threads.
+	var t *Task
+	if tg.lastTimerSignalTask == nil {
+		t = tg.tasks.Front()
+	} else {
+		t = tg.lastTimerSignalTask.Next()
+	}
+
+	// Iterate from lastTimerSignalTask.Next() to the last task in the task list.
+	for t != nil {
+		if t.onCPULocked(includeSys) {
+			tg.lastTimerSignalTask = t
+			return t.sendSignalLocked(info, true /* group */)
+		}
+		t = t.Next()
+	}
+
+	// t is nil when we reach here. If lastTimerSignalTask is not nil, iterate
+	// from Front to lastTimerSignalTask.
+	if tg.lastTimerSignalTask != nil {
+		for t := tg.tasks.Front(); t != tg.lastTimerSignalTask.Next(); t = t.Next() {
+			if t.onCPULocked(includeSys) {
+				tg.lastTimerSignalTask = t
+				return t.sendSignalLocked(info, true /* group */)
+			}
+		}
+	}
+
+	// No running threads? Just try the leader.
+	tg.lastTimerSignalTask = tg.leader
+	return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
+	if t.exitState == TaskExitDead {
+		return syserror.ESRCH
+	}
+	sig := linux.Signal(info.Signo)
+	if sig == 0 {
+		return nil
+	}
+	if !sig.IsValid() {
+		return syserror.EINVAL
+	}
+
+	// Signal side effects apply even if the signal is ultimately discarded.
+	t.tg.applySignalSideEffectsLocked(sig)
+
+	// TODO: "Only signals for which the "init" process has established a
+	// signal handler can be sent to the "init" process by other members of the
+	// PID namespace. This restriction applies even to privileged processes,
+	// and prevents other members of the PID namespace from accidentally
+	// killing the "init" process." - pid_namespaces(7). We don't currently do
+	// this for child namespaces, though we should; we also don't do this for
+	// the root namespace (the same restriction applies to global init on
+	// Linux), where whether or not we should is much murkier. In practice,
+	// most sandboxed applications are not prepared to function as an init
+	// process.
+
+	// Unmasked, ignored signals are discarded without being queued, unless
+	// they will be visible to a tracer. Even for group signals, it's the
+	// originally-targeted task's signal mask and tracer that matter; compare
+	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
+	// sig_ignored().
+	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
+	if linux.SignalSetOf(sig)&t.tr.SignalMask == 0 && ignored && !t.hasTracer() {
+		t.Debugf("Discarding ignored signal %d", sig)
+		return nil
+	}
+
+	q := &t.pendingSignals
+	if group {
+		q = &t.tg.pendingSignals
+	}
+	if !q.enqueue(info) {
+		if sig.IsRealtime() {
+			return syserror.EAGAIN
+		}
+		t.Debugf("Discarding duplicate signal %d", sig)
+		return nil
+	}
+
+	// Find a receiver to notify. Note that the task we choose to notify, if
+	// any, may not be the task that actually dequeues and handles the signal;
+	// e.g. a racing signal mask change may cause the notified task to become
+	// ineligible, or a racing sibling task may dequeue the signal first.
+	if t.canReceiveSignalLocked(sig) {
+		t.Debugf("Notified of signal %d", sig)
+		t.interrupt()
+		return nil
+	}
+	if group {
+		if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+			nt.Debugf("Notified of group signal %d", sig)
+			nt.interrupt()
+			return nil
+		}
+	}
+	t.Debugf("No task notified of signal %d", sig)
+	return nil
+}
+
+func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
+	switch {
+	case linux.SignalSetOf(sig)&StopSignals != 0:
+		// Stop signals cause all prior SIGCONT to be discarded. (This is
+		// despite the fact this has little effect since SIGCONT's most
+		// important effect is applied when the signal is sent in the branch
+		// below, not when the signal is delivered.)
+		tg.discardSpecificLocked(linux.SIGCONT)
+	case sig == linux.SIGCONT:
+		// "The SIGCONT signal has a side effect of waking up (all threads of)
+		// a group-stopped process. This side effect happens before
+		// signal-delivery-stop. The tracer can't suppress this side effect (it
+		// can only suppress signal injection, which only causes the SIGCONT
+		// handler to not be executed in the tracee, if such a handler is
+		// installed." - ptrace(2)
+		tg.endGroupStopLocked(true)
+	case sig == linux.SIGKILL:
+		// "SIGKILL does not generate signal-delivery-stop and therefore the
+		// tracer can't suppress it. SIGKILL kills even within system calls
+		// (syscall-exit-stop is not generated prior to death by SIGKILL)." -
+		// ptrace(2)
+		//
+		// Note that this differs from ThreadGroup.requestExit in that it
+		// ignores tg.execing.
+		if !tg.exiting {
+			tg.exiting = true
+			tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
+		}
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			t.killLocked()
+		}
+	}
+}
+
+// canReceiveSignalLocked returns true if t should be interrupted to receive
+// the given signal. canReceiveSignalLocked is analogous to Linux's
+// kernel/signal.c:wants_signal(), but see below for divergences.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
+	// - Do not choose tasks that are blocking the signal.
+	if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+		return false
+	}
+	// - No need to check Task.exitState, as the exit path sets every bit in the
+	// signal mask when it transitions from TaskExitNone to TaskExitInitiated.
+	// - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
+	// task group via applySignalSideEffects => killLocked.
+	// - Do not choose stopped tasks, which cannot handle signals.
+	if t.stop != nil {
+		return false
+	}
+	// - TODO: No special case for when t is also the sending task,
+	// because the identity of the sender is unknown.
+	// - Do not choose tasks that have already been interrupted, as they may be
+	// busy handling another signal.
+	if len(t.interruptChan) != 0 {
+		return false
+	}
+	return true
+}
+
+// findSignalReceiverLocked returns a task in tg that should be interrupted to
+// receive the given signal. If no such task exists, findSignalReceiverLocked
+// returns nil.
+//
+// Linux actually records curr_target to balance the group signal targets.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if t.canReceiveSignalLocked(sig) {
+			return t
+		}
+	}
+	return nil
+}
+
+// forceSignal ensures that the task is not ignoring or blocking the given
+// signal. If unconditional is true, forceSignal takes action even if the
+// signal isn't being ignored or blocked.
+func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.forceSignalLocked(sig, unconditional)
+}
+
+func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
+	blocked := linux.SignalSetOf(sig)&t.tr.SignalMask != 0
+	act := t.tg.signalHandlers.actions[sig]
+	ignored := act.Handler == arch.SignalActIgnore
+	if blocked || ignored || unconditional {
+		act.Handler = arch.SignalActDefault
+		t.tg.signalHandlers.actions[sig] = act
+		if blocked {
+			t.setSignalMaskLocked(t.tr.SignalMask &^ linux.SignalSetOf(sig))
+		}
+	}
+}
+
+// SignalMask returns a copy of t's signal mask.
+func (t *Task) SignalMask() linux.SignalSet {
+	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.tr.SignalMask)))
+}
+
+// SetSignalMask sets t's signal mask.
+//
+// Preconditions: SetSignalMask can only be called by the task goroutine.
+// t.exitState < TaskExitZombie.
+func (t *Task) SetSignalMask(mask linux.SignalSet) {
+	// By precondition, t prevents t.tg from completing an execve and mutating
+	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
+	t.tg.signalHandlers.mu.Lock()
+	t.setSignalMaskLocked(mask)
+	t.tg.signalHandlers.mu.Unlock()
+}
+
+// Preconditions: The signal mutex must be locked.
+func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
+	oldMask := t.tr.SignalMask
+	atomic.StoreUint64((*uint64)(&t.tr.SignalMask), uint64(mask))
+
+	// If the new mask blocks any signals that were not blocked by the old
+	// mask, and at least one such signal is pending in tg.pendingSignals, and
+	// t has been woken, it could be the case that t was woken to handle that
+	// signal, but will no longer do so as a result of its new signal mask, so
+	// we have to pick a replacement.
+	blocked := mask &^ oldMask
+	blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
+	if blockedGroupPending != 0 && t.interrupted() {
+		linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
+			if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+				nt.interrupt()
+				return
+			}
+		})
+		// We have to re-issue the interrupt consumed by t.interrupted() since
+		// it might have been for a different reason.
+		t.interruptSelf()
+	}
+
+	// Conversely, if the new mask unblocks any signals that were blocked by
+	// the old mask, and at least one such signal is pending, we may now need
+	// to handle that signal.
+	unblocked := oldMask &^ mask
+	unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
+	if unblockedPending != 0 {
+		t.interruptSelf()
+	}
+}
+
+// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
+// comment).
+//
+// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
+	t.savedSignalMask = mask
+	t.haveSavedSignalMask = true
+}
+
+// SignalStack returns the task-private signal stack.
+func (t *Task) SignalStack() arch.SignalStack {
+	return t.signalStack
+}
+
+// OnSignalStack returns true if, when the task resumes running, it will run on
+// the task-private signal stack.
+func (t *Task) OnSignalStack(s arch.SignalStack) bool {
+	sp := usermem.Addr(t.Arch().Stack())
+	return usermem.Addr(s.Addr) <= sp && sp < usermem.Addr(s.Addr+s.Size)
+}
+
+// SetSignalStack sets the task-private signal stack and clears the
+// SignalStackFlagDisable, since we have a signal stack.
+func (t *Task) SetSignalStack(alt arch.SignalStack) error {
+	// Mask out irrelevant parts: only disable matters.
+	alt.Flags &= arch.SignalStackFlagDisable
+	t.signalStack = alt
+	return nil
+}
+
+// SetSignalAct atomically sets the thread group's signal action for signal sig
+// to *actptr (if actptr is not nil) and returns the old signal action.
+func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) {
+	if !sig.IsValid() {
+		return arch.SignalAct{}, syserror.EINVAL
+	}
+
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	sh := tg.signalHandlers
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	oldact := sh.actions[sig]
+	if actptr != nil {
+		if sig == linux.SIGKILL || sig == linux.SIGSTOP {
+			return oldact, syserror.EINVAL
+		}
+
+		act := *actptr
+		act.Mask &^= UnblockableSignals
+		sh.actions[sig] = act
+		// From POSIX, by way of Linux:
+		//
+		// "Setting a signal action to SIG_IGN for a signal that is pending
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		//
+		// "Setting a signal action to SIG_DFL for a signal that is pending and
+		// whose default action is to ignore the signal (for example, SIGCHLD),
+		// shall cause the pending signal to be discarded, whether or not it is
+		// blocked."
+		if computeAction(sig, act) == SignalActionIgnore {
+			tg.discardSpecificLocked(sig)
+		}
+	}
+	return oldact, nil
+}
+
+// CopyOutSignalAct converts the given SignalAct into an architecture-specific
+// type and then copies it out to task memory.
+func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
+	n := t.Arch().NewSignalAct()
+	n.SerializeFrom(s)
+	_, err := t.CopyOut(addr, n)
+	return err
+}
+
+// CopyInSignalAct copies an architecture-specific sigaction type from task
+// memory and then converts it into a SignalAct.
+func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
+	n := t.Arch().NewSignalAct()
+	var s arch.SignalAct
+	if _, err := t.CopyIn(addr, n); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// CopyOutSignalStack converts the given SignalStack into an
+// architecture-specific type and then copies it out to task memory.
+func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
+	n := t.Arch().NewSignalStack()
+	n.SerializeFrom(s)
+	_, err := t.CopyOut(addr, n)
+	return err
+}
+
+// CopyInSignalStack copies an architecture-specific stack_t from task memory
+// and then converts it into a SignalStack.
+func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
+	n := t.Arch().NewSignalStack()
+	var s arch.SignalStack
+	if _, err := t.CopyIn(addr, n); err != nil {
+		return s, err
+	}
+	n.DeserializeTo(&s)
+	return s, nil
+}
+
+// groupStop is a TaskStop placed on tasks that have received a stop signal
+// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
+// the ptrace man page.)
+type groupStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*groupStop) Killable() bool { return true }
+
+type groupStopPhase int
+
+const (
+	// groupStopNone indicates that a thread group is not in, or attempting to
+	// enter or leave, a group stop.
+	groupStopNone groupStopPhase = iota
+
+	// groupStopDequeued indicates that at least one task in a thread group has
+	// dequeued a stop signal (or dequeued any signal and entered a
+	// signal-delivery-stop as a result, which allows ptrace to change the
+	// signal into a stop signal), but temporarily dropped the signal mutex
+	// without initiating the group stop.
+	//
+	// groupStopDequeued is analogous to JOBCTL_STOP_DEQUEUED in Linux.
+	groupStopDequeued
+
+	// groupStopInitiated indicates that a task in a thread group has initiated
+	// a group stop, but not all tasks in the thread group have acknowledged
+	// entering the group stop.
+	//
+	// groupStopInitiated is represented by JOBCTL_STOP_PENDING &&
+	// !SIGNAL_STOP_STOPPED in Linux.
+	groupStopInitiated
+
+	// groupStopComplete indicates that all tasks in a thread group have
+	// acknowledged entering the group stop, and the last one to do so has
+	// notified the thread group's parent.
+	//
+	// groupStopComplete is represented by JOBCTL_STOP_PENDING &&
+	// SIGNAL_STOP_STOPPED in Linux.
+	groupStopComplete
+)
+
+// initiateGroupStop attempts to initiate a group stop based on a
+// previously-dequeued stop signal.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if t.tg.groupStopPhase != groupStopDequeued {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing signal", info.Signo)
+		return
+	}
+	if t.tg.exiting {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
+		return
+	}
+	if t.tg.execing != nil {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
+		return
+	}
+	t.Debugf("Signal %d: stopping thread group", info.Signo)
+	t.tg.groupStopPhase = groupStopInitiated
+	t.tg.groupStopSignal = linux.Signal(info.Signo)
+	t.tg.groupStopCount = 0
+	for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
+		t2.groupStopRequired = true
+		t2.groupStopAcknowledged = false
+		t2.interrupt()
+	}
+}
+
+// endGroupStopLocked ensures that all prior stop signals received by tg are
+// not stopping tg and will not stop tg in the future. If broadcast is true,
+// parent and tracer notification will be scheduled if appropriate.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
+	// Discard all previously-queued stop signals.
+	linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)
+
+	if tg.groupStopPhase != groupStopNone {
+		tg.leader.Debugf("Ending group stop currently in phase %d", tg.groupStopPhase)
+		if tg.groupStopPhase == groupStopInitiated || tg.groupStopPhase == groupStopComplete {
+			tg.groupStopSignal = 0
+			for t := tg.tasks.Front(); t != nil; t = t.Next() {
+				if _, ok := t.stop.(*groupStop); ok {
+					t.endInternalStopLocked()
+				}
+			}
+			if broadcast {
+				// Instead of notifying the parent here, set groupContNotify so
+				// that one of the continuing tasks does so. (Linux does
+				// something similar.) The reason we do this is to keep locking
+				// sane. In order to send a signal to the parent, we need to
+				// lock its signal mutex, but we're already holding tg's signal
+				// mutex, and the TaskSet mutex must be locked for writing for
+				// us to hold two signal mutexes. Since we don't want to
+				// require this for endGroupStopLocked (which is called from
+				// signal-sending paths), nor do we want to lose atomicity by
+				// releasing the mutexes we're already holding, just let the
+				// continuing thread group deal with it.
+				tg.groupContNotify = true
+				tg.groupContInterrupted = tg.groupStopPhase == groupStopInitiated
+				tg.groupContWaitable = true
+			}
+		}
+		// If groupStopPhase was groupStopDequeued, setting it to groupStopNone
+		// will cause following calls to initiateGroupStop to recognize that
+		// the group stop has been cancelled.
+		tg.groupStopPhase = groupStopNone
+	}
+}
+
+// signalStop sends a signal to t's thread group of a new group stop, group
+// continue, or ptrace stop, if appropriate. code and status are set in the
+// signal sent to tg, if any.
+//
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (t *Task) signalStop(target *Task, code int32, status int32) {
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
+	if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) {
+		sigchld := &arch.SignalInfo{
+			Signo: int32(linux.SIGCHLD),
+			Code:  code,
+		}
+		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
+		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		sigchld.SetStatus(status)
+		// TODO: Set utime, stime.
+		t.sendSignalLocked(sigchld, true /* group */)
+	}
+}
+
+// The runInterrupt state handles conditions indicated by interrupts.
+type runInterrupt struct{}
+
+func (*runInterrupt) execute(t *Task) taskRunState {
+	// Interrupts are de-duplicated (if t is interrupted twice before
+	// t.interrupted() is called, t.interrupted() will only return true once),
+	// so early exits from this function must re-enter the runInterrupt state
+	// to check for more interrupt-signaled conditions.
+
+	t.tg.signalHandlers.mu.Lock()
+
+	// Did we just leave a group stop?
+	if t.tg.groupContNotify {
+		t.tg.groupContNotify = false
+		sig := t.tg.groupStopSignal
+		intr := t.tg.groupContInterrupted
+		t.tg.signalHandlers.mu.Unlock()
+		t.tg.pidns.owner.mu.RLock()
+		// For consistency with Linux, if the parent and (thread group
+		// leader's) tracer are in the same thread group, deduplicate
+		// notifications.
+		notifyParent := t.tg.leader.parent != nil
+		if tracer := t.tg.leader.ptraceTracer.Load().(*Task); tracer != nil {
+			if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+				notifyParent = false
+			}
+			// Sending CLD_STOPPED to the tracer doesn't really make any sense;
+			// the thread group leader may have already entered the stop and
+			// notified its tracer accordingly. But it's consistent with
+			// Linux...
+			if intr {
+				tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				if !notifyParent {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
+				}
+			} else {
+				tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				tracer.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		if notifyParent {
+			// If groupContInterrupted, do as Linux does and pretend the group
+			// stop completed just before it ended. The theoretical behavior in
+			// this case would be to send a SIGCHLD indicating the completed
+			// stop, followed by a SIGCHLD indicating the continue. However,
+			// SIGCHLD is a standard signal, so the latter would always be
+			// dropped. Hence sending only the former is equivalent.
+			if intr {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
+			} else {
+				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
+			}
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+		return (*runInterrupt)(nil)
+	}
+
+	// Do we need to enter a group stop?
+	if t.groupStopRequired {
+		t.groupStopRequired = false
+		sig := t.tg.groupStopSignal
+		notifyParent := false
+		if !t.groupStopAcknowledged {
+			t.groupStopAcknowledged = true
+			t.tg.groupStopCount++
+			if t.tg.groupStopCount == t.tg.activeTasks {
+				t.Debugf("Completing group stop")
+				notifyParent = true
+				t.tg.groupStopPhase = groupStopComplete
+				t.tg.groupStopWaitable = true
+				t.tg.groupContNotify = false
+				t.tg.groupContWaitable = false
+			}
+		}
+		// Drop the signal mutex so we can take the TaskSet mutex.
+		t.tg.signalHandlers.mu.Unlock()
+
+		t.tg.pidns.owner.mu.RLock()
+		if t.tg.leader.parent == nil {
+			notifyParent = false
+		}
+		if tracer := t.Tracer(); tracer != nil {
+			t.ptraceCode = int32(sig)
+			t.ptraceSiginfo = nil
+			if t.beginPtraceStopLocked() {
+				tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
+				// For consistency with Linux, if the parent and tracer are in the
+				// same thread group, deduplicate notification signals.
+				if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+					notifyParent = false
+					tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
+				} else {
+					tracer.tg.eventQueue.Notify(EventTraceeStop)
+				}
+			}
+		} else {
+			t.tg.signalHandlers.mu.Lock()
+			if !t.killedLocked() {
+				t.beginInternalStopLocked((*groupStop)(nil))
+			}
+			t.tg.signalHandlers.mu.Unlock()
+		}
+		if notifyParent {
+			t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+			t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+		}
+		t.tg.pidns.owner.mu.RUnlock()
+
+		return (*runInterrupt)(nil)
+	}
+
+	// Are there signals pending?
+	if info := t.dequeueSignalLocked(); info != nil {
+		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 && t.tg.groupStopPhase == groupStopNone {
+			// Indicate that we've dequeued a stop signal before
+			// unlocking the signal mutex; initiateGroupStop will check
+			// that the phase hasn't changed (or is at least another
+			// "stop signal dequeued" phase) after relocking it.
+			t.tg.groupStopPhase = groupStopDequeued
+		}
+		if t.ptraceSignalLocked(info) {
+			// Dequeueing the signal action must wait until after the
+			// signal-delivery-stop ends since the tracer can change or
+			// suppress the signal.
+			t.tg.signalHandlers.mu.Unlock()
+			return (*runInterruptAfterSignalDeliveryStop)(nil)
+		}
+		act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+		t.tg.signalHandlers.mu.Unlock()
+		return t.deliverSignal(info, act)
+	}
+
+	t.tg.signalHandlers.mu.Unlock()
+	return (*runApp)(nil)
+}
+
+type runInterruptAfterSignalDeliveryStop struct{}
+
+func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
+	t.tg.pidns.owner.mu.Lock()
+	// Can't defer unlock: deliverSignal must be called without holding TaskSet
+	// mutex.
+	sig := linux.Signal(t.ptraceCode)
+	defer func() {
+		t.ptraceSiginfo = nil
+	}()
+	if !sig.IsValid() {
+		t.tg.pidns.owner.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	info := t.ptraceSiginfo
+	if sig != linux.Signal(info.Signo) {
+		info.Signo = int32(sig)
+		info.Errno = 0
+		info.Code = arch.SignalInfoUser
+		// pid isn't a valid field for all signal numbers, but Linux
+		// doesn't care (kernel/signal.c:ptrace_signal()).
+		//
+		// Linux uses t->parent for the tid and uid here, which is the tracer
+		// if it hasn't detached or the real parent otherwise.
+		parent := t.parent
+		if tracer := t.Tracer(); tracer != nil {
+			parent = tracer
+		}
+		if parent == nil {
+			// Tracer has detached and t was created by Kernel.CreateProcess().
+			// Pretend the parent is in an ancestor PID + user namespace.
+			info.SetPid(0)
+			info.SetUid(int32(auth.OverflowUID))
+		} else {
+			info.SetPid(int32(t.tg.pidns.tids[parent]))
+			info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		}
+	}
+	t.tg.signalHandlers.mu.Lock()
+	t.tg.pidns.owner.mu.Unlock()
+	// If the signal is masked, re-queue it.
+	if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+		t.sendSignalLocked(info, false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+		return (*runInterrupt)(nil)
+	}
+	act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+	t.tg.signalHandlers.mu.Unlock()
+	return t.deliverSignal(info, act)
+}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
new file mode 100644
index 000000000..801cb3395
--- /dev/null
+++ b/pkg/sentry/kernel/task_start.go
@@ -0,0 +1,252 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskConfig defines the configuration of a new Task (see below).
+type TaskConfig struct {
+	// Kernel is the owning Kernel.
+	*Kernel
+
+	// Parent is the new task's parent. Parent may be nil.
+	Parent *Task
+
+	// ThreadGroup is the ThreadGroup the new task belongs to.
+	*ThreadGroup
+
+	// TaskContext is the TaskContext of the new task.
+	*TaskContext
+
+	// TaskResources is the TaskResources of the new task.
+	*TaskResources
+
+	// Credentials is the Credentials of the new task.
+	Credentials *auth.Credentials
+
+	// Niceness is the niceness of the new task.
+	Niceness int
+
+	// If NetworkNamespaced is true, the new task should observe a non-root
+	// network namespace.
+	NetworkNamespaced bool
+
+	// AllowedCPUMask contains the cpus that this task can run on.
+	AllowedCPUMask sched.CPUSet
+
+	// UTSNamespace is the UTSNamespace of the new task.
+	UTSNamespace *UTSNamespace
+
+	// IPCNamespace is the IPCNamespace of the new task.
+	IPCNamespace *IPCNamespace
+}
+
+// NewTask creates a new task defined by TaskConfig.
+// Whether or not NewTask is successful, it takes ownership of both TaskContext
+// and TaskResources of the TaskConfig.
+//
+// NewTask does not start the returned task; the caller must call Task.Start.
+func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+	t, err := ts.newTask(cfg)
+	if err != nil {
+		cfg.TaskContext.release()
+		cfg.TaskResources.release()
+		return nil, err
+	}
+	return t, nil
+}
+
+// newTask is a helper for TaskSet.NewTask that only takes ownership of TaskContext
+// and TaskResources of the TaskConfig if it succeeds.
+func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
+	tg := cfg.ThreadGroup
+	tc := cfg.TaskContext
+	t := &Task{
+		taskNode: taskNode{
+			tg:       tg,
+			parent:   cfg.Parent,
+			children: make(map[*Task]struct{}),
+		},
+		runState:       (*runApp)(nil),
+		interruptChan:  make(chan struct{}, 1),
+		signalStack:    arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:             *tc,
+		tr:             *cfg.TaskResources,
+		p:              cfg.Kernel.Platform.NewContext(),
+		k:              cfg.Kernel,
+		ptraceTracees:  make(map[*Task]struct{}),
+		allowedCPUMask: cfg.AllowedCPUMask.Copy(),
+		ioUsage:        &usage.IO{},
+		creds:          cfg.Credentials,
+		niceness:       cfg.Niceness,
+		netns:          cfg.NetworkNamespaced,
+		utsns:          cfg.UTSNamespace,
+		ipcns:          cfg.IPCNamespace,
+		rseqCPU:        -1,
+		futexWaiter:    futex.NewWaiter(),
+	}
+	t.endStopCond.L = &t.tg.signalHandlers.mu
+	t.ptraceTracer.Store((*Task)(nil))
+	// We don't construct t.blockingTimer until Task.run(); see that function
+	// for justification.
+
+	// Make the new task (and possibly thread group) visible to the rest of
+	// the system atomically.
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	if tg.exiting || tg.execing != nil {
+		// If the caller is in the same thread group, then what we return
+		// doesn't matter too much since the caller will exit before it returns
+		// to userspace. If the caller isn't in the same thread group, then
+		// we're in uncharted territory and can return whatever we want.
+		return nil, syserror.EINTR
+	}
+	if err := ts.assignTIDsLocked(t); err != nil {
+		return nil, err
+	}
+	// Below this point, newTask is expected not to fail (there is no rollback
+	// of assignTIDsLocked or any of the following).
+
+	// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
+	// This is the earliest point at which we can do so (since t now has thread
+	// IDs).
+	t.updateLogPrefixLocked()
+
+	if t.parent != nil {
+		t.parent.children[t] = struct{}{}
+	}
+
+	if tg.leader == nil {
+		// New thread group.
+		tg.leader = t
+		if parentPG := tg.parentPG(); parentPG == nil {
+			tg.createSession()
+		} else {
+			// Inherit the process group.
+			parentPG.incRefWithParent(parentPG)
+			tg.processGroup = parentPG
+		}
+	}
+	tg.tasks.PushBack(t)
+	tg.tasksCount++
+	tg.liveTasks++
+	tg.activeTasks++
+
+	// Propagate external TaskSet stops to the new task.
+	t.stopCount = ts.stopCount
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])
+
+	t.startTime = t.k.RealtimeClock().Now()
+
+	return t, nil
+}
+
+// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
+// which it should be visible.
+//
+// Preconditions: ts.mu must be locked for writing.
+func (ts *TaskSet) assignTIDsLocked(t *Task) error {
+	type allocatedTID struct {
+		ns  *PIDNamespace
+		tid ThreadID
+	}
+	var allocatedTIDs []allocatedTID
+	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+		tid, err := ns.allocateTID()
+		if err != nil {
+			// Failure. Remove the tids we already allocated in descendant
+			// namespaces.
+			for _, a := range allocatedTIDs {
+				delete(a.ns.tasks, a.tid)
+				delete(a.ns.tids, t)
+			}
+			return err
+		}
+		ns.tasks[tid] = t
+		ns.tids[t] = tid
+		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
+	}
+	return nil
+}
+
+// allocateTID returns an unused ThreadID from ns.
+//
+// Preconditions: ns.owner.mu must be locked for writing.
+func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
+	if ns.exiting {
+		// "In this case, a subsequent fork(2) into this PID namespace will
+		// fail with the error ENOMEM; it is not possible to create a new
+		// processes [sic] in a PID namespace whose init process has
+		// terminated." - pid_namespaces(7)
+		return 0, syserror.ENOMEM
+	}
+	tid := ns.last
+	for {
+		// Next.
+		tid++
+		if tid > TasksLimit {
+			tid = InitTID + 1
+		}
+
+		// Is it available?
+		_, ok := ns.tasks[tid]
+		if !ok {
+			ns.last = tid
+			return tid, nil
+		}
+
+		// Did we do a full cycle?
+		if tid == ns.last {
+			// No tid available.
+			return 0, syserror.EAGAIN
+		}
+	}
+}
+
+// Start starts the task goroutine. Start must be called exactly once for each
+// task returned by NewTask.
+//
+// 'tid' must be the task's TID in the root PID namespace and it's used for
+// debugging purposes only (set as parameter to Task.run to make it visible
+// in stack dumps).
+func (t *Task) Start(tid ThreadID) {
+	// If the task was restored, it may be "starting" after having already exited.
+	if t.runState == nil {
+		return
+	}
+	t.goroutineStopped.Add(1)
+	t.tg.liveGoroutines.Add(1)
+	t.tg.pidns.owner.liveGoroutines.Add(1)
+	t.tg.pidns.owner.runningGoroutines.Add(1)
+
+	// Task is now running in system mode.
+	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
+
+	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
+	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
+}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
new file mode 100644
index 000000000..feaf6cae4
--- /dev/null
+++ b/pkg/sentry/kernel/task_stop.go
@@ -0,0 +1,226 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements task stops, which represent the equivalent of Linux's
+// uninterruptible sleep states in a way that is compatible with save/restore.
+// Task stops comprise both internal stops (which form part of the task's
+// "normal" control flow) and external stops (which do not); see README.md for
+// details.
+//
+// There are multiple interfaces for interacting with stops because there are
+// multiple cases to consider:
+//
+// - A task goroutine can begin a stop on its associated task (e.g. a
+// vfork() syscall stopping the calling task until the child task releases its
+// MM). In this case, calling Task.interrupt is both unnecessary (the task
+// goroutine obviously cannot be blocked in Task.block or executing application
+// code) and undesirable (as it may spuriously interrupt a in-progress
+// syscall).
+//
+// Beginning internal stops in this case is implemented by
+// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing,
+// there are no instances of this case that begin external stops, except for
+// autosave; however, autosave terminates the sentry without ending the
+// external stop, so the spurious interrupt is moot.
+//
+// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all
+// tasks being stopped in preparation for state checkpointing). If the task
+// goroutine may be in Task.block or executing application code, it must be
+// interrupted by Task.interrupt for it to actually enter the stop; since,
+// strictly speaking, we have no way of determining this, we call
+// Task.interrupt unconditionally.
+//
+// Beginning external stops in this case is implemented by
+// Task.BeginExternalStop. As of this writing, there are no instances of this
+// case that begin internal stops.
+//
+// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an
+// exiting task resuming a sibling task that has been blocked in an execve()
+// syscall waiting for other tasks to exit). In this case, Task.endStopCond
+// must be notified to kick the task goroutine out of Task.doStop.
+//
+// Ending internal stops in this case is implemented by
+// Task.endInternalStopLocked. Ending external stops in this case is
+// implemented by Task.EndExternalStop.
+//
+// - Hypothetically, a task goroutine can end an internal stop on its
+// associated task. As of this writing, there are no instances of this case.
+// However, any instances of this case could still use the above functions,
+// since notifying Task.endStopCond would be unnecessary but harmless.
+
+import (
+	"fmt"
+	"sync/atomic"
+)
+
+// A TaskStop is a condition visible to the task control flow graph that
+// prevents a task goroutine from running or exiting, i.e. an internal stop.
+//
+// NOTE: Most TaskStops don't contain any data; they're
+// distinguished by their type. The obvious way to implement such a TaskStop
+// is:
+//
+//     type groupStop struct{}
+//     func (groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop(groupStop{})
+//
+// However, this doesn't work because the state package can't serialize values,
+// only pointers. Furthermore, the correctness of save/restore depends on the
+// ability to pass a TaskStop to endInternalStop that will compare equal to the
+// TaskStop that was passed to beginInternalStop, even if a save/restore cycle
+// occurred between the two. As a result, the current idiom is to always use a
+// typecast nil for data-free TaskStops:
+//
+//     type groupStop struct{}
+//     func (*groupStop) Killable() bool { return true }
+//     ...
+//     t.beginInternalStop((*groupStop)(nil))
+//
+// This is pretty gross, but the alternatives seem grosser.
+type TaskStop interface {
+	// Killable returns true if Task.Kill should end the stop prematurely.
+	// Killable is analogous to Linux's TASK_WAKEKILL.
+	Killable() bool
+}
+
+// beginInternalStop indicates the start of an internal stop that applies to t.
+//
+// Preconditions: The task must not already be in an internal stop (i.e. t.stop
+// == nil). The caller must be running on the task goroutine.
+func (t *Task) beginInternalStop(s TaskStop) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginInternalStopLocked(s)
+}
+
+// Preconditions: The signal mutex must be locked. All preconditions for
+// Task.beginInternalStop also apply.
+func (t *Task) beginInternalStopLocked(s TaskStop) {
+	if t.stop != nil {
+		panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
+	}
+	t.Debugf("Entering internal stop %#v", s)
+	t.stop = s
+	t.beginStopLocked()
+}
+
+// endInternalStopLocked indicates the end of an internal stop that applies to
+// t. endInternalStopLocked does not wait for the task to resume.
+//
+// The caller is responsible for ensuring that the internal stop they expect
+// actually applies to t; this requires holding the signal mutex which protects
+// t.stop, which is why there is no endInternalStop that locks the signal mutex
+// for you.
+//
+// Preconditions: The signal mutex must be locked. The task must be in an
+// internal stop (i.e. t.stop != nil).
+func (t *Task) endInternalStopLocked() {
+	if t.stop == nil {
+		panic("Attempting to leave non-existent internal stop")
+	}
+	t.Debugf("Leaving internal stop %#v", t.stop)
+	t.stop = nil
+	t.endStopLocked()
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to t.
+// BeginExternalStop does not wait for t's task goroutine to stop.
+func (t *Task) BeginExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.beginStopLocked()
+	t.interrupt()
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task
+// goroutine to resume.
+func (t *Task) EndExternalStop() {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	t.endStopLocked()
+}
+
+// beginStopLocked increments t.stopCount to indicate that a new internal or
+// external stop applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) beginStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 {
+		// Most likely overflow.
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	}
+}
+
+// endStopLocked decerements t.stopCount to indicate that an existing internal
+// or external stop no longer applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) endStopLocked() {
+	if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+	} else if newval == 0 {
+		t.endStopCond.Signal()
+	}
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to
+// all current and future tasks in ts. BeginExternalStop does not wait for
+// task goroutines to stop.
+func (ts *TaskSet) BeginExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount++
+	if ts.stopCount <= 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.beginStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+		t.interrupt()
+	}
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
+// goroutines to resume.
+func (ts *TaskSet) EndExternalStop() {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+	ts.stopCount--
+	if ts.stopCount < 0 {
+		panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+	}
+	if ts.Root == nil {
+		return
+	}
+	for t := range ts.Root.tids {
+		t.tg.signalHandlers.mu.Lock()
+		t.endStopLocked()
+		t.tg.signalHandlers.mu.Unlock()
+	}
+}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
new file mode 100644
index 000000000..79f4ff60c
--- /dev/null
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -0,0 +1,434 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
+// include/linux/errno.h. These errnos are never returned to userspace
+// directly, but are used to communicate the expected behavior of an
+// interrupted syscall from the syscall to signal handling.
+type SyscallRestartErrno int
+
+// These numeric values are significant because ptrace syscall exit tracing can
+// observe them.
+//
+// For all of the following errnos, if the syscall is not interrupted by a
+// signal delivered to a user handler, the syscall is restarted.
+const (
+	// ERESTARTSYS is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler without SA_RESTART set, and restarted otherwise.
+	ERESTARTSYS = SyscallRestartErrno(512)
+
+	// ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
+	// should always be restarted.
+	ERESTARTNOINTR = SyscallRestartErrno(513)
+
+	// ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
+	// should be converted to EINTR if interrupted by a signal delivered to a
+	// user handler, and restarted otherwise.
+	ERESTARTNOHAND = SyscallRestartErrno(514)
+
+	// ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
+	// that it should be restarted using a custom function. The interrupted
+	// syscall must register a custom restart function by calling
+	// Task.SetRestartSyscallFn.
+	ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
+)
+
+// Error implements error.Error.
+func (e SyscallRestartErrno) Error() string {
+	// Descriptions are borrowed from strace.
+	switch e {
+	case ERESTARTSYS:
+		return "to be restarted if SA_RESTART is set"
+	case ERESTARTNOINTR:
+		return "to be restarted"
+	case ERESTARTNOHAND:
+		return "to be restarted if no handler"
+	case ERESTART_RESTARTBLOCK:
+		return "interrupted by signal"
+	default:
+		return "(unknown interrupt error)"
+	}
+}
+
+// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
+// rv, the value in a syscall return register.
+func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
+	switch int(rv) {
+	case -int(ERESTARTSYS):
+		return ERESTARTSYS, true
+	case -int(ERESTARTNOINTR):
+		return ERESTARTNOINTR, true
+	case -int(ERESTARTNOHAND):
+		return ERESTARTNOHAND, true
+	case -int(ERESTART_RESTARTBLOCK):
+		return ERESTART_RESTARTBLOCK, true
+	default:
+		return 0, false
+	}
+}
+
+// SyscallRestartBlock represents the restart block for a syscall restartable
+// with a custom function. It encapsulates the state required to restart a
+// syscall across a S/R.
+type SyscallRestartBlock interface {
+	Restart(t *Task) (uintptr, error)
+}
+
+// SyscallControl is returned by syscalls to control the behavior of
+// Task.doSyscallInvoke.
+type SyscallControl struct {
+	// next is the state that the task goroutine should switch to. If next is
+	// nil, the task goroutine should continue to syscall exit as usual.
+	next taskRunState
+
+	// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
+	// in the task's syscall return value register.
+	ignoreReturn bool
+}
+
+var (
+	// CtrlDoExit is returned by the implementations of the exit and exit_group
+	// syscalls to enter the task exit path directly, skipping syscall exit
+	// tracing.
+	CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
+
+	// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
+	// feature before syscall execution. This causes Task.doSyscallInvoke
+	// to return runSyscallReinvoke, allowing Task.run to check for stops
+	// before immediately re-invoking the syscall (skipping the re-checking
+	// of seccomp filters and ptrace which would confuse userspace
+	// tracing).
+	ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
+
+	// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
+	// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
+	// than tail-calling it, allowing stops to be checked before syscall exit.
+	ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
+)
+
+func (t *Task) invokeExternal() {
+	t.BeginExternalStop()
+	go func() { // S/R-SAFE: External control flow.
+		defer t.EndExternalStop()
+		t.SyscallTable().External(t.Kernel())
+	}()
+}
+
+func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
+	s := t.SyscallTable()
+
+	fe := s.FeatureEnable.Word(sysno)
+
+	var straceContext interface{}
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
+	}
+
+	if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
+		t.invokeExternal()
+		// Ensure we check for stops, then invoke the syscall again.
+		ctrl = ctrlStopAndReinvokeSyscall
+	} else {
+		fn := s.Lookup(sysno)
+		if fn != nil {
+			// Call our syscall implementation.
+			rval, ctrl, err = fn(t, args)
+		} else {
+			// Use the missing function if not found.
+			rval, err = t.SyscallTable().Missing(t, sysno, args)
+		}
+	}
+
+	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
+		t.invokeExternal()
+		// Don't reinvoke the syscall.
+	}
+
+	if bits.IsAnyOn32(fe, StraceEnableBits) {
+		s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
+	}
+
+	return
+}
+
+// doSyscall is the entry point for an invocation of a system call specified by
+// the current state of t's registers.
+//
+// The syscall path is very hot; avoid defer.
+func (t *Task) doSyscall() taskRunState {
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+
+	// Tracers expect to see this between when the task traps into the kernel
+	// to perform a syscall and when the syscall is actually invoked.
+	// This useless-looking temporary is needed because Go.
+	tmp := uintptr(syscall.ENOSYS)
+	t.Arch().SetReturn(-tmp)
+
+	// Check seccomp filters. The nil check is for performance (as seccomp use
+	// is rare), not needed for correctness.
+	if t.syscallFilters != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+		case seccompResultDeny:
+			t.Debugf("Syscall %d: denied by seccomp", sysno)
+			return (*runSyscallExit)(nil)
+		case seccompResultAllow:
+			// ok
+		case seccompResultKill:
+			t.Debugf("Syscall %d: killed by seccomp", sysno)
+			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+			return (*runExit)(nil)
+		case seccompResultTrace:
+			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
+			return (*runSyscallAfterPtraceEventSeccomp)(nil)
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doSyscallEnter(sysno, args)
+}
+
+type runSyscallAfterPtraceEventSeccomp struct{}
+
+func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
+		// ptrace(2)
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "The tracer can skip the system call by changing the syscall number to
+	// -1." - Documentation/prctl/seccomp_filter.txt
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil).execute(t)
+	}
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallEnter(sysno, args)
+}
+
+func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	if next, ok := t.ptraceSyscallEnter(); ok {
+		return next
+	}
+	return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSyscallEnterStop struct{}
+
+func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(sigPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	if sysno == ^uintptr(0) {
+		return (*runSyscallExit)(nil)
+	}
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSysemuStop struct{}
+
+func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
+	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+		t.tg.signalHandlers.mu.Lock()
+		t.sendSignalLocked(sigPriv(sig), false /* group */)
+		t.tg.signalHandlers.mu.Unlock()
+	}
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+
+	if ctrl != nil {
+		if !ctrl.ignoreReturn {
+			t.Arch().SetReturn(rval)
+		}
+		if ctrl.next != nil {
+			return ctrl.next
+		}
+	} else if err != nil {
+		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+		t.haveSyscallReturn = true
+	} else {
+		t.Arch().SetReturn(rval)
+	}
+
+	return (*runSyscallExit)(nil).execute(t)
+}
+
+type runSyscallReinvoke struct{}
+
+func (*runSyscallReinvoke) execute(t *Task) taskRunState {
+	if t.killed() {
+		// It's possible that since the last execution, the task has
+		// been forcible killed. Invoking the system call here could
+		// result in an infinite loop if it is again preempted by an
+		// external stop and reinvoked.
+		return (*runInterrupt)(nil)
+	}
+
+	sysno := t.Arch().SyscallNo()
+	args := t.Arch().SyscallArgs()
+	return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallExit struct{}
+
+func (*runSyscallExit) execute(t *Task) taskRunState {
+	t.ptraceSyscallExit()
+	return (*runApp)(nil)
+}
+
+// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
+// indicated by an execution fault at address addr. doVsyscall returns the
+// task's next run state.
+func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+	// Grab the caller up front, to make sure there's a sensible stack.
+	caller := t.Arch().Native(uintptr(0))
+	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(sigPriv(linux.SIGSEGV))
+		return (*runApp)(nil)
+	}
+
+	// For _vsyscalls_, there is no need to translate System V calling convention
+	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
+	// arguments and none of the vsyscalls uses more than two arguments.
+	args := t.Arch().SyscallArgs()
+	if t.syscallFilters != nil {
+		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
+		case seccompResultDeny:
+			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
+			return (*runApp)(nil)
+		case seccompResultAllow:
+			// ok
+		case seccompResultTrace:
+			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
+			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
+		default:
+			panic(fmt.Sprintf("Unknown seccomp result %d", r))
+		}
+	}
+
+	return t.doVsyscallInvoke(sysno, args, caller)
+}
+
+type runVsyscallAfterPtraceEventSeccomp struct {
+	addr   usermem.Addr
+	sysno  uintptr
+	caller interface{}
+}
+
+func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+	if t.killed() {
+		return (*runInterrupt)(nil)
+	}
+	sysno := t.Arch().SyscallNo()
+	// "... the syscall may not be changed to another system call using the
+	// orig_rax register. It may only be changed to -1 order [sic] to skip the
+	// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
+	// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
+	// causes do_exit(SIGSYS), and changing sp is ignored.
+	if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+		t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+		return (*runExit)(nil)
+	}
+	if sysno == ^uintptr(0) {
+		return (*runApp)(nil)
+	}
+	return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
+}
+
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+	rval, ctrl, err := t.executeSyscall(sysno, args)
+	if ctrl != nil {
+		t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(0)
+	} else if err == nil {
+		t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
+		// Set the return value. The stack has already been adjusted.
+		t.Arch().SetReturn(uintptr(rval))
+	} else {
+		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
+		if err == syserror.EFAULT {
+			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+			t.SendSignal(sigPriv(linux.SIGSEGV))
+			// A return is not emulated in this case.
+			return (*runApp)(nil)
+		}
+		t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+	}
+	t.Arch().SetIP(t.Arch().Value(caller))
+	t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
+	return (*runApp)(nil)
+}
+
+// ExtractErrno extracts an integer error number from the error.
+// The syscall number is purely for context in the error case. Use -1 if
+// syscall number is unknown.
+func (t *Task) ExtractErrno(err error, sysno int) int {
+	switch err := err.(type) {
+	case nil:
+		return 0
+	case syscall.Errno:
+		return int(err)
+	case SyscallRestartErrno:
+		return int(err)
+	case *memmap.BusError:
+		// Bus errors may generate SIGBUS, but for syscalls they still
+		// return EFAULT. See case in task_run.go where the fault is
+		// handled (and the SIGBUS is delivered).
+		return int(syscall.EFAULT)
+	case *os.PathError:
+		return t.ExtractErrno(err.Err, sysno)
+	case *os.LinkError:
+		return t.ExtractErrno(err.Err, sysno)
+	case *os.SyscallError:
+		return t.ExtractErrno(err.Err, sysno)
+	default:
+		if errno, ok := syserror.TranslateError(err); ok {
+			return int(errno)
+		}
+	}
+	panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
+}
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
new file mode 100644
index 000000000..82ef858a1
--- /dev/null
+++ b/pkg/sentry/kernel/task_test.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+)
+
+func TestTaskCPU(t *testing.T) {
+	for _, test := range []struct {
+		mask sched.CPUSet
+		tid  ThreadID
+		cpu  int32
+	}{
+		{
+			mask: []byte{0xff},
+			tid:  1,
+			cpu:  0,
+		},
+		{
+			mask: []byte{0xff},
+			tid:  10,
+			cpu:  1,
+		},
+		{
+			// more than 8 cpus.
+			mask: []byte{0xff, 0xff},
+			tid:  10,
+			cpu:  9,
+		},
+		{
+			// missing the first cpu.
+			mask: []byte{0xfe},
+			tid:  1,
+			cpu:  1,
+		},
+		{
+			mask: []byte{0xfe},
+			tid:  10,
+			cpu:  3,
+		},
+		{
+			// missing the fifth cpu.
+			mask: []byte{0xef},
+			tid:  10,
+			cpu:  2,
+		},
+	} {
+		assigned := assignCPU(test.mask, test.tid)
+		if test.cpu != assigned {
+			t.Errorf("assignCPU(%v, %v) got %v, want %v", test.mask, test.tid, assigned, test.cpu)
+		}
+	}
+
+}
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
new file mode 100644
index 000000000..7a62ab674
--- /dev/null
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -0,0 +1,298 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// _MAX_RW_COUNT is the maximum size in bytes of a single read or write.
+// Reads and writes that exceed this size may be silently truncated.
+// (Linux: include/linux/fs.h:MAX_RW_COUNT)
+var _MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+
+// Activate ensures that the task has an active address space.
+func (t *Task) Activate() {
+	if mm := t.MemoryManager(); mm != nil {
+		if err := mm.Activate(); err != nil {
+			panic("unable to activate mm: " + err.Error())
+		}
+	}
+}
+
+// Deactivate relinquishes the task's active address space.
+func (t *Task) Deactivate() {
+	if mm := t.MemoryManager(); mm != nil {
+		if err := mm.Deactivate(); err != nil {
+			panic("unable to deactivate mm: " + err.Error())
+		}
+	}
+}
+
+// CopyIn copies a fixed-size value or slice of fixed-size values in from the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
+	return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInBytes is a fast version of CopyIn if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOut copies a fixed-size value or slice of fixed-size values out to the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not writeable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
+	return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInString copies a NUL-terminated string of length at most maxlen in from
+// the task's memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
+	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
+// CopyInVector copies a NULL-terminated vector of strings from the task's
+// memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// maxElemSize is the maximum size of each individual element.
+//
+// maxTotalSize is the maximum total length of all elements plus the total
+// number of elements. For example, the following strings correspond to
+// the following set of sizes:
+//
+//     { "a", "b", "c" } => 6 (3 for lengths, 3 for elements)
+//     { "abc" }         => 4 (3 for length, 1 for elements)
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
+	var v []string
+	for {
+		argAddr := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, argAddr); err != nil {
+			return v, err
+		}
+		if t.Arch().Value(argAddr) == 0 {
+			break
+		}
+		// Each string has a zero terminating byte counted, so copying out a string
+		// requires at least one byte of space. Also, see the calculation below.
+		if maxTotalSize <= 0 {
+			return nil, syserror.ENOMEM
+		}
+		thisMax := maxElemSize
+		if maxTotalSize < thisMax {
+			thisMax = maxTotalSize
+		}
+		arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax)
+		if err != nil {
+			return v, err
+		}
+		v = append(v, arg)
+		addr += usermem.Addr(t.Arch().Width())
+		maxTotalSize -= len(arg) + 1
+	}
+	return v, nil
+}
+
+// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
+// memory mapped at addr.
+//
+// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok {
+			return syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for ; !src.IsEmpty(); src = src.Tail() {
+			ar := src.Head()
+			usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
+			usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
+			if _, err := t.CopyOutBytes(addr, b); err != nil {
+				return err
+			}
+			addr += itemLen
+		}
+
+	default:
+		return syserror.ENOSYS
+	}
+
+	return nil
+}
+
+// CopyInIovecs copies an array of numIovecs struct iovecs from the memory
+// mapped at addr, converts them to usermem.AddrRanges, and returns them as a
+// usermem.AddrRangeSeq.
+//
+// CopyInIovecs shares the following properties with Linux's
+// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
+//
+// - If the length of any AddrRange would exceed the range of an ssize_t,
+// CopyInIovecs returns EINVAL.
+//
+// - If the length of any AddrRange would cause its end to overflow,
+// CopyInIovecs returns EFAULT.
+//
+// - The combined length of all AddrRanges is limited to _MAX_RW_COUNT. If the
+// combined length of all AddrRanges would otherwise exceed this amount, ranges
+// beyond _MAX_RW_COUNT are silently truncated.
+//
+// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
+	if numIovecs == 0 {
+		return usermem.AddrRangeSeq{}, nil
+	}
+
+	var dst []usermem.AddrRange
+	if numIovecs > 1 {
+		dst = make([]usermem.AddrRange, 0, numIovecs)
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		const itemLen = 16
+		if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
+			return usermem.AddrRangeSeq{}, syserror.EFAULT
+		}
+
+		b := t.CopyScratchBuffer(itemLen)
+		for i := 0; i < numIovecs; i++ {
+			if _, err := t.CopyInBytes(addr, b); err != nil {
+				return usermem.AddrRangeSeq{}, err
+			}
+
+			base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8]))
+			length := usermem.ByteOrder.Uint64(b[8:16])
+			if length > math.MaxInt64 {
+				return usermem.AddrRangeSeq{}, syserror.EINVAL
+			}
+			ar, ok := base.ToRange(length)
+			if !ok {
+				return usermem.AddrRangeSeq{}, syserror.EFAULT
+			}
+
+			if numIovecs == 1 {
+				// Special case to avoid allocating dst.
+				return usermem.AddrRangeSeqOf(ar).TakeFirst(_MAX_RW_COUNT), nil
+			}
+			dst = append(dst, ar)
+
+			addr += itemLen
+		}
+
+	default:
+		return usermem.AddrRangeSeq{}, syserror.ENOSYS
+	}
+
+	// Truncate to _MAX_RW_COUNT.
+	var total uint64
+	for i := range dst {
+		dstlen := uint64(dst[i].Length())
+		if rem := uint64(_MAX_RW_COUNT) - total; rem < dstlen {
+			dst[i].End -= usermem.Addr(dstlen - rem)
+			dstlen = rem
+		}
+		total += dstlen
+	}
+
+	return usermem.AddrRangeSeqFromSlice(dst), nil
+}
+
+// SingleIOSequence returns a usermem.IOSequence representing [addr,
+// addr+length) in t's address space. If length exceeds _MAX_RW_COUNT, it is
+// silently truncated.
+//
+// SingleIOSequence is analogous to Linux's
+// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
+// write syscalls in Linux do not use import_single_range(), but are still
+// truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
+func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if length > _MAX_RW_COUNT {
+		length = _MAX_RW_COUNT
+	}
+	ar, ok := addr.ToRange(uint64(length))
+	if !ok {
+		return usermem.IOSequence{}, syserror.EFAULT
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: usermem.AddrRangeSeqOf(ar),
+		Opts:  opts,
+	}, nil
+}
+
+// IovecsIOSequence returns a usermem.IOSequence representing the array of
+// iovcnt struct iovecs at addr in t's address space. opts applies to the
+// returned IOSequence, not the reading of the struct iovec array.
+//
+// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
+//
+// Preconditions: As for Task.CopyInIovecs.
+func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+	ars, err := t.CopyInIovecs(addr, iovcnt)
+	if err != nil {
+		return usermem.IOSequence{}, err
+	}
+	return usermem.IOSequence{
+		IO:    t.MemoryManager(),
+		Addrs: ars,
+		Opts:  opts,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
new file mode 100644
index 000000000..8fffd3446
--- /dev/null
+++ b/pkg/sentry/kernel/thread_group.go
@@ -0,0 +1,269 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// A ThreadGroup is a logical grouping of tasks that has widespread
+// significance to other kernel features (e.g. signal handling). ("Thread
+// groups" are usually called "processes" in userspace documentation.)
+//
+// ThreadGroup is a superset of Linux's struct signal_struct.
+type ThreadGroup struct {
+	threadGroupNode
+
+	// signalHandlers is the set of signal handlers used by every task in this
+	// thread group. (signalHandlers may also be shared with other thread
+	// groups.)
+	//
+	// signalHandlers.mu (hereafter "the signal mutex") protects state related
+	// to signal handling, as well as state that usually needs to be atomic
+	// with signal handling, for all ThreadGroups and Tasks using
+	// signalHandlers. (This is analogous to Linux's use of struct
+	// sighand_struct::siglock.)
+	//
+	// The signalHandlers pointer can only be mutated during an execve
+	// (Task.finishExec). Consequently, when it's possible for a task in the
+	// thread group to be completing an execve, signalHandlers is protected by
+	// the owning TaskSet.mu. Otherwise, it is possible to read the
+	// signalHandlers pointer without synchronization. In particular,
+	// completing an execve requires that all other tasks in the thread group
+	// have exited, so task goroutines do not need the owning TaskSet.mu to
+	// read the signalHandlers pointer of their thread groups.
+	signalHandlers *SignalHandlers
+
+	// pendingSignals is the set of pending signals that may be handled by any
+	// task in this thread group.
+	//
+	// pendingSignals is protected by the signal mutex.
+	pendingSignals pendingSignals
+
+	// lastTimerSignalTask records the last task we deliver a process timer signal to.
+	// Please see SendTimerSignal for more details.
+	//
+	// lastTimerSignalTask is protected by the signal mutex.
+	lastTimerSignalTask *Task
+
+	// groupStopPhase indicates the state of a group stop in progress on the
+	// thread group, if any.
+	//
+	// groupStopPhase is protected by the signal mutex.
+	groupStopPhase groupStopPhase
+
+	// groupStopSignal is the signal that caused a group stop to be initiated.
+	// groupStopSignal is only meaningful if groupStopPhase is
+	// groupStopInitiated or groupStopComplete.
+	//
+	// groupStopSignal is protected by the signal mutex.
+	groupStopSignal linux.Signal
+
+	// groupStopCount is the number of non-exited tasks in the thread group
+	// that have acknowledged an initiated group stop. groupStopCount is only
+	// meaningful if groupStopPhase is groupStopInitiated.
+	//
+	// groupStopCount is protected by the signal mutex.
+	groupStopCount int
+
+	// If groupStopWaitable is true, the thread group is indicating a waitable
+	// group stop event (as defined by EventChildGroupStop).
+	//
+	// Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
+	// and group_exit_code being non-zero.
+	//
+	// groupStopWaitable is protected by the signal mutex.
+	groupStopWaitable bool
+
+	// If groupContNotify is true, then a SIGCONT has recently ended a group
+	// stop on this thread group, and the first task to observe it should
+	// notify its parent.
+	//
+	// groupContNotify is protected by the signal mutex.
+	groupContNotify bool
+
+	// If groupContNotify is true, groupContInterrupted is true iff SIGCONT
+	// ended a group stop in phase groupStopInitiated. If groupContNotify is
+	// false, groupContInterrupted is meaningless.
+	//
+	// Analogues in Linux:
+	//
+	// - groupContNotify && groupContInterrupted is represented by
+	// SIGNAL_CLD_STOPPED.
+	//
+	// - groupContNotify && !groupContInterrupted is represented by
+	// SIGNAL_CLD_CONTINUED.
+	//
+	// - !groupContNotify is represented by neither flag being set.
+	//
+	// groupContInterrupted is protected by the signal mutex.
+	groupContInterrupted bool
+
+	// If groupContWaitable is true, the thread group is indicating a waitable
+	// continue event (as defined by EventGroupContinue).
+	//
+	// groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
+	//
+	// groupContWaitable is protected by the signal mutex.
+	groupContWaitable bool
+
+	// exiting is true if all tasks in the ThreadGroup should exit. exiting is
+	// analogous to Linux's SIGNAL_GROUP_EXIT.
+	//
+	// exiting is protected by the signal mutex. exiting can only transition
+	// from false to true.
+	exiting bool
+
+	// exitStatus is the thread group's exit status.
+	//
+	// While exiting is false, exitStatus is protected by the signal mutex.
+	// When exiting becomes true, exitStatus becomes immutable.
+	exitStatus ExitStatus
+
+	// terminationSignal is the signal that this thread group's leader will
+	// send to its parent when it exits.
+	//
+	// terminationSignal is protected by the TaskSet mutex.
+	terminationSignal linux.Signal
+
+	// liveGoroutines is the number of non-exited task goroutines in the thread
+	// group.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	// tm contains process timers. TimerManager fields are immutable.
+	tm TimerManager
+
+	// exitedCPUStats is the CPU usage for all exited tasks in the thread
+	// group. exitedCPUStats is protected by the TaskSet mutex.
+	exitedCPUStats usage.CPUStats
+
+	// childCPUStats is the CPU usage of all joined descendants of this thread
+	// group. childCPUStats is protected by the TaskSet mutex.
+	childCPUStats usage.CPUStats
+
+	// ioUsage is the I/O usage for all exited tasks in the thread group.
+	// The ioUsage pointer is immutable.
+	ioUsage *usage.IO
+
+	// maxRSS is the historical maximum resident set size of the thread group, updated when:
+	//
+	// - A task in the thread group exits, since after all tasks have
+	// exited the MemoryManager is no longer reachable.
+	//
+	// - The thread group completes an execve, since this changes
+	// MemoryManagers.
+	//
+	// maxRSS is protected by the TaskSet mutex.
+	maxRSS uint64
+
+	// childMaxRSS is the maximum resident set size in bytes of all joined
+	// descendants of this thread group.
+	//
+	// childMaxRSS is protected by the TaskSet mutex.
+	childMaxRSS uint64
+
+	// Resource limits for this ThreadGroup. The limits pointer is immutable.
+	limits *limits.LimitSet
+
+	// processGroup is the processGroup for this thread group.
+	//
+	// processGroup is protected by the TaskSet mutex.
+	processGroup *ProcessGroup
+
+	// execed indicates an exec has occurred since creation. This will be
+	// set by finishExec, and new TheadGroups will have this field cleared.
+	// When execed is set, the processGroup may no longer be changed.
+	//
+	// execed is protected by the TaskSet mutex.
+	execed bool
+
+	// rscr is the thread group's RSEQ critical region.
+	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+}
+
+// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// thread group leader will send its parent terminationSignal when it exits.
+// The new thread group isn't visible to the system until a task has been
+// created inside of it by a successful call to TaskSet.NewTask.
+func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+	tg := &ThreadGroup{
+		threadGroupNode: threadGroupNode{
+			pidns: ns,
+		},
+		signalHandlers:    sh,
+		terminationSignal: terminationSignal,
+		ioUsage:           &usage.IO{},
+		limits:            limits,
+	}
+	tg.tm = newTimerManager(tg, monotonicClock)
+	tg.rscr.Store(&RSEQCriticalRegion{})
+	return tg
+}
+
+// saveRscr is invopked by stateify.
+func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
+	return tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// loadRscr is invoked by stateify.
+func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
+	tg.rscr.Store(rscr)
+}
+
+// SignalHandlers returns the signal handlers used by tg.
+//
+// Preconditions: The caller must provide the synchronization required to read
+// tg.signalHandlers, as described in the field's comment.
+func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
+	return tg.signalHandlers
+}
+
+// Timer returns tg's timers.
+func (tg *ThreadGroup) Timer() *TimerManager {
+	return &tg.tm
+}
+
+// Limits returns tg's limits.
+func (tg *ThreadGroup) Limits() *limits.LimitSet {
+	return tg.limits
+}
+
+// release releases the thread group's resources.
+func (tg *ThreadGroup) release() {
+	// This must be done without holding the TaskSet mutex since thread group
+	// timers call SendSignal with Timer.mu locked.
+	tg.tm.destroy()
+}
+
+// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
+//
+// Precondition: TaskSet.mu must be held.
+func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		for child := range t.children {
+			if child == child.tg.leader {
+				fn(child.tg)
+			}
+		}
+	}
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
new file mode 100644
index 000000000..440da9dad
--- /dev/null
+++ b/pkg/sentry/kernel/threads.go
@@ -0,0 +1,443 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// TasksLimit is the maximum number of threads for untrusted application.
+// Linux doesn't really limit this directly, rather it is limited by total
+// memory size, stacks allocated and a global maximum. There's no real reason
+// for us to limit it either, (esp. since threads are backed by go routines),
+// and we would expect to hit resource limits long before hitting this number.
+// However, for correctness, we still check that the user doesn't exceed this
+// number.
+//
+// Note that because of the way futexes are implemented, there *are* in fact
+// serious restrictions on valid thread IDs. They are limited to 2^30 - 1
+// (kernel/fork.c:MAX_THREADS).
+const TasksLimit = (1 << 16)
+
+// ThreadID is a generic thread identifier.
+type ThreadID int32
+
+// String returns a decimal representation of the ThreadID.
+func (tid ThreadID) String() string {
+	return fmt.Sprintf("%d", tid)
+}
+
+// InitTID is the TID given to the first task added to each PID namespace. The
+// thread group led by InitTID is called the namespace's init process. The
+// death of a PID namespace's init process causes all tasks visible in that
+// namespace to be killed.
+const InitTID ThreadID = 1
+
+// A TaskSet comprises all tasks in a system.
+type TaskSet struct {
+	// mu protects all relationships betweens tasks and thread groups in the
+	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
+	mu sync.RWMutex `state:"nosave"`
+
+	// Root is the root PID namespace, in which all tasks in the TaskSet are
+	// visible. The Root pointer is immutable.
+	Root *PIDNamespace
+
+	// sessions is the set of all sessions.
+	sessions sessionList
+
+	// stopCount is the number of active external stops applicable to all tasks
+	// in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
+	// paired with a call to TaskSet.EndExternalStop). stopCount is protected
+	// by mu.
+	//
+	// stopCount is not saved for the same reason as Task.stopCount; it is
+	// always reset to zero after restore.
+	stopCount int32 `state:"nosave"`
+
+	// liveGoroutines is the number of non-exited task goroutines in the
+	// TaskSet.
+	//
+	// liveGoroutines is not saved; it is reset as task goroutines are
+	// restarted by Task.Start.
+	liveGoroutines sync.WaitGroup `state:"nosave"`
+
+	// runningGoroutines is the number of running task goroutines in the
+	// TaskSet.
+	//
+	// runningGoroutines is not saved; its counter value is required to be zero
+	// at time of save (but note that this is not necessarily the same thing as
+	// sync.WaitGroup's zero value).
+	runningGoroutines sync.WaitGroup `state:"nosave"`
+}
+
+// newTaskSet returns a new, empty TaskSet.
+func newTaskSet() *TaskSet {
+	ts := &TaskSet{}
+	ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace())
+	return ts
+}
+
+// forEachThreadGroupLocked applies f to each thread group in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
+	for t := range ts.Root.tids {
+		if t == t.tg.leader {
+			f(t.tg)
+		}
+	}
+}
+
+// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
+// tasks. See the pid_namespaces(7) man page for further details.
+//
+// N.B. A task is said to be visible in a PID namespace if the PID namespace
+// contains a thread ID that maps to that task.
+type PIDNamespace struct {
+	// owner is the TaskSet that this PID namespace belongs to. The owner
+	// pointer is immutable.
+	owner *TaskSet
+
+	// parent is the PID namespace of the process that created this one. If
+	// this is the root PID namespace, parent is nil. The parent pointer is
+	// immutable.
+	//
+	// Invariant: All tasks that are visible in this namespace are also visible
+	// in all ancestor namespaces.
+	parent *PIDNamespace
+
+	// userns is the user namespace with which this PID namespace is
+	// associated. Privileged operations on this PID namespace must have
+	// appropriate capabilities in userns. The userns pointer is immutable.
+	userns *auth.UserNamespace
+
+	// The following fields are protected by owner.mu.
+
+	// last is the last ThreadID to be allocated in this namespace.
+	last ThreadID
+
+	// tasks is a mapping from ThreadIDs in this namespace to tasks visible in
+	// the namespace.
+	tasks map[ThreadID]*Task
+
+	// tids is a mapping from tasks visible in this namespace to their
+	// identifiers in this namespace.
+	tids map[*Task]ThreadID
+
+	// sessions is a mapping from SessionIDs in this namespace to sessions
+	// visible in the namespace.
+	sessions map[SessionID]*Session
+
+	// sids is a mapping from sessions visible in this namespace to their
+	// identifiers in this namespace.
+	sids map[*Session]SessionID
+
+	// processGroups is a mapping from ProcessGroupIDs in this namespace to
+	// process groups visible in the namespace.
+	processGroups map[ProcessGroupID]*ProcessGroup
+
+	// pgids is a mapping from process groups visible in this namespace to
+	// their identifiers in this namespace.
+	pgids map[*ProcessGroup]ProcessGroupID
+
+	// exiting indicates that the namespace's init process is exiting or has
+	// exited.
+	exiting bool
+}
+
+func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
+	return &PIDNamespace{
+		owner:         ts,
+		parent:        parent,
+		userns:        userns,
+		tasks:         make(map[ThreadID]*Task),
+		tids:          make(map[*Task]ThreadID),
+		sessions:      make(map[SessionID]*Session),
+		sids:          make(map[*Session]SessionID),
+		processGroups: make(map[ProcessGroupID]*ProcessGroup),
+		pgids:         make(map[*ProcessGroup]ProcessGroupID),
+	}
+}
+
+// NewChild returns a new, empty PID namespace that is a child of ns. Authority
+// over the new PID namespace is controlled by userns.
+func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
+	return newPIDNamespace(ns.owner, ns, userns)
+}
+
+// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
+// task has that TID, TaskWithID returns nil.
+func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.tasks[tid]
+}
+
+// ThreadGroupWithID returns the thread group lead by the task with thread ID
+// tid in PID namespace ns. If no task has that TID, or if the task with that
+// TID is not a thread group leader, ThreadGroupWithID returns nil.
+func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	t := ns.tasks[tid]
+	if t == nil {
+		return nil
+	}
+	if t != t.tg.leader {
+		return nil
+	}
+	return t.tg
+}
+
+// IDOfTask returns the TID assigned to the given task in PID namespace ns. If
+// the task is not visible in that namespace, IDOfTask returns 0. (This return
+// value is significant in some cases, e.g. getppid() is documented as
+// returning 0 if the caller's parent is in an ancestor namespace and
+// consequently not visible to the caller.) If the task is nil, IDOfTask returns
+// 0.
+func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.tids[t]
+}
+
+// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
+// If the task is not visible in that namespace, IDOfThreadGroup returns 0.
+func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.tids[tg.leader]
+}
+
+// Tasks returns a snapshot of the tasks in ns.
+func (ns *PIDNamespace) Tasks() []*Task {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	tasks := make([]*Task, 0, len(ns.tasks))
+	for t := range ns.tids {
+		tasks = append(tasks, t)
+	}
+	return tasks
+}
+
+// ThreadGroups returns a snapshot of the thread groups in ns.
+func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	var tgs []*ThreadGroup
+	for t := range ns.tids {
+		if t == t.tg.leader {
+			tgs = append(tgs, t.tg)
+		}
+	}
+	return tgs
+}
+
+// UserNamespace returns the user namespace associated with PID namespace ns.
+func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
+	return ns.userns
+}
+
+// A threadGroupNode defines the relationship between a thread group and the
+// rest of the system. Conceptually, threadGroupNode is data belonging to the
+// owning TaskSet, as if TaskSet contained a field `nodes
+// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
+// threadGroupNode is embedded in the ThreadGroup it represents.
+// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
+// threadGroupEntry's methods on ThreadGroup to make it implement
+// threadGroupLinker.)
+type threadGroupNode struct {
+	// pidns is the PID namespace containing the thread group and all of its
+	// member tasks. The pidns pointer is immutable.
+	pidns *PIDNamespace
+
+	// eventQueue is notified whenever a event of interest to Task.Wait occurs
+	// in a child of this thread group, or a ptrace tracee of a task in this
+	// thread group. Events are defined in task_exit.go.
+	//
+	// Note that we cannot check and save this wait queue similarly to other
+	// wait queues, as the queue will not be empty by the time of saving, due
+	// to the wait sourced from Exec().
+	eventQueue waiter.Queue `state:"nosave"`
+
+	// leader is the thread group's leader, which is the oldest task in the
+	// thread group; usually the last task in the thread group to call
+	// execve(), or if no such task exists then the first task in the thread
+	// group, which was created by a call to fork() or clone() without
+	// CLONE_THREAD. Once a thread group has been made visible to the rest of
+	// the system by TaskSet.newTask, leader is never nil.
+	//
+	// Note that it's possible for the leader to exit without causing the rest
+	// of the thread group to exit; in such a case, leader will still be valid
+	// and non-nil, but leader will not be in tasks.
+	//
+	// leader is protected by the TaskSet mutex.
+	leader *Task
+
+	// If execing is not nil, it is a task in the thread group that has killed
+	// all other tasks so that it can become the thread group leader and
+	// perform an execve. (execing may already be the thread group leader.)
+	//
+	// execing is analogous to Linux's signal_struct::group_exit_task.
+	//
+	// execing is protected by the TaskSet mutex.
+	execing *Task
+
+	// tasks is all tasks in the thread group that have not yet been reaped.
+	//
+	// tasks is protected by both the TaskSet mutex and the signal mutex:
+	// Mutating tasks requires locking the TaskSet mutex for writing *and*
+	// locking the signal mutex. Reading tasks requires locking the TaskSet
+	// mutex *or* locking the signal mutex.
+	tasks taskList
+
+	// tasksCount is the number of tasks in the thread group that have not yet
+	// been reaped; equivalently, tasksCount is the number of tasks in tasks.
+	//
+	// tasksCount is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	tasksCount int
+
+	// liveTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitZombie.
+	//
+	// liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
+	liveTasks int
+
+	// activeTasks is the number of tasks in the thread group that have not yet
+	// reached TaskExitInitiated.
+	//
+	// activeTasks is protected by both the TaskSet mutex and the signal mutex,
+	// as with tasks.
+	activeTasks int
+}
+
+// PIDNamespace returns the PID namespace containing tg.
+func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
+	return tg.pidns
+}
+
+// TaskSet returns the TaskSet containing tg.
+func (tg *ThreadGroup) TaskSet() *TaskSet {
+	return tg.pidns.owner
+}
+
+// Leader returns tg's leader.
+func (tg *ThreadGroup) Leader() *Task {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.leader
+}
+
+// Count returns the number of non-exited threads in the group.
+func (tg *ThreadGroup) Count() int {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	var count int
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		count++
+	}
+	return count
+}
+
+// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
+// all tasks in tg.
+func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+
+	var tasks []ThreadID
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		if id, ok := pidns.tids[t]; ok {
+			tasks = append(tasks, id)
+		}
+	}
+	return tasks
+}
+
+// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
+// is dead, ID returns 0.
+func (tg *ThreadGroup) ID() ThreadID {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.pidns.tids[tg.leader]
+}
+
+// A taskNode defines the relationship between a task and the rest of the
+// system. The comments on threadGroupNode also apply to taskNode.
+type taskNode struct {
+	// tg is the thread group that this task belongs to. The tg pointer is
+	// immutable.
+	tg *ThreadGroup `state:"wait"`
+
+	// taskEntry links into tg.tasks. Note that this means that
+	// Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
+	// group. See threadGroupNode.tasks for synchronization info.
+	taskEntry
+
+	// parent is the task's parent. parent may be nil.
+	//
+	// parent is protected by the TaskSet mutex.
+	parent *Task
+
+	// children is this task's children.
+	//
+	// children is protected by the TaskSet mutex.
+	children map[*Task]struct{}
+
+	// If childPIDNamespace is not nil, all new tasks created by this task will
+	// be members of childPIDNamespace rather than this one. (As a corollary,
+	// this task becomes unable to create sibling tasks in the same thread
+	// group.)
+	//
+	// childPIDNamespace is exclusive to the task goroutine.
+	childPIDNamespace *PIDNamespace
+}
+
+// ThreadGroup returns the thread group containing t.
+func (t *Task) ThreadGroup() *ThreadGroup {
+	return t.tg
+}
+
+// PIDNamespace returns the PID namespace containing t.
+func (t *Task) PIDNamespace() *PIDNamespace {
+	return t.tg.pidns
+}
+
+// TaskSet returns the TaskSet containing t.
+func (t *Task) TaskSet() *TaskSet {
+	return t.tg.pidns.owner
+}
+
+// Timekeeper returns the system Timekeeper.
+func (t *Task) Timekeeper() *Timekeeper {
+	return t.k.timekeeper
+}
+
+// Parent returns t's parent.
+func (t *Task) Parent() *Task {
+	return t.parent
+}
+
+// ThreadID returns t's thread ID in its own PID namespace. If the task is
+// dead, ThreadID returns 0.
+func (t *Task) ThreadID() ThreadID {
+	return t.tg.pidns.IDOfTask(t)
+}
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
new file mode 100644
index 000000000..84f31b2dc
--- /dev/null
+++ b/pkg/sentry/kernel/time/BUILD
@@ -0,0 +1,32 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "time_state",
+    srcs = [
+        "time.go",
+    ],
+    out = "time_state.go",
+    package = "time",
+)
+
+go_library(
+    name = "time",
+    srcs = [
+        "context.go",
+        "time.go",
+        "time_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
new file mode 100644
index 000000000..ac4dc01d8
--- /dev/null
+++ b/pkg/sentry/kernel/time/context.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the time package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxRealtimeClock is a Context.Value key for the current real time.
+	CtxRealtimeClock contextID = iota
+)
+
+// RealtimeClockFromContext returns the real time clock associated with context
+// ctx.
+func RealtimeClockFromContext(ctx context.Context) Clock {
+	if v := ctx.Value(CtxRealtimeClock); v != nil {
+		return v.(Clock)
+	}
+	return nil
+}
+
+// NowFromContext returns the current real time associated with context ctx.
+func NowFromContext(ctx context.Context) Time {
+	if clk := RealtimeClockFromContext(ctx); clk != nil {
+		return clk.Now()
+	}
+	panic("encountered context without RealtimeClock")
+}
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
new file mode 100644
index 000000000..c223c2f19
--- /dev/null
+++ b/pkg/sentry/kernel/time/time.go
@@ -0,0 +1,649 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package time defines the Timer type, which provides a periodic timer that
+// works by sampling a user-provided clock.
+package time
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Events that may be generated by a Clock.
+const (
+	// ClockEventSet occurs when a Clock undergoes a discontinuous change.
+	ClockEventSet waiter.EventMask = 1 << iota
+
+	// ClockEventRateIncrease occurs when the rate at which a Clock advances
+	// increases significantly, such that values returned by previous calls to
+	// Clock.WallTimeUntil may be too large.
+	ClockEventRateIncrease
+)
+
+// Time represents an instant in time with nanosecond precision.
+//
+// Time may represent time with respect to any clock and may not have any
+// meaning in the real world.
+type Time struct {
+	ns int64
+}
+
+var (
+	// MinTime is the zero time instant, the lowest possible time that can
+	// be represented by Time.
+	MinTime = Time{ns: math.MinInt64}
+
+	// MaxTime is the highest possible time that can be represented by
+	// Time.
+	MaxTime = Time{ns: math.MaxInt64}
+
+	// ZeroTime represents the zero time in an unspecified Clock's domain.
+	ZeroTime = Time{ns: 0}
+)
+
+const (
+	// MinDuration is the minimum duration representable by time.Duration.
+	MinDuration = time.Duration(math.MinInt64)
+
+	// MaxDuration is the maximum duration representable by time.Duration.
+	MaxDuration = time.Duration(math.MaxInt64)
+)
+
+// FromNanoseconds returns a Time representing the point ns nanoseconds after
+// an unspecified Clock's zero time.
+func FromNanoseconds(ns int64) Time {
+	return Time{ns}
+}
+
+// FromSeconds returns a Time representing the point s seconds after an
+// unspecified Clock's zero time.
+func FromSeconds(s int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	return Time{s * 1e9}
+}
+
+// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real
+// time Unix clock domain.
+func FromUnix(s int64, ns int64) Time {
+	if s > math.MaxInt64/time.Second.Nanoseconds() {
+		return MaxTime
+	}
+	t := s * 1e9
+	if t > math.MaxInt64-ns {
+		return MaxTime
+	}
+	return Time{t + ns}
+}
+
+// FromTimespec converts from Linux Timespec to Time.
+func FromTimespec(ts linux.Timespec) Time {
+	return Time{ts.ToNsecCapped()}
+}
+
+// FromTimeval converts a Linux Timeval to Time.
+func FromTimeval(tv linux.Timeval) Time {
+	return Time{tv.ToNsecCapped()}
+}
+
+// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock
+// domain. If t represents walltime, this is nanoseconds since the Unix epoch.
+func (t Time) Nanoseconds() int64 {
+	return t.ns
+}
+
+// Seconds returns seconds elapsed since the zero time in t's Clock domain. If
+// t represents walltime, this is seconds since Unix epoch.
+func (t Time) Seconds() int64 {
+	return t.Nanoseconds() / time.Second.Nanoseconds()
+}
+
+// Timespec converts Time to a Linux timespec.
+func (t Time) Timespec() linux.Timespec {
+	return linux.NsecToTimespec(t.Nanoseconds())
+}
+
+// Unix returns the (seconds, nanoseconds) representation of t such that
+// seconds*1e9 + nanoseconds = t.
+func (t Time) Unix() (s int64, ns int64) {
+	s = t.ns / 1e9
+	ns = t.ns % 1e9
+	return
+}
+
+// TimeT converts Time to a Linux time_t.
+func (t Time) TimeT() linux.TimeT {
+	return linux.NsecToTimeT(t.Nanoseconds())
+}
+
+// Timeval converts Time to a Linux timeval.
+func (t Time) Timeval() linux.Timeval {
+	return linux.NsecToTimeval(t.Nanoseconds())
+}
+
+// Add adds the duration of d to t.
+func (t Time) Add(d time.Duration) Time {
+	if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) {
+		return MaxTime
+	}
+	if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) {
+		return MinTime
+	}
+	return Time{int64(t.ns) + d.Nanoseconds()}
+}
+
+// AddTime adds the duration of u to t.
+func (t Time) AddTime(u Time) Time {
+	return t.Add(time.Duration(u.ns))
+}
+
+// Equal reports whether the two times represent the same instant in time.
+func (t Time) Equal(u Time) bool {
+	return t.ns == u.ns
+}
+
+// Before reports whether the instant t is before the instant u.
+func (t Time) Before(u Time) bool {
+	return t.ns < u.ns
+}
+
+// After reports whether the instant t is after the instant u.
+func (t Time) After(u Time) bool {
+	return t.ns > u.ns
+}
+
+// Sub returns the duration of t - u.
+//
+// N.B. This measure may not make sense for every Time returned by ktime.Clock.
+// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to
+// estimate that wall time.
+func (t Time) Sub(u Time) time.Duration {
+	dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond
+	switch {
+	case u.Add(dur).Equal(t):
+		return dur
+	case t.Before(u):
+		return MinDuration
+	default:
+		return MaxDuration
+	}
+}
+
+// IsMin returns whether t represents the lowest possible time instant.
+func (t Time) IsMin() bool {
+	return t == MinTime
+}
+
+// IsZero returns whether t represents the zero time instant in t's Clock domain.
+func (t Time) IsZero() bool {
+	return t == ZeroTime
+}
+
+// String returns the time represented in nanoseconds as a string.
+func (t Time) String() string {
+	return fmt.Sprintf("%dns", t.Nanoseconds())
+}
+
+// A Clock is an abstract time source.
+type Clock interface {
+	// Now returns the current time in nanoseconds according to the Clock.
+	Now() Time
+
+	// WallTimeUntil returns the estimated wall time until Now will return a
+	// value greater than or equal to t, given that a recent call to Now
+	// returned now. If t has already passed, WallTimeUntil may return 0 or a
+	// negative value.
+	//
+	// WallTimeUntil must be abstract to support Clocks that do not represent
+	// wall time (e.g. thread group execution timers). Clocks that represent
+	// wall times may embed the WallRateClock type to obtain an appropriate
+	// trivial implementation of WallTimeUntil.
+	//
+	// WallTimeUntil is used to determine when associated Timers should next
+	// check for expirations. Returning too small a value may result in
+	// spurious Timer goroutine wakeups, while returning too large a value may
+	// result in late expirations. Implementations should usually err on the
+	// side of underestimating.
+	WallTimeUntil(t, now Time) time.Duration
+
+	// Waitable methods may be used to subscribe to Clock events. Waiters will
+	// not be preserved by Save and must be re-established during restore.
+	//
+	// Since Clock events are transient, implementations of
+	// waiter.Waitable.Readiness should return 0.
+	waiter.Waitable
+}
+
+// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the
+// same rate as wall time.
+type WallRateClock struct{}
+
+// WallTimeUntil implements Clock.WallTimeUntil.
+func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+	return t.Sub(now)
+}
+
+// NoClockEvents implements waiter.Waitable for Clocks that do not generate
+// events.
+type NoClockEvents struct{}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+}
+
+// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
+// defining waiter.Waitable.Readiness as required by Clock.
+type ClockEventsQueue struct {
+	waiter.Queue
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return 0
+}
+
+// A TimerListener receives expirations from a Timer.
+type TimerListener interface {
+	// Notify is called when its associated Timer expires. exp is the number of
+	// expirations.
+	//
+	// Notify is called with the associated Timer's mutex locked, so Notify
+	// must not take any locks that precede Timer.mu in lock order.
+	//
+	// Preconditions: exp > 0.
+	Notify(exp uint64)
+
+	// Destroy is called when the timer is destroyed.
+	Destroy()
+}
+
+// Setting contains user-controlled mutable Timer properties.
+type Setting struct {
+	// Enabled is true if the timer is running.
+	Enabled bool
+
+	// Next is the time in nanoseconds of the next expiration.
+	Next Time
+
+	// Period is the time in nanoseconds between expirations. If Period is
+	// zero, the timer will not automatically restart after expiring.
+	//
+	// Invariant: Period >= 0.
+	Period time.Duration
+}
+
+// SettingFromSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as a time relative to c.Now().
+func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+	if value < 0 {
+		return Setting{}, syserror.EINVAL
+	}
+	if value == 0 {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    c.Now().Add(value),
+		Period:  interval,
+	}, nil
+}
+
+// SettingFromAbsSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as an absolute time.
+func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
+	if value.Before(ZeroTime) {
+		return Setting{}, syserror.EINVAL
+	}
+	if value.IsZero() {
+		return Setting{Period: interval}, nil
+	}
+	return Setting{
+		Enabled: true,
+		Next:    value,
+		Period:  interval,
+	}, nil
+}
+
+// SpecFromSetting converts a timestamp and a Setting to a (relative value,
+// interval) pair, as used by most Linux syscalls that return a struct
+// itimerval or struct itimerspec.
+func SpecFromSetting(now Time, s Setting) (value, period time.Duration) {
+	if !s.Enabled {
+		return 0, s.Period
+	}
+	return s.Next.Sub(now), s.Period
+}
+
+// advancedTo returns an updated Setting and a number of expirations after
+// the associated Clock indicates a time of now.
+//
+// Settings may be created by successive calls to advancedTo with decreasing
+// values of now (i.e. time may appear to go backward). Supporting this is
+// required to support non-monotonic clocks, as well as allowing
+// Timer.clock.Now() to be called without holding Timer.mu.
+func (s Setting) advancedTo(now Time) (Setting, uint64) {
+	if !s.Enabled {
+		return s, 0
+	}
+	if s.Next.After(now) {
+		return s, 0
+	}
+	if s.Period == 0 {
+		s.Enabled = false
+		return s, 1
+	}
+	exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period)
+	s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp))
+	return s, exp
+}
+
+// Timer is an optionally-periodic timer driven by sampling a user-specified
+// Clock. Timer's semantics support the requirements of Linux's interval timers
+// (setitimer(2), timer_create(2), timerfd_create(2)).
+//
+// Timers should be created using NewTimer and must be cleaned up by calling
+// Timer.Destroy when no longer used.
+type Timer struct {
+	// clock is the time source. clock is immutable.
+	clock Clock
+
+	// listener is notified of expirations. listener is immutable.
+	listener TimerListener
+
+	// mu protects the following mutable fields.
+	mu sync.Mutex `state:"nosave"`
+
+	// setting is the timer setting. setting is protected by mu.
+	setting Setting
+
+	// paused is true if the Timer is paused. paused is protected by mu.
+	paused bool
+
+	// kicker is used to wake the Timer goroutine. The kicker pointer is
+	// immutable, but its state is protected by mu.
+	kicker *time.Timer `state:"nosave"`
+
+	// entry is registered with clock.EventRegister. entry is immutable.
+	//
+	// Per comment in Clock, entry must be re-registered after restore; per
+	// comment in Timer.Load, this is done in Timer.Resume.
+	entry waiter.Entry `state:"nosave"`
+
+	// events is the channel that will be notified whenever entry receives an
+	// event. It is also closed by Timer.Destroy to instruct the Timer
+	// goroutine to exit.
+	events chan struct{} `state:"nosave"`
+}
+
+// timerTickEvents are Clock events that require the Timer goroutine to Tick
+// prematurely.
+const timerTickEvents = ClockEventSet | ClockEventRateIncrease
+
+// NewTimer returns a new Timer that will obtain time from clock and send
+// expirations to listener. The Timer is initially stopped and has no first
+// expiration or period configured.
+func NewTimer(clock Clock, listener TimerListener) *Timer {
+	t := &Timer{
+		clock:    clock,
+		listener: listener,
+	}
+	t.init()
+	return t
+}
+
+// After waits for the duration to elapse according to clock and then sends a
+// notification on the returned channel. The timer is started immediately and
+// will fire exactly once. The second return value is the start time used with
+// the duration.
+//
+// Callers must call Timer.Destroy.
+func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) {
+	notifier, tchan := NewChannelNotifier()
+	t := NewTimer(clock, notifier)
+	now := clock.Now()
+
+	t.Swap(Setting{
+		Enabled: true,
+		Period:  0,
+		Next:    now.Add(duration),
+	})
+	return t, now, tchan
+}
+
+// init initializes Timer state that is not preserved across save/restore. If
+// init has already been called, calling it again is a no-op.
+//
+// Preconditions: t.mu must be locked, or the caller must have exclusive access
+// to t.
+func (t *Timer) init() {
+	if t.kicker != nil {
+		return
+	}
+	// If t.kicker is nil, the Timer goroutine can't be running, so we can't
+	// race with it.
+	t.kicker = time.NewTimer(0)
+	t.entry, t.events = waiter.NewChannelEntry(nil)
+	t.clock.EventRegister(&t.entry, timerTickEvents)
+	go t.runGoroutine() // S/R-SAFE: synchronized by t.mu
+}
+
+// Destroy releases resources owned by the Timer. A Destroyed Timer must not be
+// used again; in particular, a Destroyed Timer should not be Saved.
+func (t *Timer) Destroy() {
+	// Stop the Timer, ensuring that the Timer goroutine will not call
+	// t.kicker.Reset, before calling t.kicker.Stop.
+	t.mu.Lock()
+	t.setting.Enabled = false
+	t.mu.Unlock()
+	t.kicker.Stop()
+	// Unregister t.entry, ensuring that the Clock will not send to t.events,
+	// before closing t.events to instruct the Timer goroutine to exit.
+	t.clock.EventUnregister(&t.entry)
+	close(t.events)
+	t.listener.Destroy()
+}
+
+func (t *Timer) runGoroutine() {
+	for {
+		select {
+		case <-t.kicker.C:
+		case _, ok := <-t.events:
+			if !ok {
+				// Channel closed by Destroy.
+				return
+			}
+		}
+		t.Tick()
+	}
+}
+
+// Tick requests that the Timer immediately check for expirations and
+// re-evaluate when it should next check for expirations.
+func (t *Timer) Tick() {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		return
+	}
+	s, exp := t.setting.advancedTo(now)
+	t.setting = s
+	if exp > 0 {
+		t.listener.Notify(exp)
+	}
+	t.resetKickerLocked(now)
+}
+
+// Pause pauses the Timer, ensuring that it does not generate any further
+// expirations until Resume is called. If the Timer is already paused, Pause
+// has no effect.
+func (t *Timer) Pause() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.paused = true
+	// t.kicker may be nil if we were restored but never resumed.
+	if t.kicker != nil {
+		t.kicker.Stop()
+	}
+}
+
+// Resume ends the effect of Pause. If the Timer is not paused, Resume has no
+// effect.
+func (t *Timer) Resume() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.paused {
+		return
+	}
+	t.paused = false
+
+	// Lazily initialize the Timer. We can't call Timer.init until Timer.Resume
+	// because save/restore will restore Timers before
+	// kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed
+	// by a kernel.Timekeeper then the Timer goroutine will panic if it calls
+	// t.clock.Now().
+	t.init()
+
+	// Kick the Timer goroutine in case it was already initialized, but the
+	// Timer goroutine was sleeping.
+	t.kicker.Reset(0)
+}
+
+// Get returns a snapshot of the Timer's current Setting and the time
+// (according to the Timer's Clock) at which the snapshot was taken.
+//
+// Preconditions: The Timer must not be paused (since its Setting cannot
+// be advanced to the current time while it is paused.)
+func (t *Timer) Get() (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
+	}
+	s, exp := t.setting.advancedTo(now)
+	t.setting = s
+	if exp > 0 {
+		t.listener.Notify(exp)
+	}
+	t.resetKickerLocked(now)
+	return now, s
+}
+
+// Swap atomically changes the Timer's Setting and returns the Timer's previous
+// Setting and the time (according to the Timer's Clock) at which the snapshot
+// was taken. Setting s.Enabled to true starts the Timer, while setting
+// s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused.
+func (t *Timer) Swap(s Setting) (Time, Setting) {
+	return t.SwapAnd(s, nil)
+}
+
+// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil,
+// and returns the Timer's previous Setting and the time (according to the
+// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
+// starts the timer, while setting s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused. f cannot call any Timer methods
+// since it is called with the Timer mutex locked.
+func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
+	now := t.clock.Now()
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.paused {
+		panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
+	}
+	oldS, oldExp := t.setting.advancedTo(now)
+	if oldExp > 0 {
+		t.listener.Notify(oldExp)
+	}
+	if f != nil {
+		f()
+	}
+	newS, newExp := s.advancedTo(now)
+	t.setting = newS
+	if newExp > 0 {
+		t.listener.Notify(newExp)
+	}
+	t.resetKickerLocked(now)
+	return now, oldS
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Timer) resetKickerLocked(now Time) {
+	if t.setting.Enabled {
+		// Clock.WallTimeUntil may return a negative value. This is fine;
+		// time.when treats negative Durations as 0.
+		t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now))
+	}
+	// We don't call t.kicker.Stop if !t.setting.Enabled because in most cases
+	// resetKickerLocked will be called from the Timer goroutine itself, in
+	// which case t.kicker has already fired and t.kicker.Stop will be an
+	// expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer
+	// => runtime.deltimer).
+}
+
+// Clock returns the Clock used by t.
+func (t *Timer) Clock() Clock {
+	return t.clock
+}
+
+// ChannelNotifier is a TimerListener that sends a message on an empty struct
+// channel.
+//
+// ChannelNotifier cannot be saved or loaded.
+type ChannelNotifier struct {
+	// tchan must be a buffered channel.
+	tchan chan struct{}
+}
+
+// NewChannelNotifier creates a new channel notifier.
+//
+// If the notifier is used with a timer, Timer.Destroy will close the channel
+// returned here.
+func NewChannelNotifier() (TimerListener, <-chan struct{}) {
+	tchan := make(chan struct{}, 1)
+	return &ChannelNotifier{tchan}, tchan
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (c *ChannelNotifier) Notify(uint64) {
+	select {
+	case c.tchan <- struct{}{}:
+	default:
+	}
+}
+
+// Destroy implements ktime.TimerListener.Destroy and will close the channel.
+func (c *ChannelNotifier) Destroy() {
+	close(c.tchan)
+}
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
new file mode 100644
index 000000000..3f16c1676
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -0,0 +1,270 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// Timekeeper manages all of the kernel clocks.
+type Timekeeper struct {
+	// clocks are the clock sources.
+	//
+	// These are not saved directly, as the new machine's clock may behave
+	// differently.
+	//
+	// It is set only once, by SetClocks.
+	clocks sentrytime.Clocks `state:"nosave"`
+
+	// bootTime is the realtime when the system "booted". i.e., when
+	// SetClocks was called in the initial (not restored) run.
+	bootTime ktime.Time
+
+	// monotonicOffset is the offset to apply to the monotonic clock output
+	// from clocks.
+	//
+	// It is set only once, by SetClocks.
+	monotonicOffset int64 `state:"nosave"`
+
+	// restored indicates that this Timekeeper was restored from a state
+	// file.
+	restored bool `state:"nosave"`
+
+	// saveMonotonic is the (offset) value of the monotonic clock at the
+	// time of save.
+	//
+	// It is only valid if restored is true.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveMonotonic int64
+
+	// saveRealtime is the value of the realtime clock at the time of save.
+	//
+	// It is only valid if restored is true.
+	//
+	// It is only used in SetClocks after restore to compute the new
+	// monotonicOffset.
+	saveRealtime int64
+
+	// params manages the parameter page.
+	params *VDSOParamPage
+
+	// mu protects destruction with stop and wg.
+	mu sync.Mutex `state:"nosave"`
+
+	// stop is used to tell the update goroutine to exit.
+	stop chan struct{} `state:"nosave"`
+
+	// wg is used to indicate that the update goroutine has exited.
+	wg sync.WaitGroup `state:"nosave"`
+}
+
+// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date.
+// NewTimekeeper does not take ownership of paramPage.
+//
+// SetClocks must be called on the returned Timekeeper before it is usable.
+func NewTimekeeper(platform platform.Platform, paramPage platform.FileRange) (*Timekeeper, error) {
+	return &Timekeeper{
+		params: NewVDSOParamPage(platform, paramPage),
+	}, nil
+}
+
+// SetClocks the backing clock source.
+//
+// SetClocks must be called before the Timekeeper is used, and it may not be
+// called more than once, as changing the clock source without extra correction
+// could cause time discontinuities.
+//
+// It must also be called after Load.
+func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
+	// Update the params, marking them "not ready", as we may need to
+	// restart calibration on this new machine.
+	if t.restored {
+		if err := t.params.Write(func() vdsoParams {
+			return vdsoParams{}
+		}); err != nil {
+			panic("unable to reset VDSO params: " + err.Error())
+		}
+	}
+
+	if t.clocks != nil {
+		panic("SetClocks called on previously-initialized Timekeeper")
+	}
+
+	t.clocks = c
+
+	// Compute the offset of the monotonic clock from the base Clocks.
+	//
+	// In a fresh (not restored) sentry, monotonic time starts at zero.
+	//
+	// In a restored sentry, monotonic time jumps forward by approximately
+	// the same amount as real time. There are no guarantees here, we are
+	// just making a best-effort attempt to to make it appear that the app
+	// was simply not scheduled for a long period, rather than that the
+	// real time clock was changed.
+	//
+	// If real time went backwards, it remains the same.
+	wantMonotonic := int64(0)
+
+	nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		panic("Unable to get current monotonic time: " + err.Error())
+	}
+
+	nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Unable to get current realtime: " + err.Error())
+	}
+
+	if t.restored {
+		wantMonotonic = t.saveMonotonic
+		elapsed := nowRealtime - t.saveRealtime
+		if elapsed > 0 {
+			wantMonotonic += elapsed
+		}
+	}
+
+	t.monotonicOffset = wantMonotonic - nowMonotonic
+
+	if !t.restored {
+		// Hold on to the initial "boot" time.
+		t.bootTime = ktime.FromNanoseconds(nowRealtime)
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+}
+
+// startUpdater starts an update goroutine that keeps the clocks updated.
+//
+// mu must be held.
+func (t *Timekeeper) startUpdater() {
+	if t.stop != nil {
+		// Timekeeper already started
+		return
+	}
+	t.stop = make(chan struct{})
+
+	// Keep the clocks up to date.
+	//
+	// Note that the Go runtime uses host CLOCK_MONOTONIC to service the
+	// timer, so it may run at a *slightly* different rate from the
+	// application CLOCK_MONOTONIC. That is fine, as we only need to update
+	// at approximately this rate.
+	timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
+	t.wg.Add(1)
+	go func() { // S/R-SAFE: stopped during save.
+		for {
+			// Start with an update immediately, so the clocks are
+			// ready ASAP.
+
+			// Call Update within a Write block to prevent the VDSO
+			// from using the old params between Update and
+			// Write.
+			if err := t.params.Write(func() vdsoParams {
+				monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update()
+
+				var p vdsoParams
+				if monotonicOk {
+					p.monotonicReady = 1
+					p.monotonicBaseCycles = int64(monotonicParams.BaseCycles)
+					p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset
+					p.monotonicFrequency = monotonicParams.Frequency
+				}
+				if realtimeOk {
+					p.realtimeReady = 1
+					p.realtimeBaseCycles = int64(realtimeParams.BaseCycles)
+					p.realtimeBaseRef = int64(realtimeParams.BaseRef)
+					p.realtimeFrequency = realtimeParams.Frequency
+				}
+
+				log.Debugf("Updating VDSO parameters: %+v", p)
+
+				return p
+			}); err != nil {
+				log.Warningf("Unable to update VDSO parameter page: %v", err)
+			}
+
+			select {
+			case <-timer.C:
+			case <-t.stop:
+				t.wg.Done()
+				return
+			}
+		}
+	}()
+}
+
+// stopUpdater stops the update goroutine, blocking until it exits.
+//
+// mu must be held.
+func (t *Timekeeper) stopUpdater() {
+	if t.stop == nil {
+		// Updater not running.
+		return
+	}
+
+	close(t.stop)
+	t.wg.Wait()
+	t.stop = nil
+}
+
+// Destroy destroys the Timekeeper, freeing all associated resources.
+func (t *Timekeeper) Destroy() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.stopUpdater()
+}
+
+// PauseUpdates stops clock parameter updates. This should only be used when
+// Tasks are not running and thus cannot access the clock.
+func (t *Timekeeper) PauseUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.stopUpdater()
+}
+
+// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates.
+func (t *Timekeeper) ResumeUpdates() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.startUpdater()
+}
+
+// GetTime returns the current time in nanoseconds.
+func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
+	if t.clocks == nil {
+		panic("Timekeeper used before initialized with SetClocks")
+	}
+	now, err := t.clocks.GetTime(c)
+	if err == nil && c == sentrytime.Monotonic {
+		now += t.monotonicOffset
+	}
+	return now, err
+}
+
+// BootTime returns the system boot real time.
+func (t *Timekeeper) BootTime() ktime.Time {
+	return t.bootTime
+}
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
new file mode 100644
index 000000000..aee983ac7
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// beforeSave is invoked by stateify.
+func (t *Timekeeper) beforeSave() {
+	if t.stop != nil {
+		panic("pauseUpdates must be called before Save")
+	}
+
+	// N.B. we want the *offset* monotonic time.
+	var err error
+	if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil {
+		panic("unable to get current monotonic time: " + err.Error())
+	}
+
+	if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil {
+		panic("unable to get current realtime: " + err.Error())
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (t *Timekeeper) afterLoad() {
+	t.restored = true
+}
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
new file mode 100644
index 000000000..08bacba4f
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -0,0 +1,156 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// mockClocks is a sentrytime.Clocks that simply returns the times in the
+// struct.
+type mockClocks struct {
+	monotonic int64
+	realtime  int64
+}
+
+// Update implements sentrytime.Clocks.Update. It does nothing.
+func (*mockClocks) Update() (monotonicParams sentrytime.Parameters, monotonicOk bool, realtimeParam sentrytime.Parameters, realtimeOk bool) {
+	return
+}
+
+// Update implements sentrytime.Clocks.GetTime.
+func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
+	switch id {
+	case sentrytime.Monotonic:
+		return c.monotonic, nil
+	case sentrytime.Realtime:
+		return c.realtime, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// stateTestClocklessTimekeeper returns a test Timekeeper which has not had
+// SetClocks called.
+func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
+	ctx := contexttest.Context(tb)
+	p := platform.FromContext(ctx)
+	fr, err := p.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+	if err != nil {
+		tb.Fatalf("failed to allocate memory: %v", err)
+	}
+	return &Timekeeper{
+		params: NewVDSOParamPage(p, fr),
+	}
+}
+
+func stateTestTimekeeper(tb testing.TB) *Timekeeper {
+	t := stateTestClocklessTimekeeper(tb)
+	t.SetClocks(sentrytime.NewCalibratedClocks())
+	return t
+}
+
+// TestTimekeeperMonotonicZero tests that monotonic time starts at zero.
+func TestTimekeeperMonotonicZero(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 100000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 0 {
+		t.Errorf("GetTime got %d want 0", now)
+	}
+
+	c.monotonic += 10
+
+	now, err = tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 10 {
+		t.Errorf("GetTime got %d want 10", now)
+	}
+}
+
+// TestTimekeeperMonotonicJumpForward tests that monotonic time jumps forward
+// after restore.
+func TestTimekeeperMonotonicForward(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 900000,
+		realtime:  600000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.restored = true
+	tk.saveMonotonic = 100000
+	tk.saveRealtime = 400000
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	// The monotonic clock should jump ahead by 200000 to 300000.
+	//
+	// The new system monotonic time (900000) is irrelevant to what the app
+	// sees.
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 300000 {
+		t.Errorf("GetTime got %d want 300000", now)
+	}
+}
+
+// TestTimekeeperMonotonicJumpBackwards tests that monotonic time does not jump
+// backwards when realtime goes backwards.
+func TestTimekeeperMonotonicJumpBackwards(t *testing.T) {
+	c := &mockClocks{
+		monotonic: 900000,
+		realtime:  400000,
+	}
+
+	tk := stateTestClocklessTimekeeper(t)
+	tk.restored = true
+	tk.saveMonotonic = 100000
+	tk.saveRealtime = 600000
+	tk.SetClocks(c)
+	defer tk.Destroy()
+
+	// The monotonic clock should remain at 100000.
+	//
+	// The new system monotonic time (900000) is irrelevant to what the app
+	// sees and we don't want to jump the monotonic clock backwards like
+	// realtime did.
+	now, err := tk.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		t.Errorf("GetTime err got %v want nil", err)
+	}
+	if now != 100000 {
+		t.Errorf("GetTime got %d want 100000", now)
+	}
+}
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
new file mode 100644
index 000000000..03a3310be
--- /dev/null
+++ b/pkg/sentry/kernel/timer.go
@@ -0,0 +1,282 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+type timekeeperClock struct {
+	tk *Timekeeper
+	c  sentrytime.ClockID
+
+	// Implements ktime.Clock.WallTimeUntil.
+	ktime.WallRateClock `state:"nosave"`
+
+	// Implements waiter.Waitable. (We have no ability to detect
+	// discontinuities from external changes to CLOCK_REALTIME).
+	ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+	now, err := tc.tk.GetTime(tc.c)
+	if err != nil {
+		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+	}
+	return ktime.FromNanoseconds(now)
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing.
+type tgClock struct {
+	tg *ThreadGroup
+
+	// If includeSys is true, the tgClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// tgClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable.
+	ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+	return tg.tm.virtClock
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+	return tg.tm.profClock
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+	stats := tgc.tg.CPUStats()
+	if tgc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+	// The assumption here is that the time spent in this process (not matter
+	// virtual or prof) should not exceed wall time * active tasks, since
+	// Task.exitThreadGroup stops accounting as it transitions to
+	// TaskExitInitiated.
+	tgc.tg.pidns.owner.mu.RLock()
+	n := tgc.tg.activeTasks
+	tgc.tg.pidns.owner.mu.RUnlock()
+	if n == 0 {
+		if t.Before(now) {
+			return 0
+		}
+		// The timer tick raced with thread group exit, after which no more
+		// tasks can enter the thread group. So tgc.Now() will never advance
+		// again. Return a large delay; the timer should be stopped long before
+		// it comes again anyway.
+		return time.Hour
+	}
+	// This is a lower bound on the amount of time that can elapse before an
+	// associated timer expires, so returning this value tends to result in a
+	// sequence of closely-spaced ticks just before timer expiry. To avoid
+	// this, round up to the nearest ClockTick; CPU usage measurements are
+	// limited to this resolution anyway.
+	remaining := time.Duration(int64(t.Sub(now))/int64(n)) * time.Nanosecond
+	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing.
+type taskClock struct {
+	t *Task
+
+	// If includeSys is true, the taskClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// taskClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+	// based on either of the clock events, so there's no event to be
+	// notified for.
+	ktime.NoClockEvents `state:"nosave"`
+
+	// Implements ktime.Clock.WallTimeUntil.
+	//
+	// As an upper bound, a task's clock cannot advance faster than CPU
+	// time. It would have to execute at a rate of more than 1 task-second
+	// per 1 CPU-second, which isn't possible.
+	ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+	stats := tc.t.CPUStats()
+	if tc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
+type signalNotifier struct {
+	tg         *ThreadGroup
+	signal     linux.Signal
+	realTimer  bool
+	includeSys bool
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (s *signalNotifier) Notify(exp uint64) {
+	// Since all signals sent using a signalNotifier are standard (not
+	// real-time) signals, we can ignore the number of expirations and send
+	// only a single signal.
+	if s.realTimer {
+		// real timer signal sent to leader. See kernel/time/itimer.c:it_real_fn
+		s.tg.SendSignal(sigPriv(s.signal))
+	} else {
+		s.tg.SendTimerSignal(sigPriv(s.signal), s.includeSys)
+	}
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (s *signalNotifier) Destroy() {}
+
+// TimerManager is a collection of supported process cpu timers.
+type TimerManager struct {
+	// Clocks used to drive thread group execution time timers.
+	virtClock *tgClock
+	profClock *tgClock
+
+	RealTimer      *ktime.Timer
+	VirtualTimer   *ktime.Timer
+	ProfTimer      *ktime.Timer
+	SoftLimitTimer *ktime.Timer
+	HardLimitTimer *ktime.Timer
+}
+
+// newTimerManager returns a new instance of TimerManager.
+func newTimerManager(tg *ThreadGroup, monotonicClock ktime.Clock) TimerManager {
+	virtClock := &tgClock{tg: tg, includeSys: false}
+	profClock := &tgClock{tg: tg, includeSys: true}
+	tm := TimerManager{
+		virtClock: virtClock,
+		profClock: profClock,
+		RealTimer: ktime.NewTimer(monotonicClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGALRM,
+			realTimer:  true,
+			includeSys: false,
+		}),
+		VirtualTimer: ktime.NewTimer(virtClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGVTALRM,
+			realTimer:  false,
+			includeSys: false,
+		}),
+		ProfTimer: ktime.NewTimer(profClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGPROF,
+			realTimer:  false,
+			includeSys: true,
+		}),
+		SoftLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGXCPU,
+			realTimer:  false,
+			includeSys: true,
+		}),
+		HardLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
+			tg:         tg,
+			signal:     linux.SIGKILL,
+			realTimer:  false,
+			includeSys: true,
+		}),
+	}
+	tm.applyCPULimits(tg.Limits().Get(limits.CPU))
+	return tm
+}
+
+// Save saves this TimerManger.
+
+// destroy destroys all timers.
+func (tm *TimerManager) destroy() {
+	tm.RealTimer.Destroy()
+	tm.VirtualTimer.Destroy()
+	tm.ProfTimer.Destroy()
+	tm.SoftLimitTimer.Destroy()
+	tm.HardLimitTimer.Destroy()
+}
+
+func (tm *TimerManager) applyCPULimits(l limits.Limit) {
+	tm.SoftLimitTimer.Swap(ktime.Setting{
+		Enabled: l.Cur != limits.Infinity,
+		Next:    ktime.FromNanoseconds((time.Duration(l.Cur) * time.Second).Nanoseconds()),
+		Period:  time.Second,
+	})
+	tm.HardLimitTimer.Swap(ktime.Setting{
+		Enabled: l.Max != limits.Infinity,
+		Next:    ktime.FromNanoseconds((time.Duration(l.Max) * time.Second).Nanoseconds()),
+	})
+}
+
+// kick is called when the number of threads in the thread group associated
+// with tm increases.
+func (tm *TimerManager) kick() {
+	tm.virtClock.Notify(ktime.ClockEventRateIncrease)
+	tm.profClock.Notify(ktime.ClockEventRateIncrease)
+}
+
+// pause is to pause the timers and stop timer signal delivery.
+func (tm *TimerManager) pause() {
+	tm.RealTimer.Pause()
+	tm.VirtualTimer.Pause()
+	tm.ProfTimer.Pause()
+	tm.SoftLimitTimer.Pause()
+	tm.HardLimitTimer.Pause()
+}
+
+// resume is to resume the timers and continue timer signal delivery.
+func (tm *TimerManager) resume() {
+	tm.RealTimer.Resume()
+	tm.VirtualTimer.Resume()
+	tm.ProfTimer.Resume()
+	tm.SoftLimitTimer.Resume()
+	tm.HardLimitTimer.Resume()
+}
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
new file mode 100644
index 000000000..58e9b4d1b
--- /dev/null
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// UTSNamespace represents a UTS namespace, a holder of two system identifiers:
+// the hostname and domain name.
+type UTSNamespace struct {
+	// mu protects all fields below.
+	mu         sync.Mutex `state:"nosave"`
+	hostName   string
+	domainName string
+
+	// userns is the user namespace associated with the UTSNamespace.
+	// Privileged operations on this UTSNamespace must have appropriate
+	// capabilities in userns.
+	//
+	// userns is immutable.
+	userns *auth.UserNamespace
+}
+
+// NewUTSNamespace creates a new UTS namespace.
+func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace {
+	return &UTSNamespace{
+		hostName:   hostName,
+		domainName: domainName,
+		userns:     userns,
+	}
+}
+
+// UTSNamespace returns the task's UTS namespace.
+func (t *Task) UTSNamespace() *UTSNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.utsns
+}
+
+// HostName returns the host name of this UTS namespace.
+func (u *UTSNamespace) HostName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.hostName
+}
+
+// SetHostName sets the host name of this UTS namespace.
+func (u *UTSNamespace) SetHostName(host string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.hostName = host
+}
+
+// DomainName returns the domain name of this UTS namespace.
+func (u *UTSNamespace) DomainName() string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.domainName
+}
+
+// SetDomainName sets the domain name of this UTS namespace.
+func (u *UTSNamespace) SetDomainName(domain string) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	u.domainName = domain
+}
+
+// UserNamespace returns the user namespace associated with this UTS namespace.
+func (u *UTSNamespace) UserNamespace() *auth.UserNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return u.userns
+}
+
+// Clone makes a copy of this UTS namespace, associating the given user
+// namespace.
+func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	return &UTSNamespace{
+		hostName:   u.hostName,
+		domainName: u.domainName,
+		userns:     userns,
+	}
+}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
new file mode 100644
index 000000000..0bacbea49
--- /dev/null
+++ b/pkg/sentry/kernel/vdso.go
@@ -0,0 +1,145 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// vdsoParams are the parameters exposed to the VDSO.
+//
+// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
+// which also includes a sequence counter.
+type vdsoParams struct {
+	monotonicReady      uint64
+	monotonicBaseCycles int64
+	monotonicBaseRef    int64
+	monotonicFrequency  uint64
+
+	realtimeReady      uint64
+	realtimeBaseCycles int64
+	realtimeBaseRef    int64
+	realtimeFrequency  uint64
+}
+
+// VDSOParamPage manages a VDSO parameter page.
+//
+// Its memory layout looks like:
+//
+// type page struct {
+//	// seq is a sequence counter that protects the fields below.
+//	seq uint64
+//	vdsoParams
+// }
+//
+// Everything in the struct is 8 bytes for easy alignment.
+//
+// It must be kept in sync with params in vdso/vdso_time.cc.
+type VDSOParamPage struct {
+	// The parameter page is fr, allocated from platform.Memory().
+	platform platform.Platform
+	fr       platform.FileRange
+
+	// seq is the current sequence count written to the page.
+	//
+	// A write is in progress if bit 1 of the counter is set.
+	//
+	// Timekeeper's updater goroutine may call Write before equality is
+	// checked in state_test_util tests, causing this field to change across
+	// save / restore.
+	seq uint64
+}
+
+// NewVDSOParamPage returns a VDSOParamPage.
+//
+// Preconditions:
+//
+// * fr is a single page allocated from platform.Memory(). VDSOParamPage does
+//   not take ownership of fr; it must remain allocated for the lifetime of the
+//   VDSOParamPage.
+//
+// * VDSOParamPage must be the only writer to fr.
+//
+// * platform.Memory().MapInternal(fr) must return a single safemem.Block.
+func NewVDSOParamPage(platform platform.Platform, fr platform.FileRange) *VDSOParamPage {
+	return &VDSOParamPage{platform: platform, fr: fr}
+}
+
+// access returns a mapping of the param page.
+func (v *VDSOParamPage) access() (safemem.Block, error) {
+	bs, err := v.platform.Memory().MapInternal(v.fr, usermem.ReadWrite)
+	if err != nil {
+		return safemem.Block{}, err
+	}
+	if bs.NumBlocks() != 1 {
+		panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks()))
+	}
+	return bs.Head(), nil
+}
+
+// incrementSeq increments the sequence counter in the param page.
+func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
+	next := v.seq + 1
+	old, err := safemem.SwapUint64(paramPage, next)
+	if err != nil {
+		return err
+	}
+
+	if old != v.seq {
+		return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+	}
+
+	v.seq = next
+	return nil
+}
+
+// Write updates the VDSO parameters.
+//
+// Write starts a write block, calls f to get the new parameters, writes
+// out the new parameters, then ends the write block.
+func (v *VDSOParamPage) Write(f func() vdsoParams) error {
+	paramPage, err := v.access()
+	if err != nil {
+		return err
+	}
+
+	// Write begin.
+	next := v.seq + 1
+	if next%2 != 1 {
+		panic("Out-of-order sequence count")
+	}
+
+	err = v.incrementSeq(paramPage)
+	if err != nil {
+		return err
+	}
+
+	// Get the new params.
+	p := f()
+	buf := binary.Marshal(nil, usermem.ByteOrder, p)
+
+	// Skip the sequence counter.
+	if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
+		panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err))
+	}
+
+	// Write end.
+	return v.incrementSeq(paramPage)
+}
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
new file mode 100644
index 000000000..a9e84673f
--- /dev/null
+++ b/pkg/sentry/kernel/version.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Version defines the application-visible system version.
+type Version struct {
+	// Operating system name (e.g. "Linux").
+	Sysname string
+
+	// Operating system release (e.g. "3.11.10-amd64").
+	Release string
+
+	// Operating system version. On Linux this takes the shape
+	// "#VERSION CONFIG_FLAGS TIMESTAMP"
+	// where:
+	// - VERSION is a sequence counter incremented on every successful build
+	// - CONFIG_FLAGS is a space-separated list of major enabled kernel features
+	//   (e.g. "SMP" and "PREEMPT")
+	// - TIMESTAMP is the build timestamp as returned by `date`
+	Version string
+}
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
new file mode 100644
index 000000000..06c3e72b0
--- /dev/null
+++ b/pkg/sentry/limits/BUILD
@@ -0,0 +1,39 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "limits_state",
+    srcs = [
+        "limits.go",
+    ],
+    out = "limits_state.go",
+    package = "limits",
+)
+
+go_library(
+    name = "limits",
+    srcs = [
+        "context.go",
+        "limits.go",
+        "limits_state.go",
+        "linux.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/limits",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/state",
+    ],
+)
+
+go_test(
+    name = "limits_test",
+    size = "small",
+    srcs = [
+        "limits_test.go",
+    ],
+    embed = [":limits"],
+)
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
new file mode 100644
index 000000000..75e97bf92
--- /dev/null
+++ b/pkg/sentry/limits/context.go
@@ -0,0 +1,35 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package limits
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the limit package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxLimits is a Context.Value key for a LimitSet.
+	CtxLimits contextID = iota
+)
+
+// FromContext returns the limits that apply to ctx.
+func FromContext(ctx context.Context) *LimitSet {
+	if v := ctx.Value(CtxLimits); v != nil {
+		return v.(*LimitSet)
+	}
+	return nil
+}
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
new file mode 100644
index 000000000..4230ba958
--- /dev/null
+++ b/pkg/sentry/limits/limits.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package limits provides resource limits.
+package limits
+
+import (
+	"sync"
+	"syscall"
+)
+
+// LimitType defines a type of resource limit.
+type LimitType int
+
+// Set of constants defining the different types of resource limits.
+const (
+	CPU LimitType = iota
+	FileSize
+	Data
+	Stack
+	Core
+	Rss
+	ProcessCount
+	NumberOfFiles
+	MemoryPagesLocked
+	AS
+	Locks
+	SignalsPending
+	MessageQueueBytes
+	Nice
+	RealTimePriority
+	Rttime
+)
+
+// Infinity is a constant representing a resource with no limit.
+const Infinity = ^uint64(0)
+
+// Limit specifies a system limit.
+type Limit struct {
+	// Cur specifies the current limit.
+	Cur uint64
+	// Max specifies the maximum settable limit.
+	Max uint64
+}
+
+// LimitSet represents the Limits that correspond to each LimitType.
+type LimitSet struct {
+	mu   sync.Mutex `state:"nosave"`
+	data map[LimitType]Limit
+}
+
+// NewLimitSet creates a new, empty LimitSet.
+func NewLimitSet() *LimitSet {
+	return &LimitSet{
+		data: make(map[LimitType]Limit),
+	}
+}
+
+// GetCopy returns a clone of the LimitSet.
+func (l *LimitSet) GetCopy() *LimitSet {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	copyData := make(map[LimitType]Limit)
+	for k, v := range l.data {
+		copyData[k] = v
+	}
+	return &LimitSet{
+		data: copyData,
+	}
+}
+
+// Get returns the resource limit associated with LimitType t.
+// If no limit is provided, it defaults to an infinite limit.Infinity.
+func (l *LimitSet) Get(t LimitType) Limit {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	s, ok := l.data[t]
+	if !ok {
+		return Limit{Cur: Infinity, Max: Infinity}
+	}
+	return s
+}
+
+// GetCapped returns the current value for the limit, capped as specified.
+func (l *LimitSet) GetCapped(t LimitType, max uint64) uint64 {
+	s := l.Get(t)
+	if s.Cur == Infinity || s.Cur > max {
+		return max
+	}
+	return s.Cur
+}
+
+// SetUnchecked assigns value v to resource of LimitType t.
+func (l *LimitSet) SetUnchecked(t LimitType, v Limit) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	l.data[t] = v
+}
+
+// Set assigns value v to resource of LimitType t and returns the old value.
+func (l *LimitSet) Set(t LimitType, v Limit) (Limit, error) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	// If a limit is already set, make sure the new limit doesn't
+	// exceed the previous max limit.
+	if _, ok := l.data[t]; ok {
+		if l.data[t].Max < v.Max {
+			return Limit{}, syscall.EPERM
+		}
+		if v.Cur > v.Max {
+			return Limit{}, syscall.EINVAL
+		}
+	}
+	old := l.data[t]
+	l.data[t] = v
+	return old, nil
+}
diff --git a/pkg/sentry/limits/limits_test.go b/pkg/sentry/limits/limits_test.go
new file mode 100644
index 000000000..dd6f80750
--- /dev/null
+++ b/pkg/sentry/limits/limits_test.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package limits
+
+import (
+	"syscall"
+	"testing"
+)
+
+func TestSet(t *testing.T) {
+	ls := NewLimitSet()
+	ls.Set(1, Limit{Cur: 50, Max: 50})
+	if _, err := ls.Set(1, Limit{Cur: 20, Max: 50}); err != nil {
+		t.Fatalf("Tried to lower Limit to valid new value: got %v, wanted nil", err)
+	}
+	if _, err := ls.Set(1, Limit{Cur: 20, Max: 60}); err != syscall.EPERM {
+		t.Fatalf("Tried to raise limit.Max to invalid higher value: got %v, wanted syscall.EPERM", err)
+	}
+	if _, err := ls.Set(1, Limit{Cur: 60, Max: 50}); err != syscall.EINVAL {
+		t.Fatalf("Tried to raise limit.Cur to invalid higher value: got %v, wanted syscall.EINVAL", err)
+	}
+	if _, err := ls.Set(1, Limit{Cur: 11, Max: 10}); err != syscall.EINVAL {
+		t.Fatalf("Tried to set new limit with Cur > Max: got %v, wanted syscall.EINVAL", err)
+	}
+}
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
new file mode 100644
index 000000000..8e6a24341
--- /dev/null
+++ b/pkg/sentry/limits/linux.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package limits
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// FromLinuxResource maps linux resources to sentry LimitTypes.
+var FromLinuxResource = map[int]LimitType{
+	linux.RLIMIT_CPU:        CPU,
+	linux.RLIMIT_FSIZE:      FileSize,
+	linux.RLIMIT_DATA:       Data,
+	linux.RLIMIT_STACK:      Stack,
+	linux.RLIMIT_CORE:       Core,
+	linux.RLIMIT_RSS:        Rss,
+	linux.RLIMIT_NPROC:      ProcessCount,
+	linux.RLIMIT_NOFILE:     NumberOfFiles,
+	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
+	linux.RLIMIT_AS:         AS,
+	linux.RLIMIT_LOCKS:      Locks,
+	linux.RLIMIT_SIGPENDING: SignalsPending,
+	linux.RLIMIT_MSGQUEUE:   MessageQueueBytes,
+	linux.RLIMIT_NICE:       Nice,
+	linux.RLIMIT_RTPRIO:     RealTimePriority,
+	linux.RLIMIT_RTTIME:     Rttime,
+}
+
+// FromLinux maps linux rlimit values to sentry Limits, being careful to handle
+// infinities.
+func FromLinux(rl uint64) uint64 {
+	if rl == linux.RLimInfinity {
+		return Infinity
+	}
+	return rl
+}
+
+// ToLinux maps sentry Limits to linux rlimit values, being careful to handle
+// infinities.
+func ToLinux(l uint64) uint64 {
+	if l == Infinity {
+		return linux.RLimInfinity
+	}
+	return l
+}
+
+// NewLinuxLimitSet returns a LimitSet whose values match the default rlimits
+// in Linux.
+func NewLinuxLimitSet() (*LimitSet, error) {
+	ls := NewLimitSet()
+	for rlt, rl := range linux.InitRLimits {
+		lt, ok := FromLinuxResource[rlt]
+		if !ok {
+			return nil, fmt.Errorf("unknown rlimit type %v", rlt)
+		}
+		ls.SetUnchecked(lt, Limit{
+			Cur: FromLinux(rl.Cur),
+			Max: FromLinux(rl.Max),
+		})
+	}
+	return ls, nil
+}
+
+// NewLinuxDistroLimitSet returns a new LimitSet whose values are typical
+// for a booted Linux distro.
+//
+// Many Linux init systems adjust the default Linux limits to values more
+// expected by the rest of the userspace. NewLinuxDistroLimitSet returns a
+// LimitSet with sensible defaults for applications that aren't starting
+// their own init system.
+func NewLinuxDistroLimitSet() (*LimitSet, error) {
+	ls, err := NewLinuxLimitSet()
+	if err != nil {
+		return nil, err
+	}
+
+	// Adjust ProcessCount to a lower value because GNU bash allocates 16
+	// bytes per proc and OOMs if this number is set too high. Value was
+	// picked arbitrarily.
+	//
+	// 1,048,576 ought to be enough for anyone.
+	l := ls.Get(ProcessCount)
+	l.Cur = 1 << 20
+	ls.Set(ProcessCount, l)
+	return ls, nil
+}
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
new file mode 100644
index 000000000..917ec8cc8
--- /dev/null
+++ b/pkg/sentry/loader/BUILD
@@ -0,0 +1,59 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_embed_data", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_embed_data(
+    name = "vdso_bin",
+    src = "//vdso:vdso.so",
+    package = "loader",
+    var = "vdsoBin",
+)
+
+go_stateify(
+    name = "loader_state",
+    srcs = [
+        "vdso.go",
+        "vdso_state.go",
+    ],
+    out = "loader_state.go",
+    package = "loader",
+)
+
+go_library(
+    name = "loader",
+    srcs = [
+        "elf.go",
+        "interpreter.go",
+        "loader.go",
+        "loader_state.go",
+        "vdso.go",
+        "vdso_state.go",
+        ":vdso_bin",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/loader",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/cpuid",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
new file mode 100644
index 000000000..d23dc1096
--- /dev/null
+++ b/pkg/sentry/loader/elf.go
@@ -0,0 +1,637 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"bytes"
+	"debug/elf"
+	"fmt"
+	"io"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// elfMagic identifies an ELF file.
+	elfMagic = "\x7fELF"
+
+	// maxTotalPhdrSize is the maximum combined size of all program
+	// headers.  Linux limits this to one page.
+	maxTotalPhdrSize = usermem.PageSize
+)
+
+var (
+	// header64Size is the size of elf.Header64.
+	header64Size = int(binary.Size(elf.Header64{}))
+
+	// Prog64Size is the size of elf.Prog64.
+	prog64Size = int(binary.Size(elf.Prog64{}))
+)
+
+func progFlagsAsPerms(f elf.ProgFlag) usermem.AccessType {
+	var p usermem.AccessType
+	if f&elf.PF_R == elf.PF_R {
+		p.Read = true
+	}
+	if f&elf.PF_W == elf.PF_W {
+		p.Write = true
+	}
+	if f&elf.PF_X == elf.PF_X {
+		p.Execute = true
+	}
+	return p
+}
+
+// elfInfo contains the metadata needed to load an ELF binary.
+type elfInfo struct {
+	// os is the target OS of the ELF.
+	os abi.OS
+
+	// arch is the target architecture of the ELF.
+	arch arch.Arch
+
+	// entry is the program entry point.
+	entry usermem.Addr
+
+	// phdrs are the program headers.
+	phdrs []elf.ProgHeader
+
+	// phdrSize is the size of a single program header in the ELF.
+	phdrSize int
+
+	// phdrOff is the offset of the program headers in the file.
+	phdrOff uint64
+
+	// sharedObject is true if the ELF represents a shared object.
+	sharedObject bool
+}
+
+// parseHeader parse the ELF header, verifying that this is a supported ELF
+// file and returning the ELF program headers.
+//
+// This is similar to elf.NewFile, except that it is more strict about what it
+// accepts from the ELF, and it doesn't parse unnecessary parts of the file.
+//
+// ctx may be nil if f does not need it.
+func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
+	// Check ident first; it will tell us the endianness of the rest of the
+	// structs.
+	var ident [elf.EI_NIDENT]byte
+	_, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0)
+	if err != nil {
+		log.Infof("Error reading ELF ident: %v", err)
+		// The entire ident array always exists.
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = syserror.ENOEXEC
+		}
+		return elfInfo{}, err
+	}
+
+	// Only some callers pre-check the ELF magic.
+	if !bytes.Equal(ident[:len(elfMagic)], []byte(elfMagic)) {
+		log.Infof("File is not an ELF")
+		return elfInfo{}, syserror.ENOEXEC
+	}
+
+	// We only support 64-bit, little endian binaries
+	if class := elf.Class(ident[elf.EI_CLASS]); class != elf.ELFCLASS64 {
+		log.Infof("Unsupported ELF class: %v", class)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	if endian := elf.Data(ident[elf.EI_DATA]); endian != elf.ELFDATA2LSB {
+		log.Infof("Unsupported ELF endianness: %v", endian)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	byteOrder := binary.LittleEndian
+
+	if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT {
+		log.Infof("Unsupported ELF version: %v", version)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	// EI_OSABI is ignored by Linux, which is the only OS supported.
+	os := abi.Linux
+
+	var hdr elf.Header64
+	hdrBuf := make([]byte, header64Size)
+	_, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0)
+	if err != nil {
+		log.Infof("Error reading ELF header: %v", err)
+		// The entire header always exists.
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = syserror.ENOEXEC
+		}
+		return elfInfo{}, err
+	}
+	binary.Unmarshal(hdrBuf, byteOrder, &hdr)
+
+	// We only support amd64.
+	if machine := elf.Machine(hdr.Machine); machine != elf.EM_X86_64 {
+		log.Infof("Unsupported ELF machine %d", machine)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	a := arch.AMD64
+
+	var sharedObject bool
+	elfType := elf.Type(hdr.Type)
+	switch elfType {
+	case elf.ET_EXEC:
+		sharedObject = false
+	case elf.ET_DYN:
+		sharedObject = true
+	default:
+		log.Infof("Unsupported ELF type %v", elfType)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+
+	if int(hdr.Phentsize) != prog64Size {
+		log.Infof("Unsupported phdr size %d", hdr.Phentsize)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	totalPhdrSize := prog64Size * int(hdr.Phnum)
+	if totalPhdrSize < prog64Size {
+		log.Warningf("No phdrs or total phdr size overflows: prog64Size: %d phnum: %d", prog64Size, int(hdr.Phnum))
+		return elfInfo{}, syserror.ENOEXEC
+	}
+	if totalPhdrSize > maxTotalPhdrSize {
+		log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize)
+		return elfInfo{}, syserror.ENOEXEC
+	}
+
+	phdrBuf := make([]byte, totalPhdrSize)
+	_, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
+	if err != nil {
+		log.Infof("Error reading ELF phdrs: %v", err)
+		// If phdrs were specified, they should all exist.
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			err = syserror.ENOEXEC
+		}
+		return elfInfo{}, err
+	}
+
+	phdrs := make([]elf.ProgHeader, hdr.Phnum)
+	for i := range phdrs {
+		var prog64 elf.Prog64
+		binary.Unmarshal(phdrBuf[:prog64Size], byteOrder, &prog64)
+		phdrBuf = phdrBuf[prog64Size:]
+		phdrs[i] = elf.ProgHeader{
+			Type:   elf.ProgType(prog64.Type),
+			Flags:  elf.ProgFlag(prog64.Flags),
+			Off:    prog64.Off,
+			Vaddr:  prog64.Vaddr,
+			Paddr:  prog64.Paddr,
+			Filesz: prog64.Filesz,
+			Memsz:  prog64.Memsz,
+			Align:  prog64.Align,
+		}
+	}
+
+	return elfInfo{
+		os:           os,
+		arch:         a,
+		entry:        usermem.Addr(hdr.Entry),
+		phdrs:        phdrs,
+		phdrOff:      hdr.Phoff,
+		phdrSize:     prog64Size,
+		sharedObject: sharedObject,
+	}, nil
+}
+
+// mapSegment maps a phdr into the Task. offset is the offset to apply to
+// phdr.Vaddr.
+func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
+	// Alignment of vaddr and offset must match. We'll need to map on the
+	// page boundary.
+	adjust := usermem.Addr(phdr.Vaddr).PageOffset()
+	if adjust != usermem.Addr(phdr.Off).PageOffset() {
+		ctx.Infof("Alignment of vaddr %#x != off %#x", phdr.Vaddr, phdr.Off)
+		return syserror.ENOEXEC
+	}
+
+	addr, ok := offset.AddLength(phdr.Vaddr)
+	if !ok {
+		// If offset != 0 we should have ensured this would fit.
+		ctx.Warningf("Computed segment load address overflows: %#x + %#x", phdr.Vaddr, offset)
+		return syserror.ENOEXEC
+	}
+	addr -= usermem.Addr(adjust)
+
+	fileOffset := phdr.Off - adjust
+	fileSize := phdr.Filesz + adjust
+	if fileSize < phdr.Filesz {
+		ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust)
+		return syserror.ENOEXEC
+	}
+	memSize := phdr.Memsz + adjust
+	if memSize < phdr.Memsz {
+		ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust)
+		return syserror.ENOEXEC
+	}
+	ms, ok := usermem.Addr(fileSize).RoundUp()
+	if !ok {
+		ctx.Infof("fileSize %#x too large", fileSize)
+		return syserror.ENOEXEC
+	}
+	mapSize := uint64(ms)
+
+	prot := progFlagsAsPerms(phdr.Flags)
+	mopts := memmap.MMapOpts{
+		Length: mapSize,
+		Offset: fileOffset,
+		Addr:   addr,
+		Fixed:  true,
+		// Linux will happily allow conflicting segments to map over
+		// one another.
+		Unmap:    true,
+		Private:  true,
+		Perms:    prot,
+		MaxPerms: usermem.AnyAccess,
+	}
+	if err := f.ConfigureMMap(ctx, &mopts); err != nil {
+		ctx.Infof("File is not memory-mappable: %v", err)
+		return err
+	}
+	if _, err := m.MMap(ctx, mopts); err != nil {
+		ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err)
+		return err
+	}
+
+	// We need to clear the end of the last page that exceeds fileSize so
+	// we don't map part of the file beyond fileSize.
+	//
+	// Note that Linux *does not* clear the portion of the first page
+	// before phdr.Off.
+	if mapSize > fileSize {
+		zeroAddr, ok := addr.AddLength(fileSize)
+		if !ok {
+			panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize))
+		}
+		zeroSize := int64(mapSize - fileSize)
+		if zeroSize < 0 {
+			panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize)))
+		}
+		if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil {
+			ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+usermem.Addr(zeroSize), err)
+			return err
+		}
+	}
+
+	// Allocate more anonymous pages if necessary.
+	if mapSize < memSize {
+		anonAddr, ok := addr.AddLength(mapSize)
+		if !ok {
+			panic(fmt.Sprintf("anonymous memory doesn't fit in pre-sized range? %#x + %#x", addr, mapSize))
+		}
+		anonSize, ok := usermem.Addr(memSize - mapSize).RoundUp()
+		if !ok {
+			ctx.Infof("extra anon pages too large: %#x", memSize-mapSize)
+			return syserror.ENOEXEC
+		}
+
+		if _, err := m.MMap(ctx, memmap.MMapOpts{
+			Length: uint64(anonSize),
+			Addr:   anonAddr,
+			// Fixed without Unmap will fail the mmap if something is
+			// already at addr.
+			Fixed:    true,
+			Private:  true,
+			Perms:    progFlagsAsPerms(phdr.Flags),
+			MaxPerms: usermem.AnyAccess,
+		}); err != nil {
+			ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err)
+			return err
+		}
+	}
+
+	return nil
+}
+
+// loadedELF describes an ELF that has been successfully loaded.
+type loadedELF struct {
+	// os is the target OS of the ELF.
+	os abi.OS
+
+	// arch is the target architecture of the ELF.
+	arch arch.Arch
+
+	// entry is the entry point of the ELF.
+	entry usermem.Addr
+
+	// start is the end of the ELF.
+	start usermem.Addr
+
+	// end is the end of the ELF.
+	end usermem.Addr
+
+	// interpter is the path to the ELF interpreter.
+	interpreter string
+
+	// phdrAddr is the address of the ELF program headers.
+	phdrAddr usermem.Addr
+
+	// phdrSize is the size of a single program header in the ELF.
+	phdrSize int
+
+	// phdrNum is the number of program headers.
+	phdrNum int
+
+	// auxv contains a subset of ELF-specific auxiliary vector entries:
+	// * AT_PHDR
+	// * AT_PHENT
+	// * AT_PHNUM
+	// * AT_BASE
+	// * AT_ENTRY
+	auxv arch.Auxv
+}
+
+// loadParsedELF loads f into mm.
+//
+// info is the parsed elfInfo from the header.
+//
+// It does not load the ELF interpreter, or return any auxv entries.
+//
+// Preconditions:
+//  * f is an ELF file
+func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
+	first := true
+	var start, end usermem.Addr
+	var interpreter string
+	for _, phdr := range info.phdrs {
+		switch phdr.Type {
+		case elf.PT_LOAD:
+			vaddr := usermem.Addr(phdr.Vaddr)
+			if first {
+				first = false
+				start = vaddr
+			}
+			if vaddr < end {
+				ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+			var ok bool
+			end, ok = vaddr.AddLength(phdr.Memsz)
+			if !ok {
+				ctx.Infof("PT_LOAD header size overflows. %#x + %#x", vaddr, phdr.Memsz)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+		case elf.PT_INTERP:
+			if phdr.Filesz > syscall.PathMax {
+				ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+			path := make([]byte, phdr.Filesz)
+			_, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
+			if err != nil {
+				ctx.Infof("Error reading PT_INTERP path: %v", err)
+				// If an interpreter was specified, it should exist.
+				if err == io.EOF || err == io.ErrUnexpectedEOF {
+					err = syserror.ENOEXEC
+				}
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+			if path[len(path)-1] != 0 {
+				ctx.Infof("PT_INTERP path not NUL-terminated: %v", path)
+				return loadedELF{}, syserror.ENOEXEC
+			}
+
+			// Strip NUL-terminator from string.
+			interpreter = string(path[:len(path)-1])
+		}
+	}
+
+	// Shared objects don't have fixed load addresses. We need to pick a
+	// base address big enough to fit all segments, so we first create a
+	// mapping for the total size just to find a region that is big enough.
+	//
+	// It is safe to unmap it immediately with racing with another mapping
+	// because we are the only one in control of the MemoryManager.
+	//
+	// Note that the vaddr of the first PT_LOAD segment is ignored when
+	// choosing the load address (even if it is non-zero). The vaddr does
+	// become an offset from that load address.
+	var offset usermem.Addr
+	if info.sharedObject {
+		totalSize := end - start
+		totalSize, ok := totalSize.RoundUp()
+		if !ok {
+			ctx.Infof("ELF PT_LOAD segments too big")
+			return loadedELF{}, syserror.ENOEXEC
+		}
+
+		var err error
+		offset, err = m.MMap(ctx, memmap.MMapOpts{
+			Length:  uint64(totalSize),
+			Addr:    sharedLoadOffset,
+			Private: true,
+		})
+		if err != nil {
+			ctx.Infof("Error allocating address space for shared object: %v", err)
+			return loadedELF{}, err
+		}
+		if err := m.MUnmap(ctx, offset, uint64(totalSize)); err != nil {
+			panic(fmt.Sprintf("Failed to unmap base address: %v", err))
+		}
+
+		start, ok = start.AddLength(uint64(offset))
+		if !ok {
+			panic(fmt.Sprintf("Start %#x + offset %#x overflows?", start, offset))
+		}
+
+		end, ok = end.AddLength(uint64(offset))
+		if !ok {
+			panic(fmt.Sprintf("End %#x + offset %#x overflows?", end, offset))
+		}
+
+		info.entry, ok = info.entry.AddLength(uint64(offset))
+		if !ok {
+			ctx.Infof("Entrypoint %#x + offset %#x overflows? Is the entrypoint within a segment?", info.entry, offset)
+			return loadedELF{}, err
+		}
+	}
+
+	// Map PT_LOAD segments.
+	for _, phdr := range info.phdrs {
+		switch phdr.Type {
+		case elf.PT_LOAD:
+			if phdr.Memsz == 0 {
+				// No need to load segments with size 0, but
+				// they exist in some binaries.
+				continue
+			}
+
+			if err := mapSegment(ctx, m, f, &phdr, offset); err != nil {
+				ctx.Infof("Failed to map PT_LOAD segment: %+v", phdr)
+				return loadedELF{}, err
+			}
+		}
+	}
+
+	// This assumes that the first segment contains the ELF headers. This
+	// may not be true in a malformed ELF, but Linux makes the same
+	// assumption.
+	phdrAddr, ok := start.AddLength(info.phdrOff)
+	if !ok {
+		ctx.Warningf("ELF start address %#x + phdr offset %#x overflows", start, info.phdrOff)
+		phdrAddr = 0
+	}
+
+	return loadedELF{
+		os:          info.os,
+		arch:        info.arch,
+		entry:       info.entry,
+		start:       start,
+		end:         end,
+		interpreter: interpreter,
+		phdrAddr:    phdrAddr,
+		phdrSize:    info.phdrSize,
+		phdrNum:     len(info.phdrs),
+	}, nil
+}
+
+// loadInitialELF loads f into mm.
+//
+// It creates an arch.Context for the ELF and prepares the mm for this arch.
+//
+// It does not load the ELF interpreter, or return any auxv entries.
+//
+// Preconditions:
+//  * f is an ELF file
+//  * f is the first ELF loaded into m
+func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+	info, err := parseHeader(ctx, f)
+	if err != nil {
+		ctx.Infof("Failed to parse initial ELF: %v", err)
+		return loadedELF{}, nil, err
+	}
+
+	// Create the arch.Context now so we can prepare the mmap layout before
+	// mapping anything.
+	ac := arch.New(info.arch, fs)
+
+	l, err := m.SetMmapLayout(ac, limits.FromContext(ctx))
+	if err != nil {
+		ctx.Warningf("Failed to set mmap layout: %v", err)
+		return loadedELF{}, nil, err
+	}
+
+	// PIELoadAddress tries to move the ELF out of the way of the default
+	// mmap base to ensure that the initial brk has sufficient space to
+	// grow.
+	le, err := loadParsedELF(ctx, m, f, info, ac.PIELoadAddress(l))
+	return le, ac, err
+}
+
+// loadInterpreterELF loads f into mm.
+//
+// The interpreter must be for the same OS/Arch as the initial ELF.
+//
+// It does not return any auxv entries.
+//
+// Preconditions:
+//  * f is an ELF file
+func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) {
+	info, err := parseHeader(ctx, f)
+	if err != nil {
+		if err == syserror.ENOEXEC {
+			// Bad interpreter.
+			err = syserror.ELIBBAD
+		}
+		return loadedELF{}, err
+	}
+
+	if info.os != initial.os {
+		ctx.Infof("Initial ELF OS %v and interpreter ELF OS %v differ", initial.os, info.os)
+		return loadedELF{}, syserror.ELIBBAD
+	}
+	if info.arch != initial.arch {
+		ctx.Infof("Initial ELF arch %v and interpreter ELF arch %v differ", initial.arch, info.arch)
+		return loadedELF{}, syserror.ELIBBAD
+	}
+
+	// The interpreter is not given a load offset, as its location does not
+	// affect brk.
+	return loadParsedELF(ctx, m, f, info, 0)
+}
+
+// loadELF loads f into the Task address space.
+//
+// If loadELF returns ErrSwitchFile it should be called again with the returned
+// path and argv.
+//
+// Preconditions:
+//  * f is an ELF file
+func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+	bin, ac, err := loadInitialELF(ctx, m, fs, f)
+	if err != nil {
+		ctx.Infof("Error loading binary: %v", err)
+		return loadedELF{}, nil, err
+	}
+
+	var interp loadedELF
+	if bin.interpreter != "" {
+		d, i, err := openPath(ctx, mounts, root, wd, maxTraversals, bin.interpreter)
+		if err != nil {
+			ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
+			return loadedELF{}, nil, err
+		}
+		defer i.DecRef()
+		// We don't need the Dirent.
+		d.DecRef()
+
+		interp, err = loadInterpreterELF(ctx, m, i, bin)
+		if err != nil {
+			ctx.Infof("Error loading interpreter: %v", err)
+			return loadedELF{}, nil, err
+		}
+
+		if interp.interpreter != "" {
+			// No recursive interpreters!
+			ctx.Infof("Interpreter requires an interpreter")
+			return loadedELF{}, nil, syserror.ENOEXEC
+		}
+	}
+
+	// ELF-specific auxv entries.
+	bin.auxv = arch.Auxv{
+		arch.AuxEntry{linux.AT_PHDR, bin.phdrAddr},
+		arch.AuxEntry{linux.AT_PHENT, usermem.Addr(bin.phdrSize)},
+		arch.AuxEntry{linux.AT_PHNUM, usermem.Addr(bin.phdrNum)},
+		arch.AuxEntry{linux.AT_ENTRY, bin.entry},
+	}
+	if bin.interpreter != "" {
+		bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, interp.start})
+
+		// Start in the interpreter.
+		// N.B. AT_ENTRY above contains the *original* entry point.
+		bin.entry = interp.entry
+	}
+
+	return bin, ac, nil
+}
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
new file mode 100644
index 000000000..b8ecbe92f
--- /dev/null
+++ b/pkg/sentry/loader/interpreter.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"bytes"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// interpreterScriptMagic identifies an interpreter script.
+	interpreterScriptMagic = "#!"
+
+	// interpMaxLineLength is the maximum length for the first line of an
+	// interpreter script.
+	//
+	// From execve(2): "A maximum line length of 127 characters is allowed
+	// for the first line in a #! executable shell script."
+	interpMaxLineLength = 127
+)
+
+// parseInterpreterScript returns the interpreter path and argv.
+func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv, envv []string) (newpath string, newargv []string, err error) {
+	line := make([]byte, interpMaxLineLength)
+	n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
+	// Short read is OK.
+	if err != nil && err != io.ErrUnexpectedEOF {
+		if err == io.EOF {
+			err = syserror.ENOEXEC
+		}
+		return "", []string{}, err
+	}
+	line = line[:n]
+
+	if !bytes.Equal(line[:2], []byte(interpreterScriptMagic)) {
+		return "", []string{}, syserror.ENOEXEC
+	}
+	// Ignore #!.
+	line = line[2:]
+
+	// Ignore everything after newline.
+	// Linux silently truncates the remainder of the line if it exceeds
+	// interpMaxLineLength.
+	i := bytes.IndexByte(line, '\n')
+	if i > 0 {
+		line = line[:i]
+	}
+
+	// Skip any whitespace before the interpeter.
+	line = bytes.TrimLeft(line, " \t")
+
+	// Linux only looks for a space or tab delimiting the interpreter and
+	// arg.
+	//
+	// execve(2): "On Linux, the entire string following the interpreter
+	// name is passed as a single argument to the interpreter, and this
+	// string can include white space."
+	interp := line
+	var arg []byte
+	i = bytes.IndexAny(line, " \t")
+	if i >= 0 {
+		interp = line[:i]
+		if i+1 < len(line) {
+			arg = line[i+1:]
+		}
+	}
+
+	// Build the new argument list:
+	//
+	// 1. The interpreter.
+	newargv = append(newargv, string(interp))
+
+	// 2. The optional interpreter argument.
+	if len(arg) > 0 {
+		newargv = append(newargv, string(arg))
+	}
+
+	// 3. The original arguments. The original argv[0] is replaced with the
+	// full script filename.
+	if len(argv) > 0 {
+		argv[0] = filename
+	} else {
+		argv = []string{filename}
+	}
+	newargv = append(newargv, argv...)
+
+	return string(interp), newargv, nil
+}
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
new file mode 100644
index 000000000..94c281b72
--- /dev/null
+++ b/pkg/sentry/loader/loader.go
@@ -0,0 +1,277 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package loader loads a binary into a MemoryManager.
+package loader
+
+import (
+	"bytes"
+	"crypto/rand"
+	"io"
+	"path"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// readFull behaves like io.ReadFull for an *fs.File.
+func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	var total int64
+	for dst.NumBytes() > 0 {
+		n, err := f.Preadv(ctx, dst, offset+total)
+		total += n
+		if err == io.EOF && total != 0 {
+			return total, io.ErrUnexpectedEOF
+		} else if err != nil {
+			return total, err
+		}
+		dst = dst.DropFirst64(n)
+	}
+	return total, nil
+}
+
+// openPath opens name for loading.
+//
+// openPath returns the fs.Dirent and an *fs.File for name, which is not
+// installed in the Task FDMap. The caller takes ownership of both.
+//
+// name must be a readable, executable, regular file.
+func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, name string) (*fs.Dirent, *fs.File, error) {
+	d, err := mm.FindInode(ctx, root, wd, name, maxTraversals)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer d.DecRef()
+
+	perms := fs.PermMask{
+		// TODO: Linux requires only execute permission,
+		// not read. However, our backing filesystems may prevent us
+		// from reading the file without read permission.
+		//
+		// Additionally, a task with a non-readable executable has
+		// additional constraints on access via ptrace and procfs.
+		Read:    true,
+		Execute: true,
+	}
+	if err := d.Inode.CheckPermission(ctx, perms); err != nil {
+		return nil, nil, err
+	}
+
+	// If they claim it's a directory, then make sure.
+	//
+	// N.B. we reject directories below, but we must first reject
+	// non-directories passed as directories.
+	if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) {
+		return nil, nil, syserror.ENOTDIR
+	}
+
+	// No exec-ing directories, pipes, etc!
+	if !fs.IsRegular(d.Inode.StableAttr) {
+		ctx.Infof("Error regularing %s: %v", name, d.Inode.StableAttr)
+		return nil, nil, syserror.EACCES
+	}
+
+	// Create a new file.
+	file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// We must be able to read at arbitrary offsets.
+	if !file.Flags().Pread {
+		file.DecRef()
+		ctx.Infof("%s cannot be read at an offset: %+v", name, file.Flags())
+		return nil, nil, syserror.EACCES
+	}
+
+	// Grab a reference for the caller.
+	d.IncRef()
+	return d, file, nil
+}
+
+// allocStack allocates and maps a stack in to any available part of the address space.
+func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch.Stack, error) {
+	ar, err := m.MapStack(ctx)
+	if err != nil {
+		return nil, err
+	}
+	return &arch.Stack{a, m, ar.End}, nil
+}
+
+const (
+	// maxLoaderAttempts is the maximum number of attempts to try to load
+	// an interpreter scripts, to prevent loops. 6 (inital + 5 changes) is
+	// what the Linux kernel allows (fs/exec.c:search_binary_handler).
+	maxLoaderAttempts = 6
+)
+
+// loadPath resolves filename to a binary and loads it.
+//
+// It returns:
+//  * loadedELF, description of the loaded binary
+//  * arch.Context matching the binary arch
+//  * fs.Dirent of the binary file
+//  * Possibly updated argv
+func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+	for i := 0; i < maxLoaderAttempts; i++ {
+		d, f, err := openPath(ctx, mounts, root, wd, maxTraversals, filename)
+		if err != nil {
+			ctx.Infof("Error opening %s: %v", filename, err)
+			return loadedELF{}, nil, nil, nil, err
+		}
+		defer f.DecRef()
+		// We will return d in the successful case, but defer a DecRef
+		// for intermediate loops and failure cases.
+		defer d.DecRef()
+
+		// Check the header. Is this an ELF or interpreter script?
+		var hdr [4]uint8
+		// N.B. We assume that reading from a regular file cannot block.
+		_, err = readFull(ctx, f, usermem.BytesIOSequence(hdr[:]), 0)
+		// Allow unexpected EOF, as a valid executable could be only three
+		// bytes (e.g., #!a).
+		if err != nil && err != io.ErrUnexpectedEOF {
+			if err == io.EOF {
+				err = syserror.ENOEXEC
+			}
+			return loadedELF{}, nil, nil, nil, err
+		}
+
+		switch {
+		case bytes.Equal(hdr[:], []byte(elfMagic)):
+			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, maxTraversals, fs, f)
+			if err != nil {
+				ctx.Infof("Error loading ELF: %v", err)
+				return loadedELF{}, nil, nil, nil, err
+			}
+			// An ELF is always terminal. Hold on to d.
+			d.IncRef()
+			return loaded, ac, d, argv, err
+		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
+			newpath, newargv, err := parseInterpreterScript(ctx, filename, f, argv, envv)
+			if err != nil {
+				ctx.Infof("Error loading interpreter script: %v", err)
+				return loadedELF{}, nil, nil, nil, err
+			}
+			filename = newpath
+			argv = newargv
+		default:
+			ctx.Infof("Unknown magic: %v", hdr)
+			return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
+		}
+	}
+
+	return loadedELF{}, nil, nil, nil, syserror.ELOOP
+}
+
+// Load loads filename into a MemoryManager.
+//
+// If Load returns ErrSwitchFile it should be called again with the returned
+// path and argv.
+//
+// Preconditions:
+//  * The Task MemoryManager is empty.
+//  * Load is called on the Task goroutine.
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
+	// Load the binary itself.
+	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv)
+	if err != nil {
+		ctx.Infof("Failed to load %s: %v", filename, err)
+		return 0, nil, "", err
+	}
+	defer d.DecRef()
+
+	// Load the VDSO.
+	vdsoAddr, err := loadVDSO(ctx, m, vdso, loaded)
+	if err != nil {
+		ctx.Infof("Error loading VDSO: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Setup the heap. brk starts at the next page after the end of the
+	// binary. Userspace can assume that the remainer of the page after
+	// loaded.end is available for its use.
+	e, ok := loaded.end.RoundUp()
+	if !ok {
+		ctx.Warningf("brk overflows: %#x", loaded.end)
+		return 0, nil, "", syserror.ENOEXEC
+	}
+	m.BrkSetup(ctx, e)
+
+	// Allocate our stack.
+	stack, err := allocStack(ctx, m, ac)
+	if err != nil {
+		ctx.Infof("Failed to allocate stack: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Push the original filename to the stack, for AT_EXECFN.
+	execfn, err := stack.Push(filename)
+	if err != nil {
+		ctx.Infof("Failed to push exec filename: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Push 16 random bytes on the stack which AT_RANDOM will point to.
+	var b [16]byte
+	if _, err := rand.Read(b[:]); err != nil {
+		ctx.Infof("Failed to read random bytes: %v", err)
+		return 0, nil, "", err
+	}
+	random, err := stack.Push(b)
+	if err != nil {
+		ctx.Infof("Failed to push random bytes: %v", err)
+		return 0, nil, "", err
+	}
+
+	// Add generic auxv entries
+	auxv := append(loaded.auxv, arch.Auxv{
+		arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC},
+		arch.AuxEntry{linux.AT_EXECFN, execfn},
+		arch.AuxEntry{linux.AT_RANDOM, random},
+		arch.AuxEntry{linux.AT_PAGESZ, usermem.PageSize},
+		arch.AuxEntry{linux.AT_SYSINFO_EHDR, vdsoAddr},
+	}...)
+	auxv = append(auxv, extraAuxv...)
+
+	sl, err := stack.Load(argv, envv, auxv)
+	if err != nil {
+		ctx.Infof("Failed to load stack: %v", err)
+		return 0, nil, "", err
+	}
+
+	m.SetArgvStart(sl.ArgvStart)
+	m.SetArgvEnd(sl.ArgvEnd)
+	m.SetEnvvStart(sl.EnvvStart)
+	m.SetEnvvEnd(sl.EnvvEnd)
+	m.SetAuxv(auxv)
+	m.SetExecutable(d)
+
+	ac.SetIP(uintptr(loaded.entry))
+	ac.SetStack(uintptr(stack.Bottom))
+
+	name := path.Base(filename)
+	if len(name) > linux.TASK_COMM_LEN-1 {
+		name = name[:linux.TASK_COMM_LEN-1]
+	}
+
+	return loaded.os, ac, name, nil
+}
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
new file mode 100644
index 000000000..ce4f6f5d9
--- /dev/null
+++ b/pkg/sentry/loader/vdso.go
@@ -0,0 +1,382 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"debug/elf"
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// byteReaderFileOperations implements fs.FileOperations for reading
+// from a []byte source.
+type byteReader struct {
+	fsutil.NoopRelease
+	fsutil.PipeSeek
+	fsutil.NotDirReaddir
+	fsutil.NoFsync
+	fsutil.NoopFlush
+	fsutil.NoMMap
+	fsutil.NoIoctl
+	waiter.AlwaysReady
+	data []byte
+}
+
+type fileContext struct {
+	context.Context
+}
+
+func (f *fileContext) Value(key interface{}) interface{} {
+	switch key {
+	case uniqueid.CtxGlobalUniqueID:
+		return uint64(0)
+	default:
+		return f.Context.Value(key)
+	}
+}
+
+func newByteReaderFile(data []byte) *fs.File {
+	dirent := fs.NewTransientDirent(nil)
+	flags := fs.FileFlags{Read: true, Pread: true}
+	return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
+		data: data,
+	})
+}
+
+func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if offset >= int64(len(b.data)) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, b.data[offset:])
+	return int64(n), err
+}
+
+func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	panic("Write not supported")
+}
+
+// validateVDSO checks that the VDSO can be loaded by loadVDSO.
+//
+// VDSOs are special (see below). Since we are going to map the VDSO directly
+// rather than using a normal loading process, we require that the PT_LOAD
+// segments have the same layout in the ELF as they expect to have in memory.
+//
+// Namely, this means that we must verify:
+// * PT_LOAD file offsets are equivalent to the memory offset from the first
+//   segment.
+// * No extra zeroed space (memsz) is required.
+// * PT_LOAD segments are in order.
+// * No two PT_LOAD segments occupy parts of the same page.
+// * PT_LOAD segments don't extend beyond the end of the file.
+//
+// ctx may be nil if f does not need it.
+func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
+	info, err := parseHeader(ctx, f)
+	if err != nil {
+		log.Infof("Unable to parse VDSO header: %v", err)
+		return elfInfo{}, err
+	}
+
+	var first *elf.ProgHeader
+	var prev *elf.ProgHeader
+	var prevEnd usermem.Addr
+	for i, phdr := range info.phdrs {
+		if phdr.Type != elf.PT_LOAD {
+			continue
+		}
+
+		if first == nil {
+			first = &info.phdrs[i]
+			if phdr.Off != 0 {
+				log.Warningf("First PT_LOAD segment has non-zero file offset")
+				return elfInfo{}, syserror.ENOEXEC
+			}
+		}
+
+		memoryOffset := phdr.Vaddr - first.Vaddr
+		if memoryOffset != phdr.Off {
+			log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+
+		// memsz larger than filesz means that extra zeroed space should be
+		// provided at the end of the segment. Since we are mapping the ELF
+		// directly, we don't want to just overwrite part of the ELF with
+		// zeroes.
+		if phdr.Memsz != phdr.Filesz {
+			log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+
+		start := usermem.Addr(memoryOffset)
+		end, ok := start.AddLength(phdr.Memsz)
+		if !ok {
+			log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+		if uint64(end) > size {
+			log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size)
+			return elfInfo{}, syserror.ENOEXEC
+		}
+
+		if prev != nil {
+			if start < prevEnd {
+				log.Warningf("PT_LOAD segments out of order")
+				return elfInfo{}, syserror.ENOEXEC
+			}
+
+			// We mprotect entire pages, so each segment must be in
+			// its own page.
+			prevEndPage := prevEnd.RoundDown()
+			startPage := start.RoundDown()
+			if prevEndPage >= startPage {
+				log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage)
+				return elfInfo{}, syserror.ENOEXEC
+			}
+		}
+		prev = &info.phdrs[i]
+		prevEnd = end
+	}
+
+	return info, nil
+}
+
+// VDSO describes a VDSO.
+//
+// NOTE: to support multiple architectures or operating systems, this
+// would need to contain a VDSO for each.
+type VDSO struct {
+	// ParamPage is the VDSO parameter page. This page should be updated to
+	// inform the VDSO for timekeeping data.
+	ParamPage *mm.SpecialMappable
+
+	// vdso is the VDSO ELF itself.
+	vdso *mm.SpecialMappable
+
+	// os is the operating system targeted by the VDSO.
+	os abi.OS
+
+	// arch is the architecture targeted by the VDSO.
+	arch arch.Arch
+
+	// phdrs are the VDSO ELF phdrs.
+	phdrs []elf.ProgHeader `state:".([]elfProgHeader)"`
+}
+
+// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
+// param page for updating by the kernel.
+func PrepareVDSO(p platform.Platform) (*VDSO, error) {
+	vdsoFile := newByteReaderFile(vdsoBin)
+
+	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
+	// nil context can be passed.
+	info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
+	if err != nil {
+		return nil, err
+	}
+
+	// Then copy it into a VDSO mapping.
+	size, ok := usermem.Addr(len(vdsoBin)).RoundUp()
+	if !ok {
+		return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsoBin))
+	}
+
+	vdso, err := p.Memory().Allocate(uint64(size), usage.System)
+	if err != nil {
+		return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err)
+	}
+
+	ims, err := p.Memory().MapInternal(vdso, usermem.ReadWrite)
+	if err != nil {
+		p.Memory().DecRef(vdso)
+		return nil, fmt.Errorf("unable to map VDSO memory: %v", err)
+	}
+
+	_, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsoBin)))
+	if err != nil {
+		p.Memory().DecRef(vdso)
+		return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err)
+	}
+
+	// Finally, allocate a param page for this VDSO.
+	paramPage, err := p.Memory().Allocate(usermem.PageSize, usage.System)
+	if err != nil {
+		p.Memory().DecRef(vdso)
+		return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err)
+	}
+
+	return &VDSO{
+		ParamPage: mm.NewSpecialMappable("[vvar]", p, paramPage),
+		// TODO: Don't advertise the VDSO, as some applications may
+		// not be able to handle multiple [vdso] hints.
+		vdso:  mm.NewSpecialMappable("", p, vdso),
+		phdrs: info.phdrs,
+	}, nil
+}
+
+// loadVDSO loads the VDSO into m.
+//
+// VDSOs are special.
+//
+// VDSOs are fully position independent. However, instead of loading a VDSO
+// like a normal ELF binary, mapping only the PT_LOAD segments, the Linux
+// kernel simply directly maps the entire file into process memory, with very
+// little real ELF parsing.
+//
+// NOTE: This means that userspace can, and unfortunately does,
+// depend on parts of the ELF that would normally not be mapped.  To maintain
+// compatibility with such binaries, we load the VDSO much like Linux.
+//
+// loadVDSO takes a reference on the VDSO and parameter page FrameRegions.
+func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (usermem.Addr, error) {
+	if v == nil {
+		// Should be used only by tests.
+		ctx.Warningf("No VDSO provided, skipping VDSO mapping")
+		return 0, nil
+	}
+
+	if v.os != bin.os {
+		ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os)
+		return 0, syserror.ENOEXEC
+	}
+	if v.arch != bin.arch {
+		ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch)
+		return 0, syserror.ENOEXEC
+	}
+
+	// Reserve address space for the VDSO and its parameter page, which is
+	// mapped just before the VDSO.
+	mapSize := v.vdso.Length() + v.ParamPage.Length()
+	addr, err := m.MMap(ctx, memmap.MMapOpts{
+		Length:  mapSize,
+		Private: true,
+	})
+	if err != nil {
+		ctx.Infof("Unable to reserve VDSO address space: %v", err)
+		return 0, err
+	}
+
+	// Now map the param page.
+	_, err = m.MMap(ctx, memmap.MMapOpts{
+		Length:          v.ParamPage.Length(),
+		MappingIdentity: v.ParamPage,
+		Mappable:        v.ParamPage,
+		Addr:            addr,
+		Fixed:           true,
+		Unmap:           true,
+		Private:         true,
+		Perms:           usermem.Read,
+		MaxPerms:        usermem.Read,
+	})
+	if err != nil {
+		ctx.Infof("Unable to map VDSO param page: %v", err)
+		return 0, err
+	}
+
+	// Now map the VDSO itself.
+	vdsoAddr, ok := addr.AddLength(v.ParamPage.Length())
+	if !ok {
+		panic(fmt.Sprintf("Part of mapped range overflows? %#x + %#x", addr, v.ParamPage.Length()))
+	}
+	_, err = m.MMap(ctx, memmap.MMapOpts{
+		Length:          v.vdso.Length(),
+		MappingIdentity: v.vdso,
+		Mappable:        v.vdso,
+		Addr:            vdsoAddr,
+		Fixed:           true,
+		Unmap:           true,
+		Private:         true,
+		Perms:           usermem.Read,
+		MaxPerms:        usermem.AnyAccess,
+	})
+	if err != nil {
+		ctx.Infof("Unable to map VDSO: %v", err)
+		return 0, err
+	}
+
+	vdsoEnd, ok := vdsoAddr.AddLength(v.vdso.Length())
+	if !ok {
+		panic(fmt.Sprintf("VDSO mapping overflows? %#x + %#x", vdsoAddr, v.vdso.Length()))
+	}
+
+	// Set additional protections for the individual segments.
+	var first *elf.ProgHeader
+	for i, phdr := range v.phdrs {
+		if phdr.Type != elf.PT_LOAD {
+			continue
+		}
+
+		if first == nil {
+			first = &v.phdrs[i]
+		}
+
+		memoryOffset := phdr.Vaddr - first.Vaddr
+		segAddr, ok := vdsoAddr.AddLength(memoryOffset)
+		if !ok {
+			ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset)
+			return 0, syserror.ENOEXEC
+		}
+		segPage := segAddr.RoundDown()
+		segSize := usermem.Addr(phdr.Memsz)
+		segSize, ok = segSize.AddLength(segAddr.PageOffset())
+		if !ok {
+			ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset())
+			return 0, syserror.ENOEXEC
+		}
+		segSize, ok = segSize.RoundUp()
+		if !ok {
+			ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset())
+			return 0, syserror.ENOEXEC
+		}
+		segEnd, ok := segPage.AddLength(uint64(segSize))
+		if !ok {
+			ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize)
+			return 0, syserror.ENOEXEC
+		}
+		if segEnd > vdsoEnd {
+			ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd)
+			return 0, syserror.ENOEXEC
+		}
+
+		perms := progFlagsAsPerms(phdr.Flags)
+		if perms != usermem.Read {
+			if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil {
+				ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err)
+				return 0, syserror.ENOEXEC
+			}
+		}
+	}
+
+	return vdsoAddr, nil
+}
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
new file mode 100644
index 000000000..92004ad9e
--- /dev/null
+++ b/pkg/sentry/loader/vdso_state.go
@@ -0,0 +1,47 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loader
+
+import (
+	"debug/elf"
+)
+
+type elfProgHeader struct {
+	Type   elf.ProgType
+	Flags  elf.ProgFlag
+	Off    uint64
+	Vaddr  uint64
+	Paddr  uint64
+	Filesz uint64
+	Memsz  uint64
+	Align  uint64
+}
+
+// savePhdrs is invoked by stateify.
+func (v *VDSO) savePhdrs() []elfProgHeader {
+	s := make([]elfProgHeader, 0, len(v.phdrs))
+	for _, h := range v.phdrs {
+		s = append(s, elfProgHeader(h))
+	}
+	return s
+}
+
+// loadPhdrs is invoked by stateify.
+func (v *VDSO) loadPhdrs(s []elfProgHeader) {
+	v.phdrs = make([]elf.ProgHeader, 0, len(s))
+	for _, h := range s {
+		v.phdrs = append(v.phdrs, elf.ProgHeader(h))
+	}
+}
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
new file mode 100644
index 000000000..7525fea45
--- /dev/null
+++ b/pkg/sentry/memmap/BUILD
@@ -0,0 +1,71 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "memmap_state",
+    srcs = [
+        "mappable_range.go",
+        "mapping_set.go",
+        "mapping_set_impl.go",
+    ],
+    out = "memmap_state.go",
+    package = "memmap",
+)
+
+go_template_instance(
+    name = "mappable_range",
+    out = "mappable_range.go",
+    package = "memmap",
+    prefix = "Mappable",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint64",
+    },
+)
+
+go_template_instance(
+    name = "mapping_set_impl",
+    out = "mapping_set_impl.go",
+    package = "memmap",
+    prefix = "Mapping",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "MappableRange",
+        "Value": "MappingsOfRange",
+        "Functions": "mappingSetFunctions",
+    },
+)
+
+go_library(
+    name = "memmap",
+    srcs = [
+        "mappable_range.go",
+        "mapping_set.go",
+        "mapping_set_impl.go",
+        "memmap.go",
+        "memmap_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "memmap_test",
+    size = "small",
+    srcs = ["mapping_set_test.go"],
+    embed = [":memmap"],
+    deps = ["//pkg/sentry/usermem"],
+)
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
new file mode 100644
index 000000000..0cd42ffbf
--- /dev/null
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -0,0 +1,245 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memmap
+
+import (
+	"fmt"
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// MappingSet maps offsets into a Mappable to mappings of those offsets. It is
+// used to implement Mappable.AddMapping and RemoveMapping for Mappables that
+// may need to call MappingSpace.Invalidate.
+//
+// type MappingSet <generated by go_generics>
+
+// MappingsOfRange is the value type of MappingSet, and represents the set of
+// all mappings of the corresponding MappableRange.
+//
+// Using a map offers O(1) lookups in RemoveMapping and
+// mappingSetFunctions.Merge.
+type MappingsOfRange map[MappingOfRange]struct{}
+
+// MappingOfRange represents a mapping of a MappableRange.
+type MappingOfRange struct {
+	MappingSpace MappingSpace
+	AddrRange    usermem.AddrRange
+}
+
+func (r MappingOfRange) invalidate(opts InvalidateOpts) {
+	r.MappingSpace.Invalidate(r.AddrRange, opts)
+}
+
+// String implements fmt.Stringer.String.
+func (r MappingOfRange) String() string {
+	return fmt.Sprintf("%#v", r.AddrRange)
+}
+
+// mappingSetFunctions implements segment.Functions for MappingSet.
+type mappingSetFunctions struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (mappingSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (mappingSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (mappingSetFunctions) ClearValue(v *MappingsOfRange) {
+	*v = MappingsOfRange{}
+}
+
+// Merge implements segment.Functions.Merge.
+//
+// Since each value is a map of MappingOfRanges, values can only be merged if
+// all MappingOfRanges in each map have an exact pair in the other map, forming
+// one contiguous region.
+func (mappingSetFunctions) Merge(r1 MappableRange, val1 MappingsOfRange, r2 MappableRange, val2 MappingsOfRange) (MappingsOfRange, bool) {
+	if len(val1) != len(val2) {
+		return nil, false
+	}
+
+	merged := make(MappingsOfRange, len(val1))
+
+	// Each MappingOfRange in val1 must have a matching region in val2, forming
+	// one contiguous region.
+	for k1 := range val1 {
+		// We expect val2 to to contain a key that forms a contiguous
+		// region with k1.
+		k2 := MappingOfRange{
+			MappingSpace: k1.MappingSpace,
+			AddrRange: usermem.AddrRange{
+				Start: k1.AddrRange.End,
+				End:   k1.AddrRange.End + usermem.Addr(r2.Length()),
+			},
+		}
+		if _, ok := val2[k2]; !ok {
+			return nil, false
+		}
+
+		// OK. Add it to the merged map.
+		merged[MappingOfRange{
+			MappingSpace: k1.MappingSpace,
+			AddrRange: usermem.AddrRange{
+				Start: k1.AddrRange.Start,
+				End:   k2.AddrRange.End,
+			},
+		}] = struct{}{}
+	}
+
+	return merged, true
+}
+
+// Split implements segment.Functions.Split.
+func (mappingSetFunctions) Split(r MappableRange, val MappingsOfRange, split uint64) (MappingsOfRange, MappingsOfRange) {
+	if split <= r.Start || split >= r.End {
+		panic(fmt.Sprintf("split is not within range %v", r))
+	}
+
+	m1 := make(MappingsOfRange, len(val))
+	m2 := make(MappingsOfRange, len(val))
+
+	// split is a value in MappableRange, we need the offset into the
+	// corresponding MappingsOfRange.
+	offset := usermem.Addr(split - r.Start)
+	for k := range val {
+		k1 := MappingOfRange{
+			MappingSpace: k.MappingSpace,
+			AddrRange: usermem.AddrRange{
+				Start: k.AddrRange.Start,
+				End:   k.AddrRange.Start + offset,
+			},
+		}
+		m1[k1] = struct{}{}
+
+		k2 := MappingOfRange{
+			MappingSpace: k.MappingSpace,
+			AddrRange: usermem.AddrRange{
+				Start: k.AddrRange.Start + offset,
+				End:   k.AddrRange.End,
+			},
+		}
+		m2[k2] = struct{}{}
+	}
+
+	return m1, m2
+}
+
+// subsetMapping returns the MappingOfRange that maps subsetRange, given that
+// ms maps wholeRange beginning at addr.
+//
+// For instance, suppose wholeRange = [0x0, 0x2000) and addr = 0x4000,
+// indicating that ms maps addresses [0x4000, 0x6000) to MappableRange [0x0,
+// 0x2000). Then for subsetRange = [0x1000, 0x2000), subsetMapping returns a
+// MappingOfRange for which AddrRange = [0x5000, 0x6000).
+func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr usermem.Addr) MappingOfRange {
+	if !wholeRange.IsSupersetOf(subsetRange) {
+		panic(fmt.Sprintf("%v is not a superset of %v", wholeRange, subsetRange))
+	}
+
+	offset := subsetRange.Start - wholeRange.Start
+	start := addr + usermem.Addr(offset)
+	return MappingOfRange{
+		MappingSpace: ms,
+		AddrRange: usermem.AddrRange{
+			Start: start,
+			End:   start + usermem.Addr(subsetRange.Length()),
+		},
+	}
+}
+
+// AddMapping adds the given mapping and returns the set of MappableRanges that
+// previously had no mappings.
+//
+// Preconditions: As for Mappable.AddMapping.
+func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64) []MappableRange {
+	mr := MappableRange{offset, offset + uint64(ar.Length())}
+	var mapped []MappableRange
+	seg, gap := s.Find(mr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < mr.End:
+			seg = s.Isolate(seg, mr)
+			seg.Value()[subsetMapping(mr, seg.Range(), ms, ar.Start)] = struct{}{}
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok() && gap.Start() < mr.End:
+			gapMR := gap.Range().Intersect(mr)
+			mapped = append(mapped, gapMR)
+			// Insert a set and continue from the above case.
+			seg, gap = s.Insert(gap, gapMR, make(MappingsOfRange)), MappingGapIterator{}
+
+		default:
+			return mapped
+		}
+	}
+}
+
+// RemoveMapping removes the given mapping and returns the set of
+// MappableRanges that now have no mappings.
+//
+// Preconditions: As for Mappable.RemoveMapping.
+func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64) []MappableRange {
+	mr := MappableRange{offset, offset + uint64(ar.Length())}
+	var unmapped []MappableRange
+
+	seg := s.FindSegment(mr.Start)
+	if !seg.Ok() {
+		panic(fmt.Sprintf("MappingSet.RemoveMapping(%v): no segment containing %#x: %v", mr, mr.Start, s))
+	}
+	for seg.Ok() && seg.Start() < mr.End {
+		// Ensure this segment is limited to our range.
+		seg = s.Isolate(seg, mr)
+
+		// Remove this part of the mapping.
+		mappings := seg.Value()
+		delete(mappings, subsetMapping(mr, seg.Range(), ms, ar.Start))
+
+		if len(mappings) == 0 {
+			unmapped = append(unmapped, seg.Range())
+			seg = s.Remove(seg).NextSegment()
+		} else {
+			seg = seg.NextSegment()
+		}
+	}
+	s.MergeAdjacent(mr)
+	return unmapped
+}
+
+// Invalidate calls MappingSpace.Invalidate for all mappings of offsets in mr.
+func (s *MappingSet) Invalidate(mr MappableRange, opts InvalidateOpts) {
+	for seg := s.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() {
+		segMR := seg.Range()
+		for m := range seg.Value() {
+			region := subsetMapping(segMR, segMR.Intersect(mr), m.MappingSpace, m.AddrRange.Start)
+			region.invalidate(opts)
+		}
+	}
+}
+
+// InvalidateAll calls MappingSpace.Invalidate for all mappings of s.
+func (s *MappingSet) InvalidateAll(opts InvalidateOpts) {
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		for m := range seg.Value() {
+			m.invalidate(opts)
+		}
+	}
+}
diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go
new file mode 100644
index 000000000..10668d404
--- /dev/null
+++ b/pkg/sentry/memmap/mapping_set_test.go
@@ -0,0 +1,186 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memmap
+
+import (
+	"reflect"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type testMappingSpace struct {
+	// Ideally we'd store the full ranges that were invalidated, rather
+	// than individual calls to Invalidate, as they are an implementation
+	// detail, but this is the simplest way for now.
+	inv []usermem.AddrRange
+}
+
+func (n *testMappingSpace) reset() {
+	n.inv = []usermem.AddrRange{}
+}
+
+func (n *testMappingSpace) Invalidate(ar usermem.AddrRange, opts InvalidateOpts) {
+	n.inv = append(n.inv, ar)
+}
+
+func TestAddRemoveMapping(t *testing.T) {
+	set := MappingSet{}
+	ms := &testMappingSpace{}
+
+	mapped := set.AddMapping(ms, usermem.AddrRange{0x10000, 0x12000}, 0x1000)
+	if got, want := mapped, []MappableRange{{0x1000, 0x3000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings (usermem.AddrRanges => memmap.MappableRange):
+	// [0x10000, 0x12000) => [0x1000, 0x3000)
+	t.Log(&set)
+
+	mapped = set.AddMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000)
+	if len(mapped) != 0 {
+		t.Errorf("AddMapping: got %+v, wanted []", mapped)
+	}
+
+	// Mappings:
+	// [0x10000, 0x11000) => [0x1000, 0x2000)
+	// [0x11000, 0x12000) and [0x20000, 0x21000) => [0x2000, 0x3000)
+	t.Log(&set)
+
+	mapped = set.AddMapping(ms, usermem.AddrRange{0x30000, 0x31000}, 0x4000)
+	if got, want := mapped, []MappableRange{{0x4000, 0x5000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x10000, 0x11000) => [0x1000, 0x2000)
+	// [0x11000, 0x12000) and [0x20000, 0x21000) => [0x2000, 0x3000)
+	// [0x30000, 0x31000) => [0x4000, 0x5000)
+	t.Log(&set)
+
+	mapped = set.AddMapping(ms, usermem.AddrRange{0x12000, 0x15000}, 0x3000)
+	if got, want := mapped, []MappableRange{{0x3000, 0x4000}, {0x5000, 0x6000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x10000, 0x11000) => [0x1000, 0x2000)
+	// [0x11000, 0x12000) and [0x20000, 0x21000) => [0x2000, 0x3000)
+	// [0x12000, 0x13000) => [0x3000, 0x4000)
+	// [0x13000, 0x14000) and [0x30000, 0x31000) => [0x4000, 0x5000)
+	// [0x14000, 0x15000) => [0x5000, 0x6000)
+	t.Log(&set)
+
+	unmapped := set.RemoveMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0x1000)
+	if got, want := unmapped, []MappableRange{{0x1000, 0x2000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x11000, 0x12000) and [0x20000, 0x21000) => [0x2000, 0x3000)
+	// [0x12000, 0x13000) => [0x3000, 0x4000)
+	// [0x13000, 0x14000) and [0x30000, 0x31000) => [0x4000, 0x5000)
+	// [0x14000, 0x15000) => [0x5000, 0x6000)
+	t.Log(&set)
+
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000)
+	if len(unmapped) != 0 {
+		t.Errorf("RemoveMapping: got %+v, wanted []", unmapped)
+	}
+
+	// Mappings:
+	// [0x11000, 0x13000) => [0x2000, 0x4000)
+	// [0x13000, 0x14000) and [0x30000, 0x31000) => [0x4000, 0x5000)
+	// [0x14000, 0x15000) => [0x5000, 0x6000)
+	t.Log(&set)
+
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x11000, 0x15000}, 0x2000)
+	if got, want := unmapped, []MappableRange{{0x2000, 0x4000}, {0x5000, 0x6000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x30000, 0x31000) => [0x4000, 0x5000)
+	t.Log(&set)
+
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x30000, 0x31000}, 0x4000)
+	if got, want := unmapped, []MappableRange{{0x4000, 0x5000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
+	}
+}
+
+func TestInvalidateWholeMapping(t *testing.T) {
+	set := MappingSet{}
+	ms := &testMappingSpace{}
+
+	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0)
+	// Mappings:
+	// [0x10000, 0x11000) => [0, 0x1000)
+	t.Log(&set)
+	set.Invalidate(MappableRange{0, 0x1000}, InvalidateOpts{})
+	if got, want := ms.inv, []usermem.AddrRange{{0x10000, 0x11000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Invalidate: got %+v, wanted %+v", got, want)
+	}
+}
+
+func TestInvalidatePartialMapping(t *testing.T) {
+	set := MappingSet{}
+	ms := &testMappingSpace{}
+
+	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x13000}, 0)
+	// Mappings:
+	// [0x10000, 0x13000) => [0, 0x3000)
+	t.Log(&set)
+	set.Invalidate(MappableRange{0x1000, 0x2000}, InvalidateOpts{})
+	if got, want := ms.inv, []usermem.AddrRange{{0x11000, 0x12000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Invalidate: got %+v, wanted %+v", got, want)
+	}
+}
+
+func TestInvalidateMultipleMappings(t *testing.T) {
+	set := MappingSet{}
+	ms := &testMappingSpace{}
+
+	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0)
+	set.AddMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000)
+	// Mappings:
+	// [0x10000, 0x11000) => [0, 0x1000)
+	// [0x12000, 0x13000) => [0x2000, 0x3000)
+	t.Log(&set)
+	set.Invalidate(MappableRange{0, 0x3000}, InvalidateOpts{})
+	if got, want := ms.inv, []usermem.AddrRange{{0x10000, 0x11000}, {0x20000, 0x21000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Invalidate: got %+v, wanted %+v", got, want)
+	}
+}
+
+func TestInvalidateOverlappingMappings(t *testing.T) {
+	set := MappingSet{}
+	ms1 := &testMappingSpace{}
+	ms2 := &testMappingSpace{}
+
+	set.AddMapping(ms1, usermem.AddrRange{0x10000, 0x12000}, 0)
+	set.AddMapping(ms2, usermem.AddrRange{0x20000, 0x22000}, 0x1000)
+	// Mappings:
+	// ms1:[0x10000, 0x12000) => [0, 0x2000)
+	// ms2:[0x11000, 0x13000) => [0x1000, 0x3000)
+	t.Log(&set)
+	set.Invalidate(MappableRange{0x1000, 0x2000}, InvalidateOpts{})
+	if got, want := ms1.inv, []usermem.AddrRange{{0x11000, 0x12000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Invalidate: ms1: got %+v, wanted %+v", got, want)
+	}
+	if got, want := ms2.inv, []usermem.AddrRange{{0x20000, 0x21000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Invalidate: ms1: got %+v, wanted %+v", got, want)
+	}
+}
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
new file mode 100644
index 000000000..14fed55bc
--- /dev/null
+++ b/pkg/sentry/memmap/memmap.go
@@ -0,0 +1,297 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memmap defines semantics for memory mappings.
+package memmap
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Mappable represents a memory-mappable object, a mutable mapping from uint64
+// offsets to (platform.File, uint64 File offset) pairs.
+//
+// See mm/mm.go for Mappable's place in the lock order.
+//
+// Preconditions: For all Mappable methods, usermem.AddrRanges and
+// MappableRanges must be non-empty (Length() != 0), and usermem.Addrs and
+// Mappable offsets must be page-aligned.
+type Mappable interface {
+	// AddMapping notifies the Mappable of a mapping from addresses ar in ms to
+	// offsets [offset, offset+ar.Length()) in this Mappable.
+	//
+	// Preconditions: offset+ar.Length() does not overflow.
+	AddMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64) error
+
+	// RemoveMapping notifies the Mappable of the removal of a mapping from
+	// addresses ar in ms to offsets [offset, offset+ar.Length()) in this
+	// Mappable.
+	//
+	// Preconditions: offset+ar.Length() does not overflow. The removed mapping
+	// must exist.
+	RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64)
+
+	// CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
+	// from srcAR to dstAR. For most Mappables, this is equivalent to
+	// AddMapping.
+	//
+	// CopyMapping is only called when a mapping is copied within a given
+	// MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
+	//
+	// Preconditions: offset+dstAR.Length() does not overflow. The mapping at
+	// srcAR must exist.
+	CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error
+
+	// Translate returns the Mappable's current mappings for at least the range
+	// of offsets specified by required, and at most the range of offsets
+	// specified by optional. at is the set of access types that may be
+	// performed using the returned Translations. If not all required offsets
+	// are translated, it returns a non-nil error explaining why. Returned
+	// translations, and any mappings returned by platform.File.MapInternal for
+	// translated platform.Files, are valid until invalidated by a call back to
+	// MappingSpace.Invalidate or until the caller removes its mapping of the
+	// translated range.
+	//
+	// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
+	// required and optional must be page-aligned. The caller must have
+	// established a mapping for all of the queried offsets via a previous call
+	// to AddMapping. The caller is responsible for ensuring that calls to
+	// Translate synchronize with invalidation.
+	//
+	// Postconditions: See CheckTranslateResult.
+	Translate(ctx context.Context, required, optional MappableRange, at usermem.AccessType) ([]Translation, error)
+
+	// InvalidateUnsavable requests that the Mappable invalidate Translations
+	// that cannot be preserved across save/restore.
+	//
+	// Invariant: InvalidateUnsavable never races with concurrent calls to any
+	// other Mappable methods.
+	InvalidateUnsavable(ctx context.Context) error
+}
+
+// Translations are returned by Mappable.Translate.
+type Translation struct {
+	// Source is the translated range in the Mappable.
+	Source MappableRange
+
+	// File is the mapped file. When the Translation is invalidated, pages
+	// mapped by File.MapInto must be unmapped, and pages mapped by
+	// File.MapInternal become invalid.
+	File platform.File
+
+	// Offset is the offset into File at which this Translation begins.
+	Offset uint64
+}
+
+// CheckTranslateResult returns an error if (ts, terr) does not satisfy all
+// postconditions for Mappable.Translate(required, optional).
+//
+// Preconditions: As for Mappable.Translate.
+func CheckTranslateResult(required, optional MappableRange, ts []Translation, terr error) error {
+	// Verify that the inputs to Mappable.Translate were valid.
+	if !required.WellFormed() || required.Length() <= 0 {
+		panic(fmt.Sprintf("invalid required range: %v", required))
+	}
+	if !usermem.Addr(required.Start).IsPageAligned() || !usermem.Addr(required.End).IsPageAligned() {
+		panic(fmt.Sprintf("unaligned required range: %v", required))
+	}
+	if !optional.IsSupersetOf(required) {
+		panic(fmt.Sprintf("optional range %v is not a superset of required range %v", optional, required))
+	}
+	if !usermem.Addr(optional.Start).IsPageAligned() || !usermem.Addr(optional.End).IsPageAligned() {
+		panic(fmt.Sprintf("unaligned optional range: %v", optional))
+	}
+
+	// The first Translation must include required.Start.
+	if len(ts) != 0 && !ts[0].Source.Contains(required.Start) {
+		return fmt.Errorf("first Translation %+v does not cover start of required range %v", ts[0], required)
+	}
+	for i, t := range ts {
+		if !t.Source.WellFormed() || t.Source.Length() <= 0 {
+			return fmt.Errorf("Translation %+v has invalid Source", t)
+		}
+		if !usermem.Addr(t.Source.Start).IsPageAligned() || !usermem.Addr(t.Source.End).IsPageAligned() {
+			return fmt.Errorf("Translation %+v has unaligned Source", t)
+		}
+		if t.File == nil {
+			return fmt.Errorf("Translation %+v has nil File", t)
+		}
+		if !usermem.Addr(t.Offset).IsPageAligned() {
+			return fmt.Errorf("Translation %+v has unaligned Offset", t)
+		}
+		// Translations must be contiguous and in increasing order of
+		// Translation.Source.
+		if i > 0 && ts[i-1].Source.End != t.Source.Start {
+			return fmt.Errorf("Translations %+v and %+v are not contiguous", ts[i-1], t)
+		}
+		// At least part of each Translation must be required.
+		if t.Source.Intersect(required).Length() == 0 {
+			return fmt.Errorf("Translation %+v lies entirely outside required range %v", t, required)
+		}
+		// Translations must be constrained to the optional range.
+		if !optional.IsSupersetOf(t.Source) {
+			return fmt.Errorf("Translation %+v lies outside optional range %v", t, optional)
+		}
+	}
+	// If the set of Translations does not cover the entire required range,
+	// Translate must return a non-nil error explaining why.
+	if terr == nil {
+		if len(ts) == 0 {
+			return fmt.Errorf("no Translations and no error")
+		}
+		if t := ts[len(ts)-1]; !t.Source.Contains(required.End - 1) {
+			return fmt.Errorf("last Translation %+v does not reach end of required range %v, but Translate returned no error", t, required)
+		}
+	}
+	return nil
+}
+
+// BusError may be returned by implementations of Mappable.Translate for errors
+// that should result in SIGBUS delivery if they cause application page fault
+// handling to fail.
+type BusError struct {
+	// Err is the original error.
+	Err error
+}
+
+// Error implements error.Error.
+func (b *BusError) Error() string {
+	return fmt.Sprintf("BusError: %v", b.Err.Error())
+}
+
+// MappableRange represents a range of uint64 offsets into a Mappable.
+//
+// type MappableRange <generated using go_generics>
+
+// String implements fmt.Stringer.String.
+func (mr MappableRange) String() string {
+	return fmt.Sprintf("[%#x, %#x)", mr.Start, mr.End)
+}
+
+// MappingSpace represents a mutable mapping from usermem.Addrs to (Mappable,
+// uint64 offset) pairs.
+type MappingSpace interface {
+	// Invalidate is called to notify the MappingSpace that values returned by
+	// previous calls to Mappable.Translate for offsets mapped by addresses in
+	// ar are no longer valid.
+	//
+	// Invalidate must not take any locks preceding mm.MemoryManager.activeMu
+	// in the lock order.
+	//
+	// Preconditions: ar.Length() != 0. ar must be page-aligned.
+	Invalidate(ar usermem.AddrRange, opts InvalidateOpts)
+}
+
+// InvalidateOpts holds options to MappingSpace.Invalidate.
+type InvalidateOpts struct {
+	// InvalidatePrivate is true if private pages in the invalidated region
+	// should also be discarded, causing their data to be lost.
+	InvalidatePrivate bool
+}
+
+// MappingIdentity controls the lifetime of a Mappable, and provides
+// information about the Mappable for /proc/[pid]/maps. It is distinct from
+// Mappable because all Mappables that are coherent must compare equal to
+// support the implementation of shared futexes, but different
+// MappingIdentities may represent the same Mappable, in the same way that
+// multiple fs.Files may represent the same fs.Inode. (This similarity is not
+// coincidental; fs.File implements MappingIdentity, and some
+// fs.InodeOperations implement Mappable.)
+type MappingIdentity interface {
+	// MappingIdentity is reference-counted.
+	refs.RefCounter
+
+	// MappedName returns the application-visible name shown in
+	// /proc/[pid]/maps.
+	MappedName(ctx context.Context) string
+
+	// DeviceID returns the device number shown in /proc/[pid]/maps.
+	DeviceID() uint64
+
+	// InodeID returns the inode number shown in /proc/[pid]/maps.
+	InodeID() uint64
+
+	// Msync has the same semantics as fs.FileOperations.Fsync(ctx,
+	// int64(mr.Start), int64(mr.End-1), fs.SyncData).
+	// (fs.FileOperations.Fsync() takes an inclusive end, but mr.End is
+	// exclusive, hence mr.End-1.) It is defined rather than Fsync so that
+	// implementors don't need to depend on the fs package for fs.SyncType.
+	Msync(ctx context.Context, mr MappableRange) error
+}
+
+// MMapOpts specifies a request to create a memory mapping.
+type MMapOpts struct {
+	// Length is the length of the mapping.
+	Length uint64
+
+	// MappingIdentity controls the lifetime of Mappable, and provides
+	// properties of the mapping shown in /proc/[pid]/maps. If MMapOpts is used
+	// to successfully create a memory mapping, a reference is taken on
+	// MappingIdentity.
+	MappingIdentity MappingIdentity
+
+	// Mappable is the Mappable to be mapped. If Mappable is nil, the mapping
+	// is anonymous. If Mappable is not nil, it must remain valid as long as a
+	// reference is held on MappingIdentity.
+	Mappable Mappable
+
+	// Offset is the offset into Mappable to map. If Mappable is nil, Offset is
+	// ignored.
+	Offset uint64
+
+	// Addr is the suggested address for the mapping.
+	Addr usermem.Addr
+
+	// Fixed specifies whether this is a fixed mapping (it must be located at
+	// Addr).
+	Fixed bool
+
+	// Unmap specifies whether existing mappings in the range being mapped may
+	// be replaced. If Unmap is true, Fixed must be true.
+	Unmap bool
+
+	// Perms is the set of permissions to the applied to this mapping.
+	Perms usermem.AccessType
+
+	// MaxPerms limits the set of permissions that may ever apply to this
+	// mapping. If Mappable is not nil, all memmap.Translations returned by
+	// Mappable.Translate must support all accesses in MaxPerms.
+	//
+	// Preconditions: MaxAccessType should be an effective AccessType, as
+	// access cannot be limited beyond effective AccessTypes.
+	MaxPerms usermem.AccessType
+
+	// Private is true if writes to the mapping should be propagated to a copy
+	// that is exclusive to the MemoryManager.
+	Private bool
+
+	// GrowsDown is true if the mapping should be automatically expanded
+	// downward on guard page faults.
+	GrowsDown bool
+
+	// Precommit is true if the platform should eagerly commit resources to the
+	// mapping (see platform.AddressSpace.MapFile).
+	Precommit bool
+
+	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
+	// empty, MappingIdentity.MappedName() will be used instead.
+	//
+	// TODO: Replace entirely with MappingIdentity?
+	Hint string
+}
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
new file mode 100644
index 000000000..a387a0c9f
--- /dev/null
+++ b/pkg/sentry/memutil/BUILD
@@ -0,0 +1,14 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "memutil",
+    srcs = [
+        "memutil.go",
+        "memutil_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memutil",
+    visibility = ["//pkg/sentry:internal"],
+    deps = ["@org_golang_x_sys//unix:go_default_library"],
+)
diff --git a/pkg/sentry/memutil/memutil.go b/pkg/sentry/memutil/memutil.go
new file mode 100644
index 000000000..4f245cf3c
--- /dev/null
+++ b/pkg/sentry/memutil/memutil.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memutil contains the utility functions for memory operations.
+package memutil
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go
new file mode 100644
index 000000000..32c27eb2f
--- /dev/null
+++ b/pkg/sentry/memutil/memutil_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memutil
+
+import (
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// CreateMemFD creates a memfd file and returns the fd.
+func CreateMemFD(name string, flags int) (fd int, err error) {
+	p, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return -1, err
+	}
+	r0, _, e0 := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0)
+	if e0 != 0 {
+		return -1, e0
+	}
+	return int(r0), nil
+}
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
new file mode 100644
index 000000000..39bde2be3
--- /dev/null
+++ b/pkg/sentry/mm/BUILD
@@ -0,0 +1,155 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "mm_state",
+    srcs = [
+        "aio_context.go",
+        "aio_context_state.go",
+        "file_refcount_set.go",
+        "io_list.go",
+        "mm.go",
+        "pma_set.go",
+        "save_restore.go",
+        "special_mappable.go",
+        "vma_set.go",
+    ],
+    out = "mm_state.go",
+    package = "mm",
+)
+
+go_template_instance(
+    name = "file_refcount_set",
+    out = "file_refcount_set.go",
+    imports = {
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "mm",
+    prefix = "fileRefcount",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "int32",
+        "Functions": "fileRefcountSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "vma_set",
+    out = "vma_set.go",
+    consts = {
+        "minDegree": "8",
+    },
+    imports = {
+        "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
+    },
+    package = "mm",
+    prefix = "vma",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "usermem.Addr",
+        "Range": "usermem.AddrRange",
+        "Value": "vma",
+        "Functions": "vmaSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "pma_set",
+    out = "pma_set.go",
+    consts = {
+        "minDegree": "8",
+    },
+    imports = {
+        "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
+    },
+    package = "mm",
+    prefix = "pma",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "usermem.Addr",
+        "Range": "usermem.AddrRange",
+        "Value": "pma",
+        "Functions": "pmaSetFunctions",
+    },
+)
+
+go_template_instance(
+    name = "io_list",
+    out = "io_list.go",
+    package = "mm",
+    prefix = "io",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*ioResult",
+    },
+)
+
+go_library(
+    name = "mm",
+    srcs = [
+        "address_space.go",
+        "aio_context.go",
+        "aio_context_state.go",
+        "debug.go",
+        "file_refcount_set.go",
+        "io.go",
+        "io_list.go",
+        "lifecycle.go",
+        "metadata.go",
+        "mm.go",
+        "mm_state.go",
+        "pma.go",
+        "pma_set.go",
+        "proc_pid_maps.go",
+        "save_restore.go",
+        "special_mappable.go",
+        "syscalls.go",
+        "vma.go",
+        "vma_set.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/mm",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/atomicbitops",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/safecopy",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip/buffer",
+    ],
+)
+
+go_test(
+    name = "mm_test",
+    size = "small",
+    srcs = ["mm_test.go"],
+    embed = [":mm"],
+    deps = [
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usermem",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
new file mode 100644
index 000000000..067733475
--- /dev/null
+++ b/pkg/sentry/mm/README.md
@@ -0,0 +1,279 @@
+This package provides an emulation of Linux semantics for application virtual
+memory mappings.
+
+For completeness, this document also describes aspects of the memory management
+subsystem defined outside this package.
+
+# Background
+
+We begin by describing semantics for virtual memory in Linux.
+
+A virtual address space is defined as a collection of mappings from virtual
+addresses to physical memory. However, userspace applications do not configure
+mappings to physical memory directly. Instead, applications configure memory
+mappings from virtual addresses to offsets into a file using the `mmap` system
+call.[^mmap-anon] For example, a call to:
+
+    mmap(
+        /* addr = */ 0x400000,
+        /* length = */ 0x1000,
+        PROT_READ | PROT_WRITE,
+        MAP_SHARED,
+        /* fd = */ 3,
+        /* offset = */ 0);
+
+creates a mapping of length 0x1000 bytes, starting at virtual address (VA)
+0x400000, to offset 0 in the file represented by file descriptor (FD) 3. Within
+the Linux kernel, virtual memory mappings are represented by *virtual memory
+areas* (VMAs). Supposing that FD 3 represents file /tmp/foo, the state of the
+virtual memory subsystem after the `mmap` call may be depicted as:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0
+
+Establishing a virtual memory area does not necessarily establish a mapping to a
+physical address, because Linux has not necessarily provisioned physical memory
+to store the file's contents. Thus, if the application attempts to read the
+contents of VA 0x400000, it may incur a *page fault*, a CPU exception that
+forces the kernel to create such a mapping to service the read.
+
+For a file, doing so consists of several logical phases:
+
+1. The kernel allocates physical memory to store the contents of the required
+   part of the file, and copies file contents to the allocated memory. Supposing
+   that the kernel chooses the physical memory at physical address (PA)
+   0x2fb000, the resulting state of the system is:
+
+        VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+
+   (In Linux the state of the mapping from file offset to physical memory is
+   stored in `struct address_space`, but to avoid confusion with other notions
+   of address space we will refer to this system as filemap, named after Linux
+   kernel source file `mm/filemap.c`.)
+
+2. The kernel stores the effective mapping from virtual to physical address in a
+   *page table entry* (PTE) in the application's *page tables*, which are used
+   by the CPU's virtual memory hardware to perform address translation. The
+   resulting state of the system is:
+
+        VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+        PTE:     VA:0x400000 -----------------> PA:0x2fb000
+
+   The PTE is required for the application to actually use the contents of the
+   mapped file as virtual memory. However, the PTE is derived from the VMA and
+   filemap state, both of which are independently mutable, such that mutations
+   to either will affect the PTE. For example:
+
+   - The application may remove the VMA using the `munmap` system call. This
+     breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently the
+     mapping from VA:0x400000 to PA:0x2fb000. However, it does not necessarily
+     break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a future mapping of
+     the same file offset may reuse this physical memory.
+
+   - The application may invalidate the file's contents by passing a length of 0
+     to the `ftruncate` system call. This breaks the mapping from /tmp/foo:0x0
+     to PA:0x2fb000, and consequently the mapping from VA:0x400000 to
+     PA:0x2fb000. However, it does not break the mapping from VA:0x400000 to
+     /tmp/foo:0x0, so future changes to the file's contents may again be made
+     visible at VA:0x400000 after another page fault results in the allocation
+     of a new physical address.
+
+   Note that, in order to correctly break the mapping from VA:0x400000 to
+   PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping*
+   from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE.
+
+[^mmap-anon]: Memory mappings to non-files are discussed in later sections.
+
+## Private Mappings
+
+The preceding example considered VMAs created using the `MAP_SHARED` flag, which
+means that PTEs derived from the mapping should always use physical memory that
+represents the current state of the mapped file.[^mmap-dev-zero] Applications
+can alternatively pass the `MAP_PRIVATE` flag to create a *private mapping*.
+Private mappings are *copy-on-write*.
+
+Suppose that the application instead created a private mapping in the previous
+example. In Linux, the state of the system after a read page fault would be:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+    PTE:     VA:0x400000 -----------------> PA:0x2fb000 (read-only)
+
+Now suppose the application attempts to write to VA:0x400000. For a shared
+mapping, the write would be propagated to PA:0x2fb000, and the kernel would be
+responsible for ensuring that the write is later propagated to the mapped file.
+For a private mapping, the write incurs another page fault since the PTE is
+marked read-only. In response, the kernel allocates physical memory to store the
+mapping's *private copy* of the file's contents, copies file contents to the
+allocated memory, and changes the PTE to map to the private copy. Supposing that
+the kernel chooses the physical memory at physical address (PA) 0x5ea000, the
+resulting state of the system is:
+
+    VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
+    PTE:     VA:0x400000 -----------------> PA:0x5ea000
+
+Note that the filemap mapping from /tmp/foo:0x0 to PA:0x2fb000 may still exist,
+but is now irrelevant to this mapping.
+
+[^mmap-dev-zero]: Modulo files with special mmap semantics such as `/dev/zero`.
+
+## Anonymous Mappings
+
+Instead of passing a file to the `mmap` system call, applications can instead
+request an *anonymous* mapping by passing the `MAP_ANONYMOUS` flag.
+Semantically, an anonymous mapping is essentially a mapping to an ephemeral file
+initially filled with zero bytes. Practically speaking, this is how shared
+anonymous mappings are implemented, but private anonymous mappings do not result
+in the creation of an ephemeral file; since there would be no way to modify the
+contents of the underlying file through a private mapping, all private anonymous
+mappings use a single shared page filled with zero bytes until copy-on-write
+occurs.
+
+# Virtual Memory in the Sentry
+
+The sentry implements application virtual memory atop a host kernel, introducing
+an additional level of indirection to the above.
+
+Consider the same scenario as in the previous section. Since the sentry handles
+application system calls, the effect of an application `mmap` system call is to
+create a VMA in the sentry (as opposed to the host kernel):
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+
+When the application first incurs a page fault on this address, the host kernel
+delivers information about the page fault to the sentry in a platform-dependent
+manner, and the sentry handles the fault:
+
+1. The sentry allocates memory to store the contents of the required part of the
+   file, and copies file contents to the allocated memory. However, since the
+   sentry is implemented atop a host kernel, it does not configure mappings to
+   physical memory directly. Instead, mappable "memory" in the sentry is
+   represented by a host file descriptor and offset, since (as noted in
+   "Background") this is the memory mapping primitive provided by the host
+   kernel. In general, memory is allocated from a temporary host file using the
+   `filemem` package. Supposing that the sentry allocates offset 0x3000 from
+   host file "memory-file", the resulting state is:
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+
+2. The sentry stores the effective mapping from virtual address to host file in
+   a host VMA by invoking the `mmap` system call:
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+          Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000
+
+3. The sentry returns control to the application, which immediately incurs the
+   page fault again.[^mmap-populate] However, since a host VMA now exists for
+   the faulting virtual address, the host kernel now handles the page fault as
+   described in "Background":
+
+        Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
+        Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+          Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000
+          Host filemap:                                host:memory-file:0x3000 -> PA:0x2fb000
+          Host PTE:     VA:0x400000 --------------------------------------------> PA:0x2fb000
+
+Thus, from an implementation standpoint, host VMAs serve the same purpose in the
+sentry that PTEs do in Linux. As in Linux, sentry VMA and filemap state is
+independently mutable, and the desired state of host VMAs is derived from that
+state.
+
+[^mmap-populate]: The sentry could force the host kernel to establish PTEs when
+                  it creates the host VMA by passing the `MAP_POPULATE` flag to
+                  the `mmap` system call, but usually does not. This is because,
+                  to reduce the number of page faults that require handling by
+                  the sentry and (correspondingly) the number of host `mmap`
+                  system calls, the sentry usually creates host VMAs that are
+                  much larger than the single faulting page.
+
+## Private Mappings
+
+The sentry implements private mappings consistently with Linux. Before
+copy-on-write, the private mapping example given in the Background results in:
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+      Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000 (read-only)
+      Host filemap:                                host:memory-file:0x3000 -> PA:0x2fb000
+      Host PTE:     VA:0x400000 --------------------------------------------> PA:0x2fb000 (read-only)
+
+When the application attempts to write to this address, the host kernel delivers
+information about the resulting page fault to the sentry. Analogous to Linux,
+the sentry allocates memory to store the mapping's private copy of the file's
+contents, copies file contents to the allocated memory, and changes the host VMA
+to map to the private copy. Supposing that the sentry chooses the offset 0x4000
+in host file `memory-file` to store the private copy, the state of the system
+after copy-on-write is:
+
+    Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0 (private)
+    Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
+      Host VMA:     VA:0x400000 -----------------> host:memory-file:0x4000
+      Host filemap:                                host:memory-file:0x4000 -> PA:0x5ea000
+      Host PTE:     VA:0x400000 --------------------------------------------> PA:0x5ea000
+
+However, this highlights an important difference between Linux and the sentry.
+In Linux, page tables are concrete (architecture-dependent) data structures
+owned by the kernel. Conversely, the sentry has the ability to create and
+destroy host VMAs using host system calls, but it does not have direct access to
+their state. Thus, as written, if the application invokes the `munmap` system
+call to remove the sentry VMA, it is non-trivial for the sentry to determine
+that it should deallocate `host:memory-file:0x4000`. This implies that the
+sentry must retain information about the host VMAs that it has created.
+
+## Anonymous Mappings
+
+The sentry implements anonymous mappings consistently with Linux, except that
+there is no shared zero page.
+
+# Implementation Constructs
+
+In Linux:
+
+- A virtual address space is represented by `struct mm_struct`.
+
+- VMAs are represented by `struct vm_area_struct`, stored in `struct
+  mm_struct::mmap`.
+
+- Mappings from file offsets to physical memory are stored in `struct
+  address_space`.
+
+- Reverse mappings from file offsets to virtual mappings are stored in `struct
+  address_space::i_mmap`.
+
+- Physical memory pages are represented by a pointer to `struct page` or an
+  index called a *page frame number* (PFN), represented by `pfn_t`.
+
+- PTEs are represented by architecture-dependent type `pte_t`, stored in a table
+  hierarchy rooted at `struct mm_struct::pgd`.
+
+In the sentry:
+
+- A virtual address space is represented by type [`mm.MemoryManager`][mm].
+
+- Sentry VMAs are represented by type [`mm.vma`][mm], stored in
+  `mm.MemoryManager.vmas`.
+
+- Mappings from sentry file offsets to host file offsets are abstracted through
+  interface method [`memmap.Mappable.Translate`][memmap].
+
+- Reverse mappings from sentry file offsets to virtual mappings are abstracted
+  through interface methods [`memmap.Mappable.AddMapping` and
+  `memmap.Mappable.RemoveMapping`][memmap].
+
+- Host files that may be mapped into host VMAs are represented by type
+  [`platform.File`][platform].
+
+- Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform
+  mapping area"), stored in `mm.MemoryManager.pmas`.
+
+- Creation and destruction of host VMAs is abstracted through interface methods
+  [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
+
+[filemem]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/filemem/filemem.go
+[memmap]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/memmap/memmap.go
+[mm]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/mm/mm.go
+[platform]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/platform.go
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
new file mode 100644
index 000000000..4dd67b1ea
--- /dev/null
+++ b/pkg/sentry/mm/address_space.go
@@ -0,0 +1,223 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// AddressSpace returns the platform.AddressSpace bound to mm.
+//
+// Preconditions: The caller must have called mm.Activate().
+func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
+	if atomic.LoadInt32(&mm.active) == 0 {
+		panic("trying to use inactive address space?")
+	}
+	return mm.as
+}
+
+// Activate ensures this MemoryManager has a platform.AddressSpace.
+//
+// The caller must not hold any locks when calling Activate.
+//
+// When this MemoryManager is no longer needed by a task, it should call
+// Deactivate to release the reference.
+func (mm *MemoryManager) Activate() error {
+	// Fast path: the MemoryManager already has an active
+	// platform.AddressSpace, and we just need to indicate that we need it too.
+	if atomicbitops.IncUnlessZeroInt32(&mm.active) {
+		return nil
+	}
+
+	for {
+		// Slow path: may need to synchronize with other goroutines changing
+		// mm.active to or from zero.
+		mm.activeMu.Lock()
+		// Inline Unlock instead of using a defer for performance since this
+		// method is commonly in the hot-path.
+
+		// Check if we raced with another goroutine performing activation.
+		if atomic.LoadInt32(&mm.active) > 0 {
+			// This can't race; Deactivate can't decrease mm.active from 1 to 0
+			// without holding activeMu.
+			atomic.AddInt32(&mm.active, 1)
+			mm.activeMu.Unlock()
+			return nil
+		}
+
+		// Do we have a context? If so, then we never unmapped it. This can
+		// only be the case if !mm.p.CooperativelySchedulesAddressSpace().
+		if mm.as != nil {
+			atomic.StoreInt32(&mm.active, 1)
+			mm.activeMu.Unlock()
+			return nil
+		}
+
+		// Get a new address space. We must force unmapping by passing nil to
+		// NewAddressSpace if requested. (As in the nil interface object, not a
+		// typed nil.)
+		mappingsID := (interface{})(mm)
+		if mm.unmapAllOnActivate {
+			mappingsID = nil
+		}
+		as, c, err := mm.p.NewAddressSpace(mappingsID)
+		if err != nil {
+			mm.activeMu.Unlock()
+			return err
+		}
+		if as == nil {
+			// AddressSpace is unavailable, we must wait.
+			//
+			// activeMu must not be held while waiting, as the user
+			// of the address space we are waiting on may attempt
+			// to take activeMu.
+			//
+			// Don't call UninterruptibleSleepStart to register the
+			// wait to allow the watchdog stuck task to trigger in
+			// case a process is starved waiting for the address
+			// space.
+			mm.activeMu.Unlock()
+			<-c
+			continue
+		}
+
+		// Okay, we could restore all mappings at this point.
+		// But forget that. Let's just let them fault in.
+		mm.as = as
+
+		// Unmapping is done, if necessary.
+		mm.unmapAllOnActivate = false
+
+		// Now that m.as has been assigned, we can set m.active to a non-zero value
+		// to enable the fast path.
+		atomic.StoreInt32(&mm.active, 1)
+
+		mm.activeMu.Unlock()
+		return nil
+	}
+}
+
+// Deactivate releases a release to the MemoryManager.
+func (mm *MemoryManager) Deactivate() error {
+	// Fast path: this is not the last goroutine to deactivate the
+	// MemoryManager.
+	if atomicbitops.DecUnlessOneInt32(&mm.active) {
+		return nil
+	}
+
+	mm.activeMu.Lock()
+	// Same as Activate.
+
+	// Still active?
+	if atomic.AddInt32(&mm.active, -1) > 0 {
+		mm.activeMu.Unlock()
+		return nil
+	}
+
+	// Can we hold on to the address space?
+	if !mm.p.CooperativelySchedulesAddressSpace() {
+		mm.activeMu.Unlock()
+		return nil
+	}
+
+	// Release the address space.
+	if err := mm.as.Release(); err != nil {
+		atomic.StoreInt32(&mm.active, 1)
+		mm.activeMu.Unlock()
+		return err
+	}
+
+	// Lost it.
+	mm.as = nil
+	mm.activeMu.Unlock()
+	return nil
+}
+
+// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
+// for all addresses in ar should be precommitted.
+//
+// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
+// ar must be page-aligned. pseg.Range().Contains(ar.Start).
+func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
+	// By default, map entire pmas at a time, under the assumption that there
+	// is no cost to mapping more of a pma than necessary.
+	mapAR := usermem.AddrRange{0, ^usermem.Addr(usermem.PageSize - 1)}
+	if precommit {
+		// When explicitly precommitting, only map ar, since overmapping may
+		// incur unexpected resource usage.
+		mapAR = ar
+	} else if mapUnit := mm.p.MapUnit(); mapUnit != 0 {
+		// Limit the range we map to ar, aligned to mapUnit.
+		mapMask := usermem.Addr(mapUnit - 1)
+		mapAR.Start = ar.Start &^ mapMask
+		// If rounding ar.End up overflows, just keep the existing mapAR.End.
+		if end := (ar.End + mapMask) &^ mapMask; end >= ar.End {
+			mapAR.End = end
+		}
+	}
+	if checkInvariants {
+		if !mapAR.IsSupersetOf(ar) {
+			panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar))
+		}
+	}
+
+	for {
+		pma := pseg.ValuePtr()
+		pmaAR := pseg.Range()
+		pmaMapAR := pmaAR.Intersect(mapAR)
+		perms := pma.vmaEffectivePerms
+		if pma.needCOW {
+			perms.Write = false
+		}
+		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+			return err
+		}
+		// Since this checks ar.End and not mapAR.End, we will never map a pma
+		// that is not required.
+		if ar.End <= pmaAR.End {
+			return nil
+		}
+		pseg = pseg.NextSegment()
+	}
+}
+
+// unmapASLocked removes all AddressSpace mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) unmapASLocked(ar usermem.AddrRange) {
+	if mm.as == nil {
+		// No AddressSpace? Force all mappings to be unmapped on the next
+		// Activate.
+		mm.unmapAllOnActivate = true
+		return
+	}
+
+	// unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be
+	// passed ranges that include addresses that can't be mapped by the
+	// application.
+	ar = ar.Intersect(mm.applicationAddrRange())
+
+	// Note that this AddressSpace may or may not be active. If the
+	// platform does not require cooperative sharing of AddressSpaces, they
+	// are retained between Deactivate/Activate calls. Despite not being
+	// active, it is still valid to perform operations on these address
+	// spaces.
+	mm.as.Unmap(ar.Start, uint64(ar.Length()))
+}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
new file mode 100644
index 000000000..992bde5a5
--- /dev/null
+++ b/pkg/sentry/mm/aio_context.go
@@ -0,0 +1,377 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// aioManager creates and manages asynchronous I/O contexts.
+type aioManager struct {
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// aioContexts is the set of asynchronous I/O contexts.
+	contexts map[uint64]*AIOContext
+}
+
+func (a *aioManager) destroy() {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	for _, ctx := range a.contexts {
+		ctx.destroy()
+	}
+}
+
+// newAIOContext creates a new context for asynchronous I/O.
+//
+// Returns false if 'id' is currently in use.
+func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if _, ok := a.contexts[id]; ok {
+		return false
+	}
+
+	a.contexts[id] = &AIOContext{
+		done:           make(chan struct{}, 1),
+		maxOutstanding: events,
+	}
+	return true
+}
+
+// destroyAIOContext destroys an asynchronous I/O context.
+//
+// False is returned if the context does not exist.
+func (a *aioManager) destroyAIOContext(id uint64) bool {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ctx, ok := a.contexts[id]
+	if !ok {
+		return false
+	}
+	delete(a.contexts, id)
+	ctx.destroy()
+	return true
+}
+
+// lookupAIOContext looks up the given context.
+//
+// Returns false if context does not exist.
+func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	ctx, ok := a.contexts[id]
+	return ctx, ok
+}
+
+// ioResult is a completed I/O operation.
+type ioResult struct {
+	data interface{}
+	ioEntry
+}
+
+// AIOContext is a single asynchronous I/O context.
+type AIOContext struct {
+	// done is the notification channel used for all requests.
+	done chan struct{} `state:"nosave"`
+
+	// mu protects below.
+	mu sync.Mutex `state:"nosave"`
+
+	// results is the set of completed requests.
+	results ioList
+
+	// maxOutstanding is the maximum number of outstanding entries; this value
+	// is immutable.
+	maxOutstanding uint32
+
+	// outstanding is the number of requests outstanding; this will effectively
+	// be the number of entries in the result list or that are expected to be
+	// added to the result list.
+	outstanding uint32
+
+	// dead is set when the context is destroyed.
+	dead bool `state:"zerovalue"`
+}
+
+// destroy marks the context dead.
+func (ctx *AIOContext) destroy() {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	ctx.dead = true
+	if ctx.outstanding == 0 {
+		close(ctx.done)
+	}
+}
+
+// Prepare reserves space for a new request, returning true if available.
+// Returns false if the context is busy.
+func (ctx *AIOContext) Prepare() bool {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	if ctx.outstanding >= ctx.maxOutstanding {
+		return false
+	}
+	ctx.outstanding++
+	return true
+}
+
+// PopRequest pops a completed request if available, this function does not do
+// any blocking. Returns false if no request is available.
+func (ctx *AIOContext) PopRequest() (interface{}, bool) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	// Is there anything ready?
+	if e := ctx.results.Front(); e != nil {
+		ctx.results.Remove(e)
+		ctx.outstanding--
+		if ctx.outstanding == 0 && ctx.dead {
+			close(ctx.done)
+		}
+		return e.data, true
+	}
+	return nil, false
+}
+
+// FinishRequest finishes a pending request. It queues up the data
+// and notifies listeners.
+func (ctx *AIOContext) FinishRequest(data interface{}) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+
+	// Push to the list and notify opportunistically. The channel notify
+	// here is guaranteed to be safe because outstanding must be non-zero.
+	// The done channel is only closed when outstanding reaches zero.
+	ctx.results.PushBack(&ioResult{data: data})
+
+	select {
+	case ctx.done <- struct{}{}:
+	default:
+	}
+}
+
+// WaitChannel returns a channel that is notified when an AIO request is
+// completed.
+//
+// The boolean return value indicates whether or not the context is active.
+func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
+	ctx.mu.Lock()
+	defer ctx.mu.Unlock()
+	if ctx.outstanding == 0 && ctx.dead {
+		return nil, false
+	}
+	return ctx.done, true
+}
+
+// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
+// ring buffers.
+type aioMappable struct {
+	refs.AtomicRefCount
+
+	p  platform.Platform
+	fr platform.FileRange
+}
+
+var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
+
+func newAIOMappable(p platform.Platform) (*aioMappable, error) {
+	fr, err := p.Memory().Allocate(aioRingBufferSize, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+	return &aioMappable{p: p, fr: fr}, nil
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *aioMappable) DecRef() {
+	m.AtomicRefCount.DecRefWithDestructor(func() {
+		m.p.Memory().DecRef(m.fr)
+	})
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *aioMappable) MappedName(ctx context.Context) string {
+	return "[aio]"
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *aioMappable) DeviceID() uint64 {
+	return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *aioMappable) InodeID() uint64 {
+	return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	// Linux: aio_ring_fops.fsync == NULL
+	return syserror.EINVAL
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (m *aioMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+	// sets VM_DONTEXPAND).
+	if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
+		return syserror.EFAULT
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (m *aioMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
+	// sets VM_DONTEXPAND).
+	if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
+		return syserror.EFAULT
+	}
+	// Require that the mapping correspond to a live AIOContext. Compare
+	// Linux's fs/aio.c:aio_ring_mremap().
+	mm, ok := ms.(*MemoryManager)
+	if !ok {
+		return syserror.EINVAL
+	}
+	am := &mm.aioManager
+	am.mu.Lock()
+	defer am.mu.Unlock()
+	oldID := uint64(srcAR.Start)
+	aioCtx, ok := am.contexts[oldID]
+	if !ok {
+		return syserror.EINVAL
+	}
+	aioCtx.mu.Lock()
+	defer aioCtx.mu.Unlock()
+	if aioCtx.dead {
+		return syserror.EINVAL
+	}
+	// Use the new ID for the AIOContext.
+	am.contexts[uint64(dstAR.Start)] = aioCtx
+	delete(am.contexts, oldID)
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > m.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   m.p.Memory(),
+				Offset: m.fr.Start + source.Start,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
+
+// NewAIOContext creates a new context for asynchronous I/O.
+//
+// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc().
+func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) {
+	// libaio get_ioevents() expects context "handle" to be a valid address.
+	// libaio peeks inside looking for a magic number. This function allocates
+	// a page per context and keeps it set to zeroes to ensure it will not
+	// match AIO_RING_MAGIC and make libaio happy.
+	m, err := newAIOMappable(mm.p)
+	if err != nil {
+		return 0, err
+	}
+	defer m.DecRef()
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:          aioRingBufferSize,
+		MappingIdentity: m,
+		Mappable:        m,
+		// TODO: Linux does "do_mmap_pgoff(..., PROT_READ |
+		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
+		// mapping read-only?
+		Perms:    usermem.Read,
+		MaxPerms: usermem.Read,
+	})
+	if err != nil {
+		return 0, err
+	}
+	id := uint64(addr)
+	if !mm.aioManager.newAIOContext(events, id) {
+		mm.MUnmap(ctx, addr, aioRingBufferSize)
+		return 0, syserror.EINVAL
+	}
+	return id, nil
+}
+
+// DestroyAIOContext destroys an asynchronous I/O context. It returns false if
+// the context does not exist.
+func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool {
+	if _, ok := mm.LookupAIOContext(ctx, id); !ok {
+		return false
+	}
+
+	// Only unmaps after it assured that the address is a valid aio context to
+	// prevent random memory from been unmapped.
+	//
+	// Note: It's possible to unmap this address and map something else into
+	// the same address. Then it would be unmapping memory that it doesn't own.
+	// This is, however, the way Linux implements AIO. Keeps the same [weird]
+	// semantics in case anyone relies on it.
+	mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize)
+
+	return mm.aioManager.destroyAIOContext(id)
+}
+
+// LookupAIOContext looks up the given context. It returns false if the context
+// does not exist.
+func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) {
+	aioCtx, ok := mm.aioManager.lookupAIOContext(id)
+	if !ok {
+		return nil, false
+	}
+
+	// Protect against 'ids' that are inaccessible (Linux also reads 4 bytes
+	// from id).
+	var buf [4]byte
+	_, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{})
+	if err != nil {
+		return nil, false
+	}
+
+	return aioCtx, true
+}
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
new file mode 100644
index 000000000..1a5e56f8e
--- /dev/null
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+// afterLoad is invoked by stateify.
+func (a *AIOContext) afterLoad() {
+	a.done = make(chan struct{}, 1)
+}
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
new file mode 100644
index 000000000..56d0490f0
--- /dev/null
+++ b/pkg/sentry/mm/debug.go
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+const (
+	// If checkInvariants is true, perform runtime checks for invariants
+	// expected by the mm package. This is normally disabled since MM is a
+	// significant hot path in general, and some such checks (notably
+	// memmap.CheckTranslateResult) are very expensive.
+	checkInvariants = false
+
+	// If logIOErrors is true, log I/O errors that originate from MM before
+	// converting them to EFAULT.
+	logIOErrors = false
+)
+
+// String implements fmt.Stringer.String.
+func (mm *MemoryManager) String() string {
+	return mm.DebugString(context.Background())
+}
+
+// DebugString returns a string containing information about mm for debugging.
+func (mm *MemoryManager) DebugString(ctx context.Context) string {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return mm.debugStringLocked(ctx)
+}
+
+// Preconditions: mm.mappingMu and mm.activeMu must be locked.
+func (mm *MemoryManager) debugStringLocked(ctx context.Context) string {
+	var b bytes.Buffer
+	b.WriteString("VMAs:\n")
+	for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+		b.Write(mm.vmaMapsEntryLocked(ctx, vseg))
+	}
+	b.WriteString("PMAs:\n")
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		b.Write(pseg.debugStringEntryLocked())
+	}
+	return string(b.Bytes())
+}
+
+// Preconditions: mm.activeMu must be locked.
+func (pseg pmaIterator) debugStringEntryLocked() []byte {
+	var b bytes.Buffer
+
+	fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End())
+
+	pma := pseg.ValuePtr()
+	if pma.vmaEffectivePerms.Read {
+		b.WriteByte('r')
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.vmaEffectivePerms.Write {
+		if pma.needCOW {
+			b.WriteByte('c')
+		} else {
+			b.WriteByte('w')
+		}
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.vmaEffectivePerms.Execute {
+		b.WriteByte('x')
+	} else {
+		b.WriteByte('-')
+	}
+	if pma.private {
+		b.WriteByte('p')
+	} else {
+		b.WriteByte('s')
+	}
+
+	fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file)
+	return b.Bytes()
+}
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
new file mode 100644
index 000000000..cac81a59d
--- /dev/null
+++ b/pkg/sentry/mm/io.go
@@ -0,0 +1,604 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// There are two supported ways to copy data to/from application virtual
+// memory:
+//
+// 1. Internally-mapped copying: Determine the platform.File that backs the
+// copied-to/from virtual address, obtain a mapping of its pages, and read or
+// write to the mapping.
+//
+// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is
+// true, AddressSpace permissions are applicable, and an AddressSpace is
+// available, copy directly through the AddressSpace, handling faults as
+// needed.
+//
+// (Given that internally-mapped copying requires that backing memory is always
+// implemented using a host file descriptor, we could also preadv/pwritev to it
+// instead. But this would incur a host syscall for each use of the mapped
+// page, whereas mmap is a one-time cost.)
+//
+// The fixed overhead of internally-mapped copying is expected to be higher
+// than that of AddressSpace copying since the former always needs to translate
+// addresses, whereas the latter only needs to do so when faults occur.
+// However, the throughput of internally-mapped copying is expected to be
+// somewhat higher than that of AddressSpace copying due to the high cost of
+// page faults and because implementations of the latter usually rely on
+// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace
+// copying (when available) for smaller copies, and switch to internally-mapped
+// copying once a size threshold is exceeded.
+const (
+	// copyMapMinBytes is the size threshold for switching to internally-mapped
+	// copying in CopyOut, CopyIn, and ZeroOut.
+	copyMapMinBytes = 32 << 10 // 32 KB
+
+	// rwMapMinBytes is the size threshold for switching to internally-mapped
+	// copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes
+	// since AddressSpace copying in this case requires additional buffering;
+	// see CopyOutFrom for details.
+	rwMapMinBytes = 512
+)
+
+// checkIORange is similar to usermem.Addr.ToRange, but applies bounds checks
+// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok().
+//
+// Preconditions: length >= 0.
+func (mm *MemoryManager) checkIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) {
+	// Note that access_ok() constrains end even if length == 0.
+	ar, ok := addr.ToRange(uint64(length))
+	return ar, (ok && ar.End <= mm.layout.MaxAddr)
+}
+
+// checkIOVec applies bound checks consistent with Linux's
+// arch/x86/include/asm/uaccess.h:access_ok() to ars.
+func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool {
+	for !ars.IsEmpty() {
+		ar := ars.Head()
+		if _, ok := mm.checkIORange(ar.Start, int64(ar.Length())); !ok {
+			return false
+		}
+		ars = ars.Tail()
+	}
+	return true
+}
+
+func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool {
+	return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive
+}
+
+// translateIOError converts errors to EFAULT, as is usually reported for all
+// I/O errors originating from MM in Linux.
+func translateIOError(ctx context.Context, err error) error {
+	if err == nil {
+		return nil
+	}
+	if logIOErrors {
+		ctx.Debugf("MM I/O error: %v", err)
+	}
+	return syserror.EFAULT
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+	ar, ok := mm.checkIORange(addr, int64(len(src)))
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if len(src) == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && len(src) < copyMapMinBytes {
+		return mm.asCopyOut(ctx, addr, src)
+	}
+
+	// Go through internal mappings.
+	n64, err := mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
+		return n, translateIOError(ctx, err)
+	})
+	return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src []byte) (int, error) {
+	var done int
+	for {
+		n, err := mm.as.CopyOut(addr+usermem.Addr(done), src[done:])
+		done += n
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(len(src)))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+	ar, ok := mm.checkIORange(addr, int64(len(dst)))
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes {
+		return mm.asCopyIn(ctx, addr, dst)
+	}
+
+	// Go through internal mappings.
+	n64, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims)
+		return n, translateIOError(ctx, err)
+	})
+	return int(n64), err
+}
+
+func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []byte) (int, error) {
+	var done int
+	for {
+		n, err := mm.as.CopyIn(addr+usermem.Addr(done), dst[done:])
+		done += n
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(len(dst)))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+	ar, ok := mm.checkIORange(addr, toZero)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	if toZero == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && toZero < copyMapMinBytes {
+		return mm.asZeroOut(ctx, addr, toZero)
+	}
+
+	// Go through internal mappings.
+	return mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) {
+		n, err := safemem.ZeroSeq(dsts)
+		return n, translateIOError(ctx, err)
+	})
+}
+
+func (mm *MemoryManager) asZeroOut(ctx context.Context, addr usermem.Addr, toZero int64) (int64, error) {
+	var done int64
+	for {
+		n, err := mm.as.ZeroOut(addr+usermem.Addr(done), uintptr(toZero-done))
+		done += int64(n)
+		if err == nil {
+			return done, nil
+		}
+		if f, ok := err.(platform.SegmentationFault); ok {
+			ar, _ := addr.ToRange(uint64(toZero))
+			if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil {
+				return done, err
+			}
+			continue
+		}
+		return done, translateIOError(ctx, err)
+	}
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+	if !mm.checkIOVec(ars) {
+		return 0, syserror.EFAULT
+	}
+
+	if ars.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+		// We have to introduce a buffered copy, instead of just passing a
+		// safemem.BlockSeq representing addresses in the AddressSpace to src.
+		// This is because usermem.IO.CopyOutFrom() guarantees that it calls
+		// src.ReadToBlocks() at most once, which is incompatible with handling
+		// faults between calls. In the future, this is probably best resolved
+		// by introducing a CopyOutFrom variant or option that allows it to
+		// call src.ReadToBlocks() any number of times.
+		//
+		// This issue applies to CopyInTo as well.
+		buf := make([]byte, int(ars.NumBytes()))
+		bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)))
+		var done int64
+		for done < int64(bufN) {
+			ar := ars.Head()
+			cplen := int64(ar.Length())
+			if cplen > int64(bufN)-done {
+				cplen = int64(bufN) - done
+			}
+			n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)])
+			done += int64(n)
+			if err != nil {
+				return done, err
+			}
+			ars = ars.Tail()
+		}
+		// Do not convert errors returned by src to EFAULT.
+		return done, bufErr
+	}
+
+	// Go through internal mappings.
+	return mm.withVecInternalMappings(ctx, ars, usermem.Write, opts.IgnorePermissions, src.ReadToBlocks)
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+	if !mm.checkIOVec(ars) {
+		return 0, syserror.EFAULT
+	}
+
+	if ars.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
+		buf := make([]byte, int(ars.NumBytes()))
+		var done int
+		var bufErr error
+		for !ars.IsEmpty() {
+			ar := ars.Head()
+			var n int
+			n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())])
+			done += n
+			if bufErr != nil {
+				break
+			}
+			ars = ars.Tail()
+		}
+		n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done])))
+		if err != nil {
+			return int64(n), err
+		}
+		// Do not convert errors returned by dst to EFAULT.
+		return int64(n), bufErr
+	}
+
+	// Go through internal mappings.
+	return mm.withVecInternalMappings(ctx, ars, usermem.Read, opts.IgnorePermissions, dst.WriteFromBlocks)
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.checkIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			old, err := mm.as.SwapUint32(addr, new)
+			if err == nil {
+				return old, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var old uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		old, err = safemem.SwapUint32(im, new)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		return 4, nil
+	})
+	return old, err
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.checkIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			prev, err := mm.as.CompareAndSwapUint32(addr, old, new)
+			if err == nil {
+				return prev, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var prev uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		prev, err = safemem.CompareAndSwapUint32(im, old, new)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		return 4, nil
+	})
+	return prev, err
+}
+
+// handleASIOFault handles a page fault at address addr for an AddressSpaceIO
+// operation spanning ioar.
+//
+// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
+func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
+	// Try to map all remaining pages in the I/O operation. This RoundUp can't
+	// overflow because otherwise it would have been caught by checkIORange.
+	end, _ := ioar.End.RoundUp()
+	ar := usermem.AddrRange{addr.RoundDown(), end}
+
+	// Don't bother trying existingPMAsLocked; in most cases, if we did have
+	// existing pmas, we wouldn't have faulted.
+
+	// Ensure that we have usable vmas. Here and below, only return early if we
+	// can't map the first (faulting) page; failure to map later pages are
+	// silently ignored. This maximizes partial success.
+	mm.mappingMu.RLock()
+	vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return translateIOError(ctx, err)
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
+		breakCOW: at.Write,
+	})
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return translateIOError(ctx, err)
+		}
+		ar.End = pendaddr
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	err = mm.mapASLocked(pseg, ar, false)
+	mm.activeMu.RUnlock()
+	return translateIOError(ctx, err)
+}
+
+// withInternalMappings ensures that pmas exist for all addresses in ar,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subrange of ar for which this property holds.
+//
+// withInternalMappings takes a function returning uint64 since many safemem
+// functions have this property, but returns an int64 since this is usually
+// more useful for usermem.IO methods.
+//
+// Preconditions: 0 < ar.Length() <= math.MaxInt64.
+func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+	po := pmaOpts{
+		breakCOW: at.Write,
+	}
+
+	// If pmas are already available, we can do IO without touching mm.vmas or
+	// mm.mappingMu.
+	mm.activeMu.RLock()
+	if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, po, true /* needInternalMappings */); pseg.Ok() {
+		n, err := f(mm.internalMappingsLocked(pseg, ar))
+		mm.activeMu.RUnlock()
+		// Do not convert errors returned by f to EFAULT.
+		return int64(n), err
+	}
+	mm.activeMu.RUnlock()
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return 0, translateIOError(ctx, verr)
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, po)
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return 0, translateIOError(ctx, perr)
+		}
+		ar.End = pendaddr
+	}
+	imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar)
+	mm.activeMu.DowngradeLock()
+	if imendaddr := imend.Start(); imendaddr < ar.End {
+		if imendaddr <= ar.Start {
+			mm.activeMu.RUnlock()
+			return 0, translateIOError(ctx, imerr)
+		}
+		ar.End = imendaddr
+	}
+
+	// Do I/O.
+	un, err := f(mm.internalMappingsLocked(pseg, ar))
+	mm.activeMu.RUnlock()
+	n := int64(un)
+
+	// Return the first error in order of progress through ar.
+	if err != nil {
+		// Do not convert errors returned by f to EFAULT.
+		return n, err
+	}
+	if imerr != nil {
+		return n, translateIOError(ctx, imerr)
+	}
+	if perr != nil {
+		return n, translateIOError(ctx, perr)
+	}
+	return n, translateIOError(ctx, verr)
+}
+
+// withVecInternalMappings ensures that pmas exist for all addresses in ars,
+// support access of type (at, ignorePermissions), and have internal mappings
+// cached. It then calls f with mm.activeMu locked for reading, passing
+// internal mappings for the subset of ars for which this property holds.
+//
+// Preconditions: !ars.IsEmpty().
+func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
+	// withInternalMappings is faster than withVecInternalMappings because of
+	// iterator plumbing (this isn't generally practical in the vector case due
+	// to iterator invalidation between AddrRanges). Use it if possible.
+	if ars.NumRanges() == 1 {
+		return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f)
+	}
+
+	po := pmaOpts{
+		breakCOW: at.Write,
+	}
+
+	// If pmas are already available, we can do IO without touching mm.vmas or
+	// mm.mappingMu.
+	mm.activeMu.RLock()
+	if mm.existingVecPMAsLocked(ars, at, ignorePermissions, po, true /* needInternalMappings */) {
+		n, err := f(mm.vecInternalMappingsLocked(ars))
+		mm.activeMu.RUnlock()
+		// Do not convert errors returned by f to EFAULT.
+		return int64(n), err
+	}
+	mm.activeMu.RUnlock()
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions)
+	if vars.NumBytes() == 0 {
+		mm.mappingMu.RUnlock()
+		return 0, translateIOError(ctx, verr)
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pars, perr := mm.getVecPMAsLocked(ctx, vars, po)
+	mm.mappingMu.RUnlock()
+	if pars.NumBytes() == 0 {
+		mm.activeMu.Unlock()
+		return 0, translateIOError(ctx, perr)
+	}
+	imars, imerr := mm.getVecPMAInternalMappingsLocked(pars)
+	mm.activeMu.DowngradeLock()
+	if imars.NumBytes() == 0 {
+		mm.activeMu.RUnlock()
+		return 0, translateIOError(ctx, imerr)
+	}
+
+	// Do I/O.
+	un, err := f(mm.vecInternalMappingsLocked(imars))
+	mm.activeMu.RUnlock()
+	n := int64(un)
+
+	// Return the first error in order of progress through ars.
+	if err != nil {
+		// Do not convert errors from f to EFAULT.
+		return n, err
+	}
+	if imerr != nil {
+		return n, translateIOError(ctx, imerr)
+	}
+	if perr != nil {
+		return n, translateIOError(ctx, perr)
+	}
+	return n, translateIOError(ctx, verr)
+}
+
+// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to
+// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
+// truncate usermem.AddrRangeSeq when errors occur.
+//
+// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End.
+func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq {
+	ar := arsit.Head()
+	if end <= ar.Start {
+		return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes())
+	}
+	return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start))
+}
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
new file mode 100644
index 000000000..de7f29b04
--- /dev/null
+++ b/pkg/sentry/mm/lifecycle.go
@@ -0,0 +1,218 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
+func NewMemoryManager(p platform.Platform) *MemoryManager {
+	return &MemoryManager{
+		p:           p,
+		haveASIO:    p.SupportsAddressSpaceIO(),
+		privateRefs: &privateRefs{},
+		users:       1,
+		auxv:        arch.Auxv{},
+		aioManager:  aioManager{contexts: make(map[uint64]*AIOContext)},
+	}
+}
+
+// SetMmapLayout initializes mm's layout from the given arch.Context.
+//
+// Preconditions: mm contains no mappings and is not used concurrently.
+func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) {
+	layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
+	if err != nil {
+		return arch.MmapLayout{}, err
+	}
+	mm.layout = layout
+	return layout, nil
+}
+
+// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
+// clone() (without CLONE_VM).
+func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm2 := &MemoryManager{
+		p:                    mm.p,
+		haveASIO:             mm.haveASIO,
+		layout:               mm.layout,
+		privateRefs:          mm.privateRefs,
+		users:                1,
+		usageAS:              mm.usageAS,
+		brk:                  mm.brk,
+		captureInvalidations: true,
+		argv:                 mm.argv,
+		envv:                 mm.envv,
+		auxv:                 append(arch.Auxv(nil), mm.auxv...),
+		// IncRef'd below, once we know that there isn't an error.
+		executable: mm.executable,
+		aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+	}
+
+	// Copy vmas.
+	dstvgap := mm2.vmas.FirstGap()
+	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
+		vma := srcvseg.ValuePtr()
+		vmaAR := srcvseg.Range()
+		// Inform the Mappable, if any, of the new mapping.
+		if vma.mappable != nil {
+			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off); err != nil {
+				mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
+				return nil, err
+			}
+		}
+		if vma.id != nil {
+			vma.id.IncRef()
+		}
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
+		// We don't need to update mm2.usageAS since we copied it from mm
+		// above.
+	}
+
+	// Copy pmas. We have to lock mm.activeMu for writing to make existing
+	// private pmas copy-on-write. We also have to lock mm2.activeMu since
+	// after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
+	// only copy private pmas, since in the common case where fork(2) is
+	// immediately followed by execve(2), copying non-private pmas that can be
+	// regenerated by calling memmap.Mappable.Translate is a waste of time.
+	// (Linux does the same; compare kernel/fork.c:dup_mmap() =>
+	// mm/memory.c:copy_page_range().)
+	mm2.activeMu.Lock()
+	defer mm2.activeMu.Unlock()
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	dstpgap := mm2.pmas.FirstGap()
+	var unmapAR usermem.AddrRange
+	for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
+		pma := srcpseg.ValuePtr()
+		if !pma.private {
+			continue
+		}
+		if !pma.needCOW {
+			pma.needCOW = true
+			if pma.vmaEffectivePerms.Write {
+				// We don't want to unmap the whole address space, even though
+				// doing so would reduce calls to unmapASLocked(), because mm
+				// will most likely continue to be used after the fork, so
+				// unmapping pmas unnecessarily will result in extra page
+				// faults. But we do want to merge consecutive AddrRanges
+				// across pma boundaries.
+				if unmapAR.End == srcpseg.Start() {
+					unmapAR.End = srcpseg.End()
+				} else {
+					if unmapAR.Length() != 0 {
+						mm.unmapASLocked(unmapAR)
+					}
+					unmapAR = srcpseg.Range()
+				}
+			}
+		}
+		fr := srcpseg.fileRange()
+		mm2.incPrivateRef(fr)
+		srcpseg.ValuePtr().file.IncRef(fr)
+		addrRange := srcpseg.Range()
+		mm2.addRSSLocked(addrRange)
+		dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
+	}
+	if unmapAR.Length() != 0 {
+		mm.unmapASLocked(unmapAR)
+	}
+
+	// Between when we call memmap.Mappable.AddMapping while copying vmas and
+	// when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
+	// ineffective because the pmas they invalidate haven't yet been copied,
+	// possibly allowing mm2 to get invalidated translations:
+	//
+	// Invalidating Mappable            mm.Fork
+	// ---------------------            -------
+	//
+	// mm2.Invalidate()
+	//                                  mm.activeMu.Lock()
+	// mm.Invalidate() /* blocks */
+	//                                  mm2.activeMu.Lock()
+	//                                  (mm copies invalidated pma to mm2)
+	//
+	// This would technically be both safe (since we only copy private pmas,
+	// which will still hold a reference on their memory) and consistent with
+	// Linux, but we avoid it anyway by setting mm2.captureInvalidations during
+	// construction, causing calls to mm2.Invalidate() to be captured in
+	// mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
+	// here.
+	mm2.captureInvalidations = false
+	for _, invArgs := range mm2.capturedInvalidations {
+		mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
+	}
+	mm2.capturedInvalidations = nil
+
+	if mm2.executable != nil {
+		mm2.executable.IncRef()
+	}
+	return mm2, nil
+}
+
+// IncUsers increments mm's user count and returns true. If the user count is
+// already 0, IncUsers does nothing and returns false.
+func (mm *MemoryManager) IncUsers() bool {
+	return atomicbitops.IncUnlessZeroInt32(&mm.users)
+}
+
+// DecUsers decrements mm's user count. If the user count reaches 0, all
+// mappings in mm are unmapped.
+func (mm *MemoryManager) DecUsers(ctx context.Context) {
+	if users := atomic.AddInt32(&mm.users, -1); users > 0 {
+		return
+	} else if users < 0 {
+		panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
+	}
+
+	mm.aioManager.destroy()
+
+	mm.metadataMu.Lock()
+	exe := mm.executable
+	mm.executable = nil
+	mm.metadataMu.Unlock()
+	if exe != nil {
+		exe.DecRef()
+	}
+
+	mm.activeMu.Lock()
+	// Sanity check.
+	if atomic.LoadInt32(&mm.active) != 0 {
+		panic("active address space lost?")
+	}
+	// Make sure the AddressSpace is returned.
+	if mm.as != nil {
+		mm.as.Release()
+		mm.as = nil
+	}
+	mm.activeMu.Unlock()
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	mm.unmapLocked(ctx, mm.applicationAddrRange())
+}
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
new file mode 100644
index 000000000..32d5e2ff6
--- /dev/null
+++ b/pkg/sentry/mm/metadata.go
@@ -0,0 +1,139 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// ArgvStart returns the start of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvEnd.
+func (mm *MemoryManager) ArgvStart() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.argv.Start
+}
+
+// SetArgvStart sets the start of the application argument vector.
+func (mm *MemoryManager) SetArgvStart(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.argv.Start = a
+}
+
+// ArgvEnd returns the end of the application argument vector.
+//
+// There is no guarantee that this value is sensible w.r.t. ArgvStart.
+func (mm *MemoryManager) ArgvEnd() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.argv.End
+}
+
+// SetArgvEnd sets the end of the application argument vector.
+func (mm *MemoryManager) SetArgvEnd(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.argv.End = a
+}
+
+// EnvvStart returns the start of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvEnd.
+func (mm *MemoryManager) EnvvStart() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.envv.Start
+}
+
+// SetEnvvStart sets the start of the application environment vector.
+func (mm *MemoryManager) SetEnvvStart(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.envv.Start = a
+}
+
+// EnvvEnd returns the end of the application environment vector.
+//
+// There is no guarantee that this value is sensible w.r.t. EnvvStart.
+func (mm *MemoryManager) EnvvEnd() usermem.Addr {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return mm.envv.End
+}
+
+// SetEnvvEnd sets the end of the application environment vector.
+func (mm *MemoryManager) SetEnvvEnd(a usermem.Addr) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.envv.End = a
+}
+
+// Auxv returns the current map of auxiliary vectors.
+func (mm *MemoryManager) Auxv() arch.Auxv {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	return append(arch.Auxv(nil), mm.auxv...)
+}
+
+// SetAuxv sets the entire map of auxiliary vectors.
+func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+	mm.auxv = append(arch.Auxv(nil), auxv...)
+}
+
+// Executable returns the executable, if available.
+//
+// An additional reference will be taken in the case of a non-nil executable,
+// which must be released by the caller.
+func (mm *MemoryManager) Executable() *fs.Dirent {
+	mm.metadataMu.Lock()
+	defer mm.metadataMu.Unlock()
+
+	if mm.executable == nil {
+		return nil
+	}
+
+	mm.executable.IncRef()
+	return mm.executable
+}
+
+// SetExecutable sets the executable.
+//
+// This takes a reference on d.
+func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
+	mm.metadataMu.Lock()
+
+	// Grab a new reference.
+	d.IncRef()
+
+	// Set the executable.
+	orig := mm.executable
+	mm.executable = d
+
+	mm.metadataMu.Unlock()
+
+	// Release the old reference.
+	//
+	// Do this without holding the lock, since it may wind up doing some
+	// I/O to sync the dirent, etc.
+	if orig != nil {
+		orig.DecRef()
+	}
+}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
new file mode 100644
index 000000000..ce8097b7f
--- /dev/null
+++ b/pkg/sentry/mm/mm.go
@@ -0,0 +1,417 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mm provides a memory management subsystem. See README.md for a
+// detailed overview.
+//
+// Lock order:
+//
+// fs locks, except for memmap.Mappable locks
+//   mm.MemoryManager.metadataMu
+//     mm.MemoryManager.mappingMu
+//       Locks taken by memmap.Mappable methods other than Translate
+//         mm.MemoryManager.activeMu
+//           Locks taken by memmap.Mappable.Translate
+//             mm.privateRefs.mu
+//               platform.File locks
+//         mm.aioManager.mu
+//           mm.AIOContext.mu
+//
+// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
+// multiple mm.MemoryManagers, as it does so in a well-defined order (forked
+// child first).
+package mm
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	ssync "gvisor.googlesource.com/gvisor/pkg/sync"
+)
+
+// MemoryManager implements a virtual address space.
+type MemoryManager struct {
+	// p is the platform.
+	//
+	// p is immutable.
+	p platform.Platform
+
+	// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
+	// eliminating an indirect call in the hot I/O path, this makes
+	// MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
+	//
+	// haveASIO is immutable.
+	haveASIO bool `state:"nosave"`
+
+	// layout is the memory layout.
+	//
+	// layout is set by the binary loader before the MemoryManager can be used.
+	layout arch.MmapLayout
+
+	// privateRefs stores reference counts for private memory (memory whose
+	// ownership is shared by one or more pmas instead of being owned by a
+	// memmap.Mappable).
+	//
+	// NOTE: This should be replaced using refcounts on
+	// platform.File.
+	//
+	// privateRefs is immutable.
+	privateRefs *privateRefs
+
+	// users is the number of dependences on the mappings in the MemoryManager.
+	// When the number of references in users reaches zero, all mappings are
+	// unmapped.
+	//
+	// users is accessed using atomic memory operations.
+	users int32
+
+	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
+	mappingMu ssync.DowngradableRWMutex `state:"nosave"`
+
+	// vmas stores virtual memory areas. Since vmas are stored by value,
+	// clients should usually use vmaIterator.ValuePtr() instead of
+	// vmaIterator.Value() to get a pointer to the vma rather than a copy.
+	//
+	// Invariants: vmas are always page-aligned.
+	//
+	// vmas is protected by mappingMu.
+	vmas vmaSet
+
+	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+	//
+	// usageAS is protected by mappingMu.
+	usageAS uint64
+
+	// brk is the mm's brk, which is manipulated using the brk(2) system call.
+	// The brk is initially set up by the loader which maps an executable
+	// binary into the mm.
+	//
+	// brk is protected by mappingMu.
+	brk usermem.AddrRange
+
+	// activeMu is loosely analogous to Linux's struct
+	// mm_struct::page_table_lock.
+	activeMu ssync.DowngradableRWMutex `state:"nosave"`
+
+	// pmas stores platform mapping areas used to implement vmas. Since pmas
+	// are stored by value, clients should usually use pmaIterator.ValuePtr()
+	// instead of pmaIterator.Value() to get a pointer to the pma rather than
+	// a copy.
+	//
+	// Inserting or removing segments from pmas should happen along with a
+	// call to mm.insertRSS or mm.removeRSS.
+	//
+	// Invariants: pmas are always page-aligned. If a pma exists for a given
+	// address, a vma must also exist for that address.
+	//
+	// pmas is protected by activeMu.
+	pmas pmaSet
+
+	// curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
+	// reported as the MemoryManager's RSS.
+	//
+	// maxRSS should be modified only via insertRSS and removeRSS, not
+	// directly.
+	//
+	// maxRSS is protected by activeMu.
+	curRSS uint64
+
+	// maxRSS is the maximum resident set size in bytes of a MemoryManager.
+	// It is tracked as the application adds and removes mappings to pmas.
+	//
+	// maxRSS should be modified only via insertRSS, not directly.
+	//
+	// maxRSS is protected by activeMu.
+	maxRSS uint64
+
+	// as is the platform.AddressSpace that pmas are mapped into. active is the
+	// number of contexts that require as to be non-nil; if active == 0, as may
+	// be nil.
+	//
+	// as is protected by activeMu. active is manipulated with atomic memory
+	// operations; transitions to and from zero are additionally protected by
+	// activeMu. (This is because such transitions may need to be atomic with
+	// changes to as.)
+	as     platform.AddressSpace `state:"nosave"`
+	active int32                 `state:"zerovalue"`
+
+	// unmapAllOnActivate indicates that the next Activate call should activate
+	// an empty AddressSpace.
+	//
+	// This is used to ensure that an AddressSpace cached in
+	// NewAddressSpace is not used after some change in the MemoryManager
+	// or VMAs has made that AddressSpace stale.
+	//
+	// unmapAllOnActivate is protected by activeMu. It must only be set when
+	// there is no active or cached AddressSpace. If as != nil, then
+	// invalidations should be propagated immediately.
+	unmapAllOnActivate bool `state:"nosave"`
+
+	// If captureInvalidations is true, calls to MM.Invalidate() are recorded
+	// in capturedInvalidations rather than being applied immediately to pmas.
+	// This is to avoid a race condition in MM.Fork(); see that function for
+	// details.
+	//
+	// Both captureInvalidations and capturedInvalidations are protected by
+	// activeMu. Neither need to be saved since captureInvalidations is only
+	// enabled during MM.Fork(), during which saving can't occur.
+	captureInvalidations  bool             `state:"zerovalue"`
+	capturedInvalidations []invalidateArgs `state:"nosave"`
+
+	metadataMu sync.Mutex `state:"nosave"`
+
+	// argv is the application argv. This is set up by the loader and may be
+	// modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
+	// requirements apply to argv; we do not require that argv.WellFormed().
+	//
+	// argv is protected by metadataMu.
+	argv usermem.AddrRange
+
+	// envv is the application envv. This is set up by the loader and may be
+	// modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
+	// requirements apply to envv; we do not require that envv.WellFormed().
+	//
+	// envv is protected by metadataMu.
+	envv usermem.AddrRange
+
+	// auxv is the ELF's auxiliary vector.
+	//
+	// auxv is protected by metadataMu.
+	auxv arch.Auxv
+
+	// executable is the executable for this MemoryManager. If executable
+	// is not nil, it holds a reference on the Dirent.
+	//
+	// executable is protected by metadataMu.
+	executable *fs.Dirent
+
+	// aioManager keeps track of AIOContexts used for async IOs. AIOManager
+	// must be cloned when CLONE_VM is used.
+	aioManager aioManager
+}
+
+// vma represents a virtual memory area.
+type vma struct {
+	// mappable is the virtual memory object mapped by this vma. If mappable is
+	// nil, the vma represents a private anonymous mapping.
+	mappable memmap.Mappable
+
+	// off is the offset into mappable at which this vma begins. If mappable is
+	// nil, off is meaningless.
+	off uint64
+
+	// To speedup VMA save/restore, we group and save the following booleans
+	// as a single integer.
+
+	// realPerms are the memory permissions on this vma, as defined by the
+	// application.
+	realPerms usermem.AccessType `state:".(int)"`
+
+	// effectivePerms are the memory permissions on this vma which are
+	// actually used to control access.
+	//
+	// Invariant: effectivePerms == realPerms.Effective().
+	effectivePerms usermem.AccessType `state:"manual"`
+
+	// maxPerms limits the set of permissions that may ever apply to this
+	// memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
+	// is true (e.g. ptrace(PTRACE_POKEDATA)).
+	//
+	// Invariant: maxPerms == maxPerms.Effective().
+	maxPerms usermem.AccessType `state:"manual"`
+
+	// private is true if this is a MAP_PRIVATE mapping, such that writes to
+	// the mapping are propagated to a copy.
+	private bool `state:"manual"`
+
+	// growsDown is true if the mapping may be automatically extended downward
+	// under certain conditions. If growsDown is true, mappable must be nil.
+	//
+	// There is currently no corresponding growsUp flag; in Linux, the only
+	// architectures that can have VM_GROWSUP mappings are ia64, parisc, and
+	// metag, none of which we currently support.
+	growsDown bool `state:"manual"`
+
+	// If id is not nil, it controls the lifecycle of mappable and provides vma
+	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
+	id memmap.MappingIdentity
+
+	// If hint is non-empty, it is a description of the vma printed in
+	// /proc/[pid]/maps. hint takes priority over id.MappedName().
+	hint string
+}
+
+const (
+	vmaRealPermsRead = 1 << iota
+	vmaRealPermsWrite
+	vmaRealPermsExecute
+	vmaEffectivePermsRead
+	vmaEffectivePermsWrite
+	vmaEffectivePermsExecute
+	vmaMaxPermsRead
+	vmaMaxPermsWrite
+	vmaMaxPermsExecute
+	vmaPrivate
+	vmaGrowsDown
+)
+
+func (v *vma) saveRealPerms() int {
+	var b int
+	if v.realPerms.Read {
+		b |= vmaRealPermsRead
+	}
+	if v.realPerms.Write {
+		b |= vmaRealPermsWrite
+	}
+	if v.realPerms.Execute {
+		b |= vmaRealPermsExecute
+	}
+	if v.effectivePerms.Read {
+		b |= vmaEffectivePermsRead
+	}
+	if v.effectivePerms.Write {
+		b |= vmaEffectivePermsWrite
+	}
+	if v.effectivePerms.Execute {
+		b |= vmaEffectivePermsExecute
+	}
+	if v.maxPerms.Read {
+		b |= vmaMaxPermsRead
+	}
+	if v.maxPerms.Write {
+		b |= vmaMaxPermsWrite
+	}
+	if v.maxPerms.Execute {
+		b |= vmaMaxPermsExecute
+	}
+	if v.private {
+		b |= vmaPrivate
+	}
+	if v.growsDown {
+		b |= vmaGrowsDown
+	}
+	return b
+}
+
+func (v *vma) loadRealPerms(b int) {
+	if b&vmaRealPermsRead > 0 {
+		v.realPerms.Read = true
+	}
+	if b&vmaRealPermsWrite > 0 {
+		v.realPerms.Write = true
+	}
+	if b&vmaRealPermsExecute > 0 {
+		v.realPerms.Execute = true
+	}
+	if b&vmaEffectivePermsRead > 0 {
+		v.effectivePerms.Read = true
+	}
+	if b&vmaEffectivePermsWrite > 0 {
+		v.effectivePerms.Write = true
+	}
+	if b&vmaEffectivePermsExecute > 0 {
+		v.effectivePerms.Execute = true
+	}
+	if b&vmaMaxPermsRead > 0 {
+		v.maxPerms.Read = true
+	}
+	if b&vmaMaxPermsWrite > 0 {
+		v.maxPerms.Write = true
+	}
+	if b&vmaMaxPermsExecute > 0 {
+		v.maxPerms.Execute = true
+	}
+	if b&vmaPrivate > 0 {
+		v.private = true
+	}
+	if b&vmaGrowsDown > 0 {
+		v.growsDown = true
+	}
+}
+
+// pma represents a platform mapping area.
+type pma struct {
+	// file is the file mapped by this pma. Only pmas for which file ==
+	// platform.Platform.Memory() may be saved. pmas hold a reference to the
+	// corresponding file range while they exist.
+	file platform.File `state:"nosave"`
+
+	// off is the offset into file at which this pma begins.
+	off uint64
+
+	// vmaEffectivePerms and vmaMaxPerms are duplicated from the
+	// corresponding vma so that the IO implementation can avoid iterating
+	// mm.vmas when pmas already exist.
+	vmaEffectivePerms usermem.AccessType
+	vmaMaxPerms       usermem.AccessType
+
+	// needCOW is true if writes to the mapping must be propagated to a copy.
+	needCOW bool
+
+	// private is true if this pma represents private memory.
+	//
+	// If private is true, file must be platform.Platform.Memory(), the pma
+	// holds a reference on the mapped memory that is tracked in privateRefs,
+	// and calls to Invalidate for which
+	// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
+	//
+	// If private is false, this pma caches a translation from the
+	// corresponding vma's memmap.Mappable.Translate.
+	private bool
+
+	// If internalMappings is not empty, it is the cached return value of
+	// file.MapInternal for the platform.FileRange mapped by this pma.
+	internalMappings safemem.BlockSeq `state:"nosave"`
+}
+
+type privateRefs struct {
+	mu sync.Mutex `state:"nosave"`
+
+	// refs maps offsets into Platform.Memory() to the number of pmas (or,
+	// equivalently, MemoryManagers) that share ownership of the memory at that
+	// offset.
+	refs fileRefcountSet
+}
+
+type invalidateArgs struct {
+	ar   usermem.AddrRange
+	opts memmap.InvalidateOpts
+}
+
+// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
+type fileRefcountSetFunctions struct{}
+
+func (fileRefcountSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (fileRefcountSetFunctions) MaxKey() uint64 {
+	return ^uint64(0)
+}
+
+func (fileRefcountSetFunctions) ClearValue(_ *int32) {
+}
+
+func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) {
+	return rc1, rc1 == rc2
+}
+
+func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) {
+	return rc, rc
+}
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
new file mode 100644
index 000000000..b47aa7263
--- /dev/null
+++ b/pkg/sentry/mm/mm_test.go
@@ -0,0 +1,174 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func testMemoryManager(ctx context.Context) *MemoryManager {
+	p := platform.FromContext(ctx)
+	mm := NewMemoryManager(p)
+	mm.layout = arch.MmapLayout{
+		MinAddr:      p.MinUserAddress(),
+		MaxAddr:      p.MaxUserAddress(),
+		BottomUpBase: p.MinUserAddress(),
+		TopDownBase:  p.MaxUserAddress(),
+	}
+	return mm
+}
+
+func (mm *MemoryManager) realUsageAS() uint64 {
+	return uint64(mm.vmas.Span())
+}
+
+func TestUsageASUpdates(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length: 2 * usermem.PageSize,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+	realUsage := mm.realUsageAS()
+	if mm.usageAS != realUsage {
+		t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+	}
+
+	mm.MUnmap(ctx, addr, usermem.PageSize)
+	realUsage = mm.realUsageAS()
+	if mm.usageAS != realUsage {
+		t.Fatalf("usageAS believes %v bytes are mapped; %v bytes are actually mapped", mm.usageAS, realUsage)
+	}
+}
+
+func TestBrkDataLimitUpdates(t *testing.T) {
+	limitSet := limits.NewLimitSet()
+	limitSet.Set(limits.Data, limits.Limit{}) // zero RLIMIT_DATA
+
+	ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	// Try to extend the brk by one page and expect doing so to fail.
+	oldBrk, _ := mm.Brk(ctx, 0)
+	if newBrk, _ := mm.Brk(ctx, oldBrk+usermem.PageSize); newBrk != oldBrk {
+		t.Errorf("brk() increased data segment above RLIMIT_DATA (old brk = %#x, new brk = %#x", oldBrk, newBrk)
+	}
+}
+
+// TestIOAfterUnmap ensures that IO fails after unmap.
+func TestIOAfterUnmap(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.Read,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+
+	// IO works before munmap.
+	b := make([]byte, 1)
+	n, err := mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+	if err != nil {
+		t.Errorf("CopyIn got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyIn got %d want 1", n)
+	}
+
+	err = mm.MUnmap(ctx, addr, usermem.PageSize)
+	if err != nil {
+		t.Fatalf("MUnmap got err %v want nil", err)
+	}
+
+	n, err = mm.CopyIn(ctx, addr, b, usermem.IOOpts{})
+	if err != syserror.EFAULT {
+		t.Errorf("CopyIn got err %v want EFAULT", err)
+	}
+	if n != 0 {
+		t.Errorf("CopyIn got %d want 0", n)
+	}
+}
+
+// TestIOAfterMProtect tests IO interaction with mprotect permissions.
+func TestIOAfterMProtect(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.ReadWrite,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+
+	// Writing works before mprotect.
+	b := make([]byte, 1)
+	n, err := mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+	if err != nil {
+		t.Errorf("CopyOut got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyOut got %d want 1", n)
+	}
+
+	err = mm.MProtect(addr, usermem.PageSize, usermem.Read, false)
+	if err != nil {
+		t.Errorf("MProtect got err %v want nil", err)
+	}
+
+	// Without IgnorePermissions, CopyOut should no longer succeed.
+	n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{})
+	if err != syserror.EFAULT {
+		t.Errorf("CopyOut got err %v want EFAULT", err)
+	}
+	if n != 0 {
+		t.Errorf("CopyOut got %d want 0", n)
+	}
+
+	// With IgnorePermissions, CopyOut should succeed despite mprotect.
+	n, err = mm.CopyOut(ctx, addr, b, usermem.IOOpts{
+		IgnorePermissions: true,
+	})
+	if err != nil {
+		t.Errorf("CopyOut got err %v want nil", err)
+	}
+	if n != 1 {
+		t.Errorf("CopyOut got %d want 1", n)
+	}
+}
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
new file mode 100644
index 000000000..35e873762
--- /dev/null
+++ b/pkg/sentry/mm/pma.go
@@ -0,0 +1,928 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type pmaOpts struct {
+	// If breakCOW is true, pmas must not be copy-on-write.
+	breakCOW bool
+}
+
+// existingPMAsLocked checks that pmas exist for all addresses in ar, and
+// support access of type (at, ignorePermissions). If so, it returns an
+// iterator to the pma containing ar.Start. Otherwise it returns a terminal
+// iterator.
+//
+// Preconditions: mm.activeMu must be locked. ar.Length() != 0.
+func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) pmaIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	first := mm.pmas.FindSegment(ar.Start)
+	pseg := first
+	for pseg.Ok() {
+		pma := pseg.ValuePtr()
+		perms := pma.vmaEffectivePerms
+		if ignorePermissions {
+			perms = pma.vmaMaxPerms
+		}
+		if !perms.SupersetOf(at) {
+			// These are the vma's permissions, so the caller will get an error
+			// when they try to get new pmas.
+			return pmaIterator{}
+		}
+		if opts.breakCOW && pma.needCOW {
+			return pmaIterator{}
+		}
+		if needInternalMappings && pma.internalMappings.IsEmpty() {
+			return pmaIterator{}
+		}
+
+		if ar.End <= pseg.End() {
+			return first
+		}
+		pseg, _ = pseg.NextNonEmpty()
+	}
+
+	// Ran out of pmas before reaching ar.End.
+	return pmaIterator{}
+}
+
+// existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
+// and support access of type (at, ignorePermissions).
+//
+// Preconditions: mm.activeMu must be locked.
+func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) bool {
+	for ; !ars.IsEmpty(); ars = ars.Tail() {
+		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, opts, needInternalMappings).Ok() {
+			return false
+		}
+	}
+	return true
+}
+
+// getPMAsLocked ensures that pmas exist for all addresses in ar, subject to
+// opts. It returns:
+//
+// - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
+// for all addresses in ar.
+func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, opts pmaOpts) (pmaIterator, pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if !vseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+		}
+	}
+
+	// Page-align ar so that all AddrRanges are aligned.
+	end, ok := ar.End.RoundUp()
+	var alignerr error
+	if !ok {
+		end = ar.End.RoundDown()
+		alignerr = syserror.EFAULT
+	}
+	ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+	pstart, pend, perr := mm.ensurePMAsLocked(ctx, vseg, ar)
+	if pend.Start() <= ar.Start {
+		return pmaIterator{}, pend, perr
+	}
+	// ensurePMAsLocked may not have pstart due to iterator invalidation. We
+	// need it, either to return it immediately or to pass to
+	// breakCopyOnWriteLocked.
+	if !pstart.Ok() {
+		pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+	}
+
+	var cowerr error
+	if opts.breakCOW {
+		var invalidated bool
+		pend, invalidated, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
+		if pend.Start() <= ar.Start {
+			return pmaIterator{}, pend, cowerr
+		}
+		if invalidated {
+			pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+		}
+	}
+
+	if cowerr != nil {
+		return pstart, pend, cowerr
+	}
+	if perr != nil {
+		return pstart, pend, perr
+	}
+	return pstart, pend, alignerr
+}
+
+// getVecPMAsLocked ensures that pmas exist for all addresses in ars. It
+// returns the subset of ars for which pmas exist. If this is not equal to ars,
+// it returns a non-nil error explaining why.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. vmas must exist for all addresses in ars.
+func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, opts pmaOpts) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+
+		// Page-align ar so that all AddrRanges are aligned.
+		end, ok := ar.End.RoundUp()
+		var alignerr error
+		if !ok {
+			end = ar.End.RoundDown()
+			alignerr = syserror.EFAULT
+		}
+		ar = usermem.AddrRange{ar.Start.RoundDown(), end}
+
+		pstart, pend, perr := mm.ensurePMAsLocked(ctx, mm.vmas.FindSegment(ar.Start), ar)
+		if pend.Start() <= ar.Start {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
+		}
+
+		var cowerr error
+		if opts.breakCOW {
+			if !pstart.Ok() {
+				pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
+			}
+			pend, _, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
+		}
+
+		if cowerr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), cowerr
+		}
+		if perr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
+		}
+		if alignerr != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
+		}
+	}
+
+	return ars, nil
+}
+
+// ensurePMAsLocked ensures that pmas exist for all addresses in ar. It returns:
+//
+// - An iterator to the pma containing ar.Start, on a best-effort basis (that
+// is, the returned iterator may be terminal, even if such a pma exists).
+// Returning this iterator on a best-effort basis allows callers that require
+// it to use it when it's cheaply available, while also avoiding the overhead
+// of retrieving it when it's not.
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. ar.Length() != 0. ar must be page-aligned.
+// vseg.Range().Contains(ar.Start). vmas must exist for all addresses in ar.
+func (mm *MemoryManager) ensurePMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange) (pmaIterator, pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
+		}
+	}
+
+	pstart, pgap := mm.pmas.Find(ar.Start)
+	if pstart.Ok() {
+		pgap = pstart.NextGap()
+	}
+	for pgap.Start() < ar.End {
+		if pgap.Range().Length() == 0 {
+			pgap = pgap.NextGap()
+			continue
+		}
+		// A single pgap might be spanned by multiple vmas. Insert pmas to
+		// cover the first (vma, pgap) pair.
+		pgapAR := pgap.Range().Intersect(ar)
+		vseg = vseg.seekNextLowerBound(pgapAR.Start)
+		if checkInvariants {
+			if !vseg.Ok() {
+				panic(fmt.Sprintf("no vma after %#x", pgapAR.Start))
+			}
+			if pgapAR.Start < vseg.Start() {
+				panic(fmt.Sprintf("no vma in [%#x, %#x)", pgapAR.Start, vseg.Start()))
+			}
+		}
+		var err error
+		pgap, err = mm.insertPMAsLocked(ctx, vseg, pgap, ar)
+		// insertPMAsLocked most likely invalidated iterators, so pstart is now
+		// unknown.
+		pstart = pmaIterator{}
+		if err != nil {
+			return pstart, pgap, err
+		}
+	}
+	return pstart, pgap, nil
+}
+
+const (
+	// When memory is allocated for a private pma, align the allocated address
+	// range to a privateAllocUnit boundary when possible. Larger values of
+	// privateAllocUnit may reduce page faults by allowing fewer, larger pmas
+	// to be mapped, but may result in larger amounts of wasted memory in the
+	// presence of fragmentation. privateAllocUnit must be a power-of-2
+	// multiple of usermem.PageSize.
+	privateAllocUnit = usermem.HugePageSize
+
+	privateAllocMask = privateAllocUnit - 1
+)
+
+func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
+	aligned := usermem.AddrRange{ar.Start &^ privateAllocMask, ar.End}
+	if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
+		aligned.End = end
+	}
+	if checkInvariants {
+		if !aligned.IsSupersetOf(ar) {
+			panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
+		}
+	}
+	return aligned
+}
+
+// insertPMAsLocked inserts pmas into pgap corresponding to the vma iterated by
+// vseg, spanning at least ar. It returns:
+//
+// - An iterator to the gap after the last pma containing an address in ar. If
+// pmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if pmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
+// writing. vseg.Range().Intersect(pgap.Range()).Intersect(ar).Length() != 0.
+// ar must be page-aligned.
+func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator, pgap pmaGapIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
+	optAR := vseg.Range().Intersect(pgap.Range())
+	if checkInvariants {
+		if optAR.Length() <= 0 {
+			panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar %v", ar))
+		}
+	}
+	vma := vseg.ValuePtr()
+
+	// Private anonymous mappings get pmas by allocating.
+	if vma.mappable == nil {
+		// Limit the range we allocate to ar, aligned to privateAllocUnit.
+		maskAR := privateAligned(ar)
+		allocAR := optAR.Intersect(maskAR)
+		mem := mm.p.Memory()
+		fr, err := mem.Allocate(uint64(allocAR.Length()), usage.Anonymous)
+		if err != nil {
+			return pgap, err
+		}
+		mm.incPrivateRef(fr)
+
+		if checkInvariants {
+			if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
+				panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
+			}
+		}
+
+		mm.addRSSLocked(allocAR)
+		mem.IncRef(fr)
+
+		return mm.pmas.Insert(pgap, allocAR, pma{
+			file:              mem,
+			off:               fr.Start,
+			vmaEffectivePerms: vma.effectivePerms,
+			vmaMaxPerms:       vma.maxPerms,
+			private:           true,
+			// Since we just allocated this memory and have the only reference,
+			// the new pma does not need copy-on-write.
+		}).NextGap(), nil
+	}
+
+	// Other mappings get pmas by translating. Limit the required range
+	// to ar.
+	optMR := vseg.mappableRangeOf(optAR)
+	reqAR := optAR.Intersect(ar)
+	reqMR := vseg.mappableRangeOf(reqAR)
+	perms := vma.maxPerms
+	if vma.private {
+		perms.Write = false
+	}
+	ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+	if checkInvariants {
+		if err := memmap.CheckTranslateResult(reqMR, optMR, ts, err); err != nil {
+			panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v): %v", vma.mappable, reqMR, optMR, err))
+		}
+	}
+
+	// Install a pma for each Translation.
+	for _, t := range ts {
+		// This is valid because memmap.Mappable.Translate is required to
+		// return Translations in increasing Translation.Source order.
+		addrRange := vseg.addrRangeOf(t.Source)
+		mm.addRSSLocked(addrRange)
+		pseg := mm.pmas.Insert(pgap, addrRange, pma{
+			file:              t.File,
+			off:               t.Offset,
+			vmaEffectivePerms: vma.effectivePerms,
+			vmaMaxPerms:       vma.maxPerms,
+			needCOW:           vma.private,
+		})
+		// The new pseg may have been merged with existing segments, only take a
+		// ref on the inserted range.
+		t.File.IncRef(pseg.fileRangeOf(addrRange))
+		pgap = pseg.NextGap()
+	}
+
+	// Even if Translate returned an error, if we got to ar.End,
+	// insertPMAsLocked succeeded.
+	if ar.End <= pgap.Start() {
+		return pgap, nil
+	}
+	return pgap, err
+}
+
+// breakCopyOnWriteLocked ensures that pmas in ar are not copy-on-write. It
+// returns:
+//
+// - An iterator to the gap after the last non-COW pma containing an address in
+// ar. If non-COW pmas exist for no addresses in ar, the iterator is to a gap
+// that begins before ar.Start.
+//
+// - A boolean that is true if iterators into mm.pmas may have been
+// invalidated.
+//
+// - An error that is non-nil if non-COW pmas exist for only a subset of ar.
+//
+// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned. pseg.Range().Contains(ar.Start). pmas must exist for
+// all addresses in ar.
+func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, bool, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	// Limit the range we copy to ar, aligned to privateAllocUnit.
+	maskAR := privateAligned(ar)
+	var invalidatedIterators, didUnmapAS bool
+	mem := mm.p.Memory()
+	for {
+		if mm.isPMACopyOnWriteLocked(pseg) {
+			// Determine the range to copy.
+			copyAR := pseg.Range().Intersect(maskAR)
+
+			// Get internal mappings from the pma to copy from.
+			if err := pseg.getInternalMappingsLocked(); err != nil {
+				return pseg.PrevGap(), invalidatedIterators, err
+			}
+
+			// Copy contents.
+			fr, err := platform.AllocateAndFill(mem, uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
+			if _, ok := err.(safecopy.BusError); ok {
+				// If we got SIGBUS during the copy, deliver SIGBUS to
+				// userspace (instead of SIGSEGV) if we're breaking
+				// copy-on-write due to application page fault.
+				err = &memmap.BusError{err}
+			}
+			if fr.Length() == 0 {
+				return pseg.PrevGap(), invalidatedIterators, err
+			}
+			mm.incPrivateRef(fr)
+			mem.IncRef(fr)
+
+			// Unmap all of maskAR, not just copyAR, to minimize host syscalls.
+			// AddressSpace mappings must be removed before mm.decPrivateRef().
+			if !didUnmapAS {
+				mm.unmapASLocked(maskAR)
+				didUnmapAS = true
+			}
+
+			// Replace the pma with a copy in the part of the address range
+			// where copying was successful.
+			copyAR.End = copyAR.Start + usermem.Addr(fr.Length())
+			if copyAR != pseg.Range() {
+				pseg = mm.pmas.Isolate(pseg, copyAR)
+				invalidatedIterators = true
+			}
+			pma := pseg.ValuePtr()
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			pma.file.DecRef(pseg.fileRange())
+
+			pma.file = mem
+			pma.off = fr.Start
+			pma.private = true
+			pma.needCOW = false
+			pma.internalMappings = safemem.BlockSeq{}
+
+			// Try to merge pma with its neighbors.
+			if prev := pseg.PrevSegment(); prev.Ok() {
+				if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
+					pseg = merged
+					invalidatedIterators = true
+				}
+			}
+			if next := pseg.NextSegment(); next.Ok() {
+				if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
+					pseg = merged
+					invalidatedIterators = true
+				}
+			}
+
+			// If an error occurred after ar.End, breakCopyOnWriteLocked still
+			// did its job, so discard the error.
+			if err != nil && pseg.End() < ar.End {
+				return pseg.NextGap(), invalidatedIterators, err
+			}
+		}
+		// This checks against ar.End, not maskAR.End, so we will never break
+		// COW on a pma that does not intersect ar.
+		if ar.End <= pseg.End() {
+			return pseg.NextGap(), invalidatedIterators, nil
+		}
+		pseg = pseg.NextSegment()
+	}
+}
+
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) isPMACopyOnWriteLocked(pseg pmaIterator) bool {
+	pma := pseg.ValuePtr()
+	if !pma.needCOW {
+		return false
+	}
+	if !pma.private {
+		return true
+	}
+	// If we have the only reference on private memory to be copied, just take
+	// ownership of it instead of copying. If we do hold the only reference,
+	// additional references can only be taken by mm.Fork(), which is excluded
+	// by mm.activeMu, so this isn't racy.
+	mm.privateRefs.mu.Lock()
+	defer mm.privateRefs.mu.Unlock()
+	fr := pseg.fileRange()
+	// This check relies on mm.privateRefs.refs being kept fully merged.
+	rseg := mm.privateRefs.refs.FindSegment(fr.Start)
+	if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
+		pma.needCOW = false
+		return false
+	}
+	return true
+}
+
+// Invalidate implements memmap.MappingSpace.Invalidate.
+func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	if mm.captureInvalidations {
+		mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
+		return
+	}
+	mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
+}
+
+// invalidateLocked removes pmas and AddressSpace mappings of those pmas for
+// addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	var didUnmapAS bool
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
+			pseg = mm.pmas.Isolate(pseg, ar)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			mm.removeRSSLocked(pseg.Range())
+			pma.file.DecRef(pseg.fileRange())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
+		} else {
+			pseg = pseg.NextSegment()
+		}
+	}
+}
+
+// movePMAsLocked moves all pmas in oldAR to newAR.
+//
+// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
+// oldAR.Length() == newAR.Length(). !oldAR.Overlaps(newAR).
+// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
+func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
+	if checkInvariants {
+		if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() {
+			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
+		}
+		if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
+			panic(fmt.Sprintf("invalid newAR: %v", newAR))
+		}
+		if oldAR.Length() != newAR.Length() {
+			panic(fmt.Sprintf("old and new address ranges have different lengths: %v, %v", oldAR, newAR))
+		}
+		if oldAR.Overlaps(newAR) {
+			panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
+		}
+		// mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
+	}
+
+	type movedPMA struct {
+		oldAR usermem.AddrRange
+		pma   pma
+	}
+	var movedPMAs []movedPMA
+	pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
+	for pseg.Ok() && pseg.Start() < oldAR.End {
+		pseg = mm.pmas.Isolate(pseg, oldAR)
+		movedPMAs = append(movedPMAs, movedPMA{
+			oldAR: pseg.Range(),
+			pma:   pseg.Value(),
+		})
+		mm.removeRSSLocked(pseg.Range())
+		pseg = mm.pmas.Remove(pseg).NextSegment()
+	}
+
+	off := newAR.Start - oldAR.Start
+	pgap := mm.pmas.FindGap(newAR.Start)
+	for i := range movedPMAs {
+		mpma := &movedPMAs[i]
+		pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
+		mm.addRSSLocked(pmaNewAR)
+		pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
+	}
+
+	mm.unmapASLocked(oldAR)
+}
+
+// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have
+// cached internal mappings. It returns:
+//
+// - An iterator to the gap after the last pma with internal mappings
+// containing an address in ar. If internal mappings exist for no addresses in
+// ar, the iterator is to a gap that begins before ar.Start.
+//
+// - An error that is non-nil if internal mappings exist for only a subset of
+// ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar.
+// ar.Length() != 0.
+//
+// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	for {
+		if err := pseg.getInternalMappingsLocked(); err != nil {
+			return pseg.PrevGap(), err
+		}
+		if ar.End <= pseg.End() {
+			return pseg.NextGap(), nil
+		}
+		pseg, _ = pseg.NextNonEmpty()
+	}
+}
+
+// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars
+// have cached internal mappings. It returns the subset of ars for which
+// internal mappings exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.activeMu must be locked for writing. pmas must exist for
+// all addresses in ar.
+//
+// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
+// into mm.pmas.
+func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSeq) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil {
+			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err
+		}
+	}
+	return ars, nil
+}
+
+// internalMappingsLocked returns internal mappings for addresses in ar.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ar. ar.Length() != 0.
+// pseg.Range().Contains(ar.Start).
+func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().Contains(ar.Start) {
+			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
+		}
+	}
+
+	if ar.End <= pseg.End() {
+		// Since only one pma is involved, we can use pma.internalMappings
+		// directly, avoiding a slice allocation.
+		offset := uint64(ar.Start - pseg.Start())
+		return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
+	}
+
+	var ims []safemem.Block
+	for {
+		pr := pseg.Range().Intersect(ar)
+		for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
+			ims = append(ims, pims.Head())
+		}
+		if ar.End <= pseg.End() {
+			break
+		}
+		pseg = pseg.NextSegment()
+	}
+	return safemem.BlockSeqFromSlice(ims)
+}
+
+// vecInternalMappingsLocked returns internal mappings for addresses in ars.
+//
+// Preconditions: mm.activeMu must be locked. Internal mappings must have been
+// previously established for all addresses in ars.
+func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq {
+	var ims []safemem.Block
+	for ; !ars.IsEmpty(); ars = ars.Tail() {
+		ar := ars.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
+			ims = append(ims, pims.Head())
+		}
+	}
+	return safemem.BlockSeqFromSlice(ims)
+}
+
+// incPrivateRef acquires a reference on private pages in fr.
+func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) {
+	mm.privateRefs.mu.Lock()
+	defer mm.privateRefs.mu.Unlock()
+	refSet := &mm.privateRefs.refs
+	seg, gap := refSet.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = refSet.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty()
+		default:
+			refSet.MergeAdjacent(fr)
+			return
+		}
+	}
+}
+
+// decPrivateRef releases a reference on private pages in fr.
+func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
+	var freed []platform.FileRange
+
+	mm.privateRefs.mu.Lock()
+	refSet := &mm.privateRefs.refs
+	seg := refSet.LowerBoundSegment(fr.Start)
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = refSet.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			freed = append(freed, seg.Range())
+			seg = refSet.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	refSet.MergeAdjacent(fr)
+	mm.privateRefs.mu.Unlock()
+
+	mem := mm.p.Memory()
+	for _, fr := range freed {
+		mem.DecRef(fr)
+	}
+}
+
+// addRSSLocked updates the current and maximum resident set size of a
+// MemoryManager to reflect the insertion of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) addRSSLocked(ar usermem.AddrRange) {
+	mm.curRSS += uint64(ar.Length())
+	if mm.curRSS > mm.maxRSS {
+		mm.maxRSS = mm.curRSS
+	}
+}
+
+// removeRSSLocked updates the current resident set size of a MemoryManager to
+// reflect the removal of a pma at ar.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (mm *MemoryManager) removeRSSLocked(ar usermem.AddrRange) {
+	mm.curRSS -= uint64(ar.Length())
+}
+
+// pmaSetFunctions implements segment.Functions for pmaSet.
+type pmaSetFunctions struct{}
+
+func (pmaSetFunctions) MinKey() usermem.Addr {
+	return 0
+}
+
+func (pmaSetFunctions) MaxKey() usermem.Addr {
+	return ^usermem.Addr(0)
+}
+
+func (pmaSetFunctions) ClearValue(pma *pma) {
+	pma.file = nil
+	pma.internalMappings = safemem.BlockSeq{}
+}
+
+func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) {
+	if pma1.file != pma2.file ||
+		pma1.off+uint64(ar1.Length()) != pma2.off ||
+		pma1.vmaEffectivePerms != pma2.vmaEffectivePerms ||
+		pma1.vmaMaxPerms != pma2.vmaMaxPerms ||
+		pma1.needCOW != pma2.needCOW ||
+		pma1.private != pma2.private {
+		return pma{}, false
+	}
+
+	// Discard internal mappings instead of trying to merge them, since merging
+	// them requires an allocation and getting them again from the
+	// platform.File might not.
+	pma1.internalMappings = safemem.BlockSeq{}
+	return pma1, true
+}
+
+func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (pma, pma) {
+	newlen1 := uint64(split - ar.Start)
+	p2 := p
+	p2.off += newlen1
+	if !p.internalMappings.IsEmpty() {
+		p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
+		p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
+	}
+	return p, p2
+}
+
+// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
+// so by scanning linearly backward from pgap.
+//
+// Preconditions: mm.activeMu must be locked. addr <= pgap.Start().
+func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator {
+	if checkInvariants {
+		if !pgap.Ok() {
+			panic("terminal pma iterator")
+		}
+		if addr > pgap.Start() {
+			panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
+		}
+	}
+	// Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
+	// which is the case if findOrSeekPrevUpperBoundPMA is called to find the
+	// start of a range containing only a single PMA.
+	if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
+		return pseg
+	}
+	return mm.pmas.UpperBoundSegment(addr)
+}
+
+// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
+// non-empty.
+//
+// Preconditions: mm.activeMu must be locked for writing.
+func (pseg pmaIterator) getInternalMappingsLocked() error {
+	pma := pseg.ValuePtr()
+	if pma.internalMappings.IsEmpty() {
+		// Internal mappings are used for ignorePermissions accesses,
+		// so we need to use vma.maxPerms instead of
+		// vma.effectivePerms. However, we will never execute
+		// application code through an internal mapping, and we don't
+		// actually need a writable mapping if copy-on-write is in
+		// effect. (But get a writable mapping anyway if the pma is
+		// private, so that if breakCopyOnWriteLocked =>
+		// isPMACopyOnWriteLocked takes ownership of the pma instead of
+		// copying, it doesn't need to get a new mapping.)
+		perms := pma.vmaMaxPerms
+		perms.Execute = false
+		if pma.needCOW && !pma.private {
+			perms.Write = false
+		}
+		ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
+		if err != nil {
+			return err
+		}
+		pma.internalMappings = ims
+	}
+	return nil
+}
+
+func (pseg pmaIterator) fileRange() platform.FileRange {
+	return pseg.fileRangeOf(pseg.Range())
+}
+
+// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0.
+func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange {
+	if checkInvariants {
+		if !pseg.Ok() {
+			panic("terminal pma iterator")
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !pseg.Range().IsSupersetOf(ar) {
+			panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
+		}
+	}
+
+	pma := pseg.ValuePtr()
+	pstart := pseg.Start()
+	return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
+}
diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go
new file mode 100644
index 000000000..5840b257c
--- /dev/null
+++ b/pkg/sentry/mm/proc_pid_maps.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// devMinorBits is the number of minor bits in a device number. Linux:
+	// include/linux/kdev_t.h:MINORBITS
+	devMinorBits = 20
+)
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData is called by fs/proc.mapsData.ReadSeqFileData.
+func (mm *MemoryManager) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var data []seqfile.SeqData
+	var start usermem.Addr
+	if handle != nil {
+		start = *handle.(*usermem.Addr)
+	}
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		// FIXME: If we use a usermem.Addr for the handle, we get
+		// "panic: autosave error: type usermem.Addr is not registered".
+		vmaAddr := vseg.End()
+		data = append(data, seqfile.SeqData{
+			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
+			Handle: &vmaAddr,
+		})
+	}
+	return data, 1
+}
+
+// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
+// vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+	vma := vseg.ValuePtr()
+	private := "p"
+	if !vma.private {
+		private = "s"
+	}
+
+	var dev, ino uint64
+	if vma.id != nil {
+		dev = vma.id.DeviceID()
+		ino = vma.id.InodeID()
+	}
+	devMajor := uint32(dev >> devMinorBits)
+	devMinor := uint32(dev & ((1 << devMinorBits) - 1))
+
+	var b bytes.Buffer
+	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
+	// stack_guard_page_start().
+	fmt.Fprintf(&b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
+
+	// Figure out our filename or hint.
+	var s string
+	if vma.hint != "" {
+		s = vma.hint
+	} else if vma.id != nil {
+		// FIXME: We are holding mm.mappingMu here, which is
+		// consistent with Linux's holding mmap_sem in
+		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
+		// However, it's not clear that fs.File.MappedName() is actually
+		// consistent with this lock order.
+		s = vma.id.MappedName(ctx)
+	}
+	if s != "" {
+		// Per linux, we pad until the 74th character.
+		if pad := 73 - b.Len(); pad > 0 {
+			b.WriteString(strings.Repeat(" ", pad))
+		}
+		b.WriteString(s)
+	}
+	b.WriteString("\n")
+	return b.Bytes()
+}
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
new file mode 100644
index 000000000..36fed8f1c
--- /dev/null
+++ b/pkg/sentry/mm/save_restore.go
@@ -0,0 +1,57 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all
+// Mappables mapped by mm.
+func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+		if vma := vseg.ValuePtr(); vma.mappable != nil {
+			if err := vma.mappable.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// beforeSave is invoked by stateify.
+func (mm *MemoryManager) beforeSave() {
+	mem := mm.p.Memory()
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		if pma := pseg.ValuePtr(); pma.file != mem {
+			// InvalidateUnsavable should have caused all such pmas to be
+			// invalidated.
+			panic(fmt.Sprintf("Can't save pma %#v with non-Memory file of type %T:\n%s", pseg.Range(), pma.file, mm))
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (mm *MemoryManager) afterLoad() {
+	mm.haveASIO = mm.p.SupportsAddressSpaceIO()
+	mem := mm.p.Memory()
+	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
+		pseg.ValuePtr().file = mem
+	}
+}
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
new file mode 100644
index 000000000..9d3614034
--- /dev/null
+++ b/pkg/sentry/mm/special_mappable.go
@@ -0,0 +1,147 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with
+// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
+// that SpecialMappable takes ownership of the memory that it represents
+// (_install_special_mapping() does not.)
+type SpecialMappable struct {
+	refs.AtomicRefCount
+
+	p    platform.Platform
+	fr   platform.FileRange
+	name string
+}
+
+// NewSpecialMappable returns a SpecialMappable that owns fr, which represents
+// offsets in p.Memory() that contain the SpecialMappable's data. The
+// SpecialMappable will use the given name in /proc/[pid]/maps.
+//
+// Preconditions: fr.Length() != 0.
+func NewSpecialMappable(name string, p platform.Platform, fr platform.FileRange) *SpecialMappable {
+	return &SpecialMappable{p: p, fr: fr, name: name}
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+func (m *SpecialMappable) DecRef() {
+	m.AtomicRefCount.DecRefWithDestructor(func() {
+		m.p.Memory().DecRef(m.fr)
+	})
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (m *SpecialMappable) MappedName(ctx context.Context) string {
+	return m.name
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (m *SpecialMappable) DeviceID() uint64 {
+	return 0
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (m *SpecialMappable) InodeID() uint64 {
+	return 0
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	// Linux: vm_file is NULL, causing msync to skip it entirely.
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (m *SpecialMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (m *SpecialMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (m *SpecialMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > m.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   m.p.Memory(),
+				Offset: m.fr.Start + source.Start,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error {
+	// Since data is stored in platform.Platform.Memory(), the contents of
+	// which are preserved across save/restore, we don't need to do anything.
+	return nil
+}
+
+// Platform returns the Platform whose Memory stores the SpecialMappable's
+// contents.
+func (m *SpecialMappable) Platform() platform.Platform {
+	return m.p
+}
+
+// FileRange returns the offsets into Platform().Memory() that store the
+// SpecialMappable's contents.
+func (m *SpecialMappable) FileRange() platform.FileRange {
+	return m.fr
+}
+
+// Length returns the length of the SpecialMappable.
+func (m *SpecialMappable) Length() uint64 {
+	return m.fr.Length()
+}
+
+// NewSharedAnonMappable returns a SpecialMappable that implements the
+// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
+//
+// TODO: The use of SpecialMappable is a lazy code reuse hack. Linux
+// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
+// do the same to get non-zero device and inode IDs.
+func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable, error) {
+	if length == 0 || length != uint64(usermem.Addr(length).RoundDown()) {
+		return nil, syserror.EINVAL
+	}
+	fr, err := p.Memory().Allocate(length, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+	return NewSpecialMappable("/dev/zero (deleted)", p, fr), nil
+}
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
new file mode 100644
index 000000000..0730be65b
--- /dev/null
+++ b/pkg/sentry/mm/syscalls.go
@@ -0,0 +1,794 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	mrand "math/rand"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// HandleUserFault handles an application page fault. sp is the faulting
+// application thread's stack pointer.
+//
+// Preconditions: mm.as != nil.
+func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error {
+	ar, ok := addr.RoundDown().ToRange(usermem.PageSize)
+	if !ok {
+		return syserror.EFAULT
+	}
+
+	// Don't bother trying existingPMAsLocked; in most cases, if we did have
+	// existing pmas, we wouldn't have faulted.
+
+	// Ensure that we have a usable vma. Here and below, since we are only
+	// asking for a single page, there is no possibility of partial success,
+	// and any error is immediately fatal.
+	mm.mappingMu.RLock()
+	vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
+	if err != nil {
+		mm.mappingMu.RUnlock()
+		return err
+	}
+
+	// Ensure that we have a usable pma.
+	mm.activeMu.Lock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
+		breakCOW: at.Write,
+	})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return err
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// Map the faulted page into the active AddressSpace.
+	err = mm.mapASLocked(pseg, ar, false)
+	mm.activeMu.RUnlock()
+	return err
+}
+
+// MMap establishes a memory mapping.
+func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) {
+	if opts.Length == 0 {
+		return 0, syserror.EINVAL
+	}
+	length, ok := usermem.Addr(opts.Length).RoundUp()
+	if !ok {
+		return 0, syserror.ENOMEM
+	}
+	opts.Length = uint64(length)
+
+	if opts.Mappable != nil {
+		// Offset must be aligned.
+		if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) {
+			return 0, syserror.EINVAL
+		}
+		// Offset + length must not overflow.
+		if end := opts.Offset + opts.Length; end < opts.Offset {
+			return 0, syserror.ENOMEM
+		}
+	} else {
+		opts.Offset = 0
+		if !opts.Private {
+			if opts.MappingIdentity != nil {
+				return 0, syserror.EINVAL
+			}
+			m, err := NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+			if err != nil {
+				return 0, err
+			}
+			opts.MappingIdentity = m
+			opts.Mappable = m
+		}
+	}
+
+	if opts.Addr.RoundDown() != opts.Addr {
+		// MAP_FIXED requires addr to be page-aligned; non-fixed mappings
+		// don't.
+		if opts.Fixed {
+			return 0, syserror.EINVAL
+		}
+		opts.Addr = opts.Addr.RoundDown()
+	}
+
+	if !opts.MaxPerms.SupersetOf(opts.Perms) {
+		return 0, syserror.EACCES
+	}
+	if opts.Unmap && !opts.Fixed {
+		return 0, syserror.EINVAL
+	}
+	if opts.GrowsDown && opts.Mappable != nil {
+		return 0, syserror.EINVAL
+	}
+
+	// Get the new vma.
+	mm.mappingMu.Lock()
+	vseg, ar, err := mm.createVMALocked(ctx, opts)
+	if err != nil {
+		mm.mappingMu.Unlock()
+		return 0, err
+	}
+
+	switch {
+	case opts.Precommit:
+		// Get pmas and map with precommit as requested.
+		mm.populateAndUnlock(ctx, vseg, ar, true)
+
+	case opts.Mappable == nil && length <= privateAllocUnit:
+		// NOTE: Get pmas and map eagerly in the hope
+		// that doing so will save on future page faults. We only do this for
+		// anonymous mappings, since otherwise the cost of
+		// memmap.Mappable.Translate is unknown; and only for small mappings,
+		// to avoid needing to allocate large amounts of memory that we may
+		// subsequently need to checkpoint.
+		mm.populateAndUnlock(ctx, vseg, ar, false)
+
+	default:
+		mm.mappingMu.Unlock()
+	}
+
+	return ar.Start, nil
+}
+
+// Preconditions: mm.mappingMu must be locked for writing.
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		// Linux doesn't populate inaccessible pages. See
+		// mm/gup.c:populate_vma_page_range.
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	// Even if we get a new pma, we can't actually map it if we don't have an
+	// AddressSpace.
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// Ensure that we have usable pmas.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
+		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
+		// userspace actually tries to use the failing page.
+		mm.activeMu.Unlock()
+		return
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// As above, errors are silently ignored.
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
+// MapStack allocates the initial process stack.
+func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
+	// maxStackSize is the maximum supported process stack size in bytes.
+	//
+	// This limit exists because stack growing isn't implemented, so the entire
+	// process stack must be mapped up-front.
+	const maxStackSize = 128 << 20
+
+	stackSize := limits.FromContext(ctx).Get(limits.Stack)
+	r, ok := usermem.Addr(stackSize.Cur).RoundUp()
+	sz := uint64(r)
+	if !ok {
+		// RLIM_INFINITY rounds up to 0.
+		sz = linux.DefaultStackSoftLimit
+	} else if sz > maxStackSize {
+		ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
+		sz = maxStackSize
+	} else if sz == 0 {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	szaddr := usermem.Addr(sz)
+	ctx.Debugf("Allocating stack with size of %v bytes", sz)
+
+	// Determine the stack's desired location. Unlike Linux, address
+	// randomization can't be disabled.
+	stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
+	if stackEnd < szaddr {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	stackStart := stackEnd - szaddr
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	_, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		Length:    sz,
+		Addr:      stackStart,
+		Perms:     usermem.ReadWrite,
+		MaxPerms:  usermem.AnyAccess,
+		Private:   true,
+		GrowsDown: true,
+		Hint:      "[stack]",
+	})
+	return ar, err
+}
+
+// MUnmap implements the semantics of Linux's munmap(2).
+func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return syserror.EINVAL
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.EINVAL
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	mm.unmapLocked(ctx, ar)
+	return nil
+}
+
+// MRemapOpts specifies options to MRemap.
+type MRemapOpts struct {
+	// Move controls whether MRemap moves the remapped mapping to a new address.
+	Move MRemapMoveMode
+
+	// NewAddr is the new address for the remapping. NewAddr is ignored unless
+	// Move is MMRemapMustMove.
+	NewAddr usermem.Addr
+}
+
+// MRemapMoveMode controls MRemap's moving behavior.
+type MRemapMoveMode int
+
+const (
+	// MRemapNoMove prevents MRemap from moving the remapped mapping.
+	MRemapNoMove MRemapMoveMode = iota
+
+	// MRemapMayMove allows MRemap to move the remapped mapping.
+	MRemapMayMove
+
+	// MRemapMustMove requires MRemap to move the remapped mapping to
+	// MRemapOpts.NewAddr, replacing any existing mappings in the remapped
+	// range.
+	MRemapMustMove
+)
+
+// MRemap implements the semantics of Linux's mremap(2).
+func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) {
+	// "Note that old_address has to be page aligned." - mremap(2)
+	if oldAddr.RoundDown() != oldAddr {
+		return 0, syserror.EINVAL
+	}
+
+	// Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
+	// valid size. However, new_size can't be 0 after rounding.
+	oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp()
+	oldSize = uint64(oldSizeAddr)
+	newSizeAddr, ok := usermem.Addr(newSize).RoundUp()
+	if !ok || newSizeAddr == 0 {
+		return 0, syserror.EINVAL
+	}
+	newSize = uint64(newSizeAddr)
+
+	oldEnd, ok := oldAddr.AddLength(oldSize)
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	// All cases require that a vma exists at oldAddr.
+	vseg := mm.vmas.FindSegment(oldAddr)
+	if !vseg.Ok() {
+		return 0, syserror.EFAULT
+	}
+
+	if opts.Move != MRemapMustMove {
+		// Handle noops and in-place shrinking. These cases don't care if
+		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
+		// (aside from oldAddr).
+		if newSize <= oldSize {
+			if newSize < oldSize {
+				// If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
+				// either.
+				newEnd := oldAddr + usermem.Addr(newSize)
+				mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd})
+			}
+			return oldAddr, nil
+		}
+
+		// Handle in-place growing.
+
+		// Check that oldEnd maps to the same vma as oldAddr.
+		if vseg.End() < oldEnd {
+			return 0, syserror.EFAULT
+		}
+		// "Grow" the existing vma by creating a new mergeable one.
+		vma := vseg.ValuePtr()
+		var newOffset uint64
+		if vma.mappable != nil {
+			newOffset = vseg.mappableRange().End
+		}
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length:          newSize - oldSize,
+			MappingIdentity: vma.id,
+			Mappable:        vma.mappable,
+			Offset:          newOffset,
+			Addr:            oldEnd,
+			Fixed:           true,
+			Perms:           vma.realPerms,
+			MaxPerms:        vma.maxPerms,
+			Private:         vma.private,
+			GrowsDown:       vma.growsDown,
+			Hint:            vma.hint,
+		})
+		if err == nil {
+			return oldAddr, nil
+		}
+		// In-place growth failed. In the MRemapMayMove case, fall through to
+		// moving below.
+		if opts.Move == MRemapNoMove {
+			return 0, err
+		}
+	}
+
+	// Handle moving, which is the only remaining case.
+
+	// Find a destination for the move.
+	var newAR usermem.AddrRange
+	switch opts.Move {
+	case MRemapMayMove:
+		newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
+		if err != nil {
+			return 0, err
+		}
+		newAR, _ = newAddr.ToRange(newSize)
+
+	case MRemapMustMove:
+		newAddr := opts.NewAddr
+		if newAddr.RoundDown() != newAddr {
+			return 0, syserror.EINVAL
+		}
+		var ok bool
+		newAR, ok = newAddr.ToRange(newSize)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+		if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
+			return 0, syserror.EINVAL
+		}
+
+		// Unmap any mappings at the destination.
+		mm.unmapLocked(ctx, newAR)
+
+		// If the sizes specify shrinking, unmap everything between the new and
+		// old sizes at the source.
+		if newSize < oldSize {
+			oldNewEnd := oldAddr + usermem.Addr(newSize)
+			mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
+			oldEnd = oldNewEnd
+		}
+
+		// unmapLocked may have invalidated vseg; look it up again.
+		vseg = mm.vmas.FindSegment(oldAddr)
+	}
+
+	oldAR := usermem.AddrRange{oldAddr, oldEnd}
+
+	// In the MRemapMustMove case, these checks happen after unmapping:
+	// mm/mremap.c:mremap_to() => do_munmap(), vma_to_resize().
+
+	// Check that oldEnd maps to the same vma as oldAddr.
+	if vseg.End() < oldEnd {
+		return 0, syserror.EFAULT
+	}
+
+	// Check against RLIMIT_AS.
+	newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+		return 0, syserror.ENOMEM
+	}
+
+	if vma := vseg.ValuePtr(); vma.mappable != nil {
+		// Check that offset+length does not overflow.
+		if vma.off+uint64(newAR.Length()) < vma.off {
+			return 0, syserror.EINVAL
+		}
+		// Inform the Mappable, if any, of the copied mapping.
+		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Remove the existing vma before inserting the new one to minimize
+	// iterator invalidation. We do this directly (instead of calling
+	// removeVMAsLocked) because:
+	//
+	// 1. We can't drop the reference on vma.id, which will be transferred to
+	// the new vma.
+	//
+	// 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
+	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
+	// overlapping oldAR.
+	//
+	// Call vseg.Value() (rather than vseg.ValuePtr()) first to make a copy of
+	// the vma.
+	vseg = mm.vmas.Isolate(vseg, oldAR)
+	vma := vseg.Value()
+	mm.vmas.Remove(vseg)
+
+	// Insert the new vma, transferring the reference on vma.id.
+	mm.vmas.Add(newAR, vma)
+
+	// Move pmas. This is technically optional for non-private pmas, which
+	// could just go through memmap.Mappable.Translate again, but it's required
+	// for private pmas.
+	mm.activeMu.Lock()
+	mm.movePMAsLocked(oldAR, newAR)
+	mm.activeMu.Unlock()
+
+	// Now that pmas have been moved to newAR, we can notify vma.mappable that
+	// oldAR is no longer mapped.
+	if vma.mappable != nil {
+		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off)
+	}
+
+	return newAR.Start, nil
+}
+
+// MProtect implements the semantics of Linux's mprotect(2).
+func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error {
+	if addr.RoundDown() != addr {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	rlength, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(rlength))
+	if !ok {
+		return syserror.ENOMEM
+	}
+	effectivePerms := realPerms.Effective()
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Non-growsDown mprotect requires that all of ar is mapped, and stops at
+	// the first non-empty gap. growsDown mprotect requires that the first vma
+	// be growsDown, but does not require it to extend all the way to ar.Start;
+	// vmas after the first must be contiguous but need not be growsDown, like
+	// the non-growsDown case.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		return syserror.ENOMEM
+	}
+	if growsDown {
+		if !vseg.ValuePtr().growsDown {
+			return syserror.EINVAL
+		}
+		if ar.End <= vseg.Start() {
+			return syserror.ENOMEM
+		}
+		ar.Start = vseg.Start()
+	} else {
+		if ar.Start < vseg.Start() {
+			return syserror.ENOMEM
+		}
+	}
+
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+		mm.pmas.MergeRange(ar)
+		mm.pmas.MergeAdjacent(ar)
+	}()
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	var didUnmapAS bool
+	for {
+		// Check for permission validity before splitting vmas, for consistency
+		// with Linux.
+		if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
+			return syserror.EACCES
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+
+		// Update vma permissions.
+		vma := vseg.ValuePtr()
+		vma.realPerms = realPerms
+		vma.effectivePerms = effectivePerms
+
+		// Propagate vma permission changes to pmas.
+		for pseg.Ok() && pseg.Start() < vseg.End() {
+			if pseg.Range().Overlaps(vseg.Range()) {
+				pseg = mm.pmas.Isolate(pseg, vseg.Range())
+				if !effectivePerms.SupersetOf(pseg.ValuePtr().vmaEffectivePerms) && !didUnmapAS {
+					// Unmap all of ar, not just vseg.Range(), to minimize host
+					// syscalls.
+					mm.unmapASLocked(ar)
+					didUnmapAS = true
+				}
+				pseg.ValuePtr().vmaEffectivePerms = effectivePerms
+			}
+			pseg = pseg.NextSegment()
+		}
+
+		// Continue to the next vma.
+		if ar.End <= vseg.End() {
+			return nil
+		}
+		vseg, _ = vseg.NextNonEmpty()
+		if !vseg.Ok() {
+			return syserror.ENOMEM
+		}
+	}
+}
+
+// BrkSetup sets mm's brk address to addr and its brk size to 0.
+func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Unmap the existing brk.
+	if mm.brk.Length() != 0 {
+		mm.unmapLocked(ctx, mm.brk)
+	}
+	mm.brk = usermem.AddrRange{addr, addr}
+}
+
+// Brk implements the semantics of Linux's brk(2), except that it returns an
+// error on failure.
+func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	if addr < mm.brk.Start {
+		return mm.brk.End, syserror.EINVAL
+	}
+
+	// TODO: This enforces RLIMIT_DATA, but is slightly more
+	// permissive than the usual data limit. In particular, this only
+	// limits the size of the heap; a true RLIMIT_DATA limits the size of
+	// heap + data + bss. The segment sizes need to be plumbed from the
+	// loader package to fully enforce RLIMIT_DATA.
+	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		return mm.brk.End, syserror.ENOMEM
+	}
+
+	oldbrkpg, _ := mm.brk.End.RoundUp()
+	newbrkpg, ok := addr.RoundUp()
+	if !ok {
+		return mm.brk.End, syserror.EFAULT
+	}
+
+	switch {
+	case newbrkpg < oldbrkpg:
+		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+
+	case oldbrkpg < newbrkpg:
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length: uint64(newbrkpg - oldbrkpg),
+			Addr:   oldbrkpg,
+			Fixed:  true,
+			// Compare Linux's
+			// arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
+			Perms:    usermem.ReadWrite,
+			MaxPerms: usermem.AnyAccess,
+			Private:  true,
+			Hint:     "[heap]",
+		})
+		if err != nil {
+			return mm.brk.End, err
+		}
+	}
+
+	mm.brk.End = addr
+	return addr, nil
+}
+
+// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
+func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+
+	// Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range()
+	// is analogous to our mm.invalidateLocked(ar, true, true). We inline this
+	// here, with the special case that we synchronously decommit
+	// uniquely-owned (non-copy-on-write) pages for private anonymous vma,
+	// which is the common case for MADV_DONTNEED. Invalidating these pmas, and
+	// allowing them to be reallocated when touched again, increases pma
+	// fragmentation, which may significantly reduce performance for
+	// non-vectored I/O implementations. Also, decommitting synchronously
+	// ensures that Decommit immediately reduces host memory usage.
+	var didUnmapAS bool
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	mem := mm.p.Memory()
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+			psegAR := pseg.Range().Intersect(ar)
+			vseg = vseg.seekNextLowerBound(psegAR.Start)
+			if checkInvariants {
+				if !vseg.Ok() {
+					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
+				}
+				if psegAR.Start < vseg.Start() {
+					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
+				}
+			}
+			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
+				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+					pseg = pseg.NextSegment()
+					continue
+				}
+				// If an error occurs, fall through to the general
+				// invalidation case below.
+			}
+		}
+		pseg = mm.pmas.Isolate(pseg, ar)
+		pma = pseg.ValuePtr()
+		if !didUnmapAS {
+			// Unmap all of ar, not just pseg.Range(), to minimize host
+			// syscalls. AddressSpace mappings must be removed before
+			// mm.decPrivateRef().
+			mm.unmapASLocked(ar)
+			didUnmapAS = true
+		}
+		if pma.private {
+			mm.decPrivateRef(pseg.fileRange())
+		}
+		pma.file.DecRef(pseg.fileRange())
+		mm.removeRSSLocked(pseg.Range())
+
+		pseg = mm.pmas.Remove(pseg).NextSegment()
+	}
+
+	// "If there are some parts of the specified address space that are not
+	// mapped, the Linux version of madvise() ignores them and applies the call
+	// to the rest (but returns ENOMEM from the system call, as it should)." -
+	// madvise(2)
+	if mm.vmas.SpanRange(ar) != ar.Length() {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// Sync implements the semantics of Linux's msync(MS_SYNC).
+func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.ENOMEM
+	}
+
+	mm.mappingMu.RLock()
+	// Can't defer mm.mappingMu.RUnlock(); see below.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		mm.mappingMu.RUnlock()
+		return syserror.ENOMEM
+	}
+	var unmapped bool
+	lastEnd := ar.Start
+	for {
+		if !vseg.Ok() {
+			mm.mappingMu.RUnlock()
+			unmapped = true
+			break
+		}
+		if lastEnd < vseg.Start() {
+			unmapped = true
+		}
+		lastEnd = vseg.End()
+		vma := vseg.ValuePtr()
+		// It's only possible to have dirtied the Mappable through a shared
+		// mapping. Don't check if the mapping is writable, because mprotect
+		// may have changed this, and also because Linux doesn't.
+		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+			// We can't call memmap.MappingIdentity.Msync while holding
+			// mm.mappingMu since it may take fs locks that precede it in the
+			// lock order.
+			id.IncRef()
+			mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
+			mm.mappingMu.RUnlock()
+			err := id.Msync(ctx, mr)
+			id.DecRef()
+			if err != nil {
+				return err
+			}
+			if lastEnd >= ar.End {
+				break
+			}
+			mm.mappingMu.RLock()
+			vseg = mm.vmas.LowerBoundSegment(lastEnd)
+		} else {
+			if lastEnd >= ar.End {
+				mm.mappingMu.RUnlock()
+				break
+			}
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	if unmapped {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// VirtualMemorySize returns the combined length in bytes of all mappings in
+// mm.
+func (mm *MemoryManager) VirtualMemorySize() uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return uint64(mm.usageAS)
+}
+
+// VirtualMemorySizeRange returns the combined length in bytes of all mappings
+// in ar in mm.
+func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return uint64(mm.vmas.SpanRange(ar))
+}
+
+// ResidentSetSize returns the value advertised as mm's RSS in bytes.
+func (mm *MemoryManager) ResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return uint64(mm.curRSS)
+}
+
+// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
+func (mm *MemoryManager) MaxResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return uint64(mm.maxRSS)
+}
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
new file mode 100644
index 000000000..b6af48cb7
--- /dev/null
+++ b/pkg/sentry/mm/vma.go
@@ -0,0 +1,476 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
+// as defined by the checks in MMap.
+func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
+	if opts.MaxPerms != opts.MaxPerms.Effective() {
+		panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
+	}
+
+	// Find a useable range.
+	addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{
+		Addr:  opts.Addr,
+		Fixed: opts.Fixed,
+		Unmap: opts.Unmap,
+	})
+	if err != nil {
+		return vmaIterator{}, usermem.AddrRange{}, err
+	}
+	ar, _ := addr.ToRange(opts.Length)
+
+	// Check against RLIMIT_AS.
+	newUsageAS := mm.usageAS + opts.Length
+	if opts.Unmap {
+		newUsageAS -= uint64(mm.vmas.SpanRange(ar))
+	}
+	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
+	}
+
+	// Remove overwritten mappings. This ordering is consistent with Linux:
+	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
+	// file->f_op->mmap().
+	var vgap vmaGapIterator
+	if opts.Unmap {
+		vgap = mm.unmapLocked(ctx, ar)
+	} else {
+		vgap = mm.vmas.FindGap(ar.Start)
+	}
+
+	// Inform the Mappable, if any, of the new mapping.
+	if opts.Mappable != nil {
+		if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset); err != nil {
+			return vmaIterator{}, usermem.AddrRange{}, err
+		}
+	}
+
+	// Take a reference on opts.MappingIdentity before inserting the vma since
+	// vma merging can drop the reference.
+	if opts.MappingIdentity != nil {
+		opts.MappingIdentity.IncRef()
+	}
+
+	// Finally insert the vma.
+	vseg := mm.vmas.Insert(vgap, ar, vma{
+		mappable:       opts.Mappable,
+		off:            opts.Offset,
+		realPerms:      opts.Perms,
+		effectivePerms: opts.Perms.Effective(),
+		maxPerms:       opts.MaxPerms,
+		private:        opts.Private,
+		growsDown:      opts.GrowsDown,
+		id:             opts.MappingIdentity,
+		hint:           opts.Hint,
+	})
+	mm.usageAS += opts.Length
+
+	return vseg, ar, nil
+}
+
+type findAvailableOpts struct {
+	// Addr is a suggested address. Addr must be page-aligned.
+	Addr usermem.Addr
+
+	// Fixed is true if only the suggested address is acceptable.
+	Fixed bool
+
+	// Unmap is true if existing vmas and guard pages may exist in the returned
+	// range.
+	Unmap bool
+}
+
+// findAvailableLocked finds an allocatable range.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) {
+	// Does the provided suggestion work?
+	if ar, ok := opts.Addr.ToRange(length); ok {
+		if mm.applicationAddrRange().IsSupersetOf(ar) {
+			if opts.Unmap {
+				return ar.Start, nil
+			}
+			// Check for the presence of an existing vma or guard page.
+			if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) {
+				return ar.Start, nil
+			}
+		}
+	}
+
+	// Fixed mappings accept only the requested address.
+	if opts.Fixed {
+		return 0, syserror.ENOMEM
+	}
+
+	// Prefer hugepage alignment if a hugepage or more is requested.
+	alignment := uint64(usermem.PageSize)
+	if length >= usermem.HugePageSize {
+		alignment = usermem.HugePageSize
+	}
+
+	if mm.layout.DefaultDirection == arch.MmapBottomUp {
+		return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr})
+	}
+	return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase})
+}
+
+func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
+	return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr}
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+	for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() {
+		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+			// Can we shift up to match the alignment?
+			if offset := uint64(gr.Start) % alignment; offset != 0 {
+				if uint64(gr.Length()) >= length+alignment-offset {
+					// Yes, we're aligned.
+					return gr.Start + usermem.Addr(alignment-offset), nil
+				}
+			}
+
+			// Either aligned perfectly, or can't align it.
+			return gr.Start, nil
+		}
+	}
+	return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+	for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() {
+		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+			// Can we shift down to match the alignment?
+			start := gr.End - usermem.Addr(length)
+			if offset := uint64(start) % alignment; offset != 0 {
+				if gr.Start <= start-usermem.Addr(offset) {
+					// Yes, we're aligned.
+					return start - usermem.Addr(offset), nil
+				}
+			}
+
+			// Either aligned perfectly, or can't align it.
+			return start, nil
+		}
+	}
+	return 0, syserror.ENOMEM
+}
+
+// getVMAsLocked ensures that vmas exist for all addresses in ar, and support
+// access of type (at, ignorePermissions). It returns:
+//
+// - An iterator to the vma containing ar.Start. If no vma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last vma containing an address in ar. If
+// vmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if vmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked. ar.Length() != 0.
+func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if
+	// !vbegin.Ok().
+	vbegin, vgap := mm.vmas.Find(ar.Start)
+	if !vbegin.Ok() {
+		vbegin = vgap.NextSegment()
+		// vseg.Ok() is checked before entering the following loop.
+	} else {
+		vgap = vbegin.PrevGap()
+	}
+
+	addr := ar.Start
+	vseg := vbegin
+	for vseg.Ok() {
+		// Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
+		vma := vseg.ValuePtr()
+		if addr < vseg.Start() {
+			// TODO: Implement vma.growsDown here.
+			return vbegin, vgap, syserror.EFAULT
+		}
+
+		perms := vma.effectivePerms
+		if ignorePermissions {
+			perms = vma.maxPerms
+		}
+		if !perms.SupersetOf(at) {
+			return vbegin, vgap, syserror.EPERM
+		}
+
+		addr = vseg.End()
+		vgap = vseg.NextGap()
+		if addr >= ar.End {
+			return vbegin, vgap, nil
+		}
+		vseg = vgap.NextSegment()
+	}
+
+	// Ran out of vmas before ar.End.
+	return vbegin, vgap, syserror.EFAULT
+}
+
+// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
+// support access to type of (at, ignorePermissions). It retuns the subset of
+// ars for which vmas exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked.
+//
+// Postconditions: ars is not mutated.
+func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) {
+	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+		ar := arsit.Head()
+		if ar.Length() == 0 {
+			continue
+		}
+		if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil {
+			return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err
+		}
+	}
+	return ars, nil
+}
+
+// vma extension will not shrink the number of unmapped bytes between the start
+// of a growsDown vma and the end of its predecessor non-growsDown vma below
+// guardBytes.
+//
+// guardBytes is equivalent to Linux's stack_guard_gap after upstream
+// 1be7107fbe18 "mm: larger stack guard gap, between vmas".
+const guardBytes = 256 * usermem.PageSize
+
+// unmapLocked unmaps all addresses in ar and returns the resulting gap in
+// mm.vmas.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
+// ar must be page-aligned.
+func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// AddressSpace mappings and pmas must be invalidated before
+	// mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping().
+	mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true})
+	return mm.removeVMAsLocked(ctx, ar)
+}
+
+// removeVMAsLocked removes vmas for addresses in ar and returns the resulting
+// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
+// must do so before calling removeVMAsLocked.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	vseg, vgap := mm.vmas.Find(ar.Start)
+	if vgap.Ok() {
+		vseg = vgap.NextSegment()
+	}
+	for vseg.Ok() && vseg.Start() < ar.End {
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vmaAR := vseg.Range()
+		vma := vseg.ValuePtr()
+		if vma.mappable != nil {
+			vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off)
+		}
+		if vma.id != nil {
+			vma.id.DecRef()
+		}
+		mm.usageAS -= uint64(vmaAR.Length())
+		vgap = mm.vmas.Remove(vseg)
+		vseg = vgap.NextSegment()
+	}
+	return vgap
+}
+
+// vmaSetFunctions implements segment.Functions for vmaSet.
+type vmaSetFunctions struct{}
+
+func (vmaSetFunctions) MinKey() usermem.Addr {
+	return 0
+}
+
+func (vmaSetFunctions) MaxKey() usermem.Addr {
+	return ^usermem.Addr(0)
+}
+
+func (vmaSetFunctions) ClearValue(vma *vma) {
+	vma.mappable = nil
+	vma.id = nil
+	vma.hint = ""
+}
+
+func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) {
+	if vma1.mappable != vma2.mappable ||
+		(vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) ||
+		vma1.realPerms != vma2.realPerms ||
+		vma1.maxPerms != vma2.maxPerms ||
+		vma1.private != vma2.private ||
+		vma1.growsDown != vma2.growsDown ||
+		vma1.id != vma2.id ||
+		vma1.hint != vma2.hint {
+		return vma{}, false
+	}
+
+	if vma2.id != nil {
+		vma2.id.DecRef()
+	}
+	return vma1, true
+}
+
+func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) {
+	v2 := v
+	if v2.mappable != nil {
+		v2.off += uint64(split - ar.Start)
+	}
+	if v2.id != nil {
+		v2.id.IncRef()
+	}
+	return v, v2
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("Mappable offset is meaningless for anonymous vma")
+		}
+		if !vseg.Range().Contains(addr) {
+			panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return vma.off + uint64(addr-vstart)
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+func (vseg vmaIterator) mappableRange() memmap.MappableRange {
+	return vseg.mappableRangeOf(vseg.Range())
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("MappableRange is meaningless for anonymous vma")
+		}
+		if !ar.WellFormed() || ar.Length() <= 0 {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+		if !vseg.Range().IsSupersetOf(ar) {
+			panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if vseg.ValuePtr().mappable == nil {
+			panic("MappableRange is meaningless for anonymous vma")
+		}
+		if !mr.WellFormed() || mr.Length() <= 0 {
+			panic(fmt.Sprintf("invalid mr: %v", mr))
+		}
+		if !vseg.mappableRange().IsSupersetOf(mr) {
+			panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange()))
+		}
+	}
+
+	vma := vseg.ValuePtr()
+	vstart := vseg.Start()
+	return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)}
+}
+
+// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
+// scanning linearly forward from vseg.
+//
+// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
+	if checkInvariants {
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
+		if addr < vseg.Start() {
+			panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start()))
+		}
+	}
+	for vseg.Ok() && addr >= vseg.End() {
+		vseg = vseg.NextSegment()
+	}
+	return vseg
+}
+
+// availableRange returns the subset of vgap.Range() in which new vmas may be
+// created without MMapOpts.Unmap == true.
+func (vgap vmaGapIterator) availableRange() usermem.AddrRange {
+	ar := vgap.Range()
+	next := vgap.NextSegment()
+	if !next.Ok() || !next.ValuePtr().growsDown {
+		return ar
+	}
+	// Exclude guard pages.
+	if ar.Length() < guardBytes {
+		return usermem.AddrRange{ar.Start, ar.Start}
+	}
+	ar.End -= guardBytes
+	return ar
+}
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
new file mode 100644
index 000000000..d5be81f8d
--- /dev/null
+++ b/pkg/sentry/platform/BUILD
@@ -0,0 +1,51 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "platform_state",
+    srcs = [
+        "file_range.go",
+    ],
+    out = "platform_state.go",
+    package = "platform",
+)
+
+go_template_instance(
+    name = "file_range",
+    out = "file_range.go",
+    package = "platform",
+    prefix = "File",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint64",
+    },
+)
+
+go_library(
+    name = "platform",
+    srcs = [
+        "context.go",
+        "file_range.go",
+        "mmap_min_addr.go",
+        "platform.go",
+        "platform_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/atomicbitops",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/platform/safecopy",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go
new file mode 100644
index 000000000..0d200a5e2
--- /dev/null
+++ b/pkg/sentry/platform/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package platform
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxPlatform is a Context.Value key for a Platform.
+	CtxPlatform contextID = iota
+)
+
+// FromContext returns the Platform that is used to execute ctx's application
+// code, or nil if no such Platform exists.
+func FromContext(ctx context.Context) Platform {
+	if v := ctx.Value(CtxPlatform); v != nil {
+		return v.(Platform)
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
new file mode 100644
index 000000000..3c4d5b0b6
--- /dev/null
+++ b/pkg/sentry/platform/filemem/BUILD
@@ -0,0 +1,69 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "filemem_autogen_state",
+    srcs = [
+        "filemem.go",
+        "filemem_state.go",
+        "usage_set.go",
+    ],
+    out = "filemem_autogen_state.go",
+    package = "filemem",
+)
+
+go_template_instance(
+    name = "usage_set",
+    out = "usage_set.go",
+    consts = {
+        "minDegree": "10",
+    },
+    imports = {
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "filemem",
+    prefix = "usage",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "usageInfo",
+        "Functions": "usageSetFunctions",
+    },
+)
+
+go_library(
+    name = "filemem",
+    srcs = [
+        "filemem.go",
+        "filemem_autogen_state.go",
+        "filemem_state.go",
+        "filemem_unsafe.go",
+        "usage_set.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/memutil",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "filemem_test",
+    size = "small",
+    srcs = ["filemem_test.go"],
+    embed = [":filemem"],
+    deps = ["//pkg/sentry/usermem"],
+)
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
new file mode 100644
index 000000000..d79c3c7f1
--- /dev/null
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -0,0 +1,838 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filemem provides a reusable implementation of platform.Memory.
+//
+// It enables memory to be sourced from a memfd file.
+//
+// Lock order:
+//
+// filemem.FileMem.mu
+//   filemem.FileMem.mappingsMu
+package filemem
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// FileMem is a platform.Memory that allocates from a host file that it owns.
+type FileMem struct {
+	// Filemem models the backing file as follows:
+	//
+	// Each page in the file can be committed or uncommitted. A page is
+	// committed if the host kernel is spending resources to store its contents
+	// and uncommitted otherwise. This definition includes pages that the host
+	// kernel has swapped; this is intentional, to ensure that accounting does
+	// not change even if host kernel swapping behavior changes, and that
+	// memory used by pseudo-swap mechanisms like zswap is still accounted.
+	//
+	// The initial contents of uncommitted pages are implicitly zero bytes. A
+	// read or write to the contents of an uncommitted page causes it to be
+	// committed. This is the only event that can cause a uncommitted page to
+	// be committed.
+	//
+	// fallocate(FALLOC_FL_PUNCH_HOLE) (FileMem.Decommit) causes committed
+	// pages to be uncommitted. This is the only event that can cause a
+	// committed page to be uncommitted.
+	//
+	// Filemem's accounting is based on identifying the set of committed pages.
+	// Since filemem does not have direct access to the MMU, tracking reads and
+	// writes to uncommitted pages to detect commitment would introduce
+	// additional page faults, which would be prohibitively expensive. Instead,
+	// filemem queries the host kernel to determine which pages are committed.
+
+	// file is the backing memory file. The file pointer is immutable.
+	file *os.File
+
+	mu sync.Mutex
+
+	// usage maps each page in the file to metadata for that page. Pages for
+	// which no segment exists in usage are both unallocated (not in use) and
+	// uncommitted.
+	//
+	// Since usage stores usageInfo objects by value, clients should usually
+	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
+	// pointer to the usageInfo rather than a copy.
+	//
+	// usage must be kept maximally merged (that is, there should never be two
+	// adjacent segments with the same values). At least markReclaimed depends
+	// on this property.
+	//
+	// usage is protected by mu.
+	usage usageSet
+
+	// The UpdateUsage function scans all segments with knownCommitted set
+	// to false, sees which pages are committed and creates corresponding
+	// segments with knownCommitted set to true.
+	//
+	// In order to avoid unnecessary scans, usageExpected tracks the total
+	// file blocks expected. This is used to elide the scan when this
+	// matches the underlying file blocks.
+	//
+	// To track swapped pages, usageSwapped tracks the discrepency between
+	// what is observed in core and what is reported by the file. When
+	// usageSwapped is non-zero, a sweep will be performed at least every
+	// second. The start of the last sweep is recorded in usageLast.
+	//
+	// All usage attributes are all protected by mu.
+	usageExpected uint64
+	usageSwapped  uint64
+	usageLast     time.Time
+
+	// fileSize is the size of the backing memory file in bytes. fileSize is
+	// always a power-of-two multiple of chunkSize.
+	//
+	// fileSize is protected by mu.
+	fileSize int64
+
+	// destroyed is set by Destroy to instruct the reclaimer goroutine to
+	// release resources and exit. destroyed is protected by mu.
+	destroyed bool
+
+	// reclaimable is true if usage may contain reclaimable pages. reclaimable
+	// is protected by mu.
+	reclaimable bool
+
+	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
+	// transitions from false to true.
+	reclaimCond sync.Cond
+
+	// Filemem pages are mapped into the local address space on the granularity
+	// of large pieces called chunks. mappings is a []uintptr that stores, for
+	// each chunk, the start address of a mapping of that chunk in the current
+	// process' address space, or 0 if no such mapping exists. Once a chunk is
+	// mapped, it is never remapped or unmapped until the filemem is destroyed.
+	//
+	// Mutating the mappings slice or its contents requires both holding
+	// mappingsMu and using atomic memory operations. (The slice is mutated
+	// whenever the file is expanded. Per the above, the only permitted
+	// mutation of the slice's contents is the assignment of a mapping to a
+	// chunk that was previously unmapped.) Reading the slice or its contents
+	// only requires *either* holding mappingsMu or using atomic memory
+	// operations. This allows FileMem.AccessPhysical to avoid locking in the
+	// common case where chunk mappings already exist.
+
+	mappingsMu sync.Mutex
+	mappings   atomic.Value
+}
+
+// usage tracks usage information.
+type usageInfo struct {
+	// kind is the usage kind.
+	kind usage.MemoryKind
+
+	// knownCommitted indicates whether this region is known to be
+	// committed. If this is false, then the region may or may not have
+	// been touched. If it is true however, then mincore (below) has
+	// indicated that the page is present at least once.
+	knownCommitted bool
+
+	refs uint64
+}
+
+func (u *usageInfo) incRef() {
+	u.refs++
+}
+
+func (u *usageInfo) decRef() {
+	if u.refs == 0 {
+		panic("DecRef at 0 refs!")
+	}
+	u.refs--
+}
+
+const (
+	chunkShift = 24
+	chunkSize  = 1 << chunkShift // 16 MB
+	chunkMask  = chunkSize - 1
+
+	initialSize = chunkSize
+)
+
+// newFromFile creates a FileMem backed by the given file.
+func newFromFile(file *os.File) (*FileMem, error) {
+	if err := file.Truncate(initialSize); err != nil {
+		return nil, err
+	}
+	f := &FileMem{
+		fileSize: initialSize,
+		file:     file,
+	}
+	f.reclaimCond.L = &f.mu
+	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	go f.runReclaim() // S/R-SAFE: f.mu
+
+	// The Linux kernel contains an optional feature called "Integrity
+	// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
+	// binaries the first time they are mapped PROT_EXEC. This is bad news for
+	// executable pages mapped from FileMem, which can grow to terabytes in
+	// (sparse) size. If IMA attempts to checksum a file that large, it will
+	// allocate all of the sparse pages and quickly exhaust all memory.
+	//
+	// Work around IMA by immediately creating a temporary PROT_EXEC mapping,
+	// while FileMem is still small. IMA will ignore any future mappings.
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		usermem.PageSize,
+		syscall.PROT_EXEC,
+		syscall.MAP_SHARED,
+		f.file.Fd(),
+		0)
+	if errno != 0 {
+		// This isn't fatal to filemem (IMA may not even be in use). Log the
+		// error, but don't return it.
+		log.Warningf("Failed to pre-map FileMem PROT_EXEC: %v", errno)
+	} else {
+		syscall.Syscall(
+			syscall.SYS_MUNMAP,
+			m,
+			usermem.PageSize,
+			0)
+	}
+
+	return f, nil
+}
+
+// New creates a FileMem backed by a memfd file.
+func New(name string) (*FileMem, error) {
+	fd, err := memutil.CreateMemFD(name, 0)
+	if err != nil {
+		return nil, err
+	}
+	return newFromFile(os.NewFile(uintptr(fd), name))
+}
+
+// Destroy implements platform.Memory.Destroy.
+func (f *FileMem) Destroy() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.destroyed = true
+	f.reclaimCond.Signal()
+}
+
+// Allocate implements platform.Memory.Allocate.
+func (f *FileMem) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
+	if length == 0 || length%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid allocation length: %#x", length))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Align hugepage-and-larger allocations on hugepage boundaries to try
+	// to take advantage of hugetmpfs.
+	alignment := uint64(usermem.PageSize)
+	if length >= usermem.HugePageSize {
+		alignment = usermem.HugePageSize
+	}
+
+	start := findUnallocatedRange(&f.usage, length, alignment)
+	end := start + length
+	// File offsets are int64s. Since length must be strictly positive, end
+	// cannot legitimately be 0.
+	if end < start || int64(end) <= 0 {
+		return platform.FileRange{}, syserror.ENOMEM
+	}
+
+	// Expand the file if needed. Double the file size on each expansion;
+	// uncommitted pages have effectively no cost.
+	fileSize := f.fileSize
+	for int64(end) > fileSize {
+		if fileSize >= 2*fileSize {
+			// fileSize overflow.
+			return platform.FileRange{}, syserror.ENOMEM
+		}
+		fileSize *= 2
+	}
+	if fileSize > f.fileSize {
+		if err := f.file.Truncate(fileSize); err != nil {
+			return platform.FileRange{}, err
+		}
+		f.fileSize = fileSize
+		f.mappingsMu.Lock()
+		oldMappings := f.mappings.Load().([]uintptr)
+		newMappings := make([]uintptr, fileSize>>chunkShift)
+		copy(newMappings, oldMappings)
+		f.mappings.Store(newMappings)
+		f.mappingsMu.Unlock()
+	}
+
+	// Mark selected pages as in use.
+	fr := platform.FileRange{start, end}
+	if !f.usage.Add(fr, usageInfo{
+		kind: kind,
+		refs: 1,
+	}) {
+		panic(fmt.Sprintf("allocating %v: failed to insert into f.usage:\n%v", fr, &f.usage))
+	}
+	return fr, nil
+}
+
+func findUnallocatedRange(usage *usageSet, length, alignment uint64) uint64 {
+	alignMask := alignment - 1
+	var start uint64
+	for seg := usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		r := seg.Range()
+		if start >= r.End {
+			// start was rounded up to an alignment boundary from the end
+			// of a previous segment.
+			continue
+		}
+		// This segment represents allocated or reclaimable pages; only the
+		// range from start to the segment's beginning is allocatable, and the
+		// next allocatable range begins after the segment.
+		if r.Start > start && r.Start-start >= length {
+			break
+		}
+		start = (r.End + alignMask) &^ alignMask
+	}
+	return start
+}
+
+// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
+const (
+	_FALLOC_FL_KEEP_SIZE  = 1
+	_FALLOC_FL_PUNCH_HOLE = 2
+)
+
+// Decommit implements platform.Memory.Decommit.
+func (f *FileMem) Decommit(fr platform.FileRange) error {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	// "After a successful call, subsequent reads from this range will
+	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
+	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
+	err := syscall.Fallocate(
+		int(f.file.Fd()),
+		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
+		int64(fr.Start),
+		int64(fr.Length()))
+	if err != nil {
+		return err
+	}
+	f.markDecommitted(fr)
+	return nil
+}
+
+func (f *FileMem) markDecommitted(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	// Since we're changing the knownCommitted attribute, we need to merge
+	// across the entire range to ensure that the usage tree is minimal.
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		val := seg.ValuePtr()
+		if val.knownCommitted {
+			// Drop the usageExpected appropriately.
+			amount := seg.Range().Length()
+			usage.MemoryAccounting.Dec(amount, val.kind)
+			f.usageExpected -= amount
+			val.knownCommitted = false
+		}
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+	f.usage.MergeRange(fr)
+}
+
+// runReclaim implements the reclaimer goroutine, which continuously decommits
+// reclaimable frames in order to reduce memory usage.
+func (f *FileMem) runReclaim() {
+	for {
+		fr, ok := f.findReclaimable()
+		if !ok {
+			break
+		}
+
+		if err := f.Decommit(fr); err != nil {
+			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+			// Zero the frames manually. This won't reduce memory usage, but at
+			// least ensures that the frames will be zero when reallocated.
+			f.forEachMappingSlice(fr, func(bs []byte) {
+				for i := range bs {
+					bs[i] = 0
+				}
+			})
+			// Pretend the frames were decommitted even though they weren't,
+			// since the memory accounting implementation has no idea how to
+			// deal with this.
+			f.markDecommitted(fr)
+		}
+		f.markReclaimed(fr)
+	}
+	// We only get here if findReclaimable finds f.destroyed set and returns
+	// false.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if !f.destroyed {
+		panic("findReclaimable broke out of reclaim loop, but f.destroyed is no longer set")
+	}
+	f.file.Close()
+	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
+	// that has possibly been reassigned.
+	f.file = nil
+	mappings := f.mappings.Load().([]uintptr)
+	for i, m := range mappings {
+		if m != 0 {
+			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
+			if errno != 0 {
+				log.Warningf("Failed to unmap mapping %#x for filemem chunk %d: %v", m, i, errno)
+			}
+		}
+	}
+	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
+	f.mappings.Store([]uintptr{})
+}
+
+func (f *FileMem) findReclaimable() (platform.FileRange, bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for {
+		for {
+			if f.destroyed {
+				return platform.FileRange{}, false
+			}
+			if f.reclaimable {
+				break
+			}
+			f.reclaimCond.Wait()
+		}
+		// Allocate returns the first usable range in offset order and is
+		// currently a linear scan, so reclaiming from the beginning of the
+		// file minimizes the expected latency of Allocate.
+		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			if seg.ValuePtr().refs == 0 {
+				return seg.Range(), true
+			}
+		}
+		f.reclaimable = false
+	}
+}
+
+func (f *FileMem) markReclaimed(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	seg := f.usage.FindSegment(fr.Start)
+	// All of fr should be mapped to a single uncommitted reclaimable segment
+	// accounted to System.
+	if !seg.Ok() {
+		panic(fmt.Sprintf("Reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
+	}
+	if !seg.Range().IsSupersetOf(fr) {
+		panic(fmt.Sprintf("Reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
+	}
+	if got, want := seg.Value(), (usageInfo{
+		kind:           usage.System,
+		knownCommitted: false,
+		refs:           0,
+	}); got != want {
+		panic(fmt.Sprintf("Reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
+	}
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
+	// caller of markReclaimed may not have decommitted it, so we can only mark
+	// fr as reclaimed.
+	f.usage.Remove(f.usage.Isolate(seg, fr))
+}
+
+// MapInto implements platform.File.MapInto.
+func (f *FileMem) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+	return as.MapFile(addr, int(f.file.Fd()), fr, at, precommit)
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (f *FileMem) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	if !fr.WellFormed() || fr.Length() == 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+	if at.Execute {
+		return safemem.BlockSeq{}, syserror.EACCES
+	}
+
+	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
+	if chunks == 1 {
+		// Avoid an unnecessary slice allocation.
+		var seq safemem.BlockSeq
+		err := f.forEachMappingSlice(fr, func(bs []byte) {
+			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
+		})
+		return seq, err
+	}
+	blocks := make([]safemem.Block, 0, chunks)
+	err := f.forEachMappingSlice(fr, func(bs []byte) {
+		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
+	})
+	return safemem.BlockSeqFromSlice(blocks), err
+}
+
+// IncRef implements platform.File.IncRef.
+func (f *FileMem) IncRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		seg.ValuePtr().incRef()
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+}
+
+// DecRef implements platform.File.DecRef.
+func (f *FileMem) DecRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	var freed bool
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
+		seg = f.usage.Isolate(seg, fr)
+		val := seg.ValuePtr()
+		val.decRef()
+		if val.refs == 0 {
+			freed = true
+			// Reclassify memory as System, until it's freed by the reclaim
+			// goroutine.
+			if val.knownCommitted {
+				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
+			}
+			val.kind = usage.System
+		}
+	}
+	f.usage.MergeAdjacent(fr)
+
+	if freed {
+		f.reclaimable = true
+		f.reclaimCond.Signal()
+	}
+}
+
+// Flush implements platform.Mappable.Flush.
+func (f *FileMem) Flush(ctx context.Context) error {
+	return nil
+}
+
+// forEachMappingSlice invokes fn on a sequence of byte slices that
+// collectively map all bytes in fr.
+func (f *FileMem) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
+	mappings := f.mappings.Load().([]uintptr)
+	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
+		chunk := int(chunkStart >> chunkShift)
+		m := atomic.LoadUintptr(&mappings[chunk])
+		if m == 0 {
+			var err error
+			mappings, m, err = f.getChunkMapping(chunk)
+			if err != nil {
+				return err
+			}
+		}
+		startOff := uint64(0)
+		if chunkStart < fr.Start {
+			startOff = fr.Start - chunkStart
+		}
+		endOff := uint64(chunkSize)
+		if chunkStart+chunkSize > fr.End {
+			endOff = fr.End - chunkStart
+		}
+		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
+	}
+	return nil
+}
+
+func (f *FileMem) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
+	f.mappingsMu.Lock()
+	defer f.mappingsMu.Unlock()
+	// Another thread may have replaced f.mappings altogether due to file
+	// expansion.
+	mappings := f.mappings.Load().([]uintptr)
+	// Another thread may have already mapped the chunk.
+	if m := mappings[chunk]; m != 0 {
+		return mappings, m, nil
+	}
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		chunkSize,
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_SHARED,
+		f.file.Fd(),
+		uintptr(chunk<<chunkShift))
+	if errno != 0 {
+		return nil, 0, errno
+	}
+	atomic.StoreUintptr(&mappings[chunk], m)
+	return mappings, m, nil
+}
+
+// UpdateUsage implements platform.Memory.UpdateUsage.
+func (f *FileMem) UpdateUsage() error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// If the underlying usage matches where the usage tree already
+	// represents, then we can just avoid the entire scan (we know it's
+	// accurate).
+	currentUsage, err := f.TotalUsage()
+	if err != nil {
+		return err
+	}
+	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
+		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
+		return nil
+	}
+	// If the current usage matches the expected but there's swap
+	// accounting, then ensure a scan takes place at least every second
+	// (when requested).
+	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
+		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
+		return nil
+	}
+
+	f.usageLast = time.Now()
+	err = f.updateUsageLocked(currentUsage, mincore)
+	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
+		currentUsage, f.usageExpected, f.usageSwapped)
+	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
+	return err
+}
+
+// updateUsageLocked attempts to detect commitment of previous-uncommitted
+// pages by invoking checkCommitted, which is a function that, for each page i
+// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
+//
+// Precondition: f.mu must be held.
+func (f *FileMem) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
+	// Track if anything changed to elide the merge. In the common case, we
+	// expect all segments to be committed and no merge to occur.
+	changedAny := false
+	defer func() {
+		if changedAny {
+			f.usage.MergeAll()
+		}
+
+		// Adjust the swap usage to reflect reality.
+		if f.usageExpected < currentUsage {
+			// Since no pages may be decommitted while we hold usageMu, we
+			// know that usage may have only increased since we got the
+			// last current usage. Therefore, if usageExpected is still
+			// short of currentUsage, we must assume that the difference is
+			// in pages that have been swapped.
+			newUsageSwapped := currentUsage - f.usageExpected
+			if f.usageSwapped < newUsageSwapped {
+				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
+			} else {
+				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
+			}
+			f.usageSwapped = newUsageSwapped
+		} else if f.usageSwapped != 0 {
+			// We have more usage accounted for than the file itself.
+			// That's fine, we probably caught a race where pages were
+			// being committed while the above loop was running. Just
+			// report the higher number that we found and ignore swap.
+			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
+			f.usageSwapped = 0
+		}
+	}()
+
+	// Reused mincore buffer, will generally be <= 4096 bytes.
+	var buf []byte
+
+	// Iterate over all usage data. There will only be usage segments
+	// present when there is an associated reference.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		val := seg.Value()
+
+		// Already known to be committed; ignore.
+		if val.knownCommitted {
+			continue
+		}
+
+		// Assume that reclaimable pages (that aren't already known to be
+		// committed) are not committed. This isn't necessarily true, even
+		// after the reclaimer does Decommit(), because the kernel may
+		// subsequently back the hugepage-sized region containing the
+		// decommitted page with a hugepage. However, it's consistent with our
+		// treatment of unallocated pages, which have the same property.
+		if val.refs == 0 {
+			continue
+		}
+
+		// Get the range for this segment. As we touch slices, the
+		// Start value will be walked along.
+		r := seg.Range()
+
+		var checkErr error
+		err := f.forEachMappingSlice(r, func(s []byte) {
+			if checkErr != nil {
+				return
+			}
+
+			// Ensure that we have sufficient buffer for the call
+			// (one byte per page). The length of each slice must
+			// be page-aligned.
+			bufLen := len(s) / usermem.PageSize
+			if len(buf) < bufLen {
+				buf = make([]byte, bufLen)
+			}
+
+			// Query for new pages in core.
+			if err := checkCommitted(s, buf); err != nil {
+				checkErr = err
+				return
+			}
+
+			// Scan each page and switch out segments.
+			populatedRun := false
+			populatedRunStart := 0
+			for i := 0; i <= bufLen; i++ {
+				// We run past the end of the slice here to
+				// simplify the logic and only set populated if
+				// we're still looking at elements.
+				populated := false
+				if i < bufLen {
+					populated = buf[i]&0x1 != 0
+				}
+
+				switch {
+				case populated == populatedRun:
+					// Keep the run going.
+					continue
+				case populated && !populatedRun:
+					// Begin the run.
+					populatedRun = true
+					populatedRunStart = i
+					// Keep going.
+					continue
+				case !populated && populatedRun:
+					// Finish the run by changing this segment.
+					runRange := platform.FileRange{
+						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
+						End:   r.Start + uint64(i*usermem.PageSize),
+					}
+					seg = f.usage.Isolate(seg, runRange)
+					seg.ValuePtr().knownCommitted = true
+					// Advance the segment only if we still
+					// have work to do in the context of
+					// the original segment from the for
+					// loop. Otherwise, the for loop itself
+					// will advance the segment
+					// appropriately.
+					if runRange.End != r.End {
+						seg = seg.NextSegment()
+					}
+					amount := runRange.Length()
+					usage.MemoryAccounting.Inc(amount, val.kind)
+					f.usageExpected += amount
+					changedAny = true
+					populatedRun = false
+				}
+			}
+
+			// Advance r.Start.
+			r.Start += uint64(len(s))
+		})
+		if checkErr != nil {
+			return checkErr
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// TotalUsage implements platform.Memory.TotalUsage.
+func (f *FileMem) TotalUsage() (uint64, error) {
+	// Stat the underlying file to discover the underlying usage. stat(2)
+	// always reports the allocated block count in units of 512 bytes. This
+	// includes pages in the page cache and swapped pages.
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(int(f.file.Fd()), &stat); err != nil {
+		return 0, err
+	}
+	return uint64(stat.Blocks * 512), nil
+}
+
+// TotalSize implements platform.Memory.TotalSize.
+func (f *FileMem) TotalSize() uint64 {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return uint64(f.fileSize)
+}
+
+// File returns the memory file used by f.
+func (f *FileMem) File() *os.File {
+	return f.file
+}
+
+// String implements fmt.Stringer.String.
+//
+// Note that because f.String locks f.mu, calling f.String internally
+// (including indirectly through the fmt package) risks recursive locking.
+// Within the filemem package, use f.usage directly instead.
+func (f *FileMem) String() string {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.usage.String()
+}
+
+type usageSetFunctions struct{}
+
+func (usageSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (usageSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (usageSetFunctions) ClearValue(val *usageInfo) {
+}
+
+func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
+	return val1, val1 == val2
+}
+
+func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
+	return val, val
+}
diff --git a/pkg/sentry/platform/filemem/filemem_state.go b/pkg/sentry/platform/filemem/filemem_state.go
new file mode 100644
index 000000000..5dace8fec
--- /dev/null
+++ b/pkg/sentry/platform/filemem/filemem_state.go
@@ -0,0 +1,170 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filemem
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// SaveTo implements platform.Memory.SaveTo.
+func (f *FileMem) SaveTo(w io.Writer) error {
+	// Wait for reclaim.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for f.reclaimable {
+		f.reclaimCond.Signal()
+		f.mu.Unlock()
+		runtime.Gosched()
+		f.mu.Lock()
+	}
+
+	// Ensure that all pages that contain data have knownCommitted set, since
+	// we only store knownCommitted pages below.
+	zeroPage := make([]byte, usermem.PageSize)
+	err := f.updateUsageLocked(0, func(bs []byte, committed []byte) error {
+		for pgoff := 0; pgoff < len(bs); pgoff += usermem.PageSize {
+			i := pgoff / usermem.PageSize
+			pg := bs[pgoff : pgoff+usermem.PageSize]
+			if !bytes.Equal(pg, zeroPage) {
+				committed[i] = 1
+				continue
+			}
+			committed[i] = 0
+			// Reading the page caused it to be committed; decommit it to
+			// reduce memory usage.
+			//
+			// "MADV_REMOVE [...] Free up a given range of pages and its
+			// associated backing store. This is equivalent to punching a hole
+			// in the corresponding byte range of the backing store (see
+			// fallocate(2))." - madvise(2)
+			if err := syscall.Madvise(pg, syscall.MADV_REMOVE); err != nil {
+				// This doesn't impact the correctness of saved memory, it
+				// just means that we're incrementally more likely to OOM.
+				// Complain, but don't abort saving.
+				log.Warningf("Decommitting page %p while saving failed: %v", pg, err)
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	// Save metadata.
+	if err := state.Save(w, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := state.Save(w, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Dump out committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Write a header to distinguish from objects.
+		if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil {
+			return err
+		}
+		// Write out data.
+		var ioErr error
+		err := f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = w.Write(s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+
+		// Update accounting for restored pages. We need to do this here since
+		// these segments are marked as "known committed", and will be skipped
+		// over on accounting scans.
+		usage.MemoryAccounting.Inc(seg.Range().Length(), seg.Value().kind)
+	}
+
+	return nil
+}
+
+// LoadFrom implements platform.Memory.LoadFrom.
+func (f *FileMem) LoadFrom(r io.Reader) error {
+	// Load metadata.
+	if err := state.Load(r, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := f.file.Truncate(f.fileSize); err != nil {
+		return err
+	}
+	newMappings := make([]uintptr, f.fileSize>>chunkShift)
+	f.mappings.Store(newMappings)
+	if err := state.Load(r, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Load committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Verify header.
+		length, object, err := state.ReadHeader(r)
+		if err != nil {
+			return err
+		}
+		if object {
+			// Not expected.
+			return fmt.Errorf("unexpected object")
+		}
+		if expected := uint64(seg.Range().Length()); length != expected {
+			// Size mismatch.
+			return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length)
+		}
+		// Read data.
+		var ioErr error
+		err = f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = io.ReadFull(r, s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+
+		// Update accounting for restored pages. We need to do this here since
+		// these segments are marked as "known committed", and will be skipped
+		// over on accounting scans.
+		usage.MemoryAccounting.Inc(seg.End()-seg.Start(), seg.Value().kind)
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/platform/filemem/filemem_test.go b/pkg/sentry/platform/filemem/filemem_test.go
new file mode 100644
index 000000000..46ffcf116
--- /dev/null
+++ b/pkg/sentry/platform/filemem/filemem_test.go
@@ -0,0 +1,122 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filemem
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	page     = usermem.PageSize
+	hugepage = usermem.HugePageSize
+)
+
+func TestFindUnallocatedRange(t *testing.T) {
+	for _, test := range []struct {
+		desc      string
+		usage     *usageSegmentDataSlices
+		length    uint64
+		alignment uint64
+		start     uint64
+	}{
+		{
+			desc:      "Initial allocation succeeds",
+			usage:     &usageSegmentDataSlices{},
+			length:    page,
+			alignment: page,
+			start:     0,
+		},
+		{
+			desc: "Allocation begins at start of file",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page},
+				End:    []uint64{2 * page},
+				Values: []usageInfo{{refs: 1}},
+			},
+			length:    page,
+			alignment: page,
+			start:     0,
+		},
+		{
+			desc: "In-use frames are not allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, page},
+				End:    []uint64{page, 2 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			length:    page,
+			alignment: page,
+			start:     2 * page,
+		},
+		{
+			desc: "Reclaimable frames are not allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, page, 2 * page},
+				End:    []uint64{page, 2 * page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
+			},
+			length:    page,
+			alignment: page,
+			start:     3 * page,
+		},
+		{
+			desc: "Gaps between in-use frames are allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 2 * page},
+				End:    []uint64{page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			length:    page,
+			alignment: page,
+			start:     page,
+		},
+		{
+			desc: "Inadequately-sized gaps are rejected",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 2 * page},
+				End:    []uint64{page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			length:    2 * page,
+			alignment: page,
+			start:     3 * page,
+		},
+		{
+			desc: "Hugepage alignment is honored",
+			usage: &usageSegmentDataSlices{
+				Start: []uint64{0, hugepage + page},
+				// Hugepage-sized gap here that shouldn't be allocated from
+				// since it's incorrectly aligned.
+				End:    []uint64{page, hugepage + 2*page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			length:    hugepage,
+			alignment: hugepage,
+			start:     2 * hugepage,
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			var usage usageSet
+			if err := usage.ImportSortedSlices(test.usage); err != nil {
+				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
+			}
+			if got, want := findUnallocatedRange(&usage, test.length, test.alignment), test.start; got != want {
+				t.Errorf("findUnallocatedRange(%v, %d, %d): got %d, wanted %d", test.usage, test.length, test.alignment, got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/platform/filemem/filemem_unsafe.go b/pkg/sentry/platform/filemem/filemem_unsafe.go
new file mode 100644
index 000000000..a23b9825a
--- /dev/null
+++ b/pkg/sentry/platform/filemem/filemem_unsafe.go
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filemem
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+)
+
+func unsafeSlice(addr uintptr, length int) (slice []byte) {
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	sh.Data = addr
+	sh.Len = length
+	sh.Cap = length
+	return
+}
+
+func mincore(s []byte, buf []byte) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_MINCORE,
+		uintptr(unsafe.Pointer(&s[0])),
+		uintptr(len(s)),
+		uintptr(unsafe.Pointer(&buf[0]))); errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
new file mode 100644
index 000000000..33dde2a31
--- /dev/null
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -0,0 +1,19 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "interrupt",
+    srcs = [
+        "interrupt.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt",
+    visibility = ["//pkg/sentry:internal"],
+)
+
+go_test(
+    name = "interrupt_test",
+    size = "small",
+    srcs = ["interrupt_test.go"],
+    embed = [":interrupt"],
+)
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
new file mode 100644
index 000000000..ca4f42087
--- /dev/null
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -0,0 +1,96 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package interrupt provides an interrupt helper.
+package interrupt
+
+import (
+	"fmt"
+	"sync"
+)
+
+// Receiver receives interrupt notifications from a Forwarder.
+type Receiver interface {
+	// NotifyInterrupt is called when the Receiver receives an interrupt.
+	NotifyInterrupt()
+}
+
+// Forwarder is a helper for delivering delayed signal interruptions.
+//
+// This helps platform implementations with Interrupt semantics.
+type Forwarder struct {
+	// mu protects the below.
+	mu sync.Mutex
+
+	// dst is the function to be called when NotifyInterrupt() is called. If
+	// dst is nil, pending will be set instead, causing the next call to
+	// Enable() to return false.
+	dst     Receiver
+	pending bool
+}
+
+// Enable attempts to enable interrupt forwarding to r. If f has already
+// received an interrupt, Enable does nothing and returns false. Otherwise,
+// future calls to f.NotifyInterrupt() cause r.NotifyInterrupt() to be called,
+// and Enable returns true.
+//
+// Usage:
+//
+// if !f.Enable(r) {
+//     // There was an interrupt.
+//     return
+// }
+// defer f.Disable()
+//
+// Preconditions: r must not be nil. f must not already be forwarding
+// interrupts to a Receiver.
+func (f *Forwarder) Enable(r Receiver) bool {
+	if r == nil {
+		panic("nil Receiver")
+	}
+	f.mu.Lock()
+	if f.dst != nil {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("already forwarding interrupts to %+v", f.dst))
+	}
+	if f.pending {
+		f.pending = false
+		f.mu.Unlock()
+		return false
+	}
+	f.dst = r
+	f.mu.Unlock()
+	return true
+}
+
+// Disable stops interrupt forwarding. If interrupt forwarding is already
+// disabled, Disable is a no-op.
+func (f *Forwarder) Disable() {
+	f.mu.Lock()
+	f.dst = nil
+	f.mu.Unlock()
+}
+
+// NotifyInterrupt implements Receiver.NotifyInterrupt. If interrupt forwarding
+// is enabled, the configured Receiver will be notified. Otherwise the
+// interrupt will be delivered to the next call to Enable.
+func (f *Forwarder) NotifyInterrupt() {
+	f.mu.Lock()
+	if f.dst != nil {
+		f.dst.NotifyInterrupt()
+	} else {
+		f.pending = true
+	}
+	f.mu.Unlock()
+}
diff --git a/pkg/sentry/platform/interrupt/interrupt_test.go b/pkg/sentry/platform/interrupt/interrupt_test.go
new file mode 100644
index 000000000..7c49eeea6
--- /dev/null
+++ b/pkg/sentry/platform/interrupt/interrupt_test.go
@@ -0,0 +1,99 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package interrupt
+
+import (
+	"testing"
+)
+
+type countingReceiver struct {
+	interrupts int
+}
+
+// NotifyInterrupt implements Receiver.NotifyInterrupt.
+func (r *countingReceiver) NotifyInterrupt() {
+	r.interrupts++
+}
+
+func TestSingleInterruptBeforeEnable(t *testing.T) {
+	var (
+		f Forwarder
+		r countingReceiver
+	)
+	f.NotifyInterrupt()
+	// The interrupt should cause the first Enable to fail.
+	if f.Enable(&r) {
+		f.Disable()
+		t.Fatalf("Enable: got true, wanted false")
+	}
+	// The failing Enable "acknowledges" the interrupt, allowing future Enables
+	// to succeed.
+	if !f.Enable(&r) {
+		t.Fatalf("Enable: got false, wanted true")
+	}
+	f.Disable()
+}
+
+func TestMultipleInterruptsBeforeEnable(t *testing.T) {
+	var (
+		f Forwarder
+		r countingReceiver
+	)
+	f.NotifyInterrupt()
+	f.NotifyInterrupt()
+	// The interrupts should cause the first Enable to fail.
+	if f.Enable(&r) {
+		f.Disable()
+		t.Fatalf("Enable: got true, wanted false")
+	}
+	// Interrupts are deduplicated while the Forwarder is disabled, so the
+	// failing Enable "acknowledges" all interrupts, allowing future Enables to
+	// succeed.
+	if !f.Enable(&r) {
+		t.Fatalf("Enable: got false, wanted true")
+	}
+	f.Disable()
+}
+
+func TestSingleInterruptAfterEnable(t *testing.T) {
+	var (
+		f Forwarder
+		r countingReceiver
+	)
+	if !f.Enable(&r) {
+		t.Fatalf("Enable: got false, wanted true")
+	}
+	defer f.Disable()
+	f.NotifyInterrupt()
+	if r.interrupts != 1 {
+		t.Errorf("interrupts: got %d, wanted 1", r.interrupts)
+	}
+}
+
+func TestMultipleInterruptsAfterEnable(t *testing.T) {
+	var (
+		f Forwarder
+		r countingReceiver
+	)
+	if !f.Enable(&r) {
+		t.Fatalf("Enable: got false, wanted true")
+	}
+	defer f.Disable()
+	f.NotifyInterrupt()
+	f.NotifyInterrupt()
+	if r.interrupts != 2 {
+		t.Errorf("interrupts: got %d, wanted 2", r.interrupts)
+	}
+}
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
new file mode 100644
index 000000000..d902e344a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -0,0 +1,90 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "host_map_set",
+    out = "host_map_set.go",
+    consts = {
+        "minDegree": "15",
+    },
+    imports = {
+        "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
+    },
+    package = "kvm",
+    prefix = "hostMap",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "usermem.Addr",
+        "Range": "usermem.AddrRange",
+        "Value": "uintptr",
+        "Functions": "hostMapSetFunctions",
+    },
+)
+
+go_library(
+    name = "kvm",
+    srcs = [
+        "address_space.go",
+        "bluepill.go",
+        "bluepill_amd64.go",
+        "bluepill_amd64.s",
+        "bluepill_amd64_unsafe.go",
+        "bluepill_fault.go",
+        "bluepill_unsafe.go",
+        "context.go",
+        "host_map.go",
+        "host_map_set.go",
+        "kvm.go",
+        "kvm_amd64.go",
+        "kvm_amd64_unsafe.go",
+        "kvm_const.go",
+        "machine.go",
+        "machine_amd64.go",
+        "machine_amd64_unsafe.go",
+        "machine_unsafe.go",
+        "physical_map.go",
+        "virtual_map.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/cpuid",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/filemem",
+        "//pkg/sentry/platform/interrupt",
+        "//pkg/sentry/platform/procid",
+        "//pkg/sentry/platform/ring0",
+        "//pkg/sentry/platform/ring0/pagetables",
+        "//pkg/sentry/platform/safecopy",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usermem",
+        "//pkg/tmutex",
+    ],
+)
+
+go_test(
+    name = "kvm_test",
+    size = "small",
+    srcs = [
+        "kvm_test.go",
+        "virtual_map_test.go",
+    ],
+    embed = [":kvm"],
+    tags = [
+        "nogotsan",
+        "requires-kvm",
+    ],
+    deps = [
+        "//pkg/sentry/arch",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm/testutil",
+        "//pkg/sentry/platform/ring0",
+        "//pkg/sentry/platform/ring0/pagetables",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
new file mode 100644
index 000000000..791f038b0
--- /dev/null
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -0,0 +1,207 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"reflect"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// addressSpace is a wrapper for PageTables.
+type addressSpace struct {
+	platform.NoAddressSpaceIO
+
+	// filemem is the memory instance.
+	filemem *filemem.FileMem
+
+	// machine is the underlying machine.
+	machine *machine
+
+	// pageTables are for this particular address space.
+	pageTables *pagetables.PageTables
+
+	// dirtySet is the set of dirty vCPUs.
+	//
+	// The key is the vCPU, the value is a shared uint32 pointer that
+	// indicates whether or not the context is clean. A zero here indicates
+	// that the context should be cleaned prior to re-entry.
+	dirtySet sync.Map
+
+	// files contains files mapped in the host address space.
+	files hostMap
+}
+
+// Invalidate interrupts all dirty contexts.
+func (as *addressSpace) Invalidate() {
+	as.dirtySet.Range(func(key, value interface{}) bool {
+		c := key.(*vCPU)
+		v := value.(*uint32)
+		atomic.StoreUint32(v, 0) // Invalidation required.
+		c.Bounce()               // Force a kernel transition.
+		return true              // Keep iterating.
+	})
+}
+
+// Touch adds the given vCPU to the dirty list.
+func (as *addressSpace) Touch(c *vCPU) *uint32 {
+	value, ok := as.dirtySet.Load(c)
+	if !ok {
+		value, _ = as.dirtySet.LoadOrStore(c, new(uint32))
+	}
+	return value.(*uint32)
+}
+
+func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+	for m.length > 0 {
+		physical, length, ok := TranslateToPhysical(m.addr)
+		if !ok {
+			panic("unable to translate segment")
+		}
+		if length > m.length {
+			length = m.length
+		}
+
+		// Ensure that this map has physical mappings. If the page does
+		// not have physical mappings, the KVM module may inject
+		// spurious exceptions when emulation fails (i.e. it tries to
+		// emulate because the RIP is pointed at those pages).
+		as.machine.mapPhysical(physical, length)
+
+		// Install the page table mappings. Note that the ordering is
+		// important; if the pagetable mappings were installed before
+		// ensuring the physical pages were available, then some other
+		// thread could theoretically access them.
+		prev := as.pageTables.Map(addr, length, true /* user */, at, physical)
+		inv = inv || prev
+		m.addr += length
+		m.length -= length
+		addr += usermem.Addr(length)
+	}
+
+	return inv
+}
+
+func (as *addressSpace) mapHostFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType) error {
+	// Create custom host mappings.
+	ms, err := as.files.CreateMappings(usermem.AddrRange{
+		Start: addr,
+		End:   addr + usermem.Addr(fr.End-fr.Start),
+	}, at, fd, fr.Start)
+	if err != nil {
+		return err
+	}
+
+	inv := false
+	for _, m := range ms {
+		// The host mapped slices are guaranteed to be aligned.
+		inv = inv || as.mapHost(addr, m, at)
+		addr += usermem.Addr(m.length)
+	}
+	if inv {
+		as.Invalidate()
+	}
+
+	return nil
+}
+
+func (as *addressSpace) mapFilemem(addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	// TODO: Lock order at the platform level is not sufficiently
+	// well-defined to guarantee that the caller (FileMem.MapInto) is not
+	// holding any locks that FileMem.MapInternal may take.
+
+	// Retrieve mappings for the underlying filemem. Note that the
+	// permissions here are largely irrelevant, since it corresponds to
+	// physical memory for the guest. We enforce the given access type
+	// below, in the guest page tables.
+	bs, err := as.filemem.MapInternal(fr, usermem.AccessType{
+		Read:  true,
+		Write: true,
+	})
+	if err != nil {
+		return err
+	}
+
+	// Save the original range for invalidation.
+	orig := usermem.AddrRange{
+		Start: addr,
+		End:   addr + usermem.Addr(fr.End-fr.Start),
+	}
+
+	inv := false
+	for !bs.IsEmpty() {
+		b := bs.Head()
+		bs = bs.Tail()
+		// Since fr was page-aligned, b should also be page-aligned. We do the
+		// lookup in our host page tables for this translation.
+		s := b.ToSlice()
+		if precommit {
+			for i := 0; i < len(s); i += usermem.PageSize {
+				_ = s[i] // Touch to commit.
+			}
+		}
+		inv = inv || as.mapHost(addr, hostMapEntry{
+			addr:   reflect.ValueOf(&s[0]).Pointer(),
+			length: uintptr(len(s)),
+		}, at)
+		addr += usermem.Addr(len(s))
+	}
+	if inv {
+		as.Invalidate()
+		as.files.DeleteMapping(orig)
+	}
+
+	return nil
+}
+
+// MapFile implements platform.AddressSpace.MapFile.
+func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	// Create an appropriate mapping. If this is filemem, we don't create
+	// custom mappings for each in-application mapping. For files however,
+	// we create distinct mappings for each address space. Unfortunately,
+	// there's not a better way to manage this here. The file underlying
+	// this fd can change at any time, so we can't actually index the file
+	// and share between address space. Oh well. It's all refering to the
+	// same physical pages, hopefully we don't run out of address space.
+	if fd != int(as.filemem.File().Fd()) {
+		// N.B. precommit is ignored for host files.
+		return as.mapHostFile(addr, fd, fr, at)
+	}
+
+	return as.mapFilemem(addr, fr, at, precommit)
+}
+
+// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
+func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
+	if prev := as.pageTables.Unmap(addr, uintptr(length)); prev {
+		as.Invalidate()
+		as.files.DeleteMapping(usermem.AddrRange{
+			Start: addr,
+			End:   addr + usermem.Addr(length),
+		})
+	}
+}
+
+// Release releases the page tables.
+func (as *addressSpace) Release() error {
+	as.Unmap(0, ^uint64(0))
+	as.pageTables.Release()
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
new file mode 100644
index 000000000..ecc33d7dd
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+)
+
+// bluepill enters guest mode.
+func bluepill(*vCPU)
+
+// sighandler is the signal entry point.
+func sighandler()
+
+// savedHandler is a pointer to the previous handler.
+//
+// This is called by bluepillHandler.
+var savedHandler uintptr
+
+func init() {
+	// Install the handler.
+	if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
+	}
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
new file mode 100644
index 000000000..a2baefb7d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -0,0 +1,143 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+var (
+	// bounceSignal is the signal used for bouncing KVM.
+	//
+	// We use SIGCHLD because it is not masked by the runtime, and
+	// it will be ignored properly by other parts of the kernel.
+	bounceSignal = syscall.SIGCHLD
+
+	// bounceSignalMask has only bounceSignal set.
+	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+
+	// bounce is the interrupt vector used to return to the kernel.
+	bounce = uint32(ring0.VirtualizationException)
+)
+
+// redpill on amd64 invokes a syscall with -1.
+//
+//go:nosplit
+func redpill() {
+	syscall.RawSyscall(^uintptr(0), 0, 0, 0)
+}
+
+// bluepillArchEnter is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
+	c = vCPUPtr(uintptr(context.Rax))
+	regs := c.CPU.Registers()
+	regs.R8 = context.R8
+	regs.R9 = context.R9
+	regs.R10 = context.R10
+	regs.R11 = context.R11
+	regs.R12 = context.R12
+	regs.R13 = context.R13
+	regs.R14 = context.R14
+	regs.R15 = context.R15
+	regs.Rdi = context.Rdi
+	regs.Rsi = context.Rsi
+	regs.Rbp = context.Rbp
+	regs.Rbx = context.Rbx
+	regs.Rdx = context.Rdx
+	regs.Rax = context.Rax
+	regs.Rcx = context.Rcx
+	regs.Rsp = context.Rsp
+	regs.Rip = context.Rip
+	regs.Eflags = context.Eflags
+	regs.Eflags &^= uint64(ring0.KernelFlagsClear)
+	regs.Eflags |= ring0.KernelFlagsSet
+	regs.Cs = uint64(ring0.Kcode)
+	regs.Ds = uint64(ring0.Udata)
+	regs.Es = uint64(ring0.Udata)
+	regs.Fs = uint64(ring0.Udata)
+	regs.Ss = uint64(ring0.Kdata)
+
+	// ring0 uses GS exclusively, so we use GS_base to store the location
+	// of the floating point address.
+	//
+	// The address will be restored directly after running the VCPU, and
+	// will be saved again prior to halting. We rely on the fact that the
+	// SaveFloatingPointer/LoadFloatingPoint functions use the most
+	// efficient mechanism available (including compression) so the state
+	// size is guaranteed to be less than what's pointed to here.
+	regs.Gs_base = uint64(context.Fpstate)
+	return
+}
+
+// bluepillSyscall handles kernel syscalls.
+//
+//go:nosplit
+func bluepillSyscall() {
+	regs := ring0.Current().Registers()
+	if regs.Rax != ^uint64(0) {
+		regs.Rip -= 2 // Rewind.
+	}
+	ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+	ring0.Halt()
+	ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+}
+
+// bluepillException handles kernel exceptions.
+//
+//go:nosplit
+func bluepillException(vector ring0.Vector) {
+	regs := ring0.Current().Registers()
+	if vector == ring0.Vector(bounce) {
+		// These should not interrupt kernel execution; point the Rip
+		// to zero to ensure that we get a reasonable panic when we
+		// attempt to return.
+		regs.Rip = 0
+	}
+	ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+	ring0.Halt()
+	ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+}
+
+// bluepillArchExit is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
+	regs := c.CPU.Registers()
+	context.R8 = regs.R8
+	context.R9 = regs.R9
+	context.R10 = regs.R10
+	context.R11 = regs.R11
+	context.R12 = regs.R12
+	context.R13 = regs.R13
+	context.R14 = regs.R14
+	context.R15 = regs.R15
+	context.Rdi = regs.Rdi
+	context.Rsi = regs.Rsi
+	context.Rbp = regs.Rbp
+	context.Rbx = regs.Rbx
+	context.Rdx = regs.Rdx
+	context.Rax = regs.Rax
+	context.Rcx = regs.Rcx
+	context.Rsp = regs.Rsp
+	context.Rip = regs.Rip
+	context.Eflags = regs.Eflags
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
new file mode 100644
index 000000000..0881bd5f5
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -0,0 +1,87 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// VCPU_CPU is the location of the CPU in the vCPU struct.
+//
+// This is guaranteed to be zero.
+#define VCPU_CPU 0x0
+
+// CPU_SELF is the self reference in ring0's percpu.
+//
+// This is guaranteed to be zero.
+#define CPU_SELF 0x0
+
+// Context offsets.
+//
+// Only limited use of the context is done in the assembly stub below, most is
+// done in the Go handlers. However, the RIP must be examined.
+#define CONTEXT_RAX 0x90
+#define CONTEXT_RIP 0xa8
+#define CONTEXT_FP  0xe0
+
+// CLI is the literal byte for the disable interrupts instruction.
+//
+// This is checked as the source of the fault.
+#define CLI $0xfa
+
+// See bluepill.go.
+TEXT ·bluepill(SB),NOSPLIT,$0
+begin:
+	MOVQ vcpu+0(FP), AX
+	LEAQ VCPU_CPU(AX), BX
+	BYTE CLI;
+check_vcpu:
+	MOVQ CPU_SELF(GS), CX
+	CMPQ BX, CX
+	JE right_vCPU
+wrong_vcpu:
+	CALL ·redpill(SB)
+	JMP begin
+right_vCPU:
+	RET
+
+// sighandler: see bluepill.go for documentation.
+//
+// The arguments are the following:
+//
+// 	DI - The signal number.
+// 	SI - Pointer to siginfo_t structure.
+// 	DX - Pointer to ucontext structure.
+//
+TEXT ·sighandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel.
+	MOVQ $0x80, CX
+	CMPL CX, 0x8(SI)
+	JNE fallback
+
+	// Check if RIP is disable interrupts.
+	MOVQ CONTEXT_RIP(DX), CX
+	CMPQ CX, $0x0
+	JE fallback
+	CMPB 0(CX), CLI
+	JNE fallback
+
+	// Call the bluepillHandler.
+	PUSHQ DX                    // First argument (context).
+	CALL ·bluepillHandler(SB)   // Call the handler.
+	POPQ DX                     // Discard the argument.
+	RET
+
+fallback:
+	// Jump to the previous signal handler.
+	XORQ CX, CX
+	MOVQ ·savedHandler(SB), AX
+	JMP AX
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
new file mode 100644
index 000000000..61ca61dcb
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// bluepillArchContext returns the arch-specific context.
+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
+	return &((*arch.UContext64)(context).MContext)
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
new file mode 100644
index 000000000..7c8c7bc37
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -0,0 +1,127 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// faultBlockSize is the size used for servicing memory faults.
+	//
+	// This should be large enough to avoid frequent faults and avoid using
+	// all available KVM slots (~512), but small enough that KVM does not
+	// complain about slot sizes (~4GB). See handleBluepillFault for how
+	// this block is used.
+	faultBlockSize = 2 << 30
+
+	// faultBlockMask is the mask for the fault blocks.
+	//
+	// This must be typed to avoid overflow complaints (ugh).
+	faultBlockMask = ^uintptr(faultBlockSize - 1)
+)
+
+// yield yields the CPU.
+//
+//go:nosplit
+func yield() {
+	syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0)
+}
+
+// calculateBluepillFault calculates the fault address range.
+//
+//go:nosplit
+func calculateBluepillFault(m *machine, physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
+	alignedPhysical := physical &^ uintptr(usermem.PageSize-1)
+	for _, pr := range physicalRegions {
+		end := pr.physical + pr.length
+		if physical < pr.physical || physical >= end {
+			continue
+		}
+
+		// Adjust the block to match our size.
+		physicalStart = alignedPhysical & faultBlockMask
+		if physicalStart < pr.physical {
+			// Bound the starting point to the start of the region.
+			physicalStart = pr.physical
+		}
+		virtualStart = pr.virtual + (physicalStart - pr.physical)
+		physicalEnd := physicalStart + faultBlockSize
+		if physicalEnd > end {
+			physicalEnd = end
+		}
+		length = physicalEnd - physicalStart
+		return virtualStart, physicalStart, length, true
+	}
+
+	return 0, 0, 0, false
+}
+
+// handleBluepillFault handles a physical fault.
+//
+// The corresponding virtual address is returned. This may throw on error.
+//
+//go:nosplit
+func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
+	// Paging fault: we need to map the underlying physical pages for this
+	// fault. This all has to be done in this function because we're in a
+	// signal handler context. (We can't call any functions that might
+	// split the stack.)
+	virtualStart, physicalStart, length, ok := calculateBluepillFault(m, physical)
+	if !ok {
+		return 0, false
+	}
+
+	// Set the KVM slot.
+	//
+	// First, we need to acquire the exclusive right to set a slot.  See
+	// machine.nextSlot for information about the protocol.
+	slot := atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+	for slot == ^uint32(0) {
+		yield() // Race with another call.
+		slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+	}
+	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart)
+	if errno == 0 {
+		// Successfully added region; we can increment nextSlot and
+		// allow another set to proceed here.
+		atomic.StoreUint32(&m.nextSlot, slot+1)
+		return virtualStart + (physical - physicalStart), true
+	}
+
+	// Release our slot (still available).
+	atomic.StoreUint32(&m.nextSlot, slot)
+
+	switch errno {
+	case syscall.EEXIST:
+		// The region already exists. It's possible that we raced with
+		// another vCPU here. We just revert nextSlot and return true,
+		// because this must have been satisfied by some other vCPU.
+		return virtualStart + (physical - physicalStart), true
+	case syscall.EINVAL:
+		throw("set memory region failed; out of slots")
+	case syscall.ENOMEM:
+		throw("set memory region failed: out of memory")
+	case syscall.EFAULT:
+		throw("set memory region failed: invalid physical range")
+	default:
+		throw("set memory region failed: unknown reason")
+	}
+
+	panic("unreachable")
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
new file mode 100644
index 000000000..85703ff18
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+)
+
+//go:linkname throw runtime.throw
+func throw(string)
+
+// vCPUPtr returns a CPU for the given address.
+//
+//go:nosplit
+func vCPUPtr(addr uintptr) *vCPU {
+	return (*vCPU)(unsafe.Pointer(addr))
+}
+
+// bytePtr returns a bytePtr for the given address.
+//
+//go:nosplit
+func bytePtr(addr uintptr) *byte {
+	return (*byte)(unsafe.Pointer(addr))
+}
+
+// bluepillHandler is called from the signal stub.
+//
+// The world may be stopped while this is executing, and it executes on the
+// signal stack. It should only execute raw system calls and functions that are
+// explicitly marked go:nosplit.
+//
+//go:nosplit
+func bluepillHandler(context unsafe.Pointer) {
+	// Sanitize the registers; interrupts must always be disabled.
+	c := bluepillArchEnter(bluepillArchContext(context))
+
+	// Increment the number of switches.
+	atomic.AddUint32(&c.switches, 1)
+
+	// Store vCPUGuest.
+	//
+	// This is fine even if we're not in guest mode yet.  In this signal
+	// handler, we'll already have all the relevant signals blocked, so an
+	// interrupt is only deliverable when we actually execute the KVM_RUN.
+	//
+	// The state will be returned to vCPUReady by Phase2.
+	if state := atomic.SwapUintptr(&c.state, vCPUGuest); state != vCPUReady {
+		throw("vCPU not in ready state")
+	}
+
+	for {
+		_, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0)
+		if errno == syscall.EINTR {
+			// First, we process whatever pending signal
+			// interrupted KVM. Since we're in a signal handler
+			// currently, all signals are masked and the signal
+			// must have been delivered directly to this thread.
+			sig, _, errno := syscall.RawSyscall6(
+				syscall.SYS_RT_SIGTIMEDWAIT,
+				uintptr(unsafe.Pointer(&bounceSignalMask)),
+				0, // siginfo.
+				0, // timeout.
+				8, // sigset size.
+				0, 0)
+			if errno != 0 {
+				throw("error waiting for pending signal")
+			}
+			if sig != uintptr(bounceSignal) {
+				throw("unexpected signal")
+			}
+
+			// Check whether the current state of the vCPU is ready
+			// for interrupt injection. Because we don't have a
+			// PIC, we can't inject an interrupt while they are
+			// masked. We need to request a window if it's not
+			// ready.
+			if c.runData.readyForInterruptInjection == 0 {
+				c.runData.requestInterruptWindow = 1
+				continue // Rerun vCPU.
+			} else {
+				// Force injection below; the vCPU is ready.
+				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+			}
+		} else if errno != 0 {
+			throw("run failed")
+		}
+
+		switch c.runData.exitReason {
+		case _KVM_EXIT_EXCEPTION:
+			throw("exception")
+		case _KVM_EXIT_IO:
+			throw("I/O")
+		case _KVM_EXIT_INTERNAL_ERROR:
+			throw("internal error")
+		case _KVM_EXIT_HYPERCALL:
+			throw("hypercall")
+		case _KVM_EXIT_DEBUG:
+			throw("debug")
+		case _KVM_EXIT_HLT:
+			// Copy out registers.
+			bluepillArchExit(c, bluepillArchContext(context))
+
+			// Notify any waiters.
+			switch state := atomic.SwapUintptr(&c.state, vCPUReady); state {
+			case vCPUGuest:
+			case vCPUWaiter:
+				c.notify() // Safe from handler.
+			default:
+				throw("invalid state")
+			}
+			return
+		case _KVM_EXIT_MMIO:
+			// Increment the fault count.
+			atomic.AddUint32(&c.faults, 1)
+
+			// For MMIO, the physical address is the first data item.
+			virtual, ok := handleBluepillFault(c.machine, uintptr(c.runData.data[0]))
+			if !ok {
+				throw("physical address not valid")
+			}
+
+			// We now need to fill in the data appropriately. KVM
+			// expects us to provide the result of the given MMIO
+			// operation in the runData struct. This is safe
+			// because, if a fault occurs here, the same fault
+			// would have occurred in guest mode. The kernel should
+			// not create invalid page table mappings.
+			data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
+			length := (uintptr)((uint32)(c.runData.data[2]))
+			write := (uint8)((c.runData.data[2] >> 32 & 0xff)) != 0
+			for i := uintptr(0); i < length; i++ {
+				b := bytePtr(uintptr(virtual) + i)
+				if write {
+					// Write to the given address.
+					*b = data[i]
+				} else {
+					// Read from the given address.
+					data[i] = *b
+				}
+			}
+		case _KVM_EXIT_IRQ_WINDOW_OPEN:
+			// Interrupt: we must have requested an interrupt
+			// window; set the interrupt line.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_INTERRUPT,
+				uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+				throw("interrupt injection failed")
+			}
+			// Clear previous injection request.
+			c.runData.requestInterruptWindow = 0
+		case _KVM_EXIT_SHUTDOWN:
+			throw("shutdown")
+		case _KVM_EXIT_FAIL_ENTRY:
+			throw("entry failed")
+		default:
+			throw("unknown failure")
+		}
+	}
+}
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
new file mode 100644
index 000000000..fd04a2c47
--- /dev/null
+++ b/pkg/sentry/platform/kvm/context.go
@@ -0,0 +1,81 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// context is an implementation of the platform context.
+//
+// This is a thin wrapper around the machine.
+type context struct {
+	// machine is the parent machine, and is immutable.
+	machine *machine
+
+	// interrupt is the interrupt context.
+	interrupt interrupt.Forwarder
+}
+
+// Switch runs the provided context in the given address space.
+func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+	// Extract data.
+	localAS := as.(*addressSpace)
+	regs := &ac.StateData().Regs
+	fp := (*byte)(ac.FloatingPointData())
+
+	// Grab a vCPU.
+	cpu, err := c.machine.Get()
+	if err != nil {
+		return nil, usermem.NoAccess, err
+	}
+
+	// Enable interrupts (i.e. calls to vCPU.Notify).
+	if !c.interrupt.Enable(cpu) {
+		c.machine.Put(cpu) // Already preempted.
+		return nil, usermem.NoAccess, platform.ErrContextInterrupt
+	}
+
+	// Mark the address space as dirty.
+	flags := ring0.Flags(0)
+	dirty := localAS.Touch(cpu)
+	if v := atomic.SwapUint32(dirty, 1); v == 0 {
+		flags |= ring0.FlagFlush
+	}
+	if ac.FullRestore() {
+		flags |= ring0.FlagFull
+	}
+
+	// Take the blue pill.
+	si, at, err := cpu.SwitchToUser(regs, fp, localAS.pageTables, flags)
+
+	// Release resources.
+	c.machine.Put(cpu)
+
+	// All done.
+	c.interrupt.Disable()
+	return si, at, err
+}
+
+// Interrupt interrupts the running context.
+func (c *context) Interrupt() {
+	c.interrupt.NotifyInterrupt()
+}
diff --git a/pkg/sentry/platform/kvm/host_map.go b/pkg/sentry/platform/kvm/host_map.go
new file mode 100644
index 000000000..357f8c92e
--- /dev/null
+++ b/pkg/sentry/platform/kvm/host_map.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type hostMap struct {
+	// mu protects below.
+	mu sync.RWMutex
+
+	// set contains host mappings.
+	set hostMapSet
+}
+
+type hostMapEntry struct {
+	addr   uintptr
+	length uintptr
+}
+
+func (hm *hostMap) forEachEntry(r usermem.AddrRange, fn func(offset uint64, m hostMapEntry)) {
+	for seg := hm.set.FindSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
+		length := uintptr(seg.Range().Length())
+		segOffset := uint64(0) // Adjusted below.
+		if seg.End() > r.End {
+			length -= uintptr(seg.End() - r.End)
+		}
+		if seg.Start() < r.Start {
+			length -= uintptr(r.Start - seg.Start())
+		} else {
+			segOffset = uint64(seg.Start() - r.Start)
+		}
+		fn(segOffset, hostMapEntry{
+			addr:   seg.Value(),
+			length: length,
+		})
+	}
+}
+
+func (hm *hostMap) createMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) {
+	// Replace any existing mappings.
+	hm.forEachEntry(r, func(segOffset uint64, m hostMapEntry) {
+		_, _, errno := syscall.RawSyscall6(
+			syscall.SYS_MMAP,
+			m.addr,
+			m.length,
+			uintptr(at.Prot()),
+			syscall.MAP_FIXED|syscall.MAP_SHARED,
+			uintptr(fd),
+			uintptr(offset+segOffset))
+		if errno != 0 && err == nil {
+			err = errno
+		}
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	// Add in necessary new mappings.
+	for gap := hm.set.FindGap(r.Start); gap.Ok() && gap.Start() < r.End; {
+		length := uintptr(gap.Range().Length())
+		gapOffset := uint64(0) // Adjusted below.
+		if gap.End() > r.End {
+			length -= uintptr(gap.End() - r.End)
+		}
+		if gap.Start() < r.Start {
+			length -= uintptr(r.Start - gap.Start())
+		} else {
+			gapOffset = uint64(gap.Start() - r.Start)
+		}
+
+		// Map the host file memory.
+		hostAddr, _, errno := syscall.RawSyscall6(
+			syscall.SYS_MMAP,
+			0,
+			length,
+			uintptr(at.Prot()),
+			syscall.MAP_SHARED,
+			uintptr(fd),
+			uintptr(offset+gapOffset))
+		if errno != 0 {
+			return nil, errno
+		}
+
+		// Insert into the host set and move to the next gap.
+		gap = hm.set.Insert(gap, gap.Range().Intersect(r), hostAddr).NextGap()
+	}
+
+	// Collect all slices.
+	hm.forEachEntry(r, func(_ uint64, m hostMapEntry) {
+		ms = append(ms, m)
+	})
+
+	return ms, nil
+}
+
+// CreateMappings creates a new set of host mapping entries.
+func (hm *hostMap) CreateMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) {
+	hm.mu.Lock()
+	ms, err = hm.createMappings(r, at, fd, offset)
+	hm.mu.Unlock()
+	return
+}
+
+func (hm *hostMap) deleteMapping(r usermem.AddrRange) {
+	// Remove all the existing mappings.
+	hm.forEachEntry(r, func(_ uint64, m hostMapEntry) {
+		_, _, errno := syscall.RawSyscall(
+			syscall.SYS_MUNMAP,
+			m.addr,
+			m.length,
+			0)
+		if errno != 0 {
+			// Should never happen.
+			panic(fmt.Sprintf("unmap error: %v", errno))
+		}
+	})
+
+	// Knock the range out.
+	hm.set.RemoveRange(r)
+}
+
+// DeleteMapping deletes the given range.
+func (hm *hostMap) DeleteMapping(r usermem.AddrRange) {
+	hm.mu.Lock()
+	hm.deleteMapping(r)
+	hm.mu.Unlock()
+}
+
+// hostMapSetFunctions is used in the implementation of mapSet.
+type hostMapSetFunctions struct{}
+
+func (hostMapSetFunctions) MinKey() usermem.Addr    { return 0 }
+func (hostMapSetFunctions) MaxKey() usermem.Addr    { return ^usermem.Addr(0) }
+func (hostMapSetFunctions) ClearValue(val *uintptr) { *val = 0 }
+
+func (hostMapSetFunctions) Merge(r1 usermem.AddrRange, addr1 uintptr, r2 usermem.AddrRange, addr2 uintptr) (uintptr, bool) {
+	if addr1+uintptr(r1.Length()) != addr2 {
+		return 0, false
+	}
+
+	// Since the two regions are contiguous in both the key space and the
+	// value space, we can just store a single segment with the first host
+	// virtual address; the logic above operates based on the size of the
+	// segments.
+	return addr1, true
+}
+
+func (hostMapSetFunctions) Split(r usermem.AddrRange, hostAddr uintptr, split usermem.Addr) (uintptr, uintptr) {
+	return hostAddr, hostAddr + uintptr(split-r.Start)
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
new file mode 100644
index 000000000..31928c9f0
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -0,0 +1,149 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kvm provides a kvm-based implementation of the platform interface.
+package kvm
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// KVM represents a lightweight VM context.
+type KVM struct {
+	platform.NoCPUPreemptionDetection
+
+	// filemem is our memory source.
+	*filemem.FileMem
+
+	// machine is the backing VM.
+	machine *machine
+}
+
+var (
+	globalOnce sync.Once
+	globalErr  error
+)
+
+// New returns a new KVM-based implementation of the platform interface.
+func New() (*KVM, error) {
+	// Allocate physical memory for the vCPUs.
+	fm, err := filemem.New("kvm-memory")
+	if err != nil {
+		return nil, err
+	}
+
+	// Try opening KVM.
+	fd, err := syscall.Open("/dev/kvm", syscall.O_RDWR, 0)
+	if err != nil {
+		return nil, fmt.Errorf("opening /dev/kvm: %v", err)
+	}
+	defer syscall.Close(fd)
+
+	// Ensure global initialization is done.
+	globalOnce.Do(func() {
+		physicalInit()
+		globalErr = updateSystemValues(fd)
+		ring0.Init(cpuid.HostFeatureSet())
+	})
+	if globalErr != nil {
+		return nil, err
+	}
+
+	// Create a new VM fd.
+	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_CREATE_VM, 0)
+	if errno != 0 {
+		return nil, fmt.Errorf("creating VM: %v", errno)
+	}
+
+	// Create a VM context.
+	machine, err := newMachine(int(vm), runtime.NumCPU())
+	if err != nil {
+		return nil, err
+	}
+
+	// All set.
+	return &KVM{
+		FileMem: fm,
+		machine: machine,
+	}, nil
+}
+
+// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
+func (*KVM) SupportsAddressSpaceIO() bool {
+	return false
+}
+
+// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
+func (*KVM) CooperativelySchedulesAddressSpace() bool {
+	return false
+}
+
+// MapUnit implements platform.Platform.MapUnit.
+func (*KVM) MapUnit() uint64 {
+	// We greedily creates PTEs in MapFile, so extremely large mappings can
+	// be expensive. Not _that_ expensive since we allow super pages, but
+	// even though can get out of hand if you're creating multi-terabyte
+	// mappings. For this reason, we limit mappings to an arbitrary 16MB.
+	return 16 << 20
+}
+
+// MinUserAddress returns the lowest available address.
+func (*KVM) MinUserAddress() usermem.Addr {
+	return usermem.PageSize
+}
+
+// MaxUserAddress returns the first address that may not be used.
+func (*KVM) MaxUserAddress() usermem.Addr {
+	return usermem.Addr(ring0.MaximumUserAddress)
+}
+
+// NewAddressSpace returns a new pagetable root.
+func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
+	// Allocate page tables and install system mappings.
+	pageTables := k.machine.kernel.PageTables.New()
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map the kernel in the upper half.
+		kernelVirtual := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
+		pageTables.Map(kernelVirtual, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		return true // Keep iterating.
+	})
+
+	// Return the new address space.
+	return &addressSpace{
+		filemem:    k.FileMem,
+		machine:    k.machine,
+		pageTables: pageTables,
+	}, nil, nil
+}
+
+// NewContext returns an interruptible context.
+func (k *KVM) NewContext() platform.Context {
+	return &context{
+		machine: k.machine,
+	}
+}
+
+// Memory returns the platform memory used to do allocations.
+func (k *KVM) Memory() platform.Memory {
+	return k.FileMem
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
new file mode 100644
index 000000000..3d56ed895
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -0,0 +1,213 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+// userRegs represents KVM user registers.
+//
+// This mirrors kvm_regs.
+type userRegs struct {
+	RAX    uint64
+	RBX    uint64
+	RCX    uint64
+	RDX    uint64
+	RSI    uint64
+	RDI    uint64
+	RSP    uint64
+	RBP    uint64
+	R8     uint64
+	R9     uint64
+	R10    uint64
+	R11    uint64
+	R12    uint64
+	R13    uint64
+	R14    uint64
+	R15    uint64
+	RIP    uint64
+	RFLAGS uint64
+}
+
+// systemRegs represents KVM system registers.
+//
+// This mirrors kvm_sregs.
+type systemRegs struct {
+	CS              segment
+	DS              segment
+	ES              segment
+	FS              segment
+	GS              segment
+	SS              segment
+	TR              segment
+	LDT             segment
+	GDT             descriptor
+	IDT             descriptor
+	CR0             uint64
+	CR2             uint64
+	CR3             uint64
+	CR4             uint64
+	CR8             uint64
+	EFER            uint64
+	apicBase        uint64
+	interruptBitmap [(_KVM_NR_INTERRUPTS + 63) / 64]uint64
+}
+
+// segment is the expanded form of a segment register.
+//
+// This mirrors kvm_segment.
+type segment struct {
+	base     uint64
+	limit    uint32
+	selector uint16
+	typ      uint8
+	present  uint8
+	DPL      uint8
+	DB       uint8
+	S        uint8
+	L        uint8
+	G        uint8
+	AVL      uint8
+	unusable uint8
+	_        uint8
+}
+
+// Clear clears the segment and marks it unusable.
+func (s *segment) Clear() {
+	*s = segment{unusable: 1}
+}
+
+// selector is a segment selector.
+type selector uint16
+
+// tobool is a simple helper.
+func tobool(x ring0.SegmentDescriptorFlags) uint8 {
+	if x != 0 {
+		return 1
+	}
+	return 0
+}
+
+// Load loads the segment described by d into the segment s.
+//
+// The argument sel is recorded as the segment selector index.
+func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) {
+	flag := d.Flags()
+	if flag&ring0.SegmentDescriptorPresent == 0 {
+		s.Clear()
+		return
+	}
+	s.base = uint64(d.Base())
+	s.limit = d.Limit()
+	s.typ = uint8((flag>>8)&0xF) | 1
+	s.S = tobool(flag & ring0.SegmentDescriptorSystem)
+	s.DPL = uint8(d.DPL())
+	s.present = tobool(flag & ring0.SegmentDescriptorPresent)
+	s.AVL = tobool(flag & ring0.SegmentDescriptorAVL)
+	s.L = tobool(flag & ring0.SegmentDescriptorLong)
+	s.DB = tobool(flag & ring0.SegmentDescriptorDB)
+	s.G = tobool(flag & ring0.SegmentDescriptorG)
+	if s.L != 0 {
+		s.limit = 0xffffffff
+	}
+	s.unusable = 0
+	s.selector = uint16(sel)
+}
+
+// descriptor describes a region of physical memory.
+//
+// It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT
+// instructions, and mirrors kvm_dtable.
+type descriptor struct {
+	base  uint64
+	limit uint16
+	_     [3]uint16
+}
+
+// modelControlRegister is an MSR entry.
+//
+// This mirrors kvm_msr_entry.
+type modelControlRegister struct {
+	index uint32
+	_     uint32
+	data  uint64
+}
+
+// modelControlRegisers is a collection of MSRs.
+//
+// This mirrors kvm_msrs.
+type modelControlRegisters struct {
+	nmsrs   uint32
+	_       uint32
+	entries [16]modelControlRegister
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
+// cpuidEntry is a single CPUID entry.
+//
+// This mirrors kvm_cpuid_entry2.
+type cpuidEntry struct {
+	function uint32
+	index    uint32
+	flags    uint32
+	eax      uint32
+	ebx      uint32
+	ecx      uint32
+	edx      uint32
+	_        [3]uint32
+}
+
+// cpuidEntries is a collection of CPUID entries.
+//
+// This mirrors kvm_cpuid2.
+type cpuidEntries struct {
+	nr      uint32
+	_       uint32
+	entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
new file mode 100644
index 000000000..389412d87
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+var (
+	runDataSize     int
+	hasGuestPCID    bool
+	hasGuestINVPCID bool
+	pagetablesOpts  pagetables.Opts
+	cpuidSupported  = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES}
+)
+
+func updateSystemValues(fd int) error {
+	// Extract the mmap size.
+	sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0)
+	if errno != 0 {
+		return fmt.Errorf("getting VCPU mmap size: %v", errno)
+	}
+
+	// Save the data.
+	runDataSize = int(sz)
+
+	// Must do the dance to figure out the number of entries.
+	_, _, errno = syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(fd),
+		_KVM_GET_SUPPORTED_CPUID,
+		uintptr(unsafe.Pointer(&cpuidSupported)))
+	if errno != 0 && errno != syscall.ENOMEM {
+		// Some other error occurred.
+		return fmt.Errorf("getting supported CPUID: %v", errno)
+	}
+
+	// The number should now be correct.
+	_, _, errno = syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(fd),
+		_KVM_GET_SUPPORTED_CPUID,
+		uintptr(unsafe.Pointer(&cpuidSupported)))
+	if errno != 0 {
+		// Didn't work with the right number.
+		return fmt.Errorf("getting supported CPUID (2nd attempt): %v", errno)
+	}
+
+	// Calculate whether guestPCID is supported.
+	//
+	// FIXME: These should go through the much more pleasant
+	// cpuid package interfaces, once a way to accept raw kvm CPUID entries
+	// is plumbed (or some rough equivalent).
+	for i := 0; i < int(cpuidSupported.nr); i++ {
+		entry := cpuidSupported.entries[i]
+		if entry.function == 1 && entry.index == 0 && entry.ecx&(1<<17) != 0 {
+			hasGuestPCID = true // Found matching PCID in guest feature set.
+		}
+		if entry.function == 7 && entry.index == 0 && entry.ebx&(1<<10) != 0 {
+			hasGuestINVPCID = true // Found matching INVPCID in guest feature set.
+		}
+	}
+
+	// A basic sanity check: ensure that we don't attempt to
+	// invpcid if guest PCIDs are not supported; it's not clear
+	// what the semantics of this would be (or why some CPU or
+	// hypervisor would export this particular combination).
+	hasGuestINVPCID = hasGuestPCID && hasGuestINVPCID
+
+	// Set the pagetables to use PCID if it's available.
+	pagetablesOpts.EnablePCID = hasGuestPCID
+
+	// Success.
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
new file mode 100644
index 000000000..0ec6a4a00
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// KVM ioctls.
+//
+// Only the ioctls we need in Go appear here; some additional ioctls are used
+// within the assembly stubs (KVM_INTERRUPT, etc.).
+const (
+	_KVM_CREATE_VM              = 0xae01
+	_KVM_GET_VCPU_MMAP_SIZE     = 0xae04
+	_KVM_CREATE_VCPU            = 0xae41
+	_KVM_SET_TSS_ADDR           = 0xae47
+	_KVM_RUN                    = 0xae80
+	_KVM_INTERRUPT              = 0x4004ae86
+	_KVM_SET_MSRS               = 0x4008ae89
+	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
+	_KVM_SET_REGS               = 0x4090ae82
+	_KVM_SET_SREGS              = 0x4138ae84
+	_KVM_GET_SUPPORTED_CPUID    = 0xc008ae05
+	_KVM_SET_CPUID2             = 0x4008ae90
+	_KVM_SET_SIGNAL_MASK        = 0x4004ae8b
+)
+
+// KVM exit reasons.
+const (
+	_KVM_EXIT_EXCEPTION       = 0x1
+	_KVM_EXIT_IO              = 0x2
+	_KVM_EXIT_HYPERCALL       = 0x3
+	_KVM_EXIT_DEBUG           = 0x4
+	_KVM_EXIT_HLT             = 0x5
+	_KVM_EXIT_MMIO            = 0x6
+	_KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
+	_KVM_EXIT_SHUTDOWN        = 0x8
+	_KVM_EXIT_FAIL_ENTRY      = 0x9
+	_KVM_EXIT_INTERNAL_ERROR  = 0x11
+)
+
+// KVM limits.
+const (
+	_KVM_NR_VCPUS         = 0x100
+	_KVM_NR_INTERRUPTS    = 0x100
+	_KVM_NR_CPUID_ENTRIES = 0x100
+)
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
new file mode 100644
index 000000000..61cfdd8fd
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -0,0 +1,415 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"math/rand"
+	"reflect"
+	"syscall"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm/testutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+var dummyFPState = (*byte)(arch.NewFloatingPointData())
+
+type testHarness interface {
+	Errorf(format string, args ...interface{})
+	Fatalf(format string, args ...interface{})
+}
+
+func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
+	// Create the machine.
+	k, err := New()
+	if err != nil {
+		t.Fatalf("error creating KVM instance: %v", err)
+	}
+	defer k.machine.Destroy()
+	defer k.FileMem.Destroy()
+
+	// Call additional setup.
+	if setup != nil {
+		setup(k)
+	}
+
+	var c *vCPU // For recovery.
+	defer func() {
+		redpill()
+		if c != nil {
+			k.machine.Put(c)
+		}
+	}()
+	for {
+		c, err = k.machine.Get()
+		if err != nil {
+			t.Fatalf("error getting vCPU: %v", err)
+		}
+		if !fn(c) {
+			break
+		}
+
+		// We put the vCPU here and clear the value so that the
+		// deferred recovery will not re-put it above.
+		k.machine.Put(c)
+		c = nil
+	}
+}
+
+func bluepillTest(t testHarness, fn func(*vCPU)) {
+	kvmTest(t, nil, func(c *vCPU) bool {
+		bluepill(c)
+		fn(c)
+		return false
+	})
+}
+
+func TestKernelSyscall(t *testing.T) {
+	bluepillTest(t, func(c *vCPU) {
+		redpill() // Leave guest mode.
+		if got := c.State(); got != vCPUReady {
+			t.Errorf("vCPU not in ready state: got %v", got)
+		}
+	})
+}
+
+func hostFault() {
+	defer func() {
+		recover()
+	}()
+	var foo *int
+	*foo = 0
+}
+
+func TestKernelFault(t *testing.T) {
+	hostFault() // Ensure recovery works.
+	bluepillTest(t, func(c *vCPU) {
+		hostFault()
+		if got := c.State(); got != vCPUReady {
+			t.Errorf("vCPU not in ready state: got %v", got)
+		}
+	})
+}
+
+func TestKernelFloatingPoint(t *testing.T) {
+	bluepillTest(t, func(c *vCPU) {
+		if !testutil.FloatingPointWorks() {
+			t.Errorf("floating point does not work, and it should!")
+		}
+	})
+}
+
+func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *syscall.PtraceRegs, *pagetables.PageTables) bool) {
+	// Initialize registers & page tables.
+	var (
+		regs syscall.PtraceRegs
+		pt   *pagetables.PageTables
+	)
+	testutil.SetTestTarget(&regs, target)
+	defer func() {
+		if pt != nil {
+			pt.Release()
+		}
+	}()
+
+	kvmTest(t, func(k *KVM) {
+		// Create new page tables.
+		as, _, err := k.NewAddressSpace(nil /* invalidator */)
+		if err != nil {
+			t.Fatalf("can't create new address space: %v", err)
+		}
+		pt = as.(*addressSpace).pageTables
+
+		if useHostMappings {
+			// Apply the physical mappings to these page tables.
+			// (This is normally dangerous, since they point to
+			// physical pages that may not exist. This shouldn't be
+			// done for regular user code, but is fine for test
+			// purposes.)
+			applyPhysicalRegions(func(pr physicalRegion) bool {
+				pt.Map(usermem.Addr(pr.virtual), pr.length, true /* user */, usermem.AnyAccess, pr.physical)
+				return true // Keep iterating.
+			})
+		}
+	}, func(c *vCPU) bool {
+		// Invoke the function with the extra data.
+		return fn(c, &regs, pt)
+	})
+}
+
+func TestApplicationSyscall(t *testing.T) {
+	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil {
+			t.Errorf("application syscall with full restore failed: %v", err)
+		}
+		return false
+	})
+	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+			t.Errorf("application syscall with partial restore failed: %v", err)
+		}
+		return false
+	})
+}
+
+func TestApplicationFault(t *testing.T) {
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		testutil.SetTouchTarget(regs, nil) // Cause fault.
+		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+			t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
+		}
+		return false
+	})
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		testutil.SetTouchTarget(regs, nil) // Cause fault.
+		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+			t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
+		}
+		return false
+	})
+}
+
+func TestRegistersSyscall(t *testing.T) {
+	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		testutil.SetTestRegs(regs) // Fill values for all registers.
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+			t.Errorf("application register check with partial restore got unexpected error: %v", err)
+		}
+		if err := testutil.CheckTestRegs(regs, false); err != nil {
+			t.Errorf("application register check with partial restore failed: %v", err)
+		}
+		return false
+	})
+}
+
+func TestRegistersFault(t *testing.T) {
+	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		testutil.SetTestRegs(regs) // Fill values for all registers.
+		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
+			t.Errorf("application register check with full restore got unexpected error: %v", err)
+		}
+		if err := testutil.CheckTestRegs(regs, true); err != nil {
+			t.Errorf("application register check with full restore failed: %v", err)
+		}
+		return false
+	})
+}
+
+func TestSegments(t *testing.T) {
+	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		testutil.SetTestSegments(regs)
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil {
+			t.Errorf("application segment check with full restore got unexpected error: %v", err)
+		}
+		if err := testutil.CheckTestSegments(regs); err != nil {
+			t.Errorf("application segment check with full restore failed: %v", err)
+		}
+		return false
+	})
+}
+
+func TestBounce(t *testing.T) {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		go func() {
+			time.Sleep(time.Millisecond)
+			c.Bounce()
+		}()
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
+			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+		}
+		return false
+	})
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		go func() {
+			time.Sleep(time.Millisecond)
+			c.Bounce()
+		}()
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextInterrupt {
+			t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+		}
+		return false
+	})
+}
+
+func TestBounceStress(t *testing.T) {
+	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		randomSleep := func() {
+			// O(hundreds of microseconds) is appropriate to ensure
+			// different overlaps and different schedules.
+			if n := rand.Intn(1000); n > 100 {
+				time.Sleep(time.Duration(n) * time.Microsecond)
+			}
+		}
+		for i := 0; i < 1000; i++ {
+			// Start an asynchronously executing goroutine that
+			// calls Bounce at pseudo-random point in time.
+			// This should wind up calling Bounce when the
+			// kernel is in various stages of the switch.
+			go func() {
+				randomSleep()
+				c.Bounce()
+			}()
+			randomSleep()
+			// Execute the switch.
+			if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
+				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+			}
+			// Simulate work.
+			c.Unlock()
+			randomSleep()
+			c.Lock()
+		}
+		return false
+	})
+}
+
+func TestInvalidate(t *testing.T) {
+	var data uintptr // Used below.
+	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		testutil.SetTouchTarget(regs, &data) // Read legitimate value.
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+			t.Errorf("application partial restore: got %v, wanted nil", err)
+		}
+		// Unmap the page containing data & invalidate.
+		pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize)
+		c.Invalidate() // Ensure invalidation.
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal {
+			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
+		}
+		return false
+	})
+}
+
+// IsFault returns true iff the given signal represents a fault.
+func IsFault(err error, si *arch.SignalInfo) bool {
+	return err == platform.ErrContextSignal && si.Signo == int32(syscall.SIGSEGV)
+}
+
+func TestEmptyAddressSpace(t *testing.T) {
+	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); !IsFault(err, si) {
+			t.Errorf("first fault with partial restore failed got %v", err)
+			t.Logf("registers: %#v", &regs)
+		}
+		return false
+	})
+	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); !IsFault(err, si) {
+			t.Errorf("first fault with full restore failed got %v", err)
+			t.Logf("registers: %#v", &regs)
+		}
+		return false
+	})
+}
+
+func TestWrongVCPU(t *testing.T) {
+	kvmTest(t, nil, func(c1 *vCPU) bool {
+		kvmTest(t, nil, func(c2 *vCPU) bool {
+			// Basic test, one then the other.
+			bluepill(c1)
+			bluepill(c2)
+			if c2.switches == 0 {
+				// Don't allow the test to proceed if this fails.
+				t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+			}
+
+			// Alternate vCPUs; we expect to need to trigger the
+			// wrong vCPU path on each switch.
+			for i := 0; i < 100; i++ {
+				bluepill(c1)
+				bluepill(c2)
+			}
+			if count := c1.switches; count < 90 {
+				t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+			}
+			if count := c2.switches; count < 90 {
+				t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+			}
+			return false
+		})
+		return false
+	})
+	kvmTest(t, nil, func(c1 *vCPU) bool {
+		kvmTest(t, nil, func(c2 *vCPU) bool {
+			bluepill(c1)
+			bluepill(c2)
+			return false
+		})
+		return false
+	})
+}
+
+func BenchmarkApplicationSyscall(b *testing.B) {
+	var (
+		i int // Iteration includes machine.Get() / machine.Put().
+		a int // Count for ErrContextInterrupt.
+	)
+	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+			if err == platform.ErrContextInterrupt {
+				a++
+				return true // Ignore.
+			}
+			b.Fatalf("benchmark failed: %v", err)
+		}
+		i++
+		return i < b.N
+	})
+	if a != 0 {
+		b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i)
+	}
+}
+
+func BenchmarkKernelSyscall(b *testing.B) {
+	// Note that the target passed here is irrelevant, we never execute SwitchToUser.
+	applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		// iteration does not include machine.Get() / machine.Put().
+		for i := 0; i < b.N; i++ {
+			testutil.Getpid()
+		}
+		return false
+	})
+}
+
+func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
+	// see BenchmarkApplicationSyscall.
+	var (
+		i int
+		a int
+	)
+	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+			if err == platform.ErrContextInterrupt {
+				a++
+				return true // Ignore.
+			}
+			b.Fatalf("benchmark failed: %v", err)
+		}
+		// This will intentionally cause the world switch. By executing
+		// a host syscall here, we force the transition between guest
+		// and host mode.
+		testutil.Getpid()
+		i++
+		return i < b.N
+	})
+	if a != 0 {
+		b.Logf("EAGAIN occurred %d times (in %d iterations).", a, a+i)
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
new file mode 100644
index 000000000..a5be0cee3
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -0,0 +1,412 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/tmutex"
+)
+
+// machine contains state associated with the VM as a whole.
+type machine struct {
+	// fd is the vm fd.
+	fd int
+
+	// nextSlot is the next slot for setMemoryRegion.
+	//
+	// This must be accessed atomically. If nextSlot is ^uint32(0), then
+	// slots are currently being updated, and the caller should retry.
+	nextSlot uint32
+
+	// kernel is the set of global structures.
+	kernel *ring0.Kernel
+
+	// mappingCache is used for mapPhysical.
+	mappingCache sync.Map
+
+	// mu protects vCPUs.
+	mu sync.Mutex
+
+	// vCPUs are the machine vCPUs.
+	//
+	// This is eventually keyed by system TID, but is initially indexed by
+	// the negative vCPU id. This is merely an optimization, so while
+	// collisions here are not possible, it wouldn't matter anyways.
+	vCPUs map[uint64]*vCPU
+}
+
+const (
+	// vCPUReady is the lock value for an available vCPU.
+	//
+	// Legal transitions: vCPUGuest (bluepill).
+	vCPUReady uintptr = iota
+
+	// vCPUGuest indicates the vCPU is in guest mode.
+	//
+	// Legal transition: vCPUReady (bluepill), vCPUWaiter (wait).
+	vCPUGuest
+
+	// vCPUWaiter indicates that the vCPU should be released.
+	//
+	// Legal transition: vCPUReady (bluepill).
+	vCPUWaiter
+)
+
+// vCPU is a single KVM vCPU.
+type vCPU struct {
+	// CPU is the kernel CPU data.
+	//
+	// This must be the first element of this structure, it is referenced
+	// by the bluepill code (see bluepill_amd64.s).
+	ring0.CPU
+
+	// fd is the vCPU fd.
+	fd int
+
+	// tid is the last set tid.
+	tid uint64
+
+	// switches is a count of world switches (informational only).
+	switches uint32
+
+	// faults is a count of world faults (informational only).
+	faults uint32
+
+	// state is the vCPU state; all are described above.
+	state uintptr
+
+	// runData for this vCPU.
+	runData *runData
+
+	// machine associated with this vCPU.
+	machine *machine
+
+	// mu applies across get/put; it does not protect the above.
+	mu tmutex.Mutex
+}
+
+// newMachine returns a new VM context.
+func newMachine(vm int, vCPUs int) (*machine, error) {
+	// Create the machine.
+	m := &machine{
+		fd:    vm,
+		vCPUs: make(map[uint64]*vCPU),
+	}
+	if vCPUs > _KVM_NR_VCPUS {
+		// Hard cap at KVM's limit.
+		vCPUs = _KVM_NR_VCPUS
+	}
+	if n := 2 * runtime.NumCPU(); vCPUs > n {
+		// Cap at twice the number of physical cores. Otherwise we're
+		// just wasting memory and thrashing. (There may be scheduling
+		// issues when you've got > n active threads.)
+		vCPUs = n
+	}
+	m.kernel = ring0.New(ring0.KernelOpts{
+		PageTables: pagetables.New(m, pagetablesOpts),
+	})
+
+	// Initialize architecture state.
+	if err := m.initArchState(vCPUs); err != nil {
+		m.Destroy()
+		return nil, err
+	}
+
+	// Create all the vCPUs.
+	for id := 0; id < vCPUs; id++ {
+		// Create the vCPU.
+		fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id))
+		if errno != 0 {
+			m.Destroy()
+			return nil, fmt.Errorf("error creating VCPU: %v", errno)
+		}
+		c := &vCPU{
+			fd:      int(fd),
+			machine: m,
+		}
+		c.mu.Init()
+		c.CPU.Init(m.kernel)
+		c.CPU.KernelSyscall = bluepillSyscall
+		c.CPU.KernelException = bluepillException
+		m.vCPUs[uint64(-id)] = c // See above.
+
+		// Ensure the signal mask is correct.
+		if err := c.setSignalMask(); err != nil {
+			m.Destroy()
+			return nil, err
+		}
+
+		// Initialize architecture state.
+		if err := c.initArchState(); err != nil {
+			m.Destroy()
+			return nil, err
+		}
+
+		// Map the run data.
+		runData, err := mapRunData(int(fd))
+		if err != nil {
+			m.Destroy()
+			return nil, err
+		}
+		c.runData = runData
+	}
+
+	// Apply the physical mappings. Note that these mappings may point to
+	// guest physical addresses that are not actually available. These
+	// physical pages are mapped on demand, see kernel_unsafe.go.
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map everything in the lower half.
+		m.kernel.PageTables.Map(usermem.Addr(pr.virtual), pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		// And keep everything in the upper half.
+		kernelAddr := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
+		m.kernel.PageTables.Map(kernelAddr, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		return true // Keep iterating.
+	})
+
+	// Ensure that the currently mapped virtual regions are actually
+	// available in the VM. Note that this doesn't guarantee no future
+	// faults, however it should guarantee that everything is available to
+	// ensure successful vCPU entry.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			return // skip region.
+		}
+		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+			physical, length, ok := TranslateToPhysical(virtual)
+			if !ok {
+				// This must be an invalid region that was
+				// knocked out by creation of the physical map.
+				return
+			}
+			if virtual+length > vr.virtual+vr.length {
+				// Cap the length to the end of the area.
+				length = vr.virtual + vr.length - virtual
+			}
+
+			// Ensure the physical range is mapped.
+			m.mapPhysical(physical, length)
+			virtual += length
+		}
+	})
+
+	// Ensure the machine is cleaned up properly.
+	runtime.SetFinalizer(m, (*machine).Destroy)
+	return m, nil
+}
+
+// mapPhysical checks for the mapping of a physical range, and installs one if
+// not available. This attempts to be efficient for calls in the hot path.
+//
+// This panics on error.
+func (m *machine) mapPhysical(physical, length uintptr) {
+	for end := physical + length; physical < end; {
+		_, physicalStart, length, ok := calculateBluepillFault(m, physical)
+		if !ok {
+			// Should never happen.
+			panic("mapPhysical on unknown physical address")
+		}
+
+		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
+			// Not present in the cache; requires setting the slot.
+			if _, ok := handleBluepillFault(m, physical); !ok {
+				panic("handleBluepillFault failed")
+			}
+		}
+
+		// Move to the next chunk.
+		physical = physicalStart + length
+	}
+}
+
+// Destroy frees associated resources.
+//
+// Destroy should only be called once all active users of the machine are gone.
+// The machine object should not be used after calling Destroy.
+//
+// Precondition: all vCPUs must be returned to the machine.
+func (m *machine) Destroy() {
+	runtime.SetFinalizer(m, nil)
+
+	// Destroy vCPUs.
+	for _, c := range m.vCPUs {
+		// Ensure the vCPU is not still running in guest mode. This is
+		// possible iff teardown has been done by other threads, and
+		// somehow a single thread has not executed any system calls.
+		c.wait()
+
+		// Teardown the vCPU itself.
+		switch state := c.State(); state {
+		case vCPUReady:
+			// Note that the runData may not be mapped if an error
+			// occurs during the middle of initialization.
+			if c.runData != nil {
+				if err := unmapRunData(c.runData); err != nil {
+					panic(fmt.Sprintf("error unmapping rundata: %v", err))
+				}
+			}
+			if err := syscall.Close(int(c.fd)); err != nil {
+				panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+			}
+		case vCPUGuest, vCPUWaiter:
+			// Should never happen; waited above.
+			panic("vCPU disposed in guest state")
+		default:
+			// Should never happen; not a valid state.
+			panic(fmt.Sprintf("vCPU in invalid state: %v", state))
+		}
+	}
+
+	// Release host mappings.
+	if m.kernel.PageTables != nil {
+		m.kernel.PageTables.Release()
+	}
+
+	// vCPUs are gone: teardown machine state.
+	if err := syscall.Close(m.fd); err != nil {
+		panic(fmt.Sprintf("error closing VM fd: %v", err))
+	}
+}
+
+// Get gets an available vCPU.
+func (m *machine) Get() (*vCPU, error) {
+	runtime.LockOSThread()
+	tid := procid.Current()
+	m.mu.Lock()
+
+	for {
+		// Check for an exact match.
+		if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() {
+			m.mu.Unlock()
+			return c, nil
+		}
+
+		// Scan for an available vCPU.
+		for origTID, c := range m.vCPUs {
+			if c.LockInState(vCPUReady) {
+				delete(m.vCPUs, origTID)
+				m.vCPUs[tid] = c
+				m.mu.Unlock()
+
+				// We need to reload thread-local segments as
+				// we have origTID != tid and the vCPU state
+				// may be stale.
+				c.loadSegments()
+				atomic.StoreUint64(&c.tid, tid)
+				return c, nil
+			}
+		}
+
+		// Everything is busy executing user code (locked).
+		//
+		// We hold the pool lock here, so we should be able to kick something
+		// out of kernel mode and have it bounce into host mode when it tries
+		// to grab the vCPU again.
+		for _, c := range m.vCPUs {
+			if c.State() != vCPUWaiter {
+				c.Bounce()
+			}
+		}
+
+		// Give other threads an opportunity to run.
+		yield()
+	}
+}
+
+// Put puts the current vCPU.
+func (m *machine) Put(c *vCPU) {
+	c.Unlock()
+	runtime.UnlockOSThread()
+}
+
+// State returns the current state.
+func (c *vCPU) State() uintptr {
+	return atomic.LoadUintptr(&c.state)
+}
+
+// Lock locks the vCPU.
+func (c *vCPU) Lock() {
+	c.mu.Lock()
+}
+
+// Invalidate invalidates caches.
+func (c *vCPU) Invalidate() {
+}
+
+// LockInState locks the vCPU if it is in the given state and TryLock succeeds.
+func (c *vCPU) LockInState(state uintptr) bool {
+	if c.State() == state && c.mu.TryLock() {
+		if c.State() != state {
+			c.mu.Unlock()
+			return false
+		}
+		return true
+	}
+	return false
+}
+
+// Unlock unlocks the given vCPU.
+func (c *vCPU) Unlock() {
+	// Ensure we're out of guest mode, if necessary.
+	if c.State() == vCPUWaiter {
+		redpill() // Force guest mode exit.
+	}
+	c.mu.Unlock()
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+func (c *vCPU) NotifyInterrupt() {
+	c.Bounce()
+}
+
+// pid is used below in bounce.
+var pid = syscall.Getpid()
+
+// Bounce ensures that the vCPU bounces back to the kernel.
+//
+// In practice, this means returning EAGAIN from running user code. The vCPU
+// will be unlocked and relock, and the kernel is guaranteed to check for
+// interrupt notifications (e.g. injected via Notify) and invalidations.
+func (c *vCPU) Bounce() {
+	for {
+		if c.mu.TryLock() {
+			// We know that the vCPU must be in the kernel already,
+			// because the lock was not acquired. We specifically
+			// don't want to call bounce in this case, because it's
+			// not necessary to knock the vCPU out of guest mode.
+			c.mu.Unlock()
+			return
+		}
+
+		if state := c.State(); state == vCPUGuest || state == vCPUWaiter {
+			// We know that the vCPU was in guest mode, so a single signal
+			// interruption will guarantee that a transition takes place.
+			syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal)
+			return
+		}
+
+		// Someone holds the lock, but the vCPU is not yet transitioned
+		// into guest mode. It's in the critical section; give it time.
+		yield()
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
new file mode 100644
index 000000000..dfa691e88
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// initArchState initializes architecture-specific state.
+func (m *machine) initArchState(vCPUs int) error {
+	// Set the legacy TSS address. This address is covered by the reserved
+	// range (up to 4GB). In fact, this is a main reason it exists.
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_TSS_ADDR,
+		uintptr(reservedMemory-(3*usermem.PageSize))); errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// initArchState initializes architecture-specific state.
+func (c *vCPU) initArchState() error {
+	var (
+		kernelSystemRegs systemRegs
+		kernelUserRegs   userRegs
+	)
+
+	// Set base control registers.
+	kernelSystemRegs.CR0 = c.CR0()
+	kernelSystemRegs.CR4 = c.CR4()
+	kernelSystemRegs.EFER = c.EFER()
+
+	// Set the IDT & GDT in the registers.
+	kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT()
+	kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT()
+	kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode)
+	kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata)
+	kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata)
+	tssBase, tssLimit, tss := c.TSS()
+	kernelSystemRegs.TR.Load(tss, ring0.Tss)
+	kernelSystemRegs.TR.base = tssBase
+	kernelSystemRegs.TR.limit = uint32(tssLimit)
+
+	// Point to kernel page tables.
+	kernelSystemRegs.CR3 = c.machine.kernel.PageTables.FlushCR3()
+
+	// Set the CPUID; this is required before setting system registers,
+	// since KVM will reject several CR4 bits if the CPUID does not
+	// indicate the support is available.
+	if err := c.setCPUID(); err != nil {
+		return err
+	}
+
+	// Set the entrypoint for the kernel.
+	kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
+	kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+	kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
+
+	// Set the system registers.
+	if err := c.setSystemRegisters(&kernelSystemRegs); err != nil {
+		return err
+	}
+
+	// Set the user registers.
+	if err := c.setUserRegisters(&kernelUserRegs); err != nil {
+		return err
+	}
+
+	// Set the time offset to the host native time.
+	return c.setSystemTime()
+}
+
+// SwitchToUser unpacks architectural-details.
+func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags ring0.Flags) (*arch.SignalInfo, usermem.AccessType, error) {
+	// See below.
+	var vector ring0.Vector
+
+	// Past this point, stack growth can cause system calls (and a break
+	// from guest mode). So we need to ensure that between the bluepill
+	// call here and the switch call immediately below, no additional
+	// allocations occur.
+	entersyscall()
+	bluepill(c)
+	vector = c.CPU.SwitchToUser(regs, fpState, pt, flags)
+	exitsyscall()
+
+	// Free and clear.
+	switch vector {
+	case ring0.Debug, ring0.Breakpoint:
+		info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)}
+		return info, usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.PageFault:
+		bluepill(c) // Probably no-op, but may not be.
+		faultAddr := ring0.ReadCR2()
+		code, user := c.ErrorCode()
+		if !user {
+			// The last fault serviced by this CPU was not a user
+			// fault, so we can't reliably trust the faultAddr or
+			// the code provided here. We need to re-execute.
+			return nil, usermem.NoAccess, platform.ErrContextInterrupt
+		}
+		info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
+		info.SetAddr(uint64(faultAddr))
+		accessType := usermem.AccessType{
+			Read:    code&(1<<1) == 0,
+			Write:   code&(1<<1) != 0,
+			Execute: code&(1<<4) != 0,
+		}
+		return info, accessType, platform.ErrContextSignal
+
+	case ring0.GeneralProtectionFault:
+		if !ring0.IsCanonical(regs.Rip) {
+			// If the RIP is non-canonical, it's a SEGV.
+			info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
+			return info, usermem.AccessType{}, platform.ErrContextSignal
+		}
+		// Otherwise, we deliver a SIGBUS.
+		info := &arch.SignalInfo{Signo: int32(syscall.SIGBUS)}
+		return info, usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.InvalidOpcode:
+		info := &arch.SignalInfo{Signo: int32(syscall.SIGILL)}
+		return info, usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.X87FloatingPointException:
+		info := &arch.SignalInfo{Signo: int32(syscall.SIGFPE)}
+		return info, usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.Vector(bounce):
+		redpill() // Bail and reacqire.
+		return nil, usermem.NoAccess, platform.ErrContextInterrupt
+
+	case ring0.Syscall, ring0.SyscallInt80:
+		// System call executed.
+		return nil, usermem.NoAccess, nil
+
+	default:
+		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
new file mode 100644
index 000000000..c2bcb3a47
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -0,0 +1,156 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
+	userRegion := userMemoryRegion{
+		slot:          uint32(slot),
+		flags:         0,
+		guestPhysAddr: uint64(physical),
+		memorySize:    uint64(length),
+		userspaceAddr: uint64(virtual),
+	}
+
+	// Set the region.
+	_, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_USER_MEMORY_REGION,
+		uintptr(unsafe.Pointer(&userRegion)))
+	return errno
+}
+
+// loadSegments copies the current segments.
+//
+// This may be called from within the signal context and throws on error.
+//
+//go:nosplit
+func (c *vCPU) loadSegments() {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_ARCH_PRCTL,
+		linux.ARCH_GET_FS,
+		uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)),
+		0); errno != 0 {
+		throw("getting FS segment")
+	}
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_ARCH_PRCTL,
+		linux.ARCH_GET_GS,
+		uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)),
+		0); errno != 0 {
+		throw("getting GS segment")
+	}
+}
+
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
+	}
+	return nil
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return fmt.Errorf("error setting system registers: %v", errno)
+	}
+	return nil
+}
+
+// setCPUID sets the CPUID to be used by the guest.
+func (c *vCPU) setCPUID() error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_CPUID2,
+		uintptr(unsafe.Pointer(&cpuidSupported))); errno != 0 {
+		return fmt.Errorf("error setting CPUID: %v", errno)
+	}
+	return nil
+}
+
+// setSystemTime sets the TSC for the vCPU.
+//
+// FIXME: This introduces a slight TSC offset between host and
+// guest, which may vary per vCPU.
+func (c *vCPU) setSystemTime() error {
+	const _MSR_IA32_TSC = 0x00000010
+	registers := modelControlRegisters{
+		nmsrs: 1,
+	}
+	registers.entries[0] = modelControlRegister{
+		index: _MSR_IA32_TSC,
+		data:  uint64(time.Rdtsc()),
+	}
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_MSRS,
+		uintptr(unsafe.Pointer(&registers))); errno != 0 {
+		return fmt.Errorf("error setting system time: %v", errno)
+	}
+	return nil
+}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
new file mode 100644
index 000000000..da67e23f6
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+//go:linkname entersyscall runtime.entersyscall
+func entersyscall()
+
+//go:linkname exitsyscall runtime.exitsyscall
+func exitsyscall()
+
+// TranslateToVirtual implements pagetables.Translater.TranslateToPhysical.
+func (m *machine) TranslateToPhysical(ptes *pagetables.PTEs) uintptr {
+	// The length doesn't matter because all these translations require
+	// only a single page, which is guaranteed to be satisfied.
+	physical, _, ok := TranslateToPhysical(uintptr(unsafe.Pointer(ptes)))
+	if !ok {
+		panic("unable to translate pagetables.Node to physical address")
+	}
+	return physical
+}
+
+// mapRunData maps the vCPU run data.
+func mapRunData(fd int) (*runData, error) {
+	r, _, errno := syscall.RawSyscall6(
+		syscall.SYS_MMAP,
+		0,
+		uintptr(runDataSize),
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_SHARED,
+		uintptr(fd),
+		0)
+	if errno != 0 {
+		return nil, fmt.Errorf("error mapping runData: %v", errno)
+	}
+	return (*runData)(unsafe.Pointer(r)), nil
+}
+
+// unmapRunData unmaps the vCPU run data.
+func unmapRunData(r *runData) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_MUNMAP,
+		uintptr(unsafe.Pointer(r)),
+		uintptr(runDataSize),
+		0); errno != 0 {
+		return fmt.Errorf("error unmapping runData: %v", errno)
+	}
+	return nil
+}
+
+// notify notifies that the vCPU has returned to host mode.
+//
+// This may be called by a signal handler and therefore throws on error.
+//
+//go:nosplit
+func (c *vCPU) notify() {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&c.state)),
+		linux.FUTEX_WAKE,
+		^uintptr(0), // Number of waiters.
+		0, 0, 0)
+	if errno != 0 {
+		throw("futex wake error")
+	}
+}
+
+// wait waits for the vCPU to return to host mode.
+//
+// This panics on error.
+func (c *vCPU) wait() {
+	if !atomic.CompareAndSwapUintptr(&c.state, vCPUGuest, vCPUWaiter) {
+		return // Nothing to wait for.
+	}
+	for {
+		_, _, errno := syscall.Syscall6(
+			syscall.SYS_FUTEX,
+			uintptr(unsafe.Pointer(&c.state)),
+			linux.FUTEX_WAIT,
+			uintptr(vCPUWaiter), // Expected value.
+			0, 0, 0)
+		if errno == syscall.EINTR {
+			continue
+		} else if errno == syscall.EAGAIN {
+			break
+		} else if errno != 0 {
+			panic("futex wait error")
+		}
+		break
+	}
+}
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
new file mode 100644
index 000000000..5d55c9486
--- /dev/null
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -0,0 +1,221 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"sort"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// reservedMemory is a chunk of physical memory reserved starting at
+	// physical address zero. There are some special pages in this region,
+	// so we just call the whole thing off.
+	//
+	// Other architectures may define this to be zero.
+	reservedMemory = 0x100000000
+)
+
+type region struct {
+	virtual uintptr
+	length  uintptr
+}
+
+type physicalRegion struct {
+	region
+	physical uintptr
+}
+
+// physicalRegions contains a list of available physical regions.
+//
+// The physical value used in physicalRegions is a number indicating the
+// physical offset, aligned appropriately and starting above reservedMemory.
+var physicalRegions []physicalRegion
+
+// fillAddressSpace fills the host address space with PROT_NONE mappings until
+// the number of available bits until we have a host address space size that is
+// equal to the physical address space.
+//
+// The excluded regions are returned.
+func fillAddressSpace() (excludedRegions []region) {
+	// We can cut vSize in half, because the kernel will be using the top
+	// half and we ignore it while constructing mappings. It's as if we've
+	// already excluded half the possible addresses.
+	vSize := uintptr(1) << ring0.VirtualAddressBits()
+	vSize = vSize >> 1
+
+	// We exclude reservedMemory below from our physical memory size, so it
+	// needs to be dropped here as well. Otherwise, we could end up with
+	// physical addresses that are beyond what is mapped.
+	pSize := uintptr(1) << ring0.PhysicalAddressBits()
+	pSize -= reservedMemory
+
+	// Sanity check.
+	if vSize < pSize {
+		panic(fmt.Sprintf("vSize (%x) < pSize (%x)", vSize, pSize))
+	}
+
+	// Add specifically excluded regions; see excludeVirtualRegion.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			excludedRegions = append(excludedRegions, vr.region)
+			vSize -= vr.length
+			log.Infof("excluded: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length)
+		}
+	})
+
+	// Calculate the required space and fill it.
+	//
+	// Note carefully that we add faultBlockSize to required up front, and
+	// on each iteration of the loop below (i.e. each new physical region
+	// we define), we add faultBlockSize again. This is done because the
+	// computation of physical regions will ensure proper alignments with
+	// faultBlockSize, potentially causing up to faultBlockSize bytes in
+	// internal fragmentation for each physical region. So we need to
+	// account for this properly during allocation.
+	requiredAddr, ok := usermem.Addr(vSize - pSize + faultBlockSize).RoundUp()
+	if !ok {
+		panic(fmt.Sprintf(
+			"overflow for vSize (%x) - pSize (%x) + faultBlockSize (%x)",
+			vSize, pSize, faultBlockSize))
+	}
+	required := uintptr(requiredAddr)
+	current := required // Attempted mmap size.
+	for filled := uintptr(0); filled < required && current > 0; {
+		addr, _, errno := syscall.RawSyscall6(
+			syscall.SYS_MMAP,
+			0, // Suggested address.
+			current,
+			syscall.PROT_NONE,
+			syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE|syscall.MAP_NORESERVE,
+			0, 0)
+		if errno != 0 {
+			// Attempt half the size; overflow not possible.
+			currentAddr, _ := usermem.Addr(current >> 1).RoundUp()
+			current = uintptr(currentAddr)
+			continue
+		}
+		// We filled a block.
+		filled += current
+		excludedRegions = append(excludedRegions, region{
+			virtual: addr,
+			length:  current,
+		})
+		// See comment above.
+		if filled != required {
+			required += faultBlockSize
+		}
+	}
+	if current == 0 {
+		panic("filling address space failed")
+	}
+	sort.Slice(excludedRegions, func(i, j int) bool {
+		return excludedRegions[i].virtual < excludedRegions[j].virtual
+	})
+	for _, r := range excludedRegions {
+		log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length)
+	}
+	return excludedRegions
+}
+
+// computePhysicalRegions computes physical regions.
+func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) {
+	physical := uintptr(reservedMemory)
+	addValidRegion := func(virtual, length uintptr) {
+		if length == 0 {
+			return
+		}
+		if virtual == 0 {
+			virtual += usermem.PageSize
+			length -= usermem.PageSize
+		}
+		if end := virtual + length; end > ring0.MaximumUserAddress {
+			length -= (end - ring0.MaximumUserAddress)
+		}
+		if length == 0 {
+			return
+		}
+		// Round physical up to the same alignment as the virtual
+		// address (with respect to faultBlockSize).
+		if offset := virtual &^ faultBlockMask; physical&^faultBlockMask != offset {
+			if newPhysical := (physical & faultBlockMask) + offset; newPhysical > physical {
+				physical = newPhysical // Round up by only a little bit.
+			} else {
+				physical = ((physical + faultBlockSize) & faultBlockMask) + offset
+			}
+		}
+		physicalRegions = append(physicalRegions, physicalRegion{
+			region: region{
+				virtual: virtual,
+				length:  length,
+			},
+			physical: physical,
+		})
+		physical += length
+	}
+	lastExcludedEnd := uintptr(0)
+	for _, r := range excludedRegions {
+		addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd)
+		lastExcludedEnd = r.virtual + r.length
+	}
+	addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd)
+
+	// Dump our all physical regions.
+	for _, r := range physicalRegions {
+		log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)",
+			r.virtual, r.virtual+r.length, r.physical, r.physical+r.length)
+	}
+	return physicalRegions
+}
+
+// physicalInit initializes physical address mappings.
+func physicalInit() {
+	physicalRegions = computePhysicalRegions(fillAddressSpace())
+}
+
+// applyPhysicalRegions applies the given function on physical regions.
+//
+// Iteration continues as long as true is returned. The return value is the
+// return from the last call to fn, or true if there are no entries.
+//
+// Precondition: physicalInit must have been called.
+func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool {
+	for _, pr := range physicalRegions {
+		if !fn(pr) {
+			return false
+		}
+	}
+	return true
+}
+
+// TranslateToPhysical translates the given virtual address.
+//
+// Precondition: physicalInit must have been called.
+func TranslateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) {
+	ok = !applyPhysicalRegions(func(pr physicalRegion) bool {
+		if pr.virtual <= virtual && virtual < pr.virtual+pr.length {
+			physical = pr.physical + (virtual - pr.virtual)
+			length = pr.length - (virtual - pr.virtual)
+			return false
+		}
+		return true
+	})
+	return
+}
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
new file mode 100644
index 000000000..8533a8d89
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -0,0 +1,15 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "testutil",
+    testonly = 1,
+    srcs = [
+        "testutil.go",
+        "testutil_amd64.go",
+        "testutil_amd64.s",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm/testutil",
+    visibility = ["//pkg/sentry/platform/kvm:__pkg__"],
+)
diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go
new file mode 100644
index 000000000..8a614e25d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/testutil.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil provides common assembly stubs for testing.
+package testutil
+
+import (
+	"fmt"
+	"strings"
+)
+
+// Getpid executes a trivial system call.
+func Getpid()
+
+// Touch touches the value in the first register.
+func Touch()
+
+// SyscallLoop executes a syscall and loops.
+func SyscallLoop()
+
+// SpinLoop spins on the CPU.
+func SpinLoop()
+
+// HaltLoop immediately halts and loops.
+func HaltLoop()
+
+// TwiddleRegsFault twiddles registers then faults.
+func TwiddleRegsFault()
+
+// TwiddleRegsSyscall twiddles registers then executes a syscall.
+func TwiddleRegsSyscall()
+
+// TwiddleSegments reads segments into known registers.
+func TwiddleSegments()
+
+// FloatingPointWorks is a floating point test.
+//
+// It returns true or false.
+func FloatingPointWorks() bool
+
+// RegisterMismatchError is used for checking registers.
+type RegisterMismatchError []string
+
+// Error returns a human-readable error.
+func (r RegisterMismatchError) Error() string {
+	return strings.Join([]string(r), ";")
+}
+
+// addRegisterMisatch allows simple chaining of register mismatches.
+func addRegisterMismatch(err error, reg string, got, expected interface{}) error {
+	errStr := fmt.Sprintf("%s got %08x, expected %08x", reg, got, expected)
+	switch r := err.(type) {
+	case nil:
+		// Return a new register mismatch.
+		return RegisterMismatchError{errStr}
+	case RegisterMismatchError:
+		// Append the error.
+		r = append(r, errStr)
+		return r
+	default:
+		// Leave as is.
+		return err
+	}
+}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
new file mode 100644
index 000000000..39286a0af
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package testutil
+
+import (
+	"reflect"
+	"syscall"
+)
+
+// SetTestTarget sets the rip appropriately.
+func SetTestTarget(regs *syscall.PtraceRegs, fn func()) {
+	regs.Rip = uint64(reflect.ValueOf(fn).Pointer())
+}
+
+// SetTouchTarget sets rax appropriately.
+func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
+	if target != nil {
+		regs.Rax = uint64(reflect.ValueOf(target).Pointer())
+	} else {
+		regs.Rax = 0
+	}
+}
+
+// RewindSyscall rewinds a syscall RIP.
+func RewindSyscall(regs *syscall.PtraceRegs) {
+	regs.Rip -= 2
+}
+
+// SetTestRegs initializes registers to known values.
+func SetTestRegs(regs *syscall.PtraceRegs) {
+	regs.R15 = 0x15
+	regs.R14 = 0x14
+	regs.R13 = 0x13
+	regs.R12 = 0x12
+	regs.Rbp = 0xb9
+	regs.Rbx = 0xb4
+	regs.R11 = 0x11
+	regs.R10 = 0x10
+	regs.R9 = 0x09
+	regs.R8 = 0x08
+	regs.Rax = 0x44
+	regs.Rcx = 0xc4
+	regs.Rdx = 0xd4
+	regs.Rsi = 0x51
+	regs.Rdi = 0xd1
+	regs.Rsp = 0x59
+}
+
+// CheckTestRegs checks that registers were twiddled per TwiddleRegs.
+func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) {
+	if need := ^uint64(0x15); regs.R15 != need {
+		err = addRegisterMismatch(err, "R15", regs.R15, need)
+	}
+	if need := ^uint64(0x14); regs.R14 != need {
+		err = addRegisterMismatch(err, "R14", regs.R14, need)
+	}
+	if need := ^uint64(0x13); regs.R13 != need {
+		err = addRegisterMismatch(err, "R13", regs.R13, need)
+	}
+	if need := ^uint64(0x12); regs.R12 != need {
+		err = addRegisterMismatch(err, "R12", regs.R12, need)
+	}
+	if need := ^uint64(0xb9); regs.Rbp != need {
+		err = addRegisterMismatch(err, "Rbp", regs.Rbp, need)
+	}
+	if need := ^uint64(0xb4); regs.Rbx != need {
+		err = addRegisterMismatch(err, "Rbx", regs.Rbx, need)
+	}
+	if need := ^uint64(0x10); regs.R10 != need {
+		err = addRegisterMismatch(err, "R10", regs.R10, need)
+	}
+	if need := ^uint64(0x09); regs.R9 != need {
+		err = addRegisterMismatch(err, "R9", regs.R9, need)
+	}
+	if need := ^uint64(0x08); regs.R8 != need {
+		err = addRegisterMismatch(err, "R8", regs.R8, need)
+	}
+	if need := ^uint64(0x44); regs.Rax != need {
+		err = addRegisterMismatch(err, "Rax", regs.Rax, need)
+	}
+	if need := ^uint64(0xd4); regs.Rdx != need {
+		err = addRegisterMismatch(err, "Rdx", regs.Rdx, need)
+	}
+	if need := ^uint64(0x51); regs.Rsi != need {
+		err = addRegisterMismatch(err, "Rsi", regs.Rsi, need)
+	}
+	if need := ^uint64(0xd1); regs.Rdi != need {
+		err = addRegisterMismatch(err, "Rdi", regs.Rdi, need)
+	}
+	if need := ^uint64(0x59); regs.Rsp != need {
+		err = addRegisterMismatch(err, "Rsp", regs.Rsp, need)
+	}
+	// Rcx & R11 are ignored if !full is set.
+	if need := ^uint64(0x11); full && regs.R11 != need {
+		err = addRegisterMismatch(err, "R11", regs.R11, need)
+	}
+	if need := ^uint64(0xc4); full && regs.Rcx != need {
+		err = addRegisterMismatch(err, "Rcx", regs.Rcx, need)
+	}
+	return
+}
+
+var fsData uint64 = 0x55
+var gsData uint64 = 0x85
+
+// SetTestSegments initializes segments to known values.
+func SetTestSegments(regs *syscall.PtraceRegs) {
+	regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer())
+	regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer())
+}
+
+// CheckTestSegments checks that registers were twiddled per TwiddleSegments.
+func CheckTestSegments(regs *syscall.PtraceRegs) (err error) {
+	if regs.Rax != fsData {
+		err = addRegisterMismatch(err, "Rax", regs.Rax, fsData)
+	}
+	if regs.Rbx != gsData {
+		err = addRegisterMismatch(err, "Rbx", regs.Rcx, gsData)
+	}
+	return
+}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
new file mode 100644
index 000000000..3b5ad8817
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+// test_util_amd64.s provides AMD64 test functions.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT ·Getpid(SB),NOSPLIT,$0
+	NO_LOCAL_POINTERS
+	MOVQ $39, AX // getpid
+	SYSCALL
+	RET
+
+TEXT ·Touch(SB),NOSPLIT,$0
+start:
+	MOVQ 0(AX), BX // deref AX
+	MOVQ $39, AX   // getpid
+	SYSCALL
+	JMP start
+
+TEXT ·HaltLoop(SB),NOSPLIT,$0
+start:
+	HLT
+	JMP start
+
+TEXT ·SyscallLoop(SB),NOSPLIT,$0
+start:
+	SYSCALL
+	JMP start
+
+TEXT ·SpinLoop(SB),NOSPLIT,$0
+start:
+	JMP start
+
+TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
+	NO_LOCAL_POINTERS
+	MOVQ $1, AX
+	MOVQ AX, X0
+	MOVQ $39, AX // getpid
+	SYSCALL
+	MOVQ X0, AX
+	CMPQ AX, $1
+	SETEQ ret+0(FP)
+	RET
+
+#define TWIDDLE_REGS() \
+	NOTQ R15; \
+	NOTQ R14; \
+	NOTQ R13; \
+	NOTQ R12; \
+	NOTQ BP; \
+	NOTQ BX; \
+	NOTQ R11; \
+	NOTQ R10; \
+	NOTQ R9; \
+	NOTQ R8; \
+	NOTQ AX; \
+	NOTQ CX; \
+	NOTQ DX; \
+	NOTQ SI; \
+	NOTQ DI; \
+	NOTQ SP;
+
+TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0
+	TWIDDLE_REGS()
+	SYSCALL
+	RET // never reached
+
+TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0
+	TWIDDLE_REGS()
+	JMP AX // must fault
+	RET // never reached
+
+#define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00;
+#define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00;
+
+TEXT ·TwiddleSegments(SB),NOSPLIT,$0
+	MOVQ $0x0, AX
+	READ_GS()
+	MOVQ AX, BX
+	MOVQ $0x0, AX
+	READ_FS()
+	SYSCALL
+	RET // never reached
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
new file mode 100644
index 000000000..0d3fbe043
--- /dev/null
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"os"
+	"regexp"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type virtualRegion struct {
+	region
+	accessType usermem.AccessType
+	shared     bool
+	offset     uintptr
+	filename   string
+}
+
+// mapsLine matches a single line from /proc/PID/maps.
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+
+// excludeRegion returns true if these regions should be excluded from the
+// physical map. Virtual regions need to be excluded if get_user_pages will
+// fail on those addresses, preventing KVM from satisfying EPT faults.
+//
+// This includes the VVAR page because the VVAR page may be mapped as I/O
+// memory. And the VDSO page is knocked out because the VVAR page is not even
+// recorded in /proc/self/maps on older kernels; knocking out the VDSO page
+// prevents code in the VDSO from accessing the VVAR address.
+//
+// This is called by the physical map functions, not applyVirtualRegions.
+func excludeVirtualRegion(r virtualRegion) bool {
+	return r.filename == "[vvar]" || r.filename == "[vdso]"
+}
+
+// applyVirtualRegions parses the process maps file.
+//
+// Unlike mappedRegions, these are not consistent over time.
+func applyVirtualRegions(fn func(vr virtualRegion)) error {
+	// Open /proc/self/maps.
+	f, err := os.Open("/proc/self/maps")
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	// Parse all entries.
+	r := bufio.NewReader(f)
+	for {
+		b, err := r.ReadBytes('\n')
+		if b != nil && len(b) > 0 {
+			m := mapsLine.FindSubmatch(b)
+			if m == nil {
+				// This should not happen: kernel bug?
+				return fmt.Errorf("badly formed line: %v", string(b))
+			}
+			start, err := strconv.ParseUint(string(m[1]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad start address: %v", string(b))
+			}
+			end, err := strconv.ParseUint(string(m[2]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad end address: %v", string(b))
+			}
+			read := m[3][0] == 'r'
+			write := m[3][1] == 'w'
+			execute := m[3][2] == 'x'
+			shared := m[3][3] == 's'
+			offset, err := strconv.ParseUint(string(m[4]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad offset: %v", string(b))
+			}
+			fn(virtualRegion{
+				region: region{
+					virtual: uintptr(start),
+					length:  uintptr(end - start),
+				},
+				accessType: usermem.AccessType{
+					Read:    read,
+					Write:   write,
+					Execute: execute,
+				},
+				shared:   shared,
+				offset:   uintptr(offset),
+				filename: string(m[5]),
+			})
+		}
+		if err != nil && err == io.EOF {
+			break
+		} else if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
new file mode 100644
index 000000000..31e5b0e61
--- /dev/null
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type checker struct {
+	ok bool
+}
+
+func (c *checker) Contains(addr uintptr) func(virtualRegion) {
+	c.ok = false // Reset for below calls.
+	return func(vr virtualRegion) {
+		if vr.virtual <= addr && addr < vr.virtual+vr.length {
+			c.ok = true
+		}
+	}
+}
+
+func TestParseMaps(t *testing.T) {
+	c := new(checker)
+
+	// Simple test.
+	if err := applyVirtualRegions(c.Contains(0)); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// MMap a new page.
+	addr, _, errno := syscall.RawSyscall6(
+		syscall.SYS_MMAP, 0, usermem.PageSize,
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE, 0, 0)
+	if errno != 0 {
+		t.Fatalf("unexpected map error: %v", errno)
+	}
+
+	// Re-parse maps.
+	if err := applyVirtualRegions(c.Contains(addr)); err != nil {
+		syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// Assert that it now does contain the region.
+	if !c.ok {
+		syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
+		t.Fatalf("updated map does not contain 0x%08x, expected true", addr)
+	}
+
+	// Unmap the region.
+	syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
+
+	// Re-parse maps.
+	if err := applyVirtualRegions(c.Contains(addr)); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// Assert that it once again does _not_ contain the region.
+	if c.ok {
+		t.Fatalf("final map does contain 0x%08x, expected false", addr)
+	}
+}
diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go
new file mode 100644
index 000000000..6398e5e01
--- /dev/null
+++ b/pkg/sentry/platform/mmap_min_addr.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package platform
+
+import (
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// systemMMapMinAddrSource is the source file.
+const systemMMapMinAddrSource = "/proc/sys/vm/mmap_min_addr"
+
+// systemMMapMinAddr is the system's minimum map address.
+var systemMMapMinAddr uint64
+
+// SystemMMapMinAddr returns the minimum system address.
+func SystemMMapMinAddr() usermem.Addr {
+	return usermem.Addr(systemMMapMinAddr)
+}
+
+// MMapMinAddr is a size zero struct that implements MinUserAddress based on
+// the system minimum address. It is suitable for embedding in platforms that
+// rely on the system mmap, and thus require the system minimum.
+type MMapMinAddr struct {
+}
+
+// MinUserAddress implements platform.MinUserAddresss.
+func (*MMapMinAddr) MinUserAddress() usermem.Addr {
+	return SystemMMapMinAddr()
+}
+
+func init() {
+	// Open the source file.
+	b, err := ioutil.ReadFile(systemMMapMinAddrSource)
+	if err != nil {
+		panic(fmt.Sprintf("couldn't open %s: %v", systemMMapMinAddrSource, err))
+	}
+
+	// Parse the result.
+	systemMMapMinAddr, err = strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64)
+	if err != nil {
+		panic(fmt.Sprintf("couldn't parse %s from %s: %v", string(b), systemMMapMinAddrSource, err))
+	}
+}
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
new file mode 100644
index 000000000..6219dada7
--- /dev/null
+++ b/pkg/sentry/platform/platform.go
@@ -0,0 +1,428 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package platform provides a Platform abstraction.
+//
+// See Platform for more information.
+package platform
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Platform provides abstractions for execution contexts (Context) and memory
+// management (Memory, AddressSpace).
+type Platform interface {
+	// SupportsAddressSpaceIO returns true if AddressSpaces returned by this
+	// Platform support AddressSpaceIO methods.
+	//
+	// The value returned by SupportsAddressSpaceIO is guaranteed to remain
+	// unchanged over the lifetime of the Platform.
+	SupportsAddressSpaceIO() bool
+
+	// CooperativelySchedulesAddressSpace returns true if the Platform has a
+	// limited number of AddressSpaces, such that mm.MemoryManager.Deactivate
+	// should call AddressSpace.Release when there are no goroutines that
+	// require the mm.MemoryManager to have an active AddressSpace.
+	//
+	// The value returned by CooperativelySchedulesAddressSpace is guaranteed
+	// to remain unchanged over the lifetime of the Platform.
+	CooperativelySchedulesAddressSpace() bool
+
+	// DetectsCPUPreemption returns true if Contexts returned by the Platform
+	// can reliably return ErrContextCPUPreempted.
+	DetectsCPUPreemption() bool
+
+	// MapUnit returns the alignment used for optional mappings into this
+	// platform's AddressSpaces. Higher values indicate lower per-page
+	// costs for AddressSpace.MapInto. As a special case, a MapUnit of 0
+	// indicates that the cost of AddressSpace.MapInto is effectively
+	// independent of the number of pages mapped. If MapUnit is non-zero,
+	// it must be a power-of-2 multiple of usermem.PageSize.
+	MapUnit() uint64
+
+	// MinUserAddress returns the minimum mappable address on this
+	// platform.
+	MinUserAddress() usermem.Addr
+
+	// MaxUserAddress returns the maximum mappable address on this
+	// platform.
+	MaxUserAddress() usermem.Addr
+
+	// NewAddressSpace returns a new memory context for this platform.
+	//
+	// If mappingsID is not nil, the platform may assume that (1) all calls
+	// to NewAddressSpace with the same mappingsID represent the same
+	// (mutable) set of mappings, and (2) the set of mappings has not
+	// changed since the last time AddressSpace.Release was called on an
+	// AddressSpace returned by a call to NewAddressSpace with the same
+	// mappingsID.
+	//
+	// If a new AddressSpace cannot be created immediately, a nil
+	// AddressSpace is returned, along with channel that is closed when
+	// the caller should retry a call to NewAddressSpace.
+	//
+	// In general, this blocking behavior only occurs when
+	// CooperativelySchedulesAddressSpace (above) returns false.
+	NewAddressSpace(mappingsID interface{}) (AddressSpace, <-chan struct{}, error)
+
+	// NewContext returns a new execution context.
+	NewContext() Context
+
+	// Memory returns memory for allocations.
+	Memory() Memory
+
+	// PreemptAllCPUs causes all concurrent calls to Context.Switch(), as well
+	// as the first following call to Context.Switch() for each Context, to
+	// return ErrContextCPUPreempted.
+	//
+	// PreemptAllCPUs is only supported if DetectsCPUPremption() == true.
+	// Platforms for which this does not hold may panic if PreemptAllCPUs is
+	// called.
+	PreemptAllCPUs() error
+}
+
+// NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and
+// dependent methods for Platforms that do not support this feature.
+type NoCPUPreemptionDetection struct{}
+
+// DetectsCPUPreemption implements Platform.DetectsCPUPreemption.
+func (NoCPUPreemptionDetection) DetectsCPUPreemption() bool {
+	return false
+}
+
+// PreemptAllCPUs implements Platform.PreemptAllCPUs.
+func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
+	panic("This platform does not support CPU preemption detection")
+}
+
+// Context represents the execution context for a single thread.
+type Context interface {
+	// Switch resumes execution of the thread specified by the arch.Context
+	// in the provided address space. This call will block while the thread
+	// is executing.
+	//
+	// If cpu is non-negative, and it is not the number of the CPU that the
+	// thread executes on, Context should return ErrContextCPUPreempted. cpu
+	// can only be non-negative if Platform.DetectsCPUPreemption() is true;
+	// Contexts from Platforms for which this does not hold may ignore cpu, or
+	// panic if cpu is non-negative.
+	//
+	// Switch may return one of the following special errors:
+	//
+	// - nil: The Context invoked a system call.
+	//
+	// - ErrContextSignal: The Context was interrupted by a signal. The
+	// returned *arch.SignalInfo contains information about the signal. If
+	// arch.SignalInfo.Signo == SIGSEGV, the returned usermem.AccessType
+	// contains the access type of the triggering fault.
+	//
+	// - ErrContextInterrupt: The Context was interrupted by a call to
+	// Interrupt(). Switch() may return ErrContextInterrupt spuriously. In
+	// particular, most implementations of Interrupt() will cause the first
+	// following call to Switch() to return ErrContextInterrupt if there is no
+	// concurrent call to Switch().
+	//
+	// - ErrContextCPUPreempted: See the definition of that error for details.
+	Switch(as AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error)
+
+	// Interrupt interrupts a concurrent call to Switch(), causing it to return
+	// ErrContextInterrupt.
+	Interrupt()
+}
+
+var (
+	// ErrContextSignal is returned by Context.Switch() to indicate that the
+	// Context was interrupted by a signal.
+	ErrContextSignal = fmt.Errorf("interrupted by signal")
+
+	// ErrContextInterrupt is returned by Context.Switch() to indicate that the
+	// Context was interrupted by a call to Context.Interrupt().
+	ErrContextInterrupt = fmt.Errorf("interrupted by platform.Context.Interrupt()")
+
+	// ErrContextCPUPreempted is returned by Context.Switch() to indicate that
+	// one of the following occurred:
+	//
+	// - The CPU executing the Context is not the CPU passed to
+	// Context.Switch().
+	//
+	// - The CPU executing the Context may have executed another Context since
+	// the last time it executed this one; or the CPU has previously executed
+	// another Context, and has never executed this one.
+	//
+	// - Platform.PreemptAllCPUs() was called since the last return from
+	// Context.Switch().
+	ErrContextCPUPreempted = fmt.Errorf("interrupted by CPU preemption")
+)
+
+// SignalInterrupt is a signal reserved for use by implementations of
+// Context.Interrupt(). The sentry guarantees that it will ignore delivery of
+// this signal both to Contexts and to the sentry itself, under the assumption
+// that they originate from races with Context.Interrupt().
+//
+// NOTE: The Go runtime only guarantees that a small subset
+// of signals will be always be unblocked on all threads, one of which
+// is SIGCHLD.
+const SignalInterrupt = linux.SIGCHLD
+
+// AddressSpace represents a virtual address space in which a Context can
+// execute.
+type AddressSpace interface {
+	// MapFile creates a shared mapping of offsets in fr, from the file
+	// with file descriptor fd, at address addr. Any existing overlapping
+	// mappings are silently replaced.
+	//
+	// If precommit is true, host memory should be committed to the mapping
+	// when MapFile returns when possible. The precommit flag is advisory
+	// and implementations may choose to ignore it.
+	//
+	// Preconditions: addr and fr must be page-aligned. length > 0.
+	// at.Any() == true.
+	MapFile(addr usermem.Addr, fd int, fr FileRange, at usermem.AccessType, precommit bool) error
+
+	// Unmap unmaps the given range.
+	//
+	// Preconditions: addr is page-aligned. length > 0.
+	Unmap(addr usermem.Addr, length uint64)
+
+	// Release releases this address space. After releasing, a new AddressSpace
+	// must be acquired via platform.NewAddressSpace().
+	Release() error
+
+	// AddressSpaceIO methods are supported iff the associated platform's
+	// Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this
+	// does not hold may panic if AddressSpaceIO methods are invoked.
+	AddressSpaceIO
+}
+
+// AddressSpaceIO supports IO through the memory mappings installed in an
+// AddressSpace.
+//
+// AddressSpaceIO implementors are responsible for ensuring that address ranges
+// are application-mappable.
+type AddressSpaceIO interface {
+	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
+	// returns the number of bytes copied. If the number of bytes copied is <
+	// len(src), it returns a non-nil error explaining why.
+	CopyOut(addr usermem.Addr, src []byte) (int, error)
+
+	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
+	// It returns the number of bytes copied. If the number of bytes copied is
+	// < len(dst), it returns a non-nil error explaining why.
+	CopyIn(addr usermem.Addr, dst []byte) (int, error)
+
+	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
+	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
+	// non-nil error explaining why.
+	ZeroOut(addr usermem.Addr, toZero uintptr) (uintptr, error)
+
+	// SwapUint32 atomically sets the uint32 value at addr to new and returns
+	// the previous value.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+
+	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
+	// old; if they are equal, the value in memory is replaced by new. In
+	// either case, the previous value stored in memory is returned.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+}
+
+// NoAddressSpaceIO implements AddressSpaceIO methods by panicing.
+type NoAddressSpaceIO struct{}
+
+// CopyOut implements AddressSpaceIO.CopyOut.
+func (NoAddressSpaceIO) CopyOut(addr usermem.Addr, src []byte) (int, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// CopyIn implements AddressSpaceIO.CopyIn.
+func (NoAddressSpaceIO) CopyIn(addr usermem.Addr, dst []byte) (int, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// ZeroOut implements AddressSpaceIO.ZeroOut.
+func (NoAddressSpaceIO) ZeroOut(addr usermem.Addr, toZero uintptr) (uintptr, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// SwapUint32 implements AddressSpaceIO.SwapUint32.
+func (NoAddressSpaceIO) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// CompareAndSwapUint32 implements AddressSpaceIO.CompareAndSwapUint32.
+func (NoAddressSpaceIO) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// SegmentationFault is an error returned by AddressSpaceIO methods when IO
+// fails due to access of an unmapped page, or a mapped page with insufficient
+// permissions.
+type SegmentationFault struct {
+	// Addr is the address at which the fault occurred.
+	Addr usermem.Addr
+}
+
+// Error implements error.Error.
+func (f SegmentationFault) Error() string {
+	return fmt.Sprintf("segmentation fault at %#x", f.Addr)
+}
+
+// File represents a host file that may be mapped into an AddressSpace.
+type File interface {
+	// MapInto maps fr into as, starting at addr, for accesses of type at.
+	//
+	// If precommit is true, the platform should eagerly commit resources (e.g.
+	// physical memory) to the mapping. The precommit flag is advisory and
+	// implementations may choose to ignore it.
+	//
+	// Note that there is no File.Unmap; clients should use as.Unmap directly.
+	//
+	// Preconditions: fr.Start and fr.End must be page-aligned.
+	// fr.Length() > 0. at.Any() == true. Implementors may define
+	// additional requirements.
+	MapInto(as AddressSpace, addr usermem.Addr, fr FileRange, at usermem.AccessType, precommit bool) error
+
+	// MapInternal returns a mapping of the given file offsets in the invoking
+	// process' address space for reading and writing. The lifetime of the
+	// returned mapping is implementation-defined.
+	//
+	// Note that fr.Start and fr.End need not be page-aligned.
+	//
+	// Preconditions: fr.Length() > 0. Implementors may define additional
+	// requirements.
+	MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
+
+	// IncRef signals that a region in the file is actively referenced through a
+	// memory map. Implementors must ensure that the contents of a referenced
+	// region remain consistent. Specifically, mappings returned by MapInternal
+	// must refer to the same underlying contents. If the implementor also
+	// implements the Memory interface, the file range must not be reused in a
+	// different allocation while it has active references.
+	//
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > 0.
+	IncRef(fr FileRange)
+
+	// DecRef reduces the frame ref count on the range specified by fr.
+	//
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+	// 0. DecRef()s on a region must match earlier IncRef()s.
+	DecRef(fr FileRange)
+}
+
+// FileRange represents a range of uint64 offsets into a File.
+//
+// type FileRange <generated using go_generics>
+
+// String implements fmt.Stringer.String.
+func (fr FileRange) String() string {
+	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
+}
+
+// Memory represents an allocatable File that may be mapped into any
+// AddressSpace associated with the same Platform.
+type Memory interface {
+	// Memory implements File methods with the following properties:
+	//
+	// - Pages mapped by MapInto must be allocated, and must be unmapped from
+	// all AddressSpaces before they are freed.
+	//
+	// - Pages mapped by MapInternal must be allocated. Returned mappings are
+	// guaranteed to be valid until the mapped pages are freed.
+	File
+
+	// Allocate returns a range of pages of the given length, owned by the
+	// caller and with the given accounting kind. Allocated memory initially has
+	// a single reference and will automatically be freed when no references to
+	// them remain. See File.IncRef and File.DecRef.
+	//
+	// Preconditions: length must be page-aligned and non-zero.
+	Allocate(length uint64, kind usage.MemoryKind) (FileRange, error)
+
+	// Decommit releases resources associated with maintaining the contents of
+	// the given frames. If Decommit succeeds, future accesses of the
+	// decommitted frames will read zeroes.
+	//
+	// Preconditions: fr.Length() > 0.
+	Decommit(fr FileRange) error
+
+	// UpdateUsage updates the memory usage statistics. This must be called
+	// before the relevant memory statistics in usage.MemoryAccounting can
+	// be considered accurate.
+	UpdateUsage() error
+
+	// TotalUsage returns an aggregate usage for all memory statistics
+	// except Mapped (which is external to the Memory implementation). This
+	// is generally much cheaper than UpdateUsage, but will not provide a
+	// fine-grained breakdown.
+	TotalUsage() (uint64, error)
+
+	// TotalSize returns the current maximum size of the Memory in bytes. The
+	// value returned by TotalSize is permitted to change.
+	TotalSize() uint64
+
+	// Destroy releases all resources associated with the Memory.
+	//
+	// Preconditions: There are no remaining uses of any of the freed memory's
+	// frames.
+	//
+	// Postconditions: None of the Memory's methods may be called after Destroy.
+	Destroy()
+
+	// SaveTo saves the memory state to the given stream, which will
+	// generally be a statefile.
+	SaveTo(w io.Writer) error
+
+	// LoadFrom loads the memory state from the given stream, which will
+	// generally be a statefile.
+	LoadFrom(r io.Reader) error
+}
+
+// AllocateAndFill allocates memory of the given kind from mem and fills it by
+// calling r.ReadToBlocks() repeatedly until either length bytes are read or a
+// non-nil error is returned. It returns the memory filled by r, truncated down
+// to the nearest page. If this is shorter than length bytes due to an error
+// returned by r.ReadToBlocks(), it returns that error.
+//
+// Preconditions: length > 0. length must be page-aligned.
+func AllocateAndFill(mem Memory, length uint64, kind usage.MemoryKind, r safemem.Reader) (FileRange, error) {
+	fr, err := mem.Allocate(length, kind)
+	if err != nil {
+		return FileRange{}, err
+	}
+	dsts, err := mem.MapInternal(fr, usermem.Write)
+	if err != nil {
+		mem.DecRef(fr)
+		return FileRange{}, err
+	}
+	n, err := safemem.ReadFullToBlocks(r, dsts)
+	un := uint64(usermem.Addr(n).RoundDown())
+	if un < length {
+		// Free unused memory and update fr to contain only the memory that is
+		// still allocated.
+		mem.DecRef(FileRange{fr.Start + un, fr.End})
+		fr.End = fr.Start + un
+	}
+	return fr, err
+}
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD
new file mode 100644
index 000000000..5db4f6261
--- /dev/null
+++ b/pkg/sentry/platform/procid/BUILD
@@ -0,0 +1,32 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "procid",
+    srcs = [
+        "procid.go",
+        "procid_amd64.s",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid",
+    visibility = ["//pkg/sentry:internal"],
+)
+
+go_test(
+    name = "procid_test",
+    size = "small",
+    srcs = [
+        "procid_test.go",
+    ],
+    embed = [":procid"],
+)
+
+go_test(
+    name = "procid_net_test",
+    size = "small",
+    srcs = [
+        "procid_net_test.go",
+        "procid_test.go",
+    ],
+    embed = [":procid"],
+)
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/sentry/platform/procid/procid.go
new file mode 100644
index 000000000..5f861908f
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid.go
@@ -0,0 +1,21 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package procid provides a way to get the current system thread identifier.
+package procid
+
+// Current returns the current system thread identifier.
+//
+// Precondition: This should only be called with the runtime OS thread locked.
+func Current() uint64
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
new file mode 100644
index 000000000..ead4e3d91
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+// +build go1.8
+// +build !go1.11
+
+#include "textflag.h"
+
+TEXT ·Current(SB),NOSPLIT,$0-8
+	// The offset specified here is the m_procid offset for Go1.8+.
+	// Changes to this offset should be caught by the tests, and major
+	// version changes require an explicit tag change above.
+	MOVQ TLS, AX
+	MOVQ 0(AX)(TLS*1), AX
+	MOVQ 48(AX), AX // g_m (may change in future versions)
+	MOVQ 72(AX), AX // m_procid (may change in future versions)
+	MOVQ AX, ret+0(FP)
+	RET
diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/sentry/platform/procid/procid_net_test.go
new file mode 100644
index 000000000..2d1605a08
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid_net_test.go
@@ -0,0 +1,21 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package procid
+
+// This file is just to force the inclusion of the "net" package, which will
+// make the test binary a cgo one.
+import (
+	_ "net"
+)
diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/sentry/platform/procid/procid_test.go
new file mode 100644
index 000000000..5e44da36f
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid_test.go
@@ -0,0 +1,85 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package procid
+
+import (
+	"os"
+	"runtime"
+	"sync"
+	"syscall"
+	"testing"
+)
+
+// runOnMain is used to send functions to run on the main (initial) thread.
+var runOnMain = make(chan func(), 10)
+
+func checkProcid(t *testing.T, start *sync.WaitGroup, done *sync.WaitGroup) {
+	defer done.Done()
+
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	start.Done()
+	start.Wait()
+
+	procID := Current()
+	tid := syscall.Gettid()
+
+	if procID != uint64(tid) {
+		t.Logf("Bad procid: expected %v, got %v", tid, procID)
+		t.Fail()
+	}
+}
+
+func TestProcidInitialized(t *testing.T) {
+	var start sync.WaitGroup
+	var done sync.WaitGroup
+
+	count := 100
+	start.Add(count + 1)
+	done.Add(count + 1)
+
+	// Run the check on the main thread.
+	//
+	// When cgo is not included, the only case when procid isn't initialized
+	// is in the main (initial) thread, so we have to test this case
+	// specifically.
+	runOnMain <- func() {
+		checkProcid(t, &start, &done)
+	}
+
+	// Run the check on a number of different threads.
+	for i := 0; i < count; i++ {
+		go checkProcid(t, &start, &done)
+	}
+
+	done.Wait()
+}
+
+func TestMain(m *testing.M) {
+	// Make sure we remain at the main (initial) thread.
+	runtime.LockOSThread()
+
+	// Start running tests in a different goroutine.
+	go func() {
+		os.Exit(m.Run())
+	}()
+
+	// Execute any functions that have been sent for execution in the main
+	// thread.
+	for f := range runOnMain {
+		f()
+	}
+}
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
new file mode 100644
index 000000000..16b0b3c69
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -0,0 +1,31 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "ptrace",
+    srcs = [
+        "ptrace.go",
+        "ptrace_unsafe.go",
+        "stub_amd64.s",
+        "stub_unsafe.go",
+        "subprocess.go",
+        "subprocess_amd64.go",
+        "subprocess_linux.go",
+        "subprocess_linux_amd64_unsafe.go",
+        "subprocess_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/filemem",
+        "//pkg/sentry/platform/interrupt",
+        "//pkg/sentry/platform/procid",
+        "//pkg/sentry/platform/safecopy",
+        "//pkg/sentry/usermem",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
new file mode 100644
index 000000000..05f8b1d05
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -0,0 +1,242 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ptrace provides a ptrace-based implementation of the platform
+// interface. This is useful for development and testing purposes primarily,
+// and runs on stock kernels without special permissions.
+//
+// In a nutshell, it works as follows:
+//
+// The creation of a new address space creates a new child processes with a
+// single thread which is traced by a single goroutine.
+//
+// A context is just a collection of temporary variables. Calling Switch on a
+// context does the following:
+//
+//	Locks the runtime thread.
+//
+//	Looks up a traced subprocess thread for the current runtime thread. If
+//	none exists, the dedicated goroutine is asked to create a new stopped
+//	thread in the subprocess. This stopped subprocess thread is then traced
+//	by the current thread and this information is stored for subsequent
+//	switches.
+//
+//	The context is then bound with information about the subprocess thread
+//	so that the context may be appropriately interrupted via a signal.
+//
+//	The requested operation is performed in the traced subprocess thread
+//	(e.g. set registers, execute, return).
+//
+// FIXME: This package is currently sloppy with cleanup.
+//
+// Lock order:
+//
+// subprocess.mu
+//   context.mu
+package ptrace
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+var (
+	// stubStart is the link address for our stub, and determines the
+	// maximum user address. This is valid only after a call to stubInit.
+	//
+	// We attempt to link the stub here, and adjust downward as needed.
+	stubStart uintptr = 0x7fffffff0000
+
+	// stubEnd is the first byte past the end of the stub, as with
+	// stubStart this is valid only after a call to stubInit.
+	stubEnd uintptr
+
+	// stubInitialized controls one-time stub initialization.
+	stubInitialized sync.Once
+)
+
+type context struct {
+	// signalInfo is the signal info, if and when a signal is received.
+	signalInfo arch.SignalInfo
+
+	// interrupt is the interrupt context.
+	interrupt interrupt.Forwarder
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// If lastFaultSP is non-nil, the last context switch was due to a fault
+	// received while executing lastFaultSP. Only context.Switch may set
+	// lastFaultSP to a non-nil value.
+	lastFaultSP *subprocess
+
+	// lastFaultAddr is the last faulting address; this is only meaningful if
+	// lastFaultSP is non-nil.
+	lastFaultAddr usermem.Addr
+
+	// lastFaultIP is the address of the last faulting instruction;
+	// this is also only meaningful if lastFaultSP is non-nil.
+	lastFaultIP usermem.Addr
+}
+
+// Switch runs the provided context in the given address space.
+func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) {
+	s := as.(*subprocess)
+	isSyscall := s.switchToApp(c, ac)
+
+	var faultSP *subprocess
+	var faultAddr usermem.Addr
+	var faultIP usermem.Addr
+	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV {
+		faultSP = s
+		faultAddr = usermem.Addr(c.signalInfo.Addr())
+		faultIP = usermem.Addr(ac.IP())
+	}
+
+	// Update the context to reflect the outcome of this context switch.
+	c.mu.Lock()
+	lastFaultSP := c.lastFaultSP
+	lastFaultAddr := c.lastFaultAddr
+	lastFaultIP := c.lastFaultIP
+	// At this point, c may not yet be in s.contexts, so c.lastFaultSP won't be
+	// updated by s.Unmap(). This is fine; we only need to synchronize with
+	// calls to s.Unmap() that occur after the handling of this fault.
+	c.lastFaultSP = faultSP
+	c.lastFaultAddr = faultAddr
+	c.lastFaultIP = faultIP
+	c.mu.Unlock()
+
+	// Update subprocesses to reflect the outcome of this context switch.
+	if lastFaultSP != faultSP {
+		if lastFaultSP != nil {
+			lastFaultSP.mu.Lock()
+			delete(lastFaultSP.contexts, c)
+			lastFaultSP.mu.Unlock()
+		}
+		if faultSP != nil {
+			faultSP.mu.Lock()
+			faultSP.contexts[c] = struct{}{}
+			faultSP.mu.Unlock()
+		}
+	}
+
+	if isSyscall {
+		return nil, usermem.NoAccess, nil
+	}
+	if faultSP == nil {
+		// Non-fault signal.
+		return &c.signalInfo, usermem.NoAccess, platform.ErrContextSignal
+	}
+
+	// Got a page fault. Ideally, we'd get real fault type here, but ptrace
+	// doesn't expose this information. Instead, we use a simple heuristic:
+	//
+	// It was an instruction fault iff the faulting addr == instruction
+	// pointer.
+	//
+	// It was a write fault if the fault is immediately repeated.
+	at := usermem.Read
+	if faultAddr == faultIP {
+		at.Execute = true
+	}
+	if lastFaultSP == faultSP &&
+		lastFaultAddr == faultAddr &&
+		lastFaultIP == faultIP {
+		at.Write = true
+	}
+	return &c.signalInfo, at, platform.ErrContextSignal
+}
+
+// Interrupt interrupts the running guest application associated with this context.
+func (c *context) Interrupt() {
+	c.interrupt.NotifyInterrupt()
+}
+
+// PTrace represents a collection of ptrace subprocesses.
+type PTrace struct {
+	platform.MMapMinAddr
+	platform.NoCPUPreemptionDetection
+	*filemem.FileMem
+}
+
+// New returns a new ptrace-based implementation of the platform interface.
+func New() (*PTrace, error) {
+	stubInitialized.Do(func() {
+		// Initialize the stub.
+		stubInit()
+
+		// Create the master process for the global pool. This must be
+		// done before initializing any other processes.
+		master, err := newSubprocess(createStub)
+		if err != nil {
+			// Should never happen.
+			panic("unable to initialize ptrace master: " + err.Error())
+		}
+
+		// Set the master on the globalPool.
+		globalPool.master = master
+	})
+
+	fm, err := filemem.New("ptrace-memory")
+	if err != nil {
+		return nil, err
+	}
+
+	return &PTrace{FileMem: fm}, nil
+}
+
+// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
+func (*PTrace) SupportsAddressSpaceIO() bool {
+	return false
+}
+
+// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
+func (*PTrace) CooperativelySchedulesAddressSpace() bool {
+	return false
+}
+
+// MapUnit implements platform.Platform.MapUnit.
+func (*PTrace) MapUnit() uint64 {
+	// The host kernel manages page tables and arbitrary-sized mappings
+	// have effectively the same cost.
+	return 0
+}
+
+// MaxUserAddress returns the first address that may not be used by user
+// applications.
+func (*PTrace) MaxUserAddress() usermem.Addr {
+	return usermem.Addr(stubStart)
+}
+
+// NewAddressSpace returns a new subprocess.
+func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
+	as, err := newSubprocess(globalPool.master.createStub)
+	return as, nil, err
+}
+
+// NewContext returns an interruptible context.
+func (*PTrace) NewContext() platform.Context {
+	return &context{}
+}
+
+// Memory returns the platform memory used to do allocations.
+func (p *PTrace) Memory() platform.Memory {
+	return p.FileMem
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
new file mode 100644
index 000000000..b55b2795a
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -0,0 +1,166 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// GETREGSET/SETREGSET register set types.
+//
+// See include/uapi/linux/elf.h.
+const (
+	// _NT_PRFPREG is for x86 floating-point state without using xsave.
+	_NT_PRFPREG = 0x2
+
+	// _NT_X86_XSTATE is for x86 extended state using xsave.
+	_NT_X86_XSTATE = 0x202
+)
+
+// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
+func fpRegSet(useXsave bool) uintptr {
+	if useXsave {
+		return _NT_X86_XSTATE
+	}
+	return _NT_PRFPREG
+}
+
+// getRegs sets the regular register set.
+func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETREGS,
+		uintptr(t.tid),
+		0,
+		uintptr(unsafe.Pointer(regs)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// setRegs sets the regular register set.
+func (t *thread) setRegs(regs *syscall.PtraceRegs) error {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETREGS,
+		uintptr(t.tid),
+		0,
+		uintptr(unsafe.Pointer(regs)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// getFPRegs gets the floating-point data via the GETREGSET ptrace syscall.
+func (t *thread) getFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(fpState),
+		Len:  fpLen,
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETREGSET,
+		uintptr(t.tid),
+		fpRegSet(useXsave),
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// setFPRegs sets the floating-point data via the SETREGSET ptrace syscall.
+func (t *thread) setFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(fpState),
+		Len:  fpLen,
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETREGSET,
+		uintptr(t.tid),
+		fpRegSet(useXsave),
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// getSignalInfo retrieves information about the signal that caused the stop.
+func (t *thread) getSignalInfo(si *arch.SignalInfo) error {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETSIGINFO,
+		uintptr(t.tid),
+		0,
+		uintptr(unsafe.Pointer(si)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// clone creates a new thread from this one.
+//
+// The returned thread will be stopped and available for any system thread to
+// call attach on it.
+//
+// Precondition: the OS thread must be locked and own t.
+func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) {
+	r, ok := usermem.Addr(initRegs.Rsp).RoundUp()
+	if !ok {
+		return nil, syscall.EINVAL
+	}
+	rval, err := t.syscallIgnoreInterrupt(
+		initRegs,
+		syscall.SYS_CLONE,
+		arch.SyscallArgument{Value: uintptr(
+			syscall.CLONE_FILES |
+				syscall.CLONE_FS |
+				syscall.CLONE_SIGHAND |
+				syscall.CLONE_THREAD |
+				syscall.CLONE_PTRACE |
+				syscall.CLONE_VM)},
+		// The stack pointer is just made up, but we have it be
+		// something sensible so the kernel doesn't think we're
+		// up to no good. Which we are.
+		arch.SyscallArgument{Value: uintptr(r)},
+		arch.SyscallArgument{},
+		arch.SyscallArgument{},
+		// We use these registers initially, but really they
+		// could be anything. We're going to stop immediately.
+		arch.SyscallArgument{Value: uintptr(unsafe.Pointer(initRegs))})
+	if err != nil {
+		return nil, err
+	}
+
+	return &thread{
+		tgid: t.tgid,
+		tid:  int32(rval),
+		cpu:  ^uint32(0),
+	}, nil
+}
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
new file mode 100644
index 000000000..9bf87b6f6
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -0,0 +1,114 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+#define SYS_GETPID		39
+#define SYS_EXIT		60
+#define SYS_KILL		62
+#define SYS_GETPPID		110
+#define SYS_PRCTL		157
+
+#define SIGKILL			9
+#define SIGSTOP			19
+
+#define PR_SET_PDEATHSIG	1
+
+// stub bootstraps the child and sends itself SIGSTOP to wait for attach.
+//
+// R15 contains the expected PPID. R15 is used instead of a more typical DI
+// since syscalls will clobber DI and createStub wants to pass a new PPID to
+// grandchildren.
+//
+// This should not be used outside the context of a new ptrace child (as the
+// function is otherwise a bunch of nonsense).
+TEXT ·stub(SB),NOSPLIT,$0
+begin:
+	// N.B. This loop only executes in the context of a single-threaded
+	// fork child.
+
+	MOVQ $SYS_PRCTL, AX
+	MOVQ $PR_SET_PDEATHSIG, DI
+	MOVQ $SIGKILL, SI
+	SYSCALL
+
+	CMPQ AX, $0
+	JNE error
+
+	// If the parent already died before we called PR_SET_DEATHSIG then
+	// we'll have an unexpected PPID.
+	MOVQ $SYS_GETPPID, AX
+	SYSCALL
+
+	CMPQ AX, $0
+	JL error
+
+	CMPQ AX, R15
+	JNE parent_dead
+
+	MOVQ $SYS_GETPID, AX
+	SYSCALL
+
+	CMPQ AX, $0
+	JL error
+
+	// SIGSTOP to wait for attach.
+	//
+	// The SYSCALL instruction will be used for future syscall injection by
+	// thread.syscall.
+	MOVQ AX, DI
+	MOVQ $SYS_KILL, AX
+	MOVQ $SIGSTOP, SI
+	SYSCALL
+
+	// The tracer may "detach" and/or allow code execution here in three cases:
+	//
+	// 1. New (traced) stub threads are explicitly detached by the
+	// goroutine in newSubprocess. However, they are detached while in
+	// group-stop, so they do not execute code here.
+	//
+	// 2. If a tracer thread exits, it implicitly detaches from the stub,
+	// potentially allowing code execution here. However, the Go runtime
+	// never exits individual threads, so this case never occurs.
+	//
+	// 3. subprocess.createStub clones a new stub process that is untraced,
+	// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
+	// ourselves for attach by the tracer.
+	//
+	// R15 has been updated with the expected PPID.
+	JMP begin
+
+error:
+	// Exit with -errno.
+	MOVQ AX, DI
+	NEGQ DI
+	MOVQ $SYS_EXIT, AX
+	SYSCALL
+	HLT
+
+parent_dead:
+	MOVQ $SYS_EXIT, AX
+	MOVQ $1, DI
+	SYSCALL
+	HLT
+
+// stubCall calls the stub function at the given address with the given PPID.
+//
+// This is a distinct function because stub, above, may be mapped at any
+// arbitrary location, and stub has a specific binary API (see above).
+TEXT ·stubCall(SB),NOSPLIT,$0-16
+	MOVQ addr+0(FP), AX
+	MOVQ pid+8(FP), R15
+	JMP AX
diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go
new file mode 100644
index 000000000..c868a2d68
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/stub_unsafe.go
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// stub is defined in arch-specific assembly.
+func stub()
+
+// stubCall calls the stub at the given address with the given pid.
+func stubCall(addr, pid uintptr)
+
+// unsafeSlice returns a slice for the given address and length.
+func unsafeSlice(addr uintptr, length int) (slice []byte) {
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	sh.Data = addr
+	sh.Len = length
+	sh.Cap = length
+	return
+}
+
+// stubInit initializes the stub.
+func stubInit() {
+	// Grab the existing stub.
+	stubBegin := reflect.ValueOf(stub).Pointer()
+	stubLen := int(safecopy.FindEndAddress(stubBegin) - stubBegin)
+	stubSlice := unsafeSlice(stubBegin, stubLen)
+	mapLen := uintptr(stubLen)
+	if offset := mapLen % usermem.PageSize; offset != 0 {
+		mapLen += usermem.PageSize - offset
+	}
+
+	for stubStart > 0 {
+		// Map the target address for the stub.
+		//
+		// We don't use FIXED here because we don't want to unmap
+		// something that may have been there already. We just walk
+		// down the address space until we find a place where the stub
+		// can be placed.
+		addr, _, errno := syscall.RawSyscall6(
+			syscall.SYS_MMAP,
+			stubStart,
+			mapLen,
+			syscall.PROT_WRITE|syscall.PROT_READ,
+			syscall.MAP_PRIVATE|syscall.MAP_ANONYMOUS,
+			0 /* fd */, 0 /* offset */)
+		if addr != stubStart || errno != 0 {
+			if addr != 0 {
+				// Unmap the region we've mapped accidentally.
+				syscall.RawSyscall(syscall.SYS_MUNMAP, addr, mapLen, 0)
+			}
+
+			// Attempt to begin at a lower address.
+			stubStart -= uintptr(usermem.PageSize)
+			continue
+		}
+
+		// Copy the stub to the address.
+		targetSlice := unsafeSlice(addr, stubLen)
+		copy(targetSlice, stubSlice)
+
+		// Make the stub executable.
+		if _, _, errno := syscall.RawSyscall(
+			syscall.SYS_MPROTECT,
+			stubStart,
+			mapLen,
+			syscall.PROT_EXEC|syscall.PROT_READ); errno != 0 {
+			panic("mprotect failed: " + errno.Error())
+		}
+
+		// Set the end.
+		stubEnd = stubStart + mapLen
+		return
+	}
+
+	// This will happen only if we exhaust the entire address
+	// space, and it will take a long, long time.
+	panic("failed to map stub")
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
new file mode 100644
index 000000000..0d6a38f15
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -0,0 +1,559 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// globalPool exists to solve two distinct problems:
+//
+// 1) Subprocesses can't always be killed properly (see Release).
+//
+// 2) Any seccomp filters that have been installed will apply to subprocesses
+// created here. Therefore we use the intermediary (master), which is created
+// on initialization of the platform.
+var globalPool struct {
+	mu        sync.Mutex
+	master    *subprocess
+	available []*subprocess
+}
+
+// thread is a traced thread; it is a thread identifier.
+//
+// This is a convenience type for defining ptrace operations.
+type thread struct {
+	tgid int32
+	tid  int32
+	cpu  uint32
+}
+
+// threadPool is a collection of threads.
+type threadPool struct {
+	// mu protects below.
+	mu sync.Mutex
+
+	// threads is the collection of threads.
+	//
+	// This map is indexed by system TID (the calling thread); which will
+	// be the tracer for the given *thread, and therefore capable of using
+	// relevant ptrace calls.
+	threads map[int32]*thread
+}
+
+// lookupOrCreate looks up a given thread or creates one.
+//
+// newThread will generally be subprocess.newThread.
+//
+// Precondition: the runtime OS thread must be locked.
+func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread) *thread {
+	tp.mu.Lock()
+	t, ok := tp.threads[currentTID]
+	if !ok {
+		// Before creating a new thread, see if we can find a thread
+		// whose system tid has disappeared.
+		//
+		// TODO: Other parts of this package depend on
+		// threads never exiting.
+		for origTID, t := range tp.threads {
+			// Signal zero is an easy existence check.
+			if err := syscall.Tgkill(syscall.Getpid(), int(origTID), 0); err != nil {
+				// This thread has been abandoned; reuse it.
+				delete(tp.threads, origTID)
+				tp.threads[currentTID] = t
+				tp.mu.Unlock()
+				return t
+			}
+		}
+
+		// Create a new thread.
+		t = newThread()
+		tp.threads[currentTID] = t
+	}
+	tp.mu.Unlock()
+	return t
+}
+
+// subprocess is a collection of threads being traced.
+type subprocess struct {
+	platform.NoAddressSpaceIO
+
+	// initRegs are the initial registers for the first thread.
+	//
+	// These are used for the register set for system calls.
+	initRegs syscall.PtraceRegs
+
+	// requests is used to signal creation of new threads.
+	requests chan chan *thread
+
+	// sysemuThreads are reserved for emulation.
+	sysemuThreads threadPool
+
+	// syscallThreads are reserved for syscalls (except clone, which is
+	// handled in the dedicated goroutine corresponding to requests above).
+	syscallThreads threadPool
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// contexts is the set of contexts for which it's possible that
+	// context.lastFaultSP == this subprocess.
+	contexts map[*context]struct{}
+}
+
+// newSubprocess returns a useable subprocess.
+//
+// This will either be a newly created subprocess, or one from the global pool.
+// The create function will be called in the latter case, which is guaranteed
+// to happen with the runtime thread locked.
+func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
+	// See Release.
+	globalPool.mu.Lock()
+	if len(globalPool.available) > 0 {
+		sp := globalPool.available[len(globalPool.available)-1]
+		globalPool.available = globalPool.available[:len(globalPool.available)-1]
+		globalPool.mu.Unlock()
+		return sp, nil
+	}
+	globalPool.mu.Unlock()
+
+	// The following goroutine is responsible for creating the first traced
+	// thread, and responding to requests to make additional threads in the
+	// traced process. The process will be killed and reaped when the
+	// request channel is closed, which happens in Release below.
+	var initRegs syscall.PtraceRegs
+	errChan := make(chan error)
+	requests := make(chan chan *thread)
+	go func() { // S/R-SAFE: Platform-related.
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+
+		// Initialize the first thread.
+		firstThread, err := create()
+		if err != nil {
+			errChan <- err
+			return
+		}
+
+		// Grab registers.
+		//
+		// Note that we adjust the current register RIP value to be
+		// just before the current system call executed. This depends
+		// on the definition of the stub itself.
+		if err := firstThread.getRegs(&initRegs); err != nil {
+			panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+		}
+		initRegs.Rip -= initRegsRipAdjustment
+
+		// Ready to handle requests.
+		errChan <- nil
+
+		// Wait for requests to create threads.
+		for r := range requests {
+			t, err := firstThread.clone(&initRegs)
+			if err != nil {
+				// Should not happen: not recoverable.
+				panic(fmt.Sprintf("error initializing first thread: %v", err))
+			}
+
+			// Since the new thread was created with
+			// clone(CLONE_PTRACE), it will begin execution with
+			// SIGSTOP pending and with this thread as its tracer.
+			// (Hopefully nobody tgkilled it with a signal <
+			// SIGSTOP before the SIGSTOP was delivered, in which
+			// case that signal would be delivered before SIGSTOP.)
+			if sig := t.wait(); sig != syscall.SIGSTOP {
+				panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
+			}
+
+			// Detach the thread without suppressing the SIGSTOP,
+			// causing it to enter group-stop.
+			if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
+				panic(fmt.Sprintf("can't detach new clone: %v", errno))
+			}
+
+			// Return the thread.
+			r <- t
+		}
+
+		// Requests should never be closed.
+		panic("unreachable")
+	}()
+
+	// Wait until error or readiness.
+	if err := <-errChan; err != nil {
+		return nil, err
+	}
+
+	// Ready.
+	sp := &subprocess{
+		initRegs: initRegs,
+		requests: requests,
+		sysemuThreads: threadPool{
+			threads: make(map[int32]*thread),
+		},
+		syscallThreads: threadPool{
+			threads: make(map[int32]*thread),
+		},
+		contexts: make(map[*context]struct{}),
+	}
+
+	sp.unmap()
+	return sp, nil
+}
+
+// unmap unmaps non-stub regions of the process.
+//
+// This will panic on failure (which should never happen).
+func (s *subprocess) unmap() {
+	s.Unmap(0, uint64(stubStart))
+	if maximumUserAddress != stubEnd {
+		s.Unmap(usermem.Addr(stubEnd), uint64(maximumUserAddress-stubEnd))
+	}
+}
+
+// Release kills the subprocess.
+//
+// Just kidding! We can't safely co-ordinate the detaching of all the
+// tracees (since the tracers are random runtime threads, and the process
+// won't exit until tracers have been notifier).
+//
+// Therefore we simply unmap everything in the subprocess and return it to the
+// globalPool. This has the added benefit of reducing creation time for new
+// subprocesses.
+func (s *subprocess) Release() error {
+	go func() { // S/R-SAFE: Platform.
+		s.unmap()
+		globalPool.mu.Lock()
+		globalPool.available = append(globalPool.available, s)
+		globalPool.mu.Unlock()
+	}()
+	return nil
+}
+
+// newThread creates a new traced thread.
+//
+// Precondition: the OS thread must be locked.
+func (s *subprocess) newThread() *thread {
+	// Ask the first thread to create a new one.
+	r := make(chan *thread)
+	s.requests <- r
+	t := <-r
+
+	// Attach the subprocess to this one.
+	t.attach()
+
+	// Return the new thread, which is now bound.
+	return t
+}
+
+// attach attachs to the thread.
+func (t *thread) attach() {
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0); errno != 0 {
+		panic(fmt.Sprintf("unable to attach: %v", errno))
+	}
+
+	// PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already
+	// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
+	// newSubprocess), so we always expect to see signal-delivery-stop with
+	// SIGSTOP.
+	if sig := t.wait(); sig != syscall.SIGSTOP {
+		panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig))
+	}
+
+	// Initialize options.
+	t.init()
+}
+
+// wait waits for a stop event.
+func (t *thread) wait() syscall.Signal {
+	var status syscall.WaitStatus
+
+	for {
+		r, err := syscall.Wait4(int(t.tid), &status, syscall.WALL|syscall.WUNTRACED, nil)
+		if err == syscall.EINTR || err == syscall.EAGAIN {
+			// Wait was interrupted; wait again.
+			continue
+		} else if err != nil {
+			panic(fmt.Sprintf("ptrace wait failed: %v", err))
+		}
+		if int(r) != int(t.tid) {
+			panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
+		}
+		if !status.Stopped() {
+			panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+		}
+		if status.StopSignal() == 0 {
+			continue // Spurious stop.
+		}
+		return status.StopSignal()
+	}
+}
+
+// init initializes trace options.
+func (t *thread) init() {
+	// Set our TRACESYSGOOD option to differeniate real SIGTRAP.
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETOPTIONS,
+		uintptr(t.tid),
+		0,
+		syscall.PTRACE_O_TRACESYSGOOD,
+		0, 0)
+	if errno != 0 {
+		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
+	}
+}
+
+// syscall executes a system call cycle in the traced context.
+//
+// This is _not_ for use by application system calls, rather it is for use when
+// a system call must be injected into the remote context (e.g. mmap, munmap).
+// Note that clones are handled separately.
+func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
+	// Set registers.
+	if err := t.setRegs(regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Execute the syscall instruction.
+		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait()
+		if sig == (0x80 | syscall.SIGTRAP) {
+			// Reached syscall-enter-stop.
+			break
+		} else {
+			// Some other signal caused a thread stop; ignore.
+			continue
+		}
+	}
+
+	// Complete the actual system call.
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 {
+		panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+	}
+
+	// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
+	// between syscall-enter-stop and syscall-exit-stop; it happens *after*
+	// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
+	if sig := t.wait(); sig != (0x80 | syscall.SIGTRAP) {
+		panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
+	}
+
+	// Grab registers.
+	if err := t.getRegs(regs); err != nil {
+		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+	}
+
+	return syscallReturnValue(regs)
+}
+
+// syscallIgnoreInterrupt ignores interrupts on the system call thread and
+// restarts the syscall if the kernel indicates that should happen.
+func (t *thread) syscallIgnoreInterrupt(
+	initRegs *syscall.PtraceRegs,
+	sysno uintptr,
+	args ...arch.SyscallArgument) (uintptr, error) {
+	for {
+		regs := createSyscallRegs(initRegs, sysno, args...)
+		rval, err := t.syscall(&regs)
+		switch err {
+		case ERESTARTSYS:
+			continue
+		case ERESTARTNOINTR:
+			continue
+		case ERESTARTNOHAND:
+			continue
+		default:
+			return rval, err
+		}
+	}
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+func (t *thread) NotifyInterrupt() {
+	syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(platform.SignalInterrupt))
+}
+
+// switchToApp is called from the main SwitchToApp entrypoint.
+//
+// This function returns true on a system call, false on a signal.
+func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
+	regs := &ac.StateData().Regs
+	s.resetSysemuRegs(regs)
+
+	// Extract floating point state.
+	fpState := ac.FloatingPointData()
+	fpLen, _ := ac.FeatureSet().ExtendedStateSize()
+	useXsave := ac.FeatureSet().UseXsave()
+
+	// Lock the thread for ptrace operations.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	// Grab our thread from the pool.
+	currentTID := int32(procid.Current())
+	t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread)
+
+	// Check for interrupts, and ensure that future interrupts will signal t.
+	if !c.interrupt.Enable(t) {
+		// Pending interrupt; simulate.
+		c.signalInfo = arch.SignalInfo{Signo: int32(platform.SignalInterrupt)}
+		return false
+	}
+	defer c.interrupt.Disable()
+
+	// Ensure that the CPU set is bound appropriately; this makes the
+	// emulation below several times faster, presumably by avoiding
+	// interprocessor wakeups and by simplifying the schedule.
+	t.bind()
+
+	// Set registers.
+	if err := t.setRegs(regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+	if err := t.setFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
+		panic(fmt.Sprintf("ptrace set fpregs failed: %v", err))
+	}
+
+	for {
+		// Start running until the next system call.
+		if isSingleStepping(regs) {
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_PTRACE,
+				syscall.PTRACE_SYSEMU_SINGLESTEP,
+				uintptr(t.tid), 0); errno != 0 {
+				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
+			}
+		} else {
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_PTRACE,
+				syscall.PTRACE_SYSEMU,
+				uintptr(t.tid), 0); errno != 0 {
+				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
+			}
+		}
+
+		// Wait for the syscall-enter stop.
+		sig := t.wait()
+
+		// Refresh all registers.
+		if err := t.getRegs(regs); err != nil {
+			panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+		}
+		if err := t.getFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
+			panic(fmt.Sprintf("ptrace get fpregs failed: %v", err))
+		}
+
+		// Is it a system call?
+		if sig == (0x80 | syscall.SIGTRAP) {
+			// Ensure registers are sane.
+			updateSyscallRegs(regs)
+			return true
+		}
+
+		if sig == syscall.SIGSTOP {
+			// SIGSTOP was delivered to another thread in the same thread
+			// group, which initiated another group stop. Just ignore it.
+			continue
+		}
+
+		// Grab signal information.
+		if err := t.getSignalInfo(&c.signalInfo); err != nil {
+			// Should never happen.
+			panic(fmt.Sprintf("ptrace get signal info failed: %v", err))
+		}
+
+		// We have a signal. We verify however, that the signal was
+		// either delivered from the kernel or from this process. We
+		// don't respect other signals.
+		if c.signalInfo.Code > 0 {
+			return false // kernel.
+		} else if c.signalInfo.Code <= 0 && c.signalInfo.Pid() == int32(os.Getpid()) {
+			return false // this process.
+		}
+	}
+}
+
+// syscall executes the given system call without handling interruptions.
+func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) {
+	// Grab a thread.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+	currentTID := int32(procid.Current())
+	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
+
+	return t.syscallIgnoreInterrupt(&s.initRegs, sysno, args...)
+}
+
+// MapFile implements platform.AddressSpace.MapFile.
+func (s *subprocess) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	var flags int
+	if precommit {
+		flags |= syscall.MAP_POPULATE
+	}
+	_, err := s.syscall(
+		syscall.SYS_MMAP,
+		arch.SyscallArgument{Value: uintptr(addr)},
+		arch.SyscallArgument{Value: uintptr(fr.Length())},
+		arch.SyscallArgument{Value: uintptr(at.Prot())},
+		arch.SyscallArgument{Value: uintptr(flags | syscall.MAP_SHARED | syscall.MAP_FIXED)},
+		arch.SyscallArgument{Value: uintptr(fd)},
+		arch.SyscallArgument{Value: uintptr(fr.Start)})
+	return err
+}
+
+// Unmap implements platform.AddressSpace.Unmap.
+func (s *subprocess) Unmap(addr usermem.Addr, length uint64) {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length))
+	}
+	s.mu.Lock()
+	for c := range s.contexts {
+		c.mu.Lock()
+		if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) {
+			// Forget the last fault so that if c faults again, the fault isn't
+			// incorrectly reported as a write fault. If this is being called
+			// due to munmap() of the corresponding vma, handling of the second
+			// fault will fail anyway.
+			c.lastFaultSP = nil
+			delete(s.contexts, c)
+		}
+		c.mu.Unlock()
+	}
+	s.mu.Unlock()
+	_, err := s.syscall(
+		syscall.SYS_MUNMAP,
+		arch.SyscallArgument{Value: uintptr(addr)},
+		arch.SyscallArgument{Value: uintptr(length)})
+	if err != nil {
+		// We never expect this to happen.
+		panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
+	}
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
new file mode 100644
index 000000000..8211215df
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -0,0 +1,104 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ptrace
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+	// maximumUserAddress is the largest possible user address.
+	maximumUserAddress = 0x7ffffffff000
+
+	// initRegsRipAdjustment is the size of the syscall instruction.
+	initRegsRipAdjustment = 2
+)
+
+// Linux kernel errnos which "should never be seen by user programs", but will
+// be revealed to ptrace syscall exit tracing.
+//
+// These constants are used in subprocess.go.
+const (
+	ERESTARTSYS    = syscall.Errno(512)
+	ERESTARTNOINTR = syscall.Errno(513)
+	ERESTARTNOHAND = syscall.Errno(514)
+)
+
+// resetSysemuRegs sets up emulation registers.
+//
+// This should be called prior to calling sysemu.
+func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
+	regs.Cs = s.initRegs.Cs
+	regs.Ss = s.initRegs.Ss
+	regs.Ds = s.initRegs.Ds
+	regs.Es = s.initRegs.Es
+	regs.Fs = s.initRegs.Fs
+	regs.Gs = s.initRegs.Gs
+}
+
+// createSyscallRegs sets up syscall registers.
+//
+// This should be called to generate registers for a system call.
+func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+	// Copy initial registers (RIP, segments, etc.).
+	regs := *initRegs
+
+	// Set our syscall number.
+	regs.Rax = uint64(sysno)
+	if len(args) >= 1 {
+		regs.Rdi = args[0].Uint64()
+	}
+	if len(args) >= 2 {
+		regs.Rsi = args[1].Uint64()
+	}
+	if len(args) >= 3 {
+		regs.Rdx = args[2].Uint64()
+	}
+	if len(args) >= 4 {
+		regs.R10 = args[3].Uint64()
+	}
+	if len(args) >= 5 {
+		regs.R8 = args[4].Uint64()
+	}
+	if len(args) >= 6 {
+		regs.R9 = args[5].Uint64()
+	}
+
+	return regs
+}
+
+// isSingleStepping determines if the registers indicate single-stepping.
+func isSingleStepping(regs *syscall.PtraceRegs) bool {
+	return (regs.Eflags & arch.X86TrapFlag) != 0
+}
+
+// updateSyscallRegs updates registers after finishing sysemu.
+func updateSyscallRegs(regs *syscall.PtraceRegs) {
+	// Ptrace puts -ENOSYS in rax on syscall-enter-stop.
+	regs.Rax = regs.Orig_rax
+}
+
+// syscallReturnValue extracts a sensible return from registers.
+func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+	rval := int64(regs.Rax)
+	if rval < 0 {
+		return 0, syscall.Errno(-rval)
+	}
+	return uintptr(rval), nil
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
new file mode 100644
index 000000000..227dd4882
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -0,0 +1,146 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package ptrace
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+)
+
+// createStub creates a fresh stub processes.
+//
+// Precondition: the runtime OS thread must be locked.
+func createStub() (*thread, error) {
+	// Declare all variables up front in order to ensure that there's no
+	// need for allocations between beforeFork & afterFork.
+	var (
+		pid   uintptr
+		ppid  uintptr
+		errno syscall.Errno
+	)
+
+	// Remember the current ppid for the pdeathsig race.
+	ppid, _, _ = syscall.RawSyscall(syscall.SYS_GETPID, 0, 0, 0)
+
+	// Among other things, beforeFork masks all signals.
+	beforeFork()
+	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
+	if errno != 0 {
+		afterFork()
+		return nil, errno
+	}
+
+	// Is this the parent?
+	if pid != 0 {
+		// Among other things, restore signal mask.
+		afterFork()
+
+		// Initialize the first thread.
+		t := &thread{
+			tgid: int32(pid),
+			tid:  int32(pid),
+			cpu:  ^uint32(0),
+		}
+		if sig := t.wait(); sig != syscall.SIGSTOP {
+			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
+		}
+		t.attach()
+
+		return t, nil
+	}
+
+	// afterForkInChild resets all signals to their default dispositions
+	// and restores the signal mask to its pre-fork state.
+	afterForkInChild()
+
+	// Explicitly unmask all signals to ensure that the tracer can see
+	// them.
+	errno = unmaskAllSignals()
+	if errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
+	// Call the stub; should not return.
+	stubCall(stubStart, ppid)
+	panic("unreachable")
+}
+
+// createStub creates a stub processes as a child of an existing subprocesses.
+//
+// Precondition: the runtime OS thread must be locked.
+func (s *subprocess) createStub() (*thread, error) {
+	// There's no need to lock the runtime thread here, as this can only be
+	// called from a context that is already locked.
+	currentTID := int32(procid.Current())
+	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
+
+	// Pass the expected PPID to the child via R15.
+	regs := s.initRegs
+	regs.R15 = uint64(t.tgid)
+
+	// Call fork in a subprocess.
+	//
+	// The new child must set up PDEATHSIG to ensure it dies if this
+	// process dies. Since this process could die at any time, this cannot
+	// be done via instrumentation from here.
+	//
+	// Instead, we create the child untraced, which will do the PDEATHSIG
+	// setup and then SIGSTOP itself for our attach below.
+	pid, err := t.syscallIgnoreInterrupt(
+		&regs,
+		syscall.SYS_CLONE,
+		arch.SyscallArgument{Value: uintptr(syscall.SIGCHLD | syscall.CLONE_FILES)},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0})
+	if err != nil {
+		return nil, err
+	}
+
+	// Wait for child to enter group-stop, so we don't stop its
+	// bootstrapping work with t.attach below.
+	//
+	// We unfortunately don't have a handy part of memory to write the wait
+	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
+	// If the child actually exited, the attach below will fail.
+	_, err = t.syscallIgnoreInterrupt(
+		&s.initRegs,
+		syscall.SYS_WAIT4,
+		arch.SyscallArgument{Value: uintptr(pid)},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: syscall.WUNTRACED},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0})
+	if err != nil {
+		return nil, err
+	}
+
+	childT := &thread{
+		tgid: int32(pid),
+		tid:  int32(pid),
+		cpu:  ^uint32(0),
+	}
+	childT.attach()
+
+	return childT, nil
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
new file mode 100644
index 000000000..697431472
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
@@ -0,0 +1,109 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 linux
+
+package ptrace
+
+import (
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
+// runtime.NumCPU doesn't actually record the number of CPUs on the system, it
+// just records the number of CPUs available in the scheduler affinity set at
+// startup. This may a) change over time and b) gives a number far lower than
+// the maximum indexable CPU. To prevent lots of allocation in the hot path, we
+// use a pool to store large masks that we can reuse during bind.
+var maskPool = sync.Pool{
+	New: func() interface{} {
+		const maxCPUs = 1024 // Not a hard limit; see below.
+		return make([]uintptr, maxCPUs/64)
+	},
+}
+
+// unmaskAllSignals unmasks all signals on the current thread.
+//
+//go:nosplit
+func unmaskAllSignals() syscall.Errno {
+	var set linux.SignalSet
+	_, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
+	return errno
+}
+
+// getCPU gets the current CPU.
+//
+// Precondition: the current runtime thread should be locked.
+func getCPU() (uint32, error) {
+	var cpu uintptr
+	if _, _, errno := syscall.RawSyscall(
+		unix.SYS_GETCPU,
+		uintptr(unsafe.Pointer(&cpu)),
+		0, 0); errno != 0 {
+		return 0, errno
+	}
+	return uint32(cpu), nil
+}
+
+// setCPU sets the CPU affinity.
+func (t *thread) setCPU(cpu uint32) error {
+	mask := maskPool.Get().([]uintptr)
+	n := int(cpu / 64)
+	v := uintptr(1 << uintptr(cpu%64))
+	if n >= len(mask) {
+		// See maskPool note above. We've actually exceeded the number
+		// of available cores. Grow the mask and return it.
+		mask = make([]uintptr, n+1)
+	}
+	mask[n] |= v
+	if _, _, errno := syscall.RawSyscall(
+		unix.SYS_SCHED_SETAFFINITY,
+		uintptr(t.tid),
+		uintptr(len(mask)*8),
+		uintptr(unsafe.Pointer(&mask[0]))); errno != 0 {
+		return errno
+	}
+	mask[n] &^= v
+	maskPool.Put(mask)
+	return nil
+}
+
+// bind attempts to ensure that the thread is on the same CPU as the current
+// thread. This provides no guarantees as it is fundamentally a racy operation:
+// CPU sets may change and we may be rescheduled in the middle of this
+// operation. As a result, no failures are reported.
+//
+// Precondition: the current runtime thread should be locked.
+func (t *thread) bind() {
+	currentCPU, err := getCPU()
+	if err != nil {
+		return
+	}
+	if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
+		// Set the affinity on the thread and save the CPU for next
+		// round; we don't expect CPUs to bounce around too frequently.
+		//
+		// (It's worth noting that we could move CPUs between this point
+		// and when the tracee finishes executing. But that would be
+		// roughly the status quo anyways -- we're just maximizing our
+		// chances of colocation, not guaranteeing it.)
+		t.setCPU(currentCPU)
+	}
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
new file mode 100644
index 000000000..fe41641d3
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	_ "unsafe" // required for go:linkname.
+)
+
+//go:linkname beforeFork syscall.runtime_BeforeFork
+func beforeFork()
+
+//go:linkname afterFork syscall.runtime_AfterFork
+func afterFork()
+
+//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
+func afterForkInChild()
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
new file mode 100644
index 000000000..2df232a64
--- /dev/null
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -0,0 +1,52 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
+
+go_template(
+    name = "defs",
+    srcs = [
+        "defs.go",
+        "defs_amd64.go",
+        "offsets_amd64.go",
+        "x86.go",
+    ],
+    visibility = [":__subpackages__"],
+)
+
+go_template_instance(
+    name = "defs_impl",
+    out = "defs_impl.go",
+    package = "ring0",
+    template = ":defs",
+)
+
+genrule(
+    name = "entry_impl_amd64",
+    srcs = ["entry_amd64.s"],
+    outs = ["entry_impl_amd64.s"],
+    cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@",
+    tools = ["//pkg/sentry/platform/ring0/gen_offsets"],
+)
+
+go_library(
+    name = "ring0",
+    srcs = [
+        "defs_impl.go",
+        "entry_amd64.go",
+        "entry_impl_amd64.s",
+        "kernel.go",
+        "kernel_amd64.go",
+        "kernel_unsafe.go",
+        "lib_amd64.go",
+        "lib_amd64.s",
+        "ring0.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/cpuid",
+        "//pkg/sentry/platform/ring0/pagetables",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
new file mode 100644
index 000000000..9d947b73d
--- /dev/null
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+var (
+	// UserspaceSize is the total size of userspace.
+	UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
+
+	// MaximumUserAddress is the largest possible user address.
+	MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1)
+
+	// KernelStartAddress is the starting kernel address.
+	KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
+)
+
+// Kernel is a global kernel object.
+//
+// This contains global state, shared by multiple CPUs.
+type Kernel struct {
+	KernelArchState
+}
+
+// CPU is the per-CPU struct.
+type CPU struct {
+	// self is a self reference.
+	//
+	// This is always guaranteed to be at offset zero.
+	self *CPU
+
+	// kernel is reference to the kernel that this CPU was initialized
+	// with. This reference is kept for garbage collection purposes: CPU
+	// registers may refer to objects within the Kernel object that cannot
+	// be safely freed.
+	kernel *Kernel
+
+	// CPUArchState is architecture-specific state.
+	CPUArchState
+
+	// registers is a set of registers; these may be used on kernel system
+	// calls and exceptions via the Registers function.
+	registers syscall.PtraceRegs
+
+	// KernelException handles an exception during kernel execution.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelException func(Vector)
+
+	// KernelSyscall is called for kernel system calls.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelSyscall func()
+}
+
+// Registers returns a modifiable-copy of the kernel registers.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) Registers() *syscall.PtraceRegs {
+	return &c.registers
+}
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
new file mode 100644
index 000000000..bb3420125
--- /dev/null
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+// Segment indices and Selectors.
+const (
+	// Index into GDT array.
+	_          = iota // Null descriptor first.
+	_                 // Reserved (Linux is kernel 32).
+	segKcode          // Kernel code (64-bit).
+	segKdata          // Kernel data.
+	segUcode32        // User code (32-bit).
+	segUdata          // User data.
+	segUcode64        // User code (64-bit).
+	segTss            // Task segment descriptor.
+	segTssHi          // Upper bits for TSS.
+	segLast           // Last segment (terminal, not included).
+)
+
+// Selectors.
+const (
+	Kcode   Selector = segKcode << 3
+	Kdata   Selector = segKdata << 3
+	Ucode32 Selector = (segUcode32 << 3) | 3
+	Udata   Selector = (segUdata << 3) | 3
+	Ucode64 Selector = (segUcode64 << 3) | 3
+	Tss     Selector = segTss << 3
+)
+
+// Standard segments.
+var (
+	UserCodeSegment32 SegmentDescriptor
+	UserDataSegment   SegmentDescriptor
+	UserCodeSegment64 SegmentDescriptor
+	KernelCodeSegment SegmentDescriptor
+	KernelDataSegment SegmentDescriptor
+)
+
+// KernelOpts has initialization options for the kernel.
+type KernelOpts struct {
+	// PageTables are the kernel pagetables; this must be provided.
+	PageTables *pagetables.PageTables
+}
+
+// KernelArchState contains architecture-specific state.
+type KernelArchState struct {
+	KernelOpts
+
+	// globalIDT is our set of interrupt gates.
+	globalIDT idt64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
+	// stack is the stack used for interrupts on this CPU.
+	stack [256]byte
+
+	// errorCode is the error code from the last exception.
+	errorCode uintptr
+
+	// errorType indicates the type of error code here, it is always set
+	// along with the errorCode value above.
+	//
+	// It will either by 1, which indicates a user error, or 0 indicating a
+	// kernel error. If the error code below returns false (kernel error),
+	// then it cannot provide relevant information about the last
+	// exception.
+	errorType uintptr
+
+	// gdt is the CPU's descriptor table.
+	gdt descriptorTable
+
+	// tss is the CPU's task state.
+	tss TaskState64
+}
+
+// ErrorCode returns the last error code.
+//
+// The returned boolean indicates whether the error code corresponds to the
+// last user error or not. If it does not, then fault information must be
+// ignored. This is generally the result of a kernel fault while servicing a
+// user fault.
+//
+//go:nosplit
+func (c *CPU) ErrorCode() (value uintptr, user bool) {
+	return c.errorCode, c.errorType != 0
+}
+
+func init() {
+	KernelCodeSegment.setCode64(0, 0, 0)
+	KernelDataSegment.setData(0, 0xffffffff, 0)
+	UserCodeSegment32.setCode64(0, 0, 3)
+	UserDataSegment.setData(0, 0xffffffff, 3)
+	UserCodeSegment64.setCode64(0, 0, 3)
+}
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
new file mode 100644
index 000000000..a3e992e0d
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"syscall"
+)
+
+// This is an assembly function.
+//
+// The sysenter function is invoked in two situations:
+//
+//  (1) The guest kernel has executed a system call.
+//  (2) The guest application has executed a system call.
+//
+// The interrupt flag is examined to determine whether the system call was
+// executed from kernel mode or not and the appropriate stub is called.
+func sysenter()
+
+// swapgs swaps the current GS value.
+//
+// This must be called prior to sysret/iret.
+func swapgs()
+
+// sysret returns to userspace from a system call.
+//
+// The return code is the vector that interrupted execution.
+//
+// See stubs.go for a note regarding the frame size of this function.
+func sysret(*CPU, *syscall.PtraceRegs) Vector
+
+// "iret is the cadillac of CPL switching."
+//
+//				-- Neel Natu
+//
+// iret is nearly identical to sysret, except an iret is used to fully restore
+// all user state. This must be called in cases where all registers need to be
+// restored.
+func iret(*CPU, *syscall.PtraceRegs) Vector
+
+// exception is the generic exception entry.
+//
+// This is called by the individual stub definitions.
+func exception()
+
+// resume is a stub that restores the CPU kernel registers.
+//
+// This is used when processing kernel exceptions and syscalls.
+func resume()
+
+// Start is the CPU entrypoint.
+//
+// The following start conditions must be satisfied:
+//
+//  * AX should contain the CPU pointer.
+//  * c.GDT() should be loaded as the GDT.
+//  * c.IDT() should be loaded as the IDT.
+//  * c.CR0() should be the current CR0 value.
+//  * c.CR3() should be set to the kernel PageTables.
+//  * c.CR4() should be the current CR4 value.
+//  * c.EFER() should be the current EFER value.
+//
+// The CPU state will be set to c.Registers().
+func Start()
+
+// Exception stubs.
+func divideByZero()
+func debug()
+func nmi()
+func breakpoint()
+func overflow()
+func boundRangeExceeded()
+func invalidOpcode()
+func deviceNotAvailable()
+func doubleFault()
+func coprocessorSegmentOverrun()
+func invalidTSS()
+func segmentNotPresent()
+func stackSegmentFault()
+func generalProtectionFault()
+func pageFault()
+func x87FloatingPointException()
+func alignmentCheck()
+func machineCheck()
+func simdFloatingPointException()
+func virtualizationException()
+func securityException()
+func syscallInt80()
+
+// Exception handler index.
+var handlers = map[Vector]func(){
+	DivideByZero:               divideByZero,
+	Debug:                      debug,
+	NMI:                        nmi,
+	Breakpoint:                 breakpoint,
+	Overflow:                   overflow,
+	BoundRangeExceeded:         boundRangeExceeded,
+	InvalidOpcode:              invalidOpcode,
+	DeviceNotAvailable:         deviceNotAvailable,
+	DoubleFault:                doubleFault,
+	CoprocessorSegmentOverrun:  coprocessorSegmentOverrun,
+	InvalidTSS:                 invalidTSS,
+	SegmentNotPresent:          segmentNotPresent,
+	StackSegmentFault:          stackSegmentFault,
+	GeneralProtectionFault:     generalProtectionFault,
+	PageFault:                  pageFault,
+	X87FloatingPointException:  x87FloatingPointException,
+	AlignmentCheck:             alignmentCheck,
+	MachineCheck:               machineCheck,
+	SIMDFloatingPointException: simdFloatingPointException,
+	VirtualizationException:    virtualizationException,
+	SecurityException:          securityException,
+	SyscallInt80:               syscallInt80,
+}
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
new file mode 100644
index 000000000..e8638133b
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -0,0 +1,334 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// NB: Offsets are programatically generated (see BUILD).
+//
+// This file is concatenated with the definitions.
+
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_SAVE(reg, offset) \
+  MOVQ R15, offset+PTRACE_R15(reg); \
+  MOVQ R14, offset+PTRACE_R14(reg); \
+  MOVQ R13, offset+PTRACE_R13(reg); \
+  MOVQ R12, offset+PTRACE_R12(reg); \
+  MOVQ BP,  offset+PTRACE_RBP(reg); \
+  MOVQ BX,  offset+PTRACE_RBX(reg); \
+  MOVQ CX,  offset+PTRACE_RCX(reg); \
+  MOVQ DX,  offset+PTRACE_RDX(reg); \
+  MOVQ R11, offset+PTRACE_R11(reg); \
+  MOVQ R10, offset+PTRACE_R10(reg); \
+  MOVQ R9,  offset+PTRACE_R9(reg); \
+  MOVQ R8,  offset+PTRACE_R8(reg); \
+  MOVQ SI,  offset+PTRACE_RSI(reg); \
+  MOVQ DI,  offset+PTRACE_RDI(reg);
+
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_LOAD(reg, offset) \
+  MOVQ offset+PTRACE_R15(reg), R15; \
+  MOVQ offset+PTRACE_R14(reg), R14; \
+  MOVQ offset+PTRACE_R13(reg), R13; \
+  MOVQ offset+PTRACE_R12(reg), R12; \
+  MOVQ offset+PTRACE_RBP(reg), BP; \
+  MOVQ offset+PTRACE_RBX(reg), BX; \
+  MOVQ offset+PTRACE_RCX(reg), CX; \
+  MOVQ offset+PTRACE_RDX(reg), DX; \
+  MOVQ offset+PTRACE_R11(reg), R11; \
+  MOVQ offset+PTRACE_R10(reg), R10; \
+  MOVQ offset+PTRACE_R9(reg),  R9; \
+  MOVQ offset+PTRACE_R8(reg),  R8; \
+  MOVQ offset+PTRACE_RSI(reg), SI; \
+  MOVQ offset+PTRACE_RDI(reg), DI;
+
+// SWAP_GS swaps the kernel GS (CPU).
+#define SWAP_GS() \
+	BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
+
+// IRET returns from an interrupt frame.
+#define IRET() \
+	BYTE $0x48; BYTE $0xcf;
+
+// SYSRET64 executes the sysret instruction.
+#define SYSRET64() \
+	BYTE $0x48; BYTE $0x0f; BYTE $0x07;
+
+// LOAD_KERNEL_ADDRESS loads a kernel address.
+#define LOAD_KERNEL_ADDRESS(from, to) \
+	MOVQ from, to; \
+	ORQ ·KernelStartAddress(SB), to;
+
+// LOAD_KERNEL_STACK loads the kernel stack.
+#define LOAD_KERNEL_STACK(from) \
+	LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
+	LEAQ CPU_STACK_TOP(SP), SP;
+
+// See kernel.go.
+TEXT ·Halt(SB),NOSPLIT,$0
+	HLT
+	RET
+
+// See kernel.go.
+TEXT ·Current(SB),NOSPLIT,$0-8
+	MOVQ CPU_SELF(GS), AX
+	MOVQ AX, ret+0(FP)
+	RET
+
+// See entry_amd64.go.
+TEXT ·swapgs(SB),NOSPLIT,$0
+	SWAP_GS()
+	RET
+
+// See entry_amd64.go.
+TEXT ·sysret(SB),NOSPLIT,$0-24
+	// Save original state.
+	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+	// Restore user register state.
+	REGISTERS_LOAD(AX, 0)
+	MOVQ PTRACE_RIP(AX), CX    // Needed for SYSRET.
+	MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
+	MOVQ PTRACE_RSP(AX), SP    // Restore the stack directly.
+	MOVQ PTRACE_RAX(AX), AX    // Restore AX (scratch).
+	SYSRET64()
+
+// See entry_amd64.go.
+TEXT ·iret(SB),NOSPLIT,$0-24
+	// Save original state.
+	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+	// Build an IRET frame & restore state.
+	LOAD_KERNEL_STACK(BX)
+	MOVQ PTRACE_SS(AX), BX;    PUSHQ BX
+	MOVQ PTRACE_RSP(AX), CX;   PUSHQ CX
+	MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
+	MOVQ PTRACE_CS(AX), DI;    PUSHQ DI
+	MOVQ PTRACE_RIP(AX), SI;   PUSHQ SI
+	REGISTERS_LOAD(AX, 0)   // Restore most registers.
+	MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+	IRET()
+
+// See entry_amd64.go.
+TEXT ·resume(SB),NOSPLIT,$0
+	// See iret, above.
+	MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX;    PUSHQ BX
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX;   PUSHQ CX
+	MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
+	MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI;    PUSHQ DI
+	MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI;   PUSHQ SI
+	REGISTERS_LOAD(GS, CPU_REGISTERS)
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+	IRET()
+
+// See entry_amd64.go.
+TEXT ·Start(SB),NOSPLIT,$0
+	LOAD_KERNEL_STACK(AX) // Set the stack.
+	PUSHQ $0x0            // Previous frame pointer.
+	MOVQ SP, BP           // Set frame pointer.
+	PUSHQ AX              // First argument (CPU).
+	CALL ·start(SB)       // Call Go hook.
+	JMP ·resume(SB)       // Restore to registers.
+
+// See entry_amd64.go.
+TEXT ·sysenter(SB),NOSPLIT,$0
+	// Interrupts are always disabled while we're executing in kernel mode
+	// and always enabled while executing in user mode. Therefore, we can
+	// reliably look at the flags in R11 to determine where this syscall
+	// was from.
+	TESTL $_RFLAGS_IF, R11
+	JZ kernel
+
+user:
+	SWAP_GS()
+	XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
+	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX  // Load saved AX value.
+	MOVQ BX,  PTRACE_RAX(AX)               // Save everything else.
+	MOVQ BX,  PTRACE_ORIGRAX(AX)
+	MOVQ CX,  PTRACE_RIP(AX)
+	MOVQ R11, PTRACE_FLAGS(AX)
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
+	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+	MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+
+	// Return to the kernel, where the frame is:
+	//
+	//	vector      (sp+24)
+	// 	regs        (sp+16)
+	// 	cpu         (sp+8)
+	// 	vcpu.Switch (sp+0)
+	//
+	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+	MOVQ $Syscall, 24(SP)                 // Output vector.
+	RET
+
+kernel:
+	// We can't restore the original stack, but we can access the registers
+	// in the CPU state directly. No need for temporary juggling.
+	MOVQ AX,  CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+	MOVQ AX,  CPU_REGISTERS+PTRACE_RAX(GS)
+	REGISTERS_SAVE(GS, CPU_REGISTERS)
+	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(GS)
+	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
+	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(GS)
+	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+
+	// Load the function stored in KernelSyscall.
+	//
+	// Note that this function needs to be executed on the stack in case
+	// the runtime decides to make use of the redzone (grumble). This also
+	// protects against any functions that might not be go:nosplit, since
+	// this will cause a failure immediately.
+	LOAD_KERNEL_STACK(GS)
+	MOVQ CPU_KERNEL_SYSCALL(GS), DX // Function data.
+	MOVQ 0(DX), AX                  // Function pointer.
+	PUSHQ BP                        // Push the frame pointer.
+	MOVQ SP, BP                     // Set frame pointer value.
+	CALL *AX                        // Call the function.
+	POPQ BP                         // Restore the frame pointer.
+	JMP ·resume(SB)
+
+// exception is a generic exception handler.
+//
+// There are two cases handled:
+//
+// 1) An exception in kernel mode: this results in saving the state at the time
+// of the exception and calling the defined hook.
+//
+// 2) An exception in guest mode: the original kernel frame is restored, and
+// the vector & error codes are pushed as return values.
+//
+// See below for the stubs that call exception.
+TEXT ·exception(SB),NOSPLIT,$0
+	// Determine whether the exception occurred in kernel mode or user
+	// mode, based on the flags. We expect the following stack:
+	//
+	//	SS          (sp+48)
+	//	SP          (sp+40)
+	//	FLAGS       (sp+32)
+	//	CS          (sp+24)
+	//	IP          (sp+16)
+	//	ERROR_CODE  (sp+8)
+	//	VECTOR      (sp+0)
+	//
+	TESTL $_RFLAGS_IF, 32(SP)
+	JZ kernel
+
+user:
+	SWAP_GS()
+	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX  // Load saved AX value.
+	MOVQ BX, PTRACE_RAX(AX)                // Save everything else.
+	MOVQ BX, PTRACE_ORIGRAX(AX)
+	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
+	MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
+	MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX)
+	MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
+	MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
+
+	// Copy out and return.
+	MOVQ 0(SP), BX                        // Load vector.
+	MOVQ 8(SP), CX                        // Load error code.
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
+	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+	MOVQ CX, CPU_ERROR_CODE(GS)           // Set error code.
+	MOVQ $1, CPU_ERROR_TYPE(GS)           // Set error type to user.
+	MOVQ BX, 24(SP)                       // Output vector.
+	RET
+
+kernel:
+	// As per above, we can save directly.
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
+	MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+	REGISTERS_SAVE(GS, CPU_REGISTERS)
+	MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
+	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
+	MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+
+	// Set the error code and adjust the stack.
+	MOVQ 8(SP), AX              // Load the error code.
+	MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
+	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+	MOVQ 0(SP), BX              // BX contains the vector.
+	ADDQ $48, SP                // Drop the exception frame.
+
+	// Load the function stored in KernelException.
+	//
+	// See note above re: the kernel stack.
+	LOAD_KERNEL_STACK(GS)
+	MOVQ CPU_KERNEL_EXCEPTION(GS), DX // Function data.
+	MOVQ 0(DX), AX                    // Function pointer.
+	PUSHQ BP                          // Push the frame pointer.
+	MOVQ SP, BP                       // Set frame pointer value.
+	PUSHQ BX                          // First argument (vector).
+	CALL *AX                          // Call the function.
+	POPQ BX                           // Discard the argument.
+	POPQ BP                           // Restore the frame pointer.
+	JMP ·resume(SB)
+
+#define EXCEPTION_WITH_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+	PUSHQ $value; \
+	JMP ·exception(SB);
+
+#define EXCEPTION_WITHOUT_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+	PUSHQ $0x0; \
+	PUSHQ $value; \
+	JMP ·exception(SB);
+
+EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB))
+EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB))
+EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB))
+EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB))
+EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB))
+EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB))
+EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB))
+EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB))
+EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB))
+EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB))
+EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB))
+EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB))
+EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB))
+EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB))
+EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB))
+EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB))
+EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB))
+EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB))
+EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB))
+EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB))
+EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB))
+EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB))
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
new file mode 100644
index 000000000..3bce56985
--- /dev/null
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -0,0 +1,25 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "defs_impl",
+    out = "defs_impl.go",
+    package = "main",
+    template = "//pkg/sentry/platform/ring0:defs",
+)
+
+go_binary(
+    name = "gen_offsets",
+    srcs = [
+        "defs_impl.go",
+        "main.go",
+    ],
+    visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
+    deps = [
+        "//pkg/cpuid",
+        "//pkg/sentry/platform/ring0/pagetables",
+        "//pkg/sentry/usermem",
+    ],
+)
diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go
new file mode 100644
index 000000000..ffa7eaf77
--- /dev/null
+++ b/pkg/sentry/platform/ring0/gen_offsets/main.go
@@ -0,0 +1,24 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary gen_offsets is a helper for generating offset headers.
+package main
+
+import (
+	"os"
+)
+
+func main() {
+	Emit(os.Stdout)
+}
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
new file mode 100644
index 000000000..b0471ab9a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -0,0 +1,71 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+// New creates a new kernel.
+//
+// N.B. that constraints on KernelOpts must be satisfied.
+//
+// Init must have been called.
+func New(opts KernelOpts) *Kernel {
+	k := new(Kernel)
+	k.init(opts)
+	return k
+}
+
+// NewCPU creates a new CPU associated with this Kernel.
+//
+// Note that execution of the new CPU must begin at Start, with constraints as
+// documented. Initialization is not completed by this method alone.
+//
+// See also Init.
+func (k *Kernel) NewCPU() *CPU {
+	c := new(CPU)
+	c.Init(k)
+	return c
+}
+
+// Halt halts execution.
+func Halt()
+
+// Current returns the current CPU.
+//
+// Its use is only legal in the KernelSyscall and KernelException contexts,
+// which must all be guarded go:nosplit.
+func Current() *CPU
+
+// defaultSyscall is the default syscall hook.
+//
+//go:nosplit
+func defaultSyscall() { Halt() }
+
+// defaultException is the default exception hook.
+//
+//go:nosplit
+func defaultException(Vector) { Halt() }
+
+// Init allows the initialization of a CPU from a kernel without allocation.
+// The same constraints as NewCPU apply.
+//
+// Init allows embedding in other objects.
+func (c *CPU) Init(k *Kernel) {
+	c.self = c   // Set self reference.
+	c.kernel = k // Set kernel reference.
+	c.init()     // Perform architectural init.
+
+	// Defaults.
+	c.KernelSyscall = defaultSyscall
+	c.KernelException = defaultException
+}
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
new file mode 100644
index 000000000..c82613a9c
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -0,0 +1,280 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"encoding/binary"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+const (
+	// KernelFlagsSet should always be set in the kernel.
+	KernelFlagsSet = _RFLAGS_RESERVED
+
+	// UserFlagsSet are always set in userspace.
+	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+
+	// KernelFlagsClear should always be clear in the kernel.
+	KernelFlagsClear = _RFLAGS_IF | _RFLAGS_NT | _RFLAGS_IOPL
+
+	// UserFlagsClear are always cleared in userspace.
+	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+)
+
+// init initializes architecture-specific state.
+func (k *Kernel) init(opts KernelOpts) {
+	// Save the root page tables.
+	k.PageTables = opts.PageTables
+
+	// Setup the IDT, which is uniform.
+	for v, handler := range handlers {
+		// Note that we set all traps to use the interrupt stack, this
+		// is defined below when setting up the TSS.
+		k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), 0 /* dpl */, 1 /* ist */)
+	}
+}
+
+// init initializes architecture-specific state.
+func (c *CPU) init() {
+	// Null segment.
+	c.gdt[0].setNull()
+
+	// Kernel & user segments.
+	c.gdt[segKcode] = KernelCodeSegment
+	c.gdt[segKdata] = KernelDataSegment
+	c.gdt[segUcode32] = UserCodeSegment32
+	c.gdt[segUdata] = UserDataSegment
+	c.gdt[segUcode64] = UserCodeSegment64
+
+	// The task segment, this spans two entries.
+	tssBase, tssLimit, _ := c.TSS()
+	c.gdt[segTss].set(
+		uint32(tssBase),
+		uint32(tssLimit),
+		0, // Privilege level zero.
+		SegmentDescriptorPresent|
+			SegmentDescriptorAccess|
+			SegmentDescriptorWrite|
+			SegmentDescriptorExecute)
+	c.gdt[segTssHi].setHi(uint32((tssBase) >> 32))
+
+	// Set the kernel stack pointer in the TSS (virtual address).
+	stackAddr := c.StackTop()
+	c.tss.rsp0Lo = uint32(stackAddr)
+	c.tss.rsp0Hi = uint32(stackAddr >> 32)
+	c.tss.ist1Lo = uint32(stackAddr)
+	c.tss.ist1Hi = uint32(stackAddr >> 32)
+
+	// Permanently set the kernel segments.
+	c.registers.Cs = uint64(Kcode)
+	c.registers.Ds = uint64(Kdata)
+	c.registers.Es = uint64(Kdata)
+	c.registers.Ss = uint64(Kdata)
+	c.registers.Fs = uint64(Kdata)
+	c.registers.Gs = uint64(Kdata)
+}
+
+// StackTop returns the kernel's stack address.
+//
+//go:nosplit
+func (c *CPU) StackTop() uint64 {
+	return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
+}
+
+// IDT returns the CPU's IDT base and limit.
+//
+//go:nosplit
+func (c *CPU) IDT() (uint64, uint16) {
+	return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1)
+}
+
+// GDT returns the CPU's GDT base and limit.
+//
+//go:nosplit
+func (c *CPU) GDT() (uint64, uint16) {
+	return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1)
+}
+
+// TSS returns the CPU's TSS base, limit and value.
+//
+//go:nosplit
+func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
+	return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss]
+}
+
+// CR0 returns the CPU's CR0 value.
+//
+//go:nosplit
+func (c *CPU) CR0() uint64 {
+	return _CR0_PE | _CR0_PG | _CR0_ET
+}
+
+// CR4 returns the CPU's CR4 value.
+//
+//go:nosplit
+func (c *CPU) CR4() uint64 {
+	cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT)
+	if hasPCID {
+		cr4 |= _CR4_PCIDE
+	}
+	if hasXSAVE {
+		cr4 |= _CR4_OSXSAVE
+	}
+	if hasSMEP {
+		cr4 |= _CR4_SMEP
+	}
+	if hasFSGSBASE {
+		cr4 |= _CR4_FSGSBASE
+	}
+	return cr4
+}
+
+// EFER returns the CPU's EFER value.
+//
+//go:nosplit
+func (c *CPU) EFER() uint64 {
+	return _EFER_LME | _EFER_SCE | _EFER_NX
+}
+
+// IsCanonical indicates whether addr is canonical per the amd64 spec.
+//
+//go:nosplit
+func IsCanonical(addr uint64) bool {
+	return addr <= 0x00007fffffffffff || addr > 0xffff800000000000
+}
+
+// Flags contains flags related to switch.
+type Flags uintptr
+
+const (
+	// FlagFull indicates that a full restore should be not, not a fast
+	// restore (on the syscall return path.)
+	FlagFull = 1 << iota
+
+	// FlagFlush indicates that a full TLB flush is required.
+	FlagFlush
+)
+
+// SwitchToUser performs either a sysret or an iret.
+//
+// The return value is the vector that interrupted execution.
+//
+// This function will not split the stack. Callers will probably want to call
+// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to
+// calling this function.
+//
+// When this is done, this region is quite sensitive to things like system
+// calls. After calling entersyscall, any memory used must have been allocated
+// and no function calls without go:nosplit are permitted. Any calls made here
+// are protected appropriately (e.g. IsCanonical and CR3).
+//
+// Also note that this function transitively depends on the compiler generating
+// code that uses IP-relative addressing inside of absolute addresses. That's
+// the case for amd64, but may not be the case for other architectures.
+//
+//go:nosplit
+func (c *CPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags Flags) (vector Vector) {
+	// Check for canonical addresses.
+	if !IsCanonical(regs.Rip) || !IsCanonical(regs.Rsp) || !IsCanonical(regs.Fs_base) || !IsCanonical(regs.Gs_base) {
+		return GeneralProtectionFault
+	}
+
+	var (
+		userCR3   uint64
+		kernelCR3 uint64
+	)
+
+	// Sanitize registers.
+	if flags&FlagFlush != 0 {
+		userCR3 = pt.FlushCR3()
+	} else {
+		userCR3 = pt.CR3()
+	}
+	regs.Eflags &= ^uint64(UserFlagsClear)
+	regs.Eflags |= UserFlagsSet
+	regs.Cs = uint64(Ucode64) // Required for iret.
+	regs.Ss = uint64(Udata)   // Ditto.
+	kernelCR3 = c.kernel.PageTables.CR3()
+
+	// Perform the switch.
+	swapgs()                    // GS will be swapped on return.
+	wrfs(uintptr(regs.Fs_base)) // Set application FS.
+	wrgs(uintptr(regs.Gs_base)) // Set application GS.
+	LoadFloatingPoint(fpState)  // Copy in floating point.
+	jumpToKernel()              // Switch to upper half.
+	writeCR3(uintptr(userCR3))  // Change to user address space.
+	if flags&FlagFull != 0 {
+		vector = iret(c, regs)
+	} else {
+		vector = sysret(c, regs)
+	}
+	writeCR3(uintptr(kernelCR3))       // Return to kernel address space.
+	jumpToUser()                       // Return to lower half.
+	SaveFloatingPoint(fpState)         // Copy out floating point.
+	wrfs(uintptr(c.registers.Fs_base)) // Restore kernel FS.
+	return
+}
+
+// start is the CPU entrypoint.
+//
+// This is called from the Start asm stub (see entry_amd64.go); on return the
+// registers in c.registers will be restored (not segments).
+//
+//go:nosplit
+func start(c *CPU) {
+	// Save per-cpu & FS segment.
+	wrgs(kernelAddr(c))
+	wrfs(uintptr(c.Registers().Fs_base))
+
+	// Initialize floating point.
+	//
+	// Note that on skylake, the valid XCR0 mask reported seems to be 0xff.
+	// This breaks down as:
+	//
+	//	bit0   - x87
+	//	bit1   - SSE
+	//	bit2   - AVX
+	//	bit3-4 - MPX
+	//	bit5-7 - AVX512
+	//
+	// For some reason, enabled MPX & AVX512 on platforms that report them
+	// seems to be cause a general protection fault. (Maybe there are some
+	// virtualization issues and these aren't exported to the guest cpuid.)
+	// This needs further investigation, but we can limit the floating
+	// point operations to x87, SSE & AVX for now.
+	fninit()
+	xsetbv(0, validXCR0Mask&0x7)
+
+	// Set the syscall target.
+	wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
+	wrmsr(_MSR_SYSCALL_MASK, _RFLAGS_STEP|_RFLAGS_IF|_RFLAGS_DF|_RFLAGS_IOPL|_RFLAGS_AC|_RFLAGS_NT)
+
+	// NOTE: This depends on having the 64-bit segments immediately
+	// following the 32-bit user segments. This is simply the way the
+	// sysret instruction is designed to work (it assumes they follow).
+	wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
+	wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
+}
+
+// ReadCR2 reads the current CR2 value.
+//
+//go:nosplit
+func ReadCR2() uintptr {
+	return readCR2()
+}
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
new file mode 100644
index 000000000..cfb3ad853
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+import (
+	"unsafe"
+)
+
+// eface mirrors runtime.eface.
+type eface struct {
+	typ  uintptr
+	data unsafe.Pointer
+}
+
+// kernelAddr returns the kernel virtual address for the given object.
+//
+//go:nosplit
+func kernelAddr(obj interface{}) uintptr {
+	e := (*eface)(unsafe.Pointer(&obj))
+	return KernelStartAddress | uintptr(e.data)
+}
+
+// kernelFunc returns the address of the given function.
+//
+//go:nosplit
+func kernelFunc(fn func()) uintptr {
+	fnptr := (**uintptr)(unsafe.Pointer(&fn))
+	return KernelStartAddress | **fnptr
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
new file mode 100644
index 000000000..f1ed5bfb4
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+)
+
+// LoadFloatingPoint loads floating point state by the most efficient mechanism
+// available (set by Init).
+var LoadFloatingPoint func(*byte)
+
+// SaveFloatingPoint saves floating point state by the most efficient mechanism
+// available (set by Init).
+var SaveFloatingPoint func(*byte)
+
+// fxrstor uses fxrstor64 to load floating point state.
+func fxrstor(*byte)
+
+// xrstor uses xrstor to load floating point state.
+func xrstor(*byte)
+
+// fxsave uses fxsave64 to save floating point state.
+func fxsave(*byte)
+
+// xsave uses xsave to save floating point state.
+func xsave(*byte)
+
+// xsaveopt uses xsaveopt to save floating point state.
+func xsaveopt(*byte)
+
+// wrfs sets the GS address (set by init).
+var wrfs func(addr uintptr)
+
+// wrfsbase writes to the GS base address.
+func wrfsbase(addr uintptr)
+
+// wrfsmsr writes to the GS_BASE MSR.
+func wrfsmsr(addr uintptr)
+
+// wrgs sets the GS address (set by init).
+var wrgs func(addr uintptr)
+
+// wrgsbase writes to the GS base address.
+func wrgsbase(addr uintptr)
+
+// wrgsmsr writes to the GS_BASE MSR.
+func wrgsmsr(addr uintptr)
+
+// writeCR3 writes the CR3 value.
+func writeCR3(phys uintptr)
+
+// readCR2 reads the current CR2 value.
+func readCR2() uintptr
+
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
+// jumpToUser jumps to the user version of the current RIP.
+func jumpToUser()
+
+// fninit initializes the floating point unit.
+func fninit()
+
+// xsetbv writes to an extended control register.
+func xsetbv(reg, value uintptr)
+
+// xgetbv reads an extended control register.
+func xgetbv(reg uintptr) uintptr
+
+// wrmsr reads to the given MSR.
+func wrmsr(reg, value uintptr)
+
+// rdmsr reads the given MSR.
+func rdmsr(reg uintptr) uintptr
+
+// Mostly-constants set by Init.
+var (
+	hasSMEP       bool
+	hasPCID       bool
+	hasXSAVEOPT   bool
+	hasXSAVE      bool
+	hasFSGSBASE   bool
+	validXCR0Mask uintptr
+)
+
+// Init sets function pointers based on architectural features.
+//
+// This must be called prior to using ring0.
+func Init(featureSet *cpuid.FeatureSet) {
+	hasSMEP = featureSet.HasFeature(cpuid.X86FeatureSMEP)
+	hasPCID = featureSet.HasFeature(cpuid.X86FeaturePCID)
+	hasXSAVEOPT = featureSet.UseXsaveopt()
+	hasXSAVE = featureSet.UseXsave()
+	hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase)
+	validXCR0Mask = uintptr(featureSet.ValidXCR0Mask())
+	if hasXSAVEOPT {
+		SaveFloatingPoint = xsaveopt
+		LoadFloatingPoint = xrstor
+	} else if hasXSAVE {
+		SaveFloatingPoint = xsave
+		LoadFloatingPoint = xrstor
+	} else {
+		SaveFloatingPoint = fxsave
+		LoadFloatingPoint = fxrstor
+	}
+	if hasFSGSBASE {
+		wrfs = wrfsbase
+		wrgs = wrgsbase
+	} else {
+		wrfs = wrfsmsr
+		wrgs = wrgsmsr
+	}
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
new file mode 100644
index 000000000..6f143ea5a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -0,0 +1,247 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// fxrstor loads floating point state.
+//
+// The code corresponds to:
+//
+//     fxrstor64 (%rbx)
+//
+TEXT ·fxrstor(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), BX
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b;
+	RET
+
+// xrstor loads floating point state.
+//
+// The code corresponds to:
+//
+//     xrstor (%rdi)
+//
+TEXT ·xrstor(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f;
+	RET
+
+// fxsave saves floating point state.
+//
+// The code corresponds to:
+//
+//     fxsave64 (%rbx)
+//
+TEXT ·fxsave(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), BX
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03;
+	RET
+
+// xsave saves floating point state.
+//
+// The code corresponds to:
+//
+//     xsave (%rdi)
+//
+TEXT ·xsave(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27;
+	RET
+
+// xsaveopt saves floating point state.
+//
+// The code corresponds to:
+//
+//     xsaveopt (%rdi)
+//
+TEXT ·xsaveopt(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37;
+	RET
+
+// wrfsbase writes to the FS base.
+//
+// The code corresponds to:
+//
+// 	wrfsbase %rax
+//
+TEXT ·wrfsbase(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0;
+	RET
+
+// wrfsmsr writes to the FSBASE MSR.
+//
+// The code corresponds to:
+//
+// 	wrmsr (writes EDX:EAX to the MSR in ECX)
+//
+TEXT ·wrfsmsr(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	MOVQ AX, DX
+	SHRQ $32, DX
+	MOVQ $0xc0000100, CX // MSR_FS_BASE
+	BYTE $0x0f; BYTE $0x30;
+	RET
+
+// wrgsbase writes to the GS base.
+//
+// The code corresponds to:
+//
+// 	wrgsbase %rax
+//
+TEXT ·wrgsbase(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8;
+	RET
+
+// wrgsmsr writes to the GSBASE MSR.
+//
+// See wrfsmsr.
+TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	MOVQ AX, DX
+	SHRQ $32, DX
+	MOVQ $0xc0000101, CX     // MSR_GS_BASE
+	BYTE $0x0f; BYTE $0x30;  // WRMSR
+	RET
+
+// jumpToUser changes execution to the user address.
+//
+// This works by changing the return value to the user version.
+TEXT ·jumpToUser(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	MOVQ ·KernelStartAddress(SB), BX
+	NOTQ BX
+	ANDQ BX, SP // Switch the stack.
+	ANDQ BX, BP // Switch the frame pointer.
+	ANDQ BX, AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	MOVQ ·KernelStartAddress(SB), BX
+	ORQ BX, SP // Switch the stack.
+	ORQ BX, BP // Switch the frame pointer.
+	ORQ BX, AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
+// writeCR3 writes the given CR3 value.
+//
+// The code corresponds to:
+//
+// 	mov %rax, %cr3
+//
+TEXT ·writeCR3(SB),NOSPLIT,$0-8
+	MOVQ cr3+0(FP), AX
+	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+	RET
+
+// readCR3 reads the current CR3 value.
+//
+// The code corresponds to:
+//
+// 	mov %cr3, %rax
+//
+TEXT ·readCR3(SB),NOSPLIT,$0-8
+	BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
+	MOVQ AX, ret+0(FP)
+	RET
+
+// readCR2 reads the current CR2 value.
+//
+// The code corresponds to:
+//
+// 	mov %cr2, %rax
+//
+TEXT ·readCR2(SB),NOSPLIT,$0-8
+	BYTE $0x0f; BYTE $0x20; BYTE $0xd0;
+	MOVQ AX, ret+0(FP)
+	RET
+
+// fninit initializes the floating point unit.
+//
+// The code corresponds to:
+//
+// 	fninit
+TEXT ·fninit(SB),NOSPLIT,$0
+	BYTE $0xdb; BYTE $0xe3;
+	RET
+
+// xsetbv writes to an extended control register.
+//
+// The code corresponds to:
+//
+// 	xsetbv
+//
+TEXT ·xsetbv(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	MOVL value+8(FP), AX
+	MOVL value+12(FP), DX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd1;
+	RET
+
+// xgetbv reads an extended control register.
+//
+// The code corresponds to:
+//
+// 	xgetbv
+//
+TEXT ·xgetbv(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0;
+	MOVL AX, ret+8(FP)
+	MOVL DX, ret+12(FP)
+	RET
+
+// wrmsr writes to a control register.
+//
+// The code corresponds to:
+//
+// 	wrmsr
+//
+TEXT ·wrmsr(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	MOVL value+8(FP), AX
+	MOVL value+12(FP), DX
+	BYTE $0x0f; BYTE $0x30;
+	RET
+
+// rdmsr reads a control register.
+//
+// The code corresponds to:
+//
+// 	rdmsr
+//
+TEXT ·rdmsr(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	BYTE $0x0f; BYTE $0x32;
+	MOVL AX, ret+8(FP)
+	MOVL DX, ret+12(FP)
+	RET
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
new file mode 100644
index 000000000..9acd442ba
--- /dev/null
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"fmt"
+	"io"
+	"reflect"
+	"syscall"
+)
+
+// Emit prints architecture-specific offsets.
+func Emit(w io.Writer) {
+	fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
+
+	c := &CPU{}
+	fmt.Fprintf(w, "\n// CPU offsets.\n")
+	fmt.Fprintf(w, "#define CPU_SELF             0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_REGISTERS        0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_STACK_TOP        0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
+	fmt.Fprintf(w, "#define CPU_ERROR_CODE       0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_ERROR_TYPE       0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_KERNEL_EXCEPTION 0x%02x\n", reflect.ValueOf(&c.KernelException).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_KERNEL_SYSCALL   0x%02x\n", reflect.ValueOf(&c.KernelSyscall).Pointer()-reflect.ValueOf(c).Pointer())
+
+	fmt.Fprintf(w, "\n// Bits.\n")
+	fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF)
+
+	fmt.Fprintf(w, "\n// Vectors.\n")
+	fmt.Fprintf(w, "#define DivideByZero               0x%02x\n", DivideByZero)
+	fmt.Fprintf(w, "#define Debug                      0x%02x\n", Debug)
+	fmt.Fprintf(w, "#define NMI                        0x%02x\n", NMI)
+	fmt.Fprintf(w, "#define Breakpoint                 0x%02x\n", Breakpoint)
+	fmt.Fprintf(w, "#define Overflow                   0x%02x\n", Overflow)
+	fmt.Fprintf(w, "#define BoundRangeExceeded         0x%02x\n", BoundRangeExceeded)
+	fmt.Fprintf(w, "#define InvalidOpcode              0x%02x\n", InvalidOpcode)
+	fmt.Fprintf(w, "#define DeviceNotAvailable         0x%02x\n", DeviceNotAvailable)
+	fmt.Fprintf(w, "#define DoubleFault                0x%02x\n", DoubleFault)
+	fmt.Fprintf(w, "#define CoprocessorSegmentOverrun  0x%02x\n", CoprocessorSegmentOverrun)
+	fmt.Fprintf(w, "#define InvalidTSS                 0x%02x\n", InvalidTSS)
+	fmt.Fprintf(w, "#define SegmentNotPresent          0x%02x\n", SegmentNotPresent)
+	fmt.Fprintf(w, "#define StackSegmentFault          0x%02x\n", StackSegmentFault)
+	fmt.Fprintf(w, "#define GeneralProtectionFault     0x%02x\n", GeneralProtectionFault)
+	fmt.Fprintf(w, "#define PageFault                  0x%02x\n", PageFault)
+	fmt.Fprintf(w, "#define X87FloatingPointException  0x%02x\n", X87FloatingPointException)
+	fmt.Fprintf(w, "#define AlignmentCheck             0x%02x\n", AlignmentCheck)
+	fmt.Fprintf(w, "#define MachineCheck               0x%02x\n", MachineCheck)
+	fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException)
+	fmt.Fprintf(w, "#define VirtualizationException    0x%02x\n", VirtualizationException)
+	fmt.Fprintf(w, "#define SecurityException          0x%02x\n", SecurityException)
+	fmt.Fprintf(w, "#define SyscallInt80               0x%02x\n", SyscallInt80)
+	fmt.Fprintf(w, "#define Syscall                    0x%02x\n", Syscall)
+
+	p := &syscall.PtraceRegs{}
+	fmt.Fprintf(w, "\n// Ptrace registers.\n")
+	fmt.Fprintf(w, "#define PTRACE_R15      0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R14      0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R13      0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R12      0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RBP      0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RBX      0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R11      0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R10      0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R9       0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R8       0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RAX      0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RCX      0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RDX      0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RSI      0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RDI      0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_ORIGRAX  0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RIP      0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_CS       0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_FLAGS    0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RSP      0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_SS       0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_FS       0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_GS       0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
new file mode 100644
index 000000000..c0c481ab3
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -0,0 +1,32 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "pagetables",
+    srcs = [
+        "pagetables.go",
+        "pagetables_amd64.go",
+        "pagetables_unsafe.go",
+        "pagetables_x86.go",
+        "pcids_x86.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables",
+    visibility = [
+        "//pkg/sentry/platform/kvm:__subpackages__",
+        "//pkg/sentry/platform/ring0:__subpackages__",
+    ],
+    deps = ["//pkg/sentry/usermem"],
+)
+
+go_test(
+    name = "pagetables_test",
+    size = "small",
+    srcs = [
+        "pagetables_test.go",
+        "pagetables_x86_test.go",
+        "pcids_x86_test.go",
+    ],
+    embed = [":pagetables"],
+    deps = ["//pkg/sentry/usermem"],
+)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
new file mode 100644
index 000000000..3cbf0bfa5
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -0,0 +1,193 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pagetables provides a generic implementation of pagetables.
+package pagetables
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Node is a single node within a set of page tables.
+type Node struct {
+	// unalignedData has unaligned data. Unfortunately, we can't really
+	// rely on the allocator to give us what we want here. So we just throw
+	// it at the wall and use the portion that matches. Gross. This may be
+	// changed in the future to use a different allocation mechanism.
+	//
+	// Access must happen via functions found in pagetables_unsafe.go.
+	unalignedData [(2 * usermem.PageSize) - 1]byte
+
+	// physical is the translated address of these entries.
+	//
+	// This is filled in at creation time.
+	physical uintptr
+}
+
+// PageTables is a set of page tables.
+type PageTables struct {
+	mu sync.Mutex
+
+	// root is the pagetable root.
+	root *Node
+
+	// translater is the translater passed at creation.
+	translater Translater
+
+	// archPageTables includes architecture-specific features.
+	archPageTables
+
+	// allNodes is a set of nodes indexed by translater address.
+	allNodes map[uintptr]*Node
+}
+
+// Translater translates to guest physical addresses.
+type Translater interface {
+	// TranslateToPhysical translates the given pointer object into a
+	// "physical" address. We do not require that it translates back, the
+	// reverse mapping is maintained internally.
+	TranslateToPhysical(*PTEs) uintptr
+}
+
+// New returns new PageTables.
+func New(t Translater, opts Opts) *PageTables {
+	p := &PageTables{
+		translater: t,
+		allNodes:   make(map[uintptr]*Node),
+	}
+	p.root = p.allocNode()
+	p.init(opts)
+	return p
+}
+
+// New returns a new set of PageTables derived from the given one.
+//
+// This function should always be preferred to New if there are existing
+// pagetables, as this function preserves architectural constraints relevant to
+// managing multiple sets of pagetables.
+func (p *PageTables) New() *PageTables {
+	np := &PageTables{
+		translater: p.translater,
+		allNodes:   make(map[uintptr]*Node),
+	}
+	np.root = np.allocNode()
+	np.initFrom(&p.archPageTables)
+	return np
+}
+
+// setPageTable sets the given index as a page table.
+func (p *PageTables) setPageTable(n *Node, index int, child *Node) {
+	phys := p.translater.TranslateToPhysical(child.PTEs())
+	p.allNodes[phys] = child
+	pte := &n.PTEs()[index]
+	pte.setPageTable(phys)
+}
+
+// clearPageTable clears the given entry.
+func (p *PageTables) clearPageTable(n *Node, index int) {
+	pte := &n.PTEs()[index]
+	physical := pte.Address()
+	pte.Clear()
+	delete(p.allNodes, physical)
+}
+
+// getPageTable returns the page table entry.
+func (p *PageTables) getPageTable(n *Node, index int) *Node {
+	pte := &n.PTEs()[index]
+	physical := pte.Address()
+	child := p.allNodes[physical]
+	return child
+}
+
+// Map installs a mapping with the given physical address.
+//
+// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be aligned, their sum must not overflow.
+func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at usermem.AccessType, physical uintptr) bool {
+	if at == usermem.NoAccess {
+		return p.Unmap(addr, length)
+	}
+	prev := false
+	p.mu.Lock()
+	end, ok := addr.AddLength(uint64(length))
+	if !ok {
+		panic("pagetables.Map: overflow")
+	}
+	p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) {
+		p := physical + (s - uintptr(addr))
+		prev = prev || (pte.Valid() && (p != pte.Address() || at.Write != pte.Writeable() || at.Execute != pte.Executable()))
+		if p&align != 0 {
+			// We will install entries at a smaller granulaity if
+			// we don't install a valid entry here, however we must
+			// zap any existing entry to ensure this happens.
+			pte.Clear()
+			return
+		}
+		pte.Set(p, at.Write, at.Execute, user)
+	})
+	p.mu.Unlock()
+	return prev
+}
+
+// Unmap unmaps the given range.
+//
+// True is returned iff there was a previous mapping in the range.
+func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
+	p.mu.Lock()
+	count := 0
+	p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) {
+		pte.Clear()
+		count++
+	})
+	p.mu.Unlock()
+	return count > 0
+}
+
+// Release releases this address space.
+//
+// This must be called to release the PCID.
+func (p *PageTables) Release() {
+	// Clear all pages.
+	p.Unmap(0, ^uintptr(0))
+	p.release()
+}
+
+// Lookup returns the physical address for the given virtual address.
+func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType usermem.AccessType) {
+	mask := uintptr(usermem.PageSize - 1)
+	off := uintptr(addr) & mask
+	addr = addr &^ usermem.Addr(mask)
+	p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) {
+		if !pte.Valid() {
+			return
+		}
+		physical = pte.Address() + (s - uintptr(addr)) + off
+		accessType = usermem.AccessType{
+			Read:    true,
+			Write:   pte.Writeable(),
+			Execute: pte.Executable(),
+		}
+	})
+	return physical, accessType
+}
+
+// allocNode allocates a new page.
+func (p *PageTables) allocNode() *Node {
+	n := new(Node)
+	n.physical = p.translater.TranslateToPhysical(n.PTEs())
+	return n
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
new file mode 100644
index 000000000..b89665c96
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -0,0 +1,397 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package pagetables
+
+import (
+	"fmt"
+	"sync/atomic"
+)
+
+// Address constraints.
+//
+// The lowerTop and upperBottom currently apply to four-level pagetables;
+// additional refactoring would be necessary to support five-level pagetables.
+const (
+	lowerTop    = 0x00007fffffffffff
+	upperBottom = 0xffff800000000000
+
+	pteShift = 12
+	pmdShift = 21
+	pudShift = 30
+	pgdShift = 39
+
+	pteMask = 0x1ff << pteShift
+	pmdMask = 0x1ff << pmdShift
+	pudMask = 0x1ff << pudShift
+	pgdMask = 0x1ff << pgdShift
+
+	pteSize = 1 << pteShift
+	pmdSize = 1 << pmdShift
+	pudSize = 1 << pudShift
+	pgdSize = 1 << pgdShift
+)
+
+// Bits in page table entries.
+const (
+	present        = 0x001
+	writable       = 0x002
+	user           = 0x004
+	writeThrough   = 0x008
+	cacheDisable   = 0x010
+	accessed       = 0x020
+	dirty          = 0x040
+	super          = 0x080
+	executeDisable = 1 << 63
+)
+
+// PTE is a page table entry.
+type PTE uint64
+
+// Clear clears this PTE, including super page information.
+func (p *PTE) Clear() {
+	atomic.StoreUint64((*uint64)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+func (p *PTE) Valid() bool {
+	return atomic.LoadUint64((*uint64)(p))&present != 0
+}
+
+// Writeable returns true iff the page is writable.
+func (p *PTE) Writeable() bool {
+	return atomic.LoadUint64((*uint64)(p))&writable != 0
+}
+
+// User returns true iff the page is user-accessible.
+func (p *PTE) User() bool {
+	return atomic.LoadUint64((*uint64)(p))&user != 0
+}
+
+// Executable returns true iff the page is executable.
+func (p *PTE) Executable() bool {
+	return atomic.LoadUint64((*uint64)(p))&executeDisable == 0
+}
+
+// SetSuper sets this page as a super page.
+//
+// The page must not be valid or a panic will result.
+func (p *PTE) SetSuper() {
+	if p.Valid() {
+		// This is not allowed.
+		panic("SetSuper called on valid page!")
+	}
+	atomic.StoreUint64((*uint64)(p), super)
+}
+
+// IsSuper returns true iff this page is a super page.
+func (p *PTE) IsSuper() bool {
+	return atomic.LoadUint64((*uint64)(p))&super != 0
+}
+
+// Set sets this PTE value.
+func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) {
+	v := uint64(addr)&^uint64(0xfff) | present | accessed
+	if userAccessible {
+		v |= user
+	}
+	if !execute {
+		v |= executeDisable
+	}
+	if write {
+		v |= writable | dirty
+	}
+	if p.IsSuper() {
+		v |= super
+	}
+	atomic.StoreUint64((*uint64)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and super bit to
+// be cleared. This is used explicitly for breaking super pages.
+func (p *PTE) setPageTable(addr uintptr) {
+	v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty
+	atomic.StoreUint64((*uint64)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+func (p *PTE) Address() uintptr {
+	return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff))
+}
+
+// entriesPerPage is the number of PTEs per page.
+const entriesPerPage = 512
+
+// PTEs is a collection of entries.
+type PTEs [entriesPerPage]PTE
+
+// next returns the next address quantized by the given size.
+func next(start uint64, size uint64) uint64 {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If alloc is set, then Set _must_ be called on all given PTEs. The exception
+// is super pages. If a valid super page cannot be installed, then the walk
+// will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if alloc set, then no gaps will be present. However, if alloc is
+// not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: startAddr and endAddr must be page-aligned.
+//
+// Precondition: startStart must be less than endAddr.
+//
+// Precondition: If alloc is set, then startAddr and endAddr should not span
+// non-canonical ranges. If they do, a panic will result.
+func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) {
+	start := uint64(startAddr)
+	end := uint64(endAddr)
+	if start%pteSize != 0 {
+		panic(fmt.Sprintf("unaligned start: %v", start))
+	}
+	if start > end {
+		panic(fmt.Sprintf("start > end (%v > %v))", start, end))
+	}
+
+	// Deal with cases where we traverse the "gap".
+	//
+	// These are all explicitly disallowed if alloc is set, and we must
+	// traverse an entry for each address explicitly.
+	switch {
+	case start < lowerTop && end > lowerTop && end < upperBottom:
+		if alloc {
+			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+		}
+		p.iterateRange(startAddr, lowerTop, false, fn)
+		return
+	case start < lowerTop && end > lowerTop:
+		if alloc {
+			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+		}
+		p.iterateRange(startAddr, lowerTop, false, fn)
+		p.iterateRange(upperBottom, endAddr, false, fn)
+		return
+	case start > lowerTop && end < upperBottom:
+		if alloc {
+			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+		}
+		return
+	case start > lowerTop && start < upperBottom && end > upperBottom:
+		if alloc {
+			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+		}
+		p.iterateRange(upperBottom, endAddr, false, fn)
+		return
+	}
+
+	for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		pgdEntry := &p.root.PTEs()[pgdIndex]
+		if !pgdEntry.Valid() {
+			if !alloc {
+				// Skip over this entry.
+				start = next(start, pgdSize)
+				continue
+			}
+
+			// Allocate a new pgd.
+			p.setPageTable(p.root, pgdIndex, p.allocNode())
+		}
+
+		// Map the next level.
+		pudNode := p.getPageTable(p.root, pgdIndex)
+		clearPUDEntries := 0
+
+		for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			pudEntry := &(pudNode.PTEs()[pudIndex])
+			if !pudEntry.Valid() {
+				if !alloc {
+					// Skip over this entry.
+					clearPUDEntries++
+					start = next(start, pudSize)
+					continue
+				}
+
+				// This level has 1-GB super pages. Is this
+				// entire region contained in a single PUD
+				// entry? If so, we can skip allocating a new
+				// page for the pmd.
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = next(start, pudSize)
+						continue
+					}
+				}
+
+				// Allocate a new pud.
+				p.setPageTable(pudNode, pudIndex, p.allocNode())
+
+			} else if pudEntry.IsSuper() {
+				// Does this page need to be split?
+				if start&(pudSize-1) != 0 || end < next(start, pudSize) {
+					currentAddr := uint64(pudEntry.Address())
+					writeable := pudEntry.Writeable()
+					executable := pudEntry.Executable()
+					user := pudEntry.User()
+
+					// Install the relevant entries.
+					pmdNode := p.allocNode()
+					pmdEntries := pmdNode.PTEs()
+					for index := 0; index < entriesPerPage; index++ {
+						pmdEntry := &pmdEntries[index]
+						pmdEntry.SetSuper()
+						pmdEntry.Set(uintptr(currentAddr), writeable, executable, user)
+						currentAddr += pmdSize
+					}
+
+					// Reset to point to the new page.
+					p.setPageTable(pudNode, pudIndex, pmdNode)
+				} else {
+					// A super page to be checked directly.
+					fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+
+					// Might have been cleared.
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					// Note that the super page was changed.
+					start = next(start, pudSize)
+					continue
+				}
+			}
+
+			// Map the next level, since this is valid.
+			pmdNode := p.getPageTable(pudNode, pudIndex)
+			clearPMDEntries := 0
+
+			for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				pmdEntry := &pmdNode.PTEs()[pmdIndex]
+				if !pmdEntry.Valid() {
+					if !alloc {
+						// Skip over this entry.
+						clearPMDEntries++
+						start = next(start, pmdSize)
+						continue
+					}
+
+					// This level has 2-MB huge pages. If this
+					// region is contined in a single PMD entry?
+					// As above, we can skip allocating a new page.
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = next(start, pmdSize)
+							continue
+						}
+					}
+
+					// Allocate a new pmd.
+					p.setPageTable(pmdNode, pmdIndex, p.allocNode())
+
+				} else if pmdEntry.IsSuper() {
+					// Does this page need to be split?
+					if start&(pmdSize-1) != 0 || end < next(start, pmdSize) {
+						currentAddr := uint64(pmdEntry.Address())
+						writeable := pmdEntry.Writeable()
+						executable := pmdEntry.Executable()
+						user := pmdEntry.User()
+
+						// Install the relevant entries.
+						pteNode := p.allocNode()
+						pteEntries := pteNode.PTEs()
+						for index := 0; index < entriesPerPage; index++ {
+							pteEntry := &pteEntries[index]
+							pteEntry.Set(uintptr(currentAddr), writeable, executable, user)
+							currentAddr += pteSize
+						}
+
+						// Reset to point to the new page.
+						p.setPageTable(pmdNode, pmdIndex, pteNode)
+					} else {
+						// A huge page to be checked directly.
+						fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+
+						// Might have been cleared.
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						// Note that the huge page was changed.
+						start = next(start, pmdSize)
+						continue
+					}
+				}
+
+				// Map the next level, since this is valid.
+				pteNode := p.getPageTable(pmdNode, pmdIndex)
+				clearPTEEntries := 0
+
+				for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					pteEntry := &pteNode.PTEs()[pteIndex]
+					if !pteEntry.Valid() && !alloc {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					// At this point, we are guaranteed that start%pteSize == 0.
+					fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if alloc {
+							panic("PTE not set after iteration with alloc=true!")
+						}
+						clearPTEEntries++
+					}
+
+					// Note that the pte was changed.
+					start += pteSize
+					continue
+				}
+
+				// Check if we no longer need this page.
+				if clearPTEEntries == entriesPerPage {
+					p.clearPageTable(pmdNode, pmdIndex)
+					clearPMDEntries++
+				}
+			}
+
+			// Check if we no longer need this page.
+			if clearPMDEntries == entriesPerPage {
+				p.clearPageTable(pudNode, pudIndex)
+				clearPUDEntries++
+			}
+		}
+
+		// Check if we no longer need this page.
+		if clearPUDEntries == entriesPerPage {
+			p.clearPageTable(p.root, pgdIndex)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
new file mode 100644
index 000000000..9cbc0e3b0
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -0,0 +1,161 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"reflect"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type reflectTranslater struct{}
+
+func (r reflectTranslater) TranslateToPhysical(ptes *PTEs) uintptr {
+	return reflect.ValueOf(ptes).Pointer()
+}
+
+type mapping struct {
+	start     uintptr
+	length    uintptr
+	addr      uintptr
+	writeable bool
+}
+
+func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
+	var (
+		current int
+		found   []mapping
+		failed  string
+	)
+
+	// Iterate over all the mappings.
+	pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) {
+		found = append(found, mapping{
+			start:     s,
+			length:    e - s,
+			addr:      pte.Address(),
+			writeable: pte.Writeable(),
+		})
+		if failed != "" {
+			// Don't keep looking for errors.
+			return
+		}
+
+		if current >= len(m) {
+			failed = "more mappings than expected"
+		} else if m[current].start != s {
+			failed = "start didn't match expected"
+		} else if m[current].length != (e - s) {
+			failed = "end didn't match expected"
+		} else if m[current].addr != pte.Address() {
+			failed = "address didn't match expected"
+		} else if m[current].writeable != pte.Writeable() {
+			failed = "writeable didn't match"
+		}
+		current++
+	})
+
+	// Were we expected additional mappings?
+	if failed == "" && current != len(m) {
+		failed = "insufficient mappings found"
+	}
+
+	// Emit a meaningful error message on failure.
+	if failed != "" {
+		t.Errorf("%s; got %#v, wanted %#v", failed, found, m)
+	}
+}
+
+func TestAllocFree(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+	pt.Release()
+}
+
+func TestUnmap(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map and unmap one entry.
+	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+	pt.Unmap(0x400000, pteSize)
+
+	checkMappings(t, pt, nil)
+	pt.Release()
+}
+
+func TestReadOnly(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map one entry.
+	pt.Map(0x400000, pteSize, true, usermem.Read, pteSize*42)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, false},
+	})
+	pt.Release()
+}
+
+func TestReadWrite(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map one entry.
+	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, true},
+	})
+	pt.Release()
+}
+
+func TestSerialEntries(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map two sequential entries.
+	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+	pt.Map(0x401000, pteSize, true, usermem.ReadWrite, pteSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, true},
+		{0x401000, pteSize, pteSize * 47, true},
+	})
+	pt.Release()
+}
+
+func TestSpanningEntries(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Span a pgd with two pages.
+	pt.Map(0x00007efffffff000, 2*pteSize, true, usermem.Read, pteSize*42)
+
+	checkMappings(t, pt, []mapping{
+		{0x00007efffffff000, pteSize, pteSize * 42, false},
+		{0x00007f0000000000, pteSize, pteSize * 43, false},
+	})
+	pt.Release()
+}
+
+func TestSparseEntries(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map two entries in different pgds.
+	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+	pt.Map(0x00007f0000000000, pteSize, true, usermem.Read, pteSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, true},
+		{0x00007f0000000000, pteSize, pteSize * 47, false},
+	})
+	pt.Release()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
new file mode 100644
index 000000000..a2b44fb79
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// PTEs returns aligned PTE entries.
+func (n *Node) PTEs() *PTEs {
+	addr := uintptr(unsafe.Pointer(&n.unalignedData[0]))
+	offset := addr & (usermem.PageSize - 1)
+	if offset != 0 {
+		offset = usermem.PageSize - offset
+	}
+	return (*PTEs)(unsafe.Pointer(addr + offset))
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
new file mode 100644
index 000000000..dac66373f
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+// Opts are pagetable options.
+type Opts struct {
+	EnablePCID bool
+}
+
+// archPageTables has x86-specific features.
+type archPageTables struct {
+	// pcids is the PCID database.
+	pcids *PCIDs
+
+	// pcid is the globally unique identifier, or zero if none were
+	// available or pcids is nil.
+	pcid uint16
+}
+
+// init initializes arch-specific features.
+func (a *archPageTables) init(opts Opts) {
+	if opts.EnablePCID {
+		a.pcids = NewPCIDs()
+		a.pcid = a.pcids.allocate()
+	}
+}
+
+// initFrom initializes arch-specific features from an existing entry.'
+func (a *archPageTables) initFrom(other *archPageTables) {
+	a.pcids = other.pcids // Refer to the same PCID database.
+	if a.pcids != nil {
+		a.pcid = a.pcids.allocate()
+	}
+}
+
+// release is called from Release.
+func (a *archPageTables) release() {
+	// Return the PCID.
+	if a.pcids != nil {
+		a.pcids.free(a.pcid)
+	}
+}
+
+// CR3 returns the CR3 value for these tables.
+//
+// This may be called in interrupt contexts.
+//
+//go:nosplit
+func (p *PageTables) CR3() uint64 {
+	// Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1).
+	const noFlushBit uint64 = 0x8000000000000000
+	if p.pcid != 0 {
+		return noFlushBit | uint64(p.root.physical) | uint64(p.pcid)
+	}
+	return uint64(p.root.physical)
+}
+
+// FlushCR3 returns the CR3 value that flushes the TLB.
+//
+// This may be called in interrupt contexts.
+//
+//go:nosplit
+func (p *PageTables) FlushCR3() uint64 {
+	return uint64(p.root.physical) | uint64(p.pcid)
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
new file mode 100644
index 000000000..1fc403c48
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func Test2MAnd4K(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a small page and a huge page.
+	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+	pt.Map(0x00007f0000000000, 1<<21, true, usermem.Read, pmdSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, true},
+		{0x00007f0000000000, pmdSize, pmdSize * 47, false},
+	})
+	pt.Release()
+}
+
+func Test1GAnd4K(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a small page and a super page.
+	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+	pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, true},
+		{0x00007f0000000000, pudSize, pudSize * 47, false},
+	})
+	pt.Release()
+}
+
+func TestSplit1GPage(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a super page and knock out the middle.
+	pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*42)
+	pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize))
+
+	checkMappings(t, pt, []mapping{
+		{0x00007f0000000000, pteSize, pudSize * 42, false},
+		{0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, false},
+	})
+	pt.Release()
+}
+
+func TestSplit2MPage(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a huge page and knock out the middle.
+	pt.Map(0x00007f0000000000, pmdSize, true, usermem.Read, pmdSize*42)
+	pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize))
+
+	checkMappings(t, pt, []mapping{
+		{0x00007f0000000000, pteSize, pmdSize * 42, false},
+		{0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, false},
+	})
+	pt.Release()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
new file mode 100644
index 000000000..509e8c0d9
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+	"sync"
+)
+
+// maxPCID is the maximum allowed PCID.
+const maxPCID = 4095
+
+// PCIDs is a simple PCID database.
+type PCIDs struct {
+	mu sync.Mutex
+
+	// last is the last fresh PCID given out (not including the available
+	// pool). If last >= maxPCID, then the only PCIDs available in the
+	// available pool below.
+	last uint16
+
+	// available are PCIDs that have been freed.
+	available map[uint16]struct{}
+}
+
+// NewPCIDs returns a new PCID set.
+func NewPCIDs() *PCIDs {
+	return &PCIDs{
+		available: make(map[uint16]struct{}),
+	}
+}
+
+// allocate returns an unused PCID, or zero if all are taken.
+func (p *PCIDs) allocate() uint16 {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if len(p.available) > 0 {
+		for id := range p.available {
+			delete(p.available, id)
+			return id
+		}
+	}
+	if id := p.last + 1; id <= maxPCID {
+		p.last = id
+		return id
+	}
+	// Nothing available.
+	return 0
+}
+
+// free returns a PCID to the pool.
+//
+// It is safe to call free with a zero pcid. That is, you may always call free
+// with anything returned by allocate.
+func (p *PCIDs) free(id uint16) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if id != 0 {
+		p.available[id] = struct{}{}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
new file mode 100644
index 000000000..0b555cd76
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
@@ -0,0 +1,65 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+	"testing"
+)
+
+func TestMaxPCID(t *testing.T) {
+	p := NewPCIDs()
+	for i := 0; i < maxPCID; i++ {
+		if id := p.allocate(); id != uint16(i+1) {
+			t.Errorf("got %d, expected %d", id, i+1)
+		}
+	}
+	if id := p.allocate(); id != 0 {
+		if id != 0 {
+			t.Errorf("got %d, expected 0", id)
+		}
+	}
+}
+
+func TestFirstPCID(t *testing.T) {
+	p := NewPCIDs()
+	if id := p.allocate(); id != 1 {
+		t.Errorf("got %d, expected 1", id)
+	}
+}
+
+func TestFreePCID(t *testing.T) {
+	p := NewPCIDs()
+	p.free(0)
+	if id := p.allocate(); id != 1 {
+		t.Errorf("got %d, expected 1 (not zero)", id)
+	}
+}
+
+func TestReusePCID(t *testing.T) {
+	p := NewPCIDs()
+	id := p.allocate()
+	if id != 1 {
+		t.Errorf("got %d, expected 1", id)
+	}
+	p.free(id)
+	if id := p.allocate(); id != 1 {
+		t.Errorf("got %d, expected 1", id)
+	}
+	if id := p.allocate(); id != 2 {
+		t.Errorf("got %d, expected 2", id)
+	}
+}
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
new file mode 100644
index 000000000..4991031c5
--- /dev/null
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ring0 provides basic operating system-level stubs.
+package ring0
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
new file mode 100644
index 000000000..e16f6c599
--- /dev/null
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -0,0 +1,242 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package ring0
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+)
+
+// Useful bits.
+const (
+	_CR0_PE = 1 << 0
+	_CR0_ET = 1 << 4
+	_CR0_PG = 1 << 31
+
+	_CR4_PSE        = 1 << 4
+	_CR4_PAE        = 1 << 5
+	_CR4_PGE        = 1 << 7
+	_CR4_OSFXSR     = 1 << 9
+	_CR4_OSXMMEXCPT = 1 << 10
+	_CR4_FSGSBASE   = 1 << 16
+	_CR4_PCIDE      = 1 << 17
+	_CR4_OSXSAVE    = 1 << 18
+	_CR4_SMEP       = 1 << 20
+
+	_RFLAGS_AC       = 1 << 18
+	_RFLAGS_NT       = 1 << 14
+	_RFLAGS_IOPL     = 3 << 12
+	_RFLAGS_DF       = 1 << 10
+	_RFLAGS_IF       = 1 << 9
+	_RFLAGS_STEP     = 1 << 8
+	_RFLAGS_RESERVED = 1 << 1
+
+	_EFER_SCE = 0x001
+	_EFER_LME = 0x100
+	_EFER_NX  = 0x800
+
+	_MSR_STAR         = 0xc0000081
+	_MSR_LSTAR        = 0xc0000082
+	_MSR_CSTAR        = 0xc0000083
+	_MSR_SYSCALL_MASK = 0xc0000084
+)
+
+// Vector is an exception vector.
+type Vector uintptr
+
+// Exception vectors.
+const (
+	DivideByZero Vector = iota
+	Debug
+	NMI
+	Breakpoint
+	Overflow
+	BoundRangeExceeded
+	InvalidOpcode
+	DeviceNotAvailable
+	DoubleFault
+	CoprocessorSegmentOverrun
+	InvalidTSS
+	SegmentNotPresent
+	StackSegmentFault
+	GeneralProtectionFault
+	PageFault
+	_
+	X87FloatingPointException
+	AlignmentCheck
+	MachineCheck
+	SIMDFloatingPointException
+	VirtualizationException
+	SecurityException = 0x1e
+	SyscallInt80      = 0x80
+	_NR_INTERRUPTS    = SyscallInt80 + 1
+)
+
+// System call vectors.
+const (
+	Syscall Vector = _NR_INTERRUPTS
+)
+
+// VirtualAddressBits returns the number bits available for virtual addresses.
+//
+// Note that sign-extension semantics apply to the highest order bit.
+//
+// FIXME: This should use the cpuid passed to Init.
+func VirtualAddressBits() uint32 {
+	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+	return (ax >> 8) & 0xff
+}
+
+// PhysicalAddressBits returns the number of bits available for physical addresses.
+//
+// FIXME: This should use the cpuid passed to Init.
+func PhysicalAddressBits() uint32 {
+	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+	return ax & 0xff
+}
+
+// Selector is a segment Selector.
+type Selector uint16
+
+// SegmentDescriptor is a segment descriptor.
+type SegmentDescriptor struct {
+	bits [2]uint32
+}
+
+// descriptorTable is a collection of descriptors.
+type descriptorTable [32]SegmentDescriptor
+
+// SegmentDescriptorFlags are typed flags within a descriptor.
+type SegmentDescriptorFlags uint32
+
+// SegmentDescriptorFlag declarations.
+const (
+	SegmentDescriptorAccess     SegmentDescriptorFlags = 1 << 8  // Access bit (always set).
+	SegmentDescriptorWrite                             = 1 << 9  // Write permission.
+	SegmentDescriptorExpandDown                        = 1 << 10 // Grows down, not used.
+	SegmentDescriptorExecute                           = 1 << 11 // Execute permission.
+	SegmentDescriptorSystem                            = 1 << 12 // Zero => system, 1 => user code/data.
+	SegmentDescriptorPresent                           = 1 << 15 // Present.
+	SegmentDescriptorAVL                               = 1 << 20 // Available.
+	SegmentDescriptorLong                              = 1 << 21 // Long mode.
+	SegmentDescriptorDB                                = 1 << 22 // 16 or 32-bit.
+	SegmentDescriptorG                                 = 1 << 23 // Granularity: page or byte.
+)
+
+// Base returns the descriptor's base linear address.
+func (d *SegmentDescriptor) Base() uint32 {
+	return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16
+}
+
+// Limit returns the descriptor size.
+func (d *SegmentDescriptor) Limit() uint32 {
+	l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000
+	if d.bits[1]&uint32(SegmentDescriptorG) != 0 {
+		l <<= 12
+		l |= 0xFFF
+	}
+	return l
+}
+
+// Flags returns descriptor flags.
+func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags {
+	return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00)
+}
+
+// DPL returns the descriptor privilege level.
+func (d *SegmentDescriptor) DPL() int {
+	return int((d.bits[1] >> 13) & 3)
+}
+
+func (d *SegmentDescriptor) setNull() {
+	d.bits[0] = 0
+	d.bits[1] = 0
+}
+
+func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) {
+	flags |= SegmentDescriptorPresent
+	if limit>>12 != 0 {
+		limit >>= 12
+		flags |= SegmentDescriptorG
+	}
+	d.bits[0] = base<<16 | limit&0xFFFF
+	d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13
+}
+
+func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorDB|
+			SegmentDescriptorExecute|
+			SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorG|
+			SegmentDescriptorLong|
+			SegmentDescriptorExecute|
+			SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorWrite|
+			SegmentDescriptorSystem)
+}
+
+// setHi is only used for the TSS segment, which is magically 64-bits.
+func (d *SegmentDescriptor) setHi(base uint32) {
+	d.bits[0] = base
+	d.bits[1] = 0
+}
+
+// Gate64 is a 64-bit task, trap, or interrupt gate.
+type Gate64 struct {
+	bits [4]uint32
+}
+
+// idt64 is a 64-bit interrupt descriptor table.
+type idt64 [_NR_INTERRUPTS]Gate64
+
+func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) {
+	g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF
+	g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7
+	g.bits[2] = uint32(rip >> 32)
+}
+
+func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) {
+	g.setInterrupt(cs, rip, dpl, ist)
+	g.bits[1] |= 1 << 8
+}
+
+// TaskState64 is a 64-bit task state structure.
+type TaskState64 struct {
+	_              uint32
+	rsp0Lo, rsp0Hi uint32
+	rsp1Lo, rsp1Hi uint32
+	rsp2Lo, rsp2Hi uint32
+	_              [2]uint32
+	ist1Lo, ist1Hi uint32
+	ist2Lo, ist2Hi uint32
+	ist3Lo, ist3Hi uint32
+	ist4Lo, ist4Hi uint32
+	ist5Lo, ist5Hi uint32
+	ist6Lo, ist6Hi uint32
+	ist7Lo, ist7Hi uint32
+	_              [2]uint32
+	_              uint16
+	ioPerm         uint16
+}
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
new file mode 100644
index 000000000..8b9f29403
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -0,0 +1,28 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "safecopy",
+    srcs = [
+        "atomic_amd64.s",
+        "memclr_amd64.s",
+        "memcpy_amd64.s",
+        "safecopy.go",
+        "safecopy_unsafe.go",
+        "sighandler_amd64.s",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "safecopy_test",
+    srcs = [
+        "safecopy_test.go",
+    ],
+    embed = [":safecopy"],
+)
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
new file mode 100644
index 000000000..69947dec3
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// handleSwapUint32Fault returns the value stored in DI. Control is transferred
+// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as swapUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
+  MOVL DI, sig+20(FP)
+  RET
+
+// swapUint32 atomically stores new into *addr and returns (the previous *addr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+TEXT ·swapUint32(SB), NOSPLIT, $0-24
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleSwapUint32Fault will store a different value in this address.
+  MOVL $0, sig+20(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVL new+8(FP), AX
+  XCHGL AX, 0(DI)
+  MOVL AX, old+16(FP)
+  RET
+
+// handleSwapUint64Fault returns the value stored in DI. Control is transferred
+// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as swapUint64 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
+  MOVL DI, sig+24(FP)
+  RET
+
+// swapUint64 atomically stores new into *addr and returns (the previous *addr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: addr must be aligned to a 8-byte boundary.
+//
+//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+TEXT ·swapUint64(SB), NOSPLIT, $0-28
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleSwapUint64Fault will store a different value in this address.
+  MOVL $0, sig+24(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVQ new+8(FP), AX
+  XCHGQ AX, 0(DI)
+  MOVQ AX, old+16(FP)
+  RET
+
+// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is
+// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the
+// signal number stored in DI.
+//
+// It must have the same frame configuration as compareAndSwapUint32 so that it
+// can undo any potential call frame set up by the assembler.
+TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
+  MOVL DI, sig+20(FP)
+  RET
+
+// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is
+// received during the operation, the value of prev is unspecified, and sig is
+// the number of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
+  // Store 0 as the returned signal number. If we run to completion, this is
+  // the value the caller will see; if a signal is received,
+  // handleCompareAndSwapUint32Fault will store a different value in this
+  // address.
+  MOVL $0, sig+20(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVL old+8(FP), AX
+  MOVL new+12(FP), DX
+  LOCK
+  CMPXCHGL DX, 0(DI)
+  MOVL AX, prev+16(FP)
+  RET
diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s
new file mode 100644
index 000000000..7d1019f60
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memclr_amd64.s
@@ -0,0 +1,157 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// handleMemclrFault returns (the value stored in AX, the value stored in DI).
+// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in AX and the signal number stored in DI.
+//
+// It must have the same frame configuration as memclr so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemclrFault(SB), NOSPLIT, $0-28
+	MOVQ	AX, addr+16(FP)
+	MOVL	DI, sig+24(FP)
+	RET
+
+// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
+// signal is received during the write, it returns the address that caused the
+// fault and the number of the signal that was received. Otherwise, it returns
+// an unspecified address and a signal number of 0.
+//
+// Data is written in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully written.
+//
+// The code is derived from runtime.memclrNoHeapPointers.
+//
+// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memclr(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemclrFault will store a different value in this address.
+	MOVL	$0, sig+24(FP)
+
+	MOVQ	ptr+0(FP), DI
+	MOVQ	n+8(FP), BX
+	XORQ	AX, AX
+
+	// MOVOU seems always faster than REP STOSQ.
+tail:
+	TESTQ	BX, BX
+	JEQ	_0
+	CMPQ	BX, $2
+	JBE	_1or2
+	CMPQ	BX, $4
+	JBE	_3or4
+	CMPQ	BX, $8
+	JB	_5through7
+	JE	_8
+	CMPQ	BX, $16
+	JBE	_9through16
+	PXOR	X0, X0
+	CMPQ	BX, $32
+	JBE	_17through32
+	CMPQ	BX, $64
+	JBE	_33through64
+	CMPQ	BX, $128
+	JBE	_65through128
+	CMPQ	BX, $256
+	JBE	_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
+
+loop:
+	MOVOU	X0, 0(DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, 128(DI)
+	MOVOU	X0, 144(DI)
+	MOVOU	X0, 160(DI)
+	MOVOU	X0, 176(DI)
+	MOVOU	X0, 192(DI)
+	MOVOU	X0, 208(DI)
+	MOVOU	X0, 224(DI)
+	MOVOU	X0, 240(DI)
+	SUBQ	$256, BX
+	ADDQ	$256, DI
+	CMPQ	BX, $256
+	JAE	loop
+	JMP	tail
+
+_1or2:
+	MOVB	AX, (DI)
+	MOVB	AX, -1(DI)(BX*1)
+	RET
+_0:
+	RET
+_3or4:
+	MOVW	AX, (DI)
+	MOVW	AX, -2(DI)(BX*1)
+	RET
+_5through7:
+	MOVL	AX, (DI)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+_8:
+	// We need a separate case for 8 to make sure we clear pointers atomically.
+	MOVQ	AX, (DI)
+	RET
+_9through16:
+	MOVQ	AX, (DI)
+	MOVQ	AX, -8(DI)(BX*1)
+	RET
+_17through32:
+	MOVOU	X0, (DI)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_33through64:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_65through128:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_129through256:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, -128(DI)(BX*1)
+	MOVOU	X0, -112(DI)(BX*1)
+	MOVOU	X0, -96(DI)(BX*1)
+	MOVOU	X0, -80(DI)(BX*1)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s
new file mode 100644
index 000000000..96ef2eefc
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memcpy_amd64.s
@@ -0,0 +1,242 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// handleMemcpyFault returns (the value stored in AX, the value stored in DI).
+// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in AX and the signal number stored in DI.
+//
+// It must have the same frame configuration as memcpy so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
+	MOVQ	AX, addr+24(FP)
+	MOVL	DI, sig+32(FP)
+	RET
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+// The code is derived from the forward copying part of runtime.memmove.
+//
+// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memcpy(SB), NOSPLIT, $0-36
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemcpyFault will store a different value in this address.
+	MOVL	$0, sig+32(FP)
+
+	MOVQ	to+0(FP), DI
+	MOVQ	from+8(FP), SI
+	MOVQ	n+16(FP), BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code. The REP MOVSQ instruction is really fast
+	// for large sizes. The cutover is approximately 2K.
+tail:
+	// move_129through256 or smaller work whether or not the source and the
+	// destination memory regions overlap because they load all data into
+	// registers before writing it back.  move_256through2048 on the other
+	// hand can be used only when the memory regions don't overlap or the copy
+	// direction is forward.
+	TESTQ	BX, BX
+	JEQ	move_0
+	CMPQ	BX, $2
+	JBE	move_1or2
+	CMPQ	BX, $4
+	JBE	move_3or4
+	CMPQ	BX, $8
+	JB	move_5through7
+	JE	move_8
+	CMPQ	BX, $16
+	JBE	move_9through16
+	CMPQ	BX, $32
+	JBE	move_17through32
+	CMPQ	BX, $64
+	JBE	move_33through64
+	CMPQ	BX, $128
+	JBE	move_65through128
+	CMPQ	BX, $256
+	JBE	move_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+
+/*
+ * forward copy loop
+ */
+	CMPQ	BX, $2048
+	JLS	move_256through2048
+
+	// Check alignment
+	MOVL	SI, AX
+	ORL	DI, AX
+	TESTL	$7, AX
+	JEQ	fwdBy8
+
+	// Do 1 byte at a time
+	MOVQ	BX, CX
+	REP;	MOVSB
+	RET
+
+fwdBy8:
+	// Do 8 bytes at a time
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+	REP;	MOVSQ
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	AX, (DI)
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_3or4:
+	MOVW	(SI), AX
+	MOVW	AX, (DI)
+	MOVW	-2(SI)(BX*1), CX
+	MOVW	CX, -2(DI)(BX*1)
+	RET
+move_5through7:
+	MOVL	(SI), AX
+	MOVL	AX, (DI)
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_8:
+	// We need a separate case for 8 to make sure we write pointers atomically.
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	RET
+move_9through16:
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	CX, -8(DI)(BX*1)
+	RET
+move_17through32:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	-16(SI)(BX*1), X1
+	MOVOU	X1, -16(DI)(BX*1)
+	RET
+move_33through64:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	-32(SI)(BX*1), X2
+	MOVOU	X2, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X3
+	MOVOU	X3, -16(DI)(BX*1)
+	RET
+move_65through128:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	-64(SI)(BX*1), X4
+	MOVOU	X4, -64(DI)(BX*1)
+	MOVOU	-48(SI)(BX*1), X5
+	MOVOU	X5, -48(DI)(BX*1)
+	MOVOU	-32(SI)(BX*1), X6
+	MOVOU	X6, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X7
+	MOVOU	X7, -16(DI)(BX*1)
+	RET
+move_129through256:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	-128(SI)(BX*1), X8
+	MOVOU	X8, -128(DI)(BX*1)
+	MOVOU	-112(SI)(BX*1), X9
+	MOVOU	X9, -112(DI)(BX*1)
+	MOVOU	-96(SI)(BX*1), X10
+	MOVOU	X10, -96(DI)(BX*1)
+	MOVOU	-80(SI)(BX*1), X11
+	MOVOU	X11, -80(DI)(BX*1)
+	MOVOU	-64(SI)(BX*1), X12
+	MOVOU	X12, -64(DI)(BX*1)
+	MOVOU	-48(SI)(BX*1), X13
+	MOVOU	X13, -48(DI)(BX*1)
+	MOVOU	-32(SI)(BX*1), X14
+	MOVOU	X14, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X15
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
+move_256through2048:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	128(SI), X8
+	MOVOU	X8, 128(DI)
+	MOVOU	144(SI), X9
+	MOVOU	X9, 144(DI)
+	MOVOU	160(SI), X10
+	MOVOU	X10, 160(DI)
+	MOVOU	176(SI), X11
+	MOVOU	X11, 176(DI)
+	MOVOU	192(SI), X12
+	MOVOU	X12, 192(DI)
+	MOVOU	208(SI), X13
+	MOVOU	X13, 208(DI)
+	MOVOU	224(SI), X14
+	MOVOU	X14, 224(DI)
+	MOVOU	240(SI), X15
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_256through2048
+	JMP	tail
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
new file mode 100644
index 000000000..90a2aad7b
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -0,0 +1,140 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package safecopy provides an efficient implementation of functions to access
+// memory that may result in SIGSEGV or SIGBUS being sent to the accessor.
+package safecopy
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SegvError is returned when a safecopy function receives SIGSEGV.
+type SegvError struct {
+	// Addr is the address at which the SIGSEGV occurred.
+	Addr uintptr
+}
+
+// Error implements error.Error.
+func (e SegvError) Error() string {
+	return fmt.Sprintf("SIGSEGV at %#x", e.Addr)
+}
+
+// BusError is returned when a safecopy function receives SIGBUS.
+type BusError struct {
+	// Addr is the address at which the SIGBUS occurred.
+	Addr uintptr
+}
+
+// Error implements error.Error.
+func (e BusError) Error() string {
+	return fmt.Sprintf("SIGBUS at %#x", e.Addr)
+}
+
+// AlignmentError is returned when a safecopy function is passed an address
+// that does not meet alignment requirements.
+type AlignmentError struct {
+	// Addr is the invalid address.
+	Addr uintptr
+
+	// Alignment is the required alignment.
+	Alignment uintptr
+}
+
+// Error implements error.Error.
+func (e AlignmentError) Error() string {
+	return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment)
+}
+
+var (
+	// The begin and end addresses below are for the functions that are
+	// checked by the signal handler.
+	memcpyBegin               uintptr
+	memcpyEnd                 uintptr
+	memclrBegin               uintptr
+	memclrEnd                 uintptr
+	swapUint32Begin           uintptr
+	swapUint32End             uintptr
+	swapUint64Begin           uintptr
+	swapUint64End             uintptr
+	compareAndSwapUint32Begin uintptr
+	compareAndSwapUint32End   uintptr
+
+	// savedSigSegVHandler is a pointer to the SIGSEGV handler that was
+	// configured before we replaced it with our own. We still call into it
+	// when we get a SIGSEGV that is not interesting to us.
+	savedSigSegVHandler uintptr
+
+	// same a above, but for SIGBUS signals.
+	savedSigBusHandler uintptr
+)
+
+// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS
+// signals.
+func signalHandler()
+
+// FindEndAddress returns the end address (one byte beyond the last) of the
+// function that contains the specified address (begin).
+func FindEndAddress(begin uintptr) uintptr {
+	f := runtime.FuncForPC(begin)
+	if f != nil {
+		for p := begin; ; p++ {
+			g := runtime.FuncForPC(p)
+			if f != g {
+				return p
+			}
+		}
+	}
+	return begin
+}
+
+// initializeAddresses initializes the addresses used by the signal handler.
+func initializeAddresses() {
+	// The following functions are written in assembly language, so they won't
+	// be inlined by the existing compiler/linker. Tests will fail if this
+	// assumption is violated.
+	memcpyBegin = reflect.ValueOf(memcpy).Pointer()
+	memcpyEnd = FindEndAddress(memcpyBegin)
+	memclrBegin = reflect.ValueOf(memclr).Pointer()
+	memclrEnd = FindEndAddress(memclrBegin)
+	swapUint32Begin = reflect.ValueOf(swapUint32).Pointer()
+	swapUint32End = FindEndAddress(swapUint32Begin)
+	swapUint64Begin = reflect.ValueOf(swapUint64).Pointer()
+	swapUint64End = FindEndAddress(swapUint64Begin)
+	compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer()
+	compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
+}
+
+func init() {
+	initializeAddresses()
+	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err))
+	}
+	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err))
+	}
+	syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) {
+		switch e.(type) {
+		case SegvError, BusError, AlignmentError:
+			return syscall.EFAULT, true
+		default:
+			return 0, false
+		}
+	})
+}
diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go
new file mode 100644
index 000000000..67df36121
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/safecopy_test.go
@@ -0,0 +1,617 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safecopy
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"runtime/debug"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+// Size of a page in bytes. Cloned from usermem.PageSize to avoid a circular
+// dependency.
+const pageSize = 4096
+
+func initRandom(b []byte) {
+	for i := range b {
+		b[i] = byte(rand.Intn(256))
+	}
+}
+
+func randBuf(size int) []byte {
+	b := make([]byte, size)
+	initRandom(b)
+	return b
+}
+
+func TestCopyInSuccess(t *testing.T) {
+	// Test that CopyIn does not return an error when all pages are accessible.
+	const bufLen = 8192
+	a := randBuf(bufLen)
+	b := make([]byte, bufLen)
+
+	n, err := CopyIn(b, unsafe.Pointer(&a[0]))
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestCopyOutSuccess(t *testing.T) {
+	// Test that CopyOut does not return an error when all pages are
+	// accessible.
+	const bufLen = 8192
+	a := randBuf(bufLen)
+	b := make([]byte, bufLen)
+
+	n, err := CopyOut(unsafe.Pointer(&b[0]), a)
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestCopySuccess(t *testing.T) {
+	// Test that Copy does not return an error when all pages are accessible.
+	const bufLen = 8192
+	a := randBuf(bufLen)
+	b := make([]byte, bufLen)
+
+	n, err := Copy(unsafe.Pointer(&b[0]), unsafe.Pointer(&a[0]), bufLen)
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestZeroOutSuccess(t *testing.T) {
+	// Test that ZeroOut does not return an error when all pages are
+	// accessible.
+	const bufLen = 8192
+	a := make([]byte, bufLen)
+	b := randBuf(bufLen)
+
+	n, err := ZeroOut(unsafe.Pointer(&b[0]), bufLen)
+	if n != bufLen {
+		t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen)
+	}
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if !bytes.Equal(a, b) {
+		t.Errorf("Buffers are not equal when they should be: %v %v", a, b)
+	}
+}
+
+func TestSwapUint32Success(t *testing.T) {
+	// Test that SwapUint32 does not return an error when the page is
+	// accessible.
+	before := uint32(rand.Int31())
+	after := uint32(rand.Int31())
+	val := before
+
+	old, err := SwapUint32(unsafe.Pointer(&val), after)
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if old != before {
+		t.Errorf("Unexpected old value: got %v, want %v", old, before)
+	}
+	if val != after {
+		t.Errorf("Unexpected new value: got %v, want %v", val, after)
+	}
+}
+
+func TestSwapUint32AlignmentError(t *testing.T) {
+	// Test that SwapUint32 returns an AlignmentError when passed an unaligned
+	// address.
+	data := new(struct{ val uint64 })
+	addr := uintptr(unsafe.Pointer(&data.val)) + 1
+	want := AlignmentError{Addr: addr, Alignment: 4}
+	if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want {
+		t.Errorf("Unexpected error: got %v, want %v", err, want)
+	}
+}
+
+func TestSwapUint64Success(t *testing.T) {
+	// Test that SwapUint64 does not return an error when the page is
+	// accessible.
+	before := uint64(rand.Int63())
+	after := uint64(rand.Int63())
+	// "The first word in ... an allocated struct or slice can be relied upon
+	// to be 64-bit aligned." - sync/atomic docs
+	data := new(struct{ val uint64 })
+	data.val = before
+
+	old, err := SwapUint64(unsafe.Pointer(&data.val), after)
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if old != before {
+		t.Errorf("Unexpected old value: got %v, want %v", old, before)
+	}
+	if data.val != after {
+		t.Errorf("Unexpected new value: got %v, want %v", data.val, after)
+	}
+}
+
+func TestSwapUint64AlignmentError(t *testing.T) {
+	// Test that SwapUint64 returns an AlignmentError when passed an unaligned
+	// address.
+	data := new(struct{ val1, val2 uint64 })
+	addr := uintptr(unsafe.Pointer(&data.val1)) + 1
+	want := AlignmentError{Addr: addr, Alignment: 8}
+	if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want {
+		t.Errorf("Unexpected error: got %v, want %v", err, want)
+	}
+}
+
+func TestCompareAndSwapUint32Success(t *testing.T) {
+	// Test that CompareAndSwapUint32 does not return an error when the page is
+	// accessible.
+	before := uint32(rand.Int31())
+	after := uint32(rand.Int31())
+	val := before
+
+	old, err := CompareAndSwapUint32(unsafe.Pointer(&val), before, after)
+	if err != nil {
+		t.Errorf("Unexpected error: %v", err)
+	}
+	if old != before {
+		t.Errorf("Unexpected old value: got %v, want %v", old, before)
+	}
+	if val != after {
+		t.Errorf("Unexpected new value: got %v, want %v", val, after)
+	}
+}
+
+func TestCompareAndSwapUint32AlignmentError(t *testing.T) {
+	// Test that CompareAndSwapUint32 returns an AlignmentError when passed an
+	// unaligned address.
+	data := new(struct{ val uint64 })
+	addr := uintptr(unsafe.Pointer(&data.val)) + 1
+	want := AlignmentError{Addr: addr, Alignment: 4}
+	if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want {
+		t.Errorf("Unexpected error: got %v, want %v", err, want)
+	}
+}
+
+// withSegvErrorTestMapping calls fn with a two-page mapping. The first page
+// contains random data, and the second page generates SIGSEGV when accessed.
+func withSegvErrorTestMapping(t *testing.T, fn func(m []byte)) {
+	mapping, err := syscall.Mmap(-1, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+	}
+	defer syscall.Munmap(mapping)
+	if err := syscall.Mprotect(mapping[pageSize:], syscall.PROT_NONE); err != nil {
+		t.Fatalf("Mprotect failed: %v", err)
+	}
+	initRandom(mapping[:pageSize])
+
+	fn(mapping)
+}
+
+// withBusErrorTestMapping calls fn with a two-page mapping. The first page
+// contains random data, and the second page generates SIGBUS when accessed.
+func withBusErrorTestMapping(t *testing.T, fn func(m []byte)) {
+	f, err := ioutil.TempFile("", "sigbus_test")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	defer f.Close()
+	if err := f.Truncate(pageSize); err != nil {
+		t.Fatalf("Truncate failed: %v", err)
+	}
+	mapping, err := syscall.Mmap(int(f.Fd()), 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+	}
+	defer syscall.Munmap(mapping)
+	initRandom(mapping[:pageSize])
+
+	fn(mapping)
+}
+
+func TestCopyInSegvError(t *testing.T) {
+	// Test that CopyIn returns a SegvError when reaching a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := CopyIn(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyInBusError(t *testing.T) {
+	// Test that CopyIn returns a BusError when reaching a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := CopyIn(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyOutSegvError(t *testing.T) {
+	// Test that CopyOut returns a SegvError when reaching a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := CopyOut(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyOutBusError(t *testing.T) {
+	// Test that CopyOut returns a BusError when reaching a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := CopyOut(dst, src)
+				if n != bytesBeforeFault {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopySourceSegvError(t *testing.T) {
+	// Test that Copy returns a SegvError when copying from a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopySourceBusError(t *testing.T) {
+	// Test that Copy returns a BusError when copying from a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				dst := randBuf(pageSize)
+				n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyDestinationSegvError(t *testing.T) {
+	// Test that Copy returns a SegvError when copying to a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestCopyDestinationBusError(t *testing.T) {
+	// Test that Copy returns a BusError when copying to a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				src := randBuf(pageSize)
+				n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) {
+					t.Errorf("Buffers are not equal when they should be: %v %v", got, want)
+				}
+			})
+		})
+	}
+}
+
+func TestZeroOutSegvError(t *testing.T) {
+	// Test that ZeroOut returns a SegvError when reaching a page that signals
+	// SIGSEGV.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) {
+			withSegvErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				n, err := ZeroOut(dst, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (SegvError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) {
+					t.Errorf("Non-zero bytes in written part of mapping: %v", got)
+				}
+			})
+		})
+	}
+}
+
+func TestZeroOutBusError(t *testing.T) {
+	// Test that ZeroOut returns a BusError when reaching a page that signals
+	// SIGBUS.
+	for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ {
+		t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) {
+			withBusErrorTestMapping(t, func(mapping []byte) {
+				secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+				dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault))
+				n, err := ZeroOut(dst, pageSize)
+				if n != uintptr(bytesBeforeFault) {
+					t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault)
+				}
+				if want := (BusError{secondPage}); err != want {
+					t.Errorf("Unexpected error: got %v, want %v", err, want)
+				}
+				if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) {
+					t.Errorf("Non-zero bytes in written part of mapping: %v", got)
+				}
+			})
+		})
+	}
+}
+
+func TestSwapUint32SegvError(t *testing.T) {
+	// Test that SwapUint32 returns a SegvError when reaching a page that
+	// signals SIGSEGV.
+	withSegvErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
+		if want := (SegvError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestSwapUint32BusError(t *testing.T) {
+	// Test that SwapUint32 returns a BusError when reaching a page that
+	// signals SIGBUS.
+	withBusErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint32(unsafe.Pointer(secondPage), 1)
+		if want := (BusError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestSwapUint64SegvError(t *testing.T) {
+	// Test that SwapUint64 returns a SegvError when reaching a page that
+	// signals SIGSEGV.
+	withSegvErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
+		if want := (SegvError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestSwapUint64BusError(t *testing.T) {
+	// Test that SwapUint64 returns a BusError when reaching a page that
+	// signals SIGBUS.
+	withBusErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := SwapUint64(unsafe.Pointer(secondPage), 1)
+		if want := (BusError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestCompareAndSwapUint32SegvError(t *testing.T) {
+	// Test that CompareAndSwapUint32 returns a SegvError when reaching a page
+	// that signals SIGSEGV.
+	withSegvErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
+		if want := (SegvError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func TestCompareAndSwapUint32BusError(t *testing.T) {
+	// Test that CompareAndSwapUint32 returns a BusError when reaching a page
+	// that signals SIGBUS.
+	withBusErrorTestMapping(t, func(mapping []byte) {
+		secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize
+		_, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1)
+		if want := (BusError{secondPage}); err != want {
+			t.Errorf("Unexpected error: got %v, want %v", err, want)
+		}
+	})
+}
+
+func testCopy(dst, src []byte) (panicked bool) {
+	defer func() {
+		if r := recover(); r != nil {
+			panicked = true
+		}
+	}()
+	debug.SetPanicOnFault(true)
+	copy(dst, src)
+	return
+}
+
+func TestSegVOnMemmove(t *testing.T) {
+	// Test that SIGSEGVs received by runtime.memmove when *not* doing
+	// CopyIn or CopyOut work gets propagated to the runtime.
+	const bufLen = pageSize
+	a, err := syscall.Mmap(-1, 0, bufLen, syscall.PROT_NONE, syscall.MAP_ANON|syscall.MAP_PRIVATE)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+
+	}
+	defer syscall.Munmap(a)
+	b := randBuf(bufLen)
+
+	if !testCopy(b, a) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+
+	if !testCopy(a, b) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+}
+
+func TestSigbusOnMemmove(t *testing.T) {
+	// Test that SIGBUS received by runtime.memmove when *not* doing
+	// CopyIn or CopyOut work gets propagated to the runtime.
+	const bufLen = pageSize
+	f, err := ioutil.TempFile("", "sigbus_test")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	os.Remove(f.Name())
+	defer f.Close()
+
+	a, err := syscall.Mmap(int(f.Fd()), 0, bufLen, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		t.Fatalf("Mmap failed: %v", err)
+
+	}
+	defer syscall.Munmap(a)
+	b := randBuf(bufLen)
+
+	if !testCopy(b, a) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+
+	if !testCopy(a, b) {
+		t.Fatalf("testCopy didn't panic when it should have")
+	}
+}
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
new file mode 100644
index 000000000..72f243f8d
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -0,0 +1,315 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safecopy
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+// maxRegisterSize is the maximum register size used in memcpy and memclr. It
+// is used to decide by how much to rewind the copy (for memcpy) or zeroing
+// (for memclr) before proceeding.
+const maxRegisterSize = 16
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+//go:noescape
+func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+
+// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
+// signal is received during the write, it returns the address that caused the
+// fault and the number of the signal that was received. Otherwise, it returns
+// an unspecified address and a signal number of 0.
+//
+// Data is written in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully written.
+//
+//go:noescape
+func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+
+// swapUint32 atomically stores new into *ptr and returns (the previous *ptr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+
+// swapUint64 atomically stores new into *ptr and returns (the previous *ptr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: ptr must be aligned to a 8-byte boundary.
+//
+//go:noescape
+func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+
+// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is
+// received during the operation, the value of prev is unspecified, and sig is
+// the number of the signal that was received.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+
+// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
+// copied and an error if SIGSEGV or SIGBUS is received while reading from src.
+func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
+	toCopy := uintptr(len(dst))
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy)
+	if sig == 0 {
+		return len(dst), nil
+	}
+
+	if faultN, srcN := uintptr(fault), uintptr(src); faultN < srcN && faultN >= srcN+toCopy {
+		panic(fmt.Sprintf("CopyIn faulted at %#x, which is outside source [%#x, %#x)", faultN, srcN, srcN+toCopy))
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	faultN, srcN := uintptr(fault), uintptr(src)
+	var done int
+	if faultN-srcN > maxRegisterSize {
+		done = int(faultN - srcN - maxRegisterSize)
+	}
+	n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done)))
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// CopyOut copies len(src) bytes from src to dst. If returns the number of
+// bytes done and an error if SIGSEGV or SIGBUS is received while writing to
+// dst.
+func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
+	toCopy := uintptr(len(src))
+	if toCopy == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy)
+	if sig == 0 {
+		return len(src), nil
+	}
+
+	if faultN, dstN := uintptr(fault), uintptr(dst); faultN < dstN && faultN >= dstN+toCopy {
+		panic(fmt.Sprintf("CopyOut faulted at %#x, which is outside destination [%#x, %#x)", faultN, dstN, dstN+toCopy))
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	var done int
+	if faultN-dstN > maxRegisterSize {
+		done = int(faultN - dstN - maxRegisterSize)
+	}
+	n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)])
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// Copy copies toCopy bytes from src to dst. It returns the number of bytes
+// copied and an error if SIGSEGV or SIGBUS is received while reading from src
+// or writing to dst.
+//
+// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap,
+// the resulting contents of dst are unspecified.
+func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
+	if toCopy == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(dst, src, toCopy)
+	if sig == 0 {
+		return toCopy, nil
+	}
+
+	// Did the fault occur while reading from src or writing to dst?
+	faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst)
+	faultAfterSrc := ^uintptr(0)
+	if faultN >= srcN {
+		faultAfterSrc = faultN - srcN
+	}
+	faultAfterDst := ^uintptr(0)
+	if faultN >= dstN {
+		faultAfterDst = faultN - dstN
+	}
+	if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
+		panic(fmt.Sprintf("Copy faulted at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
+	}
+	faultedAfter := faultAfterSrc
+	if faultedAfter > faultAfterDst {
+		faultedAfter = faultAfterDst
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	var done uintptr
+	if faultedAfter > maxRegisterSize {
+		done = faultedAfter - maxRegisterSize
+	}
+	n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done)
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes
+// written and an error if SIGSEGV or SIGBUS is received while writing to dst.
+func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
+	if toZero == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memclr(dst, toZero)
+	if sig == 0 {
+		return toZero, nil
+	}
+
+	if faultN, dstN := uintptr(fault), uintptr(dst); faultN < dstN && faultN >= dstN+toZero {
+		panic(fmt.Sprintf("ZeroOut faulted at %#x, which is outside destination [%#x, %#x)", faultN, dstN, dstN+toZero))
+	}
+
+	// memclr might have ended the write up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to write up to the fault.
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	var done uintptr
+	if faultN-dstN > maxRegisterSize {
+		done = faultN - dstN - maxRegisterSize
+	}
+	n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done)
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns
+// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
+// not aligned to a 4-byte boundary.
+func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	old, sig := swapUint32(ptr, new)
+	return old, errorFromFaultSignal(ptr, sig)
+}
+
+// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns
+// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
+// not aligned to an 8-byte boundary.
+func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) {
+	if addr := uintptr(ptr); addr&7 != 0 {
+		return 0, AlignmentError{addr, 8}
+	}
+	old, sig := swapUint64(ptr, new)
+	return old, errorFromFaultSignal(ptr, sig)
+}
+
+// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32,
+// except that it returns an error if SIGSEGV or SIGBUS is received while
+// accessing ptr, or if ptr is not aligned to a 4-byte boundary.
+func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	prev, sig := compareAndSwapUint32(ptr, old, new)
+	return prev, errorFromFaultSignal(ptr, sig)
+}
+
+func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
+	switch sig {
+	case 0:
+		return nil
+	case int32(syscall.SIGSEGV):
+		return SegvError{uintptr(addr)}
+	case int32(syscall.SIGBUS):
+		return BusError{uintptr(addr)}
+	default:
+		panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr))
+	}
+}
+
+// ReplaceSignalHandler replaces the existing signal handler for the provided
+// signal with the one that handles faults in safecopy-protected functions.
+//
+// It stores the value of the previously set handler in previous.
+//
+// This function will be called on initialization in order to install safecopy
+// handlers for appropriate signals. These handlers will call the previous
+// handler however, and if this is function is being used externally then the
+// same courtesy is expected.
+func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error {
+	var sa struct {
+		handler  uintptr
+		flags    uint64
+		restorer uintptr
+		mask     uint64
+	}
+	const maskLen = 8
+
+	// Get the existing signal handler information, and save the current
+	// handler. Once we replace it, we will use this pointer to fall back to
+	// it when we receive other signals.
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 {
+		return e
+	}
+
+	// Fail if there isn't a previous handler.
+	if sa.handler == 0 {
+		return fmt.Errorf("previous handler for signal %x isn't set", sig)
+	}
+
+	*previous = sa.handler
+
+	// Install our own handler.
+	sa.handler = handler
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 {
+		return e
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
new file mode 100644
index 000000000..a65cb0c26
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -0,0 +1,124 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// The signals handled by sigHandler.
+#define SIGBUS  7
+#define SIGSEGV 11
+
+// Offsets to the registers in context->uc_mcontext.gregs[].
+#define REG_RDI 0x68
+#define REG_RAX 0x90
+#define REG_IP  0xa8
+
+// Offset to the si_addr field of siginfo.
+#define SI_CODE 0x08
+#define SI_ADDR 0x10
+
+// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
+// not be set up as a handler to any other signals.
+//
+// If the instruction causing the signal is within a safecopy-protected
+// function, the signal is handled such that execution resumes in the
+// appropriate fault handling stub with AX containing the faulting address and
+// DI containing the signal number. Otherwise control is transferred to the
+// previously configured signal handler (savedSigSegvHandler or
+// savedSigBusHandler).
+//
+// This function cannot be written in go because it runs whenever a signal is
+// received by the thread (preempting whatever was running), which includes when
+// garbage collector has stopped or isn't expecting any interactions (like
+// barriers).
+//
+// The arguments are the following:
+// DI - The signal number.
+// SI - Pointer to siginfo_t structure.
+// DX - Pointer to ucontext structure.
+TEXT ·signalHandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel.
+	MOVQ $0x0, CX
+	CMPL CX, SI_CODE(SI)
+	JGE original_handler
+
+	// Check if RIP is within the area we care about.
+	MOVQ REG_IP(DX), CX
+	CMPQ CX, ·memcpyBegin(SB)
+	JB not_memcpy
+	CMPQ CX, ·memcpyEnd(SB)
+	JAE not_memcpy
+
+	// Modify the context such that execution will resume in the fault
+	// handler.
+	LEAQ handleMemcpyFault(SB), CX
+	JMP handle_fault
+
+not_memcpy:
+	CMPQ CX, ·memclrBegin(SB)
+	JB not_memclr
+	CMPQ CX, ·memclrEnd(SB)
+	JAE not_memclr
+
+	LEAQ handleMemclrFault(SB), CX
+	JMP handle_fault
+
+not_memclr:
+	CMPQ CX, ·swapUint32Begin(SB)
+	JB not_swapuint32
+	CMPQ CX, ·swapUint32End(SB)
+	JAE not_swapuint32
+
+	LEAQ handleSwapUint32Fault(SB), CX
+	JMP handle_fault
+
+not_swapuint32:
+	CMPQ CX, ·swapUint64Begin(SB)
+	JB not_swapuint64
+	CMPQ CX, ·swapUint64End(SB)
+	JAE not_swapuint64
+
+	LEAQ handleSwapUint64Fault(SB), CX
+	JMP handle_fault
+
+not_swapuint64:
+	CMPQ CX, ·compareAndSwapUint32Begin(SB)
+	JB not_casuint32
+	CMPQ CX, ·compareAndSwapUint32End(SB)
+	JAE not_casuint32
+
+	LEAQ handleCompareAndSwapUint32Fault(SB), CX
+	JMP handle_fault
+
+not_casuint32:
+original_handler:
+	// Jump to the previous signal handler, which is likely the golang one.
+	XORQ CX, CX
+	MOVQ ·savedSigBusHandler(SB), AX
+	CMPL DI, $SIGSEGV
+	CMOVQEQ ·savedSigSegVHandler(SB), AX
+	JMP AX
+
+handle_fault:
+	// Entered with the address of the fault handler in RCX; store it in
+	// RIP.
+	MOVQ CX, REG_IP(DX)
+
+	// Store the faulting address in RAX.
+	MOVQ SI_ADDR(SI), CX
+	MOVQ CX, REG_RAX(DX)
+
+	// Store the signal number in EDI.
+	MOVL DI, REG_RDI(DX)
+
+	RET
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
new file mode 100644
index 000000000..dc4cfce41
--- /dev/null
+++ b/pkg/sentry/safemem/BUILD
@@ -0,0 +1,28 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "safemem",
+    srcs = [
+        "block_unsafe.go",
+        "io.go",
+        "safemem.go",
+        "seq_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/safemem",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/platform/safecopy",
+    ],
+)
+
+go_test(
+    name = "safemem_test",
+    size = "small",
+    srcs = [
+        "io_test.go",
+        "seq_test.go",
+    ],
+    embed = [":safemem"],
+)
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
new file mode 100644
index 000000000..0b58f6497
--- /dev/null
+++ b/pkg/sentry/safemem/block_unsafe.go
@@ -0,0 +1,269 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"fmt"
+	"reflect"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+)
+
+// A Block is a range of contiguous bytes, similar to []byte but with the
+// following differences:
+//
+// - The memory represented by a Block may require the use of safecopy to
+// access.
+//
+// - Block does not carry a capacity and cannot be expanded.
+//
+// Blocks are immutable and may be copied by value. The zero value of Block
+// represents an empty range, analogous to a nil []byte.
+type Block struct {
+	// [start, start+length) is the represented memory.
+	//
+	// start is an unsafe.Pointer to ensure that Block prevents the represented
+	// memory from being garbage-collected.
+	start  unsafe.Pointer
+	length int
+
+	// needSafecopy is true if accessing the represented memory requires the
+	// use of safecopy.
+	needSafecopy bool
+}
+
+// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to
+// access without safecopy.
+func BlockFromSafeSlice(slice []byte) Block {
+	return blockFromSlice(slice, false)
+}
+
+// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to
+// access without safecopy.
+func BlockFromUnsafeSlice(slice []byte) Block {
+	return blockFromSlice(slice, true)
+}
+
+func blockFromSlice(slice []byte, needSafecopy bool) Block {
+	if len(slice) == 0 {
+		return Block{}
+	}
+	return Block{
+		start:        unsafe.Pointer(&slice[0]),
+		length:       len(slice),
+		needSafecopy: needSafecopy,
+	}
+}
+
+// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is
+// safe to access without safecopy.
+//
+// Preconditions: ptr+len does not overflow.
+func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block {
+	return blockFromPointer(ptr, len, false)
+}
+
+// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which
+// is not safe to access without safecopy.
+//
+// Preconditions: ptr+len does not overflow.
+func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block {
+	return blockFromPointer(ptr, len, true)
+}
+
+func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block {
+	if uptr := uintptr(ptr); uptr+uintptr(len) < uptr {
+		panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len))
+	}
+	return Block{
+		start:        ptr,
+		length:       len,
+		needSafecopy: needSafecopy,
+	}
+}
+
+// DropFirst returns a Block equivalent to b, but with the first n bytes
+// omitted. It is analogous to the [n:] operation on a slice, except that if n
+// > b.Len(), DropFirst returns an empty Block instead of panicking.
+//
+// Preconditions: n >= 0.
+func (b Block) DropFirst(n int) Block {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return b.DropFirst64(uint64(n))
+}
+
+// DropFirst64 is equivalent to DropFirst but takes a uint64.
+func (b Block) DropFirst64(n uint64) Block {
+	if n >= uint64(b.length) {
+		return Block{}
+	}
+	return Block{
+		start:        unsafe.Pointer(uintptr(b.start) + uintptr(n)),
+		length:       b.length - int(n),
+		needSafecopy: b.needSafecopy,
+	}
+}
+
+// TakeFirst returns a Block equivalent to the first n bytes of b. It is
+// analogous to the [:n] operation on a slice, except that if n > b.Len(),
+// TakeFirst returns a copy of b instead of panicking.
+//
+// Preconditions: n >= 0.
+func (b Block) TakeFirst(n int) Block {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return b.TakeFirst64(uint64(n))
+}
+
+// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
+func (b Block) TakeFirst64(n uint64) Block {
+	if n == 0 {
+		return Block{}
+	}
+	if n >= uint64(b.length) {
+		return b
+	}
+	return Block{
+		start:        b.start,
+		length:       int(n),
+		needSafecopy: b.needSafecopy,
+	}
+}
+
+// ToSlice returns a []byte equivalent to b.
+func (b Block) ToSlice() []byte {
+	var bs []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
+	hdr.Data = uintptr(b.start)
+	hdr.Len = b.length
+	hdr.Cap = b.length
+	return bs
+}
+
+// Addr returns b's start address as a uintptr. It returns uintptr instead of
+// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers
+// without importing the unsafe package explicitly.
+//
+// Note that a uintptr is not recognized as a pointer by the garbage collector,
+// such that if there are no uses of b after a call to b.Addr() and the address
+// is to Go-managed memory, the returned uintptr does not prevent garbage
+// collection of the pointee.
+func (b Block) Addr() uintptr {
+	return uintptr(b.start)
+}
+
+// Len returns b's length in bytes.
+func (b Block) Len() int {
+	return b.length
+}
+
+// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy.
+func (b Block) NeedSafecopy() bool {
+	return b.needSafecopy
+}
+
+// String implements fmt.Stringer.String.
+func (b Block) String() string {
+	if uintptr(b.start) == 0 && b.length == 0 {
+		return "<nil>"
+	}
+	var suffix string
+	if b.needSafecopy {
+		suffix = "*"
+	}
+	return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix)
+}
+
+// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src
+// to dst and returns the number of bytes copied.
+//
+// If src and dst overlap, the data stored in dst is unspecified.
+func Copy(dst, src Block) (int, error) {
+	if !dst.needSafecopy && !src.needSafecopy {
+		return copy(dst.ToSlice(), src.ToSlice()), nil
+	}
+
+	n := dst.length
+	if n > src.length {
+		n = src.length
+	}
+	if n == 0 {
+		return 0, nil
+	}
+
+	switch {
+	case dst.needSafecopy && !src.needSafecopy:
+		return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice())
+	case !dst.needSafecopy && src.needSafecopy:
+		return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start)
+	case dst.needSafecopy && src.needSafecopy:
+		n64, err := safecopy.Copy(dst.start, src.start, uintptr(n))
+		return int(n64), err
+	default:
+		panic("unreachable")
+	}
+}
+
+// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed.
+func Zero(dst Block) (int, error) {
+	if !dst.needSafecopy {
+		bs := dst.ToSlice()
+		for i := range bs {
+			bs[i] = 0
+		}
+		return len(bs), nil
+	}
+
+	n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length))
+	return int(n64), err
+}
+
+// Safecopy atomics are no slower than non-safecopy atomics, so use the former
+// even when !b.needSafecopy to get consistent alignment checking.
+
+// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func SwapUint32(b Block, new uint32) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.SwapUint32(b.start, new)
+}
+
+// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b.
+//
+// Preconditions: b.Len() >= 8.
+func SwapUint64(b Block, new uint64) (uint64, error) {
+	if b.length < 8 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.SwapUint64(b.start, new)
+}
+
+// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4
+// bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.CompareAndSwapUint32(b.start, old, new)
+}
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
new file mode 100644
index 000000000..fd917648b
--- /dev/null
+++ b/pkg/sentry/safemem/io.go
@@ -0,0 +1,339 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"errors"
+	"io"
+	"math"
+)
+
+// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write
+// beyond the end of the BlockSeq.
+var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq")
+
+// Reader represents a streaming byte source like io.Reader.
+type Reader interface {
+	// ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the
+	// number of bytes read. It may return a partial read without an error
+	// (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a
+	// full read with an error (i.e. (dsts.NumBytes(), err) where err != nil);
+	// note that this differs from io.Reader.Read (in particular, io.EOF should
+	// not be returned if ReadToBlocks successfully reads dsts.NumBytes()
+	// bytes.)
+	ReadToBlocks(dsts BlockSeq) (uint64, error)
+}
+
+// Writer represents a streaming byte sink like io.Writer.
+type Writer interface {
+	// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
+	// the number of bytes written. It may return a partial write without an
+	// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
+	// return a full write with an error (i.e. srcs.NumBytes(), err) where err
+	// != nil).
+	WriteFromBlocks(srcs BlockSeq) (uint64, error)
+}
+
+// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes()
+// bytes have been read or ReadToBlocks returns an error.
+func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) {
+	var done uint64
+	for !dsts.IsEmpty() {
+		n, err := r.ReadToBlocks(dsts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		dsts = dsts.DropFirst64(n)
+	}
+	return done, nil
+}
+
+// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until
+// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error.
+func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) {
+	var done uint64
+	for !srcs.IsEmpty() {
+		n, err := w.WriteFromBlocks(srcs)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		srcs = srcs.DropFirst64(n)
+	}
+	return done, nil
+}
+
+// BlockSeqReader implements Reader by reading from a BlockSeq.
+type BlockSeqReader struct {
+	Blocks BlockSeq
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	n, err := CopySeq(dsts, r.Blocks)
+	r.Blocks = r.Blocks.DropFirst64(n)
+	if err != nil {
+		return n, err
+	}
+	if n < dsts.NumBytes() {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+// BlockSeqWriter implements Writer by writing to a BlockSeq.
+type BlockSeqWriter struct {
+	Blocks BlockSeq
+}
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	n, err := CopySeq(w.Blocks, srcs)
+	w.Blocks = w.Blocks.DropFirst64(n)
+	if err != nil {
+		return n, err
+	}
+	if n < srcs.NumBytes() {
+		return n, ErrEndOfBlockSeq
+	}
+	return n, nil
+}
+
+// ReaderFunc implements Reader for a function with the semantics of
+// Reader.ReadToBlocks.
+type ReaderFunc func(dsts BlockSeq) (uint64, error)
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	return f(dsts)
+}
+
+// WriterFunc implements Writer for a function with the semantics of
+// Writer.WriteFromBlocks.
+type WriterFunc func(srcs BlockSeq) (uint64, error)
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	return f(srcs)
+}
+
+// ToIOReader implements io.Reader for a (safemem.)Reader.
+//
+// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does
+// so.
+type ToIOReader struct {
+	Reader Reader
+}
+
+// Read implements io.Reader.Read.
+func (r ToIOReader) Read(dst []byte) (int, error) {
+	n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst)))
+	return int(n), err
+}
+
+// ToIOWriter implements io.Writer for a (safemem.)Writer.
+type ToIOWriter struct {
+	Writer Writer
+}
+
+// Write implements io.Writer.Write.
+func (w ToIOWriter) Write(src []byte) (int, error) {
+	// io.Writer does not permit partial writes.
+	n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src)))
+	return int(n), err
+}
+
+// FromIOReader implements Reader for an io.Reader by repeatedly invoking
+// io.Reader.Read until it returns an error or partial read.
+//
+// FromIOReader will return a successful partial read iff Reader.Read does so.
+type FromIOReader struct {
+	Reader io.Reader
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	var buf []byte
+	var done uint64
+	for !dsts.IsEmpty() {
+		dst := dsts.Head()
+		var n int
+		var err error
+		n, buf, err = r.readToBlock(dst, buf)
+		done += uint64(n)
+		if n != dst.Len() {
+			return done, err
+		}
+		dsts = dsts.Tail()
+		if err != nil {
+			if dsts.IsEmpty() && err == io.EOF {
+				return done, nil
+			}
+			return done, err
+		}
+	}
+	return done, nil
+}
+
+func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
+	// io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
+	// safecopy.
+	if !dst.NeedSafecopy() {
+		n, err := r.Reader.Read(dst.ToSlice())
+		return n, buf, err
+	}
+	if len(buf) < dst.Len() {
+		buf = make([]byte, dst.Len())
+	}
+	rn, rerr := r.Reader.Read(buf[:dst.Len()])
+	wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
+	if wberr != nil {
+		return wbn, buf, wberr
+	}
+	return wbn, buf, rerr
+}
+
+// FromIOWriter implements Writer for an io.Writer by repeatedly invoking
+// io.Writer.Write until it returns an error or partial write.
+//
+// FromIOWriter will tolerate implementations of io.Writer.Write that return
+// partial writes with a nil error in contravention of io.Writer's
+// requirements, since Writer is permitted to do so. FromIOWriter will return a
+// successful partial write iff Writer.Write does so.
+type FromIOWriter struct {
+	Writer io.Writer
+}
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	var buf []byte
+	var done uint64
+	for !srcs.IsEmpty() {
+		src := srcs.Head()
+		var n int
+		var err error
+		n, buf, err = w.writeFromBlock(src, buf)
+		done += uint64(n)
+		if n != src.Len() || err != nil {
+			return done, err
+		}
+		srcs = srcs.Tail()
+	}
+	return done, nil
+}
+
+func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) {
+	// io.Writer isn't safecopy-aware, so we have to buffer Blocks that require
+	// safecopy.
+	if !src.NeedSafecopy() {
+		n, err := w.Writer.Write(src.ToSlice())
+		return n, buf, err
+	}
+	if len(buf) < src.Len() {
+		buf = make([]byte, src.Len())
+	}
+	bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src)
+	wn, werr := w.Writer.Write(buf[:bufn])
+	if werr != nil {
+		return wn, buf, werr
+	}
+	return wn, buf, buferr
+}
+
+// FromVecReaderFunc implements Reader for a function that reads data into a
+// [][]byte and returns the number of bytes read as an int64.
+type FromVecReaderFunc struct {
+	ReadVec func(dsts [][]byte) (int64, error)
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+//
+// ReadToBlocks calls r.ReadVec at most once.
+func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	// Ensure that we don't pass a [][]byte with a total length > MaxInt64.
+	dsts = dsts.TakeFirst64(uint64(math.MaxInt64))
+	dstSlices := make([][]byte, 0, dsts.NumBlocks())
+	// Buffer Blocks that require safecopy.
+	for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() {
+		dst := tmp.Head()
+		if dst.NeedSafecopy() {
+			dstSlices = append(dstSlices, make([]byte, dst.Len()))
+		} else {
+			dstSlices = append(dstSlices, dst.ToSlice())
+		}
+	}
+	rn, rerr := r.ReadVec(dstSlices)
+	dsts = dsts.TakeFirst64(uint64(rn))
+	var done uint64
+	var i int
+	for !dsts.IsEmpty() {
+		dst := dsts.Head()
+		if dst.NeedSafecopy() {
+			n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i]))
+			done += uint64(n)
+			if err != nil {
+				return done, err
+			}
+		} else {
+			done += uint64(dst.Len())
+		}
+		dsts = dsts.Tail()
+		i++
+	}
+	return done, rerr
+}
+
+// FromVecWriterFunc implements Writer for a function that writes data from a
+// [][]byte and returns the number of bytes written.
+type FromVecWriterFunc struct {
+	WriteVec func(srcs [][]byte) (int64, error)
+}
+
+// WriteFromBlocks implements Writer.WriteFromBlocks.
+//
+// WriteFromBlocks calls w.WriteVec at most once.
+func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	// Ensure that we don't pass a [][]byte with a total length > MaxInt64.
+	srcs = srcs.TakeFirst64(uint64(math.MaxInt64))
+	srcSlices := make([][]byte, 0, srcs.NumBlocks())
+	// Buffer Blocks that require safecopy.
+	var buferr error
+	for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() {
+		src := tmp.Head()
+		if src.NeedSafecopy() {
+			slice := make([]byte, src.Len())
+			n, err := Copy(BlockFromSafeSlice(slice), src)
+			srcSlices = append(srcSlices, slice[:n])
+			if err != nil {
+				buferr = err
+				break
+			}
+		} else {
+			srcSlices = append(srcSlices, src.ToSlice())
+		}
+	}
+	n, err := w.WriteVec(srcSlices)
+	if err != nil {
+		return uint64(n), err
+	}
+	return uint64(n), buferr
+}
diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go
new file mode 100644
index 000000000..edac4c1d7
--- /dev/null
+++ b/pkg/sentry/safemem/io_test.go
@@ -0,0 +1,199 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"bytes"
+	"io"
+	"testing"
+)
+
+func makeBlocks(slices ...[]byte) []Block {
+	blocks := make([]Block, 0, len(slices))
+	for _, s := range slices {
+		blocks = append(blocks, BlockFromSafeSlice(s))
+	}
+	return blocks
+}
+
+func TestFromIOReaderFullRead(t *testing.T) {
+	r := FromIOReader{bytes.NewBufferString("foobar")}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("foo"), []byte("bar")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+type eofHidingReader struct {
+	Reader io.Reader
+}
+
+func (r eofHidingReader) Read(dst []byte) (int, error) {
+	n, err := r.Reader.Read(dst)
+	if err == io.EOF {
+		return n, nil
+	}
+	return n, err
+}
+
+func TestFromIOReaderPartialRead(t *testing.T) {
+	r := FromIOReader{eofHidingReader{bytes.NewBufferString("foob")}}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
+	// FromIOReader should stop after the eofHidingReader returns (1, nil)
+	// for a 3-byte read.
+	if wantN := uint64(4); n != wantN || err != nil {
+		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("foo"), []byte("b\x00\x00")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+type singleByteReader struct {
+	Reader io.Reader
+}
+
+func (r singleByteReader) Read(dst []byte) (int, error) {
+	if len(dst) == 0 {
+		return r.Reader.Read(dst)
+	}
+	return r.Reader.Read(dst[:1])
+}
+
+func TestSingleByteReader(t *testing.T) {
+	r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts))
+	// FromIOReader should stop after the singleByteReader returns (1, nil)
+	// for a 3-byte read.
+	if wantN := uint64(1); n != wantN || err != nil {
+		t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("f\x00\x00"), []byte("\x00\x00\x00")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+func TestReadFullToBlocks(t *testing.T) {
+	r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}}
+	dsts := makeBlocks(make([]byte, 3), make([]byte, 3))
+	n, err := ReadFullToBlocks(r, BlockSeqFromSlice(dsts))
+	// ReadFullToBlocks should call into FromIOReader => singleByteReader
+	// repeatedly until dsts is exhausted.
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("ReadFullToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	for i, want := range [][]byte{[]byte("foo"), []byte("bar")} {
+		if got := dsts[i].ToSlice(); !bytes.Equal(got, want) {
+			t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want)
+		}
+	}
+}
+
+func TestFromIOWriterFullWrite(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{&dst}
+	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+type limitedWriter struct {
+	Writer io.Writer
+	Done   int
+	Limit  int
+}
+
+func (w *limitedWriter) Write(src []byte) (int, error) {
+	count := len(src)
+	if count > (w.Limit - w.Done) {
+		count = w.Limit - w.Done
+	}
+	n, err := w.Writer.Write(src[:count])
+	w.Done += n
+	return n, err
+}
+
+func TestFromIOWriterPartialWrite(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{&limitedWriter{&dst, 0, 4}}
+	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
+	// FromIOWriter should stop after the limitedWriter returns (1, nil) for a
+	// 3-byte write.
+	if wantN := uint64(4); n != wantN || err != nil {
+		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+type singleByteWriter struct {
+	Writer io.Writer
+}
+
+func (w singleByteWriter) Write(src []byte) (int, error) {
+	if len(src) == 0 {
+		return w.Writer.Write(src)
+	}
+	return w.Writer.Write(src[:1])
+}
+
+func TestSingleByteWriter(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{singleByteWriter{&dst}}
+	n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs))
+	// FromIOWriter should stop after the singleByteWriter returns (1, nil)
+	// for a 3-byte write.
+	if wantN := uint64(1); n != wantN || err != nil {
+		t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("f"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+func TestWriteFullToBlocks(t *testing.T) {
+	srcs := makeBlocks([]byte("foo"), []byte("bar"))
+	var dst bytes.Buffer
+	w := FromIOWriter{singleByteWriter{&dst}}
+	n, err := WriteFullFromBlocks(w, BlockSeqFromSlice(srcs))
+	// WriteFullToBlocks should call into FromIOWriter => singleByteWriter
+	// repeatedly until srcs is exhausted.
+	if wantN := uint64(6); n != wantN || err != nil {
+		t.Errorf("WriteFullFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go
new file mode 100644
index 000000000..2f8002004
--- /dev/null
+++ b/pkg/sentry/safemem/safemem.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package safemem provides the Block and BlockSeq types.
+package safemem
diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go
new file mode 100644
index 000000000..3e83b3851
--- /dev/null
+++ b/pkg/sentry/safemem/seq_test.go
@@ -0,0 +1,196 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+)
+
+type blockSeqTest struct {
+	desc string
+
+	pieces     []string
+	haveOffset bool
+	offset     uint64
+	haveLimit  bool
+	limit      uint64
+
+	want string
+}
+
+func (t blockSeqTest) NonEmptyByteSlices() [][]byte {
+	// t is a value, so we can mutate it freely.
+	slices := make([][]byte, 0, len(t.pieces))
+	for _, str := range t.pieces {
+		if t.haveOffset {
+			strOff := t.offset
+			if strOff > uint64(len(str)) {
+				strOff = uint64(len(str))
+			}
+			str = str[strOff:]
+			t.offset -= strOff
+		}
+		if t.haveLimit {
+			strLim := t.limit
+			if strLim > uint64(len(str)) {
+				strLim = uint64(len(str))
+			}
+			str = str[:strLim]
+			t.limit -= strLim
+		}
+		if len(str) != 0 {
+			slices = append(slices, []byte(str))
+		}
+	}
+	return slices
+}
+
+func (t blockSeqTest) BlockSeq() BlockSeq {
+	blocks := make([]Block, 0, len(t.pieces))
+	for _, str := range t.pieces {
+		blocks = append(blocks, BlockFromSafeSlice([]byte(str)))
+	}
+	bs := BlockSeqFromSlice(blocks)
+	if t.haveOffset {
+		bs = bs.DropFirst64(t.offset)
+	}
+	if t.haveLimit {
+		bs = bs.TakeFirst64(t.limit)
+	}
+	return bs
+}
+
+var blockSeqTests = []blockSeqTest{
+	{
+		desc: "Empty sequence",
+	},
+	{
+		desc:   "Sequence of length 1",
+		pieces: []string{"foobar"},
+		want:   "foobar",
+	},
+	{
+		desc:   "Sequence of length 2",
+		pieces: []string{"foo", "bar"},
+		want:   "foobar",
+	},
+	{
+		desc:   "Empty Blocks",
+		pieces: []string{"", "foo", "", "", "bar", ""},
+		want:   "foobar",
+	},
+	{
+		desc:       "Sequence with non-zero offset",
+		pieces:     []string{"foo", "bar"},
+		haveOffset: true,
+		offset:     2,
+		want:       "obar",
+	},
+	{
+		desc:      "Sequence with non-maximal limit",
+		pieces:    []string{"foo", "bar"},
+		haveLimit: true,
+		limit:     5,
+		want:      "fooba",
+	},
+	{
+		desc:       "Sequence with offset and limit",
+		pieces:     []string{"foo", "bar"},
+		haveOffset: true,
+		offset:     2,
+		haveLimit:  true,
+		limit:      3,
+		want:       "oba",
+	},
+}
+
+func TestBlockSeqNumBytes(t *testing.T) {
+	for _, test := range blockSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			if got, want := test.BlockSeq().NumBytes(), uint64(len(test.want)); got != want {
+				t.Errorf("NumBytes: got %d, wanted %d", got, want)
+			}
+		})
+	}
+}
+
+func TestBlockSeqIterBlocks(t *testing.T) {
+	// Tests BlockSeq iteration using Head/Tail.
+	for _, test := range blockSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			srcs := test.BlockSeq()
+			// "Note that a non-nil empty slice and a nil slice ... are not
+			// deeply equal." - reflect
+			slices := make([][]byte, 0, 0)
+			for !srcs.IsEmpty() {
+				src := srcs.Head()
+				slices = append(slices, src.ToSlice())
+				nextSrcs := srcs.Tail()
+				if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-uint64(src.Len()); got != want {
+					t.Fatalf("%v.Tail(): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want)
+				}
+				srcs = nextSrcs
+			}
+			if wantSlices := test.NonEmptyByteSlices(); !reflect.DeepEqual(slices, wantSlices) {
+				t.Errorf("Accumulated slices: got %v, wanted %v", slices, wantSlices)
+			}
+		})
+	}
+}
+
+func TestBlockSeqIterBytes(t *testing.T) {
+	// Tests BlockSeq iteration using Head/DropFirst.
+	for _, test := range blockSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			srcs := test.BlockSeq()
+			var dst bytes.Buffer
+			for !srcs.IsEmpty() {
+				src := srcs.Head()
+				var b [1]byte
+				n, err := Copy(BlockFromSafeSlice(b[:]), src)
+				if n != 1 || err != nil {
+					t.Fatalf("Copy: got (%v, %v), wanted (1, nil)", n, err)
+				}
+				dst.WriteByte(b[0])
+				nextSrcs := srcs.DropFirst(1)
+				if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-1; got != want {
+					t.Fatalf("%v.DropFirst(1): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want)
+				}
+				srcs = nextSrcs
+			}
+			if got := string(dst.Bytes()); got != test.want {
+				t.Errorf("Copied string: got %q, wanted %q", got, test.want)
+			}
+		})
+	}
+}
+
+func TestBlockSeqDropBeyondLimit(t *testing.T) {
+	blocks := []Block{BlockFromSafeSlice([]byte("123")), BlockFromSafeSlice([]byte("4"))}
+	bs := BlockSeqFromSlice(blocks)
+	if got, want := bs.NumBytes(), uint64(4); got != want {
+		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
+	}
+	bs = bs.TakeFirst(1)
+	if got, want := bs.NumBytes(), uint64(1); got != want {
+		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
+	}
+	bs = bs.DropFirst(2)
+	if got, want := bs.NumBytes(), uint64(0); got != want {
+		t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want)
+	}
+}
diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go
new file mode 100644
index 000000000..e0d29a0b3
--- /dev/null
+++ b/pkg/sentry/safemem/seq_unsafe.go
@@ -0,0 +1,299 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safemem
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+// A BlockSeq represents a sequence of Blocks, each of which has non-zero
+// length.
+//
+// BlockSeqs are immutable and may be copied by value. The zero value of
+// BlockSeq represents an empty sequence.
+type BlockSeq struct {
+	// If length is 0, then the BlockSeq is empty. Invariants: data == 0;
+	// offset == 0; limit == 0.
+	//
+	// If length is -1, then the BlockSeq represents the single Block{data,
+	// limit, false}. Invariants: offset == 0; limit > 0; limit does not
+	// overflow the range of an int.
+	//
+	// If length is -2, then the BlockSeq represents the single Block{data,
+	// limit, true}. Invariants: offset == 0; limit > 0; limit does not
+	// overflow the range of an int.
+	//
+	// Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks
+	// in the array of Blocks starting at address `data`, starting at `offset`
+	// bytes into the first Block and limited to the following `limit` bytes.
+	// Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <=
+	// the combined length of all Blocks in the array; the first Block in the
+	// array has non-zero length.
+	//
+	// length is never 1; sequences consisting of a single Block are always
+	// stored inline (with length < 0).
+	data   unsafe.Pointer
+	length int
+	offset int
+	limit  uint64
+}
+
+// BlockSeqOf returns a BlockSeq representing the single Block b.
+func BlockSeqOf(b Block) BlockSeq {
+	bs := BlockSeq{
+		data:   b.start,
+		length: -1,
+		limit:  uint64(b.length),
+	}
+	if b.needSafecopy {
+		bs.length = -2
+	}
+	return bs
+}
+
+// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice.
+// If slice contains Blocks with zero length, BlockSeq will skip them during
+// iteration.
+//
+// Whether the returned BlockSeq shares memory with slice is unspecified;
+// clients should avoid mutating slices passed to BlockSeqFromSlice.
+//
+// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64.
+func BlockSeqFromSlice(slice []Block) BlockSeq {
+	slice = skipEmpty(slice)
+	var limit uint64
+	for _, b := range slice {
+		sum := limit + uint64(b.Len())
+		if sum < limit {
+			panic("BlockSeq length overflows uint64")
+		}
+		limit = sum
+	}
+	return blockSeqFromSliceLimited(slice, limit)
+}
+
+// Preconditions: The combined length of all Blocks in slice <= limit. If
+// len(slice) != 0, the first Block in slice has non-zero length, and limit >
+// 0.
+func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq {
+	switch len(slice) {
+	case 0:
+		return BlockSeq{}
+	case 1:
+		return BlockSeqOf(slice[0].TakeFirst64(limit))
+	default:
+		return BlockSeq{
+			data:   unsafe.Pointer(&slice[0]),
+			length: len(slice),
+			limit:  limit,
+		}
+	}
+}
+
+func skipEmpty(slice []Block) []Block {
+	for i, b := range slice {
+		if b.Len() != 0 {
+			return slice[i:]
+		}
+	}
+	return nil
+}
+
+// IsEmpty returns true if bs contains no Blocks.
+//
+// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0).
+// (Of these, prefer to use bs.IsEmpty().)
+func (bs BlockSeq) IsEmpty() bool {
+	return bs.length == 0
+}
+
+// NumBlocks returns the number of Blocks in bs.
+func (bs BlockSeq) NumBlocks() int {
+	// In general, we have to count: if bs represents a windowed slice then the
+	// slice may contain Blocks with zero length, and bs.length may be larger
+	// than the actual number of Blocks due to bs.limit.
+	var n int
+	for !bs.IsEmpty() {
+		n++
+		bs = bs.Tail()
+	}
+	return n
+}
+
+// NumBytes returns the sum of Block.Len() for all Blocks in bs.
+func (bs BlockSeq) NumBytes() uint64 {
+	return bs.limit
+}
+
+// Head returns the first Block in bs.
+//
+// Preconditions: !bs.IsEmpty().
+func (bs BlockSeq) Head() Block {
+	if bs.length == 0 {
+		panic("empty BlockSeq")
+	}
+	if bs.length < 0 {
+		return bs.internalBlock()
+	}
+	return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit)
+}
+
+// Preconditions: bs.length < 0.
+func (bs BlockSeq) internalBlock() Block {
+	return Block{
+		start:        bs.data,
+		length:       int(bs.limit),
+		needSafecopy: bs.length == -2,
+	}
+}
+
+// Tail returns a BlockSeq consisting of all Blocks in bs after the first.
+//
+// Preconditions: !bs.IsEmpty().
+func (bs BlockSeq) Tail() BlockSeq {
+	if bs.length == 0 {
+		panic("empty BlockSeq")
+	}
+	if bs.length < 0 {
+		return BlockSeq{}
+	}
+	head := (*Block)(bs.data).DropFirst(bs.offset)
+	headLen := uint64(head.Len())
+	if headLen >= bs.limit {
+		// The head Block exhausts the limit, so the tail is empty.
+		return BlockSeq{}
+	}
+	var extSlice []Block
+	extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice))
+	extSliceHdr.Data = uintptr(bs.data)
+	extSliceHdr.Len = bs.length
+	extSliceHdr.Cap = bs.length
+	tailSlice := skipEmpty(extSlice[1:])
+	tailLimit := bs.limit - headLen
+	return blockSeqFromSliceLimited(tailSlice, tailLimit)
+}
+
+// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes
+// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq.
+//
+// Preconditions: n >= 0.
+func (bs BlockSeq) DropFirst(n int) BlockSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return bs.DropFirst64(uint64(n))
+}
+
+// DropFirst64 is equivalent to DropFirst but takes an uint64.
+func (bs BlockSeq) DropFirst64(n uint64) BlockSeq {
+	if n >= bs.limit {
+		return BlockSeq{}
+	}
+	for {
+		// Calling bs.Head() here is surprisingly expensive, so inline getting
+		// the head's length.
+		var headLen uint64
+		if bs.length < 0 {
+			headLen = bs.limit
+		} else {
+			headLen = uint64((*Block)(bs.data).Len() - bs.offset)
+		}
+		if n < headLen {
+			// Dropping ends partway through the head Block.
+			if bs.length < 0 {
+				return BlockSeqOf(bs.internalBlock().DropFirst64(n))
+			}
+			bs.offset += int(n)
+			bs.limit -= n
+			return bs
+		}
+		n -= headLen
+		bs = bs.Tail()
+	}
+}
+
+// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n >
+// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs.
+//
+// Preconditions: n >= 0.
+func (bs BlockSeq) TakeFirst(n int) BlockSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return bs.TakeFirst64(uint64(n))
+}
+
+// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
+func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq {
+	if n == 0 {
+		return BlockSeq{}
+	}
+	if bs.limit > n {
+		bs.limit = n
+	}
+	return bs
+}
+
+// String implements fmt.Stringer.String.
+func (bs BlockSeq) String() string {
+	var buf bytes.Buffer
+	buf.WriteByte('[')
+	var sep string
+	for !bs.IsEmpty() {
+		buf.WriteString(sep)
+		sep = " "
+		buf.WriteString(bs.Head().String())
+		bs = bs.Tail()
+	}
+	buf.WriteByte(']')
+	return buf.String()
+}
+
+// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less,
+// from srcs to dsts and returns the number of bytes copied.
+//
+// If srcs and dsts overlap, the data stored in dsts is unspecified.
+func CopySeq(dsts, srcs BlockSeq) (uint64, error) {
+	var done uint64
+	for !dsts.IsEmpty() && !srcs.IsEmpty() {
+		dst := dsts.Head()
+		src := srcs.Head()
+		n, err := Copy(dst, src)
+		done += uint64(n)
+		if err != nil {
+			return done, err
+		}
+		dsts = dsts.DropFirst(n)
+		srcs = srcs.DropFirst(n)
+	}
+	return done, nil
+}
+
+// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed.
+func ZeroSeq(dsts BlockSeq) (uint64, error) {
+	var done uint64
+	for !dsts.IsEmpty() {
+		n, err := Zero(dsts.Head())
+		done += uint64(n)
+		if err != nil {
+			return done, err
+		}
+		dsts = dsts.DropFirst(n)
+	}
+	return done, nil
+}
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
new file mode 100644
index 000000000..daaad7c90
--- /dev/null
+++ b/pkg/sentry/sighandling/BUILD
@@ -0,0 +1,18 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "sighandling",
+    srcs = [
+        "sighandling.go",
+        "sighandling_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/kernel",
+    ],
+)
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
new file mode 100644
index 000000000..1a94b535b
--- /dev/null
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -0,0 +1,116 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sighandling contains helpers for handling signals to applications.
+package sighandling
+
+import (
+	"os"
+	"os/signal"
+	"reflect"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// numSignals is the number of normal (non-realtime) signals on Linux.
+const numSignals = 32
+
+// forwardSignals listens for incoming signals and delivers them to k. It stops
+// when the stop channel is closed.
+func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, stop chan struct{}) {
+	// Build a select case.
+	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}}
+	for _, sigchan := range sigchans {
+		sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)})
+	}
+
+	for {
+		// Wait for a notification.
+		index, _, ok := reflect.Select(sc)
+
+		// Was it the stop channel?
+		if index == 0 {
+			if !ok {
+				break
+			}
+			continue
+		}
+
+		// How about a different close?
+		if !ok {
+			panic("signal channel closed unexpectedly")
+		}
+
+		// Otherwise, it was a signal on channel N. Index 0 represents the stop
+		// channel, so index N represents the channel for signal N.
+		if !k.SendExternalSignal(&arch.SignalInfo{Signo: int32(index)}, "sentry") {
+			// Kernel is not ready to receive signals.
+			//
+			// Kill ourselves if this signal would have killed the
+			// process before StartForwarding was called. i.e., all
+			// _SigKill signals; see Go
+			// src/runtime/sigtab_linux_generic.go.
+			//
+			// Otherwise ignore the signal.
+			//
+			// TODO: Convert Go's runtime.raise from
+			// tkill to tgkill so StartForwarding doesn't need to
+			// be called until after filter installation.
+			switch linux.Signal(index) {
+			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
+				dieFromSignal(linux.Signal(index))
+			}
+		}
+	}
+
+	// Close all individual channels.
+	for _, sigchan := range sigchans {
+		signal.Stop(sigchan)
+		close(sigchan)
+	}
+}
+
+// StartForwarding ensures that synchronous signals are forwarded to k and
+// returns a callback that stops signal forwarding.
+func StartForwarding(k *kernel.Kernel) func() {
+	stop := make(chan struct{})
+
+	// Register individual channels. One channel per standard signal is
+	// required as os.Notify() is non-blocking and may drop signals. To avoid
+	// this, standard signals have to be queued separately. Channel size 1 is
+	// enough for standard signals as their semantics allow de-duplication.
+	//
+	// External real-time signals are not supported. We rely on the go-runtime
+	// for their handling.
+	var sigchans []chan os.Signal
+	for sig := 1; sig <= numSignals+1; sig++ {
+		sigchan := make(chan os.Signal, 1)
+		sigchans = append(sigchans, sigchan)
+
+		// SignalPanic is handled by Run.
+		if linux.Signal(sig) == kernel.SignalPanic {
+			continue
+		}
+
+		signal.Notify(sigchan, syscall.Signal(sig))
+	}
+	// Start up our listener.
+	go forwardSignals(k, sigchans, stop) // S/R-SAFE: synchronized by Kernel.extMu
+
+	// ... shouldn't this wait until the forwardSignals goroutine returns?
+	return func() { close(stop) }
+}
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
new file mode 100644
index 000000000..a455b919f
--- /dev/null
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sighandling
+
+import (
+	"fmt"
+	"runtime"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// TODO: Move to pkg/abi/linux along with definitions in
+// pkg/sentry/arch.
+type sigaction struct {
+	handler  uintptr
+	flags    uint64
+	restorer uintptr
+	mask     uint64
+}
+
+// IgnoreChildStop sets the SA_NOCLDSTOP flag, causing child processes to not
+// generate SIGCHLD when they stop.
+func IgnoreChildStop() error {
+	var sa sigaction
+
+	// Get the existing signal handler information, and set the flag.
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(syscall.SIGCHLD), 0, uintptr(unsafe.Pointer(&sa)), linux.SignalSetSize, 0, 0); e != 0 {
+		return e
+	}
+	sa.flags |= linux.SA_NOCLDSTOP
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(syscall.SIGCHLD), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 {
+		return e
+	}
+
+	return nil
+}
+
+// dieFromSignal kills the current process with sig.
+//
+// Preconditions: The default action of sig is termination.
+func dieFromSignal(sig linux.Signal) {
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	sa := sigaction{handler: linux.SIG_DFL}
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 {
+		panic(fmt.Sprintf("rt_sigaction failed: %v", e))
+	}
+
+	set := linux.MakeSignalSet(sig)
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0); e != 0 {
+		panic(fmt.Sprintf("rt_sigprocmask failed: %v", e))
+	}
+
+	if err := syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.Signal(sig)); err != nil {
+		panic(fmt.Sprintf("tgkill failed: %v", err))
+	}
+
+	panic("failed to die")
+}
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
new file mode 100644
index 000000000..87e32df37
--- /dev/null
+++ b/pkg/sentry/socket/BUILD
@@ -0,0 +1,37 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "socket_state",
+    srcs = [
+        "socket.go",
+    ],
+    out = "socket_state_autogen.go",
+    package = "socket",
+)
+
+go_library(
+    name = "socket",
+    srcs = [
+        "socket.go",
+        "socket_state_autogen.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/tcpip/transport/unix",
+    ],
+)
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
new file mode 100644
index 000000000..25de2f655
--- /dev/null
+++ b/pkg/sentry/socket/control/BUILD
@@ -0,0 +1,39 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "control_state",
+    srcs = [
+        "control.go",
+    ],
+    out = "control_state.go",
+    imports = [
+        "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
+    ],
+    package = "control",
+)
+
+go_library(
+    name = "control",
+    srcs = [
+        "control.go",
+        "control_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+    ],
+)
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
new file mode 100644
index 000000000..cb34cbc85
--- /dev/null
+++ b/pkg/sentry/socket/control/control.go
@@ -0,0 +1,370 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package control provides internal representations of socket control
+// messages.
+package control
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+const maxInt = int(^uint(0) >> 1)
+
+// SCMCredentials represents a SCM_CREDENTIALS socket control message.
+type SCMCredentials interface {
+	unix.CredentialsControlMessage
+
+	// Credentials returns properly namespaced values for the sender's pid, uid
+	// and gid.
+	Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID)
+}
+
+// SCMRights represents a SCM_RIGHTS socket control message.
+type SCMRights interface {
+	unix.RightsControlMessage
+
+	// Files returns up to max RightsFiles.
+	Files(ctx context.Context, max int) RightsFiles
+}
+
+// RightsFiles represents a SCM_RIGHTS socket control message. A reference is
+// maintained for each fs.File and is release either when an FD is created or
+// when the Release method is called.
+type RightsFiles []*fs.File
+
+// NewSCMRights creates a new SCM_RIGHTS socket control message representation
+// using local sentry FDs.
+func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) {
+	files := make(RightsFiles, 0, len(fds))
+	for _, fd := range fds {
+		file, _ := t.FDMap().GetDescriptor(kdefs.FD(fd))
+		if file == nil {
+			files.Release()
+			return nil, syserror.EBADF
+		}
+		files = append(files, file)
+	}
+	return &files, nil
+}
+
+// Files implements SCMRights.Files.
+func (fs *RightsFiles) Files(ctx context.Context, max int) RightsFiles {
+	n := max
+	if l := len(*fs); n > l {
+		n = l
+	}
+	rf := (*fs)[:n]
+	*fs = (*fs)[n:]
+	return rf
+}
+
+// Clone implements unix.RightsControlMessage.Clone.
+func (fs *RightsFiles) Clone() unix.RightsControlMessage {
+	nfs := append(RightsFiles(nil), *fs...)
+	for _, nf := range nfs {
+		nf.IncRef()
+	}
+	return &nfs
+}
+
+// Release implements unix.RightsControlMessage.Release.
+func (fs *RightsFiles) Release() {
+	for _, f := range *fs {
+		f.DecRef()
+	}
+	*fs = nil
+}
+
+// rightsFDs gets up to the specified maximum number of FDs.
+func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) []int32 {
+	files := rights.Files(t, max)
+	fds := make([]int32, 0, len(files))
+	for i := 0; i < max && len(files) > 0; i++ {
+		fd, err := t.FDMap().NewFDFrom(0, files[0], kernel.FDFlags{cloexec}, t.ThreadGroup().Limits())
+		files[0].DecRef()
+		files = files[1:]
+		if err != nil {
+			t.Warningf("Error inserting FD: %v", err)
+			// This is what Linux does.
+			break
+		}
+
+		fds = append(fds, int32(fd))
+	}
+	return fds
+}
+
+// PackRights packs as many FDs as will fit into the unused capacity of buf.
+func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte) []byte {
+	maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4
+	// Linux does not return any FDs if none fit.
+	if maxFDs <= 0 {
+		return buf
+	}
+	fds := rightsFDs(t, rights, cloexec, maxFDs)
+	align := t.Arch().Width()
+	return putCmsg(buf, linux.SCM_RIGHTS, align, fds)
+}
+
+// scmCredentials represents an SCM_CREDENTIALS socket control message.
+type scmCredentials struct {
+	t    *kernel.Task
+	kuid auth.KUID
+	kgid auth.KGID
+}
+
+// NewSCMCredentials creates a new SCM_CREDENTIALS socket control message
+// representation.
+func NewSCMCredentials(t *kernel.Task, cred linux.ControlMessageCredentials) (SCMCredentials, error) {
+	tcred := t.Credentials()
+	kuid, err := tcred.UseUID(auth.UID(cred.UID))
+	if err != nil {
+		return nil, err
+	}
+	kgid, err := tcred.UseGID(auth.GID(cred.GID))
+	if err != nil {
+		return nil, err
+	}
+	if kernel.ThreadID(cred.PID) != t.ThreadGroup().ID() && !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.PIDNamespace().UserNamespace()) {
+		return nil, syserror.EPERM
+	}
+	return &scmCredentials{t, kuid, kgid}, nil
+}
+
+// Equals implements unix.CredentialsControlMessage.Equals.
+func (c *scmCredentials) Equals(oc unix.CredentialsControlMessage) bool {
+	if oc, _ := oc.(*scmCredentials); oc != nil && *c == *oc {
+		return true
+	}
+	return false
+}
+
+func putUint64(buf []byte, n uint64) []byte {
+	usermem.ByteOrder.PutUint64(buf[len(buf):len(buf)+8], n)
+	return buf[:len(buf)+8]
+}
+
+func putUint32(buf []byte, n uint32) []byte {
+	usermem.ByteOrder.PutUint32(buf[len(buf):len(buf)+4], n)
+	return buf[:len(buf)+4]
+}
+
+// putCmsg writes a control message header and as much data as will fit into
+// the unused capacity of a buffer.
+func putCmsg(buf []byte, msgType uint32, align uint, data []int32) []byte {
+	space := AlignDown(cap(buf)-len(buf), 4)
+
+	// We can't write to space that doesn't exist, so if we are going to align
+	// the available space, we must align down.
+	//
+	// align must be >= 4 and each data int32 is 4 bytes. The length of the
+	// header is already aligned, so if we align to the with of the data there
+	// are two cases:
+	// 1. The aligned length is less than the length of the header. The
+	// unaligned length was also less than the length of the header, so we
+	// can't write anything.
+	// 2. The aligned length is greater than or equal to the length of the
+	// header. We can write the header plus zero or more datas. We can't write
+	// a partial int32, so the length of the message will be
+	// min(aligned length, header + datas).
+	if space < linux.SizeOfControlMessageHeader {
+		return buf
+	}
+
+	length := 4*len(data) + linux.SizeOfControlMessageHeader
+	if length > space {
+		length = space
+	}
+	buf = putUint64(buf, uint64(length))
+	buf = putUint32(buf, linux.SOL_SOCKET)
+	buf = putUint32(buf, msgType)
+	for _, d := range data {
+		if len(buf)+4 > cap(buf) {
+			break
+		}
+		buf = putUint32(buf, uint32(d))
+	}
+	return alignSlice(buf, align)
+}
+
+// Credentials implements SCMCredentials.Credentials.
+func (c *scmCredentials) Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
+	// "When a process's user and group IDs are passed over a UNIX domain
+	// socket to a process in a different user namespace (see the description
+	// of SCM_CREDENTIALS in unix(7)), they are translated into the
+	// corresponding values as per the receiving process's user and group ID
+	// mappings." - user_namespaces(7)
+	pid := t.PIDNamespace().IDOfTask(c.t)
+	uid := c.kuid.In(t.UserNamespace()).OrOverflow()
+	gid := c.kgid.In(t.UserNamespace()).OrOverflow()
+
+	return pid, uid, gid
+}
+
+// PackCredentials packs the credentials in the control message (or default
+// credentials if none) into a buffer.
+func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte) []byte {
+	align := t.Arch().Width()
+
+	// Default credentials if none are available.
+	pid := kernel.ThreadID(0)
+	uid := auth.UID(auth.NobodyKUID)
+	gid := auth.GID(auth.NobodyKGID)
+
+	if creds != nil {
+		pid, uid, gid = creds.Credentials(t)
+	}
+	c := []int32{int32(pid), int32(uid), int32(gid)}
+	return putCmsg(buf, linux.SCM_CREDENTIALS, align, c)
+}
+
+// AlignUp rounds a length up to an alignment. align must be a power of 2.
+func AlignUp(length int, align uint) int {
+	return (length + int(align) - 1) & ^(int(align) - 1)
+}
+
+// AlignDown rounds a down to an alignment. align must be a power of 2.
+func AlignDown(length int, align uint) int {
+	return length & ^(int(align) - 1)
+}
+
+// alignSlice extends a slice's length (up to the capacity) to align it.
+func alignSlice(buf []byte, align uint) []byte {
+	aligned := AlignUp(len(buf), align)
+	if aligned > cap(buf) {
+		// Linux allows unaligned data if there isn't room for alignment.
+		// Since there isn't room for alignment, there isn't room for any
+		// additional messages either.
+		return buf
+	}
+	return buf[:aligned]
+}
+
+// Parse parses a raw socket control message into portable objects.
+func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.ControlMessages, error) {
+	var (
+		fds linux.ControlMessageRights
+
+		haveCreds bool
+		creds     linux.ControlMessageCredentials
+	)
+
+	for i := 0; i < len(buf); {
+		if i+linux.SizeOfControlMessageHeader > len(buf) {
+			return unix.ControlMessages{}, syserror.EINVAL
+		}
+
+		var h linux.ControlMessageHeader
+		binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h)
+
+		if h.Length < uint64(linux.SizeOfControlMessageHeader) {
+			return unix.ControlMessages{}, syserror.EINVAL
+		}
+		if h.Length > uint64(len(buf)-i) {
+			return unix.ControlMessages{}, syserror.EINVAL
+		}
+		if h.Level != linux.SOL_SOCKET {
+			return unix.ControlMessages{}, syserror.EINVAL
+		}
+
+		i += linux.SizeOfControlMessageHeader
+		length := int(h.Length) - linux.SizeOfControlMessageHeader
+
+		// The use of t.Arch().Width() is analogous to Linux's use of
+		// sizeof(long) in CMSG_ALIGN.
+		width := t.Arch().Width()
+
+		switch h.Type {
+		case linux.SCM_RIGHTS:
+			rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
+			numRights := rightsSize / linux.SizeOfControlMessageRight
+
+			if len(fds)+numRights > linux.SCM_MAX_FD {
+				return unix.ControlMessages{}, syserror.EINVAL
+			}
+
+			for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight {
+				fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
+			}
+
+			i += AlignUp(length, width)
+
+		case linux.SCM_CREDENTIALS:
+			if length < linux.SizeOfControlMessageCredentials {
+				return unix.ControlMessages{}, syserror.EINVAL
+			}
+
+			binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds)
+			haveCreds = true
+			i += AlignUp(length, width)
+
+		default:
+			// Unknown message type.
+			return unix.ControlMessages{}, syserror.EINVAL
+		}
+	}
+
+	var credentials SCMCredentials
+	if haveCreds {
+		var err error
+		if credentials, err = NewSCMCredentials(t, creds); err != nil {
+			return unix.ControlMessages{}, err
+		}
+	} else {
+		credentials = makeCreds(t, socketOrEndpoint)
+	}
+
+	var rights SCMRights
+	if len(fds) > 0 {
+		var err error
+		if rights, err = NewSCMRights(t, fds); err != nil {
+			return unix.ControlMessages{}, err
+		}
+	}
+
+	if credentials == nil && rights == nil {
+		return unix.ControlMessages{}, nil
+	}
+
+	return unix.ControlMessages{Credentials: credentials, Rights: rights}, nil
+}
+
+func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
+	if t == nil || socketOrEndpoint == nil {
+		return nil
+	}
+	if cr, ok := socketOrEndpoint.(unix.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) {
+		tcred := t.Credentials()
+		return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
+	}
+	return nil
+}
+
+// New creates default control messages if needed.
+func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) unix.ControlMessages {
+	return unix.ControlMessages{
+		Credentials: makeCreds(t, socketOrEndpoint),
+		Rights:      rights,
+	}
+}
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
new file mode 100644
index 000000000..0e463a92a
--- /dev/null
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -0,0 +1,61 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "epsocket_state",
+    srcs = [
+        "epsocket.go",
+        "save_restore.go",
+        "stack.go",
+    ],
+    out = "epsocket_state.go",
+    package = "epsocket",
+)
+
+go_library(
+    name = "epsocket",
+    srcs = [
+        "device.go",
+        "epsocket.go",
+        "epsocket_state.go",
+        "provider.go",
+        "save_restore.go",
+        "stack.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket",
+    visibility = [
+        "//pkg/sentry:internal",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/socket/epsocket/device.go b/pkg/sentry/socket/epsocket/device.go
new file mode 100644
index 000000000..17f2c9559
--- /dev/null
+++ b/pkg/sentry/socket/epsocket/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epsocket
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// epsocketDevice is the endpoint socket virtual device.
+var epsocketDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
new file mode 100644
index 000000000..3fc3ea58f
--- /dev/null
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -0,0 +1,1230 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package epsocket provides an implementation of the socket.Socket interface
+// that is backed by a tcpip.Endpoint.
+//
+// It does not depend on any particular endpoint implementation, and thus can
+// be used to expose certain endpoints to the sentry while leaving others out,
+// for example, TCP endpoints and Unix-domain endpoints.
+//
+// Lock ordering: netstack => mm: ioSequencePayload copies user memory inside
+// tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
+// this operation.
+package epsocket
+
+import (
+	"bytes"
+	"math"
+	"strings"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const sizeOfInt32 int = 4
+
+// ntohs converts a 16-bit number from network byte order to host byte order. It
+// assumes that the host is little endian.
+func ntohs(v uint16) uint16 {
+	return v<<8 | v>>8
+}
+
+// htons converts a 16-bit number from host byte order to network byte order. It
+// assumes that the host is little endian.
+func htons(v uint16) uint16 {
+	return ntohs(v)
+}
+
+// commonEndpoint represents the intersection of a tcpip.Endpoint and a
+// unix.Endpoint.
+type commonEndpoint interface {
+	// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and
+	// unix.Endpoint.GetLocalAddress.
+	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
+
+	// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and
+	// unix.Endpoint.GetRemoteAddress.
+	GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
+
+	// Readiness implements tcpip.Endpoint.Readiness and
+	// unix.Endpoint.Readiness.
+	Readiness(mask waiter.EventMask) waiter.EventMask
+
+	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
+	// unix.Endpoint.SetSockOpt.
+	SetSockOpt(interface{}) *tcpip.Error
+
+	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
+	// unix.Endpoint.GetSockOpt.
+	GetSockOpt(interface{}) *tcpip.Error
+}
+
+// SocketOperations encapsulates all the state needed to represent a network stack
+// endpoint in the kernel context.
+type SocketOperations struct {
+	socket.ReceiveTimeout
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	*waiter.Queue
+
+	family   int
+	stack    inet.Stack
+	Endpoint tcpip.Endpoint
+	skType   unix.SockType
+
+	// readMu protects access to readView, control, and sender.
+	readMu   sync.Mutex `state:"nosave"`
+	readView buffer.View
+	sender   tcpip.FullAddress
+}
+
+// New creates a new endpoint socket.
+func New(t *kernel.Task, family int, skType unix.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) *fs.File {
+	dirent := socket.NewDirent(t, epsocketDevice)
+	return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true}, &SocketOperations{
+		Queue:    queue,
+		family:   family,
+		stack:    t.NetworkContext(),
+		Endpoint: endpoint,
+		skType:   skType,
+	})
+}
+
+var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{}))
+var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{}))
+
+// GetAddress reads an sockaddr struct from the given address and converts it
+// to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6
+// addresses.
+func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
+	// Make sure we have at least 2 bytes for the address family.
+	if len(addr) < 2 {
+		return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+	}
+
+	family := usermem.ByteOrder.Uint16(addr)
+	if family != uint16(sfamily) {
+		return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
+	}
+
+	// Get the rest of the fields based on the address family.
+	switch family {
+	case linux.AF_UNIX:
+		path := addr[2:]
+		// Drop the terminating NUL (if one exists) and everything after it.
+		// Skip the first byte, which is NUL for abstract paths.
+		if len(path) > 1 {
+			if n := bytes.IndexByte(path[1:], 0); n >= 0 {
+				path = path[:n+1]
+			}
+		}
+		return tcpip.FullAddress{
+			Addr: tcpip.Address(path),
+		}, nil
+
+	case linux.AF_INET:
+		var a linux.SockAddrInet
+		if len(addr) < sockAddrInetSize {
+			return tcpip.FullAddress{}, syserr.ErrBadAddress
+		}
+		binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
+
+		out := tcpip.FullAddress{
+			Addr: tcpip.Address(a.Addr[:]),
+			Port: ntohs(a.Port),
+		}
+		if out.Addr == "\x00\x00\x00\x00" {
+			out.Addr = ""
+		}
+		return out, nil
+
+	case linux.AF_INET6:
+		var a linux.SockAddrInet6
+		if len(addr) < sockAddrInet6Size {
+			return tcpip.FullAddress{}, syserr.ErrBadAddress
+		}
+		binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
+
+		out := tcpip.FullAddress{
+			Addr: tcpip.Address(a.Addr[:]),
+			Port: ntohs(a.Port),
+		}
+		if isLinkLocal(out.Addr) {
+			out.NIC = tcpip.NICID(a.Scope_id)
+		}
+		if out.Addr == tcpip.Address(strings.Repeat("\x00", 16)) {
+			out.Addr = ""
+		}
+		return out, nil
+
+	default:
+		return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
+	}
+}
+
+func (s *SocketOperations) isPacketBased() bool {
+	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM
+}
+
+// fetchReadView updates the readView field of the socket if it's currently
+// empty. It assumes that the socket is locked.
+func (s *SocketOperations) fetchReadView() *syserr.Error {
+	if len(s.readView) > 0 {
+		return nil
+	}
+
+	s.readView = nil
+	s.sender = tcpip.FullAddress{}
+
+	v, err := s.Endpoint.Read(&s.sender)
+	if err != nil {
+		return syserr.TranslateNetstackError(err)
+	}
+
+	s.readView = v
+
+	return nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (s *SocketOperations) Release() {
+	s.Endpoint.Close()
+}
+
+// Read implements fs.FileOperations.Read.
+func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	n, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
+	if err == syserr.ErrWouldBlock {
+		return int64(n), syserror.ErrWouldBlock
+	}
+	if err != nil {
+		return 0, err.ToError()
+	}
+	return int64(n), nil
+}
+
+// ioSequencePayload implements tcpip.Payload. It copies user memory bytes on demand
+// based on the requested size.
+type ioSequencePayload struct {
+	ctx context.Context
+	src usermem.IOSequence
+}
+
+// Get implements tcpip.Payload.
+func (i *ioSequencePayload) Get(size int) ([]byte, *tcpip.Error) {
+	if size > i.Size() {
+		size = i.Size()
+	}
+	v := buffer.NewView(size)
+	if _, err := i.src.CopyIn(i.ctx, v); err != nil {
+		return nil, tcpip.ErrBadAddress
+	}
+	return v, nil
+}
+
+// Size implements tcpip.Payload.
+func (i *ioSequencePayload) Size() int {
+	return int(i.src.NumBytes())
+}
+
+// Write implements fs.FileOperations.Write.
+func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	f := &ioSequencePayload{ctx: ctx, src: src}
+	n, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
+	if err == tcpip.ErrWouldBlock {
+		return int64(n), syserror.ErrWouldBlock
+	}
+	return int64(n), syserr.TranslateNetstackError(err).ToError()
+}
+
+// Readiness returns a mask of ready events for socket s.
+func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	r := s.Endpoint.Readiness(mask)
+
+	// Check our cached value iff the caller asked for readability and the
+	// endpoint itself is currently not readable.
+	if (mask & ^r & waiter.EventIn) != 0 {
+		s.readMu.Lock()
+		if len(s.readView) > 0 {
+			r |= waiter.EventIn
+		}
+		s.readMu.Unlock()
+	}
+
+	return r
+}
+
+// Connect implements the linux syscall connect(2) for sockets backed by
+// tpcip.Endpoint.
+func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+	addr, err := GetAddress(s.family, sockaddr)
+	if err != nil {
+		return err
+	}
+
+	// Always return right away in the non-blocking case.
+	if !blocking {
+		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
+	}
+
+	// Register for notification when the endpoint becomes writable, then
+	// initiate the connection.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventOut)
+	defer s.EventUnregister(&e)
+
+	if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting {
+		return syserr.TranslateNetstackError(err)
+	}
+
+	// It's pending, so we have to wait for a notification, and fetch the
+	// result once the wait completes.
+	if err := t.Block(ch); err != nil {
+		return syserr.FromError(err)
+	}
+
+	// Call Connect() again after blocking to find connect's result.
+	return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
+}
+
+// Bind implements the linux syscall bind(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	addr, err := GetAddress(s.family, sockaddr)
+	if err != nil {
+		return err
+	}
+
+	// Issue the bind request to the endpoint.
+	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr, nil))
+}
+
+// Listen implements the linux syscall listen(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+	return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog))
+}
+
+// blockingAccept implements a blocking version of accept(2), that is, if no
+// connections are ready to be accept, it will block until one becomes ready.
+func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
+	// Register for notifications.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventIn)
+	defer s.EventUnregister(&e)
+
+	// Try to accept the connection again; if it fails, then wait until we
+	// get a notification.
+	for {
+		if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock {
+			return ep, wq, syserr.TranslateNetstackError(err)
+		}
+
+		if err := t.Block(ch); err != nil {
+			return nil, nil, syserr.FromError(err)
+		}
+	}
+}
+
+// Accept implements the linux syscall accept(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+	// Issue the accept request to get the new endpoint.
+	ep, wq, err := s.Endpoint.Accept()
+	if err != nil {
+		if err != tcpip.ErrWouldBlock || !blocking {
+			return 0, nil, 0, syserr.TranslateNetstackError(err)
+		}
+
+		var err *syserr.Error
+		ep, wq, err = s.blockingAccept(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	ns := New(t, s.family, s.skType, wq, ep)
+	defer ns.DecRef()
+
+	if flags&linux.SOCK_NONBLOCK != 0 {
+		flags := ns.Flags()
+		flags.NonBlocking = true
+		ns.SetFlags(flags.Settable())
+	}
+
+	var addr interface{}
+	var addrLen uint32
+	if peerRequested {
+		// Get address of the peer and write it to peer slice.
+		var err *syserr.Error
+		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	fdFlags := kernel.FDFlags{
+		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
+	}
+	fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
+
+	return fd, addr, addrLen, syserr.FromError(e)
+}
+
+// ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
+func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
+	var f tcpip.ShutdownFlags
+	switch how {
+	case linux.SHUT_RD:
+		f = tcpip.ShutdownRead
+	case linux.SHUT_WR:
+		f = tcpip.ShutdownWrite
+	case linux.SHUT_RDWR:
+		f = tcpip.ShutdownRead | tcpip.ShutdownWrite
+	default:
+		return 0, syserr.ErrInvalidArgument
+	}
+	return f, nil
+}
+
+// Shutdown implements the linux syscall shutdown(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+	f, err := ConvertShutdown(how)
+	if err != nil {
+		return err
+	}
+
+	// Issue shutdown request.
+	return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
+	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
+}
+
+// GetSockOpt can be used to implement the linux syscall getsockopt(2) for
+// sockets backed by a commonEndpoint.
+func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType unix.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+	switch level {
+	case syscall.SOL_SOCKET:
+		switch name {
+		case linux.SO_TYPE:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			return int32(skType), nil
+
+		case linux.SO_ERROR:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			// Get the last error and convert it.
+			err := ep.GetSockOpt(tcpip.ErrorOption{})
+			if err == nil {
+				return int32(0), nil
+			}
+			return int32(syserr.ToLinux(syserr.TranslateNetstackError(err)).Number()), nil
+
+		case linux.SO_PEERCRED:
+			if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			tcred := t.Credentials()
+			return syscall.Ucred{
+				Pid: int32(t.ThreadGroup().ID()),
+				Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
+				Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
+			}, nil
+
+		case linux.SO_PASSCRED:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var v tcpip.PasscredOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			return int32(v), nil
+
+		case linux.SO_SNDBUF:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var size tcpip.SendBufferSizeOption
+			if err := ep.GetSockOpt(&size); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			if size > math.MaxInt32 {
+				size = math.MaxInt32
+			}
+
+			return int32(size), nil
+
+		case linux.SO_RCVBUF:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var size tcpip.ReceiveBufferSizeOption
+			if err := ep.GetSockOpt(&size); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			if size > math.MaxInt32 {
+				size = math.MaxInt32
+			}
+
+			return int32(size), nil
+
+		case linux.SO_REUSEADDR:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var v tcpip.ReuseAddressOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			return int32(v), nil
+
+		case linux.SO_KEEPALIVE:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			return int32(0), nil
+
+		case linux.SO_LINGER:
+			if outLen < syscall.SizeofLinger {
+				return nil, syserr.ErrInvalidArgument
+			}
+			return syscall.Linger{}, nil
+
+		case linux.SO_RCVTIMEO:
+			if outLen < linux.SizeOfTimeval {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			return linux.NsecToTimeval(s.RecvTimeout()), nil
+		}
+
+	case syscall.SOL_TCP:
+		switch name {
+		case syscall.TCP_NODELAY:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var v tcpip.NoDelayOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			return int32(v), nil
+
+		case syscall.TCP_INFO:
+			var v tcpip.TCPInfoOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			// TODO: Translate fields once they are added to
+			// tcpip.TCPInfoOption.
+			info := linux.TCPInfo{}
+
+			// Linux truncates the output binary to outLen.
+			ib := binary.Marshal(nil, usermem.ByteOrder, &info)
+			if len(ib) > outLen {
+				ib = ib[:outLen]
+			}
+
+			return ib, nil
+		}
+
+	case syscall.SOL_IPV6:
+		switch name {
+		case syscall.IPV6_V6ONLY:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var v tcpip.V6OnlyOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			return int32(v), nil
+		}
+	}
+
+	return nil, syserr.ErrProtocolNotAvailable
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
+	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
+}
+
+// SetSockOpt can be used to implement the linux syscall setsockopt(2) for
+// sockets backed by a commonEndpoint.
+func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
+	switch level {
+	case syscall.SOL_SOCKET:
+		switch name {
+		case linux.SO_SNDBUF:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.SendBufferSizeOption(v)))
+
+		case linux.SO_RCVBUF:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(v)))
+
+		case linux.SO_REUSEADDR:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
+
+		case linux.SO_PASSCRED:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
+
+		case linux.SO_RCVTIMEO:
+			if len(optVal) < linux.SizeOfTimeval {
+				return syserr.ErrInvalidArgument
+			}
+
+			var v linux.Timeval
+			binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+			s.SetRecvTimeout(v.ToNsecCapped())
+			return nil
+		}
+
+	case syscall.SOL_TCP:
+		switch name {
+		case syscall.TCP_NODELAY:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.NoDelayOption(v)))
+		}
+	case syscall.SOL_IPV6:
+		switch name {
+		case syscall.IPV6_V6ONLY:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
+		}
+	}
+
+	// FIXME: Disallow IP-level multicast group options by
+	// default. These will need to be supported by appropriately plumbing
+	// the level through to the network stack (if at all). However, we
+	// still allow setting TTL, and multicast-enable/disable type options.
+	if level == 0 {
+		const (
+			_IP_ADD_MEMBERSHIP = 35
+			_MCAST_JOIN_GROUP  = 42
+		)
+		if name == _IP_ADD_MEMBERSHIP || name == _MCAST_JOIN_GROUP {
+			return syserr.ErrInvalidArgument
+		}
+	}
+
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
+
+// isLinkLocal determines if the given IPv6 address is link-local. This is the
+// case when it has the fe80::/10 prefix. This check is used to determine when
+// the NICID is relevant for a given IPv6 address.
+func isLinkLocal(addr tcpip.Address) bool {
+	return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80
+}
+
+// ConvertAddress converts the given address to a native format.
+func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
+	switch family {
+	case linux.AF_UNIX:
+		var out linux.SockAddrUnix
+		out.Family = linux.AF_UNIX
+		for i := 0; i < len([]byte(addr.Addr)); i++ {
+			out.Path[i] = int8(addr.Addr[i])
+		}
+		// Linux just returns the header for empty addresses.
+		if len(addr.Addr) == 0 {
+			return out, 2
+		}
+		// Linux returns the used length of the address struct (including the
+		// null terminator) for filesystem paths. The Family field is 2 bytes.
+		// It is sometimes allowed to exclude the null terminator if the
+		// address length is the max. Abstract paths always return the full
+		// length.
+		if out.Path[0] == 0 || len([]byte(addr.Addr)) == len(out.Path) {
+			return out, uint32(binary.Size(out))
+		}
+		return out, uint32(3 + len(addr.Addr))
+	case linux.AF_INET:
+		var out linux.SockAddrInet
+		copy(out.Addr[:], addr.Addr)
+		out.Family = linux.AF_INET
+		out.Port = htons(addr.Port)
+		return out, uint32(binary.Size(out))
+	case linux.AF_INET6:
+		var out linux.SockAddrInet6
+		if len(addr.Addr) == 4 {
+			// Copy address is v4-mapped format.
+			copy(out.Addr[12:], addr.Addr)
+			out.Addr[10] = 0xff
+			out.Addr[11] = 0xff
+		} else {
+			copy(out.Addr[:], addr.Addr)
+		}
+		out.Family = linux.AF_INET6
+		out.Port = htons(addr.Port)
+		if isLinkLocal(addr.Addr) {
+			out.Scope_id = uint32(addr.NIC)
+		}
+		return out, uint32(binary.Size(out))
+	default:
+		return nil, 0
+	}
+}
+
+// GetSockName implements the linux syscall getsockname(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	addr, err := s.Endpoint.GetLocalAddress()
+	if err != nil {
+		return nil, 0, syserr.TranslateNetstackError(err)
+	}
+
+	a, l := ConvertAddress(s.family, addr)
+	return a, l, nil
+}
+
+// GetPeerName implements the linux syscall getpeername(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	addr, err := s.Endpoint.GetRemoteAddress()
+	if err != nil {
+		return nil, 0, syserr.TranslateNetstackError(err)
+	}
+
+	a, l := ConvertAddress(s.family, addr)
+	return a, l, nil
+}
+
+// coalescingRead is the fast path for non-blocking, non-peek, stream-based
+// case. It coalesces as many packets as possible before returning to the
+// caller.
+func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) {
+	var err *syserr.Error
+	var copied int
+
+	// Copy as many views as possible into the user-provided buffer.
+	for dst.NumBytes() != 0 {
+		err = s.fetchReadView()
+		if err != nil {
+			break
+		}
+
+		var n int
+		var e error
+		if discard {
+			n = len(s.readView)
+			if int64(n) > dst.NumBytes() {
+				n = int(dst.NumBytes())
+			}
+		} else {
+			n, e = dst.CopyOut(ctx, s.readView)
+		}
+		copied += n
+		s.readView.TrimFront(n)
+		dst = dst.DropFirst(n)
+		if e != nil {
+			err = syserr.FromError(e)
+			break
+		}
+	}
+
+	// If we managed to copy something, we must deliver it.
+	if copied > 0 {
+		return copied, nil
+	}
+
+	return 0, err
+}
+
+// nonBlockingRead issues a non-blocking read.
+func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, interface{}, uint32, *syserr.Error) {
+	isPacket := s.isPacketBased()
+
+	// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
+	// that senderRequested is ignored for stream sockets.
+	if !peek && !isPacket {
+		// TCP sockets discard the data if MSG_TRUNC is set.
+		//
+		// This behavior is documented in man 7 tcp:
+		// Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
+		// argument of recv(2) (and recvmsg(2)). This flag causes the received
+		// bytes of data to be discarded, rather than passed back in a
+		// caller-supplied  buffer.
+		s.readMu.Lock()
+		n, err := s.coalescingRead(ctx, dst, trunc)
+		s.readMu.Unlock()
+		return n, nil, 0, err
+	}
+
+	s.readMu.Lock()
+	defer s.readMu.Unlock()
+
+	if err := s.fetchReadView(); err != nil {
+		return 0, nil, 0, err
+	}
+
+	if !isPacket && peek && trunc {
+		// MSG_TRUNC with MSG_PEEK on a TCP socket returns the
+		// amount that could be read.
+		var rql tcpip.ReceiveQueueSizeOption
+		if err := s.Endpoint.GetSockOpt(&rql); err != nil {
+			return 0, nil, 0, syserr.TranslateNetstackError(err)
+		}
+		available := len(s.readView) + int(rql)
+		bufLen := int(dst.NumBytes())
+		if available < bufLen {
+			return available, nil, 0, nil
+		}
+		return bufLen, nil, 0, nil
+	}
+
+	n, err := dst.CopyOut(ctx, s.readView)
+	var addr interface{}
+	var addrLen uint32
+	if isPacket && senderRequested {
+		addr, addrLen = ConvertAddress(s.family, s.sender)
+	}
+
+	if peek {
+		if l := len(s.readView); trunc && l > n {
+			// isPacket must be true.
+			return l, addr, addrLen, syserr.FromError(err)
+		}
+
+		if isPacket || err != nil {
+			return int(n), addr, addrLen, syserr.FromError(err)
+		}
+
+		// We need to peek beyond the first message.
+		dst = dst.DropFirst(n)
+		num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) {
+			n, err := s.Endpoint.Peek(dsts)
+			if err != nil {
+				return int64(n), syserr.TranslateNetstackError(err).ToError()
+			}
+			return int64(n), nil
+		}})
+		n += int(num)
+		if err == syserror.ErrWouldBlock && n > 0 {
+			// We got some data, so no need to return an error.
+			err = nil
+		}
+		return int(n), nil, 0, syserr.FromError(err)
+	}
+
+	var msgLen int
+	if isPacket {
+		msgLen = len(s.readView)
+		s.readView = nil
+	} else {
+		msgLen = int(n)
+		s.readView.TrimFront(int(n))
+	}
+
+	if trunc {
+		return msgLen, addr, addrLen, syserr.FromError(err)
+	}
+
+	return int(n), addr, addrLen, syserr.FromError(err)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error) {
+	trunc := flags&linux.MSG_TRUNC != 0
+
+	peek := flags&linux.MSG_PEEK != 0
+	if senderRequested && !s.isPacketBased() {
+		// Stream sockets ignore the sender address.
+		senderRequested = false
+	}
+	n, senderAddr, senderAddrLen, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		return
+	}
+
+	// We'll have to block. Register for notifications and keep trying to
+	// send all the data.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventIn)
+	defer s.EventUnregister(&e)
+
+	for {
+		n, senderAddr, senderAddrLen, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+		if err != syserr.ErrWouldBlock {
+			return
+		}
+
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+			}
+			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+		}
+	}
+}
+
+// SendMsg implements the linux syscall sendmsg(2) for sockets backed by
+// tcpip.Endpoint.
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+	// Reject control messages.
+	if !controlMessages.Empty() {
+		return 0, syserr.ErrInvalidArgument
+	}
+
+	var addr *tcpip.FullAddress
+	if len(to) > 0 {
+		addrBuf, err := GetAddress(s.family, to)
+		if err != nil {
+			return 0, err
+		}
+
+		addr = &addrBuf
+	}
+
+	v := buffer.NewView(int(src.NumBytes()))
+
+	// Copy all the data into the buffer.
+	if _, err := src.CopyIn(t, v); err != nil {
+		return 0, syserr.FromError(err)
+	}
+
+	opts := tcpip.WriteOptions{
+		To:          addr,
+		More:        flags&linux.MSG_MORE != 0,
+		EndOfRecord: flags&linux.MSG_EOR != 0,
+	}
+
+	n, err := s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+	if err != tcpip.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		return int(n), syserr.TranslateNetstackError(err)
+	}
+
+	// We'll have to block. Register for notification and keep trying to
+	// send all the data.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventOut)
+	defer s.EventUnregister(&e)
+
+	v.TrimFront(int(n))
+	total := n
+	for {
+		n, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+		v.TrimFront(int(n))
+		total += n
+		if err != tcpip.ErrWouldBlock {
+			return int(total), syserr.TranslateNetstackError(err)
+		}
+
+		if err := t.Block(ch); err != nil {
+			return int(total), syserr.FromError(err)
+		}
+	}
+}
+
+// interfaceIoctl implements interface requests.
+func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
+	var (
+		iface inet.Interface
+		index int32
+		found bool
+	)
+
+	// Find the relevant device.
+	for index, iface = range s.stack.Interfaces() {
+		if iface.Name == ifr.Name() {
+			found = true
+			break
+		}
+	}
+	if !found {
+		return syserr.ErrNoDevice
+	}
+
+	switch arg {
+	case syscall.SIOCGIFINDEX:
+		// Copy out the index to the data.
+		usermem.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
+
+	case syscall.SIOCGIFHWADDR:
+		// Copy the hardware address out.
+		ifr.Data[0] = 6 // IEEE802.2 arp type.
+		ifr.Data[1] = 0
+		n := copy(ifr.Data[2:], iface.Addr)
+		for i := 2 + n; i < len(ifr.Data); i++ {
+			ifr.Data[i] = 0 // Clear padding.
+		}
+		usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n))
+
+	case syscall.SIOCGIFFLAGS:
+		// TODO: Implement. For now, return only that the
+		// device is up so that ifconfig prints it.
+		usermem.ByteOrder.PutUint16(ifr.Data[:2], linux.IFF_UP)
+
+	case syscall.SIOCGIFADDR:
+		// Copy the IPv4 address out.
+		for _, addr := range s.stack.InterfaceAddrs()[index] {
+			// This ioctl is only compatible with AF_INET addresses.
+			if addr.Family != linux.AF_INET {
+				continue
+			}
+			copy(ifr.Data[4:8], addr.Addr)
+			break
+		}
+
+	case syscall.SIOCGIFMETRIC:
+		// Gets the metric of the device. As per netdevice(7), this
+		// always just sets ifr_metric to 0.
+		usermem.ByteOrder.PutUint32(ifr.Data[:4], 0)
+	case syscall.SIOCGIFMTU:
+		// Gets the MTU of the device.
+		// TODO: Implement.
+
+	case syscall.SIOCGIFMAP:
+		// Gets the hardware parameters of the device.
+		// TODO: Implement.
+
+	case syscall.SIOCGIFTXQLEN:
+		// Gets the transmit queue length of the device.
+		// TODO: Implement.
+
+	case syscall.SIOCGIFDSTADDR:
+		// Gets the destination address of a point-to-point device.
+		// TODO: Implement.
+
+	case syscall.SIOCGIFBRDADDR:
+		// Gets the broadcast address of a device.
+		// TODO: Implement.
+
+	case syscall.SIOCGIFNETMASK:
+		// Gets the network mask of a device.
+		// TODO: Implement.
+
+	default:
+		// Not a valid call.
+		return syserr.ErrInvalidArgument
+	}
+
+	return nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch arg := int(args[1].Int()); arg {
+	case syscall.SIOCGIFFLAGS,
+		syscall.SIOCGIFADDR,
+		syscall.SIOCGIFBRDADDR,
+		syscall.SIOCGIFDSTADDR,
+		syscall.SIOCGIFHWADDR,
+		syscall.SIOCGIFINDEX,
+		syscall.SIOCGIFMAP,
+		syscall.SIOCGIFMETRIC,
+		syscall.SIOCGIFMTU,
+		syscall.SIOCGIFNETMASK,
+		syscall.SIOCGIFTXQLEN:
+
+		var ifr linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		if err := s.interfaceIoctl(ctx, io, arg, &ifr); err != nil {
+			return 0, err.ToError()
+		}
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case syscall.SIOCGIFCONF:
+		// Return a list of interface addresses or the buffer size
+		// necessary to hold the list.
+		var ifc linux.IFConf
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		if err := s.ifconfIoctl(ctx, io, &ifc); err != nil {
+			return 0, err
+		}
+
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+
+		return 0, err
+	}
+
+	return Ioctl(ctx, s.Endpoint, io, args)
+}
+
+// ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
+func (s *SocketOperations) ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
+	// If Ptr is NULL, return the necessary buffer size via Len.
+	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
+	// structs.
+	if ifc.Ptr == 0 {
+		ifc.Len = int32(len(s.stack.Interfaces())) * int32(linux.SizeOfIFReq)
+		return nil
+	}
+
+	max := ifc.Len
+	ifc.Len = 0
+	for key, ifaceAddrs := range s.stack.InterfaceAddrs() {
+		iface := s.stack.Interfaces()[key]
+		for _, ifaceAddr := range ifaceAddrs {
+			// Don't write past the end of the buffer.
+			if ifc.Len+int32(linux.SizeOfIFReq) > max {
+				break
+			}
+			if ifaceAddr.Family != linux.AF_INET {
+				continue
+			}
+
+			// Populate ifr.ifr_addr.
+			ifr := linux.IFReq{}
+			ifr.SetName(iface.Name)
+			usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
+			usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0)
+			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
+
+			// Copy the ifr to userspace.
+			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
+			ifc.Len += int32(linux.SizeOfIFReq)
+			if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{
+				AddressSpaceActive: true,
+			}); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl for sockets backed by a
+// commonEndpoint.
+func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Switch on ioctl request.
+	switch int(args[1].Int()) {
+	case linux.TIOCINQ:
+		var v tcpip.ReceiveQueueSizeOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return 0, syserr.TranslateNetstackError(err).ToError()
+		}
+
+		if v > math.MaxInt32 {
+			v = math.MaxInt32
+		}
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCOUTQ:
+		var v tcpip.SendQueueSizeOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return 0, syserr.TranslateNetstackError(err).ToError()
+		}
+
+		if v > math.MaxInt32 {
+			v = math.MaxInt32
+		}
+
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	}
+
+	return 0, syserror.ENOTTY
+}
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
new file mode 100644
index 000000000..5616435b3
--- /dev/null
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epsocket
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// provider is an inet socket provider.
+type provider struct {
+	family   int
+	netProto tcpip.NetworkProtocolNumber
+}
+
+// GetTransportProtocol figures out transport protocol. Currently only TCP and
+// UDP are supported.
+func GetTransportProtocol(stype unix.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
+	switch stype {
+	case linux.SOCK_STREAM:
+		if protocol != 0 && protocol != syscall.IPPROTO_TCP {
+			return 0, syserr.ErrInvalidArgument
+		}
+		return tcp.ProtocolNumber, nil
+
+	case linux.SOCK_DGRAM:
+		if protocol != 0 && protocol != syscall.IPPROTO_UDP {
+			return 0, syserr.ErrInvalidArgument
+		}
+		return udp.ProtocolNumber, nil
+
+	default:
+		return 0, syserr.ErrInvalidArgument
+	}
+}
+
+// Socket creates a new socket object for the AF_INET or AF_INET6 family.
+func (p *provider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+	// Fail right away if we don't have a stack.
+	stack := t.NetworkContext()
+	if stack == nil {
+		// Don't propagate an error here. Instead, allow the socket
+		// code to continue searching for another provider.
+		return nil, nil
+	}
+	eps, ok := stack.(*Stack)
+	if !ok {
+		return nil, nil
+	}
+
+	// Figure out the transport protocol.
+	transProto, err := GetTransportProtocol(stype, protocol)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create the endpoint.
+	wq := &waiter.Queue{}
+	ep, e := eps.Stack.NewEndpoint(transProto, p.netProto, wq)
+	if e != nil {
+		return nil, syserr.TranslateNetstackError(e)
+	}
+
+	return New(t, p.family, stype, wq, ep), nil
+}
+
+// Pair just returns nil sockets (not supported).
+func (*provider) Pair(*kernel.Task, unix.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+	return nil, nil, nil
+}
+
+// init registers socket providers for AF_INET and AF_INET6.
+func init() {
+	// Providers backed by netstack.
+	p := []provider{
+		{
+			family:   linux.AF_INET,
+			netProto: ipv4.ProtocolNumber,
+		},
+		{
+			family:   linux.AF_INET6,
+			netProto: ipv6.ProtocolNumber,
+		},
+	}
+
+	for i := range p {
+		socket.RegisterProvider(p[i].family, &p[i])
+	}
+}
diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/epsocket/save_restore.go
new file mode 100644
index 000000000..2613f90de
--- /dev/null
+++ b/pkg/sentry/socket/epsocket/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epsocket
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+)
+
+// afterLoad is invoked by stateify.
+func (s *Stack) afterLoad() {
+	s.Stack = stack.StackFromEnv // FIXME
+	if s.Stack == nil {
+		panic("can't restore without netstack/tcpip/stack.Stack")
+	}
+}
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
new file mode 100644
index 000000000..ec1d96ccb
--- /dev/null
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -0,0 +1,132 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epsocket
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+)
+
+// Stack implements inet.Stack for netstack/tcpip/stack.Stack.
+type Stack struct {
+	Stack *stack.Stack `state:"manual"`
+}
+
+// SupportsIPv6 implements Stack.SupportsIPv6.
+func (s *Stack) SupportsIPv6() bool {
+	return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber)
+}
+
+// Interfaces implements inet.Stack.Interfaces.
+func (s *Stack) Interfaces() map[int32]inet.Interface {
+	is := make(map[int32]inet.Interface)
+	for id, ni := range s.Stack.NICInfo() {
+		is[int32(id)] = inet.Interface{
+			Name: ni.Name,
+			Addr: []byte(ni.LinkAddress),
+			// TODO: Other fields.
+		}
+	}
+	return is
+}
+
+// InterfaceAddrs implements inet.Stack.InterfaceAddrs.
+func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
+	nicAddrs := make(map[int32][]inet.InterfaceAddr)
+	for id, ni := range s.Stack.NICInfo() {
+		var addrs []inet.InterfaceAddr
+		for _, a := range ni.ProtocolAddresses {
+			var family uint8
+			switch a.Protocol {
+			case ipv4.ProtocolNumber:
+				family = linux.AF_INET
+			case ipv6.ProtocolNumber:
+				family = linux.AF_INET6
+			default:
+				log.Warningf("Unknown network protocol in %+v", a)
+				continue
+			}
+
+			addrs = append(addrs, inet.InterfaceAddr{
+				Family:    family,
+				PrefixLen: uint8(len(a.Address) * 8),
+				Addr:      []byte(a.Address),
+				// TODO: Other fields.
+			})
+		}
+		nicAddrs[int32(id)] = addrs
+	}
+	return nicAddrs
+}
+
+// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
+func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
+	var rs tcp.ReceiveBufferSizeOption
+	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs)
+	return inet.TCPBufferSize{
+		Min:     rs.Min,
+		Default: rs.Default,
+		Max:     rs.Max,
+	}, syserr.TranslateNetstackError(err).ToError()
+}
+
+// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
+func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
+	rs := tcp.ReceiveBufferSizeOption{
+		Min:     size.Min,
+		Default: size.Default,
+		Max:     size.Max,
+	}
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError()
+}
+
+// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
+func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
+	var ss tcp.SendBufferSizeOption
+	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss)
+	return inet.TCPBufferSize{
+		Min:     ss.Min,
+		Default: ss.Default,
+		Max:     ss.Max,
+	}, syserr.TranslateNetstackError(err).ToError()
+}
+
+// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
+func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
+	ss := tcp.SendBufferSizeOption{
+		Min:     size.Min,
+		Default: size.Default,
+		Max:     size.Max,
+	}
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError()
+}
+
+// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
+func (s *Stack) TCPSACKEnabled() (bool, error) {
+	var sack tcp.SACKEnabled
+	err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack)
+	return bool(sack), syserr.TranslateNetstackError(err).ToError()
+}
+
+// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
+func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
+	return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
+}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
new file mode 100644
index 000000000..60ec265ba
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -0,0 +1,53 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "hostinet_state",
+    srcs = [
+        "save_restore.go",
+        "socket.go",
+        "stack.go",
+    ],
+    out = "hostinet_autogen_state.go",
+    package = "hostinet",
+)
+
+go_library(
+    name = "hostinet",
+    srcs = [
+        "device.go",
+        "hostinet.go",
+        "hostinet_autogen_state.go",
+        "save_restore.go",
+        "socket.go",
+        "socket_unsafe.go",
+        "stack.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+        "//pkg/waiter/fdnotifier",
+    ],
+)
diff --git a/pkg/sentry/socket/hostinet/device.go b/pkg/sentry/socket/hostinet/device.go
new file mode 100644
index 000000000..a9a673316
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/device.go
@@ -0,0 +1,19 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+var socketDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/socket/hostinet/hostinet.go b/pkg/sentry/socket/hostinet/hostinet.go
new file mode 100644
index 000000000..67c6c8066
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/hostinet.go
@@ -0,0 +1,17 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hostinet implements AF_INET and AF_INET6 sockets using the host's
+// network stack.
+package hostinet
diff --git a/pkg/sentry/socket/hostinet/save_restore.go b/pkg/sentry/socket/hostinet/save_restore.go
new file mode 100644
index 000000000..0821a794a
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/save_restore.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+// beforeSave is invoked by stateify.
+func (*socketOperations) beforeSave() {
+	panic("host.socketOperations is not savable")
+}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
new file mode 100644
index 000000000..defa3db2c
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -0,0 +1,562 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+const (
+	sizeofInt32 = 4
+
+	// sizeofSockaddr is the size in bytes of the largest sockaddr type
+	// supported by this package.
+	sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in)
+)
+
+// socketOperations implements fs.FileOperations and socket.Socket for a socket
+// implemented using a host socket.
+type socketOperations struct {
+	socket.ReceiveTimeout
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+
+	fd    int // must be O_NONBLOCK
+	queue waiter.Queue
+}
+
+func newSocketFile(ctx context.Context, fd int, nonblock bool) (*fs.File, *syserr.Error) {
+	s := &socketOperations{fd: fd}
+	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
+		return nil, syserr.FromError(err)
+	}
+	dirent := socket.NewDirent(ctx, socketDevice)
+	return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true}, s), nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (s *socketOperations) Release() {
+	fdnotifier.RemoveFD(int32(s.fd))
+	syscall.Close(s.fd)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(s.fd), mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.queue.EventRegister(e, mask)
+	fdnotifier.UpdateFD(int32(s.fd))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *socketOperations) EventUnregister(e *waiter.Entry) {
+	s.queue.EventUnregister(e)
+	fdnotifier.UpdateFD(int32(s.fd))
+}
+
+// Read implements fs.FileOperations.Read.
+func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
+		// Refuse to do anything if any part of dst.Addrs was unusable.
+		if uint64(dst.NumBytes()) != dsts.NumBytes() {
+			return 0, nil
+		}
+		if dsts.IsEmpty() {
+			return 0, nil
+		}
+		if dsts.NumBlocks() == 1 {
+			// Skip allocating []syscall.Iovec.
+			n, err := syscall.Read(s.fd, dsts.Head().ToSlice())
+			if err != nil {
+				return 0, translateIOSyscallError(err)
+			}
+			return uint64(n), nil
+		}
+		return readv(s.fd, iovecsFromBlockSeq(dsts))
+	}))
+	return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
+		// Refuse to do anything if any part of src.Addrs was unusable.
+		if uint64(src.NumBytes()) != srcs.NumBytes() {
+			return 0, nil
+		}
+		if srcs.IsEmpty() {
+			return 0, nil
+		}
+		if srcs.NumBlocks() == 1 {
+			// Skip allocating []syscall.Iovec.
+			n, err := syscall.Write(s.fd, srcs.Head().ToSlice())
+			if err != nil {
+				return 0, translateIOSyscallError(err)
+			}
+			return uint64(n), nil
+		}
+		return writev(s.fd, iovecsFromBlockSeq(srcs))
+	}))
+	return int64(n), err
+}
+
+// Connect implements socket.Socket.Connect.
+func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+	if len(sockaddr) > sizeofSockaddr {
+		sockaddr = sockaddr[:sizeofSockaddr]
+	}
+
+	_, _, errno := syscall.Syscall(syscall.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
+
+	if errno == 0 {
+		return nil
+	}
+	if errno != syscall.EINPROGRESS || !blocking {
+		return syserr.FromError(translateIOSyscallError(errno))
+	}
+
+	// "EINPROGRESS: The socket is nonblocking and the connection cannot be
+	// completed immediately. It is possible to select(2) or poll(2) for
+	// completion by selecting the socket for writing. After select(2)
+	// indicates writability, use getsockopt(2) to read the SO_ERROR option at
+	// level SOL-SOCKET to determine whether connect() completed successfully
+	// (SO_ERROR is zero) or unsuccessfully (SO_ERROR is one of the usual error
+	// codes listed here, explaining the reason for the failure)." - connect(2)
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventOut)
+	defer s.EventUnregister(&e)
+	if s.Readiness(waiter.EventOut)&waiter.EventOut == 0 {
+		if err := t.Block(ch); err != nil {
+			return syserr.FromError(err)
+		}
+	}
+	val, err := syscall.GetsockoptInt(s.fd, syscall.SOL_SOCKET, syscall.SO_ERROR)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+	if val != 0 {
+		return syserr.FromError(syscall.Errno(uintptr(val)))
+	}
+	return nil
+}
+
+// Accept implements socket.Socket.Accept.
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+	var peerAddr []byte
+	var peerAddrlen uint32
+	var peerAddrPtr *byte
+	var peerAddrlenPtr *uint32
+	if peerRequested {
+		peerAddr = make([]byte, sizeofSockaddr)
+		peerAddrlen = uint32(len(peerAddr))
+		peerAddrPtr = &peerAddr[0]
+		peerAddrlenPtr = &peerAddrlen
+	}
+
+	// Conservatively ignore all flags specified by the application and add
+	// SOCK_NONBLOCK since socketOperations requires it.
+	fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK)
+	if blocking {
+		var ch chan struct{}
+		for syscallErr == syserror.ErrWouldBlock {
+			if ch != nil {
+				if syscallErr = t.Block(ch); syscallErr != nil {
+					break
+				}
+			} else {
+				var e waiter.Entry
+				e, ch = waiter.NewChannelEntry(nil)
+				s.EventRegister(&e, waiter.EventIn)
+				defer s.EventUnregister(&e)
+			}
+			fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK)
+		}
+	}
+
+	if peerRequested {
+		peerAddr = peerAddr[:peerAddrlen]
+	}
+	if syscallErr != nil {
+		return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
+	}
+
+	f, err := newSocketFile(t, fd, flags&syscall.SOCK_NONBLOCK != 0)
+	if err != nil {
+		syscall.Close(fd)
+		return 0, nil, 0, err
+	}
+	defer f.DecRef()
+
+	fdFlags := kernel.FDFlags{
+		CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
+	}
+	kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits())
+	return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
+}
+
+// Bind implements socket.Socket.Bind.
+func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	if len(sockaddr) > sizeofSockaddr {
+		sockaddr = sockaddr[:sizeofSockaddr]
+	}
+
+	_, _, errno := syscall.Syscall(syscall.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
+	if errno != 0 {
+		return syserr.FromError(errno)
+	}
+	return nil
+}
+
+// Listen implements socket.Socket.Listen.
+func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+	return syserr.FromError(syscall.Listen(s.fd, backlog))
+}
+
+// Shutdown implements socket.Socket.Shutdown.
+func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+	switch how {
+	case syscall.SHUT_RD, syscall.SHUT_WR, syscall.SHUT_RDWR:
+		return syserr.FromError(syscall.Shutdown(s.fd, how))
+	default:
+		return syserr.ErrInvalidArgument
+	}
+}
+
+// GetSockOpt implements socket.Socket.GetSockOpt.
+func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+	if outLen < 0 {
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// Whitelist options and constrain option length.
+	var optlen int
+	switch level {
+	case syscall.SOL_IPV6:
+		switch name {
+		case syscall.IPV6_V6ONLY:
+			optlen = sizeofInt32
+		}
+	case syscall.SOL_SOCKET:
+		switch name {
+		case syscall.SO_ERROR, syscall.SO_KEEPALIVE, syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR, syscall.SO_TYPE:
+			optlen = sizeofInt32
+		case syscall.SO_LINGER:
+			optlen = syscall.SizeofLinger
+		}
+	case syscall.SOL_TCP:
+		switch name {
+		case syscall.TCP_NODELAY:
+			optlen = sizeofInt32
+		case syscall.TCP_INFO:
+			optlen = int(linux.SizeOfTCPInfo)
+		}
+	}
+	if optlen == 0 {
+		return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT
+	}
+	if outLen < optlen {
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	opt, err := getsockopt(s.fd, level, name, optlen)
+	if err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return opt, nil
+}
+
+// SetSockOpt implements socket.Socket.SetSockOpt.
+func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
+	// Whitelist options and constrain option length.
+	var optlen int
+	switch level {
+	case syscall.SOL_IPV6:
+		switch name {
+		case syscall.IPV6_V6ONLY:
+			optlen = sizeofInt32
+		}
+	case syscall.SOL_SOCKET:
+		switch name {
+		case syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR:
+			optlen = sizeofInt32
+		}
+	case syscall.SOL_TCP:
+		switch name {
+		case syscall.TCP_NODELAY:
+			optlen = sizeofInt32
+		}
+	}
+	if optlen == 0 {
+		// Pretend to accept socket options we don't understand. This seems
+		// dangerous, but it's what netstack does...
+		return nil
+	}
+	if len(opt) < optlen {
+		return syserr.ErrInvalidArgument
+	}
+	opt = opt[:optlen]
+
+	_, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(s.fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(len(opt)), 0)
+	if errno != 0 {
+		return syserr.FromError(errno)
+	}
+	return nil
+}
+
+// RecvMsg implements socket.Socket.RecvMsg.
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+	// Whitelist flags.
+	//
+	// FIXME: We can't support MSG_ERRQUEUE because it uses ancillary
+	// messages that netstack/tcpip/transport/unix doesn't understand. Kill the
+	// Socket interface's dependence on netstack.
+	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
+		return 0, nil, 0, unix.ControlMessages{}, syserr.ErrInvalidArgument
+	}
+
+	var senderAddr []byte
+	if senderRequested {
+		senderAddr = make([]byte, sizeofSockaddr)
+	}
+
+	recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
+		// Refuse to do anything if any part of dst.Addrs was unusable.
+		if uint64(dst.NumBytes()) != dsts.NumBytes() {
+			return 0, nil
+		}
+		if dsts.IsEmpty() {
+			return 0, nil
+		}
+
+		// We always do a non-blocking recv*().
+		sysflags := flags | syscall.MSG_DONTWAIT
+
+		if dsts.NumBlocks() == 1 {
+			// Skip allocating []syscall.Iovec.
+			return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddr)
+		}
+
+		iovs := iovecsFromBlockSeq(dsts)
+		msg := syscall.Msghdr{
+			Iov:    &iovs[0],
+			Iovlen: uint64(len(iovs)),
+		}
+		if len(senderAddr) != 0 {
+			msg.Name = &senderAddr[0]
+			msg.Namelen = uint32(len(senderAddr))
+		}
+		n, err := recvmsg(s.fd, &msg, sysflags)
+		if err != nil {
+			return 0, err
+		}
+		senderAddr = senderAddr[:msg.Namelen]
+		return n, nil
+	})
+
+	var ch chan struct{}
+	n, err := dst.CopyOutFrom(t, recvmsgToBlocks)
+	if flags&syscall.MSG_DONTWAIT == 0 {
+		for err == syserror.ErrWouldBlock {
+			// We only expect blocking to come from the actual syscall, in which
+			// case it can't have returned any data.
+			if n != 0 {
+				panic(fmt.Sprintf("CopyOutFrom: got (%d, %v), wanted (0, %v)", n, err, err))
+			}
+			if ch != nil {
+				if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+					break
+				}
+			} else {
+				var e waiter.Entry
+				e, ch = waiter.NewChannelEntry(nil)
+				s.EventRegister(&e, waiter.EventIn)
+				defer s.EventUnregister(&e)
+			}
+			n, err = dst.CopyOutFrom(t, recvmsgToBlocks)
+		}
+	}
+
+	return int(n), senderAddr, uint32(len(senderAddr)), unix.ControlMessages{}, syserr.FromError(err)
+}
+
+// SendMsg implements socket.Socket.SendMsg.
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+	// Whitelist flags.
+	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
+		return 0, syserr.ErrInvalidArgument
+	}
+
+	sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
+		// Refuse to do anything if any part of src.Addrs was unusable.
+		if uint64(src.NumBytes()) != srcs.NumBytes() {
+			return 0, nil
+		}
+		if srcs.IsEmpty() {
+			return 0, nil
+		}
+
+		// We always do a non-blocking send*().
+		sysflags := flags | syscall.MSG_DONTWAIT
+
+		if srcs.NumBlocks() == 1 {
+			// Skip allocating []syscall.Iovec.
+			src := srcs.Head()
+			n, _, errno := syscall.Syscall6(syscall.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to)))
+			if errno != 0 {
+				return 0, translateIOSyscallError(errno)
+			}
+			return uint64(n), nil
+		}
+
+		iovs := iovecsFromBlockSeq(srcs)
+		msg := syscall.Msghdr{
+			Iov:    &iovs[0],
+			Iovlen: uint64(len(iovs)),
+		}
+		if len(to) != 0 {
+			msg.Name = &to[0]
+			msg.Namelen = uint32(len(to))
+		}
+		return sendmsg(s.fd, &msg, sysflags)
+	})
+
+	var ch chan struct{}
+	n, err := src.CopyInTo(t, sendmsgFromBlocks)
+	if flags&syscall.MSG_DONTWAIT == 0 {
+		for err == syserror.ErrWouldBlock {
+			// We only expect blocking to come from the actual syscall, in which
+			// case it can't have returned any data.
+			if n != 0 {
+				panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err))
+			}
+			if ch != nil {
+				if err = t.Block(ch); err != nil {
+					break
+				}
+			} else {
+				var e waiter.Entry
+				e, ch = waiter.NewChannelEntry(nil)
+				s.EventRegister(&e, waiter.EventOut)
+				defer s.EventUnregister(&e)
+			}
+			n, err = src.CopyInTo(t, sendmsgFromBlocks)
+		}
+	}
+
+	return int(n), syserr.FromError(err)
+}
+
+func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec {
+	iovs := make([]syscall.Iovec, 0, bs.NumBlocks())
+	for ; !bs.IsEmpty(); bs = bs.Tail() {
+		b := bs.Head()
+		iovs = append(iovs, syscall.Iovec{
+			Base: &b.ToSlice()[0],
+			Len:  uint64(b.Len()),
+		})
+		// We don't need to care about b.NeedSafecopy(), because the host
+		// kernel will handle such address ranges just fine (by returning
+		// EFAULT).
+	}
+	return iovs
+}
+
+func translateIOSyscallError(err error) error {
+	if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK {
+		return syserror.ErrWouldBlock
+	}
+	return err
+}
+
+type socketProvider struct {
+	family int
+}
+
+// Socket implements socket.Provider.Socket.
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+	// Check that we are using the host network stack.
+	stack := t.NetworkContext()
+	if stack == nil {
+		return nil, nil
+	}
+	if _, ok := stack.(*Stack); !ok {
+		return nil, nil
+	}
+
+	// Only accept TCP and UDP.
+	stype := int(stypeflags) & linux.SOCK_TYPE_MASK
+	switch stype {
+	case syscall.SOCK_STREAM:
+		switch protocol {
+		case 0, syscall.IPPROTO_TCP:
+			// ok
+		default:
+			return nil, nil
+		}
+	case syscall.SOCK_DGRAM:
+		switch protocol {
+		case 0, syscall.IPPROTO_UDP:
+			// ok
+		default:
+			return nil, nil
+		}
+	default:
+		return nil, nil
+	}
+
+	// Conservatively ignore all flags specified by the application and add
+	// SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0
+	// to simplify the syscall filters, since 0 and IPPROTO_* are equivalent.
+	fd, err := syscall.Socket(p.family, stype|syscall.SOCK_NONBLOCK, 0)
+	if err != nil {
+		return nil, syserr.FromError(err)
+	}
+	return newSocketFile(t, fd, stypeflags&syscall.SOCK_NONBLOCK != 0)
+}
+
+// Pair implements socket.Provider.Pair.
+func (p *socketProvider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+	// Not supported by AF_INET/AF_INET6.
+	return nil, nil, nil
+}
+
+func init() {
+	for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} {
+		socket.RegisterProvider(family, &socketProvider{family})
+	}
+}
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
new file mode 100644
index 000000000..f8bb75636
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func firstBytePtr(bs []byte) unsafe.Pointer {
+	if bs == nil {
+		return nil
+	}
+	return unsafe.Pointer(&bs[0])
+}
+
+// Preconditions: len(dsts) != 0.
+func readv(fd int, dsts []syscall.Iovec) (uint64, error) {
+	n, _, errno := syscall.Syscall(syscall.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&dsts[0])), uintptr(len(dsts)))
+	if errno != 0 {
+		return 0, translateIOSyscallError(errno)
+	}
+	return uint64(n), nil
+}
+
+// Preconditions: len(srcs) != 0.
+func writev(fd int, srcs []syscall.Iovec) (uint64, error) {
+	n, _, errno := syscall.Syscall(syscall.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&srcs[0])), uintptr(len(srcs)))
+	if errno != 0 {
+		return 0, translateIOSyscallError(errno)
+	}
+	return uint64(n), nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := uintptr(args[1].Int()); cmd {
+	case syscall.TIOCINQ, syscall.TIOCOUTQ:
+		var val int32
+		if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s.fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 {
+			return 0, translateIOSyscallError(errno)
+		}
+		var buf [4]byte
+		usermem.ByteOrder.PutUint32(buf[:], uint32(val))
+		_, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+func accept4(fd int, addr *byte, addrlen *uint32, flags int) (int, error) {
+	afd, _, errno := syscall.Syscall6(syscall.SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(addr)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
+	if errno != 0 {
+		return 0, translateIOSyscallError(errno)
+	}
+	return int(afd), nil
+}
+
+func getsockopt(fd int, level, name int, optlen int) ([]byte, error) {
+	opt := make([]byte, optlen)
+	optlen32 := int32(len(opt))
+	_, _, errno := syscall.Syscall6(syscall.SYS_GETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(unsafe.Pointer(&optlen32)), 0)
+	if errno != 0 {
+		return nil, errno
+	}
+	return opt[:optlen32], nil
+}
+
+// GetSockName implements socket.Socket.GetSockName.
+func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	addr := make([]byte, sizeofSockaddr)
+	addrlen := uint32(len(addr))
+	_, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
+	if errno != 0 {
+		return nil, 0, syserr.FromError(errno)
+	}
+	return addr[:addrlen], addrlen, nil
+}
+
+// GetPeerName implements socket.Socket.GetPeerName.
+func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	addr := make([]byte, sizeofSockaddr)
+	addrlen := uint32(len(addr))
+	_, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
+	if errno != 0 {
+		return nil, 0, syserr.FromError(errno)
+	}
+	return addr[:addrlen], addrlen, nil
+}
+
+func recvfrom(fd int, dst []byte, flags int, from *[]byte) (uint64, error) {
+	fromLen := uint32(len(*from))
+	n, _, errno := syscall.Syscall6(syscall.SYS_RECVFROM, uintptr(fd), uintptr(firstBytePtr(dst)), uintptr(len(dst)), uintptr(flags), uintptr(firstBytePtr(*from)), uintptr(unsafe.Pointer(&fromLen)))
+	if errno != 0 {
+		return 0, translateIOSyscallError(errno)
+	}
+	*from = (*from)[:fromLen]
+	return uint64(n), nil
+}
+
+func recvmsg(fd int, msg *syscall.Msghdr, flags int) (uint64, error) {
+	n, _, errno := syscall.Syscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags))
+	if errno != 0 {
+		return 0, translateIOSyscallError(errno)
+	}
+	return uint64(n), nil
+}
+
+func sendmsg(fd int, msg *syscall.Msghdr, flags int) (uint64, error) {
+	n, _, errno := syscall.Syscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags))
+	if errno != 0 {
+		return 0, translateIOSyscallError(errno)
+	}
+	return uint64(n), nil
+}
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
new file mode 100644
index 000000000..44c3b9a3f
--- /dev/null
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -0,0 +1,244 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostinet
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"strings"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+var defaultRecvBufSize = inet.TCPBufferSize{
+	Min:     4096,
+	Default: 87380,
+	Max:     6291456,
+}
+
+var defaultSendBufSize = inet.TCPBufferSize{
+	Min:     4096,
+	Default: 16384,
+	Max:     4194304,
+}
+
+// Stack implements inet.Stack for host sockets.
+type Stack struct {
+	// Stack is immutable.
+	interfaces     map[int32]inet.Interface
+	interfaceAddrs map[int32][]inet.InterfaceAddr
+	supportsIPv6   bool
+	tcpRecvBufSize inet.TCPBufferSize
+	tcpSendBufSize inet.TCPBufferSize
+	tcpSACKEnabled bool
+}
+
+// NewStack returns an empty Stack containing no configuration.
+func NewStack() *Stack {
+	return &Stack{
+		interfaces:     make(map[int32]inet.Interface),
+		interfaceAddrs: make(map[int32][]inet.InterfaceAddr),
+	}
+}
+
+// Configure sets up the stack using the current state of the host network.
+func (s *Stack) Configure() error {
+	if err := addHostInterfaces(s); err != nil {
+		return err
+	}
+
+	if _, err := os.Stat("/proc/net/if_inet6"); err == nil {
+		s.supportsIPv6 = true
+	}
+
+	s.tcpRecvBufSize = defaultRecvBufSize
+	if tcpRMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_rmem"); err == nil {
+		s.tcpRecvBufSize = tcpRMem
+	} else {
+		log.Warningf("Failed to read TCP receive buffer size, using default values")
+	}
+
+	s.tcpSendBufSize = defaultSendBufSize
+	if tcpWMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_wmem"); err == nil {
+		s.tcpSendBufSize = tcpWMem
+	} else {
+		log.Warningf("Failed to read TCP send buffer size, using default values")
+	}
+
+	s.tcpSACKEnabled = false
+	if sack, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_sack"); err == nil {
+		s.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0"
+	} else {
+		log.Warningf("Failed to read if TCP SACK if enabled, setting to false")
+	}
+
+	return nil
+}
+
+// ExtractHostInterfaces will populate an interface map and
+// interfaceAddrs map with the results of the equivalent
+// netlink messages.
+func ExtractHostInterfaces(links []syscall.NetlinkMessage, addrs []syscall.NetlinkMessage, interfaces map[int32]inet.Interface, interfaceAddrs map[int32][]inet.InterfaceAddr) error {
+	for _, link := range links {
+		if link.Header.Type != syscall.RTM_NEWLINK {
+			continue
+		}
+		if len(link.Data) < syscall.SizeofIfInfomsg {
+			return fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid data length (%d bytes, expected at least %d bytes)", len(link.Data), syscall.SizeofIfInfomsg)
+		}
+		var ifinfo syscall.IfInfomsg
+		binary.Unmarshal(link.Data[:syscall.SizeofIfInfomsg], usermem.ByteOrder, &ifinfo)
+		inetIF := inet.Interface{
+			DeviceType: ifinfo.Type,
+			Flags:      ifinfo.Flags,
+		}
+		// Not clearly documented: syscall.ParseNetlinkRouteAttr will check the
+		// syscall.NetlinkMessage.Header.Type and skip the struct ifinfomsg
+		// accordingly.
+		attrs, err := syscall.ParseNetlinkRouteAttr(&link)
+		if err != nil {
+			return fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid rtattrs: %v", err)
+		}
+		for _, attr := range attrs {
+			switch attr.Attr.Type {
+			case syscall.IFLA_ADDRESS:
+				inetIF.Addr = attr.Value
+			case syscall.IFLA_IFNAME:
+				inetIF.Name = string(attr.Value[:len(attr.Value)-1])
+			}
+		}
+		interfaces[ifinfo.Index] = inetIF
+	}
+
+	for _, addr := range addrs {
+		if addr.Header.Type != syscall.RTM_NEWADDR {
+			continue
+		}
+		if len(addr.Data) < syscall.SizeofIfAddrmsg {
+			return fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid data length (%d bytes, expected at least %d bytes)", len(addr.Data), syscall.SizeofIfAddrmsg)
+		}
+		var ifaddr syscall.IfAddrmsg
+		binary.Unmarshal(addr.Data[:syscall.SizeofIfAddrmsg], usermem.ByteOrder, &ifaddr)
+		inetAddr := inet.InterfaceAddr{
+			Family:    ifaddr.Family,
+			PrefixLen: ifaddr.Prefixlen,
+			Flags:     ifaddr.Flags,
+		}
+		attrs, err := syscall.ParseNetlinkRouteAttr(&addr)
+		if err != nil {
+			return fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid rtattrs: %v", err)
+		}
+		for _, attr := range attrs {
+			switch attr.Attr.Type {
+			case syscall.IFA_ADDRESS:
+				inetAddr.Addr = attr.Value
+			}
+		}
+		interfaceAddrs[int32(ifaddr.Index)] = append(interfaceAddrs[int32(ifaddr.Index)], inetAddr)
+	}
+
+	return nil
+}
+
+func addHostInterfaces(s *Stack) error {
+	links, err := doNetlinkRouteRequest(syscall.RTM_GETLINK)
+	if err != nil {
+		return fmt.Errorf("RTM_GETLINK failed: %v", err)
+	}
+
+	addrs, err := doNetlinkRouteRequest(syscall.RTM_GETADDR)
+	if err != nil {
+		return fmt.Errorf("RTM_GETADDR failed: %v", err)
+	}
+
+	return ExtractHostInterfaces(links, addrs, s.interfaces, s.interfaceAddrs)
+}
+
+func doNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) {
+	data, err := syscall.NetlinkRIB(req, syscall.AF_UNSPEC)
+	if err != nil {
+		return nil, err
+	}
+	return syscall.ParseNetlinkMessage(data)
+}
+
+func readTCPBufferSizeFile(filename string) (inet.TCPBufferSize, error) {
+	contents, err := ioutil.ReadFile(filename)
+	if err != nil {
+		return inet.TCPBufferSize{}, fmt.Errorf("failed to read %s: %v", filename, err)
+	}
+	ioseq := usermem.BytesIOSequence(contents)
+	fields := make([]int32, 3)
+	if n, err := usermem.CopyInt32StringsInVec(context.Background(), ioseq.IO, ioseq.Addrs, fields, ioseq.Opts); n != ioseq.NumBytes() || err != nil {
+		return inet.TCPBufferSize{}, fmt.Errorf("failed to parse %s (%q): got %v after %d/%d bytes", filename, contents, err, n, ioseq.NumBytes())
+	}
+	return inet.TCPBufferSize{
+		Min:     int(fields[0]),
+		Default: int(fields[1]),
+		Max:     int(fields[2]),
+	}, nil
+}
+
+// Interfaces implements inet.Stack.Interfaces.
+func (s *Stack) Interfaces() map[int32]inet.Interface {
+	return s.interfaces
+}
+
+// InterfaceAddrs implements inet.Stack.InterfaceAddrs.
+func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
+	return s.interfaceAddrs
+}
+
+// SupportsIPv6 implements inet.Stack.SupportsIPv6.
+func (s *Stack) SupportsIPv6() bool {
+	return s.supportsIPv6
+}
+
+// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
+func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
+	return s.tcpRecvBufSize, nil
+}
+
+// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
+func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
+	return syserror.EACCES
+}
+
+// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
+func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
+	return s.tcpSendBufSize, nil
+}
+
+// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
+func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
+	return syserror.EACCES
+}
+
+// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
+func (s *Stack) TCPSACKEnabled() (bool, error) {
+	return s.tcpSACKEnabled, nil
+}
+
+// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
+func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
+	return syserror.EACCES
+}
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
new file mode 100644
index 000000000..9df3ab17c
--- /dev/null
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -0,0 +1,47 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "netlink_state",
+    srcs = [
+        "socket.go",
+    ],
+    out = "netlink_state.go",
+    package = "netlink",
+)
+
+go_library(
+    name = "netlink",
+    srcs = [
+        "message.go",
+        "netlink_state.go",
+        "provider.go",
+        "socket.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/netlink/port",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
new file mode 100644
index 000000000..b902d7ec9
--- /dev/null
+++ b/pkg/sentry/socket/netlink/message.go
@@ -0,0 +1,159 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netlink
+
+import (
+	"fmt"
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// alignUp rounds a length up to an alignment.
+//
+// Preconditions: align is a power of two.
+func alignUp(length int, align uint) int {
+	return (length + int(align) - 1) &^ (int(align) - 1)
+}
+
+// Message contains a complete serialized netlink message.
+type Message struct {
+	buf []byte
+}
+
+// NewMessage creates a new Message containing the passed header.
+//
+// The header length will be updated by Finalize.
+func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
+	return &Message{
+		buf: binary.Marshal(nil, usermem.ByteOrder, hdr),
+	}
+}
+
+// Finalize returns the []byte containing the entire message, with the total
+// length set in the message header. The Message must not be modified after
+// calling Finalize.
+func (m *Message) Finalize() []byte {
+	// Update length, which is the first 4 bytes of the header.
+	usermem.ByteOrder.PutUint32(m.buf, uint32(len(m.buf)))
+
+	// Align the message. Note that the message length in the header (set
+	// above) is the useful length of the message, not the total aligned
+	// length. See net/netlink/af_netlink.c:__nlmsg_put.
+	aligned := alignUp(len(m.buf), linux.NLMSG_ALIGNTO)
+	m.putZeros(aligned - len(m.buf))
+	return m.buf
+}
+
+// putZeros adds n zeros to the message.
+func (m *Message) putZeros(n int) {
+	for n > 0 {
+		m.buf = append(m.buf, 0)
+		n--
+	}
+}
+
+// Put serializes v into the message.
+func (m *Message) Put(v interface{}) {
+	m.buf = binary.Marshal(m.buf, usermem.ByteOrder, v)
+}
+
+// PutAttr adds v to the message as a netlink attribute.
+//
+// Preconditions: The serialized attribute (linux.NetlinkAttrHeaderSize +
+// binary.Size(v) fits in math.MaxUint16 bytes.
+func (m *Message) PutAttr(atype uint16, v interface{}) {
+	l := linux.NetlinkAttrHeaderSize + int(binary.Size(v))
+	if l > math.MaxUint16 {
+		panic(fmt.Sprintf("attribute too large: %d", l))
+	}
+
+	m.Put(linux.NetlinkAttrHeader{
+		Type:   atype,
+		Length: uint16(l),
+	})
+	m.Put(v)
+
+	// Align the attribute.
+	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	m.putZeros(aligned - l)
+}
+
+// PutAttrString adds s to the message as a netlink attribute.
+func (m *Message) PutAttrString(atype uint16, s string) {
+	l := linux.NetlinkAttrHeaderSize + len(s) + 1
+	m.Put(linux.NetlinkAttrHeader{
+		Type:   atype,
+		Length: uint16(l),
+	})
+
+	// String + NUL-termination.
+	m.Put([]byte(s))
+	m.putZeros(1)
+
+	// Align the attribute.
+	aligned := alignUp(l, linux.NLA_ALIGNTO)
+	m.putZeros(aligned - l)
+}
+
+// MessageSet contains a series of netlink messages.
+type MessageSet struct {
+	// Multi indicates that this a multi-part message, to be terminated by
+	// NLMSG_DONE. NLMSG_DONE is sent even if the set contains only one
+	// Message.
+	//
+	// If Multi is set, all added messages will have NLM_F_MULTI set.
+	Multi bool
+
+	// PortID is the destination port for all messages.
+	PortID int32
+
+	// Seq is the sequence counter for all messages in the set.
+	Seq uint32
+
+	// Messages contains the messages in the set.
+	Messages []*Message
+}
+
+// NewMessageSet creates a new MessageSet.
+//
+// portID is the destination port to set as PortID in all messages.
+//
+// seq is the sequence counter to set as seq in all messages in the set.
+func NewMessageSet(portID int32, seq uint32) *MessageSet {
+	return &MessageSet{
+		PortID: portID,
+		Seq:    seq,
+	}
+}
+
+// AddMessage adds a new message to the set and returns it for further
+// additions.
+//
+// The passed header will have Seq, PortID and the multi flag set
+// automatically.
+func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message {
+	hdr.Seq = ms.Seq
+	hdr.PortID = uint32(ms.PortID)
+	if ms.Multi {
+		hdr.Flags |= linux.NLM_F_MULTI
+	}
+
+	m := NewMessage(hdr)
+	ms.Messages = append(ms.Messages, m)
+	return m
+}
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
new file mode 100644
index 000000000..7340b95c9
--- /dev/null
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -0,0 +1,28 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "port_state",
+    srcs = ["port.go"],
+    out = "port_state.go",
+    package = "port",
+)
+
+go_library(
+    name = "port",
+    srcs = [
+        "port.go",
+        "port_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port",
+    visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/state"],
+)
+
+go_test(
+    name = "port_test",
+    srcs = ["port_test.go"],
+    embed = [":port"],
+)
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
new file mode 100644
index 000000000..4ccf0b84c
--- /dev/null
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -0,0 +1,114 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package port provides port ID allocation for netlink sockets.
+//
+// A netlink port is any int32 value. Positive ports are typically equivalent
+// to the PID of the binding process. If that port is unavailable, negative
+// ports are searched to find a free port that will not conflict with other
+// PIDS.
+package port
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	"sync"
+)
+
+// maxPorts is a sanity limit on the maximum number of ports to allocate per
+// protocol.
+const maxPorts = 10000
+
+// Manager allocates netlink port IDs.
+type Manager struct {
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// ports contains a map of allocated ports for each protocol.
+	ports map[int]map[int32]struct{}
+}
+
+// New creates a new Manager.
+func New() *Manager {
+	return &Manager{
+		ports: make(map[int]map[int32]struct{}),
+	}
+}
+
+// Allocate reserves a new port ID for protocol. hint will be taken if
+// available.
+func (m *Manager) Allocate(protocol int, hint int32) (int32, bool) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	proto, ok := m.ports[protocol]
+	if !ok {
+		proto = make(map[int32]struct{})
+		// Port 0 is reserved for the kernel.
+		proto[0] = struct{}{}
+		m.ports[protocol] = proto
+	}
+
+	if len(proto) >= maxPorts {
+		return 0, false
+	}
+
+	if _, ok := proto[hint]; !ok {
+		// Hint is available, reserve it.
+		proto[hint] = struct{}{}
+		return hint, true
+	}
+
+	// Search for any free port in [math.MinInt32, -4096). The positive
+	// port space is left open for pid-based allocations. This behavior is
+	// consistent with Linux.
+	start := int32(math.MinInt32 + rand.Int63n(math.MaxInt32-4096+1))
+	curr := start
+	for {
+		if _, ok := proto[curr]; !ok {
+			proto[curr] = struct{}{}
+			return curr, true
+		}
+
+		curr--
+		if curr >= -4096 {
+			curr = -4097
+		}
+		if curr == start {
+			// Nothing found. We should always find a free port
+			// because maxPorts < -4096 - MinInt32.
+			panic(fmt.Sprintf("No free port found in %+v", proto))
+		}
+	}
+}
+
+// Release frees the specified port for protocol.
+//
+// Preconditions: port is already allocated.
+func (m *Manager) Release(protocol int, port int32) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	proto, ok := m.ports[protocol]
+	if !ok {
+		panic(fmt.Sprintf("Released port %d for protocol %d which has no allocations", port, protocol))
+	}
+
+	if _, ok := proto[port]; !ok {
+		panic(fmt.Sprintf("Released port %d for protocol %d is not allocated", port, protocol))
+	}
+
+	delete(proto, port)
+}
diff --git a/pkg/sentry/socket/netlink/port/port_test.go b/pkg/sentry/socket/netlink/port/port_test.go
new file mode 100644
index 000000000..34565e2f9
--- /dev/null
+++ b/pkg/sentry/socket/netlink/port/port_test.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package port
+
+import (
+	"testing"
+)
+
+func TestAllocateHint(t *testing.T) {
+	m := New()
+
+	// We can get the hint port.
+	p, ok := m.Allocate(0, 1)
+	if !ok {
+		t.Errorf("m.Allocate got !ok want ok")
+	}
+	if p != 1 {
+		t.Errorf("m.Allocate(0, 1) got %d want 1", p)
+	}
+
+	// Hint is taken.
+	p, ok = m.Allocate(0, 1)
+	if !ok {
+		t.Errorf("m.Allocate got !ok want ok")
+	}
+	if p == 1 {
+		t.Errorf("m.Allocate(0, 1) got 1 want anything else")
+	}
+
+	// Hint is available for a different protocol.
+	p, ok = m.Allocate(1, 1)
+	if !ok {
+		t.Errorf("m.Allocate got !ok want ok")
+	}
+	if p != 1 {
+		t.Errorf("m.Allocate(1, 1) got %d want 1", p)
+	}
+
+	m.Release(0, 1)
+
+	// Hint is available again after release.
+	p, ok = m.Allocate(0, 1)
+	if !ok {
+		t.Errorf("m.Allocate got !ok want ok")
+	}
+	if p != 1 {
+		t.Errorf("m.Allocate(0, 1) got %d want 1", p)
+	}
+}
+
+func TestAllocateExhausted(t *testing.T) {
+	m := New()
+
+	// Fill all ports (0 is already reserved).
+	for i := int32(1); i < maxPorts; i++ {
+		p, ok := m.Allocate(0, i)
+		if !ok {
+			t.Fatalf("m.Allocate got !ok want ok")
+		}
+		if p != i {
+			t.Fatalf("m.Allocate(0, %d) got %d want %d", i, p, i)
+		}
+	}
+
+	// Now no more can be allocated.
+	p, ok := m.Allocate(0, 1)
+	if ok {
+		t.Errorf("m.Allocate got %d, ok want !ok", p)
+	}
+}
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
new file mode 100644
index 000000000..36800da4d
--- /dev/null
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -0,0 +1,104 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package netlink
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// Protocol is the implementation of a netlink socket protocol.
+type Protocol interface {
+	// Protocol returns the Linux netlink protocol value.
+	Protocol() int
+
+	// ProcessMessage processes a single message from userspace.
+	//
+	// If err == nil, any messages added to ms will be sent back to the
+	// other end of the socket. Setting ms.Multi will cause an NLMSG_DONE
+	// message to be sent even if ms contains no messages.
+	ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error
+}
+
+// Provider is a function that creates a new Protocol for a specific netlink
+// protocol.
+//
+// Note that this is distinct from socket.Provider, which is used for all
+// socket families.
+type Provider func(t *kernel.Task) (Protocol, *syserr.Error)
+
+// protocols holds a map of all known address protocols and their provider.
+var protocols = make(map[int]Provider)
+
+// RegisterProvider registers the provider of a given address protocol so that
+// netlink sockets of that type can be created via socket(2).
+//
+// Preconditions: May only be called before any netlink sockets are created.
+func RegisterProvider(protocol int, provider Provider) {
+	if p, ok := protocols[protocol]; ok {
+		panic(fmt.Sprintf("Netlink protocol %d already provided by %+v", protocol, p))
+	}
+
+	protocols[protocol] = provider
+}
+
+// socketProvider implements socket.Provider.
+type socketProvider struct {
+}
+
+// Socket implements socket.Provider.Socket.
+func (*socketProvider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+	// Netlink sockets must be specified as datagram or raw, but they
+	// behave the same regardless of type.
+	if stype != unix.SockDgram && stype != unix.SockRaw {
+		return nil, syserr.ErrSocketNotSupported
+	}
+
+	provider, ok := protocols[protocol]
+	if !ok {
+		return nil, syserr.ErrProtocolNotSupported
+	}
+
+	p, err := provider(t)
+	if err != nil {
+		return nil, err
+	}
+
+	s, err := NewSocket(t, p)
+	if err != nil {
+		return nil, err
+	}
+
+	d := socket.NewDirent(t, netlinkSocketDevice)
+	return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true}, s), nil
+}
+
+// Pair implements socket.Provider.Pair by returning an error.
+func (*socketProvider) Pair(*kernel.Task, unix.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+	// Netlink sockets never supports creating socket pairs.
+	return nil, nil, syserr.ErrNotSupported
+}
+
+// init registers the socket provider.
+func init() {
+	socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{})
+}
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
new file mode 100644
index 000000000..ff3f7b7a4
--- /dev/null
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -0,0 +1,33 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "route_state",
+    srcs = ["protocol.go"],
+    out = "route_state.go",
+    package = "route",
+)
+
+go_library(
+    name = "route",
+    srcs = [
+        "protocol.go",
+        "route_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+    ],
+)
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
new file mode 100644
index 000000000..d611519d4
--- /dev/null
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -0,0 +1,189 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package route provides a NETLINK_ROUTE socket protocol.
+package route
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+)
+
+// commandKind describes the operational class of a message type.
+//
+// The route message types use the lower 2 bits of the type to describe class
+// of command.
+type commandKind int
+
+const (
+	kindNew commandKind = 0x0
+	kindDel             = 0x1
+	kindGet             = 0x2
+	kindSet             = 0x3
+)
+
+func typeKind(typ uint16) commandKind {
+	return commandKind(typ & 0x3)
+}
+
+// Protocol implements netlink.Protocol.
+type Protocol struct {
+	// stack is the network stack that this provider describes.
+	//
+	// May be nil.
+	stack inet.Stack
+}
+
+var _ netlink.Protocol = (*Protocol)(nil)
+
+// NewProtocol creates a NETLINK_ROUTE netlink.Protocol.
+func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) {
+	return &Protocol{
+		stack: t.NetworkContext(),
+	}, nil
+}
+
+// Protocol implements netlink.Protocol.Protocol.
+func (p *Protocol) Protocol() int {
+	return linux.NETLINK_ROUTE
+}
+
+// dumpLinks handles RTM_GETLINK + NLM_F_DUMP requests.
+func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
+	// ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
+	// userspace applications (including glibc) still include rtgenmsg.
+	// Linux has a workaround based on the total message length.
+	//
+	// We don't bother to check for either, since we don't support any
+	// extra attributes that may be included anyways.
+	//
+	// The message may also contain netlink attribute IFLA_EXT_MASK, which
+	// we don't support.
+
+	// The RTM_GETLINK dump response is a set of messages each containing
+	// an InterfaceInfoMessage followed by a set of netlink attributes.
+
+	// We always send back an NLMSG_DONE.
+	ms.Multi = true
+
+	if p.stack == nil {
+		// No network devices.
+		return nil
+	}
+
+	for id, i := range p.stack.Interfaces() {
+		m := ms.AddMessage(linux.NetlinkMessageHeader{
+			Type: linux.RTM_NEWLINK,
+		})
+
+		m.Put(linux.InterfaceInfoMessage{
+			Family: linux.AF_UNSPEC,
+			Type:   i.DeviceType,
+			Index:  id,
+			Flags:  i.Flags,
+		})
+
+		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+
+		// TODO: There are many more attributes, such as
+		// MAC address.
+	}
+
+	return nil
+}
+
+// dumpAddrs handles RTM_GETADDR + NLM_F_DUMP requests.
+func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// RTM_GETADDR dump requests need not contain anything more than the
+	// netlink header and 1 byte protocol family common to all
+	// NETLINK_ROUTE requests.
+	//
+	// TODO: Filter output by passed protocol family.
+
+	// The RTM_GETADDR dump response is a set of RTM_NEWADDR messages each
+	// containing an InterfaceAddrMessage followed by a set of netlink
+	// attributes.
+
+	// We always send back an NLMSG_DONE.
+	ms.Multi = true
+
+	if p.stack == nil {
+		// No network devices.
+		return nil
+	}
+
+	for id, as := range p.stack.InterfaceAddrs() {
+		for _, a := range as {
+			m := ms.AddMessage(linux.NetlinkMessageHeader{
+				Type: linux.RTM_NEWADDR,
+			})
+
+			m.Put(linux.InterfaceAddrMessage{
+				Family:    a.Family,
+				PrefixLen: a.PrefixLen,
+				Index:     uint32(id),
+			})
+
+			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
+
+			// TODO: There are many more attributes.
+		}
+	}
+
+	return nil
+}
+
+// ProcessMessage implements netlink.Protocol.ProcessMessage.
+func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+	// All messages start with a 1 byte protocol family.
+	if len(data) < 1 {
+		// Linux ignores messages missing the protocol family. See
+		// net/core/rtnetlink.c:rtnetlink_rcv_msg.
+		return nil
+	}
+
+	// Non-GET message types require CAP_NET_ADMIN.
+	if typeKind(hdr.Type) != kindGet {
+		creds := auth.CredentialsFromContext(ctx)
+		if !creds.HasCapability(linux.CAP_NET_ADMIN) {
+			return syserr.ErrPermissionDenied
+		}
+	}
+
+	// TODO: Only the dump variant of the types below are
+	// supported.
+	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
+		return syserr.ErrNotSupported
+	}
+
+	switch hdr.Type {
+	case linux.RTM_GETLINK:
+		return p.dumpLinks(ctx, hdr, data, ms)
+	case linux.RTM_GETADDR:
+		return p.dumpAddrs(ctx, hdr, data, ms)
+	default:
+		return syserr.ErrNotSupported
+	}
+}
+
+// init registers the NETLINK_ROUTE provider.
+func init() {
+	netlink.RegisterProvider(linux.NETLINK_ROUTE, NewProtocol)
+}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
new file mode 100644
index 000000000..2d0e59ceb
--- /dev/null
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -0,0 +1,517 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package netlink provides core functionality for netlink sockets.
+package netlink
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
+	sunix "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// defaultSendBufferSize is the default size for the send buffer.
+const defaultSendBufferSize = 16 * 1024
+
+// netlinkSocketDevice is the netlink socket virtual device.
+var netlinkSocketDevice = device.NewAnonDevice()
+
+// Socket is the base socket type for netlink sockets.
+//
+// This implementation only supports userspace sending and receiving messages
+// to/from the kernel.
+//
+// Socket implements socket.Socket.
+type Socket struct {
+	socket.ReceiveTimeout
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+
+	// ports provides netlink port allocation.
+	ports *port.Manager
+
+	// protocol is the netlink protocol implementation.
+	protocol Protocol
+
+	// ep is a datagram unix endpoint used to buffer messages sent from the
+	// kernel to userspace. RecvMsg reads messages from this endpoint.
+	ep unix.Endpoint
+
+	// connection is the kernel's connection to ep, used to write messages
+	// sent to userspace.
+	connection unix.ConnectedEndpoint
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// bound indicates that portid is valid.
+	bound bool
+
+	// portID is the port ID allocated for this socket.
+	portID int32
+
+	// sendBufferSize is the send buffer "size". We don't actually have a
+	// fixed buffer but only consume this many bytes.
+	sendBufferSize uint64
+}
+
+var _ socket.Socket = (*Socket)(nil)
+
+// NewSocket creates a new Socket.
+func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
+	// Datagram endpoint used to buffer kernel -> user messages.
+	ep := unix.NewConnectionless()
+
+	// Bind the endpoint for good measure so we can connect to it. The
+	// bound address will never be exposed.
+	if terr := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); terr != nil {
+		ep.Close()
+		return nil, syserr.TranslateNetstackError(terr)
+	}
+
+	// Create a connection from which the kernel can write messages.
+	connection, terr := ep.(unix.BoundEndpoint).UnidirectionalConnect()
+	if terr != nil {
+		ep.Close()
+		return nil, syserr.TranslateNetstackError(terr)
+	}
+
+	return &Socket{
+		ports:          t.Kernel().NetlinkPorts(),
+		protocol:       protocol,
+		ep:             ep,
+		connection:     connection,
+		sendBufferSize: defaultSendBufferSize,
+	}, nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (s *Socket) Release() {
+	s.connection.Release()
+	s.ep.Close()
+
+	if s.bound {
+		s.ports.Release(s.protocol.Protocol(), s.portID)
+	}
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// ep holds messages to be read and thus handles EventIn readiness.
+	ready := s.ep.Readiness(mask)
+
+	if mask&waiter.EventOut == waiter.EventOut {
+		// sendMsg handles messages synchronously and is thus always
+		// ready for writing.
+		ready |= waiter.EventOut
+	}
+
+	return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *Socket) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.ep.EventRegister(e, mask)
+	// Writable readiness never changes, so no registration is needed.
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *Socket) EventUnregister(e *waiter.Entry) {
+	s.ep.EventUnregister(e)
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (s *Socket) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// TODO: no ioctls supported.
+	return 0, syserror.ENOTTY
+}
+
+// ExtractSockAddr extracts the SockAddrNetlink from b.
+func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) {
+	if len(b) < linux.SockAddrNetlinkSize {
+		return nil, syserr.ErrBadAddress
+	}
+
+	var sa linux.SockAddrNetlink
+	binary.Unmarshal(b[:linux.SockAddrNetlinkSize], usermem.ByteOrder, &sa)
+
+	if sa.Family != linux.AF_NETLINK {
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	return &sa, nil
+}
+
+// bindPort binds this socket to a port, preferring 'port' if it is available.
+//
+// port of 0 defaults to the ThreadGroup ID.
+//
+// Preconditions: mu is held.
+func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error {
+	if s.bound {
+		// Re-binding is only allowed if the port doesn't change.
+		if port != s.portID {
+			return syserr.ErrInvalidArgument
+		}
+
+		return nil
+	}
+
+	if port == 0 {
+		port = int32(t.ThreadGroup().ID())
+	}
+	port, ok := s.ports.Allocate(s.protocol.Protocol(), port)
+	if !ok {
+		return syserr.ErrBusy
+	}
+
+	s.portID = port
+	s.bound = true
+	return nil
+}
+
+// Bind implements socket.Socket.Bind.
+func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	a, err := ExtractSockAddr(sockaddr)
+	if err != nil {
+		return err
+	}
+
+	// No support for multicast groups yet.
+	if a.Groups != 0 {
+		return syserr.ErrPermissionDenied
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	return s.bindPort(t, int32(a.PortID))
+}
+
+// Connect implements socket.Socket.Connect.
+func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+	a, err := ExtractSockAddr(sockaddr)
+	if err != nil {
+		return err
+	}
+
+	// No support for multicast groups yet.
+	if a.Groups != 0 {
+		return syserr.ErrPermissionDenied
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if a.PortID == 0 {
+		// Netlink sockets default to connected to the kernel, but
+		// connecting anyways automatically binds if not already bound.
+		if !s.bound {
+			// Pass port 0 to get an auto-selected port ID.
+			return s.bindPort(t, 0)
+		}
+		return nil
+	}
+
+	// We don't support non-kernel destination ports. Linux returns EPERM
+	// if applications attempt to do this without NL_CFG_F_NONROOT_SEND, so
+	// we emulate that.
+	return syserr.ErrPermissionDenied
+}
+
+// Accept implements socket.Socket.Accept.
+func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+	// Netlink sockets never support accept.
+	return 0, nil, 0, syserr.ErrNotSupported
+}
+
+// Listen implements socket.Socket.Listen.
+func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error {
+	// Netlink sockets never support listen.
+	return syserr.ErrNotSupported
+}
+
+// Shutdown implements socket.Socket.Shutdown.
+func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error {
+	// Netlink sockets never support shutdown.
+	return syserr.ErrNotSupported
+}
+
+// GetSockOpt implements socket.Socket.GetSockOpt.
+func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+	// TODO: no sockopts supported.
+	return nil, syserr.ErrProtocolNotAvailable
+}
+
+// SetSockOpt implements socket.Socket.SetSockOpt.
+func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
+	// TODO: no sockopts supported.
+	return syserr.ErrProtocolNotAvailable
+}
+
+// GetSockName implements socket.Socket.GetSockName.
+func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	sa := linux.SockAddrNetlink{
+		Family: linux.AF_NETLINK,
+		PortID: uint32(s.portID),
+	}
+	return sa, uint32(binary.Size(sa)), nil
+}
+
+// GetPeerName implements socket.Socket.GetPeerName.
+func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	sa := linux.SockAddrNetlink{
+		Family: linux.AF_NETLINK,
+		// TODO: Support non-kernel peers. For now the peer
+		// must be the kernel.
+		PortID: 0,
+	}
+	return sa, uint32(binary.Size(sa)), nil
+}
+
+// RecvMsg implements socket.Socket.RecvMsg.
+func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+	from := linux.SockAddrNetlink{
+		Family: linux.AF_NETLINK,
+		PortID: 0,
+	}
+	fromLen := uint32(binary.Size(from))
+
+	trunc := flags&linux.MSG_TRUNC != 0
+
+	r := sunix.EndpointReader{
+		Endpoint: s.ep,
+		Peek:     flags&linux.MSG_PEEK != 0,
+	}
+
+	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		if trunc {
+			n = int64(r.MsgSize)
+		}
+		return int(n), from, fromLen, unix.ControlMessages{}, syserr.FromError(err)
+	}
+
+	// We'll have to block. Register for notification and keep trying to
+	// receive all the data.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventIn)
+	defer s.EventUnregister(&e)
+
+	for {
+		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+			if trunc {
+				n = int64(r.MsgSize)
+			}
+			return int(n), from, fromLen, unix.ControlMessages{}, syserr.FromError(err)
+		}
+
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+			}
+			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+		}
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	return dst.CopyOutFrom(ctx, &sunix.EndpointReader{
+		Endpoint: s.ep,
+	})
+}
+
+// sendResponse sends the response messages in ms back to userspace.
+func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error {
+	// Linux combines multiple netlink messages into a single datagram.
+	bufs := make([][]byte, 0, len(ms.Messages))
+	for _, m := range ms.Messages {
+		bufs = append(bufs, m.Finalize())
+	}
+
+	if len(bufs) > 0 {
+		// RecvMsg never receives the address, so we don't need to send
+		// one.
+		_, notify, terr := s.connection.Send(bufs, unix.ControlMessages{}, tcpip.FullAddress{})
+		// If the buffer is full, we simply drop messages, just like
+		// Linux.
+		if terr != nil && terr != tcpip.ErrWouldBlock {
+			return syserr.TranslateNetstackError(terr)
+		}
+		if notify {
+			s.connection.SendNotify()
+		}
+	}
+
+	// N.B. multi-part messages should still send NLMSG_DONE even if
+	// MessageSet contains no messages.
+	//
+	// N.B. NLMSG_DONE is always sent in a different datagram. See
+	// net/netlink/af_netlink.c:netlink_dump.
+	if ms.Multi {
+		m := NewMessage(linux.NetlinkMessageHeader{
+			Type:   linux.NLMSG_DONE,
+			Flags:  linux.NLM_F_MULTI,
+			Seq:    ms.Seq,
+			PortID: uint32(ms.PortID),
+		})
+
+		_, notify, terr := s.connection.Send([][]byte{m.Finalize()}, unix.ControlMessages{}, tcpip.FullAddress{})
+		if terr != nil && terr != tcpip.ErrWouldBlock {
+			return syserr.TranslateNetstackError(terr)
+		}
+		if notify {
+			s.connection.SendNotify()
+		}
+	}
+
+	return nil
+}
+
+// processMessages handles each message in buf, passing it to the protocol
+// handler for final handling.
+func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
+	for len(buf) > 0 {
+		if len(buf) < linux.NetlinkMessageHeaderSize {
+			// Linux ignores messages that are too short. See
+			// net/netlink/af_netlink.c:netlink_rcv_skb.
+			break
+		}
+
+		var hdr linux.NetlinkMessageHeader
+		binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr)
+
+		if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) {
+			// Linux ignores malformed messages. See
+			// net/netlink/af_netlink.c:netlink_rcv_skb.
+			break
+		}
+
+		// Data from this message.
+		data := buf[linux.NetlinkMessageHeaderSize:hdr.Length]
+
+		// Advance to the next message.
+		next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO)
+		if next >= len(buf)-1 {
+			next = len(buf) - 1
+		}
+		buf = buf[next:]
+
+		// Ignore control messages.
+		if hdr.Type < linux.NLMSG_MIN_TYPE {
+			continue
+		}
+
+		// TODO: ACKs not supported yet.
+		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
+			return syserr.ErrNotSupported
+		}
+
+		ms := NewMessageSet(s.portID, hdr.Seq)
+		if err := s.protocol.ProcessMessage(ctx, hdr, data, ms); err != nil {
+			return err
+		}
+
+		if err := s.sendResponse(ctx, ms); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// sendMsg is the core of message send, used for SendMsg and Write.
+func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+	dstPort := int32(0)
+
+	if len(to) != 0 {
+		a, err := ExtractSockAddr(to)
+		if err != nil {
+			return 0, err
+		}
+
+		// No support for multicast groups yet.
+		if a.Groups != 0 {
+			return 0, syserr.ErrPermissionDenied
+		}
+
+		dstPort = int32(a.PortID)
+	}
+
+	if dstPort != 0 {
+		// Non-kernel destinations not supported yet. Treat as if
+		// NL_CFG_F_NONROOT_SEND is not set.
+		return 0, syserr.ErrPermissionDenied
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// For simplicity, and consistency with Linux, we copy in the entire
+	// message up front.
+	if uint64(src.NumBytes()) > s.sendBufferSize {
+		return 0, syserr.ErrMessageTooLong
+	}
+
+	buf := make([]byte, src.NumBytes())
+	n, err := src.CopyIn(ctx, buf)
+	if err != nil {
+		// Don't partially consume messages.
+		return 0, syserr.FromError(err)
+	}
+
+	if err := s.processMessages(ctx, buf); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
+// SendMsg implements socket.Socket.SendMsg.
+func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+	return s.sendMsg(t, src, to, flags, controlMessages)
+}
+
+// Write implements fs.FileOperations.Write.
+func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	n, err := s.sendMsg(ctx, src, nil, 0, unix.ControlMessages{})
+	return int64(n), err.ToError()
+}
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
new file mode 100644
index 000000000..b0351b363
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -0,0 +1,59 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "rpcinet",
+    srcs = [
+        "device.go",
+        "rpcinet.go",
+        "socket.go",
+        "stack.go",
+        "stack_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        ":syscall_rpc_go_proto",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/hostinet",
+        "//pkg/sentry/socket/rpcinet/conn",
+        "//pkg/sentry/socket/rpcinet/notifier",
+        "//pkg/sentry/usermem",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/unet",
+        "//pkg/waiter",
+    ],
+)
+
+proto_library(
+    name = "syscall_rpc_proto",
+    srcs = ["syscall_rpc.proto"],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
+go_proto_library(
+    name = "syscall_rpc_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto",
+    proto = ":syscall_rpc_proto",
+    visibility = [
+        "//visibility:public",
+    ],
+)
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
new file mode 100644
index 000000000..4923dee4b
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -0,0 +1,17 @@
+package(licenses = ["notice"])  # BSD
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "conn",
+    srcs = ["conn.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/binary",
+        "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
+        "//pkg/syserr",
+        "//pkg/unet",
+        "@com_github_golang_protobuf//proto:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
new file mode 100644
index 000000000..ea6ec87ed
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -0,0 +1,167 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package conn is an RPC connection to a syscall RPC server.
+package conn
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"github.com/golang/protobuf/proto"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+
+	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
+)
+
+type request struct {
+	response     []byte
+	ready        chan struct{}
+	ignoreResult bool
+}
+
+// RPCConnection represents a single RPC connection to a syscall gofer.
+type RPCConnection struct {
+	// reqID is the ID of the last request and must be accessed atomically.
+	reqID uint64
+
+	sendMu sync.Mutex
+	socket *unet.Socket
+
+	reqMu    sync.Mutex
+	requests map[uint64]request
+}
+
+// NewRPCConnection initializes a RPC connection to a socket gofer.
+func NewRPCConnection(s *unet.Socket) *RPCConnection {
+	conn := &RPCConnection{socket: s, requests: map[uint64]request{}}
+	go func() { // S/R-FIXME
+		var nums [16]byte
+		for {
+			for n := 0; n < len(nums); {
+				nn, err := conn.socket.Read(nums[n:])
+				if err != nil {
+					panic(fmt.Sprint("error reading length from socket rpc gofer: ", err))
+				}
+				n += nn
+			}
+
+			b := make([]byte, binary.LittleEndian.Uint64(nums[:8]))
+			id := binary.LittleEndian.Uint64(nums[8:])
+
+			for n := 0; n < len(b); {
+				nn, err := conn.socket.Read(b[n:])
+				if err != nil {
+					panic(fmt.Sprint("error reading request from socket rpc gofer: ", err))
+				}
+				n += nn
+			}
+
+			conn.reqMu.Lock()
+			r := conn.requests[id]
+			if r.ignoreResult {
+				delete(conn.requests, id)
+			} else {
+				r.response = b
+				conn.requests[id] = r
+			}
+			conn.reqMu.Unlock()
+			close(r.ready)
+		}
+	}()
+	return conn
+}
+
+// NewRequest makes a request to the RPC gofer and returns the request ID and a
+// channel which will be closed once the request completes.
+func (c *RPCConnection) NewRequest(req pb.SyscallRequest, ignoreResult bool) (uint64, chan struct{}) {
+	b, err := proto.Marshal(&req)
+	if err != nil {
+		panic(fmt.Sprint("invalid proto: ", err))
+	}
+
+	id := atomic.AddUint64(&c.reqID, 1)
+	ch := make(chan struct{})
+
+	c.reqMu.Lock()
+	c.requests[id] = request{ready: ch, ignoreResult: ignoreResult}
+	c.reqMu.Unlock()
+
+	c.sendMu.Lock()
+	defer c.sendMu.Unlock()
+
+	var nums [16]byte
+	binary.LittleEndian.PutUint64(nums[:8], uint64(len(b)))
+	binary.LittleEndian.PutUint64(nums[8:], id)
+	for n := 0; n < len(nums); {
+		nn, err := c.socket.Write(nums[n:])
+		if err != nil {
+			panic(fmt.Sprint("error writing length and ID to socket gofer: ", err))
+		}
+		n += nn
+	}
+
+	for n := 0; n < len(b); {
+		nn, err := c.socket.Write(b[n:])
+		if err != nil {
+			panic(fmt.Sprint("error writing request to socket gofer: ", err))
+		}
+		n += nn
+	}
+
+	return id, ch
+}
+
+// RPCReadFile will execute the ReadFile helper RPC method which avoids the
+// common pattern of open(2), read(2), close(2) by doing all three operations
+// as a single RPC. It will read the entire file or return EFBIG if the file
+// was too large.
+func (c *RPCConnection) RPCReadFile(path string) ([]byte, *syserr.Error) {
+	req := &pb.SyscallRequest_ReadFile{&pb.ReadFileRequest{
+		Path: path,
+	}}
+
+	id, ch := c.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-ch
+
+	res := c.Request(id).Result.(*pb.SyscallResponse_ReadFile).ReadFile.Result
+	if e, ok := res.(*pb.ReadFileResponse_ErrorNumber); ok {
+		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.ReadFileResponse_Data).Data, nil
+}
+
+// Request retrieves the request corresponding to the given request ID.
+//
+// The channel returned by NewRequest must have been closed before Request can
+// be called. This will happen automatically, do not manually close the
+// channel.
+func (c *RPCConnection) Request(id uint64) pb.SyscallResponse {
+	c.reqMu.Lock()
+	r := c.requests[id]
+	delete(c.requests, id)
+	c.reqMu.Unlock()
+
+	var resp pb.SyscallResponse
+	if err := proto.Unmarshal(r.response, &resp); err != nil {
+		panic(fmt.Sprint("invalid proto: ", err))
+	}
+
+	return resp
+}
diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go
new file mode 100644
index 000000000..f7b63436e
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/device.go
@@ -0,0 +1,19 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rpcinet
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+var socketDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
new file mode 100644
index 000000000..6f3b06a05
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -0,0 +1,15 @@
+package(licenses = ["notice"])  # BSD
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "notifier",
+    srcs = ["notifier.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
+        "//pkg/sentry/socket/rpcinet/conn",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
new file mode 100644
index 000000000..f88a908ed
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -0,0 +1,230 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package notifier implements an FD notifier implementation over RPC.
+package notifier
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn"
+	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+type fdInfo struct {
+	queue   *waiter.Queue
+	waiting bool
+}
+
+// Notifier holds all the state necessary to issue notifications when IO events
+// occur in the observed FDs.
+type Notifier struct {
+	// rpcConn is the connection that is used for sending RPCs.
+	rpcConn *conn.RPCConnection
+
+	// epFD is the epoll file descriptor used to register for io
+	// notifications.
+	epFD uint32
+
+	// mu protects fdMap.
+	mu sync.Mutex
+
+	// fdMap maps file descriptors to their notification queues and waiting
+	// status.
+	fdMap map[uint32]*fdInfo
+}
+
+// NewRPCNotifier creates a new notifier object.
+func NewRPCNotifier(cn *conn.RPCConnection) (*Notifier, error) {
+	id, c := cn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCreate1{&pb.EpollCreate1Request{}}}, false /* ignoreResult */)
+	<-c
+
+	res := cn.Request(id).Result.(*pb.SyscallResponse_EpollCreate1).EpollCreate1.Result
+	if e, ok := res.(*pb.EpollCreate1Response_ErrorNumber); ok {
+		return nil, syscall.Errno(e.ErrorNumber)
+	}
+
+	w := &Notifier{
+		rpcConn: cn,
+		epFD:    res.(*pb.EpollCreate1Response_Fd).Fd,
+		fdMap:   make(map[uint32]*fdInfo),
+	}
+
+	go w.waitAndNotify() // S/R-FIXME
+
+	return w, nil
+}
+
+// waitFD waits on mask for fd. The fdMap mutex must be hold.
+func (n *Notifier) waitFD(fd uint32, fi *fdInfo, mask waiter.EventMask) error {
+	if !fi.waiting && mask == 0 {
+		return nil
+	}
+
+	e := pb.EpollEvent{
+		Events: uint32(mask) | -syscall.EPOLLET,
+		Fd:     fd,
+	}
+
+	switch {
+	case !fi.waiting && mask != 0:
+		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_ADD, Fd: fd, Event: &e}}}, false /* ignoreResult */)
+		<-c
+
+		e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber
+		if e != 0 {
+			return syscall.Errno(e)
+		}
+
+		fi.waiting = true
+	case fi.waiting && mask == 0:
+		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_DEL, Fd: fd}}}, false /* ignoreResult */)
+		<-c
+		n.rpcConn.Request(id)
+
+		fi.waiting = false
+	case fi.waiting && mask != 0:
+		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollCtl{&pb.EpollCtlRequest{Epfd: n.epFD, Op: syscall.EPOLL_CTL_MOD, Fd: fd, Event: &e}}}, false /* ignoreResult */)
+		<-c
+
+		e := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollCtl).EpollCtl.ErrorNumber
+		if e != 0 {
+			return syscall.Errno(e)
+		}
+	}
+
+	return nil
+}
+
+// addFD adds an FD to the list of FDs observed by n.
+func (n *Notifier) addFD(fd uint32, queue *waiter.Queue) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Panic if we're already notifying on this FD.
+	if _, ok := n.fdMap[fd]; ok {
+		panic(fmt.Sprintf("File descriptor %d added twice", fd))
+	}
+
+	// We have nothing to wait for at the moment. Just add it to the map.
+	n.fdMap[fd] = &fdInfo{queue: queue}
+}
+
+// updateFD updates the set of events the FD needs to be notified on.
+func (n *Notifier) updateFD(fd uint32) error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if fi, ok := n.fdMap[fd]; ok {
+		return n.waitFD(fd, fi, fi.queue.Events())
+	}
+
+	return nil
+}
+
+// RemoveFD removes an FD from the list of FDs observed by n.
+func (n *Notifier) removeFD(fd uint32) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Remove from map, then from epoll object.
+	n.waitFD(fd, n.fdMap[fd], 0)
+	delete(n.fdMap, fd)
+}
+
+// hasFD returns true if the FD is in the list of observed FDs.
+func (n *Notifier) hasFD(fd uint32) bool {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	_, ok := n.fdMap[fd]
+	return ok
+}
+
+// waitAndNotify loops waiting for io event notifications from the epoll
+// object. Once notifications arrive, they are dispatched to the
+// registered queue.
+func (n *Notifier) waitAndNotify() error {
+	for {
+		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_EpollWait{&pb.EpollWaitRequest{Fd: n.epFD, NumEvents: 100, Msec: -1}}}, false /* ignoreResult */)
+		<-c
+
+		res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollWait).EpollWait.Result
+		if e, ok := res.(*pb.EpollWaitResponse_ErrorNumber); ok {
+			err := syscall.Errno(e.ErrorNumber)
+			// NOTE: I don't think epoll_wait can return EAGAIN but I'm being
+			// conseratively careful here since exiting the notification thread
+			// would be really bad.
+			if err == syscall.EINTR || err == syscall.EAGAIN {
+				continue
+			}
+			return err
+		}
+
+		n.mu.Lock()
+		for _, e := range res.(*pb.EpollWaitResponse_Events).Events.Events {
+			if fi, ok := n.fdMap[e.Fd]; ok {
+				fi.queue.Notify(waiter.EventMask(e.Events))
+			}
+		}
+		n.mu.Unlock()
+	}
+}
+
+// AddFD adds an FD to the list of observed FDs.
+func (n *Notifier) AddFD(fd uint32, queue *waiter.Queue) error {
+	n.addFD(fd, queue)
+	return nil
+}
+
+// UpdateFD updates the set of events the FD needs to be notified on.
+func (n *Notifier) UpdateFD(fd uint32) error {
+	return n.updateFD(fd)
+}
+
+// RemoveFD removes an FD from the list of observed FDs.
+func (n *Notifier) RemoveFD(fd uint32) {
+	n.removeFD(fd)
+}
+
+// HasFD returns true if the FD is in the list of observed FDs.
+//
+// This should only be used by tests to assert that FDs are correctly
+// registered.
+func (n *Notifier) HasFD(fd uint32) bool {
+	return n.hasFD(fd)
+}
+
+// NonBlockingPoll polls the given fd in non-blocking fashion. It is used just
+// to query the FD's current state; this method will block on the RPC response
+// although the syscall is non-blocking.
+func (n *Notifier) NonBlockingPoll(fd uint32, mask waiter.EventMask) waiter.EventMask {
+	for {
+		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Poll{&pb.PollRequest{Fd: fd, Events: uint32(mask)}}}, false /* ignoreResult */)
+		<-c
+
+		res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_Poll).Poll.Result
+		if e, ok := res.(*pb.PollResponse_ErrorNumber); ok {
+			if syscall.Errno(e.ErrorNumber) == syscall.EINTR {
+				continue
+			}
+			return mask
+		}
+
+		return waiter.EventMask(res.(*pb.PollResponse_Events).Events)
+	}
+}
diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go
new file mode 100644
index 000000000..10b0dedc2
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/rpcinet.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package rpcinet implements sockets using an RPC for each syscall.
+package rpcinet
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
new file mode 100644
index 000000000..574d99ba5
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -0,0 +1,567 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rpcinet
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
+	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// socketOperations implements fs.FileOperations and socket.Socket for a socket
+// implemented using a host socket.
+type socketOperations struct {
+	socket.ReceiveTimeout
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+
+	fd       uint32 // must be O_NONBLOCK
+	wq       *waiter.Queue
+	rpcConn  *conn.RPCConnection
+	notifier *notifier.Notifier
+}
+
+// Verify that we actually implement socket.Socket.
+var _ = socket.Socket(&socketOperations{})
+
+// New creates a new RPC socket.
+func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) {
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */)
+	<-c
+
+	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result
+	if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok {
+		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+	fd := res.(*pb.SocketResponse_Fd).Fd
+
+	var wq waiter.Queue
+	stack.notifier.AddFD(fd, &wq)
+
+	dirent := socket.NewDirent(ctx, socketDevice)
+	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{
+		wq:       &wq,
+		fd:       fd,
+		rpcConn:  stack.rpcConn,
+		notifier: stack.notifier,
+	}), nil
+}
+
+func isBlockingErrno(err error) bool {
+	return err == syscall.EAGAIN || err == syscall.EWOULDBLOCK
+}
+
+func translateIOSyscallError(err error) error {
+	if isBlockingErrno(err) {
+		return syserror.ErrWouldBlock
+	}
+	return err
+}
+
+// Release implements fs.FileOperations.Release.
+func (s *socketOperations) Release() {
+	s.notifier.RemoveFD(s.fd)
+
+	// We always need to close the FD.
+	_, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: s.fd}}}, true /* ignoreResult */)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.notifier.NonBlockingPoll(s.fd, mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.wq.EventRegister(e, mask)
+	s.notifier.UpdateFD(s.fd)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *socketOperations) EventUnregister(e *waiter.Entry) {
+	s.wq.EventUnregister(e)
+	s.notifier.UpdateFD(s.fd)
+}
+
+func rpcRead(t *kernel.Task, req *pb.SyscallRequest_Read) (*pb.ReadResponse_Data, *syserr.Error) {
+	s := t.NetworkContext().(*Stack)
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-c
+
+	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Read).Read.Result
+	if e, ok := res.(*pb.ReadResponse_ErrorNumber); ok {
+		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.ReadResponse_Data), nil
+}
+
+// Read implements fs.FileOperations.Read.
+func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	req := &pb.SyscallRequest_Read{&pb.ReadRequest{
+		Fd:     s.fd,
+		Length: uint32(dst.NumBytes()),
+	}}
+
+	res, se := rpcRead(ctx.(*kernel.Task), req)
+	if se == nil {
+		n, e := dst.CopyOut(ctx, res.Data)
+		return int64(n), e
+	}
+	if se != syserr.ErrWouldBlock {
+		return 0, se.ToError()
+	}
+
+	// We'll have to block. Register for notifications and read again when ready.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventIn)
+	defer s.EventUnregister(&e)
+
+	for {
+		res, se := rpcRead(ctx.(*kernel.Task), req)
+		if se == nil {
+			n, e := dst.CopyOut(ctx, res.Data)
+			return int64(n), e
+		}
+		if se != syserr.ErrWouldBlock {
+			return 0, se.ToError()
+		}
+
+		if err := ctx.(*kernel.Task).Block(ch); err != nil {
+			return 0, err
+		}
+	}
+}
+
+func rpcWrite(t *kernel.Task, req *pb.SyscallRequest_Write) (uint32, *syserr.Error) {
+	s := t.NetworkContext().(*Stack)
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-c
+
+	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Write).Write.Result
+	if e, ok := res.(*pb.WriteResponse_ErrorNumber); ok {
+		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.WriteResponse_Length).Length, nil
+}
+
+// Write implements fs.FileOperations.Write.
+func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	t := ctx.(*kernel.Task)
+	v := buffer.NewView(int(src.NumBytes()))
+
+	// Copy all the data into the buffer.
+	if _, err := src.CopyIn(t, v); err != nil {
+		return 0, err
+	}
+
+	n, err := rpcWrite(t, &pb.SyscallRequest_Write{&pb.WriteRequest{Fd: s.fd, Data: v}})
+	return int64(n), err.ToError()
+}
+
+func rpcConnect(t *kernel.Task, fd uint32, sockaddr []byte) *syserr.Error {
+	s := t.NetworkContext().(*Stack)
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Connect{&pb.ConnectRequest{Fd: uint32(fd), Address: sockaddr}}}, false /* ignoreResult */)
+	<-c
+
+	if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Connect).Connect.ErrorNumber; e != 0 {
+		return syserr.FromHost(syscall.Errno(e))
+	}
+	return nil
+}
+
+// Connect implements socket.Socket.Connect.
+func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+	if !blocking {
+		return rpcConnect(t, s.fd, sockaddr)
+	}
+
+	// Register for notification when the endpoint becomes writable, then
+	// initiate the connection.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventOut)
+	defer s.EventUnregister(&e)
+
+	if err := rpcConnect(t, s.fd, sockaddr); err != syserr.ErrConnectStarted && err != syserr.ErrAlreadyConnecting {
+		return err
+	}
+
+	// It's pending, so we have to wait for a notification, and fetch the
+	// result once the wait completes.
+	if err := t.Block(ch); err != nil {
+		return syserr.FromError(err)
+	}
+
+	// Call Connect() again after blocking to find connect's result.
+	return rpcConnect(t, s.fd, sockaddr)
+}
+
+func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultPayload, *syserr.Error) {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Accept{&pb.AcceptRequest{Fd: fd, Peer: peer, Flags: syscall.SOCK_NONBLOCK}}}, false /* ignoreResult */)
+	<-c
+
+	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Accept).Accept.Result
+	if e, ok := res.(*pb.AcceptResponse_ErrorNumber); ok {
+		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+	return res.(*pb.AcceptResponse_Payload).Payload, nil
+}
+
+// Accept implements socket.Socket.Accept.
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+	payload, se := rpcAccept(t, s.fd, peerRequested)
+
+	// Check if we need to block.
+	if blocking && se == syserr.ErrWouldBlock {
+		// Register for notifications.
+		e, ch := waiter.NewChannelEntry(nil)
+		s.EventRegister(&e, waiter.EventIn)
+		defer s.EventUnregister(&e)
+
+		// Try to accept the connection again; if it fails, then wait until we
+		// get a notification.
+		for {
+			if payload, se = rpcAccept(t, s.fd, peerRequested); se != syserr.ErrWouldBlock {
+				break
+			}
+
+			if err := t.Block(ch); err != nil {
+				return 0, nil, 0, syserr.FromError(err)
+			}
+		}
+	}
+
+	// Handle any error from accept.
+	if se != nil {
+		return 0, nil, 0, se
+	}
+
+	var wq waiter.Queue
+	s.notifier.AddFD(payload.Fd, &wq)
+
+	dirent := socket.NewDirent(t, socketDevice)
+	file := fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonBlocking: flags&linux.SOCK_NONBLOCK != 0}, &socketOperations{
+		wq:       &wq,
+		fd:       payload.Fd,
+		notifier: s.notifier,
+	})
+
+	fdFlags := kernel.FDFlags{
+		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
+	}
+	fd, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, 0, syserr.FromError(err)
+	}
+
+	return fd, payload.Address.Address, payload.Address.Length, nil
+}
+
+// Bind implements socket.Socket.Bind.
+func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: s.fd, Address: sockaddr}}}, false /* ignoreResult */)
+	<-c
+
+	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 {
+		syserr.FromHost(syscall.Errno(e))
+	}
+	return nil
+}
+
+// Listen implements socket.Socket.Listen.
+func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Listen{&pb.ListenRequest{Fd: s.fd, Backlog: int64(backlog)}}}, false /* ignoreResult */)
+	<-c
+
+	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Listen).Listen.ErrorNumber; e != 0 {
+		syserr.FromHost(syscall.Errno(e))
+	}
+	return nil
+}
+
+// Shutdown implements socket.Socket.Shutdown.
+func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Shutdown{&pb.ShutdownRequest{Fd: s.fd, How: int64(how)}}}, false /* ignoreResult */)
+	<-c
+
+	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Shutdown).Shutdown.ErrorNumber; e != 0 {
+		return syserr.FromHost(syscall.Errno(e))
+	}
+	return nil
+}
+
+// GetSockOpt implements socket.Socket.GetSockOpt.
+func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockOpt{&pb.GetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Length: uint32(outLen)}}}, false /* ignoreResult */)
+	<-c
+
+	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockOpt).GetSockOpt.Result
+	if e, ok := res.(*pb.GetSockOptResponse_ErrorNumber); ok {
+		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.GetSockOptResponse_Opt).Opt, nil
+}
+
+// SetSockOpt implements socket.Socket.SetSockOpt.
+func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_SetSockOpt{&pb.SetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Opt: opt}}}, false /* ignoreResult */)
+	<-c
+
+	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_SetSockOpt).SetSockOpt.ErrorNumber; e != 0 {
+		syserr.FromHost(syscall.Errno(e))
+	}
+	return nil
+}
+
+// GetPeerName implements socket.Socket.GetPeerName.
+func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetPeerName{&pb.GetPeerNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
+	<-c
+
+	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetPeerName).GetPeerName.Result
+	if e, ok := res.(*pb.GetPeerNameResponse_ErrorNumber); ok {
+		return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	addr := res.(*pb.GetPeerNameResponse_Address).Address
+	return addr.Address, addr.Length, nil
+}
+
+// GetSockName implements socket.Socket.GetSockName.
+func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	stack := t.NetworkContext().(*Stack)
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockName{&pb.GetSockNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
+	<-c
+
+	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_GetSockName).GetSockName.Result
+	if e, ok := res.(*pb.GetSockNameResponse_ErrorNumber); ok {
+		return nil, 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	addr := res.(*pb.GetSockNameResponse_Address).Address
+	return addr.Address, addr.Length, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return 0, syserror.ENOTTY
+}
+
+func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) {
+	s := t.NetworkContext().(*Stack)
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-c
+
+	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result
+	if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok {
+		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.RecvmsgResponse_Payload).Payload, nil
+}
+
+// RecvMsg implements socket.Socket.RecvMsg.
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
+		Fd:     s.fd,
+		Length: uint32(dst.NumBytes()),
+		Sender: senderRequested,
+		Trunc:  flags&linux.MSG_TRUNC != 0,
+		Peek:   flags&linux.MSG_PEEK != 0,
+	}}
+
+	res, err := rpcRecvMsg(t, req)
+	if err == nil {
+		n, e := dst.CopyOut(t, res.Data)
+		return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e)
+	}
+	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		return 0, nil, 0, unix.ControlMessages{}, err
+	}
+
+	// We'll have to block. Register for notifications and keep trying to
+	// send all the data.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventIn)
+	defer s.EventUnregister(&e)
+
+	for {
+		res, err := rpcRecvMsg(t, req)
+		if err == nil {
+			n, e := dst.CopyOut(t, res.Data)
+			return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e)
+		}
+		if err != syserr.ErrWouldBlock {
+			return 0, nil, 0, unix.ControlMessages{}, err
+		}
+
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+			}
+			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+		}
+	}
+}
+
+func rpcSendMsg(t *kernel.Task, req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) {
+	s := t.NetworkContext().(*Stack)
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-c
+
+	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result
+	if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok {
+		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.SendmsgResponse_Length).Length, nil
+}
+
+// SendMsg implements socket.Socket.SendMsg.
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+	// Whitelist flags.
+	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
+		return 0, syserr.ErrInvalidArgument
+	}
+
+	// Reject control messages.
+	if !controlMessages.Empty() {
+		return 0, syserr.ErrInvalidArgument
+	}
+
+	v := buffer.NewView(int(src.NumBytes()))
+
+	// Copy all the data into the buffer.
+	if _, err := src.CopyIn(t, v); err != nil {
+		return 0, syserr.FromError(err)
+	}
+
+	// TODO: this needs to change to map directly to a SendMsg syscall
+	// in the RPC.
+	req := &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
+		Fd:          uint32(s.fd),
+		Data:        v,
+		Address:     to,
+		More:        flags&linux.MSG_MORE != 0,
+		EndOfRecord: flags&linux.MSG_EOR != 0,
+	}}
+
+	n, err := rpcSendMsg(t, req)
+	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		return int(n), err
+	}
+
+	// We'll have to block. Register for notification and keep trying to
+	// send all the data.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventOut)
+	defer s.EventUnregister(&e)
+
+	for {
+		n, err := rpcSendMsg(t, req)
+		if err != syserr.ErrWouldBlock {
+			return int(n), err
+		}
+
+		if err := t.Block(ch); err != nil {
+			return 0, syserr.FromError(err)
+		}
+	}
+}
+
+type socketProvider struct {
+	family int
+}
+
+// Socket implements socket.Provider.Socket.
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+	// Check that we are using the RPC network stack.
+	stack := t.NetworkContext()
+	if stack == nil {
+		return nil, nil
+	}
+
+	s, ok := stack.(*Stack)
+	if !ok {
+		return nil, nil
+	}
+
+	// Only accept TCP and UDP.
+	//
+	// Try to restrict the flags we will accept to minimize backwards
+	// incompatability with netstack.
+	stype := int(stypeflags) & linux.SOCK_TYPE_MASK
+	switch stype {
+	case syscall.SOCK_STREAM:
+		switch protocol {
+		case 0, syscall.IPPROTO_TCP:
+			// ok
+		default:
+			return nil, nil
+		}
+	case syscall.SOCK_DGRAM:
+		switch protocol {
+		case 0, syscall.IPPROTO_UDP:
+			// ok
+		default:
+			return nil, nil
+		}
+	default:
+		return nil, nil
+	}
+
+	return newSocketFile(t, s, p.family, stype, 0)
+}
+
+// Pair implements socket.Provider.Pair.
+func (p *socketProvider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+	// Not supported by AF_INET/AF_INET6.
+	return nil, nil, nil
+}
+
+func init() {
+	for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} {
+		socket.RegisterProvider(family, &socketProvider{family})
+	}
+}
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
new file mode 100644
index 000000000..503e0e932
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rpcinet
+
+import (
+	"fmt"
+	"strings"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+)
+
+// Stack implements inet.Stack for RPC backed sockets.
+type Stack struct {
+	// We intentionally do not allow these values to be changed to remain
+	// consistent with the other networking stacks.
+	interfaces     map[int32]inet.Interface
+	interfaceAddrs map[int32][]inet.InterfaceAddr
+	supportsIPv6   bool
+	tcpRecvBufSize inet.TCPBufferSize
+	tcpSendBufSize inet.TCPBufferSize
+	tcpSACKEnabled bool
+	rpcConn        *conn.RPCConnection
+	notifier       *notifier.Notifier
+}
+
+func readTCPBufferSizeFile(conn *conn.RPCConnection, filename string) (inet.TCPBufferSize, error) {
+	contents, se := conn.RPCReadFile(filename)
+	if se != nil {
+		return inet.TCPBufferSize{}, fmt.Errorf("failed to read %s: %v", filename, se)
+	}
+	ioseq := usermem.BytesIOSequence(contents)
+	fields := make([]int32, 3)
+	if n, err := usermem.CopyInt32StringsInVec(context.Background(), ioseq.IO, ioseq.Addrs, fields, ioseq.Opts); n != ioseq.NumBytes() || err != nil {
+		return inet.TCPBufferSize{}, fmt.Errorf("failed to parse %s (%q): got %v after %d/%d bytes", filename, contents, err, n, ioseq.NumBytes())
+	}
+	return inet.TCPBufferSize{
+		Min:     int(fields[0]),
+		Default: int(fields[1]),
+		Max:     int(fields[2]),
+	}, nil
+}
+
+// NewStack returns a Stack containing the current state of the host network
+// stack.
+func NewStack(fd int32) (*Stack, error) {
+	sock, err := unet.NewSocket(int(fd))
+	if err != nil {
+		return nil, err
+	}
+
+	stack := &Stack{
+		interfaces:     make(map[int32]inet.Interface),
+		interfaceAddrs: make(map[int32][]inet.InterfaceAddr),
+		rpcConn:        conn.NewRPCConnection(sock),
+	}
+
+	var e error
+	stack.notifier, e = notifier.NewRPCNotifier(stack.rpcConn)
+	if e != nil {
+		return nil, e
+	}
+
+	// Load the configuration values from procfs.
+	tcpRMem, e := readTCPBufferSizeFile(stack.rpcConn, "/proc/sys/net/ipv4/tcp_rmem")
+	if e != nil {
+		return nil, e
+	}
+	stack.tcpRecvBufSize = tcpRMem
+
+	tcpWMem, e := readTCPBufferSizeFile(stack.rpcConn, "/proc/sys/net/ipv4/tcp_wmem")
+	if e != nil {
+		return nil, e
+	}
+	stack.tcpSendBufSize = tcpWMem
+
+	ipv6, se := stack.rpcConn.RPCReadFile("/proc/net/if_inet6")
+	if len(string(ipv6)) > 0 {
+		stack.supportsIPv6 = true
+	}
+
+	sackFile := "/proc/sys/net/ipv4/tcp_sack"
+	sack, se := stack.rpcConn.RPCReadFile(sackFile)
+	if se != nil {
+		return nil, fmt.Errorf("failed to read %s: %v", sackFile, se)
+	}
+	stack.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0"
+
+	links, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETLINK)
+	if err != nil {
+		return nil, fmt.Errorf("RTM_GETLINK failed: %v", err)
+	}
+
+	addrs, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETADDR)
+	if err != nil {
+		return nil, fmt.Errorf("RTM_GETADDR failed: %v", err)
+	}
+
+	e = hostinet.ExtractHostInterfaces(links, addrs, stack.interfaces, stack.interfaceAddrs)
+	if e != nil {
+		return nil, e
+	}
+
+	return stack, nil
+}
+
+// Interfaces implements inet.Stack.Interfaces.
+func (s *Stack) Interfaces() map[int32]inet.Interface {
+	return s.interfaces
+}
+
+// InterfaceAddrs implements inet.Stack.InterfaceAddrs.
+func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
+	return s.interfaceAddrs
+}
+
+// SupportsIPv6 implements inet.Stack.SupportsIPv6.
+func (s *Stack) SupportsIPv6() bool {
+	return s.supportsIPv6
+}
+
+// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
+func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
+	return s.tcpRecvBufSize, nil
+}
+
+// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
+func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
+	// To keep all the supported stacks consistent we don't allow changing this
+	// value even though it would be possible via an RPC.
+	return syserror.EACCES
+}
+
+// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
+func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
+	return s.tcpSendBufSize, nil
+}
+
+// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
+func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
+	// To keep all the supported stacks consistent we don't allow changing this
+	// value even though it would be possible via an RPC.
+	return syserror.EACCES
+}
+
+// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
+func (s *Stack) TCPSACKEnabled() (bool, error) {
+	return s.tcpSACKEnabled, nil
+}
+
+// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
+func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
+	// To keep all the supported stacks consistent we don't allow changing this
+	// value even though it would be possible via an RPC.
+	return syserror.EACCES
+}
diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go
new file mode 100644
index 000000000..9a896c623
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/stack_unsafe.go
@@ -0,0 +1,193 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rpcinet
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+)
+
+// NewNetlinkRouteRequest builds a netlink message for getting the RIB,
+// the routing information base.
+func newNetlinkRouteRequest(proto, seq, family int) []byte {
+	rr := &syscall.NetlinkRouteRequest{}
+	rr.Header.Len = uint32(syscall.NLMSG_HDRLEN + syscall.SizeofRtGenmsg)
+	rr.Header.Type = uint16(proto)
+	rr.Header.Flags = syscall.NLM_F_DUMP | syscall.NLM_F_REQUEST
+	rr.Header.Seq = uint32(seq)
+	rr.Data.Family = uint8(family)
+	return netlinkRRtoWireFormat(rr)
+}
+
+func netlinkRRtoWireFormat(rr *syscall.NetlinkRouteRequest) []byte {
+	b := make([]byte, rr.Header.Len)
+	*(*uint32)(unsafe.Pointer(&b[0:4][0])) = rr.Header.Len
+	*(*uint16)(unsafe.Pointer(&b[4:6][0])) = rr.Header.Type
+	*(*uint16)(unsafe.Pointer(&b[6:8][0])) = rr.Header.Flags
+	*(*uint32)(unsafe.Pointer(&b[8:12][0])) = rr.Header.Seq
+	*(*uint32)(unsafe.Pointer(&b[12:16][0])) = rr.Header.Pid
+	b[16] = byte(rr.Data.Family)
+	return b
+}
+
+func (s *Stack) getNetlinkFd() (uint32, *syserr.Error) {
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(syscall.AF_NETLINK), Type: int64(syscall.SOCK_RAW | syscall.SOCK_NONBLOCK), Protocol: int64(syscall.NETLINK_ROUTE)}}}, false /* ignoreResult */)
+	<-c
+
+	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Socket).Socket.Result
+	if e, ok := res.(*pb.SocketResponse_ErrorNumber); ok {
+		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+	return res.(*pb.SocketResponse_Fd).Fd, nil
+}
+
+func (s *Stack) bindNetlinkFd(fd uint32, sockaddr []byte) *syserr.Error {
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Bind{&pb.BindRequest{Fd: fd, Address: sockaddr}}}, false /* ignoreResult */)
+	<-c
+
+	if e := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 {
+		return syserr.FromHost(syscall.Errno(e))
+	}
+	return nil
+}
+
+func (s *Stack) closeNetlinkFd(fd uint32) {
+	_, _ = s.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Close{&pb.CloseRequest{Fd: fd}}}, true /* ignoreResult */)
+}
+
+func (s *Stack) rpcSendMsg(req *pb.SyscallRequest_Sendmsg) (uint32, *syserr.Error) {
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-c
+
+	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Sendmsg).Sendmsg.Result
+	if e, ok := res.(*pb.SendmsgResponse_ErrorNumber); ok {
+		return 0, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.SendmsgResponse_Length).Length, nil
+}
+
+func (s *Stack) sendMsg(fd uint32, buf []byte, to []byte, flags int) (int, *syserr.Error) {
+	// Whitelist flags.
+	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
+		return 0, syserr.ErrInvalidArgument
+	}
+
+	req := &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
+		Fd:          fd,
+		Data:        buf,
+		Address:     to,
+		More:        flags&linux.MSG_MORE != 0,
+		EndOfRecord: flags&linux.MSG_EOR != 0,
+	}}
+
+	n, err := s.rpcSendMsg(req)
+	return int(n), err
+}
+
+func (s *Stack) rpcRecvMsg(req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) {
+	id, c := s.rpcConn.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-c
+
+	res := s.rpcConn.Request(id).Result.(*pb.SyscallResponse_Recvmsg).Recvmsg.Result
+	if e, ok := res.(*pb.RecvmsgResponse_ErrorNumber); ok {
+		return nil, syserr.FromHost(syscall.Errno(e.ErrorNumber))
+	}
+
+	return res.(*pb.RecvmsgResponse_Payload).Payload, nil
+}
+
+func (s *Stack) recvMsg(fd, l, flags uint32) ([]byte, *syserr.Error) {
+	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
+		Fd:     fd,
+		Length: l,
+		Sender: false,
+		Trunc:  flags&linux.MSG_TRUNC != 0,
+		Peek:   flags&linux.MSG_PEEK != 0,
+	}}
+
+	res, err := s.rpcRecvMsg(req)
+	if err != nil {
+		return nil, err
+	}
+	return res.Data, nil
+}
+
+func (s *Stack) netlinkRequest(proto, family int) ([]byte, error) {
+	fd, err := s.getNetlinkFd()
+	if err != nil {
+		return nil, err.ToError()
+	}
+	defer s.closeNetlinkFd(fd)
+
+	lsa := syscall.SockaddrNetlink{Family: syscall.AF_NETLINK}
+	b := binary.Marshal(nil, usermem.ByteOrder, &lsa)
+	if err := s.bindNetlinkFd(fd, b); err != nil {
+		return nil, err.ToError()
+	}
+
+	wb := newNetlinkRouteRequest(proto, 1, family)
+	_, err = s.sendMsg(fd, wb, b, 0)
+	if err != nil {
+		return nil, err.ToError()
+	}
+
+	var tab []byte
+done:
+	for {
+		rb, err := s.recvMsg(fd, uint32(syscall.Getpagesize()), 0)
+		nr := len(rb)
+		if err != nil {
+			return nil, err.ToError()
+		}
+
+		if nr < syscall.NLMSG_HDRLEN {
+			return nil, syserr.ErrInvalidArgument.ToError()
+		}
+
+		tab = append(tab, rb...)
+		msgs, e := syscall.ParseNetlinkMessage(rb)
+		if e != nil {
+			return nil, e
+		}
+
+		for _, m := range msgs {
+			if m.Header.Type == syscall.NLMSG_DONE {
+				break done
+			}
+			if m.Header.Type == syscall.NLMSG_ERROR {
+				return nil, syserr.ErrInvalidArgument.ToError()
+			}
+		}
+	}
+
+	return tab, nil
+}
+
+// DoNetlinkRouteRequest returns routing information base, also known as RIB,
+// which consists of network facility information, states and parameters.
+func (s *Stack) DoNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) {
+	data, err := s.netlinkRequest(req, syscall.AF_UNSPEC)
+	if err != nil {
+		return nil, err
+	}
+	return syscall.ParseNetlinkMessage(data)
+}
diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
new file mode 100644
index 000000000..b845b1bce
--- /dev/null
+++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
@@ -0,0 +1,351 @@
+syntax = "proto3";
+
+// package syscall_rpc is a set of networking related system calls that can be
+// forwarded to a socket gofer.
+//
+// TODO: Document individual RPCs.
+package syscall_rpc;
+
+message SendmsgRequest {
+  uint32 fd = 1;
+  bytes data = 2;
+  bytes address = 3;
+  bool more = 4;
+  bool end_of_record = 5;
+}
+
+message SendmsgResponse {
+  oneof result {
+    uint32 error_number = 1;
+    uint32 length = 2;
+  }
+}
+
+message IOCtlRequest {
+  uint32 fd = 1;
+  uint32 cmd = 2;
+  uint64 arg = 3;
+}
+
+message IOCtlResponse {
+  oneof result {
+    uint32 error_number = 1;
+    uint64 value = 2;
+  }
+}
+
+message RecvmsgRequest {
+  uint32 fd = 1;
+  uint32 length = 2;
+  bool sender = 3;
+  bool peek = 4;
+  bool trunc = 5;
+}
+
+message OpenRequest {
+  bytes path = 1;
+  uint32 flags = 2;
+  uint32 mode = 3;
+}
+
+message OpenResponse {
+  oneof result {
+    uint32 error_number = 1;
+    uint32 fd = 2;
+  }
+}
+
+message ReadRequest {
+  uint32 fd = 1;
+  uint32 length = 2;
+}
+
+message ReadResponse {
+  oneof result {
+    uint32 error_number = 1;
+    bytes data = 2;
+  }
+}
+
+message ReadFileRequest {
+  string path = 1;
+}
+
+message ReadFileResponse {
+  oneof result {
+    uint32 error_number = 1;
+    bytes data = 2;
+  }
+}
+
+message WriteRequest {
+  uint32 fd = 1;
+  bytes data = 2;
+}
+
+message WriteResponse {
+  oneof result {
+    uint32 error_number = 1;
+    uint32 length = 2;
+  }
+}
+
+message WriteFileRequest {
+  string path = 1;
+  bytes content = 2;
+}
+
+message WriteFileResponse {
+  uint32 error_number = 1;
+  uint32 written = 2;
+}
+
+message AddressResponse {
+  bytes address = 1;
+  uint32 length = 2;
+}
+
+message RecvmsgResponse {
+  message ResultPayload {
+    bytes data = 1;
+    AddressResponse address = 2;
+    uint32 length = 3;
+  }
+  oneof result {
+    uint32 error_number = 1;
+    ResultPayload payload = 2;
+  }
+}
+
+message BindRequest {
+  uint32 fd = 1;
+  bytes address = 2;
+}
+
+message BindResponse {
+  uint32 error_number = 1;
+}
+
+message AcceptRequest {
+  uint32 fd = 1;
+  bool peer = 2;
+  int64 flags = 3;
+}
+
+message AcceptResponse {
+  message ResultPayload {
+    uint32 fd = 1;
+    AddressResponse address = 2;
+  }
+  oneof result {
+    uint32 error_number = 1;
+    ResultPayload payload = 2;
+  }
+}
+
+message ConnectRequest {
+  uint32 fd = 1;
+  bytes address = 2;
+}
+
+message ConnectResponse {
+  uint32 error_number = 1;
+}
+
+message ListenRequest {
+  uint32 fd = 1;
+  int64 backlog = 2;
+}
+
+message ListenResponse {
+  uint32 error_number = 1;
+}
+
+message ShutdownRequest {
+  uint32 fd = 1;
+  int64 how = 2;
+}
+
+message ShutdownResponse {
+  uint32 error_number = 1;
+}
+
+message CloseRequest {
+  uint32 fd = 1;
+}
+
+message CloseResponse {
+  uint32 error_number = 1;
+}
+
+message GetSockOptRequest {
+  uint32 fd = 1;
+  int64 level = 2;
+  int64 name = 3;
+  uint32 length = 4;
+}
+
+message GetSockOptResponse {
+  oneof result {
+    uint32 error_number = 1;
+    bytes opt = 2;
+  }
+}
+
+message SetSockOptRequest {
+  uint32 fd = 1;
+  int64 level = 2;
+  int64 name = 3;
+  bytes opt = 4;
+}
+
+message SetSockOptResponse {
+  uint32 error_number = 1;
+}
+
+message GetSockNameRequest {
+  uint32 fd = 1;
+}
+
+message GetSockNameResponse {
+  oneof result {
+    uint32 error_number = 1;
+    AddressResponse address = 2;
+  }
+}
+
+message GetPeerNameRequest {
+  uint32 fd = 1;
+}
+
+message GetPeerNameResponse {
+  oneof result {
+    uint32 error_number = 1;
+    AddressResponse address = 2;
+  }
+}
+
+message SocketRequest {
+  int64 family = 1;
+  int64 type = 2;
+  int64 protocol = 3;
+}
+
+message SocketResponse {
+  oneof result {
+    uint32 error_number = 1;
+    uint32 fd = 2;
+  }
+}
+
+message EpollWaitRequest {
+  uint32 fd = 1;
+  uint32 num_events = 2;
+  sint64 msec = 3;
+}
+
+message EpollEvent {
+  uint32 fd = 1;
+  uint32 events = 2;
+}
+
+message EpollEvents {
+  repeated EpollEvent events = 1;
+}
+
+message EpollWaitResponse {
+  oneof result {
+    uint32 error_number = 1;
+    EpollEvents events = 2;
+  }
+}
+
+message EpollCtlRequest {
+  uint32 epfd = 1;
+  int64 op = 2;
+  uint32 fd = 3;
+  EpollEvent event = 4;
+}
+
+message EpollCtlResponse {
+  uint32 error_number = 1;
+}
+
+message EpollCreate1Request {
+  int64 flag = 1;
+}
+
+message EpollCreate1Response {
+  oneof result {
+    uint32 error_number = 1;
+    uint32 fd = 2;
+  }
+}
+
+message PollRequest {
+  uint32 fd = 1;
+  uint32 events = 2;
+}
+
+message PollResponse {
+  oneof result {
+    uint32 error_number = 1;
+    uint32 events = 2;
+  }
+}
+
+message SyscallRequest {
+  oneof args {
+    SocketRequest socket = 1;
+    SendmsgRequest sendmsg = 2;
+    RecvmsgRequest recvmsg = 3;
+    BindRequest bind = 4;
+    AcceptRequest accept = 5;
+    ConnectRequest connect = 6;
+    ListenRequest listen = 7;
+    ShutdownRequest shutdown = 8;
+    CloseRequest close = 9;
+    GetSockOptRequest get_sock_opt = 10;
+    SetSockOptRequest set_sock_opt = 11;
+    GetSockNameRequest get_sock_name = 12;
+    GetPeerNameRequest get_peer_name = 13;
+    EpollWaitRequest epoll_wait = 14;
+    EpollCtlRequest epoll_ctl = 15;
+    EpollCreate1Request epoll_create1 = 16;
+    PollRequest poll = 17;
+    ReadRequest read = 18;
+    WriteRequest write = 19;
+    OpenRequest open = 20;
+    IOCtlRequest ioctl = 21;
+    WriteFileRequest write_file = 22;
+    ReadFileRequest read_file = 23;
+  }
+}
+
+message SyscallResponse {
+  oneof result {
+    SocketResponse socket = 1;
+    SendmsgResponse sendmsg = 2;
+    RecvmsgResponse recvmsg = 3;
+    BindResponse bind = 4;
+    AcceptResponse accept = 5;
+    ConnectResponse connect = 6;
+    ListenResponse listen = 7;
+    ShutdownResponse shutdown = 8;
+    CloseResponse close = 9;
+    GetSockOptResponse get_sock_opt = 10;
+    SetSockOptResponse set_sock_opt = 11;
+    GetSockNameResponse get_sock_name = 12;
+    GetPeerNameResponse get_peer_name = 13;
+    EpollWaitResponse epoll_wait = 14;
+    EpollCtlResponse epoll_ctl = 15;
+    EpollCreate1Response epoll_create1 = 16;
+    PollResponse poll = 17;
+    ReadResponse read = 18;
+    WriteResponse write = 19;
+    OpenResponse open = 20;
+    IOCtlResponse ioctl = 21;
+    WriteFileResponse write_file = 22;
+    ReadFileResponse read_file = 23;
+  }
+}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
new file mode 100644
index 000000000..be3026bfa
--- /dev/null
+++ b/pkg/sentry/socket/socket.go
@@ -0,0 +1,205 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package socket provides the interfaces that need to be provided by socket
+// implementations and providers, as well as per family demultiplexing of socket
+// creation.
+package socket
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// Socket is the interface containing socket syscalls used by the syscall layer
+// to redirect them to the appropriate implementation.
+type Socket interface {
+	fs.FileOperations
+
+	// Connect implements the connect(2) linux syscall.
+	Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error
+
+	// Accept implements the accept4(2) linux syscall.
+	// Returns fd, real peer address length and error. Real peer address
+	// length is only set if len(peer) > 0.
+	Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error)
+
+	// Bind implements the bind(2) linux syscall.
+	Bind(t *kernel.Task, sockaddr []byte) *syserr.Error
+
+	// Listen implements the listen(2) linux syscall.
+	Listen(t *kernel.Task, backlog int) *syserr.Error
+
+	// Shutdown implements the shutdown(2) linux syscall.
+	Shutdown(t *kernel.Task, how int) *syserr.Error
+
+	// GetSockOpt implements the getsockopt(2) linux syscall.
+	GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error)
+
+	// SetSockOpt implements the setsockopt(2) linux syscall.
+	SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error
+
+	// GetSockName implements the getsockname(2) linux syscall.
+	//
+	// addrLen is the address length to be returned to the application, not
+	// necessarily the actual length of the address.
+	GetSockName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)
+
+	// GetPeerName implements the getpeername(2) linux syscall.
+	//
+	// addrLen is the address length to be returned to the application, not
+	// necessarily the actual length of the address.
+	GetPeerName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)
+
+	// RecvMsg implements the recvmsg(2) linux syscall.
+	//
+	// senderAddrLen is the address length to be returned to the application,
+	// not necessarily the actual length of the address.
+	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error)
+
+	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
+	// ownership of the ControlMessage on error.
+	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (n int, err *syserr.Error)
+
+	// SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means
+	// no timeout.
+	SetRecvTimeout(nanoseconds int64)
+
+	// RecvTimeout gets the current timeout (in ns) for recv operations. Zero
+	// means no timeout.
+	RecvTimeout() int64
+}
+
+// Provider is the interface implemented by providers of sockets for specific
+// address families (e.g., AF_INET).
+type Provider interface {
+	// Socket creates a new socket.
+	//
+	// If a nil Socket _and_ a nil error is returned, it means that the
+	// protocol is not supported. A non-nil error should only be returned
+	// if the protocol is supported, but an error occurs during creation.
+	Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error)
+
+	// Pair creates a pair of connected sockets.
+	//
+	// See Socket for error information.
+	Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
+}
+
+// families holds a map of all known address families and their providers.
+var families = make(map[int][]Provider)
+
+// RegisterProvider registers the provider of a given address family so that
+// sockets of that type can be created via socket() and/or socketpair()
+// syscalls.
+func RegisterProvider(family int, provider Provider) {
+	families[family] = append(families[family], provider)
+}
+
+// New creates a new socket with the given family, type and protocol.
+func New(t *kernel.Task, family int, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+	for _, p := range families[family] {
+		s, err := p.Socket(t, stype, protocol)
+		if err != nil {
+			return nil, err
+		}
+		if s != nil {
+			return s, nil
+		}
+	}
+
+	return nil, syserr.ErrAddressFamilyNotSupported
+}
+
+// Pair creates a new connected socket pair with the given family, type and
+// protocol.
+func Pair(t *kernel.Task, family int, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+	providers, ok := families[family]
+	if !ok {
+		return nil, nil, syserr.ErrAddressFamilyNotSupported
+	}
+
+	for _, p := range providers {
+		s, t, err := p.Pair(t, stype, protocol)
+		if err != nil {
+			return nil, nil, err
+		}
+		if s != nil && t != nil {
+			return s, t, nil
+		}
+	}
+
+	return nil, nil, syserr.ErrSocketNotSupported
+}
+
+// NewDirent returns a sockfs fs.Dirent that resides on device d.
+func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
+	ino := d.NextIno()
+	// There is no real filesystem backing this pipe, so we pass in a nil
+	// Filesystem.
+	inode := fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
+		FSType: linux.SOCKFS_MAGIC,
+		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: fs.FileOwnerFromContext(ctx),
+			Perms: fs.FilePermissions{
+				User: fs.PermMask{Read: true, Write: true},
+			},
+			Links: 1,
+		}),
+	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+		Type:      fs.Socket,
+		DeviceID:  d.DeviceID(),
+		InodeID:   ino,
+		BlockSize: usermem.PageSize,
+	})
+
+	// Dirent name matches net/socket.c:sockfs_dname.
+	return fs.NewDirent(inode, fmt.Sprintf("socket:[%d]", ino))
+}
+
+// ReceiveTimeout stores a timeout for receive calls.
+//
+// It is meant to be embedded into Socket implementations to help satisfy the
+// interface.
+//
+// Care must be taken when copying ReceiveTimeout as it contains atomic
+// variables.
+type ReceiveTimeout struct {
+	// ns is length of the timeout in nanoseconds.
+	//
+	// ns must be accessed atomically.
+	ns int64
+}
+
+// SetRecvTimeout implements Socket.SetRecvTimeout.
+func (rt *ReceiveTimeout) SetRecvTimeout(nanoseconds int64) {
+	atomic.StoreInt64(&rt.ns, nanoseconds)
+}
+
+// RecvTimeout implements Socket.RecvTimeout.
+func (rt *ReceiveTimeout) RecvTimeout() int64 {
+	return atomic.LoadInt64(&rt.ns)
+}
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
new file mode 100644
index 000000000..1ec6eb7ed
--- /dev/null
+++ b/pkg/sentry/socket/unix/BUILD
@@ -0,0 +1,48 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "unix_state",
+    srcs = [
+        "unix.go",
+    ],
+    out = "unix_state.go",
+    package = "unix",
+)
+
+go_library(
+    name = "unix",
+    srcs = [
+        "device.go",
+        "io.go",
+        "unix.go",
+        "unix_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/epsocket",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/socket/unix/device.go b/pkg/sentry/socket/unix/device.go
new file mode 100644
index 000000000..e8bcc7a9f
--- /dev/null
+++ b/pkg/sentry/socket/unix/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unix
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// unixSocketDevice is the unix socket virtual device.
+var unixSocketDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
new file mode 100644
index 000000000..0ca2e35d0
--- /dev/null
+++ b/pkg/sentry/socket/unix/io.go
@@ -0,0 +1,88 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unix
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// EndpointWriter implements safemem.Writer that writes to a unix.Endpoint.
+//
+// EndpointWriter is not thread-safe.
+type EndpointWriter struct {
+	// Endpoint is the unix.Endpoint to write to.
+	Endpoint unix.Endpoint
+
+	// Control is the control messages to send.
+	Control unix.ControlMessages
+
+	// To is the endpoint to send to. May be nil.
+	To unix.BoundEndpoint
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (w *EndpointWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	return safemem.FromVecWriterFunc{func(bufs [][]byte) (int64, error) {
+		n, err := w.Endpoint.SendMsg(bufs, w.Control, w.To)
+		if err != nil {
+			return int64(n), syserr.TranslateNetstackError(err).ToError()
+		}
+		return int64(n), nil
+	}}.WriteFromBlocks(srcs)
+}
+
+// EndpointReader implements safemem.Reader that reads from a unix.Endpoint.
+//
+// EndpointReader is not thread-safe.
+type EndpointReader struct {
+	// Endpoint is the unix.Endpoint to read from.
+	Endpoint unix.Endpoint
+
+	// Creds indicates if credential control messages are requested.
+	Creds bool
+
+	// NumRights is the number of SCM_RIGHTS FDs requested.
+	NumRights uintptr
+
+	// Peek indicates that the data should not be consumed from the
+	// endpoint.
+	Peek bool
+
+	// MsgSize is the size of the message that was read from. For stream
+	// sockets, it is the amount read.
+	MsgSize uintptr
+
+	// From, if not nil, will be set with the address read from.
+	From *tcpip.FullAddress
+
+	// Control contains the received control messages.
+	Control unix.ControlMessages
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) {
+		n, ms, c, err := r.Endpoint.RecvMsg(bufs, r.Creds, r.NumRights, r.Peek, r.From)
+		r.Control = c
+		r.MsgSize = ms
+		if err != nil {
+			return int64(n), syserr.TranslateNetstackError(err).ToError()
+		}
+		return int64(n), nil
+	}}.ReadToBlocks(dsts)
+}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
new file mode 100644
index 000000000..a4b414851
--- /dev/null
+++ b/pkg/sentry/socket/unix/unix.go
@@ -0,0 +1,571 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package unix provides an implementation of the socket.Socket interface for
+// the AF_UNIX protocol family.
+package unix
+
+import (
+	"strings"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// SocketOperations is a Unix socket. It is similar to an epsocket, except it is backed
+// by a unix.Endpoint instead of a tcpip.Endpoint.
+type SocketOperations struct {
+	refs.AtomicRefCount
+	socket.ReceiveTimeout
+	fsutil.PipeSeek      `state:"nosave"`
+	fsutil.NotDirReaddir `state:"nosave"`
+	fsutil.NoFsync       `state:"nosave"`
+	fsutil.NoopFlush     `state:"nosave"`
+	fsutil.NoMMap        `state:"nosave"`
+	ep                   unix.Endpoint
+}
+
+// New creates a new unix socket.
+func New(ctx context.Context, endpoint unix.Endpoint) *fs.File {
+	dirent := socket.NewDirent(ctx, unixSocketDevice)
+	return NewWithDirent(ctx, dirent, endpoint, fs.FileFlags{Read: true, Write: true})
+}
+
+// NewWithDirent creates a new unix socket using an existing dirent.
+func NewWithDirent(ctx context.Context, d *fs.Dirent, ep unix.Endpoint, flags fs.FileFlags) *fs.File {
+	return fs.NewFile(ctx, d, flags, &SocketOperations{
+		ep: ep,
+	})
+}
+
+// DecRef implements RefCounter.DecRef.
+func (s *SocketOperations) DecRef() {
+	s.DecRefWithDestructor(func() {
+		s.ep.Close()
+	})
+}
+
+// Release implemements fs.FileOperations.Release.
+func (s *SocketOperations) Release() {
+	// Release only decrements a reference on s because s may be referenced in
+	// the abstract socket namespace.
+	s.DecRef()
+}
+
+// Endpoint extracts the unix.Endpoint.
+func (s *SocketOperations) Endpoint() unix.Endpoint {
+	return s.ep
+}
+
+// extractPath extracts and validates the address.
+func extractPath(sockaddr []byte) (string, *syserr.Error) {
+	addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr)
+	if err != nil {
+		return "", err
+	}
+
+	// The address is trimmed by GetAddress.
+	p := string(addr.Addr)
+	if p == "" {
+		// Not allowed.
+		return "", syserr.ErrInvalidArgument
+	}
+	if p[len(p)-1] == '/' {
+		// Weird, they tried to bind '/a/b/c/'?
+		return "", syserr.ErrIsDir
+	}
+
+	return p, nil
+}
+
+// GetPeerName implements the linux syscall getpeername(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	addr, err := s.ep.GetRemoteAddress()
+	if err != nil {
+		return nil, 0, syserr.TranslateNetstackError(err)
+	}
+
+	a, l := epsocket.ConvertAddress(linux.AF_UNIX, addr)
+	return a, l, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+	addr, err := s.ep.GetLocalAddress()
+	if err != nil {
+		return nil, 0, syserr.TranslateNetstackError(err)
+	}
+
+	a, l := epsocket.ConvertAddress(linux.AF_UNIX, addr)
+	return a, l, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return epsocket.Ioctl(ctx, s.ep, io, args)
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
+	return epsocket.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
+}
+
+// Listen implements the linux syscall listen(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
+	return syserr.TranslateNetstackError(s.ep.Listen(backlog))
+}
+
+// blockingAccept implements a blocking version of accept(2), that is, if no
+// connections are ready to be accept, it will block until one becomes ready.
+func (s *SocketOperations) blockingAccept(t *kernel.Task) (unix.Endpoint, *syserr.Error) {
+	// Register for notifications.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventIn)
+	defer s.EventUnregister(&e)
+
+	// Try to accept the connection; if it fails, then wait until we get a
+	// notification.
+	for {
+		if ep, err := s.ep.Accept(); err != tcpip.ErrWouldBlock {
+			return ep, syserr.TranslateNetstackError(err)
+		}
+
+		if err := t.Block(ch); err != nil {
+			return nil, syserr.FromError(err)
+		}
+	}
+}
+
+// Accept implements the linux syscall accept(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
+	// Issue the accept request to get the new endpoint.
+	ep, err := s.ep.Accept()
+	if err != nil {
+		if err != tcpip.ErrWouldBlock || !blocking {
+			return 0, nil, 0, syserr.TranslateNetstackError(err)
+		}
+
+		var err *syserr.Error
+		ep, err = s.blockingAccept(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	ns := New(t, ep)
+	defer ns.DecRef()
+
+	if flags&linux.SOCK_NONBLOCK != 0 {
+		flags := ns.Flags()
+		flags.NonBlocking = true
+		ns.SetFlags(flags.Settable())
+	}
+
+	var addr interface{}
+	var addrLen uint32
+	if peerRequested {
+		// Get address of the peer.
+		var err *syserr.Error
+		addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t)
+		if err != nil {
+			return 0, nil, 0, err
+		}
+	}
+
+	fdFlags := kernel.FDFlags{
+		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
+	}
+	fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
+	if e != nil {
+		return 0, nil, 0, syserr.FromError(e)
+	}
+
+	return fd, addr, addrLen, nil
+}
+
+// Bind implements the linux syscall bind(2) for unix sockets.
+func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
+	p, e := extractPath(sockaddr)
+	if e != nil {
+		return e
+	}
+
+	bep, ok := s.ep.(unix.BoundEndpoint)
+	if !ok {
+		// This socket can't be bound.
+		return syserr.ErrInvalidArgument
+	}
+
+	return syserr.TranslateNetstackError(s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *tcpip.Error {
+		// Is it abstract?
+		if p[0] == 0 {
+			if t.IsNetworkNamespaced() {
+				return tcpip.ErrInvalidEndpointState
+			}
+			if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
+				// tcpip.ErrPortInUse corresponds to EADDRINUSE.
+				return tcpip.ErrPortInUse
+			}
+		} else {
+			// The parent and name.
+			var d *fs.Dirent
+			var name string
+
+			cwd := t.FSContext().WorkingDirectory()
+			defer cwd.DecRef()
+
+			// Is there no slash at all?
+			if !strings.Contains(p, "/") {
+				d = cwd
+				name = p
+			} else {
+				root := t.FSContext().RootDirectory()
+				defer root.DecRef()
+				// Find the last path component, we know that something follows
+				// that final slash, otherwise extractPath() would have failed.
+				lastSlash := strings.LastIndex(p, "/")
+				subPath := p[:lastSlash]
+				if subPath == "" {
+					// Fix up subpath in case file is in root.
+					subPath = "/"
+				}
+				var err error
+				d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, fs.DefaultTraversalLimit)
+				if err != nil {
+					// No path available.
+					return tcpip.ErrNoSuchFile
+				}
+				defer d.DecRef()
+				name = p[lastSlash+1:]
+			}
+
+			// Create the socket.
+			if err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}); err != nil {
+				return tcpip.ErrPortInUse
+			}
+		}
+
+		return nil
+	}))
+}
+
+// extractEndpoint retrieves the unix.BoundEndpoint associated with a Unix
+// socket path. The Release must be called on the unix.BoundEndpoint when the
+// caller is done with it.
+func extractEndpoint(t *kernel.Task, sockaddr []byte) (unix.BoundEndpoint, *syserr.Error) {
+	path, err := extractPath(sockaddr)
+	if err != nil {
+		return nil, err
+	}
+
+	// Is it abstract?
+	if path[0] == 0 {
+		if t.IsNetworkNamespaced() {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		ep := t.AbstractSockets().BoundEndpoint(path[1:])
+		if ep == nil {
+			// No socket found.
+			return nil, syserr.ErrConnectionRefused
+		}
+
+		return ep, nil
+	}
+
+	// Find the node in the filesystem.
+	root := t.FSContext().RootDirectory()
+	cwd := t.FSContext().WorkingDirectory()
+	d, e := t.MountNamespace().FindInode(t, root, cwd, path, fs.DefaultTraversalLimit)
+	cwd.DecRef()
+	root.DecRef()
+	if e != nil {
+		return nil, syserr.FromError(e)
+	}
+
+	// Extract the endpoint if one is there.
+	ep := d.Inode.BoundEndpoint(path)
+	d.DecRef()
+	if ep == nil {
+		// No socket!
+		return nil, syserr.ErrConnectionRefused
+	}
+
+	return ep, nil
+}
+
+// Connect implements the linux syscall connect(2) for unix sockets.
+func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
+	ep, err := extractEndpoint(t, sockaddr)
+	if err != nil {
+		return err
+	}
+	defer ep.Release()
+
+	// Connect the server endpoint.
+	return syserr.TranslateNetstackError(s.ep.Connect(ep))
+}
+
+// Writev implements fs.FileOperations.Write.
+func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	t := kernel.TaskFromContext(ctx)
+	ctrl := control.New(t, s.ep, nil)
+
+	if src.NumBytes() == 0 {
+		nInt, tcpipError := s.ep.SendMsg([][]byte{}, ctrl, nil)
+		return int64(nInt), syserr.TranslateNetstackError(tcpipError).ToError()
+	}
+
+	return src.CopyInTo(ctx, &EndpointWriter{
+		Endpoint: s.ep,
+		Control:  ctrl,
+		To:       nil,
+	})
+}
+
+// SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+	w := EndpointWriter{
+		Endpoint: s.ep,
+		Control:  controlMessages,
+		To:       nil,
+	}
+	if len(to) > 0 {
+		ep, err := extractEndpoint(t, to)
+		if err != nil {
+			return 0, err
+		}
+		defer ep.Release()
+		w.To = ep
+	}
+
+	if n, err := src.CopyInTo(t, &w); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		return int(n), syserr.FromError(err)
+	}
+
+	// We'll have to block. Register for notification and keep trying to
+	// send all the data.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventOut)
+	defer s.EventUnregister(&e)
+
+	for {
+		if n, err := src.CopyInTo(t, &w); err != syserror.ErrWouldBlock {
+			return int(n), syserr.FromError(err)
+		}
+
+		if err := t.Block(ch); err != nil {
+			return 0, syserr.FromError(err)
+		}
+	}
+}
+
+// Passcred implements unix.Credentialer.Passcred.
+func (s *SocketOperations) Passcred() bool {
+	return s.ep.Passcred()
+}
+
+// ConnectedPasscred implements unix.Credentialer.ConnectedPasscred.
+func (s *SocketOperations) ConnectedPasscred() bool {
+	return s.ep.ConnectedPasscred()
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return s.ep.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (s *SocketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	s.ep.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (s *SocketOperations) EventUnregister(e *waiter.Entry) {
+	s.ep.EventUnregister(e)
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
+	return epsocket.SetSockOpt(t, s, s.ep, level, name, optVal)
+}
+
+// Shutdown implements the linux syscall shutdown(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+	f, err := epsocket.ConvertShutdown(how)
+	if err != nil {
+		return err
+	}
+
+	// Issue shutdown request.
+	return syserr.TranslateNetstackError(s.ep.Shutdown(f))
+}
+
+// Read implements fs.FileOperations.Read.
+func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	return dst.CopyOutFrom(ctx, &EndpointReader{
+		Endpoint:  s.ep,
+		NumRights: 0,
+		Peek:      false,
+		From:      nil,
+	})
+}
+
+// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
+// a unix.Endpoint.
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error) {
+	trunc := flags&linux.MSG_TRUNC != 0
+	peek := flags&linux.MSG_PEEK != 0
+
+	// Calculate the number of FDs for which we have space and if we are
+	// requesting credentials.
+	var wantCreds bool
+	rightsLen := int(controlDataLen) - syscall.SizeofCmsghdr
+	if s.Passcred() {
+		// Credentials take priority if they are enabled and there is space.
+		wantCreds = rightsLen > 0
+		credLen := syscall.CmsgSpace(syscall.SizeofUcred)
+		rightsLen -= credLen
+	}
+	// FDs are 32 bit (4 byte) ints.
+	numRights := rightsLen / 4
+	if numRights < 0 {
+		numRights = 0
+	}
+
+	r := EndpointReader{
+		Endpoint:  s.ep,
+		Creds:     wantCreds,
+		NumRights: uintptr(numRights),
+		Peek:      peek,
+	}
+	if senderRequested {
+		r.From = &tcpip.FullAddress{}
+	}
+	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		var from interface{}
+		var fromLen uint32
+		if r.From != nil {
+			from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
+		}
+		if trunc {
+			n = int64(r.MsgSize)
+		}
+		return int(n), from, fromLen, r.Control, syserr.FromError(err)
+	}
+
+	// We'll have to block. Register for notification and keep trying to
+	// send all the data.
+	e, ch := waiter.NewChannelEntry(nil)
+	s.EventRegister(&e, waiter.EventIn)
+	defer s.EventUnregister(&e)
+
+	for {
+		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+			var from interface{}
+			var fromLen uint32
+			if r.From != nil {
+				from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
+			}
+			if trunc {
+				n = int64(r.MsgSize)
+			}
+			return int(n), from, fromLen, r.Control, syserr.FromError(err)
+		}
+
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+			}
+			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+		}
+	}
+}
+
+// provider is a unix domain socket provider.
+type provider struct{}
+
+// Socket returns a new unix domain socket.
+func (*provider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 {
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	// Create the endpoint and socket.
+	var ep unix.Endpoint
+	switch stype {
+	case linux.SOCK_DGRAM:
+		ep = unix.NewConnectionless()
+	case linux.SOCK_STREAM, linux.SOCK_SEQPACKET:
+		ep = unix.NewConnectioned(stype, t.Kernel())
+	default:
+		return nil, syserr.ErrInvalidArgument
+	}
+
+	return New(t, ep), nil
+}
+
+// Pair creates a new pair of AF_UNIX connected sockets.
+func (*provider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+	// Check arguments.
+	if protocol != 0 {
+		return nil, nil, syserr.ErrInvalidArgument
+	}
+
+	switch stype {
+	case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
+	default:
+		return nil, nil, syserr.ErrInvalidArgument
+	}
+
+	// Create the endpoints and sockets.
+	ep1, ep2 := unix.NewPair(stype, t.Kernel())
+	s1 := New(t, ep1)
+	s2 := New(t, ep2)
+
+	return s1, s2, nil
+}
+
+func init() {
+	socket.RegisterProvider(linux.AF_UNIX, &provider{})
+}
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
new file mode 100644
index 000000000..7148df395
--- /dev/null
+++ b/pkg/sentry/state/BUILD
@@ -0,0 +1,21 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "state",
+    srcs = [
+        "state.go",
+        "state_metadata.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/state",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/watchdog",
+        "//pkg/state/statefile",
+    ],
+)
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
new file mode 100644
index 000000000..5bec4e018
--- /dev/null
+++ b/pkg/sentry/state/state.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package state provides high-level state wrappers.
+package state
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/state/statefile"
+)
+
+// ErrStateFile is returned when the state file cannot be opened.
+type ErrStateFile struct {
+	err error
+}
+
+// Error implements error.Error().
+func (e ErrStateFile) Error() string {
+	return fmt.Sprintf("failed to open statefile: %v", e.err)
+}
+
+// SaveOpts contains save-related options.
+type SaveOpts struct {
+	// Destination is the save target.
+	Destination io.Writer
+
+	// Key is used for state integrity check.
+	Key []byte
+
+	// Metadata is save metadata.
+	Metadata map[string]string
+
+	// CompressionLevel is the compression level to use.
+	//
+	// See statefile.NewWriter for details.
+	CompressionLevel int
+
+	// Callback is called prior to unpause, with any save error.
+	Callback func(err error)
+}
+
+// Save saves the system state.
+func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
+	log.Infof("Sandbox save started, pausing all tasks.")
+	k.Pause()
+	defer k.Unpause()
+	defer log.Infof("Tasks resumed after save.")
+
+	w.Stop()
+	defer w.Start()
+
+	// Supplement the metadata.
+	if opts.Metadata == nil {
+		opts.Metadata = make(map[string]string)
+	}
+	addSaveMetadata(opts.Metadata)
+
+	// Open the statefile.
+	wc, err := statefile.NewWriter(opts.Destination, opts.Key, opts.Metadata, opts.CompressionLevel)
+	if err != nil {
+		err = ErrStateFile{err}
+	} else {
+		// Save the kernel.
+		err = k.SaveTo(wc)
+		if closeErr := wc.Close(); err == nil && closeErr != nil {
+			err = closeErr
+		}
+		if err != nil {
+			err = ErrStateFile{err}
+		}
+	}
+	opts.Callback(err)
+	return err
+}
+
+// LoadOpts contains load-related options.
+type LoadOpts struct {
+	// Destination is the load source.
+	Source io.Reader
+
+	// Key is used for state integrity check.
+	Key []byte
+}
+
+// Load loads the given kernel, setting the provided platform and stack.
+func (opts LoadOpts) Load(k *kernel.Kernel, p platform.Platform, n inet.Stack) error {
+	// Open the file.
+	r, _, err := statefile.NewReader(opts.Source, opts.Key)
+	if err != nil {
+		return ErrStateFile{err}
+	}
+
+	// Restore the Kernel object graph.
+	return k.LoadFrom(r, p, n)
+}
diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go
new file mode 100644
index 000000000..ac374f428
--- /dev/null
+++ b/pkg/sentry/state/state_metadata.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package state
+
+import (
+	"fmt"
+	"time"
+)
+
+// The save metadata keys for timestamp.
+const (
+	metadataTimestamp = "timestamp"
+)
+
+func addSaveMetadata(m map[string]string) {
+	m[metadataTimestamp] = fmt.Sprintf("%v", time.Now())
+}
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
new file mode 100644
index 000000000..c5946a564
--- /dev/null
+++ b/pkg/sentry/strace/BUILD
@@ -0,0 +1,48 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "strace",
+    srcs = [
+        "clone.go",
+        "futex.go",
+        "linux64.go",
+        "open.go",
+        "ptrace.go",
+        "socket.go",
+        "strace.go",
+        "syscalls.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/strace",
+    visibility = ["//:sandbox"],
+    deps = [
+        ":strace_go_proto",
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/bits",
+        "//pkg/eventchannel",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/epsocket",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/usermem",
+    ],
+)
+
+proto_library(
+    name = "strace_proto",
+    srcs = ["strace.proto"],
+    visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+    name = "strace_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto",
+    proto = ":strace_proto",
+    visibility = ["//visibility:public"],
+)
diff --git a/pkg/sentry/strace/clone.go b/pkg/sentry/strace/clone.go
new file mode 100644
index 000000000..b82ca1ad1
--- /dev/null
+++ b/pkg/sentry/strace/clone.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+)
+
+// CloneFlagSet is the set of clone(2) flags.
+var CloneFlagSet = abi.FlagSet{
+	{
+		Flag: syscall.CLONE_VM,
+		Name: "CLONE_VM",
+	},
+	{
+		Flag: syscall.CLONE_FS,
+		Name: "CLONE_FS",
+	},
+	{
+		Flag: syscall.CLONE_FILES,
+		Name: "CLONE_FILES",
+	},
+	{
+		Flag: syscall.CLONE_SIGHAND,
+		Name: "CLONE_SIGHAND",
+	},
+	{
+		Flag: syscall.CLONE_PTRACE,
+		Name: "CLONE_PTRACE",
+	},
+	{
+		Flag: syscall.CLONE_VFORK,
+		Name: "CLONE_VFORK",
+	},
+	{
+		Flag: syscall.CLONE_PARENT,
+		Name: "CLONE_PARENT",
+	},
+	{
+		Flag: syscall.CLONE_THREAD,
+		Name: "CLONE_THREAD",
+	},
+	{
+		Flag: syscall.CLONE_NEWNS,
+		Name: "CLONE_NEWNS",
+	},
+	{
+		Flag: syscall.CLONE_SYSVSEM,
+		Name: "CLONE_SYSVSEM",
+	},
+	{
+		Flag: syscall.CLONE_SETTLS,
+		Name: "CLONE_SETTLS",
+	},
+	{
+		Flag: syscall.CLONE_PARENT_SETTID,
+		Name: "CLONE_PARENT_SETTID",
+	},
+	{
+		Flag: syscall.CLONE_CHILD_CLEARTID,
+		Name: "CLONE_CHILD_CLEARTID",
+	},
+	{
+		Flag: syscall.CLONE_DETACHED,
+		Name: "CLONE_DETACHED",
+	},
+	{
+		Flag: syscall.CLONE_UNTRACED,
+		Name: "CLONE_UNTRACED",
+	},
+	{
+		Flag: syscall.CLONE_CHILD_SETTID,
+		Name: "CLONE_CHILD_SETTID",
+	},
+	{
+		Flag: syscall.CLONE_NEWUTS,
+		Name: "CLONE_NEWUTS",
+	},
+	{
+		Flag: syscall.CLONE_NEWIPC,
+		Name: "CLONE_NEWIPC",
+	},
+	{
+		Flag: syscall.CLONE_NEWUSER,
+		Name: "CLONE_NEWUSER",
+	},
+	{
+		Flag: syscall.CLONE_NEWPID,
+		Name: "CLONE_NEWPID",
+	},
+	{
+		Flag: syscall.CLONE_NEWNET,
+		Name: "CLONE_NEWNET",
+	},
+	{
+		Flag: syscall.CLONE_IO,
+		Name: "CLONE_IO",
+	},
+}
diff --git a/pkg/sentry/strace/futex.go b/pkg/sentry/strace/futex.go
new file mode 100644
index 000000000..3da108cb7
--- /dev/null
+++ b/pkg/sentry/strace/futex.go
@@ -0,0 +1,91 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// FutexCmd are the possible futex(2) commands.
+var FutexCmd = abi.ValueSet{
+	{
+		Value: linux.FUTEX_WAIT,
+		Name:  "FUTEX_WAIT",
+	},
+	{
+		Value: linux.FUTEX_WAKE,
+		Name:  "FUTEX_WAKE",
+	},
+	{
+		Value: linux.FUTEX_FD,
+		Name:  "FUTEX_FD",
+	},
+	{
+		Value: linux.FUTEX_REQUEUE,
+		Name:  "FUTEX_REQUEUE",
+	},
+	{
+		Value: linux.FUTEX_CMP_REQUEUE,
+		Name:  "FUTEX_CMP_REQUEUE",
+	},
+	{
+		Value: linux.FUTEX_WAKE_OP,
+		Name:  "FUTEX_WAKE_OP",
+	},
+	{
+		Value: linux.FUTEX_LOCK_PI,
+		Name:  "FUTEX_LOCK_PI",
+	},
+	{
+		Value: linux.FUTEX_UNLOCK_PI,
+		Name:  "FUTEX_UNLOCK_PI",
+	},
+	{
+		Value: linux.FUTEX_TRYLOCK_PI,
+		Name:  "FUTEX_TRYLOCK_PI",
+	},
+	{
+		Value: linux.FUTEX_WAIT_BITSET,
+		Name:  "FUTEX_WAIT_BITSET",
+	},
+	{
+		Value: linux.FUTEX_WAKE_BITSET,
+		Name:  "FUTEX_WAKE_BITSET",
+	},
+	{
+		Value: linux.FUTEX_WAIT_REQUEUE_PI,
+		Name:  "FUTEX_WAIT_REQUEUE_PI",
+	},
+	{
+		Value: linux.FUTEX_CMP_REQUEUE_PI,
+		Name:  "FUTEX_CMP_REQUEUE_PI",
+	},
+}
+
+func futex(op uint64) string {
+	cmd := op &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
+	clockRealtime := (op & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
+	private := (op & linux.FUTEX_PRIVATE_FLAG) == linux.FUTEX_PRIVATE_FLAG
+
+	s := FutexCmd.Parse(cmd)
+	if clockRealtime {
+		s += "|FUTEX_CLOCK_REALTIME"
+	}
+	if private {
+		s += "|FUTEX_PRIVATE_FLAG"
+	}
+	return s
+}
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
new file mode 100644
index 000000000..90ea8c36f
--- /dev/null
+++ b/pkg/sentry/strace/linux64.go
@@ -0,0 +1,338 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+// linuxAMD64 provides a mapping of the Linux amd64 syscalls and their argument
+// types for display / formatting.
+var linuxAMD64 = SyscallMap{
+	0:   makeSyscallInfo("read", Hex, ReadBuffer, Hex),
+	1:   makeSyscallInfo("write", Hex, WriteBuffer, Hex),
+	2:   makeSyscallInfo("open", Path, OpenFlags, Mode),
+	3:   makeSyscallInfo("close", Hex),
+	4:   makeSyscallInfo("stat", Path, Stat),
+	5:   makeSyscallInfo("fstat", Hex, Stat),
+	6:   makeSyscallInfo("lstat", Path, Stat),
+	7:   makeSyscallInfo("poll", Hex, Hex, Hex),
+	8:   makeSyscallInfo("lseek", Hex, Hex, Hex),
+	9:   makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, Hex, Hex),
+	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
+	11:  makeSyscallInfo("munmap", Hex, Hex),
+	12:  makeSyscallInfo("brk", Hex),
+	13:  makeSyscallInfo("rt_sigaction", Hex, Hex, Hex),
+	14:  makeSyscallInfo("rt_sigprocmask", Hex, Hex, Hex, Hex),
+	15:  makeSyscallInfo("rt_sigreturn"),
+	16:  makeSyscallInfo("ioctl", Hex, Hex, Hex),
+	17:  makeSyscallInfo("pread64", Hex, ReadBuffer, Hex, Hex),
+	18:  makeSyscallInfo("pwrite64", Hex, WriteBuffer, Hex, Hex),
+	19:  makeSyscallInfo("readv", Hex, ReadIOVec, Hex),
+	20:  makeSyscallInfo("writev", Hex, WriteIOVec, Hex),
+	21:  makeSyscallInfo("access", Path, Oct),
+	22:  makeSyscallInfo("pipe", PipeFDs),
+	23:  makeSyscallInfo("select", Hex, Hex, Hex, Hex, Timeval),
+	24:  makeSyscallInfo("sched_yield"),
+	25:  makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
+	26:  makeSyscallInfo("msync", Hex, Hex, Hex),
+	27:  makeSyscallInfo("mincore", Hex, Hex, Hex),
+	28:  makeSyscallInfo("madvise", Hex, Hex, Hex),
+	29:  makeSyscallInfo("shmget", Hex, Hex, Hex),
+	30:  makeSyscallInfo("shmat", Hex, Hex, Hex),
+	31:  makeSyscallInfo("shmctl", Hex, Hex, Hex),
+	32:  makeSyscallInfo("dup", Hex),
+	33:  makeSyscallInfo("dup2", Hex, Hex),
+	34:  makeSyscallInfo("pause"),
+	35:  makeSyscallInfo("nanosleep", Timespec, PostTimespec),
+	36:  makeSyscallInfo("getitimer", Hex, PostItimerVal),
+	37:  makeSyscallInfo("alarm", Hex),
+	38:  makeSyscallInfo("setitimer", Hex, ItimerVal, PostItimerVal),
+	39:  makeSyscallInfo("getpid"),
+	40:  makeSyscallInfo("sendfile", Hex, Hex, Hex, Hex),
+	41:  makeSyscallInfo("socket", SockFamily, SockType, SockProtocol),
+	42:  makeSyscallInfo("connect", Hex, SockAddr, Hex),
+	43:  makeSyscallInfo("accept", Hex, PostSockAddr, SockLen),
+	44:  makeSyscallInfo("sendto", Hex, Hex, Hex, Hex, SockAddr, Hex),
+	45:  makeSyscallInfo("recvfrom", Hex, Hex, Hex, Hex, PostSockAddr, SockLen),
+	46:  makeSyscallInfo("sendmsg", Hex, SendMsgHdr, Hex),
+	47:  makeSyscallInfo("recvmsg", Hex, RecvMsgHdr, Hex),
+	48:  makeSyscallInfo("shutdown", Hex, Hex),
+	49:  makeSyscallInfo("bind", Hex, SockAddr, Hex),
+	50:  makeSyscallInfo("listen", Hex, Hex),
+	51:  makeSyscallInfo("getsockname", Hex, PostSockAddr, SockLen),
+	52:  makeSyscallInfo("getpeername", Hex, PostSockAddr, SockLen),
+	53:  makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
+	54:  makeSyscallInfo("setsockopt", Hex, Hex, Hex, Hex, Hex),
+	55:  makeSyscallInfo("getsockopt", Hex, Hex, Hex, Hex, Hex),
+	56:  makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
+	57:  makeSyscallInfo("fork"),
+	58:  makeSyscallInfo("vfork"),
+	59:  makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector),
+	60:  makeSyscallInfo("exit", Hex),
+	61:  makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage),
+	62:  makeSyscallInfo("kill", Hex, Hex),
+	63:  makeSyscallInfo("uname", Uname),
+	64:  makeSyscallInfo("semget", Hex, Hex, Hex),
+	65:  makeSyscallInfo("semop", Hex, Hex, Hex),
+	66:  makeSyscallInfo("semctl", Hex, Hex, Hex, Hex),
+	67:  makeSyscallInfo("shmdt", Hex),
+	68:  makeSyscallInfo("msgget", Hex, Hex),
+	69:  makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex),
+	70:  makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex),
+	71:  makeSyscallInfo("msgctl", Hex, Hex, Hex),
+	72:  makeSyscallInfo("fcntl", Hex, Hex, Hex),
+	73:  makeSyscallInfo("flock", Hex, Hex),
+	74:  makeSyscallInfo("fsync", Hex),
+	75:  makeSyscallInfo("fdatasync", Hex),
+	76:  makeSyscallInfo("truncate", Path, Hex),
+	77:  makeSyscallInfo("ftruncate", Hex, Hex),
+	78:  makeSyscallInfo("getdents", Hex, Hex, Hex),
+	79:  makeSyscallInfo("getcwd", Hex, Hex),
+	80:  makeSyscallInfo("chdir", Path),
+	81:  makeSyscallInfo("fchdir", Hex),
+	82:  makeSyscallInfo("rename", Path, Path),
+	83:  makeSyscallInfo("mkdir", Path, Oct),
+	84:  makeSyscallInfo("rmdir", Path),
+	85:  makeSyscallInfo("creat", Path, Oct),
+	86:  makeSyscallInfo("link", Path, Path),
+	87:  makeSyscallInfo("unlink", Path),
+	88:  makeSyscallInfo("symlink", Path, Path),
+	89:  makeSyscallInfo("readlink", Path, ReadBuffer, Hex),
+	90:  makeSyscallInfo("chmod", Path, Mode),
+	91:  makeSyscallInfo("fchmod", Hex, Mode),
+	92:  makeSyscallInfo("chown", Path, Hex, Hex),
+	93:  makeSyscallInfo("fchown", Hex, Hex, Hex),
+	94:  makeSyscallInfo("lchown", Hex, Hex, Hex),
+	95:  makeSyscallInfo("umask", Hex),
+	96:  makeSyscallInfo("gettimeofday", Timeval, Hex),
+	97:  makeSyscallInfo("getrlimit", Hex, Hex),
+	98:  makeSyscallInfo("getrusage", Hex, Rusage),
+	99:  makeSyscallInfo("sysinfo", Hex),
+	100: makeSyscallInfo("times", Hex),
+	101: makeSyscallInfo("ptrace", PtraceRequest, Hex, Hex, Hex),
+	102: makeSyscallInfo("getuid"),
+	103: makeSyscallInfo("syslog", Hex, Hex, Hex),
+	104: makeSyscallInfo("getgid"),
+	105: makeSyscallInfo("setuid", Hex),
+	106: makeSyscallInfo("setgid", Hex),
+	107: makeSyscallInfo("geteuid"),
+	108: makeSyscallInfo("getegid"),
+	109: makeSyscallInfo("setpgid", Hex, Hex),
+	110: makeSyscallInfo("getppid"),
+	111: makeSyscallInfo("getpgrp"),
+	112: makeSyscallInfo("setsid"),
+	113: makeSyscallInfo("setreuid", Hex, Hex),
+	114: makeSyscallInfo("setregid", Hex, Hex),
+	115: makeSyscallInfo("getgroups", Hex, Hex),
+	116: makeSyscallInfo("setgroups", Hex, Hex),
+	117: makeSyscallInfo("setresuid", Hex, Hex, Hex),
+	118: makeSyscallInfo("getresuid", Hex, Hex, Hex),
+	119: makeSyscallInfo("setresgid", Hex, Hex, Hex),
+	120: makeSyscallInfo("getresgid", Hex, Hex, Hex),
+	121: makeSyscallInfo("getpgid", Hex),
+	122: makeSyscallInfo("setfsuid", Hex),
+	123: makeSyscallInfo("setfsgid", Hex),
+	124: makeSyscallInfo("getsid", Hex),
+	125: makeSyscallInfo("capget", Hex, Hex),
+	126: makeSyscallInfo("capset", Hex, Hex),
+	127: makeSyscallInfo("rt_sigpending", Hex),
+	128: makeSyscallInfo("rt_sigtimedwait", Hex, Hex, Timespec, Hex),
+	129: makeSyscallInfo("rt_sigqueueinfo", Hex, Hex, Hex),
+	130: makeSyscallInfo("rt_sigsuspend", Hex),
+	131: makeSyscallInfo("sigaltstack", Hex, Hex),
+	132: makeSyscallInfo("utime", Path, Utimbuf),
+	133: makeSyscallInfo("mknod", Path, Mode, Hex),
+	134: makeSyscallInfo("uselib", Hex),
+	135: makeSyscallInfo("personality", Hex),
+	136: makeSyscallInfo("ustat", Hex, Hex),
+	137: makeSyscallInfo("statfs", Path, Hex),
+	138: makeSyscallInfo("fstatfs", Hex, Hex),
+	139: makeSyscallInfo("sysfs", Hex, Hex, Hex),
+	140: makeSyscallInfo("getpriority", Hex, Hex),
+	141: makeSyscallInfo("setpriority", Hex, Hex, Hex),
+	142: makeSyscallInfo("sched_setparam", Hex, Hex),
+	143: makeSyscallInfo("sched_getparam", Hex, Hex),
+	144: makeSyscallInfo("sched_setscheduler", Hex, Hex, Hex),
+	145: makeSyscallInfo("sched_getscheduler", Hex),
+	146: makeSyscallInfo("sched_get_priority_max", Hex),
+	147: makeSyscallInfo("sched_get_priority_min", Hex),
+	148: makeSyscallInfo("sched_rr_get_interval", Hex, Hex),
+	149: makeSyscallInfo("mlock", Hex, Hex),
+	150: makeSyscallInfo("munlock", Hex, Hex),
+	151: makeSyscallInfo("mlockall", Hex),
+	152: makeSyscallInfo("munlockall"),
+	153: makeSyscallInfo("vhangup"),
+	154: makeSyscallInfo("modify_ldt", Hex, Hex, Hex),
+	155: makeSyscallInfo("pivot_root", Hex, Hex),
+	156: makeSyscallInfo("_sysctl", Hex),
+	157: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex),
+	158: makeSyscallInfo("arch_prctl", Hex, Hex),
+	159: makeSyscallInfo("adjtimex", Hex),
+	160: makeSyscallInfo("setrlimit", Hex, Hex),
+	161: makeSyscallInfo("chroot", Path),
+	162: makeSyscallInfo("sync"),
+	163: makeSyscallInfo("acct", Hex),
+	164: makeSyscallInfo("settimeofday", Timeval, Hex),
+	165: makeSyscallInfo("mount", Path, Path, Path, Hex, Path),
+	166: makeSyscallInfo("umount2", Path, Hex),
+	167: makeSyscallInfo("swapon", Hex, Hex),
+	168: makeSyscallInfo("swapoff", Hex),
+	169: makeSyscallInfo("reboot", Hex, Hex, Hex, Hex),
+	170: makeSyscallInfo("sethostname", Hex, Hex),
+	171: makeSyscallInfo("setdomainname", Hex, Hex),
+	172: makeSyscallInfo("iopl", Hex),
+	173: makeSyscallInfo("ioperm", Hex, Hex, Hex),
+	174: makeSyscallInfo("create_module", Path, Hex),
+	175: makeSyscallInfo("init_module", Hex, Hex, Hex),
+	176: makeSyscallInfo("delete_module", Hex, Hex),
+	177: makeSyscallInfo("get_kernel_syms", Hex),
+	// 178: query_module (only present in Linux < 2.6)
+	179: makeSyscallInfo("quotactl", Hex, Hex, Hex, Hex),
+	180: makeSyscallInfo("nfsservctl", Hex, Hex, Hex),
+	// 181: getpmsg (not implemented in the Linux kernel)
+	// 182: putpmsg (not implemented in the Linux kernel)
+	// 183: afs_syscall (not implemented in the Linux kernel)
+	// 184: tuxcall (not implemented in the Linux kernel)
+	// 185: security (not implemented in the Linux kernel)
+	186: makeSyscallInfo("gettid"),
+	187: makeSyscallInfo("readahead", Hex, Hex, Hex),
+	188: makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex),
+	189: makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex),
+	190: makeSyscallInfo("fsetxattr", Hex, Path, Hex, Hex, Hex),
+	191: makeSyscallInfo("getxattr", Path, Path, Hex, Hex),
+	192: makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex),
+	193: makeSyscallInfo("fgetxattr", Hex, Path, Hex, Hex),
+	194: makeSyscallInfo("listxattr", Path, Path, Hex),
+	195: makeSyscallInfo("llistxattr", Path, Path, Hex),
+	196: makeSyscallInfo("flistxattr", Hex, Path, Hex),
+	197: makeSyscallInfo("removexattr", Path, Path),
+	198: makeSyscallInfo("lremovexattr", Path, Path),
+	199: makeSyscallInfo("fremovexattr", Hex, Path),
+	200: makeSyscallInfo("tkill", Hex, Hex),
+	201: makeSyscallInfo("time", Hex),
+	202: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex),
+	203: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex),
+	204: makeSyscallInfo("sched_getaffinity", Hex, Hex, Hex),
+	205: makeSyscallInfo("set_thread_area", Hex),
+	206: makeSyscallInfo("io_setup", Hex, Hex),
+	207: makeSyscallInfo("io_destroy", Hex),
+	208: makeSyscallInfo("io_getevents", Hex, Hex, Hex, Hex, Timespec),
+	209: makeSyscallInfo("io_submit", Hex, Hex, Hex),
+	210: makeSyscallInfo("io_cancel", Hex, Hex, Hex),
+	211: makeSyscallInfo("get_thread_area", Hex),
+	212: makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
+	213: makeSyscallInfo("epoll_create", Hex),
+	// 214: epoll_ctl_old (not implemented in the Linux kernel)
+	// 215: epoll_wait_old (not implemented in the Linux kernel)
+	216: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex),
+	217: makeSyscallInfo("getdents64", Hex, Hex, Hex),
+	218: makeSyscallInfo("set_tid_address", Hex),
+	219: makeSyscallInfo("restart_syscall"),
+	220: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex),
+	221: makeSyscallInfo("fadvise64", Hex, Hex, Hex, Hex),
+	222: makeSyscallInfo("timer_create", Hex, Hex, Hex),
+	223: makeSyscallInfo("timer_settime", Hex, Hex, Hex, Hex),
+	224: makeSyscallInfo("timer_gettime", Hex, Hex),
+	225: makeSyscallInfo("timer_getoverrun", Hex),
+	226: makeSyscallInfo("timer_delete", Hex),
+	227: makeSyscallInfo("clock_settime", Hex, Timespec),
+	228: makeSyscallInfo("clock_gettime", Hex, PostTimespec),
+	229: makeSyscallInfo("clock_getres", Hex, PostTimespec),
+	230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
+	231: makeSyscallInfo("exit_group", Hex),
+	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
+	233: makeSyscallInfo("epoll_ctl", Hex, Hex, Hex, Hex),
+	234: makeSyscallInfo("tgkill", Hex, Hex, Hex),
+	235: makeSyscallInfo("utimes", Path, Timeval),
+	// 236: vserver (not implemented in the Linux kernel)
+	237: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex),
+	238: makeSyscallInfo("set_mempolicy", Hex, Hex, Hex),
+	239: makeSyscallInfo("get_mempolicy", Hex, Hex, Hex, Hex, Hex),
+	240: makeSyscallInfo("mq_open", Hex, Hex, Hex, Hex),
+	241: makeSyscallInfo("mq_unlink", Hex),
+	242: makeSyscallInfo("mq_timedsend", Hex, Hex, Hex, Hex, Hex),
+	243: makeSyscallInfo("mq_timedreceive", Hex, Hex, Hex, Hex, Hex),
+	244: makeSyscallInfo("mq_notify", Hex, Hex),
+	245: makeSyscallInfo("mq_getsetattr", Hex, Hex, Hex),
+	246: makeSyscallInfo("kexec_load", Hex, Hex, Hex, Hex),
+	247: makeSyscallInfo("waitid", Hex, Hex, Hex, Hex, Rusage),
+	248: makeSyscallInfo("add_key", Hex, Hex, Hex, Hex, Hex),
+	249: makeSyscallInfo("request_key", Hex, Hex, Hex, Hex),
+	250: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex),
+	251: makeSyscallInfo("ioprio_set", Hex, Hex, Hex),
+	252: makeSyscallInfo("ioprio_get", Hex, Hex),
+	253: makeSyscallInfo("inotify_init"),
+	254: makeSyscallInfo("inotify_add_watch", Hex, Hex, Hex),
+	255: makeSyscallInfo("inotify_rm_watch", Hex, Hex),
+	256: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
+	257: makeSyscallInfo("openat", Hex, Path, Hex, Mode),
+	258: makeSyscallInfo("mkdirat", Hex, Path, Hex),
+	259: makeSyscallInfo("mknodat", Hex, Path, Mode, Hex),
+	260: makeSyscallInfo("fchownat", Hex, Path, Hex, Hex, Hex),
+	261: makeSyscallInfo("futimesat", Hex, Path, Hex),
+	262: makeSyscallInfo("newfstatat", Hex, Path, Stat, Hex),
+	263: makeSyscallInfo("unlinkat", Hex, Path, Hex),
+	264: makeSyscallInfo("renameat", Hex, Path, Hex, Path),
+	265: makeSyscallInfo("linkat", Hex, Path, Hex, Path, Hex),
+	266: makeSyscallInfo("symlinkat", Path, Hex, Path),
+	267: makeSyscallInfo("readlinkat", Hex, Path, ReadBuffer, Hex),
+	268: makeSyscallInfo("fchmodat", Hex, Path, Mode),
+	269: makeSyscallInfo("faccessat", Hex, Path, Oct, Hex),
+	270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
+	271: makeSyscallInfo("ppoll", Hex, Hex, Timespec, Hex, Hex),
+	272: makeSyscallInfo("unshare", Hex),
+	273: makeSyscallInfo("set_robust_list", Hex, Hex),
+	274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
+	275: makeSyscallInfo("splice", Hex, Hex, Hex, Hex, Hex, Hex),
+	276: makeSyscallInfo("tee", Hex, Hex, Hex, Hex),
+	277: makeSyscallInfo("sync_file_range", Hex, Hex, Hex, Hex),
+	278: makeSyscallInfo("vmsplice", Hex, Hex, Hex, Hex),
+	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
+	280: makeSyscallInfo("utimensat", Hex, Path, UTimeTimespec, Hex),
+	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, Hex, Hex),
+	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
+	283: makeSyscallInfo("timerfd_create", Hex, Hex),
+	284: makeSyscallInfo("eventfd", Hex),
+	285: makeSyscallInfo("fallocate", Hex, Hex, Hex, Hex),
+	286: makeSyscallInfo("timerfd_settime", Hex, Hex, Hex, Hex),
+	287: makeSyscallInfo("timerfd_gettime", Hex, Hex),
+	288: makeSyscallInfo("accept4", Hex, PostSockAddr, SockLen, SockFlags),
+	289: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex),
+	290: makeSyscallInfo("eventfd2", Hex, Hex),
+	291: makeSyscallInfo("epoll_create1", Hex),
+	292: makeSyscallInfo("dup3", Hex, Hex, Hex),
+	293: makeSyscallInfo("pipe2", PipeFDs, Hex),
+	294: makeSyscallInfo("inotify_init1", Hex),
+	295: makeSyscallInfo("preadv", Hex, ReadIOVec, Hex, Hex),
+	296: makeSyscallInfo("pwritev", Hex, WriteIOVec, Hex, Hex),
+	297: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Hex, Hex),
+	298: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex),
+	299: makeSyscallInfo("recvmmsg", Hex, Hex, Hex, Hex, Hex),
+	300: makeSyscallInfo("fanotify_init", Hex, Hex),
+	301: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex),
+	302: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex),
+	303: makeSyscallInfo("name_to_handle_at", Hex, Hex, Hex, Hex, Hex),
+	304: makeSyscallInfo("open_by_handle_at", Hex, Hex, Hex),
+	305: makeSyscallInfo("clock_adjtime", Hex, Hex),
+	306: makeSyscallInfo("syncfs", Hex),
+	307: makeSyscallInfo("sendmmsg", Hex, Hex, Hex, Hex),
+	308: makeSyscallInfo("setns", Hex, Hex),
+	309: makeSyscallInfo("getcpu", Hex, Hex, Hex),
+	310: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex),
+	311: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex),
+	312: makeSyscallInfo("kcmp", Hex, Hex, Hex, Hex, Hex),
+	313: makeSyscallInfo("finit_module", Hex, Hex, Hex),
+	314: makeSyscallInfo("sched_setattr", Hex, Hex, Hex),
+	315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
+	316: makeSyscallInfo("renameat2", Hex, Path, Hex, Path, Hex),
+	317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+}
diff --git a/pkg/sentry/strace/open.go b/pkg/sentry/strace/open.go
new file mode 100644
index 000000000..839d5eda7
--- /dev/null
+++ b/pkg/sentry/strace/open.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+)
+
+// OpenMode represents the mode to open(2) a file.
+var OpenMode = abi.ValueSet{
+	{
+		Value: syscall.O_RDWR,
+		Name:  "O_RDWR",
+	},
+	{
+		Value: syscall.O_WRONLY,
+		Name:  "O_WRONLY",
+	},
+	{
+		Value: syscall.O_RDONLY,
+		Name:  "O_RDONLY",
+	},
+}
+
+// OpenFlagSet is the set of open(2) flags.
+var OpenFlagSet = abi.FlagSet{
+	{
+		Flag: syscall.O_APPEND,
+		Name: "O_APPEND",
+	},
+	{
+		Flag: syscall.O_ASYNC,
+		Name: "O_ASYNC",
+	},
+	{
+		Flag: syscall.O_CLOEXEC,
+		Name: "O_CLOEXEC",
+	},
+	{
+		Flag: syscall.O_CREAT,
+		Name: "O_CREAT",
+	},
+	{
+		Flag: syscall.O_DIRECT,
+		Name: "O_DIRECT",
+	},
+	{
+		Flag: syscall.O_DIRECTORY,
+		Name: "O_DIRECTORY",
+	},
+	{
+		Flag: syscall.O_EXCL,
+		Name: "O_EXCL",
+	},
+	{
+		Flag: syscall.O_NOATIME,
+		Name: "O_NOATIME",
+	},
+	{
+		Flag: syscall.O_NOCTTY,
+		Name: "O_NOCTTY",
+	},
+	{
+		Flag: syscall.O_NOFOLLOW,
+		Name: "O_NOFOLLOW",
+	},
+	{
+		Flag: syscall.O_NONBLOCK,
+		Name: "O_NONBLOCK",
+	},
+	{
+		Flag: 0x200000, // O_PATH
+		Name: "O_PATH",
+	},
+	{
+		Flag: syscall.O_SYNC,
+		Name: "O_SYNC",
+	},
+	{
+		Flag: syscall.O_TRUNC,
+		Name: "O_TRUNC",
+	},
+}
+
+func open(val uint64) string {
+	s := OpenMode.Parse(val & syscall.O_ACCMODE)
+	if flags := OpenFlagSet.Parse(val &^ syscall.O_ACCMODE); flags != "" {
+		s += "|" + flags
+	}
+	return s
+}
diff --git a/pkg/sentry/strace/ptrace.go b/pkg/sentry/strace/ptrace.go
new file mode 100644
index 000000000..a0dabb27a
--- /dev/null
+++ b/pkg/sentry/strace/ptrace.go
@@ -0,0 +1,178 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// PtraceRequestSet are the possible ptrace(2) requests.
+var PtraceRequestSet = abi.ValueSet{
+	{
+		Value: syscall.PTRACE_TRACEME,
+		Name:  "PTRACE_TRACEME",
+	},
+	{
+		Value: syscall.PTRACE_PEEKTEXT,
+		Name:  "PTRACE_PEEKTEXT",
+	},
+	{
+		Value: syscall.PTRACE_PEEKDATA,
+		Name:  "PTRACE_PEEKDATA",
+	},
+	{
+		Value: syscall.PTRACE_PEEKUSR,
+		Name:  "PTRACE_PEEKUSR",
+	},
+	{
+		Value: syscall.PTRACE_POKETEXT,
+		Name:  "PTRACE_POKETEXT",
+	},
+	{
+		Value: syscall.PTRACE_POKEDATA,
+		Name:  "PTRACE_POKEDATA",
+	},
+	{
+		Value: syscall.PTRACE_POKEUSR,
+		Name:  "PTRACE_POKEUSR",
+	},
+	{
+		Value: syscall.PTRACE_CONT,
+		Name:  "PTRACE_CONT",
+	},
+	{
+		Value: syscall.PTRACE_KILL,
+		Name:  "PTRACE_KILL",
+	},
+	{
+		Value: syscall.PTRACE_SINGLESTEP,
+		Name:  "PTRACE_SINGLESTEP",
+	},
+	{
+		Value: syscall.PTRACE_ATTACH,
+		Name:  "PTRACE_ATTACH",
+	},
+	{
+		Value: syscall.PTRACE_DETACH,
+		Name:  "PTRACE_DETACH",
+	},
+	{
+		Value: syscall.PTRACE_SYSCALL,
+		Name:  "PTRACE_SYSCALL",
+	},
+	{
+		Value: syscall.PTRACE_SETOPTIONS,
+		Name:  "PTRACE_SETOPTIONS",
+	},
+	{
+		Value: syscall.PTRACE_GETEVENTMSG,
+		Name:  "PTRACE_GETEVENTMSG",
+	},
+	{
+		Value: syscall.PTRACE_GETSIGINFO,
+		Name:  "PTRACE_GETSIGINFO",
+	},
+	{
+		Value: syscall.PTRACE_SETSIGINFO,
+		Name:  "PTRACE_SETSIGINFO",
+	},
+	{
+		Value: syscall.PTRACE_GETREGSET,
+		Name:  "PTRACE_GETREGSET",
+	},
+	{
+		Value: syscall.PTRACE_SETREGSET,
+		Name:  "PTRACE_SETREGSET",
+	},
+	{
+		Value: kernel.PTRACE_SEIZE,
+		Name:  "PTRACE_SEIZE",
+	},
+	{
+		Value: kernel.PTRACE_INTERRUPT,
+		Name:  "PTRACE_INTERRUPT",
+	},
+	{
+		Value: kernel.PTRACE_LISTEN,
+		Name:  "PTRACE_LISTEN",
+	},
+	{
+		Value: kernel.PTRACE_PEEKSIGINFO,
+		Name:  "PTRACE_PEEKSIGINFO",
+	},
+	{
+		Value: kernel.PTRACE_GETSIGMASK,
+		Name:  "PTRACE_GETSIGMASK",
+	},
+	{
+		Value: kernel.PTRACE_SETSIGMASK,
+		Name:  "PTRACE_SETSIGMASK",
+	},
+	{
+		Value: syscall.PTRACE_GETREGS,
+		Name:  "PTRACE_GETREGS",
+	},
+	{
+		Value: syscall.PTRACE_SETREGS,
+		Name:  "PTRACE_SETREGS",
+	},
+	{
+		Value: syscall.PTRACE_GETFPREGS,
+		Name:  "PTRACE_GETFPREGS",
+	},
+	{
+		Value: syscall.PTRACE_SETFPREGS,
+		Name:  "PTRACE_SETFPREGS",
+	},
+	{
+		Value: syscall.PTRACE_GETFPXREGS,
+		Name:  "PTRACE_GETFPXREGS",
+	},
+	{
+		Value: syscall.PTRACE_SETFPXREGS,
+		Name:  "PTRACE_SETFPXREGS",
+	},
+	{
+		Value: syscall.PTRACE_OLDSETOPTIONS,
+		Name:  "PTRACE_OLDSETOPTIONS",
+	},
+	{
+		Value: syscall.PTRACE_GET_THREAD_AREA,
+		Name:  "PTRACE_GET_THREAD_AREA",
+	},
+	{
+		Value: syscall.PTRACE_SET_THREAD_AREA,
+		Name:  "PTRACE_SET_THREAD_AREA",
+	},
+	{
+		Value: syscall.PTRACE_ARCH_PRCTL,
+		Name:  "PTRACE_ARCH_PRCTL",
+	},
+	{
+		Value: syscall.PTRACE_SYSEMU,
+		Name:  "PTRACE_SYSEMU",
+	},
+	{
+		Value: syscall.PTRACE_SYSEMU_SINGLESTEP,
+		Name:  "PTRACE_SYSEMU_SINGLESTEP",
+	},
+	{
+		Value: syscall.PTRACE_SINGLEBLOCK,
+		Name:  "PTRACE_SINGLEBLOCK",
+	},
+}
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
new file mode 100644
index 000000000..48c072e96
--- /dev/null
+++ b/pkg/sentry/strace/socket.go
@@ -0,0 +1,674 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
+	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// SocketFamily are the possible socket(2) families.
+var SocketFamily = abi.ValueSet{
+	{
+		Value: linux.AF_UNSPEC,
+		Name:  "AF_UNSPEC",
+	},
+	{
+		Value: linux.AF_UNIX,
+		Name:  "AF_UNIX",
+	},
+	{
+		Value: linux.AF_INET,
+		Name:  "AF_INET",
+	},
+	{
+		Value: linux.AF_AX25,
+		Name:  "AF_AX25",
+	},
+	{
+		Value: linux.AF_IPX,
+		Name:  "AF_IPX",
+	},
+	{
+		Value: linux.AF_APPLETALK,
+		Name:  "AF_APPLETALK",
+	},
+	{
+		Value: linux.AF_NETROM,
+		Name:  "AF_NETROM",
+	},
+	{
+		Value: linux.AF_BRIDGE,
+		Name:  "AF_BRIDGE",
+	},
+	{
+		Value: linux.AF_ATMPVC,
+		Name:  "AF_ATMPVC",
+	},
+	{
+		Value: linux.AF_X25,
+		Name:  "AF_X25",
+	},
+	{
+		Value: linux.AF_INET6,
+		Name:  "AF_INET6",
+	},
+	{
+		Value: linux.AF_ROSE,
+		Name:  "AF_ROSE",
+	},
+	{
+		Value: linux.AF_DECnet,
+		Name:  "AF_DECnet",
+	},
+	{
+		Value: linux.AF_NETBEUI,
+		Name:  "AF_NETBEUI",
+	},
+	{
+		Value: linux.AF_SECURITY,
+		Name:  "AF_SECURITY",
+	},
+	{
+		Value: linux.AF_KEY,
+		Name:  "AF_KEY",
+	},
+	{
+		Value: linux.AF_NETLINK,
+		Name:  "AF_NETLINK",
+	},
+	{
+		Value: linux.AF_PACKET,
+		Name:  "AF_PACKET",
+	},
+	{
+		Value: linux.AF_ASH,
+		Name:  "AF_ASH",
+	},
+	{
+		Value: linux.AF_ECONET,
+		Name:  "AF_ECONET",
+	},
+	{
+		Value: linux.AF_ATMSVC,
+		Name:  "AF_ATMSVC",
+	},
+	{
+		Value: linux.AF_RDS,
+		Name:  "AF_RDS",
+	},
+	{
+		Value: linux.AF_SNA,
+		Name:  "AF_SNA",
+	},
+	{
+		Value: linux.AF_IRDA,
+		Name:  "AF_IRDA",
+	},
+	{
+		Value: linux.AF_PPPOX,
+		Name:  "AF_PPPOX",
+	},
+	{
+		Value: linux.AF_WANPIPE,
+		Name:  "AF_WANPIPE",
+	},
+	{
+		Value: linux.AF_LLC,
+		Name:  "AF_LLC",
+	},
+	{
+		Value: linux.AF_IB,
+		Name:  "AF_IB",
+	},
+	{
+		Value: linux.AF_MPLS,
+		Name:  "AF_MPLS",
+	},
+	{
+		Value: linux.AF_CAN,
+		Name:  "AF_CAN",
+	},
+	{
+		Value: linux.AF_TIPC,
+		Name:  "AF_TIPC",
+	},
+	{
+		Value: linux.AF_BLUETOOTH,
+		Name:  "AF_BLUETOOTH",
+	},
+	{
+		Value: linux.AF_IUCV,
+		Name:  "AF_IUCV",
+	},
+	{
+		Value: linux.AF_RXRPC,
+		Name:  "AF_RXRPC",
+	},
+	{
+		Value: linux.AF_ISDN,
+		Name:  "AF_ISDN",
+	},
+	{
+		Value: linux.AF_PHONET,
+		Name:  "AF_PHONET",
+	},
+	{
+		Value: linux.AF_IEEE802154,
+		Name:  "AF_IEEE802154",
+	},
+	{
+		Value: linux.AF_CAIF,
+		Name:  "AF_CAIF",
+	},
+	{
+		Value: linux.AF_ALG,
+		Name:  "AF_ALG",
+	},
+	{
+		Value: linux.AF_NFC,
+		Name:  "AF_NFC",
+	},
+	{
+		Value: linux.AF_VSOCK,
+		Name:  "AF_VSOCK",
+	},
+}
+
+// SocketType are the possible socket(2) types.
+var SocketType = abi.ValueSet{
+	{
+		Value: linux.SOCK_STREAM,
+		Name:  "SOCK_STREAM",
+	},
+	{
+		Value: linux.SOCK_DGRAM,
+		Name:  "SOCK_DGRAM",
+	},
+	{
+		Value: linux.SOCK_RAW,
+		Name:  "SOCK_RAW",
+	},
+	{
+		Value: linux.SOCK_RDM,
+		Name:  "SOCK_RDM",
+	},
+	{
+		Value: linux.SOCK_SEQPACKET,
+		Name:  "SOCK_SEQPACKET",
+	},
+	{
+		Value: linux.SOCK_DCCP,
+		Name:  "SOCK_DCCP",
+	},
+	{
+		Value: linux.SOCK_PACKET,
+		Name:  "SOCK_PACKET",
+	},
+}
+
+// SocketFlagSet are the possible socket(2) flags.
+var SocketFlagSet = abi.FlagSet{
+	{
+		Flag: linux.SOCK_CLOEXEC,
+		Name: "SOCK_CLOEXEC",
+	},
+	{
+		Flag: linux.SOCK_NONBLOCK,
+		Name: "SOCK_NONBLOCK",
+	},
+}
+
+// ipProtocol are the possible socket(2) types for INET and INET6 sockets.
+var ipProtocol = abi.ValueSet{
+	{
+		Value: linux.IPPROTO_IP,
+		Name:  "IPPROTO_IP",
+	},
+	{
+		Value: linux.IPPROTO_ICMP,
+		Name:  "IPPROTO_ICMP",
+	},
+	{
+		Value: linux.IPPROTO_IGMP,
+		Name:  "IPPROTO_IGMP",
+	},
+	{
+		Value: linux.IPPROTO_IPIP,
+		Name:  "IPPROTO_IPIP",
+	},
+	{
+		Value: linux.IPPROTO_TCP,
+		Name:  "IPPROTO_TCP",
+	},
+	{
+		Value: linux.IPPROTO_EGP,
+		Name:  "IPPROTO_EGP",
+	},
+	{
+		Value: linux.IPPROTO_PUP,
+		Name:  "IPPROTO_PUP",
+	},
+	{
+		Value: linux.IPPROTO_UDP,
+		Name:  "IPPROTO_UDP",
+	},
+	{
+		Value: linux.IPPROTO_IDP,
+		Name:  "IPPROTO_IDP",
+	},
+	{
+		Value: linux.IPPROTO_TP,
+		Name:  "IPPROTO_TP",
+	},
+	{
+		Value: linux.IPPROTO_DCCP,
+		Name:  "IPPROTO_DCCP",
+	},
+	{
+		Value: linux.IPPROTO_IPV6,
+		Name:  "IPPROTO_IPV6",
+	},
+	{
+		Value: linux.IPPROTO_RSVP,
+		Name:  "IPPROTO_RSVP",
+	},
+	{
+		Value: linux.IPPROTO_GRE,
+		Name:  "IPPROTO_GRE",
+	},
+	{
+		Value: linux.IPPROTO_ESP,
+		Name:  "IPPROTO_ESP",
+	},
+	{
+		Value: linux.IPPROTO_AH,
+		Name:  "IPPROTO_AH",
+	},
+	{
+		Value: linux.IPPROTO_MTP,
+		Name:  "IPPROTO_MTP",
+	},
+	{
+		Value: linux.IPPROTO_BEETPH,
+		Name:  "IPPROTO_BEETPH",
+	},
+	{
+		Value: linux.IPPROTO_ENCAP,
+		Name:  "IPPROTO_ENCAP",
+	},
+	{
+		Value: linux.IPPROTO_PIM,
+		Name:  "IPPROTO_PIM",
+	},
+	{
+		Value: linux.IPPROTO_COMP,
+		Name:  "IPPROTO_COMP",
+	},
+	{
+		Value: linux.IPPROTO_SCTP,
+		Name:  "IPPROTO_SCTP",
+	},
+	{
+		Value: linux.IPPROTO_UDPLITE,
+		Name:  "IPPROTO_UDPLITE",
+	},
+	{
+		Value: linux.IPPROTO_MPLS,
+		Name:  "IPPROTO_MPLS",
+	},
+	{
+		Value: linux.IPPROTO_RAW,
+		Name:  "IPPROTO_RAW",
+	},
+}
+
+// SocketProtocol are the possible socket(2) protocols for each protocol family.
+var SocketProtocol = map[int32]abi.ValueSet{
+	linux.AF_INET:  ipProtocol,
+	linux.AF_INET6: ipProtocol,
+	linux.AF_NETLINK: {
+		{
+			Value: linux.NETLINK_ROUTE,
+			Name:  "NETLINK_ROUTE",
+		},
+		{
+			Value: linux.NETLINK_UNUSED,
+			Name:  "NETLINK_UNUSED",
+		},
+		{
+			Value: linux.NETLINK_USERSOCK,
+			Name:  "NETLINK_USERSOCK",
+		},
+		{
+			Value: linux.NETLINK_FIREWALL,
+			Name:  "NETLINK_FIREWALL",
+		},
+		{
+			Value: linux.NETLINK_SOCK_DIAG,
+			Name:  "NETLINK_SOCK_DIAG",
+		},
+		{
+			Value: linux.NETLINK_NFLOG,
+			Name:  "NETLINK_NFLOG",
+		},
+		{
+			Value: linux.NETLINK_XFRM,
+			Name:  "NETLINK_XFRM",
+		},
+		{
+			Value: linux.NETLINK_SELINUX,
+			Name:  "NETLINK_SELINUX",
+		},
+		{
+			Value: linux.NETLINK_ISCSI,
+			Name:  "NETLINK_ISCSI",
+		},
+		{
+			Value: linux.NETLINK_AUDIT,
+			Name:  "NETLINK_AUDIT",
+		},
+		{
+			Value: linux.NETLINK_FIB_LOOKUP,
+			Name:  "NETLINK_FIB_LOOKUP",
+		},
+		{
+			Value: linux.NETLINK_CONNECTOR,
+			Name:  "NETLINK_CONNECTOR",
+		},
+		{
+			Value: linux.NETLINK_NETFILTER,
+			Name:  "NETLINK_NETFILTER",
+		},
+		{
+			Value: linux.NETLINK_IP6_FW,
+			Name:  "NETLINK_IP6_FW",
+		},
+		{
+			Value: linux.NETLINK_DNRTMSG,
+			Name:  "NETLINK_DNRTMSG",
+		},
+		{
+			Value: linux.NETLINK_KOBJECT_UEVENT,
+			Name:  "NETLINK_KOBJECT_UEVENT",
+		},
+		{
+			Value: linux.NETLINK_GENERIC,
+			Name:  "NETLINK_GENERIC",
+		},
+		{
+			Value: linux.NETLINK_SCSITRANSPORT,
+			Name:  "NETLINK_SCSITRANSPORT",
+		},
+		{
+			Value: linux.NETLINK_ECRYPTFS,
+			Name:  "NETLINK_ECRYPTFS",
+		},
+		{
+			Value: linux.NETLINK_RDMA,
+			Name:  "NETLINK_RDMA",
+		},
+		{
+			Value: linux.NETLINK_CRYPTO,
+			Name:  "NETLINK_CRYPTO",
+		},
+	},
+}
+
+var controlMessageType = map[int32]string{
+	linux.SCM_RIGHTS:      "SCM_RIGHTS",
+	linux.SCM_CREDENTIALS: "SCM_CREDENTIALS",
+}
+
+func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) string {
+	if length > maxBytes {
+		return fmt.Sprintf("%#x (error decoding control: invalid length (%d))", addr, length)
+	}
+
+	buf := make([]byte, length)
+	if _, err := t.CopyIn(addr, &buf); err != nil {
+		return fmt.Sprintf("%#x (error decoding control: %v)", addr, err)
+	}
+
+	var strs []string
+
+	for i := 0; i < len(buf); {
+		if i+linux.SizeOfControlMessageHeader > len(buf) {
+			strs = append(strs, "{invalid control message (too short)}")
+			break
+		}
+
+		var h linux.ControlMessageHeader
+		binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h)
+		i += linux.SizeOfControlMessageHeader
+
+		var skipData bool
+		level := "SOL_SOCKET"
+		if h.Level != linux.SOL_SOCKET {
+			skipData = true
+			level = fmt.Sprint(h.Level)
+		}
+
+		typ, ok := controlMessageType[h.Type]
+		if !ok {
+			skipData = true
+			typ = fmt.Sprint(h.Type)
+		}
+
+		if h.Length > uint64(len(buf)-i) {
+			strs = append(strs, fmt.Sprintf(
+				"{level=%s, type=%s, length=%d, content extends beyond buffer}",
+				level,
+				typ,
+				h.Length,
+			))
+			break
+		}
+
+		width := t.Arch().Width()
+		length := int(h.Length) - linux.SizeOfControlMessageHeader
+
+		if skipData {
+			strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
+			i += control.AlignUp(i+length, width)
+			continue
+		}
+
+		switch h.Type {
+		case linux.SCM_RIGHTS:
+			rightsSize := control.AlignDown(length, linux.SizeOfControlMessageRight)
+
+			numRights := rightsSize / linux.SizeOfControlMessageRight
+			fds := make(linux.ControlMessageRights, numRights)
+			binary.Unmarshal(buf[i:i+rightsSize], usermem.ByteOrder, &fds)
+
+			rights := make([]string, 0, len(fds))
+			for _, fd := range fds {
+				rights = append(rights, fmt.Sprint(fd))
+			}
+
+			strs = append(strs, fmt.Sprintf(
+				"{level=%s, type=%s, length=%d, content: %s}",
+				level,
+				typ,
+				h.Length,
+				strings.Join(rights, ","),
+			))
+
+			i += control.AlignUp(length, width)
+
+		case linux.SCM_CREDENTIALS:
+			if length < linux.SizeOfControlMessageCredentials {
+				strs = append(strs, fmt.Sprintf(
+					"{level=%s, type=%s, length=%d, content too short}",
+					level,
+					typ,
+					h.Length,
+				))
+				i += control.AlignUp(length, width)
+				break
+			}
+
+			var creds linux.ControlMessageCredentials
+			binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds)
+
+			strs = append(strs, fmt.Sprintf(
+				"{level=%s, type=%s, length=%d, pid: %d, uid: %d, gid: %d}",
+				level,
+				typ,
+				h.Length,
+				creds.PID,
+				creds.UID,
+				creds.GID,
+			))
+
+			i += control.AlignUp(length, width)
+
+		default:
+			panic("unreachable")
+		}
+	}
+
+	return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", "))
+}
+
+func msghdr(t *kernel.Task, addr usermem.Addr, printContent bool, maxBytes uint64) string {
+	var msg slinux.MessageHeader64
+	if err := slinux.CopyInMessageHeader64(t, addr, &msg); err != nil {
+		return fmt.Sprintf("%#x (error decoding msghdr: %v)", addr, err)
+	}
+	s := fmt.Sprintf(
+		"%#x {name=%#x, namelen=%d, iovecs=%s",
+		addr,
+		msg.Name,
+		msg.NameLen,
+		iovecs(t, usermem.Addr(msg.Iov), int(msg.IovLen), printContent, maxBytes),
+	)
+	if printContent {
+		s = fmt.Sprintf("%s, control={%s}", s, cmsghdr(t, usermem.Addr(msg.Control), msg.ControlLen, maxBytes))
+	} else {
+		s = fmt.Sprintf("%s, control=%#x, control_len=%d", s, msg.Control, msg.ControlLen)
+	}
+	return fmt.Sprintf("%s, flags=%d}", s, msg.Flags)
+}
+
+func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	b, err := slinux.CaptureAddress(t, addr, length)
+	if err != nil {
+		return fmt.Sprintf("%#x {error reading address: %v}", addr, err)
+	}
+
+	// Extract address family.
+	if len(b) < 2 {
+		return fmt.Sprintf("%#x {address too short: %d bytes}", addr, len(b))
+	}
+	family := usermem.ByteOrder.Uint16(b)
+
+	familyStr := SocketFamily.Parse(uint64(family))
+
+	switch family {
+	case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
+		fa, err := epsocket.GetAddress(int(family), b)
+		if err != nil {
+			return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
+		}
+
+		if family == linux.AF_UNIX {
+			return fmt.Sprintf("%#x {Family: %s, Addr: %q}", addr, familyStr, string(fa.Addr))
+		}
+
+		return fmt.Sprintf("%#x {Family: %s, Addr: %v, Port: %d}", addr, familyStr, fa.Addr, fa.Port)
+	case linux.AF_NETLINK:
+		sa, err := netlink.ExtractSockAddr(b)
+		if err != nil {
+			return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
+		}
+		return fmt.Sprintf("%#x {Family: %s, PortID: %d, Groups: %d}", addr, familyStr, sa.PortID, sa.Groups)
+	default:
+		return fmt.Sprintf("%#x {Family: %s, family addr format unknown}", addr, familyStr)
+	}
+}
+
+func postSockAddr(t *kernel.Task, addr usermem.Addr, lengthPtr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	if lengthPtr == 0 {
+		return fmt.Sprintf("%#x {length null}", addr)
+	}
+
+	l, err := copySockLen(t, lengthPtr)
+	if err != nil {
+		return fmt.Sprintf("%#x {error reading length: %v}", addr, err)
+	}
+
+	return sockAddr(t, addr, l)
+}
+
+func copySockLen(t *kernel.Task, addr usermem.Addr) (uint32, error) {
+	// socklen_t is 32-bits.
+	var l uint32
+	_, err := t.CopyIn(addr, &l)
+	return l, err
+}
+
+func sockLenPointer(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+	l, err := copySockLen(t, addr)
+	if err != nil {
+		return fmt.Sprintf("%#x {error reading length: %v}", addr, err)
+	}
+	return fmt.Sprintf("%#x {length=%v}", addr, l)
+}
+
+func sockType(stype int32) string {
+	s := SocketType.Parse(uint64(stype & linux.SOCK_TYPE_MASK))
+	if flags := SocketFlagSet.Parse(uint64(stype &^ linux.SOCK_TYPE_MASK)); flags != "" {
+		s += "|" + flags
+	}
+	return s
+}
+
+func sockProtocol(family, protocol int32) string {
+	protocols, ok := SocketProtocol[family]
+	if !ok {
+		return fmt.Sprintf("%#x", protocol)
+	}
+	return protocols.Parse(uint64(protocol))
+}
+
+func sockFlags(flags int32) string {
+	if flags == 0 {
+		return "0"
+	}
+	return SocketFlagSet.Parse(uint64(flags))
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
new file mode 100644
index 000000000..4cd16d2f8
--- /dev/null
+++ b/pkg/sentry/strace/strace.go
@@ -0,0 +1,666 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package strace implements the logic to print out the input and the return value
+// of each traced syscall.
+package strace
+
+import (
+	"encoding/binary"
+	"fmt"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	pb "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto"
+	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// DefaultLogMaximumSize is the default LogMaximumSize.
+const DefaultLogMaximumSize = 1024
+
+// LogMaximumSize determines the maximum display size for data blobs (read,
+// write, etc.).
+var LogMaximumSize uint = DefaultLogMaximumSize
+
+// EventMaximumSize determines the maximum size for data blobs (read, write,
+// etc.) sent over the event channel. Default is 0 because most clients cannot
+// do anything useful with binary text dump of byte array arguments.
+var EventMaximumSize uint
+
+func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, maxBytes uint64) string {
+	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
+		return fmt.Sprintf("%#x (error decoding iovecs: invalid iovcnt)", addr)
+	}
+	ars, err := t.CopyInIovecs(addr, iovcnt)
+	if err != nil {
+		return fmt.Sprintf("%#x (error decoding iovecs: %v)", addr, err)
+	}
+
+	var totalBytes uint64
+	var truncated bool
+	iovs := make([]string, iovcnt)
+	for i := 0; !ars.IsEmpty(); i, ars = i+1, ars.Tail() {
+		ar := ars.Head()
+		if ar.Length() == 0 || !printContent {
+			iovs[i] = fmt.Sprintf("{base=%#x, len=%d}", ar.Start, ar.Length())
+			continue
+		}
+
+		size := uint64(ar.Length())
+		if truncated || totalBytes+size > maxBytes {
+			truncated = true
+			size = maxBytes - totalBytes
+		} else {
+			totalBytes += uint64(ar.Length())
+		}
+
+		b := make([]byte, size)
+		amt, err := t.CopyIn(ar.Start, b)
+		if err != nil {
+			iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q..., error decoding string: %v}", ar.Start, ar.Length(), b[:amt], err)
+			continue
+		}
+
+		dot := ""
+		if truncated {
+			// Indicate truncation.
+			dot = "..."
+		}
+		iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q%s}", ar.Start, ar.Length(), b[:amt], dot)
+	}
+
+	return fmt.Sprintf("%#x %s", addr, strings.Join(iovs, ", "))
+}
+
+func dump(t *kernel.Task, addr usermem.Addr, size uint, maximumBlobSize uint) string {
+	origSize := size
+	if size > maximumBlobSize {
+		size = maximumBlobSize
+	}
+	if size == 0 {
+		return ""
+	}
+
+	b := make([]byte, size)
+	amt, err := t.CopyIn(addr, b)
+	if err != nil {
+		return fmt.Sprintf("%#x (error decoding string: %s)", addr, err)
+	}
+
+	dot := ""
+	if uint(amt) < origSize {
+		// ... if we truncated the dump.
+		dot = "..."
+	}
+
+	return fmt.Sprintf("%#x %q%s", addr, b[:amt], dot)
+}
+
+func path(t *kernel.Task, addr usermem.Addr) string {
+	path, err := t.CopyInString(addr, syscall.PathMax)
+	if err != nil {
+		return fmt.Sprintf("%#x (error decoding path: %s)", addr, err)
+	}
+	return fmt.Sprintf("%#x %s", addr, path)
+}
+
+func fdpair(t *kernel.Task, addr usermem.Addr) string {
+	var fds [2]int32
+	_, err := t.CopyIn(addr, &fds)
+	if err != nil {
+		return fmt.Sprintf("%#x (error decoding fds: %s)", addr, err)
+	}
+
+	return fmt.Sprintf("%#x [%d %d]", addr, fds[0], fds[1])
+}
+
+func uname(t *kernel.Task, addr usermem.Addr) string {
+	var u linux.UtsName
+	if _, err := t.CopyIn(addr, &u); err != nil {
+		return fmt.Sprintf("%#x (error decoding utsname: %s)", addr, err)
+	}
+
+	return fmt.Sprintf("%#x %s", addr, u)
+}
+
+func utimensTimespec(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var tim linux.Timespec
+	if _, err := t.CopyIn(addr, &tim); err != nil {
+		return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
+	}
+
+	var ns string
+	switch tim.Nsec {
+	case linux.UTIME_NOW:
+		ns = "UTIME_NOW"
+	case linux.UTIME_OMIT:
+		ns = "UTIME_OMIT"
+	default:
+		ns = fmt.Sprintf("%v", tim.Nsec)
+	}
+	return fmt.Sprintf("%#x {sec=%v nsec=%s}", addr, tim.Sec, ns)
+}
+
+func timespec(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var tim linux.Timespec
+	if _, err := t.CopyIn(addr, &tim); err != nil {
+		return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err)
+	}
+	return fmt.Sprintf("%#x {sec=%v nsec=%v}", addr, tim.Sec, tim.Nsec)
+}
+
+func timeval(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var tim linux.Timeval
+	if _, err := t.CopyIn(addr, &tim); err != nil {
+		return fmt.Sprintf("%#x (error decoding timeval: %s)", addr, err)
+	}
+
+	return fmt.Sprintf("%#x {sec=%v usec=%v}", addr, tim.Sec, tim.Usec)
+}
+
+func utimbuf(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var utim syscall.Utimbuf
+	if _, err := t.CopyIn(addr, &utim); err != nil {
+		return fmt.Sprintf("%#x (error decoding utimbuf: %s)", addr, err)
+	}
+
+	return fmt.Sprintf("%#x {actime=%v, modtime=%v}", addr, utim.Actime, utim.Modtime)
+}
+
+func stat(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var stat linux.Stat
+	if _, err := t.CopyIn(addr, &stat); err != nil {
+		return fmt.Sprintf("%#x (error decoding stat: %s)", addr, err)
+	}
+	return fmt.Sprintf("%#x {dev=%d, ino=%d, mode=%s, nlink=%d, uid=%d, gid=%d, rdev=%d, size=%d, blksize=%d, blocks=%d, atime=%s, mtime=%s, ctime=%s}", addr, stat.Dev, stat.Ino, linux.FileMode(stat.Mode), stat.Nlink, stat.UID, stat.GID, stat.Rdev, stat.Size, stat.Blksize, stat.Blocks, time.Unix(stat.ATime.Sec, stat.ATime.Nsec), time.Unix(stat.MTime.Sec, stat.MTime.Nsec), time.Unix(stat.CTime.Sec, stat.CTime.Nsec))
+}
+
+func itimerval(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	interval := timeval(t, addr)
+	value := timeval(t, addr+usermem.Addr(binary.Size(linux.Timeval{})))
+	return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
+}
+
+func stringVector(t *kernel.Task, addr usermem.Addr) string {
+	vec, err := t.CopyInVector(addr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
+	if err != nil {
+		return fmt.Sprintf("%#x {error copying vector: %v}", addr, err)
+	}
+	s := fmt.Sprintf("%#x [", addr)
+	for i, v := range vec {
+		if i != 0 {
+			s += ", "
+		}
+		s += fmt.Sprintf("%q", v)
+	}
+	s += "]"
+	return s
+}
+
+func rusage(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var ru linux.Rusage
+	if _, err := t.CopyIn(addr, &ru); err != nil {
+		return fmt.Sprintf("%#x (error decoding rusage: %s)", addr, err)
+	}
+	return fmt.Sprintf("%#x %+v", addr, ru)
+}
+
+// pre fills in the pre-execution arguments for a system call. If an argument
+// cannot be interpreted before the system call is executed, then a hex value
+// will be used. Note that a full output slice will always be provided, that is
+// len(return) == len(args).
+func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlobSize uint) []string {
+	var output []string
+
+	for arg := range args {
+		if arg >= len(i.format) {
+			break
+		}
+		switch i.format[arg] {
+		case WriteBuffer:
+			output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize))
+		case WriteIOVec:
+			output = append(output, iovecs(t, args[arg].Pointer(), int(args[arg+1].Int()), true /* content */, uint64(maximumBlobSize)))
+		case IOVec:
+			output = append(output, iovecs(t, args[arg].Pointer(), int(args[arg+1].Int()), false /* content */, uint64(maximumBlobSize)))
+		case SendMsgHdr:
+			output = append(output, msghdr(t, args[arg].Pointer(), true /* content */, uint64(maximumBlobSize)))
+		case RecvMsgHdr:
+			output = append(output, msghdr(t, args[arg].Pointer(), false /* content */, uint64(maximumBlobSize)))
+		case Path:
+			output = append(output, path(t, args[arg].Pointer()))
+		case ExecveStringVector:
+			output = append(output, stringVector(t, args[arg].Pointer()))
+		case SockAddr:
+			output = append(output, sockAddr(t, args[arg].Pointer(), uint32(args[arg+1].Uint64())))
+		case SockLen:
+			output = append(output, sockLenPointer(t, args[arg].Pointer()))
+		case SockFamily:
+			output = append(output, SocketFamily.Parse(uint64(args[arg].Int())))
+		case SockType:
+			output = append(output, sockType(args[arg].Int()))
+		case SockProtocol:
+			output = append(output, sockProtocol(args[arg-2].Int(), args[arg].Int()))
+		case SockFlags:
+			output = append(output, sockFlags(args[arg].Int()))
+		case Timespec:
+			output = append(output, timespec(t, args[arg].Pointer()))
+		case UTimeTimespec:
+			output = append(output, utimensTimespec(t, args[arg].Pointer()))
+		case ItimerVal:
+			output = append(output, itimerval(t, args[arg].Pointer()))
+		case Timeval:
+			output = append(output, timeval(t, args[arg].Pointer()))
+		case Utimbuf:
+			output = append(output, utimbuf(t, args[arg].Pointer()))
+		case CloneFlags:
+			output = append(output, CloneFlagSet.Parse(uint64(args[arg].Uint())))
+		case OpenFlags:
+			output = append(output, open(uint64(args[arg].Uint())))
+		case Mode:
+			output = append(output, linux.FileMode(args[arg].ModeT()).String())
+		case FutexOp:
+			output = append(output, futex(uint64(args[arg].Uint())))
+		case PtraceRequest:
+			output = append(output, PtraceRequestSet.Parse(args[arg].Uint64()))
+		case Oct:
+			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
+		case Hex:
+			fallthrough
+		default:
+			output = append(output, "0x"+strconv.FormatUint(args[arg].Uint64(), 16))
+		}
+	}
+
+	return output
+}
+
+// post fills in the post-execution arguments for a system call. This modifies
+// the given output slice in place with arguments that may only be interpreted
+// after the system call has been executed.
+func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uintptr, output []string, maximumBlobSize uint) {
+	for arg := range output {
+		if arg >= len(i.format) {
+			break
+		}
+		switch i.format[arg] {
+		case ReadBuffer:
+			output[arg] = dump(t, args[arg].Pointer(), uint(rval), maximumBlobSize)
+		case ReadIOVec:
+			printLength := uint64(rval)
+			if printLength > uint64(maximumBlobSize) {
+				printLength = uint64(maximumBlobSize)
+			}
+			output[arg] = iovecs(t, args[arg].Pointer(), int(args[arg+1].Int()), true /* content */, printLength)
+		case WriteIOVec, IOVec, WriteBuffer:
+			// We already have a big blast from write.
+			output[arg] = "..."
+		case SendMsgHdr:
+			output[arg] = msghdr(t, args[arg].Pointer(), false /* content */, uint64(maximumBlobSize))
+		case RecvMsgHdr:
+			output[arg] = msghdr(t, args[arg].Pointer(), true /* content */, uint64(maximumBlobSize))
+		case PipeFDs:
+			output[arg] = fdpair(t, args[arg].Pointer())
+		case Uname:
+			output[arg] = uname(t, args[arg].Pointer())
+		case Stat:
+			output[arg] = stat(t, args[arg].Pointer())
+		case PostSockAddr:
+			output[arg] = postSockAddr(t, args[arg].Pointer(), args[arg+1].Pointer())
+		case SockLen:
+			output[arg] = sockLenPointer(t, args[arg].Pointer())
+		case PostTimespec:
+			output[arg] = timespec(t, args[arg].Pointer())
+		case PostItimerVal:
+			output[arg] = itimerval(t, args[arg].Pointer())
+		case Timeval:
+			output[arg] = timeval(t, args[arg].Pointer())
+		case Rusage:
+			output[arg] = rusage(t, args[arg].Pointer())
+		}
+	}
+}
+
+// printEntry prints the given system call entry.
+func (i *SyscallInfo) printEnter(t *kernel.Task, args arch.SyscallArguments) []string {
+	output := i.pre(t, args, LogMaximumSize)
+
+	switch len(output) {
+	case 0:
+		t.Infof("%s E %s()", t.Name(), i.name)
+	case 1:
+		t.Infof("%s E %s(%s)", t.Name(), i.name,
+			output[0])
+	case 2:
+		t.Infof("%s E %s(%s, %s)", t.Name(), i.name,
+			output[0], output[1])
+	case 3:
+		t.Infof("%s E %s(%s, %s, %s)", t.Name(), i.name,
+			output[0], output[1], output[2])
+	case 4:
+		t.Infof("%s E %s(%s, %s, %s, %s)", t.Name(), i.name,
+			output[0], output[1], output[2], output[3])
+	case 5:
+		t.Infof("%s E %s(%s, %s, %s, %s, %s)", t.Name(), i.name,
+			output[0], output[1], output[2], output[3], output[4])
+	case 6:
+		t.Infof("%s E %s(%s, %s, %s, %s, %s, %s)", t.Name(), i.name,
+			output[0], output[1], output[2], output[3], output[4], output[5])
+	}
+
+	return output
+}
+
+// printExit prints the given system call exit.
+func (i *SyscallInfo) printExit(t *kernel.Task, elapsed time.Duration, output []string, args arch.SyscallArguments, retval uintptr, err error, errno int) {
+	var rval string
+	if err == nil {
+		// Fill in the output after successful execution.
+		i.post(t, args, retval, output, LogMaximumSize)
+		rval = fmt.Sprintf("%#x (%v)", retval, elapsed)
+	} else {
+		rval = fmt.Sprintf("%#x errno=%d (%s) (%v)", retval, errno, err, elapsed)
+	}
+
+	switch len(output) {
+	case 0:
+		t.Infof("%s X %s() = %s", t.Name(), i.name,
+			rval)
+	case 1:
+		t.Infof("%s X %s(%s) = %s", t.Name(), i.name,
+			output[0], rval)
+	case 2:
+		t.Infof("%s X %s(%s, %s) = %s", t.Name(), i.name,
+			output[0], output[1], rval)
+	case 3:
+		t.Infof("%s X %s(%s, %s, %s) = %s", t.Name(), i.name,
+			output[0], output[1], output[2], rval)
+	case 4:
+		t.Infof("%s X %s(%s, %s, %s, %s) = %s", t.Name(), i.name,
+			output[0], output[1], output[2], output[3], rval)
+	case 5:
+		t.Infof("%s X %s(%s, %s, %s, %s, %s) = %s", t.Name(), i.name,
+			output[0], output[1], output[2], output[3], output[4], rval)
+	case 6:
+		t.Infof("%s X %s(%s, %s, %s, %s, %s, %s) = %s", t.Name(), i.name,
+			output[0], output[1], output[2], output[3], output[4], output[5], rval)
+	}
+}
+
+// sendEnter sends the syscall enter to event log.
+func (i *SyscallInfo) sendEnter(t *kernel.Task, args arch.SyscallArguments) []string {
+	output := i.pre(t, args, EventMaximumSize)
+
+	event := pb.Strace{
+		Process:  t.Name(),
+		Function: i.name,
+		Info: &pb.Strace_Enter{
+			Enter: &pb.StraceEnter{},
+		},
+	}
+	for _, arg := range output {
+		event.Args = append(event.Args, arg)
+	}
+	eventchannel.Emit(&event)
+
+	return output
+}
+
+// sendExit sends the syscall exit to event log.
+func (i *SyscallInfo) sendExit(t *kernel.Task, elapsed time.Duration, output []string, args arch.SyscallArguments, rval uintptr, err error, errno int) {
+	if err == nil {
+		// Fill in the output after successful execution.
+		i.post(t, args, rval, output, EventMaximumSize)
+	}
+
+	exit := &pb.StraceExit{
+		Return:    fmt.Sprintf("%#x", rval),
+		ElapsedNs: elapsed.Nanoseconds(),
+	}
+	if err != nil {
+		exit.Error = err.Error()
+		exit.ErrNo = int64(errno)
+	}
+	event := pb.Strace{
+		Process:  t.Name(),
+		Function: i.name,
+		Info:     &pb.Strace_Exit{Exit: exit},
+	}
+	for _, arg := range output {
+		event.Args = append(event.Args, arg)
+	}
+	eventchannel.Emit(&event)
+}
+
+type syscallContext struct {
+	info        SyscallInfo
+	args        arch.SyscallArguments
+	start       time.Time
+	logOutput   []string
+	eventOutput []string
+	flags       uint32
+}
+
+// SyscallEnter implements kernel.Stracer.SyscallEnter. It logs the syscall
+// entry trace.
+func (s SyscallMap) SyscallEnter(t *kernel.Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} {
+	info, ok := s[sysno]
+	if !ok {
+		info = SyscallInfo{
+			name:   fmt.Sprintf("sys_%d", sysno),
+			format: defaultFormat,
+		}
+	}
+
+	var output, eventOutput []string
+	if bits.IsOn32(flags, kernel.StraceEnableLog) {
+		output = info.printEnter(t, args)
+	}
+	if bits.IsOn32(flags, kernel.StraceEnableEvent) {
+		eventOutput = info.sendEnter(t, args)
+	}
+
+	return &syscallContext{
+		info:        info,
+		args:        args,
+		start:       time.Now(),
+		logOutput:   output,
+		eventOutput: eventOutput,
+		flags:       flags,
+	}
+}
+
+// SyscallExit implements kernel.Stracer.SyscallExit. It logs the syscall
+// exit trace.
+func (s SyscallMap) SyscallExit(context interface{}, t *kernel.Task, sysno, rval uintptr, err error) {
+	errno := t.ExtractErrno(err, int(sysno))
+	c := context.(*syscallContext)
+
+	elapsed := time.Since(c.start)
+	if bits.IsOn32(c.flags, kernel.StraceEnableLog) {
+		c.info.printExit(t, elapsed, c.logOutput, c.args, rval, err, errno)
+	}
+	if bits.IsOn32(c.flags, kernel.StraceEnableEvent) {
+		c.info.sendExit(t, elapsed, c.eventOutput, c.args, rval, err, errno)
+	}
+}
+
+// ConvertToSysnoMap converts the names to a map keyed on the syscall number and value set to true.
+// The map is in a convenient format to call SyscallFlagsTable.Enable().
+func (s SyscallMap) ConvertToSysnoMap(syscalls []string) (map[uintptr]bool, error) {
+	if syscalls == nil {
+		// Sentinel: no list.
+		return nil, nil
+	}
+
+	l := make(map[uintptr]bool)
+	for _, sc := range syscalls {
+		// Try to match this system call.
+		sysno, ok := s.ConvertToSysno(sc)
+		if !ok {
+			return nil, fmt.Errorf("syscall %q not found", sc)
+		}
+		l[sysno] = true
+	}
+
+	// Success.
+	return l, nil
+}
+
+// ConvertToSysno converts the name to system call number. Returns false
+// if syscall with same name is not found.
+func (s SyscallMap) ConvertToSysno(syscall string) (uintptr, bool) {
+	for sysno, info := range s {
+		if info.name != "" && info.name == syscall {
+			return sysno, true
+		}
+	}
+	return 0, false
+}
+
+// Name returns the syscall name.
+func (s SyscallMap) Name(sysno uintptr) string {
+	if info, ok := s[sysno]; ok {
+		return info.name
+	}
+	return fmt.Sprintf("sys_%d", sysno)
+}
+
+// Initialize prepares all syscall tables for use by this package.
+//
+// N.B. This is not in an init function because we can't be sure all syscall
+// tables are registered with the kernel when init runs.
+//
+// TODO: remove kernel package dependencies from this package and
+// have the kernel package self-initialize all syscall tables.
+func Initialize() {
+	for _, table := range kernel.SyscallTables() {
+		// Is this known?
+		sys, ok := Lookup(table.OS, table.Arch)
+		if !ok {
+			continue
+		}
+
+		table.Stracer = sys
+	}
+}
+
+// SinkType defines where to send straces to.
+type SinkType uint32
+
+const (
+	// SinkTypeLog sends straces to text log
+	SinkTypeLog SinkType = 1 << iota
+
+	// SinkTypeEvent sends strace to event log
+	SinkTypeEvent
+)
+
+func convertToSyscallFlag(sinks SinkType) uint32 {
+	ret := uint32(0)
+	if bits.IsOn32(uint32(sinks), uint32(SinkTypeLog)) {
+		ret |= kernel.StraceEnableLog
+	}
+	if bits.IsOn32(uint32(sinks), uint32(SinkTypeEvent)) {
+		ret |= kernel.StraceEnableEvent
+	}
+	return ret
+}
+
+// Enable enables the syscalls in whitelist in all syscall tables.
+//
+// Preconditions: Initialize has been called.
+func Enable(whitelist []string, sinks SinkType) error {
+	flags := convertToSyscallFlag(sinks)
+	for _, table := range kernel.SyscallTables() {
+		// Is this known?
+		sys, ok := Lookup(table.OS, table.Arch)
+		if !ok {
+			continue
+		}
+
+		// Convert to a set of system calls numbers.
+		wl, err := sys.ConvertToSysnoMap(whitelist)
+		if err != nil {
+			return err
+		}
+
+		table.FeatureEnable.Enable(flags, wl, true)
+	}
+
+	// Done.
+	return nil
+}
+
+// Disable will disable Strace for all system calls and missing syscalls.
+//
+// Preconditions: Initialize has been called.
+func Disable(sinks SinkType) {
+	flags := convertToSyscallFlag(sinks)
+	for _, table := range kernel.SyscallTables() {
+		// Strace will be disabled for all syscalls including missing.
+		table.FeatureEnable.Enable(flags, nil, false)
+	}
+}
+
+// EnableAll enables all syscalls in all syscall tables.
+//
+// Preconditions: Initialize has been called.
+func EnableAll(sinks SinkType) {
+	flags := convertToSyscallFlag(sinks)
+	for _, table := range kernel.SyscallTables() {
+		// Is this known?
+		if _, ok := Lookup(table.OS, table.Arch); !ok {
+			continue
+		}
+
+		table.FeatureEnable.EnableAll(flags)
+	}
+}
diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto
new file mode 100644
index 000000000..914e8c7b0
--- /dev/null
+++ b/pkg/sentry/strace/strace.proto
@@ -0,0 +1,50 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+message Strace {
+  // Process name that made the syscall.
+  string process = 1;
+
+  // Syscall function name.
+  string function = 2;
+
+  // List of syscall arguments formatted as strings.
+  repeated string args = 3;
+
+  oneof info {
+    StraceEnter enter = 4;
+    StraceExit exit = 5;
+  }
+}
+
+message StraceEnter {
+}
+
+message StraceExit {
+  // Return value formatted as string.
+  string return = 1;
+
+  // Formatted error string in case syscall failed.
+  string error = 2;
+
+  // Value of errno upon syscall exit.
+  int64 err_no = 3;  // errno is a macro and gets expanded :-(
+
+  // Time elapsed between syscall enter and exit.
+  int64 elapsed_ns = 4;
+}
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
new file mode 100644
index 000000000..d0e661706
--- /dev/null
+++ b/pkg/sentry/strace/syscalls.go
@@ -0,0 +1,217 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// FormatSpecifier values describe how an individual syscall argument should be
+// formatted.
+type FormatSpecifier int
+
+// Valid FormatSpecifiers.
+//
+// Unless otherwise specified, values are formatted before syscall execution
+// and not updated after syscall execution (the same value is output).
+const (
+	// Hex is just a hexadecimal number.
+	Hex FormatSpecifier = iota
+
+	// Oct is just an octal number.
+	Oct
+
+	// ReadBuffer is a buffer for a read-style call. The syscall return
+	// value is used for the length.
+	//
+	// Formatted after syscall execution.
+	ReadBuffer
+
+	// WriteBuffer is a buffer for a write-style call. The following arg is
+	// used for the length.
+	//
+	// Contents omitted after syscall execution.
+	WriteBuffer
+
+	// ReadIOVec is a pointer to a struct iovec for a writev-style call.
+	// The following arg is used for the length. The return value is used
+	// for the total length.
+	//
+	// Complete contents only formatted after syscall execution.
+	ReadIOVec
+
+	// WriteIOVec is a pointer to a struct iovec for a writev-style call.
+	// The following arg is used for the length.
+	//
+	// Complete contents only formatted before syscall execution, omitted
+	// after.
+	WriteIOVec
+
+	// IOVec is a generic pointer to a struct iovec. Contents are not dumped.
+	IOVec
+
+	// SendMsgHdr is a pointer to a struct msghdr for a sendmsg-style call.
+	// Contents formatted only before syscall execution, omitted after.
+	SendMsgHdr
+
+	// RecvMsgHdr is a pointer to a struct msghdr for a recvmsg-style call.
+	// Contents formatted only after syscall execution.
+	RecvMsgHdr
+
+	// Path is a pointer to a char* path.
+	Path
+
+	// ExecveStringVector is a NULL-terminated array of strings. Enforces
+	// the maximum execve array length.
+	ExecveStringVector
+
+	// PipeFDs is an array of two FDs, formatted after syscall execution.
+	PipeFDs
+
+	// Uname is a pointer to a struct uname, formatted after syscall exection.
+	Uname
+
+	// Stat is a pointer to a struct stat, formatted after syscall execution.
+	Stat
+
+	// SockAddr is a pointer to a struct sockaddr. The following arg is
+	// used for length.
+	SockAddr
+
+	// PostSockAddr is a pointer to a struct sockaddr, formatted after
+	// syscall execution. The following arg is a pointer to the socklen_t
+	// length.
+	PostSockAddr
+
+	// SockLen is a pointer to a socklen_t, formatted before and after
+	// syscall execution.
+	SockLen
+
+	// SockFamily is a socket protocol family value.
+	SockFamily
+
+	// SockType is a socket type and flags value.
+	SockType
+
+	// SockProtocol is a socket protocol value. Argument n-2 is the socket
+	// protocol family.
+	SockProtocol
+
+	// SockFlags are socket flags.
+	SockFlags
+
+	// Timespec is a pointer to a struct timespec.
+	Timespec
+
+	// PostTimespec is a pointer to a struct timespec, formatted after
+	// syscall execution.
+	PostTimespec
+
+	// UTimeTimespec is a pointer to a struct timespec. Formatting includes
+	// UTIME_NOW and UTIME_OMIT.
+	UTimeTimespec
+
+	// ItimerVal is a pointer to a struct itimerval.
+	ItimerVal
+
+	// ItimerVal is a pointer to a struct itimerval, formatted after
+	// syscall execution.
+	PostItimerVal
+
+	// Timeval is a pointer to a struct timeval, formatted before and after
+	// syscall execution.
+	Timeval
+
+	// Utimbuf is a pointer to a struct utimbuf.
+	Utimbuf
+
+	// CloneFlags are clone(2) flags.
+	CloneFlags
+
+	// OpenFlags are open(2) flags.
+	OpenFlags
+
+	// Mode is a mode_t.
+	Mode
+
+	// FutexOp is the futex(2) operation.
+	FutexOp
+
+	// PtraceRequest is the ptrace(2) request.
+	PtraceRequest
+
+	// Rusage is a struct rusage, formatted after syscall execution.
+	Rusage
+)
+
+// defaultFormat is the syscall argument format to use if the actual format is
+// not known. It formats all six arguments as hex.
+var defaultFormat = []FormatSpecifier{Hex, Hex, Hex, Hex, Hex, Hex}
+
+// SyscallInfo captures the name and printing format of a syscall.
+type SyscallInfo struct {
+	// name is the name of the syscall.
+	name string
+
+	// format contains the format specifiers for each argument.
+	//
+	// Syscall calls can have up to six arguments. Arguments without a
+	// corresponding entry in format will not be printed.
+	format []FormatSpecifier
+}
+
+// makeSyscallInfo returns a SyscallInfo for a syscall.
+func makeSyscallInfo(name string, f ...FormatSpecifier) SyscallInfo {
+	return SyscallInfo{name: name, format: f}
+}
+
+// SyscallMap maps syscalls into names and printing formats.
+type SyscallMap map[uintptr]SyscallInfo
+
+var _ kernel.Stracer = (SyscallMap)(nil)
+
+// syscallTable contains the syscalls for a specific OS/Arch.
+type syscallTable struct {
+	// os is the operating system this table targets.
+	os abi.OS
+
+	// arch is the architecture this table targets.
+	arch arch.Arch
+
+	// syscalls contains the syscall mappings.
+	syscalls SyscallMap
+}
+
+// syscallTables contains all syscall tables.
+var syscallTables = []syscallTable{
+	{
+		os:       abi.Linux,
+		arch:     arch.AMD64,
+		syscalls: linuxAMD64,
+	},
+}
+
+// Lookup returns the SyscallMap for the OS/Arch combination. The returned map
+// must not be changed.
+func Lookup(os abi.OS, a arch.Arch) (SyscallMap, bool) {
+	for _, s := range syscallTables {
+		if s.os == os && s.arch == a {
+			return s.syscalls, true
+		}
+	}
+	return nil, false
+}
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
new file mode 100644
index 000000000..d667b42c8
--- /dev/null
+++ b/pkg/sentry/syscalls/BUILD
@@ -0,0 +1,43 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "syscalls",
+    srcs = [
+        "epoll.go",
+        "polling.go",
+        "syscalls.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls",
+    visibility = ["//:sandbox"],
+    deps = [
+        ":unimplemented_syscall_go_proto",
+        "//pkg/abi/linux",
+        "//pkg/eventchannel",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
+        "//pkg/syserror",
+        "//pkg/waiter",
+    ],
+)
+
+proto_library(
+    name = "unimplemented_syscall_proto",
+    srcs = ["unimplemented_syscall.proto"],
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_proto"],
+)
+
+go_proto_library(
+    name = "unimplemented_syscall_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto",
+    proto = ":unimplemented_syscall_proto",
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_go_proto"],
+)
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
new file mode 100644
index 000000000..01dd6fa71
--- /dev/null
+++ b/pkg/sentry/syscalls/epoll.go
@@ -0,0 +1,174 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syscalls
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// CreateEpoll implements the epoll_create(2) linux syscall.
+func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) {
+	file := epoll.NewEventPoll(t)
+	defer file.DecRef()
+
+	flags := kernel.FDFlags{
+		CloseOnExec: closeOnExec,
+	}
+	fd, err := t.FDMap().NewFDFrom(0, file, flags, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, err
+	}
+
+	return fd, nil
+}
+
+// AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD.
+func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+	// Get epoll from the file descriptor.
+	epollfile := t.FDMap().GetFile(epfd)
+	if epollfile == nil {
+		return syscall.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Get the target file id.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return syscall.EBADF
+	}
+
+	// Try to add the entry.
+	return e.AddEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData)
+}
+
+// UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD.
+func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error {
+	// Get epoll from the file descriptor.
+	epollfile := t.FDMap().GetFile(epfd)
+	if epollfile == nil {
+		return syscall.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Get the target file id.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return syscall.EBADF
+	}
+
+	// Try to update the entry.
+	return e.UpdateEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData)
+}
+
+// RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL.
+func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error {
+	// Get epoll from the file descriptor.
+	epollfile := t.FDMap().GetFile(epfd)
+	if epollfile == nil {
+		return syscall.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Get the target file id.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return syscall.EBADF
+	}
+
+	// Try to remove the entry.
+	return e.RemoveEntry(epoll.FileIdentifier{file, fd})
+}
+
+// WaitEpoll implements the epoll_wait(2) linux syscall.
+func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event, error) {
+	// Get epoll from the file descriptor.
+	epollfile := t.FDMap().GetFile(fd)
+	if epollfile == nil {
+		return nil, syscall.EBADF
+	}
+	defer epollfile.DecRef()
+
+	// Extract the epollPoll operations.
+	e, ok := epollfile.FileOperations.(*epoll.EventPoll)
+	if !ok {
+		return nil, syscall.EBADF
+	}
+
+	// Try to read events and return right away if we got them or if the
+	// caller requested a non-blocking "wait".
+	r := e.ReadEvents(max)
+	if len(r) != 0 || timeout == 0 {
+		return r, nil
+	}
+
+	// We'll have to wait. Set up the timer if a timeout was specified and
+	// and register with the epoll object for readability events.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if timeout > 0 {
+		timeoutDur := time.Duration(timeout) * time.Millisecond
+		deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
+		haveDeadline = true
+	}
+
+	w, ch := waiter.NewChannelEntry(nil)
+	e.EventRegister(&w, waiter.EventIn)
+	defer e.EventUnregister(&w)
+
+	// Try to read the events again until we succeed, timeout or get
+	// interrupted.
+	for {
+		r = e.ReadEvents(max)
+		if len(r) != 0 {
+			return r, nil
+		}
+
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syscall.ETIMEDOUT {
+				return nil, nil
+			}
+
+			return nil, err
+		}
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
new file mode 100644
index 000000000..bc67ebf30
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -0,0 +1,103 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "linux_state",
+    srcs = [
+        "sys_aio.go",
+        "sys_futex.go",
+        "sys_poll.go",
+        "sys_time.go",
+    ],
+    out = "linux_state.go",
+    package = "linux",
+)
+
+go_library(
+    name = "linux",
+    srcs = [
+        "error.go",
+        "flags.go",
+        "linux64.go",
+        "linux_state.go",
+        "sigset.go",
+        "sys_aio.go",
+        "sys_capability.go",
+        "sys_epoll.go",
+        "sys_eventfd.go",
+        "sys_file.go",
+        "sys_futex.go",
+        "sys_getdents.go",
+        "sys_identity.go",
+        "sys_inotify.go",
+        "sys_lseek.go",
+        "sys_mmap.go",
+        "sys_mount.go",
+        "sys_pipe.go",
+        "sys_poll.go",
+        "sys_prctl.go",
+        "sys_random.go",
+        "sys_read.go",
+        "sys_rlimit.go",
+        "sys_rusage.go",
+        "sys_sched.go",
+        "sys_sem.go",
+        "sys_signal.go",
+        "sys_socket.go",
+        "sys_stat.go",
+        "sys_sync.go",
+        "sys_sysinfo.go",
+        "sys_syslog.go",
+        "sys_thread.go",
+        "sys_time.go",
+        "sys_timer.go",
+        "sys_timerfd.go",
+        "sys_tls.go",
+        "sys_utsname.go",
+        "sys_write.go",
+        "timespec.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/bpf",
+        "//pkg/eventchannel",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/epoll",
+        "//pkg/sentry/kernel/eventfd",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/syscalls",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
new file mode 100644
index 000000000..013b385bc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -0,0 +1,117 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"io"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+var (
+	partialResultMetric = metric.MustCreateNewUint64Metric("/syscalls/partial_result", true /* sync */, "Whether or not a partial result has occurred for this sandbox.")
+	partialResultOnce   sync.Once
+)
+
+// handleIOError handles special error cases for partial results. For some
+// errors, we may consume the error and return only the partial read/write.
+//
+// op and f are used only for panics.
+func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error {
+	switch err {
+	case nil:
+		// Typical successful syscall.
+		return nil
+	case io.EOF:
+		// EOF is always consumed. If this is a partial read/write
+		// (result != 0), the application will see that, otherwise
+		// they will see 0.
+		return nil
+	case syserror.ErrExceedsFileSizeLimit:
+		// Ignore partialResult because this error only applies to
+		// normal files, and for those files we cannot accumulate
+		// write results.
+		//
+		// Do not consume the error and return it as EFBIG.
+		// Simultaneously send a SIGXFSZ per setrlimit(2).
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoKernel,
+		})
+		return syscall.EFBIG
+	case syserror.ErrInterrupted:
+		// The syscall was interrupted. Return nil if it completed
+		// partially, otherwise return the error code that the syscall
+		// needs (to indicate to the kernel what it should do).
+		if partialResult {
+			return nil
+		}
+		return intr
+	}
+
+	if !partialResult {
+		// Typical syscall error.
+		return err
+	}
+
+	switch err {
+	case syserror.EINTR:
+		// Syscall interrupted, but completed a partial
+		// read/write.  Like ErrWouldBlock, since we have a
+		// partial read/write, we consume the error and return
+		// the partial result.
+		return nil
+	case syserror.EFAULT:
+		// EFAULT is only shown the user if nothing was
+		// read/written. If we read something (this case), they see
+		// a partial read/write. They will then presumably try again
+		// with an incremented buffer, which will EFAULT with
+		// result == 0.
+		return nil
+	case syserror.EPIPE:
+		// Writes to a pipe or socket will return EPIPE if the other
+		// side is gone. The partial write is returned. EPIPE will be
+		// returned on the next call.
+		//
+		// TODO: In some cases SIGPIPE should also be sent
+		// to the application.
+		return nil
+	case syserror.ErrWouldBlock:
+		// Syscall would block, but completed a partial read/write.
+		// This case should only be returned by IssueIO for nonblocking
+		// files. Since we have a partial read/write, we consume
+		// ErrWouldBlock, returning the partial result.
+		return nil
+	}
+
+	switch err.(type) {
+	case kernel.SyscallRestartErrno:
+		// Identical to the EINTR case.
+		return nil
+	}
+
+	// An unknown error is encountered with a partial read/write.
+	name, _ := f.Dirent.FullName(nil /* ignore chroot */)
+	log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations)
+	partialResultOnce.Do(partialResultMetric.Increment)
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
new file mode 100644
index 000000000..82bfd7c2a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -0,0 +1,95 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// flagsToPermissions returns a Permissions object from Linux flags.
+// This includes truncate permission if O_TRUNC is set in the mask.
+func flagsToPermissions(mask uint) (p fs.PermMask) {
+	switch mask & syscall.O_ACCMODE {
+	case syscall.O_WRONLY:
+		p.Write = true
+	case syscall.O_RDWR:
+		p.Write = true
+		p.Read = true
+	case syscall.O_RDONLY:
+		p.Read = true
+	}
+	return
+}
+
+// fdFlagsToLinux converts a kernel.FDFlags object to a Linux representation.
+func fdFlagsToLinux(flags kernel.FDFlags) (mask uint) {
+	if flags.CloseOnExec {
+		mask |= syscall.FD_CLOEXEC
+	}
+	return
+}
+
+// flagsToLinux converts a FileFlags object to a Linux representation.
+func flagsToLinux(flags fs.FileFlags) (mask uint) {
+	if flags.Direct {
+		mask |= syscall.O_DIRECT
+	}
+	if flags.NonBlocking {
+		mask |= syscall.O_NONBLOCK
+	}
+	if flags.Sync {
+		mask |= syscall.O_SYNC
+	}
+	if flags.Append {
+		mask |= syscall.O_APPEND
+	}
+	if flags.Directory {
+		mask |= syscall.O_DIRECTORY
+	}
+	switch {
+	case flags.Read && flags.Write:
+		mask |= syscall.O_RDWR
+	case flags.Write:
+		mask |= syscall.O_WRONLY
+	case flags.Read:
+		mask |= syscall.O_RDONLY
+	}
+	return
+}
+
+// linuxToFlags converts linux file flags to a FileFlags object.
+func linuxToFlags(mask uint) (flags fs.FileFlags) {
+	return fs.FileFlags{
+		Direct:      mask&syscall.O_DIRECT != 0,
+		Sync:        mask&syscall.O_SYNC != 0,
+		NonBlocking: mask&syscall.O_NONBLOCK != 0,
+		Read:        (mask & syscall.O_ACCMODE) != syscall.O_WRONLY,
+		Write:       (mask & syscall.O_ACCMODE) != syscall.O_RDONLY,
+		Append:      mask&syscall.O_APPEND != 0,
+		Directory:   mask&syscall.O_DIRECTORY != 0,
+	}
+}
+
+// linuxToSettableFlags converts linux file flags to a SettableFileFlags object.
+func linuxToSettableFlags(mask uint) fs.SettableFileFlags {
+	return fs.SettableFileFlags{
+		Direct:      mask&syscall.O_DIRECT != 0,
+		NonBlocking: mask&syscall.O_NONBLOCK != 0,
+		Append:      mask&syscall.O_APPEND != 0,
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
new file mode 100644
index 000000000..44db2d582
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -0,0 +1,376 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package linux provides syscall tables for amd64 Linux.
+//
+// NOTE: Linux i386 support has been removed.
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// AUDIT_ARCH_X86_64 identifies the Linux syscall API on AMD64, and is taken
+// from <linux/audit.h>.
+const _AUDIT_ARCH_X86_64 = 0xc000003e
+
+// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
+// numbers from Linux 3.11. The entries commented out are those syscalls we
+// don't currently support.
+var AMD64 = &kernel.SyscallTable{
+	OS:   abi.Linux,
+	Arch: arch.AMD64,
+	Version: kernel.Version{
+		Sysname: "Linux",
+		Release: "3.11.10",
+		Version: "#1 SMP Fri Nov 29 10:47:50 PST 2013",
+	},
+	AuditNumber: _AUDIT_ARCH_X86_64,
+	Table: map[uintptr]kernel.SyscallFn{
+		0:  Read,
+		1:  Write,
+		2:  Open,
+		3:  Close,
+		4:  Stat,
+		5:  Fstat,
+		6:  Lstat,
+		7:  Poll,
+		8:  Lseek,
+		9:  Mmap,
+		10: Mprotect,
+		11: Munmap,
+		12: Brk,
+		13: RtSigaction,
+		14: RtSigprocmask,
+		15: RtSigreturn,
+		16: Ioctl,
+		17: Pread64,
+		18: Pwrite64,
+		19: Readv,
+		20: Writev,
+		21: Access,
+		22: Pipe,
+		23: Select,
+		24: SchedYield,
+		25: Mremap,
+		26: Msync,
+		27: Mincore,
+		28: Madvise,
+		//     29: Shmget, TODO
+		//     30: Shmat, TODO
+		//     31: Shmctl, TODO
+		32: Dup,
+		33: Dup2,
+		34: Pause,
+		35: Nanosleep,
+		36: Getitimer,
+		37: Alarm,
+		38: Setitimer,
+		39: Getpid,
+		40: Sendfile,
+		41: Socket,
+		42: Connect,
+		43: Accept,
+		44: SendTo,
+		45: RecvFrom,
+		46: SendMsg,
+		47: RecvMsg,
+		48: Shutdown,
+		49: Bind,
+		50: Listen,
+		51: GetSockName,
+		52: GetPeerName,
+		53: SocketPair,
+		54: SetSockOpt,
+		55: GetSockOpt,
+		56: Clone,
+		57: Fork,
+		58: Vfork,
+		59: Execve,
+		60: Exit,
+		61: Wait4,
+		62: Kill,
+		63: Uname,
+		64: Semget,
+		65: Semop,
+		66: Semctl,
+		//     67: Shmdt, TODO
+		//     68: Msgget, TODO
+		//     69: Msgsnd, TODO
+		//     70: Msgrcv, TODO
+		//     71: Msgctl, TODO
+		72:  Fcntl,
+		73:  Flock,
+		74:  Fsync,
+		75:  Fdatasync,
+		76:  Truncate,
+		77:  Ftruncate,
+		78:  Getdents,
+		79:  Getcwd,
+		80:  Chdir,
+		81:  Fchdir,
+		82:  Rename,
+		83:  Mkdir,
+		84:  Rmdir,
+		85:  Creat,
+		86:  Link,
+		87:  Unlink,
+		88:  Symlink,
+		89:  Readlink,
+		90:  Chmod,
+		91:  Fchmod,
+		92:  Chown,
+		93:  Fchown,
+		94:  Lchown,
+		95:  Umask,
+		96:  Gettimeofday,
+		97:  Getrlimit,
+		98:  Getrusage,
+		99:  Sysinfo,
+		100: Times,
+		101: Ptrace,
+		102: Getuid,
+		103: Syslog,
+		104: Getgid,
+		105: Setuid,
+		106: Setgid,
+		107: Geteuid,
+		108: Getegid,
+		109: Setpgid,
+		110: Getppid,
+		111: Getpgrp,
+		112: Setsid,
+		113: Setreuid,
+		114: Setregid,
+		115: Getgroups,
+		116: Setgroups,
+		117: Setresuid,
+		118: Getresuid,
+		119: Setresgid,
+		120: Getresgid,
+		121: Getpgid,
+		//     122: Setfsuid, TODO
+		//     123: Setfsgid, TODO
+		124: Getsid,
+		125: Capget,
+		126: Capset,
+		127: RtSigpending,
+		128: RtSigtimedwait,
+		129: RtSigqueueinfo,
+		130: RtSigsuspend,
+		131: Sigaltstack,
+		132: Utime,
+		133: Mknod,
+		134: syscalls.Error(syscall.ENOSYS),          // Uselib, obsolete
+		135: syscalls.ErrorWithEvent(syscall.EINVAL), // SetPersonality, unable to change personality
+		136: syscalls.ErrorWithEvent(syscall.ENOSYS), // Ustat, needs filesystem support
+		137: Statfs,
+		138: Fstatfs,
+		//     139: Sysfs, TODO
+		140: Getpriority,
+		141: Setpriority,
+		142: syscalls.CapError(linux.CAP_SYS_NICE), // SchedSetparam, requires cap_sys_nice
+		143: SchedGetparam,
+		144: SchedSetscheduler,
+		145: SchedGetscheduler,
+		146: SchedGetPriorityMax,
+		147: SchedGetPriorityMin,
+		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
+		149: syscalls.Error(nil),                         // Mlock, TODO
+		150: syscalls.Error(nil),                         // Munlock, TODO
+		151: syscalls.Error(nil),                         // Mlockall, TODO
+		152: syscalls.Error(nil),                         // Munlockall, TODO
+		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
+		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
+		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
+		156: syscalls.Error(syscall.EPERM),               // Sysctl, syscall is "worthless"
+		157: Prctl,
+		158: ArchPrctl,
+		159: syscalls.CapError(linux.CAP_SYS_TIME), // Adjtimex, requires cap_sys_time
+		160: Setrlimit,
+		161: Chroot,
+		162: Sync,
+		163: syscalls.CapError(linux.CAP_SYS_PACCT), // Acct, requires cap_sys_pacct
+		164: syscalls.CapError(linux.CAP_SYS_TIME),  // Settimeofday, requires cap_sys_time
+		165: Mount,
+		166: Umount2,
+		167: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapon, requires cap_sys_admin
+		168: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapoff, requires cap_sys_admin
+		169: syscalls.CapError(linux.CAP_SYS_BOOT),  // Reboot, requires cap_sys_boot
+		170: Sethostname,
+		171: Setdomainname,
+		172: syscalls.CapError(linux.CAP_SYS_RAWIO),  // Iopl, requires cap_sys_rawio
+		173: syscalls.CapError(linux.CAP_SYS_RAWIO),  // Ioperm, requires cap_sys_rawio
+		174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module
+		175: syscalls.CapError(linux.CAP_SYS_MODULE), // InitModule, requires cap_sys_module
+		176: syscalls.CapError(linux.CAP_SYS_MODULE), // DeleteModule, requires cap_sys_module
+		177: syscalls.Error(syscall.ENOSYS),          // GetKernelSyms, not supported in > 2.6
+		178: syscalls.Error(syscall.ENOSYS),          // QueryModule, not supported in > 2.6
+		179: syscalls.CapError(linux.CAP_SYS_ADMIN),  // Quotactl, requires cap_sys_admin (most operations)
+		180: syscalls.Error(syscall.ENOSYS),          // Nfsservctl, does not exist > 3.1
+		181: syscalls.Error(syscall.ENOSYS),          // Getpmsg, not implemented in Linux
+		182: syscalls.Error(syscall.ENOSYS),          // Putpmsg, not implemented in Linux
+		183: syscalls.Error(syscall.ENOSYS),          // AfsSyscall, not implemented in Linux
+		184: syscalls.Error(syscall.ENOSYS),          // Tuxcall, not implemented in Linux
+		185: syscalls.Error(syscall.ENOSYS),          // Security, not implemented in Linux
+		186: Gettid,
+		187: nil,                                      // Readahead, TODO
+		188: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Setxattr, requires filesystem support
+		189: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lsetxattr, requires filesystem support
+		190: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fsetxattr, requires filesystem support
+		191: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Getxattr, requires filesystem support
+		192: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lgetxattr, requires filesystem support
+		193: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fgetxattr, requires filesystem support
+		194: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Listxattr, requires filesystem support
+		195: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Llistxattr, requires filesystem support
+		196: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Flistxattr, requires filesystem support
+		197: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Removexattr, requires filesystem support
+		198: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lremovexattr, requires filesystem support
+		199: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fremovexattr, requires filesystem support
+		200: Tkill,
+		201: Time,
+		202: Futex,
+		203: SchedSetaffinity,
+		204: SchedGetaffinity,
+		205: syscalls.Error(syscall.ENOSYS), // SetThreadArea, expected to return ENOSYS on 64-bit
+		206: IoSetup,
+		207: IoDestroy,
+		208: IoGetevents,
+		209: IoSubmit,
+		210: IoCancel,
+		211: syscalls.Error(syscall.ENOSYS),         // GetThreadArea, expected to return ENOSYS on 64-bit
+		212: syscalls.CapError(linux.CAP_SYS_ADMIN), // LookupDcookie, requires cap_sys_admin
+		213: EpollCreate,
+		214: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollCtlOld, deprecated (afaik, unused)
+		215: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollWaitOld, deprecated (afaik, unused)
+		216: syscalls.ErrorWithEvent(syscall.ENOSYS), // RemapFilePages, deprecated since 3.16
+		217: Getdents64,
+		218: SetTidAddress,
+		219: RestartSyscall,
+		//     220: Semtimedop, TODO
+		221: Fadvise64,
+		//     222: TimerCreate, TODO
+		//     223: TimerSettime, TODO
+		//     224: TimerGettime, TODO
+		//     225: TimerGetoverrun, TODO
+		//     226: TimerDelete, TODO
+		227: ClockSettime,
+		228: ClockGettime,
+		229: ClockGetres,
+		230: ClockNanosleep,
+		231: ExitGroup,
+		232: EpollWait,
+		233: EpollCtl,
+		234: Tgkill,
+		235: Utimes,
+		236: syscalls.Error(syscall.ENOSYS),        // Vserver, not implemented by Linux
+		237: syscalls.CapError(linux.CAP_SYS_NICE), // Mbind, may require cap_sys_nice TODO
+		238: SetMempolicy,
+		239: GetMempolicy,
+		//     240: MqOpen, TODO
+		//     241: MqUnlink, TODO
+		//     242: MqTimedsend, TODO
+		//     243: MqTimedreceive, TODO
+		//     244: MqNotify, TODO
+		//     245: MqGetsetattr, TODO
+		246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot
+		247: Waitid,
+		248: syscalls.Error(syscall.EACCES),         // AddKey, not available to user
+		249: syscalls.Error(syscall.EACCES),         // RequestKey, not available to user
+		250: syscalls.Error(syscall.EACCES),         // Keyctl, not available to user
+		251: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioSet, requires cap_sys_nice or cap_sys_admin (depending)
+		252: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioGet, requires cap_sys_nice or cap_sys_admin (depending)
+		253: InotifyInit,
+		254: InotifyAddWatch,
+		255: InotifyRmWatch,
+		256: syscalls.CapError(linux.CAP_SYS_NICE), // MigratePages, requires cap_sys_nice
+		257: Openat,
+		258: Mkdirat,
+		259: Mknodat,
+		260: Fchownat,
+		261: Futimesat,
+		262: Fstatat,
+		263: Unlinkat,
+		264: Renameat,
+		265: Linkat,
+		266: Symlinkat,
+		267: Readlinkat,
+		268: Fchmodat,
+		269: Faccessat,
+		270: Pselect,
+		271: Ppoll,
+		272: Unshare,
+		273: syscalls.Error(syscall.ENOSYS), // SetRobustList, obsolete
+		274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete
+		//     275: Splice, TODO
+		//     276: Tee, TODO
+		//     277: SyncFileRange, TODO
+		//     278: Vmsplice, TODO
+		279: syscalls.CapError(linux.CAP_SYS_NICE), // MovePages, requires cap_sys_nice (mostly)
+		280: Utimensat,
+		281: EpollPwait,
+		//     282: Signalfd, TODO
+		283: TimerfdCreate,
+		284: Eventfd,
+		285: Fallocate,
+		286: TimerfdSettime,
+		287: TimerfdGettime,
+		288: Accept4,
+		//     289: Signalfd4, TODO
+		290: Eventfd2,
+		291: EpollCreate1,
+		292: Dup3,
+		293: Pipe2,
+		294: InotifyInit1,
+		295: Preadv,
+		296: Pwritev,
+		297: RtTgsigqueueinfo,
+		298: syscalls.ErrorWithEvent(syscall.ENODEV), // PerfEventOpen, no support for perf counters
+		299: RecvMMsg,
+		300: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyInit, needs CONFIG_FANOTIFY
+		301: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyMark, needs CONFIG_FANOTIFY
+		302: Prlimit64,
+		303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // NameToHandleAt, needs filesystem support
+		304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // OpenByHandleAt, needs filesystem support
+		305: syscalls.CapError(linux.CAP_SYS_TIME),       // ClockAdjtime, requires cap_sys_time
+		306: Syncfs,
+		307: SendMMsg,
+		//     308: Setns, TODO
+		309: Getcpu,
+		//     310: ProcessVmReadv, TODO may require cap_sys_ptrace
+		//     311: ProcessVmWritev, TODO may require cap_sys_ptrace
+		312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace
+		313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module
+		// "Backports."
+		318: GetRandom,
+	},
+
+	Emulate: map[usermem.Addr]uintptr{
+		0xffffffffff600000: 96,  // vsyscall gettimeofday(2)
+		0xffffffffff600400: 201, // vsyscall time(2)
+		0xffffffffff600800: 309, // vsyscall getcpu(2)
+	},
+	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
+		syscalls.UnimplementedEvent(t)
+		return 0, syserror.ENOSYS
+	},
+}
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
new file mode 100644
index 000000000..bfb541634
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
+// STOP are clear.
+func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) {
+	if size != linux.SignalSetSize {
+		return 0, syscall.EINVAL
+	}
+	b := t.CopyScratchBuffer(8)
+	if _, err := t.CopyInBytes(sigSetAddr, b); err != nil {
+		return 0, err
+	}
+	mask := usermem.ByteOrder.Uint64(b[:])
+	return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil
+}
+
+// copyOutSigSet copies out a sigset_t.
+func copyOutSigSet(t *kernel.Task, sigSetAddr usermem.Addr, mask linux.SignalSet) error {
+	b := t.CopyScratchBuffer(8)
+	usermem.ByteOrder.PutUint64(b, uint64(mask))
+	_, err := t.CopyOutBytes(sigSetAddr, b)
+	return err
+}
+
+// copyInSigSetWithSize copies in a structure as below
+//
+//   struct {
+//       sigset_t* sigset_addr;
+//       size_t sizeof_sigset;
+//   };
+//
+// and returns sigset_addr and size.
+func copyInSigSetWithSize(t *kernel.Task, addr usermem.Addr) (usermem.Addr, uint, error) {
+	switch t.Arch().Width() {
+	case 8:
+		in := t.CopyScratchBuffer(16)
+		if _, err := t.CopyInBytes(addr, in); err != nil {
+			return 0, 0, err
+		}
+		maskAddr := usermem.Addr(usermem.ByteOrder.Uint64(in[0:]))
+		maskSize := uint(usermem.ByteOrder.Uint64(in[8:]))
+		return maskAddr, maskSize, nil
+	default:
+		return 0, 0, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
new file mode 100644
index 000000000..80407a082
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -0,0 +1,402 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"encoding/binary"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// I/O commands.
+const (
+	_IOCB_CMD_PREAD   = 0
+	_IOCB_CMD_PWRITE  = 1
+	_IOCB_CMD_FSYNC   = 2
+	_IOCB_CMD_FDSYNC  = 3
+	_IOCB_CMD_NOOP    = 6
+	_IOCB_CMD_PREADV  = 7
+	_IOCB_CMD_PWRITEV = 8
+)
+
+// I/O flags.
+const (
+	_IOCB_FLAG_RESFD = 1
+)
+
+// ioCallback describes an I/O request.
+//
+// The priority field is currently ignored in the implementation below. Also
+// note that the IOCB_FLAG_RESFD feature is not supported.
+type ioCallback struct {
+	Data      uint64
+	Key       uint32
+	Reserved1 uint32
+
+	OpCode  uint16
+	ReqPrio int16
+	FD      uint32
+
+	Buf    uint64
+	Bytes  uint64
+	Offset int64
+
+	Reserved2 uint64
+	Flags     uint32
+
+	// eventfd to signal if IOCB_FLAG_RESFD is set in flags.
+	ResFD uint32
+}
+
+// ioEvent describes an I/O result.
+type ioEvent struct {
+	Data    uint64
+	Obj     uint64
+	Result  int64
+	Result2 int64
+}
+
+// ioEventSize is the size of an ioEvent encoded.
+var ioEventSize = binary.Size(ioEvent{})
+
+// IoSetup implements linux syscall io_setup(2).
+func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nrEvents := args[0].Int()
+	idAddr := args[1].Pointer()
+
+	// Linux uses the native long as the aio ID.
+	//
+	// The context pointer _must_ be zero initially.
+	var idIn uint64
+	if _, err := t.CopyIn(idAddr, &idIn); err != nil {
+		return 0, nil, err
+	}
+	if idIn != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Copy out the new ID.
+	if _, err := t.CopyOut(idAddr, &id); err != nil {
+		t.MemoryManager().DestroyAIOContext(t, id)
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// IoDestroy implements linux syscall io_destroy(2).
+func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+
+	// Destroy the given context.
+	if !t.MemoryManager().DestroyAIOContext(t, id) {
+		// Does not exist.
+		return 0, nil, syserror.EINVAL
+	}
+	// FIXME: Linux blocks until all AIO to the destroyed context is
+	// done.
+	return 0, nil, nil
+}
+
+// IoGetevents implements linux syscall io_getevents(2).
+func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	minEvents := args[1].Int()
+	events := args[2].Int()
+	eventsAddr := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+
+	// Sanity check arguments.
+	if minEvents > events {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Setup the timeout.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if timespecAddr != 0 {
+		d, err := copyTimespecIn(t, timespecAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !d.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration())
+		haveDeadline = true
+	}
+
+	// Loop over all requests.
+	for count := int32(0); count < events; count++ {
+		// Get a request, per semantics.
+		var v interface{}
+		if count >= minEvents {
+			var ok bool
+			v, ok = ctx.PopRequest()
+			if !ok {
+				return uintptr(count), nil, nil
+			}
+		} else {
+			var err error
+			v, err = waitForRequest(ctx, t, haveDeadline, deadline)
+			if err != nil {
+				if count > 0 || err == syserror.ETIMEDOUT {
+					return uintptr(count), nil, nil
+				}
+				return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+			}
+		}
+
+		ev := v.(*ioEvent)
+
+		// Copy out the result.
+		if _, err := t.CopyOut(eventsAddr, ev); err != nil {
+			if count > 0 {
+				return uintptr(count), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Keep rolling.
+		eventsAddr += usermem.Addr(ioEventSize)
+	}
+
+	// Everything finished.
+	return uintptr(events), nil, nil
+}
+
+func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
+	for {
+		if v, ok := ctx.PopRequest(); ok {
+			// Request was readly available. Just return it.
+			return v, nil
+		}
+
+		// Need to wait for request completion.
+		done, active := ctx.WaitChannel()
+		if !active {
+			// Context has been destroyed.
+			return nil, syserror.EINVAL
+		}
+		if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil {
+			return nil, err
+		}
+	}
+}
+
+// memoryFor returns appropriate memory for the given callback.
+func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) {
+	bytes := int(cb.Bytes)
+	if bytes < 0 {
+		// Linux also requires that this field fit in ssize_t.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+
+	// Since this I/O will be asynchronous with respect to t's task goroutine,
+	// we have no guarantee that t's AddressSpace will be active during the
+	// I/O.
+	switch cb.OpCode {
+	case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE:
+		return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV:
+		return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{
+			AddressSpaceActive: false,
+		})
+
+	case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP:
+		return usermem.IOSequence{}, nil
+
+	default:
+		// Not a supported command.
+		return usermem.IOSequence{}, syserror.EINVAL
+	}
+}
+
+func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) {
+	ev := &ioEvent{
+		Data: cb.Data,
+		Obj:  uint64(cbAddr),
+	}
+
+	// Construct a context.Context that will not be interrupted if t is
+	// interrupted.
+	c := t.AsyncContext()
+
+	var err error
+	switch cb.OpCode {
+	case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV:
+		ev.Result, err = file.Preadv(c, ioseq, cb.Offset)
+	case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
+		ev.Result, err = file.Pwritev(c, ioseq, cb.Offset)
+	case _IOCB_CMD_FSYNC:
+		err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll)
+	case _IOCB_CMD_FDSYNC:
+		err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData)
+	}
+
+	// Update the result.
+	if err != nil {
+		err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
+		ev.Result = -int64(t.ExtractErrno(err, 0))
+	}
+
+	file.DecRef()
+
+	// Queue the result for delivery.
+	ctx.FinishRequest(ev)
+
+	// Notify the event file if one was specified. This needs to happen
+	// *after* queueing the result to avoid racing with the thread we may
+	// wake up.
+	if eventFile != nil {
+		eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
+		eventFile.DecRef()
+	}
+}
+
+// submitCallback processes a single callback.
+func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error {
+	file := t.FDMap().GetFile(kdefs.FD(cb.FD))
+	if file == nil {
+		// File not found.
+		return syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Was there an eventFD? Extract it.
+	var eventFile *fs.File
+	if cb.Flags&_IOCB_FLAG_RESFD != 0 {
+		eventFile := t.FDMap().GetFile(kdefs.FD(cb.ResFD))
+		if eventFile == nil {
+			// Bad FD.
+			return syserror.EBADF
+		}
+		defer eventFile.DecRef()
+
+		// Check that it is an eventfd.
+		if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok {
+			// Not an event FD.
+			return syserror.EINVAL
+		}
+	}
+
+	ioseq, err := memoryFor(t, cb)
+	if err != nil {
+		return err
+	}
+
+	// Prepare the request.
+	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
+	if !ok {
+		return syserror.EINVAL
+	}
+	if ready := ctx.Prepare(); !ready {
+		// Context is busy.
+		return syserror.EAGAIN
+	}
+
+	if eventFile != nil {
+		// The request is set. Make sure there's a ref on the file.
+		//
+		// This is necessary when the callback executes on completion,
+		// which is also what will release this reference.
+		eventFile.IncRef()
+	}
+
+	// Perform the request asynchronously.
+	file.IncRef()
+	fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) })
+
+	// All set.
+	return nil
+}
+
+// IoSubmit implements linux syscall io_submit(2).
+func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Uint64()
+	nrEvents := args[1].Int()
+	addr := args[2].Pointer()
+
+	for i := int32(0); i < nrEvents; i++ {
+		// Copy in the address.
+		cbAddrNative := t.Arch().Native(0)
+		if _, err := t.CopyIn(addr, cbAddrNative); err != nil {
+			if i > 0 {
+				// Some successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Copy in this callback.
+		var cb ioCallback
+		cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative))
+		if _, err := t.CopyIn(cbAddr, &cb); err != nil {
+
+			if i > 0 {
+				// Some have been successful.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Process this callback.
+		if err := submitCallback(t, id, &cb, cbAddr); err != nil {
+			if i > 0 {
+				// Partial success.
+				return uintptr(i), nil, nil
+			}
+			// Nothing done.
+			return 0, nil, err
+		}
+
+		// Advance to the next one.
+		addr += usermem.Addr(t.Arch().Width())
+	}
+
+	return uintptr(nrEvents), nil, nil
+}
+
+// IoCancel implements linux syscall io_cancel(2).
+//
+// It is not presently supported (ENOSYS indicates no support on this
+// architecture).
+func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ENOSYS
+}
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
new file mode 100644
index 000000000..89c81ac90
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -0,0 +1,149 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) {
+	if tid < 0 {
+		err = syserror.EINVAL
+		return
+	}
+	if tid > 0 {
+		t = t.PIDNamespace().TaskWithID(tid)
+	}
+	if t == nil {
+		err = syserror.ESRCH
+		return
+	}
+	creds := t.Credentials()
+	permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps
+	return
+}
+
+// Capget implements Linux syscall capget.
+func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	hdrAddr := args[0].Pointer()
+	dataAddr := args[1].Pointer()
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+		return 0, nil, err
+	}
+	// hdr.Pid doesn't need to be valid if this capget() is a "version probe"
+	// (hdr.Version is unrecognized and dataAddr is null), so we can't do the
+	// lookup yet.
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		if dataAddr == 0 {
+			return 0, nil, nil
+		}
+		p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+		if err != nil {
+			return 0, nil, err
+		}
+		data := linux.CapUserData{
+			Effective:   uint32(e),
+			Permitted:   uint32(p),
+			Inheritable: uint32(i),
+		}
+		_, err = t.CopyOut(dataAddr, &data)
+		return 0, nil, err
+
+	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+		if dataAddr == 0 {
+			return 0, nil, nil
+		}
+		p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
+		if err != nil {
+			return 0, nil, err
+		}
+		data := [2]linux.CapUserData{
+			{
+				Effective:   uint32(e),
+				Permitted:   uint32(p),
+				Inheritable: uint32(i),
+			},
+			{
+				Effective:   uint32(e >> 32),
+				Permitted:   uint32(p >> 32),
+				Inheritable: uint32(i >> 32),
+			},
+		}
+		_, err = t.CopyOut(dataAddr, &data)
+		return 0, nil, err
+
+	default:
+		hdr.Version = linux.HighestCapabilityVersion
+		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+			return 0, nil, err
+		}
+		if dataAddr != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		return 0, nil, nil
+	}
+}
+
+// Capset implements Linux syscall capset.
+func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	hdrAddr := args[0].Pointer()
+	dataAddr := args[1].Pointer()
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+		return 0, nil, err
+	}
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+			return 0, nil, syserror.EPERM
+		}
+		var data linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return 0, nil, err
+		}
+		p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
+		i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities
+		e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities
+		return 0, nil, t.SetCapabilitySets(p, i, e)
+
+	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+		if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
+			return 0, nil, syserror.EPERM
+		}
+		var data [2]linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return 0, nil, err
+		}
+		p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
+		i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities
+		e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities
+		return 0, nil, t.SetCapabilitySets(p, i, e)
+
+	default:
+		hdr.Version = linux.HighestCapabilityVersion
+		if _, err := t.CopyOut(hdrAddr, &hdr); err != nil {
+			return 0, nil, err
+		}
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
new file mode 100644
index 000000000..e69dfc77a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -0,0 +1,171 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// EpollCreate1 implements the epoll_create1(2) linux syscall.
+func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags & ^syscall.EPOLL_CLOEXEC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	closeOnExec := flags&syscall.EPOLL_CLOEXEC != 0
+	fd, err := syscalls.CreateEpoll(t, closeOnExec)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// EpollCreate implements the epoll_create(2) linux syscall.
+func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+
+	if size <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	fd, err := syscalls.CreateEpoll(t, false)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// EpollCtl implements the epoll_ctl(2) linux syscall.
+func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := kdefs.FD(args[0].Int())
+	op := args[1].Int()
+	fd := kdefs.FD(args[2].Int())
+	eventAddr := args[3].Pointer()
+
+	// Capture the event state if needed.
+	flags := epoll.EntryFlags(0)
+	mask := waiter.EventMask(0)
+	var data [2]int32
+	if op != syscall.EPOLL_CTL_DEL {
+		var e syscall.EpollEvent
+		if _, err := t.CopyIn(eventAddr, &e); err != nil {
+			return 0, nil, err
+		}
+
+		if e.Events&syscall.EPOLLONESHOT != 0 {
+			flags |= epoll.OneShot
+		}
+
+		// syscall.EPOLLET is incorrectly generated as a negative number
+		// in Go, see https://github.com/golang/go/issues/5328 for
+		// details.
+		if e.Events&-syscall.EPOLLET != 0 {
+			flags |= epoll.EdgeTriggered
+		}
+
+		mask = waiter.EventMask(e.Events)
+		data[0] = e.Fd
+		data[1] = e.Pad
+	}
+
+	// Perform the requested operations.
+	switch op {
+	case syscall.EPOLL_CTL_ADD:
+		// See fs/eventpoll.c.
+		mask |= waiter.EventHUp | waiter.EventErr
+		return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data)
+	case syscall.EPOLL_CTL_DEL:
+		return 0, nil, syscalls.RemoveEpoll(t, epfd, fd)
+	case syscall.EPOLL_CTL_MOD:
+		// Same as EPOLL_CTL_ADD.
+		mask |= waiter.EventHUp | waiter.EventErr
+		return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data)
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+// copyOutEvents copies epoll events from the kernel to user memory.
+func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
+	const itemLen = 12
+	if _, ok := addr.AddLength(uint64(len(e)) * itemLen); !ok {
+		return syserror.EFAULT
+	}
+
+	b := t.CopyScratchBuffer(itemLen)
+	for i := range e {
+		usermem.ByteOrder.PutUint32(b[0:], e[i].Events)
+		usermem.ByteOrder.PutUint32(b[4:], uint32(e[i].Data[0]))
+		usermem.ByteOrder.PutUint32(b[8:], uint32(e[i].Data[1]))
+		if _, err := t.CopyOutBytes(addr, b); err != nil {
+			return err
+		}
+		addr += itemLen
+	}
+
+	return nil
+}
+
+// EpollWait implements the epoll_wait(2) linux syscall.
+func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	epfd := kdefs.FD(args[0].Int())
+	eventsAddr := args[1].Pointer()
+	maxEvents := int(args[2].Int())
+	timeout := int(args[3].Int())
+
+	r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout)
+	if err != nil {
+		return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	if len(r) != 0 {
+		if err := copyOutEvents(t, eventsAddr, r); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return uintptr(len(r)), nil, nil
+}
+
+// EpollPwait implements the epoll_pwait(2) linux syscall.
+func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	maskAddr := args[4].Pointer()
+	maskSize := uint(args[5].Uint())
+
+	if maskAddr != 0 {
+		mask, err := copyInSigSet(t, maskAddr, maskSize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		oldmask := t.SignalMask()
+		t.SetSignalMask(mask)
+		t.SetSavedSignalMask(oldmask)
+	}
+
+	return EpollWait(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
new file mode 100644
index 000000000..60fe5a133
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -0,0 +1,65 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd"
+)
+
+const (
+	// EFD_SEMAPHORE is a flag used in syscall eventfd(2) and eventfd2(2). Please
+	// see its man page for more information.
+	EFD_SEMAPHORE = 1
+	EFD_NONBLOCK  = 0x800
+	EFD_CLOEXEC   = 0x80000
+)
+
+// Eventfd2 implements linux syscall eventfd2(2).
+func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	initVal := args[0].Int()
+	flags := uint(args[1].Uint())
+	allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC)
+
+	if flags & ^allOps != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0)
+	event.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&EFD_NONBLOCK != 0,
+	})
+	defer event.DecRef()
+
+	fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{
+		CloseOnExec: flags&EFD_CLOEXEC != 0,
+	},
+		t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// Eventfd implements linux syscall eventfd(2).
+func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[1].Value = 0
+	return Eventfd2(t, args)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
new file mode 100644
index 000000000..a2dbba7e0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -0,0 +1,1942 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"io"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// fileOpAt performs an operation on the second last component in the path.
+func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string) error) error {
+	// Extract the last component.
+	dir, name := fs.SplitLast(path)
+	if dir == "/" {
+		// Common case: we are accessing a file in the root.
+		root := t.FSContext().RootDirectory()
+		err := fn(root, root, name)
+		root.DecRef()
+		return err
+	} else if dir == "." && dirFD == linux.AT_FDCWD {
+		// Common case: we are accessing a file relative to the current
+		// working directory; skip the look-up.
+		wd := t.FSContext().WorkingDirectory()
+		root := t.FSContext().RootDirectory()
+		err := fn(root, wd, name)
+		wd.DecRef()
+		root.DecRef()
+		return err
+	}
+
+	return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return fn(root, d, name)
+	})
+}
+
+// fileOpOn performs an operation on the last entry of the path.
+func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+	var (
+		d   *fs.Dirent // The file.
+		wd  *fs.Dirent // The working directory (if required.)
+		rel *fs.Dirent // The relative directory for search (if required.)
+		f   *fs.File   // The file corresponding to dirFD (if required.)
+		err error
+	)
+
+	// Extract the working directory (maybe).
+	if len(path) > 0 && path[0] == '/' {
+		// Absolute path; rel can be nil.
+	} else if dirFD == linux.AT_FDCWD {
+		// Need to reference the working directory.
+		wd = t.FSContext().WorkingDirectory()
+		rel = wd
+	} else {
+		// Need to extract the given FD.
+		f = t.FDMap().GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		rel = f.Dirent
+		if !fs.IsDir(rel.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+	}
+
+	// Grab the root (always required.)
+	root := t.FSContext().RootDirectory()
+
+	// Lookup the node.
+	if resolve {
+		d, err = t.MountNamespace().FindInode(t, root, rel, path, linux.MaxSymlinkTraversals)
+	} else {
+		d, err = t.MountNamespace().FindLink(t, root, rel, path, linux.MaxSymlinkTraversals)
+	}
+	root.DecRef()
+	if wd != nil {
+		wd.DecRef()
+	}
+	if f != nil {
+		f.DecRef()
+	}
+	if err != nil {
+		return err
+	}
+
+	err = fn(root, d)
+	d.DecRef()
+	return err
+}
+
+// copyInPath copies a path in.
+func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
+	path, err = t.CopyInString(addr, syscall.PathMax)
+	if err != nil {
+		return "", false, err
+	}
+	if path == "" && !allowEmpty {
+		return "", false, syserror.ENOENT
+	}
+
+	// If the path ends with a /, then checks must be enforced in various
+	// ways in the different callers. We pass this back to the caller.
+	path, dirPath = fs.TrimTrailingSlashes(path)
+
+	return path, dirPath, nil
+}
+
+func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+
+	err = fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// First check a few things about the filesystem before trying to get the file
+		// reference.
+		//
+		// It's required that Check does not try to open files not that aren't backed by
+		// this dirent (e.g. pipes and sockets) because this would result in opening these
+		// files an extra time just to check permissions.
+		if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+			return err
+		}
+
+		fileFlags := linuxToFlags(flags)
+		isDir := fs.IsDir(d.Inode.StableAttr)
+
+		// If O_DIRECTORY is set, but the file is not a directory, then fail.
+		if fileFlags.Directory && !isDir {
+			return syserror.ENOTDIR
+		}
+
+		// If it's a directory, then make sure.
+		if dirPath && !isDir {
+			return syserror.ENOTDIR
+		}
+
+		// Don't allow directories to be opened writable.
+		if isDir && fileFlags.Write {
+			return syserror.EISDIR
+		}
+
+		file, err := d.Inode.GetFile(t, d, fileFlags)
+		if err != nil {
+			return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+		}
+		defer file.DecRef()
+
+		// Success.
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}
+		newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return err
+		}
+
+		// Set return result in frame.
+		fd = uintptr(newFD)
+
+		// Generate notification for opened file.
+		d.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Do we have the appropriate permissions on the parent?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+
+		// Attempt a creation.
+		perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+
+		switch mode.FileType() {
+		case 0:
+			// "Zero file type is equivalent to type S_IFREG." - mknod(2)
+			fallthrough
+		case linux.ModeRegular:
+			// We are not going to return the file, so the actual
+			// flags used don't matter, but they cannot be empty or
+			// Create will complain.
+			flags := fs.FileFlags{Read: true, Write: true}
+			file, err := d.Create(t, root, name, flags, perms)
+			if err != nil {
+				return err
+			}
+			file.DecRef()
+			return nil
+
+		case linux.ModeNamedPipe:
+			return d.CreateFifo(t, root, name, perms)
+
+		case linux.ModeSocket:
+			// While it is possible create a unix domain socket file on linux
+			// using mknod(2), in practice this is pretty useless from an
+			// application. Linux internally uses mknod() to create the socket
+			// node during bind(2), but we implement bind(2) independently. If
+			// an application explicitly creates a socket node using mknod(),
+			// you can't seem to bind() or connect() to the resulting socket.
+			//
+			// Instead of emulating this seemingly useless behaviour, we'll
+			// indicate that the filesystem doesn't support the creation of
+			// sockets.
+			return syserror.EOPNOTSUPP
+
+		case linux.ModeCharacterDevice:
+			fallthrough
+		case linux.ModeBlockDevice:
+			// TODO: We don't support creating block or character
+			// devices at the moment.
+			//
+			// When we start supporting block and character devices, we'll
+			// need to check for CAP_MKNOD here.
+			return syserror.EPERM
+
+		default:
+			// "EINVAL - mode requested creation of something other than a
+			// regular file, device special file, FIFO or socket." - mknod(2)
+			return syserror.EINVAL
+		}
+	})
+}
+
+// Mknod implements the linux syscall mknod(2).
+func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	path := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[2].Uint() // dev
+
+	return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode)
+}
+
+// Mknodat implements the linux syscall mknodat(2).
+func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	path := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+	// We don't need this argument until we support creation of device nodes.
+	_ = args[3].Uint() // dev
+
+	return 0, nil, mknodAt(t, dirFD, path, mode)
+}
+
+func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does this file exist already?
+		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		var newFile *fs.File
+		switch err {
+		case nil:
+			// The file existed.
+			defer targetDirent.DecRef()
+
+			// Check if we wanted to create.
+			if flags&syscall.O_EXCL != 0 {
+				return syserror.EEXIST
+			}
+
+			// Like sys_open, check for a few things about the
+			// filesystem before trying to get a reference to the
+			// fs.File. The same constraints on Check apply.
+			if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
+				return err
+			}
+
+			// Should we truncate the file?
+			if flags&syscall.O_TRUNC != 0 {
+				if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil {
+					return err
+				}
+			}
+
+			// Create a new fs.File.
+			newFile, err = targetDirent.Inode.GetFile(t, targetDirent, linuxToFlags(flags))
+			if err != nil {
+				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+			}
+			defer newFile.DecRef()
+		case syserror.EACCES:
+			// Permission denied while walking to the file.
+			return err
+		default:
+			// Do we have write permissions on the parent?
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Attempt a creation.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			newFile, err = d.Create(t, root, name, linuxToFlags(flags), perms)
+			if err != nil {
+				// No luck, bail.
+				return err
+			}
+			defer newFile.DecRef()
+			targetDirent = newFile.Dirent
+		}
+
+		// Success.
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}
+		newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return err
+		}
+
+		// Set result in frame.
+		fd = uintptr(newFD)
+
+		// Queue the open inotify event. The creation event is
+		// automatically queued when the dirent is targetDirent. The
+		// open events are implemented at the syscall layer so we need
+		// to manually queue one here.
+		targetDirent.InotifyEvent(linux.IN_OPEN, 0)
+
+		return nil
+	})
+	return fd, err // Use result in frame.
+}
+
+// Open implements linux syscall open(2).
+func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := uint(args[1].Uint())
+	if flags&syscall.O_CREAT != 0 {
+		mode := linux.FileMode(args[2].ModeT())
+		n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, linux.AT_FDCWD, addr, flags)
+	return n, nil, err
+}
+
+// Openat implements linux syscall openat(2).
+func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	flags := uint(args[2].Uint())
+	if flags&syscall.O_CREAT != 0 {
+		mode := linux.FileMode(args[3].ModeT())
+		n, err := createAt(t, dirFD, addr, flags, mode)
+		return n, nil, err
+	}
+	n, err := openAt(t, dirFD, addr, flags)
+	return n, nil, err
+}
+
+// Creat implements linux syscall creat(2).
+func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+	n, err := createAt(t, linux.AT_FDCWD, addr, syscall.O_WRONLY|syscall.O_TRUNC, mode)
+	return n, nil, err
+}
+
+// accessContext is a context that overrides the credentials used, but
+// otherwise carries the same values as the embedded context.
+//
+// accessContext should only be used for access(2).
+type accessContext struct {
+	context.Context
+	creds auth.Credentials
+}
+
+// Value implements context.Context.
+func (ac accessContext) Value(key interface{}) interface{} {
+	switch key {
+	case auth.CtxCredentials:
+		return &ac.creds
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error {
+	const rOK = 4
+	const wOK = 2
+	const xOK = 1
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Sanity check the mode.
+	if mode&^(rOK|wOK|xOK) != 0 {
+		return syserror.EINVAL
+	}
+
+	return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		// access(2) and faccessat(2) check permissions using real
+		// UID/GID, not effective UID/GID.
+		//
+		// "access() needs to use the real uid/gid, not the effective
+		// uid/gid. We do this by temporarily clearing all FS-related
+		// capabilities and switching the fsuid/fsgid around to the
+		// real ones." -fs/open.c:faccessat
+		creds := t.Credentials()
+		creds.EffectiveKUID = creds.RealKUID
+		creds.EffectiveKGID = creds.RealKGID
+		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
+			creds.EffectiveCaps = creds.PermittedCaps
+		} else {
+			creds.EffectiveCaps = 0
+		}
+
+		ctx := &accessContext{
+			Context: t,
+			creds:   creds,
+		}
+
+		if err := d.Inode.CheckPermission(ctx, fs.PermMask{
+			Read:    mode&rOK != 0,
+			Write:   mode&wOK != 0,
+			Execute: mode&xOK != 0,
+		}); err != nil {
+			return err
+		}
+		return nil
+	})
+}
+
+// Access implements linux syscall access(2).
+func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := args[1].ModeT()
+
+	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode)
+}
+
+// Faccessat implements linux syscall faccessat(2).
+func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := args[2].ModeT()
+	flags := args[3].Int()
+
+	return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
+}
+
+// Ioctl implements linux syscall ioctl(2).
+func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	request := int(args[1].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Shared flags between file and socket.
+	switch request {
+	case linux.FIONCLEX:
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: false,
+		})
+		return 0, nil, nil
+	case linux.FIOCLEX:
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: true,
+		})
+		return 0, nil, nil
+
+	case linux.FIONBIO:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.Flags()
+		if set != 0 {
+			flags.NonBlocking = true
+		} else {
+			flags.NonBlocking = false
+		}
+		file.SetFlags(flags.Settable())
+		return 0, nil, nil
+
+	default:
+		ret, err := file.FileOperations.Ioctl(t, t.MemoryManager(), args)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		return ret, nil, nil
+	}
+}
+
+// Getcwd implements the linux syscall getcwd(2).
+func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	size := args[1].SizeT()
+	cwd := t.FSContext().WorkingDirectory()
+	defer cwd.DecRef()
+	root := t.FSContext().RootDirectory()
+	defer root.DecRef()
+
+	// Get our fullname from the root and preprend unreachable if the root was
+	// unreachable from our current dirent this is the same behavior as on linux.
+	s, reachable := cwd.FullName(root)
+	if !reachable {
+		s = "(unreachable)" + s
+	}
+
+	// Note this is >= because we need a terminator.
+	if uint(len(s)) >= size {
+		return 0, nil, syserror.ERANGE
+	}
+
+	// Copy out the path name for the node.
+	bytes, err := t.CopyOutBytes(addr, []byte(s))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Top it off with a terminator.
+	_, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00"))
+	return uintptr(bytes + 1), nil, err
+}
+
+// Chroot implements the linux syscall chroot(2).
+func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
+		return 0, nil, syserror.EPERM
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetRootDirectory(d)
+		return nil
+	})
+}
+
+// Chdir implements the linux syscall chdir(2).
+func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Is it a directory?
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does it have execute permissions?
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+			return err
+		}
+
+		t.FSContext().SetWorkingDirectory(d)
+		return nil
+	})
+}
+
+// Fchdir implements the linux syscall fchdir(2).
+func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Is it a directory?
+	if !fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENOTDIR
+	}
+
+	// Does it have execute permissions?
+	if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
+		return 0, nil, err
+	}
+
+	t.FSContext().SetWorkingDirectory(file.Dirent)
+	return 0, nil, nil
+}
+
+// Close implements linux syscall close(2).
+func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file, ok := t.FDMap().Remove(fd)
+	if !ok {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Flush(t)
+	return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file)
+}
+
+// Dup implements linux syscall dup(2).
+func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, syserror.EMFILE
+	}
+	return uintptr(newfd), nil, nil
+}
+
+// Dup2 implements linux syscall dup2(2).
+func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := kdefs.FD(args[0].Int())
+	newfd := kdefs.FD(args[1].Int())
+
+	// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
+	// then dup2() does nothing, and returns newfd.
+	if oldfd == newfd {
+		oldFile := t.FDMap().GetFile(oldfd)
+		if oldFile == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer oldFile.DecRef()
+
+		return uintptr(newfd), nil, nil
+	}
+
+	// Zero out flags arg to be used by Dup3.
+	args[2].Value = 0
+	return Dup3(t, args)
+}
+
+// Dup3 implements linux syscall dup3(2).
+func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldfd := kdefs.FD(args[0].Int())
+	newfd := kdefs.FD(args[1].Int())
+	flags := args[2].Uint()
+
+	if oldfd == newfd {
+		return 0, nil, syserror.EINVAL
+	}
+
+	oldFile := t.FDMap().GetFile(oldfd)
+	if oldFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer oldFile.DecRef()
+
+	err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(newfd), nil, nil
+}
+
+// Fcntl implements linux syscall fcntl(2).
+func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	cmd := args[1].Int()
+
+	file, flags := t.FDMap().GetDescriptor(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch cmd {
+	case syscall.F_DUPFD, syscall.F_DUPFD_CLOEXEC:
+		from := kdefs.FD(args[2].Int())
+		fdFlags := kernel.FDFlags{CloseOnExec: cmd == syscall.F_DUPFD_CLOEXEC}
+		fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits())
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(fd), nil, nil
+	case syscall.F_GETFD:
+		return uintptr(fdFlagsToLinux(flags)), nil, nil
+	case syscall.F_SETFD:
+		flags := args[2].Uint()
+		t.FDMap().SetFlags(fd, kernel.FDFlags{
+			CloseOnExec: flags&syscall.FD_CLOEXEC != 0,
+		})
+	case syscall.F_GETFL:
+		return uintptr(flagsToLinux(file.Flags())), nil, nil
+	case syscall.F_SETFL:
+		flags := uint(args[2].Uint())
+		file.SetFlags(linuxToSettableFlags(flags))
+	case syscall.F_SETLK, syscall.F_SETLKW:
+		// In Linux the file system can choose to provide lock operations for an inode.
+		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
+		// hammer by only allowing locks on files and directories.
+		if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) {
+			return 0, nil, syserror.EBADF
+		}
+
+		// Copy in the lock request.
+		flockAddr := args[2].Pointer()
+		var flock syscall.Flock_t
+		if _, err := t.CopyIn(flockAddr, &flock); err != nil {
+			return 0, nil, err
+		}
+
+		// Compute the lock whence.
+		var sw fs.SeekWhence
+		switch flock.Whence {
+		case 0:
+			sw = fs.SeekSet
+		case 1:
+			sw = fs.SeekCurrent
+		case 2:
+			sw = fs.SeekEnd
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock offset.
+		var off int64
+		switch sw {
+		case fs.SeekSet:
+			off = 0
+		case fs.SeekCurrent:
+			// Note that Linux does not hold any mutexes while retrieving the file offset,
+			// see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+			off = file.Offset()
+		case fs.SeekEnd:
+			uattr, err := file.Dirent.Inode.UnstableAttr(t)
+			if err != nil {
+				return 0, nil, err
+			}
+			off = uattr.Size
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Compute the lock range.
+		rng, err := lock.ComputeRange(flock.Start, flock.Len, off)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// The lock uid is that of the Task's FDMap.
+		lockUniqueID := lock.UniqueID(t.FDMap().ID())
+
+		// These locks don't block; execute the non-blocking operation using the inode's lock
+		// context directly.
+		switch flock.Type {
+		case syscall.F_RDLCK:
+			if !file.Flags().Read {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == syscall.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case syscall.F_WRLCK:
+			if !file.Flags().Write {
+				return 0, nil, syserror.EBADF
+			}
+			if cmd == syscall.F_SETLK {
+				// Non-blocking lock, provide a nil lock.Blocker.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+					return 0, nil, syserror.EAGAIN
+				}
+			} else {
+				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+					return 0, nil, syserror.EINTR
+				}
+			}
+			return 0, nil, nil
+		case syscall.F_UNLCK:
+			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
+			return 0, nil, nil
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	default:
+		// Everything else is not yet supported.
+		return 0, nil, syserror.EINVAL
+	}
+	return 0, nil, nil
+}
+
+const (
+	_FADV_NORMAL     = 0
+	_FADV_RANDOM     = 1
+	_FADV_SEQUENTIAL = 2
+	_FADV_WILLNEED   = 3
+	_FADV_DONTNEED   = 4
+	_FADV_NOREUSE    = 5
+)
+
+// Fadvise64 implements linux syscall fadvise64(2).
+// This implementation currently ignores the provided advice.
+func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	offset := args[1].Int64()
+	length := args[2].Uint()
+	advice := args[3].Int()
+
+	if offset < 0 || length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	switch advice {
+	case _FADV_NORMAL:
+	case _FADV_RANDOM:
+	case _FADV_SEQUENTIAL:
+	case _FADV_WILLNEED:
+	case _FADV_DONTNEED:
+	case _FADV_NOREUSE:
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Sure, whatever.
+	return 0, nil, nil
+}
+
+func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Does this directory exist already?
+		f, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		switch err {
+		case nil:
+			// The directory existed.
+			defer f.DecRef()
+			return syserror.EEXIST
+		case syserror.EACCES:
+			// Permission denied while walking to the directory.
+			return err
+		default:
+			// Do we have write permissions on the parent?
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+
+			// Create the directory.
+			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
+			return d.CreateDirectory(t, root, name, perms)
+		}
+	})
+}
+
+// Mkdir implements linux syscall mkdir(2).
+func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Mkdirat implements linux syscall mkdirat(2).
+func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, mkdirAt(t, dirFD, addr, mode)
+}
+
+func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	// Special case: rmdir rejects anything with '.' as last component.
+	// This would be handled by the busy check for the current working
+	// directory, but this is how it's done.
+	if (len(path) == 1 && path == ".") || (len(path) > 1 && path[len(path)-2:] == "/.") {
+		return syserror.EINVAL
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		if err := fs.MayDelete(t, root, d, name); err != nil {
+			return err
+		}
+
+		return d.RemoveDirectory(t, root, name)
+	})
+}
+
+// Rmdir implements linux syscall rmdir(2).
+func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
+}
+
+func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error {
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	// The oldPath is copied in verbatim. This is because the symlink
+	// will include all details, including trailing slashes.
+	oldPath, err := t.CopyInString(oldAddr, syscall.PathMax)
+	if err != nil {
+		return err
+	}
+	if oldPath == "" {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Make sure we have write permissions on the parent directory.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+		return d.CreateLink(t, root, oldPath, name)
+	})
+}
+
+// Symlink implements linux syscall symlink(2).
+func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr)
+}
+
+// Symlinkat implements linux syscall symlinkat(2).
+func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	dirFD := kdefs.FD(args[1].Int())
+	newAddr := args[2].Pointer()
+
+	return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
+}
+
+// mayLinkAt determines whether t can create a hard link to target.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
+	// Technically Linux is more restrictive in 3.11.10 (requires CAP_FOWNER in
+	// root user namespace); this is from the later f2ca379642d7 "namei: permit
+	// linking with CAP_FOWNER in userns".
+	if !target.CheckOwnership(t) {
+		return syserror.EPERM
+	}
+
+	// Check that the target is not a directory and that permissions are okay.
+	if fs.IsDir(target.StableAttr) || target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
+		return syserror.EPERM
+	}
+
+	return nil
+}
+
+// linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
+// specified by newDirFD and newAddr.  If resolve is true, then the symlinks
+// will be followed when evaluating the target.
+func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error {
+	oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
+	if err != nil {
+		return err
+	}
+	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	if allowEmpty && oldPath == "" {
+		target := t.FDMap().GetFile(oldDirFD)
+		if target == nil {
+			return syserror.EBADF
+		}
+		defer target.DecRef()
+		if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
+			return err
+		}
+
+		// Resolve the target directory.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target.Dirent, newName)
+		})
+	}
+
+	// Resolve oldDirFD and oldAddr to a dirent.  The "resolve" argument
+	// only applies to this name.
+	return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent) error {
+		if err := mayLinkAt(t, target.Inode); err != nil {
+			return err
+		}
+
+		// Next resolve newDirFD and newAddr to the parent dirent and name.
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Make sure we have write permissions on the parent directory.
+			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
+				return err
+			}
+			return newParent.CreateHardLink(t, root, target, newName)
+		})
+	})
+}
+
+// Link implements linux syscall link(2).
+func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	newAddr := args[1].Pointer()
+
+	// man link(2):
+	// POSIX.1-2001 says that link() should dereference oldpath if it is a
+	// symbolic link. However, since kernel 2.0, Linux does not do so: if
+	// oldpath is a symbolic link, then newpath is created as a (hard) link
+	// to the same symbolic link file (i.e., newpath becomes a symbolic
+	// link to the same file that oldpath refers to).
+	resolve := false
+	return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */)
+}
+
+// Linkat implements linux syscall linkat(2).
+func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := kdefs.FD(args[0].Int())
+	oldAddr := args[1].Pointer()
+	newDirFD := kdefs.FD(args[2].Int())
+	newAddr := args[3].Pointer()
+
+	// man linkat(2):
+	// By default, linkat(), does not dereference oldpath if it is a
+	// symbolic link (like link(2)). Since Linux 2.6.18, the flag
+	// AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
+	// dereferenced if it is a symbolic link.
+	flags := args[4].Int()
+	resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
+	allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
+
+	if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) {
+		return 0, nil, syserror.ENOENT
+	}
+
+	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
+}
+
+func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, err
+	}
+	if dirPath {
+		return 0, syserror.ENOENT
+	}
+
+	err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		// Check for Read permission.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil {
+			return err
+		}
+
+		s, err := d.Inode.Readlink(t)
+		if err == syserror.ENOLINK {
+			return syserror.EINVAL
+		}
+		if err != nil {
+			return err
+		}
+
+		buffer := []byte(s)
+		if uint(len(buffer)) > size {
+			buffer = buffer[:size]
+		}
+
+		n, err := t.CopyOutBytes(bufAddr, buffer)
+
+		// Update frame return value.
+		copied = uintptr(n)
+
+		return err
+	})
+	return copied, err // Return frame value.
+}
+
+// Readlink implements linux syscall readlink(2).
+func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	bufAddr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+// Readlinkat implements linux syscall readlinkat(2).
+func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	bufAddr := args[2].Pointer()
+	size := args[3].SizeT()
+
+	n, err := readlinkAt(t, dirFD, addr, bufAddr, size)
+	return n, nil, err
+}
+
+func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	if dirPath {
+		return syserror.ENOENT
+	}
+
+	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
+		if !fs.IsDir(d.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		if err := fs.MayDelete(t, root, d, name); err != nil {
+			return err
+		}
+
+		return d.Remove(t, root, name)
+	})
+}
+
+// Unlink implements linux syscall unlink(2).
+func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr)
+}
+
+// Unlinkat implements linux syscall unlinkat(2).
+func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	flags := args[2].Uint()
+	if flags&linux.AT_REMOVEDIR != 0 {
+		return 0, nil, rmdirAt(t, dirFD, addr)
+	}
+	return 0, nil, unlinkAt(t, dirFD, addr)
+}
+
+// Truncate implements linux syscall truncate(2).
+func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].Int64()
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+	if dirPath {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		if fs.IsDir(d.Inode.StableAttr) {
+			return syserror.EISDIR
+		}
+		if !fs.IsFile(d.Inode.StableAttr) {
+			return syserror.EINVAL
+		}
+
+		// Reject truncation if the access permissions do not allow truncation.
+		// This is different from the behavior of sys_ftruncate, see below.
+		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+			return err
+		}
+
+		if err := d.Inode.Truncate(t, d, length); err != nil {
+			return err
+		}
+
+		// File length modified, generate notification.
+		d.InotifyEvent(linux.IN_MODIFY, 0)
+
+		return nil
+	})
+}
+
+// Ftruncate implements linux syscall ftruncate(2).
+func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	length := args[1].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Reject truncation if the file flags do not permit this operation.
+	// This is different from truncate(2) above.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Note that this is different from truncate(2) above, where a
+	// directory returns EISDIR.
+	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if length < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+
+	return 0, nil, nil
+}
+
+// Umask implements linux syscall umask(2).
+func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mask := args[0].ModeT()
+	mask = t.FSContext().SwapUmask(mask & 0777)
+	return uintptr(mask), nil, nil
+}
+
+// Change ownership of a file.
+//
+// uid and gid may be -1, in which case they will not be changed.
+func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
+	owner := fs.FileOwner{
+		UID: auth.NoID,
+		GID: auth.NoID,
+	}
+
+	uattr, err := d.Inode.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+	c := t.Credentials()
+	hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN)
+	isOwner := uattr.Owner.UID == c.EffectiveKUID
+	if uid.Ok() {
+		kuid := c.UserNamespace.MapToKUID(uid)
+		// Valid UID must be supplied if UID is to be changed.
+		if !kuid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "Only a privileged process (CAP_CHOWN) may change the owner
+		// of a file." -chown(2)
+		//
+		// Linux also allows chown if you own the file and are
+		// explicitly not changing its UID.
+		isNoop := uattr.Owner.UID == kuid
+		if !(hasCap || (isOwner && isNoop)) {
+			return syserror.EPERM
+		}
+
+		owner.UID = kuid
+	}
+	if gid.Ok() {
+		kgid := c.UserNamespace.MapToKGID(gid)
+		// Valid GID must be supplied if GID is to be changed.
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+
+		// "The owner of a file may change the group of the file to any
+		// group of which that owner is a member. A privileged process
+		// (CAP_CHOWN) may change the group arbitrarily." -chown(2)
+		isNoop := uattr.Owner.GID == kgid
+		isMemberGroup := c.InGroup(kgid)
+		if !(hasCap || (isOwner && (isNoop || isMemberGroup))) {
+			return syserror.EPERM
+		}
+
+		owner.GID = kgid
+	}
+
+	// FIXME: This is racy; the inode's owner may have changed in
+	// the meantime. (Linux holds i_mutex while calling
+	// fs/attr.c:notify_change() => inode_operations::setattr =>
+	// inode_change_ok().)
+	if err := d.Inode.SetOwner(t, d, owner); err != nil {
+		return err
+	}
+
+	// When the owner or group are changed by an unprivileged user,
+	// chown(2) also clears the set-user-ID and set-group-ID bits, but
+	// we do not support them.
+	return nil
+}
+
+func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
+	path, _, err := copyInPath(t, addr, allowEmpty)
+	if err != nil {
+		return err
+	}
+
+	if path == "" {
+		// Annoying. What's wrong with fchown?
+		file := t.FDMap().GetFile(fd)
+		if file == nil {
+			return syserror.EBADF
+		}
+		defer file.DecRef()
+
+		return chown(t, file.Dirent, uid, gid)
+	}
+
+	return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		return chown(t, d, uid, gid)
+	})
+}
+
+// Chown implements linux syscall chown(2).
+func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Lchown implements linux syscall lchown(2).
+func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid)
+}
+
+// Fchown implements linux syscall fchown(2).
+func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	uid := auth.UID(args[1].Uint())
+	gid := auth.GID(args[2].Uint())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chown(t, file.Dirent, uid, gid)
+}
+
+// Fchownat implements Linux syscall fchownat(2).
+func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	uid := auth.UID(args[2].Uint())
+	gid := auth.GID(args[3].Uint())
+	flags := args[4].Int()
+
+	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid)
+}
+
+func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
+	// Must own file to change mode.
+	if !d.Inode.CheckOwnership(t) {
+		return syserror.EPERM
+	}
+
+	p := fs.FilePermsFromMode(mode)
+	if !d.Inode.SetPermissions(t, d, p) {
+		return syserror.EPERM
+	}
+
+	// File attribute changed, generate notification.
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+
+	return nil
+}
+
+func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error {
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return chmod(t, d, mode)
+	})
+}
+
+// Chmod implements linux syscall chmod(2).
+func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	mode := linux.FileMode(args[1].ModeT())
+
+	return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode)
+}
+
+// Fchmod implements linux syscall fchmod(2).
+func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	mode := linux.FileMode(args[1].ModeT())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, chmod(t, file.Dirent, mode)
+}
+
+// Fchmodat implements linux syscall fchmodat(2).
+func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mode := linux.FileMode(args[2].ModeT())
+
+	return 0, nil, chmodAt(t, fd, addr, mode)
+}
+
+// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime
+// to the system time.
+func defaultSetToSystemTimeSpec() fs.TimeSpec {
+	return fs.TimeSpec{
+		ATimeSetSystemTime: true,
+		MTimeSetSystemTime: true,
+	}
+}
+
+func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error {
+	setTimestamp := func(root *fs.Dirent, d *fs.Dirent) error {
+		// Does the task own the file?
+		if !d.Inode.CheckOwnership(t) {
+			// Trying to set a specific time? Must be owner.
+			if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) {
+				return syserror.EPERM
+			}
+
+			// Trying to set to current system time? Must have write access.
+			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
+				return err
+			}
+		}
+
+		return d.Inode.SetTimestamps(t, d, ts)
+	}
+
+	// From utimes.c:
+	// "If filename is NULL and dfd refers to an open file, then operate on
+	// the file.  Otherwise look up filename, possibly using dfd as a
+	// starting point."
+	if addr == 0 && dirFD != linux.AT_FDCWD {
+		if !resolve {
+			// Linux returns EINVAL in this case. See utimes.c.
+			return syserror.EINVAL
+		}
+		f := t.FDMap().GetFile(dirFD)
+		if f == nil {
+			return syserror.EBADF
+		}
+		defer f.DecRef()
+
+		root := t.FSContext().RootDirectory()
+		defer root.DecRef()
+
+		return setTimestamp(root, f.Dirent)
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpOn(t, dirFD, path, resolve, setTimestamp)
+}
+
+// Utime implements linux syscall utime(2).
+func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times syscall.Utimbuf
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromSeconds(times.Actime),
+			MTime: ktime.FromSeconds(times.Modtime),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// Utimes implements linux syscall utimes(2).
+func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	timesAddr := args[1].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
+}
+
+// timespecIsValid checks that the timespec is valid for use in utimensat.
+func timespecIsValid(ts linux.Timespec) bool {
+	// Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9.
+	return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9
+}
+
+// Utimensat implements linux syscall utimensat(2).
+func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timespec
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// If both are UTIME_OMIT, this is a noop.
+		if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT {
+			return 0, nil, nil
+		}
+
+		ts = fs.TimeSpec{
+			ATime:              ktime.FromTimespec(times[0]),
+			ATimeOmit:          times[0].Nsec == linux.UTIME_OMIT,
+			ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+			MTime:              ktime.FromTimespec(times[1]),
+			MTimeOmit:          times[1].Nsec == linux.UTIME_OMIT,
+			MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0)
+}
+
+// Futimesat implements linux syscall futimesat(2).
+func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	dirFD := kdefs.FD(args[0].Int())
+	pathnameAddr := args[1].Pointer()
+	timesAddr := args[2].Pointer()
+
+	// No timesAddr argument will be interpreted as current system time.
+	ts := defaultSetToSystemTimeSpec()
+	if timesAddr != 0 {
+		var times [2]linux.Timeval
+		if _, err := t.CopyIn(timesAddr, &times); err != nil {
+			return 0, nil, err
+		}
+		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
+			times[1].Usec >= 1e6 || times[1].Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+
+		ts = fs.TimeSpec{
+			ATime: ktime.FromTimeval(times[0]),
+			MTime: ktime.FromTimeval(times[1]),
+		}
+	}
+	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
+}
+
+func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error {
+	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+	oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */)
+	if err != nil {
+		return err
+	}
+
+	return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string) error {
+		if !fs.IsDir(oldParent.Inode.StableAttr) {
+			return syserror.ENOTDIR
+		}
+
+		// Root cannot be renamed to anything.
+		//
+		// TODO: This catches the case when the rename
+		// argument is exactly "/", but we should return EBUSY when
+		// renaming any mount point, or when the argument is not
+		// exactly "/" but still resolves to the root, like "/.." or
+		// "/bin/..".
+		if oldParent == root && oldName == "." {
+			return syscall.EBUSY
+		}
+		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
+			if !fs.IsDir(newParent.Inode.StableAttr) {
+				return syserror.ENOTDIR
+			}
+
+			// Nothing can be renamed to root.
+			//
+			// TODO: Same as above.
+			if newParent == root && newName == "." {
+				return syscall.EBUSY
+			}
+			return fs.Rename(t, root, oldParent, oldName, newParent, newName)
+		})
+	})
+}
+
+// Rename implements linux syscall rename(2).
+func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldPathAddr := args[0].Pointer()
+	newPathAddr := args[1].Pointer()
+	return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr)
+}
+
+// Renameat implements linux syscall renameat(2).
+func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldDirFD := kdefs.FD(args[0].Int())
+	oldPathAddr := args[1].Pointer()
+	newDirFD := kdefs.FD(args[2].Int())
+	newPathAddr := args[3].Pointer()
+	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
+}
+
+// Fallocate implements linux system call fallocate(2).
+// (well, not really, but at least we return the expected error codes)
+func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	offset := args[2].Int64()
+	length := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	if offset < 0 || length <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, syserror.EOPNOTSUPP
+}
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	operation := args[1].Int()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		// flock(2): EBADF fd is not an open file descriptor.
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	nonblocking := operation&linux.LOCK_NB != 0
+	operation &^= linux.LOCK_NB
+
+	// flock(2):
+	// Locks created by flock() are associated with an open file table entry. This means that
+	// duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the
+	// same lock, and this lock may be modified or released using any of these descriptors. Furthermore,
+	// the lock is released either by an explicit LOCK_UN operation on any of these duplicate
+	// descriptors, or when all such descriptors have been closed.
+	//
+	// If a process uses open(2) (or similar) to obtain more than one descriptor for the same file,
+	// these descriptors are treated independently by flock(). An attempt to lock the file using
+	// one of these file descriptors may be denied by a lock that the calling process has already placed via
+	// another descriptor.
+	//
+	// We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2)
+	// and fork(2).
+	lockUniqueID := lock.UniqueID(file.UniqueID)
+
+	// A BSD style lock spans the entire file.
+	rng := lock.LockRange{
+		Start: 0,
+		End:   lock.LockEOF,
+	}
+
+	switch operation {
+	case linux.LOCK_EX:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_SH:
+		if nonblocking {
+			// Since we're nonblocking we pass a nil lock.Blocker implementation.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+				return 0, nil, syserror.EWOULDBLOCK
+			}
+		} else {
+			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+				return 0, nil, syserror.EINTR
+			}
+		}
+	case linux.LOCK_UN:
+		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng)
+	default:
+		// flock(2): EINVAL operation is invalid.
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	outFD := kdefs.FD(args[0].Int())
+	inFD := kdefs.FD(args[1].Int())
+	offsetAddr := args[2].Pointer()
+	count := int64(args[3].SizeT())
+
+	// Don't send a negative number of bytes.
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get files.
+	outFile := t.FDMap().GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	inFile := t.FDMap().GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	// Verify that the outfile is writable.
+	outFlags := outFile.Flags()
+	if !outFlags.Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Verify that the outfile Append flag is not set.
+	if outFlags.Append {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Verify that we have a regular infile.
+	// http://elixir.free-electrons.com/linux/latest/source/fs/splice.c#L933
+	if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Verify that the infile is readable.
+	if !inFile.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Setup for sending data.
+	var offset uint64
+	var n int64
+	var err error
+	w := &fs.FileWriter{t, outFile}
+	hasOffset := offsetAddr != 0
+	// If we have a provided offset.
+	if hasOffset {
+		// Copy in the offset.
+		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+			return 0, nil, err
+		}
+		// Send data using Preadv.
+		r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), count)
+		n, err = io.Copy(w, r)
+		// Copy out the new offset.
+		if _, err := t.CopyOut(offsetAddr, n+int64(offset)); err != nil {
+			return 0, nil, err
+		}
+		// If we don't have a provided offset.
+	} else {
+		// Send data using readv.
+		r := &io.LimitedReader{R: &fs.FileReader{t, inFile}, N: count}
+		n, err = io.Copy(w, r)
+	}
+
+	// We can only pass a single file to handleIOError, so pick inFile
+	// arbitrarily.
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
new file mode 100644
index 000000000..57762d058
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -0,0 +1,319 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// futexChecker is a futex.Checker that uses a Task's MemoryManager.
+type futexChecker struct {
+	t *kernel.Task
+}
+
+// Check checks if the address contains the given value, and returns
+// syserror.EAGAIN if it doesn't. See Checker interface in futex package
+// for more information.
+func (f futexChecker) Check(addr uintptr, val uint32) error {
+	in := f.t.CopyScratchBuffer(4)
+	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
+	if err != nil {
+		return err
+	}
+	nval := usermem.ByteOrder.Uint32(in)
+	if val != nval {
+		return syserror.EAGAIN
+	}
+	return nil
+}
+
+func (f futexChecker) atomicOp(addr uintptr, op func(uint32) uint32) (uint32, error) {
+	in := f.t.CopyScratchBuffer(4)
+	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
+	if err != nil {
+		return 0, err
+	}
+	o := usermem.ByteOrder.Uint32(in)
+	mm := f.t.MemoryManager()
+	for {
+		n := op(o)
+		r, err := mm.CompareAndSwapUint32(f.t, usermem.Addr(addr), o, n, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		if err != nil {
+			return 0, err
+		}
+
+		if r == o {
+			return o, nil
+		}
+		o = r
+	}
+}
+
+// Op performs an operation on addr and returns a result based on the operation.
+func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
+	op := (opIn >> 28) & 0xf
+	cmp := (opIn >> 24) & 0xf
+	opArg := (opIn >> 12) & 0xfff
+	cmpArg := opIn & 0xfff
+
+	if op&linux.FUTEX_OP_OPARG_SHIFT != 0 {
+		opArg = 1 << opArg
+		op &^= linux.FUTEX_OP_OPARG_SHIFT // clear flag
+	}
+
+	var oldVal uint32
+	var err error
+	switch op {
+	case linux.FUTEX_OP_SET:
+		oldVal, err = f.t.MemoryManager().SwapUint32(f.t, usermem.Addr(addr), opArg, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+	case linux.FUTEX_OP_ADD:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a + opArg
+		})
+	case linux.FUTEX_OP_OR:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a | opArg
+		})
+	case linux.FUTEX_OP_ANDN:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a & ^opArg
+		})
+	case linux.FUTEX_OP_XOR:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a ^ opArg
+		})
+	default:
+		return false, syserror.ENOSYS
+	}
+	if err != nil {
+		return false, err
+	}
+
+	switch cmp {
+	case linux.FUTEX_OP_CMP_EQ:
+		return oldVal == cmpArg, nil
+	case linux.FUTEX_OP_CMP_NE:
+		return oldVal != cmpArg, nil
+	case linux.FUTEX_OP_CMP_LT:
+		return oldVal < cmpArg, nil
+	case linux.FUTEX_OP_CMP_LE:
+		return oldVal <= cmpArg, nil
+	case linux.FUTEX_OP_CMP_GT:
+		return oldVal > cmpArg, nil
+	case linux.FUTEX_OP_CMP_GE:
+		return oldVal >= cmpArg, nil
+	default:
+		return false, syserror.ENOSYS
+	}
+}
+
+// futexWaitRestartBlock encapsulates the state required to restart futex(2)
+// via restart_syscall(2).
+type futexWaitRestartBlock struct {
+	duration time.Duration
+
+	// addr stored as uint64 since uintptr is not save-able.
+	addr uint64
+
+	val  uint32
+	mask uint32
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return futexWaitDuration(t, f.duration, false, uintptr(f.addr), f.val, f.mask)
+}
+
+// futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise it blocks until ts.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr uintptr, val, mask uint32) (uintptr, error) {
+	w := t.FutexWaiter()
+	err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask)
+	if err != nil {
+		return 0, err
+	}
+
+	if forever {
+		err = t.Block(w.C)
+	} else if clockRealtime {
+		notifier, tchan := ktime.NewChannelNotifier()
+		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+		timer.Swap(ktime.Setting{
+			Enabled: true,
+			Next:    ktime.FromTimespec(ts),
+		})
+		err = t.BlockWithTimer(w.C, tchan)
+		timer.Destroy()
+	} else {
+		err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
+	}
+
+	t.Futex().WaitComplete(w)
+	return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
+// complete.
+//
+// The wait blocks forever if forever is true, otherwise is blocks for
+// duration.
+//
+// If blocking is interrupted, forever determines how to restart the
+// syscall. If forever is true, the syscall is restarted with the original
+// arguments. If forever is false, duration is a relative timeout and the
+// syscall is restarted with the remaining timeout.
+func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr uintptr, val, mask uint32) (uintptr, error) {
+	w := t.FutexWaiter()
+	err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask)
+	if err != nil {
+		return 0, err
+	}
+
+	remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
+	t.Futex().WaitComplete(w)
+	if err == nil {
+		return 0, nil
+	}
+
+	// The wait was unsuccessful for some reason other than interruption. Simply
+	// forward the error.
+	if err != syserror.ErrInterrupted {
+		return 0, err
+	}
+
+	// The wait was interrupted and we need to restart. Decide how.
+
+	// The wait duration was absolute, restart with the original arguments.
+	if forever {
+		return 0, kernel.ERESTARTSYS
+	}
+
+	// The wait duration was relative, restart with the remaining duration.
+	t.SetSyscallRestartBlock(&futexWaitRestartBlock{
+		duration: remaining,
+		addr:     uint64(addr),
+		val:      val,
+		mask:     mask,
+	})
+	return 0, kernel.ERESTART_RESTARTBLOCK
+}
+
+// Futex implements linux syscall futex(2).
+// It provides a method for a program to wait for a value at a given address to
+// change, and a method to wake up anyone waiting on a particular address.
+func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	uaddr := args[0].Pointer()
+	futexOp := args[1].Int()
+	val := int(args[2].Int())
+	nreq := int(args[3].Int())
+	timeout := args[3].Pointer()
+	uaddr2 := args[4].Pointer()
+	val3 := args[5].Int()
+
+	addr := uintptr(uaddr)
+	naddr := uintptr(uaddr2)
+	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
+	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
+	mask := uint32(val3)
+
+	switch cmd {
+	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
+		// WAIT{_BITSET} wait forever if the timeout isn't passed.
+		forever := timeout == 0
+
+		var timespec linux.Timespec
+		if !forever {
+			var err error
+			timespec, err = copyTimespecIn(t, timeout)
+			if err != nil {
+				return 0, nil, err
+			}
+		}
+
+		switch cmd {
+		case linux.FUTEX_WAIT:
+			// WAIT uses a relative timeout.
+			mask = ^uint32(0)
+			var timeoutDur time.Duration
+			if !forever {
+				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
+			}
+			n, err := futexWaitDuration(t, timeoutDur, forever, addr, uint32(val), mask)
+			return n, nil, err
+
+		case linux.FUTEX_WAIT_BITSET:
+			// WAIT_BITSET uses an absolute timeout which is either
+			// CLOCK_MONOTONIC or CLOCK_REALTIME.
+			if mask == 0 {
+				return 0, nil, syserror.EINVAL
+			}
+			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, uint32(val), mask)
+			return n, nil, err
+		default:
+			panic("unreachable")
+		}
+
+	case linux.FUTEX_WAKE:
+		mask = ^uint32(0)
+		fallthrough
+
+	case linux.FUTEX_WAKE_BITSET:
+		if mask == 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		n, err := t.Futex().Wake(addr, mask, val)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_REQUEUE:
+		n, err := t.Futex().Requeue(addr, naddr, val, nreq)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_CMP_REQUEUE:
+		// 'val3' contains the value to be checked at 'addr' and
+		// 'val' is the number of waiters that should be woken up.
+		nval := uint32(val3)
+		n, err := t.Futex().RequeueCmp(futexChecker{t}, addr, nval, naddr, val, nreq)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_WAKE_OP:
+		op := uint32(val3)
+		n, err := t.Futex().WakeOp(futexChecker{t}, addr, naddr, val, nreq, op)
+		return uintptr(n), nil, err
+
+	case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
+		// We don't support any priority inversion futexes.
+		return 0, nil, syserror.ENOSYS
+
+	default:
+		// We don't even know about this command.
+		return 0, nil, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
new file mode 100644
index 000000000..178714b07
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -0,0 +1,269 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"bytes"
+	"io"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Getdents implements linux syscall getdents(2) for 64bit systems.
+func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	minSize := int(smallestDirent(t.Arch()))
+	if size < minSize {
+		// size is smaller than smallest possible dirent.
+		return 0, nil, syserror.EINVAL
+	}
+
+	n, err := getdents(t, fd, addr, size, (*dirent).Serialize)
+	return n, nil, err
+}
+
+// Getdents64 implements linux syscall getdents64(2).
+func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := int(args[2].Uint())
+
+	minSize := int(smallestDirent64(t.Arch()))
+	if size < minSize {
+		// size is smaller than smallest possible dirent.
+		return 0, nil, syserror.EINVAL
+	}
+
+	n, err := getdents(t, fd, addr, size, (*dirent).Serialize64)
+	return n, nil, err
+}
+
+// getdents implements the core of getdents(2)/getdents64(2).
+// f is the syscall implementation dirent serialization function.
+func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) {
+	dir := t.FDMap().GetFile(fd)
+	if dir == nil {
+		return 0, syserror.EBADF
+	}
+	defer dir.DecRef()
+
+	w := &usermem.IOReadWriter{
+		Ctx:  t,
+		IO:   t.MemoryManager(),
+		Addr: addr,
+		Opts: usermem.IOOpts{
+			AddressSpaceActive: true,
+		},
+	}
+
+	ds := newDirentSerializer(f, w, t.Arch(), size)
+	rerr := dir.Readdir(t, ds)
+
+	switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err {
+	case nil:
+		dir.Dirent.InotifyEvent(syscall.IN_ACCESS, 0)
+		return uintptr(ds.Written()), nil
+	case io.EOF:
+		return 0, nil
+	default:
+		return 0, err
+	}
+}
+
+// oldDirentHdr is a fixed sized header matching the fixed size
+// fields found in the old linux dirent struct.
+type oldDirentHdr struct {
+	Ino    uint64
+	Off    uint64
+	Reclen uint16
+}
+
+// direntHdr is a fixed sized header matching the fixed size
+// fields found in the new linux dirent struct.
+type direntHdr struct {
+	OldHdr oldDirentHdr
+	Typ    uint8
+}
+
+// dirent contains the data pointed to by a new linux dirent struct.
+type dirent struct {
+	Hdr  direntHdr
+	Name []byte
+}
+
+// newDirent returns a dirent from an fs.InodeOperationsInfo.
+func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent {
+	d := &dirent{
+		Hdr: direntHdr{
+			OldHdr: oldDirentHdr{
+				Ino: attr.InodeID,
+				Off: offset,
+			},
+			Typ: toType(attr.Type),
+		},
+		Name: []byte(name),
+	}
+	d.Hdr.OldHdr.Reclen = d.padRec(int(width))
+	return d
+}
+
+// smallestDirent returns the size of the smallest possible dirent using
+// the old linux dirent format.
+func smallestDirent(a arch.Context) uint {
+	d := dirent{}
+	return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1
+}
+
+// smallestDirent64 returns the size of the smallest possible dirent using
+// the new linux dirent format.
+func smallestDirent64(a arch.Context) uint {
+	d := dirent{}
+	return uint(binary.Size(d.Hdr)) + a.Width()
+}
+
+// toType converts an fs.InodeOperationsInfo to a linux dirent typ field.
+func toType(nodeType fs.InodeType) uint8 {
+	switch nodeType {
+	case fs.RegularFile, fs.SpecialFile:
+		return syscall.DT_REG
+	case fs.Symlink:
+		return syscall.DT_LNK
+	case fs.Directory:
+		return syscall.DT_DIR
+	case fs.Pipe:
+		return syscall.DT_FIFO
+	case fs.CharacterDevice:
+		return syscall.DT_CHR
+	case fs.BlockDevice:
+		return syscall.DT_BLK
+	case fs.Socket:
+		return syscall.DT_SOCK
+	default:
+		return syscall.DT_UNKNOWN
+	}
+}
+
+// padRec pads the name field until the rec length is a multiple of the width,
+// which must be a power of 2. It returns the padded rec length.
+func (d *dirent) padRec(width int) uint16 {
+	a := int(binary.Size(d.Hdr)) + len(d.Name)
+	r := (a + width) &^ (width - 1)
+	padding := r - a
+	d.Name = append(d.Name, make([]byte, padding)...)
+	return uint16(r)
+}
+
+// Serialize64 serializes a Dirent struct to a byte slice, keeping the new
+// linux dirent format. Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize64(w io.Writer) (int, error) {
+	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr))
+	if err != nil {
+		return 0, err
+	}
+	n2, err := w.Write(d.Name)
+	if err != nil {
+		return 0, err
+	}
+	return n1 + n2, nil
+}
+
+// Serialize serializes a Dirent struct to a byte slice, using the old linux
+// dirent format.
+// Returns the number of bytes serialized or an error.
+func (d *dirent) Serialize(w io.Writer) (int, error) {
+	n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr))
+	if err != nil {
+		return 0, err
+	}
+	n2, err := w.Write(d.Name)
+	if err != nil {
+		return 0, err
+	}
+	n3, err := w.Write([]byte{d.Hdr.Typ})
+	if err != nil {
+		return 0, err
+	}
+	return n1 + n2 + n3, nil
+}
+
+// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an
+// io.Writer.
+type direntSerializer struct {
+	serialize func(*dirent, io.Writer) (int, error)
+	w         io.Writer
+	// width is the arch native value width.
+	width uint
+	// offset is the current dirent offset.
+	offset uint64
+	// written is the total bytes serialized.
+	written int
+	// size is the size of the buffer to serialize into.
+	size int
+}
+
+func newDirentSerializer(f func(d *dirent, w io.Writer) (int, error), w io.Writer, ac arch.Context, size int) *direntSerializer {
+	return &direntSerializer{
+		serialize: f,
+		w:         w,
+		width:     ac.Width(),
+		size:      size,
+	}
+}
+
+// CopyOut implements fs.InodeOperationsInfoSerializer.CopyOut.
+// It serializes and writes the fs.DentAttr to the direntSerializer io.Writer.
+func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
+	ds.offset++
+
+	d := newDirent(ds.width, name, attr, ds.offset)
+
+	// Serialize dirent into a temp buffer.
+	var b bytes.Buffer
+	n, err := ds.serialize(d, &b)
+	if err != nil {
+		ds.offset--
+		return err
+	}
+
+	// Check that we have enough room remaining to write the dirent.
+	if n > (ds.size - ds.written) {
+		ds.offset--
+		return io.EOF
+	}
+
+	// Write out the temp buffer.
+	if _, err := b.WriteTo(ds.w); err != nil {
+		ds.offset--
+		return err
+	}
+
+	ds.written += n
+	return nil
+}
+
+// Written returns the total number of bytes written.
+func (ds *direntSerializer) Written() int {
+	return ds.written
+}
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
new file mode 100644
index 000000000..4fd0ed794
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -0,0 +1,180 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// As NGROUPS_MAX in include/uapi/linux/limits.h.
+	maxNGroups = 65536
+)
+
+// Getuid implements the Linux syscall getuid.
+func Getuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+	return uintptr(ruid), nil, nil
+}
+
+// Geteuid implements the Linux syscall geteuid.
+func Geteuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+	return uintptr(euid), nil, nil
+}
+
+// Getresuid implements the Linux syscall getresuid.
+func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruidAddr := args[0].Pointer()
+	euidAddr := args[1].Pointer()
+	suidAddr := args[2].Pointer()
+	c := t.Credentials()
+	ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
+	euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
+	suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
+	if _, err := t.CopyOut(ruidAddr, ruid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(euidAddr, euid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(suidAddr, suid); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
+
+// Getgid implements the Linux syscall getgid.
+func Getgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+	return uintptr(rgid), nil, nil
+}
+
+// Getegid implements the Linux syscall getegid.
+func Getegid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	c := t.Credentials()
+	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+	return uintptr(egid), nil, nil
+}
+
+// Getresgid implements the Linux syscall getresgid.
+func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgidAddr := args[0].Pointer()
+	egidAddr := args[1].Pointer()
+	sgidAddr := args[2].Pointer()
+	c := t.Credentials()
+	rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
+	egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
+	sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
+	if _, err := t.CopyOut(rgidAddr, rgid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(egidAddr, egid); err != nil {
+		return 0, nil, err
+	}
+	if _, err := t.CopyOut(sgidAddr, sgid); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
+
+// Setuid implements the Linux syscall setuid.
+func Setuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	uid := auth.UID(args[0].Int())
+	return 0, nil, t.SetUID(uid)
+}
+
+// Setreuid implements the Linux syscall setreuid.
+func Setreuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruid := auth.UID(args[0].Int())
+	euid := auth.UID(args[1].Int())
+	return 0, nil, t.SetREUID(ruid, euid)
+}
+
+// Setresuid implements the Linux syscall setreuid.
+func Setresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ruid := auth.UID(args[0].Int())
+	euid := auth.UID(args[1].Int())
+	suid := auth.UID(args[2].Int())
+	return 0, nil, t.SetRESUID(ruid, euid, suid)
+}
+
+// Setgid implements the Linux syscall setgid.
+func Setgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	gid := auth.GID(args[0].Int())
+	return 0, nil, t.SetGID(gid)
+}
+
+// Setregid implements the Linux syscall setregid.
+func Setregid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgid := auth.GID(args[0].Int())
+	egid := auth.GID(args[1].Int())
+	return 0, nil, t.SetREGID(rgid, egid)
+}
+
+// Setresgid implements the Linux syscall setregid.
+func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	rgid := auth.GID(args[0].Int())
+	egid := auth.GID(args[1].Int())
+	sgid := auth.GID(args[2].Int())
+	return 0, nil, t.SetRESGID(rgid, egid, sgid)
+}
+
+// Getgroups implements the Linux syscall getgroups.
+func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := int(args[0].Int())
+	if size < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	kgids := t.Credentials().ExtraKGIDs
+	// "If size is zero, list is not modified, but the total number of
+	// supplementary group IDs for the process is returned." - getgroups(2)
+	if size == 0 {
+		return uintptr(len(kgids)), nil, nil
+	}
+	if size < len(kgids) {
+		return 0, nil, syserror.EINVAL
+	}
+	gids := make([]auth.GID, len(kgids))
+	for i, kgid := range kgids {
+		gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
+	}
+	if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(len(gids)), nil, nil
+}
+
+// Setgroups implements the Linux syscall setgroups.
+func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	size := args[0].Int()
+	if size < 0 || size > maxNGroups {
+		return 0, nil, syserror.EINVAL
+	}
+	if size == 0 {
+		return 0, nil, t.SetExtraGIDs(nil)
+	}
+	gids := make([]auth.GID, size)
+	if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, t.SetExtraGIDs(gids)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
new file mode 100644
index 000000000..725204dff
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+)
+
+const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC)
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+
+	if flags&^allFlags != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	dirent := fs.NewDirent(anon.NewInode(t), "inotify")
+	fileFlags := fs.FileFlags{
+		Read:        true,
+		Write:       true,
+		NonBlocking: flags&linux.IN_NONBLOCK != 0,
+	}
+	n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t))
+	defer n.DecRef()
+
+	fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{
+		CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+	}, t.ThreadGroup().Limits())
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[0].Value = 0
+	return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) {
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		// Invalid fd.
+		return nil, nil, syscall.EBADF
+	}
+
+	ino, ok := file.FileOperations.(*fs.Inotify)
+	if !ok {
+		// Not an inotify fd.
+		file.DecRef()
+		return nil, nil, syscall.EINVAL
+	}
+
+	return ino, file, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	mask := args[2].Uint()
+
+	// "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+	//  -- inotify(7)
+	resolve := mask&linux.IN_DONT_FOLLOW == 0
+
+	// "EINVAL: The given event mask contains no valid events."
+	// -- inotify_add_watch(2)
+	if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	ino, file, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent) error {
+		// "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7)
+		if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) {
+			return syscall.ENOTDIR
+		}
+
+		// Copy out to the return frame.
+		fd = kdefs.FD(ino.AddWatch(dirent, mask))
+
+		return nil
+	})
+	return uintptr(fd), nil, err // Return from the existing value.
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	wd := args[1].Int()
+
+	ino, file, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer file.DecRef()
+	return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
new file mode 100644
index 000000000..97b51ba7c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -0,0 +1,55 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Lseek implements linux syscall lseek(2).
+func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	offset := args[1].Int64()
+	whence := args[2].Int()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	var sw fs.SeekWhence
+	switch whence {
+	case 0:
+		sw = fs.SeekSet
+	case 1:
+		sw = fs.SeekCurrent
+	case 2:
+		sw = fs.SeekEnd
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	offset, serr := file.Seek(t, sw, offset)
+	err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(offset), nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
new file mode 100644
index 000000000..2c7d41de0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -0,0 +1,435 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"bytes"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Brk implements linux syscall brk(2).
+func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr, _ := t.MemoryManager().Brk(t, args[0].Pointer())
+	// "However, the actual Linux system call returns the new program break on
+	// success. On failure, the system call returns the current break." -
+	// brk(2)
+	return uintptr(addr), nil, nil
+}
+
+// Mmap implements linux syscall mmap(2).
+func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	prot := args[2].Int()
+	flags := args[3].Int()
+	fd := kdefs.FD(args[4].Int())
+	fixed := flags&linux.MAP_FIXED != 0
+	private := flags&linux.MAP_PRIVATE != 0
+	shared := flags&linux.MAP_SHARED != 0
+	anon := flags&linux.MAP_ANONYMOUS != 0
+
+	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
+	if private == shared {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts := memmap.MMapOpts{
+		Length:  args[1].Uint64(),
+		Offset:  args[5].Uint64(),
+		Addr:    args[0].Pointer(),
+		Fixed:   fixed,
+		Unmap:   fixed,
+		Private: private,
+		Perms: usermem.AccessType{
+			Read:    linux.PROT_READ&prot != 0,
+			Write:   linux.PROT_WRITE&prot != 0,
+			Execute: linux.PROT_EXEC&prot != 0,
+		},
+		MaxPerms:  usermem.AnyAccess,
+		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
+		Precommit: linux.MAP_POPULATE&flags != 0,
+	}
+	defer func() {
+		if opts.MappingIdentity != nil {
+			opts.MappingIdentity.DecRef()
+		}
+	}()
+
+	if !anon {
+		// Convert the passed FD to a file reference.
+		file := t.FDMap().GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		flags := file.Flags()
+		// mmap unconditionally requires that the FD is readable.
+		if !flags.Read {
+			return 0, nil, syserror.EACCES
+		}
+		// MAP_SHARED requires that the FD be writable for PROT_WRITE.
+		if shared && !flags.Write {
+			opts.MaxPerms.Write = false
+		}
+
+		if err := file.ConfigureMMap(t, &opts); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	rv, err := t.MemoryManager().MMap(t, opts)
+	return uintptr(rv), nil, err
+}
+
+// Munmap implements linux syscall munmap(2).
+func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
+}
+
+// Mremap implements linux syscall mremap(2).
+func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	oldAddr := args[0].Pointer()
+	oldSize := args[1].Uint64()
+	newSize := args[2].Uint64()
+	flags := args[3].Uint64()
+	newAddr := args[4].Pointer()
+
+	if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	mayMove := flags&linux.MREMAP_MAYMOVE != 0
+	fixed := flags&linux.MREMAP_FIXED != 0
+	var moveMode mm.MRemapMoveMode
+	switch {
+	case !mayMove && !fixed:
+		moveMode = mm.MRemapNoMove
+	case mayMove && !fixed:
+		moveMode = mm.MRemapMayMove
+	case mayMove && fixed:
+		moveMode = mm.MRemapMustMove
+	case !mayMove && fixed:
+		// "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be
+		// specified." - mremap(2)
+		return 0, nil, syserror.EINVAL
+	}
+
+	rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{
+		Move:    moveMode,
+		NewAddr: newAddr,
+	})
+	return uintptr(rv), nil, err
+}
+
+// Mprotect implements linux syscall mprotect(2).
+func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	length := args[1].Uint64()
+	prot := args[2].Int()
+	err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{
+		Read:    linux.PROT_READ&prot != 0,
+		Write:   linux.PROT_WRITE&prot != 0,
+		Execute: linux.PROT_EXEC&prot != 0,
+	}, linux.PROT_GROWSDOWN&prot != 0)
+	return 0, nil, err
+}
+
+// Madvise implements linux syscall madvise(2).
+func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := uint64(args[1].SizeT())
+	adv := args[2].Int()
+
+	// "The Linux implementation requires that the address addr be
+	// page-aligned, and allows length to be zero." - madvise(2)
+	if addr.RoundDown() != addr {
+		return 0, nil, syserror.EINVAL
+	}
+	if length == 0 {
+		return 0, nil, nil
+	}
+	// Not explicitly stated: length need not be page-aligned.
+	lenAddr, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+	length = uint64(lenAddr)
+
+	switch adv {
+	case linux.MADV_DONTNEED:
+		return 0, nil, t.MemoryManager().Decommit(addr, length)
+	case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE:
+		fallthrough
+	case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
+		fallthrough
+	case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
+		// Do nothing, we totally ignore the suggestions above.
+		return 0, nil, nil
+	case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK:
+		// These "suggestions" have application-visible side effects, so we
+		// have to indicate that we don't support them.
+		return 0, nil, syserror.ENOSYS
+	case linux.MADV_HWPOISON:
+		// Only privileged processes are allowed to poison pages.
+		return 0, nil, syserror.EPERM
+	default:
+		// If adv is not a valid value tell the caller.
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) {
+	if ptr != 0 {
+		return t.CopyOut(ptr, val)
+	}
+	return 0, nil
+}
+
+// GetMempolicy implements the syscall get_mempolicy(2).
+func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	mode := args[0].Pointer()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+	addr := args[3].Pointer()
+	flags := args[4].Uint()
+
+	memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
+	nodeFlag := flags&linux.MPOL_F_NODE != 0
+	addrFlag := flags&linux.MPOL_F_ADDR != 0
+
+	// TODO: Once sysfs is implemented, report a single numa node in
+	// /sys/devices/system/node.
+	if nodemask != 0 && maxnode < 1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// 'addr' provided iff 'addrFlag' set.
+	if addrFlag == (addr == 0) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Default policy for the thread.
+	if flags == 0 {
+		policy, nodemaskVal := t.NumaPolicy()
+		if _, err := copyOutIfNotNull(t, mode, policy); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		return 0, nil, nil
+	}
+
+	// Report all nodes available to caller.
+	if memsAllowed {
+		// MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED.
+		if nodeFlag || addrFlag {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Report a single numa node.
+		if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		return 0, nil, nil
+	}
+
+	if addrFlag {
+		if nodeFlag {
+			// Return the id for the node where 'addr' resides, via 'mode'.
+			//
+			// The real get_mempolicy(2) allocates the page referenced by 'addr'
+			// by simulating a read, if it is unallocated before the call. It
+			// then returns the node the page is allocated on through the mode
+			// pointer.
+			b := t.CopyScratchBuffer(1)
+			_, err := t.CopyInBytes(addr, b)
+			if err != nil {
+				return 0, nil, syserror.EFAULT
+			}
+			if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
+				return 0, nil, syserror.EFAULT
+			}
+		} else {
+			storedPolicy, _ := t.NumaPolicy()
+			// Return the policy governing the memory referenced by 'addr'.
+			if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil {
+				return 0, nil, syserror.EFAULT
+			}
+		}
+		return 0, nil, nil
+	}
+
+	storedPolicy, _ := t.NumaPolicy()
+	if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) {
+		// Policy for current thread is to interleave memory between
+		// nodes. Return the next node we'll allocate on. Since we only have a
+		// single node, this is always node 0.
+		if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		return 0, nil, nil
+	}
+
+	return 0, nil, syserror.EINVAL
+}
+
+func allowedNodesMask() uint32 {
+	const maxNodes = 1
+	return ^uint32((1 << maxNodes) - 1)
+}
+
+// SetMempolicy implements the syscall set_mempolicy(2).
+func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	modeWithFlags := args[0].Int()
+	nodemask := args[1].Pointer()
+	maxnode := args[2].Uint()
+
+	if maxnode < 1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS {
+		// Can't specify multiple modes simultaneously. Must also contain a
+		// valid mode, which we check below.
+		return 0, nil, syserror.EINVAL
+	}
+
+	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
+	if mode < 0 || mode >= linux.MPOL_MAX {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var nodemaskVal uint32
+	if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil {
+		return 0, nil, syserror.EFAULT
+	}
+
+	// When setting MPOL_INTERLEAVE, nodemask must not be empty.
+	if mode == linux.MPOL_INTERLEAVE && nodemaskVal == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if nodemaskVal&allowedNodesMask() != 0 {
+		// Invalid node specified.
+		return 0, nil, syserror.EINVAL
+	}
+
+	t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal)
+
+	return 0, nil, nil
+}
+
+// Mincore implements the syscall mincore(2).
+func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	vec := args[2].Pointer()
+
+	if addr != addr.RoundDown() {
+		return 0, nil, syserror.EINVAL
+	}
+	// "The length argument need not be a multiple of the page size, but since
+	// residency information is returned for whole pages, length is effectively
+	// rounded up to the next multiple of the page size." - mincore(2)
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+
+	// Pretend that all mapped pages are "resident in core".
+	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+	// "ENOMEM: addr to addr + length contained unmapped memory."
+	if mapped != uint64(la) {
+		return 0, nil, syserror.ENOMEM
+	}
+	resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize))
+	_, err := t.CopyOut(vec, resident)
+	return 0, nil, err
+}
+
+// Msync implements Linux syscall msync(2).
+func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	if addr != addr.RoundDown() {
+		return 0, nil, syserror.EINVAL
+	}
+	if length == 0 {
+		return 0, nil, nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
+	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
+	// permits a call to msync() that specifies neither of these flags, with
+	// semantics that are (currently) equivalent to specifying MS_ASYNC." -
+	// msync(2)
+	if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	sync := flags&linux.MS_SYNC != 0
+	if sync && flags&linux.MS_ASYNC != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
+	// that they can be updated with the fresh values just written)". This is a
+	// no-op given that shared memory exists. However, MS_INVALIDATE can also
+	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
+	// and a memory lock exists for the specified address range." Given that
+	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
+	// some user program could be using it for synchronization.
+	if flags&linux.MS_INVALIDATE != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	// MS_SYNC "requests an update and waits for it to complete."
+	if sync {
+		err := t.MemoryManager().Sync(t, addr, uint64(la))
+		// Sync calls fsync, the same interrupt conversion rules apply, see
+		// mm/msync.c, fsync POSIX.1-2008.
+		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	}
+	// MS_ASYNC "specifies that an update be scheduled, but the call returns
+	// immediately". As long as dirty pages are tracked and eventually written
+	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
+	// is in fact a no-op, since the kernel properly tracks dirty pages and
+	// flushes them to storage as necessary.")
+	//
+	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
+	// This applies even for MS_ASYNC.
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
+	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+	if mapped != uint64(la) {
+		return 0, nil, syserror.ENOMEM
+	}
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
new file mode 100644
index 000000000..d70b79e4f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -0,0 +1,140 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sourceAddr := args[0].Pointer()
+	targetAddr := args[1].Pointer()
+	typeAddr := args[2].Pointer()
+	flags := args[3].Uint64()
+	dataAddr := args[4].Pointer()
+
+	fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	sourcePath, _, err := copyInPath(t, sourceAddr, true /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	targetPath, _, err := copyInPath(t, targetAddr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// In Linux, a full page is always copied in regardless of null
+	// character placement, and the address is passed to each file system.
+	// Most file systems always treat this data as a string, though, and so
+	// do all of the ones we implement.
+	data, err := t.CopyInString(dataAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Ignore magic value that was required before Linux 2.4.
+	if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+		flags = flags &^ linux.MS_MGC_MSK
+	}
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+		linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+		linux.MS_UNBINDABLE | linux.MS_MOVE
+
+	// Silently allow MS_NOSUID, since we don't implement set-id bits
+	// anyway.
+	const unsupportedFlags = linux.MS_NODEV | linux.MS_NOEXEC |
+		linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+	// Linux just allows passing any flags to mount(2) - it won't fail when
+	// unknown or unsupported flags are passed. Since we don't implement
+	// everything, we fail explicitly on flags that are unimplemented.
+	if flags&(unsupportedOps|unsupportedFlags) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	rsys, ok := fs.FindFilesystem(fsType)
+	if !ok {
+		return 0, nil, syserror.ENODEV
+	}
+	if !rsys.AllowUserMount() {
+		return 0, nil, syserror.EPERM
+	}
+
+	var superFlags fs.MountSourceFlags
+	if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+		superFlags.NoAtime = true
+	}
+	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+		superFlags.ReadOnly = true
+	}
+
+	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return t.MountNamespace().Mount(t, d, rootInode)
+	})
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+
+	const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+	if flags&unsupported != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	//
+	// Currently, this is always the init task's user namespace.
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+
+	resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW
+	detachOnly := flags&linux.MNT_DETACH == linux.MNT_DETACH
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
+		return t.MountNamespace().Unmount(t, d, detachOnly)
+	})
+}
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
new file mode 100644
index 000000000..3efc06a27
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// pipe2 implements the actual system call with flags.
+func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
+	if flags&^(syscall.O_NONBLOCK|syscall.O_CLOEXEC) != 0 {
+		return 0, syscall.EINVAL
+	}
+	r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
+
+	r.SetFlags(linuxToFlags(flags).Settable())
+	defer r.DecRef()
+
+	w.SetFlags(linuxToFlags(flags).Settable())
+	defer w.DecRef()
+
+	rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{
+		CloseOnExec: flags&syscall.O_CLOEXEC != 0},
+		t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, err
+	}
+
+	wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{
+		CloseOnExec: flags&syscall.O_CLOEXEC != 0},
+		t.ThreadGroup().Limits())
+	if err != nil {
+		t.FDMap().Remove(rfd)
+		return 0, err
+	}
+
+	if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil {
+		t.FDMap().Remove(rfd)
+		t.FDMap().Remove(wfd)
+		return 0, syscall.EFAULT
+	}
+	return 0, nil
+}
+
+// Pipe implements linux syscall pipe(2).
+func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	n, err := pipe2(t, addr, 0)
+	return n, nil, err
+}
+
+// Pipe2 implements linux syscall pipe2(2).
+func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := uint(args[1].Uint())
+
+	n, err := pipe2(t, addr, flags)
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
new file mode 100644
index 000000000..d4dbfd285
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -0,0 +1,429 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// fileCap is the maximum allowable files for poll & select.
+const fileCap = 1024 * 1024
+
+// Masks for "readable", "writable", and "exceptional" events as defined by
+// select(2).
+const (
+	// selectReadEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLIN_SET.
+	selectReadEvents = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+
+	// selectWriteEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLOUT_SET.
+	selectWriteEvents = waiter.EventOut | waiter.EventErr
+
+	// selectExceptEvents is analogous to the Linux kernel's
+	// fs/select.c:POLLEX_SET.
+	selectExceptEvents = waiter.EventPri
+)
+
+func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return timeout, 0, syserror.EINVAL
+	}
+
+	pfd := make([]syscalls.PollFD, nfds)
+	if nfds > 0 {
+		if _, err := t.CopyIn(pfdAddr, &pfd); err != nil {
+			return timeout, 0, err
+		}
+	}
+
+	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
+	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
+	// polling, changing event masks here is an application-visible difference.
+	// (Linux also doesn't copy out event masks at all, only revents.)
+	for i := range pfd {
+		pfd[i].Events |= waiter.EventHUp | waiter.EventErr
+	}
+	remainingTimeout, n, err := syscalls.Poll(t, pfd, timeout)
+	err = syserror.ConvertIntr(err, syserror.EINTR)
+
+	// The poll entries are copied out regardless of whether
+	// any are set or not. This aligns with the Linux behavior.
+	if nfds > 0 && err == nil {
+		if _, err := t.CopyOut(pfdAddr, pfd); err != nil {
+			return remainingTimeout, 0, err
+		}
+	}
+
+	return remainingTimeout, n, err
+}
+
+func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
+	if nfds < 0 || uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+		return 0, syserror.EINVAL
+	}
+
+	// Capture all the provided input vectors.
+	//
+	// N.B. This only works on little-endian architectures.
+	byteCount := (nfds + 7) / 8
+	bitsInLastPartialByte := uint(nfds % 8)
+	r := make([]byte, byteCount)
+	w := make([]byte, byteCount)
+	e := make([]byte, byteCount)
+
+	if readFDs != 0 {
+		if _, err := t.CopyIn(readFDs, &r); err != nil {
+			return 0, err
+		}
+		// Mask out bits above nfds.
+		if bitsInLastPartialByte != 0 {
+			r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyIn(writeFDs, &w); err != nil {
+			return 0, err
+		}
+		if bitsInLastPartialByte != 0 {
+			w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyIn(exceptFDs, &e); err != nil {
+			return 0, err
+		}
+		if bitsInLastPartialByte != 0 {
+			e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
+		}
+	}
+
+	// Count how many FDs are actually being requested so that we can build
+	// a PollFD array.
+	fdCount := 0
+	for i := 0; i < byteCount; i++ {
+		v := r[i] | w[i] | e[i]
+		for v != 0 {
+			v &= (v - 1)
+			fdCount++
+		}
+	}
+
+	// Build the PollFD array.
+	pfd := make([]syscalls.PollFD, 0, fdCount)
+	fd := kdefs.FD(0)
+	for i := 0; i < byteCount; i++ {
+		rV, wV, eV := r[i], w[i], e[i]
+		v := rV | wV | eV
+		m := byte(1)
+		for j := 0; j < 8; j++ {
+			if (v & m) != 0 {
+				// Make sure the fd is valid and decrement the reference
+				// immediately to ensure we don't leak. Note, another thread
+				// might be about to close fd. This is racy, but that's
+				// OK. Linux is racy in the same way.
+				file := t.FDMap().GetFile(fd)
+				if file == nil {
+					return 0, syserror.EBADF
+				}
+				file.DecRef()
+
+				mask := waiter.EventMask(0)
+				if (rV & m) != 0 {
+					mask |= selectReadEvents
+				}
+
+				if (wV & m) != 0 {
+					mask |= selectWriteEvents
+				}
+
+				if (eV & m) != 0 {
+					mask |= selectExceptEvents
+				}
+
+				pfd = append(pfd, syscalls.PollFD{
+					FD:     fd,
+					Events: mask,
+				})
+			}
+
+			fd++
+			m <<= 1
+		}
+	}
+
+	// Do the syscall, then count the number of bits set.
+	_, _, err := syscalls.Poll(t, pfd, timeout)
+	if err != nil {
+		return 0, syserror.ConvertIntr(err, syserror.EINTR)
+	}
+
+	// r, w, and e are currently event mask bitsets; unset bits corresponding
+	// to events that *didn't* occur.
+	bitSetCount := uintptr(0)
+	for idx := range pfd {
+		events := pfd[idx].REvents
+		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
+		m := byte(1) << j
+		if r[i]&m != 0 {
+			if (events & selectReadEvents) != 0 {
+				bitSetCount++
+			} else {
+				r[i] &^= m
+			}
+		}
+		if w[i]&m != 0 {
+			if (events & selectWriteEvents) != 0 {
+				bitSetCount++
+			} else {
+				w[i] &^= m
+			}
+		}
+		if e[i]&m != 0 {
+			if (events & selectExceptEvents) != 0 {
+				bitSetCount++
+			} else {
+				e[i] &^= m
+			}
+		}
+	}
+
+	// Copy updated vectors back.
+	if readFDs != 0 {
+		if _, err := t.CopyOut(readFDs, r); err != nil {
+			return 0, err
+		}
+	}
+
+	if writeFDs != 0 {
+		if _, err := t.CopyOut(writeFDs, w); err != nil {
+			return 0, err
+		}
+	}
+
+	if exceptFDs != 0 {
+		if _, err := t.CopyOut(exceptFDs, e); err != nil {
+			return 0, err
+		}
+	}
+
+	return bitSetCount, nil
+}
+
+// timeoutRemaining returns the amount of time remaining for the specified
+// timeout or 0 if it has elapsed.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
+	now := t.Kernel().MonotonicClock().Now()
+	remaining := timeout - now.Sub(startNs)
+	if remaining < 0 {
+		remaining = 0
+	}
+	return remaining
+}
+
+// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
+	return copyTimespecOut(t, timespecAddr, &tsRemaining)
+}
+
+// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
+//
+// startNs must be from CLOCK_MONOTONIC.
+func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
+	if timeout <= 0 {
+		return nil
+	}
+	remaining := timeoutRemaining(t, startNs, timeout)
+	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
+	return copyTimevalOut(t, timevalAddr, &tvRemaining)
+}
+
+// pollRestartBlock encapsulates the state required to restart poll(2) via
+// restart_syscall(2).
+type pollRestartBlock struct {
+	pfdAddr usermem.Addr
+	nfds    uint
+	timeout time.Duration
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return poll(t, p.pfdAddr, p.nfds, p.timeout)
+}
+
+func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
+	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	// On an interrupt poll(2) is restarted with the remaining timeout.
+	if err == syserror.EINTR {
+		t.SetSyscallRestartBlock(&pollRestartBlock{
+			pfdAddr: pfdAddr,
+			nfds:    nfds,
+			timeout: remainingTimeout,
+		})
+		return 0, kernel.ERESTART_RESTARTBLOCK
+	}
+	return n, err
+}
+
+// Poll implements linux syscall poll(2).
+func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timeout := time.Duration(args[2].Int()) * time.Millisecond
+	n, err := poll(t, pfdAddr, nfds, timeout)
+	return n, nil, err
+}
+
+// Ppoll implements linux syscall ppoll(2).
+func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pfdAddr := args[0].Pointer()
+	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
+	timespecAddr := args[2].Pointer()
+	maskAddr := args[3].Pointer()
+	maskSize := uint(args[4].Uint())
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskAddr != 0 {
+		mask, err := copyInSigSet(t, maskAddr, maskSize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		oldmask := t.SignalMask()
+		t.SetSignalMask(mask)
+		t.SetSavedSignalMask(oldmask)
+	}
+
+	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
+	// if interrupted by something other than a signal handled by the
+	// application (i.e. returns ERESTARTNOHAND). However, if
+	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
+	// wrong timeout, so the error should be left as EINTR.
+	//
+	// Note that this means that if err is nil but copyErr is not, copyErr is
+	// ignored. This is consistent with Linux.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Select implements linux syscall select(2).
+func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timevalAddr := args[4].Pointer()
+
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timevalAddr != 0 {
+		timeval, err := copyTimevalIn(t, timevalAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if timeval.Sec < 0 || timeval.Usec < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(timeval.ToNsecCapped())
+	}
+	startNs := t.Kernel().MonotonicClock().Now()
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
+
+// Pselect implements linux syscall pselect(2).
+func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nfds := int(args[0].Int()) // select(2) uses an int.
+	readFDs := args[1].Pointer()
+	writeFDs := args[2].Pointer()
+	exceptFDs := args[3].Pointer()
+	timespecAddr := args[4].Pointer()
+	maskWithSizeAddr := args[5].Pointer()
+
+	timeout, err := copyTimespecInToDuration(t, timespecAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var startNs ktime.Time
+	if timeout > 0 {
+		startNs = t.Kernel().MonotonicClock().Now()
+	}
+
+	if maskWithSizeAddr != 0 {
+		maskAddr, size, err := copyInSigSetWithSize(t, maskWithSizeAddr)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		if maskAddr != 0 {
+			mask, err := copyInSigSet(t, maskAddr, size)
+			if err != nil {
+				return 0, nil, err
+			}
+			oldmask := t.SignalMask()
+			t.SetSignalMask(mask)
+			t.SetSavedSignalMask(oldmask)
+		}
+	}
+
+	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
+	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
+	// See comment in Ppoll.
+	if err == syserror.EINTR && copyErr == nil {
+		err = kernel.ERESTARTNOHAND
+	}
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
new file mode 100644
index 000000000..2ca7471cf
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -0,0 +1,188 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+type userSockFprog struct {
+	// Len is the length of the filter in BPF instructions.
+	Len uint16
+
+	_ [6]byte // padding for alignment
+
+	// Filter is a user pointer to the struct sock_filter array that makes up
+	// the filter program. Filter is a uint64 rather than a usermem.Addr
+	// because usermem.Addr is actually uintptr, which is not a fixed-size
+	// type, and encoding/binary.Read objects to this.
+	Filter uint64
+}
+
+// Prctl implements linux syscall prctl(2).
+// It has a list of subfunctions which operate on the process. The arguments are
+// all based on each subfunction.
+func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	option := args[0].Int()
+
+	switch option {
+	case linux.PR_SET_PDEATHSIG:
+		sig := linux.Signal(args[1].Int())
+		if sig != 0 && !sig.IsValid() {
+			return 0, nil, syscall.EINVAL
+		}
+		t.SetParentDeathSignal(sig)
+		return 0, nil, nil
+
+	case linux.PR_GET_PDEATHSIG:
+		_, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
+		return 0, nil, err
+
+	case linux.PR_GET_KEEPCAPS:
+		if t.Credentials().KeepCaps {
+			return 1, nil, nil
+		}
+
+		return 0, nil, nil
+
+	case linux.PR_SET_KEEPCAPS:
+		val := args[1].Int()
+		// prctl(2): arg2 must be either 0 (permitted capabilities are cleared)
+		// or 1 (permitted capabilities are kept).
+		if val == 0 {
+			t.SetKeepCaps(false)
+		} else if val == 1 {
+			t.SetKeepCaps(true)
+		} else {
+			return 0, nil, syscall.EINVAL
+		}
+
+		return 0, nil, nil
+
+	case linux.PR_SET_NAME:
+		addr := args[1].Pointer()
+		name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1)
+		if err != nil && err != syscall.ENAMETOOLONG {
+			return 0, nil, err
+		}
+		t.SetName(name)
+
+	case linux.PR_GET_NAME:
+		addr := args[1].Pointer()
+		buf := make([]byte, linux.TASK_COMM_LEN)
+		len := copy(buf, t.Name())
+		if len < linux.TASK_COMM_LEN {
+			buf[len] = 0
+			len++
+		}
+		_, err := t.CopyOut(addr, buf[:len])
+		if err != nil {
+			return 0, nil, err
+		}
+
+	case linux.PR_SET_MM:
+		switch args[1].Int() {
+		case linux.PR_SET_MM_EXE_FILE:
+			fd := kdefs.FD(args[2].Int())
+
+			file := t.FDMap().GetFile(fd)
+			if file == nil {
+				return 0, nil, syscall.EBADF
+			}
+			defer file.DecRef()
+
+			// They trying to set exe to a non-file?
+			if !fs.IsFile(file.Dirent.Inode.StableAttr) {
+				return 0, nil, syscall.EBADF
+			}
+
+			// Set the underlying executable.
+			t.MemoryManager().SetExecutable(file.Dirent)
+		default:
+			return 0, nil, syscall.EINVAL
+		}
+
+	case linux.PR_SET_NO_NEW_PRIVS:
+		if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+			return 0, nil, syscall.EINVAL
+		}
+		// no_new_privs is assumed to always be set. See
+		// auth.Credentials.UpdateForExec.
+		return 0, nil, nil
+
+	case linux.PR_GET_NO_NEW_PRIVS:
+		if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
+			return 0, nil, syscall.EINVAL
+		}
+		return 1, nil, nil
+
+	case linux.PR_SET_SECCOMP:
+		if args[1].Int() != linux.SECCOMP_MODE_FILTER {
+			// Unsupported mode.
+			return 0, nil, syscall.EINVAL
+		}
+		var fprog userSockFprog
+		if _, err := t.CopyIn(args[2].Pointer(), &fprog); err != nil {
+			return 0, nil, err
+		}
+		filter := make([]linux.BPFInstruction, int(fprog.Len))
+		if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+			return 0, nil, err
+		}
+		compiledFilter, err := bpf.Compile(filter)
+		if err != nil {
+			t.Debugf("Invalid seccomp-bpf filter: %v", err)
+			return 0, nil, syscall.EINVAL
+		}
+		return 0, nil, t.AppendSyscallFilter(compiledFilter)
+
+	case linux.PR_GET_SECCOMP:
+		return uintptr(t.SeccompMode()), nil, nil
+
+	case linux.PR_CAPBSET_READ:
+		cp := linux.Capability(args[1].Uint64())
+		if !cp.Ok() {
+			return 0, nil, syscall.EINVAL
+		}
+		var rv uintptr
+		if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 {
+			rv = 1
+		}
+		return rv, nil, nil
+
+	case linux.PR_CAPBSET_DROP:
+		cp := linux.Capability(args[1].Uint64())
+		if !cp.Ok() {
+			return 0, nil, syscall.EINVAL
+		}
+		return 0, nil, t.DropBoundingCapability(cp)
+
+	default:
+		t.Warningf("Unsupported prctl %d", option)
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
new file mode 100644
index 000000000..2dd59b1c3
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -0,0 +1,92 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"crypto/rand"
+	"io"
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	_GRND_NONBLOCK = 0x1
+	_GRND_RANDOM   = 0x2
+)
+
+// GetRandom implements the linux syscall getrandom(2).
+//
+// In a multi-tenant/shared environment, the only valid implementation is to
+// fetch data from the urandom pool, otherwise starvation attacks become
+// possible. The urandom pool is also expected to have plenty of entropy, thus
+// the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as
+// the pool will already be initialized.
+func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	// Flags are checked for validity but otherwise ignored. See above.
+	if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if length > math.MaxInt32 {
+		length = math.MaxInt32
+	}
+	ar, ok := addr.ToRange(uint64(length))
+	if !ok {
+		return 0, nil, syserror.EFAULT
+	}
+
+	// "If the urandom source has been initialized, reads of up to 256 bytes
+	// will always return as many bytes as requested and will not be
+	// interrupted by signals. No such guarantees apply for larger buffer
+	// sizes." - getrandom(2)
+	min := int(length)
+	if min > 256 {
+		min = 256
+	}
+	n, err := t.MemoryManager().CopyOutFrom(t, usermem.AddrRangeSeqOf(ar), safemem.FromIOReader{&randReader{-1, min}}, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if n >= int64(min) {
+		return uintptr(n), nil, nil
+	}
+	return 0, nil, err
+}
+
+// randReader is a io.Reader that handles partial reads from rand.Reader.
+type randReader struct {
+	done int
+	min  int
+}
+
+// Read implements io.Reader.Read.
+func (r *randReader) Read(dst []byte) (int, error) {
+	if r.done >= r.min {
+		return rand.Reader.Read(dst)
+	}
+	min := r.min - r.done
+	if min > len(dst) {
+		min = len(dst)
+	}
+	return io.ReadAtLeast(rand.Reader, dst, min)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
new file mode 100644
index 000000000..0be2d195a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -0,0 +1,274 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// EventMaskRead contains events that can be triggerd on reads.
+	EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+)
+
+// Read implements linux syscall read(2).  Note that we try to get a buffer that
+// is exactly the size requested because some applications like qemu expect
+// they can do large reads all at once.  Bug for bug.  Same for other read
+// calls below.
+func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := readv(t, file, dst)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
+}
+
+// Pread64 implements linux syscall pread64(2).
+func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the destination of the read.
+	dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
+}
+
+// Readv implements linux syscall readv(2).
+func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := readv(t, file, dst)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
+}
+
+// Preadv implements linux syscall preadv(2).
+func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
+}
+
+func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
+	n, err := f.Readv(t, dst)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we read anything.
+			f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst64(n)
+
+		// Issue the request and break out if it completes with anything
+		// other than "would block".
+		n, err = f.Readv(t, dst)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we read anything.
+		f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+	}
+
+	return total, err
+}
+
+func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	n, err := f.Preadv(t, dst, offset)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we read anything.
+			f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskRead)
+
+	total := n
+	for {
+		// Shorten dst to reflect bytes previously read.
+		dst = dst.DropFirst64(n)
+
+		// Issue the request and break out if it completes with anything
+		// other than "would block".
+		n, err = f.Preadv(t, dst, offset+total)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we read anything.
+		f.Dirent.InotifyEvent(linux.IN_ACCESS, 0)
+	}
+
+	return total, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
new file mode 100644
index 000000000..481e79eaa
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -0,0 +1,217 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// rlimit describes an implementation of 'struct rlimit', which may vary from
+// system-to-system.
+type rlimit interface {
+	// toLimit converts an rlimit to a limits.Limit.
+	toLimit() *limits.Limit
+
+	// fromLimit converts a limits.Limit to an rlimit.
+	fromLimit(lim limits.Limit)
+
+	// copyIn copies an rlimit from the untrusted app to the kernel.
+	copyIn(t *kernel.Task, addr usermem.Addr) error
+
+	// copyOut copies an rlimit from the kernel to the untrusted app.
+	copyOut(t *kernel.Task, addr usermem.Addr) error
+}
+
+// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
+func newRlimit(t *kernel.Task) (rlimit, error) {
+	switch t.Arch().Width() {
+	case 8:
+		// On 64-bit system, struct rlimit and struct rlimit64 are identical.
+		return &rlimit64{}, nil
+	default:
+		return nil, syserror.ENOSYS
+	}
+}
+
+type rlimit64 struct {
+	Cur uint64
+	Max uint64
+}
+
+func (r *rlimit64) toLimit() *limits.Limit {
+	return &limits.Limit{
+		Cur: limits.FromLinux(r.Cur),
+		Max: limits.FromLinux(r.Max),
+	}
+}
+
+func (r *rlimit64) fromLimit(lim limits.Limit) {
+	*r = rlimit64{
+		Cur: limits.ToLinux(lim.Cur),
+		Max: limits.ToLinux(lim.Max),
+	}
+}
+
+func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error {
+	_, err := t.CopyIn(addr, r)
+	return err
+}
+
+func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error {
+	_, err := t.CopyOut(addr, *r)
+	return err
+}
+
+func makeRlimit64(lim limits.Limit) *rlimit64 {
+	return &rlimit64{Cur: lim.Cur, Max: lim.Max}
+}
+
+// setableLimits is the set of supported setable limits.
+var setableLimits = map[limits.LimitType]struct{}{
+	limits.NumberOfFiles: {},
+	limits.AS:            {},
+	limits.CPU:           {},
+	limits.Data:          {},
+	limits.FileSize:      {},
+	limits.Stack:         {},
+	// These are not enforced, but we include them here to avoid returning
+	// EPERM, since some apps expect them to succeed.
+	limits.Core:         {},
+	limits.ProcessCount: {},
+}
+
+func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) {
+	if newLim == nil {
+		return t.ThreadGroup().Limits().Get(resource), nil
+	}
+
+	if _, ok := setableLimits[resource]; !ok {
+		return limits.Limit{}, syserror.EPERM
+	}
+	oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim)
+	if err != nil {
+		return limits.Limit{}, err
+	}
+
+	if resource == limits.CPU {
+		t.ThreadGroup().SetCPUTimer(newLim)
+	}
+	return oldLim, nil
+}
+
+// Getrlimit implements linux syscall getrlimit(2).
+func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	addr := args[1].Pointer()
+	rlim, err := newRlimit(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	lim, err := prlimit64(t, resource, nil)
+	if err != nil {
+		return 0, nil, err
+	}
+	rlim.fromLimit(lim)
+	return 0, nil, rlim.copyOut(t, addr)
+}
+
+// Setrlimit implements linux syscall setrlimit(2).
+func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	resource, ok := limits.FromLinuxResource[int(args[0].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	addr := args[1].Pointer()
+	rlim, err := newRlimit(t)
+	if err != nil {
+		return 0, nil, err
+	}
+	if err := rlim.copyIn(t, addr); err != nil {
+		return 0, nil, syserror.EFAULT
+	}
+	_, err = prlimit64(t, resource, rlim.toLimit())
+	return 0, nil, err
+}
+
+// Prlimit64 implements linux syscall prlimit64(2).
+func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	resource, ok := limits.FromLinuxResource[int(args[1].Int())]
+	if !ok {
+		// Return err; unknown limit.
+		return 0, nil, syserror.EINVAL
+	}
+	newRlimAddr := args[2].Pointer()
+	oldRlimAddr := args[3].Pointer()
+
+	var newLim *limits.Limit
+	if newRlimAddr != 0 {
+		var nrl rlimit64
+		if err := nrl.copyIn(t, newRlimAddr); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+		newLim = nrl.toLimit()
+	}
+
+	if tid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	ot := t
+	if tid > 0 {
+		if ot = t.PIDNamespace().TaskWithID(tid); ot == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	// "To set or get the resources of a process other than itself, the caller
+	// must have the CAP_SYS_RESOURCE capability, or the real, effective, and
+	// saved set user IDs of the target process must match the real user ID of
+	// the caller and the real, effective, and saved set group IDs of the
+	// target process must match the real group ID of the caller."
+	if !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
+		cred, tcred := t.Credentials(), ot.Credentials()
+		if cred.RealKUID != tcred.RealKUID ||
+			cred.RealKUID != tcred.EffectiveKUID ||
+			cred.RealKUID != tcred.SavedKUID ||
+			cred.RealKGID != tcred.RealKGID ||
+			cred.RealKGID != tcred.EffectiveKGID ||
+			cred.RealKGID != tcred.SavedKGID {
+			return 0, nil, syserror.EPERM
+		}
+	}
+
+	oldLim, err := prlimit64(ot, resource, newLim)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if oldRlimAddr != 0 {
+		if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
new file mode 100644
index 000000000..82e42b589
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func getrusage(t *kernel.Task, which int32) linux.Rusage {
+	var cs usage.CPUStats
+
+	switch which {
+	case linux.RUSAGE_SELF:
+		cs = t.ThreadGroup().CPUStats()
+
+	case linux.RUSAGE_CHILDREN:
+		cs = t.ThreadGroup().JoinedChildCPUStats()
+
+	case linux.RUSAGE_THREAD:
+		cs = t.CPUStats()
+
+	case linux.RUSAGE_BOTH:
+		tg := t.ThreadGroup()
+		cs = tg.CPUStats()
+		cs.Accumulate(tg.JoinedChildCPUStats())
+	}
+
+	return linux.Rusage{
+		UTime:  linux.NsecToTimeval(cs.UserTime.Nanoseconds()),
+		STime:  linux.NsecToTimeval(cs.SysTime.Nanoseconds()),
+		NVCSw:  int64(cs.VoluntarySwitches),
+		MaxRSS: int64(t.MaxRSS(which) / 1024),
+	}
+}
+
+// Getrusage implements linux syscall getrusage(2).
+//	marked "y" are supported now
+//	marked "*" are not used on Linux
+//	marked "p" are pending for support
+//
+//	y    struct timeval ru_utime; /* user CPU time used */
+//	y    struct timeval ru_stime; /* system CPU time used */
+//	p    long   ru_maxrss;        /* maximum resident set size */
+//	*    long   ru_ixrss;         /* integral shared memory size */
+//	*    long   ru_idrss;         /* integral unshared data size */
+//	*    long   ru_isrss;         /* integral unshared stack size */
+//	p    long   ru_minflt;        /* page reclaims (soft page faults) */
+//	p    long   ru_majflt;        /* page faults (hard page faults) */
+//	*    long   ru_nswap;         /* swaps */
+//	p    long   ru_inblock;       /* block input operations */
+//	p    long   ru_oublock;       /* block output operations */
+//	*    long   ru_msgsnd;        /* IPC messages sent */
+//	*    long   ru_msgrcv;        /* IPC messages received */
+//	*    long   ru_nsignals;      /* signals received */
+//	y    long   ru_nvcsw;         /* voluntary context switches */
+//	y    long   ru_nivcsw;        /* involuntary context switches */
+func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	addr := args[1].Pointer()
+
+	if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ru := getrusage(t, which)
+	_, err := t.CopyOut(addr, &ru)
+	return 0, nil, err
+}
+
+// Times implements linux syscall times(2).
+func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	// Calculate the ticks first, and figure out if any additional work is
+	// necessary. Linux allows for a NULL addr, in which case only the
+	// return value is meaningful. We don't need to do anything else.
+	ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds())
+	if addr == 0 {
+		return ticks, nil, nil
+	}
+
+	cs1 := t.ThreadGroup().CPUStats()
+	cs2 := t.ThreadGroup().JoinedChildCPUStats()
+	r := linux.Tms{
+		UTime:  linux.ClockTFromDuration(cs1.UserTime),
+		STime:  linux.ClockTFromDuration(cs1.SysTime),
+		CUTime: linux.ClockTFromDuration(cs2.UserTime),
+		CSTime: linux.ClockTFromDuration(cs2.SysTime),
+	}
+	if _, err := t.CopyOut(addr, &r); err != nil {
+		return 0, nil, err
+	}
+
+	return ticks, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
new file mode 100644
index 000000000..ff9e46077
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+const (
+	onlyScheduler = linux.SCHED_NORMAL
+	onlyPriority  = 0
+)
+
+// SchedParam replicates struct sched_param in sched.h.
+type SchedParam struct {
+	schedPriority int64
+}
+
+// SchedGetparam implements linux syscall sched_getparam(2).
+func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	param := args[1].Pointer()
+	if param == 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid < 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syscall.ESRCH
+	}
+	r := SchedParam{schedPriority: onlyPriority}
+	if _, err := t.CopyOut(param, r); err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// SchedGetscheduler implements linux syscall sched_getscheduler(2).
+func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	if pid < 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syscall.ESRCH
+	}
+	return onlyScheduler, nil, nil
+}
+
+// SchedSetscheduler implements linux syscall sched_setscheduler(2).
+func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := args[0].Int()
+	policy := args[1].Int()
+	param := args[2].Pointer()
+	if pid < 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if policy != onlyScheduler {
+		return 0, nil, syscall.EINVAL
+	}
+	if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
+		return 0, nil, syscall.ESRCH
+	}
+	var r SchedParam
+	if _, err := t.CopyIn(param, &r); err != nil {
+		return 0, nil, syscall.EINVAL
+	}
+	if r.schedPriority != onlyPriority {
+		return 0, nil, syscall.EINVAL
+	}
+	return 0, nil, nil
+}
+
+// SchedGetPriorityMax implements linux syscall sched_get_priority_max(2).
+func SchedGetPriorityMax(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return onlyPriority, nil, nil
+}
+
+// SchedGetPriorityMin implements linux syscall sched_get_priority_min(2).
+func SchedGetPriorityMin(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return onlyPriority, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
new file mode 100644
index 000000000..a8983705b
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -0,0 +1,166 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const opsMax = 500 // SEMOPM
+
+// Semget handles: semget(key_t key, int nsems, int semflg)
+func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	key := args[0].Int()
+	nsems := args[1].Int()
+	flag := args[2].Int()
+
+	private := key == linux.IPC_PRIVATE
+	create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+	exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+	mode := linux.FileMode(flag & 0777)
+
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(set.ID), nil, nil
+}
+
+// Semop handles: semop(int semid, struct sembuf *sops, size_t nsops)
+func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	sembufAddr := args[1].Pointer()
+	nsops := args[2].SizeT()
+
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, nil, syserror.EINVAL
+	}
+	if nsops <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+	if nsops > opsMax {
+		return 0, nil, syserror.E2BIG
+	}
+
+	ops := make([]linux.Sembuf, nsops)
+	if _, err := t.CopyIn(sembufAddr, ops); err != nil {
+		return 0, nil, err
+	}
+
+	creds := auth.CredentialsFromContext(t)
+	for {
+		ch, num, err := set.ExecuteOps(t, ops, creds)
+		if ch == nil || err != nil {
+			// We're done (either on success or a failure).
+			return 0, nil, err
+		}
+		if err = t.Block(ch); err != nil {
+			set.AbortWait(num, ch)
+			return 0, nil, err
+		}
+	}
+}
+
+// Semctl handles: semctl(int semid, int semnum, int cmd, ...)
+func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	num := args[1].Int()
+	cmd := args[2].Int()
+
+	switch cmd {
+	case linux.SETVAL:
+		val := args[3].Int()
+		if val > math.MaxInt16 {
+			return 0, nil, syserror.ERANGE
+		}
+		return 0, nil, setVal(t, id, num, int16(val))
+
+	case linux.GETVAL:
+		v, err := getVal(t, id, num)
+		return uintptr(v), nil, err
+
+	case linux.IPC_RMID:
+		return 0, nil, remove(t, id)
+
+	case linux.IPC_SET:
+		arg := args[3].Pointer()
+		s := linux.SemidDS{}
+		if _, err := t.CopyIn(arg, &s); err != nil {
+			return 0, nil, err
+		}
+
+		perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
+		return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)
+
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
+
+func remove(t *kernel.Task, id int32) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	creds := auth.CredentialsFromContext(t)
+	return r.RemoveID(id, creds)
+}
+
+func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+
+	creds := auth.CredentialsFromContext(t)
+	kuid := creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	kgid := creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	owner := fs.FileOwner{UID: kuid, GID: kgid}
+	return set.Change(t, creds, owner, perms)
+}
+
+func setVal(t *kernel.Task, id int32, num int32, val int16) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	return set.SetVal(t, num, val, creds)
+}
+
+func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	return set.GetVal(num, creds)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
new file mode 100644
index 000000000..93b3f531a
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -0,0 +1,553 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"math"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// "For a process to have permission to send a signal it must
+// - either be privileged (CAP_KILL), or
+// - the real or effective user ID of the sending process must be equal to the
+// real or saved set-user-ID of the target process.
+//
+// In the case of SIGCONT it suffices when the sending and receiving processes
+// belong to the same session." - kill(2)
+//
+// Equivalent to kernel/signal.c:check_kill_permission.
+func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool {
+	// kernel/signal.c:check_kill_permission also allows a signal if the
+	// sending and receiving tasks share a thread group, which is not
+	// mentioned in kill(2) since kill does not allow task-level
+	// granularity in signal sending.
+	if t.ThreadGroup() == target.ThreadGroup() {
+		return true
+	}
+
+	if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) {
+		return true
+	}
+
+	creds := t.Credentials()
+	tcreds := target.Credentials()
+	if creds.EffectiveKUID == tcreds.SavedKUID ||
+		creds.EffectiveKUID == tcreds.RealKUID ||
+		creds.RealKUID == tcreds.SavedKUID ||
+		creds.RealKUID == tcreds.RealKUID {
+		return true
+	}
+
+	if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() {
+		return true
+	}
+	return false
+}
+
+// Kill implements linux syscall kill(2).
+func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+
+	switch {
+	case pid > 0:
+		// "If pid is positive, then signal sig is sent to the process with the
+		// ID specified by pid." - kill(2)
+		// This loops to handle races with execve where target dies between
+		// TaskWithID and SendGroupSignal. Compare Linux's
+		// kernel/signal.c:kill_pid_info().
+		for {
+			target := t.PIDNamespace().TaskWithID(pid)
+			if target == nil {
+				return 0, nil, syserror.ESRCH
+			}
+			if !mayKill(t, target, sig) {
+				return 0, nil, syserror.EPERM
+			}
+			info := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			info.SetPid(int32(target.PIDNamespace().IDOfTask(t)))
+			info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow()))
+			if err := target.SendGroupSignal(info); err != syserror.ESRCH {
+				return 0, nil, err
+			}
+		}
+	case pid == -1:
+		// "If pid equals -1, then sig is sent to every process for which the
+		// calling process has permission to send signals, except for process 1
+		// (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig)
+		// send sig to all processes that the calling process may send signals
+		// to, except possibly for some implementation-defined system
+		// processes. Linux allows a process to signal itself, but on Linux the
+		// call kill(-1,sig) does not signal the calling process."
+		var (
+			lastErr   error
+			delivered int
+		)
+		for _, tg := range t.PIDNamespace().ThreadGroups() {
+			if tg == t.ThreadGroup() {
+				continue
+			}
+			if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID {
+				continue
+			}
+
+			// If pid == -1, the returned error is the last non-EPERM error
+			// from any call to group_send_sig_info.
+			if !mayKill(t, tg.Leader(), sig) {
+				continue
+			}
+			// Here and below, whether or not kill returns an error may
+			// depend on the iteration order. We at least implement the
+			// semantics documented by the man page: "On success (at least
+			// one signal was sent), zero is returned."
+			info := &arch.SignalInfo{
+				Signo: int32(sig),
+				Code:  arch.SignalInfoUser,
+			}
+			info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+			info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+			err := tg.SendSignal(info)
+			if err == syserror.ESRCH {
+				// ESRCH is ignored because it means the task
+				// exited while we were iterating.  This is a
+				// race which would not normally exist on
+				// Linux, so we suppress it.
+				continue
+			}
+			delivered++
+			if err != nil {
+				lastErr = err
+			}
+		}
+		if delivered > 0 {
+			return 0, nil, lastErr
+		}
+		return 0, nil, syserror.ESRCH
+	default:
+		// "If pid equals 0, then sig is sent to every process in the process
+		// group of the calling process."
+		//
+		// "If pid is less than -1, then sig is sent to every process
+		// in the process group whose ID is -pid."
+		pgid := kernel.ProcessGroupID(-pid)
+		if pgid == 0 {
+			pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+		}
+
+		// If pid != -1 (i.e. signalling a process group), the returned error
+		// is the last error from any call to group_send_sig_info.
+		lastErr := syserror.ESRCH
+		for _, tg := range t.PIDNamespace().ThreadGroups() {
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+				if !mayKill(t, tg.Leader(), sig) {
+					lastErr = syserror.EPERM
+					continue
+				}
+
+				info := &arch.SignalInfo{
+					Signo: int32(sig),
+					Code:  arch.SignalInfoUser,
+				}
+				info.SetPid(int32(tg.PIDNamespace().IDOfTask(t)))
+				info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
+				// See note above regarding ESRCH race above.
+				if err := tg.SendSignal(info); err != syserror.ESRCH {
+					lastErr = err
+				}
+			}
+		}
+
+		return 0, nil, lastErr
+	}
+}
+
+func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoTkill,
+	}
+	info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup())))
+	info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	return info
+}
+
+// Tkill implements linux syscall tkill(2).
+func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// Tgkill implements linux syscall tgkill(2).
+func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tgid := kernel.ThreadID(args[0].Int())
+	tid := kernel.ThreadID(args[1].Int())
+	sig := linux.Signal(args[2].Int())
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tgid <= 0 || tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+	target := t.PIDNamespace().TaskWithID(tid)
+	if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+		return 0, nil, syserror.ESRCH
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
+}
+
+// RtSigaction implements linux syscall rt_sigaction(2).
+func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sig := linux.Signal(args[0].Int())
+	newactarg := args[1].Pointer()
+	oldactarg := args[2].Pointer()
+
+	var newactptr *arch.SignalAct
+	if newactarg != 0 {
+		newact, err := t.CopyInSignalAct(newactarg)
+		if err != nil {
+			return 0, nil, err
+		}
+		newactptr = &newact
+	}
+	oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr)
+	if err != nil {
+		return 0, nil, err
+	}
+	if oldactarg != 0 {
+		if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// Sigreturn implements linux syscall sigreturn(2).
+func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ctrl, err := t.SignalReturn(false)
+	return 0, ctrl, err
+}
+
+// RtSigreturn implements linux syscall rt_sigreturn(2).
+func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	ctrl, err := t.SignalReturn(true)
+	return 0, ctrl, err
+}
+
+// RtSigprocmask implements linux syscall rt_sigprocmask(2).
+func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	how := args[0].Int()
+	setaddr := args[1].Pointer()
+	oldaddr := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	if sigsetsize != linux.SignalSetSize {
+		return 0, nil, syserror.EINVAL
+	}
+	oldmask := t.SignalMask()
+	if setaddr != 0 {
+		mask, err := copyInSigSet(t, setaddr, sigsetsize)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		switch how {
+		case linux.SIG_BLOCK:
+			t.SetSignalMask(oldmask | mask)
+		case linux.SIG_UNBLOCK:
+			t.SetSignalMask(oldmask &^ mask)
+		case linux.SIG_SETMASK:
+			t.SetSignalMask(mask)
+		default:
+			return 0, nil, syserror.EINVAL
+		}
+	}
+	if oldaddr != 0 {
+		return 0, nil, copyOutSigSet(t, oldaddr, oldmask)
+	}
+
+	return 0, nil, nil
+}
+
+// Sigaltstack implements linux syscall sigaltstack(2).
+func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	setaddr := args[0].Pointer()
+	oldaddr := args[1].Pointer()
+
+	if oldaddr != 0 {
+		alt := t.SignalStack()
+		if t.OnSignalStack(alt) {
+			alt.Flags |= arch.SignalStackFlagOnStack
+		}
+		if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil {
+			return 0, nil, err
+		}
+	}
+	if setaddr != 0 {
+		if t.OnSignalStack(t.SignalStack()) {
+			return 0, nil, syserror.EPERM
+		}
+		alt, err := t.CopyInSignalStack(setaddr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if err := t.SetSignalStack(alt); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// Pause implements linux syscall pause(2).
+func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+func sigtimedwait(t *kernel.Task, mask linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
+	// Is it already pending?
+	if info := t.TakeSignal(^mask); info != nil {
+		return info, nil
+	}
+
+	// No signals available immediately and asked not to wait.
+	if timeout == 0 {
+		return nil, syserror.EAGAIN
+	}
+
+	// No signals available yet. Temporarily unblock the ones we are interested
+	// in then wait for either a timeout or a new signal.
+	oldmask := t.SignalMask()
+	t.SetSignalMask(oldmask &^ mask)
+	_, err := t.BlockWithTimeout(nil, true, timeout)
+	t.SetSignalMask(oldmask)
+
+	// How did the wait go?
+	switch err {
+	case syserror.ErrInterrupted:
+		if info := t.TakeSignal(^mask); info != nil {
+			// Got one of the signals we were waiting for.
+			return info, nil
+		}
+		// Got a signal we weren't waiting for.
+		return nil, syserror.EINTR
+	case syserror.ETIMEDOUT:
+		// Timed out and still no signals.
+		return nil, syserror.EAGAIN
+	default:
+		// Some other error? Shouldn't be possible. The event channel
+		// passed to BlockWithTimeout was nil, so the only two ways the
+		// block could've ended are a timeout or an interrupt.
+		panic("unreachable")
+	}
+}
+
+// RtSigpending implements linux syscall rt_sigpending(2).
+func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	pending := t.PendingSignals()
+	_, err := t.CopyOut(addr, pending)
+	return 0, nil, err
+}
+
+// RtSigtimedwait implements linux syscall rt_sigtimedwait(2).
+func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sigset := args[0].Pointer()
+	siginfo := args[1].Pointer()
+	timespec := args[2].Pointer()
+	sigsetsize := args[3].SizeT()
+
+	mask, err := copyInSigSet(t, sigset, sigsetsize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var timeout time.Duration
+	if timespec != 0 {
+		d, err := copyTimespecIn(t, timespec)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !d.Valid() {
+			return 0, nil, syserror.EINVAL
+		}
+		timeout = time.Duration(d.ToNsecCapped())
+	} else {
+		timeout = time.Duration(math.MaxInt64)
+	}
+
+	si, err := sigtimedwait(t, mask, timeout)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if si != nil {
+		if siginfo != 0 {
+			si.FixSignalCodeForUser()
+			if _, err := t.CopyOut(siginfo, si); err != nil {
+				return 0, nil, err
+			}
+		}
+		return uintptr(si.Signo), nil, nil
+	}
+
+	// sigtimedwait's not supposed to return nil si and err...
+	return 0, nil, nil
+}
+
+// RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2).
+func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := kernel.ThreadID(args[0].Int())
+	sig := linux.Signal(args[1].Int())
+	infoAddr := args[2].Pointer()
+
+	// Copy in the info.
+	//
+	// We must ensure that the Signo is set (Linux overrides this in the
+	// same way), and that the code is in the allowed set. This same logic
+	// appears below in RtSigtgqueueinfo and should be kept in sync.
+	var info arch.SignalInfo
+	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+		return 0, nil, err
+	}
+	info.Signo = int32(sig)
+
+	// This must loop to handle the race with execve described in Kill.
+	for {
+		// Deliver to the given task's thread group.
+		target := t.PIDNamespace().TaskWithID(pid)
+		if target == nil {
+			return 0, nil, syserror.ESRCH
+		}
+
+		// If the sender is not the receiver, it can't use si_codes used by the
+		// kernel or SI_TKILL.
+		if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+			return 0, nil, syserror.EPERM
+		}
+
+		if !mayKill(t, target, sig) {
+			return 0, nil, syserror.EPERM
+		}
+
+		if err := target.SendGroupSignal(&info); err != syserror.ESRCH {
+			return 0, nil, err
+		}
+	}
+}
+
+// RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2).
+func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tgid := kernel.ThreadID(args[0].Int())
+	tid := kernel.ThreadID(args[1].Int())
+	sig := linux.Signal(args[2].Int())
+	infoAddr := args[3].Pointer()
+
+	// N.B. Inconsistent with man page, linux actually rejects calls with
+	// tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
+	if tgid <= 0 || tid <= 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy in the info. See RtSigqueueinfo above.
+	var info arch.SignalInfo
+	if _, err := t.CopyIn(infoAddr, &info); err != nil {
+		return 0, nil, err
+	}
+	info.Signo = int32(sig)
+
+	// Deliver to the given task.
+	targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
+	target := t.PIDNamespace().TaskWithID(tid)
+	if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
+		return 0, nil, syserror.ESRCH
+	}
+
+	// If the sender is not the receiver, it can't use si_codes used by the
+	// kernel or SI_TKILL.
+	if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t {
+		return 0, nil, syserror.EPERM
+	}
+
+	if !mayKill(t, target, sig) {
+		return 0, nil, syserror.EPERM
+	}
+	return 0, nil, target.SendSignal(&info)
+}
+
+// RtSigsuspend implements linux syscall rt_sigsuspend(2).
+func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sigset := args[0].Pointer()
+
+	// Copy in the signal mask.
+	var mask linux.SignalSet
+	if _, err := t.CopyIn(sigset, &mask); err != nil {
+		return 0, nil, err
+	}
+	mask &^= kernel.UnblockableSignals
+
+	// Swap the mask.
+	oldmask := t.SignalMask()
+	t.SetSignalMask(mask)
+	t.SetSavedSignalMask(oldmask)
+
+	// Perform the wait.
+	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
+}
+
+// RestartSyscall implements the linux syscall restart_syscall(2).
+func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	if r := t.SyscallRestartBlock(); r != nil {
+		n, err := r.Restart(t)
+		return n, nil, err
+	}
+	// The restart block should never be nil here, but it's possible
+	// ERESTART_RESTARTBLOCK was set by ptrace without the current syscall
+	// setting up a restart block. If ptrace didn't manipulate the return value,
+	// finding a nil restart block is a bug. Linux ensures that the restart
+	// function is never null by (re)initializing it with one that translates
+	// the restart into EINTR. We'll emulate that behaviour.
+	t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?")
+	return 0, nil, syserror.EINTR
+}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
new file mode 100644
index 000000000..3797c0a5d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -0,0 +1,1059 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+// minListenBacklog is the minimum reasonable backlog for listening sockets.
+const minListenBacklog = 8
+
+// maxListenBacklog is the maximum allowed backlog for listening sockets.
+const maxListenBacklog = 1024
+
+// maxAddrLen is the maximum socket address length we're willing to accept.
+const maxAddrLen = 200
+
+// maxOptLen is the maximum sockopt parameter length we're willing to accept.
+const maxOptLen = 1024
+
+// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
+// willing to accept. Note that this limit is smaller than Linux, which allows
+// buffers upto INT_MAX.
+const maxControlLen = 10 * 1024 * 1024
+
+// nameLenOffset is the offset from the start of the MessageHeader64 struct to
+// the NameLen field.
+const nameLenOffset = 8
+
+// controlLenOffset is the offset form the start of the MessageHeader64 struct
+// to the ControlLen field.
+const controlLenOffset = 40
+
+// messageHeader64Len is the length of a MessageHeader64 struct.
+var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
+
+// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
+var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
+
+// MessageHeader64 is the 64-bit representation of the msghdr struct used in
+// the recvmsg and sendmsg syscalls.
+type MessageHeader64 struct {
+	// Name is the optional pointer to a network address buffer.
+	Name uint64
+
+	// NameLen is the length of the buffer pointed to by Name.
+	NameLen uint32
+	_       uint32
+
+	// Iov is a pointer to an array of io vectors that describe the memory
+	// locations involved in the io operation.
+	Iov uint64
+
+	// IovLen is the length of the array pointed to by Iov.
+	IovLen uint64
+
+	// Control is the optional pointer to ancillary control data.
+	Control uint64
+
+	// ControlLen is the length of the data pointed to by Control.
+	ControlLen uint64
+
+	// Flags on the sent/received message.
+	Flags int32
+	_     int32
+}
+
+// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
+// the recvmmsg and sendmmsg syscalls.
+type multipleMessageHeader64 struct {
+	msgHdr MessageHeader64
+	msgLen uint32
+	_      int32
+}
+
+// CopyInMessageHeader64 copies a message header from user to kernel memory.
+func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error {
+	b := t.CopyScratchBuffer(52)
+	if _, err := t.CopyInBytes(addr, b); err != nil {
+		return err
+	}
+
+	msg.Name = usermem.ByteOrder.Uint64(b[0:])
+	msg.NameLen = usermem.ByteOrder.Uint32(b[8:])
+	msg.Iov = usermem.ByteOrder.Uint64(b[16:])
+	msg.IovLen = usermem.ByteOrder.Uint64(b[24:])
+	msg.Control = usermem.ByteOrder.Uint64(b[32:])
+	msg.ControlLen = usermem.ByteOrder.Uint64(b[40:])
+	msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:]))
+
+	return nil
+}
+
+// CaptureAddress allocates memory for and copies a socket address structure
+// from the untrusted address space range.
+func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) {
+	if addrlen > maxAddrLen {
+		return nil, syscall.EINVAL
+	}
+
+	addrBuf := make([]byte, addrlen)
+	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
+		return nil, err
+	}
+
+	return addrBuf, nil
+}
+
+// writeAddress writes a sockaddr structure and its length to an output buffer
+// in the unstrusted address space range. If the address is bigger than the
+// buffer, it is truncated.
+func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error {
+	// Get the buffer length.
+	var bufLen uint32
+	if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil {
+		return err
+	}
+
+	if int32(bufLen) < 0 {
+		return syscall.EINVAL
+	}
+
+	// Write the length unconditionally.
+	if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil {
+		return err
+	}
+
+	if addr == nil {
+		return nil
+	}
+
+	if bufLen > addrLen {
+		bufLen = addrLen
+	}
+
+	// Copy as much of the address as will fit in the buffer.
+	encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr)
+	if bufLen > uint32(len(encodedAddr)) {
+		bufLen = uint32(len(encodedAddr))
+	}
+	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
+	return err
+}
+
+// Socket implements the linux syscall socket(2).
+func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Create the new socket.
+	s, e := socket.New(t, domain, unix.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	s.SetFlags(fs.SettableFileFlags{
+		NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+	})
+	defer s.DecRef()
+
+	fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// SocketPair implements the linux syscall socketpair(2).
+func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	domain := int(args[0].Int())
+	stype := args[1].Int()
+	protocol := int(args[2].Int())
+	socks := args[3].Pointer()
+
+	// Check and initialize the flags.
+	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	fileFlags := fs.SettableFileFlags{
+		NonBlocking: stype&linux.SOCK_NONBLOCK != 0,
+	}
+	fdFlags := kernel.FDFlags{
+		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
+	}
+
+	// Create the socket pair.
+	s1, s2, e := socket.Pair(t, domain, unix.SockType(stype&0xf), protocol)
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+	s1.SetFlags(fileFlags)
+	s2.SetFlags(fileFlags)
+	defer s1.DecRef()
+	defer s2.DecRef()
+
+	// Create the FDs for the sockets.
+	fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+	fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits())
+	if err != nil {
+		t.FDMap().Remove(fd1)
+		return 0, nil, err
+	}
+
+	// Copy the file descriptors out.
+	if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil {
+		t.FDMap().Remove(fd1)
+		t.FDMap().Remove(fd2)
+		return 0, nil, err
+	}
+
+	return 0, nil, nil
+}
+
+// Connect implements the linux syscall connect(2).
+func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	blocking := !file.Flags().NonBlocking
+	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS)
+}
+
+// accept is the implementation of the accept syscall. It is called by accept
+// and accept4 syscall handlers.
+func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) {
+	// Check that no unsupported flags are passed in.
+	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syscall.ENOTSOCK
+	}
+
+	// Call the syscall implementation for this socket, then copy the
+	// output address if one is specified.
+	blocking := !file.Flags().NonBlocking
+
+	peerRequested := addrLen != 0
+	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	if peerRequested {
+		// NOTE: Linux does not give you an error if it can't
+		// write the data back out so neither do we.
+		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL {
+			return 0, err
+		}
+	}
+	return uintptr(nfd), nil
+}
+
+// Accept4 implements the linux syscall accept4(2).
+func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+	flags := int(args[3].Int())
+
+	n, err := accept(t, fd, addr, addrlen, flags)
+	return n, nil, err
+}
+
+// Accept implements the linux syscall accept(2).
+func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	n, err := accept(t, fd, addr, addrlen, 0)
+	return n, nil, err
+}
+
+// Bind implements the linux syscall bind(2).
+func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Uint()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Capture address and call syscall implementation.
+	a, err := CaptureAddress(t, addr, addrlen)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, s.Bind(t, a).ToError()
+}
+
+// Listen implements the linux syscall listen(2).
+func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	backlog := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Per Linux, the backlog is silently capped to reasonable values.
+	if backlog <= 0 {
+		backlog = minListenBacklog
+	}
+	if backlog > maxListenBacklog {
+		backlog = maxListenBacklog
+	}
+
+	return 0, nil, s.Listen(t, int(backlog)).ToError()
+}
+
+// Shutdown implements the linux syscall shutdown(2).
+func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	how := args[1].Int()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Validate how, then call syscall implementation.
+	switch how {
+	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, s.Shutdown(t, int(how)).ToError()
+}
+
+// GetSockOpt implements the linux syscall getsockopt(2).
+func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLenAddr := args[4].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Read the length if present. Reject negative values.
+	optLen := int32(0)
+	if optLenAddr != 0 {
+		if _, err := t.CopyIn(optLenAddr, &optLen); err != nil {
+			return 0, nil, err
+		}
+
+		if optLen < 0 {
+			return 0, nil, syscall.EINVAL
+		}
+	}
+
+	// Call syscall implementation then copy both value and value len out.
+	v, e := s.GetSockOpt(t, int(level), int(name), int(optLen))
+	if e != nil {
+		return 0, nil, e.ToError()
+	}
+
+	if optLenAddr != 0 {
+		vLen := int32(binary.Size(v))
+		if _, err := t.CopyOut(optLenAddr, vLen); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if v != nil {
+		if _, err := t.CopyOut(optValAddr, v); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return 0, nil, nil
+}
+
+// SetSockOpt implements the linux syscall setsockopt(2).
+//
+// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
+func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	level := args[1].Int()
+	name := args[2].Int()
+	optValAddr := args[3].Pointer()
+	optLen := args[4].Int()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	if optLen <= 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if optLen > maxOptLen {
+		return 0, nil, syscall.EINVAL
+	}
+	buf := make([]byte, optLen)
+	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
+		return 0, nil, err
+	}
+
+	// Call syscall implementation.
+	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, nil
+}
+
+// GetSockName implements the linux syscall getsockname(2).
+func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Get the socket name and copy it to the caller.
+	v, vl, err := s.GetSockName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// GetPeerName implements the linux syscall getpeername(2).
+func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	addrlen := args[2].Pointer()
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Get the socket peer name and copy it to the caller.
+	v, vl, err := s.GetPeerName(t)
+	if err != nil {
+		return 0, nil, err.ToError()
+	}
+
+	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
+}
+
+// RecvMsg implements the linux syscall recvmsg(2).
+func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, false, ktime.Time{})
+	return n, nil, err
+}
+
+// RecvMMsg implements the linux syscall recvmmsg(2).
+func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+	toPtr := args[4].Pointer()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+	if toPtr != 0 {
+		ts, err := copyTimespecIn(t, toPtr)
+		if err != nil {
+			return 0, nil, err
+		}
+		if !ts.Valid() {
+			return 0, nil, syscall.EINVAL
+		}
+		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
+		haveDeadline = true
+	}
+
+	if !haveDeadline {
+		dl := s.RecvTimeout()
+		if dl != 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		}
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		var n uintptr
+		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
+	// Capture the message header and io vectors.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syscall.EMSGSIZE
+	}
+	dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// FIXME: Pretend we have an empty error queue.
+	if flags&linux.MSG_ERRQUEUE != 0 {
+		return 0, syscall.EAGAIN
+	}
+
+	// Fast path when no control message nor name buffers are provided.
+	if msg.ControlLen == 0 && msg.NameLen == 0 {
+		n, _, _, _, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		if err != nil {
+			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
+		}
+		return uintptr(n), nil
+	}
+
+	if msg.ControlLen > maxControlLen {
+		return 0, syscall.ENOBUFS
+	}
+	n, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+	defer cms.Release()
+
+	controlData := make([]byte, 0, msg.ControlLen)
+
+	if cr, ok := s.(unix.Credentialer); ok && cr.Passcred() {
+		creds, _ := cms.Credentials.(control.SCMCredentials)
+		controlData = control.PackCredentials(t, creds, controlData)
+	}
+
+	if cms.Rights != nil {
+		controlData = control.PackRights(t, cms.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData)
+	}
+
+	// Copy the address to the caller.
+	if msg.NameLen != 0 {
+		if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Copy the control data to the caller.
+	if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
+		return 0, err
+	}
+	if len(controlData) > 0 {
+		if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// recvFrom is the implementation of the recvfrom syscall. It is called by
+// recvfrom and recv syscall handlers.
+func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) {
+	if int(bufLen) < 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC) != 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syscall.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	var haveDeadline bool
+	var deadline ktime.Time
+
+	if dl := s.RecvTimeout(); dl != 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	}
+
+	n, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	cm.Release()
+	if e != nil {
+		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
+	}
+
+	// Copy the address to the caller.
+	if nameLenPtr != 0 {
+		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
+			return 0, err
+		}
+	}
+
+	return uintptr(n), nil
+}
+
+// RecvFrom implements the linux syscall recvfrom(2).
+func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLenPtr := args[5].Pointer()
+
+	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
+	return n, nil, err
+}
+
+// SendMsg implements the linux syscall sendmsg(2).
+func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	flags := args[2].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
+	return n, nil, err
+}
+
+// SendMMsg implements the linux syscall sendmmsg(2).
+func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	msgPtr := args[1].Pointer()
+	vlen := args[2].Uint()
+	flags := args[3].Int()
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		return 0, nil, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, nil, syscall.ENOTSOCK
+	}
+
+	// Reject flags that we don't handle yet.
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	var count uint32
+	var err error
+	for i := uint64(0); i < uint64(vlen); i++ {
+		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		var n uintptr
+		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
+			break
+		}
+
+		// Copy the received length to the caller.
+		lp, ok := mp.AddLength(messageHeader64Len)
+		if !ok {
+			return 0, nil, syscall.EFAULT
+		}
+		if _, err = t.CopyOut(lp, uint32(n)); err != nil {
+			break
+		}
+		count++
+	}
+
+	if count == 0 {
+		return 0, nil, err
+	}
+	return uintptr(count), nil, nil
+}
+
+func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) {
+	// Capture the message header.
+	var msg MessageHeader64
+	if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil {
+		return 0, err
+	}
+
+	var controlData []byte
+	if msg.ControlLen > 0 {
+		// Put an upper bound to prevent large allocations.
+		if msg.ControlLen > maxControlLen {
+			return 0, syscall.ENOBUFS
+		}
+		controlData = make([]byte, msg.ControlLen)
+		if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil {
+			return 0, err
+		}
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	if msg.NameLen != 0 {
+		var err error
+		to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	// Read data then call the sendmsg implementation.
+	if msg.IovLen > linux.UIO_MAXIOV {
+		return 0, syscall.EMSGSIZE
+	}
+	src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	controlMessages, err := control.Parse(t, s, controlData)
+	if err != nil {
+		return 0, err
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), controlMessages)
+	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
+	if err != nil {
+		controlMessages.Release()
+	}
+	return uintptr(n), err
+}
+
+// sendTo is the implementation of the sendto syscall. It is called by sendto
+// and send syscall handlers.
+func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) {
+	bl := int(bufLen)
+	if bl < 0 {
+		return 0, syscall.EINVAL
+	}
+
+	// Get socket from the file descriptor.
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, syscall.EBADF
+	}
+	defer file.DecRef()
+
+	// Extract the socket.
+	s, ok := file.FileOperations.(socket.Socket)
+	if !ok {
+		return 0, syscall.ENOTSOCK
+	}
+
+	if file.Flags().NonBlocking {
+		flags |= linux.MSG_DONTWAIT
+	}
+
+	// Read the destination address if one is specified.
+	var to []byte
+	var err error
+	if namePtr != 0 {
+		to, err = CaptureAddress(t, namePtr, nameLen)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, err
+	}
+
+	// Call the syscall implementation.
+	n, e := s.SendMsg(t, src, to, int(flags), control.New(t, s, nil))
+	return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
+}
+
+// SendTo implements the linux syscall sendto(2).
+func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	bufPtr := args[1].Pointer()
+	bufLen := args[2].Uint64()
+	flags := args[3].Int()
+	namePtr := args[4].Pointer()
+	nameLen := args[5].Uint()
+
+	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
+	return n, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
new file mode 100644
index 000000000..6e21b34fd
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -0,0 +1,209 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Stat implements linux syscall stat(2).
+func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Fstatat implements linux syscall newfstatat, i.e. fstatat(2).
+func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	statAddr := args[2].Pointer()
+	flags := args[3].Int()
+
+	path, dirPath, err := copyInPath(t, addr, flags&linux.AT_EMPTY_PATH != 0)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if path == "" {
+		// Annoying. What's wrong with fstat?
+		file := t.FDMap().GetFile(fd)
+		if file == nil {
+			return 0, nil, syserror.EBADF
+		}
+		defer file.DecRef()
+
+		return 0, nil, stat(t, file.Dirent, false, statAddr)
+	}
+
+	return 0, nil, fileOpOn(t, fd, path, flags&linux.AT_SYMLINK_NOFOLLOW == 0, func(root *fs.Dirent, d *fs.Dirent) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Lstat implements linux syscall lstat(2).
+func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statAddr := args[1].Pointer()
+
+	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return stat(t, d, dirPath, statAddr)
+	})
+}
+
+// Fstat implements linux syscall fstat(2).
+func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	statAddr := args[1].Pointer()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, stat(t, file.Dirent, false /* dirPath */, statAddr)
+}
+
+// stat implements stat from the given *fs.Dirent.
+func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) error {
+	if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+		return syserror.ENOTDIR
+	}
+	uattr, err := d.Inode.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+
+	var mode uint32
+	switch d.Inode.StableAttr.Type {
+	case fs.RegularFile, fs.SpecialFile:
+		mode |= linux.ModeRegular
+	case fs.Symlink:
+		mode |= linux.ModeSymlink
+	case fs.Directory, fs.SpecialDirectory:
+		mode |= linux.ModeDirectory
+	case fs.Pipe:
+		mode |= linux.ModeNamedPipe
+	case fs.CharacterDevice:
+		mode |= linux.ModeCharacterDevice
+	case fs.BlockDevice:
+		mode |= linux.ModeBlockDevice
+	case fs.Socket:
+		mode |= linux.ModeSocket
+	}
+
+	_, err = t.CopyOut(statAddr, linux.Stat{
+		Dev:     uint64(d.Inode.StableAttr.DeviceID),
+		Rdev:    uint64(linux.MakeDeviceID(d.Inode.StableAttr.DeviceFileMajor, d.Inode.StableAttr.DeviceFileMinor)),
+		Ino:     uint64(d.Inode.StableAttr.InodeID),
+		Nlink:   uattr.Links,
+		Mode:    mode | uint32(uattr.Perms.LinuxMode()),
+		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+		Size:    uattr.Size,
+		Blksize: d.Inode.StableAttr.BlockSize,
+		Blocks:  uattr.Usage / 512,
+		ATime:   uattr.AccessTime.Timespec(),
+		MTime:   uattr.ModificationTime.Timespec(),
+		CTime:   uattr.StatusChangeTime.Timespec(),
+	})
+	return err
+}
+
+// Statfs implements linux syscall statfs(2).
+func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	statfsAddr := args[1].Pointer()
+
+	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+		return statfsImpl(t, d, statfsAddr)
+	})
+}
+
+// Fstatfs implements linux syscall fstatfs(2).
+func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	statfsAddr := args[1].Pointer()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	return 0, nil, statfsImpl(t, file.Dirent, statfsAddr)
+}
+
+// statfsImpl implements the linux syscall statfs and fstatfs based on a Dirent,
+// copying the statfs structure out to addr on success, otherwise an error is
+// returned.
+func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
+	info, err := d.Inode.StatFS(t)
+	if err != nil {
+		return err
+	}
+	// Construct the statfs structure and copy it out.
+	statfs := linux.Statfs{
+		Type: info.Type,
+		// Treat block size and fragment size as the same, as
+		// most consumers of this structure will expect one
+		// or the other to be filled in.
+		BlockSize: d.Inode.StableAttr.BlockSize,
+		Blocks:    info.TotalBlocks,
+		// We don't have the concept of reserved blocks, so
+		// report blocks free the same as available blocks.
+		// This is a normal thing for filesystems, to do, see
+		// udf, hugetlbfs, tmpfs, among others.
+		BlocksFree:      info.FreeBlocks,
+		BlocksAvailable: info.FreeBlocks,
+		Files:           info.TotalFiles,
+		FilesFree:       info.FreeFiles,
+		// Same as Linux for simple_statfs, see fs/libfs.c.
+		NameLength:   syscall.PathMax,
+		FragmentSize: d.Inode.StableAttr.BlockSize,
+		// Leave other fields 0 like simple_statfs does.
+	}
+	if _, err := t.CopyOut(addr, &statfs); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
new file mode 100644
index 000000000..902d210db
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Sync implements linux system call sync(2).
+func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	t.MountNamespace().SyncAll(t)
+	// Sync is always successful.
+	return 0, nil, nil
+}
+
+// Syncfs implements linux system call syncfs(2).
+func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Use "sync-the-world" for now, it's guaranteed that fd is at least
+	// on the root filesystem.
+	return Sync(t, args)
+}
+
+// Fsync implements linux syscall fsync(2).
+func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll)
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Fdatasync implements linux syscall fdatasync(2).
+//
+// At the moment, it just calls Fsync, which is a big hammer, but correct.
+func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
new file mode 100644
index 000000000..bd0ffcd5c
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -0,0 +1,42 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo.
+func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	mem := t.Kernel().Platform.Memory()
+	mem.UpdateUsage()
+	_, totalUsage := usage.MemoryAccounting.Copy()
+	totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage)
+
+	// Only a subset of the fields in sysinfo_t make sense to return.
+	si := linux.Sysinfo{
+		Procs:    uint16(len(t.PIDNamespace().Tasks())),
+		Uptime:   t.Kernel().MonotonicClock().Now().Seconds(),
+		TotalRAM: totalSize,
+		FreeRAM:  totalSize - totalUsage,
+	}
+	_, err := t.CopyOut(addr, si)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
new file mode 100644
index 000000000..792040c81
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	_SYSLOG_ACTION_READ_ALL    = 3
+	_SYSLOG_ACTION_SIZE_BUFFER = 10
+)
+
+// logBufLen is the default syslog buffer size on Linux.
+const logBufLen = 1 << 17
+
+// Syslog implements part of Linux syscall syslog.
+//
+// Only the unpriviledged commands are implemented, allowing applications to
+// read a fun dmesg.
+func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	command := args[0].Int()
+	buf := args[1].Pointer()
+	size := int(args[2].Int())
+
+	switch command {
+	case _SYSLOG_ACTION_READ_ALL:
+		if size < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		if size > logBufLen {
+			size = logBufLen
+		}
+
+		log := t.Kernel().Syslog().Log()
+		if len(log) > size {
+			log = log[:size]
+		}
+
+		n, err := t.CopyOutBytes(buf, log)
+		return uintptr(n), nil, err
+	case _SYSLOG_ACTION_SIZE_BUFFER:
+		return logBufLen, nil, nil
+	default:
+		return 0, nil, syserror.ENOSYS
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
new file mode 100644
index 000000000..0adbf160f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -0,0 +1,704 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+	// ExecMaxTotalSize is the maximum length of all argv and envv entries.
+	//
+	// N.B. The behavior here is different than Linux. Linux provides a limit on
+	// individual arguments of 32 pages, and an aggregate limit of at least 32 pages
+	// but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
+	// any behavior based on the stack size, and instead provide a fixed hard-limit of
+	// 2 MB (which should work well given that 8 MB stack limits are common).
+	ExecMaxTotalSize = 2 * 1024 * 1024
+
+	// ExecMaxElemSize is the maximum length of a single argv or envv entry.
+	ExecMaxElemSize = 32 * usermem.PageSize
+
+	// exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux.
+	exitSignalMask = 0xff
+)
+
+// Possible values for the idtype argument to waitid(2), defined in Linux's
+// include/uapi/linux/wait.h.
+const (
+	_P_ALL  = 0
+	_P_PID  = 1
+	_P_PGID = 2
+)
+
+// Getppid implements linux syscall getppid(2).
+func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	parent := t.Parent()
+	if parent == nil {
+		return 0, nil, nil
+	}
+	return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
+}
+
+// Getpid implements linux syscall getpid(2).
+func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.ThreadGroup().ID()), nil, nil
+}
+
+// Gettid implements linux syscall gettid(2).
+func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.ThreadID()), nil, nil
+}
+
+// Execve implements linux syscall execve(2).
+func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	filenameAddr := args[0].Pointer()
+	argvAddr := args[1].Pointer()
+	envvAddr := args[2].Pointer()
+
+	// Extract our arguments.
+	filename, err := t.CopyInString(filenameAddr, syscall.PathMax)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var argv, envv []string
+	if argvAddr != 0 {
+		var err error
+		argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+	if envvAddr != 0 {
+		var err error
+		envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	root := t.FSContext().RootDirectory()
+	defer root.DecRef()
+	wd := t.FSContext().WorkingDirectory()
+	defer wd.DecRef()
+
+	// Load the new TaskContext.
+	tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, linux.MaxSymlinkTraversals, filename, argv, envv, t.Arch().FeatureSet())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	ctrl, err := t.Execve(tc)
+	return 0, ctrl, err
+}
+
+// Exit implements linux syscall exit(2).
+func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	status := int(args[0].Int())
+	t.PrepareExit(kernel.ExitStatus{Code: status})
+	return 0, kernel.CtrlDoExit, nil
+}
+
+// ExitGroup implements linux syscall exit_group(2).
+func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	status := int(args[0].Int())
+	t.PrepareGroupExit(kernel.ExitStatus{Code: status})
+	return 0, kernel.CtrlDoExit, nil
+}
+
+// clone is used by Clone, Fork, and VFork.
+func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) {
+	opts := kernel.CloneOptions{
+		SharingOptions: kernel.SharingOptions{
+			NewAddressSpace:     flags&syscall.CLONE_VM == 0,
+			NewSignalHandlers:   flags&syscall.CLONE_SIGHAND == 0,
+			NewThreadGroup:      flags&syscall.CLONE_THREAD == 0,
+			TerminationSignal:   linux.Signal(flags & exitSignalMask),
+			NewPIDNamespace:     flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID,
+			NewUserNamespace:    flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER,
+			NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET,
+			NewFiles:            flags&syscall.CLONE_FILES == 0,
+			NewFSContext:        flags&syscall.CLONE_FS == 0,
+			NewUTSNamespace:     flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS,
+			NewIPCNamespace:     flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC,
+		},
+		Stack:         stack,
+		SetTLS:        flags&syscall.CLONE_SETTLS == syscall.CLONE_SETTLS,
+		TLS:           tls,
+		ChildClearTID: flags&syscall.CLONE_CHILD_CLEARTID == syscall.CLONE_CHILD_CLEARTID,
+		ChildSetTID:   flags&syscall.CLONE_CHILD_SETTID == syscall.CLONE_CHILD_SETTID,
+		ChildTID:      childTID,
+		ParentSetTID:  flags&syscall.CLONE_PARENT_SETTID == syscall.CLONE_PARENT_SETTID,
+		ParentTID:     parentTID,
+		Vfork:         flags&syscall.CLONE_VFORK == syscall.CLONE_VFORK,
+		Untraced:      flags&syscall.CLONE_UNTRACED == syscall.CLONE_UNTRACED,
+		InheritTracer: flags&syscall.CLONE_PTRACE == syscall.CLONE_PTRACE,
+	}
+	ntid, ctrl, err := t.Clone(&opts)
+	return uintptr(ntid), ctrl, err
+}
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors. We implement the default one in the
+// current linux 3.11 x86_64:
+//    sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := int(args[0].Int())
+	stack := args[1].Pointer()
+	parentTID := args[2].Pointer()
+	childTID := args[3].Pointer()
+	tls := args[4].Pointer()
+	return clone(t, flags, stack, parentTID, childTID, tls)
+}
+
+// Fork implements Linux syscall fork(2).
+func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// "A call to fork() is equivalent to a call to clone(2) specifying flags
+	// as just SIGCHLD." - fork(2)
+	return clone(t, int(syscall.SIGCHLD), 0, 0, 0, 0)
+}
+
+// Vfork implements Linux syscall vfork(2).
+func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// """
+	// A call to vfork() is equivalent to calling clone(2) with flags specified as:
+	//
+	//     CLONE_VM | CLONE_VFORK | SIGCHLD
+	// """ - vfork(2)
+	return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0)
+}
+
+// wait4 waits for the given child process to exit.
+func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
+	if options&^(syscall.WNOHANG|syscall.WUNTRACED|syscall.WCONTINUED|syscall.WALL|syscall.WCLONE) != 0 {
+		return 0, syscall.EINVAL
+	}
+	wopts := kernel.WaitOptions{
+		Events:       kernel.EventExit | kernel.EventTraceeStop,
+		ConsumeEvent: true,
+	}
+	// There are four cases to consider:
+	//
+	// pid < -1    any child process whose process group ID is equal to the absolute value of pid
+	// pid == -1   any child process
+	// pid == 0    any child process whose process group ID is equal to that of the calling process
+	// pid > 0     the child whose process ID is equal to the value of pid
+	switch {
+	case pid < -1:
+		wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
+	case pid == -1:
+		// Any process is the default.
+	case pid == 0:
+		wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
+	default:
+		wopts.SpecificTID = kernel.ThreadID(pid)
+	}
+
+	switch options & (syscall.WCLONE | syscall.WALL) {
+	case 0:
+		wopts.NonCloneTasks = true
+	case syscall.WCLONE:
+		wopts.CloneTasks = true
+	case syscall.WALL:
+		wopts.NonCloneTasks = true
+		wopts.CloneTasks = true
+	default:
+		return 0, syscall.EINVAL
+	}
+	if options&syscall.WUNTRACED != 0 {
+		wopts.Events |= kernel.EventChildGroupStop
+	}
+	if options&syscall.WCONTINUED != 0 {
+		wopts.Events |= kernel.EventGroupContinue
+	}
+	if options&syscall.WNOHANG == 0 {
+		wopts.BlockInterruptErr = kernel.ERESTARTSYS
+	}
+
+	wr, err := t.Wait(&wopts)
+	if err != nil {
+		if err == kernel.ErrNoWaitableEvent {
+			return 0, nil
+		}
+		return 0, err
+	}
+	if statusAddr != 0 {
+		if _, err := t.CopyOut(statusAddr, wr.Status); err != nil {
+			return 0, err
+		}
+	}
+	if rusageAddr != 0 {
+		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+			return 0, err
+		}
+	}
+	return uintptr(wr.TID), nil
+}
+
+// Wait4 implements linux syscall wait4(2).
+func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := int(args[0].Int())
+	statusAddr := args[1].Pointer()
+	options := int(args[2].Uint())
+	rusageAddr := args[3].Pointer()
+
+	n, err := wait4(t, pid, statusAddr, options, rusageAddr)
+	return n, nil, err
+}
+
+// WaitPid implements linux syscall waitpid(2).
+func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	pid := int(args[0].Int())
+	statusAddr := args[1].Pointer()
+	options := int(args[2].Uint())
+
+	n, err := wait4(t, pid, statusAddr, options, 0)
+	return n, nil, err
+}
+
+// Waitid implements linux syscall waitid(2).
+func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	idtype := args[0].Int()
+	id := args[1].Int()
+	infop := args[2].Pointer()
+	options := int(args[3].Uint())
+	rusageAddr := args[4].Pointer()
+
+	if options&^(syscall.WNOHANG|syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED|syscall.WNOWAIT) != 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	if options&(syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED) == 0 {
+		return 0, nil, syscall.EINVAL
+	}
+	wopts := kernel.WaitOptions{
+		NonCloneTasks: true,
+		Events:        kernel.EventTraceeStop,
+		ConsumeEvent:  options&syscall.WNOWAIT == 0,
+	}
+	switch idtype {
+	case _P_ALL:
+	case _P_PID:
+		wopts.SpecificTID = kernel.ThreadID(id)
+	case _P_PGID:
+		wopts.SpecificPGID = kernel.ProcessGroupID(id)
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+	if options&syscall.WEXITED != 0 {
+		wopts.Events |= kernel.EventExit
+	}
+	if options&syscall.WSTOPPED != 0 {
+		wopts.Events |= kernel.EventChildGroupStop
+	}
+	if options&syscall.WCONTINUED != 0 {
+		wopts.Events |= kernel.EventGroupContinue
+	}
+	if options&syscall.WNOHANG == 0 {
+		wopts.BlockInterruptErr = kernel.ERESTARTSYS
+	}
+
+	wr, err := t.Wait(&wopts)
+	if err != nil {
+		if err == kernel.ErrNoWaitableEvent {
+			err = nil
+			// "If WNOHANG was specified in options and there were no children
+			// in a waitable state, then waitid() returns 0 immediately and the
+			// state of the siginfo_t structure pointed to by infop is
+			// unspecified." - waitid(2). But Linux's waitid actually zeroes
+			// out the fields it would set for a successful waitid in this case
+			// as well.
+			if infop != 0 {
+				var si arch.SignalInfo
+				_, err = t.CopyOut(infop, &si)
+			}
+		}
+		return 0, nil, err
+	}
+	if rusageAddr != 0 {
+		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
+		if _, err := t.CopyOut(rusageAddr, &ru); err != nil {
+			return 0, nil, err
+		}
+	}
+	if infop == 0 {
+		return 0, nil, nil
+	}
+	si := arch.SignalInfo{
+		Signo: int32(syscall.SIGCHLD),
+	}
+	si.SetPid(int32(wr.TID))
+	si.SetUid(int32(wr.UID))
+	// TODO: convert kernel.ExitStatus to functions and make
+	// WaitResult.Status a linux.WaitStatus
+	s := syscall.WaitStatus(wr.Status)
+	switch {
+	case s.Exited():
+		si.Code = arch.CLD_EXITED
+		si.SetStatus(int32(s.ExitStatus()))
+	case s.Signaled():
+		si.Code = arch.CLD_KILLED
+		si.SetStatus(int32(s.Signal()))
+	case s.CoreDump():
+		si.Code = arch.CLD_DUMPED
+		si.SetStatus(int32(s.Signal()))
+	case s.Stopped():
+		if wr.Event == kernel.EventTraceeStop {
+			si.Code = arch.CLD_TRAPPED
+			si.SetStatus(int32(s.TrapCause()))
+		} else {
+			si.Code = arch.CLD_STOPPED
+			si.SetStatus(int32(s.StopSignal()))
+		}
+	case s.Continued():
+		si.Code = arch.CLD_CONTINUED
+		si.SetStatus(int32(syscall.SIGCONT))
+	default:
+		t.Warningf("waitid got incomprehensible wait status %d", s)
+	}
+	_, err = t.CopyOut(infop, &si)
+	return 0, nil, err
+}
+
+// SetTidAddress implements linux syscall set_tid_address(2).
+func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	// Always succeed, return caller's tid.
+	t.SetClearTID(addr)
+	return uintptr(t.ThreadID()), nil, nil
+}
+
+// Unshare implements linux syscall unshare(2).
+func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	opts := kernel.SharingOptions{
+		NewAddressSpace:     flags&syscall.CLONE_VM == syscall.CLONE_VM,
+		NewSignalHandlers:   flags&syscall.CLONE_SIGHAND == syscall.CLONE_SIGHAND,
+		NewThreadGroup:      flags&syscall.CLONE_THREAD == syscall.CLONE_THREAD,
+		NewPIDNamespace:     flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID,
+		NewUserNamespace:    flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER,
+		NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET,
+		NewFiles:            flags&syscall.CLONE_FILES == syscall.CLONE_FILES,
+		NewFSContext:        flags&syscall.CLONE_FS == syscall.CLONE_FS,
+		NewUTSNamespace:     flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS,
+		NewIPCNamespace:     flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC,
+	}
+	// "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
+	if opts.NewPIDNamespace {
+		opts.NewThreadGroup = true
+	}
+	// "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
+	// Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
+	if opts.NewUserNamespace {
+		opts.NewThreadGroup = true
+		opts.NewFSContext = true
+	}
+	return 0, nil, t.Unshare(&opts)
+}
+
+// SchedYield implements linux syscall sched_yield(2).
+func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	t.Yield()
+	return 0, nil, nil
+}
+
+// SchedSetaffinity implements linux syscall sched_setaffinity(2).
+func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := args[0].Int()
+	size := args[1].SizeT()
+	maskAddr := args[2].Pointer()
+
+	var task *kernel.Task
+	if tid == 0 {
+		task = t
+	} else {
+		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
+	if size > mask.Size() {
+		size = mask.Size()
+	}
+	if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
+		return 0, nil, err
+	}
+	return 0, nil, task.SetCPUMask(mask)
+}
+
+// SchedGetaffinity implements linux syscall sched_getaffinity(2).
+func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := args[0].Int()
+	size := args[1].SizeT()
+	maskAddr := args[2].Pointer()
+
+	// This limitation is because linux stores the cpumask
+	// in an array of "unsigned long" so the buffer needs to
+	// be a multiple of the word size.
+	if size&(t.Arch().Width()-1) > 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var task *kernel.Task
+	if tid == 0 {
+		task = t
+	} else {
+		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
+		if task == nil {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	mask := task.CPUMask()
+	// The buffer needs to be big enough to hold a cpumask with
+	// all possible cpus.
+	if size < mask.Size() {
+		return 0, nil, syserror.EINVAL
+	}
+	_, err := t.CopyOutBytes(maskAddr, mask)
+
+	// NOTE: The syscall interface is slightly different than the glibc
+	// interface. The raw sched_getaffinity syscall returns the number of
+	// bytes used to represent a cpu mask.
+	return uintptr(mask.Size()), nil, err
+}
+
+// Getcpu implements linux syscall getcpu(2).
+func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	cpu := args[0].Pointer()
+	node := args[1].Pointer()
+	// third argument to this system call is nowadays unused.
+
+	if cpu != 0 {
+		buf := t.CopyScratchBuffer(4)
+		usermem.ByteOrder.PutUint32(buf, uint32(t.CPU()))
+		if _, err := t.CopyOutBytes(cpu, buf); err != nil {
+			return 0, nil, err
+		}
+	}
+	// We always return node 0.
+	if node != 0 {
+		if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// Setpgid implements the linux syscall setpgid(2).
+func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// Note that throughout this function, pgid is interpreted with respect
+	// to t's namespace, not with respect to the selected ThreadGroup's
+	// namespace (which may be different).
+	pid := kernel.ThreadID(args[0].Int())
+	pgid := kernel.ProcessGroupID(args[1].Int())
+
+	// "If pid is zero, then the process ID of the calling process is used."
+	tg := t.ThreadGroup()
+	if pid != 0 {
+		ot := t.PIDNamespace().TaskWithID(pid)
+		if ot == nil {
+			return 0, nil, syserror.ESRCH
+		}
+		tg = ot.ThreadGroup()
+		if tg.Leader() != ot {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Setpgid only operates on child threadgroups.
+		if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
+			return 0, nil, syserror.ESRCH
+		}
+	}
+
+	// "If pgid is zero, then the PGID of the process specified by pid is made
+	// the same as its process ID."
+	defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
+	if pgid == 0 {
+		pgid = defaultPGID
+	} else if pgid < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// If the pgid is the same as the group, then create a new one. Otherwise,
+	// we attempt to join an existing process group.
+	if pgid == defaultPGID {
+		// For convenience, errors line up with Linux syscall API.
+		if err := tg.CreateProcessGroup(); err != nil {
+			// Is the process group already as expected? If so,
+			// just return success. This is the same behavior as
+			// Linux.
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
+				return 0, nil, nil
+			}
+			return 0, nil, err
+		}
+	} else {
+		// Same as CreateProcessGroup, above.
+		if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
+			// See above.
+			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
+				return 0, nil, nil
+			}
+			return 0, nil, err
+		}
+	}
+
+	// Success.
+	return 0, nil, nil
+}
+
+// Getpgrp implements the linux syscall getpgrp(2).
+func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Getpgid implements the linux syscall getpgid(2).
+func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	if tid == 0 {
+		return Getpgrp(t, args)
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
+}
+
+// Setsid implements the linux syscall setsid(2).
+func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.ThreadGroup().CreateSession()
+}
+
+// Getsid implements the linux syscall getsid(2).
+func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tid := kernel.ThreadID(args[0].Int())
+	if tid == 0 {
+		return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
+	}
+
+	target := t.PIDNamespace().TaskWithID(tid)
+	if target == nil {
+		return 0, nil, syserror.ESRCH
+	}
+
+	return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
+}
+
+// Getpriority pretends to implement the linux syscall getpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	who := kernel.ThreadID(args[1].Int())
+
+	switch which {
+	case syscall.PRIO_PROCESS:
+		// Look for who, return ESRCH if not found.
+		var task *kernel.Task
+		if who == 0 {
+			task = t
+		} else {
+			task = t.PIDNamespace().TaskWithID(who)
+		}
+
+		if task == nil {
+			return 0, nil, syscall.ESRCH
+		}
+
+		// From kernel/sys.c:getpriority:
+		// "To avoid negative return values, 'getpriority()'
+		// will not return the normal nice-value, but a negated
+		// value that has been offset by 20"
+		return uintptr(20 - task.Niceness()), nil, nil
+	case syscall.PRIO_USER:
+		fallthrough
+	case syscall.PRIO_PGRP:
+		// PRIO_USER and PRIO_PGRP have no further implementation yet.
+		return 0, nil, nil
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+}
+
+// Setpriority pretends to implement the linux syscall setpriority(2).
+//
+// This is a stub; real priorities require a full scheduler.
+func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	which := args[0].Int()
+	who := kernel.ThreadID(args[1].Int())
+	niceval := int(args[2].Int())
+
+	// In the kernel's implementation, values outside the range
+	// of [-20, 19] are truncated to these minimum and maximum
+	// values.
+	if niceval < -20 /* min niceval */ {
+		niceval = -20
+	} else if niceval > 19 /* max niceval */ {
+		niceval = 19
+	}
+
+	switch which {
+	case syscall.PRIO_PROCESS:
+		// Look for who, return ESRCH if not found.
+		var task *kernel.Task
+		if who == 0 {
+			task = t
+		} else {
+			task = t.PIDNamespace().TaskWithID(who)
+		}
+
+		if task == nil {
+			return 0, nil, syscall.ESRCH
+		}
+
+		task.SetNiceness(niceval)
+	case syscall.PRIO_USER:
+		fallthrough
+	case syscall.PRIO_PGRP:
+		// PRIO_USER and PRIO_PGRP have no further implementation yet.
+		return 0, nil, nil
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, nil
+}
+
+// Ptrace implements linux system call ptrace(2).
+func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	req := args[0].Int64()
+	pid := kernel.ThreadID(args[1].Int())
+	addr := args[2].Pointer()
+	data := args[3].Pointer()
+
+	return 0, nil, t.Ptrace(req, pid, addr, data)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
new file mode 100644
index 000000000..dcee694b2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -0,0 +1,338 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// The most significant 29 bits hold either a pid or a file descriptor.
+func pidOfClockID(c int32) kernel.ThreadID {
+	return kernel.ThreadID(^(c >> 3))
+}
+
+// whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or
+// CLOCK_FD.
+func whichCPUClock(c int32) int32 {
+	return c & linux.CPUCLOCK_CLOCK_MASK
+}
+
+// isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the
+// clock id.
+func isCPUClockPerThread(c int32) bool {
+	return c&linux.CPUCLOCK_PERTHREAD_MASK != 0
+}
+
+// isValidCPUClock returns checks that the cpu clock id is valid.
+func isValidCPUClock(c int32) bool {
+	// Bits 0, 1, and 2 cannot all be set.
+	if c&7 == 7 {
+		return false
+	}
+	if whichCPUClock(c) >= linux.CPUCLOCK_MAX {
+		return false
+	}
+	return true
+}
+
+// targetTask returns the kernel.Task for the given clock id.
+func targetTask(t *kernel.Task, c int32) *kernel.Task {
+	pid := pidOfClockID(c)
+	if pid == 0 {
+		return t
+	}
+	return t.PIDNamespace().TaskWithID(pid)
+}
+
+// ClockGetres implements linux syscall clock_getres(2).
+func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	addr := args[1].Pointer()
+	r := linux.Timespec{
+		Sec:  0,
+		Nsec: 1,
+	}
+
+	if _, err := getClock(t, clockID); err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if addr == 0 {
+		// Don't need to copy out.
+		return 0, nil, nil
+	}
+
+	return 0, nil, copyTimespecOut(t, addr, &r)
+}
+
+type cpuClocker interface {
+	UserCPUClock() ktime.Clock
+	CPUClock() ktime.Clock
+}
+
+func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) {
+	if clockID < 0 {
+		if !isValidCPUClock(clockID) {
+			return nil, syserror.EINVAL
+		}
+
+		targetTask := targetTask(t, clockID)
+		if targetTask == nil {
+			return nil, syserror.EINVAL
+		}
+
+		var target cpuClocker
+		if isCPUClockPerThread(clockID) {
+			target = targetTask
+		} else {
+			target = targetTask.ThreadGroup()
+		}
+
+		switch whichCPUClock(clockID) {
+		case linux.CPUCLOCK_VIRT:
+			return target.UserCPUClock(), nil
+		case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED:
+			// CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF.
+			return target.CPUClock(), nil
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+
+	switch clockID {
+	case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE:
+		return t.Kernel().RealtimeClock(), nil
+	case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE, linux.CLOCK_MONOTONIC_RAW:
+		// CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW.
+		return t.Kernel().MonotonicClock(), nil
+	case linux.CLOCK_PROCESS_CPUTIME_ID:
+		return t.ThreadGroup().CPUClock(), nil
+	case linux.CLOCK_THREAD_CPUTIME_ID:
+		return t.CPUClock(), nil
+	default:
+		return nil, syserror.EINVAL
+	}
+}
+
+// ClockGettime implements linux syscall clock_gettime(2).
+func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	addr := args[1].Pointer()
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+	ts := c.Now().Timespec()
+	return 0, nil, copyTimespecOut(t, addr, &ts)
+}
+
+// ClockSettime implements linux syscall clock_settime(2).
+func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, syserror.EPERM
+}
+
+// Time implements linux syscall time(2).
+func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+
+	r := t.Kernel().RealtimeClock().Now().TimeT()
+	if addr == usermem.Addr(0) {
+		return uintptr(r), nil, nil
+	}
+
+	if _, err := t.CopyOut(addr, r); err != nil {
+		return 0, nil, err
+	}
+	return uintptr(r), nil, nil
+}
+
+// clockNanosleepRestartBlock encapsulates the state required to restart
+// clock_nanosleep(2) via restart_syscall(2).
+type clockNanosleepRestartBlock struct {
+	c        ktime.Clock
+	duration time.Duration
+	rem      usermem.Addr
+}
+
+// Restart implements kernel.SyscallRestartBlock.Restart.
+func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
+	return 0, clockNanosleepFor(t, n.c, n.duration, n.rem)
+}
+
+// clockNanosleepUntil blocks until a specified time.
+//
+// If blocking is interrupted, the syscall is restarted with the original
+// arguments.
+func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error {
+	notifier, tchan := ktime.NewChannelNotifier()
+	timer := ktime.NewTimer(c, notifier)
+
+	// Turn on the timer.
+	timer.Swap(ktime.Setting{
+		Period:  0,
+		Enabled: true,
+		Next:    ktime.FromTimespec(ts),
+	})
+
+	err := t.BlockWithTimer(nil, tchan)
+
+	timer.Destroy()
+
+	// Did we just block until the timeout happened?
+	if err == syserror.ETIMEDOUT {
+		return nil
+	}
+
+	return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND)
+}
+
+// clockNanosleepFor blocks for a specified duration.
+//
+// If blocking is interrupted, the syscall is restarted with the remaining
+// duration timeout.
+func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem usermem.Addr) error {
+	timer, start, tchan := ktime.After(c, dur)
+
+	err := t.BlockWithTimer(nil, tchan)
+
+	after := c.Now()
+
+	timer.Destroy()
+
+	var remaining time.Duration
+	// Did we just block for the entire duration?
+	if err == syserror.ETIMEDOUT {
+		remaining = 0
+	} else {
+		remaining = dur - after.Sub(start)
+		if remaining < 0 {
+			remaining = time.Duration(0)
+		}
+	}
+
+	// Copy out remaining time.
+	if err != nil && rem != usermem.Addr(0) {
+		timeleft := linux.NsecToTimespec(remaining.Nanoseconds())
+		if err := copyTimespecOut(t, rem, &timeleft); err != nil {
+			return err
+		}
+	}
+
+	// Did we just block for the entire duration?
+	if err == syserror.ETIMEDOUT {
+		return nil
+	}
+
+	// If interrupted, arrange for a restart with the remaining duration.
+	if err == syserror.ErrInterrupted {
+		t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{
+			c:        c,
+			duration: remaining,
+			rem:      rem,
+		})
+		return kernel.ERESTART_RESTARTBLOCK
+	}
+
+	return err
+}
+
+// Nanosleep implements linux syscall Nanosleep(2).
+func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	rem := args[1].Pointer()
+
+	ts, err := copyTimespecIn(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if !ts.Valid() {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Just like linux, we cap the timeout with the max number that int64 can
+	// represent which is roughly 292 years.
+	dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond
+	return 0, nil, clockNanosleepFor(t, t.Kernel().MonotonicClock(), dur, rem)
+}
+
+// ClockNanosleep implements linux syscall clock_nanosleep(2).
+func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := int32(args[0].Int())
+	flags := args[1].Int()
+	addr := args[2].Pointer()
+	rem := args[3].Pointer()
+
+	req, err := copyTimespecIn(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if !req.Valid() {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Only allow clock constants also allowed by Linux.
+	if clockID > 0 {
+		if clockID != linux.CLOCK_REALTIME &&
+			clockID != linux.CLOCK_MONOTONIC &&
+			clockID != linux.CLOCK_PROCESS_CPUTIME_ID {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if flags&linux.TIMER_ABSTIME != 0 {
+		return 0, nil, clockNanosleepUntil(t, c, req)
+	}
+
+	dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond
+	return 0, nil, clockNanosleepFor(t, c, dur, rem)
+}
+
+// Gettimeofday implements linux syscall gettimeofday(2).
+func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	tv := args[0].Pointer()
+	tz := args[1].Pointer()
+
+	if tv != usermem.Addr(0) {
+		nowTv := t.Kernel().RealtimeClock().Now().Timeval()
+		if err := copyTimevalOut(t, tv, &nowTv); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if tz != usermem.Addr(0) {
+		// Ask the time package for the timezone.
+		_, offset := time.Now().Zone()
+		// This int32 array mimics linux's struct timezone.
+		timezone := [2]int32{-int32(offset) / 60, 0}
+		_, err := t.CopyOut(tz, timezone)
+		return 0, nil, err
+	}
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
new file mode 100644
index 000000000..4ed077626
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// ItimerType denotes the type of interval timer.
+type ItimerType int
+
+// Interval timer types from <sys/time.h>.
+const (
+	// ItimerReal equals to ITIMER_REAL.
+	ItimerReal ItimerType = iota
+	// ItimerVirtual equals to ITIMER_VIRTUAL.
+	ItimerVirtual
+	// ItimerProf equals to ITIMER_PROF.
+	ItimerProf
+)
+
+const nsecPerSec = int64(time.Second)
+
+// copyItimerValIn copies an ItimerVal from the untrusted app range to the
+// kernel.  The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed because because Linux allows
+// setitimer(which, NULL, &old_value) which disables the timer.
+// There is a KERN_WARN message saying this misfeature will be removed.
+// However, that hasn't happened as of 3.19, so we continue to support it.
+func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) {
+	if addr == usermem.Addr(0) {
+		return linux.ItimerVal{}, nil
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		// Native size, just copy directly.
+		var itv linux.ItimerVal
+		if _, err := t.CopyIn(addr, &itv); err != nil {
+			return linux.ItimerVal{}, err
+		}
+
+		return itv, nil
+	default:
+		return linux.ItimerVal{}, syscall.ENOSYS
+	}
+}
+
+// copyItimerValOut copies an ItimerVal to the untrusted app range.
+// The ItimerVal may be either 32 or 64 bits.
+// A NULL address is allowed, in which case no copy takes place
+func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error {
+	if addr == usermem.Addr(0) {
+		return nil
+	}
+
+	switch t.Arch().Width() {
+	case 8:
+		// Native size, just copy directly.
+		_, err := t.CopyOut(addr, itv)
+		return err
+	default:
+		return syscall.ENOSYS
+	}
+}
+
+func findTimer(t *kernel.Task, w ItimerType) (*ktime.Timer, error) {
+	switch w {
+	case ItimerReal:
+		return t.ThreadGroup().Timer().RealTimer, nil
+	case ItimerVirtual:
+		return t.ThreadGroup().Timer().VirtualTimer, nil
+	case ItimerProf:
+		return t.ThreadGroup().Timer().ProfTimer, nil
+	default:
+		return nil, syscall.EINVAL
+	}
+}
+
+// Getitimer implements linux syscall getitimer(2).
+func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := ItimerType(args[0].Int())
+	val := args[1].Pointer()
+
+	timer, err := findTimer(t, timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+	value, interval := ktime.SpecFromSetting(timer.Get())
+	olditv := linux.ItimerVal{
+		Value:    linux.DurationToTimeval(value),
+		Interval: linux.DurationToTimeval(interval),
+	}
+
+	return 0, nil, copyItimerValOut(t, val, &olditv)
+}
+
+// Setitimer implements linux syscall setitimer(2).
+func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := ItimerType(args[0].Int())
+	newVal := args[1].Pointer()
+	oldVal := args[2].Pointer()
+
+	timer, err := findTimer(t, timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	itv, err := copyItimerValIn(t, newVal)
+	if err != nil {
+		return 0, nil, err
+	}
+	// Just like linux, we cap the timer value and interval with the max
+	// number that int64 can represent which is roughly 292 years.
+	s, err := ktime.SettingFromSpec(itv.Value.ToDuration(),
+		itv.Interval.ToDuration(), timer.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	valueNS, intervalNS := ktime.SpecFromSetting(timer.Swap(s))
+	olditv := linux.ItimerVal{
+		Value:    linux.DurationToTimeval(valueNS),
+		Interval: linux.DurationToTimeval(intervalNS),
+	}
+
+	return 0, nil, copyItimerValOut(t, oldVal, &olditv)
+}
+
+// Alarm implements linux syscall alarm(2).
+func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	duration := time.Duration(args[0].Uint()) * time.Second
+
+	timer := t.ThreadGroup().Timer().RealTimer
+	s, err := ktime.SettingFromSpec(duration, 0, timer.Clock())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	value, _ := ktime.SpecFromSetting(timer.Swap(s))
+	sec := int64(value) / nsecPerSec
+	nsec := int64(value) % nsecPerSec
+	// We can't return 0 if we have an alarm pending ...
+	if (sec == 0 && nsec > 0) || nsec >= nsecPerSec/2 {
+		sec++
+	}
+
+	return uintptr(sec), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
new file mode 100644
index 000000000..cb81d42b9
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TimerfdCreate implements Linux syscall timerfd_create(2).
+func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	flags := args[1].Int()
+
+	if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var c ktime.Clock
+	switch clockID {
+	case linux.CLOCK_REALTIME:
+		c = t.Kernel().RealtimeClock()
+	case linux.CLOCK_MONOTONIC:
+		c = t.Kernel().MonotonicClock()
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+	f := timerfd.NewFile(t, c)
+	defer f.DecRef()
+	f.SetFlags(fs.SettableFileFlags{
+		NonBlocking: flags&linux.TFD_NONBLOCK != 0,
+	})
+
+	fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{
+		CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
+	}, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// TimerfdSettime implements Linux syscall timerfd_settime(2).
+func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	f := t.FDMap().GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	var s ktime.Setting
+	var err error
+	if flags&linux.TFD_TIMER_ABSTIME != 0 {
+		s, err = ktime.SettingFromAbsSpec(ktime.FromTimespec(newVal.Value),
+			newVal.Interval.ToDuration())
+	} else {
+		s, err = ktime.SettingFromSpec(newVal.Value.ToDuration(),
+			newVal.Interval.ToDuration(), tf.Clock())
+	}
+	if err != nil {
+		return 0, nil, err
+	}
+	valueNS, intervalNS := ktime.SpecFromSetting(tf.SetTime(s))
+	if oldValAddr == 0 {
+		return 0, nil, nil
+	}
+	oldVal := linux.Itimerspec{
+		Interval: linux.DurationToTimespec(intervalNS),
+		Value:    linux.DurationToTimespec(valueNS),
+	}
+	_, err = t.CopyOut(oldValAddr, &oldVal)
+	return 0, nil, err
+}
+
+// TimerfdGettime implements Linux syscall timerfd_gettime(2).
+func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	curValAddr := args[1].Pointer()
+
+	f := t.FDMap().GetFile(fd)
+	if f == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer f.DecRef()
+
+	tf, ok := f.FileOperations.(*timerfd.TimerOperations)
+	if !ok {
+		return 0, nil, syserror.EINVAL
+	}
+
+	valueNS, intervalNS := ktime.SpecFromSetting(tf.GetTime())
+	curVal := linux.Itimerspec{
+		Interval: linux.DurationToTimespec(intervalNS),
+		Value:    linux.DurationToTimespec(valueNS),
+	}
+	_, err := t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
new file mode 100644
index 000000000..1047364b3
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -0,0 +1,48 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//+build amd64
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// ArchPrctl implements linux syscall arch_prctl(2).
+// It sets architecture-specific process or thread state for t.
+func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	switch args[0].Int() {
+	case linux.ARCH_GET_FS:
+		addr := args[1].Pointer()
+		_, err := t.CopyOut(addr, &t.Arch().StateData().Regs.Fs_base)
+		if err != nil {
+			return 0, nil, err
+		}
+
+	case linux.ARCH_SET_FS:
+		regs := &t.Arch().StateData().Regs
+		regs.Fs = 0
+		regs.Fs_base = args[1].Uint64()
+
+	default:
+		return 0, nil, syscall.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
new file mode 100644
index 000000000..899116374
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -0,0 +1,89 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Uname implements linux syscall uname.
+func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	version := t.SyscallTable().Version
+
+	uts := t.UTSNamespace()
+
+	// Fill in structure fields.
+	var u linux.UtsName
+	copy(u.Sysname[:], version.Sysname)
+	copy(u.Nodename[:], uts.HostName())
+	copy(u.Release[:], version.Release)
+	copy(u.Version[:], version.Version)
+	copy(u.Machine[:], "x86_64") // +build tag above.
+	copy(u.Domainname[:], uts.DomainName())
+
+	// Copy out the result.
+	va := args[0].Pointer()
+	_, err := t.CopyOut(va, u)
+	return 0, nil, err
+}
+
+// Setdomainname implements Linux syscall setdomainname.
+func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nameAddr := args[0].Pointer()
+	size := args[1].Int()
+
+	utsns := t.UTSNamespace()
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+	if size < 0 || size > linux.UTSLen {
+		return 0, nil, syserror.EINVAL
+	}
+
+	name, err := t.CopyInString(nameAddr, int(size))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	utsns.SetDomainName(name)
+	return 0, nil, nil
+}
+
+// Sethostname implements Linux syscall sethostname.
+func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	nameAddr := args[0].Pointer()
+	size := args[1].Int()
+
+	utsns := t.UTSNamespace()
+	if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) {
+		return 0, nil, syserror.EPERM
+	}
+	if size < 0 || size > linux.UTSLen {
+		return 0, nil, syserror.EINVAL
+	}
+
+	name, err := t.CopyInString(nameAddr, int(size))
+	if err != nil {
+		return 0, nil, err
+	}
+
+	utsns.SetHostName(name)
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
new file mode 100644
index 000000000..caa7b01ea
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -0,0 +1,274 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// EventMaskWrite contains events that can be triggered on writes.
+	//
+	// Note that EventHUp is not going to happen for pipes but may for
+	// implementations of poll on some sockets, see net/core/datagram.c.
+	EventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
+)
+
+// Write implements linux syscall write(2).
+func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := writev(t, file, src)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
+}
+
+// Pwrite64 implements linux syscall pwrite64(2).
+func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	size := args[2].SizeT()
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check that the size is legitimate.
+	si := int(size)
+	if si < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get the source of the write.
+	src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
+}
+
+// Writev implements linux syscall writev(2).
+func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := writev(t, file, src)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
+}
+
+// Pwritev implements linux syscall pwritev(2).
+func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is writable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
+}
+
+func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
+	n, err := f.Writev(t, src)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we wrote anything.
+			f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		// Issue the request and break out if it completes with
+		// anything other than "would block".
+		n, err = f.Writev(t, src)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we wrote anything.
+		f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
+
+	return total, err
+}
+
+func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	n, err := f.Pwritev(t, src, offset)
+	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
+		if n > 0 {
+			// Queue notification if we wrote anything.
+			f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+		}
+		return n, err
+	}
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	f.EventRegister(&w, EventMaskWrite)
+
+	total := n
+	for {
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		// Issue the request and break out if it completes with
+		// anything other than "would block".
+		n, err = f.Pwritev(t, src, offset+total)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	f.EventUnregister(&w)
+
+	if total > 0 {
+		// Queue notification if we wrote anything.
+		f.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
+
+	return total, err
+}
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
new file mode 100644
index 000000000..e865c6fc0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// copyTimespecIn copies a Timespec from the untrusted app range to the kernel.
+func copyTimespecIn(t *kernel.Task, addr usermem.Addr) (linux.Timespec, error) {
+	switch t.Arch().Width() {
+	case 8:
+		ts := linux.Timespec{}
+		in := t.CopyScratchBuffer(16)
+		_, err := t.CopyInBytes(addr, in)
+		if err != nil {
+			return ts, err
+		}
+		ts.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+		ts.Nsec = int64(usermem.ByteOrder.Uint64(in[8:]))
+		return ts, nil
+	default:
+		return linux.Timespec{}, syserror.ENOSYS
+	}
+}
+
+// copyTimespecOut copies a Timespec to the untrusted app range.
+func copyTimespecOut(t *kernel.Task, addr usermem.Addr, ts *linux.Timespec) error {
+	switch t.Arch().Width() {
+	case 8:
+		out := t.CopyScratchBuffer(16)
+		usermem.ByteOrder.PutUint64(out[0:], uint64(ts.Sec))
+		usermem.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec))
+		_, err := t.CopyOutBytes(addr, out)
+		return err
+	default:
+		return syserror.ENOSYS
+	}
+}
+
+// copyTimevalIn copies a Timeval from the untrusted app range to the kernel.
+func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) {
+	switch t.Arch().Width() {
+	case 8:
+		tv := linux.Timeval{}
+		in := t.CopyScratchBuffer(16)
+		_, err := t.CopyInBytes(addr, in)
+		if err != nil {
+			return tv, err
+		}
+		tv.Sec = int64(usermem.ByteOrder.Uint64(in[0:]))
+		tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:]))
+		return tv, nil
+	default:
+		return linux.Timeval{}, syscall.ENOSYS
+	}
+}
+
+// copyTimevalOut copies a Timeval to the untrusted app range.
+func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error {
+	switch t.Arch().Width() {
+	case 8:
+		out := t.CopyScratchBuffer(16)
+		usermem.ByteOrder.PutUint64(out[0:], uint64(tv.Sec))
+		usermem.ByteOrder.PutUint64(out[8:], uint64(tv.Usec))
+		_, err := t.CopyOutBytes(addr, out)
+		return err
+	default:
+		return syscall.ENOSYS
+	}
+}
+
+// copyTimespecInToDuration copies a Timespec from the untrusted app range,
+// validates it and converts it to a Duration.
+//
+// If the Timespec is larger than what can be represented in a Duration, the
+// returned value is the maximum that Duration will allow.
+//
+// If timespecAddr is NULL, the returned value is negative.
+func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
+	// Use a negative Duration to indicate "no timeout".
+	timeout := time.Duration(-1)
+	if timespecAddr != 0 {
+		timespec, err := copyTimespecIn(t, timespecAddr)
+		if err != nil {
+			return 0, err
+		}
+		if !timespec.Valid() {
+			return 0, syscall.EINVAL
+		}
+		timeout = time.Duration(timespec.ToNsecCapped())
+	}
+	return timeout, nil
+}
diff --git a/pkg/sentry/syscalls/polling.go b/pkg/sentry/syscalls/polling.go
new file mode 100644
index 000000000..fd90184ef
--- /dev/null
+++ b/pkg/sentry/syscalls/polling.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package syscalls
+
+import (
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// PollFD describes a pollable FD.
+type PollFD struct {
+	FD      kdefs.FD
+	Events  waiter.EventMask
+	REvents waiter.EventMask
+}
+
+// pollState tracks the associated file descriptor and waiter of a PollFD.
+type pollState struct {
+	file   *fs.File
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func (pfd *PollFD) initReadiness(t *kernel.Task, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.FDMap().GetFile(pfd.FD)
+	if file == nil {
+		pfd.REvents = waiter.EventNVal
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, pfd.Events)
+	}
+
+	pfd.REvents = file.Readiness(pfd.Events) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// Poll polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// Poll returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func Poll(t *kernel.Task, pfd []PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		pfd[i].initReadiness(t, &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	forever := timeout < 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, !forever, timeout)
+		if err != nil {
+			if err == syscall.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			ready := state[i].file.Readiness(pfd[i].Events) & pfd[i].Events
+			if ready != 0 {
+				pfd[i].REvents = ready
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
new file mode 100644
index 000000000..1176f858d
--- /dev/null
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -0,0 +1,72 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package syscalls is the interface from the application to the kernel.
+// Traditionally, syscalls is the interface that is used by applications to
+// request services from the kernel of a operating system. We provide a
+// user-mode kernel that needs to handle those requests coming from unmodified
+// applications. Therefore, we still use the term "syscalls" to denote this
+// interface.
+//
+// Note that the stubs in this package may merely provide the interface, not
+// the actual implementation. It just makes writing syscall stubs
+// straightforward.
+package syscalls
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	uspb "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Error returns a syscall handler that will always give the passed error.
+func Error(err error) kernel.SyscallFn {
+	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+		return 0, nil, err
+	}
+}
+
+// ErrorWithEvent gives a syscall function that sends an unimplemented
+// syscall event via the event channel and returns the passed error.
+func ErrorWithEvent(err error) kernel.SyscallFn {
+	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+		UnimplementedEvent(t)
+		return 0, nil, err
+	}
+}
+
+// CapError gives a syscall function that checks for capability c.  If the task
+// has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged
+// tasks, it will seem like there is an implementation.
+func CapError(c linux.Capability) kernel.SyscallFn {
+	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+		if !t.HasCapability(c) {
+			return 0, nil, syserror.EPERM
+		}
+		UnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+}
+
+// UnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func UnimplementedEvent(t *kernel.Task) {
+	eventchannel.Emit(&uspb.UnimplementedSyscall{
+		Tid:       int32(t.ThreadID()),
+		Registers: t.Arch().StateData().Proto(),
+	})
+}
diff --git a/pkg/sentry/syscalls/unimplemented_syscall.proto b/pkg/sentry/syscalls/unimplemented_syscall.proto
new file mode 100644
index 000000000..d6febf5b1
--- /dev/null
+++ b/pkg/sentry/syscalls/unimplemented_syscall.proto
@@ -0,0 +1,27 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+import "pkg/sentry/arch/registers.proto";
+
+message UnimplementedSyscall {
+  // Task ID.
+  int32 tid = 1;
+
+  // Registers at the time of the call.
+  Registers registers = 2;
+}
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
new file mode 100644
index 000000000..cbcd699d5
--- /dev/null
+++ b/pkg/sentry/time/BUILD
@@ -0,0 +1,48 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "seqatomic_parameters",
+    out = "seqatomic_parameters.go",
+    package = "time",
+    suffix = "Parameters",
+    template = "//pkg/sync:generic_seqatomic",
+    types = {
+        "Value": "Parameters",
+    },
+)
+
+go_library(
+    name = "time",
+    srcs = [
+        "calibrated_clock.go",
+        "clock_id.go",
+        "clocks.go",
+        "muldiv_amd64.s",
+        "parameters.go",
+        "sampler.go",
+        "sampler_unsafe.go",
+        "seqatomic_parameters.go",
+        "tsc_amd64.s",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/time",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "time_test",
+    srcs = [
+        "calibrated_clock_test.go",
+        "parameters_test.go",
+        "sampler_test.go",
+    ],
+    embed = [":time"],
+)
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
new file mode 100644
index 000000000..cbb95e2d7
--- /dev/null
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -0,0 +1,269 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package time provides a calibrated clock synchronized to a system reference
+// clock.
+package time
+
+import (
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// fallbackMetric tracks failed updates. It is not sync, as it is not critical
+// that all occurrences are captured and CalibratedClock may fallback many
+// times.
+var fallbackMetric = metric.MustCreateNewUint64Metric("/time/fallback", false /* sync */, "Incremented when a clock falls back to system calls due to a failed update")
+
+// CalibratedClock implements a clock that tracks a reference clock.
+//
+// Users should call Update at regular intervals of around approxUpdateInterval
+// to ensure that the clock does not drift significantly from the reference
+// clock.
+type CalibratedClock struct {
+	// mu protects the fields below.
+	// TODO: consider a sequence counter for read locking.
+	mu sync.RWMutex
+
+	// ref sample the reference clock that this clock is calibrated
+	// against.
+	ref *sampler
+
+	// ready indicates that the fields below are ready for use calculating
+	// time.
+	ready bool
+
+	// params are the current timekeeping parameters.
+	params Parameters
+
+	// errorNS is the estimated clock error in nanoseconds.
+	errorNS ReferenceNS
+}
+
+// NewCalibratedClock creates a CalibratedClock that tracks the given ClockID.
+func NewCalibratedClock(c ClockID) *CalibratedClock {
+	return &CalibratedClock{
+		ref: newSampler(c),
+	}
+}
+
+// Debugf logs at debug level.
+func (c *CalibratedClock) Debugf(format string, v ...interface{}) {
+	if log.IsLogging(log.Debug) {
+		args := []interface{}{c.ref.clockID}
+		args = append(args, v...)
+		log.Debugf("CalibratedClock(%v): "+format, args...)
+	}
+}
+
+// Infof logs at debug level.
+func (c *CalibratedClock) Infof(format string, v ...interface{}) {
+	if log.IsLogging(log.Info) {
+		args := []interface{}{c.ref.clockID}
+		args = append(args, v...)
+		log.Infof("CalibratedClock(%v): "+format, args...)
+	}
+}
+
+// Warningf logs at debug level.
+func (c *CalibratedClock) Warningf(format string, v ...interface{}) {
+	if log.IsLogging(log.Warning) {
+		args := []interface{}{c.ref.clockID}
+		args = append(args, v...)
+		log.Warningf("CalibratedClock(%v): "+format, args...)
+	}
+}
+
+// reset forces the clock to restart the calibration process, logging the
+// passed message.
+func (c *CalibratedClock) reset(str string, v ...interface{}) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.resetLocked(str, v...)
+}
+
+// resetLocked is equivalent to reset with c.mu already held for writing.
+func (c *CalibratedClock) resetLocked(str string, v ...interface{}) {
+	c.Warningf(str+" Resetting clock; time may jump.", v...)
+	c.ready = false
+	c.ref.Reset()
+	fallbackMetric.Increment()
+}
+
+// updateParams updates the timekeeping parameters based on the passed
+// parameters.
+//
+// actual is the actual estimated timekeeping parameters. The stored parameters
+// may need to be adjusted slightly from these values to compensate for error.
+//
+// Preconditions: c.mu must be held for writing.
+func (c *CalibratedClock) updateParams(actual Parameters) {
+	if !c.ready {
+		// At initial calibration there is nothing to correct.
+		c.params = actual
+		c.ready = true
+
+		c.Infof("ready")
+
+		return
+	}
+
+	// Otherwise, adjust the params to correct for errors.
+	newParams, errorNS, err := errorAdjust(c.params, actual, actual.BaseCycles)
+	if err != nil {
+		// Something is very wrong. Reset and try again from the
+		// beginning.
+		c.resetLocked("Unable to update params: %v.", err)
+		return
+	}
+	logErrorAdjustment(c.ref.clockID, errorNS, c.params, newParams)
+
+	if errorNS.Magnitude() >= MaxClockError {
+		// We should never get such extreme error, something is very
+		// wrong. Reset everything and start again.
+		//
+		// N.B. logErrorAdjustment will have already logged the error
+		// at warning level.
+		//
+		// TODO: We could allow Realtime clock jumps here.
+		c.resetLocked("Extreme clock error.")
+		return
+	}
+
+	c.params = newParams
+	c.errorNS = errorNS
+}
+
+// Update runs the update step of the clock, updating its synchronization with
+// the reference clock.
+//
+// Update returns timekeeping and true with the new timekeeping parameters if
+// the clock is calibrated. Update should be called regularly to prevent the
+// clock from getting significantly out of sync from the reference clock.
+//
+// The returned timekeeping parameters are invalidated on the next call to
+// Update.
+func (c *CalibratedClock) Update() (Parameters, bool) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if err := c.ref.Sample(); err != nil {
+		c.resetLocked("Unable to update calibrated clock: %v.", err)
+		return Parameters{}, false
+	}
+
+	oldest, newest, ok := c.ref.Range()
+	if !ok {
+		// Not ready yet.
+		return Parameters{}, false
+	}
+
+	minCount := uint64(newest.before - oldest.after)
+	maxCount := uint64(newest.after - oldest.before)
+	refInterval := uint64(newest.ref - oldest.ref)
+
+	// freq hz = count / (interval ns) * (nsPerS ns) / (1 s)
+	nsPerS := uint64(time.Second.Nanoseconds())
+
+	minHz, ok := muldiv64(minCount, nsPerS, refInterval)
+	if !ok {
+		c.resetLocked("Unable to update calibrated clock: (%v - %v) * %v / %v overflows.", newest.before, oldest.after, nsPerS, refInterval)
+		return Parameters{}, false
+	}
+
+	maxHz, ok := muldiv64(maxCount, nsPerS, refInterval)
+	if !ok {
+		c.resetLocked("Unable to update calibrated clock: (%v - %v) * %v / %v overflows.", newest.after, oldest.before, nsPerS, refInterval)
+		return Parameters{}, false
+	}
+
+	c.updateParams(Parameters{
+		Frequency:  (minHz + maxHz) / 2,
+		BaseRef:    newest.ref,
+		BaseCycles: newest.after,
+	})
+
+	return c.params, true
+}
+
+// GetTime returns the current time based on the clock calibration.
+func (c *CalibratedClock) GetTime() (int64, error) {
+	c.mu.RLock()
+
+	if !c.ready {
+		// Fallback to a syscall.
+		now, err := c.ref.Syscall()
+		c.mu.RUnlock()
+		return int64(now), err
+	}
+
+	now := c.ref.Cycles()
+	v, ok := c.params.ComputeTime(now)
+	if !ok {
+		// Something is seriously wrong with the clock. Try
+		// again with syscalls.
+		c.resetLocked("Time computation overflowed. params = %+v, now = %v.", c.params, now)
+		now, err := c.ref.Syscall()
+		c.mu.RUnlock()
+		return int64(now), err
+	}
+
+	c.mu.RUnlock()
+	return v, nil
+}
+
+// CalibratedClocks contains calibrated monotonic and realtime clocks.
+//
+// TODO: We know that Linux runs the monotonic and realtime clocks at
+// the same rate, so rather than tracking both individually, we could do one
+// calibration for both clocks.
+type CalibratedClocks struct {
+	// monotonic is the clock tracking the system monotonic clock.
+	monotonic *CalibratedClock
+
+	// realtime is the realtime equivalent of monotonic.
+	realtime *CalibratedClock
+}
+
+// NewCalibratedClocks creates a CalibratedClocks.
+func NewCalibratedClocks() *CalibratedClocks {
+	return &CalibratedClocks{
+		monotonic: NewCalibratedClock(Monotonic),
+		realtime:  NewCalibratedClock(Realtime),
+	}
+}
+
+// Update implements Clocks.Update.
+func (c *CalibratedClocks) Update() (Parameters, bool, Parameters, bool) {
+	monotonicParams, monotonicOk := c.monotonic.Update()
+	realtimeParams, realtimeOk := c.realtime.Update()
+
+	return monotonicParams, monotonicOk, realtimeParams, realtimeOk
+}
+
+// GetTime implements Clocks.GetTime.
+func (c *CalibratedClocks) GetTime(id ClockID) (int64, error) {
+	switch id {
+	case Monotonic:
+		return c.monotonic.GetTime()
+	case Realtime:
+		return c.realtime.GetTime()
+	default:
+		return 0, syserror.EINVAL
+	}
+}
diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go
new file mode 100644
index 000000000..8b6dd5592
--- /dev/null
+++ b/pkg/sentry/time/calibrated_clock_test.go
@@ -0,0 +1,186 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"testing"
+	"time"
+)
+
+// newTestCalibratedClock returns a CalibratedClock that collects samples from
+// the given sample list and cycle counts from the given cycle list.
+func newTestCalibratedClock(samples []sample, cycles []TSCValue) *CalibratedClock {
+	return &CalibratedClock{
+		ref: newTestSampler(samples, cycles),
+	}
+}
+
+func TestConstantFrequency(t *testing.T) {
+	// Perfectly constant frequency.
+	samples := []sample{
+		{before: 100000, after: 100000 + defaultOverheadCycles, ref: 100},
+		{before: 200000, after: 200000 + defaultOverheadCycles, ref: 200},
+		{before: 300000, after: 300000 + defaultOverheadCycles, ref: 300},
+		{before: 400000, after: 400000 + defaultOverheadCycles, ref: 400},
+		{before: 500000, after: 500000 + defaultOverheadCycles, ref: 500},
+		{before: 600000, after: 600000 + defaultOverheadCycles, ref: 600},
+		{before: 700000, after: 700000 + defaultOverheadCycles, ref: 700},
+	}
+
+	c := newTestCalibratedClock(samples, nil)
+
+	// Update from all samples.
+	for range samples {
+		c.Update()
+	}
+
+	c.mu.RLock()
+	if !c.ready {
+		c.mu.RUnlock()
+		t.Fatalf("clock not ready")
+	}
+	// A bit after the last sample.
+	now, ok := c.params.ComputeTime(750000)
+	c.mu.RUnlock()
+	if !ok {
+		t.Fatalf("ComputeTime ok got %v want true", ok)
+	}
+
+	t.Logf("now: %v", now)
+
+	// Time should be between the current sample and where we'd expect the
+	// next sample.
+	if now < 700 || now > 800 {
+		t.Errorf("now got %v want > 700 && < 800", now)
+	}
+}
+
+func TestErrorCorrection(t *testing.T) {
+	testCases := []struct {
+		name               string
+		samples            [5]sample
+		projectedTimeStart int64
+		projectedTimeEnd   int64
+	}{
+		// Initial calibration should be ~1MHz for each of these, and
+		// the reference clock changes in samples[2].
+		{
+			name: "slow-down",
+			samples: [5]sample{
+				{before: 1000000, after: 1000001, ref: ReferenceNS(1 * ApproxUpdateInterval.Nanoseconds())},
+				{before: 2000000, after: 2000001, ref: ReferenceNS(2 * ApproxUpdateInterval.Nanoseconds())},
+				// Reference clock has slowed down, causing 100ms of error.
+				{before: 3010000, after: 3010001, ref: ReferenceNS(3 * ApproxUpdateInterval.Nanoseconds())},
+				{before: 4020000, after: 4020001, ref: ReferenceNS(4 * ApproxUpdateInterval.Nanoseconds())},
+				{before: 5030000, after: 5030001, ref: ReferenceNS(5 * ApproxUpdateInterval.Nanoseconds())},
+			},
+			projectedTimeStart: 3005 * time.Millisecond.Nanoseconds(),
+			projectedTimeEnd:   3015 * time.Millisecond.Nanoseconds(),
+		},
+		{
+			name: "speed-up",
+			samples: [5]sample{
+				{before: 1000000, after: 1000001, ref: ReferenceNS(1 * ApproxUpdateInterval.Nanoseconds())},
+				{before: 2000000, after: 2000001, ref: ReferenceNS(2 * ApproxUpdateInterval.Nanoseconds())},
+				// Reference clock has sped up, causing 100ms of error.
+				{before: 2990000, after: 2990001, ref: ReferenceNS(3 * ApproxUpdateInterval.Nanoseconds())},
+				{before: 3980000, after: 3980001, ref: ReferenceNS(4 * ApproxUpdateInterval.Nanoseconds())},
+				{before: 4970000, after: 4970001, ref: ReferenceNS(5 * ApproxUpdateInterval.Nanoseconds())},
+			},
+			projectedTimeStart: 2985 * time.Millisecond.Nanoseconds(),
+			projectedTimeEnd:   2995 * time.Millisecond.Nanoseconds(),
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			c := newTestCalibratedClock(tc.samples[:], nil)
+
+			// Initial calibration takes two updates.
+			_, ok := c.Update()
+			if ok {
+				t.Fatalf("Update ready too early")
+			}
+
+			params, ok := c.Update()
+			if !ok {
+				t.Fatalf("Update not ready")
+			}
+
+			// Initial calibration is ~1MHz.
+			hz := params.Frequency
+			if hz < 990000 || hz > 1010000 {
+				t.Fatalf("Frequency got %v want > 990kHz && < 1010kHz", hz)
+			}
+
+			// Project time at the next update. Given the 1MHz
+			// calibration, it is expected to be ~3.1s/2.9s, not
+			// the actual 3s.
+			//
+			// N.B. the next update time is the "after" time above.
+			projected, ok := params.ComputeTime(tc.samples[2].after)
+			if !ok {
+				t.Fatalf("ComputeTime ok got %v want true", ok)
+			}
+			if projected < tc.projectedTimeStart || projected > tc.projectedTimeEnd {
+				t.Fatalf("ComputeTime(%v) got %v want > %v && < %v", tc.samples[2].after, projected, tc.projectedTimeStart, tc.projectedTimeEnd)
+			}
+
+			// Update again to see the changed reference clock.
+			params, ok = c.Update()
+			if !ok {
+				t.Fatalf("Update not ready")
+			}
+
+			// We now know that TSC = tc.samples[2].after -> 3s,
+			// but with the previous params indicated that TSC
+			// tc.samples[2].after -> 3.5s/2.5s. We can't allow the
+			// clock to go backwards, and having the clock jump
+			// forwards is undesirable. There should be a smooth
+			// transition that corrects the clock error over time.
+			// Check that the clock is continuous at TSC =
+			// tc.samples[2].after.
+			newProjected, ok := params.ComputeTime(tc.samples[2].after)
+			if !ok {
+				t.Fatalf("ComputeTime ok got %v want true", ok)
+			}
+			if newProjected != projected {
+				t.Errorf("Discontinuous time; ComputeTime(%v) got %v want %v", tc.samples[2].after, newProjected, projected)
+			}
+
+			// As the reference clock stablizes, ensure that the clock error
+			// decreases.
+			initialErr := c.errorNS
+			t.Logf("initial error: %v ns", initialErr)
+
+			_, ok = c.Update()
+			if !ok {
+				t.Fatalf("Update not ready")
+			}
+			if c.errorNS.Magnitude() > initialErr.Magnitude() {
+				t.Errorf("errorNS increased, got %v want |%v| <= |%v|", c.errorNS, c.errorNS, initialErr)
+			}
+
+			_, ok = c.Update()
+			if !ok {
+				t.Fatalf("Update not ready")
+			}
+			if c.errorNS.Magnitude() > initialErr.Magnitude() {
+				t.Errorf("errorNS increased, got %v want |%v| <= |%v|", c.errorNS, c.errorNS, initialErr)
+			}
+
+			t.Logf("final error: %v ns", c.errorNS)
+		})
+	}
+}
diff --git a/pkg/sentry/time/clock_id.go b/pkg/sentry/time/clock_id.go
new file mode 100644
index 000000000..500102e58
--- /dev/null
+++ b/pkg/sentry/time/clock_id.go
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"strconv"
+)
+
+// ClockID is a Linux clock identifier.
+type ClockID int32
+
+// These are the supported Linux clock identifiers.
+const (
+	Realtime ClockID = iota
+	Monotonic
+)
+
+// String implements fmt.Stringer.String.
+func (c ClockID) String() string {
+	switch c {
+	case Realtime:
+		return "Realtime"
+	case Monotonic:
+		return "Monotonic"
+	default:
+		return strconv.Itoa(int(c))
+	}
+}
diff --git a/pkg/sentry/time/clocks.go b/pkg/sentry/time/clocks.go
new file mode 100644
index 000000000..9925b407d
--- /dev/null
+++ b/pkg/sentry/time/clocks.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+// Clocks represents a clock source that contains both a monotonic and realtime
+// clock.
+type Clocks interface {
+	// Update performs an update step, keeping the clocks in sync with the
+	// reference host clocks, and returning the new timekeeping parameters.
+	//
+	// Update should be called at approximately ApproxUpdateInterval.
+	Update() (monotonicParams Parameters, monotonicOk bool, realtimeParam Parameters, realtimeOk bool)
+
+	// GetTime returns the current time in nanoseconds for the given clock.
+	//
+	// Clocks implementations must support at least Monotonic and
+	// Realtime.
+	GetTime(c ClockID) (int64, error)
+}
diff --git a/pkg/sentry/time/muldiv_amd64.s b/pkg/sentry/time/muldiv_amd64.s
new file mode 100644
index 000000000..291940b1d
--- /dev/null
+++ b/pkg/sentry/time/muldiv_amd64.s
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// Documentation is available in parameters.go.
+//
+// func muldiv64(value, multiplier, divisor uint64) (uint64, bool)
+TEXT ·muldiv64(SB),NOSPLIT,$0-33
+	MOVQ value+0(FP), AX
+	MOVQ multiplier+8(FP), BX
+	MOVQ divisor+16(FP), CX
+
+	// Multiply AX*BX and store result in DX:AX.
+	MULQ BX
+
+	// If divisor <= (value*multiplier) / 2^64, then the division will overflow.
+	//
+	// (value*multiplier) / 2^64 is DX:AX >> 64, or simply DX.
+	CMPQ CX, DX
+	JLE overflow
+
+	// Divide DX:AX by CX.
+	DIVQ CX
+
+	MOVQ AX, result+24(FP)
+	MOVB $1, ok+32(FP)
+	RET
+
+overflow:
+	MOVQ $0, result+24(FP)
+	MOVB $0, ok+32(FP)
+	RET
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
new file mode 100644
index 000000000..594b4874b
--- /dev/null
+++ b/pkg/sentry/time/parameters.go
@@ -0,0 +1,239 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+const (
+	// ApproxUpdateInterval is the approximate interval that parameters
+	// should be updated at.
+	//
+	// Error correction assumes that the next update will occur after this
+	// much time.
+	//
+	// If an update occurs before ApproxUpdateInterval passes, it has no
+	// adverse effect on error correction behavior.
+	//
+	// If an update occurs after ApproxUpdateInterval passes, the clock
+	// will overshoot its error correction target and begin accumulating
+	// error in the other direction.
+	//
+	// If updates occur after more than 2*ApproxUpdateInterval passes, the
+	// clock becomes unstable, accumulating more error than it had
+	// originally. Repeated updates after more than 2*ApproxUpdateInterval
+	// will cause unbounded increases in error.
+	//
+	// These statements assume that the host clock does not change. Actual
+	// error will depend upon host clock changes.
+	//
+	// TODO: make error correction more robust to delayed
+	// updates.
+	ApproxUpdateInterval = 1 * time.Second
+
+	// MaxClockError is the maximum amount of error that the clocks will
+	// try to correct.
+	//
+	// This limit:
+	//
+	//  * Puts a limit on cases of otherwise unbounded increases in error.
+	//
+	//  * Avoids unreasonably large frequency adjustments required to
+	//    correct large errors over a single update interval.
+	MaxClockError = ReferenceNS(ApproxUpdateInterval) / 4
+)
+
+// Parameters are the timekeeping parameters needed to compute the current
+// time.
+type Parameters struct {
+	// BaseCycles was the TSC counter value when the time was BaseRef.
+	BaseCycles TSCValue
+
+	// BaseRef is the reference clock time in nanoseconds corresponding to
+	// BaseCycles.
+	BaseRef ReferenceNS
+
+	// Frequency is the frequency of the cycle clock in Hertz.
+	Frequency uint64
+}
+
+// muldiv64 multiplies two 64-bit numbers, then divides the result by another
+// 64-bit number.
+//
+// It requires that the result fit in 64 bits, but doesn't require that
+// intermediate values do; in particular, the result of the multiplication may
+// require 128 bits.
+//
+// It returns !ok if divisor is zero or the result does not fit in 64 bits.
+func muldiv64(value, multiplier, divisor uint64) (uint64, bool)
+
+// ComputeTime calculates the current time from a "now" TSC value.
+//
+// time = ref + (now - base) / f
+func (p Parameters) ComputeTime(nowCycles TSCValue) (int64, bool) {
+	diffCycles := nowCycles - p.BaseCycles
+	if diffCycles < 0 {
+		log.Warningf("now cycles %v < base cycles %v", nowCycles, p.BaseCycles)
+		diffCycles = 0
+	}
+
+	// Overflow "won't ever happen". If diffCycles is the max value
+	// (2^63 - 1), then to overflow,
+	//
+	// frequency <= ((2^63 - 1) * 10^9) / 2^64 = 500Mhz
+	//
+	// A TSC running at 2GHz takes 201 years to reach 2^63-1. 805 years at
+	// 500MHz.
+	diffNS, ok := muldiv64(uint64(diffCycles), uint64(time.Second.Nanoseconds()), p.Frequency)
+	return int64(uint64(p.BaseRef) + diffNS), ok
+}
+
+// errorAdjust returns a new Parameters struct "adjusted" that satisfies:
+//
+// 1. adjusted.ComputeTime(now) = prevParams.ComputeTime(now)
+//   * i.e., the current time does not jump.
+//
+// 2. adjusted.ComputeTime(TSC at next update) = newParams.ComputeTime(TSC at next update)
+//   * i.e., Any error between prevParams and newParams will be corrected over
+//     the course of the next update period.
+//
+// errorAdjust also returns the current clock error.
+//
+// Preconditions:
+// * newParams.BaseCycles >= prevParams.BaseCycles; i.e., TSC must not go
+//   backwards.
+// * newParams.BaseCycles <= now; i.e., the new parameters be computed at or
+//   before now.
+func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Parameters, ReferenceNS, error) {
+	if newParams.BaseCycles < prevParams.BaseCycles {
+		// Oh dear! Something is very wrong.
+		return Parameters{}, 0, fmt.Errorf("TSC went backwards in updated clock params: %v < %v", newParams.BaseCycles, prevParams.BaseCycles)
+	}
+	if newParams.BaseCycles > now {
+		return Parameters{}, 0, fmt.Errorf("parameters contain base cycles later than now: %v > %v", newParams.BaseCycles, now)
+	}
+
+	intervalNS := int64(ApproxUpdateInterval.Nanoseconds())
+	nsPerSec := uint64(time.Second.Nanoseconds())
+
+	// Current time as computed by prevParams.
+	oldNowNS, ok := prevParams.ComputeTime(now)
+	if !ok {
+		return Parameters{}, 0, fmt.Errorf("old now time computation overflowed. params = %+v, now = %v", prevParams, now)
+	}
+
+	// We expect the update ticker to run based on this clock (i.e., it has
+	// been using prevParams and will use the returned adjusted
+	// parameters). Hence it will decide to fire intervalNS from the
+	// current (oldNowNS) "now".
+	nextNS := oldNowNS + intervalNS
+
+	if nextNS <= int64(newParams.BaseRef) {
+		// The next update time already passed before the new
+		// parameters were created! We definitely can't correct the
+		// error by then.
+		return Parameters{}, 0, fmt.Errorf("unable to correct error in single period. oldNowNS = %v, nextNS = %v, p = %v", oldNowNS, nextNS, newParams)
+	}
+
+	// For what TSC value next will newParams.ComputeTime(next) = nextNS?
+	//
+	// Solve ComputeTime for next:
+	//
+	// next = newParams.Frequency * (nextNS - newParams.BaseRef) + newParams.BaseCycles
+	c, ok := muldiv64(newParams.Frequency, uint64(nextNS-int64(newParams.BaseRef)), nsPerSec)
+	if !ok {
+		return Parameters{}, 0, fmt.Errorf("%v * (%v - %v) / %v overflows", newParams.Frequency, nextNS, newParams.BaseRef, nsPerSec)
+	}
+
+	cycles := TSCValue(c)
+	next := cycles + newParams.BaseCycles
+
+	if next <= now {
+		// The next update time already passed now with the new
+		// parameters! We can't correct the error in a single period.
+		return Parameters{}, 0, fmt.Errorf("unable to correct error in single period. oldNowNS = %v, nextNS = %v, now = %v, next = %v", oldNowNS, nextNS, now, next)
+	}
+
+	// We want to solve for parameters that satisfy:
+	//
+	// adjusted.ComputeTime(now) = oldNowNS
+	//
+	// adjusted.ComputeTime(next) = nextNS
+	//
+	// i.e., the current time does not change, but by the time we reach
+	// next we reach the same time as newParams.
+
+	// We choose to keep BaseCycles fixed.
+	adjusted := Parameters{
+		BaseCycles: newParams.BaseCycles,
+	}
+
+	// We want a slope such that time goes from oldNowNS to nextNS when
+	// we reach next.
+	//
+	// In other words, cycles should increase by next - now in the next
+	// interval.
+
+	cycles = next - now
+	ns := intervalNS
+
+	// adjusted.Frequency = cycles / ns
+	adjusted.Frequency, ok = muldiv64(uint64(cycles), nsPerSec, uint64(ns))
+	if !ok {
+		return Parameters{}, 0, fmt.Errorf("(%v - %v) * %v / %v overflows", next, now, nsPerSec, ns)
+	}
+
+	// Now choose a base reference such that the current time remains the
+	// same. Note that this is just ComputeTime, solving for BaseRef:
+	//
+	// oldNowNS = BaseRef + (now - BaseCycles) / Frequency
+	// BaseRef = oldNowNS - (now - BaseCycles) / Frequency
+	diffNS, ok := muldiv64(uint64(now-adjusted.BaseCycles), nsPerSec, adjusted.Frequency)
+	if !ok {
+		return Parameters{}, 0, fmt.Errorf("(%v - %v) * %v / %v overflows", now, adjusted.BaseCycles, nsPerSec, adjusted.Frequency)
+	}
+
+	adjusted.BaseRef = ReferenceNS(oldNowNS - int64(diffNS))
+
+	// The error is the difference between the current time and what the
+	// new parameters say the current time should be.
+	newNowNS, ok := newParams.ComputeTime(now)
+	if !ok {
+		return Parameters{}, 0, fmt.Errorf("new now time computation overflowed. params = %+v, now = %v", newParams, now)
+	}
+
+	errorNS := ReferenceNS(oldNowNS - newNowNS)
+
+	return adjusted, errorNS, nil
+}
+
+// logErrorAdjustment logs the clock error and associated error correction
+// frequency adjustment.
+//
+// The log level is determined by the error severity.
+func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) {
+	fn := log.Debugf
+	if int64(errorNS.Magnitude()) > time.Millisecond.Nanoseconds() {
+		fn = log.Warningf
+	} else if int64(errorNS.Magnitude()) > 10*time.Microsecond.Nanoseconds() {
+		fn = log.Infof
+	}
+
+	fn("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency)
+}
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
new file mode 100644
index 000000000..7394fc5ee
--- /dev/null
+++ b/pkg/sentry/time/parameters_test.go
@@ -0,0 +1,486 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"math"
+	"testing"
+	"time"
+)
+
+func TestParametersComputeTime(t *testing.T) {
+	testCases := []struct {
+		name   string
+		params Parameters
+		now    TSCValue
+		want   int64
+	}{
+		{
+			// Now is the same as the base cycles.
+			name: "base-cycles",
+			params: Parameters{
+				BaseCycles: 10000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			now:  10000,
+			want: 5000 * time.Millisecond.Nanoseconds(),
+		},
+		{
+			// Now is the behind the base cycles. Time is frozen.
+			name: "backwards",
+			params: Parameters{
+				BaseCycles: 10000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			now:  9000,
+			want: 5000 * time.Millisecond.Nanoseconds(),
+		},
+		{
+			// Now is ahead of the base cycles.
+			name: "ahead",
+			params: Parameters{
+				BaseCycles: 10000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			now:  15000,
+			want: 5500 * time.Millisecond.Nanoseconds(),
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, ok := tc.params.ComputeTime(tc.now)
+			if !ok {
+				t.Errorf("ComputeTime ok got %v want true", got)
+			}
+			if got != tc.want {
+				t.Errorf("ComputeTime got %+v want %+v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestParametersErrorAdjust(t *testing.T) {
+	testCases := []struct {
+		name      string
+		oldParams Parameters
+		now       TSCValue
+		newParams Parameters
+		want      Parameters
+		errorNS   ReferenceNS
+		wantErr   bool
+	}{
+		{
+			// newParams are perfectly continuous with oldParams
+			// and don't need adjustment.
+			name: "continuous",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 50000,
+			newParams: Parameters{
+				BaseCycles: 50000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			want: Parameters{
+				BaseCycles: 50000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+		},
+		{
+			// Same as "continuous", but with now ahead of
+			// newParams.BaseCycles. The result is the same as
+			// there is no error to correct.
+			name: "continuous-nowdiff",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 60000,
+			newParams: Parameters{
+				BaseCycles: 50000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			want: Parameters{
+				BaseCycles: 50000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+		},
+		{
+			// errorAdjust bails out if the TSC goes backwards.
+			name: "tsc-backwards",
+			oldParams: Parameters{
+				BaseCycles: 10000,
+				BaseRef:    ReferenceNS(1000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			now: 9000,
+			newParams: Parameters{
+				BaseCycles: 9000,
+				BaseRef:    ReferenceNS(1100 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			wantErr: true,
+		},
+		{
+			// errorAdjust bails out if new params are from after now.
+			name: "params-after-now",
+			oldParams: Parameters{
+				BaseCycles: 10000,
+				BaseRef:    ReferenceNS(1000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			now: 11000,
+			newParams: Parameters{
+				BaseCycles: 12000,
+				BaseRef:    ReferenceNS(1200 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			wantErr: true,
+		},
+		{
+			// Host clock sped up.
+			name: "speed-up",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 45000,
+			// Host frequency changed to 9000 immediately after
+			// oldParams was returned.
+			newParams: Parameters{
+				BaseCycles: 45000,
+				// From oldParams, we think ref = 4.5s at cycles = 45000.
+				BaseRef:   ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency: 9000,
+			},
+			want: Parameters{
+				BaseCycles: 45000,
+				BaseRef:    ReferenceNS(4500 * time.Millisecond.Nanoseconds()),
+				// We must decrease the new frequency by 50% to
+				// correct 0.5s of error in 1s
+				// (ApproxUpdateInterval).
+				Frequency: 4500,
+			},
+			errorNS: ReferenceNS(-500 * time.Millisecond.Nanoseconds()),
+		},
+		{
+			// Host clock sped up, with now ahead of newParams.
+			name: "speed-up-nowdiff",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 50000,
+			// Host frequency changed to 9000 immediately after
+			// oldParams was returned.
+			newParams: Parameters{
+				BaseCycles: 45000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  9000,
+			},
+			// nextRef = 6000ms
+			// nextCycles = 9000 * (6000ms - 5000ms) + 45000
+			// nextCycles = 9000 * (1s) + 45000
+			// nextCycles = 54000
+			// f = (54000 - 50000) / 1s = 4000
+			//
+			// ref = 5000ms - (50000 - 45000) / 4000
+			// ref = 3.75s
+			want: Parameters{
+				BaseCycles: 45000,
+				BaseRef:    ReferenceNS(3750 * time.Millisecond.Nanoseconds()),
+				Frequency:  4000,
+			},
+			// oldNow = 50000 * 10000 = 5s
+			// newNow = (50000 - 45000) / 9000 + 5s = 5.555s
+			errorNS: ReferenceNS((5000*time.Millisecond - 5555555555).Nanoseconds()),
+		},
+		{
+			// Host clock sped up. The new parameters are so far
+			// ahead that the next update time already passed.
+			name: "speed-up-uncorrectable-baseref",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 50000,
+			// Host frequency changed to 5000 immediately after
+			// oldParams was returned.
+			newParams: Parameters{
+				BaseCycles: 45000,
+				BaseRef:    ReferenceNS(9000 * time.Millisecond.Nanoseconds()),
+				Frequency:  5000,
+			},
+			// The next update should be at 10s, but newParams
+			// already passed 6s.  Thus it is impossible to correct
+			// the clock by then.
+			wantErr: true,
+		},
+		{
+			// Host clock sped up. The new parameters are moving so
+			// fast that the next update should be before now.
+			name: "speed-up-uncorrectable-frequency",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 55000,
+			// Host frequency changed to 7500 immediately after
+			// oldParams was returned.
+			newParams: Parameters{
+				BaseCycles: 45000,
+				BaseRef:    ReferenceNS(6000 * time.Millisecond.Nanoseconds()),
+				Frequency:  7500,
+			},
+			// The next update should be at 6.5s, but newParams are
+			// so far ahead and fast that they reach 6.5s at cycle
+			// 48750, which before now! Thus it is impossible to
+			// correct the clock by then.
+			wantErr: true,
+		},
+		{
+			// Host clock slowed down.
+			name: "slow-down",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 55000,
+			// Host frequency changed to 11000 immediately after
+			// oldParams was returned.
+			newParams: Parameters{
+				BaseCycles: 55000,
+				// From oldParams, we think ref = 5.5s at cycles = 55000.
+				BaseRef:   ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency: 11000,
+			},
+			want: Parameters{
+				BaseCycles: 55000,
+				BaseRef:    ReferenceNS(5500 * time.Millisecond.Nanoseconds()),
+				// We must increase the new frequency by 50% to
+				// correct 0.5s of error in 1s
+				// (ApproxUpdateInterval).
+				Frequency: 16500,
+			},
+			errorNS: ReferenceNS(500 * time.Millisecond.Nanoseconds()),
+		},
+		{
+			// Host clock slowed down, with now ahead of newParams.
+			name: "slow-down-nowdiff",
+			oldParams: Parameters{
+				BaseCycles: 0,
+				BaseRef:    0,
+				Frequency:  10000,
+			},
+			now: 60000,
+			// Host frequency changed to 11000 immediately after
+			// oldParams was returned.
+			newParams: Parameters{
+				BaseCycles: 55000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  11000,
+			},
+			// nextRef = 7000ms
+			// nextCycles = 11000 * (7000ms - 5000ms) + 55000
+			// nextCycles = 11000 * (2000ms) + 55000
+			// nextCycles = 77000
+			// f = (77000 - 60000) / 1s = 17000
+			//
+			// ref = 6000ms - (60000 - 55000) / 17000
+			// ref = 5705882353ns
+			want: Parameters{
+				BaseCycles: 55000,
+				BaseRef:    ReferenceNS(5705882353),
+				Frequency:  17000,
+			},
+			// oldNow = 60000 * 10000 = 6s
+			// newNow = (60000 - 55000) / 11000 + 5s = 5.4545s
+			errorNS: ReferenceNS((6*time.Second - 5454545454).Nanoseconds()),
+		},
+		{
+			// Host time went backwards.
+			name: "time-backwards",
+			oldParams: Parameters{
+				BaseCycles: 50000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			now: 60000,
+			newParams: Parameters{
+				BaseCycles: 60000,
+				// From oldParams, we think ref = 6s at cycles = 60000.
+				BaseRef:   ReferenceNS(4000 * time.Millisecond.Nanoseconds()),
+				Frequency: 10000,
+			},
+			want: Parameters{
+				BaseCycles: 60000,
+				BaseRef:    ReferenceNS(6000 * time.Millisecond.Nanoseconds()),
+				// We must increase the frequency by 200% to
+				// correct 2s of error in 1s
+				// (ApproxUpdateInterval).
+				Frequency: 30000,
+			},
+			errorNS: ReferenceNS(2000 * time.Millisecond.Nanoseconds()),
+		},
+		{
+			// Host time went backwards, with now ahead of newParams.
+			name: "time-backwards-nowdiff",
+			oldParams: Parameters{
+				BaseCycles: 50000,
+				BaseRef:    ReferenceNS(5000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			now: 65000,
+			// nextRef = 7500ms
+			// nextCycles = 10000 * (7500ms - 4000ms) + 60000
+			// nextCycles = 10000 * (3500ms) + 60000
+			// nextCycles = 95000
+			// f = (95000 - 65000) / 1s = 30000
+			//
+			// ref = 6500ms - (65000 - 60000) / 30000
+			// ref = 6333333333ns
+			newParams: Parameters{
+				BaseCycles: 60000,
+				BaseRef:    ReferenceNS(4000 * time.Millisecond.Nanoseconds()),
+				Frequency:  10000,
+			},
+			want: Parameters{
+				BaseCycles: 60000,
+				BaseRef:    ReferenceNS(6333333334),
+				Frequency:  30000,
+			},
+			// oldNow = 65000 * 10000 = 6.5s
+			// newNow = (65000 - 60000) / 10000 + 4s = 4.5s
+			errorNS: ReferenceNS(2000 * time.Millisecond.Nanoseconds()),
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, errorNS, err := errorAdjust(tc.oldParams, tc.newParams, tc.now)
+			if err != nil && !tc.wantErr {
+				t.Errorf("err got %v want nil", err)
+			} else if err == nil && tc.wantErr {
+				t.Errorf("err got nil want non-nil")
+			}
+
+			if got != tc.want {
+				t.Errorf("Parameters got %+v want %+v", got, tc.want)
+			}
+			if errorNS != tc.errorNS {
+				t.Errorf("errorNS got %v want %v", errorNS, tc.errorNS)
+			}
+		})
+	}
+}
+
+func testMuldiv(t *testing.T, v uint64) {
+	for i := uint64(1); i <= 1000000; i++ {
+		mult := uint64(1000000000)
+		div := i * mult
+		res, ok := muldiv64(v, mult, div)
+		if !ok {
+			t.Errorf("Result of %v * %v / %v ok got false want true", v, mult, div)
+		}
+		if want := v / i; res != want {
+			t.Errorf("Bad result of %v * %v / %v: got %v, want %v", v, mult, div, res, want)
+		}
+	}
+}
+
+func TestMulDiv(t *testing.T) {
+	testMuldiv(t, math.MaxUint64)
+	for i := int64(-10); i <= 10; i++ {
+		testMuldiv(t, uint64(i))
+	}
+}
+
+func TestMulDivZero(t *testing.T) {
+	if r, ok := muldiv64(2, 4, 0); ok {
+		t.Errorf("muldiv64(2, 4, 0) got %d, ok want !ok", r)
+	}
+
+	if r, ok := muldiv64(0, 0, 0); ok {
+		t.Errorf("muldiv64(0, 0, 0) got %d, ok want !ok", r)
+	}
+}
+
+func TestMulDivOverflow(t *testing.T) {
+	testCases := []struct {
+		name string
+		val  uint64
+		mult uint64
+		div  uint64
+		ok   bool
+		ret  uint64
+	}{
+		{
+			name: "2^62",
+			val:  1 << 63,
+			mult: 4,
+			div:  8,
+			ok:   true,
+			ret:  1 << 62,
+		},
+		{
+			name: "2^64-1",
+			val:  0xffffffffffffffff,
+			mult: 1,
+			div:  1,
+			ok:   true,
+			ret:  0xffffffffffffffff,
+		},
+		{
+			name: "2^64",
+			val:  1 << 63,
+			mult: 4,
+			div:  2,
+			ok:   false,
+		},
+		{
+			name: "2^125",
+			val:  1 << 63,
+			mult: 1 << 63,
+			div:  2,
+			ok:   false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			r, ok := muldiv64(tc.val, tc.mult, tc.div)
+			if ok != tc.ok {
+				t.Errorf("ok got %v want %v", ok, tc.ok)
+			}
+			if tc.ok && r != tc.ret {
+				t.Errorf("ret got %v want %v", r, tc.ret)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/time/sampler.go b/pkg/sentry/time/sampler.go
new file mode 100644
index 000000000..cf581b5fa
--- /dev/null
+++ b/pkg/sentry/time/sampler.go
@@ -0,0 +1,225 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"errors"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+const (
+	// defaultOverheadTSC is the default estimated syscall overhead in TSC cycles.
+	// It is further refined as syscalls are made.
+	defaultOverheadCycles = 1 * 1000
+
+	// maxOverheadCycles is the maximum allowed syscall overhead in TSC cycles.
+	maxOverheadCycles = 100 * defaultOverheadCycles
+
+	// maxSampleLoops is the maximum number of times to try to get a clock sample
+	// under the expected overhead.
+	maxSampleLoops = 5
+
+	// maxSamples is the maximum number of samples to collect.
+	maxSamples = 10
+)
+
+// errOverheadTooHigh is returned from sampler.Sample if the syscall
+// overhead is too high.
+var errOverheadTooHigh = errors.New("time syscall overhead exceeds maximum")
+
+// TSCValue is a value from the TSC.
+type TSCValue int64
+
+// Rdtsc reads the TSC.
+//
+// Intel SDM, Vol 3, Ch 17.15:
+// "The RDTSC instruction reads the time-stamp counter and is guaranteed to
+// return a monotonically increasing unique value whenever executed, except for
+// a 64-bit counter wraparound. Intel guarantees that the time-stamp counter
+// will not wraparound within 10 years after being reset."
+//
+// We use int64, so we have 5 years before wrap-around.
+func Rdtsc() TSCValue
+
+// ReferenceNS are nanoseconds in the reference clock domain.
+// int64 gives us ~290 years before this overflows.
+type ReferenceNS int64
+
+// Magnitude returns the absolute value of r.
+func (r ReferenceNS) Magnitude() ReferenceNS {
+	if r < 0 {
+		return -r
+	}
+	return r
+}
+
+// cycleClock is a TSC-based cycle clock.
+type cycleClock interface {
+	// Cycles returns a count value from the TSC.
+	Cycles() TSCValue
+}
+
+// tscCycleClock is a cycleClock that uses the real TSC.
+type tscCycleClock struct{}
+
+// Cycles implements cycleClock.Cycles.
+func (tscCycleClock) Cycles() TSCValue {
+	return Rdtsc()
+}
+
+// sample contains a sample from the reference clock, with TSC values from
+// before and after the reference clock value was captured.
+type sample struct {
+	before TSCValue
+	after  TSCValue
+	ref    ReferenceNS
+}
+
+// Overhead returns the sample overhead in TSC cycles.
+func (s *sample) Overhead() TSCValue {
+	return s.after - s.before
+}
+
+// referenceClocks collects individual samples from a reference clock ID and
+// TSC.
+type referenceClocks interface {
+	cycleClock
+
+	// Sample returns a single sample from the reference clock ID.
+	Sample(c ClockID) (sample, error)
+}
+
+// sampler collects samples from a reference system clock, minimizing
+// the overhead in each sample.
+type sampler struct {
+	// clockID is the reference clock ID (e.g., CLOCK_MONOTONIC).
+	clockID ClockID
+
+	// clocks provides raw samples.
+	clocks referenceClocks
+
+	// overhead is the estimated sample overhead in TSC cycles.
+	overhead TSCValue
+
+	// samples is a ring buffer of the latest samples collected.
+	samples []sample
+}
+
+// newSampler creates a sampler for clockID.
+func newSampler(c ClockID) *sampler {
+	return &sampler{
+		clockID:  c,
+		clocks:   syscallTSCReferenceClocks{},
+		overhead: defaultOverheadCycles,
+	}
+}
+
+// Reset discards previously collected clock samples.
+func (s *sampler) Reset() {
+	s.overhead = defaultOverheadCycles
+	s.samples = []sample{}
+}
+
+// lowOverheadSample returns a reference clock sample with minimized syscall overhead.
+func (s *sampler) lowOverheadSample() (sample, error) {
+	for {
+		for i := 0; i < maxSampleLoops; i++ {
+			samp, err := s.clocks.Sample(s.clockID)
+			if err != nil {
+				return sample{}, err
+			}
+
+			if samp.before > samp.after {
+				log.Warningf("TSC went backwards: %v > %v", samp.before, samp.after)
+				continue
+			}
+
+			if samp.Overhead() <= s.overhead {
+				return samp, nil
+			}
+		}
+
+		// Couldn't get a sample with the current overhead. Increase it.
+		newOverhead := 2 * s.overhead
+		if newOverhead > maxOverheadCycles {
+			// We'll give it one more shot with the max overhead.
+
+			if s.overhead == maxOverheadCycles {
+				return sample{}, errOverheadTooHigh
+			}
+
+			newOverhead = maxOverheadCycles
+		}
+
+		s.overhead = newOverhead
+		log.Debugf("Time: Adjusting syscall overhead up to %v", s.overhead)
+	}
+}
+
+// Sample collects a reference clock sample.
+func (s *sampler) Sample() error {
+	sample, err := s.lowOverheadSample()
+	if err != nil {
+		return err
+	}
+
+	s.samples = append(s.samples, sample)
+	if len(s.samples) > maxSamples {
+		s.samples = s.samples[1:]
+	}
+
+	// If the 4 most recent samples all have an overhead less than half the
+	// expected overhead, adjust downwards.
+	if len(s.samples) < 4 {
+		return nil
+	}
+
+	for _, sample := range s.samples[len(s.samples)-4:] {
+		if sample.Overhead() > s.overhead/2 {
+			return nil
+		}
+	}
+
+	s.overhead -= s.overhead / 8
+	log.Debugf("Time: Adjusting syscall overhead down to %v", s.overhead)
+
+	return nil
+}
+
+// Syscall returns the current raw reference time without storing TSC
+// samples.
+func (s *sampler) Syscall() (ReferenceNS, error) {
+	sample, err := s.clocks.Sample(s.clockID)
+	if err != nil {
+		return 0, err
+	}
+
+	return sample.ref, nil
+}
+
+// Cycles returns a raw TSC value.
+func (s *sampler) Cycles() TSCValue {
+	return s.clocks.Cycles()
+}
+
+// Range returns the widest range of clock samples available.
+func (s *sampler) Range() (sample, sample, bool) {
+	if len(s.samples) < 2 {
+		return sample{}, sample{}, false
+	}
+
+	return s.samples[0], s.samples[len(s.samples)-1], true
+}
diff --git a/pkg/sentry/time/sampler_test.go b/pkg/sentry/time/sampler_test.go
new file mode 100644
index 000000000..caf7e5c53
--- /dev/null
+++ b/pkg/sentry/time/sampler_test.go
@@ -0,0 +1,183 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"errors"
+	"testing"
+)
+
+// errNoSamples is returned when testReferenceClocks runs out of samples.
+var errNoSamples = errors.New("no samples available")
+
+// testReferenceClocks returns a preset list of samples and cycle counts.
+type testReferenceClocks struct {
+	samples []sample
+	cycles  []TSCValue
+}
+
+// Sample implements referenceClocks.Sample, returning the next sample in the list.
+func (t *testReferenceClocks) Sample(_ ClockID) (sample, error) {
+	if len(t.samples) == 0 {
+		return sample{}, errNoSamples
+	}
+
+	s := t.samples[0]
+	if len(t.samples) == 1 {
+		t.samples = nil
+	} else {
+		t.samples = t.samples[1:]
+	}
+
+	return s, nil
+}
+
+// Cycles implements referenceClocks.Cycles, returning the next TSCValue in the list.
+func (t *testReferenceClocks) Cycles() TSCValue {
+	if len(t.cycles) == 0 {
+		return 0
+	}
+
+	c := t.cycles[0]
+	if len(t.cycles) == 1 {
+		t.cycles = nil
+	} else {
+		t.cycles = t.cycles[1:]
+	}
+
+	return c
+}
+
+// newTestSampler returns a sampler that collects samples from
+// the given sample list and cycle counts from the given cycle list.
+func newTestSampler(samples []sample, cycles []TSCValue) *sampler {
+	return &sampler{
+		clocks: &testReferenceClocks{
+			samples: samples,
+			cycles:  cycles,
+		},
+		overhead: defaultOverheadCycles,
+	}
+}
+
+// generateSamples generates n samples with the given overhead.
+func generateSamples(n int, overhead TSCValue) []sample {
+	samples := []sample{{before: 1000000, after: 1000000 + overhead, ref: 100}}
+	for i := 0; i < n-1; i++ {
+		prev := samples[len(samples)-1]
+		samples = append(samples, sample{
+			before: prev.before + 1000000,
+			after:  prev.after + 1000000,
+			ref:    prev.ref + 100,
+		})
+	}
+	return samples
+}
+
+// TestSample ensures that samples can be collected.
+func TestSample(t *testing.T) {
+	testCases := []struct {
+		name    string
+		samples []sample
+		err     error
+	}{
+		{
+			name: "basic",
+			samples: []sample{
+				{before: 100000, after: 100000 + defaultOverheadCycles, ref: 100},
+			},
+			err: nil,
+		},
+		{
+			// Sample with backwards TSC ignored.
+			// referenceClock should retry and get errNoSamples.
+			name: "backwards-tsc-ignored",
+			samples: []sample{
+				{before: 100000, after: 90000, ref: 100},
+			},
+			err: errNoSamples,
+		},
+		{
+			// Sample far above overhead skipped.
+			// referenceClock should retry and get errNoSamples.
+			name: "reject-overhead",
+			samples: []sample{
+				{before: 100000, after: 100000 + 5*defaultOverheadCycles, ref: 100},
+			},
+			err: errNoSamples,
+		},
+		{
+			// Maximum overhead allowed is bounded.
+			name: "over-max-overhead",
+			// Generate a bunch of samples. The reference clock
+			// needs a while to ramp up its expected overhead.
+			samples: generateSamples(100, 2*maxOverheadCycles),
+			err:     errOverheadTooHigh,
+		},
+		{
+			// Overhead at maximum overhead is allowed.
+			name: "max-overhead",
+			// Generate a bunch of samples. The reference clock
+			// needs a while to ramp up its expected overhead.
+			samples: generateSamples(100, maxOverheadCycles),
+			err:     nil,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			s := newTestSampler(tc.samples, nil)
+			err := s.Sample()
+			if err != tc.err {
+				t.Errorf("Sample err got %v want %v", err, tc.err)
+			}
+		})
+	}
+}
+
+// TestOutliersIgnored tests that referenceClock ignores samples with very high
+// overhead.
+func TestOutliersIgnored(t *testing.T) {
+	s := newTestSampler([]sample{
+		{before: 100000, after: 100000 + defaultOverheadCycles, ref: 100},
+		{before: 200000, after: 200000 + defaultOverheadCycles, ref: 200},
+		{before: 300000, after: 300000 + defaultOverheadCycles, ref: 300},
+		{before: 400000, after: 400000 + defaultOverheadCycles, ref: 400},
+		{before: 500000, after: 500000 + 5*defaultOverheadCycles, ref: 500}, // Ignored
+		{before: 600000, after: 600000 + defaultOverheadCycles, ref: 600},
+		{before: 700000, after: 700000 + defaultOverheadCycles, ref: 700},
+	}, nil)
+
+	// Collect 5 samples.
+	for i := 0; i < 5; i++ {
+		err := s.Sample()
+		if err != nil {
+			t.Fatalf("Unexpected error while sampling: %v", err)
+		}
+	}
+
+	oldest, newest, ok := s.Range()
+	if !ok {
+		t.Fatalf("Range not ok")
+	}
+
+	if oldest.ref != 100 {
+		t.Errorf("oldest.ref got %v want %v", oldest.ref, 100)
+	}
+
+	// We skipped the high-overhead sample.
+	if newest.ref != 600 {
+		t.Errorf("newest.ref got %v want %v", newest.ref, 600)
+	}
+}
diff --git a/pkg/sentry/time/sampler_unsafe.go b/pkg/sentry/time/sampler_unsafe.go
new file mode 100644
index 000000000..7ea19d387
--- /dev/null
+++ b/pkg/sentry/time/sampler_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// syscallTSCReferenceClocks is the standard referenceClocks, collecting
+// samples using CLOCK_GETTIME and RDTSC.
+type syscallTSCReferenceClocks struct {
+	tscCycleClock
+}
+
+// Sample implements sampler.Sample.
+func (syscallTSCReferenceClocks) Sample(c ClockID) (sample, error) {
+	var s sample
+
+	s.before = Rdtsc()
+
+	// Don't call clockGettime to avoid a call which may call morestack.
+	var ts syscall.Timespec
+	_, _, e := syscall.RawSyscall(syscall.SYS_CLOCK_GETTIME, uintptr(c), uintptr(unsafe.Pointer(&ts)), 0)
+	if e != 0 {
+		return sample{}, e
+	}
+
+	s.after = Rdtsc()
+	s.ref = ReferenceNS(ts.Nano())
+
+	return s, nil
+}
+
+// clockGettime calls SYS_CLOCK_GETTIME, returning time in nanoseconds.
+func clockGettime(c ClockID) (ReferenceNS, error) {
+	var ts syscall.Timespec
+	_, _, e := syscall.RawSyscall(syscall.SYS_CLOCK_GETTIME, uintptr(c), uintptr(unsafe.Pointer(&ts)), 0)
+	if e != 0 {
+		return 0, e
+	}
+
+	return ReferenceNS(ts.Nano()), nil
+}
diff --git a/pkg/sentry/time/tsc_amd64.s b/pkg/sentry/time/tsc_amd64.s
new file mode 100644
index 000000000..4cc604392
--- /dev/null
+++ b/pkg/sentry/time/tsc_amd64.s
@@ -0,0 +1,27 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+TEXT ·Rdtsc(SB),NOSPLIT,$0-8
+	// N.B. We need LFENCE on Intel, AMD is more complicated.
+	// Modern AMD CPUs with modern kernels make LFENCE behave like it does
+	// on Intel with MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT. MFENCE is
+	// otherwise needed on AMD.
+	LFENCE
+	RDTSC
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
new file mode 100644
index 000000000..c8ab03c3d
--- /dev/null
+++ b/pkg/sentry/uniqueid/BUILD
@@ -0,0 +1,11 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "uniqueid",
+    srcs = ["context.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid",
+    visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/sentry/context"],
+)
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
new file mode 100644
index 000000000..eeb8c4286
--- /dev/null
+++ b/pkg/sentry/uniqueid/context.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package uniqueid defines context.Context keys for obtaining system-wide
+// unique identifiers.
+package uniqueid
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the kernel package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxGlobalUniqueID is a Context.Value key for a system-wide
+	// unique identifier.
+	CtxGlobalUniqueID contextID = iota
+
+	// CtxInotifyCookie is a Context.Value key for a unique inotify
+	// event cookie.
+	CtxInotifyCookie
+)
+
+// GlobalFromContext returns a system-wide unique identifier from ctx.
+func GlobalFromContext(ctx context.Context) uint64 {
+	return ctx.Value(CtxGlobalUniqueID).(uint64)
+}
+
+// InotifyCookie generates a unique inotify event cookie from ctx.
+func InotifyCookie(ctx context.Context) uint32 {
+	return ctx.Value(CtxInotifyCookie).(uint32)
+}
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
new file mode 100644
index 000000000..a0fe0aa07
--- /dev/null
+++ b/pkg/sentry/usage/BUILD
@@ -0,0 +1,38 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "usage_state",
+    srcs = [
+        "cpu.go",
+        "io.go",
+        "memory.go",
+    ],
+    out = "usage_state.go",
+    package = "usage",
+)
+
+go_library(
+    name = "usage",
+    srcs = [
+        "cpu.go",
+        "io.go",
+        "memory.go",
+        "memory_unsafe.go",
+        "usage.go",
+        "usage_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usage",
+    visibility = [
+        "//pkg/sentry:internal",
+    ],
+    deps = [
+        "//pkg/bits",
+        "//pkg/log",
+        "//pkg/sentry/memutil",
+        "//pkg/state",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
new file mode 100644
index 000000000..1c2cc90e1
--- /dev/null
+++ b/pkg/sentry/usage/cpu.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usage
+
+import (
+	"time"
+)
+
+// CPUStats contains the subset of struct rusage fields that relate to CPU
+// scheduling.
+type CPUStats struct {
+	// UserTime is the amount of time spent executing application code.
+	UserTime time.Duration
+
+	// SysTime is the amount of time spent executing sentry code.
+	SysTime time.Duration
+
+	// VoluntarySwitches is the number of times control has been voluntarily
+	// ceded due to blocking, etc.
+	VoluntarySwitches uint64
+
+	// InvoluntarySwitches (struct rusage::ru_nivcsw) is unsupported, since
+	// "preemptive" scheduling is managed by the Go runtime, which doesn't
+	// provide this information.
+}
+
+// Accumulate adds s2 to s.
+func (s *CPUStats) Accumulate(s2 CPUStats) {
+	s.UserTime += s2.UserTime
+	s.SysTime += s2.SysTime
+	s.VoluntarySwitches += s2.VoluntarySwitches
+}
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
new file mode 100644
index 000000000..a05053c32
--- /dev/null
+++ b/pkg/sentry/usage/io.go
@@ -0,0 +1,88 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usage
+
+import (
+	"sync/atomic"
+)
+
+// IO contains I/O-related statistics.
+type IO struct {
+	// CharsRead is the number of bytes read by read syscalls.
+	CharsRead uint64
+
+	// CharsWritten is the number of bytes written by write syscalls.
+	CharsWritten uint64
+
+	// ReadSyscalls is the number of read syscalls.
+	ReadSyscalls uint64
+
+	// WriteSyscalls is the number of write syscalls.
+	WriteSyscalls uint64
+
+	// The following counter is only meaningful when Sentry has internal
+	// pagecache.
+
+	// BytesRead is the number of bytes actually read into pagecache.
+	BytesRead uint64
+
+	// BytesWritten is the number of bytes actually written from pagecache.
+	BytesWritten uint64
+
+	// BytesWriteCancelled is the number of bytes not written out due to
+	// truncation.
+	BytesWriteCancelled uint64
+}
+
+// AccountReadSyscall does the accounting for a read syscall.
+func (i *IO) AccountReadSyscall(bytes int64) {
+	atomic.AddUint64(&i.ReadSyscalls, 1)
+	if bytes > 0 {
+		atomic.AddUint64(&i.CharsRead, uint64(bytes))
+	}
+}
+
+// AccountWriteSyscall does the accounting for a write syscall.
+func (i *IO) AccountWriteSyscall(bytes int64) {
+	atomic.AddUint64(&i.WriteSyscalls, 1)
+	if bytes > 0 {
+		atomic.AddUint64(&i.CharsWritten, uint64(bytes))
+	}
+}
+
+// AccountReadIO does the accounting for a read IO into the file system.
+func (i *IO) AccountReadIO(bytes int64) {
+	if bytes > 0 {
+		atomic.AddUint64(&i.BytesRead, uint64(bytes))
+	}
+}
+
+// AccountWriteIO does the accounting for a write IO into the file system.
+func (i *IO) AccountWriteIO(bytes int64) {
+	if bytes > 0 {
+		atomic.AddUint64(&i.BytesWritten, uint64(bytes))
+	}
+}
+
+// Accumulate adds up io usages.
+func (i *IO) Accumulate(io *IO) {
+	atomic.AddUint64(&i.CharsRead, atomic.LoadUint64(&io.CharsRead))
+	atomic.AddUint64(&i.CharsWritten, atomic.LoadUint64(&io.CharsWritten))
+	atomic.AddUint64(&i.ReadSyscalls, atomic.LoadUint64(&io.ReadSyscalls))
+	atomic.AddUint64(&i.WriteSyscalls, atomic.LoadUint64(&io.WriteSyscalls))
+	atomic.AddUint64(&i.BytesRead, atomic.LoadUint64(&io.BytesRead))
+	atomic.AddUint64(&i.BytesWritten, atomic.LoadUint64(&io.BytesWritten))
+	atomic.AddUint64(&i.BytesWriteCancelled, atomic.LoadUint64(&io.BytesWriteCancelled))
+}
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
new file mode 100644
index 000000000..5d1b3a595
--- /dev/null
+++ b/pkg/sentry/usage/memory.go
@@ -0,0 +1,282 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usage
+
+import (
+	"fmt"
+	"os"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+)
+
+// MemoryKind represents a type of memory used by the application.
+//
+// For efficiency reasons, it is assumed that the Memory implementation is
+// responsible for specific stats (documented below), and those may be reported
+// in aggregate independently. See the platform.Memory interface as well as the
+// control.Usage.Collect method for more information.
+type MemoryKind int
+
+const (
+	// System represents miscellaneous system memory. This may include
+	// memory that is in the process of being reclaimed, system caches,
+	// page tables, swap, etc.
+	//
+	// This memory kind is backed by platform memory.
+	System MemoryKind = iota
+
+	// Anonymous represents anonymous application memory.
+	//
+	// This memory kind is backed by platform memory.
+	Anonymous
+
+	// PageCache represents memory allocated to back sandbox-visible files that
+	// do not have a local fd. The contents of these files are buffered in
+	// memory to support application mmaps.
+	//
+	// This memory kind is backed by platform memory.
+	PageCache
+
+	// Tmpfs represents memory used by the sandbox-visible tmpfs.
+	//
+	// This memory kind is backed by platform memory.
+	Tmpfs
+
+	// Ramdiskfs represents memory used by the ramdiskfs.
+	//
+	// This memory kind is backed by platform memory.
+	Ramdiskfs
+
+	// Mapped represents memory related to files which have a local fd on the
+	// host, and thus can be directly mapped. Typically these are files backed
+	// by gofers with donated-fd support. Note that this value may not track the
+	// exact amount of memory used by mapping on the host, because we don't have
+	// any visibility into the host kernel memory management. In particular,
+	// once we map some part of a host file, the host kernel is free to
+	// abitrarily populate/decommit the pages, which it may do for various
+	// reasons (ex. host memory reclaim, NUMA balancing).
+	//
+	// This memory kind is backed by the host pagecache, via host mmaps.
+	Mapped
+)
+
+// MemoryStats tracks application memory usage in bytes. All fields correspond to the
+// memory category with the same name. This object is thread-safe if accessed
+// through the provided methods. The public fields may be safely accessed
+// directly on a copy of the object obtained from Memory.Copy().
+type MemoryStats struct {
+	System    uint64
+	Anonymous uint64
+	PageCache uint64
+	Tmpfs     uint64
+	// Lazily updated based on the value in RTMapped.
+	Mapped    uint64
+	Ramdiskfs uint64
+}
+
+// RTMemoryStats contains the memory usage values that need to be directly
+// exposed through a shared memory file for real-time access. These are
+// categories not backed by platform memory. For details about how this works,
+// see the memory accounting docs.
+//
+// N.B. Please keep the struct in sync with the API. Noteably, changes to this
+// struct requires a version bump and addition of compatibility logic in the
+// control server. As a special-case, adding fields without re-ordering existing
+// ones do not require a version bump because the mapped page we use is
+// initially zeroed. Any added field will be ignored by an older API and will be
+// zero if read by a newer API.
+type RTMemoryStats struct {
+	RTMapped uint64
+}
+
+// MemoryLocked is Memory with access methods.
+type MemoryLocked struct {
+	mu sync.RWMutex
+	// MemoryStats records the memory stats.
+	MemoryStats
+	// RTMemoryStats records the memory stats that need to be exposed through
+	// shared page.
+	*RTMemoryStats
+	// File is the backing file storing the memory stats.
+	File *os.File
+}
+
+func newMemoryLocked() MemoryLocked {
+	name := "memory-usage"
+	fd, err := memutil.CreateMemFD(name, 0)
+	if err != nil {
+		panic("error creating usage file: " + err.Error())
+	}
+	file := os.NewFile(uintptr(fd), name)
+	if err := file.Truncate(int64(RTMemoryStatsSize)); err != nil {
+		panic("error truncating usage file: " + err.Error())
+	}
+	// Note: We rely on the returned page being initially zeroed. This will
+	// always be the case for a newly mapped page from /dev/shm. If we obtain
+	// the shared memory through some other means in the future, we may have to
+	// explicitly zero the page.
+	mmap, err := syscall.Mmap(int(file.Fd()), 0, int(RTMemoryStatsSize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		panic("error mapping usage file: " + err.Error())
+	}
+
+	return MemoryLocked{
+		File:          file,
+		RTMemoryStats: RTMemoryStatsPointer(mmap),
+	}
+}
+
+// MemoryAccounting is the global memory stats.
+//
+// There is no need to save or restore the global memory accounting object,
+// because individual frame kinds are saved and charged only when they become
+// resident.
+var MemoryAccounting = newMemoryLocked()
+
+func (m *MemoryLocked) incLocked(val uint64, kind MemoryKind) {
+	switch kind {
+	case System:
+		atomic.AddUint64(&m.System, val)
+	case Anonymous:
+		atomic.AddUint64(&m.Anonymous, val)
+	case PageCache:
+		atomic.AddUint64(&m.PageCache, val)
+	case Mapped:
+		atomic.AddUint64(&m.RTMapped, val)
+	case Tmpfs:
+		atomic.AddUint64(&m.Tmpfs, val)
+	case Ramdiskfs:
+		atomic.AddUint64(&m.Ramdiskfs, val)
+	default:
+		panic(fmt.Sprintf("invalid memory kind: %v", kind))
+	}
+}
+
+// Inc adds an additional usage of 'val' bytes to memory category 'kind'.
+//
+// This method is thread-safe.
+func (m *MemoryLocked) Inc(val uint64, kind MemoryKind) {
+	m.mu.RLock()
+	m.incLocked(val, kind)
+	m.mu.RUnlock()
+}
+
+func (m *MemoryLocked) decLocked(val uint64, kind MemoryKind) {
+	switch kind {
+	case System:
+		atomic.AddUint64(&m.System, ^(val - 1))
+	case Anonymous:
+		atomic.AddUint64(&m.Anonymous, ^(val - 1))
+	case PageCache:
+		atomic.AddUint64(&m.PageCache, ^(val - 1))
+	case Mapped:
+		atomic.AddUint64(&m.RTMapped, ^(val - 1))
+	case Tmpfs:
+		atomic.AddUint64(&m.Tmpfs, ^(val - 1))
+	case Ramdiskfs:
+		atomic.AddUint64(&m.Ramdiskfs, ^(val - 1))
+	default:
+		panic(fmt.Sprintf("invalid memory kind: %v", kind))
+	}
+}
+
+// Dec remove a usage of 'val' bytes from memory category 'kind'.
+//
+// This method is thread-safe.
+func (m *MemoryLocked) Dec(val uint64, kind MemoryKind) {
+	m.mu.RLock()
+	m.decLocked(val, kind)
+	m.mu.RUnlock()
+}
+
+// Move moves a usage of 'val' bytes from 'from' to 'to'.
+//
+// This method is thread-safe.
+func (m *MemoryLocked) Move(val uint64, to MemoryKind, from MemoryKind) {
+	m.mu.RLock()
+	// Just call decLocked and incLocked directly. We held the RLock to
+	// protect against concurrent callers to Total().
+	m.decLocked(val, from)
+	m.incLocked(val, to)
+	m.mu.RUnlock()
+}
+
+// totalLocked returns a total usage.
+//
+// Precondition: must be called when locked.
+func (m *MemoryLocked) totalLocked() (total uint64) {
+	total += atomic.LoadUint64(&m.System)
+	total += atomic.LoadUint64(&m.Anonymous)
+	total += atomic.LoadUint64(&m.PageCache)
+	total += atomic.LoadUint64(&m.RTMapped)
+	total += atomic.LoadUint64(&m.Tmpfs)
+	total += atomic.LoadUint64(&m.Ramdiskfs)
+	return
+}
+
+// Total returns a total memory usage.
+//
+// This method is thread-safe.
+func (m *MemoryLocked) Total() uint64 {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.totalLocked()
+}
+
+// Copy returns a copy of the structure with a total.
+//
+// This method is thread-safe.
+func (m *MemoryLocked) Copy() (MemoryStats, uint64) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	ms := m.MemoryStats
+	ms.Mapped = m.RTMapped
+	return ms, m.totalLocked()
+}
+
+// MinimumTotalMemoryBytes is the minimum reported total system memory.
+var MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB
+
+// TotalMemory returns the "total usable memory" available.
+//
+// This number doesn't really have a true value so it's based on the following
+// inputs and further bounded to be above some minimum guaranteed value (2GB),
+// additionally ensuring that total memory reported is always less than used.
+//
+// memSize should be the platform.Memory size reported by platform.Memory.TotalSize()
+// used is the total memory reported by MemoryLocked.Total()
+func TotalMemory(memSize, used uint64) uint64 {
+	if memSize < MinimumTotalMemoryBytes {
+		memSize = MinimumTotalMemoryBytes
+	}
+	if memSize < used {
+		memSize = used
+		// Bump totalSize to the next largest power of 2, if one exists, so
+		// that MemFree isn't 0.
+		if msb := bits.MostSignificantOne64(memSize); msb < 63 {
+			memSize = uint64(1) << (uint(msb) + 1)
+		}
+	}
+	return memSize
+}
+
+// IncrementalMappedAccounting controls whether host mapped memory is accounted
+// incrementally during map translation. This may be modified during early
+// initialization, and is read-only afterward.
+var IncrementalMappedAccounting = false
diff --git a/pkg/sentry/usage/memory_unsafe.go b/pkg/sentry/usage/memory_unsafe.go
new file mode 100644
index 000000000..f990a7750
--- /dev/null
+++ b/pkg/sentry/usage/memory_unsafe.go
@@ -0,0 +1,27 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usage
+
+import (
+	"unsafe"
+)
+
+// RTMemoryStatsSize is the size of the RTMemoryStats struct.
+var RTMemoryStatsSize = unsafe.Sizeof(RTMemoryStats{})
+
+// RTMemoryStatsPointer casts the address of the byte slice into a RTMemoryStats pointer.
+func RTMemoryStatsPointer(b []byte) *RTMemoryStats {
+	return (*RTMemoryStats)(unsafe.Pointer(&b[0]))
+}
diff --git a/pkg/sentry/usage/usage.go b/pkg/sentry/usage/usage.go
new file mode 100644
index 000000000..3b3118659
--- /dev/null
+++ b/pkg/sentry/usage/usage.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package usage provides representations of resource usage.
+package usage
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
new file mode 100644
index 000000000..36c0760dd
--- /dev/null
+++ b/pkg/sentry/usermem/BUILD
@@ -0,0 +1,70 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "usermem_state",
+    srcs = [
+        "access_type.go",
+        "addr.go",
+        "addr_range.go",
+        "addr_range_seq_unsafe.go",
+    ],
+    out = "usermem_state.go",
+    package = "usermem",
+)
+
+go_template_instance(
+    name = "addr_range",
+    out = "addr_range.go",
+    package = "usermem",
+    prefix = "Addr",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "Addr",
+    },
+)
+
+go_library(
+    name = "usermem",
+    srcs = [
+        "access_type.go",
+        "addr.go",
+        "addr_range.go",
+        "addr_range_seq_unsafe.go",
+        "bytes_io.go",
+        "bytes_io_unsafe.go",
+        "usermem.go",
+        "usermem_state.go",
+        "usermem_x86.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/atomicbitops",
+        "//pkg/binary",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/safemem",
+        "//pkg/state",
+        "//pkg/syserror",
+        "//pkg/tcpip/buffer",
+    ],
+)
+
+go_test(
+    name = "usermem_test",
+    size = "small",
+    srcs = [
+        "addr_range_seq_test.go",
+        "usermem_test.go",
+    ],
+    embed = [":usermem"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/safemem",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/usermem/README.md b/pkg/sentry/usermem/README.md
new file mode 100644
index 000000000..2ebd3bcc1
--- /dev/null
+++ b/pkg/sentry/usermem/README.md
@@ -0,0 +1,31 @@
+This package defines primitives for sentry access to application memory.
+
+Major types:
+
+- The `IO` interface represents a virtual address space and provides I/O methods
+  on that address space. `IO` is the lowest-level primitive. The primary
+  implementation of the `IO` interface is `mm.MemoryManager`.
+
+- `IOSequence` represents a collection of individually-contiguous address ranges
+  in a `IO` that is operated on sequentially, analogous to Linux's `struct
+  iov_iter`.
+
+Major usage patterns:
+
+- Access to a task's virtual memory, subject to the application's memory
+  protections and while running on that task's goroutine, from a context that is
+  at or above the level of the `kernel` package (e.g. most syscall
+  implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers
+  defined in `kernel/task_usermem.go`.
+
+- Access to a task's virtual memory, from a context that is at or above the
+  level of the `kernel` package, but where any of the above constraints does not
+  hold (e.g. `PTRACE_POKEDATA`, which ignores application memory protections);
+  obtain the task's `mm.MemoryManager` by calling `kernel.Task.MemoryManager`,
+  and call its `IO` methods directly.
+
+- Access to a task's virtual memory, from a context that is below the level of
+  the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments
+  from higher layers, usually in the form of an `IOSequence`. The
+  `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions in
+  `kernel/task_usermem.go` are convenience functions for doing so.
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
new file mode 100644
index 000000000..7eabecf30
--- /dev/null
+++ b/pkg/sentry/usermem/access_type.go
@@ -0,0 +1,117 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"syscall"
+)
+
+// AccessType specifies memory access types. This is used for
+// setting mapping permissions, as well as communicating faults.
+type AccessType struct {
+	// Read is read access.
+	Read bool
+
+	// Write is write access.
+	Write bool
+
+	// Execute is executable access.
+	Execute bool
+}
+
+// String returns a pretty representation of access. This looks like the
+// familiar r-x, rw-, etc. and can be relied on as such.
+func (a AccessType) String() string {
+	bits := [3]byte{'-', '-', '-'}
+	if a.Read {
+		bits[0] = 'r'
+	}
+	if a.Write {
+		bits[1] = 'w'
+	}
+	if a.Execute {
+		bits[2] = 'x'
+	}
+	return string(bits[:])
+}
+
+// Any returns true iff at least one of Read, Write or Execute is true.
+func (a AccessType) Any() bool {
+	return a.Read || a.Write || a.Execute
+}
+
+// Prot returns the system prot (syscall.PROT_READ, etc.) for this access.
+func (a AccessType) Prot() int {
+	var prot int
+	if a.Read {
+		prot |= syscall.PROT_READ
+	}
+	if a.Write {
+		prot |= syscall.PROT_WRITE
+	}
+	if a.Execute {
+		prot |= syscall.PROT_EXEC
+	}
+	return prot
+}
+
+// SupersetOf returns true iff the access types in a are a superset of the
+// access types in other.
+func (a AccessType) SupersetOf(other AccessType) bool {
+	if !a.Read && other.Read {
+		return false
+	}
+	if !a.Write && other.Write {
+		return false
+	}
+	if !a.Execute && other.Execute {
+		return false
+	}
+	return true
+}
+
+// Intersect returns the access types set in both a and other.
+func (a AccessType) Intersect(other AccessType) AccessType {
+	return AccessType{
+		Read:    a.Read && other.Read,
+		Write:   a.Write && other.Write,
+		Execute: a.Execute && other.Execute,
+	}
+}
+
+// Effective returns the set of effective access types allowed by a, even if
+// some types are not explicitly allowed.
+func (a AccessType) Effective() AccessType {
+	// In Linux, Write and Execute access generally imply Read access. See
+	// mm/mmap.c:protection_map.
+	//
+	// The notable exception is get_user_pages, which only checks against
+	// the original vma flags. That said, most user memory accesses do not
+	// use GUP.
+	if a.Write || a.Execute {
+		a.Read = true
+	}
+	return a
+}
+
+// Convenient access types.
+var (
+	NoAccess  = AccessType{}
+	Read      = AccessType{Read: true}
+	Write     = AccessType{Write: true}
+	Execute   = AccessType{Execute: true}
+	ReadWrite = AccessType{Read: true, Write: true}
+	AnyAccess = AccessType{Read: true, Write: true, Execute: true}
+)
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
new file mode 100644
index 000000000..d175fdc74
--- /dev/null
+++ b/pkg/sentry/usermem/addr.go
@@ -0,0 +1,106 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"fmt"
+)
+
+// Addr represents a generic virtual address.
+type Addr uintptr
+
+// AddLength adds the given length to start and returns the result. ok is true
+// iff adding the length did not overflow the range of Addr.
+//
+// Note: This function is usually used to get the end of an address range
+// defined by its start address and length. Since the resulting end is
+// exclusive, end == 0 is technically valid, and corresponds to a range that
+// extends to the end of the address space, but ok will be false. This isn't
+// expected to ever come up in practice.
+func (v Addr) AddLength(length uint64) (end Addr, ok bool) {
+	end = v + Addr(length)
+	// The second half of the following check is needed in case uintptr is
+	// smaller than 64 bits.
+	ok = end >= v && length <= uint64(^Addr(0))
+	return
+}
+
+// RoundDown returns the address rounded down to the nearest page boundary.
+func (v Addr) RoundDown() Addr {
+	return v & ^Addr(PageSize-1)
+}
+
+// RoundUp returns the address rounded up to the nearest page boundary. ok is
+// true iff rounding up did not wrap around.
+func (v Addr) RoundUp() (addr Addr, ok bool) {
+	addr = Addr(v + PageSize - 1).RoundDown()
+	ok = addr >= v
+	return
+}
+
+// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps
+// around.
+func (v Addr) MustRoundUp() Addr {
+	addr, ok := v.RoundUp()
+	if !ok {
+		panic(fmt.Sprintf("usermem.Addr(%d).RoundUp() wraps", v))
+	}
+	return addr
+}
+
+// HugeRoundDown returns the address rounded down to the nearest huge page
+// boundary.
+func (v Addr) HugeRoundDown() Addr {
+	return v & ^Addr(HugePageSize-1)
+}
+
+// HugeRoundUp returns the address rounded up to the nearest huge page boundary.
+// ok is true iff rounding up did not wrap around.
+func (v Addr) HugeRoundUp() (addr Addr, ok bool) {
+	addr = Addr(v + HugePageSize - 1).HugeRoundDown()
+	ok = addr >= v
+	return
+}
+
+// PageOffset returns the offset of v into the current page.
+func (v Addr) PageOffset() uint64 {
+	return uint64(v & Addr(PageSize-1))
+}
+
+// IsPageAligned returns true if v.PageOffset() == 0.
+func (v Addr) IsPageAligned() bool {
+	return v.PageOffset() == 0
+}
+
+// AddrRange is a range of Addrs.
+//
+// type AddrRange <generated by go_generics>
+
+// ToRange returns [v, v+length).
+func (v Addr) ToRange(length uint64) (AddrRange, bool) {
+	end, ok := v.AddLength(length)
+	return AddrRange{v, end}, ok
+}
+
+// IsPageAligned returns true if ar.Start.IsPageAligned() and
+// ar.End.IsPageAligned().
+func (ar AddrRange) IsPageAligned() bool {
+	return ar.Start.IsPageAligned() && ar.End.IsPageAligned()
+}
+
+// String implements fmt.Stringer.String.
+func (ar AddrRange) String() string {
+	return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
+}
diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go
new file mode 100644
index 000000000..cf9d785ed
--- /dev/null
+++ b/pkg/sentry/usermem/addr_range_seq_test.go
@@ -0,0 +1,197 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"testing"
+)
+
+var addrRangeSeqTests = []struct {
+	desc   string
+	ranges []AddrRange
+}{
+	{
+		desc: "Empty sequence",
+	},
+	{
+		desc: "Single empty AddrRange",
+		ranges: []AddrRange{
+			{0x10, 0x10},
+		},
+	},
+	{
+		desc: "Single non-empty AddrRange of length 1",
+		ranges: []AddrRange{
+			{0x10, 0x11},
+		},
+	},
+	{
+		desc: "Single non-empty AddrRange of length 2",
+		ranges: []AddrRange{
+			{0x10, 0x12},
+		},
+	},
+	{
+		desc: "Multiple non-empty AddrRanges",
+		ranges: []AddrRange{
+			{0x10, 0x11},
+			{0x20, 0x22},
+		},
+	},
+	{
+		desc: "Multiple AddrRanges including empty AddrRanges",
+		ranges: []AddrRange{
+			{0x10, 0x10},
+			{0x20, 0x20},
+			{0x30, 0x33},
+			{0x40, 0x44},
+			{0x50, 0x50},
+			{0x60, 0x60},
+			{0x70, 0x77},
+			{0x80, 0x88},
+			{0x90, 0x90},
+			{0xa0, 0xa0},
+		},
+	},
+}
+
+func testAddrRangeSeqEqualityWithTailIteration(t *testing.T, ars AddrRangeSeq, wantRanges []AddrRange) {
+	var wantLen int64
+	for _, ar := range wantRanges {
+		wantLen += int64(ar.Length())
+	}
+
+	var i int
+	for !ars.IsEmpty() {
+		if gotLen := ars.NumBytes(); gotLen != wantLen {
+			t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen)
+		}
+		if gotN, wantN := ars.NumRanges(), len(wantRanges)-i; gotN != wantN {
+			t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted %d", i, ars, gotN, wantN)
+		}
+		got := ars.Head()
+		if i >= len(wantRanges) {
+			t.Errorf("Iteration %d: %v.Head(): got %s, wanted <end of sequence>", i, ars, got)
+		} else if want := wantRanges[i]; got != want {
+			t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want)
+		}
+		ars = ars.Tail()
+		wantLen -= int64(got.Length())
+		i++
+	}
+	if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 {
+		t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen)
+	}
+	if gotN := ars.NumRanges(); gotN != 0 {
+		t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted 0", i, ars, gotN)
+	}
+}
+
+func TestAddrRangeSeqTailIteration(t *testing.T) {
+	for _, test := range addrRangeSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			testAddrRangeSeqEqualityWithTailIteration(t, AddrRangeSeqFromSlice(test.ranges), test.ranges)
+		})
+	}
+}
+
+func TestAddrRangeSeqDropFirstEmpty(t *testing.T) {
+	var ars AddrRangeSeq
+	if got, want := ars.DropFirst(1), ars; got != want {
+		t.Errorf("%v.DropFirst(1): got %v, wanted %v", ars, got, want)
+	}
+}
+
+func TestAddrRangeSeqDropSingleByteIteration(t *testing.T) {
+	// Tests AddrRangeSeq iteration using Head/DropFirst, simulating
+	// I/O-per-AddrRange.
+	for _, test := range addrRangeSeqTests {
+		t.Run(test.desc, func(t *testing.T) {
+			// Figure out what AddrRanges we expect to see.
+			var wantLen int64
+			var wantRanges []AddrRange
+			for _, ar := range test.ranges {
+				wantLen += int64(ar.Length())
+				wantRanges = append(wantRanges, ar)
+				if ar.Length() == 0 {
+					// We "do" 0 bytes of I/O and then call DropFirst(0),
+					// advancing to the next AddrRange.
+					continue
+				}
+				// Otherwise we "do" 1 byte of I/O and then call DropFirst(1),
+				// advancing the AddrRange by 1 byte, or to the next AddrRange
+				// if this one is exhausted.
+				for ar.Start++; ar.Length() != 0; ar.Start++ {
+					wantRanges = append(wantRanges, ar)
+				}
+			}
+			t.Logf("Expected AddrRanges: %s (%d bytes)", wantRanges, wantLen)
+
+			ars := AddrRangeSeqFromSlice(test.ranges)
+			var i int
+			for !ars.IsEmpty() {
+				if gotLen := ars.NumBytes(); gotLen != wantLen {
+					t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen)
+				}
+				got := ars.Head()
+				if i >= len(wantRanges) {
+					t.Errorf("Iteration %d: %v.Head(): got %s, wanted <end of sequence>", i, ars, got)
+				} else if want := wantRanges[i]; got != want {
+					t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want)
+				}
+				if got.Length() == 0 {
+					ars = ars.DropFirst(0)
+				} else {
+					ars = ars.DropFirst(1)
+					wantLen--
+				}
+				i++
+			}
+			if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 {
+				t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen)
+			}
+		})
+	}
+}
+
+func TestAddrRangeSeqTakeFirstEmpty(t *testing.T) {
+	var ars AddrRangeSeq
+	if got, want := ars.TakeFirst(1), ars; got != want {
+		t.Errorf("%v.TakeFirst(1): got %v, wanted %v", ars, got, want)
+	}
+}
+
+func TestAddrRangeSeqTakeFirst(t *testing.T) {
+	ranges := []AddrRange{
+		{0x10, 0x11},
+		{0x20, 0x22},
+		{0x30, 0x30},
+		{0x40, 0x44},
+		{0x50, 0x55},
+		{0x60, 0x60},
+		{0x70, 0x77},
+	}
+	ars := AddrRangeSeqFromSlice(ranges).TakeFirst(5)
+	want := []AddrRange{
+		{0x10, 0x11}, // +1 byte (total 1 byte), not truncated
+		{0x20, 0x22}, // +2 bytes (total 3 bytes), not truncated
+		{0x30, 0x30}, // +0 bytes (total 3 bytes), no change
+		{0x40, 0x42}, // +2 bytes (total 5 bytes), partially truncated
+		{0x50, 0x50}, // +0 bytes (total 5 bytes), fully truncated
+		{0x60, 0x60}, // +0 bytes (total 5 bytes), "fully truncated" (no change)
+		{0x70, 0x70}, // +0 bytes (total 5 bytes), fully truncated
+	}
+	testAddrRangeSeqEqualityWithTailIteration(t, ars, want)
+}
diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go
new file mode 100644
index 000000000..13b2998b3
--- /dev/null
+++ b/pkg/sentry/usermem/addr_range_seq_unsafe.go
@@ -0,0 +1,277 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"unsafe"
+)
+
+// An AddrRangeSeq represents a sequence of AddrRanges.
+//
+// AddrRangeSeqs are immutable and may be copied by value. The zero value of
+// AddrRangeSeq represents an empty sequence.
+//
+// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary
+// since zero-length AddrRanges are significant to MM bounds checks.
+type AddrRangeSeq struct {
+	// If length is 0, then the AddrRangeSeq represents no AddrRanges.
+	// Invariants: data == 0; offset == 0; limit == 0.
+	//
+	// If length is 1, then the AddrRangeSeq represents the single
+	// AddrRange{offset, offset+limit}. Invariants: data == 0.
+	//
+	// Otherwise, length >= 2, and the AddrRangeSeq represents the `length`
+	// AddrRanges in the array of AddrRanges starting at address `data`,
+	// starting at `offset` bytes into the first AddrRange and limited to the
+	// following `limit` bytes. (AddrRanges after `limit` are still iterated,
+	// but are truncated to a length of 0.) Invariants: data != 0; offset <=
+	// data[0].Length(); limit > 0; offset+limit <= the combined length of all
+	// AddrRanges in the array.
+	data   unsafe.Pointer
+	length int
+	offset Addr
+	limit  Addr
+}
+
+// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar.
+func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq {
+	return AddrRangeSeq{
+		length: 1,
+		offset: ar.Start,
+		limit:  ar.Length(),
+	}
+}
+
+// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in
+// slice.
+//
+// Whether the returned AddrRangeSeq shares memory with slice is unspecified;
+// clients should avoid mutating slices passed to AddrRangeSeqFromSlice.
+//
+// Preconditions: The combined length of all AddrRanges in slice <=
+// math.MaxInt64.
+func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq {
+	var limit int64
+	for _, ar := range slice {
+		len64 := int64(ar.Length())
+		if len64 < 0 {
+			panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar))
+		}
+		sum := limit + len64
+		if sum < limit {
+			panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice))
+		}
+		limit = sum
+	}
+	return addrRangeSeqFromSliceLimited(slice, limit)
+}
+
+// Preconditions: The combined length of all AddrRanges in slice <= limit.
+// limit >= 0. If len(slice) != 0, then limit > 0.
+func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq {
+	switch len(slice) {
+	case 0:
+		return AddrRangeSeq{}
+	case 1:
+		return AddrRangeSeq{
+			length: 1,
+			offset: slice[0].Start,
+			limit:  Addr(limit),
+		}
+	default:
+		return AddrRangeSeq{
+			data:   unsafe.Pointer(&slice[0]),
+			length: len(slice),
+			limit:  Addr(limit),
+		}
+	}
+}
+
+// IsEmpty returns true if ars.NumRanges() == 0.
+//
+// Note that since AddrRangeSeq may contain AddrRanges with a length of zero,
+// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not
+// necessarily empty.
+func (ars AddrRangeSeq) IsEmpty() bool {
+	return ars.length == 0
+}
+
+// NumRanges returns the number of AddrRanges in ars.
+func (ars AddrRangeSeq) NumRanges() int {
+	return ars.length
+}
+
+// NumBytes returns the number of bytes represented by ars.
+func (ars AddrRangeSeq) NumBytes() int64 {
+	return int64(ars.limit)
+}
+
+// Head returns the first AddrRange in ars.
+//
+// Preconditions: !ars.IsEmpty().
+func (ars AddrRangeSeq) Head() AddrRange {
+	if ars.length == 0 {
+		panic("empty AddrRangeSeq")
+	}
+	if ars.length == 1 {
+		return AddrRange{ars.offset, ars.offset + ars.limit}
+	}
+	ar := *(*AddrRange)(ars.data)
+	ar.Start += ars.offset
+	if ar.Length() > ars.limit {
+		ar.End = ar.Start + ars.limit
+	}
+	return ar
+}
+
+// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the
+// first.
+//
+// Preconditions: !ars.IsEmpty().
+func (ars AddrRangeSeq) Tail() AddrRangeSeq {
+	if ars.length == 0 {
+		panic("empty AddrRangeSeq")
+	}
+	if ars.length == 1 {
+		return AddrRangeSeq{}
+	}
+	return ars.externalTail()
+}
+
+// Preconditions: ars.length >= 2.
+func (ars AddrRangeSeq) externalTail() AddrRangeSeq {
+	headLen := (*AddrRange)(ars.data).Length() - ars.offset
+	var tailLimit int64
+	if ars.limit > headLen {
+		tailLimit = int64(ars.limit - headLen)
+	}
+	var extSlice []AddrRange
+	extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice))
+	extSliceHdr.Data = uintptr(ars.data)
+	extSliceHdr.Len = ars.length
+	extSliceHdr.Cap = ars.length
+	return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit)
+}
+
+// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n
+// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty
+// AddrRangeSeq.
+//
+// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit
+// at least ars.Head(), even if n == 0. This guarantees that the basic pattern
+// of:
+//
+//     for !ars.IsEmpty() {
+//       n, err = doIOWith(ars.Head())
+//       if err != nil {
+//         return err
+//       }
+//       ars = ars.DropFirst(n)
+//     }
+//
+// works even in the presence of zero-length AddrRanges.
+//
+// Preconditions: n >= 0.
+func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return ars.DropFirst64(int64(n))
+}
+
+// DropFirst64 is equivalent to DropFirst but takes an int64.
+func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	if Addr(n) > ars.limit {
+		return AddrRangeSeq{}
+	}
+	// Handle initial empty AddrRange.
+	switch ars.length {
+	case 0:
+		return AddrRangeSeq{}
+	case 1:
+		if ars.limit == 0 {
+			return AddrRangeSeq{}
+		}
+	default:
+		if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen {
+			ars = ars.externalTail()
+		}
+	}
+	for n != 0 {
+		// Calling ars.Head() here is surprisingly expensive, so inline getting
+		// the head's length.
+		var headLen Addr
+		if ars.length == 1 {
+			headLen = ars.limit
+		} else {
+			headLen = (*AddrRange)(ars.data).Length() - ars.offset
+		}
+		if Addr(n) < headLen {
+			// Dropping ends partway through the head AddrRange.
+			ars.offset += Addr(n)
+			ars.limit -= Addr(n)
+			return ars
+		}
+		n -= int64(headLen)
+		ars = ars.Tail()
+	}
+	return ars
+}
+
+// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n
+// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the
+// first n bytes are reduced to a length of zero, but will still be iterated.
+//
+// Preconditions: n >= 0.
+func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	return ars.TakeFirst64(int64(n))
+}
+
+// TakeFirst64 is equivalent to TakeFirst but takes an int64.
+func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq {
+	if n < 0 {
+		panic(fmt.Sprintf("invalid n: %d", n))
+	}
+	if ars.limit > Addr(n) {
+		ars.limit = Addr(n)
+	}
+	return ars
+}
+
+// String implements fmt.Stringer.String.
+func (ars AddrRangeSeq) String() string {
+	// This is deliberately chosen to be the same as fmt's automatic stringer
+	// for []AddrRange.
+	var buf bytes.Buffer
+	buf.WriteByte('[')
+	var sep string
+	for !ars.IsEmpty() {
+		buf.WriteString(sep)
+		sep = " "
+		buf.WriteString(ars.Head().String())
+		ars = ars.Tail()
+	}
+	buf.WriteByte(']')
+	return buf.String()
+}
diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go
new file mode 100644
index 000000000..01a746404
--- /dev/null
+++ b/pkg/sentry/usermem/bytes_io.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const maxInt = int(^uint(0) >> 1)
+
+// BytesIO implements IO using a byte slice. Addresses are interpreted as
+// offsets into the slice. Reads and writes beyond the end of the slice return
+// EFAULT.
+type BytesIO struct {
+	Bytes []byte
+}
+
+// CopyOut implements IO.CopyOut.
+func (b *BytesIO) CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) {
+	rngN, rngErr := b.rangeCheck(addr, len(src))
+	if rngN == 0 {
+		return 0, rngErr
+	}
+	return copy(b.Bytes[int(addr):], src[:rngN]), rngErr
+}
+
+// CopyIn implements IO.CopyIn.
+func (b *BytesIO) CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) {
+	rngN, rngErr := b.rangeCheck(addr, len(dst))
+	if rngN == 0 {
+		return 0, rngErr
+	}
+	return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr
+}
+
+// ZeroOut implements IO.ZeroOut.
+func (b *BytesIO) ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) {
+	if toZero > int64(maxInt) {
+		return 0, syserror.EINVAL
+	}
+	rngN, rngErr := b.rangeCheck(addr, int(toZero))
+	if rngN == 0 {
+		return 0, rngErr
+	}
+	zeroSlice := b.Bytes[int(addr) : int(addr)+rngN]
+	for i := range zeroSlice {
+		zeroSlice[i] = 0
+	}
+	return int64(rngN), rngErr
+}
+
+// CopyOutFrom implements IO.CopyOutFrom.
+func (b *BytesIO) CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) {
+	dsts, rngErr := b.blocksFromAddrRanges(ars)
+	n, err := src.ReadToBlocks(dsts)
+	if err != nil {
+		return int64(n), err
+	}
+	return int64(n), rngErr
+}
+
+// CopyInTo implements IO.CopyInTo.
+func (b *BytesIO) CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) {
+	srcs, rngErr := b.blocksFromAddrRanges(ars)
+	n, err := dst.WriteFromBlocks(srcs)
+	if err != nil {
+		return int64(n), err
+	}
+	return int64(n), rngErr
+}
+
+func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) {
+	if length == 0 {
+		return 0, nil
+	}
+	if length < 0 {
+		return 0, syserror.EINVAL
+	}
+	max := Addr(len(b.Bytes))
+	if addr >= max {
+		return 0, syserror.EFAULT
+	}
+	end, ok := addr.AddLength(uint64(length))
+	if !ok || end > max {
+		return int(max - addr), syserror.EFAULT
+	}
+	return length, nil
+}
+
+func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) {
+	blocks := make([]safemem.Block, 0, ars.NumRanges())
+	for !ars.IsEmpty() {
+		ar := ars.Head()
+		n, err := b.rangeCheck(ar.Start, int(ar.Length()))
+		if n != 0 {
+			blocks = append(blocks, safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start):int(ar.Start)+n]))
+		}
+		if err != nil {
+			return safemem.BlockSeqFromSlice(blocks), err
+		}
+		ars = ars.Tail()
+	}
+	return safemem.BlockSeqFromSlice(blocks), nil
+}
+
+// BytesIOSequence returns an IOSequence representing the given byte slice.
+func BytesIOSequence(buf []byte) IOSequence {
+	return IOSequence{
+		IO:    &BytesIO{buf},
+		Addrs: AddrRangeSeqOf(AddrRange{0, Addr(len(buf))}),
+	}
+}
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
new file mode 100644
index 000000000..efd71fcbc
--- /dev/null
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// SwapUint32 implements IO.SwapUint32.
+func (b *BytesIO) SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) {
+	if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil {
+		return 0, rngErr
+	}
+	return atomic.SwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), new), nil
+}
+
+// CompareAndSwapUint32 implements IO.CompareAndSwapUint32.
+func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) {
+	if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil {
+		return 0, rngErr
+	}
+	return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil
+}
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
new file mode 100644
index 000000000..5d8a1c558
--- /dev/null
+++ b/pkg/sentry/usermem/usermem.go
@@ -0,0 +1,572 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package usermem governs access to user memory.
+package usermem
+
+import (
+	"errors"
+	"io"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// IO provides access to the contents of a virtual memory space.
+//
+// FIXME: Implementations of IO cannot expect ctx to contain any
+// meaningful data.
+type IO interface {
+	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
+	// returns the number of bytes copied. If the number of bytes copied is <
+	// len(src), it returns a non-nil error explaining why.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order.
+	CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error)
+
+	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
+	// It returns the number of bytes copied. If the number of bytes copied is
+	// < len(dst), it returns a non-nil error explaining why.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order.
+	CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error)
+
+	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
+	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
+	// non-nil error explaining why.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. toZero >= 0.
+	ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error)
+
+	// CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at
+	// ars. It returns the number of bytes copied, which may be less than the
+	// number of bytes read from src if copying fails. CopyOutFrom may return a
+	// partial copy without an error iff src.ReadToBlocks returns a partial
+	// read without an error.
+	//
+	// CopyOutFrom calls src.ReadToBlocks at most once.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. src.ReadToBlocks must not block
+	// on mm.MemoryManager.activeMu or any preceding locks in the lock order.
+	CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error)
+
+	// CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to
+	// dst. It returns the number of bytes copied. CopyInTo may return a
+	// partial copy without an error iff dst.WriteFromBlocks returns a partial
+	// write without an error.
+	//
+	// CopyInTo calls dst.WriteFromBlocks at most once.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. dst.WriteFromBlocks must not
+	// block on mm.MemoryManager.activeMu or any preceding locks in the lock
+	// order.
+	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
+
+	// TODO: The requirement that CopyOutFrom/CopyInTo call src/dst
+	// at most once, which is unnecessary in most cases, forces implementations
+	// to gather safemem.Blocks into a single slice to pass to src/dst. Add
+	// CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid
+	// this allocation.
+
+	// SwapUint32 atomically sets the uint32 value at addr to new and
+	// returns the previous value.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error)
+
+	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
+	// old; if they are equal, the value in memory is replaced by new. In
+	// either case, the previous value stored in memory is returned.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
+}
+
+// IOOpts contains options applicable to all IO methods.
+type IOOpts struct {
+	// If IgnorePermissions is true, application-defined memory protections set
+	// by mmap(2) or mprotect(2) will be ignored. (Memory protections required
+	// by the target of the mapping are never ignored.)
+	IgnorePermissions bool
+
+	// If AddressSpaceActive is true, the IO implementation may assume that it
+	// has an active AddressSpace and can therefore use AddressSpace copying
+	// without performing activation. See mm/io.go for details.
+	AddressSpaceActive bool
+}
+
+// IOReadWriter is an io.ReadWriter that reads from / writes to addresses
+// starting at addr in IO. The preconditions that apply to IO.CopyIn and
+// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write
+// respectively.
+type IOReadWriter struct {
+	Ctx  context.Context
+	IO   IO
+	Addr Addr
+	Opts IOOpts
+}
+
+// Read implements io.Reader.Read.
+//
+// Note that an address space does not have an "end of file", so Read can only
+// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or
+// unreadable memory, or beyond the end of the address space, should return
+// EFAULT.
+func (rw *IOReadWriter) Read(dst []byte) (int, error) {
+	n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts)
+	end, ok := rw.Addr.AddLength(uint64(n))
+	if ok {
+		rw.Addr = end
+	} else {
+		// Disallow wraparound.
+		rw.Addr = ^Addr(0)
+		if err != nil {
+			err = syserror.EFAULT
+		}
+	}
+	return n, err
+}
+
+// Writer implements io.Writer.Write.
+func (rw *IOReadWriter) Write(src []byte) (int, error) {
+	n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts)
+	end, ok := rw.Addr.AddLength(uint64(n))
+	if ok {
+		rw.Addr = end
+	} else {
+		// Disallow wraparound.
+		rw.Addr = ^Addr(0)
+		if err != nil {
+			err = syserror.EFAULT
+		}
+	}
+	return n, err
+}
+
+// CopyObjectOut copies a fixed-size value or slice of fixed-size values from
+// src to the memory mapped at addr in uio. It returns the number of bytes
+// copied.
+//
+// CopyObjectOut must use reflection to encode src; performance-sensitive
+// clients should do encoding manually and use uio.CopyOut directly.
+//
+// Preconditions: As for IO.CopyOut.
+func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) {
+	w := &IOReadWriter{
+		Ctx:  ctx,
+		IO:   uio,
+		Addr: addr,
+		Opts: opts,
+	}
+	return w.Write(binary.Marshal(nil, ByteOrder, src))
+}
+
+// CopyObjectIn copies a fixed-size value or slice of fixed-size values from
+// the memory mapped at addr in uio to dst. It returns the number of bytes
+// copied.
+//
+// CopyObjectIn must use reflection to decode dst; performance-sensitive
+// clients should use uio.CopyIn directly and do decoding manually.
+//
+// Preconditions: As for IO.CopyIn.
+func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) {
+	r := &IOReadWriter{
+		Ctx:  ctx,
+		IO:   uio,
+		Addr: addr,
+		Opts: opts,
+	}
+	buf := make([]byte, binary.Size(dst))
+	if _, err := io.ReadFull(r, buf); err != nil {
+		return 0, err
+	}
+	binary.Unmarshal(buf, ByteOrder, dst)
+	return int(r.Addr - addr), nil
+}
+
+// copyStringIncrement is the maximum number of bytes that are copied from
+// virtual memory at a time by CopyStringIn.
+const copyStringIncrement = 64
+
+// CopyStringIn copies a NUL-terminated string of unknown length from the
+// memory mapped at addr in uio and returns it as a string (not including the
+// trailing NUL). If the length of the string, including the terminating NUL,
+// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and
+// ENAMETOOLONG.
+//
+// Preconditions: As for IO.CopyFromUser. maxlen >= 0.
+func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
+	buf := make([]byte, maxlen)
+	var done int
+	for done < maxlen {
+		start, ok := addr.AddLength(uint64(done))
+		if !ok {
+			// Last page of kernel memory. The application can't use this
+			// anyway.
+			return string(buf[:done]), syserror.EFAULT
+		}
+		// Read up to copyStringIncrement bytes at a time.
+		readlen := copyStringIncrement
+		if readlen > maxlen-done {
+			readlen = maxlen - done
+		}
+		end, ok := start.AddLength(uint64(readlen))
+		if !ok {
+			return string(buf[:done]), syserror.EFAULT
+		}
+		// Shorten the read to avoid crossing page boundaries, since faulting
+		// in a page unnecessarily is expensive. This also ensures that partial
+		// copies up to the end of application-mappable memory succeed.
+		if start.RoundDown() != end.RoundDown() {
+			end = end.RoundDown()
+		}
+		n, err := uio.CopyIn(ctx, start, buf[done:done+int(end-start)], opts)
+		// Look for the terminating zero byte, which may have occurred before
+		// hitting err.
+		for i, c := range buf[done : done+n] {
+			if c == 0 {
+				return string(buf[:done+i]), nil
+			}
+		}
+		done += n
+		if err != nil {
+			return string(buf[:done]), err
+		}
+	}
+	return string(buf), syserror.ENAMETOOLONG
+}
+
+// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
+// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is
+// less. CopyOutVec returns the number of bytes copied; if this is less than
+// the maximum, it returns a non-nil error explaining why.
+//
+// Preconditions: As for IO.CopyOut.
+func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) {
+	var done int
+	for !ars.IsEmpty() && done < len(src) {
+		ar := ars.Head()
+		cplen := len(src) - done
+		if Addr(cplen) >= ar.Length() {
+			cplen = int(ar.Length())
+		}
+		n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		ars = ars.DropFirst(n)
+	}
+	return done, nil
+}
+
+// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The
+// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is
+// less. CopyInVec returns the number of bytes copied; if this is less than the
+// maximum, it returns a non-nil error explaining why.
+//
+// Preconditions: As for IO.CopyIn.
+func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) {
+	var done int
+	for !ars.IsEmpty() && done < len(dst) {
+		ar := ars.Head()
+		cplen := len(dst) - done
+		if Addr(cplen) >= ar.Length() {
+			cplen = int(ar.Length())
+		}
+		n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		ars = ars.DropFirst(n)
+	}
+	return done, nil
+}
+
+// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum
+// number of bytes written is ars.NumBytes() or toZero, whichever is less.
+// ZeroOutVec returns the number of bytes written; if this is less than the
+// maximum, it returns a non-nil error explaining why.
+//
+// Preconditions: As for IO.ZeroOut.
+func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) {
+	var done int64
+	for !ars.IsEmpty() && done < toZero {
+		ar := ars.Head()
+		cplen := toZero - done
+		if Addr(cplen) >= ar.Length() {
+			cplen = int64(ar.Length())
+		}
+		n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts)
+		done += n
+		if err != nil {
+			return done, err
+		}
+		ars = ars.DropFirst64(n)
+	}
+	return done, nil
+}
+
+func isASCIIWhitespace(b byte) bool {
+	// Compare Linux include/linux/ctype.h, lib/ctype.c.
+	//  9 => horizontal tab '\t'
+	// 10 => line feed '\n'
+	// 11 => vertical tab '\v'
+	// 12 => form feed '\c'
+	// 13 => carriage return '\r'
+	return b == ' ' || (b >= 9 && b <= 13)
+}
+
+// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal
+// strings from the memory mapped at ars in uio and converts them to int32
+// values in dsts. It returns the number of bytes read.
+//
+// CopyInt32StringsInVec shares the following properties with Linux's
+// kernel/sysctl.c:proc_dointvec(write=1):
+//
+// - If any read value overflows the range of int32, or any invalid characters
+// are encountered during the read, CopyInt32StringsInVec returns EINVAL.
+//
+// - If, upon reaching the end of ars, fewer than len(dsts) values have been
+// read, CopyInt32StringsInVec returns no error if at least 1 value was read
+// and EINVAL otherwise.
+//
+// - Trailing whitespace after the last successfully read value is counted in
+// the number of bytes read.
+//
+// Unlike proc_dointvec():
+//
+// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to
+// PageSize-1; callers that require this must do so explicitly.
+//
+// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0.
+//
+// Preconditions: As for CopyInVec.
+func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) {
+	if len(dsts) == 0 {
+		return 0, nil
+	}
+
+	buf := make([]byte, ars.NumBytes())
+	n, cperr := CopyInVec(ctx, uio, ars, buf, opts)
+	buf = buf[:n]
+
+	var i, j int
+	for ; j < len(dsts); j++ {
+		// Skip leading whitespace.
+		for i < len(buf) && isASCIIWhitespace(buf[i]) {
+			i++
+		}
+		if i == len(buf) {
+			break
+		}
+
+		// Find the end of the value to be parsed (next whitespace or end of string).
+		nextI := i + 1
+		for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) {
+			nextI++
+		}
+
+		// Parse a single value.
+		val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32)
+		if err != nil {
+			return int64(i), syserror.EINVAL
+		}
+		dsts[j] = int32(val)
+
+		i = nextI
+	}
+
+	// Skip trailing whitespace.
+	for i < len(buf) && isASCIIWhitespace(buf[i]) {
+		i++
+	}
+
+	if cperr != nil {
+		return int64(i), cperr
+	}
+	if j == 0 {
+		return int64(i), syserror.EINVAL
+	}
+	return int64(i), nil
+}
+
+// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at
+// most one int32.
+func CopyInt32StringInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) {
+	dsts := [1]int32{*dst}
+	n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts)
+	*dst = dsts[0]
+	return n, err
+}
+
+// IOSequence holds arguments to IO methods.
+type IOSequence struct {
+	IO    IO
+	Addrs AddrRangeSeq
+	Opts  IOOpts
+}
+
+// NumBytes returns s.Addrs.NumBytes().
+//
+// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since
+// s.Addrs may contain a non-zero number of zero-length AddrRanges.
+// Many clients of
+// IOSequence currently do something like:
+//
+//     if ioseq.NumBytes() == 0 {
+//       return 0, nil
+//     }
+//     if f.availableBytes == 0 {
+//       return 0, syserror.ErrWouldBlock
+//     }
+//     return ioseq.CopyOutFrom(..., reader)
+//
+// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong
+// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means
+// that we will return success for zero-length I/O in cases where Linux would
+// return EFAULT due to a failed access_ok() check, so in the long term we
+// should move checks for ErrWouldBlock etc. into the body of
+// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead.
+func (s IOSequence) NumBytes() int64 {
+	return s.Addrs.NumBytes()
+}
+
+// DropFirst returns a copy of s with s.Addrs.DropFirst(n).
+//
+// Preconditions: As for AddrRangeSeq.DropFirst.
+func (s IOSequence) DropFirst(n int) IOSequence {
+	return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts}
+}
+
+// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n).
+//
+// Preconditions: As for AddrRangeSeq.DropFirst64.
+func (s IOSequence) DropFirst64(n int64) IOSequence {
+	return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts}
+}
+
+// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n).
+//
+// Preconditions: As for AddrRangeSeq.TakeFirst.
+func (s IOSequence) TakeFirst(n int) IOSequence {
+	return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts}
+}
+
+// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n).
+//
+// Preconditions: As for AddrRangeSeq.TakeFirst64.
+func (s IOSequence) TakeFirst64(n int64) IOSequence {
+	return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts}
+}
+
+// CopyOut invokes CopyOutVec over s.Addrs.
+//
+// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated
+// to s.NumBytes(), and a nil error will be returned.
+//
+// Preconditions: As for CopyOutVec.
+func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
+	return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts)
+}
+
+// CopyIn invokes CopyInVec over s.Addrs.
+//
+// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to
+// s.NumBytes(), and a nil error will be returned.
+//
+// Preconditions: As for CopyInVec.
+func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
+	return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts)
+}
+
+// ZeroOut invokes ZeroOutVec over s.Addrs.
+//
+// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated
+// to s.NumBytes(), and a nil error will be returned.
+//
+// Preconditions: As for ZeroOutVec.
+func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) {
+	return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts)
+}
+
+// CopyOutFrom invokes s.CopyOutFrom over s.Addrs.
+//
+// Preconditions: As for IO.CopyOutFrom.
+func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) {
+	return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts)
+}
+
+// CopyInTo invokes s.CopyInTo over s.Addrs.
+//
+// Preconditions: As for IO.CopyInTo.
+func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) {
+	return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts)
+}
+
+// Reader returns an io.Reader that reads from s. Reads beyond the end of s
+// return io.EOF. The preconditions that apply to s.CopyIn also apply to the
+// returned io.Reader.Read.
+func (s IOSequence) Reader(ctx context.Context) io.Reader {
+	return &ioSequenceReadWriter{ctx, s}
+}
+
+// Writer returns an io.Writer that writes to s. Writes beyond the end of s
+// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also
+// apply to the returned io.Writer.Write.
+func (s IOSequence) Writer(ctx context.Context) io.Writer {
+	return &ioSequenceReadWriter{ctx, s}
+}
+
+// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when
+// attempting to write beyond the end of the IOSequence.
+var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence")
+
+type ioSequenceReadWriter struct {
+	ctx context.Context
+	s   IOSequence
+}
+
+// Read implements io.Reader.Read.
+func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) {
+	n, err := rw.s.CopyIn(rw.ctx, dst)
+	rw.s = rw.s.DropFirst(n)
+	if err == nil && rw.s.NumBytes() == 0 {
+		err = io.EOF
+	}
+	return n, err
+}
+
+// Write implements io.Writer.Write.
+func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) {
+	n, err := rw.s.CopyOut(rw.ctx, src)
+	rw.s = rw.s.DropFirst(n)
+	if err == nil && n < len(src) {
+		err = ErrEndOfIOSequence
+	}
+	return n, err
+}
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
new file mode 100644
index 000000000..563560da8
--- /dev/null
+++ b/pkg/sentry/usermem/usermem_test.go
@@ -0,0 +1,411 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"reflect"
+	"strings"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// newContext returns a context.Context that we can use in these tests (we
+// can't use contexttest because it depends on usermem).
+func newContext() context.Context {
+	return context.Background()
+}
+
+func newBytesIOString(s string) *BytesIO {
+	return &BytesIO{[]byte(s)}
+}
+
+func TestBytesIOCopyOutSuccess(t *testing.T) {
+	b := newBytesIOString("ABCDE")
+	n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{})
+	if wantN := 3; n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := b.Bytes, []byte("AfooE"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyOutFailure(t *testing.T) {
+	b := newBytesIOString("ABC")
+	n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{})
+	if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInSuccess(t *testing.T) {
+	b := newBytesIOString("AfooE")
+	var dst [3]byte
+	n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{})
+	if wantN := 3; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInFailure(t *testing.T) {
+	b := newBytesIOString("Afo")
+	var dst [3]byte
+	n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{})
+	if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOZeroOutSuccess(t *testing.T) {
+	b := newBytesIOString("ABCD")
+	n, err := b.ZeroOut(newContext(), 1, 2, IOOpts{})
+	if wantN := int64(2); n != wantN || err != nil {
+		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := b.Bytes, []byte("A\x00\x00D"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOZeroOutFailure(t *testing.T) {
+	b := newBytesIOString("ABC")
+	n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{})
+	if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyOutFromSuccess(t *testing.T) {
+	b := newBytesIOString("ABCDEFGH")
+	n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 4, End: 7},
+		{Start: 1, End: 4},
+	}), safemem.FromIOReader{bytes.NewBufferString("barfoo")}, IOOpts{})
+	if wantN := int64(6); n != wantN || err != nil {
+		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := b.Bytes, []byte("AfoobarH"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyOutFromFailure(t *testing.T) {
+	b := newBytesIOString("ABCDE")
+	n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 1, End: 4},
+		{Start: 4, End: 7},
+	}), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{})
+	if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) {
+		t.Errorf("Bytes: got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInToSuccess(t *testing.T) {
+	b := newBytesIOString("AfoobarH")
+	var dst bytes.Buffer
+	n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 4, End: 7},
+		{Start: 1, End: 4},
+	}), safemem.FromIOWriter{&dst}, IOOpts{})
+	if wantN := int64(6); n != wantN || err != nil {
+		t.Errorf("CopyInTo: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst.Bytes(), []byte("barfoo"); !bytes.Equal(got, want) {
+		t.Errorf("dst.Bytes(): got %q, wanted %q", got, want)
+	}
+}
+
+func TestBytesIOCopyInToFailure(t *testing.T) {
+	b := newBytesIOString("Afoob")
+	var dst bytes.Buffer
+	n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{
+		{Start: 1, End: 4},
+		{Start: 4, End: 7},
+	}), safemem.FromIOWriter{&dst}, IOOpts{})
+	if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr {
+		t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr)
+	}
+	if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) {
+		t.Errorf("dst.Bytes(): got %q, wanted %q", got, want)
+	}
+}
+
+type testStruct struct {
+	Int8   int8
+	Uint8  uint8
+	Int16  int16
+	Uint16 uint16
+	Int32  int32
+	Uint32 uint32
+	Int64  int64
+	Uint64 uint64
+}
+
+func TestCopyObject(t *testing.T) {
+	wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8}
+	wantN := binary.Size(wantObj)
+	b := &BytesIO{make([]byte, wantN)}
+	ctx := newContext()
+	if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil {
+		t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	var gotObj testStruct
+	if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil {
+		t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if gotObj != wantObj {
+		t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj)
+	}
+}
+
+func TestCopyStringInShort(t *testing.T) {
+	want := strings.Repeat("A", copyStringIncrement-2)
+	mem := want + "\x00"
+	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
+	}
+}
+
+func TestCopyStringInLong(t *testing.T) {
+	want := strings.Repeat("A", copyStringIncrement+1)
+	mem := want + "\x00"
+	if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
+	}
+}
+
+func TestCopyStringInNoTerminatingZeroByte(t *testing.T) {
+	want := strings.Repeat("A", copyStringIncrement-1)
+	got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{})
+	if wantErr := syserror.EFAULT; got != want || err != wantErr {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr)
+	}
+}
+
+func TestCopyStringInTruncatedByMaxlen(t *testing.T) {
+	got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{})
+	if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr {
+		t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr)
+	}
+}
+
+func TestCopyInt32StringsInVec(t *testing.T) {
+	for _, test := range []struct {
+		str     string
+		n       int
+		initial []int32
+		final   []int32
+	}{
+		{
+			str:     "100 200",
+			n:       len("100 200"),
+			initial: []int32{1, 2},
+			final:   []int32{100, 200},
+		},
+		{
+			// Fewer values ok
+			str:     "100",
+			n:       len("100"),
+			initial: []int32{1, 2},
+			final:   []int32{100, 2},
+		},
+		{
+			// Extra values ok
+			str:     "100 200 300",
+			n:       len("100 200 "),
+			initial: []int32{1, 2},
+			final:   []int32{100, 200},
+		},
+		{
+			// Leading and trailing whitespace ok
+			str:     " 100\t200\n",
+			n:       len(" 100\t200\n"),
+			initial: []int32{1, 2},
+			final:   []int32{100, 200},
+		},
+	} {
+		t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) {
+			src := BytesIOSequence([]byte(test.str))
+			dsts := append([]int32(nil), test.initial...)
+			if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); n != int64(test.n) || err != nil {
+				t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (%d, nil)", n, err, test.n)
+			}
+			if !reflect.DeepEqual(dsts, test.final) {
+				t.Errorf("dsts: got %v, wanted %v", dsts, test.final)
+			}
+		})
+	}
+}
+
+func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) {
+	for _, s := range []string{"", "\n", "a123"} {
+		t.Run(fmt.Sprintf("%q", s), func(t *testing.T) {
+			src := BytesIOSequence([]byte(s))
+			initial := []int32{1, 2}
+			dsts := append([]int32(nil), initial...)
+			if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL {
+				t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL)
+			}
+			if !reflect.DeepEqual(dsts, initial) {
+				t.Errorf("dsts: got %v, wanted %v", dsts, initial)
+			}
+		})
+	}
+}
+
+func TestIOSequenceCopyOut(t *testing.T) {
+	buf := []byte("ABCD")
+	s := BytesIOSequence(buf)
+
+	// CopyOut limited by len(src).
+	n, err := s.CopyOut(newContext(), []byte("fo"))
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foCD"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(2); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// CopyOut limited by s.NumBytes().
+	n, err = s.CopyOut(newContext(), []byte("obar"))
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foob"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
+
+func TestIOSequenceCopyIn(t *testing.T) {
+	s := BytesIOSequence([]byte("foob"))
+	dst := []byte("ABCDEF")
+
+	// CopyIn limited by len(dst).
+	n, err := s.CopyIn(newContext(), dst[:2])
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foCDEF"); !bytes.Equal(dst, want) {
+		t.Errorf("dst: got %q, wanted %q", dst, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(2); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// CopyIn limited by s.Remaining().
+	n, err = s.CopyIn(newContext(), dst[2:])
+	if wantN := 2; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("foobEF"); !bytes.Equal(dst, want) {
+		t.Errorf("dst: got %q, wanted %q", dst, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
+
+func TestIOSequenceZeroOut(t *testing.T) {
+	buf := []byte("ABCD")
+	s := BytesIOSequence(buf)
+
+	// ZeroOut limited by toZero.
+	n, err := s.ZeroOut(newContext(), 2)
+	if wantN := int64(2); n != wantN || err != nil {
+		t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("\x00\x00CD"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(2); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// ZeroOut limited by s.NumBytes().
+	n, err = s.ZeroOut(newContext(), 4)
+	if wantN := int64(2); n != wantN || err != nil {
+		t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if want := []byte("\x00\x00\x00\x00"); !bytes.Equal(buf, want) {
+		t.Errorf("buf: got %q, wanted %q", buf, want)
+	}
+	s = s.DropFirst(2)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
+
+func TestIOSequenceTakeFirst(t *testing.T) {
+	s := BytesIOSequence([]byte("foobar"))
+	if got, want := s.NumBytes(), int64(6); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	s = s.TakeFirst(3)
+	if got, want := s.NumBytes(), int64(3); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	// TakeFirst(n) where n > s.NumBytes() is a no-op.
+	s = s.TakeFirst(9)
+	if got, want := s.NumBytes(), int64(3); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+
+	var dst [3]byte
+	n, err := s.CopyIn(newContext(), dst[:])
+	if wantN := 3; n != wantN || err != nil {
+		t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN)
+	}
+	if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) {
+		t.Errorf("dst: got %q, wanted %q", got, want)
+	}
+	s = s.DropFirst(3)
+	if got, want := s.NumBytes(), int64(0); got != want {
+		t.Errorf("NumBytes: got %v, wanted %v", got, want)
+	}
+}
diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go
new file mode 100644
index 000000000..2484b0d82
--- /dev/null
+++ b/pkg/sentry/usermem/usermem_x86.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 i386
+
+package usermem
+
+import "encoding/binary"
+
+const (
+	// PageSize is the system page size.
+	PageSize = 1 << PageShift
+
+	// HugePageSize is the system huge page size.
+	HugePageSize = 1 << HugePageShift
+
+	// PageShift is the binary log of the system page size.
+	PageShift = 12
+
+	// HugePageShift is the binary log of the system huge page size.
+	HugePageShift = 21
+)
+
+var (
+	// ByteOrder is the native byte order (little endian).
+	ByteOrder = binary.LittleEndian
+)
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
new file mode 100644
index 000000000..28fae4490
--- /dev/null
+++ b/pkg/sentry/watchdog/BUILD
@@ -0,0 +1,17 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "watchdog",
+    srcs = ["watchdog.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/metric",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/time",
+    ],
+)
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
new file mode 100644
index 000000000..5b620693d
--- /dev/null
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -0,0 +1,279 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package watchdog is responsible for monitoring the sentry for tasks that may
+// potentially be stuck or looping inderterminally causing hard to debug hungs in
+// the untrusted app.
+//
+// It works by periodically querying all tasks to check whether they are in user
+// mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks
+// that have been running in kernel mode for a long time in the same syscall
+// without blocking are considered stuck and are reported.
+//
+// When a stuck task is detected, the watchdog can take one of the following actions:
+//		1. LogWarning: Logs a warning message followed by a stack dump of all goroutines.
+//			 If a tasks continues to be stuck, the message will repeat every minute, unless
+//			 a new stuck task is detected
+//		2. Panic: same as above, followed by panic()
+//
+package watchdog
+
+import (
+	"bytes"
+	"fmt"
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+)
+
+// DefaultTimeout is a resonable timeout value for most applications.
+const DefaultTimeout = 3 * time.Minute
+
+// descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period
+// is discounted from task's last update time. It's set high enough that small scheduling delays won't
+// trigger it.
+const descheduleThreshold = 1 * time.Second
+
+var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+
+// Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
+var stackDumpSameTaskPeriod = time.Minute
+
+// Action defines what action to take when a stuck task is detected.
+type Action int
+
+const (
+	// LogWarning logs warning message followed by stack trace.
+	LogWarning Action = iota
+	// Panic will do the same logging as LogWarning and panic().
+	Panic
+)
+
+// String returns Action's string representation.
+func (a Action) String() string {
+	switch a {
+	case LogWarning:
+		return "LogWarning"
+	case Panic:
+		return "Panic"
+	default:
+		panic(fmt.Sprintf("Invalid action: %d", a))
+	}
+}
+
+// Watchdog is the main watchdog class. It controls a goroutine that periodically
+// analyses all tasks and reports if any of them appear to be stuck.
+type Watchdog struct {
+	// period indicates how often to check all tasks. It's calculated based on
+	// 'taskTimeout'.
+	period time.Duration
+
+	// taskTimeout is the amount of time to allow a task to execute the same syscall
+	// without blocking before it's declared stuck.
+	taskTimeout time.Duration
+
+	// timeoutAction indicates what action to take when a stuck tasks is detected.
+	timeoutAction Action
+
+	// k is where the tasks come from.
+	k *kernel.Kernel
+
+	// stop is used to notify to watchdog should stop.
+	stop chan struct{}
+
+	// done is used to notify when the watchdog has stopped.
+	done chan struct{}
+
+	// offenders map contains all tasks that are currently stuck.
+	offenders map[*kernel.Task]*offender
+
+	// lastStackDump tracks the last time a stack dump was generated to prevent
+	// spamming the log.
+	lastStackDump time.Time
+
+	// lastRun is set to the last time the watchdog executed a monitoring loop.
+	lastRun ktime.Time
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// started is true if the watchdog has been started before.
+	started bool
+}
+
+type offender struct {
+	lastUpdateTime ktime.Time
+}
+
+// New creates a new watchdog.
+func New(k *kernel.Kernel, taskTimeout time.Duration, a Action) *Watchdog {
+	// 4 is arbitrary, just don't want to prolong 'taskTimeout' too much.
+	period := taskTimeout / 4
+	return &Watchdog{
+		k:             k,
+		period:        period,
+		taskTimeout:   taskTimeout,
+		timeoutAction: a,
+		offenders:     make(map[*kernel.Task]*offender),
+		stop:          make(chan struct{}),
+		done:          make(chan struct{}),
+	}
+}
+
+// Start starts the watchdog.
+func (w *Watchdog) Start() {
+	if w.taskTimeout == 0 {
+		log.Infof("Watchdog disabled")
+		return
+	}
+
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.started {
+		return
+	}
+
+	w.lastRun = w.k.MonotonicClock().Now()
+
+	log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.taskTimeout, w.timeoutAction)
+	go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore.
+	w.started = true
+}
+
+// Stop requests the watchdog to stop and wait for it.
+func (w *Watchdog) Stop() {
+	if w.taskTimeout == 0 {
+		return
+	}
+
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if !w.started {
+		return
+	}
+	log.Infof("Stopping watchdog")
+	w.stop <- struct{}{}
+	<-w.done
+	w.started = false
+	log.Infof("Watchdog stopped")
+}
+
+// loop is the main watchdog routine. It only returns when 'Stop()' is called.
+func (w *Watchdog) loop() {
+	// Loop until someone stops it.
+	for {
+		select {
+		case <-w.stop:
+			w.done <- struct{}{}
+			return
+		case <-time.After(w.period):
+			w.runTurn()
+		}
+	}
+}
+
+// runTurn runs a single pass over all tasks and reports anything it finds.
+func (w *Watchdog) runTurn() {
+	tasks := w.k.TaskSet().Root.Tasks()
+
+	newOffenders := make(map[*kernel.Task]*offender)
+	newTaskFound := false
+	now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick)))
+
+	// The process may be running with low CPU limit making tasks appear stuck because
+	// are starved of CPU cycles. An estimate is that Tasks could have been starved
+	// since the last time the watchdog run. If the watchdog detects that scheduling
+	// is off, it will discount the entire duration since last run from 'lastUpdateTime'.
+	discount := time.Duration(0)
+	if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold {
+		discount = now.Sub(w.lastRun)
+	}
+	w.lastRun = now
+
+	log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount)
+	for _, t := range tasks {
+		tsched := t.TaskGoroutineSchedInfo()
+
+		// An offender is a task running inside the kernel for longer than the specified timeout.
+		if tsched.State == kernel.TaskGoroutineRunningSys {
+			lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick)))
+			elapsed := now.Sub(lastUpdateTime) - discount
+			if elapsed > w.taskTimeout {
+				tc, ok := w.offenders[t]
+				if !ok {
+					// New stuck task detected.
+					//
+					// TODO: Tasks blocked doing IO may be considered stuck in kernel.
+					tc = &offender{lastUpdateTime: lastUpdateTime}
+					stuckTasks.Increment()
+					newTaskFound = true
+				}
+				newOffenders[t] = tc
+			}
+		}
+	}
+	if len(newOffenders) > 0 {
+		w.report(newOffenders, newTaskFound, now)
+	}
+
+	// Remember which tasks have been reported.
+	w.offenders = newOffenders
+}
+
+// report takes appropriate action when a stuck task is detected.
+func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) {
+	var buf bytes.Buffer
+	buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders)))
+	for t, o := range offenders {
+		tid := w.k.TaskSet().Root.IDOfTask(t)
+		buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime)))
+	}
+	buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine")
+
+	switch w.timeoutAction {
+	case LogWarning:
+		// Dump stack only if a new task is detected or if it sometime has passed since
+		// the last time a stack dump was generated.
+		if !newTaskFound && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod {
+			buf.WriteString("\n...[stack dump skipped]...")
+			log.Warningf(buf.String())
+		} else {
+			log.TracebackAll(buf.String())
+			w.lastStackDump = time.Now()
+		}
+
+	case Panic:
+		// Panic will skip over running tasks, which is likely the culprit here. So manually
+		// dump all stacks before panic'ing.
+		log.TracebackAll(buf.String())
+
+		// Attempt to flush metrics, timeout and move on in case metrics are stuck as well.
+		metricsEmitted := make(chan struct{}, 1)
+		go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore.
+			// Flush metrics before killing process.
+			metric.EmitMetricUpdate()
+			metricsEmitted <- struct{}{}
+		}()
+		select {
+		case <-metricsEmitted:
+		case <-time.After(1 * time.Second):
+		}
+		panic("Sentry detected stuck task(s). See stack trace and message above for more details")
+	}
+}
-- 
cgit v1.2.3


From b701ee221434572881b9b3b0164d5a5b54714fa9 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 1 May 2018 08:06:11 -0700
Subject: Fix SO_RCVTIMEOUT for recvmsg

PiperOrigin-RevId: 194938091
Change-Id: Id17f26df13a915ec0c388aad3198207ea1c28d53
---
 pkg/sentry/syscalls/linux/sys_socket.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 3797c0a5d..70c618398 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -610,7 +610,14 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		flags |= linux.MSG_DONTWAIT
 	}
 
-	n, err := recvSingleMsg(t, s, msgPtr, flags, false, ktime.Time{})
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.RecvTimeout(); dl != 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	}
+
+	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
 	return n, nil, err
 }
 
-- 
cgit v1.2.3


From 3d3deef573a54e031cb98038b9f617f5fac31044 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 1 May 2018 22:11:07 -0700
Subject: Implement SO_TIMESTAMP

PiperOrigin-RevId: 195047018
Change-Id: I6d99528a00a2125f414e1e51e067205289ec9d3d
---
 pkg/dhcp/client.go                                 |  4 +-
 pkg/dhcp/dhcp_test.go                              |  2 +-
 pkg/dhcp/server.go                                 |  2 +-
 pkg/sentry/fs/host/socket_test.go                  |  2 +-
 pkg/sentry/kernel/kernel.go                        |  9 +++
 pkg/sentry/socket/BUILD                            |  1 +
 pkg/sentry/socket/control/control.go               | 35 +++++++++++
 pkg/sentry/socket/epsocket/epsocket.go             | 69 +++++++++++++++-------
 pkg/sentry/socket/hostinet/socket.go               | 10 ++--
 pkg/sentry/socket/netlink/socket.go                | 16 ++---
 pkg/sentry/socket/rpcinet/socket.go                | 20 +++----
 pkg/sentry/socket/socket.go                        | 12 +++-
 pkg/sentry/socket/unix/unix.go                     | 14 ++---
 pkg/sentry/strace/socket.go                        | 29 ++++++++-
 pkg/sentry/syscalls/linux/sys_socket.go            | 21 ++++---
 pkg/tcpip/adapters/gonet/gonet.go                  |  4 +-
 pkg/tcpip/adapters/gonet/gonet_test.go             |  2 +-
 pkg/tcpip/network/arp/arp_test.go                  |  2 +-
 pkg/tcpip/network/ipv4/icmp_test.go                |  2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go           |  4 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go              |  4 +-
 pkg/tcpip/stack/stack.go                           | 17 +++++-
 pkg/tcpip/stack/stack_test.go                      | 22 +++----
 pkg/tcpip/stack/transport_test.go                  | 16 ++---
 pkg/tcpip/tcpip.go                                 | 48 +++++++++++++--
 pkg/tcpip/transport/tcp/endpoint.go                | 22 +++----
 pkg/tcpip/transport/tcp/tcp_test.go                | 46 +++++++--------
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go      |  4 +-
 pkg/tcpip/transport/tcp/testing/context/context.go |  2 +-
 pkg/tcpip/transport/udp/endpoint.go                | 37 ++++++++++--
 pkg/tcpip/transport/udp/udp_test.go                | 10 ++--
 runsc/boot/loader.go                               |  7 ++-
 32 files changed, 345 insertions(+), 150 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 9a4fd7ae4..37deb69ff 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -162,7 +162,7 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) error
 	// DHCPOFFER
 	for {
 		var addr tcpip.FullAddress
-		v, err := epin.Read(&addr)
+		v, _, err := epin.Read(&addr)
 		if err == tcpip.ErrWouldBlock {
 			select {
 			case <-ch:
@@ -216,7 +216,7 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) error
 	// DHCPACK
 	for {
 		var addr tcpip.FullAddress
-		v, err := epin.Read(&addr)
+		v, _, err := epin.Read(&addr)
 		if err == tcpip.ErrWouldBlock {
 			select {
 			case <-ch:
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index d56b93997..ed884fcb6 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -36,7 +36,7 @@ func TestDHCP(t *testing.T) {
 		}
 	}()
 
-	s := stack.New([]string{ipv4.ProtocolName}, []string{udp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{udp.ProtocolName})
 
 	const nicid tcpip.NICID = 1
 	if err := s.CreateNIC(nicid, id); err != nil {
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index d132d90b4..8816203a8 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -104,7 +104,7 @@ func (s *Server) reader(ctx context.Context) {
 
 	for {
 		var addr tcpip.FullAddress
-		v, err := s.ep.Read(&addr)
+		v, _, err := s.ep.Read(&addr)
 		if err == tcpip.ErrWouldBlock {
 			select {
 			case <-ch:
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 80c46dcfa..9b73c5173 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -142,7 +142,7 @@ func TestSocketSendMsgLen0(t *testing.T) {
 	defer sfile.DecRef()
 
 	s := sfile.FileOperations.(socket.Socket)
-	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, unix.ControlMessages{})
+	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, socket.ControlMessages{})
 	if n != 0 {
 		t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n)
 	}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0932965e0..25c8dd885 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -887,6 +887,15 @@ func (k *Kernel) SetExitError(err error) {
 	}
 }
 
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (k *Kernel) NowNanoseconds() int64 {
+	now, err := k.timekeeper.GetTime(sentrytime.Realtime)
+	if err != nil {
+		panic("Kernel.NowNanoseconds: " + err.Error())
+	}
+	return now
+}
+
 // SupervisorContext returns a Context with maximum privileges in k. It should
 // only be used by goroutines outside the control of the emulated kernel
 // defined by e.
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 87e32df37..5500a676e 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/syserr",
+        "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index cb34cbc85..17ecdd11c 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -208,6 +208,31 @@ func putCmsg(buf []byte, msgType uint32, align uint, data []int32) []byte {
 	return alignSlice(buf, align)
 }
 
+func putCmsgStruct(buf []byte, msgType uint32, align uint, data interface{}) []byte {
+	if cap(buf)-len(buf) < linux.SizeOfControlMessageHeader {
+		return buf
+	}
+	ob := buf
+
+	buf = putUint64(buf, uint64(linux.SizeOfControlMessageHeader))
+	buf = putUint32(buf, linux.SOL_SOCKET)
+	buf = putUint32(buf, msgType)
+
+	hdrBuf := buf
+
+	buf = binary.Marshal(buf, usermem.ByteOrder, data)
+
+	// Check if we went over.
+	if cap(buf) != cap(ob) {
+		return hdrBuf
+	}
+
+	// Fix up length.
+	putUint64(ob, uint64(len(buf)-len(ob)))
+
+	return alignSlice(buf, align)
+}
+
 // Credentials implements SCMCredentials.Credentials.
 func (c *scmCredentials) Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
 	// "When a process's user and group IDs are passed over a UNIX domain
@@ -261,6 +286,16 @@ func alignSlice(buf []byte, align uint) []byte {
 	return buf[:aligned]
 }
 
+// PackTimestamp packs a SO_TIMESTAMP socket control message.
+func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte {
+	return putCmsgStruct(
+		buf,
+		linux.SO_TIMESTAMP,
+		t.Arch().Width(),
+		linux.NsecToTimeval(timestamp),
+	)
+}
+
 // Parse parses a raw socket control message into portable objects.
 func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.ControlMessages, error) {
 	var (
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 3fc3ea58f..5701ecfac 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -109,6 +109,7 @@ type SocketOperations struct {
 	// readMu protects access to readView, control, and sender.
 	readMu   sync.Mutex `state:"nosave"`
 	readView buffer.View
+	readCM   tcpip.ControlMessages
 	sender   tcpip.FullAddress
 }
 
@@ -210,12 +211,13 @@ func (s *SocketOperations) fetchReadView() *syserr.Error {
 	s.readView = nil
 	s.sender = tcpip.FullAddress{}
 
-	v, err := s.Endpoint.Read(&s.sender)
+	v, cms, err := s.Endpoint.Read(&s.sender)
 	if err != nil {
 		return syserr.TranslateNetstackError(err)
 	}
 
 	s.readView = v
+	s.readCM = cms
 
 	return nil
 }
@@ -230,7 +232,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	n, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
+	n, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
 	if err == syserr.ErrWouldBlock {
 		return int64(n), syserror.ErrWouldBlock
 	}
@@ -552,6 +554,18 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 			}
 
 			return linux.NsecToTimeval(s.RecvTimeout()), nil
+
+		case linux.SO_TIMESTAMP:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var v tcpip.TimestampOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			return int32(v), nil
 		}
 
 	case syscall.SOL_TCP:
@@ -659,6 +673,14 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n
 			binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
 			s.SetRecvTimeout(v.ToNsecCapped())
 			return nil
+
+		case linux.SO_TIMESTAMP:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TimestampOption(v)))
 		}
 
 	case syscall.SOL_TCP:
@@ -823,7 +845,9 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 }
 
 // nonBlockingRead issues a non-blocking read.
-func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, interface{}, uint32, *syserr.Error) {
+//
+// TODO: Support timestamps for stream sockets.
+func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	isPacket := s.isPacketBased()
 
 	// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
@@ -839,14 +863,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readMu.Lock()
 		n, err := s.coalescingRead(ctx, dst, trunc)
 		s.readMu.Unlock()
-		return n, nil, 0, err
+		return n, nil, 0, socket.ControlMessages{}, err
 	}
 
 	s.readMu.Lock()
 	defer s.readMu.Unlock()
 
 	if err := s.fetchReadView(); err != nil {
-		return 0, nil, 0, err
+		return 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	if !isPacket && peek && trunc {
@@ -854,14 +878,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		// amount that could be read.
 		var rql tcpip.ReceiveQueueSizeOption
 		if err := s.Endpoint.GetSockOpt(&rql); err != nil {
-			return 0, nil, 0, syserr.TranslateNetstackError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
 		}
 		available := len(s.readView) + int(rql)
 		bufLen := int(dst.NumBytes())
 		if available < bufLen {
-			return available, nil, 0, nil
+			return available, nil, 0, socket.ControlMessages{}, nil
 		}
-		return bufLen, nil, 0, nil
+		return bufLen, nil, 0, socket.ControlMessages{}, nil
 	}
 
 	n, err := dst.CopyOut(ctx, s.readView)
@@ -874,17 +898,18 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	if peek {
 		if l := len(s.readView); trunc && l > n {
 			// isPacket must be true.
-			return l, addr, addrLen, syserr.FromError(err)
+			return l, addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 		}
 
 		if isPacket || err != nil {
-			return int(n), addr, addrLen, syserr.FromError(err)
+			return int(n), addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 		}
 
 		// We need to peek beyond the first message.
 		dst = dst.DropFirst(n)
 		num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) {
-			n, err := s.Endpoint.Peek(dsts)
+			n, _, err := s.Endpoint.Peek(dsts)
+			// TODO: Handle peek timestamp.
 			if err != nil {
 				return int64(n), syserr.TranslateNetstackError(err).ToError()
 			}
@@ -895,7 +920,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 			// We got some data, so no need to return an error.
 			err = nil
 		}
-		return int(n), nil, 0, syserr.FromError(err)
+		return int(n), nil, 0, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 	}
 
 	var msgLen int
@@ -908,15 +933,15 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	}
 
 	if trunc {
-		return msgLen, addr, addrLen, syserr.FromError(err)
+		return msgLen, addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 	}
 
-	return int(n), addr, addrLen, syserr.FromError(err)
+	return int(n), addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
 }
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 
 	peek := flags&linux.MSG_PEEK != 0
@@ -924,7 +949,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		// Stream sockets ignore the sender address.
 		senderRequested = false
 	}
-	n, senderAddr, senderAddrLen, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+	n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
 	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		return
 	}
@@ -936,25 +961,25 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	defer s.EventUnregister(&e)
 
 	for {
-		n, senderAddr, senderAddrLen, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+		n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
 		if err != syserr.ErrWouldBlock {
 			return
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
 
 // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
-	// Reject control messages.
-	if !controlMessages.Empty() {
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+	// Reject Unix control messages.
+	if !controlMessages.Unix.Empty() {
 		return 0, syserr.ErrInvalidArgument
 	}
 
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index defa3db2c..02fad1c60 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -57,6 +57,8 @@ type socketOperations struct {
 	queue waiter.Queue
 }
 
+var _ = socket.Socket(&socketOperations{})
+
 func newSocketFile(ctx context.Context, fd int, nonblock bool) (*fs.File, *syserr.Error) {
 	s := &socketOperations{fd: fd}
 	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
@@ -339,14 +341,14 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
 	// FIXME: We can't support MSG_ERRQUEUE because it uses ancillary
 	// messages that netstack/tcpip/transport/unix doesn't understand. Kill the
 	// Socket interface's dependence on netstack.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
-		return 0, nil, 0, unix.ControlMessages{}, syserr.ErrInvalidArgument
+		return 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
 	}
 
 	var senderAddr []byte
@@ -411,11 +413,11 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		}
 	}
 
-	return int(n), senderAddr, uint32(len(senderAddr)), unix.ControlMessages{}, syserr.FromError(err)
+	return int(n), senderAddr, uint32(len(senderAddr)), socket.ControlMessages{}, syserr.FromError(err)
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Whitelist flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 2d0e59ceb..0b8f528d0 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -305,7 +305,7 @@ func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	from := linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		PortID: 0,
@@ -323,7 +323,7 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 		if trunc {
 			n = int64(r.MsgSize)
 		}
-		return int(n), from, fromLen, unix.ControlMessages{}, syserr.FromError(err)
+		return int(n), from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
 	}
 
 	// We'll have to block. Register for notification and keep trying to
@@ -337,14 +337,14 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 			if trunc {
 				n = int64(r.MsgSize)
 			}
-			return int(n), from, fromLen, unix.ControlMessages{}, syserr.FromError(err)
+			return int(n), from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
@@ -459,7 +459,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 }
 
 // sendMsg is the core of message send, used for SendMsg and Write.
-func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	dstPort := int32(0)
 
 	if len(to) != 0 {
@@ -506,12 +506,12 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte,
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	return s.sendMsg(t, src, to, flags, controlMessages)
 }
 
 // Write implements fs.FileOperations.Write.
 func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
-	n, err := s.sendMsg(ctx, src, nil, 0, unix.ControlMessages{})
+	n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
 	return int64(n), err.ToError()
 }
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 574d99ba5..15047df01 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -402,7 +402,7 @@ func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResp
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
 		Fd:     s.fd,
 		Length: uint32(dst.NumBytes()),
@@ -414,10 +414,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	res, err := rpcRecvMsg(t, req)
 	if err == nil {
 		n, e := dst.CopyOut(t, res.Data)
-		return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e)
+		return int(n), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 	}
 	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
-		return 0, nil, 0, unix.ControlMessages{}, err
+		return 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	// We'll have to block. Register for notifications and keep trying to
@@ -430,17 +430,17 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		res, err := rpcRecvMsg(t, req)
 		if err == nil {
 			n, e := dst.CopyOut(t, res.Data)
-			return int(n), res.Address.GetAddress(), res.Address.GetLength(), unix.ControlMessages{}, syserr.FromError(e)
+			return int(n), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 		}
 		if err != syserr.ErrWouldBlock {
-			return 0, nil, 0, unix.ControlMessages{}, err
+			return 0, nil, 0, socket.ControlMessages{}, err
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
@@ -459,14 +459,14 @@ func rpcSendMsg(t *kernel.Task, req *pb.SyscallRequest_Sendmsg) (uint32, *syserr
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Whitelist flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
 	}
 
-	// Reject control messages.
-	if !controlMessages.Empty() {
+	// Reject Unix control messages.
+	if !controlMessages.Unix.Empty() {
 		return 0, syserr.ErrInvalidArgument
 	}
 
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index be3026bfa..bd4858a34 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -31,9 +31,17 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
+// ControlMessages represents the union of unix control messages and tcpip
+// control messages.
+type ControlMessages struct {
+	Unix unix.ControlMessages
+	IP   tcpip.ControlMessages
+}
+
 // Socket is the interface containing socket syscalls used by the syscall layer
 // to redirect them to the appropriate implementation.
 type Socket interface {
@@ -78,11 +86,11 @@ type Socket interface {
 	//
 	// senderAddrLen is the address length to be returned to the application,
 	// not necessarily the actual length of the address.
-	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error)
+	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
 
 	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
 	// ownership of the ControlMessage on error.
-	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (n int, err *syserr.Error)
+	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages ControlMessages) (n int, err *syserr.Error)
 
 	// SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means
 	// no timeout.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index a4b414851..f83156c8e 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -358,10 +358,10 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 
 // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
 // a unix.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) {
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	w := EndpointWriter{
 		Endpoint: s.ep,
-		Control:  controlMessages,
+		Control:  controlMessages.Unix,
 		To:       nil,
 	}
 	if len(to) > 0 {
@@ -452,7 +452,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // a unix.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages unix.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 
@@ -490,7 +490,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		if trunc {
 			n = int64(r.MsgSize)
 		}
-		return int(n), from, fromLen, r.Control, syserr.FromError(err)
+		return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
 	}
 
 	// We'll have to block. Register for notification and keep trying to
@@ -509,14 +509,14 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			if trunc {
 				n = int64(r.MsgSize)
 			}
-			return int(n), from, fromLen, r.Control, syserr.FromError(err)
+			return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, unix.ControlMessages{}, syserr.ErrTryAgain
+				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, unix.ControlMessages{}, syserr.FromError(err)
+			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 48c072e96..1a2e8573e 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -440,6 +440,7 @@ var SocketProtocol = map[int32]abi.ValueSet{
 var controlMessageType = map[int32]string{
 	linux.SCM_RIGHTS:      "SCM_RIGHTS",
 	linux.SCM_CREDENTIALS: "SCM_CREDENTIALS",
+	linux.SO_TIMESTAMP:    "SO_TIMESTAMP",
 }
 
 func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) string {
@@ -477,7 +478,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 			typ = fmt.Sprint(h.Type)
 		}
 
-		if h.Length > uint64(len(buf)-i) {
+		if h.Length > uint64(len(buf)-i+linux.SizeOfControlMessageHeader) {
 			strs = append(strs, fmt.Sprintf(
 				"{level=%s, type=%s, length=%d, content extends beyond buffer}",
 				level,
@@ -546,6 +547,32 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 			i += control.AlignUp(length, width)
 
+		case linux.SO_TIMESTAMP:
+			if length < linux.SizeOfTimeval {
+				strs = append(strs, fmt.Sprintf(
+					"{level=%s, type=%s, length=%d, content too short}",
+					level,
+					typ,
+					h.Length,
+				))
+				i += control.AlignUp(length, width)
+				break
+			}
+
+			var tv linux.Timeval
+			binary.Unmarshal(buf[i:i+linux.SizeOfTimeval], usermem.ByteOrder, &tv)
+
+			strs = append(strs, fmt.Sprintf(
+				"{level=%s, type=%s, length=%d, Sec: %d, Usec: %d}",
+				level,
+				typ,
+				h.Length,
+				tv.Sec,
+				tv.Usec,
+			))
+
+			i += control.AlignUp(length, width)
+
 		default:
 			panic("unreachable")
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 70c618398..6258a1539 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -731,10 +731,11 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 
 	// Fast path when no control message nor name buffers are provided.
 	if msg.ControlLen == 0 && msg.NameLen == 0 {
-		n, _, _, _, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		n, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
 		if err != nil {
 			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
 		}
+		cms.Unix.Release()
 		return uintptr(n), nil
 	}
 
@@ -745,17 +746,21 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
-	defer cms.Release()
+	defer cms.Unix.Release()
 
 	controlData := make([]byte, 0, msg.ControlLen)
 
 	if cr, ok := s.(unix.Credentialer); ok && cr.Passcred() {
-		creds, _ := cms.Credentials.(control.SCMCredentials)
+		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
 		controlData = control.PackCredentials(t, creds, controlData)
 	}
 
-	if cms.Rights != nil {
-		controlData = control.PackRights(t, cms.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData)
+	if cms.IP.HasTimestamp {
+		controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData)
+	}
+
+	if cms.Unix.Rights != nil {
+		controlData = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData)
 	}
 
 	// Copy the address to the caller.
@@ -823,7 +828,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 	}
 
 	n, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
-	cm.Release()
+	cm.Unix.Release()
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
@@ -997,7 +1002,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 	}
 
 	// Call the syscall implementation.
-	n, e := s.SendMsg(t, src, to, int(flags), controlMessages)
+	n, e := s.SendMsg(t, src, to, int(flags), socket.ControlMessages{Unix: controlMessages})
 	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
 		controlMessages.Release()
@@ -1048,7 +1053,7 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla
 	}
 
 	// Call the syscall implementation.
-	n, e := s.SendMsg(t, src, to, int(flags), control.New(t, s, nil))
+	n, e := s.SendMsg(t, src, to, int(flags), socket.ControlMessages{Unix: control.New(t, s, nil)})
 	return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
 }
 
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 96a2d670d..5aa6b1aa2 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -268,7 +268,7 @@ type opErrorer interface {
 // commonRead implements the common logic between net.Conn.Read and
 // net.PacketConn.ReadFrom.
 func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, addr *tcpip.FullAddress, errorer opErrorer) ([]byte, error) {
-	read, err := ep.Read(addr)
+	read, _, err := ep.Read(addr)
 
 	if err == tcpip.ErrWouldBlock {
 		// Create wait queue entry that notifies a channel.
@@ -276,7 +276,7 @@ func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, a
 		wq.EventRegister(&waitEntry, waiter.EventIn)
 		defer wq.EventUnregister(&waitEntry)
 		for {
-			read, err = ep.Read(addr)
+			read, _, err = ep.Read(addr)
 			if err != tcpip.ErrWouldBlock {
 				break
 			}
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 2f86469eb..e3d0c6c84 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -47,7 +47,7 @@ func TestTimeouts(t *testing.T) {
 
 func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
 	// Create the stack and add a NIC.
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName, udp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName, udp.ProtocolName})
 
 	if err := s.CreateNIC(NICID, loopback.New()); err != nil {
 		return nil, err
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 91ffdce4b..47b10e64e 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -32,7 +32,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, []string{ipv4.PingProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, arp.ProtocolName}, []string{ipv4.PingProtocolName})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, stackLinkAddr)
diff --git a/pkg/tcpip/network/ipv4/icmp_test.go b/pkg/tcpip/network/ipv4/icmp_test.go
index 378fba74b..c55aa1835 100644
--- a/pkg/tcpip/network/ipv4/icmp_test.go
+++ b/pkg/tcpip/network/ipv4/icmp_test.go
@@ -26,7 +26,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New([]string{ipv4.ProtocolName}, []string{ipv4.PingProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{ipv4.PingProtocolName})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, "")
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 332929c85..ef5c7ec60 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -113,7 +113,7 @@ func main() {
 
 	// Create the stack with ipv4 and tcp protocols, then add a tun-based
 	// NIC and ipv4 address.
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	mtu, err := rawfile.GetMTU(tunName)
 	if err != nil {
@@ -183,7 +183,7 @@ func main() {
 	// connection from its side.
 	wq.EventRegister(&waitEntry, waiter.EventIn)
 	for {
-		v, err := ep.Read(nil)
+		v, _, err := ep.Read(nil)
 		if err != nil {
 			if err == tcpip.ErrClosedForReceive {
 				break
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 10cd701af..8c166f643 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -42,7 +42,7 @@ func echo(wq *waiter.Queue, ep tcpip.Endpoint) {
 	defer wq.EventUnregister(&waitEntry)
 
 	for {
-		v, err := ep.Read(nil)
+		v, _, err := ep.Read(nil)
 		if err != nil {
 			if err == tcpip.ErrWouldBlock {
 				<-notifyCh
@@ -99,7 +99,7 @@ func main() {
 
 	// Create the stack with ip and tcp protocols, then add a tun-based
 	// NIC and address.
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}, []string{tcp.ProtocolName})
 
 	mtu, err := rawfile.GetMTU(tunName)
 	if err != nil {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 558ecdb72..b480bf812 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -270,6 +270,9 @@ type Stack struct {
 	// If not nil, then any new endpoints will have this probe function
 	// invoked everytime they receive a TCP segment.
 	tcpProbeFunc TCPProbeFunc
+
+	// clock is used to generate user-visible times.
+	clock tcpip.Clock
 }
 
 // New allocates a new networking stack with only the requested networking and
@@ -279,7 +282,7 @@ type Stack struct {
 // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
 // stack. Please refer to individual protocol implementations as to what options
 // are supported.
-func New(network []string, transport []string) *Stack {
+func New(clock tcpip.Clock, network []string, transport []string) *Stack {
 	s := &Stack{
 		transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
 		networkProtocols:   make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
@@ -287,6 +290,7 @@ func New(network []string, transport []string) *Stack {
 		nics:               make(map[tcpip.NICID]*NIC),
 		linkAddrCache:      newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
 		PortManager:        ports.NewPortManager(),
+		clock:              clock,
 	}
 
 	// Add specified network protocols.
@@ -388,6 +392,11 @@ func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h f
 	}
 }
 
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (s *Stack) NowNanoseconds() int64 {
+	return s.clock.NowNanoseconds()
+}
+
 // Stats returns a snapshot of the current stats.
 //
 // NOTE: The underlying stats are updated using atomic instructions as a result
@@ -474,6 +483,12 @@ func (s *Stack) CreateDisabledNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *
 	return s.createNIC(id, "", linkEP, false)
 }
 
+// CreateDisabledNamedNIC is a combination of CreateNamedNIC and
+// CreateDisabledNIC.
+func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, name, linkEP, false)
+}
+
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
 // delivering packets to it.
 func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b416065d7..ea7dccdc2 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -176,7 +176,7 @@ func TestNetworkReceive(t *testing.T) {
 	// Create a stack with the fake network protocol, one nic, and two
 	// addresses attached to it: 1 & 2.
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -270,7 +270,7 @@ func TestNetworkSend(t *testing.T) {
 	// address: 1. The route table sends all packets through the only
 	// existing nic.
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("NewNIC failed: %v", err)
 	}
@@ -292,7 +292,7 @@ func TestNetworkSendMultiRoute(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id1, linkEP1 := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id1); err != nil {
@@ -371,7 +371,7 @@ func TestRoutes(t *testing.T) {
 	// Create a stack with the fake network protocol, two nics, and two
 	// addresses per nic, the first nic has odd address, the second one has
 	// even addresses.
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id1, _ := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id1); err != nil {
@@ -435,7 +435,7 @@ func TestRoutes(t *testing.T) {
 }
 
 func TestAddressRemoval(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -479,7 +479,7 @@ func TestAddressRemoval(t *testing.T) {
 }
 
 func TestDelayedRemovalDueToRoute(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -547,7 +547,7 @@ func TestDelayedRemovalDueToRoute(t *testing.T) {
 }
 
 func TestPromiscuousMode(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -607,7 +607,7 @@ func TestAddressSpoofing(t *testing.T) {
 	srcAddr := tcpip.Address("\x01")
 	dstAddr := tcpip.Address("\x02")
 
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, _ := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -648,7 +648,7 @@ func TestAddressSpoofing(t *testing.T) {
 
 // Set the subnet, then check that packet is delivered.
 func TestSubnetAcceptsMatchingPacket(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -682,7 +682,7 @@ func TestSubnetAcceptsMatchingPacket(t *testing.T) {
 
 // Set destination outside the subnet, then check it doesn't get delivered.
 func TestSubnetRejectsNonmatchingPacket(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, nil)
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, nil)
 
 	id, linkEP := channel.New(10, defaultMTU, "")
 	if err := s.CreateNIC(1, id); err != nil {
@@ -714,7 +714,7 @@ func TestSubnetRejectsNonmatchingPacket(t *testing.T) {
 }
 
 func TestNetworkOptions(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, []string{})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{})
 
 	// Try an unsupported network protocol.
 	if err := s.SetNetworkProtocolOption(tcpip.NetworkProtocolNumber(99999), fakeNetGoodOption(false)); err != tcpip.ErrUnknownProtocol {
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 7e072e96e..b870ab375 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -46,8 +46,8 @@ func (*fakeTransportEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask
 	return mask
 }
 
-func (*fakeTransportEndpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
-	return buffer.View{}, nil
+func (*fakeTransportEndpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	return buffer.View{}, tcpip.ControlMessages{}, nil
 }
 
 func (f *fakeTransportEndpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
@@ -67,8 +67,8 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions)
 	return uintptr(len(v)), nil
 }
 
-func (f *fakeTransportEndpoint) Peek([][]byte) (uintptr, *tcpip.Error) {
-	return 0, nil
+func (f *fakeTransportEndpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
@@ -210,7 +210,7 @@ func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
 
 func TestTransportReceive(t *testing.T) {
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -270,7 +270,7 @@ func TestTransportReceive(t *testing.T) {
 
 func TestTransportControlReceive(t *testing.T) {
 	id, linkEP := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -336,7 +336,7 @@ func TestTransportControlReceive(t *testing.T) {
 
 func TestTransportSend(t *testing.T) {
 	id, _ := channel.New(10, defaultMTU, "")
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 	if err := s.CreateNIC(1, id); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
 	}
@@ -373,7 +373,7 @@ func TestTransportSend(t *testing.T) {
 }
 
 func TestTransportOptions(t *testing.T) {
-	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"})
+	s := stack.New(&tcpip.StdClock{}, []string{"fakeNet"}, []string{"fakeTrans"})
 
 	// Try an unsupported transport protocol.
 	if err := s.SetTransportProtocolOption(tcpip.TransportProtocolNumber(99999), fakeTransportGoodOption(false)); err != tcpip.ErrUnknownProtocol {
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f3a94f353..f9df1d989 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -23,6 +23,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -80,6 +81,24 @@ var (
 	errSubnetAddressMasked  = errors.New("subnet address has bits set outside the mask")
 )
 
+// A Clock provides the current time.
+//
+// Times returned by a Clock should always be used for application-visible
+// time, but never for netstack internal timekeeping.
+type Clock interface {
+	// NowNanoseconds returns the current real time as a number of
+	// nanoseconds since some epoch.
+	NowNanoseconds() int64
+}
+
+// StdClock implements Clock with the time package.
+type StdClock struct{}
+
+// NowNanoseconds implements Clock.NowNanoseconds.
+func (*StdClock) NowNanoseconds() int64 {
+	return time.Now().UnixNano()
+}
+
 // Address is a byte slice cast as a string that represents the address of a
 // network node. Or, in the case of unix endpoints, it may represent a path.
 type Address string
@@ -210,6 +229,16 @@ func (s SlicePayload) Size() int {
 	return len(s)
 }
 
+// A ControlMessages contains socket control messages for IP sockets.
+type ControlMessages struct {
+	// HasTimestamp indicates whether Timestamp is valid/set.
+	HasTimestamp bool
+
+	// Timestamp is the time (in ns) that the last packed used to create
+	// the read data was received.
+	Timestamp int64
+}
+
 // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
 // that exposes functionality like read, write, connect, etc. to users of the
 // networking stack.
@@ -219,9 +248,13 @@ type Endpoint interface {
 	Close()
 
 	// Read reads data from the endpoint and optionally returns the sender.
-	// This method does not block if there is no data pending.
-	// It will also either return an error or data, never both.
-	Read(*FullAddress) (buffer.View, *Error)
+	//
+	// This method does not block if there is no data pending. It will also
+	// either return an error or data, never both.
+	//
+	// A timestamp (in ns) is optionally returned. A zero value indicates
+	// that no timestamp was available.
+	Read(*FullAddress) (buffer.View, ControlMessages, *Error)
 
 	// Write writes data to the endpoint's peer. This method does not block if
 	// the data cannot be written.
@@ -238,7 +271,10 @@ type Endpoint interface {
 	// Peek reads data without consuming it from the endpoint.
 	//
 	// This method does not block if there is no data pending.
-	Peek([][]byte) (uintptr, *Error)
+	//
+	// A timestamp (in ns) is optionally returned. A zero value indicates
+	// that no timestamp was available.
+	Peek([][]byte) (uintptr, ControlMessages, *Error)
 
 	// Connect connects the endpoint to its peer. Specifying a NIC is
 	// optional.
@@ -347,6 +383,10 @@ type ReuseAddressOption int
 // Only supported on Unix sockets.
 type PasscredOption int
 
+// TimestampOption is used by SetSockOpt/GetSockOpt to specify whether
+// SO_TIMESTAMP socket control messages are enabled.
+type TimestampOption int
+
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO: Add and populate stat fields.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5d62589d8..d84171b0c 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -374,7 +374,7 @@ func (e *endpoint) cleanup() {
 }
 
 // Read reads data from the endpoint.
-func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
+func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.mu.RLock()
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data. Also note that a RST being received
@@ -383,9 +383,9 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 	if s := e.state; s != stateConnected && s != stateClosed && e.rcvBufUsed == 0 {
 		e.mu.RUnlock()
 		if s == stateError {
-			return buffer.View{}, e.hardError
+			return buffer.View{}, tcpip.ControlMessages{}, e.hardError
 		}
-		return buffer.View{}, tcpip.ErrInvalidEndpointState
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
 	}
 
 	e.rcvListMu.Lock()
@@ -394,7 +394,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 
 	e.mu.RUnlock()
 
-	return v, err
+	return v, tcpip.ControlMessages{}, err
 }
 
 func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
@@ -498,7 +498,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 // Peek reads data without consuming it from the endpoint.
 //
 // This method does not block if there is no data pending.
-func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
+func (e *endpoint) Peek(vec [][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
@@ -506,9 +506,9 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 	// but has some pending unread data.
 	if s := e.state; s != stateConnected && s != stateClosed {
 		if s == stateError {
-			return 0, e.hardError
+			return 0, tcpip.ControlMessages{}, e.hardError
 		}
-		return 0, tcpip.ErrInvalidEndpointState
+		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
 	}
 
 	e.rcvListMu.Lock()
@@ -516,9 +516,9 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 
 	if e.rcvBufUsed == 0 {
 		if e.rcvClosed || e.state != stateConnected {
-			return 0, tcpip.ErrClosedForReceive
+			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
-		return 0, tcpip.ErrWouldBlock
+		return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
 	}
 
 	// Make a copy of vec so we can modify the slide headers.
@@ -534,7 +534,7 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 
 			for len(v) > 0 {
 				if len(vec) == 0 {
-					return num, nil
+					return num, tcpip.ControlMessages{}, nil
 				}
 				if len(vec[0]) == 0 {
 					vec = vec[1:]
@@ -549,7 +549,7 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, *tcpip.Error) {
 		}
 	}
 
-	return num, nil
+	return num, tcpip.ControlMessages{}, nil
 }
 
 // zeroReceiveWindow checks if the receive window to be announced now would be
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 118d861ba..3c21a1ec3 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -147,7 +147,7 @@ func TestSimpleReceive(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -169,7 +169,7 @@ func TestSimpleReceive(t *testing.T) {
 	}
 
 	// Receive data.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -199,7 +199,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -226,7 +226,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 
 	// Wait 200ms and check that no data has been received.
 	time.Sleep(200 * time.Millisecond)
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -243,7 +243,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 	// Receive data.
 	read := make([]byte, 0, 6)
 	for len(read) < len(data) {
-		v, err := c.EP.Read(nil)
+		v, _, err := c.EP.Read(nil)
 		if err != nil {
 			if err == tcpip.ErrWouldBlock {
 				// Wait for receive to be notified.
@@ -284,7 +284,7 @@ func TestOutOfOrderFlood(t *testing.T) {
 	opt := tcpip.ReceiveBufferSizeOption(10)
 	c.CreateConnected(789, 30000, &opt)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -361,7 +361,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -414,7 +414,7 @@ func TestFullWindowReceive(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, err := c.EP.Read(nil)
+	_, _, err := c.EP.Read(nil)
 	if err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -449,7 +449,7 @@ func TestFullWindowReceive(t *testing.T) {
 	)
 
 	// Receive data and check it.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -487,7 +487,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	_, err := c.EP.Read(nil)
+	_, _, err := c.EP.Read(nil)
 	if err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -551,7 +551,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	// Receive data and check it.
 	read := make([]byte, 0, 10)
 	for len(read) < len(data) {
-		v, err := c.EP.Read(nil)
+		v, _, err := c.EP.Read(nil)
 		if err != nil {
 			t.Fatalf("Unexpected error from Read: %v", err)
 		}
@@ -954,7 +954,7 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 	}
 
 	// Read some data. An ack should be sent in response to that.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -1337,7 +1337,7 @@ func TestReceiveOnResetConnection(t *testing.T) {
 
 loop:
 	for {
-		switch _, err := c.EP.Read(nil); err {
+		switch _, _, err := c.EP.Read(nil); err {
 		case nil:
 			t.Fatalf("Unexpected success.")
 		case tcpip.ErrWouldBlock:
@@ -2293,7 +2293,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 
-	if _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
 
@@ -2345,7 +2345,7 @@ func TestReadAfterClosedState(t *testing.T) {
 
 	// Check that peek works.
 	peekBuf := make([]byte, 10)
-	n, err := c.EP.Peek([][]byte{peekBuf})
+	n, _, err := c.EP.Peek([][]byte{peekBuf})
 	if err != nil {
 		t.Fatalf("Unexpected error from Peek: %v", err)
 	}
@@ -2356,7 +2356,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	}
 
 	// Receive data.
-	v, err := c.EP.Read(nil)
+	v, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
@@ -2367,11 +2367,11 @@ func TestReadAfterClosedState(t *testing.T) {
 
 	// Now that we drained the queue, check that functions fail with the
 	// right error code.
-	if _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
 		t.Fatalf("Unexpected return from Read: got %v, want %v", err, tcpip.ErrClosedForReceive)
 	}
 
-	if _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
+	if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
 		t.Fatalf("Unexpected return from Peek: got %v, want %v", err, tcpip.ErrClosedForReceive)
 	}
 }
@@ -2479,7 +2479,7 @@ func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 }
 
 func TestDefaultBufferSizes(t *testing.T) {
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
@@ -2525,7 +2525,7 @@ func TestDefaultBufferSizes(t *testing.T) {
 }
 
 func TestMinMaxBufferSizes(t *testing.T) {
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
@@ -2575,7 +2575,7 @@ func TestSelfConnect(t *testing.T) {
 	// it checks that if an endpoint binds to say 127.0.0.1:1000 then
 	// connects to 127.0.0.1:1000, then it will be connected to itself, and
 	// is able to send and receive data through the same endpoint.
-	s := stack.New([]string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{tcp.ProtocolName})
 
 	id := loopback.New()
 	if testing.Verbose() {
@@ -2637,13 +2637,13 @@ func TestSelfConnect(t *testing.T) {
 	// Read back what was written.
 	wq.EventUnregister(&waitEntry)
 	wq.EventRegister(&waitEntry, waiter.EventIn)
-	rd, err := ep.Read(nil)
+	rd, _, err := ep.Read(nil)
 	if err != nil {
 		if err != tcpip.ErrWouldBlock {
 			t.Fatalf("Read failed: %v", err)
 		}
 		<-notifyCh
-		rd, err = ep.Read(nil)
+		rd, _, err = ep.Read(nil)
 		if err != nil {
 			t.Fatalf("Read failed: %v", err)
 		}
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index d12081bb7..335262e43 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -95,7 +95,7 @@ func TestTimeStampEnabledConnect(t *testing.T) {
 	// There should be 5 views to read and each of them should
 	// contain the same data.
 	for i := 0; i < 5; i++ {
-		got, err := c.EP.Read(nil)
+		got, _, err := c.EP.Read(nil)
 		if err != nil {
 			t.Fatalf("Unexpected error from Read: %v", err)
 		}
@@ -296,7 +296,7 @@ func TestSegmentDropWhenTimestampMissing(t *testing.T) {
 	}
 
 	// Issue a read and we should data.
-	got, err := c.EP.Read(nil)
+	got, _, err := c.EP.Read(nil)
 	if err != nil {
 		t.Fatalf("Unexpected error from Read: %v", err)
 	}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6a402d150..eb928553f 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -129,7 +129,7 @@ type Context struct {
 // New allocates and initializes a test context containing a new
 // stack and a link-layer endpoint.
 func New(t *testing.T, mtu uint32) *Context {
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{tcp.ProtocolName})
 
 	// Allow minimum send/receive buffer sizes to be 1 during tests.
 	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{1, tcp.DefaultBufferSize, tcp.DefaultBufferSize * 10}); err != nil {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 80fa88c4c..f86fc6d5a 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -19,6 +19,8 @@ type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp     int64
+	hasTimestamp  bool
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -52,6 +54,7 @@ type endpoint struct {
 	rcvBufSizeMax int `state:".(int)"`
 	rcvBufSize    int
 	rcvClosed     bool
+	rcvTimestamp  bool
 
 	// The following fields are protected by the mu mutex.
 	mu         sync.RWMutex `state:"nosave"`
@@ -134,7 +137,7 @@ func (e *endpoint) Close() {
 
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
-func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, *tcpip.Error) {
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.rcvMu.Lock()
 
 	if e.rcvList.Empty() {
@@ -143,12 +146,13 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 			err = tcpip.ErrClosedForReceive
 		}
 		e.rcvMu.Unlock()
-		return buffer.View{}, err
+		return buffer.View{}, tcpip.ControlMessages{}, err
 	}
 
 	p := e.rcvList.Front()
 	e.rcvList.Remove(p)
 	e.rcvBufSize -= p.data.Size()
+	ts := e.rcvTimestamp
 
 	e.rcvMu.Unlock()
 
@@ -156,7 +160,12 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, *tcpip.Error) {
 		*addr = p.senderAddress
 	}
 
-	return p.data.ToView(), nil
+	if ts && !p.hasTimestamp {
+		// Linux uses the current time.
+		p.timestamp = e.stack.NowNanoseconds()
+	}
+
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: ts, Timestamp: p.timestamp}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -299,8 +308,8 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 }
 
 // Peek only returns data from a single datagram, so do nothing here.
-func (e *endpoint) Peek([][]byte) (uintptr, *tcpip.Error) {
-	return 0, nil
+func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
@@ -322,6 +331,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 
 		e.v6only = v != 0
+
+	case tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		e.rcvTimestamp = v != 0
+		e.rcvMu.Unlock()
 	}
 	return nil
 }
@@ -370,6 +384,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		e.rcvMu.Unlock()
 		return nil
+
+	case *tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		*o = 0
+		if e.rcvTimestamp {
+			*o = 1
+		}
+		e.rcvMu.Unlock()
 	}
 
 	return tcpip.ErrUnknownProtocolOption
@@ -733,6 +755,11 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	e.rcvList.PushBack(pkt)
 	e.rcvBufSize += vv.Size()
 
+	if e.rcvTimestamp {
+		pkt.timestamp = e.stack.NowNanoseconds()
+		pkt.hasTimestamp = true
+	}
+
 	e.rcvMu.Unlock()
 
 	// Notify any waiters that there's data to be read now.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 65c567952..1eb9ecb80 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,7 +56,7 @@ type headers struct {
 }
 
 func newDualTestContext(t *testing.T, mtu uint32) *testContext {
-	s := stack.New([]string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{udp.ProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, ipv6.ProtocolName}, []string{udp.ProtocolName})
 
 	id, linkEP := channel.New(256, mtu, "")
 	if testing.Verbose() {
@@ -260,12 +260,12 @@ func testV4Read(c *testContext) {
 	defer c.wq.EventUnregister(&we)
 
 	var addr tcpip.FullAddress
-	v, err := c.ep.Read(&addr)
+	v, _, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, err = c.ep.Read(&addr)
+			v, _, err = c.ep.Read(&addr)
 			if err != nil {
 				c.t.Fatalf("Read failed: %v", err)
 			}
@@ -355,12 +355,12 @@ func TestV6ReadOnV6(t *testing.T) {
 	defer c.wq.EventUnregister(&we)
 
 	var addr tcpip.FullAddress
-	v, err := c.ep.Read(&addr)
+	v, _, err := c.ep.Read(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for data to become available.
 		select {
 		case <-ch:
-			v, err = c.ep.Read(&addr)
+			v, _, err = c.ep.Read(&addr)
 			if err != nil {
 				c.t.Fatalf("Read failed: %v", err)
 			}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index a470cb054..d63a9028e 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -37,6 +37,7 @@ import (
 	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
@@ -177,7 +178,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack := newEmptyNetworkStack(conf)
+	networkStack := newEmptyNetworkStack(conf, k)
 
 	// Initiate the Kernel object, which is required by the Context passed
 	// to createVFS in order to mount (among other things) procfs.
@@ -337,7 +338,7 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config) inet.Stack {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) inet.Stack {
 	switch conf.Network {
 	case NetworkHost:
 		return hostinet.NewStack()
@@ -346,7 +347,7 @@ func newEmptyNetworkStack(conf *Config) inet.Stack {
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName}
-		return &epsocket.Stack{stack.New(netProtos, protoNames)}
+		return &epsocket.Stack{stack.New(clock, netProtos, protoNames)}
 
 	default:
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
-- 
cgit v1.2.3


From 65df95516898f077cda44ace15e45e4c777fdaf3 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 1 May 2018 22:17:13 -0700
Subject: Set LMA in EFER

As of Linux 4.15 (f29810335965ac1f7bcb501ee2af5f039f792416
KVM/x86: Check input paging mode when cs.l is set), KVM validates that
LMA is set along with LME.

PiperOrigin-RevId: 195047401
Change-Id: I8b43d8f758a85b1f58ccbd747dcacd4056ef3f66
---
 pkg/sentry/platform/ring0/kernel_amd64.go | 2 +-
 pkg/sentry/platform/ring0/x86.go          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index c82613a9c..76ba65b3f 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -149,7 +149,7 @@ func (c *CPU) CR4() uint64 {
 //
 //go:nosplit
 func (c *CPU) EFER() uint64 {
-	return _EFER_LME | _EFER_SCE | _EFER_NX
+	return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX
 }
 
 // IsCanonical indicates whether addr is canonical per the amd64 spec.
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index e16f6c599..74b140066 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -46,6 +46,7 @@ const (
 
 	_EFER_SCE = 0x001
 	_EFER_LME = 0x100
+	_EFER_LMA = 0x400
 	_EFER_NX  = 0x800
 
 	_MSR_STAR         = 0xc0000081
-- 
cgit v1.2.3


From eb5414ee29f20b1805345820e6174afff84276c2 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 1 May 2018 22:50:55 -0700
Subject: Add support for ping sockets

PiperOrigin-RevId: 195049322
Change-Id: I09f6dd58cf10a2e50e53d17d2823d540102913c5
---
 pkg/sentry/socket/epsocket/BUILD           |   1 +
 pkg/sentry/socket/epsocket/provider.go     |  19 +-
 pkg/sentry/syscalls/linux/sys_socket.go    |   2 +-
 pkg/tcpip/network/arp/BUILD                |   1 +
 pkg/tcpip/network/arp/arp_test.go          |   3 +-
 pkg/tcpip/network/ipv4/BUILD               |  17 +-
 pkg/tcpip/network/ipv4/icmp.go             | 190 +--------
 pkg/tcpip/network/ipv4/icmp_test.go        | 124 ------
 pkg/tcpip/transport/ping/BUILD             |  50 +++
 pkg/tcpip/transport/ping/endpoint.go       | 665 +++++++++++++++++++++++++++++
 pkg/tcpip/transport/ping/endpoint_state.go |  61 +++
 pkg/tcpip/transport/ping/protocol.go       | 106 +++++
 pkg/tcpip/transport/udp/endpoint_state.go  |   2 +-
 runsc/boot/BUILD                           |   1 +
 runsc/boot/loader.go                       |   3 +-
 15 files changed, 914 insertions(+), 331 deletions(-)
 delete mode 100644 pkg/tcpip/network/ipv4/icmp_test.go
 create mode 100644 pkg/tcpip/transport/ping/BUILD
 create mode 100644 pkg/tcpip/transport/ping/endpoint.go
 create mode 100644 pkg/tcpip/transport/ping/endpoint_state.go
 create mode 100644 pkg/tcpip/transport/ping/protocol.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 0e463a92a..8430886cb 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -50,6 +50,7 @@ go_library(
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 5616435b3..6c1e3b6b9 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
@@ -37,8 +38,8 @@ type provider struct {
 	netProto tcpip.NetworkProtocolNumber
 }
 
-// GetTransportProtocol figures out transport protocol. Currently only TCP and
-// UDP are supported.
+// GetTransportProtocol figures out transport protocol. Currently only TCP,
+// UDP, and ICMP are supported.
 func GetTransportProtocol(stype unix.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
 	switch stype {
 	case linux.SOCK_STREAM:
@@ -48,14 +49,16 @@ func GetTransportProtocol(stype unix.SockType, protocol int) (tcpip.TransportPro
 		return tcp.ProtocolNumber, nil
 
 	case linux.SOCK_DGRAM:
-		if protocol != 0 && protocol != syscall.IPPROTO_UDP {
-			return 0, syserr.ErrInvalidArgument
+		switch protocol {
+		case 0, syscall.IPPROTO_UDP:
+			return udp.ProtocolNumber, nil
+		case syscall.IPPROTO_ICMP:
+			return header.ICMPv4ProtocolNumber, nil
+		case syscall.IPPROTO_ICMPV6:
+			return header.ICMPv6ProtocolNumber, nil
 		}
-		return udp.ProtocolNumber, nil
-
-	default:
-		return 0, syserr.ErrInvalidArgument
 	}
+	return 0, syserr.ErrInvalidArgument
 }
 
 // Socket creates a new socket object for the AF_INET or AF_INET6 family.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 6258a1539..d6d5dba8a 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -791,7 +791,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CONFIRM) != 0 {
 		return 0, syscall.EINVAL
 	}
 
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index e6d0899a9..58d174965 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -30,5 +30,6 @@ go_test(
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/ping",
     ],
 )
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 47b10e64e..6d61ff1d7 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -16,6 +16,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
 )
 
 const (
@@ -32,7 +33,7 @@ type testContext struct {
 }
 
 func newTestContext(t *testing.T) *testContext {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, arp.ProtocolName}, []string{ipv4.PingProtocolName})
+	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName, arp.ProtocolName}, []string{ping.ProtocolName4})
 
 	const defaultMTU = 65536
 	id, linkEP := channel.New(256, defaultMTU, stackLinkAddr)
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 9df113df1..02d55355c 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # BSD
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "ipv4",
@@ -19,20 +19,5 @@ go_library(
         "//pkg/tcpip/network/fragmentation",
         "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
-        "//pkg/waiter",
-    ],
-)
-
-go_test(
-    name = "ipv4_test",
-    size = "small",
-    srcs = ["icmp_test.go"],
-    deps = [
-        ":ipv4",
-        "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
-        "//pkg/tcpip/link/channel",
-        "//pkg/tcpip/link/sniffer",
-        "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index ffd761350..3c382fdc2 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -5,26 +5,14 @@
 package ipv4
 
 import (
-	"context"
 	"encoding/binary"
-	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// PingProtocolName is a pseudo transport protocol used to handle ping replies.
-// Use it when constructing a stack that intends to use ipv4.Ping.
-const PingProtocolName = "icmpv4ping"
-
-// pingProtocolNumber is a fake transport protocol used to
-// deliver incoming ICMP echo replies. The ICMP identifier
-// number is used as a port number for multiplexing.
-const pingProtocolNumber tcpip.TransportProtocolNumber = 256 + 11
-
 // handleControl handles the case when an ICMP packet contains the headers of
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
@@ -78,7 +66,10 @@ func (e *endpoint) handleICMP(r *stack.Route, vv *buffer.VectorisedView) {
 		}
 
 	case header.ICMPv4EchoReply:
-		e.dispatcher.DeliverTransportPacket(r, pingProtocolNumber, vv)
+		if len(v) < header.ICMPv4EchoMinimumSize {
+			return
+		}
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, vv)
 
 	case header.ICMPv4DstUnreachable:
 		if len(v) < header.ICMPv4DstUnreachableMinimumSize {
@@ -104,179 +95,20 @@ type echoRequest struct {
 
 func (e *endpoint) echoReplier() {
 	for req := range e.echoRequests {
-		sendICMPv4(&req.r, header.ICMPv4EchoReply, 0, req.v)
+		sendPing4(&req.r, 0, req.v)
 		req.r.Release()
 	}
 }
 
-func sendICMPv4(r *stack.Route, typ header.ICMPv4Type, code byte, data buffer.View) *tcpip.Error {
-	hdr := buffer.NewPrependable(header.ICMPv4MinimumSize + int(r.MaxHeaderLength()))
+func sendPing4(r *stack.Route, code byte, data buffer.View) *tcpip.Error {
+	hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength()))
 
-	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
-	icmpv4.SetType(typ)
+	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize))
+	icmpv4.SetType(header.ICMPv4EchoReply)
 	icmpv4.SetCode(code)
+	copy(icmpv4[header.ICMPv4MinimumSize:], data)
+	data = data[header.ICMPv4EchoMinimumSize-header.ICMPv4MinimumSize:]
 	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
 
 	return r.WritePacket(&hdr, data, header.ICMPv4ProtocolNumber)
 }
-
-// A Pinger can send echo requests to an address.
-type Pinger struct {
-	Stack     *stack.Stack
-	NICID     tcpip.NICID
-	Addr      tcpip.Address
-	LocalAddr tcpip.Address // optional
-	Wait      time.Duration // if zero, defaults to 1 second
-	Count     uint16        // if zero, defaults to MaxUint16
-}
-
-// Ping sends echo requests to an ICMPv4 endpoint.
-// Responses are streamed to the channel ch.
-func (p *Pinger) Ping(ctx context.Context, ch chan<- PingReply) *tcpip.Error {
-	count := p.Count
-	if count == 0 {
-		count = 1<<16 - 1
-	}
-	wait := p.Wait
-	if wait == 0 {
-		wait = 1 * time.Second
-	}
-
-	r, err := p.Stack.FindRoute(p.NICID, p.LocalAddr, p.Addr, ProtocolNumber)
-	if err != nil {
-		return err
-	}
-
-	netProtos := []tcpip.NetworkProtocolNumber{ProtocolNumber}
-	ep := &pingEndpoint{
-		stack: p.Stack,
-		pktCh: make(chan buffer.View, 1),
-	}
-	id := stack.TransportEndpointID{
-		LocalAddress:  r.LocalAddress,
-		RemoteAddress: p.Addr,
-	}
-
-	_, err = p.Stack.PickEphemeralPort(func(port uint16) (bool, *tcpip.Error) {
-		id.LocalPort = port
-		err := p.Stack.RegisterTransportEndpoint(p.NICID, netProtos, pingProtocolNumber, id, ep)
-		switch err {
-		case nil:
-			return true, nil
-		case tcpip.ErrPortInUse:
-			return false, nil
-		default:
-			return false, err
-		}
-	})
-	if err != nil {
-		return err
-	}
-	defer p.Stack.UnregisterTransportEndpoint(p.NICID, netProtos, pingProtocolNumber, id)
-
-	v := buffer.NewView(4)
-	binary.BigEndian.PutUint16(v[0:], id.LocalPort)
-
-	start := time.Now()
-
-	done := make(chan struct{})
-	go func(count int) {
-	loop:
-		for ; count > 0; count-- {
-			select {
-			case v := <-ep.pktCh:
-				seq := binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize+2:])
-				ch <- PingReply{
-					Duration:  time.Since(start) - time.Duration(seq)*wait,
-					SeqNumber: seq,
-				}
-			case <-ctx.Done():
-				break loop
-			}
-		}
-		close(done)
-	}(int(count))
-	defer func() { <-done }()
-
-	t := time.NewTicker(wait)
-	defer t.Stop()
-	for seq := uint16(0); seq < count; seq++ {
-		select {
-		case <-t.C:
-		case <-ctx.Done():
-			return nil
-		}
-		binary.BigEndian.PutUint16(v[2:], seq)
-		sent := time.Now()
-		if err := sendICMPv4(&r, header.ICMPv4Echo, 0, v); err != nil {
-			ch <- PingReply{
-				Error:     err,
-				Duration:  time.Since(sent),
-				SeqNumber: seq,
-			}
-		}
-	}
-	return nil
-}
-
-// PingReply summarizes an ICMP echo reply.
-type PingReply struct {
-	Error     *tcpip.Error // reports any errors sending a ping request
-	Duration  time.Duration
-	SeqNumber uint16
-}
-
-type pingProtocol struct{}
-
-func (*pingProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return nil, tcpip.ErrNotSupported // endpoints are created directly
-}
-
-func (*pingProtocol) Number() tcpip.TransportProtocolNumber { return pingProtocolNumber }
-
-func (*pingProtocol) MinimumPacketSize() int { return header.ICMPv4EchoMinimumSize }
-
-func (*pingProtocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
-	ident := binary.BigEndian.Uint16(v[4:])
-	return 0, ident, nil
-}
-
-func (*pingProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *buffer.VectorisedView) bool {
-	return true
-}
-
-// SetOption implements TransportProtocol.SetOption.
-func (p *pingProtocol) SetOption(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-// Option implements TransportProtocol.Option.
-func (p *pingProtocol) Option(option interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
-func init() {
-	stack.RegisterTransportProtocolFactory(PingProtocolName, func() stack.TransportProtocol {
-		return &pingProtocol{}
-	})
-}
-
-type pingEndpoint struct {
-	stack *stack.Stack
-	pktCh chan buffer.View
-}
-
-func (e *pingEndpoint) Close() {
-	close(e.pktCh)
-}
-
-func (e *pingEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv *buffer.VectorisedView) {
-	select {
-	case e.pktCh <- vv.ToView():
-	default:
-	}
-}
-
-// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *pingEndpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv *buffer.VectorisedView) {
-}
diff --git a/pkg/tcpip/network/ipv4/icmp_test.go b/pkg/tcpip/network/ipv4/icmp_test.go
deleted file mode 100644
index c55aa1835..000000000
--- a/pkg/tcpip/network/ipv4/icmp_test.go
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright 2016 The Netstack Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package ipv4_test
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/channel"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-)
-
-const stackAddr = "\x0a\x00\x00\x01"
-
-type testContext struct {
-	t      *testing.T
-	linkEP *channel.Endpoint
-	s      *stack.Stack
-}
-
-func newTestContext(t *testing.T) *testContext {
-	s := stack.New(&tcpip.StdClock{}, []string{ipv4.ProtocolName}, []string{ipv4.PingProtocolName})
-
-	const defaultMTU = 65536
-	id, linkEP := channel.New(256, defaultMTU, "")
-	if testing.Verbose() {
-		id = sniffer.New(id)
-	}
-	if err := s.CreateNIC(1, id); err != nil {
-		t.Fatalf("CreateNIC failed: %v", err)
-	}
-
-	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
-	}
-
-	s.SetRouteTable([]tcpip.Route{{
-		Destination: "\x00\x00\x00\x00",
-		Mask:        "\x00\x00\x00\x00",
-		Gateway:     "",
-		NIC:         1,
-	}})
-
-	return &testContext{
-		t:      t,
-		s:      s,
-		linkEP: linkEP,
-	}
-}
-
-func (c *testContext) cleanup() {
-	close(c.linkEP.C)
-}
-
-func (c *testContext) loopback() {
-	go func() {
-		for pkt := range c.linkEP.C {
-			v := make(buffer.View, len(pkt.Header)+len(pkt.Payload))
-			copy(v, pkt.Header)
-			copy(v[len(pkt.Header):], pkt.Payload)
-			vv := v.ToVectorisedView([1]buffer.View{})
-			c.linkEP.Inject(pkt.Proto, &vv)
-		}
-	}()
-}
-
-func TestEcho(t *testing.T) {
-	c := newTestContext(t)
-	defer c.cleanup()
-	c.loopback()
-
-	ch := make(chan ipv4.PingReply, 1)
-	p := ipv4.Pinger{
-		Stack: c.s,
-		NICID: 1,
-		Addr:  stackAddr,
-		Wait:  10 * time.Millisecond,
-		Count: 1, // one ping only
-	}
-	if err := p.Ping(context.Background(), ch); err != nil {
-		t.Fatalf("icmp.Ping failed: %v", err)
-	}
-
-	ping := <-ch
-	if ping.Error != nil {
-		t.Errorf("bad ping response: %v", ping.Error)
-	}
-}
-
-func TestEchoSequence(t *testing.T) {
-	c := newTestContext(t)
-	defer c.cleanup()
-	c.loopback()
-
-	const numPings = 3
-	ch := make(chan ipv4.PingReply, numPings)
-	p := ipv4.Pinger{
-		Stack: c.s,
-		NICID: 1,
-		Addr:  stackAddr,
-		Wait:  10 * time.Millisecond,
-		Count: numPings,
-	}
-	if err := p.Ping(context.Background(), ch); err != nil {
-		t.Fatalf("icmp.Ping failed: %v", err)
-	}
-
-	for i := uint16(0); i < numPings; i++ {
-		ping := <-ch
-		if ping.Error != nil {
-			t.Errorf("i=%d bad ping response: %v", i, ping.Error)
-		}
-		if ping.SeqNumber != i {
-			t.Errorf("SeqNumber=%d, want %d", ping.SeqNumber, i)
-		}
-	}
-}
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
new file mode 100644
index 000000000..a39a887b6
--- /dev/null
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -0,0 +1,50 @@
+package(licenses = ["notice"])  # BSD
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "ping_state",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "ping_packet_list.go",
+    ],
+    out = "ping_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
+    package = "ping",
+)
+
+go_template_instance(
+    name = "ping_packet_list",
+    out = "ping_packet_list.go",
+    package = "ping",
+    prefix = "pingPacket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*pingPacket",
+    },
+)
+
+go_library(
+    name = "ping",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "ping_packet_list.go",
+        "ping_state.go",
+        "protocol.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sleep",
+        "//pkg/state",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
new file mode 100644
index 000000000..609e7d947
--- /dev/null
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -0,0 +1,665 @@
+// Copyright 2016 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ping
+
+import (
+	"encoding/binary"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sleep"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+type pingPacket struct {
+	pingPacketEntry
+	senderAddress tcpip.FullAddress
+	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp     int64
+	hasTimestamp  bool
+	// views is used as buffer for data when its length is large
+	// enough to store a VectorisedView.
+	views [8]buffer.View `state:"nosave"`
+}
+
+type endpointState int
+
+const (
+	stateInitial endpointState = iota
+	stateBound
+	stateConnected
+	stateClosed
+)
+
+// endpoint represents a ping endpoint. This struct serves as the interface
+// between users of the endpoint and the protocol implementation; it is legal to
+// have concurrent goroutines make calls into the endpoint, they are properly
+// synchronized.
+type endpoint struct {
+	// The following fields are initialized at creation time and do not
+	// change throughout the lifetime of the endpoint.
+	stack       *stack.Stack `state:"manual"`
+	netProto    tcpip.NetworkProtocolNumber
+	waiterQueue *waiter.Queue
+
+	// The following fields are used to manage the receive queue, and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvReady      bool
+	rcvList       pingPacketList
+	rcvBufSizeMax int
+	rcvBufSize    int
+	rcvClosed     bool
+	rcvTimestamp  bool
+
+	// The following fields are protected by the mu mutex.
+	mu         sync.RWMutex `state:"nosave"`
+	sndBufSize int
+	id         stack.TransportEndpointID
+	state      endpointState
+	bindNICID  tcpip.NICID
+	bindAddr   tcpip.Address
+	regNICID   tcpip.NICID
+	route      stack.Route `state:"manual"`
+}
+
+func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
+	return &endpoint{
+		stack:         stack,
+		netProto:      netProto,
+		waiterQueue:   waiterQueue,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSize:    32 * 1024,
+	}
+}
+
+// Close puts the endpoint in a closed state and frees all resources
+// associated with it.
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	switch e.state {
+	case stateBound, stateConnected:
+		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, ProtocolNumber4, e.id)
+	}
+
+	// Close the receive list and drain it.
+	e.rcvMu.Lock()
+	e.rcvClosed = true
+	e.rcvBufSize = 0
+	for !e.rcvList.Empty() {
+		p := e.rcvList.Front()
+		e.rcvList.Remove(p)
+	}
+	e.rcvMu.Unlock()
+
+	e.route.Release()
+
+	// Update the state.
+	e.state = stateClosed
+}
+
+// Read reads data from the endpoint. This method does not block if
+// there is no data pending.
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	e.rcvMu.Lock()
+
+	if e.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if e.rcvClosed {
+			err = tcpip.ErrClosedForReceive
+		}
+		e.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	p := e.rcvList.Front()
+	e.rcvList.Remove(p)
+	e.rcvBufSize -= p.data.Size()
+	ts := e.rcvTimestamp
+
+	e.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = p.senderAddress
+	}
+
+	if ts && !p.hasTimestamp {
+		// Linux uses the current time.
+		p.timestamp = e.stack.NowNanoseconds()
+	}
+
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: ts, Timestamp: p.timestamp}, nil
+}
+
+// prepareForWrite prepares the endpoint for sending data. In particular, it
+// binds it if it's still in the initial state. To do so, it must first
+// reacquire the mutex in exclusive mode.
+//
+// Returns true for retry if preparation should be retried.
+func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
+	switch e.state {
+	case stateInitial:
+	case stateConnected:
+		return false, nil
+
+	case stateBound:
+		if to == nil {
+			return false, tcpip.ErrDestinationRequired
+		}
+		return false, nil
+	default:
+		return false, tcpip.ErrInvalidEndpointState
+	}
+
+	e.mu.RUnlock()
+	defer e.mu.RLock()
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// The state changed when we released the shared locked and re-acquired
+	// it in exclusive mode. Try again.
+	if e.state != stateInitial {
+		return true, nil
+	}
+
+	// The state is still 'initial', so try to bind the endpoint.
+	if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
+		return false, err
+	}
+
+	return true, nil
+}
+
+// Write writes data to the endpoint's peer. This method does not block
+// if the data cannot be written.
+func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
+	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
+	if opts.More {
+		return 0, tcpip.ErrInvalidOptionValue
+	}
+
+	to := opts.To
+
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	// Prepare for write.
+	for {
+		retry, err := e.prepareForWrite(to)
+		if err != nil {
+			return 0, err
+		}
+
+		if !retry {
+			break
+		}
+	}
+
+	var route *stack.Route
+	if to == nil {
+		route = &e.route
+
+		if route.IsResolutionRequired() {
+			// Promote lock to exclusive if using a shared route, given that it may
+			// need to change in Route.Resolve() call below.
+			e.mu.RUnlock()
+			defer e.mu.RLock()
+
+			e.mu.Lock()
+			defer e.mu.Unlock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != stateConnected {
+				return 0, tcpip.ErrInvalidEndpointState
+			}
+		}
+	} else {
+		// Reject destination address if it goes through a different
+		// NIC than the endpoint was bound to.
+		nicid := to.NIC
+		if e.bindNICID != 0 {
+			if nicid != 0 && nicid != e.bindNICID {
+				return 0, tcpip.ErrNoRoute
+			}
+
+			nicid = e.bindNICID
+		}
+
+		toCopy := *to
+		to = &toCopy
+		netProto, err := e.checkV4Mapped(to, true)
+		if err != nil {
+			return 0, err
+		}
+
+		// Find the enpoint.
+		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto)
+		if err != nil {
+			return 0, err
+		}
+		defer r.Release()
+
+		route = &r
+	}
+
+	if route.IsResolutionRequired() {
+		waker := &sleep.Waker{}
+		if err := route.Resolve(waker); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				// Link address needs to be resolved. Resolution was triggered the
+				// background. Better luck next time.
+				//
+				// TODO: queue up the request and send after link address
+				// is resolved.
+				route.RemoveWaker(waker)
+				return 0, tcpip.ErrNoLinkAddress
+			}
+			return 0, err
+		}
+	}
+
+	v, err := p.Get(p.Size())
+	if err != nil {
+		return 0, err
+	}
+
+	switch e.netProto {
+	case header.IPv4ProtocolNumber:
+		err = sendPing4(route, e.id.LocalPort, v)
+
+	case header.IPv6ProtocolNumber:
+		// TODO: Support IPv6.
+	}
+
+	return uintptr(len(v)), err
+}
+
+// Peek only returns data from a single datagram, so do nothing here.
+func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// SetSockOpt sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
+	case tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		e.rcvTimestamp = v != 0
+		e.rcvMu.Unlock()
+	}
+	return nil
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+
+	case *tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		*o = tcpip.SendBufferSizeOption(e.sndBufSize)
+		e.mu.Unlock()
+		return nil
+
+	case *tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		*o = tcpip.ReceiveBufferSizeOption(e.rcvBufSizeMax)
+		e.rcvMu.Unlock()
+		return nil
+
+	case *tcpip.ReceiveQueueSizeOption:
+		e.rcvMu.Lock()
+		if e.rcvList.Empty() {
+			*o = 0
+		} else {
+			p := e.rcvList.Front()
+			*o = tcpip.ReceiveQueueSizeOption(p.data.Size())
+		}
+		e.rcvMu.Unlock()
+		return nil
+
+	case *tcpip.TimestampOption:
+		e.rcvMu.Lock()
+		*o = 0
+		if e.rcvTimestamp {
+			*o = 1
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func sendPing4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
+	if len(data) < header.ICMPv4EchoMinimumSize {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Set the ident. Sequence number is provided by the user.
+	binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], ident)
+
+	hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength()))
+
+	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize))
+	copy(icmpv4, data)
+	data = data[header.ICMPv4EchoMinimumSize:]
+
+	// Linux performs these basic checks.
+	if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	icmpv4.SetChecksum(0)
+	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
+
+	return r.WritePacket(&hdr, data, header.ICMPv4ProtocolNumber)
+}
+
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := e.netProto
+	if header.IsV4MappedAddress(addr.Addr) {
+		return 0, tcpip.ErrNoRoute
+	}
+
+	// Fail if we're bound to an address length different from the one we're
+	// checking.
+	if l := len(e.id.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	return netProto, nil
+}
+
+// Connect connects the endpoint to its peer. Specifying a NIC is optional.
+func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	nicid := addr.NIC
+	localPort := uint16(0)
+	switch e.state {
+	case stateBound, stateConnected:
+		localPort = e.id.LocalPort
+		if e.bindNICID == 0 {
+			break
+		}
+
+		if nicid != 0 && nicid != e.bindNICID {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		nicid = e.bindNICID
+	default:
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	netProto, err := e.checkV4Mapped(&addr, false)
+	if err != nil {
+		return err
+	}
+
+	// Find a route to the desired destination.
+	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto)
+	if err != nil {
+		return err
+	}
+	defer r.Release()
+
+	id := stack.TransportEndpointID{
+		LocalAddress:  r.LocalAddress,
+		LocalPort:     localPort,
+		RemoteAddress: r.RemoteAddress,
+	}
+
+	// Even if we're connected, this endpoint can still be used to send
+	// packets on a different network protocol, so we register both even if
+	// v6only is set to false and this is an ipv6 endpoint.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	id, err = e.registerWithStack(nicid, netProtos, id)
+	if err != nil {
+		return err
+	}
+
+	e.id = id
+	e.route = r.Clone()
+	e.regNICID = nicid
+
+	e.state = stateConnected
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// ConnectEndpoint is not supported.
+func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown closes the read and/or write end of the endpoint connection
+// to its peer.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	if e.state != stateConnected {
+		return tcpip.ErrNotConnected
+	}
+
+	if flags&tcpip.ShutdownRead != 0 {
+		e.rcvMu.Lock()
+		wasClosed := e.rcvClosed
+		e.rcvClosed = true
+		e.rcvMu.Unlock()
+
+		if !wasClosed {
+			e.waiterQueue.Notify(waiter.EventIn)
+		}
+	}
+
+	return nil
+}
+
+// Listen is not supported by UDP, it just fails.
+func (*endpoint) Listen(int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept is not supported by UDP, it just fails.
+func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+	if id.LocalPort != 0 {
+		// The endpoint already has a local port, just attempt to
+		// register it.
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber4, id, e)
+		return id, err
+	}
+
+	// We need to find a port for the endpoint.
+	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
+		id.LocalPort = p
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber4, id, e)
+		switch err {
+		case nil:
+			return true, nil
+		case tcpip.ErrPortInUse:
+			return false, nil
+		default:
+			return false, err
+		}
+	})
+
+	return id, err
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	// Don't allow binding once endpoint is not in the initial state
+	// anymore.
+	if e.state != stateInitial {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	netProto, err := e.checkV4Mapped(&addr, false)
+	if err != nil {
+		return err
+	}
+
+	// Expand netProtos to include v4 and v6 if the caller is binding to a
+	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
+	// set to false.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	if len(addr.Addr) != 0 {
+		// A local address was specified, verify that it's valid.
+		if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 {
+			return tcpip.ErrBadLocalAddress
+		}
+	}
+
+	id := stack.TransportEndpointID{
+		LocalPort:    addr.Port,
+		LocalAddress: addr.Addr,
+	}
+	id, err = e.registerWithStack(addr.NIC, netProtos, id)
+	if err != nil {
+		return err
+	}
+	if commit != nil {
+		if err := commit(); err != nil {
+			// Unregister, the commit failed.
+			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, ProtocolNumber4, id)
+			return err
+		}
+	}
+
+	e.id = id
+	e.regNICID = addr.NIC
+
+	// Mark endpoint as bound.
+	e.state = stateBound
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// Bind binds the endpoint to a specific local address and port.
+// Specifying a NIC is optional.
+func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	err := e.bindLocked(addr, commit)
+	if err != nil {
+		return err
+	}
+
+	e.bindNICID = addr.NIC
+	e.bindAddr = addr.Addr
+
+	return nil
+}
+
+// GetLocalAddress returns the address to which the endpoint is bound.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	return tcpip.FullAddress{
+		NIC:  e.regNICID,
+		Addr: e.id.LocalAddress,
+		Port: e.id.LocalPort,
+	}, nil
+}
+
+// GetRemoteAddress returns the address to which the endpoint is connected.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	if e.state != stateConnected {
+		return tcpip.FullAddress{}, tcpip.ErrNotConnected
+	}
+
+	return tcpip.FullAddress{
+		NIC:  e.regNICID,
+		Addr: e.id.RemoteAddress,
+		Port: e.id.RemotePort,
+	}, nil
+}
+
+// Readiness returns the current readiness of the endpoint. For example, if
+// waiter.EventIn is set, the endpoint is immediately readable.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine if the endpoint is readable if requested.
+	if (mask & waiter.EventIn) != 0 {
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() || e.rcvClosed {
+			result |= waiter.EventIn
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv *buffer.VectorisedView) {
+	e.rcvMu.Lock()
+
+	// Drop the packet if our buffer is currently full.
+	if !e.rcvReady || e.rcvClosed || e.rcvBufSize >= e.rcvBufSizeMax {
+		e.rcvMu.Unlock()
+		return
+	}
+
+	wasEmpty := e.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	pkt := &pingPacket{
+		senderAddress: tcpip.FullAddress{
+			NIC:  r.NICID(),
+			Addr: id.RemoteAddress,
+		},
+	}
+	pkt.data = vv.Clone(pkt.views[:])
+	e.rcvList.PushBack(pkt)
+	e.rcvBufSize += vv.Size()
+
+	if e.rcvTimestamp {
+		pkt.timestamp = e.stack.NowNanoseconds()
+		pkt.hasTimestamp = true
+	}
+
+	e.rcvMu.Unlock()
+
+	// Notify any waiters that there's data to be read now.
+	if wasEmpty {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv *buffer.VectorisedView) {
+}
diff --git a/pkg/tcpip/transport/ping/endpoint_state.go b/pkg/tcpip/transport/ping/endpoint_state.go
new file mode 100644
index 000000000..e1664f049
--- /dev/null
+++ b/pkg/tcpip/transport/ping/endpoint_state.go
@@ -0,0 +1,61 @@
+// Copyright 2016 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ping
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves pingPacket.data field.
+func (p *pingPacket) saveData() buffer.VectorisedView {
+	// We cannot save p.data directly as p.data.views may alias to p.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return p.data.Clone(nil)
+}
+
+// loadData loads pingPacket.data field.
+func (p *pingPacket) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing p.views for data.views.
+	p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (e *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	e.rcvMu.Lock()
+}
+
+// afterLoad is invoked by stateify.
+func (e *endpoint) afterLoad() {
+	e.stack = stack.StackFromEnv
+
+	if e.state != stateBound && e.state != stateConnected {
+		return
+	}
+
+	var err *tcpip.Error
+	if e.state == stateConnected {
+		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto)
+		if err != nil {
+			panic(*err)
+		}
+
+		e.id.LocalAddress = e.route.LocalAddress
+	} else if len(e.id.LocalAddress) != 0 { // stateBound
+		if e.stack.CheckLocalAddress(e.regNICID, e.netProto, e.id.LocalAddress) == 0 {
+			panic(tcpip.ErrBadLocalAddress)
+		}
+	}
+
+	e.id, err = e.registerWithStack(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.id)
+	if err != nil {
+		panic(*err)
+	}
+}
diff --git a/pkg/tcpip/transport/ping/protocol.go b/pkg/tcpip/transport/ping/protocol.go
new file mode 100644
index 000000000..1459b4d60
--- /dev/null
+++ b/pkg/tcpip/transport/ping/protocol.go
@@ -0,0 +1,106 @@
+// Copyright 2016 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package ping contains the implementation of the ICMP and IPv6-ICMP transport
+// protocols for use in ping. To use it in the networking stack, this package
+// must be added to the project, and
+// activated on the stack by passing ping.ProtocolName (or "ping") and/or
+// ping.ProtocolName6 (or "ping6") as one of the transport protocols when
+// calling stack.New(). Then endpoints can be created by passing
+// ping.ProtocolNumber or ping.ProtocolNumber6 as the transport protocol number
+// when calling Stack.NewEndpoint().
+package ping
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// ProtocolName4 is the string representation of the ping protocol name.
+	ProtocolName4 = "ping4"
+
+	// ProtocolNumber4 is the ICMP protocol number.
+	ProtocolNumber4 = header.ICMPv4ProtocolNumber
+
+	// ProtocolName6 is the string representation of the ping protocol name.
+	ProtocolName6 = "ping6"
+
+	// ProtocolNumber6 is the IPv6-ICMP protocol number.
+	ProtocolNumber6 = header.ICMPv6ProtocolNumber
+)
+
+type protocol struct {
+	number tcpip.TransportProtocolNumber
+}
+
+// Number returns the ICMP protocol number.
+func (p *protocol) Number() tcpip.TransportProtocolNumber {
+	return p.number
+}
+
+func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.IPv4ProtocolNumber
+	case ProtocolNumber6:
+		return header.IPv6ProtocolNumber
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// NewEndpoint creates a new ping endpoint.
+func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if netProto != p.netProto() {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+	return newEndpoint(stack, netProto, waiterQueue), nil
+}
+
+// MinimumPacketSize returns the minimum valid ping packet size.
+func (p *protocol) MinimumPacketSize() int {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.ICMPv4EchoMinimumSize
+	case ProtocolNumber6:
+		return header.ICMPv6EchoMinimumSize
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// ParsePorts returns the source and destination ports stored in the given udp
+// packet.
+func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
+	return 0, binary.BigEndian.Uint16(v[header.ICMPv4MinimumSize:]), nil
+}
+
+// HandleUnknownDestinationPacket handles packets targeted at this protocol but
+// that don't match any existing endpoint.
+func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *buffer.VectorisedView) bool {
+	return true
+}
+
+// SetOption implements TransportProtocol.SetOption.
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Option implements TransportProtocol.Option.
+func (p *protocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func init() {
+	stack.RegisterTransportProtocolFactory(ProtocolName4, func() stack.TransportProtocol {
+		return &protocol{ProtocolNumber4}
+	})
+
+	// TODO: Support IPv6.
+}
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 41b98424a..e20d59ca3 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -13,7 +13,7 @@ import (
 
 // saveData saves udpPacket.data field.
 func (u *udpPacket) saveData() buffer.VectorisedView {
-	// We canoot save u.data directly as u.data.views may alias to u.views,
+	// We cannot save u.data directly as u.data.views may alias to u.views,
 	// which is not allowed by state framework (in-struct pointer).
 	return u.data.Clone(nil)
 }
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 88736cfa4..16522c668 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -64,6 +64,7 @@ go_library(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/ping",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index d63a9028e..af577f571 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -43,6 +43,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.googlesource.com/gvisor/runsc/boot/filter"
@@ -346,7 +347,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) inet.Stack {
 	case NetworkNone, NetworkSandbox:
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
-		protoNames := []string{tcp.ProtocolName, udp.ProtocolName}
+		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
 		return &epsocket.Stack{stack.New(clock, netProtos, protoNames)}
 
 	default:
-- 
cgit v1.2.3


From 2264ce7f62994fa0476f8e1eb9fe497d60547bda Mon Sep 17 00:00:00 2001
From: Zhengyu He <hzy@google.com>
Date: Wed, 2 May 2018 03:42:54 -0700
Subject: Use png for the run states diagram

PiperOrigin-RevId: 195071508
Change-Id: I63314bf7529560e4c779ef07cc9399ad8d53f0a2
---
 pkg/sentry/kernel/README.md            |   2 +-
 pkg/sentry/kernel/g3doc/run_states.png | Bin 0 -> 234152 bytes
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 pkg/sentry/kernel/g3doc/run_states.png

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md
index 3306780d6..88760a9bb 100644
--- a/pkg/sentry/kernel/README.md
+++ b/pkg/sentry/kernel/README.md
@@ -87,7 +87,7 @@ kept separate from the main "app" state to reduce the size of the latter.
 4. `SyscallReinvoke`, which does not correspond to anything in Linux, and exists
 solely to serve the autosave feature.
 
-![dot -Tsvg -Goverlap=false -orun_states.svg run_states.dot](g3doc/run_states.dot "Task control flow graph")
+![dot -Tpng -Goverlap=false -orun_states.png run_states.dot](g3doc/run_states.png "Task control flow graph")
 
 States before which a stop may occur are represented as implementations of the
 `taskRunState` interface named `run(state)`, allowing them to be saved and
diff --git a/pkg/sentry/kernel/g3doc/run_states.png b/pkg/sentry/kernel/g3doc/run_states.png
new file mode 100644
index 000000000..b63b60f02
Binary files /dev/null and b/pkg/sentry/kernel/g3doc/run_states.png differ
-- 
cgit v1.2.3


From a61def1b368a9042e346787008e12770e4e67b35 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 2 May 2018 17:39:12 -0700
Subject: Remove detach for exec options

Detachable exec commands are handled in the client entirely and the detach option is not used anymore.

PiperOrigin-RevId: 195181272
Change-Id: I6e82a2876d2c173709c099be59670f71702e5bf0
---
 pkg/sentry/control/proc.go    | 9 ---------
 runsc/cmd/exec.go             | 3 +--
 runsc/sandbox/sandbox_test.go | 8 +-------
 3 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 7d06a1d04..d77b30c90 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -72,9 +72,6 @@ type ExecArgs struct {
 	// Capabilities is the list of capabilities to give to the process.
 	Capabilities *auth.TaskCapabilities
 
-	// Detach indicates whether Exec should detach once the process starts.
-	Detach bool
-
 	// FilePayload determines the files to give to the new process.
 	urpc.FilePayload
 }
@@ -135,12 +132,6 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		return err
 	}
 
-	// If we're supposed to detach, don't wait for the process to exit.
-	if args.Detach {
-		*waitStatus = 0
-		return nil
-	}
-
 	// Wait for completion.
 	newTG.WaitExited()
 	*waitStatus = newTG.ExitStatus().Status()
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 8379f552d..576031b5b 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -99,7 +99,6 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if err != nil {
 		Fatalf("error parsing process spec: %v", err)
 	}
-	e.Detach = ex.detach
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
@@ -123,7 +122,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// executed. If detach was specified, starts a child in non-detach mode,
 	// write the child's PID to the pid file. So when the container returns, the
 	// child process will also return and signal containerd.
-	if e.Detach {
+	if ex.detach {
 		binPath, err := specutils.BinPath()
 		if err != nil {
 			Fatalf("error getting bin path: %v", err)
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index 6c71cac30..6e3125b7b 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -365,7 +365,6 @@ func TestExec(t *testing.T) {
 		Envv:             []string{"PATH=" + os.Getenv("PATH")},
 		WorkingDirectory: "/",
 		KUID:             uid,
-		Detach:           false,
 	}
 
 	// Verify that "sleep 100" and "sleep 5" are running after exec.
@@ -472,7 +471,6 @@ func TestCapabilities(t *testing.T) {
 		KUID:             uid,
 		KGID:             gid,
 		Capabilities:     &auth.TaskCapabilities{},
-		Detach:           true,
 	}
 
 	// "exe" should fail because we don't have the necessary permissions.
@@ -484,14 +482,10 @@ func TestCapabilities(t *testing.T) {
 	execArgs.Capabilities = &auth.TaskCapabilities{
 		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
 	}
-	// First, start running exec.
+	// "exe" should not fail this time.
 	if _, err := s.Execute(&execArgs); err != nil {
 		t.Fatalf("sandbox failed to exec %v: %v", execArgs, err)
 	}
-
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Error(err)
-	}
 }
 
 // Test that an tty FD is sent over the console socket if one is provided.
-- 
cgit v1.2.3


From 9739b8c21cd1716c4c1c81a27121c50e63fcf906 Mon Sep 17 00:00:00 2001
From: Christopher Koch <chrisko@google.com>
Date: Thu, 3 May 2018 09:59:32 -0700
Subject: Don't prematurely remove MountSource from parent's children.

Otherwise, mounts that fail to be unmounted (EBUSY) will be removed
from the children list anyway.

At this point, this just affects /proc/pid/mounts and /proc/pid/mountinfo.

PiperOrigin-RevId: 195267588
Change-Id: I79114483d73b90f9a7d764a7d513b5b2f251182e
---
 pkg/sentry/fs/mounts.go | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 1e6b5b70e..87da4ee0e 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -272,31 +272,8 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 			panic("cannot unmount initial dirent")
 		}
 
+		m := node.Inode.MountSource
 		if !detachOnly {
-			m := node.Inode.MountSource
-
-			// Lock the parent MountSource first, if it exists. We are
-			// holding mns.Lock, so the parent can not change out
-			// from under us.
-			parent := m.Parent()
-			if parent != nil {
-				parent.mu.Lock()
-				defer parent.mu.Unlock()
-			}
-
-			// Lock the mount that is being unmounted.
-			m.mu.Lock()
-			defer m.mu.Unlock()
-
-			if m.parent != nil {
-				// Sanity check.
-				if _, ok := m.parent.children[m]; !ok {
-					panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent))
-				}
-				delete(m.parent.children, m)
-				m.parent = nil
-			}
-
 			// Flush all references on the mounted node.
 			m.FlushDirentRefs()
 
@@ -315,6 +292,27 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 			}
 		}
 
+		// Lock the parent MountSource first, if it exists. We are
+		// holding mns.Lock, so the parent can not change out
+		// from under us.
+		parent := m.Parent()
+		if parent != nil {
+			parent.mu.Lock()
+			defer parent.mu.Unlock()
+		}
+
+		// Lock the mount that is being unmounted.
+		m.mu.Lock()
+		defer m.mu.Unlock()
+
+		if m.parent != nil {
+			// Sanity check.
+			if _, ok := m.parent.children[m]; !ok {
+				panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent))
+			}
+			delete(m.parent.children, m)
+		}
+
 		original := origs[len(origs)-1]
 		if err := node.unmount(ctx, original); err != nil {
 			return err
-- 
cgit v1.2.3


From 04b79137babed361fb227e3ad579adb2df4bb188 Mon Sep 17 00:00:00 2001
From: Cyrille Hemidy <cyrille.hemidy@gmail.com>
Date: Thu, 3 May 2018 14:05:25 -0700
Subject: Fix misspellings.

PiperOrigin-RevId: 195307689
Change-Id: I499f19af49875a43214797d63376f20ae788d2f4
---
 pkg/log/log.go                           | 2 +-
 pkg/sentry/fs/file.go                    | 2 +-
 pkg/sentry/fs/fsutil/file.go             | 2 +-
 pkg/sentry/fs/ramfs/dir.go               | 4 ++--
 pkg/sentry/fs/tty/line_discipline.go     | 4 ++--
 pkg/sentry/kernel/semaphore/semaphore.go | 2 +-
 pkg/sentry/kernel/task_exit.go           | 2 +-
 pkg/sentry/mm/vma.go                     | 2 +-
 pkg/sentry/socket/rpcinet/socket.go      | 2 +-
 pkg/sentry/strace/syscalls.go            | 2 +-
 pkg/tcpip/header/ipv6.go                 | 2 +-
 pkg/tcpip/stack/stack.go                 | 2 +-
 pkg/tcpip/transport/tcp/snd.go           | 2 +-
 runsc/sandbox/sandbox.go                 | 4 ++--
 14 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/log/log.go b/pkg/log/log.go
index 110e0e196..cdfc0601a 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -149,7 +149,7 @@ func (t TestEmitter) Emit(level Level, timestamp time.Time, format string, v ...
 // Logger is a high-level logging interface. It is in fact, not used within the
 // log package. Rather it is provided for others to provide contextual loggers
 // that may append some addition information to log statement. BasicLogger
-// satifies this interface, and may be passed around as a Logger.
+// satisfies this interface, and may be passed around as a Logger.
 type Logger interface {
 	// Debugf logs a debug statement.
 	Debugf(format string, v ...interface{})
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index de2e80bf0..f2683bbd2 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -376,7 +376,7 @@ func (r *FileReader) Read(buf []byte) (int, error) {
 	return int(n), err
 }
 
-// ReadAt implementes io.Reader.ReadAt.
+// ReadAt implements io.Reader.ReadAt.
 func (r *FileReader) ReadAt(buf []byte, offset int64) (int, error) {
 	n, err := r.File.Preadv(r.Ctx, usermem.BytesIOSequence(buf), offset)
 	return int(n), err
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index a7329f1c9..b17f11a5a 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -34,7 +34,7 @@ func (NoopRelease) Release() {}
 // SeekWithDirCursor is used to implement fs.FileOperations.Seek.  If dirCursor
 // is not nil and the seek was on a directory, the cursor will be updated.
 //
-// Currenly only seeking to 0 on a directory is supported.
+// Currently only seeking to 0 on a directory is supported.
 //
 // FIXME: Lift directory seeking limitations.
 func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) {
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index bf4cd8dfd..19d5612ed 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -103,7 +103,7 @@ func (d *Dir) addChildLocked(name string, inode *fs.Inode) {
 	}
 
 	// Given we're now adding this inode to the directory we must also
-	// increase its link count. Similiarly we decremented it in removeChildLocked.
+	// increase its link count. Similarly we decremented it in removeChildLocked.
 	inode.AddLink()
 }
 
@@ -144,7 +144,7 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 	inode.NotifyStatusChange(ctx)
 
 	// Given we're now removing this inode to the directory we must also
-	// decrease its link count. Similiarly it is increased in addChildLocked.
+	// decrease its link count. Similarly it is increased in addChildLocked.
 	inode.DropLink()
 
 	return inode, nil
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index fde4e7941..a3aa95ece 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -244,8 +244,8 @@ func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence,
 	return int64(n), err
 }
 
-// transformOutput does ouput processing for one end of the pty. See
-// drivers/tty/n_tty.c:do_output_char for an analagous kernel function.
+// transformOutput does output processing for one end of the pty. See
+// drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
 //
 // Precondition: l.termiosMu must be held.
 func (l *lineDiscipline) transformOutput(buf []byte) *bytes.Buffer {
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 19ad5d537..fb8c2f98c 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -298,7 +298,7 @@ func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
 }
 
 // ExecuteOps attempts to execute a list of operations to the set. It only
-// suceeds when all operations can be applied. No changes are made if it fails.
+// succeeds when all operations can be applied. No changes are made if it fails.
 //
 // On failure, it may return an error (retries are hopeless) or it may return
 // a channel that can be waited on before attempting again.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 3d49ae350..d6604f37b 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -125,7 +125,7 @@ func (t *Task) killLocked() {
 		Signo: int32(linux.SIGKILL),
 		// Linux just sets SIGKILL in the pending signal bitmask without
 		// enqueueing an actual siginfo, such that
-		// kernel/signal.c:collect_signal() initalizes si_code to SI_USER.
+		// kernel/signal.c:collect_signal() initializes si_code to SI_USER.
 		Code: arch.SignalInfoUser,
 	})
 	t.interrupt()
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index b6af48cb7..61aaa3195 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -243,7 +243,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange
 }
 
 // getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
-// support access to type of (at, ignorePermissions). It retuns the subset of
+// support access to type of (at, ignorePermissions). It returns the subset of
 // ars for which vmas exist. If this is not equal to ars, it returns a non-nil
 // error explaining why.
 //
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 15047df01..2911d3fd6 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -530,7 +530,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protoc
 	// Only accept TCP and UDP.
 	//
 	// Try to restrict the flags we will accept to minimize backwards
-	// incompatability with netstack.
+	// incompatibility with netstack.
 	stype := int(stypeflags) & linux.SOCK_TYPE_MASK
 	switch stype {
 	case syscall.SOCK_STREAM:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index d0e661706..eccee733e 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -82,7 +82,7 @@ const (
 	// PipeFDs is an array of two FDs, formatted after syscall execution.
 	PipeFDs
 
-	// Uname is a pointer to a struct uname, formatted after syscall exection.
+	// Uname is a pointer to a struct uname, formatted after syscall execution.
 	Uname
 
 	// Stat is a pointer to a struct stat, formatted after syscall execution.
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index d8dc138b3..da0210539 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -60,7 +60,7 @@ const (
 	// IPv6ProtocolNumber is IPv6's network protocol number.
 	IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd
 
-	// IPv6Version is the version of the ipv6 procotol.
+	// IPv6Version is the version of the ipv6 protocol.
 	IPv6Version = 6
 
 	// IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460,
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index b480bf812..f0fbd8aad 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -165,7 +165,7 @@ type TCPSenderState struct {
 	// window size from a segment.
 	SndWndScale uint8
 
-	// MaxSentAck is the highest acknowledgemnt number sent till now.
+	// MaxSentAck is the highest acknowledgement number sent till now.
 	MaxSentAck seqnum.Value
 
 	// FastRecovery holds the fast recovery state for the endpoint.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index ad94aecd8..6c363a929 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -152,7 +152,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 
 // updateMaxPayloadSize updates the maximum payload size based on the given
 // MTU. If this is in response to "packet too big" control packets (indicated
-// by the count argument), it also reduces the number of oustanding packets and
+// by the count argument), it also reduces the number of outstanding packets and
 // attempts to retransmit the first packet above the MTU size.
 func (s *sender) updateMaxPayloadSize(mtu, count int) {
 	m := mtu - header.TCPMinimumSize
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 64810b4ea..954824ada 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -58,7 +58,7 @@ func validateID(id string) error {
 //
 // Within a root directory, we maintain subdirectories for each sandbox named
 // with the sandbox id.  The sandbox metadata is is stored as json within the
-// sandbox directoy in a file named "meta.json".  This metadata format is
+// sandbox directory in a file named "meta.json".  This metadata format is
 // defined by us, and is not part of the OCI spec.
 //
 // Sandboxes must write this metadata file after any change to their internal
@@ -199,7 +199,7 @@ func Load(rootDir, id string) (*Sandbox, error) {
 	// If the status is "Running" or "Created", check that the process
 	// still exists, and set it to Stopped if it does not.
 	//
-	// This is inherintly racey.
+	// This is inherently racey.
 	if s.Status == Running || s.Status == Created {
 		// Send signal 0 to check if process exists.
 		if err := s.Signal(0); err != nil {
-- 
cgit v1.2.3


From 58235b1840db01aa2ede311efa782eac60767722 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 3 May 2018 16:25:39 -0700
Subject: Clean up control message strace logging

PiperOrigin-RevId: 195329972
Change-Id: I42f7d8800e6692c45ffa9683741f8de89f9a69bb
---
 pkg/sentry/strace/socket.go | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 1a2e8573e..26831edd6 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -463,7 +463,6 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 
 		var h linux.ControlMessageHeader
 		binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h)
-		i += linux.SizeOfControlMessageHeader
 
 		var skipData bool
 		level := "SOL_SOCKET"
@@ -478,7 +477,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 			typ = fmt.Sprint(h.Type)
 		}
 
-		if h.Length > uint64(len(buf)-i+linux.SizeOfControlMessageHeader) {
+		if h.Length > uint64(len(buf)-i) {
 			strs = append(strs, fmt.Sprintf(
 				"{level=%s, type=%s, length=%d, content extends beyond buffer}",
 				level,
@@ -488,12 +487,13 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 			break
 		}
 
+		i += linux.SizeOfControlMessageHeader
 		width := t.Arch().Width()
 		length := int(h.Length) - linux.SizeOfControlMessageHeader
 
 		if skipData {
 			strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
-			i += control.AlignUp(i+length, width)
+			i += control.AlignUp(length, width)
 			continue
 		}
 
@@ -518,8 +518,6 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 				strings.Join(rights, ","),
 			))
 
-			i += control.AlignUp(length, width)
-
 		case linux.SCM_CREDENTIALS:
 			if length < linux.SizeOfControlMessageCredentials {
 				strs = append(strs, fmt.Sprintf(
@@ -528,7 +526,6 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 					typ,
 					h.Length,
 				))
-				i += control.AlignUp(length, width)
 				break
 			}
 
@@ -545,8 +542,6 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 				creds.GID,
 			))
 
-			i += control.AlignUp(length, width)
-
 		case linux.SO_TIMESTAMP:
 			if length < linux.SizeOfTimeval {
 				strs = append(strs, fmt.Sprintf(
@@ -555,7 +550,6 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 					typ,
 					h.Length,
 				))
-				i += control.AlignUp(length, width)
 				break
 			}
 
@@ -571,11 +565,10 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
 				tv.Usec,
 			))
 
-			i += control.AlignUp(length, width)
-
 		default:
 			panic("unreachable")
 		}
+		i += control.AlignUp(length, width)
 	}
 
 	return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", "))
-- 
cgit v1.2.3


From 0ce9c81b416494e3c3da793c278dfc767341fa6d Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Fri, 4 May 2018 13:55:06 -0700
Subject: sentry: capture CPU usage metadata for save.

PiperOrigin-RevId: 195466647
Change-Id: Ib5ca815f7b64a4881441e58567adedf344b206f1
---
 pkg/sentry/state/BUILD             |  2 ++
 pkg/sentry/state/state.go          |  6 +++++-
 pkg/sentry/state/state_metadata.go | 16 ++++++++++++++++
 pkg/sentry/state/state_unsafe.go   | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 pkg/sentry/state/state_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 7148df395..9bd98f445 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -7,10 +7,12 @@ go_library(
     srcs = [
         "state.go",
         "state_metadata.go",
+        "state_unsafe.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/state",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 5bec4e018..c306091da 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -27,6 +27,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/state/statefile"
 )
 
+var previousMetadata map[string]string
+
 // ErrStateFile is returned when the state file cannot be opened.
 type ErrStateFile struct {
 	err error
@@ -103,11 +105,13 @@ type LoadOpts struct {
 // Load loads the given kernel, setting the provided platform and stack.
 func (opts LoadOpts) Load(k *kernel.Kernel, p platform.Platform, n inet.Stack) error {
 	// Open the file.
-	r, _, err := statefile.NewReader(opts.Source, opts.Key)
+	r, m, err := statefile.NewReader(opts.Source, opts.Key)
 	if err != nil {
 		return ErrStateFile{err}
 	}
 
+	previousMetadata = m
+
 	// Restore the Kernel object graph.
 	return k.LoadFrom(r, p, n)
 }
diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go
index ac374f428..b6d3dbcb4 100644
--- a/pkg/sentry/state/state_metadata.go
+++ b/pkg/sentry/state/state_metadata.go
@@ -17,13 +17,29 @@ package state
 import (
 	"fmt"
 	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
 // The save metadata keys for timestamp.
 const (
+	cpuUsage          = "cpu_usage"
 	metadataTimestamp = "timestamp"
 )
 
 func addSaveMetadata(m map[string]string) {
+	t, err := cpuTime()
+	if err != nil {
+		log.Warningf("Error getting cpu time: %v", err)
+	}
+	if previousMetadata != nil {
+		p, err := time.ParseDuration(previousMetadata[cpuUsage])
+		if err != nil {
+			log.Warningf("Error parsing previous runs' cpu time: %v", err)
+		}
+		t += p
+	}
+	m[cpuUsage] = t.String()
+
 	m[metadataTimestamp] = fmt.Sprintf("%v", time.Now())
 }
diff --git a/pkg/sentry/state/state_unsafe.go b/pkg/sentry/state/state_unsafe.go
new file mode 100644
index 000000000..53814ef70
--- /dev/null
+++ b/pkg/sentry/state/state_unsafe.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package state
+
+import (
+	"fmt"
+	"syscall"
+	"time"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func cpuTime() (time.Duration, error) {
+	var ts syscall.Timespec
+	_, _, errno := syscall.RawSyscall(syscall.SYS_CLOCK_GETTIME, uintptr(linux.CLOCK_PROCESS_CPUTIME_ID), uintptr(unsafe.Pointer(&ts)), 0)
+	if errno != 0 {
+		return 0, fmt.Errorf("failed calling clock_gettime(CLOCK_PROCESS_CPUTIME_ID): errno=%d", errno)
+	}
+	return time.Duration(ts.Nano()), nil
+}
-- 
cgit v1.2.3


From f47174f06b9904b830268d46a7e817053b6235c8 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 4 May 2018 14:15:24 -0700
Subject: Run gofmt -s on everything

PiperOrigin-RevId: 195469901
Change-Id: I66d5c7a334bbb8b47e40d266a2661291c2d91c7f
---
 pkg/sentry/control/proc_test.go               | 8 ++++----
 pkg/sentry/kernel/semaphore/semaphore_test.go | 8 ++++----
 runsc/fsgofer/fsgofer.go                      | 4 ++--
 runsc/fsgofer/fsgofer_test.go                 | 8 ++++----
 runsc/sandbox/sandbox.go                      | 4 ++--
 5 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index 18286496f..22c826236 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -38,7 +38,7 @@ func TestProcessListTable(t *testing.T) {
 		},
 		{
 			pl: []*Process{
-				&Process{
+				{
 					UID:   0,
 					PID:   0,
 					PPID:  0,
@@ -47,7 +47,7 @@ func TestProcessListTable(t *testing.T) {
 					Time:  "0",
 					Cmd:   "zero",
 				},
-				&Process{
+				{
 					UID:   1,
 					PID:   1,
 					PPID:  1,
@@ -83,7 +83,7 @@ func TestProcessListJSON(t *testing.T) {
 		},
 		{
 			pl: []*Process{
-				&Process{
+				{
 					UID:   0,
 					PID:   0,
 					PPID:  0,
@@ -92,7 +92,7 @@ func TestProcessListJSON(t *testing.T) {
 					Time:  "0",
 					Cmd:   "zero",
 				},
-				&Process{
+				{
 					UID:   1,
 					PID:   1,
 					PPID:  1,
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index 0386586ab..1c6a2e1e9 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -57,7 +57,7 @@ func TestBasic(t *testing.T) {
 	ctx := contexttest.Context(t)
 	set := &Set{ID: 123, sems: make([]sem, 1)}
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: 1},
+		{SemOp: 1},
 	}
 	executeOps(ctx, t, set, ops, false)
 
@@ -78,7 +78,7 @@ func TestWaitForZero(t *testing.T) {
 	ctx := contexttest.Context(t)
 	set := &Set{ID: 123, sems: make([]sem, 1)}
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: 0},
+		{SemOp: 0},
 	}
 	executeOps(ctx, t, set, ops, false)
 
@@ -117,7 +117,7 @@ func TestNoWait(t *testing.T) {
 	ctx := contexttest.Context(t)
 	set := &Set{ID: 123, sems: make([]sem, 1)}
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: 1},
+		{SemOp: 1},
 	}
 	executeOps(ctx, t, set, ops, false)
 
@@ -146,7 +146,7 @@ func TestUnregister(t *testing.T) {
 	}
 
 	ops := []linux.Sembuf{
-		linux.Sembuf{SemOp: -1},
+		{SemOp: -1},
 	}
 	chs := make([]chan struct{}, 0, 5)
 	for i := 0; i < 5; i++ {
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 5ddc75a9d..be2ac5f3c 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -614,8 +614,8 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 
 	if valid.ATime || valid.MTime {
 		utimes := [2]syscall.Timespec{
-			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
-			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+			{Sec: 0, Nsec: linux.UTIME_OMIT},
+			{Sec: 0, Nsec: linux.UTIME_OMIT},
 		}
 		if valid.ATime {
 			if valid.ATimeNotSystemTime {
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 7d834d596..58d04aefa 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -39,12 +39,12 @@ var (
 	allConfs []Config
 
 	rwConfs = []Config{
-		Config{ROMount: false, LazyOpenForWrite: false},
-		Config{ROMount: false, LazyOpenForWrite: true},
+		{ROMount: false, LazyOpenForWrite: false},
+		{ROMount: false, LazyOpenForWrite: true},
 	}
 	roConfs = []Config{
-		Config{ROMount: true, LazyOpenForWrite: false},
-		Config{ROMount: true, LazyOpenForWrite: true},
+		{ROMount: true, LazyOpenForWrite: false},
+		{ROMount: true, LazyOpenForWrite: true},
 	}
 )
 
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 13bf5d800..0354a64b9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -493,8 +493,8 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, common
 	// namespace for these.
 	log.Infof("Sandbox will be started in empty IPC and UTS namespaces")
 	nss := []specs.LinuxNamespace{
-		specs.LinuxNamespace{Type: specs.IPCNamespace},
-		specs.LinuxNamespace{Type: specs.UTSNamespace},
+		{Type: specs.IPCNamespace},
+		{Type: specs.UTSNamespace},
 	}
 
 	if conf.Platform == boot.PlatformPtrace {
-- 
cgit v1.2.3


From d70787d340b3967fd691fbbd079dece329f7a65c Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 4 May 2018 16:21:38 -0700
Subject: sentry: Adds the SIOCGIFNETMASK ioctl to epsocket.

PiperOrigin-RevId: 195489319
Change-Id: I0841d41d042c6f91aa8d7f62c127213aa7953eac
---
 pkg/sentry/socket/epsocket/epsocket.go | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 5701ecfac..a45dcd551 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1109,7 +1109,20 @@ func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, ar
 
 	case syscall.SIOCGIFNETMASK:
 		// Gets the network mask of a device.
-		// TODO: Implement.
+		for _, addr := range s.stack.InterfaceAddrs()[index] {
+			// This ioctl is only compatible with AF_INET addresses.
+			if addr.Family != linux.AF_INET {
+				continue
+			}
+			// Populate ifr.ifr_netmask (type sockaddr).
+			usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(linux.AF_INET))
+			usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0)
+			var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
+			// Netmask is expected to be returned as a big endian
+			// value.
+			binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
+			break
+		}
 
 	default:
 		// Not a valid call.
-- 
cgit v1.2.3


From 268edf0e6230c9455972c3343125d416a71a8759 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 4 May 2018 23:19:13 -0700
Subject: Remove ineffectual code in sentry ELF loader

PiperOrigin-RevId: 195517702
Change-Id: Id90309a6365cac06e68e8774aa79dc76ce1b11c7
---
 pkg/sentry/loader/elf.go | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index d23dc1096..8579eeee4 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -408,11 +408,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 			path := make([]byte, phdr.Filesz)
 			_, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
 			if err != nil {
-				ctx.Infof("Error reading PT_INTERP path: %v", err)
 				// If an interpreter was specified, it should exist.
-				if err == io.EOF || err == io.ErrUnexpectedEOF {
-					err = syserror.ENOEXEC
-				}
+				ctx.Infof("Error reading PT_INTERP path: %v", err)
 				return loadedELF{}, syserror.ENOEXEC
 			}
 
-- 
cgit v1.2.3


From 7c8c3705ea5d891a3d6126090b1f49d8bae44177 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 7 May 2018 16:37:08 -0700
Subject: Fix misspellings

PiperOrigin-RevId: 195742598
Change-Id: Ibd4a8e4394e268c87700b6d1e50b4b37dfce5182
---
 pkg/cpuid/cpuid.go                                   |  2 +-
 pkg/sentry/fs/fdpipe/pipe_opener.go                  |  2 +-
 pkg/sentry/fs/file_overlay.go                        |  2 +-
 pkg/sentry/fs/inode.go                               |  2 +-
 pkg/sentry/fs/inotify.go                             |  2 +-
 pkg/sentry/kernel/auth/credentials.go                |  2 +-
 pkg/sentry/kernel/pipe/pipe.go                       |  2 +-
 pkg/sentry/loader/loader.go                          |  2 +-
 pkg/sentry/platform/kvm/address_space.go             |  2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go   | 20 ++++++++++----------
 .../platform/ring0/pagetables/pagetables_amd64.go    |  2 +-
 pkg/sentry/usage/memory.go                           |  2 +-
 pkg/tcpip/header/ipv4.go                             |  2 +-
 runsc/sandbox/sandbox_test.go                        |  2 +-
 14 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index aa248dd98..b486ab037 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -720,7 +720,7 @@ func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32)
 
 // HostFeatureSet uses cpuid to get host values and construct a feature set
 // that matches that of the host machine. Note that there are several places
-// where there appear to be some unecessary assignments between register names
+// where there appear to be some unnecessary assignments between register names
 // (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show
 // where the different feature blocks come from, to make the code easier to
 // inspect and read.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
index a0d59575f..945cfaf08 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -94,7 +94,7 @@ func unwrapError(err error) error {
 // TryOpen uses a NonBlockingOpener to try to open a host pipe, respecting the fs.FileFlags.
 func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (*pipeOperations, error) {
 	switch {
-	// Reject invalid configurations so they don't accidently succeed below.
+	// Reject invalid configurations so they don't accidentally succeed below.
 	case !flags.Read && !flags.Write:
 		return nil, syscall.EINVAL
 
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 0c6e622b9..c27c5946e 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -151,7 +151,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
 
 	// If this was a seek on a directory, we must update the cursor.
 	if seekDir && whence == SeekSet && offset == 0 {
-		// Currenly only seeking to 0 on a directory is supported.
+		// Currently only seeking to 0 on a directory is supported.
 		// FIXME: Lift directory seeking limitations.
 		f.dirCursor = ""
 	}
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b624f4182..6c8e6f188 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -26,7 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
-// Inode is a file system object that can be simulatenously referenced by different
+// Inode is a file system object that can be simultaneously referenced by different
 // components of the VFS (Dirent, fs.File, etc).
 type Inode struct {
 	// AtomicRefCount is our reference count.
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 9f50cb800..a87be8590 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -316,7 +316,7 @@ func (i *Inotify) RmWatch(wd int32) error {
 
 	// The watch is now isolated and we can safely drop the instance lock. We
 	// need to do so because watch.destroy() acquires Watch.mu, which cannot be
-	// aquired with Inotify.mu held.
+	// acquired with Inotify.mu held.
 	i.mu.Unlock()
 
 	// Generate the event for the removal.
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index b832b28fe..f6fb05285 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -126,7 +126,7 @@ func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *T
 		creds.InheritableCaps = capabilities.InheritableCaps
 		// // TODO: Support ambient capabilities.
 	} else {
-		// If no capabilities are specified, grant the same capabilites
+		// If no capabilities are specified, grant the same capabilities
 		// that NewRootCredentials does.
 		creds.PermittedCaps = AllCapabilities
 		creds.EffectiveCaps = AllCapabilities
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 1656c6ff3..9a21df5b4 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -284,7 +284,7 @@ func (p *Pipe) rReadinessLocked() waiter.EventMask {
 		ready |= waiter.EventIn
 	}
 	if !p.HasWriters() && p.hadWriter {
-		// POLLHUP must be supressed until the pipe has had at least one writer
+		// POLLHUP must be suppressed until the pipe has had at least one writer
 		// at some point. Otherwise a reader thread may poll and immediately get
 		// a POLLHUP before the writer ever opens the pipe, which the reader may
 		// interpret as the writer opening then closing the pipe.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 94c281b72..4ed796493 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -118,7 +118,7 @@ func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch
 
 const (
 	// maxLoaderAttempts is the maximum number of attempts to try to load
-	// an interpreter scripts, to prevent loops. 6 (inital + 5 changes) is
+	// an interpreter scripts, to prevent loops. 6 (initial + 5 changes) is
 	// what the Linux kernel allows (fs/exec.c:search_binary_handler).
 	maxLoaderAttempts = 6
 )
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 791f038b0..a4b9198cc 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -178,7 +178,7 @@ func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange
 	// we create distinct mappings for each address space. Unfortunately,
 	// there's not a better way to manage this here. The file underlying
 	// this fd can change at any time, so we can't actually index the file
-	// and share between address space. Oh well. It's all refering to the
+	// and share between address space. Oh well. It's all referring to the
 	// same physical pages, hopefully we don't run out of address space.
 	if fd != int(as.filemem.File().Fd()) {
 		// N.B. precommit is ignored for host files.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 3cbf0bfa5..ee7f27601 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -44,18 +44,18 @@ type PageTables struct {
 	// root is the pagetable root.
 	root *Node
 
-	// translater is the translater passed at creation.
-	translater Translater
+	// translator is the translator passed at creation.
+	translator Translator
 
 	// archPageTables includes architecture-specific features.
 	archPageTables
 
-	// allNodes is a set of nodes indexed by translater address.
+	// allNodes is a set of nodes indexed by translator address.
 	allNodes map[uintptr]*Node
 }
 
-// Translater translates to guest physical addresses.
-type Translater interface {
+// Translator translates to guest physical addresses.
+type Translator interface {
 	// TranslateToPhysical translates the given pointer object into a
 	// "physical" address. We do not require that it translates back, the
 	// reverse mapping is maintained internally.
@@ -63,9 +63,9 @@ type Translater interface {
 }
 
 // New returns new PageTables.
-func New(t Translater, opts Opts) *PageTables {
+func New(t Translator, opts Opts) *PageTables {
 	p := &PageTables{
-		translater: t,
+		translator: t,
 		allNodes:   make(map[uintptr]*Node),
 	}
 	p.root = p.allocNode()
@@ -80,7 +80,7 @@ func New(t Translater, opts Opts) *PageTables {
 // managing multiple sets of pagetables.
 func (p *PageTables) New() *PageTables {
 	np := &PageTables{
-		translater: p.translater,
+		translator: p.translator,
 		allNodes:   make(map[uintptr]*Node),
 	}
 	np.root = np.allocNode()
@@ -90,7 +90,7 @@ func (p *PageTables) New() *PageTables {
 
 // setPageTable sets the given index as a page table.
 func (p *PageTables) setPageTable(n *Node, index int, child *Node) {
-	phys := p.translater.TranslateToPhysical(child.PTEs())
+	phys := p.translator.TranslateToPhysical(child.PTEs())
 	p.allNodes[phys] = child
 	pte := &n.PTEs()[index]
 	pte.setPageTable(phys)
@@ -188,6 +188,6 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType use
 // allocNode allocates a new page.
 func (p *PageTables) allocNode() *Node {
 	n := new(Node)
-	n.physical = p.translater.TranslateToPhysical(n.PTEs())
+	n.physical = p.translator.TranslateToPhysical(n.PTEs())
 	return n
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index b89665c96..a2050b99c 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -301,7 +301,7 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					}
 
 					// This level has 2-MB huge pages. If this
-					// region is contined in a single PMD entry?
+					// region is contained in a single PMD entry?
 					// As above, we can skip allocating a new page.
 					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
 						pmdEntry.SetSuper()
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 5d1b3a595..4a1527b5f 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -95,7 +95,7 @@ type MemoryStats struct {
 // categories not backed by platform memory. For details about how this works,
 // see the memory accounting docs.
 //
-// N.B. Please keep the struct in sync with the API. Noteably, changes to this
+// N.B. Please keep the struct in sync with the API. Notably, changes to this
 // struct requires a version bump and addition of compatibility logic in the
 // control server. As a special-case, adding fields without re-ordering existing
 // ones do not require a version bump because the mapped page we use is
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index cb0d42093..6e2a3d6f4 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -81,7 +81,7 @@ const (
 	// IPv4ProtocolNumber is IPv4's network protocol number.
 	IPv4ProtocolNumber tcpip.NetworkProtocolNumber = 0x0800
 
-	// IPv4Version is the version of the ipv4 procotol.
+	// IPv4Version is the version of the ipv4 protocol.
 	IPv4Version = 4
 )
 
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index 6e3125b7b..a46212173 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -131,7 +131,7 @@ func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error {
 	return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected))
 }
 
-// TestLifecycle tests the basic Create/Start/Signal/Destory sandbox lifecycle.
+// TestLifecycle tests the basic Create/Start/Signal/Destroy sandbox lifecycle.
 // It verifies after each step that the sandbox can be loaded from disk, and
 // has the correct status.
 func TestLifecycle(t *testing.T) {
-- 
cgit v1.2.3


From d0d01a18963ed7cfc29e5b8334e30b1234b6048b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 8 May 2018 09:38:55 -0700
Subject: Fix format string type in test

PiperOrigin-RevId: 195831778
Change-Id: I413dc909cedc18fbf5320a4f75d876f1be133c6c
---
 pkg/sentry/kernel/fd_map_test.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
index e1ac900e8..95123aef3 100644
--- a/pkg/sentry/kernel/fd_map_test.go
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -119,16 +119,18 @@ func TestDescriptorFlags(t *testing.T) {
 	limitSet := limits.NewLimitSet()
 	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
 
-	if err := f.NewFDAt(2, file, FDFlags{CloseOnExec: true}, limitSet); err != nil {
+	origFlags := FDFlags{CloseOnExec: true}
+
+	if err := f.NewFDAt(2, file, origFlags, limitSet); err != nil {
 		t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
 	}
 
-	newFile, flags := f.GetDescriptor(2)
+	newFile, newFlags := f.GetDescriptor(2)
 	if newFile == nil {
 		t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile)
 	}
 
-	if !flags.CloseOnExec {
-		t.Fatalf("new File flags %d don't match original %d\n", flags, 0)
+	if newFlags != origFlags {
+		t.Fatalf("new File flags %+v don't match original %+v", newFlags, origFlags)
 	}
 }
-- 
cgit v1.2.3


From b4765f782d91443ab0415dc00e727d783632e2ad Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 8 May 2018 09:51:07 -0700
Subject: Fix warning: redundant if ...; err != nil check, just return error
 instead.

This warning is produced by golint.

PiperOrigin-RevId: 195833381
Change-Id: Idd6a7e57e3cfdf00819f2374b19fc113585dc1e1
---
 pkg/bpf/decoder.go                    |  5 +----
 pkg/compressio/compressio.go          |  5 +----
 pkg/p9/local_server/local_server.go   |  6 +-----
 pkg/sentry/fs/fdpipe/pipe.go          |  5 +----
 pkg/sentry/fs/host/fs_test.go         |  6 +-----
 pkg/sentry/fs/host/socket.go          |  5 +----
 pkg/sentry/kernel/task_signals.go     | 10 ++--------
 pkg/sentry/syscalls/linux/sys_file.go |  7 ++-----
 pkg/tcpip/stack/transport_demuxer.go  |  6 +-----
 9 files changed, 11 insertions(+), 44 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/bpf/decoder.go b/pkg/bpf/decoder.go
index 6873ffa5c..ef41e9edc 100644
--- a/pkg/bpf/decoder.go
+++ b/pkg/bpf/decoder.go
@@ -161,10 +161,7 @@ func decodeAlu(inst linux.BPFInstruction, w *bytes.Buffer) error {
 	default:
 		return fmt.Errorf("invalid BPF ALU instruction: %v", inst)
 	}
-	if err := decodeSource(inst, w); err != nil {
-		return err
-	}
-	return nil
+	return decodeSource(inst, w)
 }
 
 func decodeSource(inst linux.BPFInstruction, w *bytes.Buffer) error {
diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go
index e0d36aee9..ef8cbd2a5 100644
--- a/pkg/compressio/compressio.go
+++ b/pkg/compressio/compressio.go
@@ -184,10 +184,7 @@ func handleResult(r result, callback func(*chunk) error) error {
 	if r.err != nil {
 		return r.err
 	}
-	if err := callback(r.chunk); err != nil {
-		return err
-	}
-	return nil
+	return callback(r.chunk)
 }
 
 // schedule schedules the given buffers.
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index 5b1e97711..7a3e4cffe 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -246,11 +246,7 @@ func (l *local) Symlink(oldname string, newname string, _ p9.UID, _ p9.GID) (p9.
 //
 // Not properly implemented.
 func (l *local) Link(target p9.File, newname string) error {
-	if err := os.Link(target.(*local).path, path.Join(l.path, newname)); err != nil {
-		return err
-	}
-
-	return nil
+	return os.Link(target.(*local).path, path.Join(l.path, newname))
 }
 
 // Mknod implements p9.File.Mknod.
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index f7bbd4aff..7b318e35f 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -90,10 +90,7 @@ func (p *pipeOperations) init() error {
 	if err := syscall.SetNonblock(p.file.FD(), true); err != nil {
 		return err
 	}
-	if err := fdnotifier.AddFD(int32(p.file.FD()), &p.Queue); err != nil {
-		return err
-	}
-	return nil
+	return fdnotifier.AddFD(int32(p.file.FD()), &p.Queue)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index c000afc49..b08125ca8 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -141,11 +141,7 @@ func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) err
 		return err
 	}
 
-	if err := symlinks.CreateLink(ctx, r, "/symlinks", "recursive"); err != nil {
-		return err
-	}
-
-	return nil
+	return symlinks.CreateLink(ctx, r, "/symlinks", "recursive")
 }
 
 // allPaths returns a slice of all paths of entries visible in the rootfs.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 8e36ed7ee..467633052 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -69,10 +69,7 @@ func (e *endpoint) init() error {
 	}
 
 	e.stype = unix.SockType(stype)
-	if err := fdnotifier.AddFD(int32(e.fd), &e.queue); err != nil {
-		return err
-	}
-	return nil
+	return fdnotifier.AddFD(int32(e.fd), &e.queue)
 }
 
 // newEndpoint creates a new host endpoint.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 2340256b0..e4ef7fd67 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -127,10 +127,7 @@ func (t *Task) dequeueSignalLocked() *arch.SignalInfo {
 	if info := t.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
 		return info
 	}
-	if info := t.tg.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
-		return info
-	}
-	return nil
+	return t.tg.pendingSignals.dequeue(t.tr.SignalMask)
 }
 
 // TakeSignal returns a pending signal not blocked by mask. Signal handlers are
@@ -144,10 +141,7 @@ func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo {
 	if info := t.pendingSignals.dequeue(mask); info != nil {
 		return info
 	}
-	if info := t.tg.pendingSignals.dequeue(mask); info != nil {
-		return info
-	}
-	return nil
+	return t.tg.pendingSignals.dequeue(mask)
 }
 
 // discardSpecificLocked removes all instances of the given signal from all
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index a2dbba7e0..5fbacc15e 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -460,14 +460,11 @@ func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, m
 			creds:   creds,
 		}
 
-		if err := d.Inode.CheckPermission(ctx, fs.PermMask{
+		return d.Inode.CheckPermission(ctx, fs.PermMask{
 			Read:    mode&rOK != 0,
 			Write:   mode&wOK != 0,
 			Execute: mode&xOK != 0,
-		}); err != nil {
-			return err
-		}
-		return nil
+		})
 	})
 }
 
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 3c0d7aa31..7bb853622 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -158,9 +158,5 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv *buffe
 
 	// Try to find a match with only the local port.
 	nid.LocalAddress = ""
-	if ep := eps.endpoints[nid]; ep != nil {
-		return ep
-	}
-
-	return nil
+	return eps.endpoints[nid]
 }
-- 
cgit v1.2.3


From 09c323910d7f28fec9e4c92e5faaa92bb63bd431 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 8 May 2018 09:58:09 -0700
Subject: Reword misleading log line

PiperOrigin-RevId: 195834310
Change-Id: I8af748f75ab87ad1cd29c4c8904d07fd729ba6c9
---
 pkg/sentry/loader/loader.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 4ed796493..a68ab33e7 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -85,7 +85,7 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 
 	// No exec-ing directories, pipes, etc!
 	if !fs.IsRegular(d.Inode.StableAttr) {
-		ctx.Infof("Error regularing %s: %v", name, d.Inode.StableAttr)
+		ctx.Infof("%s is not regular: %v", name, d.Inode.StableAttr)
 		return nil, nil, syserror.EACCES
 	}
 
-- 
cgit v1.2.3


From fea624b37a90c0e1efc0c1e7ae7dda7b2d1a0050 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 8 May 2018 10:06:14 -0700
Subject: Sentry: always use "best speed" compression for save and remove the
 option.

PiperOrigin-RevId: 195835861
Change-Id: Ib696b1b571a6b061725a33c535cd7215fe518b97
---
 pkg/sentry/state/state.go             |  7 +------
 pkg/state/statefile/statefile.go      | 10 +++++++---
 pkg/state/statefile/statefile_test.go |  5 ++---
 3 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index c306091da..393289926 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -50,11 +50,6 @@ type SaveOpts struct {
 	// Metadata is save metadata.
 	Metadata map[string]string
 
-	// CompressionLevel is the compression level to use.
-	//
-	// See statefile.NewWriter for details.
-	CompressionLevel int
-
 	// Callback is called prior to unpause, with any save error.
 	Callback func(err error)
 }
@@ -76,7 +71,7 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
 	addSaveMetadata(opts.Metadata)
 
 	// Open the statefile.
-	wc, err := statefile.NewWriter(opts.Destination, opts.Key, opts.Metadata, opts.CompressionLevel)
+	wc, err := statefile.NewWriter(opts.Destination, opts.Key, opts.Metadata)
 	if err != nil {
 		err = ErrStateFile{err}
 	} else {
diff --git a/pkg/state/statefile/statefile.go b/pkg/state/statefile/statefile.go
index b25b743b7..64b0a6312 100644
--- a/pkg/state/statefile/statefile.go
+++ b/pkg/state/statefile/statefile.go
@@ -45,6 +45,7 @@ package statefile
 
 import (
 	"bytes"
+	"compress/flate"
 	"crypto/hmac"
 	"crypto/sha256"
 	"encoding/json"
@@ -86,7 +87,7 @@ var ErrMetadataInvalid = fmt.Errorf("metadata invalid, can't start with _")
 // NewWriter returns a state data writer for a statefile.
 //
 // Note that the returned WriteCloser must be closed.
-func NewWriter(w io.Writer, key []byte, metadata map[string]string, compressionLevel int) (io.WriteCloser, error) {
+func NewWriter(w io.Writer, key []byte, metadata map[string]string) (io.WriteCloser, error) {
 	if metadata == nil {
 		metadata = make(map[string]string)
 	}
@@ -140,8 +141,11 @@ func NewWriter(w io.Writer, key []byte, metadata map[string]string, compressionL
 
 	w = hashio.NewWriter(w, h)
 
-	// Wrap in compression.
-	return compressio.NewWriter(w, compressionChunkSize, compressionLevel)
+	// Wrap in compression. We always use "best speed" mode here. When using
+	// "best compression" mode, there is usually only a little gain in file
+	// size reduction, which translate to even smaller gain in restore
+	// latency reduction, while inccuring much more CPU usage at save time.
+	return compressio.NewWriter(w, compressionChunkSize, flate.BestSpeed)
 }
 
 // MetadataUnsafe reads out the metadata from a state file without verifying any
diff --git a/pkg/state/statefile/statefile_test.go b/pkg/state/statefile/statefile_test.go
index 6e67b51de..66d9581ed 100644
--- a/pkg/state/statefile/statefile_test.go
+++ b/pkg/state/statefile/statefile_test.go
@@ -16,7 +16,6 @@ package statefile
 
 import (
 	"bytes"
-	"compress/flate"
 	crand "crypto/rand"
 	"encoding/base64"
 	"io"
@@ -89,7 +88,7 @@ func TestStatefile(t *testing.T) {
 					var bufDecoded bytes.Buffer
 
 					// Do all the writing.
-					w, err := NewWriter(&bufEncoded, key, c.metadata, flate.BestSpeed)
+					w, err := NewWriter(&bufEncoded, key, c.metadata)
 					if err != nil {
 						t.Fatalf("error creating writer: got %v, expected nil", err)
 					}
@@ -195,7 +194,7 @@ func benchmark(b *testing.B, size int, write bool, compressible bool) {
 	var stateBuf bytes.Buffer
 	writeState := func() {
 		stateBuf.Reset()
-		w, err := NewWriter(&stateBuf, key, nil, flate.BestSpeed)
+		w, err := NewWriter(&stateBuf, key, nil)
 		if err != nil {
 			b.Fatalf("error creating writer: %v", err)
 		}
-- 
cgit v1.2.3


From 174161013de22be6a42b02ee06611a9de9e20b18 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 8 May 2018 11:36:11 -0700
Subject: Capture restore file system corruption errors in exit error.

PiperOrigin-RevId: 195850822
Change-Id: I4d7bdd8fe129c5ed461b73e1d7458be2cf5680c2
---
 pkg/sentry/fs/fdpipe/pipe_state.go |  9 +++---
 pkg/sentry/fs/fs.go                | 63 ++++++++++++++++++++++++++++++++++++--
 pkg/sentry/fs/gofer/file_state.go  |  9 ++++--
 pkg/sentry/fs/gofer/inode_state.go | 23 ++++++++------
 pkg/sentry/fs/host/inode_state.go  |  6 ++--
 pkg/sentry/kernel/kernel.go        |  4 ++-
 6 files changed, 90 insertions(+), 24 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index 8996a2178..99c40d8ed 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -63,7 +63,7 @@ func (p *pipeOperations) loadFlags(flags fs.FileFlags) {
 
 // afterLoad is invoked by stateify.
 func (p *pipeOperations) afterLoad() {
-	load := func() {
+	load := func() error {
 		if !p.flags.Read {
 			readPipeOperationsLoading.Wait()
 		} else {
@@ -75,14 +75,15 @@ func (p *pipeOperations) afterLoad() {
 			Write: p.flags.Write,
 		})
 		if err != nil {
-			panic(fmt.Sprintf("unable to open pipe %v: %v", p, err))
+			return fmt.Errorf("unable to open pipe %v: %v", p, err)
 		}
 		if err := p.init(); err != nil {
-			panic(fmt.Sprintf("unable to initialize pipe %v: %v", p, err))
+			return fmt.Errorf("unable to initialize pipe %v: %v", p, err)
 		}
+		return nil
 	}
 
 	// Do background opening of pipe ends. Note for write-only pipe ends we
 	// have to do it asynchronously to avoid blocking the restore.
-	fs.Async(load)
+	fs.Async(fs.CatchError(load))
 }
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index f54f767d3..6ec9ff446 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -55,11 +55,19 @@ package fs
 
 import (
 	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
-// work is a sync.WaitGroup that can be used to queue asynchronous operations
-// via Do. Callers can use Barrier to ensure no operations are outstanding.
-var work sync.WaitGroup
+var (
+	// work is a sync.WaitGroup that can be used to queue asynchronous
+	// operations via Do. Callers can use Barrier to ensure no operations
+	// are outstanding.
+	work sync.WaitGroup
+
+	// asyncError is used to store up to one asynchronous execution error.
+	asyncError = make(chan error, 1)
+)
 
 // AsyncBarrier waits for all outstanding asynchronous work to complete.
 func AsyncBarrier() {
@@ -75,6 +83,43 @@ func Async(f func()) {
 	}()
 }
 
+// AsyncErrorBarrier waits for all outstanding asynchronous work to complete, or
+// the first async error to arrive. Other unfinished async executions will
+// continue in the background. Other past and future async errors are ignored.
+func AsyncErrorBarrier() error {
+	wait := make(chan struct{}, 1)
+	go func() { // S/R-SAFE: Does not touch persistent state.
+		work.Wait()
+		wait <- struct{}{}
+	}()
+	select {
+	case <-wait:
+		select {
+		case err := <-asyncError:
+			return err
+		default:
+			return nil
+		}
+	case err := <-asyncError:
+		return err
+	}
+}
+
+// CatchError tries to capture the potential async error returned by the
+// function. At most one async error will be captured globally so excessive
+// errors will be dropped.
+func CatchError(f func() error) func() {
+	return func() {
+		if err := f(); err != nil {
+			select {
+			case asyncError <- err:
+			default:
+				log.Warningf("excessive async error dropped: %v", err)
+			}
+		}
+	}
+}
+
 // ErrSaveRejection indicates a failed save due to unsupported file system state
 // such as dangling open fd, etc.
 type ErrSaveRejection struct {
@@ -86,3 +131,15 @@ type ErrSaveRejection struct {
 func (e ErrSaveRejection) Error() string {
 	return "save rejected due to unsupported file system state: " + e.Err.Error()
 }
+
+// ErrCorruption indicates a failed restore due to external file system state in
+// corruption.
+type ErrCorruption struct {
+	// Err is the wrapped error.
+	Err error
+}
+
+// Error returns a sensible description of the save rejection error.
+func (e ErrCorruption) Error() string {
+	return "restore failed due to external file system state in corruption: " + e.Err.Error()
+}
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index 1d63e33ec..715af8f16 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -15,13 +15,15 @@
 package gofer
 
 import (
+	"fmt"
+
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 )
 
 // afterLoad is invoked by stateify.
 func (f *fileOperations) afterLoad() {
-	load := func() {
+	load := func() error {
 		f.inodeOperations.fileState.waitForLoad()
 
 		// Manually load the open handles.
@@ -29,9 +31,10 @@ func (f *fileOperations) afterLoad() {
 		// TODO: Context is not plumbed to save/restore.
 		f.handles, err = newHandles(context.Background(), f.inodeOperations.fileState.file, f.flags)
 		if err != nil {
-			panic("failed to re-open handle: " + err.Error())
+			return fmt.Errorf("failed to re-open handle: %v", err)
 		}
 		f.inodeOperations.fileState.setHandlesForCachedIO(f.flags, f.handles)
+		return nil
 	}
-	fs.Async(load)
+	fs.Async(fs.CatchError(load))
 }
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 997a7d1c1..82d1dd4da 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -15,6 +15,7 @@
 package gofer
 
 import (
+	"errors"
 	"fmt"
 	"strings"
 
@@ -83,7 +84,7 @@ func (i *inodeFileState) loadLoading(_ struct{}) {
 
 // afterLoad is invoked by stateify.
 func (i *inodeFileState) afterLoad() {
-	load := func() {
+	load := func() error {
 		// See comment on i.loading().
 		defer i.loading.Unlock()
 
@@ -92,14 +93,14 @@ func (i *inodeFileState) afterLoad() {
 		if !ok {
 			// This should be impossible, see assertion in
 			// beforeSave.
-			panic(fmt.Sprintf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings)))
+			return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))
 		}
 		// TODO: Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 		var err error
 		_, i.file, err = i.s.attach.walk(ctx, strings.Split(name, "/"))
 		if err != nil {
-			panic(fmt.Sprintf("failed to walk to %q: %v", name, err))
+			return fmt.Errorf("failed to walk to %q: %v", name, err)
 		}
 
 		// Remap the saved inode number into the gofer device using the
@@ -107,10 +108,10 @@ func (i *inodeFileState) afterLoad() {
 		// environment.
 		qid, mask, attrs, err := i.file.getAttr(ctx, p9.AttrMaskAll())
 		if err != nil {
-			panic(fmt.Sprintf("failed to get file attributes of %s: %v", name, err))
+			return fmt.Errorf("failed to get file attributes of %s: %v", name, err)
 		}
 		if !mask.RDev {
-			panic(fmt.Sprintf("file %s lacks device", name))
+			return fs.ErrCorruption{fmt.Errorf("file %s lacks device", name)}
 		}
 		i.key = device.MultiDeviceKey{
 			Device:          attrs.RDev,
@@ -118,24 +119,26 @@ func (i *inodeFileState) afterLoad() {
 			Inode:           qid.Path,
 		}
 		if !goferDevice.Load(i.key, i.sattr.InodeID) {
-			panic(fmt.Sprintf("gofer device %s -> %d conflict in gofer device mappings: %s", i.key, i.sattr.InodeID, goferDevice))
+			return fs.ErrCorruption{fmt.Errorf("gofer device %s -> %d conflict in gofer device mappings: %s", i.key, i.sattr.InodeID, goferDevice)}
 		}
 
 		if i.sattr.Type == fs.RegularFile {
 			env, ok := fs.CurrentRestoreEnvironment()
 			if !ok {
-				panic("missing restore environment")
+				return errors.New("missing restore environment")
 			}
 			uattr := unstable(ctx, mask, attrs, i.s.mounter, i.s.client)
 			if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
-				panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size))
+				return fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)}
 			}
 			if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
-				panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime))
+				return fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)}
 			}
 			i.savedUAttr = nil
 		}
+
+		return nil
 	}
 
-	fs.Async(load)
+	fs.Async(fs.CatchError(load))
 }
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index 80066512a..135c75fd5 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -59,7 +59,7 @@ func (i *inodeFileState) afterLoad() {
 		// saved filesystem are no longer unique on this filesystem.
 		// Since this violates the contract that filesystems cannot
 		// change across save and restore, error out.
-		panic(fmt.Sprintf("host %s conflict in host device mappings: %s", key, hostFileDevice))
+		panic(fs.ErrCorruption{fmt.Errorf("host %s conflict in host device mappings: %s", key, hostFileDevice)})
 	}
 
 	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
@@ -69,10 +69,10 @@ func (i *inodeFileState) afterLoad() {
 		}
 		uattr := unstableAttr(i.mops, &s)
 		if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size {
-			panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size))
+			panic(fs.ErrCorruption{fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)})
 		}
 		if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime {
-			panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime))
+			panic(fs.ErrCorruption{fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)})
 		}
 		i.savedUAttr = nil
 	}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 25c8dd885..536461bbd 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -418,7 +418,9 @@ func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) erro
 	// Ensure that all pending asynchronous work is complete:
 	//   - namedpipe opening
 	//   - inode file opening
-	fs.AsyncBarrier()
+	if err := fs.AsyncErrorBarrier(); err != nil {
+		return err
+	}
 
 	log.Infof("Overall load took [%s]", time.Since(loadStart))
 
-- 
cgit v1.2.3


From 10a2cfc6a9216cb32e3a930016178d3c15ccc383 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 8 May 2018 16:14:00 -0700
Subject: Implement /proc/[pid]/statm.

PiperOrigin-RevId: 195893391
Change-Id: I645b7042d7f4f9dd54723afde3e5df0986e43160
---
 pkg/sentry/fs/proc/task.go | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 3e9a1e50e..147d57a8f 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -81,6 +81,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
 		"ns":        newNamespaceDir(t, msrc),
 		"stat":      newTaskStat(t, msrc, showSubtasks, pidns),
+		"statm":     newStatm(t, msrc),
 		"status":    newStatus(t, msrc, pidns),
 		"uid_map":   newUIDMap(t, msrc),
 	}, fs.RootOwner, fs.FilePermsFromMode(0555))
@@ -389,6 +390,40 @@ func (s *taskStatData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData,
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*taskStatData)(nil)}}, 0
 }
 
+// statmData implements seqfile.SeqSource for /proc/[pid]/statm.
+type statmData struct {
+	t *kernel.Task
+}
+
+func newStatm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newFile(seqfile.NewSeqFile(t, &statmData{t}), msrc, fs.SpecialFile, t)
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (s *statmData) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (s *statmData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return nil, 0
+	}
+
+	var vss, rss uint64
+	s.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+
+	var buf bytes.Buffer
+	fmt.Fprintf(&buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
+
+	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statmData)(nil)}}, 0
+}
+
 // statusData implements seqfile.SeqSource for /proc/[pid]/status.
 type statusData struct {
 	t     *kernel.Task
-- 
cgit v1.2.3


From 4453b56bd92c4ab9c0480dd99e80c65994711d33 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 9 May 2018 15:43:47 -0700
Subject: Increment link count in CreateHardlink

Closes #28

PiperOrigin-RevId: 196041391
Change-Id: I5d79f1735b9d72744e8bebc6897002b27df9aa7a
---
 pkg/sentry/fs/gofer/path.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index d696f1561..6c4c2eed9 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -150,8 +150,10 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, _ *fs.Inode, targe
 	if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil {
 		return err
 	}
-	// TODO: Don't increase link count because we can't properly accounts for links
-	// with gofers.
+	if i.session().cachePolicy == cacheAll {
+		// Increase link count.
+		targetOpts.cachingInodeOps.IncLinks(ctx)
+	}
 	i.touchModificationTime(ctx)
 	return nil
 }
-- 
cgit v1.2.3


From c97f0978b7ced0a31891fab639cc6c9a80e7fb37 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 9 May 2018 16:57:31 -0700
Subject: Cache symlinks in addition to files and directories.

PiperOrigin-RevId: 196051326
Change-Id: I4195b110e9a7d38d1ce1ed9c613971dea1be3bf0
---
 pkg/sentry/fs/gofer/session.go | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index ab3b964e0..1076e3e55 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -126,9 +126,12 @@ func (s *session) Revalidate(*fs.Dirent) bool {
 
 // TakeRefs takes an extra reference on dirent if possible.
 func (s *session) Keep(dirent *fs.Dirent) bool {
-	// NOTE: Only cache files and directories.
 	sattr := dirent.Inode.StableAttr
-	return s.cachePolicy != cacheNone && (fs.IsFile(sattr) || fs.IsDir(sattr))
+	if s.cachePolicy == cacheNone {
+		return false
+	}
+	// NOTE: Only cache files, directories, and symlinks.
+	return fs.IsFile(sattr) || fs.IsDir(sattr) || fs.IsSymlink(sattr)
 }
 
 // ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
-- 
cgit v1.2.3


From 31a4fefbe0a44377f75888284c9be0a3bec2a017 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 10 May 2018 12:46:27 -0700
Subject: Make cachePolicy int to avoid string comparison

PiperOrigin-RevId: 196157086
Change-Id: Ia7f7ffe1bf486b21ef8091e2e8ef9a9faf733dfc
---
 pkg/sentry/fs/gofer/fs.go | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 0a1a49bbd..a8a3ec19d 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -57,20 +57,26 @@ const (
 )
 
 // cachePolicy is a 9p cache policy.
-type cachePolicy string
+type cachePolicy int
 
 const (
-	// Use virtual file system cache.
-	cacheAll cachePolicy = "fscache"
-
 	// TODO: fully support cache=none.
-	cacheNone cachePolicy = "none"
+	cacheNone cachePolicy = iota
 
-	// defaultCache is cacheAll. Note this diverges from the 9p Linux
-	// client whose default is "none".  See TODO above.
-	defaultCache = cacheAll
+	// Use virtual file system cache.
+	cacheAll
 )
 
+func parseCachePolicy(policy string) (cachePolicy, error) {
+	switch policy {
+	case "fscache":
+		return cacheAll, nil
+	case "none":
+		return cacheNone, nil
+	}
+	return cacheNone, fmt.Errorf("unsupported cache mode: %s", policy)
+}
+
 // defaultAname is the default attach name.
 const defaultAname = "/"
 
@@ -206,11 +212,12 @@ func options(data string) (opts, error) {
 
 	// Parse the cache policy. Reject unsupported policies.
 	o.policy = cacheAll
-	if cp, ok := options[cacheKey]; ok {
-		if cachePolicy(cp) != cacheAll && cachePolicy(cp) != cacheNone {
-			return o, fmt.Errorf("unsupported cache mode: 'cache=%s'", cp)
+	if policy, ok := options[cacheKey]; ok {
+		cp, err := parseCachePolicy(policy)
+		if err != nil {
+			return o, err
 		}
-		o.policy = cachePolicy(cp)
+		o.policy = cp
 		delete(options, cacheKey)
 	}
 
-- 
cgit v1.2.3


From ac01f245ff4515af2b69225e8b7fb2cf28808275 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 10 May 2018 14:58:51 -0700
Subject: Skip atime and mtime update when file is backed by host FD

When file is backed by host FD, atime and mtime for the host file and the
cached attributes in the Sentry must be close together. In this case,
the call to update atime and mtime can be skipped. This is important when
host filesystem is using overlay because updating atime and mtime explicitly
forces a copy up for every file that is touched.

PiperOrigin-RevId: 196176413
Change-Id: I3933ea91637a071ba2ea9db9d8ac7cdba5dc0482
---
 pkg/sentry/fs/gofer/inode.go | 31 ++++++++++++++++++++++++++++++-
 runsc/boot/fs.go             |  6 +-----
 2 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 454242923..c00da5fec 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -206,7 +206,7 @@ func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.Blo
 
 // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
 func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
-	if mask.Empty() {
+	if i.skipSetAttr(mask) {
 		return nil
 	}
 	as, ans := attr.AccessTime.Unix()
@@ -237,6 +237,35 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa
 		})
 }
 
+// skipSetAttr checks if attribute change can be skipped. It can be skipped
+// when:
+//   - Mask is empty
+//   - Mask contains only atime and/or mtime, and host FD exists
+//
+// Updates to atime and mtime can be skipped because cached value will be
+// "close enough" to host value, given that operation went directly to host FD.
+// Skipping atime updates is particularly important to reduce the number of
+// operations sent to the Gofer for readonly files.
+func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool {
+	if mask.Empty() {
+		return true
+	}
+
+	cpy := mask
+	cpy.AccessTime = false
+	cpy.ModificationTime = false
+	if !cpy.Empty() {
+		// More than just atime and mtime is being set.
+		return false
+	}
+
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+	return (i.readonly != nil && i.readonly.Host != nil) ||
+		(i.readthrough != nil && i.readthrough.Host != nil) ||
+		(i.writeback != nil && i.writeback.Host != nil)
+}
+
 // Sync implements fsutil.CachedFileObject.Sync.
 func (i *inodeFileState) Sync(ctx context.Context) error {
 	i.handlesMu.RLock()
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 2073bd0b1..86cbe1169 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -141,10 +141,7 @@ func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, i
 // createRootMount creates the root filesystem.
 func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
-	mf := fs.MountSourceFlags{
-		ReadOnly: spec.Root.Readonly,
-		NoAtime:  true,
-	}
+	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly}
 
 	var (
 		rootInode *fs.Inode
@@ -261,7 +258,6 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		// All writes go to upper, be paranoid and make lower readonly.
 		mf.ReadOnly = true
 	}
-	mf.NoAtime = true
 
 	inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ","))
 	if err != nil {
-- 
cgit v1.2.3


From 12c161f27865d0e389cd593c669bd740d7f24692 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 11 May 2018 11:16:57 -0700
Subject: Implement MAP_32BIT.

PiperOrigin-RevId: 196281052
Change-Id: Ie620a0f983a1bf2570d0003d4754611879335c1c
---
 pkg/abi/linux/mm.go                   |  1 +
 pkg/sentry/memmap/memmap.go           |  6 +++++
 pkg/sentry/mm/vma.go                  | 46 +++++++++++++++++++++++++----------
 pkg/sentry/syscalls/linux/sys_mmap.go | 14 ++++++-----
 4 files changed, 48 insertions(+), 19 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index 2263653cc..b48e1d18a 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -31,6 +31,7 @@ const (
 	MAP_PRIVATE    = 1 << 1
 	MAP_FIXED      = 1 << 4
 	MAP_ANONYMOUS  = 1 << 5
+	MAP_32BIT      = 1 << 6 // arch/x86/include/uapi/asm/mman.h
 	MAP_GROWSDOWN  = 1 << 8
 	MAP_DENYWRITE  = 1 << 11
 	MAP_EXECUTABLE = 1 << 12
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 14fed55bc..72986cbb9 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -266,6 +266,12 @@ type MMapOpts struct {
 	// be replaced. If Unmap is true, Fixed must be true.
 	Unmap bool
 
+	// If Map32Bit is true, all addresses in the created mapping must fit in a
+	// 32-bit integer. (Note that the "end address" of the mapping, i.e. the
+	// address of the first byte *after* the mapping, need not fit in a 32-bit
+	// integer.) Map32Bit is ignored if Fixed is true.
+	Map32Bit bool
+
 	// Perms is the set of permissions to the applied to this mapping.
 	Perms usermem.AccessType
 
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 61aaa3195..b81e861f1 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -34,9 +34,10 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 
 	// Find a useable range.
 	addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{
-		Addr:  opts.Addr,
-		Fixed: opts.Fixed,
-		Unmap: opts.Unmap,
+		Addr:     opts.Addr,
+		Fixed:    opts.Fixed,
+		Unmap:    opts.Unmap,
+		Map32Bit: opts.Map32Bit,
 	})
 	if err != nil {
 		return vmaIterator{}, usermem.AddrRange{}, err
@@ -93,24 +94,40 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 }
 
 type findAvailableOpts struct {
-	// Addr is a suggested address. Addr must be page-aligned.
-	Addr usermem.Addr
-
-	// Fixed is true if only the suggested address is acceptable.
-	Fixed bool
-
-	// Unmap is true if existing vmas and guard pages may exist in the returned
-	// range.
-	Unmap bool
+	// These fields are equivalent to those in memmap.MMapOpts, except that:
+	//
+	// - Addr must be page-aligned.
+	//
+	// - Unmap allows existing guard pages in the returned range.
+
+	Addr     usermem.Addr
+	Fixed    bool
+	Unmap    bool
+	Map32Bit bool
 }
 
+// map32Start/End are the bounds to which MAP_32BIT mappings are constrained,
+// and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively.
+const (
+	map32Start = 0x40000000
+	map32End   = 0x80000000
+)
+
 // findAvailableLocked finds an allocatable range.
 //
 // Preconditions: mm.mappingMu must be locked.
 func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) {
+	if opts.Fixed {
+		opts.Map32Bit = false
+	}
+	allowedAR := mm.applicationAddrRange()
+	if opts.Map32Bit {
+		allowedAR = allowedAR.Intersect(usermem.AddrRange{map32Start, map32End})
+	}
+
 	// Does the provided suggestion work?
 	if ar, ok := opts.Addr.ToRange(length); ok {
-		if mm.applicationAddrRange().IsSupersetOf(ar) {
+		if allowedAR.IsSupersetOf(ar) {
 			if opts.Unmap {
 				return ar.Start, nil
 			}
@@ -132,6 +149,9 @@ func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOp
 		alignment = usermem.HugePageSize
 	}
 
+	if opts.Map32Bit {
+		return mm.findLowestAvailableLocked(length, alignment, allowedAR)
+	}
 	if mm.layout.DefaultDirection == arch.MmapBottomUp {
 		return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr})
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 2c7d41de0..bfa23f6a8 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -45,6 +45,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	private := flags&linux.MAP_PRIVATE != 0
 	shared := flags&linux.MAP_SHARED != 0
 	anon := flags&linux.MAP_ANONYMOUS != 0
+	map32bit := flags&linux.MAP_32BIT != 0
 
 	// Require exactly one of MAP_PRIVATE and MAP_SHARED.
 	if private == shared {
@@ -52,12 +53,13 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	}
 
 	opts := memmap.MMapOpts{
-		Length:  args[1].Uint64(),
-		Offset:  args[5].Uint64(),
-		Addr:    args[0].Pointer(),
-		Fixed:   fixed,
-		Unmap:   fixed,
-		Private: private,
+		Length:   args[1].Uint64(),
+		Offset:   args[5].Uint64(),
+		Addr:     args[0].Pointer(),
+		Fixed:    fixed,
+		Unmap:    fixed,
+		Map32Bit: map32bit,
+		Private:  private,
 		Perms: usermem.AccessType{
 			Read:    linux.PROT_READ&prot != 0,
 			Write:   linux.PROT_WRITE&prot != 0,
-- 
cgit v1.2.3


From 8deabbaae1fc45b042d551891080deef866dc0f8 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 11 May 2018 12:23:25 -0700
Subject: Remove error return from AddressSpace.Release()

PiperOrigin-RevId: 196291289
Change-Id: Ie3487be029850b0b410b82416750853a6c4a2b00
---
 pkg/sentry/kernel/task_usermem.go        |  4 +---
 pkg/sentry/mm/address_space.go           | 17 ++++++-----------
 pkg/sentry/platform/kvm/address_space.go |  3 +--
 pkg/sentry/platform/platform.go          |  2 +-
 pkg/sentry/platform/ptrace/subprocess.go |  3 +--
 5 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 7a62ab674..54964dd0d 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -39,9 +39,7 @@ func (t *Task) Activate() {
 // Deactivate relinquishes the task's active address space.
 func (t *Task) Deactivate() {
 	if mm := t.MemoryManager(); mm != nil {
-		if err := mm.Deactivate(); err != nil {
-			panic("unable to deactivate mm: " + err.Error())
-		}
+		mm.Deactivate()
 	}
 }
 
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 4dd67b1ea..27554f163 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -114,12 +114,12 @@ func (mm *MemoryManager) Activate() error {
 	}
 }
 
-// Deactivate releases a release to the MemoryManager.
-func (mm *MemoryManager) Deactivate() error {
+// Deactivate releases a reference to the MemoryManager.
+func (mm *MemoryManager) Deactivate() {
 	// Fast path: this is not the last goroutine to deactivate the
 	// MemoryManager.
 	if atomicbitops.DecUnlessOneInt32(&mm.active) {
-		return nil
+		return
 	}
 
 	mm.activeMu.Lock()
@@ -128,26 +128,21 @@ func (mm *MemoryManager) Deactivate() error {
 	// Still active?
 	if atomic.AddInt32(&mm.active, -1) > 0 {
 		mm.activeMu.Unlock()
-		return nil
+		return
 	}
 
 	// Can we hold on to the address space?
 	if !mm.p.CooperativelySchedulesAddressSpace() {
 		mm.activeMu.Unlock()
-		return nil
+		return
 	}
 
 	// Release the address space.
-	if err := mm.as.Release(); err != nil {
-		atomic.StoreInt32(&mm.active, 1)
-		mm.activeMu.Unlock()
-		return err
-	}
+	mm.as.Release()
 
 	// Lost it.
 	mm.as = nil
 	mm.activeMu.Unlock()
-	return nil
 }
 
 // mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index a4b9198cc..173885867 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -200,8 +200,7 @@ func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 }
 
 // Release releases the page tables.
-func (as *addressSpace) Release() error {
+func (as *addressSpace) Release() {
 	as.Unmap(0, ^uint64(0))
 	as.pageTables.Release()
-	return nil
 }
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 6219dada7..1c385bc5a 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -205,7 +205,7 @@ type AddressSpace interface {
 
 	// Release releases this address space. After releasing, a new AddressSpace
 	// must be acquired via platform.NewAddressSpace().
-	Release() error
+	Release()
 
 	// AddressSpaceIO methods are supported iff the associated platform's
 	// Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 0d6a38f15..035ebc332 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -242,14 +242,13 @@ func (s *subprocess) unmap() {
 // Therefore we simply unmap everything in the subprocess and return it to the
 // globalPool. This has the added benefit of reducing creation time for new
 // subprocesses.
-func (s *subprocess) Release() error {
+func (s *subprocess) Release() {
 	go func() { // S/R-SAFE: Platform.
 		s.unmap()
 		globalPool.mu.Lock()
 		globalPool.available = append(globalPool.available, s)
 		globalPool.mu.Unlock()
 	}()
-	return nil
 }
 
 // newThread creates a new traced thread.
-- 
cgit v1.2.3


From 08879266fef3a67fac1a77f1ea133c3ac75759dd Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 11 May 2018 17:18:56 -0700
Subject: sentry: Adds canonical mode support.

PiperOrigin-RevId: 196331627
Change-Id: Ifef4485f8202c52481af317cedd52d2ef48cea6a
---
 pkg/abi/linux/tty.go                 |  28 +++
 pkg/sentry/fs/tty/line_discipline.go | 354 ++++++++++++++++++++++++++---------
 pkg/sentry/fs/tty/master.go          |   3 +
 pkg/sentry/fs/tty/slave.go           |   3 +
 4 files changed, 298 insertions(+), 90 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index f9e641af9..84b6ccc87 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -14,9 +14,16 @@
 
 package linux
 
+import (
+	"unicode/utf8"
+)
+
 const (
 	// NumControlCharacters is the number of control characters in Termios.
 	NumControlCharacters = 19
+	// disabledChar is used to indicate that a control character is
+	// disabled.
+	disabledChar = 0
 )
 
 // Termios is struct termios, defined in uapi/asm-generic/termbits.h.
@@ -86,6 +93,27 @@ func (t *KernelTermios) FromTermios(term Termios) {
 	t.ControlCharacters = term.ControlCharacters
 }
 
+// IsTerminating returns whether c is a line terminating character.
+func (t *KernelTermios) IsTerminating(c rune) bool {
+	if t.IsEOF(c) {
+		return true
+	}
+	switch byte(c) {
+	case disabledChar:
+		return false
+	case '\n', t.ControlCharacters[VEOL]:
+		return true
+	case t.ControlCharacters[VEOL2]:
+		return t.LEnabled(IEXTEN)
+	}
+	return false
+}
+
+// IsEOF returns whether c is the EOF character.
+func (t *KernelTermios) IsEOF(c rune) bool {
+	return utf8.RuneLen(c) == 1 && byte(c) == t.ControlCharacters[VEOF] && t.ControlCharacters[VEOF] != disabledChar
+}
+
 // Input flags.
 const (
 	IGNBRK  = 0000001
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index a3aa95ece..bdc4f5b92 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -28,9 +28,90 @@ import (
 )
 
 const (
+	// canonMaxBytes is the number of bytes that fit into a single line of
+	// terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
+	// in include/linux/tty.h.
+	canonMaxBytes = 4096
+
+	// nonCanonMaxBytes is the maximum number of bytes that can be read at
+	// a time in noncanonical mode.
+	nonCanonMaxBytes = canonMaxBytes - 1
+
 	spacesPerTab = 8
 )
 
+// queue represents one of the input or output queues between a pty master and
+// slave. Bytes written to a queue are added to the read buffer until it is
+// full, at which point they are written to the wait buffer. Bytes are
+// processed (i.e. undergo termios transformations) as they are added to the
+// read buffer. The read buffer is readable when its length is nonzero and
+// readable is true.
+type queue struct {
+	waiter.Queue `state:"nosave"`
+
+	// readBuf is buffer of data ready to be read when readable is true.
+	// This data has been processed.
+	readBuf bytes.Buffer `state:".([]byte)"`
+
+	// waitBuf contains data that can't fit into readBuf. It is put here
+	// until it can be loaded into the read buffer. waitBuf contains data
+	// that hasn't been processed.
+	waitBuf bytes.Buffer `state:".([]byte)"`
+
+	// readable indicates whether the read buffer can be read from.  In
+	// canonical mode, there can be an unterminated line in the read buffer,
+	// so readable must be checked.
+	readable bool
+}
+
+// saveReadBuf is invoked by stateify.
+func (q *queue) saveReadBuf() []byte {
+	return append([]byte(nil), q.readBuf.Bytes()...)
+}
+
+// loadReadBuf is invoked by stateify.
+func (q *queue) loadReadBuf(b []byte) {
+	q.readBuf.Write(b)
+}
+
+// saveWaitBuf is invoked by stateify.
+func (q *queue) saveWaitBuf() []byte {
+	return append([]byte(nil), q.waitBuf.Bytes()...)
+}
+
+// loadWaitBuf is invoked by stateify.
+func (q *queue) loadWaitBuf(b []byte) {
+	q.waitBuf.Write(b)
+}
+
+// readReadiness returns whether q is ready to be read from.
+func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
+	if q.readBuf.Len() > 0 && q.readable {
+		return waiter.EventIn
+	}
+	return waiter.EventMask(0)
+}
+
+// writeReadiness returns whether q is ready to be written to.
+func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
+	// Like Linux, we don't impose a maximum size on what can be enqueued.
+	return waiter.EventOut
+}
+
+// readableSize writes the number of readable bytes to userspace.
+func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	var size int32
+	if q.readable {
+		size = int32(q.readBuf.Len())
+	}
+
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+
+}
+
 // lineDiscipline dictates how input and output are handled between the
 // pseudoterminal (pty) master and slave. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
@@ -50,7 +131,7 @@ const (
 //
 //       input from terminal    +-------------+   input to process (e.g. bash)
 //    +------------------------>| input queue |---------------------------+
-//    |                         +-------------+                           |
+//    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
 // masterFD                                                            slaveFD
@@ -58,7 +139,7 @@ const (
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
 //    +------------------------| output queue |<--------------------------+
-//                             +--------------+
+//        (outputQueueRead)    +--------------+    (outputQueueWrite)
 //
 // Lock order:
 //  inMu
@@ -102,14 +183,25 @@ func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arc
 
 // setTermios sets a linux.Termios for the tty.
 func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.inMu.Lock()
+	defer l.inMu.Unlock()
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
+	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
 	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
 		AddressSpaceActive: true,
 	})
 	l.termios.FromTermios(t)
+
+	// If canonical mode is turned off, move bytes from inQueue's wait
+	// buffer to its read buffer. Anything already in the read buffer is
+	// now readable.
+	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
+		l.pushWaitBuf(&l.inQueue, transformInput)
+	}
+
 	return 0, err
 }
 
@@ -118,7 +210,9 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	defer l.inMu.Unlock()
 	l.outMu.Lock()
 	defer l.outMu.Unlock()
-	return l.inQueue.writeReadiness() | l.outQueue.readReadiness()
+	// We don't have to lock a termios because the default master termios
+	// is immutable.
+	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
 func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
@@ -126,93 +220,97 @@ func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
 	defer l.inMu.Unlock()
 	l.outMu.Lock()
 	defer l.outMu.Unlock()
-	return l.outQueue.writeReadiness() | l.inQueue.readReadiness()
-}
-
-// queue represents one of the input or output queues between a pty master and
-// slave.
-type queue struct {
-	waiter.Queue `state:"nosave"`
-	buf          bytes.Buffer `state:".([]byte)"`
-}
-
-// saveBuf is invoked by stateify.
-func (q *queue) saveBuf() []byte {
-	return append([]byte(nil), q.buf.Bytes()...)
-}
-
-// loadBuf is invoked by stateify.
-func (q *queue) loadBuf(b []byte) {
-	q.buf.Write(b)
-}
-
-// readReadiness returns whether q is ready to be read from.
-//
-// Preconditions: q's mutex must be held.
-func (q *queue) readReadiness() waiter.EventMask {
-	ready := waiter.EventMask(0)
-	if q.buf.Len() > 0 {
-		ready |= waiter.EventIn
-	}
-	return ready
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-// writeReadiness returns whether q is ready to be written to.
-func (q *queue) writeReadiness() waiter.EventMask {
-	return waiter.EventOut
+func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.inMu.Lock()
+	defer l.inMu.Unlock()
+	return l.inQueue.readableSize(ctx, io, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
 	l.inMu.Lock()
 	defer l.inMu.Unlock()
-	return l.queueRead(ctx, dst, &l.inQueue)
+	return l.queueRead(ctx, dst, &l.inQueue, transformInput)
 }
 
 func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
 	l.inMu.Lock()
 	defer l.inMu.Unlock()
-	return l.queueWrite(ctx, src, &l.inQueue, false)
+	return l.queueWrite(ctx, src, &l.inQueue, transformInput)
+}
+
+func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.outMu.Lock()
+	defer l.outMu.Unlock()
+	return l.outQueue.readableSize(ctx, io, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
 	l.outMu.Lock()
 	defer l.outMu.Unlock()
-	return l.queueRead(ctx, dst, &l.outQueue)
+	return l.queueRead(ctx, dst, &l.outQueue, transformOutput)
 }
 
 func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
 	l.outMu.Lock()
 	defer l.outMu.Unlock()
-	return l.queueWrite(ctx, src, &l.outQueue, true)
+	return l.queueWrite(ctx, src, &l.outQueue, transformOutput)
 }
 
 // queueRead reads from q to userspace.
 //
 // Preconditions: q's lock must be held.
-func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence, q *queue) (int64, error) {
-	// Copy bytes out to user-space. queueRead doesn't have to do any
-	// processing or other extra work -- that's all taken care of when
-	// writing to a queue.
-	n, err := q.buf.WriteTo(dst.Writer(ctx))
+func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence, q *queue, f transform) (int64, error) {
+	if !q.readable {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Read out from the read buffer.
+	n := canonMaxBytes
+	if n > int(dst.NumBytes()) {
+		n = int(dst.NumBytes())
+	}
+	if n > q.readBuf.Len() {
+		n = q.readBuf.Len()
+	}
+	n, err := dst.Writer(ctx).Write(q.readBuf.Bytes()[:n])
+	if err != nil {
+		return 0, err
+	}
+	// Discard bytes read out.
+	q.readBuf.Next(n)
+
+	// If we read everything, this queue is no longer readable.
+	if q.readBuf.Len() == 0 {
+		q.readable = false
+	}
+
+	// Move data from the queue's wait buffer to its read buffer.
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	l.pushWaitBuf(q, f)
 
 	// If state changed, notify any waiters. If nothing was available to
 	// read, let the caller know we could block.
 	if n > 0 {
 		q.Notify(waiter.EventOut)
-	} else if err == nil {
+	} else {
 		return 0, syserror.ErrWouldBlock
 	}
-	return int64(n), err
+	return int64(n), nil
 }
 
-// queueWrite writes to q from userspace. `output` is whether the queue being
-// written to should be subject to output processing (i.e. whether it is the
-// output queue).
+// queueWrite writes to q from userspace. f is the function used to perform
+// processing on data being written and write it to the read buffer.
 //
 // Precondition: q's lock must be held.
-func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence, q *queue, output bool) (int64, error) {
+func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence, q *queue, f transform) (int64, error) {
 	// TODO: Use CopyInTo/safemem to avoid extra copying.
-	// Get the bytes to write from user-space.
+	// Copy in the bytes to write from user-space.
 	b := make([]byte, src.NumBytes())
 	n, err := src.CopyIn(ctx, b)
 	if err != nil {
@@ -220,49 +318,69 @@ func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence,
 	}
 	b = b[:n]
 
+	// Write as much as possible to the read buffer.
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	n = f(l, q, b)
+
+	// Write remaining data to the wait buffer.
+	nWaiting, _ := q.waitBuf.Write(b[n:])
+
 	// If state changed, notify any waiters. If we were unable to write
 	// anything, let the caller know we could block.
 	if n > 0 {
 		q.Notify(waiter.EventIn)
-	} else {
+	} else if nWaiting == 0 {
 		return 0, syserror.ErrWouldBlock
 	}
+	return int64(n + nWaiting), nil
+}
 
-	// Optionally perform line discipline transformations depending on
-	// whether we're writing to the input queue or output queue.
-	var buf *bytes.Buffer
-	l.termiosMu.Lock()
-	if output {
-		buf = l.transformOutput(b)
-	} else {
-		buf = l.transformInput(b)
-	}
-	l.termiosMu.Unlock()
+// pushWaitBuf fills the queue's read buffer with data from the wait buffer.
+//
+// Precondition: l.inMu and l.termiosMu must be held.
+func (l *lineDiscipline) pushWaitBuf(q *queue, f transform) {
+	// Remove bytes from the wait buffer and move them to the read buffer.
+	n := f(l, q, q.waitBuf.Bytes())
+	q.waitBuf.Next(n)
 
-	// Enqueue buf at the end of the queue.
-	buf.WriteTo(&q.buf)
-	return int64(n), err
+	// If state changed, notify any waiters.
+	if n > 0 {
+		q.Notify(waiter.EventIn)
+	}
 }
 
+// transform functions require the passed in lineDiscipline's mutex to be held.
+type transform func(*lineDiscipline, *queue, []byte) int
+
 // transformOutput does output processing for one end of the pty. See
 // drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
 //
-// Precondition: l.termiosMu must be held.
-func (l *lineDiscipline) transformOutput(buf []byte) *bytes.Buffer {
+// Precondition: l.termiosMu and q's mutex must be held.
+func transformOutput(l *lineDiscipline, q *queue, buf []byte) int {
+	// transformOutput is effectively always in noncanonical mode, as the
+	// master termios never has ICANON set.
+
 	if !l.termios.OEnabled(linux.OPOST) {
-		return bytes.NewBuffer(buf)
+		n, _ := q.readBuf.Write(buf)
+		if q.readBuf.Len() > 0 {
+			q.readable = true
+		}
+		return n
 	}
 
-	var ret bytes.Buffer
+	var ret int
 	for len(buf) > 0 {
-		c := l.removeRune(&buf)
+		c, size := l.peekRune(buf)
+		ret += size
+		buf = buf[size:]
 		switch c {
 		case '\n':
 			if l.termios.OEnabled(linux.ONLRET) {
 				l.column = 0
 			}
 			if l.termios.OEnabled(linux.ONLCR) {
-				ret.Write([]byte{'\r', '\n'})
+				q.readBuf.Write([]byte{'\r', '\n'})
 				continue
 			}
 		case '\r':
@@ -281,7 +399,7 @@ func (l *lineDiscipline) transformOutput(buf []byte) *bytes.Buffer {
 			spaces := spacesPerTab - l.column%spacesPerTab
 			if l.termios.OutputFlags&linux.TABDLY == linux.XTABS {
 				l.column += spaces
-				ret.Write(bytes.Repeat([]byte{' '}, 8))
+				q.readBuf.Write(bytes.Repeat([]byte{' '}, spacesPerTab))
 				continue
 			}
 			l.column += spaces
@@ -292,24 +410,40 @@ func (l *lineDiscipline) transformOutput(buf []byte) *bytes.Buffer {
 		default:
 			l.column++
 		}
-		ret.WriteRune(c)
+		q.readBuf.WriteRune(c)
+	}
+	if q.readBuf.Len() > 0 {
+		q.readable = true
 	}
-	return &ret
+	return ret
 }
 
-// transformInput does input processing for one end of the pty. Characters
-// read are transformed according to flags set in the termios struct. See
+// transformInput does input processing for one end of the pty. Characters read
+// are transformed according to flags set in the termios struct. See
 // drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
 // function.
 //
-// Precondition: l.termiosMu must be held.
-func (l *lineDiscipline) transformInput(buf []byte) *bytes.Buffer {
-	var ret bytes.Buffer
-	for len(buf) > 0 {
-		c := l.removeRune(&buf)
+// Precondition: l.termiosMu and q's mutex must be held.
+func transformInput(l *lineDiscipline, q *queue, buf []byte) int {
+	// If there's a line waiting to be read in canonical mode, don't write
+	// anything else to the read buffer.
+	if l.termios.LEnabled(linux.ICANON) && q.readable {
+		return 0
+	}
+
+	maxBytes := nonCanonMaxBytes
+	if l.termios.LEnabled(linux.ICANON) {
+		maxBytes = canonMaxBytes
+	}
+
+	var ret int
+	for len(buf) > 0 && q.readBuf.Len() < canonMaxBytes {
+		c, size := l.peekRune(buf)
 		switch c {
 		case '\r':
 			if l.termios.IEnabled(linux.IGNCR) {
+				buf = buf[size:]
+				ret += size
 				continue
 			}
 			if l.termios.IEnabled(linux.ICRNL) {
@@ -320,23 +454,63 @@ func (l *lineDiscipline) transformInput(buf []byte) *bytes.Buffer {
 				c = '\r'
 			}
 		}
-		ret.WriteRune(c)
+
+		// In canonical mode, we discard non-terminating characters
+		// after the first 4095.
+		if l.shouldDiscard(q, c) {
+			buf = buf[size:]
+			ret += size
+			continue
+		}
+
+		// Stop if the buffer would be overfilled.
+		if q.readBuf.Len()+size > maxBytes {
+			break
+		}
+		buf = buf[size:]
+		ret += size
+
+		// If we get EOF, make the buffer available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(c) {
+			q.readable = true
+			break
+		}
+
+		q.readBuf.WriteRune(c)
+
+		// If we finish a line, make it available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(c) {
+			q.readable = true
+			break
+		}
+	}
+
+	// In noncanonical mode, everything is readable.
+	if !l.termios.LEnabled(linux.ICANON) && q.readBuf.Len() > 0 {
+		q.readable = true
 	}
-	return &ret
+
+	return ret
+}
+
+// shouldDiscard returns whether c should be discarded. In canonical mode, if
+// too many bytes are enqueued, we keep reading input and discarding it until
+// we find a terminating character. Signal/echo processing still occurs.
+func (l *lineDiscipline) shouldDiscard(q *queue, c rune) bool {
+	return l.termios.LEnabled(linux.ICANON) && q.readBuf.Len()+utf8.RuneLen(c) >= canonMaxBytes && !l.termios.IsTerminating(c)
 }
 
-// removeRune removes and returns the first rune from the byte array. The
-// buffer's length is updated accordingly.
-func (l *lineDiscipline) removeRune(b *[]byte) rune {
+// peekRune returns the first rune from the byte array depending on whether
+// UTF8 is enabled.
+func (l *lineDiscipline) peekRune(b []byte) (rune, int) {
 	var c rune
 	var size int
 	// If UTF-8 support is enabled, runes might be multiple bytes.
 	if l.termios.IEnabled(linux.IUTF8) {
-		c, size = utf8.DecodeRune(*b)
+		c, size = utf8.DecodeRune(b)
 	} else {
-		c = rune((*b)[0])
+		c = rune(b[0])
 		size = 1
 	}
-	*b = (*b)[size:]
-	return c
+	return c, size
 }
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 3c47ee517..74cdbe874 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -148,6 +148,9 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
 // Ioctl implements fs.FileOperations.Ioctl.
 func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	switch args[1].Uint() {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the output queue read buffer.
+		return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
 		// of the slave end.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 9178071a4..f5eec726e 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -133,6 +133,9 @@ func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src userme
 // Ioctl implements fs.FileOperations.Ioctl.
 func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	switch args[1].Uint() {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
 	case linux.TCGETS:
 		return sf.si.t.ld.getTermios(ctx, io, args)
 	case linux.TCSETS:
-- 
cgit v1.2.3


From 17a0fa3af05dbb147cdd3d5ec898d31812a0ea66 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 14 May 2018 20:26:35 -0700
Subject: Ignore spurious KVM emulation failures.

PiperOrigin-RevId: 196609789
Change-Id: Ie261eea3b7fa05b6c348ca93e229de26cbd4dc7d
---
 pkg/sentry/platform/kvm/bluepill_unsafe.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 85703ff18..9e252af64 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -105,7 +105,11 @@ func bluepillHandler(context unsafe.Pointer) {
 		case _KVM_EXIT_IO:
 			throw("I/O")
 		case _KVM_EXIT_INTERNAL_ERROR:
-			throw("internal error")
+			// An internal error is typically thrown when emulation
+			// fails. This can occur via the MMIO path below (and
+			// it might fail because we have multiple regions that
+			// are not mapped). We would actually prefer that no
+			// emulation occur, and don't mind at all if it fails.
 		case _KVM_EXIT_HYPERCALL:
 			throw("hypercall")
 		case _KVM_EXIT_DEBUG:
-- 
cgit v1.2.3


From 825e9ea8098d91e9770d27124717c08d1f5d2952 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 14 May 2018 20:44:56 -0700
Subject: Simplify KVM host map handling.

PiperOrigin-RevId: 196611084
Change-Id: I6afa6b01e1dcd2aa9776dfc0f910874cc6b8d72c
---
 pkg/sentry/platform/kvm/address_space.go |   8 ++-
 pkg/sentry/platform/kvm/host_map.go      | 108 ++++++++++++++++++-------------
 2 files changed, 68 insertions(+), 48 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 173885867..2302f78e1 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -46,6 +46,8 @@ type addressSpace struct {
 	dirtySet sync.Map
 
 	// files contains files mapped in the host address space.
+	//
+	// See host_map.go for more information.
 	files hostMap
 }
 
@@ -112,7 +114,8 @@ func (as *addressSpace) mapHostFile(addr usermem.Addr, fd int, fr platform.FileR
 	inv := false
 	for _, m := range ms {
 		// The host mapped slices are guaranteed to be aligned.
-		inv = inv || as.mapHost(addr, m, at)
+		prev := as.mapHost(addr, m, at)
+		inv = inv || prev
 		addr += usermem.Addr(m.length)
 	}
 	if inv {
@@ -157,10 +160,11 @@ func (as *addressSpace) mapFilemem(addr usermem.Addr, fr platform.FileRange, at
 				_ = s[i] // Touch to commit.
 			}
 		}
-		inv = inv || as.mapHost(addr, hostMapEntry{
+		prev := as.mapHost(addr, hostMapEntry{
 			addr:   reflect.ValueOf(&s[0]).Pointer(),
 			length: uintptr(len(s)),
 		}, at)
+		inv = inv || prev
 		addr += usermem.Addr(len(s))
 	}
 	if inv {
diff --git a/pkg/sentry/platform/kvm/host_map.go b/pkg/sentry/platform/kvm/host_map.go
index 357f8c92e..fc16ad2de 100644
--- a/pkg/sentry/platform/kvm/host_map.go
+++ b/pkg/sentry/platform/kvm/host_map.go
@@ -35,28 +35,48 @@ type hostMapEntry struct {
 	length uintptr
 }
 
-func (hm *hostMap) forEachEntry(r usermem.AddrRange, fn func(offset uint64, m hostMapEntry)) {
-	for seg := hm.set.FindSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
-		length := uintptr(seg.Range().Length())
-		segOffset := uint64(0) // Adjusted below.
-		if seg.End() > r.End {
-			length -= uintptr(seg.End() - r.End)
-		}
-		if seg.Start() < r.Start {
-			length -= uintptr(r.Start - seg.Start())
+// forEach iterates over all mappings in the given range.
+//
+// Precondition: segFn and gapFn must be non-nil.
+func (hm *hostMap) forEach(
+	r usermem.AddrRange,
+	segFn func(offset uint64, m hostMapEntry),
+	gapFn func(offset uint64, length uintptr) (uintptr, bool)) {
+
+	seg, gap := hm.set.Find(r.Start)
+	for {
+		if seg.Ok() && seg.Start() < r.End {
+			// A valid segment: pass information.
+			overlap := seg.Range().Intersect(r)
+			segOffset := uintptr(overlap.Start - seg.Start())
+			mapOffset := uint64(overlap.Start - r.Start)
+			segFn(mapOffset, hostMapEntry{
+				addr:   seg.Value() + segOffset,
+				length: uintptr(overlap.Length()),
+			})
+			seg, gap = seg.NextNonEmpty()
+		} else if gap.Ok() && gap.Start() < r.End {
+			// A gap: pass gap information.
+			overlap := gap.Range().Intersect(r)
+			mapOffset := uint64(overlap.Start - r.Start)
+			addr, ok := gapFn(mapOffset, uintptr(overlap.Length()))
+			if ok {
+				seg = hm.set.Insert(gap, overlap, addr)
+				seg, gap = seg.NextNonEmpty()
+			} else {
+				seg = gap.NextSegment()
+				gap = hostMapGapIterator{} // Invalid.
+			}
 		} else {
-			segOffset = uint64(seg.Start() - r.Start)
+			// Terminal.
+			break
 		}
-		fn(segOffset, hostMapEntry{
-			addr:   seg.Value(),
-			length: length,
-		})
 	}
 }
 
 func (hm *hostMap) createMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) {
-	// Replace any existing mappings.
-	hm.forEachEntry(r, func(segOffset uint64, m hostMapEntry) {
+	hm.forEach(r, func(mapOffset uint64, m hostMapEntry) {
+		// Replace any existing mappings.
 		_, _, errno := syscall.RawSyscall6(
 			syscall.SYS_MMAP,
 			m.addr,
@@ -64,48 +84,40 @@ func (hm *hostMap) createMappings(r usermem.AddrRange, at usermem.AccessType, fd
 			uintptr(at.Prot()),
 			syscall.MAP_FIXED|syscall.MAP_SHARED,
 			uintptr(fd),
-			uintptr(offset+segOffset))
+			uintptr(offset+mapOffset))
 		if errno != 0 && err == nil {
 			err = errno
 		}
-	})
-	if err != nil {
-		return nil, err
-	}
-
-	// Add in necessary new mappings.
-	for gap := hm.set.FindGap(r.Start); gap.Ok() && gap.Start() < r.End; {
-		length := uintptr(gap.Range().Length())
-		gapOffset := uint64(0) // Adjusted below.
-		if gap.End() > r.End {
-			length -= uintptr(gap.End() - r.End)
-		}
-		if gap.Start() < r.Start {
-			length -= uintptr(r.Start - gap.Start())
-		} else {
-			gapOffset = uint64(gap.Start() - r.Start)
-		}
-
-		// Map the host file memory.
-		hostAddr, _, errno := syscall.RawSyscall6(
+	}, func(mapOffset uint64, length uintptr) (uintptr, bool) {
+		// Create a new mapping.
+		addr, _, errno := syscall.RawSyscall6(
 			syscall.SYS_MMAP,
 			0,
 			length,
 			uintptr(at.Prot()),
 			syscall.MAP_SHARED,
 			uintptr(fd),
-			uintptr(offset+gapOffset))
+			uintptr(offset+mapOffset))
 		if errno != 0 {
-			return nil, errno
+			err = errno
+			return 0, false
 		}
-
-		// Insert into the host set and move to the next gap.
-		gap = hm.set.Insert(gap, gap.Range().Intersect(r), hostAddr).NextGap()
+		return addr, true
+	})
+	if err != nil {
+		return nil, err
 	}
 
-	// Collect all slices.
-	hm.forEachEntry(r, func(_ uint64, m hostMapEntry) {
+	// Collect all entries.
+	//
+	// We do this after the first iteration because some segments may have
+	// been merged in the above, and we'll return the simplest form. This
+	// also provides a basic sanity check in the form of no gaps.
+	hm.forEach(r, func(_ uint64, m hostMapEntry) {
 		ms = append(ms, m)
+	}, func(uint64, uintptr) (uintptr, bool) {
+		// Should not happen: we just mapped this above.
+		panic("unexpected gap")
 	})
 
 	return ms, nil
@@ -121,7 +133,7 @@ func (hm *hostMap) CreateMappings(r usermem.AddrRange, at usermem.AccessType, fd
 
 func (hm *hostMap) deleteMapping(r usermem.AddrRange) {
 	// Remove all the existing mappings.
-	hm.forEachEntry(r, func(_ uint64, m hostMapEntry) {
+	hm.forEach(r, func(_ uint64, m hostMapEntry) {
 		_, _, errno := syscall.RawSyscall(
 			syscall.SYS_MUNMAP,
 			m.addr,
@@ -131,9 +143,13 @@ func (hm *hostMap) deleteMapping(r usermem.AddrRange) {
 			// Should never happen.
 			panic(fmt.Sprintf("unmap error: %v", errno))
 		}
+	}, func(uint64, uintptr) (uintptr, bool) {
+		// Sometimes deleteMapping will be called on a larger range
+		// than physical mappings are defined. That's okay.
+		return 0, false
 	})
 
-	// Knock the range out.
+	// Knock the entire range out.
 	hm.set.RemoveRange(r)
 }
 
-- 
cgit v1.2.3


From 2ab754cff7b2d45e1d59798562e47317aa480ecf Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 14 May 2018 21:13:28 -0700
Subject: Make KVM system call first check.

PiperOrigin-RevId: 196613447
Change-Id: Ib76902896798f072c3031b0c5cf7b433718928b7
---
 pkg/sentry/platform/kvm/machine_amd64.go | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index dfa691e88..fe4d31702 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -111,8 +111,11 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
 	vector = c.CPU.SwitchToUser(regs, fpState, pt, flags)
 	exitsyscall()
 
-	// Free and clear.
 	switch vector {
+	case ring0.Syscall, ring0.SyscallInt80:
+		// Fast path: system call executed.
+		return nil, usermem.NoAccess, nil
+
 	case ring0.Debug, ring0.Breakpoint:
 		info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)}
 		return info, usermem.AccessType{}, platform.ErrContextSignal
@@ -158,10 +161,6 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
 		redpill() // Bail and reacqire.
 		return nil, usermem.NoAccess, platform.ErrContextInterrupt
 
-	case ring0.Syscall, ring0.SyscallInt80:
-		// System call executed.
-		return nil, usermem.NoAccess, nil
-
 	default:
 		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
 	}
-- 
cgit v1.2.3


From ed02ac4f668ec41063cd51cbbd451baba9e9a6e7 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 14 May 2018 21:39:31 -0700
Subject: Disable INVPCID check; it's not used.

PiperOrigin-RevId: 196615029
Change-Id: Idfa383a9aee6a9397167a4231ce99d0b0e5b9912
---
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index 389412d87..834e6b96d 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -25,11 +25,10 @@ import (
 )
 
 var (
-	runDataSize     int
-	hasGuestPCID    bool
-	hasGuestINVPCID bool
-	pagetablesOpts  pagetables.Opts
-	cpuidSupported  = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES}
+	runDataSize    int
+	hasGuestPCID   bool
+	pagetablesOpts pagetables.Opts
+	cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES}
 )
 
 func updateSystemValues(fd int) error {
@@ -74,17 +73,8 @@ func updateSystemValues(fd int) error {
 		if entry.function == 1 && entry.index == 0 && entry.ecx&(1<<17) != 0 {
 			hasGuestPCID = true // Found matching PCID in guest feature set.
 		}
-		if entry.function == 7 && entry.index == 0 && entry.ebx&(1<<10) != 0 {
-			hasGuestINVPCID = true // Found matching INVPCID in guest feature set.
-		}
 	}
 
-	// A basic sanity check: ensure that we don't attempt to
-	// invpcid if guest PCIDs are not supported; it's not clear
-	// what the semantics of this would be (or why some CPU or
-	// hypervisor would export this particular combination).
-	hasGuestINVPCID = hasGuestPCID && hasGuestINVPCID
-
 	// Set the pagetables to use PCID if it's available.
 	pagetablesOpts.EnablePCID = hasGuestPCID
 
-- 
cgit v1.2.3


From 9889c29d6d26ba86b5e3590eac85bfb8393dd54e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 15 May 2018 14:38:32 -0700
Subject: Fix problem with sendfile(2) writing less data

When the amount of data read is more than the amount written, sendfile would not
adjust 'in file' position and would resume from the wrong location.

Closes #33

PiperOrigin-RevId: 196731287
Change-Id: Ia219895dd765016ed9e571fd5b366963c99afb27
---
 pkg/sentry/syscalls/linux/sys_file.go | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 5fbacc15e..1d61ac9f0 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1929,8 +1929,16 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		// If we don't have a provided offset.
 	} else {
 		// Send data using readv.
+		inOff := inFile.Offset()
 		r := &io.LimitedReader{R: &fs.FileReader{t, inFile}, N: count}
 		n, err = io.Copy(w, r)
+		inOff += n
+		if inFile.Offset() != inOff {
+			// Adjust file position in case more bytes were read than written.
+			if _, err := inFile.Seek(t, fs.SeekSet, inOff); err != nil {
+				return 0, nil, syserror.EIO
+			}
+		}
 	}
 
 	// We can only pass a single file to handleIOError, so pick inFile
-- 
cgit v1.2.3


From 96c28a43682e8a665142da5b8b0734198fff3a00 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 15 May 2018 14:55:29 -0700
Subject: sentry: Replaces saving of inet.Stack with retrieval via context.

Previously, inet.Stack was referenced in 2 structs in sentry/socket that can be
saved/restored.  If an app is saved and restored on another machine, it may try
to use the old stack, which will have been replaced by a new stack on the new
machine.

PiperOrigin-RevId: 196733985
Change-Id: I6a8cfe73b5d7a90749734677dada635ab3389cb9
---
 pkg/sentry/fs/context.go                    |  2 +-
 pkg/sentry/inet/BUILD                       |  2 ++
 pkg/sentry/inet/context.go                  | 35 +++++++++++++++++++++++++++++
 pkg/sentry/kernel/task.go                   |  3 +++
 pkg/sentry/socket/epsocket/epsocket.go      | 25 ++++++++++++++-------
 pkg/sentry/socket/netlink/route/protocol.go | 21 +++++++----------
 6 files changed, 66 insertions(+), 22 deletions(-)
 create mode 100644 pkg/sentry/inet/context.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index b521bce75..da46ad77f 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -20,7 +20,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 )
 
-// contextID is the kernel package's type for context.Context.Value keys.
+// contextID is the fs package's type for context.Context.Value keys.
 type contextID int
 
 const (
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 207cdb692..1150ced57 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -17,12 +17,14 @@ go_stateify(
 go_library(
     name = "inet",
     srcs = [
+        "context.go",
         "inet.go",
         "inet_state.go",
         "test_stack.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/inet",
     deps = [
+        "//pkg/sentry/context",
         "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go
new file mode 100644
index 000000000..370381f41
--- /dev/null
+++ b/pkg/sentry/inet/context.go
@@ -0,0 +1,35 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package inet
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the inet package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxStack is a Context.Value key for a network stack.
+	CtxStack contextID = iota
+)
+
+// StackFromContext returns the network stack associated with ctx.
+func StackFromContext(ctx context.Context) Stack {
+	if v := ctx.Value(CtxStack); v != nil {
+		return v.(Stack)
+	}
+	return nil
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 3d2e035e9..490f795c2 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/bpf"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
@@ -560,6 +561,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t.creds
 	case fs.CtxRoot:
 		return t.FSContext().RootDirectory()
+	case inet.CtxStack:
+		return t.NetworkContext()
 	case ktime.CtxRealtimeClock:
 		return t.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index a45dcd551..3e4887e16 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -33,6 +33,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -102,7 +103,6 @@ type SocketOperations struct {
 	*waiter.Queue
 
 	family   int
-	stack    inet.Stack
 	Endpoint tcpip.Endpoint
 	skType   unix.SockType
 
@@ -119,7 +119,6 @@ func New(t *kernel.Task, family int, skType unix.SockType, queue *waiter.Queue,
 	return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true}, &SocketOperations{
 		Queue:    queue,
 		family:   family,
-		stack:    t.NetworkContext(),
 		Endpoint: endpoint,
 		skType:   skType,
 	})
@@ -1042,7 +1041,12 @@ func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, ar
 	)
 
 	// Find the relevant device.
-	for index, iface = range s.stack.Interfaces() {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		log.Warningf("Couldn't find a network stack.")
+		return syserr.ErrInvalidArgument
+	}
+	for index, iface = range stack.Interfaces() {
 		if iface.Name == ifr.Name() {
 			found = true
 			break
@@ -1074,7 +1078,7 @@ func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, ar
 
 	case syscall.SIOCGIFADDR:
 		// Copy the IPv4 address out.
-		for _, addr := range s.stack.InterfaceAddrs()[index] {
+		for _, addr := range stack.InterfaceAddrs()[index] {
 			// This ioctl is only compatible with AF_INET addresses.
 			if addr.Family != linux.AF_INET {
 				continue
@@ -1109,7 +1113,7 @@ func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, ar
 
 	case syscall.SIOCGIFNETMASK:
 		// Gets the network mask of a device.
-		for _, addr := range s.stack.InterfaceAddrs()[index] {
+		for _, addr := range stack.InterfaceAddrs()[index] {
 			// This ioctl is only compatible with AF_INET addresses.
 			if addr.Family != linux.AF_INET {
 				continue
@@ -1189,15 +1193,20 @@ func (s *SocketOperations) ifconfIoctl(ctx context.Context, io usermem.IO, ifc *
 	// If Ptr is NULL, return the necessary buffer size via Len.
 	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
 	// structs.
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
+		log.Warningf("Couldn't find a network stack.")
+		return syserr.ErrInvalidArgument.ToError()
+	}
 	if ifc.Ptr == 0 {
-		ifc.Len = int32(len(s.stack.Interfaces())) * int32(linux.SizeOfIFReq)
+		ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq)
 		return nil
 	}
 
 	max := ifc.Len
 	ifc.Len = 0
-	for key, ifaceAddrs := range s.stack.InterfaceAddrs() {
-		iface := s.stack.Interfaces()[key]
+	for key, ifaceAddrs := range stack.InterfaceAddrs() {
+		iface := stack.Interfaces()[key]
 		for _, ifaceAddr := range ifaceAddrs {
 			// Don't write past the end of the buffer.
 			if ifc.Len+int32(linux.SizeOfIFReq) > max {
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index d611519d4..e8030c518 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -43,20 +43,13 @@ func typeKind(typ uint16) commandKind {
 }
 
 // Protocol implements netlink.Protocol.
-type Protocol struct {
-	// stack is the network stack that this provider describes.
-	//
-	// May be nil.
-	stack inet.Stack
-}
+type Protocol struct{}
 
 var _ netlink.Protocol = (*Protocol)(nil)
 
 // NewProtocol creates a NETLINK_ROUTE netlink.Protocol.
 func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) {
-	return &Protocol{
-		stack: t.NetworkContext(),
-	}, nil
+	return &Protocol{}, nil
 }
 
 // Protocol implements netlink.Protocol.Protocol.
@@ -83,12 +76,13 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 	// We always send back an NLMSG_DONE.
 	ms.Multi = true
 
-	if p.stack == nil {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
 		// No network devices.
 		return nil
 	}
 
-	for id, i := range p.stack.Interfaces() {
+	for id, i := range stack.Interfaces() {
 		m := ms.AddMessage(linux.NetlinkMessageHeader{
 			Type: linux.RTM_NEWLINK,
 		})
@@ -124,12 +118,13 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 	// We always send back an NLMSG_DONE.
 	ms.Multi = true
 
-	if p.stack == nil {
+	stack := inet.StackFromContext(ctx)
+	if stack == nil {
 		// No network devices.
 		return nil
 	}
 
-	for id, as := range p.stack.InterfaceAddrs() {
+	for id, as := range stack.InterfaceAddrs() {
 		for _, a := range as {
 			m := ms.AddMessage(linux.NetlinkMessageHeader{
 				Type: linux.RTM_NEWADDR,
-- 
cgit v1.2.3


From 310a99228b9254ad3c09ecdaa66e5747be4f46c5 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 15 May 2018 18:33:19 -0700
Subject: Simplify KVM state handling.

This also removes the dependency on tmutex.

PiperOrigin-RevId: 196764317
Change-Id: I523fb67454318e1a2ca9da3a08e63bfa3c1eeed3
---
 pkg/sentry/platform/kvm/BUILD              |   2 +-
 pkg/sentry/platform/kvm/address_space.go   |   2 +-
 pkg/sentry/platform/kvm/bluepill_unsafe.go |  27 ++--
 pkg/sentry/platform/kvm/kvm_test.go        |  20 ++-
 pkg/sentry/platform/kvm/machine.go         | 243 +++++++++++++++++------------
 pkg/sentry/platform/kvm/machine_amd64.go   |   1 -
 pkg/sentry/platform/kvm/machine_unsafe.go  |  36 ++---
 7 files changed, 183 insertions(+), 148 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index d902e344a..adc43c21b 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -51,6 +51,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/atomicbitops",
         "//pkg/cpuid",
         "//pkg/log",
         "//pkg/sentry/arch",
@@ -63,7 +64,6 @@ go_library(
         "//pkg/sentry/platform/safecopy",
         "//pkg/sentry/time",
         "//pkg/sentry/usermem",
-        "//pkg/tmutex",
     ],
 )
 
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 2302f78e1..3d57ae0cb 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -57,7 +57,7 @@ func (as *addressSpace) Invalidate() {
 		c := key.(*vCPU)
 		v := value.(*uint32)
 		atomic.StoreUint32(v, 0) // Invalidation required.
-		c.Bounce()               // Force a kernel transition.
+		c.BounceToKernel()       // Force a kernel transition.
 		return true              // Keep iterating.
 	})
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9e252af64..2c1e098d7 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -51,15 +51,13 @@ func bluepillHandler(context unsafe.Pointer) {
 	// Increment the number of switches.
 	atomic.AddUint32(&c.switches, 1)
 
-	// Store vCPUGuest.
-	//
-	// This is fine even if we're not in guest mode yet.  In this signal
-	// handler, we'll already have all the relevant signals blocked, so an
-	// interrupt is only deliverable when we actually execute the KVM_RUN.
-	//
-	// The state will be returned to vCPUReady by Phase2.
-	if state := atomic.SwapUintptr(&c.state, vCPUGuest); state != vCPUReady {
-		throw("vCPU not in ready state")
+	// Mark this as guest mode.
+	switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
+	case vCPUUser: // Expected case.
+	case vCPUUser | vCPUWaiter:
+		c.notify()
+	default:
+		throw("invalid state")
 	}
 
 	for {
@@ -118,11 +116,12 @@ func bluepillHandler(context unsafe.Pointer) {
 			// Copy out registers.
 			bluepillArchExit(c, bluepillArchContext(context))
 
-			// Notify any waiters.
-			switch state := atomic.SwapUintptr(&c.state, vCPUReady); state {
-			case vCPUGuest:
-			case vCPUWaiter:
-				c.notify() // Safe from handler.
+			// Return to the vCPUReady state; notify any waiters.
+			user := atomic.LoadUint32(&c.state) & vCPUUser
+			switch atomic.SwapUint32(&c.state, user) {
+			case user | vCPUGuest: // Expected case.
+			case user | vCPUGuest | vCPUWaiter:
+				c.notify()
 			default:
 				throw("invalid state")
 			}
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 61cfdd8fd..778a6d187 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -17,6 +17,7 @@ package kvm
 import (
 	"math/rand"
 	"reflect"
+	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
@@ -84,7 +85,7 @@ func bluepillTest(t testHarness, fn func(*vCPU)) {
 func TestKernelSyscall(t *testing.T) {
 	bluepillTest(t, func(c *vCPU) {
 		redpill() // Leave guest mode.
-		if got := c.State(); got != vCPUReady {
+		if got := atomic.LoadUint32(&c.state); got != vCPUUser {
 			t.Errorf("vCPU not in ready state: got %v", got)
 		}
 	})
@@ -102,7 +103,7 @@ func TestKernelFault(t *testing.T) {
 	hostFault() // Ensure recovery works.
 	bluepillTest(t, func(c *vCPU) {
 		hostFault()
-		if got := c.State(); got != vCPUReady {
+		if got := atomic.LoadUint32(&c.state); got != vCPUUser {
 			t.Errorf("vCPU not in ready state: got %v", got)
 		}
 	})
@@ -229,7 +230,7 @@ func TestBounce(t *testing.T) {
 	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		go func() {
 			time.Sleep(time.Millisecond)
-			c.Bounce()
+			c.BounceToKernel()
 		}()
 		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
 			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
@@ -239,7 +240,7 @@ func TestBounce(t *testing.T) {
 	applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		go func() {
 			time.Sleep(time.Millisecond)
-			c.Bounce()
+			c.BounceToKernel()
 		}()
 		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextInterrupt {
 			t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
@@ -264,17 +265,15 @@ func TestBounceStress(t *testing.T) {
 			// kernel is in various stages of the switch.
 			go func() {
 				randomSleep()
-				c.Bounce()
+				c.BounceToKernel()
 			}()
 			randomSleep()
-			// Execute the switch.
 			if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
 				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 			}
-			// Simulate work.
-			c.Unlock()
+			c.unlock()
 			randomSleep()
-			c.Lock()
+			c.lock()
 		}
 		return false
 	})
@@ -289,8 +288,7 @@ func TestInvalidate(t *testing.T) {
 		}
 		// Unmap the page containing data & invalidate.
 		pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize)
-		c.Invalidate() // Ensure invalidation.
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal {
+		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFlush); err != platform.ErrContextSignal {
 			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
 		}
 		return false
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index a5be0cee3..7a962e316 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -21,11 +21,11 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/tmutex"
 )
 
 // machine contains state associated with the VM as a whole.
@@ -57,20 +57,19 @@ type machine struct {
 }
 
 const (
-	// vCPUReady is the lock value for an available vCPU.
-	//
-	// Legal transitions: vCPUGuest (bluepill).
-	vCPUReady uintptr = iota
+	// vCPUReady is an alias for all the below clear.
+	vCPUReady uint32 = 0
+
+	// vCPUser indicates that the vCPU is in or about to enter user mode.
+	vCPUUser uint32 = 1 << 0
 
 	// vCPUGuest indicates the vCPU is in guest mode.
-	//
-	// Legal transition: vCPUReady (bluepill), vCPUWaiter (wait).
-	vCPUGuest
+	vCPUGuest uint32 = 1 << 1
 
-	// vCPUWaiter indicates that the vCPU should be released.
+	// vCPUWaiter indicates that there is a waiter.
 	//
-	// Legal transition: vCPUReady (bluepill).
-	vCPUWaiter
+	// If this is set, then notify must be called on any state transitions.
+	vCPUWaiter uint32 = 1 << 2
 )
 
 // vCPU is a single KVM vCPU.
@@ -93,17 +92,16 @@ type vCPU struct {
 	// faults is a count of world faults (informational only).
 	faults uint32
 
-	// state is the vCPU state; all are described above.
-	state uintptr
+	// state is the vCPU state.
+	//
+	// This is a bitmask of the three fields (vCPU*) described above.
+	state uint32
 
 	// runData for this vCPU.
 	runData *runData
 
 	// machine associated with this vCPU.
 	machine *machine
-
-	// mu applies across get/put; it does not protect the above.
-	mu tmutex.Mutex
 }
 
 // newMachine returns a new VM context.
@@ -145,7 +143,6 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 			fd:      int(fd),
 			machine: m,
 		}
-		c.mu.Init()
 		c.CPU.Init(m.kernel)
 		c.CPU.KernelSyscall = bluepillSyscall
 		c.CPU.KernelException = bluepillException
@@ -253,27 +250,17 @@ func (m *machine) Destroy() {
 		// Ensure the vCPU is not still running in guest mode. This is
 		// possible iff teardown has been done by other threads, and
 		// somehow a single thread has not executed any system calls.
-		c.wait()
-
-		// Teardown the vCPU itself.
-		switch state := c.State(); state {
-		case vCPUReady:
-			// Note that the runData may not be mapped if an error
-			// occurs during the middle of initialization.
-			if c.runData != nil {
-				if err := unmapRunData(c.runData); err != nil {
-					panic(fmt.Sprintf("error unmapping rundata: %v", err))
-				}
-			}
-			if err := syscall.Close(int(c.fd)); err != nil {
-				panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+		c.BounceToHost()
+
+		// Note that the runData may not be mapped if an error occurs
+		// during the middle of initialization.
+		if c.runData != nil {
+			if err := unmapRunData(c.runData); err != nil {
+				panic(fmt.Sprintf("error unmapping rundata: %v", err))
 			}
-		case vCPUGuest, vCPUWaiter:
-			// Should never happen; waited above.
-			panic("vCPU disposed in guest state")
-		default:
-			// Should never happen; not a valid state.
-			panic(fmt.Sprintf("vCPU in invalid state: %v", state))
+		}
+		if err := syscall.Close(int(c.fd)); err != nil {
+			panic(fmt.Sprintf("error closing vCPU fd: %v", err))
 		}
 	}
 
@@ -296,14 +283,19 @@ func (m *machine) Get() (*vCPU, error) {
 
 	for {
 		// Check for an exact match.
-		if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() {
+		if c := m.vCPUs[tid]; c != nil {
+			c.lock()
 			m.mu.Unlock()
 			return c, nil
 		}
 
 		// Scan for an available vCPU.
 		for origTID, c := range m.vCPUs {
-			if c.LockInState(vCPUReady) {
+			// We can only steal a vCPU that is the vCPUReady
+			// state. That is, it must not be heading to user mode
+			// with some other thread, have a waiter registered, or
+			// be in guest mode already.
+			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
 				delete(m.vCPUs, origTID)
 				m.vCPUs[tid] = c
 				m.mu.Unlock()
@@ -317,96 +309,151 @@ func (m *machine) Get() (*vCPU, error) {
 			}
 		}
 
-		// Everything is busy executing user code (locked).
+		// Everything is already in guest mode.
 		//
-		// We hold the pool lock here, so we should be able to kick something
-		// out of kernel mode and have it bounce into host mode when it tries
-		// to grab the vCPU again.
+		// We hold the pool lock here, so we should be able to kick
+		// something out of kernel mode and have it bounce into host
+		// mode when it tries to grab the vCPU again.
 		for _, c := range m.vCPUs {
-			if c.State() != vCPUWaiter {
-				c.Bounce()
-			}
+			c.BounceToHost()
 		}
 
-		// Give other threads an opportunity to run.
+		// Give other threads an opportunity to run. We don't yield the
+		// pool lock above, so if they try to regrab the lock we will
+		// serialize at this point. This is extreme, but we don't
+		// expect to exhaust all vCPUs frequently.
 		yield()
 	}
 }
 
 // Put puts the current vCPU.
 func (m *machine) Put(c *vCPU) {
-	c.Unlock()
+	c.unlock()
 	runtime.UnlockOSThread()
 }
 
-// State returns the current state.
-func (c *vCPU) State() uintptr {
-	return atomic.LoadUintptr(&c.state)
-}
-
-// Lock locks the vCPU.
-func (c *vCPU) Lock() {
-	c.mu.Lock()
-}
-
-// Invalidate invalidates caches.
-func (c *vCPU) Invalidate() {
+// lock marks the vCPU as in user mode.
+//
+// This should only be called directly when known to be safe, i.e. when
+// the vCPU is owned by the current TID with no chance of theft.
+//
+//go:nosplit
+func (c *vCPU) lock() {
+	atomicbitops.OrUint32(&c.state, vCPUUser)
 }
 
-// LockInState locks the vCPU if it is in the given state and TryLock succeeds.
-func (c *vCPU) LockInState(state uintptr) bool {
-	if c.State() == state && c.mu.TryLock() {
-		if c.State() != state {
-			c.mu.Unlock()
-			return false
-		}
-		return true
+// unlock clears the vCPUUser bit.
+//
+//go:nosplit
+func (c *vCPU) unlock() {
+	if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) {
+		// Happy path: no exits are forced, and we can continue
+		// executing on our merry way with a single atomic access.
+		return
 	}
-	return false
-}
 
-// Unlock unlocks the given vCPU.
-func (c *vCPU) Unlock() {
-	// Ensure we're out of guest mode, if necessary.
-	if c.State() == vCPUWaiter {
-		redpill() // Force guest mode exit.
+	// Clear the lock.
+	origState := atomic.LoadUint32(&c.state)
+	atomicbitops.AndUint32(&c.state, ^vCPUUser)
+	switch origState {
+	case vCPUUser:
+		// Normal state.
+	case vCPUUser | vCPUGuest | vCPUWaiter:
+		// Force a transition: this must trigger a notification when we
+		// return from guest mode.
+		redpill()
+	case vCPUUser | vCPUWaiter:
+		// Waiting for the lock to be released; the responsibility is
+		// on us to notify the waiter and clear the associated bit.
+		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
+		c.notify()
+	default:
+		panic("invalid state")
 	}
-	c.mu.Unlock()
 }
 
 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+//
+//go:nosplit
 func (c *vCPU) NotifyInterrupt() {
-	c.Bounce()
+	c.BounceToKernel()
 }
 
 // pid is used below in bounce.
 var pid = syscall.Getpid()
 
-// Bounce ensures that the vCPU bounces back to the kernel.
+// bounce forces a return to the kernel or to host mode.
 //
-// In practice, this means returning EAGAIN from running user code. The vCPU
-// will be unlocked and relock, and the kernel is guaranteed to check for
-// interrupt notifications (e.g. injected via Notify) and invalidations.
-func (c *vCPU) Bounce() {
+// This effectively unwinds the state machine.
+func (c *vCPU) bounce(forceGuestExit bool) {
 	for {
-		if c.mu.TryLock() {
-			// We know that the vCPU must be in the kernel already,
-			// because the lock was not acquired. We specifically
-			// don't want to call bounce in this case, because it's
-			// not necessary to knock the vCPU out of guest mode.
-			c.mu.Unlock()
+		switch state := atomic.LoadUint32(&c.state); state {
+		case vCPUReady, vCPUWaiter:
+			// There is nothing to be done, we're already in the
+			// kernel pre-acquisition. The Bounce criteria have
+			// been satisfied.
 			return
+		case vCPUUser:
+			// We need to register a waiter for the actual guest
+			// transition. When the transition takes place, then we
+			// can inject an interrupt to ensure a return to host
+			// mode.
+			atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter)
+		case vCPUUser | vCPUWaiter:
+			// Wait for the transition to guest mode. This should
+			// come from the bluepill handler.
+			c.waitUntilNot(state)
+		case vCPUGuest, vCPUUser | vCPUGuest:
+			if state == vCPUGuest && !forceGuestExit {
+				// The vCPU is already not acquired, so there's
+				// no need to do a fresh injection here.
+				return
+			}
+			// The vCPU is in user or kernel mode. Attempt to
+			// register a notification on change.
+			if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) {
+				break // Retry.
+			}
+			for {
+				// We need to spin here until the signal is
+				// delivered, because Tgkill can return EAGAIN
+				// under memory pressure. Since we already
+				// marked ourselves as a waiter, we need to
+				// ensure that a signal is actually delivered.
+				if err := syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil {
+					break
+				} else if err.(syscall.Errno) == syscall.EAGAIN {
+					continue
+				} else {
+					// Nothing else should be returned by tgkill.
+					panic(fmt.Sprintf("unexpected tgkill error: %v", err))
+				}
+			}
+		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
+			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
+				// See above.
+				return
+			}
+			// Wait for the transition. This again should happen
+			// from the bluepill handler, but on the way out.
+			c.waitUntilNot(state)
+		default:
+			// Should not happen: the above is exhaustive.
+			panic("invalid state")
 		}
+	}
+}
 
-		if state := c.State(); state == vCPUGuest || state == vCPUWaiter {
-			// We know that the vCPU was in guest mode, so a single signal
-			// interruption will guarantee that a transition takes place.
-			syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal)
-			return
-		}
+// BounceToKernel ensures that the vCPU bounces back to the kernel.
+//
+//go:nosplit
+func (c *vCPU) BounceToKernel() {
+	c.bounce(false)
+}
 
-		// Someone holds the lock, but the vCPU is not yet transitioned
-		// into guest mode. It's in the critical section; give it time.
-		yield()
-	}
+// BounceToHost ensures that the vCPU is in host mode.
+//
+//go:nosplit
+func (c *vCPU) BounceToHost() {
+	c.bounce(true)
 }
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index fe4d31702..4e42f2c87 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -158,7 +158,6 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.Vector(bounce):
-		redpill() // Bail and reacqire.
 		return nil, usermem.NoAccess, platform.ErrContextInterrupt
 
 	default:
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index da67e23f6..9f7fcd135 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -16,7 +16,6 @@ package kvm
 
 import (
 	"fmt"
-	"sync/atomic"
 	"syscall"
 	"unsafe"
 
@@ -69,7 +68,7 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
-// notify notifies that the vCPU has returned to host mode.
+// notify notifies that the vCPU has transitioned modes.
 //
 // This may be called by a signal handler and therefore throws on error.
 //
@@ -86,27 +85,20 @@ func (c *vCPU) notify() {
 	}
 }
 
-// wait waits for the vCPU to return to host mode.
+// waitUntilNot waits for the vCPU to transition modes.
+//
+// The state should have been previously set to vCPUWaiter after performing an
+// appropriate action to cause a transition (e.g. interrupt injection).
 //
 // This panics on error.
-func (c *vCPU) wait() {
-	if !atomic.CompareAndSwapUintptr(&c.state, vCPUGuest, vCPUWaiter) {
-		return // Nothing to wait for.
-	}
-	for {
-		_, _, errno := syscall.Syscall6(
-			syscall.SYS_FUTEX,
-			uintptr(unsafe.Pointer(&c.state)),
-			linux.FUTEX_WAIT,
-			uintptr(vCPUWaiter), // Expected value.
-			0, 0, 0)
-		if errno == syscall.EINTR {
-			continue
-		} else if errno == syscall.EAGAIN {
-			break
-		} else if errno != 0 {
-			panic("futex wait error")
-		}
-		break
+func (c *vCPU) waitUntilNot(state uint32) {
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&c.state)),
+		linux.FUTEX_WAIT,
+		uintptr(state),
+		0, 0, 0)
+	if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN {
+		panic("futex wait error")
 	}
 }
-- 
cgit v1.2.3


From 00adea3a3f0f3501809901bdac1a01c543d5e116 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 15 May 2018 22:20:36 -0700
Subject: Simplify KVM invalidation logic.

PiperOrigin-RevId: 196780209
Change-Id: I89f39eec914ce54a7c6c4f28e1b6d5ff5a7dd38d
---
 pkg/sentry/platform/kvm/BUILD                   |  1 +
 pkg/sentry/platform/kvm/address_space.go        | 34 +++++++++----------
 pkg/sentry/platform/kvm/address_space_unsafe.go | 44 +++++++++++++++++++++++++
 pkg/sentry/platform/kvm/context.go              | 17 +++++++---
 pkg/sentry/platform/kvm/kvm.go                  |  1 +
 pkg/sentry/platform/kvm/machine.go              |  8 +++++
 pkg/sentry/platform/kvm/machine_unsafe.go       | 23 +++++++++++++
 7 files changed, 106 insertions(+), 22 deletions(-)
 create mode 100644 pkg/sentry/platform/kvm/address_space_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index adc43c21b..004938080 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -27,6 +27,7 @@ go_library(
     name = "kvm",
     srcs = [
         "address_space.go",
+        "address_space_unsafe.go",
         "bluepill.go",
         "bluepill_amd64.go",
         "bluepill_amd64.s",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 3d57ae0cb..e81cc0caf 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -16,8 +16,6 @@ package kvm
 
 import (
 	"reflect"
-	"sync"
-	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
@@ -40,10 +38,10 @@ type addressSpace struct {
 
 	// dirtySet is the set of dirty vCPUs.
 	//
-	// The key is the vCPU, the value is a shared uint32 pointer that
-	// indicates whether or not the context is clean. A zero here indicates
-	// that the context should be cleaned prior to re-entry.
-	dirtySet sync.Map
+	// These are actually vCPU pointers that are stored iff the vCPU is
+	// dirty. If the vCPU is not dirty and requires invalidation, then a
+	// nil value is stored here instead.
+	dirtySet dirtySet
 
 	// files contains files mapped in the host address space.
 	//
@@ -53,22 +51,22 @@ type addressSpace struct {
 
 // Invalidate interrupts all dirty contexts.
 func (as *addressSpace) Invalidate() {
-	as.dirtySet.Range(func(key, value interface{}) bool {
-		c := key.(*vCPU)
-		v := value.(*uint32)
-		atomic.StoreUint32(v, 0) // Invalidation required.
-		c.BounceToKernel()       // Force a kernel transition.
-		return true              // Keep iterating.
-	})
+	for i := 0; i < as.dirtySet.size(); i++ {
+		if c := as.dirtySet.swap(i, nil); c != nil && c.active.get() == as {
+			c.BounceToKernel() // Force a kernel transition.
+		}
+	}
 }
 
 // Touch adds the given vCPU to the dirty list.
-func (as *addressSpace) Touch(c *vCPU) *uint32 {
-	value, ok := as.dirtySet.Load(c)
-	if !ok {
-		value, _ = as.dirtySet.LoadOrStore(c, new(uint32))
+//
+// The return value indicates whether a flush is required.
+func (as *addressSpace) Touch(c *vCPU) bool {
+	if old := as.dirtySet.swap(c.id, c); old == nil {
+		return true // Flush is required.
 	}
-	return value.(*uint32)
+	// Already dirty: no flush required.
+	return false
 }
 
 func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
diff --git a/pkg/sentry/platform/kvm/address_space_unsafe.go b/pkg/sentry/platform/kvm/address_space_unsafe.go
new file mode 100644
index 000000000..b6c31ce10
--- /dev/null
+++ b/pkg/sentry/platform/kvm/address_space_unsafe.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// dirtySet tracks vCPUs for invalidation.
+type dirtySet struct {
+	vCPUs []unsafe.Pointer
+}
+
+// makeDirtySet makes a new dirtySet.
+func makeDirtySet(size int) dirtySet {
+	return dirtySet{
+		vCPUs: make([]unsafe.Pointer, size),
+	}
+}
+
+// size is the size of the set.
+func (ds *dirtySet) size() int {
+	return len(ds.vCPUs)
+}
+
+// swap sets the given index and returns the previous value.
+//
+// The index is typically the id for a non-nil vCPU.
+func (ds *dirtySet) swap(index int, c *vCPU) *vCPU {
+	return (*vCPU)(atomic.SwapPointer(&ds.vCPUs[index], unsafe.Pointer(c)))
+}
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index fd04a2c47..c9bfbc136 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -15,8 +15,6 @@
 package kvm
 
 import (
-	"sync/atomic"
-
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
@@ -54,10 +52,18 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 		return nil, usermem.NoAccess, platform.ErrContextInterrupt
 	}
 
+	// Set the active address space.
+	//
+	// This must be done prior to the call to Touch below. If the address
+	// space is invalidated between this line and the call below, we will
+	// flag on entry anyways. When the active address space below is
+	// cleared, it indicates that we don't need an explicit interrupt and
+	// that the flush can occur naturally on the next user entry.
+	cpu.active.set(localAS)
+
 	// Mark the address space as dirty.
 	flags := ring0.Flags(0)
-	dirty := localAS.Touch(cpu)
-	if v := atomic.SwapUint32(dirty, 1); v == 0 {
+	if localAS.Touch(cpu) {
 		flags |= ring0.FlagFlush
 	}
 	if ac.FullRestore() {
@@ -67,6 +73,9 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 	// Take the blue pill.
 	si, at, err := cpu.SwitchToUser(regs, fp, localAS.pageTables, flags)
 
+	// Clear the address space.
+	cpu.active.set(nil)
+
 	// Release resources.
 	c.machine.Put(cpu)
 
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 31928c9f0..15a241f01 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -133,6 +133,7 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru
 		filemem:    k.FileMem,
 		machine:    k.machine,
 		pageTables: pageTables,
+		dirtySet:   makeDirtySet(len(k.machine.vCPUs)),
 	}, nil, nil
 }
 
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 7a962e316..3ee21fe21 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -80,6 +80,9 @@ type vCPU struct {
 	// by the bluepill code (see bluepill_amd64.s).
 	ring0.CPU
 
+	// id is the vCPU id.
+	id int
+
 	// fd is the vCPU fd.
 	fd int
 
@@ -102,6 +105,10 @@ type vCPU struct {
 
 	// machine associated with this vCPU.
 	machine *machine
+
+	// active is the current addressSpace: this is set and read atomically,
+	// it is used to elide unnecessary interrupts due to invalidations.
+	active atomicAddressSpace
 }
 
 // newMachine returns a new VM context.
@@ -140,6 +147,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 			return nil, fmt.Errorf("error creating VCPU: %v", errno)
 		}
 		c := &vCPU{
+			id:      id,
 			fd:      int(fd),
 			machine: m,
 		}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 9f7fcd135..516098a2b 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -16,6 +16,7 @@ package kvm
 
 import (
 	"fmt"
+	"sync/atomic"
 	"syscall"
 	"unsafe"
 
@@ -68,6 +69,28 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
+// atomicAddressSpace is an atomic address space pointer.
+type atomicAddressSpace struct {
+	pointer unsafe.Pointer
+}
+
+// set sets the address space value.
+//
+//go:nosplit
+func (a *atomicAddressSpace) set(as *addressSpace) {
+	atomic.StorePointer(&a.pointer, unsafe.Pointer(as))
+}
+
+// get gets the address space value.
+//
+// Note that this should be considered best-effort, and may have changed by the
+// time this function returns.
+//
+//go:nosplit
+func (a *atomicAddressSpace) get() *addressSpace {
+	return (*addressSpace)(atomic.LoadPointer(&a.pointer))
+}
+
 // notify notifies that the vCPU has transitioned modes.
 //
 // This may be called by a signal handler and therefore throws on error.
-- 
cgit v1.2.3


From 4b7e4f3d3612dde08a37a040d5be92c37cd0ee57 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 15 May 2018 22:43:52 -0700
Subject: Fix KVM EFAULT handling.

PiperOrigin-RevId: 196781718
Change-Id: I889766eed871929cdc247c6b9aa634398adea9c9
---
 pkg/sentry/platform/kvm/bluepill_unsafe.go | 20 +++++++++--
 pkg/sentry/platform/kvm/kvm_const.go       |  1 +
 pkg/sentry/platform/kvm/machine_amd64.go   | 53 +++++++++++++++++++-----------
 3 files changed, 52 insertions(+), 22 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 2c1e098d7..216d4b4b6 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -61,8 +61,9 @@ func bluepillHandler(context unsafe.Pointer) {
 	}
 
 	for {
-		_, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0)
-		if errno == syscall.EINTR {
+		switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+		case 0: // Expected case.
+		case syscall.EINTR:
 			// First, we process whatever pending signal
 			// interrupted KVM. Since we're in a signal handler
 			// currently, all signals are masked and the signal
@@ -93,7 +94,20 @@ func bluepillHandler(context unsafe.Pointer) {
 				// Force injection below; the vCPU is ready.
 				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
 			}
-		} else if errno != 0 {
+		case syscall.EFAULT:
+			// If a fault is not serviceable due to the host
+			// backing pages having page permissions, instead of an
+			// MMIO exit we receive EFAULT from the run ioctl. We
+			// always inject an NMI here since we may be in kernel
+			// mode and have interrupts disabled.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_NMI, 0); errno != 0 {
+				throw("NMI injection failed")
+			}
+			continue // Rerun vCPU.
+		default:
 			throw("run failed")
 		}
 
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 0ec6a4a00..c819fd16f 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -24,6 +24,7 @@ const (
 	_KVM_CREATE_VCPU            = 0xae41
 	_KVM_SET_TSS_ADDR           = 0xae47
 	_KVM_RUN                    = 0xae80
+	_KVM_NMI                    = 0xae9a
 	_KVM_INTERRUPT              = 0x4004ae86
 	_KVM_SET_MSRS               = 0x4008ae89
 	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 4e42f2c87..f583f68f7 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -97,6 +97,29 @@ func (c *vCPU) initArchState() error {
 	return c.setSystemTime()
 }
 
+// fault generates an appropriate fault return.
+//
+//go:nosplit
+func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error) {
+	bluepill(c) // Probably no-op, but may not be.
+	faultAddr := ring0.ReadCR2()
+	code, user := c.ErrorCode()
+	if !user {
+		// The last fault serviced by this CPU was not a user
+		// fault, so we can't reliably trust the faultAddr or
+		// the code provided here. We need to re-execute.
+		return nil, usermem.NoAccess, platform.ErrContextInterrupt
+	}
+	info := &arch.SignalInfo{Signo: signal}
+	info.SetAddr(uint64(faultAddr))
+	accessType := usermem.AccessType{
+		Read:    code&(1<<1) == 0,
+		Write:   code&(1<<1) != 0,
+		Execute: code&(1<<4) != 0,
+	}
+	return info, accessType, platform.ErrContextSignal
+}
+
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags ring0.Flags) (*arch.SignalInfo, usermem.AccessType, error) {
 	// See below.
@@ -116,29 +139,13 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
 		// Fast path: system call executed.
 		return nil, usermem.NoAccess, nil
 
+	case ring0.PageFault:
+		return c.fault(int32(syscall.SIGSEGV))
+
 	case ring0.Debug, ring0.Breakpoint:
 		info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)}
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
-	case ring0.PageFault:
-		bluepill(c) // Probably no-op, but may not be.
-		faultAddr := ring0.ReadCR2()
-		code, user := c.ErrorCode()
-		if !user {
-			// The last fault serviced by this CPU was not a user
-			// fault, so we can't reliably trust the faultAddr or
-			// the code provided here. We need to re-execute.
-			return nil, usermem.NoAccess, platform.ErrContextInterrupt
-		}
-		info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
-		info.SetAddr(uint64(faultAddr))
-		accessType := usermem.AccessType{
-			Read:    code&(1<<1) == 0,
-			Write:   code&(1<<1) != 0,
-			Execute: code&(1<<4) != 0,
-		}
-		return info, accessType, platform.ErrContextSignal
-
 	case ring0.GeneralProtectionFault:
 		if !ring0.IsCanonical(regs.Rip) {
 			// If the RIP is non-canonical, it's a SEGV.
@@ -160,6 +167,14 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
 	case ring0.Vector(bounce):
 		return nil, usermem.NoAccess, platform.ErrContextInterrupt
 
+	case ring0.NMI:
+		// An NMI is generated only when a fault is not servicable by
+		// KVM itself, so we think some mapping is writeable but it's
+		// really not. This could happen, e.g. if some file is
+		// truncated (and would generate a SIGBUS) and we map it
+		// directly into the instance.
+		return c.fault(int32(syscall.SIGBUS))
+
 	default:
 		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
 	}
-- 
cgit v1.2.3


From f295e26b8abe395eaf1d4bee9a792a79b34d156f Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Wed, 16 May 2018 13:06:23 -0700
Subject: Release mutex in BidirectionalConnect to avoid deadlock.

When doing a BidirectionalConnect we don't need to continue holding
the ConnectingEndpoint's mutex when creating the NewConnectedEndpoint
as it was held during the Connect. Additionally, we're not holding
the baseEndpoint mutex while Unregistering an event.

PiperOrigin-RevId: 196875557
Change-Id: Ied4ceed89de883121c6cba81bc62aa3a8549b1e9
---
 pkg/sentry/fs/gofer/socket.go     | 20 ++++++++----
 pkg/sentry/fs/host/socket.go      | 67 +++++++++++++++++++++------------------
 pkg/sentry/fs/host/socket_test.go | 64 ++++++++++++++++++-------------------
 pkg/tcpip/transport/unix/unix.go  |  4 +--
 4 files changed, 85 insertions(+), 70 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 954000ef0..406756f5f 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -79,26 +79,33 @@ func (e *endpoint) BidirectionalConnect(ce unix.ConnectingEndpoint, returnConnec
 
 	// No lock ordering required as only the ConnectingEndpoint has a mutex.
 	ce.Lock()
-	defer ce.Unlock()
 
 	// Check connecting state.
 	if ce.Connected() {
+		ce.Unlock()
 		return tcpip.ErrAlreadyConnected
 	}
 	if ce.Listening() {
+		ce.Unlock()
 		return tcpip.ErrInvalidEndpointState
 	}
 
 	hostFile, err := e.file.Connect(cf)
 	if err != nil {
+		ce.Unlock()
 		return tcpip.ErrConnectionRefused
 	}
 
-	r, c, terr := host.NewConnectedEndpoint(hostFile, ce.WaiterQueue(), e.path)
+	c, terr := host.NewConnectedEndpoint(hostFile, ce.WaiterQueue(), e.path)
 	if terr != nil {
+		ce.Unlock()
 		return terr
 	}
-	returnConnect(r, c)
+
+	returnConnect(c, c)
+	ce.Unlock()
+	c.Init()
+
 	return nil
 }
 
@@ -109,14 +116,15 @@ func (e *endpoint) UnidirectionalConnect() (unix.ConnectedEndpoint, *tcpip.Error
 		return nil, tcpip.ErrConnectionRefused
 	}
 
-	r, c, terr := host.NewConnectedEndpoint(hostFile, &waiter.Queue{}, e.path)
+	c, terr := host.NewConnectedEndpoint(hostFile, &waiter.Queue{}, e.path)
 	if terr != nil {
 		return nil, terr
 	}
+	c.Init()
 
 	// We don't need the receiver.
-	r.CloseRecv()
-	r.Release()
+	c.CloseRecv()
+	c.Release()
 
 	return c, nil
 }
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 467633052..f4689f51f 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -286,26 +286,33 @@ func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.Fu
 	return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil
 }
 
-// NewConnectedEndpoint creates a new unix.Receiver and unix.ConnectedEndpoint
-// backed by a host FD that will pretend to be bound at a given sentry path.
-func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (unix.Receiver, unix.ConnectedEndpoint, *tcpip.Error) {
-	if err := fdnotifier.AddFD(int32(file.FD()), queue); err != nil {
-		return nil, nil, translateError(err)
-	}
-
-	e := &connectedEndpoint{path: path, queue: queue, file: file}
+// NewConnectedEndpoint creates a new ConnectedEndpoint backed by
+// a host FD that will pretend to be bound at a given sentry path.
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs
+// to be called twice because host.ConnectedEndpoint is both a
+// unix.Receiver and unix.ConnectedEndpoint.
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *tcpip.Error) {
+	e := &ConnectedEndpoint{path: path, queue: queue, file: file}
 
 	// AtomicRefCounters start off with a single reference. We need two.
 	e.ref.IncRef()
 
-	return e, e, nil
+	return e, nil
+}
+
+// Init will do initialization required without holding other locks.
+func (c *ConnectedEndpoint) Init() {
+	if err := fdnotifier.AddFD(int32(c.file.FD()), c.queue); err != nil {
+		panic(err)
+	}
 }
 
-// connectedEndpoint is a host FD backed implementation of
+// ConnectedEndpoint is a host FD backed implementation of
 // unix.ConnectedEndpoint and unix.Receiver.
 //
-// connectedEndpoint does not support save/restore for now.
-type connectedEndpoint struct {
+// ConnectedEndpoint does not support save/restore for now.
+type ConnectedEndpoint struct {
 	queue *waiter.Queue
 	path  string
 
@@ -328,7 +335,7 @@ type connectedEndpoint struct {
 }
 
 // Send implements unix.ConnectedEndpoint.Send.
-func (c *connectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.writeClosed {
@@ -341,20 +348,20 @@ func (c *connectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMess
 }
 
 // SendNotify implements unix.ConnectedEndpoint.SendNotify.
-func (c *connectedEndpoint) SendNotify() {}
+func (c *ConnectedEndpoint) SendNotify() {}
 
 // CloseSend implements unix.ConnectedEndpoint.CloseSend.
-func (c *connectedEndpoint) CloseSend() {
+func (c *ConnectedEndpoint) CloseSend() {
 	c.mu.Lock()
 	c.writeClosed = true
 	c.mu.Unlock()
 }
 
 // CloseNotify implements unix.ConnectedEndpoint.CloseNotify.
-func (c *connectedEndpoint) CloseNotify() {}
+func (c *ConnectedEndpoint) CloseNotify() {}
 
 // Writable implements unix.ConnectedEndpoint.Writable.
-func (c *connectedEndpoint) Writable() bool {
+func (c *ConnectedEndpoint) Writable() bool {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.writeClosed {
@@ -364,18 +371,18 @@ func (c *connectedEndpoint) Writable() bool {
 }
 
 // Passcred implements unix.ConnectedEndpoint.Passcred.
-func (c *connectedEndpoint) Passcred() bool {
+func (c *ConnectedEndpoint) Passcred() bool {
 	// We don't support credential passing for host sockets.
 	return false
 }
 
 // GetLocalAddress implements unix.ConnectedEndpoint.GetLocalAddress.
-func (c *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil
 }
 
 // EventUpdate implements unix.ConnectedEndpoint.EventUpdate.
-func (c *connectedEndpoint) EventUpdate() {
+func (c *ConnectedEndpoint) EventUpdate() {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.file.FD() != -1 {
@@ -384,7 +391,7 @@ func (c *connectedEndpoint) EventUpdate() {
 }
 
 // Recv implements unix.Receiver.Recv.
-func (c *connectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.readClosed {
@@ -397,24 +404,24 @@ func (c *connectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 }
 
 // close releases all resources related to the endpoint.
-func (c *connectedEndpoint) close() {
+func (c *ConnectedEndpoint) close() {
 	fdnotifier.RemoveFD(int32(c.file.FD()))
 	c.file.Close()
 	c.file = nil
 }
 
 // RecvNotify implements unix.Receiver.RecvNotify.
-func (c *connectedEndpoint) RecvNotify() {}
+func (c *ConnectedEndpoint) RecvNotify() {}
 
 // CloseRecv implements unix.Receiver.CloseRecv.
-func (c *connectedEndpoint) CloseRecv() {
+func (c *ConnectedEndpoint) CloseRecv() {
 	c.mu.Lock()
 	c.readClosed = true
 	c.mu.Unlock()
 }
 
 // Readable implements unix.Receiver.Readable.
-func (c *connectedEndpoint) Readable() bool {
+func (c *ConnectedEndpoint) Readable() bool {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.readClosed {
@@ -424,21 +431,21 @@ func (c *connectedEndpoint) Readable() bool {
 }
 
 // SendQueuedSize implements unix.Receiver.SendQueuedSize.
-func (c *connectedEndpoint) SendQueuedSize() int64 {
+func (c *ConnectedEndpoint) SendQueuedSize() int64 {
 	// SendQueuedSize isn't supported for host sockets because we don't allow the
 	// sentry to call ioctl(2).
 	return -1
 }
 
 // RecvQueuedSize implements unix.Receiver.RecvQueuedSize.
-func (c *connectedEndpoint) RecvQueuedSize() int64 {
+func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
 	// RecvQueuedSize isn't supported for host sockets because we don't allow the
 	// sentry to call ioctl(2).
 	return -1
 }
 
 // SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize.
-func (c *connectedEndpoint) SendMaxQueueSize() int64 {
+func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
 	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
 	if err != nil {
 		return -1
@@ -447,7 +454,7 @@ func (c *connectedEndpoint) SendMaxQueueSize() int64 {
 }
 
 // RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize.
-func (c *connectedEndpoint) RecvMaxQueueSize() int64 {
+func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
 	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF)
 	if err != nil {
 		return -1
@@ -456,7 +463,7 @@ func (c *connectedEndpoint) RecvMaxQueueSize() int64 {
 }
 
 // Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release.
-func (c *connectedEndpoint) Release() {
+func (c *ConnectedEndpoint) Release() {
 	c.ref.DecRefWithDestructor(c.close)
 }
 
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 9b73c5173..8b752737d 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -31,11 +31,11 @@ import (
 )
 
 var (
-	// Make sure that connectedEndpoint implements unix.ConnectedEndpoint.
-	_ = unix.ConnectedEndpoint(new(connectedEndpoint))
+	// Make sure that ConnectedEndpoint implements unix.ConnectedEndpoint.
+	_ = unix.ConnectedEndpoint(new(ConnectedEndpoint))
 
-	// Make sure that connectedEndpoint implements unix.Receiver.
-	_ = unix.Receiver(new(connectedEndpoint))
+	// Make sure that ConnectedEndpoint implements unix.Receiver.
+	_ = unix.Receiver(new(ConnectedEndpoint))
 )
 
 func getFl(fd int) (uint32, error) {
@@ -198,28 +198,28 @@ func TestListen(t *testing.T) {
 }
 
 func TestSend(t *testing.T) {
-	e := connectedEndpoint{writeClosed: true}
+	e := ConnectedEndpoint{writeClosed: true}
 	if _, _, err := e.Send(nil, unix.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend {
 		t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend)
 	}
 }
 
 func TestRecv(t *testing.T) {
-	e := connectedEndpoint{readClosed: true}
+	e := ConnectedEndpoint{readClosed: true}
 	if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != tcpip.ErrClosedForReceive {
 		t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, tcpip.ErrClosedForReceive)
 	}
 }
 
 func TestPasscred(t *testing.T) {
-	e := connectedEndpoint{}
+	e := ConnectedEndpoint{}
 	if got, want := e.Passcred(), false; got != want {
 		t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want)
 	}
 }
 
 func TestGetLocalAddress(t *testing.T) {
-	e := connectedEndpoint{path: "foo"}
+	e := ConnectedEndpoint{path: "foo"}
 	want := tcpip.FullAddress{Addr: tcpip.Address("foo")}
 	if got, err := e.GetLocalAddress(); err != nil || got != want {
 		t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil)
@@ -227,7 +227,7 @@ func TestGetLocalAddress(t *testing.T) {
 }
 
 func TestQueuedSize(t *testing.T) {
-	e := connectedEndpoint{}
+	e := ConnectedEndpoint{}
 	tests := []struct {
 		name string
 		f    func() int64
@@ -244,14 +244,14 @@ func TestQueuedSize(t *testing.T) {
 }
 
 func TestReadable(t *testing.T) {
-	e := connectedEndpoint{readClosed: true}
+	e := ConnectedEndpoint{readClosed: true}
 	if got, want := e.Readable(), true; got != want {
 		t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want)
 	}
 }
 
 func TestWritable(t *testing.T) {
-	e := connectedEndpoint{writeClosed: true}
+	e := ConnectedEndpoint{writeClosed: true}
 	if got, want := e.Writable(), true; got != want {
 		t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want)
 	}
@@ -262,8 +262,8 @@ func TestRelease(t *testing.T) {
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
-	want := &connectedEndpoint{queue: c.queue}
+	c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	want := &ConnectedEndpoint{queue: c.queue}
 	want.ref.DecRef()
 	fdnotifier.AddFD(int32(c.file.FD()), nil)
 	c.Release()
@@ -275,119 +275,119 @@ func TestRelease(t *testing.T) {
 func TestClose(t *testing.T) {
 	type testCase struct {
 		name  string
-		cep   *connectedEndpoint
+		cep   *ConnectedEndpoint
 		addFD bool
 		f     func()
-		want  *connectedEndpoint
+		want  *ConnectedEndpoint
 	}
 
 	var tests []testCase
 
-	// nil is the value used by connectedEndpoint to indicate a closed file.
+	// nil is the value used by ConnectedEndpoint to indicate a closed file.
 	// Non-nil files are used to check if the file gets closed.
 
 	f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
 	tests = append(tests, testCase{
 		name:  "First CloseRecv",
 		cep:   c,
 		addFD: false,
 		f:     c.CloseRecv,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
 	})
 
 	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
 	tests = append(tests, testCase{
 		name:  "Second CloseRecv",
 		cep:   c,
 		addFD: false,
 		f:     c.CloseRecv,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
 	})
 
 	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
+	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
 	tests = append(tests, testCase{
 		name:  "First CloseSend",
 		cep:   c,
 		addFD: false,
 		f:     c.CloseSend,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
 	})
 
 	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
 	tests = append(tests, testCase{
 		name:  "Second CloseSend",
 		cep:   c,
 		addFD: false,
 		f:     c.CloseSend,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
 	})
 
 	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
+	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
 	tests = append(tests, testCase{
 		name:  "CloseSend then CloseRecv",
 		cep:   c,
 		addFD: true,
 		f:     c.CloseRecv,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
 	})
 
 	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
+	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
 	tests = append(tests, testCase{
 		name:  "CloseRecv then CloseSend",
 		cep:   c,
 		addFD: true,
 		f:     c.CloseSend,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
 	})
 
 	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
 	tests = append(tests, testCase{
 		name:  "Full close then CloseRecv",
 		cep:   c,
 		addFD: false,
 		f:     c.CloseRecv,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
 	})
 
 	f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		t.Fatal("Creating socket:", err)
 	}
-	c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
+	c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
 	tests = append(tests, testCase{
 		name:  "Full close then CloseSend",
 		cep:   c,
 		addFD: false,
 		f:     c.CloseSend,
-		want:  &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
+		want:  &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
 	})
 
 	for _, test := range tests {
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
index 5fe37eb71..72c21a432 100644
--- a/pkg/tcpip/transport/unix/unix.go
+++ b/pkg/tcpip/transport/unix/unix.go
@@ -677,8 +677,8 @@ type baseEndpoint struct {
 
 // EventRegister implements waiter.Waitable.EventRegister.
 func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
-	e.Lock()
 	e.Queue.EventRegister(we, mask)
+	e.Lock()
 	if e.connected != nil {
 		e.connected.EventUpdate()
 	}
@@ -687,8 +687,8 @@ func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
 func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
-	e.Lock()
 	e.Queue.EventUnregister(we)
+	e.Lock()
 	if e.connected != nil {
 		e.connected.EventUpdate()
 	}
-- 
cgit v1.2.3


From d154c6a25f9d2b88b8ce22cff575467b159f06bc Mon Sep 17 00:00:00 2001
From: Christopher Koch <chrisko@google.com>
Date: Wed, 16 May 2018 13:28:29 -0700
Subject: Refcount socket Dirents correctly.

This should fix the socket Dirent memory leak.

fs.NewFile takes a new reference. It should hold the *only* reference.
DecRef that socket Dirent.

Before the globalDirentMap was introduced, a mis-refcounted Dirent
would be garbage collected when all references to it were gone. For
socket Dirents, this meant that they would be garbage collected when
the associated fs.Files disappeared.

After the globalDirentMap, Dirents *must* be reference-counted
correctly to be garbage collected, as Dirents remove themselves
from the global map when their refcount goes to -1 (see Dirent.destroy).
That removes the last pointer to that Dirent.

PiperOrigin-RevId: 196878973
Change-Id: Ic7afcd1de97c7101ccb13be5fc31de0fb50963f0
---
 pkg/sentry/socket/epsocket/epsocket.go | 1 +
 pkg/sentry/socket/hostinet/socket.go   | 1 +
 pkg/sentry/socket/netlink/provider.go  | 1 +
 pkg/sentry/socket/unix/unix.go         | 1 +
 4 files changed, 4 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 3e4887e16..18cb70c96 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -116,6 +116,7 @@ type SocketOperations struct {
 // New creates a new endpoint socket.
 func New(t *kernel.Task, family int, skType unix.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) *fs.File {
 	dirent := socket.NewDirent(t, epsocketDevice)
+	defer dirent.DecRef()
 	return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true}, &SocketOperations{
 		Queue:    queue,
 		family:   family,
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 02fad1c60..8f901df6c 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -65,6 +65,7 @@ func newSocketFile(ctx context.Context, fd int, nonblock bool) (*fs.File, *syser
 		return nil, syserr.FromError(err)
 	}
 	dirent := socket.NewDirent(ctx, socketDevice)
+	defer dirent.DecRef()
 	return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true}, s), nil
 }
 
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 36800da4d..e874216f4 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -89,6 +89,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype unix.SockType, protocol int)
 	}
 
 	d := socket.NewDirent(t, netlinkSocketDevice)
+	defer d.DecRef()
 	return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true}, s), nil
 }
 
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index f83156c8e..27bacbbc3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -56,6 +56,7 @@ type SocketOperations struct {
 // New creates a new unix socket.
 func New(ctx context.Context, endpoint unix.Endpoint) *fs.File {
 	dirent := socket.NewDirent(ctx, unixSocketDevice)
+	defer dirent.DecRef()
 	return NewWithDirent(ctx, dirent, endpoint, fs.FileFlags{Read: true, Write: true})
 }
 
-- 
cgit v1.2.3


From 3131a6b131127e70b5e3941e3c4d292d99312fa5 Mon Sep 17 00:00:00 2001
From: Chanwit Kaewkasi <chanwit@gmail.com>
Date: Wed, 16 May 2018 14:19:34 -0700
Subject: Verify that when offset address is not null, infile must be seekable

Change-Id: Id247399baeac58f6cd774acabd5d1da05e5b5697
PiperOrigin-RevId: 196887768
---
 pkg/sentry/syscalls/linux/sys_file.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 1d61ac9f0..9b8374ef6 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1915,6 +1915,10 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	hasOffset := offsetAddr != 0
 	// If we have a provided offset.
 	if hasOffset {
+		// Verify that when offset address is not null, infile must be seekable
+		if !inFile.Flags().Pread {
+			return 0, nil, syserror.ESPIPE
+		}
 		// Copy in the offset.
 		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
 			return 0, nil, err
-- 
cgit v1.2.3


From 8e1deb2ab8fb67da9a1f6521e31c5635ac587e71 Mon Sep 17 00:00:00 2001
From: Christopher Koch <chrisko@google.com>
Date: Wed, 16 May 2018 14:53:57 -0700
Subject: Fix another socket Dirent refcount.

PiperOrigin-RevId: 196893452
Change-Id: I5ea0f851fcabc5eac5859e61f15213323d996337
---
 pkg/sentry/socket/rpcinet/socket.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 2911d3fd6..11925f8d8 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -71,6 +71,7 @@ func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, pr
 	stack.notifier.AddFD(fd, &wq)
 
 	dirent := socket.NewDirent(ctx, socketDevice)
+	defer dirent.DecRef()
 	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{
 		wq:       &wq,
 		fd:       fd,
@@ -274,6 +275,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	s.notifier.AddFD(payload.Fd, &wq)
 
 	dirent := socket.NewDirent(t, socketDevice)
+	defer dirent.DecRef()
 	file := fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonBlocking: flags&linux.SOCK_NONBLOCK != 0}, &socketOperations{
 		wq:       &wq,
 		fd:       payload.Fd,
-- 
cgit v1.2.3


From 8878a66a565733493e702199b284cd7855f80bf0 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 17 May 2018 15:05:15 -0700
Subject: Implement sysv shm.

PiperOrigin-RevId: 197058289
Change-Id: I3946c25028b7e032be4894d61acb48ac0c24d574
---
 pkg/abi/linux/BUILD                  |   1 +
 pkg/abi/linux/shm.go                 |  75 +++++
 pkg/refs/refcounter.go               |   8 +-
 pkg/sentry/context/context.go        |  20 ++
 pkg/sentry/fs/dirent_refs_test.go    |  62 ++--
 pkg/sentry/kernel/BUILD              |   1 +
 pkg/sentry/kernel/ipc_namespace.go   |  15 +-
 pkg/sentry/kernel/shm/BUILD          |  40 +++
 pkg/sentry/kernel/shm/device.go      |  20 ++
 pkg/sentry/kernel/shm/shm.go         | 630 +++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/task.go            |   3 +
 pkg/sentry/kernel/task_clone.go      |   4 +-
 pkg/sentry/mm/BUILD                  |   2 +
 pkg/sentry/mm/shm.go                 |  66 ++++
 pkg/sentry/syscalls/linux/BUILD      |   2 +
 pkg/sentry/syscalls/linux/linux64.go |   8 +-
 pkg/sentry/syscalls/linux/sys_shm.go | 155 +++++++++
 runsc/boot/loader.go                 |   2 +-
 18 files changed, 1072 insertions(+), 42 deletions(-)
 create mode 100644 pkg/abi/linux/shm.go
 create mode 100644 pkg/sentry/kernel/shm/BUILD
 create mode 100644 pkg/sentry/kernel/shm/device.go
 create mode 100644 pkg/sentry/kernel/shm/shm.go
 create mode 100644 pkg/sentry/mm/shm.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_shm.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index a428e61a3..693ce0fdd 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -51,6 +51,7 @@ go_library(
         "sched.go",
         "seccomp.go",
         "sem.go",
+        "shm.go",
         "signal.go",
         "socket.go",
         "time.go",
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
new file mode 100644
index 000000000..9149ed094
--- /dev/null
+++ b/pkg/abi/linux/shm.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// shmat(2) flags. Source: include/uapi/linux/shm.h
+const (
+	SHM_RDONLY = 010000  // Read-only access.
+	SHM_RND    = 020000  // Round attach address to SHMLBA boundary.
+	SHM_REMAP  = 040000  // Take-over region on attach.
+	SHM_EXEC   = 0100000 // Execution access.
+)
+
+// IPCPerm.Mode upper byte flags. Source: include/linux/shm.h
+const (
+	SHM_DEST      = 01000  // Segment will be destroyed on last detach.
+	SHM_LOCKED    = 02000  // Segment will not be swapped.
+	SHM_HUGETLB   = 04000  // Segment will use huge TLB pages.
+	SHM_NORESERVE = 010000 // Don't check for reservations.
+)
+
+// Additional Linux-only flags for shmctl(2). Source: include/uapi/linux/shm.h
+const (
+	SHM_LOCK   = 11
+	SHM_UNLOCK = 12
+	SHM_STAT   = 13
+	SHM_INFO   = 14
+)
+
+// ShmidDS is equivalent to struct shmid64_ds. Source:
+// include/uapi/asm-generic/shmbuf.h
+type ShmidDS struct {
+	ShmPerm    IPCPerm
+	ShmSegsz   uint64
+	ShmAtime   TimeT
+	ShmDtime   TimeT
+	ShmCtime   TimeT
+	ShmCpid    int32
+	ShmLpid    int32
+	ShmNattach uint64
+
+	Unused4 uint64
+	Unused5 uint64
+}
+
+// ShmParams is equivalent to struct shminfo. Source: include/uapi/linux/shm.h
+type ShmParams struct {
+	ShmMax uint64
+	ShmMin uint64
+	ShmMni uint64
+	ShmSeg uint64
+	ShmAll uint64
+}
+
+// ShmInfo is equivalent to struct shm_info. Source: include/uapi/linux/shm.h
+type ShmInfo struct {
+	UsedIDs       int32 // Number of currently existing segments.
+	_             [4]byte
+	ShmTot        uint64 // Total number of shared memory pages.
+	ShmRss        uint64 // Number of resident shared memory pages.
+	ShmSwp        uint64 // Number of swapped shared memory pages.
+	SwapAttempts  uint64 // Unused since Linux 2.4.
+	SwapSuccesses uint64 // Unused since Linux 2.4.
+}
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 1036553c7..3162001e1 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -194,9 +194,11 @@ type AtomicRefCount struct {
 	weakRefs ilist.List `state:"nosave"`
 }
 
-// TestReadRefs returns the current reference count of r. Use only for tests.
-func (r *AtomicRefCount) TestReadRefs() int64 {
-	return atomic.LoadInt64(&r.refCount)
+// ReadRefs returns the current number of references. The returned count is
+// inherently racy and is unsafe to use without external synchronization.
+func (r *AtomicRefCount) ReadRefs() int64 {
+	// Account for the internal -1 offset on refcounts.
+	return atomic.LoadInt64(&r.refCount) + 1
 }
 
 // IncRef increments this object's reference count. While the count is kept
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index e0dffafba..598c5b4ff 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -20,6 +20,26 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
+type contextID int
+
+// Globally accessible values from a context. These keys are defined in the
+// context package to resolve dependency cycles by not requiring the caller to
+// import packages usually required to get these information.
+const (
+	// CtxThreadGroupID is the current thread group ID when a context represents
+	// a task context. The value is represented as an int32.
+	CtxThreadGroupID contextID = iota
+)
+
+// ThreadGroupIDFromContext returns the current thread group ID when ctx
+// represents a task context.
+func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
+	if tgid := ctx.Value(CtxThreadGroupID); tgid != nil {
+		return tgid.(int32), true
+	}
+	return 0, false
+}
+
 // A Context represents a thread of execution (hereafter "goroutine" to reflect
 // Go idiosyncrasy). It carries state associated with the goroutine across API
 // boundaries.
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 8ce9ba02d..f9dcba316 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -33,8 +33,8 @@ func TestWalkPositive(t *testing.T) {
 	ctx := contexttest.Context(t)
 	root := NewDirent(newMockDirInode(ctx, nil), "root")
 
-	if got := root.TestReadRefs(); got != 0 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	if got := root.ReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
 	name := "d"
@@ -43,22 +43,22 @@ func TestWalkPositive(t *testing.T) {
 		t.Fatalf("root.walk(root, %q) got %v, want nil", name, err)
 	}
 
-	if got := root.TestReadRefs(); got != 1 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 1)
+	if got := root.ReadRefs(); got != 2 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 2)
 	}
 
-	if got := d.TestReadRefs(); got != 0 {
-		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0)
+	if got := d.ReadRefs(); got != 1 {
+		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1)
 	}
 
 	d.DecRef()
 
-	if got := root.TestReadRefs(); got != 0 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	if got := root.ReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
-	if got := d.TestReadRefs(); got != -1 {
-		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, -1)
+	if got := d.ReadRefs(); got != 0 {
+		t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0)
 	}
 
 	root.flush()
@@ -76,8 +76,8 @@ func TestWalkNegative(t *testing.T) {
 	root := NewDirent(NewEmptyDir(ctx, nil), "root")
 	mn := root.Inode.InodeOperations.(*mockInodeOperationsLookupNegative)
 
-	if got := root.TestReadRefs(); got != 0 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 0)
+	if got := root.ReadRefs(); got != 1 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
 	name := "d"
@@ -88,7 +88,7 @@ func TestWalkNegative(t *testing.T) {
 		}
 	}
 
-	if got := root.TestReadRefs(); got != 0 {
+	if got := root.ReadRefs(); got != 1 {
 		t.Fatalf("root has a ref count of %d, want %d", got, 1)
 	}
 
@@ -110,14 +110,14 @@ func TestWalkNegative(t *testing.T) {
 		t.Fatalf("root found positive child at %q, want negative", name)
 	}
 
-	if got := child.(*Dirent).TestReadRefs(); got != 1 {
-		t.Fatalf("child has a ref count of %d, want %d", got, 1)
+	if got := child.(*Dirent).ReadRefs(); got != 2 {
+		t.Fatalf("child has a ref count of %d, want %d", got, 2)
 	}
 
 	child.DecRef()
 
-	if got := child.(*Dirent).TestReadRefs(); got != 0 {
-		t.Fatalf("child has a ref count of %d, want %d", got, 0)
+	if got := child.(*Dirent).ReadRefs(); got != 1 {
+		t.Fatalf("child has a ref count of %d, want %d", got, 1)
 	}
 
 	if got := len(root.children); got != 1 {
@@ -126,7 +126,7 @@ func TestWalkNegative(t *testing.T) {
 
 	root.DecRef()
 
-	if got := root.TestReadRefs(); got != -1 {
+	if got := root.ReadRefs(); got != 0 {
 		t.Fatalf("root has a ref count of %d, want %d", got, 0)
 	}
 
@@ -184,12 +184,12 @@ func TestHashNegativeToPositive(t *testing.T) {
 		t.Fatalf("got negative Dirent, want positive")
 	}
 
-	if got := d.TestReadRefs(); got != 0 {
-		t.Fatalf("child %q has a ref count of %d, want %d", name, got, 0)
+	if got := d.ReadRefs(); got != 1 {
+		t.Fatalf("child %q has a ref count of %d, want %d", name, got, 1)
 	}
 
-	if got := root.TestReadRefs(); got != 1 {
-		t.Fatalf("root has a ref count of %d, want %d", got, 1)
+	if got := root.ReadRefs(); got != 2 {
+		t.Fatalf("root has a ref count of %d, want %d", got, 2)
 	}
 
 	if got := len(root.children); got != 1 {
@@ -291,12 +291,12 @@ func TestCreateExtraRefs(t *testing.T) {
 		{
 			desc: "Create caching",
 			root: NewDirent(NewEmptyDir(ctx, NewDirentCache(1)), "root"),
-			refs: 1,
+			refs: 2,
 		},
 		{
 			desc: "Create not caching",
 			root: NewDirent(NewEmptyDir(ctx, nil), "root"),
-			refs: 0,
+			refs: 1,
 		},
 	} {
 		t.Run(test.desc, func(t *testing.T) {
@@ -307,7 +307,7 @@ func TestCreateExtraRefs(t *testing.T) {
 			}
 			d := f.Dirent
 
-			if got := d.TestReadRefs(); got != test.refs {
+			if got := d.ReadRefs(); got != test.refs {
 				t.Errorf("dirent has a ref count of %d, want %d", got, test.refs)
 			}
 		})
@@ -347,8 +347,8 @@ func TestRemoveExtraRefs(t *testing.T) {
 				t.Fatalf("root.Remove(root, %q) failed: %v", name, err)
 			}
 
-			if got := d.TestReadRefs(); got != 0 {
-				t.Fatalf("dirent has a ref count of %d, want %d", got, 0)
+			if got := d.ReadRefs(); got != 1 {
+				t.Fatalf("dirent has a ref count of %d, want %d", got, 1)
 			}
 
 			d.DecRef()
@@ -406,11 +406,11 @@ func TestRenameExtraRefs(t *testing.T) {
 			newParent.flush()
 
 			// Expect to have only active references.
-			if got := renamed.TestReadRefs(); got != 0 {
-				t.Errorf("renamed has ref count %d, want only active references %d", got, 0)
+			if got := renamed.ReadRefs(); got != 1 {
+				t.Errorf("renamed has ref count %d, want only active references %d", got, 1)
 			}
-			if got := replaced.TestReadRefs(); got != 0 {
-				t.Errorf("replaced has ref count %d, want only active references %d", got, 0)
+			if got := replaced.ReadRefs(); got != 1 {
+				t.Errorf("replaced has ref count %d, want only active references %d", got, 1)
 			}
 		})
 	}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 62794cff5..377c94e4c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -184,6 +184,7 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/shm",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 78737f58f..3049fead4 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,18 +15,26 @@
 package kernel
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm"
 )
 
 // IPCNamespace represents an IPC namespace.
 type IPCNamespace struct {
+	// User namespace which owns this IPC namespace. Immutable.
+	userNS *auth.UserNamespace
+
 	semaphores *semaphore.Registry
+	shms       *shm.Registry
 }
 
 // NewIPCNamespace creates a new IPC namespace.
-func NewIPCNamespace() *IPCNamespace {
+func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
 	return &IPCNamespace{
+		userNS:     userNS,
 		semaphores: semaphore.NewRegistry(),
+		shms:       shm.NewRegistry(userNS),
 	}
 }
 
@@ -35,6 +43,11 @@ func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
 	return i.semaphores
 }
 
+// ShmRegistry returns the shm segment registry for this namespace.
+func (i *IPCNamespace) ShmRegistry() *shm.Registry {
+	return i.shms
+}
+
 // IPCNamespace returns the task's IPC namespace.
 func (t *Task) IPCNamespace() *IPCNamespace {
 	t.mu.Lock()
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
new file mode 100644
index 000000000..182cc1c76
--- /dev/null
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -0,0 +1,40 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+    name = "shm_state",
+    srcs = [
+        "shm.go",
+    ],
+    out = "shm_autogen_state.go",
+    package = "shm",
+)
+
+go_library(
+    name = "shm",
+    srcs = [
+        "device.go",
+        "shm.go",
+        "shm_autogen_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/device",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
new file mode 100644
index 000000000..b0dacdbe0
--- /dev/null
+++ b/pkg/sentry/kernel/shm/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shm
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// shmDevice is the kernel shm device.
+var shmDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
new file mode 100644
index 000000000..7217e8103
--- /dev/null
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -0,0 +1,630 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package shm implements sysv shared memory segments.
+//
+// Known missing features:
+//
+// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
+//   memory locking in general.
+//
+// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
+//   way to implement hugetlb support on a per-map basis, and it has no impact
+//   on correctness.
+//
+// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
+//   so it's meaningless to reserve space for swap.
+//
+// - No per-process segment size enforcement. This feature probably isn't used
+//   much anyways, since Linux sets the per-process limits to the system-wide
+//   limits by default.
+//
+// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
+package shm
+
+import (
+	"fmt"
+	"math"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Various limits for shared memory segments.
+const (
+	// shmsTotalMaxPages is the system-wide limit on all shared memory segments, measured
+	// in number of pages.
+	shmsTotalMaxPages = math.MaxInt64 // SHMALL
+
+	// shmMaxSize is the maximum size of a single segment, in bytes.
+	shmMaxSize = math.MaxInt64 // SHMMAX
+
+	// shmMinSize is the minimum specifiable size of a segment, effectively
+	// yielding a size rounded up to the next page size. Measured in bytes.
+	shmMinSize = 1 // SHMMIN
+
+	// shmsTotalMax is the maximum number of segments on the system.
+	shmsTotalMax = 4096 // SHMMNI
+)
+
+// Registry tracks all shared memory segments in an IPC namespace. The registry
+// provides the mechanisms for creating and finding segments, and reporting
+// global shm parameters.
+type Registry struct {
+	// userNS owns the IPC namespace this registry belong to. Immutable.
+	userNS *auth.UserNamespace
+
+	mu sync.Mutex `state:"nosave"`
+
+	// shms maps segment ids to segments. Protected by mu.
+	shms map[int32]*Shm
+
+	// Sum of the sizes of all existing segments rounded up to page size, in
+	// units of page size. Protected by mu.
+	totalPages uint64
+
+	// lastIDUsed is protected by mu.
+	lastIDUsed int32
+}
+
+// NewRegistry creates a new shm registry.
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+	return &Registry{
+		userNS: userNS,
+		shms:   make(map[int32]*Shm),
+	}
+}
+
+// FindByID looks up a segment given an ID.
+func (r *Registry) FindByID(id int32) *Shm {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.shms[id]
+}
+
+// Precondition: Caller must hold r.mu.
+func (r *Registry) findByKey(key int32) *Shm {
+	for _, v := range r.shms {
+		if v.key == key {
+			return v
+		}
+	}
+	return nil
+}
+
+// FindOrCreate looks up or creates a segment in the registry. It's functionally
+// analogous to open(2).
+func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
+	if create && (size < shmMinSize || size > shmMaxSize) {
+		// "A new segment was to be created and size is less than SHMMIN or
+		// greater than SHMMAX." - man shmget(2)
+		return nil, syserror.EINVAL
+	}
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if len(r.shms) >= shmsTotalMax {
+		// "All possible shared memory IDs have been taken (SHMMNI) ..."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	if !private {
+		// Look up an existing segment.
+		if shm := r.findByKey(key); shm != nil {
+			shm.mu.Lock()
+			defer shm.mu.Unlock()
+
+			// Check that caller can access the segment.
+			if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
+				// "The user does not have permission to access the shared
+				// memory segment, and does not have the CAP_IPC_OWNER
+				// capability in the user namespace that governs its IPC
+				// namespace." - man shmget(2)
+				return nil, syserror.EACCES
+			}
+
+			if size > shm.size {
+				// "A segment for the given key exists, but size is greater than
+				// the size of that segment." - man shmget(2)
+				return nil, syserror.EINVAL
+			}
+
+			if create && exclusive {
+				// "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
+				// shared memory segment already exists for key."
+				//  - man shmget(2)
+				return nil, syserror.EEXIST
+			}
+
+			return shm, nil
+		}
+
+		if !create {
+			// "No segment exists for the given key, and IPC_CREAT was not
+			// specified." - man shmget(2)
+			return nil, syserror.ENOENT
+		}
+	}
+
+	var sizeAligned uint64
+	if val, ok := usermem.Addr(size).RoundUp(); ok {
+		sizeAligned = uint64(val)
+	} else {
+		return nil, syserror.EINVAL
+	}
+
+	if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > shmsTotalMaxPages {
+		// "... allocating a segment of the requested size would cause the
+		// system to exceed the system-wide limit on shared memory (SHMALL)."
+		//   - man shmget(2)
+		return nil, syserror.ENOSPC
+	}
+
+	// Need to create a new segment.
+	creator := fs.FileOwnerFromContext(ctx)
+	perms := fs.FilePermsFromMode(mode)
+	return r.newShm(ctx, pid, key, creator, perms, size)
+}
+
+// newShm creates a new segment in the registry.
+func (r *Registry) newShm(ctx context.Context, pid, key int32, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
+	p := platform.FromContext(ctx)
+	if p == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+	}
+
+	effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
+	fr, err := p.Memory().Allocate(effectiveSize, usage.Anonymous)
+	if err != nil {
+		return nil, err
+	}
+
+	shm := &Shm{
+		p:             p,
+		registry:      r,
+		creator:       creator,
+		size:          size,
+		effectiveSize: effectiveSize,
+		fr:            fr,
+		key:           key,
+		perms:         perms,
+		owner:         creator,
+		creatorPID:    pid,
+		changeTime:    ktime.NowFromContext(ctx),
+	}
+
+	// Find the next available ID.
+	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+		// Handle wrap around.
+		if id < 0 {
+			id = 0
+			continue
+		}
+		if r.shms[id] == nil {
+			r.lastIDUsed = id
+			r.shms[id] = shm
+			shm.ID = id
+
+			r.totalPages += effectiveSize / usermem.PageSize
+
+			return shm, nil
+		}
+	}
+
+	log.Warningf("Shm ids exhuasted, they may be leaking")
+	return nil, syserror.ENOSPC
+}
+
+// IPCInfo reports global parameters for sysv shared memory segments on this
+// system. See shmctl(IPC_INFO).
+func (r *Registry) IPCInfo() *linux.ShmParams {
+	return &linux.ShmParams{
+		ShmMax: shmMaxSize,
+		ShmMin: shmMinSize,
+		ShmMni: shmsTotalMax,
+		ShmSeg: shmsTotalMax, // Linux also sets this to SHMMNI.
+		ShmAll: shmsTotalMaxPages,
+	}
+}
+
+// ShmInfo reports linux-specific global parameters for sysv shared memory
+// segments on this system. See shmctl(SHM_INFO).
+func (r *Registry) ShmInfo() *linux.ShmInfo {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	return &linux.ShmInfo{
+		UsedIDs: int32(r.lastIDUsed),
+		ShmTot:  r.totalPages,
+		ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
+		ShmSwp:  0,            // No reclaim at the moment.
+	}
+}
+
+// remove unregisters a segment from this registry, preventing it from being
+// discovered in the future. Caller is responsible for ensuring s is destroyed.
+//
+// Precondition: To preserve lock ordering, caller must not hold s.mu.
+func (r *Registry) remove(s *Shm) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	delete(r.shms, s.ID)
+	r.totalPages -= s.effectiveSize / usermem.PageSize
+}
+
+// Shm represents a single shared memory segment.
+//
+// Shm segment are backed directly by an allocation from platform
+// memory. Segments are always mapped as a whole, greatly simplifying how
+// mappings are tracked. However note that mremap and munmap calls may cause the
+// vma for a segment to become fragmented; which requires special care when
+// unmapping a segment. See mm/shm.go.
+//
+// Segments persist until they are explicitly marked for destruction via
+// shmctl(SHM_RMID).
+//
+// Shm implements memmap.Mappable and memmap.MappingIdentity.
+type Shm struct {
+	// AtomicRefCount tracks the number of references to this segment from
+	// maps. A segment always holds a reference to itself, until it's marked for
+	// destruction.
+	refs.AtomicRefCount
+
+	p platform.Platform
+
+	// registry points to the shm registry containing this segment. Immutable.
+	registry *Registry
+
+	// ID is the kernel identifier for this segment. Immutable.
+	ID int32
+
+	// creator is the user that created the segment. Immutable.
+	creator fs.FileOwner
+
+	// size is the requested size of the segment at creation, in
+	// bytes. Immutable.
+	size uint64
+
+	// effectiveSize of the segment, rounding up to the next page
+	// boundary. Immutable.
+	//
+	// Invariant: effectiveSize must be a multiple of usermem.PageSize.
+	effectiveSize uint64
+
+	// fr is the offset into platform.Memory() that backs this contents of this
+	// segment. Immutable.
+	fr platform.FileRange
+
+	// key is the public identifier for this segment.
+	key int32
+
+	// mu protects all fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// perms is the access permissions for the segment.
+	perms fs.FilePermissions
+
+	// owner of this segment.
+	owner fs.FileOwner
+	// attachTime is updated on every successful shmat.
+	attachTime ktime.Time
+	// detachTime is updated on every successful shmdt.
+	detachTime ktime.Time
+	// changeTime is updated on every successful changes to the segment via
+	// shmctl(IPC_SET).
+	changeTime ktime.Time
+
+	// creatorPID is the PID of the process that created the segment.
+	creatorPID int32
+	// lastAttachDetachPID is the pid of the process that issued the last shmat
+	// or shmdt syscall.
+	lastAttachDetachPID int32
+
+	// pendingDestruction indicates the segment was marked as destroyed through
+	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
+	// in the registry and can no longer be attached. When the last user
+	// detaches from the segment, it is destroyed. Protected by mu.
+	pendingDestruction bool
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (s *Shm) MappedName(ctx context.Context) string {
+	return fmt.Sprintf("SYSV%08d", s.key)
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (s *Shm) DeviceID() uint64 {
+	return shmDevice.DeviceID()
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (s *Shm) InodeID() uint64 {
+	// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
+	// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
+	return uint64(s.ID)
+}
+
+// DecRef overrides refs.RefCount.DecRef with a destructor.
+func (s *Shm) DecRef() {
+	s.DecRefWithDestructor(s.destroy)
+}
+
+// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
+// segments.
+func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (s *Shm) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.attachTime = ktime.NowFromContext(ctx)
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		// AddMapping is called during a syscall, so ctx should always be a task
+		// context.
+		log.Warningf("Adding mapping to shm %+v but couldn't get the current pid; not updating the last attach pid", s)
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (s *Shm) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	// TODO: RemoveMapping may be called during task exit, when ctx
+	// is context.Background. Gracefully handle missing clocks. Failing to
+	// update the detach time in these cases is ok, since no one can observe the
+	// omission.
+	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
+		s.detachTime = clock.Now()
+	}
+
+	// If called from a non-task context we also won't have a threadgroup
+	// id. Silently skip updating the lastAttachDetachPid in that case.
+	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
+		s.lastAttachDetachPID = pid
+	} else {
+		log.Debugf("Couldn't obtain pid when removing mapping to shm %+v, not updating the last detach pid.", s)
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (s *Shm) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	var err error
+	if required.End > s.fr.Length() {
+		err = &memmap.BusError{syserror.EFAULT}
+	}
+	if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
+		return []memmap.Translation{
+			{
+				Source: source,
+				File:   s.p.Memory(),
+				Offset: s.fr.Start + source.Start,
+			},
+		}, err
+	}
+	return nil, err
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
+	return nil
+}
+
+// AttachOpts describes various flags passed to shmat(2).
+type AttachOpts struct {
+	Execute  bool
+	Readonly bool
+	Remap    bool
+}
+
+// ConfigureAttach creates an mmap configuration for the segment with the
+// requested attach options.
+//
+// ConfigureAttach returns with a ref on s on success. The caller should drop
+// this once the map is installed. This reference prevents s from being
+// destroyed before the returned configuration is used.
+func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.pendingDestruction && s.ReadRefs() == 0 {
+		return memmap.MMapOpts{}, syserror.EIDRM
+	}
+
+	if !s.checkPermissions(ctx, fs.PermMask{
+		Read:    true,
+		Write:   !opts.Readonly,
+		Execute: opts.Execute,
+	}) {
+		// "The calling process does not have the required permissions for the
+		// requested attach type, and does not have the CAP_IPC_OWNER capability
+		// in the user namespace that governs its IPC namespace." - man shmat(2)
+		return memmap.MMapOpts{}, syserror.EACCES
+	}
+	s.IncRef()
+	return memmap.MMapOpts{
+		Length: s.size,
+		Offset: 0,
+		Addr:   addr,
+		Fixed:  opts.Remap,
+		Perms: usermem.AccessType{
+			Read:    true,
+			Write:   !opts.Readonly,
+			Execute: opts.Execute,
+		},
+		MaxPerms:        usermem.AnyAccess,
+		Mappable:        s,
+		MappingIdentity: s,
+	}, nil
+}
+
+// EffectiveSize returns the size of the underlying shared memory segment. This
+// may be larger than the requested size at creation, due to rounding to page
+// boundaries.
+func (s *Shm) EffectiveSize() uint64 {
+	return s.effectiveSize
+}
+
+// IPCStat returns information about a shm. See shmctl(IPC_STAT).
+func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The caller must have read permission on the shared memory segment."
+	//   - man shmctl(2)
+	if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
+		// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
+		// read access for shmid, and the calling process does not have the
+		// CAP_IPC_OWNER capability in the user namespace that governs its IPC
+		// namespace." - man shmctl(2)
+		return nil, syserror.EACCES
+	}
+
+	var mode uint16
+	if s.pendingDestruction {
+		mode |= linux.SHM_DEST
+	}
+	creds := auth.CredentialsFromContext(ctx)
+
+	nattach := uint64(s.ReadRefs())
+	// Don't report the self-reference we keep prior to being marked for
+	// destruction. However, also don't report a count of -1 for segments marked
+	// as destroyed, with no mappings.
+	if !s.pendingDestruction {
+		nattach--
+	}
+
+	ds := &linux.ShmidDS{
+		ShmPerm: linux.IPCPerm{
+			Key:  uint32(s.key),
+			UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+			GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+			CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+			CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+			Mode: mode | uint16(s.perms.LinuxMode()),
+			Seq:  0, // IPC sequences not supported.
+		},
+		ShmSegsz:   s.size,
+		ShmAtime:   s.attachTime.TimeT(),
+		ShmDtime:   s.detachTime.TimeT(),
+		ShmCtime:   s.changeTime.TimeT(),
+		ShmCpid:    s.creatorPID,
+		ShmLpid:    s.lastAttachDetachPID,
+		ShmNattach: nattach,
+	}
+
+	return ds, nil
+}
+
+// Set modifies attributes for a segment. See shmctl(IPC_SET).
+func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if !s.checkOwnership(ctx) {
+		return syserror.EPERM
+	}
+
+	creds := auth.CredentialsFromContext(ctx)
+	uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
+	gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
+	if !uid.Ok() || !gid.Ok() {
+		return syserror.EINVAL
+	}
+
+	// User may only modify the lower 9 bits of the mode. All the other bits are
+	// always 0 for the underlying inode.
+	mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
+	s.perms = fs.FilePermsFromMode(mode)
+
+	s.owner.UID = uid
+	s.owner.GID = gid
+
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
+func (s *Shm) destroy() {
+	s.registry.remove(s)
+	s.p.Memory().DecRef(s.fr)
+}
+
+// MarkDestroyed marks a shm for destruction. The shm is actually destroyed once
+// it has no references. See shmctl(IPC_RMID).
+func (s *Shm) MarkDestroyed() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	// Prevent the segment from being found in the registry.
+	s.key = linux.IPC_PRIVATE
+	s.pendingDestruction = true
+	s.DecRef()
+}
+
+// checkOwnership verifies whether a segment may be accessed by ctx as an
+// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkOwnership(ctx context.Context) bool {
+	creds := auth.CredentialsFromContext(ctx)
+	if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
+		return true
+	}
+
+	// Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
+	// doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
+	// for use to "override IPC ownership checks".
+	return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
+}
+
+// checkPermissions verifies whether a segment is accessible by ctx for access
+// described by req. See ipc/util.c:ipcperms() in Linux.
+//
+// Precondition: Caller must hold s.mu.
+func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
+	creds := auth.CredentialsFromContext(ctx)
+
+	p := s.perms.Other
+	if s.owner.UID == creds.EffectiveKUID {
+		p = s.perms.User
+	} else if creds.InGroup(s.owner.GID) {
+		p = s.perms.Group
+	}
+	if p.SupersetOf(req) {
+		return true
+	}
+
+	// Tasks with CAP_IPC_OWNER may bypass permission checks.
+	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 490f795c2..7763050a5 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bpf"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -559,6 +560,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t
 	case auth.CtxCredentials:
 		return t.creds
+	case context.CtxThreadGroupID:
+		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.FSContext().RootDirectory()
 	case inet.CtxStack:
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 3a74abdfb..0c2427952 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -197,7 +197,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	if opts.NewIPCNamespace {
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
-		ipcns = NewIPCNamespace()
+		ipcns = NewIPCNamespace(userns)
 	}
 
 	tc, err := t.tc.Fork(t, !opts.NewAddressSpace)
@@ -449,7 +449,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
-		t.ipcns = NewIPCNamespace()
+		t.ipcns = NewIPCNamespace(t.creds.UserNamespace)
 	}
 	if opts.NewFiles {
 		oldFDMap := t.tr.FDMap
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 39bde2be3..258389bb2 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -107,6 +107,7 @@ go_library(
         "pma_set.go",
         "proc_pid_maps.go",
         "save_restore.go",
+        "shm.go",
         "special_mappable.go",
         "syscalls.go",
         "vma.go",
@@ -123,6 +124,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
new file mode 100644
index 000000000..bab137a5a
--- /dev/null
+++ b/pkg/sentry/mm/shm.go
@@ -0,0 +1,66 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// DetachShm unmaps a sysv shared memory segment.
+func (mm *MemoryManager) DetachShm(ctx context.Context, addr usermem.Addr) error {
+	if addr != addr.RoundDown() {
+		// "... shmaddr is not aligned on a page boundary." - man shmdt(2)
+		return syserror.EINVAL
+	}
+
+	var detached *shm.Shm
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	// Find and remove the first vma containing an address >= addr that maps a
+	// segment originally attached at addr.
+	vseg := mm.vmas.LowerBoundSegment(addr)
+	for vseg.Ok() {
+		vma := vseg.ValuePtr()
+		if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off {
+			detached = shm
+			vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+			break
+		} else {
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	if detached == nil {
+		// There is no shared memory segment attached at addr.
+		return syserror.EINVAL
+	}
+
+	// Remove all vmas that could have been created by the same attach.
+	end := addr + usermem.Addr(detached.EffectiveSize())
+	for vseg.Ok() && vseg.End() <= end {
+		vma := vseg.ValuePtr()
+		if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off {
+			vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
+		} else {
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index bc67ebf30..f9e0a4be3 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -44,6 +44,7 @@ go_library(
         "sys_rusage.go",
         "sys_sched.go",
         "sys_sem.go",
+        "sys_shm.go",
         "sys_signal.go",
         "sys_socket.go",
         "sys_stat.go",
@@ -84,6 +85,7 @@ go_library(
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/kernel/semaphore",
+        "//pkg/sentry/kernel/shm",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 44db2d582..237c61007 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -75,9 +75,9 @@ var AMD64 = &kernel.SyscallTable{
 		26: Msync,
 		27: Mincore,
 		28: Madvise,
-		//     29: Shmget, TODO
-		//     30: Shmat, TODO
-		//     31: Shmctl, TODO
+		29: Shmget,
+		30: Shmat,
+		31: Shmctl,
 		32: Dup,
 		33: Dup2,
 		34: Pause,
@@ -113,7 +113,7 @@ var AMD64 = &kernel.SyscallTable{
 		64: Semget,
 		65: Semop,
 		66: Semctl,
-		//     67: Shmdt, TODO
+		67: Shmdt,
 		//     68: Msgget, TODO
 		//     69: Msgsnd, TODO
 		//     70: Msgrcv, TODO
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
new file mode 100644
index 000000000..48ff1d5f0
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -0,0 +1,155 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Shmget implements shmget(2).
+func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	key := args[0].Int()
+	size := uint64(args[1].SizeT())
+	flag := args[2].Int()
+
+	private := key == linux.IPC_PRIVATE
+	create := flag&linux.IPC_CREAT == linux.IPC_CREAT
+	exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
+	mode := linux.FileMode(flag & 0777)
+
+	pid := int32(t.ThreadGroup().ID())
+	r := t.IPCNamespace().ShmRegistry()
+	segment, err := r.FindOrCreate(t, pid, key, size, mode, private, create, exclusive)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(segment.ID), nil, nil
+}
+
+// findSegment retrives a shm segment by the given id.
+func findSegment(t *kernel.Task, id int32) (*shm.Shm, error) {
+	r := t.IPCNamespace().ShmRegistry()
+	segment := r.FindByID(id)
+	if segment == nil {
+		// No segment with provided id.
+		return nil, syserror.EINVAL
+	}
+	return segment, nil
+}
+
+// Shmat implements shmat(2).
+func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	addr := args[1].Pointer()
+	flag := args[2].Int()
+
+	segment, err := findSegment(t, id)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
+		Execute:  flag&linux.SHM_EXEC == linux.SHM_EXEC,
+		Readonly: flag&linux.SHM_RDONLY == linux.SHM_RDONLY,
+		Remap:    flag&linux.SHM_REMAP == linux.SHM_REMAP,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer segment.DecRef()
+	addr, err = t.MemoryManager().MMap(t, opts)
+	return uintptr(addr), nil, err
+}
+
+// Shmdt implements shmdt(2).
+func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	err := t.MemoryManager().DetachShm(t, addr)
+	return 0, nil, err
+}
+
+// Shmctl implements shmctl(2).
+func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	id := args[0].Int()
+	cmd := args[1].Int()
+	buf := args[2].Pointer()
+
+	r := t.IPCNamespace().ShmRegistry()
+
+	switch cmd {
+	case linux.SHM_STAT:
+		// Technically, we should be treating id as "an index into the kernel's
+		// internal array that maintains information about all shared memory
+		// segments on the system". Since we don't track segments in an array,
+		// we'll just pretend the shmid is the index and do the same thing as
+		// IPC_STAT. Linux also uses the index as the shmid.
+		fallthrough
+	case linux.IPC_STAT:
+		segment, err := findSegment(t, id)
+		if err != nil {
+			return 0, nil, syserror.EINVAL
+		}
+
+		stat, err := segment.IPCStat(t)
+		if err == nil {
+			_, err = t.CopyOut(buf, stat)
+		}
+		return 0, nil, err
+
+	case linux.IPC_INFO:
+		params := r.IPCInfo()
+		_, err := t.CopyOut(buf, params)
+		return 0, nil, err
+
+	case linux.SHM_INFO:
+		info := r.ShmInfo()
+		_, err := t.CopyOut(buf, info)
+		return 0, nil, err
+	}
+
+	// Remaining commands refer to a specific segment.
+	segment, err := findSegment(t, id)
+	if err != nil {
+		return 0, nil, syserror.EINVAL
+	}
+
+	switch cmd {
+	case linux.IPC_SET:
+		var ds linux.ShmidDS
+		_, err = t.CopyIn(buf, &ds)
+		if err != nil {
+			return 0, nil, err
+		}
+		err = segment.Set(t, &ds)
+		return 0, nil, err
+
+	case linux.IPC_RMID:
+		segment.MarkDestroyed()
+		return 0, nil, nil
+
+	case linux.SHM_LOCK, linux.SHM_UNLOCK:
+		// We currently do not support memmory locking anywhere.
+		// mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
+		// same here.
+		return 0, nil, nil
+
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0ff54d349..566f2eb46 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -146,7 +146,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// not configurable from runtime spec.
 	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
 
-	ipcns := kernel.NewIPCNamespace()
+	ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
 
 	if err := enableStrace(conf); err != nil {
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
-- 
cgit v1.2.3


From b904250b862c5c14da84e08b6a5400c7bf2458b0 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 17 May 2018 15:37:19 -0700
Subject: Fix capability check for sysv semaphores.

Capabilities for sysv sem operations were being checked against the
current task's user namespace. They should be checked against the user
namespace owning the ipc namespace for the sems instead, per
ipc/util.c:ipcperms().

PiperOrigin-RevId: 197063111
Change-Id: Iba29486b316f2e01ee331dda4e48a6ab7960d589
---
 pkg/sentry/kernel/ipc_namespace.go            |  2 +-
 pkg/sentry/kernel/semaphore/semaphore.go      | 15 ++++++++++++---
 pkg/sentry/kernel/semaphore/semaphore_test.go |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 3049fead4..a86bda77b 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -33,7 +33,7 @@ type IPCNamespace struct {
 func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
 	return &IPCNamespace{
 		userNS:     userNS,
-		semaphores: semaphore.NewRegistry(),
+		semaphores: semaphore.NewRegistry(userNS),
 		shms:       shm.NewRegistry(userNS),
 	}
 }
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index fb8c2f98c..e9027dc14 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -43,6 +43,8 @@ const (
 
 // Registry maintains a set of semaphores that can be found by key or ID.
 type Registry struct {
+	// userNS owning the ipc name this registry belongs to. Immutable.
+	userNS *auth.UserNamespace
 	// mu protects all fields below.
 	mu         sync.Mutex `state:"nosave"`
 	semaphores map[int32]*Set
@@ -51,6 +53,9 @@ type Registry struct {
 
 // Set represents a set of semaphores that can be operated atomically.
 type Set struct {
+	// registry owning this sem set. Immutable.
+	registry *Registry
+
 	// Id is a handle that identifies the set.
 	ID int32
 
@@ -90,8 +95,11 @@ type waiter struct {
 }
 
 // NewRegistry creates a new semaphore set registry.
-func NewRegistry() *Registry {
-	return &Registry{semaphores: make(map[int32]*Set)}
+func NewRegistry(userNS *auth.UserNamespace) *Registry {
+	return &Registry{
+		userNS:     userNS,
+		semaphores: make(map[int32]*Set),
+	}
 }
 
 // FindOrCreate searches for a semaphore set that matches 'key'. If not found,
@@ -175,6 +183,7 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
 
 func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
 	set := &Set{
+		registry:   r,
 		key:        key,
 		owner:      owner,
 		creator:    owner,
@@ -415,7 +424,7 @@ func (s *Set) checkCredentials(creds *auth.Credentials) bool {
 }
 
 func (s *Set) checkCapability(creds *auth.Credentials) bool {
-	return creds.HasCapability(linux.CAP_IPC_OWNER) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
+	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
 }
 
 func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index 1c6a2e1e9..f9eb382e9 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -136,7 +136,7 @@ func TestNoWait(t *testing.T) {
 
 func TestUnregister(t *testing.T) {
 	ctx := contexttest.Context(t)
-	r := NewRegistry()
+	r := NewRegistry(auth.NewRootUserNamespace())
 	set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true)
 	if err != nil {
 		t.Fatalf("FindOrCreate() failed, err: %v", err)
-- 
cgit v1.2.3


From b960559fdb9a22c986af11ba4e886ffb316a3574 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 17 May 2018 16:25:51 -0700
Subject: Cleanup docs

This brings the proc document more up-to-date.

PiperOrigin-RevId: 197070161
Change-Id: Iae2cf9dc44e3e748a33f497bb95bd3c10d0c094a
---
 pkg/sentry/fs/proc/README.md         | 96 ++++++++++++++++++++----------------
 pkg/sentry/syscalls/linux/linux64.go |  2 -
 2 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index c510ee63a..6ad7297d2 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -10,17 +10,16 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
-| File /proc/               | Content                                          |
-| :------------------------ | :----------------------------------------------- |
-| [cpuinfo](#cpuinfo)       | Info about the CPU                               |
-| [filesystem](#filesystem) | Supported filesystems                            |
-| [loadavg](#loadavg)       | Load average of last 1, 5 & 15 minutes           |
-| [meminfo](#meminfo)       | Overall memory info                              |
-| [stat](#stat)             | Overall kernel statistics                        |
-| [sys](#sys)               | Change parameters within the kernel              |
-| [uptime](#uptime)         | Wall clock since boot, combined idle time of all |
-:                           : cpus                                             :
-| [version](#version)       | Kernel version                                   |
+| File /proc/                 | Content                                               |
+| :------------------------   | :---------------------------------------------------- |
+| [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
+| [filesystems](#filesystems) | Supported filesystems                                 |
+| [loadavg](#loadavg)         | Load average of last 1, 5 & 15 minutes                |
+| [meminfo](#meminfo)         | Overall memory info                                   |
+| [stat](#stat)               | Overall kernel statistics                             |
+| [sys](#sys)                 | Change parameters within the kernel                   |
+| [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
+| [version](#version)         | Kernel version                                        |
 
 ### cpuinfo
 
@@ -62,26 +61,20 @@ cache_alignment  | Always 64
 address sizes    | Always 46 bits physical, 48 bits virtual
 power management | Always blank
 
-Otherwise fields are derived from the SentryCPUIDSpec proto config.
+Otherwise fields are derived from the sentry configuration.
 
-### filesystem
+### filesystems
 
 ```bash
-$ cat /proc/filesystem
+$ cat /proc/filesystems
 nodev   9p
+nodev   devpts
 nodev   devtmpfs
 nodev   proc
-nodev   ramdiskfs
 nodev   sysfs
 nodev   tmpfs
 ```
 
-Notable divergences:
-
-Filesystem | Notes
-:--------- | :--------------------------------------------------------
-ramdiskfs  | No Linux equivalent, see the SentryRamdiskFS proto config
-
 ### loadavg
 
 ```bash
@@ -166,10 +159,6 @@ DirectMap4k       | Missing
 DirectMap2M       | Missing
 DirectMap1G       | Missing
 
-See [Memory
-Accounting](pkg/sentry/usage/g3doc/memory-accounting.md)
-for general caveats.
-
 ### stat
 
 ```bash
@@ -236,22 +225,26 @@ Linux version 3.11.10 #1 SMP Fri Nov 29 10:47:50 PST 2013
 
 The following files are implemented:
 
-File /proc/PID      | Content
-:------------------ | :---------------------------------------------------
-[auxv](#auxv)       | Copy of auxiliary vector for the process
-[cmdline](#cmdline) | Command line arguments
-[comm](#comm)       | Command name associated with the process
-[exe](#exe)         | Symlink to the process's executable
-[fd](#fd)           | Directory containing links to open file descriptors
-[fdinfo](#fdinfo)   | Information associated with open file descriptors
-[gid_map](#gid_map) | Mappings for group IDs inside the user namespace
-[io](#io)           | IO statistics
-[maps](#maps)       | Memory mappings (anon, executables, library files)
-[ns](#ns)           | Directory containing info about supported namespaces
-[stat](#stat)       | Process statistics
-[status](#status)   | Process status in human readable format
-[task](#task)       | Directory containing info about running threads
-[uid_map](#uid_map) | Mappings for user IDs inside the user namespace
+File /proc/PID          | Content
+:---------------------- | :---------------------------------------------------
+[auxv](#auxv)           | Copy of auxiliary vector for the process
+[cmdline](#cmdline)     | Command line arguments
+[comm](#comm)           | Command name associated with the process
+[environ](#environ)     | Process environment
+[exe](#exe)             | Symlink to the process's executable
+[fd](#fd)               | Directory containing links to open file descriptors
+[fdinfo](#fdinfo)       | Information associated with open file descriptors
+[gid_map](#gid_map)     | Mappings for group IDs inside the user namespace
+[io](#io)               | IO statistics
+[maps](#maps)           | Memory mappings (anon, executables, library files)
+[mounts](#mounts)       | Mounted filesystems
+[mountinfo](#mountinfo) | Information about mounts
+[ns](#ns)               | Directory containing info about supported namespaces
+[stat](#stat)           | Process statistics
+[statm](#statm)         | Process memory statistics
+[status](#status)       | Process status in human readable format
+[task](#task)           | Directory containing info about running threads
+[uid_map](#uid_map)     | Mappings for user IDs inside the user namespace
 
 ### auxv
 
@@ -265,6 +258,10 @@ TODO
 
 TODO
 
+### environment
+
+TODO
+
 ### exe
 
 TODO
@@ -291,6 +288,14 @@ TODO: add more detail.
 
 TODO
 
+### mounts
+
+TODO
+
+### mountinfo
+
+TODO
+
 ### ns
 
 TODO
@@ -302,9 +307,16 @@ num_threads, and exit_signal.
 
 TODO: add more detail.
 
+### statm
+
+Only has data for vss and rss.
+
+TODO: add more detail.
+
 ### status
 
-Statically created, most of the fields have no data.
+Contains data for Name, State, Tgid, Pid, Ppid, TracerPid, FDSize, VmSize,
+VmRSS, Threads, CapInh, CapPrm, CapEff, CapBnd, Seccomp.
 
 TODO: add more detail.
 
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 237c61007..edfcdca3f 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 // Package linux provides syscall tables for amd64 Linux.
-//
-// NOTE: Linux i386 support has been removed.
 package linux
 
 import (
-- 
cgit v1.2.3


From d4c81b7a2135293474f787e4a9abf3802258838b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 18 May 2018 10:42:52 -0700
Subject: sentry: Get "ip link" working.

In Linux, many UDS ioctls are passed through to the NIC driver. We do the same
here, passing ioctl calls to Unix sockets through to epsocket.

In Linux you can see this path at net/socket.c:sock_ioctl, which calls
sock_do_ioctl, which calls net/core/dev_ioctl.c:dev_ioctl.

SIOCGIFNAME is also added.

PiperOrigin-RevId: 197167508
Change-Id: I62c326a4792bd0a473e9c9108aafb6a6354f2b64
---
 pkg/sentry/socket/epsocket/epsocket.go | 226 +++++++++++++++++----------------
 1 file changed, 119 insertions(+), 107 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 18cb70c96..2495ba459 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -33,7 +33,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/binary"
-	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -703,18 +702,18 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n
 			v := usermem.ByteOrder.Uint32(optVal)
 			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
 		}
-	}
-
-	// FIXME: Disallow IP-level multicast group options by
-	// default. These will need to be supported by appropriately plumbing
-	// the level through to the network stack (if at all). However, we
-	// still allow setting TTL, and multicast-enable/disable type options.
-	if level == 0 {
+	case syscall.SOL_IP:
 		const (
+			_IP_MULTICAST_IF   = 32
 			_IP_ADD_MEMBERSHIP = 35
 			_MCAST_JOIN_GROUP  = 42
 		)
-		if name == _IP_ADD_MEMBERSHIP || name == _MCAST_JOIN_GROUP {
+		switch name {
+		case _IP_ADD_MEMBERSHIP, _MCAST_JOIN_GROUP, _IP_MULTICAST_IF:
+			// FIXME: Disallow IP-level multicast group options by
+			// default. These will need to be supported by appropriately plumbing
+			// the level through to the network stack (if at all). However, we
+			// still allow setting TTL, and multicast-enable/disable type options.
 			return syserr.ErrInvalidArgument
 		}
 	}
@@ -1033,8 +1032,99 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	}
 }
 
+// Ioctl implements fs.FileOperations.Ioctl.
+func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return Ioctl(ctx, s.Endpoint, io, args)
+}
+
+// Ioctl performs a socket ioctl.
+func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch arg := int(args[1].Int()); arg {
+	case syscall.SIOCGIFFLAGS,
+		syscall.SIOCGIFADDR,
+		syscall.SIOCGIFBRDADDR,
+		syscall.SIOCGIFDSTADDR,
+		syscall.SIOCGIFHWADDR,
+		syscall.SIOCGIFINDEX,
+		syscall.SIOCGIFMAP,
+		syscall.SIOCGIFMETRIC,
+		syscall.SIOCGIFMTU,
+		syscall.SIOCGIFNAME,
+		syscall.SIOCGIFNETMASK,
+		syscall.SIOCGIFTXQLEN:
+
+		var ifr linux.IFReq
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
+			return 0, err.ToError()
+		}
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case syscall.SIOCGIFCONF:
+		// Return a list of interface addresses or the buffer size
+		// necessary to hold the list.
+		var ifc linux.IFConf
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		if err := ifconfIoctl(ctx, io, &ifc); err != nil {
+			return 0, err
+		}
+
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+
+		return 0, err
+
+	case linux.TIOCINQ:
+		var v tcpip.ReceiveQueueSizeOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return 0, syserr.TranslateNetstackError(err).ToError()
+		}
+
+		if v > math.MaxInt32 {
+			v = math.MaxInt32
+		}
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCOUTQ:
+		var v tcpip.SendQueueSizeOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return 0, syserr.TranslateNetstackError(err).ToError()
+		}
+
+		if v > math.MaxInt32 {
+			v = math.MaxInt32
+		}
+
+		// Copy result to user-space.
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	}
+
+	return 0, syserror.ENOTTY
+}
+
 // interfaceIoctl implements interface requests.
-func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
+func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
 	var (
 		iface inet.Interface
 		index int32
@@ -1044,9 +1134,23 @@ func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, ar
 	// Find the relevant device.
 	stack := inet.StackFromContext(ctx)
 	if stack == nil {
-		log.Warningf("Couldn't find a network stack.")
-		return syserr.ErrInvalidArgument
+		return syserr.ErrNoDevice
 	}
+
+	// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
+	// identify a device.
+	if arg == syscall.SIOCGIFNAME {
+		// Gets the name of the interface given the interface index
+		// stored in ifr_ifindex.
+		index = int32(usermem.ByteOrder.Uint32(ifr.Data[:4]))
+		if iface, ok := stack.Interfaces()[index]; ok {
+			ifr.SetName(iface.Name)
+			return nil
+		}
+		return syserr.ErrNoDevice
+	}
+
+	// Find the relevant device.
 	for index, iface = range stack.Interfaces() {
 		if iface.Name == ifr.Name() {
 			found = true
@@ -1137,68 +1241,16 @@ func (s *SocketOperations) interfaceIoctl(ctx context.Context, io usermem.IO, ar
 	return nil
 }
 
-// Ioctl implements fs.FileOperations.Ioctl.
-func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch arg := int(args[1].Int()); arg {
-	case syscall.SIOCGIFFLAGS,
-		syscall.SIOCGIFADDR,
-		syscall.SIOCGIFBRDADDR,
-		syscall.SIOCGIFDSTADDR,
-		syscall.SIOCGIFHWADDR,
-		syscall.SIOCGIFINDEX,
-		syscall.SIOCGIFMAP,
-		syscall.SIOCGIFMETRIC,
-		syscall.SIOCGIFMTU,
-		syscall.SIOCGIFNETMASK,
-		syscall.SIOCGIFTXQLEN:
-
-		var ifr linux.IFReq
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
-			return 0, err
-		}
-		if err := s.interfaceIoctl(ctx, io, arg, &ifr); err != nil {
-			return 0, err.ToError()
-		}
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-	case syscall.SIOCGIFCONF:
-		// Return a list of interface addresses or the buffer size
-		// necessary to hold the list.
-		var ifc linux.IFConf
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
-			return 0, err
-		}
-
-		if err := s.ifconfIoctl(ctx, io, &ifc); err != nil {
-			return 0, err
-		}
-
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-
-		return 0, err
-	}
-
-	return Ioctl(ctx, s.Endpoint, io, args)
-}
-
 // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
-func (s *SocketOperations) ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
+func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
 	// If Ptr is NULL, return the necessary buffer size via Len.
 	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
 	// structs.
 	stack := inet.StackFromContext(ctx)
 	if stack == nil {
-		log.Warningf("Couldn't find a network stack.")
-		return syserr.ErrInvalidArgument.ToError()
+		return syserr.ErrNoDevice.ToError()
 	}
+
 	if ifc.Ptr == 0 {
 		ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq)
 		return nil
@@ -1236,43 +1288,3 @@ func (s *SocketOperations) ifconfIoctl(ctx context.Context, io usermem.IO, ifc *
 	}
 	return nil
 }
-
-// Ioctl implements fs.FileOperations.Ioctl for sockets backed by a
-// commonEndpoint.
-func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	// Switch on ioctl request.
-	switch int(args[1].Int()) {
-	case linux.TIOCINQ:
-		var v tcpip.ReceiveQueueSizeOption
-		if err := ep.GetSockOpt(&v); err != nil {
-			return 0, syserr.TranslateNetstackError(err).ToError()
-		}
-
-		if v > math.MaxInt32 {
-			v = math.MaxInt32
-		}
-		// Copy result to user-space.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-
-	case linux.TIOCOUTQ:
-		var v tcpip.SendQueueSizeOption
-		if err := ep.GetSockOpt(&v); err != nil {
-			return 0, syserr.TranslateNetstackError(err).ToError()
-		}
-
-		if v > math.MaxInt32 {
-			v = math.MaxInt32
-		}
-
-		// Copy result to user-space.
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-	}
-
-	return 0, syserror.ENOTTY
-}
-- 
cgit v1.2.3


From 61b0b19497e9ac417de5a600e6ff06d52db4268f Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 21 May 2018 16:48:41 -0700
Subject: Dramatically improve handling of KVM vCPU pool.

Especially in situations with small numbers of vCPUs, the existing
system resulted in excessive thrashing. Now, execution contexts
co-ordinate as smoothly as they can to share a small number of cores.

PiperOrigin-RevId: 197483323
Change-Id: I0afc0c5363ea9386994355baf3904bf5fe08c56c
---
 pkg/sentry/platform/kvm/context.go  |  5 +--
 pkg/sentry/platform/kvm/kvm_test.go |  5 +--
 pkg/sentry/platform/kvm/machine.go  | 71 ++++++++++++++++++++++++-------------
 3 files changed, 48 insertions(+), 33 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index c9bfbc136..dec26a23a 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -41,10 +41,7 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 	fp := (*byte)(ac.FloatingPointData())
 
 	// Grab a vCPU.
-	cpu, err := c.machine.Get()
-	if err != nil {
-		return nil, usermem.NoAccess, err
-	}
+	cpu := c.machine.Get()
 
 	// Enable interrupts (i.e. calls to vCPU.Notify).
 	if !c.interrupt.Enable(cpu) {
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 778a6d187..a3466fbed 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -59,10 +59,7 @@ func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
 		}
 	}()
 	for {
-		c, err = k.machine.Get()
-		if err != nil {
-			t.Fatalf("error getting vCPU: %v", err)
-		}
+		c = k.machine.Get()
 		if !fn(c) {
 			break
 		}
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 3ee21fe21..9b7e5130c 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -48,6 +48,9 @@ type machine struct {
 	// mu protects vCPUs.
 	mu sync.Mutex
 
+	// available is notified when vCPUs are available.
+	available sync.Cond
+
 	// vCPUs are the machine vCPUs.
 	//
 	// This is eventually keyed by system TID, but is initially indexed by
@@ -118,6 +121,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 		fd:    vm,
 		vCPUs: make(map[uint64]*vCPU),
 	}
+	m.available.L = &m.mu
 	if vCPUs > _KVM_NR_VCPUS {
 		// Hard cap at KVM's limit.
 		vCPUs = _KVM_NR_VCPUS
@@ -284,25 +288,21 @@ func (m *machine) Destroy() {
 }
 
 // Get gets an available vCPU.
-func (m *machine) Get() (*vCPU, error) {
+func (m *machine) Get() *vCPU {
 	runtime.LockOSThread()
 	tid := procid.Current()
 	m.mu.Lock()
 
-	for {
-		// Check for an exact match.
-		if c := m.vCPUs[tid]; c != nil {
-			c.lock()
-			m.mu.Unlock()
-			return c, nil
-		}
+	// Check for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.Unlock()
+		return c
+	}
 
+	for {
 		// Scan for an available vCPU.
 		for origTID, c := range m.vCPUs {
-			// We can only steal a vCPU that is the vCPUReady
-			// state. That is, it must not be heading to user mode
-			// with some other thread, have a waiter registered, or
-			// be in guest mode already.
 			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
 				delete(m.vCPUs, origTID)
 				m.vCPUs[tid] = c
@@ -313,24 +313,44 @@ func (m *machine) Get() (*vCPU, error) {
 				// may be stale.
 				c.loadSegments()
 				atomic.StoreUint64(&c.tid, tid)
-				return c, nil
+				return c
 			}
 		}
 
-		// Everything is already in guest mode.
-		//
-		// We hold the pool lock here, so we should be able to kick
-		// something out of kernel mode and have it bounce into host
-		// mode when it tries to grab the vCPU again.
-		for _, c := range m.vCPUs {
-			c.BounceToHost()
+		// Scan for something not in user mode.
+		for origTID, c := range m.vCPUs {
+			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
+				continue
+			}
+
+			// The vCPU is not be able to transition to
+			// vCPUGuest|vCPUUser or to vCPUUser because that
+			// transition requires holding the machine mutex, as we
+			// do now. There is no path to register a waiter on
+			// just the vCPUReady state.
+			for {
+				c.waitUntilNot(vCPUGuest | vCPUWaiter)
+				if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+					break
+				}
+			}
+
+			// Steal the vCPU.
+			delete(m.vCPUs, origTID)
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+
+			// See above.
+			c.loadSegments()
+			atomic.StoreUint64(&c.tid, tid)
+			return c
 		}
 
-		// Give other threads an opportunity to run. We don't yield the
-		// pool lock above, so if they try to regrab the lock we will
-		// serialize at this point. This is extreme, but we don't
-		// expect to exhaust all vCPUs frequently.
-		yield()
+		// Everything is executing in user mode. Wait until something
+		// is available.  Note that signaling the condition variable
+		// will have the extra effect of kicking the vCPUs out of guest
+		// mode if that's where they were.
+		m.available.Wait()
 	}
 }
 
@@ -338,6 +358,7 @@ func (m *machine) Get() (*vCPU, error) {
 func (m *machine) Put(c *vCPU) {
 	c.unlock()
 	runtime.UnlockOSThread()
+	m.available.Signal()
 }
 
 // lock marks the vCPU as in user mode.
-- 
cgit v1.2.3


From 705605f9011cfbd58f407ca84bc4c2d8cf39d80b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 22 May 2018 13:46:37 -0700
Subject: sentry: Add simple SIOCGIFFLAGS support (IFF_RUNNING and IFF_PROMIS).

Establishes a way of communicating interface flags between netstack and
epsocket. More flags can be added over time.

PiperOrigin-RevId: 197616669
Change-Id: I230448c5fb5b7d2e8d69b41a451eb4e1096a0e30
---
 pkg/sentry/socket/epsocket/epsocket.go   | 57 ++++++++++++++++++++++++++++++--
 pkg/tcpip/link/channel/channel.go        |  5 +++
 pkg/tcpip/link/fdbased/endpoint.go       | 13 ++++++--
 pkg/tcpip/link/loopback/loopback.go      |  5 +++
 pkg/tcpip/link/sharedmem/sharedmem.go    |  7 ++++
 pkg/tcpip/link/sniffer/sniffer.go        |  5 +++
 pkg/tcpip/link/waitable/waitable.go      |  5 +++
 pkg/tcpip/link/waitable/waitable_test.go |  5 +++
 pkg/tcpip/network/ip_test.go             |  5 +++
 pkg/tcpip/stack/registration.go          |  4 +++
 pkg/tcpip/stack/stack.go                 | 33 ++++++++++++++++++
 11 files changed, 138 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 2495ba459..9ff9af0bc 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -48,12 +48,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	nstack "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 const sizeOfInt32 int = 4
 
+var errStackType = syserr.New("expected but did not receive an epsocket.Stack", linux.EINVAL)
+
 // ntohs converts a 16-bit number from network byte order to host byte order. It
 // assumes that the host is little endian.
 func ntohs(v uint16) uint16 {
@@ -1177,9 +1180,11 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 		usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n))
 
 	case syscall.SIOCGIFFLAGS:
-		// TODO: Implement. For now, return only that the
-		// device is up so that ifconfig prints it.
-		usermem.ByteOrder.PutUint16(ifr.Data[:2], linux.IFF_UP)
+		f, err := interfaceStatusFlags(stack, iface.Name)
+		if err != nil {
+			return err
+		}
+		usermem.ByteOrder.PutUint16(ifr.Data[:2], f)
 
 	case syscall.SIOCGIFADDR:
 		// Copy the IPv4 address out.
@@ -1288,3 +1293,49 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
 	}
 	return nil
 }
+
+// interfaceStatusFlags returns status flags for an interface in the stack.
+// Flag values and meanings are described in greater detail in netdevice(7) in
+// the SIOCGIFFLAGS section.
+func interfaceStatusFlags(stack inet.Stack, name string) (uint16, *syserr.Error) {
+	// epsocket should only ever be passed an epsocket.Stack.
+	epstack, ok := stack.(*Stack)
+	if !ok {
+		return 0, errStackType
+	}
+
+	// Find the NIC corresponding to this interface.
+	var (
+		nicid tcpip.NICID
+		info  nstack.NICInfo
+		found bool
+	)
+	ns := epstack.Stack
+	for nicid, info = range ns.NICInfo() {
+		if info.Name == name {
+			found = true
+			break
+		}
+	}
+	if !found {
+		return 0, syserr.ErrNoDevice
+	}
+
+	// Set flags based on NIC state.
+	nicFlags, err := ns.NICFlags(nicid)
+	if err != nil {
+		return 0, syserr.TranslateNetstackError(err)
+	}
+
+	var retFlags uint16
+	if nicFlags.Up {
+		retFlags |= linux.IFF_UP
+	}
+	if nicFlags.Running {
+		retFlags |= linux.IFF_RUNNING
+	}
+	if nicFlags.Promiscuous {
+		retFlags |= linux.IFF_PROMISC
+	}
+	return retFlags, nil
+}
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index cebc34553..3f5440cc1 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -67,6 +67,11 @@ func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.dispatcher = dispatcher
 }
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
 // during construction.
 func (e *Endpoint) MTU() uint32 {
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index da74cd644..668514454 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -45,9 +45,10 @@ type endpoint struct {
 	// its end of the communication pipe.
 	closed func(*tcpip.Error)
 
-	vv     *buffer.VectorisedView
-	iovecs []syscall.Iovec
-	views  []buffer.View
+	vv       *buffer.VectorisedView
+	iovecs   []syscall.Iovec
+	views    []buffer.View
+	attached bool
 }
 
 // Options specify the details about the fd-based endpoint to be created.
@@ -96,9 +97,15 @@ func New(opts *Options) tcpip.LinkEndpointID {
 // Attach launches the goroutine that reads packets from the file descriptor and
 // dispatches them via the provided dispatcher.
 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.attached = true
 	go e.dispatchLoop(dispatcher) // S/R-FIXME
 }
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.attached
+}
+
 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
 // during construction.
 func (e *endpoint) MTU() uint32 {
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 1a9cd09d7..f38847949 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -32,6 +32,11 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.dispatcher = dispatcher
 }
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
 // MTU implements stack.LinkEndpoint.MTU. It returns a constant that matches the
 // linux loopback interface.
 func (*endpoint) MTU() uint32 {
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 2c0f1b294..5369ebc68 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -137,6 +137,13 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.mu.Unlock()
 }
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.workerStarted
+}
+
 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
 // during construction.
 func (e *endpoint) MTU() uint32 {
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 72d9a0f1c..3a40081c0 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -143,6 +143,11 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.lower.Attach(e)
 }
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
 // MTU implements stack.LinkEndpoint.MTU. It just forwards the request to the
 // lower endpoint.
 func (e *endpoint) MTU() uint32 {
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 2c6e73f22..91aed7a12 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -58,6 +58,11 @@ func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.lower.Attach(e)
 }
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
 // MTU implements stack.LinkEndpoint.MTU. It just forwards the request to the
 // lower endpoint.
 func (e *Endpoint) MTU() uint32 {
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index cb433dc19..188049322 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -34,6 +34,11 @@ func (e *countedEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.dispatcher = dispatcher
 }
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *countedEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
 func (e *countedEndpoint) MTU() uint32 {
 	return e.mtu
 }
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 797501858..c5f8714da 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -90,6 +90,11 @@ func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address,
 // Attach is only implemented to satisfy the LinkEndpoint interface.
 func (*testObject) Attach(stack.NetworkDispatcher) {}
 
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (*testObject) IsAttached() bool {
+	return true
+}
+
 // MTU implements stack.LinkEndpoint.MTU. It just returns a constant that
 // matches the linux loopback MTU.
 func (*testObject) MTU() uint32 {
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index e7e6381ac..15b2418ad 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -224,6 +224,10 @@ type LinkEndpoint interface {
 	// Attach attaches the data link layer endpoint to the network-layer
 	// dispatcher of the stack.
 	Attach(dispatcher NetworkDispatcher)
+
+	// IsAttached returns whether a NetworkDispatcher is attached to the
+	// endpoint.
+	IsAttached() bool
 }
 
 // A LinkAddressResolver is an extension to a NetworkProtocol that
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index f0fbd8aad..3976f585c 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -541,6 +541,39 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 	return nics
 }
 
+// NICStateFlags holds information about the state of an NIC.
+type NICStateFlags struct {
+	// Up indicates whether the interface is running.
+	Up bool
+
+	// Running indicates whether resources are allocated.
+	Running bool
+
+	// Promiscuous indicates whether the interface is in promiscuous mode.
+	Promiscuous bool
+}
+
+// NICFlags returns flags about the state of the NIC. It returns an error if
+// the NIC corresponding to id cannot be found.
+func (s *Stack) NICFlags(id tcpip.NICID) (NICStateFlags, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[id]
+	if nic == nil {
+		return NICStateFlags{}, tcpip.ErrUnknownNICID
+	}
+
+	ret := NICStateFlags{
+		// Netstack interfaces are always up.
+		Up: true,
+
+		Running:     nic.linkEP.IsAttached(),
+		Promiscuous: nic.promiscuous,
+	}
+	return ret, nil
+}
+
 // AddAddress adds a new network-layer address to the specified NIC.
 func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
 	s.mu.RLock()
-- 
cgit v1.2.3


From 7b2b7a394601ae477538838702a2c5924da83751 Mon Sep 17 00:00:00 2001
From: Chanwit Kaewkasi <chanwit@gmail.com>
Date: Tue, 22 May 2018 13:46:52 -0700
Subject: Change length type, and let fadvise64 return ESPIPE if file is a pipe

Kernel before 2.6.16 return EINVAL, but later return ESPIPE for this case.
Also change type of "length" from Uint(uint32) to Int64.
Because C header uses type "size_t" (unsigned long) or "off_t" (long) for length.
And it makes more sense to check length < 0 with Int64 because Uint cannot be negative.

Change-Id: Ifd7fea2dcded7577a30760558d0d31f479f074c4
PiperOrigin-RevId: 197616743
---
 pkg/sentry/syscalls/linux/sys_file.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 9b8374ef6..94a876332 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -871,7 +871,7 @@ const (
 func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := kdefs.FD(args[0].Int())
 	offset := args[1].Int64()
-	length := args[2].Uint()
+	length := args[2].Int64()
 	advice := args[3].Int()
 
 	if offset < 0 || length < 0 {
@@ -884,6 +884,11 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	}
 	defer file.DecRef()
 
+	// If the FD refers to a pipe or FIFO, return error.
+	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ESPIPE
+	}
+
 	switch advice {
 	case _FADV_NORMAL:
 	case _FADV_RANDOM:
-- 
cgit v1.2.3


From 51c95c270be3e0c3867c1bc93cc454b32b276721 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 22 May 2018 16:35:58 -0700
Subject: Remove offset check to match with Linux implementation.

PiperOrigin-RevId: 197644246
Change-Id: I63eb0a58889e69fbc4af2af8232f6fa1c399d43f
---
 pkg/sentry/syscalls/linux/sys_file.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 94a876332..a2db9d4c9 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -870,11 +870,11 @@ const (
 // This implementation currently ignores the provided advice.
 func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := kdefs.FD(args[0].Int())
-	offset := args[1].Int64()
 	length := args[2].Int64()
 	advice := args[3].Int()
 
-	if offset < 0 || length < 0 {
+	// Note: offset is allowed to be negative.
+	if length < 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
-- 
cgit v1.2.3


From 7f62e9c32ea6af19ccd92107252fd869e6ef1005 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Thu, 24 May 2018 15:17:42 -0700
Subject: rpcinet connect doesn't handle all errnos correctly.

These were causing non-blocking related errnos to be returned to
the sentry when they were created as blocking FDs internally.

PiperOrigin-RevId: 197962932
Change-Id: I3f843535ff87ebf4cb5827e9f3d26abfb79461b0
---
 pkg/sentry/socket/rpcinet/socket.go | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 11925f8d8..bca91ab5f 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -213,21 +213,19 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 	// Register for notification when the endpoint becomes writable, then
 	// initiate the connection.
 	e, ch := waiter.NewChannelEntry(nil)
-	s.EventRegister(&e, waiter.EventOut)
+	s.EventRegister(&e, waiter.EventOut|waiter.EventIn|waiter.EventHUp)
 	defer s.EventUnregister(&e)
+	for {
+		if err := rpcConnect(t, s.fd, sockaddr); err == nil || err != syserr.ErrInProgress && err != syserr.ErrAlreadyInProgress {
+			return err
+		}
 
-	if err := rpcConnect(t, s.fd, sockaddr); err != syserr.ErrConnectStarted && err != syserr.ErrAlreadyConnecting {
-		return err
-	}
-
-	// It's pending, so we have to wait for a notification, and fetch the
-	// result once the wait completes.
-	if err := t.Block(ch); err != nil {
-		return syserr.FromError(err)
+		// It's pending, so we have to wait for a notification, and fetch the
+		// result once the wait completes.
+		if err := t.Block(ch); err != nil {
+			return syserr.FromError(err)
+		}
 	}
-
-	// Call Connect() again after blocking to find connect's result.
-	return rpcConnect(t, s.fd, sockaddr)
 }
 
 func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultPayload, *syserr.Error) {
-- 
cgit v1.2.3


From c59475599dbcc226e1ef516f40b581d6f2f3be75 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 30 May 2018 15:13:36 -0700
Subject: Change ring0 & page tables arguments to structs.

This is a refactor of ring0 and ring0/pagetables that changes from
individual arguments to opts structures. This should involve no
functional changes, but sets the stage for subsequent changes.

PiperOrigin-RevId: 198627556
Change-Id: Id4460340f6a73f0c793cd879324398139cd58ae9
---
 pkg/sentry/platform/kvm/address_space.go           |   5 +-
 pkg/sentry/platform/kvm/context.go                 |  19 ++-
 pkg/sentry/platform/kvm/kvm.go                     |   8 +-
 pkg/sentry/platform/kvm/kvm_test.go                | 108 ++++++++++++++---
 pkg/sentry/platform/kvm/machine.go                 |  15 ++-
 pkg/sentry/platform/kvm/machine_amd64.go           |   7 +-
 pkg/sentry/platform/ring0/defs.go                  |  19 +++
 pkg/sentry/platform/ring0/kernel_amd64.go          |  46 +++----
 pkg/sentry/platform/ring0/pagetables/BUILD         |   2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go |  18 ++-
 .../platform/ring0/pagetables/pagetables_amd64.go  | 100 +--------------
 .../ring0/pagetables/pagetables_amd64_test.go      |  79 ++++++++++++
 .../platform/ring0/pagetables/pagetables_test.go   |  52 ++++----
 .../platform/ring0/pagetables/pagetables_x86.go    | 134 +++++++++++++++++++++
 .../ring0/pagetables/pagetables_x86_test.go        |  79 ------------
 15 files changed, 409 insertions(+), 282 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
 delete mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index e81cc0caf..a777533c5 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -89,7 +89,10 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 		// important; if the pagetable mappings were installed before
 		// ensuring the physical pages were available, then some other
 		// thread could theoretically access them.
-		prev := as.pageTables.Map(addr, length, true /* user */, at, physical)
+		prev := as.pageTables.Map(addr, length, pagetables.MapOpts{
+			AccessType: at,
+			User:       true,
+		}, physical)
 		inv = inv || prev
 		m.addr += length
 		m.length -= length
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index dec26a23a..aac84febf 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -35,10 +35,7 @@ type context struct {
 
 // Switch runs the provided context in the given address space.
 func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
-	// Extract data.
 	localAS := as.(*addressSpace)
-	regs := &ac.StateData().Regs
-	fp := (*byte)(ac.FloatingPointData())
 
 	// Grab a vCPU.
 	cpu := c.machine.Get()
@@ -58,17 +55,17 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 	// that the flush can occur naturally on the next user entry.
 	cpu.active.set(localAS)
 
-	// Mark the address space as dirty.
-	flags := ring0.Flags(0)
-	if localAS.Touch(cpu) {
-		flags |= ring0.FlagFlush
-	}
-	if ac.FullRestore() {
-		flags |= ring0.FlagFull
+	// Prepare switch options.
+	switchOpts := ring0.SwitchOpts{
+		Registers:          &ac.StateData().Regs,
+		FloatingPointState: (*byte)(ac.FloatingPointData()),
+		PageTables:         localAS.pageTables,
+		Flush:              localAS.Touch(cpu),
+		FullRestore:        ac.FullRestore(),
 	}
 
 	// Take the blue pill.
-	si, at, err := cpu.SwitchToUser(regs, fp, localAS.pageTables, flags)
+	si, at, err := cpu.SwitchToUser(switchOpts)
 
 	// Clear the address space.
 	cpu.active.set(nil)
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 15a241f01..6defb1c46 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -123,8 +124,11 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru
 	pageTables := k.machine.kernel.PageTables.New()
 	applyPhysicalRegions(func(pr physicalRegion) bool {
 		// Map the kernel in the upper half.
-		kernelVirtual := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
-		pageTables.Map(kernelVirtual, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		pageTables.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
 		return true // Keep iterating.
 	})
 
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index a3466fbed..00919b214 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -142,7 +142,10 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func
 			// done for regular user code, but is fine for test
 			// purposes.)
 			applyPhysicalRegions(func(pr physicalRegion) bool {
-				pt.Map(usermem.Addr(pr.virtual), pr.length, true /* user */, usermem.AnyAccess, pr.physical)
+				pt.Map(usermem.Addr(pr.virtual), pr.length, pagetables.MapOpts{
+					AccessType: usermem.AnyAccess,
+					User:       true,
+				}, pr.physical)
 				return true // Keep iterating.
 			})
 		}
@@ -154,13 +157,22 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func
 
 func TestApplicationSyscall(t *testing.T) {
 	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+			FullRestore:        true,
+		}); err != nil {
 			t.Errorf("application syscall with full restore failed: %v", err)
 		}
 		return false
 	})
 	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); err != nil {
 			t.Errorf("application syscall with partial restore failed: %v", err)
 		}
 		return false
@@ -170,14 +182,23 @@ func TestApplicationSyscall(t *testing.T) {
 func TestApplicationFault(t *testing.T) {
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
-		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+			FullRestore:        true,
+		}); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
 			t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
 		}
 		return false
 	})
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
-		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
 			t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
 		}
 		return false
@@ -187,7 +208,11 @@ func TestApplicationFault(t *testing.T) {
 func TestRegistersSyscall(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); err != nil {
 			t.Errorf("application register check with partial restore got unexpected error: %v", err)
 		}
 		if err := testutil.CheckTestRegs(regs, false); err != nil {
@@ -200,7 +225,12 @@ func TestRegistersSyscall(t *testing.T) {
 func TestRegistersFault(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
-		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
+		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+			FullRestore:        true,
+		}); err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
 			t.Errorf("application register check with full restore got unexpected error: %v", err)
 		}
 		if err := testutil.CheckTestRegs(regs, true); err != nil {
@@ -213,7 +243,12 @@ func TestRegistersFault(t *testing.T) {
 func TestSegments(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestSegments(regs)
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+			FullRestore:        true,
+		}); err != nil {
 			t.Errorf("application segment check with full restore got unexpected error: %v", err)
 		}
 		if err := testutil.CheckTestSegments(regs); err != nil {
@@ -229,7 +264,11 @@ func TestBounce(t *testing.T) {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
 		}()
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); err != platform.ErrContextInterrupt {
 			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 		}
 		return false
@@ -239,7 +278,12 @@ func TestBounce(t *testing.T) {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
 		}()
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextInterrupt {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+			FullRestore:        true,
+		}); err != platform.ErrContextInterrupt {
 			t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 		}
 		return false
@@ -265,7 +309,11 @@ func TestBounceStress(t *testing.T) {
 				c.BounceToKernel()
 			}()
 			randomSleep()
-			if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
+			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+			}); err != platform.ErrContextInterrupt {
 				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 			}
 			c.unlock()
@@ -280,12 +328,21 @@ func TestInvalidate(t *testing.T) {
 	var data uintptr // Used below.
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, &data) // Read legitimate value.
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); err != nil {
 			t.Errorf("application partial restore: got %v, wanted nil", err)
 		}
 		// Unmap the page containing data & invalidate.
 		pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize)
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFlush); err != platform.ErrContextSignal {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+			Flush:              true,
+		}); err != platform.ErrContextSignal {
 			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
 		}
 		return false
@@ -299,14 +356,23 @@ func IsFault(err error, si *arch.SignalInfo) bool {
 
 func TestEmptyAddressSpace(t *testing.T) {
 	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); !IsFault(err, si) {
+		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); !IsFault(err, si) {
 			t.Errorf("first fault with partial restore failed got %v", err)
 			t.Logf("registers: %#v", &regs)
 		}
 		return false
 	})
 	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); !IsFault(err, si) {
+		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+			FullRestore:        true,
+		}); !IsFault(err, si) {
 			t.Errorf("first fault with full restore failed got %v", err)
 			t.Logf("registers: %#v", &regs)
 		}
@@ -357,7 +423,11 @@ func BenchmarkApplicationSyscall(b *testing.B) {
 		a int // Count for ErrContextInterrupt.
 	)
 	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); err != nil {
 			if err == platform.ErrContextInterrupt {
 				a++
 				return true // Ignore.
@@ -390,7 +460,11 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
 		a int
 	)
 	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			Registers:          regs,
+			FloatingPointState: dummyFPState,
+			PageTables:         pt,
+		}); err != nil {
 			if err == platform.ErrContextInterrupt {
 				a++
 				return true // Ignore.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 9b7e5130c..5a6109ced 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -186,10 +186,19 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 	// physical pages are mapped on demand, see kernel_unsafe.go.
 	applyPhysicalRegions(func(pr physicalRegion) bool {
 		// Map everything in the lower half.
-		m.kernel.PageTables.Map(usermem.Addr(pr.virtual), pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		m.kernel.PageTables.Map(
+			usermem.Addr(pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
 		// And keep everything in the upper half.
-		kernelAddr := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
-		m.kernel.PageTables.Map(kernelAddr, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		m.kernel.PageTables.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
 		return true // Keep iterating.
 	})
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index f583f68f7..ba7bbcb91 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -121,7 +120,7 @@ func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error)
 }
 
 // SwitchToUser unpacks architectural-details.
-func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags ring0.Flags) (*arch.SignalInfo, usermem.AccessType, error) {
+func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, usermem.AccessType, error) {
 	// See below.
 	var vector ring0.Vector
 
@@ -131,7 +130,7 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
 	// allocations occur.
 	entersyscall()
 	bluepill(c)
-	vector = c.CPU.SwitchToUser(regs, fpState, pt, flags)
+	vector = c.CPU.SwitchToUser(switchOpts)
 	exitsyscall()
 
 	switch vector {
@@ -147,7 +146,7 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.GeneralProtectionFault:
-		if !ring0.IsCanonical(regs.Rip) {
+		if !ring0.IsCanonical(switchOpts.Registers.Rip) {
 			// If the RIP is non-canonical, it's a SEGV.
 			info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
 			return info, usermem.AccessType{}, platform.ErrContextSignal
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 9d947b73d..7b3bed1c7 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -91,3 +91,22 @@ type CPU struct {
 func (c *CPU) Registers() *syscall.PtraceRegs {
 	return &c.registers
 }
+
+// SwitchOpts are passed to the Switch function.
+type SwitchOpts struct {
+	// Registers are the user register state.
+	Registers *syscall.PtraceRegs
+
+	// FloatingPointState is a byte pointer where floating point state is
+	// saved and restored.
+	FloatingPointState *byte
+
+	// PageTables are the application page tables.
+	PageTables *pagetables.PageTables
+
+	// Flush indicates that a TLB flush should be forced on switch.
+	Flush bool
+
+	// FullRestore indicates that an iret-based restore should be used.
+	FullRestore bool
+}
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 76ba65b3f..02d6d0de4 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -18,9 +18,6 @@ package ring0
 
 import (
 	"encoding/binary"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 )
 
 const (
@@ -159,18 +156,6 @@ func IsCanonical(addr uint64) bool {
 	return addr <= 0x00007fffffffffff || addr > 0xffff800000000000
 }
 
-// Flags contains flags related to switch.
-type Flags uintptr
-
-const (
-	// FlagFull indicates that a full restore should be not, not a fast
-	// restore (on the syscall return path.)
-	FlagFull = 1 << iota
-
-	// FlagFlush indicates that a full TLB flush is required.
-	FlagFlush
-)
-
 // SwitchToUser performs either a sysret or an iret.
 //
 // The return value is the vector that interrupted execution.
@@ -189,8 +174,9 @@ const (
 // the case for amd64, but may not be the case for other architectures.
 //
 //go:nosplit
-func (c *CPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags Flags) (vector Vector) {
+func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	// Check for canonical addresses.
+	regs := switchOpts.Registers
 	if !IsCanonical(regs.Rip) || !IsCanonical(regs.Rsp) || !IsCanonical(regs.Fs_base) || !IsCanonical(regs.Gs_base) {
 		return GeneralProtectionFault
 	}
@@ -201,10 +187,10 @@ func (c *CPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetabl
 	)
 
 	// Sanitize registers.
-	if flags&FlagFlush != 0 {
-		userCR3 = pt.FlushCR3()
+	if switchOpts.Flush {
+		userCR3 = switchOpts.PageTables.FlushCR3()
 	} else {
-		userCR3 = pt.CR3()
+		userCR3 = switchOpts.PageTables.CR3()
 	}
 	regs.Eflags &= ^uint64(UserFlagsClear)
 	regs.Eflags |= UserFlagsSet
@@ -213,21 +199,21 @@ func (c *CPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetabl
 	kernelCR3 = c.kernel.PageTables.CR3()
 
 	// Perform the switch.
-	swapgs()                    // GS will be swapped on return.
-	wrfs(uintptr(regs.Fs_base)) // Set application FS.
-	wrgs(uintptr(regs.Gs_base)) // Set application GS.
-	LoadFloatingPoint(fpState)  // Copy in floating point.
-	jumpToKernel()              // Switch to upper half.
-	writeCR3(uintptr(userCR3))  // Change to user address space.
-	if flags&FlagFull != 0 {
+	swapgs()                                         // GS will be swapped on return.
+	wrfs(uintptr(regs.Fs_base))                      // Set application FS.
+	wrgs(uintptr(regs.Gs_base))                      // Set application GS.
+	LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
+	jumpToKernel()                                   // Switch to upper half.
+	writeCR3(uintptr(userCR3))                       // Change to user address space.
+	if switchOpts.FullRestore {
 		vector = iret(c, regs)
 	} else {
 		vector = sysret(c, regs)
 	}
-	writeCR3(uintptr(kernelCR3))       // Return to kernel address space.
-	jumpToUser()                       // Return to lower half.
-	SaveFloatingPoint(fpState)         // Copy out floating point.
-	wrfs(uintptr(c.registers.Fs_base)) // Restore kernel FS.
+	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
+	jumpToUser()                                     // Return to lower half.
+	SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
+	wrfs(uintptr(c.registers.Fs_base))               // Restore kernel FS.
 	return
 }
 
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index c0c481ab3..1a8b7931e 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -23,8 +23,8 @@ go_test(
     name = "pagetables_test",
     size = "small",
     srcs = [
+        "pagetables_amd64_test.go",
         "pagetables_test.go",
-        "pagetables_x86_test.go",
         "pcids_x86_test.go",
     ],
     embed = [":pagetables"],
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index ee7f27601..2df6792f7 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -117,8 +117,8 @@ func (p *PageTables) getPageTable(n *Node, index int) *Node {
 // True is returned iff there was a previous mapping in the range.
 //
 // Precondition: addr & length must be aligned, their sum must not overflow.
-func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at usermem.AccessType, physical uintptr) bool {
-	if at == usermem.NoAccess {
+func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
+	if !opts.AccessType.Any() {
 		return p.Unmap(addr, length)
 	}
 	prev := false
@@ -129,7 +129,7 @@ func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at userme
 	}
 	p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) {
 		p := physical + (s - uintptr(addr))
-		prev = prev || (pte.Valid() && (p != pte.Address() || at.Write != pte.Writeable() || at.Execute != pte.Executable()))
+		prev = prev || (pte.Valid() && (p != pte.Address() || opts != pte.Opts()))
 		if p&align != 0 {
 			// We will install entries at a smaller granulaity if
 			// we don't install a valid entry here, however we must
@@ -137,7 +137,7 @@ func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at userme
 			pte.Clear()
 			return
 		}
-		pte.Set(p, at.Write, at.Execute, user)
+		pte.Set(p, opts)
 	})
 	p.mu.Unlock()
 	return prev
@@ -167,7 +167,7 @@ func (p *PageTables) Release() {
 }
 
 // Lookup returns the physical address for the given virtual address.
-func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType usermem.AccessType) {
+func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
 	mask := uintptr(usermem.PageSize - 1)
 	off := uintptr(addr) & mask
 	addr = addr &^ usermem.Addr(mask)
@@ -176,13 +176,9 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType use
 			return
 		}
 		physical = pte.Address() + (s - uintptr(addr)) + off
-		accessType = usermem.AccessType{
-			Read:    true,
-			Write:   pte.Writeable(),
-			Execute: pte.Executable(),
-		}
+		opts = pte.Opts()
 	})
-	return physical, accessType
+	return
 }
 
 // allocNode allocates a new page.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index a2050b99c..8dc50f9dd 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -18,7 +18,6 @@ package pagetables
 
 import (
 	"fmt"
-	"sync/atomic"
 )
 
 // Address constraints.
@@ -43,98 +42,11 @@ const (
 	pmdSize = 1 << pmdShift
 	pudSize = 1 << pudShift
 	pgdSize = 1 << pgdShift
-)
 
-// Bits in page table entries.
-const (
-	present        = 0x001
-	writable       = 0x002
-	user           = 0x004
-	writeThrough   = 0x008
-	cacheDisable   = 0x010
-	accessed       = 0x020
-	dirty          = 0x040
-	super          = 0x080
 	executeDisable = 1 << 63
+	entriesPerPage = 512
 )
 
-// PTE is a page table entry.
-type PTE uint64
-
-// Clear clears this PTE, including super page information.
-func (p *PTE) Clear() {
-	atomic.StoreUint64((*uint64)(p), 0)
-}
-
-// Valid returns true iff this entry is valid.
-func (p *PTE) Valid() bool {
-	return atomic.LoadUint64((*uint64)(p))&present != 0
-}
-
-// Writeable returns true iff the page is writable.
-func (p *PTE) Writeable() bool {
-	return atomic.LoadUint64((*uint64)(p))&writable != 0
-}
-
-// User returns true iff the page is user-accessible.
-func (p *PTE) User() bool {
-	return atomic.LoadUint64((*uint64)(p))&user != 0
-}
-
-// Executable returns true iff the page is executable.
-func (p *PTE) Executable() bool {
-	return atomic.LoadUint64((*uint64)(p))&executeDisable == 0
-}
-
-// SetSuper sets this page as a super page.
-//
-// The page must not be valid or a panic will result.
-func (p *PTE) SetSuper() {
-	if p.Valid() {
-		// This is not allowed.
-		panic("SetSuper called on valid page!")
-	}
-	atomic.StoreUint64((*uint64)(p), super)
-}
-
-// IsSuper returns true iff this page is a super page.
-func (p *PTE) IsSuper() bool {
-	return atomic.LoadUint64((*uint64)(p))&super != 0
-}
-
-// Set sets this PTE value.
-func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) {
-	v := uint64(addr)&^uint64(0xfff) | present | accessed
-	if userAccessible {
-		v |= user
-	}
-	if !execute {
-		v |= executeDisable
-	}
-	if write {
-		v |= writable | dirty
-	}
-	if p.IsSuper() {
-		v |= super
-	}
-	atomic.StoreUint64((*uint64)(p), v)
-}
-
-// setPageTable sets this PTE value and forces the write bit and super bit to
-// be cleared. This is used explicitly for breaking super pages.
-func (p *PTE) setPageTable(addr uintptr) {
-	v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty
-	atomic.StoreUint64((*uint64)(p), v)
-}
-
-// Address extracts the address. This should only be used if Valid returns true.
-func (p *PTE) Address() uintptr {
-	return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff))
-}
-
-// entriesPerPage is the number of PTEs per page.
-const entriesPerPage = 512
-
 // PTEs is a collection of entries.
 type PTEs [entriesPerPage]PTE
 
@@ -255,9 +167,6 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 				// Does this page need to be split?
 				if start&(pudSize-1) != 0 || end < next(start, pudSize) {
 					currentAddr := uint64(pudEntry.Address())
-					writeable := pudEntry.Writeable()
-					executable := pudEntry.Executable()
-					user := pudEntry.User()
 
 					// Install the relevant entries.
 					pmdNode := p.allocNode()
@@ -265,7 +174,7 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					for index := 0; index < entriesPerPage; index++ {
 						pmdEntry := &pmdEntries[index]
 						pmdEntry.SetSuper()
-						pmdEntry.Set(uintptr(currentAddr), writeable, executable, user)
+						pmdEntry.Set(uintptr(currentAddr), pudEntry.Opts())
 						currentAddr += pmdSize
 					}
 
@@ -319,16 +228,13 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					// Does this page need to be split?
 					if start&(pmdSize-1) != 0 || end < next(start, pmdSize) {
 						currentAddr := uint64(pmdEntry.Address())
-						writeable := pmdEntry.Writeable()
-						executable := pmdEntry.Executable()
-						user := pmdEntry.User()
 
 						// Install the relevant entries.
 						pteNode := p.allocNode()
 						pteEntries := pteNode.PTEs()
 						for index := 0; index < entriesPerPage; index++ {
 							pteEntry := &pteEntries[index]
-							pteEntry.Set(uintptr(currentAddr), writeable, executable, user)
+							pteEntry.Set(uintptr(currentAddr), pmdEntry.Opts())
 							currentAddr += pteSize
 						}
 
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
new file mode 100644
index 000000000..4f15c6b58
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package pagetables
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func Test2MAnd4K(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a small page and a huge page.
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
+	pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
+		{0x00007f0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read}},
+	})
+	pt.Release()
+}
+
+func Test1GAnd4K(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a small page and a super page.
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
+	pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*47)
+
+	checkMappings(t, pt, []mapping{
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
+		{0x00007f0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read}},
+	})
+	pt.Release()
+}
+
+func TestSplit1GPage(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a super page and knock out the middle.
+	pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*42)
+	pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize))
+
+	checkMappings(t, pt, []mapping{
+		{0x00007f0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read}},
+		{0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read}},
+	})
+	pt.Release()
+}
+
+func TestSplit2MPage(t *testing.T) {
+	pt := New(reflectTranslater{}, Opts{})
+
+	// Map a huge page and knock out the middle.
+	pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*42)
+	pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize))
+
+	checkMappings(t, pt, []mapping{
+		{0x00007f0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read}},
+		{0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read}},
+	})
+	pt.Release()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index 9cbc0e3b0..a4f684af2 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -28,10 +28,10 @@ func (r reflectTranslater) TranslateToPhysical(ptes *PTEs) uintptr {
 }
 
 type mapping struct {
-	start     uintptr
-	length    uintptr
-	addr      uintptr
-	writeable bool
+	start  uintptr
+	length uintptr
+	addr   uintptr
+	opts   MapOpts
 }
 
 func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
@@ -44,10 +44,10 @@ func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
 	// Iterate over all the mappings.
 	pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) {
 		found = append(found, mapping{
-			start:     s,
-			length:    e - s,
-			addr:      pte.Address(),
-			writeable: pte.Writeable(),
+			start:  s,
+			length: e - s,
+			addr:   pte.Address(),
+			opts:   pte.Opts(),
 		})
 		if failed != "" {
 			// Don't keep looking for errors.
@@ -62,8 +62,8 @@ func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
 			failed = "end didn't match expected"
 		} else if m[current].addr != pte.Address() {
 			failed = "address didn't match expected"
-		} else if m[current].writeable != pte.Writeable() {
-			failed = "writeable didn't match"
+		} else if m[current].opts != pte.Opts() {
+			failed = "opts didn't match"
 		}
 		current++
 	})
@@ -88,7 +88,7 @@ func TestUnmap(t *testing.T) {
 	pt := New(reflectTranslater{}, Opts{})
 
 	// Map and unmap one entry.
-	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
 	pt.Unmap(0x400000, pteSize)
 
 	checkMappings(t, pt, nil)
@@ -99,10 +99,10 @@ func TestReadOnly(t *testing.T) {
 	pt := New(reflectTranslater{}, Opts{})
 
 	// Map one entry.
-	pt.Map(0x400000, pteSize, true, usermem.Read, pteSize*42)
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42)
 
 	checkMappings(t, pt, []mapping{
-		{0x400000, pteSize, pteSize * 42, false},
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}},
 	})
 	pt.Release()
 }
@@ -111,10 +111,10 @@ func TestReadWrite(t *testing.T) {
 	pt := New(reflectTranslater{}, Opts{})
 
 	// Map one entry.
-	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
 
 	checkMappings(t, pt, []mapping{
-		{0x400000, pteSize, pteSize * 42, true},
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
 	})
 	pt.Release()
 }
@@ -123,12 +123,12 @@ func TestSerialEntries(t *testing.T) {
 	pt := New(reflectTranslater{}, Opts{})
 
 	// Map two sequential entries.
-	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
-	pt.Map(0x401000, pteSize, true, usermem.ReadWrite, pteSize*47)
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
+	pt.Map(0x401000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*47)
 
 	checkMappings(t, pt, []mapping{
-		{0x400000, pteSize, pteSize * 42, true},
-		{0x401000, pteSize, pteSize * 47, true},
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
+		{0x401000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.ReadWrite}},
 	})
 	pt.Release()
 }
@@ -137,11 +137,11 @@ func TestSpanningEntries(t *testing.T) {
 	pt := New(reflectTranslater{}, Opts{})
 
 	// Span a pgd with two pages.
-	pt.Map(0x00007efffffff000, 2*pteSize, true, usermem.Read, pteSize*42)
+	pt.Map(0x00007efffffff000, 2*pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42)
 
 	checkMappings(t, pt, []mapping{
-		{0x00007efffffff000, pteSize, pteSize * 42, false},
-		{0x00007f0000000000, pteSize, pteSize * 43, false},
+		{0x00007efffffff000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}},
+		{0x00007f0000000000, pteSize, pteSize * 43, MapOpts{AccessType: usermem.Read}},
 	})
 	pt.Release()
 }
@@ -150,12 +150,12 @@ func TestSparseEntries(t *testing.T) {
 	pt := New(reflectTranslater{}, Opts{})
 
 	// Map two entries in different pgds.
-	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
-	pt.Map(0x00007f0000000000, pteSize, true, usermem.Read, pteSize*47)
+	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
+	pt.Map(0x00007f0000000000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*47)
 
 	checkMappings(t, pt, []mapping{
-		{0x400000, pteSize, pteSize * 42, true},
-		{0x00007f0000000000, pteSize, pteSize * 47, false},
+		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
+		{0x00007f0000000000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.Read}},
 	})
 	pt.Release()
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index dac66373f..8ba78ed0d 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -16,6 +16,12 @@
 
 package pagetables
 
+import (
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
 // Opts are pagetable options.
 type Opts struct {
 	EnablePCID bool
@@ -77,3 +83,131 @@ func (p *PageTables) CR3() uint64 {
 func (p *PageTables) FlushCR3() uint64 {
 	return uint64(p.root.physical) | uint64(p.pcid)
 }
+
+// Bits in page table entries.
+const (
+	present      = 0x001
+	writable     = 0x002
+	user         = 0x004
+	writeThrough = 0x008
+	cacheDisable = 0x010
+	accessed     = 0x020
+	dirty        = 0x040
+	super        = 0x080
+	global       = 0x100
+	optionMask   = executeDisable | 0xfff
+)
+
+// MapOpts are x86 options.
+type MapOpts struct {
+	// AccessType defines permissions.
+	AccessType usermem.AccessType
+
+	// Global indicates the page is globally accessible.
+	Global bool
+
+	// User indicates the page is a user page.
+	User bool
+}
+
+// PTE is a page table entry.
+type PTE uintptr
+
+// Clear clears this PTE, including super page information.
+//
+//go:nosplit
+func (p *PTE) Clear() {
+	atomic.StoreUintptr((*uintptr)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+//
+//go:nosplit
+func (p *PTE) Valid() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&present != 0
+}
+
+// Opts returns the PTE options.
+//
+// These are all options except Valid and Super.
+//
+//go:nosplit
+func (p *PTE) Opts() MapOpts {
+	v := atomic.LoadUintptr((*uintptr)(p))
+	return MapOpts{
+		AccessType: usermem.AccessType{
+			Read:    v&present != 0,
+			Write:   v&writable != 0,
+			Execute: v&executeDisable == 0,
+		},
+		Global: v&global != 0,
+		User:   v&user != 0,
+	}
+}
+
+// SetSuper sets this page as a super page.
+//
+// The page must not be valid or a panic will result.
+//
+//go:nosplit
+func (p *PTE) SetSuper() {
+	if p.Valid() {
+		// This is not allowed.
+		panic("SetSuper called on valid page!")
+	}
+	atomic.StoreUintptr((*uintptr)(p), super)
+}
+
+// IsSuper returns true iff this page is a super page.
+//
+//go:nosplit
+func (p *PTE) IsSuper() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&super != 0
+}
+
+// Set sets this PTE value.
+//
+// This does not change the super page property.
+//
+//go:nosplit
+func (p *PTE) Set(addr uintptr, opts MapOpts) {
+	if !opts.AccessType.Any() {
+		p.Clear()
+		return
+	}
+	v := (addr &^ optionMask) | present | accessed
+	if opts.User {
+		v |= user
+	}
+	if opts.Global {
+		v |= global
+	}
+	if !opts.AccessType.Execute {
+		v |= executeDisable
+	}
+	if opts.AccessType.Write {
+		v |= writable | dirty
+	}
+	if p.IsSuper() {
+		// Note that this is inherited from the previous instance. Set
+		// does not change the value of Super. See above.
+		v |= super
+	}
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and super bit to
+// be cleared. This is used explicitly for breaking super pages.
+//
+//go:nosplit
+func (p *PTE) setPageTable(addr uintptr) {
+	v := (addr &^ optionMask) | present | user | writable | accessed | dirty
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+//
+//go:nosplit
+func (p *PTE) Address() uintptr {
+	return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
deleted file mode 100644
index 1fc403c48..000000000
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build i386 amd64
-
-package pagetables
-
-import (
-	"testing"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-func Test2MAnd4K(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
-
-	// Map a small page and a huge page.
-	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
-	pt.Map(0x00007f0000000000, 1<<21, true, usermem.Read, pmdSize*47)
-
-	checkMappings(t, pt, []mapping{
-		{0x400000, pteSize, pteSize * 42, true},
-		{0x00007f0000000000, pmdSize, pmdSize * 47, false},
-	})
-	pt.Release()
-}
-
-func Test1GAnd4K(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
-
-	// Map a small page and a super page.
-	pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
-	pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*47)
-
-	checkMappings(t, pt, []mapping{
-		{0x400000, pteSize, pteSize * 42, true},
-		{0x00007f0000000000, pudSize, pudSize * 47, false},
-	})
-	pt.Release()
-}
-
-func TestSplit1GPage(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
-
-	// Map a super page and knock out the middle.
-	pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*42)
-	pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize))
-
-	checkMappings(t, pt, []mapping{
-		{0x00007f0000000000, pteSize, pudSize * 42, false},
-		{0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, false},
-	})
-	pt.Release()
-}
-
-func TestSplit2MPage(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
-
-	// Map a huge page and knock out the middle.
-	pt.Map(0x00007f0000000000, pmdSize, true, usermem.Read, pmdSize*42)
-	pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize))
-
-	checkMappings(t, pt, []mapping{
-		{0x00007f0000000000, pteSize, pmdSize * 42, false},
-		{0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, false},
-	})
-	pt.Release()
-}
-- 
cgit v1.2.3


From 57edd0ee199150d7e25c3f072f3779a761ce6b7d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 30 May 2018 17:37:00 -0700
Subject: Restore FS on resume.

Previously, the vCPU FS was always correct because it relied on the
reset coming out of the switch. When that doesn't occur, for example,
using bluepill directly, the FS value can be incorrect leading to
strange corruption.

This change is necessary for a subsequent change that enforces guest
mode for page table modifications, and it may reduce test flakiness.
(The problematic path may occur in tests, but does not occur in the
actual platform.)

PiperOrigin-RevId: 198648137
Change-Id: I513910a973dd8666c9a1d18cf78990964d6a644d
---
 pkg/sentry/platform/kvm/bluepill_amd64.go |  2 ++
 pkg/sentry/platform/ring0/kernel_amd64.go | 10 +++++-----
 pkg/sentry/platform/ring0/lib_amd64.go    | 16 ++++++++--------
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index a2baefb7d..b364e3ef7 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -98,6 +98,7 @@ func bluepillSyscall() {
 	}
 	ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
 	ring0.Halt()
+	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
 	ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
 }
 
@@ -114,6 +115,7 @@ func bluepillException(vector ring0.Vector) {
 	}
 	ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
 	ring0.Halt()
+	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
 	ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
 }
 
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 02d6d0de4..58ac4b4b2 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -200,8 +200,8 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 
 	// Perform the switch.
 	swapgs()                                         // GS will be swapped on return.
-	wrfs(uintptr(regs.Fs_base))                      // Set application FS.
-	wrgs(uintptr(regs.Gs_base))                      // Set application GS.
+	WriteFS(uintptr(regs.Fs_base))                   // Set application FS.
+	WriteGS(uintptr(regs.Gs_base))                   // Set application GS.
 	LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
 	jumpToKernel()                                   // Switch to upper half.
 	writeCR3(uintptr(userCR3))                       // Change to user address space.
@@ -213,7 +213,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
 	jumpToUser()                                     // Return to lower half.
 	SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
-	wrfs(uintptr(c.registers.Fs_base))               // Restore kernel FS.
+	WriteFS(uintptr(c.registers.Fs_base))            // Restore kernel FS.
 	return
 }
 
@@ -225,8 +225,8 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 //go:nosplit
 func start(c *CPU) {
 	// Save per-cpu & FS segment.
-	wrgs(kernelAddr(c))
-	wrfs(uintptr(c.Registers().Fs_base))
+	WriteGS(kernelAddr(c))
+	WriteFS(uintptr(c.Registers().Fs_base))
 
 	// Initialize floating point.
 	//
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index f1ed5bfb4..de2842b5a 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -43,8 +43,8 @@ func xsave(*byte)
 // xsaveopt uses xsaveopt to save floating point state.
 func xsaveopt(*byte)
 
-// wrfs sets the GS address (set by init).
-var wrfs func(addr uintptr)
+// WriteFS sets the GS address (set by init).
+var WriteFS func(addr uintptr)
 
 // wrfsbase writes to the GS base address.
 func wrfsbase(addr uintptr)
@@ -52,8 +52,8 @@ func wrfsbase(addr uintptr)
 // wrfsmsr writes to the GS_BASE MSR.
 func wrfsmsr(addr uintptr)
 
-// wrgs sets the GS address (set by init).
-var wrgs func(addr uintptr)
+// WriteGS sets the GS address (set by init).
+var WriteGS func(addr uintptr)
 
 // wrgsbase writes to the GS base address.
 func wrgsbase(addr uintptr)
@@ -119,10 +119,10 @@ func Init(featureSet *cpuid.FeatureSet) {
 		LoadFloatingPoint = fxrstor
 	}
 	if hasFSGSBASE {
-		wrfs = wrfsbase
-		wrgs = wrgsbase
+		WriteFS = wrfsbase
+		WriteGS = wrgsbase
 	} else {
-		wrfs = wrfsmsr
-		wrgs = wrgsmsr
+		WriteFS = wrfsmsr
+		WriteGS = wrgsmsr
 	}
 }
-- 
cgit v1.2.3


From 659b10d1a6a236765be8b6e6dc0d72eaa55253ee Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 1 Jun 2018 13:50:17 -0700
Subject: Move page tables lock into the address space.

This is necessary to prevent races with invalidation. It is currently
possible that page tables are garbage collected while paging caches
refer to them. We must ensure that pages are held until caches can be
invalidated. This is not achieved by this goal alone, but moving locking
to outside the page tables themselves is a requisite.

PiperOrigin-RevId: 198920784
Change-Id: I66fffecd49cb14aa2e676a84a68cabfc0c8b3e9a
---
 pkg/sentry/platform/kvm/address_space.go           | 29 ++++++++++++++++++----
 pkg/sentry/platform/ring0/pagetables/pagetables.go |  8 ------
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index a777533c5..4c76883ad 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -16,6 +16,7 @@ package kvm
 
 import (
 	"reflect"
+	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
@@ -27,6 +28,11 @@ import (
 type addressSpace struct {
 	platform.NoAddressSpaceIO
 
+	// mu is the lock for modifications to the address space.
+	//
+	// Note that the page tables themselves are not locked.
+	mu sync.Mutex
+
 	// filemem is the memory instance.
 	filemem *filemem.FileMem
 
@@ -49,8 +55,8 @@ type addressSpace struct {
 	files hostMap
 }
 
-// Invalidate interrupts all dirty contexts.
-func (as *addressSpace) Invalidate() {
+// invalidate is the implementation for Invalidate.
+func (as *addressSpace) invalidate() {
 	for i := 0; i < as.dirtySet.size(); i++ {
 		if c := as.dirtySet.swap(i, nil); c != nil && c.active.get() == as {
 			c.BounceToKernel() // Force a kernel transition.
@@ -58,6 +64,13 @@ func (as *addressSpace) Invalidate() {
 	}
 }
 
+// Invalidate interrupts all dirty contexts.
+func (as *addressSpace) Invalidate() {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+	as.invalidate()
+}
+
 // Touch adds the given vCPU to the dirty list.
 //
 // The return value indicates whether a flush is required.
@@ -120,7 +133,7 @@ func (as *addressSpace) mapHostFile(addr usermem.Addr, fd int, fr platform.FileR
 		addr += usermem.Addr(m.length)
 	}
 	if inv {
-		as.Invalidate()
+		as.invalidate()
 	}
 
 	return nil
@@ -169,7 +182,7 @@ func (as *addressSpace) mapFilemem(addr usermem.Addr, fr platform.FileRange, at
 		addr += usermem.Addr(len(s))
 	}
 	if inv {
-		as.Invalidate()
+		as.invalidate()
 		as.files.DeleteMapping(orig)
 	}
 
@@ -178,6 +191,9 @@ func (as *addressSpace) mapFilemem(addr usermem.Addr, fr platform.FileRange, at
 
 // MapFile implements platform.AddressSpace.MapFile.
 func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+
 	// Create an appropriate mapping. If this is filemem, we don't create
 	// custom mappings for each in-application mapping. For files however,
 	// we create distinct mappings for each address space. Unfortunately,
@@ -195,8 +211,11 @@ func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange
 
 // Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
 func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+
 	if prev := as.pageTables.Unmap(addr, uintptr(length)); prev {
-		as.Invalidate()
+		as.invalidate()
 		as.files.DeleteMapping(usermem.AddrRange{
 			Start: addr,
 			End:   addr + usermem.Addr(length),
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 2df6792f7..2a83bbff2 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -16,8 +16,6 @@
 package pagetables
 
 import (
-	"sync"
-
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -39,8 +37,6 @@ type Node struct {
 
 // PageTables is a set of page tables.
 type PageTables struct {
-	mu sync.Mutex
-
 	// root is the pagetable root.
 	root *Node
 
@@ -122,7 +118,6 @@ func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physic
 		return p.Unmap(addr, length)
 	}
 	prev := false
-	p.mu.Lock()
 	end, ok := addr.AddLength(uint64(length))
 	if !ok {
 		panic("pagetables.Map: overflow")
@@ -139,7 +134,6 @@ func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physic
 		}
 		pte.Set(p, opts)
 	})
-	p.mu.Unlock()
 	return prev
 }
 
@@ -147,13 +141,11 @@ func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physic
 //
 // True is returned iff there was a previous mapping in the range.
 func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
-	p.mu.Lock()
 	count := 0
 	p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) {
 		pte.Clear()
 		count++
 	})
-	p.mu.Unlock()
 	return count > 0
 }
 
-- 
cgit v1.2.3


From 0212f222c74b9f88c5c74d920127e47e942dc376 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Fri, 1 Jun 2018 14:58:46 -0700
Subject: Fix refcount bug in rpcinet socketOperations.Accept.

PiperOrigin-RevId: 198931222
Change-Id: I69ee12318e87b9a6a4a94b18a9bf0ae4e39d7eaf
---
 pkg/sentry/socket/rpcinet/socket.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index bca91ab5f..74cb84927 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -277,8 +277,10 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	file := fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonBlocking: flags&linux.SOCK_NONBLOCK != 0}, &socketOperations{
 		wq:       &wq,
 		fd:       payload.Fd,
+		rpcConn:  s.rpcConn,
 		notifier: s.notifier,
 	})
+	defer file.DecRef()
 
 	fdFlags := kernel.FDFlags{
 		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
-- 
cgit v1.2.3


From ff7b4a156f95a587b5df4de89a22c200fceabb96 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 5 Jun 2018 15:43:55 -0700
Subject: Add support for rpcinet owned procfs files.

This change will add support for /proc/sys/net and /proc/net which will
be managed and owned by rpcinet. This will allow these inodes to be forward
as rpcs.

PiperOrigin-RevId: 199370799
Change-Id: I2c876005d98fe55dd126145163bee5a645458ce4
---
 pkg/sentry/fs/proc/BUILD               |   2 +
 pkg/sentry/fs/proc/net.go              |  17 +++
 pkg/sentry/fs/proc/proc.go             |  45 +++++++-
 pkg/sentry/fs/proc/rpcinet_proc.go     | 193 +++++++++++++++++++++++++++++++++
 pkg/sentry/fs/proc/sys.go              |  10 +-
 pkg/sentry/fs/proc/sys_net.go          |  66 ++++++++++-
 pkg/sentry/socket/rpcinet/conn/conn.go |  20 ++++
 pkg/sentry/socket/rpcinet/stack.go     |  90 +++++----------
 8 files changed, 374 insertions(+), 69 deletions(-)
 create mode 100644 pkg/sentry/fs/proc/rpcinet_proc.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 18372cfbf..21b5fc0c3 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -44,6 +44,7 @@ go_library(
         "net.go",
         "proc.go",
         "proc_state.go",
+        "rpcinet_proc.go",
         "stat.go",
         "sys.go",
         "sys_net.go",
@@ -70,6 +71,7 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/mm",
+        "//pkg/sentry/socket/rpcinet",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6e464857a..e6bd35f27 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -32,6 +32,23 @@ func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	if s := p.k.NetworkStack(); s != nil && s.SupportsIPv6() {
 		d.AddChild(ctx, "dev", seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc))
 		d.AddChild(ctx, "if_inet6", seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc))
+
+		// The following files are simple stubs until they are implemented in
+		// netstack, if the file contains a header the stub is just the header
+		// otherwise it is an empty file.
+		d.AddChild(ctx, "arp", p.newStubProcFSFile(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")))
+		d.AddChild(ctx, "ipv6_route", p.newStubProcFSFile(ctx, msrc, []byte("")))
+		d.AddChild(ctx, "netlink", p.newStubProcFSFile(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")))
+		d.AddChild(ctx, "netstat", p.newStubProcFSFile(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")))
+		d.AddChild(ctx, "packet", p.newStubProcFSFile(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")))
+		d.AddChild(ctx, "protocols", p.newStubProcFSFile(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")))
+		d.AddChild(ctx, "psched", p.newStubProcFSFile(ctx, msrc, []byte("")))
+		d.AddChild(ctx, "ptype", p.newStubProcFSFile(ctx, msrc, []byte("Type Device      Function")))
+		d.AddChild(ctx, "route", p.newStubProcFSFile(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")))
+		d.AddChild(ctx, "tcp", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
+		d.AddChild(ctx, "tcp6", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
+		d.AddChild(ctx, "udp", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")))
+		d.AddChild(ctx, "udp6", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
 	}
 	return newFile(d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 459eb7e62..d727e1bc9 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -17,6 +17,7 @@ package proc
 
 import (
 	"fmt"
+	"io"
 	"sort"
 	"strconv"
 
@@ -26,6 +27,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 // proc is a root proc node.
@@ -40,6 +44,30 @@ type proc struct {
 	pidns *kernel.PIDNamespace
 }
 
+// stubProcFSFile is a file type that can be used to return file contents
+// which are constant. This file is not writable and will always have mode
+// 0444.
+type stubProcFSFile struct {
+	ramfs.Entry
+
+	// contents are the immutable file contents that will always be returned.
+	contents []byte
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (s *stubProcFSFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	if offset >= int64(len(s.contents)) {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOut(ctx, s.contents[offset:])
+	return int64(n), err
+}
+
 // New returns the root node of a partial simple procfs.
 func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 	k := kernel.KernelFromContext(ctx)
@@ -83,6 +111,15 @@ func (p *proc) newSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	return newFile(s, msrc, fs.Symlink, nil)
 }
 
+// newStubProcFsFile returns a procfs file with constant contents.
+func (p *proc) newStubProcFSFile(ctx context.Context, msrc *fs.MountSource, c []byte) *fs.Inode {
+	u := &stubProcFSFile{
+		contents: c,
+	}
+	u.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
+	return newFile(u, msrc, fs.SpecialFile, nil)
+}
+
 // Readlink implements fs.InodeOperations.Readlink.
 func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	if t := kernel.TaskFromContext(ctx); t != nil {
@@ -107,7 +144,13 @@ func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dire
 
 	// Is it a dynamic element?
 	nfs := map[string]func() *fs.Inode{
-		"net":  func() *fs.Inode { return p.newNetDir(ctx, dir.MountSource) },
+		"net": func() *fs.Inode {
+			// If we're using rpcinet we will let it manage /proc/net.
+			if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
+				return newRPCInetProcNet(ctx, dir.MountSource)
+			}
+			return p.newNetDir(ctx, dir.MountSource)
+		},
 		"self": func() *fs.Inode { return p.newSelf(ctx, dir.MountSource) },
 		"sys":  func() *fs.Inode { return p.newSysDir(ctx, dir.MountSource) },
 	}
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
new file mode 100644
index 000000000..50d0271f9
--- /dev/null
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -0,0 +1,193 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// rpcinetFile implments fs.InodeOperations as RPCs.
+type rpcinetFile struct {
+	ramfs.Entry
+
+	// filepath is the full path of this rpcinetFile.
+	filepath string
+
+	k *kernel.Kernel
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+// This method can panic if an rpcinetFile was created without an rpcinet
+// stack.
+func (r rpcinetFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	s, ok := r.k.NetworkStack().(*rpcinet.Stack)
+	if !ok {
+		panic("Network stack is not a rpcinet.")
+	}
+
+	contents, se := s.RPCReadFile(r.filepath)
+	if se != nil || offset >= int64(len(contents)) {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOut(ctx, contents[offset:])
+	return int64(n), err
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (r rpcinetFile) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
+// This method can panic if an rpcinetFile was created without an rpcinet
+// stack.
+func (r rpcinetFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	s, ok := r.k.NetworkStack().(*rpcinet.Stack)
+	if !ok {
+		panic("Network stack is not a rpcinet.")
+	}
+
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	b := make([]byte, src.NumBytes(), src.NumBytes())
+	n, err := src.CopyIn(ctx, b)
+	if err != nil {
+		return int64(n), err
+	}
+
+	written, se := s.RPCWriteFile(r.filepath, b)
+	return int64(written), se.ToError()
+}
+
+func newRPCProcFSFile(ctx context.Context, msrc *fs.MountSource, filepath string, mode linux.FileMode) *fs.Inode {
+	f := &rpcinetFile{
+		filepath: filepath,
+		k:        kernel.KernelFromContext(ctx),
+	}
+	f.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(mode))
+
+	fi := newFile(f, msrc, fs.SpecialFile, nil)
+	return fi
+}
+
+// newRPCInetProcNet will build an inode for /proc/net.
+func newRPCInetProcNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+
+	// Add all the files we want to forward for /proc/net.
+	d.AddChild(ctx, "arp", newRPCProcFSFile(ctx, msrc, "/proc/net/arp", 0444))
+	d.AddChild(ctx, "dev", newRPCProcFSFile(ctx, msrc, "/proc/net/dev", 0444))
+	d.AddChild(ctx, "if_inet6", newRPCProcFSFile(ctx, msrc, "/proc/net/if_inet6", 0444))
+	d.AddChild(ctx, "ipv6_route", newRPCProcFSFile(ctx, msrc, "/proc/net/ipv6_route", 0444))
+	d.AddChild(ctx, "netlink", newRPCProcFSFile(ctx, msrc, "/proc/net/netlink", 0444))
+	d.AddChild(ctx, "netstat", newRPCProcFSFile(ctx, msrc, "/proc/net/netstat", 0444))
+	d.AddChild(ctx, "packet", newRPCProcFSFile(ctx, msrc, "/proc/net/packet", 0444))
+	d.AddChild(ctx, "protocols", newRPCProcFSFile(ctx, msrc, "/proc/net/protocols", 0444))
+	d.AddChild(ctx, "psched", newRPCProcFSFile(ctx, msrc, "/proc/net/psched", 0444))
+	d.AddChild(ctx, "ptype", newRPCProcFSFile(ctx, msrc, "/proc/net/ptype", 0444))
+	d.AddChild(ctx, "route", newRPCProcFSFile(ctx, msrc, "/proc/net/route", 0444))
+	d.AddChild(ctx, "tcp", newRPCProcFSFile(ctx, msrc, "/proc/net/tcp", 0444))
+	d.AddChild(ctx, "tcp6", newRPCProcFSFile(ctx, msrc, "/proc/net/tcp6", 0444))
+	d.AddChild(ctx, "udp", newRPCProcFSFile(ctx, msrc, "/proc/net/udp", 0444))
+	d.AddChild(ctx, "udp6", newRPCProcFSFile(ctx, msrc, "/proc/net/udp6", 0444))
+
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+// newRPCInetProcSysNet will build an inode for /proc/sys/net.
+func newRPCInetProcSysNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	d.AddChild(ctx, "ipv4", newRPCInetSysNetIPv4Dir(ctx, msrc))
+	d.AddChild(ctx, "core", newRPCInetSysNetCore(ctx, msrc))
+
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+// newRPCInetSysNetCore builds the /proc/sys/net/core directory.
+func newRPCInetSysNetCore(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+
+	// Add all the files we want to forward over RPC for /proc/sys/net/core
+	d.AddChild(ctx, "default_qdisc", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/default_qdisc", 0444))
+	d.AddChild(ctx, "message_burst", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/message_burst", 0444))
+	d.AddChild(ctx, "message_cost", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/message_cost", 0444))
+	d.AddChild(ctx, "optmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/optmem_max", 0444))
+	d.AddChild(ctx, "rmem_default", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/rmem_default", 0444))
+	d.AddChild(ctx, "rmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/rmem_max", 0444))
+	d.AddChild(ctx, "somaxconn", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/somaxconn", 0444))
+	d.AddChild(ctx, "wmem_default", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/wmem_default", 0444))
+	d.AddChild(ctx, "wmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/wmem_max", 0444))
+
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+// newRPCInetSysNetIPv4Dir builds the /proc/sys/net/ipv4 directory.
+func newRPCInetSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+
+	// Add all the files we want to forward over RPC for /proc/sys/net/ipv4.
+	d.AddChild(ctx, "ip_local_port_range", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_local_port_range", 0444))
+	d.AddChild(ctx, "ip_local_reserved_ports", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_local_reserved_ports", 0444))
+	d.AddChild(ctx, "ipfrag_time", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ipfrag_time", 0444))
+	d.AddChild(ctx, "ip_nonlocal_bind", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_nonlocal_bind", 0444))
+	d.AddChild(ctx, "ip_no_pmtu_disc", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_no_pmtu_disc", 0444))
+
+	d.AddChild(ctx, "tcp_allowed_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_allowed_congestion_control", 0444))
+	d.AddChild(ctx, "tcp_available_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_available_congestion_control", 0444))
+	d.AddChild(ctx, "tcp_base_mss", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_base_mss", 0444))
+	d.AddChild(ctx, "tcp_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_congestion_control", 0644))
+	d.AddChild(ctx, "tcp_dsack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_dsack", 0644))
+	d.AddChild(ctx, "tcp_early_retrans", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_early_retrans", 0644))
+	d.AddChild(ctx, "tcp_fack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fack", 0644))
+	d.AddChild(ctx, "tcp_fastopen", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen", 0644))
+	d.AddChild(ctx, "tcp_fastopen_key", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen_key", 0444))
+	d.AddChild(ctx, "tcp_fin_timeout", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fin_timeout", 0644))
+	d.AddChild(ctx, "tcp_invalid_ratelimit", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_invalid_ratelimit", 0444))
+	d.AddChild(ctx, "tcp_keepalive_intvl", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_intvl", 0644))
+	d.AddChild(ctx, "tcp_keepalive_probes", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_probes", 0644))
+	d.AddChild(ctx, "tcp_keepalive_time", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_time", 0644))
+	d.AddChild(ctx, "tcp_mem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_mem", 0444))
+	d.AddChild(ctx, "tcp_mtu_probing", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_mtu_probing", 0644))
+	d.AddChild(ctx, "tcp_no_metrics_save", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_no_metrics_save", 0444))
+	d.AddChild(ctx, "tcp_probe_interval", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_interval", 0444))
+	d.AddChild(ctx, "tcp_probe_threshold", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_threshold", 0444))
+	d.AddChild(ctx, "tcp_retries1", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries1", 0644))
+	d.AddChild(ctx, "tcp_retries2", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries2", 0644))
+	d.AddChild(ctx, "tcp_rfc1337", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_rfc1337", 0444))
+	d.AddChild(ctx, "tcp_rmem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_rmem", 0444))
+	d.AddChild(ctx, "tcp_sack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_sack", 0644))
+	d.AddChild(ctx, "tcp_slow_start_after_idle", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_slow_start_after_idle", 0644))
+	d.AddChild(ctx, "tcp_synack_retries", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_synack_retries", 0644))
+	d.AddChild(ctx, "tcp_syn_retries", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_syn_retries", 0644))
+	d.AddChild(ctx, "tcp_timestamps", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_timestamps", 0644))
+	d.AddChild(ctx, "tcp_wmem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_wmem", 0444))
+
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index 4323f3650..db9ec83b9 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -112,6 +113,13 @@ func (p *proc) newSysDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
 	d.AddChild(ctx, "kernel", p.newKernelDir(ctx, msrc))
 	d.AddChild(ctx, "vm", p.newVMDir(ctx, msrc))
-	d.AddChild(ctx, "net", p.newSysNetDir(ctx, msrc))
+
+	// If we're using rpcinet we will let it manage /proc/sys/net.
+	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
+		d.AddChild(ctx, "net", newRPCInetProcSysNet(ctx, msrc))
+	} else {
+		d.AddChild(ctx, "net", p.newSysNetDir(ctx, msrc))
+	}
+
 	return newFile(d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index db44c95cb..2a108708c 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -158,7 +158,28 @@ func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 	return n, s.s.SetTCPSACKEnabled(v != 0)
 }
 
-func newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+	d := &ramfs.Dir{}
+	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+
+	// The following files are simple stubs until they are implemented in
+	// netstack, most of these files are configuration related. We use the
+	// value closest to the actual netstack behavior or any empty file,
+	// all of these files will have mode 0444 (read-only for all users).
+	d.AddChild(ctx, "default_qdisc", p.newStubProcFSFile(ctx, msrc, []byte("pfifo_fast")))
+	d.AddChild(ctx, "message_burst", p.newStubProcFSFile(ctx, msrc, []byte("10")))
+	d.AddChild(ctx, "message_cost", p.newStubProcFSFile(ctx, msrc, []byte("5")))
+	d.AddChild(ctx, "optmem_max", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "rmem_default", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
+	d.AddChild(ctx, "rmem_max", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
+	d.AddChild(ctx, "somaxconn", p.newStubProcFSFile(ctx, msrc, []byte("128")))
+	d.AddChild(ctx, "wmem_default", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
+	d.AddChild(ctx, "wmem_max", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
+
+	return newFile(d, msrc, fs.SpecialDirectory, nil)
+}
+
+func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
 	d := &ramfs.Dir{}
 	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
 
@@ -175,6 +196,46 @@ func newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *
 	// Add tcp_sack.
 	d.AddChild(ctx, "tcp_sack", newTCPSackInode(ctx, msrc, s))
 
+	// The following files are simple stubs until they are implemented in
+	// netstack, most of these files are configuration related. We use the
+	// value closest to the actual netstack behavior or any empty file,
+	// all of these files will have mode 0444 (read-only for all users).
+	d.AddChild(ctx, "ip_local_port_range", p.newStubProcFSFile(ctx, msrc, []byte("16000   65535")))
+	d.AddChild(ctx, "ip_local_reserved_ports", p.newStubProcFSFile(ctx, msrc, []byte("")))
+	d.AddChild(ctx, "ipfrag_time", p.newStubProcFSFile(ctx, msrc, []byte("30")))
+	d.AddChild(ctx, "ip_nonlocal_bind", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "ip_no_pmtu_disc", p.newStubProcFSFile(ctx, msrc, []byte("1")))
+
+	// tcp_allowed_congestion_control tell the user what they are able to do as an
+	// unprivledged process so we leave it empty.
+	d.AddChild(ctx, "tcp_allowed_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("")))
+	d.AddChild(ctx, "tcp_available_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("reno")))
+	d.AddChild(ctx, "tcp_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("reno")))
+
+	// Many of the following stub files are features netstack doesn't support
+	// and are therefore "0" for disabled.
+	d.AddChild(ctx, "tcp_base_mss", p.newStubProcFSFile(ctx, msrc, []byte("1280")))
+	d.AddChild(ctx, "tcp_dsack", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_early_retrans", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_fack", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_fastopen", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_fastopen_key", p.newStubProcFSFile(ctx, msrc, []byte("")))
+	d.AddChild(ctx, "tcp_invalid_ratelimit", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_keepalive_intvl", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_keepalive_probes", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_keepalive_time", p.newStubProcFSFile(ctx, msrc, []byte("7200")))
+	d.AddChild(ctx, "tcp_mtu_probing", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_no_metrics_save", p.newStubProcFSFile(ctx, msrc, []byte("1")))
+	d.AddChild(ctx, "tcp_probe_interval", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_probe_threshold", p.newStubProcFSFile(ctx, msrc, []byte("0")))
+	d.AddChild(ctx, "tcp_retries1", p.newStubProcFSFile(ctx, msrc, []byte("3")))
+	d.AddChild(ctx, "tcp_retries2", p.newStubProcFSFile(ctx, msrc, []byte("15")))
+	d.AddChild(ctx, "tcp_rfc1337", p.newStubProcFSFile(ctx, msrc, []byte("1")))
+	d.AddChild(ctx, "tcp_slow_start_after_idle", p.newStubProcFSFile(ctx, msrc, []byte("1")))
+	d.AddChild(ctx, "tcp_synack_retries", p.newStubProcFSFile(ctx, msrc, []byte("5")))
+	d.AddChild(ctx, "tcp_syn_retries", p.newStubProcFSFile(ctx, msrc, []byte("3")))
+	d.AddChild(ctx, "tcp_timestamps", p.newStubProcFSFile(ctx, msrc, []byte("1")))
+
 	return newFile(d, msrc, fs.SpecialDirectory, nil)
 }
 
@@ -182,7 +243,8 @@ func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode
 	d := &ramfs.Dir{}
 	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
 	if s := p.k.NetworkStack(); s != nil {
-		d.AddChild(ctx, "ipv4", newSysNetIPv4Dir(ctx, msrc, s))
+		d.AddChild(ctx, "ipv4", p.newSysNetIPv4Dir(ctx, msrc, s))
+		d.AddChild(ctx, "core", p.newSysNetCore(ctx, msrc, s))
 	}
 	return newFile(d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index ea6ec87ed..f4c8489b1 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -147,6 +147,26 @@ func (c *RPCConnection) RPCReadFile(path string) ([]byte, *syserr.Error) {
 	return res.(*pb.ReadFileResponse_Data).Data, nil
 }
 
+// RPCWriteFile will execute the WriteFile helper RPC method which avoids the
+// common pattern of open(2), write(2), write(2), close(2) by doing all
+// operations as a single RPC.
+func (c *RPCConnection) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) {
+	req := &pb.SyscallRequest_WriteFile{&pb.WriteFileRequest{
+		Path:    path,
+		Content: data,
+	}}
+
+	id, ch := c.NewRequest(pb.SyscallRequest{Args: req}, false /* ignoreResult */)
+	<-ch
+
+	res := c.Request(id).Result.(*pb.SyscallResponse_WriteFile).WriteFile
+	if e := res.ErrorNumber; e != 0 {
+		return int64(res.Written), syserr.FromHost(syscall.Errno(e))
+	}
+
+	return int64(res.Written), nil
+}
+
 // Request retrieves the request corresponding to the given request ID.
 //
 // The channel returned by NewRequest must have been closed before Request can
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index 503e0e932..bcb89fb34 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -16,50 +16,24 @@ package rpcinet
 
 import (
 	"fmt"
-	"strings"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
 // Stack implements inet.Stack for RPC backed sockets.
 type Stack struct {
-	// We intentionally do not allow these values to be changed to remain
-	// consistent with the other networking stacks.
 	interfaces     map[int32]inet.Interface
 	interfaceAddrs map[int32][]inet.InterfaceAddr
-	supportsIPv6   bool
-	tcpRecvBufSize inet.TCPBufferSize
-	tcpSendBufSize inet.TCPBufferSize
-	tcpSACKEnabled bool
 	rpcConn        *conn.RPCConnection
 	notifier       *notifier.Notifier
 }
 
-func readTCPBufferSizeFile(conn *conn.RPCConnection, filename string) (inet.TCPBufferSize, error) {
-	contents, se := conn.RPCReadFile(filename)
-	if se != nil {
-		return inet.TCPBufferSize{}, fmt.Errorf("failed to read %s: %v", filename, se)
-	}
-	ioseq := usermem.BytesIOSequence(contents)
-	fields := make([]int32, 3)
-	if n, err := usermem.CopyInt32StringsInVec(context.Background(), ioseq.IO, ioseq.Addrs, fields, ioseq.Opts); n != ioseq.NumBytes() || err != nil {
-		return inet.TCPBufferSize{}, fmt.Errorf("failed to parse %s (%q): got %v after %d/%d bytes", filename, contents, err, n, ioseq.NumBytes())
-	}
-	return inet.TCPBufferSize{
-		Min:     int(fields[0]),
-		Default: int(fields[1]),
-		Max:     int(fields[2]),
-	}, nil
-}
-
 // NewStack returns a Stack containing the current state of the host network
 // stack.
 func NewStack(fd int32) (*Stack, error) {
@@ -80,31 +54,6 @@ func NewStack(fd int32) (*Stack, error) {
 		return nil, e
 	}
 
-	// Load the configuration values from procfs.
-	tcpRMem, e := readTCPBufferSizeFile(stack.rpcConn, "/proc/sys/net/ipv4/tcp_rmem")
-	if e != nil {
-		return nil, e
-	}
-	stack.tcpRecvBufSize = tcpRMem
-
-	tcpWMem, e := readTCPBufferSizeFile(stack.rpcConn, "/proc/sys/net/ipv4/tcp_wmem")
-	if e != nil {
-		return nil, e
-	}
-	stack.tcpSendBufSize = tcpWMem
-
-	ipv6, se := stack.rpcConn.RPCReadFile("/proc/net/if_inet6")
-	if len(string(ipv6)) > 0 {
-		stack.supportsIPv6 = true
-	}
-
-	sackFile := "/proc/sys/net/ipv4/tcp_sack"
-	sack, se := stack.rpcConn.RPCReadFile(sackFile)
-	if se != nil {
-		return nil, fmt.Errorf("failed to read %s: %v", sackFile, se)
-	}
-	stack.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0"
-
 	links, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETLINK)
 	if err != nil {
 		return nil, fmt.Errorf("RTM_GETLINK failed: %v", err)
@@ -123,6 +72,21 @@ func NewStack(fd int32) (*Stack, error) {
 	return stack, nil
 }
 
+// RPCReadFile will execute the ReadFile helper RPC method which avoids the
+// common pattern of open(2), read(2), close(2) by doing all three operations
+// as a single RPC. It will read the entire file or return EFBIG if the file
+// was too large.
+func (s *Stack) RPCReadFile(path string) ([]byte, *syserr.Error) {
+	return s.rpcConn.RPCReadFile(path)
+}
+
+// RPCWriteFile will execute the WriteFile helper RPC method which avoids the
+// common pattern of open(2), write(2), write(2), close(2) by doing all
+// operations as a single RPC.
+func (s *Stack) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) {
+	return s.rpcConn.RPCWriteFile(path, data)
+}
+
 // Interfaces implements inet.Stack.Interfaces.
 func (s *Stack) Interfaces() map[int32]inet.Interface {
 	return s.interfaces
@@ -135,41 +99,37 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 
 // SupportsIPv6 implements inet.Stack.SupportsIPv6.
 func (s *Stack) SupportsIPv6() bool {
-	return s.supportsIPv6
+	panic("rpcinet handles procfs directly this method should not be called")
 }
 
 // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
 func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
-	return s.tcpRecvBufSize, nil
+	panic("rpcinet handles procfs directly this method should not be called")
 }
 
 // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
 func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
-	// To keep all the supported stacks consistent we don't allow changing this
-	// value even though it would be possible via an RPC.
-	return syserror.EACCES
+	panic("rpcinet handles procfs directly this method should not be called")
+
 }
 
 // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
 func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
-	return s.tcpSendBufSize, nil
+	panic("rpcinet handles procfs directly this method should not be called")
+
 }
 
 // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
 func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
-	// To keep all the supported stacks consistent we don't allow changing this
-	// value even though it would be possible via an RPC.
-	return syserror.EACCES
+	panic("rpcinet handles procfs directly this method should not be called")
 }
 
 // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
 func (s *Stack) TCPSACKEnabled() (bool, error) {
-	return s.tcpSACKEnabled, nil
+	panic("rpcinet handles procfs directly this method should not be called")
 }
 
 // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
 func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
-	// To keep all the supported stacks consistent we don't allow changing this
-	// value even though it would be possible via an RPC.
-	return syserror.EACCES
+	panic("rpcinet handles procfs directly this method should not be called")
 }
-- 
cgit v1.2.3


From 722275c3d1a7b420915e6e6a3d623ae941c494cf Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Wed, 6 Jun 2018 11:43:01 -0700
Subject: Added a function to the controller to checkpoint a container.

Functionality for checkpoint is not complete, more to come.

PiperOrigin-RevId: 199500803
Change-Id: Iafb0fcde68c584270000fea898e6657a592466f7
---
 pkg/sentry/control/BUILD    |  4 +++
 pkg/sentry/control/state.go | 73 +++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/controller.go    | 19 +++++++++++-
 runsc/boot/loader.go        |  6 ++--
 4 files changed, 99 insertions(+), 3 deletions(-)
 create mode 100644 pkg/sentry/control/state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 4d1d0d019..6169891f7 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "control.go",
         "proc.go",
+        "state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/control",
     visibility = [
@@ -14,6 +15,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
         "//pkg/sentry/kernel",
@@ -21,7 +23,9 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
+        "//pkg/sentry/state",
         "//pkg/sentry/usage",
+        "//pkg/sentry/watchdog",
         "//pkg/urpc",
     ],
 )
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
new file mode 100644
index 000000000..cee4db636
--- /dev/null
+++ b/pkg/sentry/control/state.go
@@ -0,0 +1,73 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"errors"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// ErrInvalidFiles is returned when the urpc call to Save does not include an
+// appropriate file payload (e.g. there is no output file!).
+var ErrInvalidFiles = errors.New("exactly one file must be provided")
+
+// State includes state-related functions.
+type State struct {
+	Kernel   *kernel.Kernel
+	Watchdog *watchdog.Watchdog
+}
+
+// SaveOpts contains options for the Save RPC call.
+type SaveOpts struct {
+	// Key is used for state integrity check.
+	Key []byte `json:"key"`
+
+	// Metadata is the set of metadata to prepend to the state file.
+	Metadata map[string]string `json:"metadata"`
+
+	// FilePayload contains the destination for the state.
+	urpc.FilePayload
+}
+
+// Save saves the running system.
+func (s *State) Save(o *SaveOpts, _ *struct{}) error {
+	// Create an output stream.
+	if len(o.FilePayload.Files) != 1 {
+		return ErrInvalidFiles
+	}
+	defer o.FilePayload.Files[0].Close()
+
+	// Save to the first provided stream.
+	saveOpts := state.SaveOpts{
+		Destination: o.FilePayload.Files[0],
+		Key:         o.Key,
+		Metadata:    o.Metadata,
+		Callback: func(err error) {
+			if err == nil {
+				log.Infof("Save succeeded: exiting...")
+			} else {
+				log.Warningf("Save failed: exiting...")
+				s.Kernel.SetExitError(err)
+			}
+			s.Kernel.Kill(kernel.ExitStatus{})
+		},
+	}
+	return saveOpts.Save(s.Kernel, s.Watchdog)
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8fc0a9076..095b0a9b9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,9 +22,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 )
 
 const (
+	// ContainerCheckpoint checkpoints a container.
+	ContainerCheckpoint = "containerManager.Checkpoint"
+
 	// ContainerEvent is the URPC endpoint for getting stats about the
 	// container used by "runsc events".
 	ContainerEvent = "containerManager.Event"
@@ -69,7 +73,7 @@ type controller struct {
 }
 
 // newController creates a new controller and starts it listening.
-func newController(fd int, k *kernel.Kernel) (*controller, error) {
+func newController(fd int, k *kernel.Kernel, w *watchdog.Watchdog) (*controller, error) {
 	srv, err := server.CreateFromFD(fd)
 	if err != nil {
 		return nil, err
@@ -79,6 +83,7 @@ func newController(fd int, k *kernel.Kernel) (*controller, error) {
 		startChan:       make(chan struct{}),
 		startResultChan: make(chan error),
 		k:               k,
+		watchdog:        w,
 	}
 	srv.Register(manager)
 
@@ -113,6 +118,9 @@ type containerManager struct {
 	// k is the emulated linux kernel on which the sandboxed
 	// containers run.
 	k *kernel.Kernel
+
+	// watchdog is the kernel watchdog.
+	watchdog *watchdog.Watchdog
 }
 
 // StartRoot will start the root container process.
@@ -136,6 +144,15 @@ func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) err
 	return nil
 }
 
+// Checkpoint pauses a sandbox and saves its state.
+func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+	state := control.State{
+		Kernel:   cm.k,
+		Watchdog: cm.watchdog,
+	}
+	return state.Save(o, nil)
+}
+
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	// TODO: Use the cid and wait on the init process in that
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 76edbb905..41d1ee50d 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -186,6 +186,9 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		atomic.StoreUint32(&sniffer.LogPackets, 0)
 	}
 
+	// Create a watchdog.
+	watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
+
 	// Create the control server using the provided FD.
 	//
 	// This must be done *after* we have initialized the kernel since the
@@ -195,7 +198,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// misconfigured process will cause an error, and we want the control
 	// server up before that so that we don't time out trying to connect to
 	// it.
-	ctrl, err := newController(controllerFD, k)
+	ctrl, err := newController(controllerFD, k, watchdog)
 	if err != nil {
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
@@ -254,7 +257,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// the emulated kernel.
 	stopSignalForwarding := sighandling.StartForwarding(k)
 
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
 	return &Loader{
 		k:                    k,
 		ctrl:                 ctrl,
-- 
cgit v1.2.3


From 79fef54eb1b9e941e2c910f90b65f3cfe94e18c4 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Wed, 6 Jun 2018 15:52:29 -0700
Subject: Add support for rpcinet ioctl(2).

This change will add support for ioctls that have previously
been supported by netstack.

LINE_LENGTH_IGNORE

PiperOrigin-RevId: 199544114
Change-Id: I3769202c19502c3b7d05e06ea9552acfd9255893
---
 pkg/sentry/socket/rpcinet/socket.go         | 63 ++++++++++++++++++++++++++++-
 pkg/sentry/socket/rpcinet/syscall_rpc.proto | 14 +++----
 2 files changed, 68 insertions(+), 9 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 74cb84927..3356f7804 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -56,6 +56,10 @@ type socketOperations struct {
 // Verify that we actually implement socket.Socket.
 var _ = socket.Socket(&socketOperations{})
 
+const (
+	sizeOfIfReq = 40
+)
+
 // New creates a new RPC socket.
 func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) {
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */)
@@ -290,7 +294,11 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		return 0, nil, 0, syserr.FromError(err)
 	}
 
-	return fd, payload.Address.Address, payload.Address.Length, nil
+	if peerRequested {
+		return fd, payload.Address.Address, payload.Address.Length, nil
+	}
+
+	return fd, nil, 0, nil
 }
 
 // Bind implements socket.Socket.Bind.
@@ -385,9 +393,60 @@ func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *sy
 	return addr.Address, addr.Length, nil
 }
 
+func rpcIoctl(t *kernel.Task, fd, cmd uint32, arg []byte) ([]byte, error) {
+	stack := t.NetworkContext().(*Stack)
+
+	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Ioctl{&pb.IOCtlRequest{Fd: fd, Cmd: cmd, Arg: arg}}}, false /* ignoreResult */)
+	<-c
+
+	res := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Ioctl).Ioctl.Result
+	if e, ok := res.(*pb.IOCtlResponse_ErrorNumber); ok {
+		return nil, syscall.Errno(e.ErrorNumber)
+	}
+
+	return res.(*pb.IOCtlResponse_Value).Value, nil
+}
+
 // Ioctl implements fs.FileOperations.Ioctl.
 func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	return 0, syserror.ENOTTY
+	t := ctx.(*kernel.Task)
+
+	cmd := uint32(args[1].Int())
+	arg := args[2].Pointer()
+
+	var buf []byte
+	switch cmd {
+	// The following ioctls take 4 byte argument parameters.
+	case syscall.TIOCINQ, syscall.TIOCOUTQ:
+		buf = make([]byte, 4)
+	// The following ioctls have args which are sizeof(struct ifreq).
+	case syscall.SIOCGIFINDEX, syscall.SIOCGIFNETMASK, syscall.SIOCGIFHWADDR, syscall.SIOCGIFNAME, syscall.SIOCGIFFLAGS:
+		buf = make([]byte, sizeOfIfReq)
+	default:
+		return 0, syserror.ENOTTY
+	}
+
+	_, err := io.CopyIn(ctx, arg, buf, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+
+	if err != nil {
+		return 0, err
+	}
+
+	v, err := rpcIoctl(t, s.fd, cmd, buf)
+	if err != nil {
+		return 0, err
+	}
+
+	if len(v) != len(buf) {
+		return 0, syserror.EINVAL
+	}
+
+	_, err = io.CopyOut(ctx, arg, v, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
 }
 
 func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResponse_ResultPayload, *syserr.Error) {
diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
index b845b1bce..996962aae 100644
--- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto
+++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
@@ -8,7 +8,7 @@ package syscall_rpc;
 
 message SendmsgRequest {
   uint32 fd = 1;
-  bytes data = 2;
+  bytes data = 2 [ctype = CORD];
   bytes address = 3;
   bool more = 4;
   bool end_of_record = 5;
@@ -24,13 +24,13 @@ message SendmsgResponse {
 message IOCtlRequest {
   uint32 fd = 1;
   uint32 cmd = 2;
-  uint64 arg = 3;
+  bytes arg = 3;
 }
 
 message IOCtlResponse {
   oneof result {
     uint32 error_number = 1;
-    uint64 value = 2;
+    bytes value = 2;
   }
 }
 
@@ -63,7 +63,7 @@ message ReadRequest {
 message ReadResponse {
   oneof result {
     uint32 error_number = 1;
-    bytes data = 2;
+    bytes data = 2 [ctype = CORD];
   }
 }
 
@@ -74,13 +74,13 @@ message ReadFileRequest {
 message ReadFileResponse {
   oneof result {
     uint32 error_number = 1;
-    bytes data = 2;
+    bytes data = 2 [ctype = CORD];
   }
 }
 
 message WriteRequest {
   uint32 fd = 1;
-  bytes data = 2;
+  bytes data = 2 [ctype = CORD];
 }
 
 message WriteResponse {
@@ -107,7 +107,7 @@ message AddressResponse {
 
 message RecvmsgResponse {
   message ResultPayload {
-    bytes data = 1;
+    bytes data = 1 [ctype = CORD];
     AddressResponse address = 2;
     uint32 length = 3;
   }
-- 
cgit v1.2.3


From 1b5062263b4a3ca3dc0271d9e06ad0113197344c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 6 Jun 2018 21:47:39 -0700
Subject: Add allocator abstraction for page tables.

In order to prevent possible garbage collection and reuse of page table
pages prior to invalidation, introduce a former allocator abstraction
that can ensure entries are held during a single traversal. This also
cleans up the abstraction and splits it out of the machine itself.

PiperOrigin-RevId: 199581636
Change-Id: I2257d5d7ffd9c36f9b7ecd42f769261baeaf115c
---
 pkg/sentry/platform/kvm/BUILD                      |   1 +
 pkg/sentry/platform/kvm/address_space.go           |   5 +-
 pkg/sentry/platform/kvm/allocator.go               |  69 +++++++++++++
 pkg/sentry/platform/kvm/bluepill_fault.go          |   4 +-
 pkg/sentry/platform/kvm/kvm.go                     |   2 +-
 pkg/sentry/platform/kvm/machine.go                 |   6 +-
 pkg/sentry/platform/kvm/machine_unsafe.go          |  12 ---
 pkg/sentry/platform/kvm/physical_map.go            |  14 +--
 pkg/sentry/platform/ring0/pagetables/BUILD         |   3 +-
 pkg/sentry/platform/ring0/pagetables/allocator.go  | 109 +++++++++++++++++++++
 .../platform/ring0/pagetables/allocator_unsafe.go  |  53 ++++++++++
 pkg/sentry/platform/ring0/pagetables/pagetables.go |  89 +++--------------
 .../platform/ring0/pagetables/pagetables_amd64.go  |  56 +++++++----
 .../ring0/pagetables/pagetables_amd64_test.go      |   8 +-
 .../platform/ring0/pagetables/pagetables_test.go   |  21 ++--
 .../platform/ring0/pagetables/pagetables_unsafe.go |  31 ------
 .../platform/ring0/pagetables/pagetables_x86.go    |  15 ++-
 17 files changed, 326 insertions(+), 172 deletions(-)
 create mode 100644 pkg/sentry/platform/kvm/allocator.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/allocator.go
 create mode 100644 pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
 delete mode 100644 pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 004938080..89d98c5c7 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -28,6 +28,7 @@ go_library(
     srcs = [
         "address_space.go",
         "address_space_unsafe.go",
+        "allocator.go",
         "bluepill.go",
         "bluepill_amd64.go",
         "bluepill_amd64.s",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 4c76883ad..15d45f5bc 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -84,7 +84,7 @@ func (as *addressSpace) Touch(c *vCPU) bool {
 
 func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
 	for m.length > 0 {
-		physical, length, ok := TranslateToPhysical(m.addr)
+		physical, length, ok := translateToPhysical(m.addr)
 		if !ok {
 			panic("unable to translate segment")
 		}
@@ -227,4 +227,7 @@ func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 func (as *addressSpace) Release() {
 	as.Unmap(0, ^uint64(0))
 	as.pageTables.Release()
+
+	// Free all pages from the allocator.
+	as.pageTables.Allocator.(allocator).base.Drain()
 }
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
new file mode 100644
index 000000000..80066bfc5
--- /dev/null
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+type allocator struct {
+	base *pagetables.RuntimeAllocator
+}
+
+// newAllocator is used to define the allocator.
+func newAllocator() allocator {
+	return allocator{
+		base: pagetables.NewRuntimeAllocator(),
+	}
+}
+
+// NewPTEs implements pagetables.Allocator.NewPTEs.
+//
+//go:nosplit
+func (a allocator) NewPTEs() *pagetables.PTEs {
+	return a.base.NewPTEs()
+}
+
+// PhysicalFor returns the physical address for a set of PTEs.
+//
+//go:nosplit
+func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+	virtual := a.base.PhysicalFor(ptes)
+	physical, _, ok := translateToPhysical(virtual)
+	if !ok {
+		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+	}
+	return physical
+}
+
+// LookupPTEs implements pagetables.Allocator.LookupPTEs.
+//
+//go:nosplit
+func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical)
+	if !ok {
+		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+	}
+	return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
+}
+
+// FreePTEs implements pagetables.Allocator.FreePTEs.
+//
+//go:nosplit
+func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
+	a.base.FreePTEs(ptes)
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index 7c8c7bc37..8650cd78f 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -46,7 +46,7 @@ func yield() {
 // calculateBluepillFault calculates the fault address range.
 //
 //go:nosplit
-func calculateBluepillFault(m *machine, physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
+func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
 	alignedPhysical := physical &^ uintptr(usermem.PageSize-1)
 	for _, pr := range physicalRegions {
 		end := pr.physical + pr.length
@@ -82,7 +82,7 @@ func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
 	// fault. This all has to be done in this function because we're in a
 	// signal handler context. (We can't call any functions that might
 	// split the stack.)
-	virtualStart, physicalStart, length, ok := calculateBluepillFault(m, physical)
+	virtualStart, physicalStart, length, ok := calculateBluepillFault(physical)
 	if !ok {
 		return 0, false
 	}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 6defb1c46..13c363993 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -121,7 +121,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
 // NewAddressSpace returns a new pagetable root.
 func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
 	// Allocate page tables and install system mappings.
-	pageTables := k.machine.kernel.PageTables.New()
+	pageTables := k.machine.kernel.PageTables.New(newAllocator())
 	applyPhysicalRegions(func(pr physicalRegion) bool {
 		// Map the kernel in the upper half.
 		pageTables.Map(
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 5a6109ced..949abd838 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -133,7 +133,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 		vCPUs = n
 	}
 	m.kernel = ring0.New(ring0.KernelOpts{
-		PageTables: pagetables.New(m, pagetablesOpts),
+		PageTables: pagetables.New(newAllocator(), pagetablesOpts),
 	})
 
 	// Initialize architecture state.
@@ -211,7 +211,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 			return // skip region.
 		}
 		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
-			physical, length, ok := TranslateToPhysical(virtual)
+			physical, length, ok := translateToPhysical(virtual)
 			if !ok {
 				// This must be an invalid region that was
 				// knocked out by creation of the physical map.
@@ -239,7 +239,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 // This panics on error.
 func (m *machine) mapPhysical(physical, length uintptr) {
 	for end := physical + length; physical < end; {
-		_, physicalStart, length, ok := calculateBluepillFault(m, physical)
+		_, physicalStart, length, ok := calculateBluepillFault(physical)
 		if !ok {
 			// Should never happen.
 			panic("mapPhysical on unknown physical address")
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 516098a2b..86323c891 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -21,7 +21,6 @@ import (
 	"unsafe"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 )
 
 //go:linkname entersyscall runtime.entersyscall
@@ -30,17 +29,6 @@ func entersyscall()
 //go:linkname exitsyscall runtime.exitsyscall
 func exitsyscall()
 
-// TranslateToVirtual implements pagetables.Translater.TranslateToPhysical.
-func (m *machine) TranslateToPhysical(ptes *pagetables.PTEs) uintptr {
-	// The length doesn't matter because all these translations require
-	// only a single page, which is guaranteed to be satisfied.
-	physical, _, ok := TranslateToPhysical(uintptr(unsafe.Pointer(ptes)))
-	if !ok {
-		panic("unable to translate pagetables.Node to physical address")
-	}
-	return physical
-}
-
 // mapRunData maps the vCPU run data.
 func mapRunData(fd int) (*runData, error) {
 	r, _, errno := syscall.RawSyscall6(
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 5d55c9486..81a98656d 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -205,17 +205,19 @@ func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool {
 	return true
 }
 
-// TranslateToPhysical translates the given virtual address.
+// translateToPhysical translates the given virtual address.
 //
 // Precondition: physicalInit must have been called.
-func TranslateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) {
-	ok = !applyPhysicalRegions(func(pr physicalRegion) bool {
+//
+//go:nosplit
+func translateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) {
+	for _, pr := range physicalRegions {
 		if pr.virtual <= virtual && virtual < pr.virtual+pr.length {
 			physical = pr.physical + (virtual - pr.virtual)
 			length = pr.length - (virtual - pr.virtual)
-			return false
+			ok = true
+			return
 		}
-		return true
-	})
+	}
 	return
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 1a8b7931e..768f96678 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -5,9 +5,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "pagetables",
     srcs = [
+        "allocator.go",
+        "allocator_unsafe.go",
         "pagetables.go",
         "pagetables_amd64.go",
-        "pagetables_unsafe.go",
         "pagetables_x86.go",
         "pcids_x86.go",
     ],
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
new file mode 100644
index 000000000..1499623fb
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -0,0 +1,109 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+// Allocator is used to allocate and map PTEs.
+//
+// Note that allocators may be called concurrently.
+type Allocator interface {
+	// NewPTEs returns a new set of PTEs and their physical address.
+	NewPTEs() *PTEs
+
+	// PhysicalFor gives the physical address for a set of PTEs.
+	PhysicalFor(ptes *PTEs) uintptr
+
+	// LookupPTEs looks up PTEs by physical address.
+	LookupPTEs(physical uintptr) *PTEs
+
+	// FreePTEs frees a set of PTEs.
+	FreePTEs(ptes *PTEs)
+}
+
+// RuntimeAllocator is a trivial allocator.
+type RuntimeAllocator struct {
+	// used is the set of PTEs that have been allocated. This includes any
+	// PTEs that may be in the pool below. PTEs are only freed from this
+	// map by the Drain call.
+	//
+	// This exists to prevent accidental garbage collection.
+	used map[*PTEs]struct{}
+
+	// pool is the set of free-to-use PTEs.
+	pool []*PTEs
+}
+
+// NewRuntimeAllocator returns an allocator that uses runtime allocation.
+func NewRuntimeAllocator() *RuntimeAllocator {
+	return &RuntimeAllocator{
+		used: make(map[*PTEs]struct{}),
+	}
+}
+
+// Drain empties the pool.
+func (r *RuntimeAllocator) Drain() {
+	for i, ptes := range r.pool {
+		// Zap the entry in the underlying array to ensure that it can
+		// be properly garbage collected.
+		r.pool[i] = nil
+		// Similarly, free the reference held by the used map (these
+		// also apply for the pool entries).
+		delete(r.used, ptes)
+	}
+	r.pool = r.pool[:0]
+}
+
+// NewPTEs implements Allocator.NewPTEs.
+//
+// Note that the "physical" address here is actually the virtual address of the
+// PTEs structure. The entries are tracked only to avoid garbage collection.
+//
+// This is guaranteed not to split as long as the pool is sufficiently full.
+//
+//go:nosplit
+func (r *RuntimeAllocator) NewPTEs() *PTEs {
+	// Pull from the pool if we can.
+	if len(r.pool) > 0 {
+		ptes := r.pool[len(r.pool)-1]
+		r.pool = r.pool[:len(r.pool)-1]
+		return ptes
+	}
+
+	// Allocate a new entry.
+	ptes := newAlignedPTEs()
+	r.used[ptes] = struct{}{}
+	return ptes
+}
+
+// PhysicalFor returns the physical address for the given PTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) PhysicalFor(ptes *PTEs) uintptr {
+	return physicalFor(ptes)
+}
+
+// LookupPTEs implements Allocator.LookupPTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs {
+	return fromPhysical(physical)
+}
+
+// FreePTEs implements Allocator.FreePTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) {
+	// Add to the pool.
+	r.pool = append(r.pool, ptes)
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
new file mode 100644
index 000000000..aca778913
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -0,0 +1,53 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// newAlignedPTEs returns a set of aligned PTEs.
+func newAlignedPTEs() *PTEs {
+	ptes := new(PTEs)
+	offset := physicalFor(ptes) & (usermem.PageSize - 1)
+	if offset == 0 {
+		// Already aligned.
+		return ptes
+	}
+
+	// Need to force an aligned allocation.
+	unaligned := make([]byte, (2*usermem.PageSize)-1)
+	offset = uintptr(unsafe.Pointer(&unaligned[0])) & (usermem.PageSize - 1)
+	if offset != 0 {
+		offset = usermem.PageSize - offset
+	}
+	return (*PTEs)(unsafe.Pointer(&unaligned[offset]))
+}
+
+// physicalFor returns the "physical" address for PTEs.
+//
+//go:nosplit
+func physicalFor(ptes *PTEs) uintptr {
+	return uintptr(unsafe.Pointer(ptes))
+}
+
+// fromPhysical returns the PTEs from the "physical" address.
+//
+//go:nosplit
+func fromPhysical(physical uintptr) *PTEs {
+	return (*PTEs)(unsafe.Pointer(physical))
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 2a83bbff2..929771cca 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -19,52 +19,28 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// Node is a single node within a set of page tables.
-type Node struct {
-	// unalignedData has unaligned data. Unfortunately, we can't really
-	// rely on the allocator to give us what we want here. So we just throw
-	// it at the wall and use the portion that matches. Gross. This may be
-	// changed in the future to use a different allocation mechanism.
-	//
-	// Access must happen via functions found in pagetables_unsafe.go.
-	unalignedData [(2 * usermem.PageSize) - 1]byte
-
-	// physical is the translated address of these entries.
-	//
-	// This is filled in at creation time.
-	physical uintptr
-}
-
 // PageTables is a set of page tables.
 type PageTables struct {
+	// Allocator is used to allocate nodes.
+	Allocator Allocator
+
 	// root is the pagetable root.
-	root *Node
+	root *PTEs
 
-	// translator is the translator passed at creation.
-	translator Translator
+	// rootPhysical is the cached physical address of the root.
+	//
+	// This is saved only to prevent constant translation.
+	rootPhysical uintptr
 
 	// archPageTables includes architecture-specific features.
 	archPageTables
-
-	// allNodes is a set of nodes indexed by translator address.
-	allNodes map[uintptr]*Node
-}
-
-// Translator translates to guest physical addresses.
-type Translator interface {
-	// TranslateToPhysical translates the given pointer object into a
-	// "physical" address. We do not require that it translates back, the
-	// reverse mapping is maintained internally.
-	TranslateToPhysical(*PTEs) uintptr
 }
 
 // New returns new PageTables.
-func New(t Translator, opts Opts) *PageTables {
-	p := &PageTables{
-		translator: t,
-		allNodes:   make(map[uintptr]*Node),
-	}
-	p.root = p.allocNode()
+func New(a Allocator, opts Opts) *PageTables {
+	p := &PageTables{Allocator: a}
+	p.root = p.Allocator.NewPTEs()
+	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
 	p.init(opts)
 	return p
 }
@@ -74,40 +50,14 @@ func New(t Translator, opts Opts) *PageTables {
 // This function should always be preferred to New if there are existing
 // pagetables, as this function preserves architectural constraints relevant to
 // managing multiple sets of pagetables.
-func (p *PageTables) New() *PageTables {
-	np := &PageTables{
-		translator: p.translator,
-		allNodes:   make(map[uintptr]*Node),
-	}
-	np.root = np.allocNode()
+func (p *PageTables) New(a Allocator) *PageTables {
+	np := &PageTables{Allocator: a}
+	np.root = np.Allocator.NewPTEs()
+	np.rootPhysical = p.Allocator.PhysicalFor(np.root)
 	np.initFrom(&p.archPageTables)
 	return np
 }
 
-// setPageTable sets the given index as a page table.
-func (p *PageTables) setPageTable(n *Node, index int, child *Node) {
-	phys := p.translator.TranslateToPhysical(child.PTEs())
-	p.allNodes[phys] = child
-	pte := &n.PTEs()[index]
-	pte.setPageTable(phys)
-}
-
-// clearPageTable clears the given entry.
-func (p *PageTables) clearPageTable(n *Node, index int) {
-	pte := &n.PTEs()[index]
-	physical := pte.Address()
-	pte.Clear()
-	delete(p.allNodes, physical)
-}
-
-// getPageTable returns the page table entry.
-func (p *PageTables) getPageTable(n *Node, index int) *Node {
-	pte := &n.PTEs()[index]
-	physical := pte.Address()
-	child := p.allNodes[physical]
-	return child
-}
-
 // Map installs a mapping with the given physical address.
 //
 // True is returned iff there was a previous mapping in the range.
@@ -172,10 +122,3 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts)
 	})
 	return
 }
-
-// allocNode allocates a new page.
-func (p *PageTables) allocNode() *Node {
-	n := new(Node)
-	n.physical = p.translator.TranslateToPhysical(n.PTEs())
-	return n
-}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 8dc50f9dd..6a724e4fd 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -121,7 +121,10 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 	}
 
 	for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
-		pgdEntry := &p.root.PTEs()[pgdIndex]
+		var (
+			pgdEntry   = &p.root[pgdIndex]
+			pudEntries *PTEs
+		)
 		if !pgdEntry.Valid() {
 			if !alloc {
 				// Skip over this entry.
@@ -130,15 +133,20 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 			}
 
 			// Allocate a new pgd.
-			p.setPageTable(p.root, pgdIndex, p.allocNode())
+			pudEntries = p.Allocator.NewPTEs()
+			pgdEntry.setPageTable(p, pudEntries)
+		} else {
+			pudEntries = p.Allocator.LookupPTEs(pgdEntry.Address())
 		}
 
 		// Map the next level.
-		pudNode := p.getPageTable(p.root, pgdIndex)
 		clearPUDEntries := 0
 
 		for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
-			pudEntry := &(pudNode.PTEs()[pudIndex])
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
 			if !pudEntry.Valid() {
 				if !alloc {
 					// Skip over this entry.
@@ -161,7 +169,8 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 				}
 
 				// Allocate a new pud.
-				p.setPageTable(pudNode, pudIndex, p.allocNode())
+				pmdEntries = p.Allocator.NewPTEs()
+				pudEntry.setPageTable(p, pmdEntries)
 
 			} else if pudEntry.IsSuper() {
 				// Does this page need to be split?
@@ -169,8 +178,7 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					currentAddr := uint64(pudEntry.Address())
 
 					// Install the relevant entries.
-					pmdNode := p.allocNode()
-					pmdEntries := pmdNode.PTEs()
+					pmdEntries = p.Allocator.NewPTEs()
 					for index := 0; index < entriesPerPage; index++ {
 						pmdEntry := &pmdEntries[index]
 						pmdEntry.SetSuper()
@@ -179,7 +187,7 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					}
 
 					// Reset to point to the new page.
-					p.setPageTable(pudNode, pudIndex, pmdNode)
+					pudEntry.setPageTable(p, pmdEntries)
 				} else {
 					// A super page to be checked directly.
 					fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
@@ -193,14 +201,18 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					start = next(start, pudSize)
 					continue
 				}
+			} else {
+				pmdEntries = p.Allocator.LookupPTEs(pudEntry.Address())
 			}
 
 			// Map the next level, since this is valid.
-			pmdNode := p.getPageTable(pudNode, pudIndex)
 			clearPMDEntries := 0
 
 			for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
-				pmdEntry := &pmdNode.PTEs()[pmdIndex]
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
 				if !pmdEntry.Valid() {
 					if !alloc {
 						// Skip over this entry.
@@ -222,7 +234,8 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 					}
 
 					// Allocate a new pmd.
-					p.setPageTable(pmdNode, pmdIndex, p.allocNode())
+					pteEntries = p.Allocator.NewPTEs()
+					pmdEntry.setPageTable(p, pteEntries)
 
 				} else if pmdEntry.IsSuper() {
 					// Does this page need to be split?
@@ -230,8 +243,7 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 						currentAddr := uint64(pmdEntry.Address())
 
 						// Install the relevant entries.
-						pteNode := p.allocNode()
-						pteEntries := pteNode.PTEs()
+						pteEntries = p.Allocator.NewPTEs()
 						for index := 0; index < entriesPerPage; index++ {
 							pteEntry := &pteEntries[index]
 							pteEntry.Set(uintptr(currentAddr), pmdEntry.Opts())
@@ -239,7 +251,7 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 						}
 
 						// Reset to point to the new page.
-						p.setPageTable(pmdNode, pmdIndex, pteNode)
+						pmdEntry.setPageTable(p, pteEntries)
 					} else {
 						// A huge page to be checked directly.
 						fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
@@ -253,14 +265,17 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 						start = next(start, pmdSize)
 						continue
 					}
+				} else {
+					pteEntries = p.Allocator.LookupPTEs(pmdEntry.Address())
 				}
 
 				// Map the next level, since this is valid.
-				pteNode := p.getPageTable(pmdNode, pmdIndex)
 				clearPTEEntries := 0
 
 				for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
-					pteEntry := &pteNode.PTEs()[pteIndex]
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
 					if !pteEntry.Valid() && !alloc {
 						clearPTEEntries++
 						start += pteSize
@@ -283,21 +298,24 @@ func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn fun
 
 				// Check if we no longer need this page.
 				if clearPTEEntries == entriesPerPage {
-					p.clearPageTable(pmdNode, pmdIndex)
+					pmdEntry.Clear()
+					p.Allocator.FreePTEs(pteEntries)
 					clearPMDEntries++
 				}
 			}
 
 			// Check if we no longer need this page.
 			if clearPMDEntries == entriesPerPage {
-				p.clearPageTable(pudNode, pudIndex)
+				pudEntry.Clear()
+				p.Allocator.FreePTEs(pmdEntries)
 				clearPUDEntries++
 			}
 		}
 
 		// Check if we no longer need this page.
 		if clearPUDEntries == entriesPerPage {
-			p.clearPageTable(p.root, pgdIndex)
+			pgdEntry.Clear()
+			p.Allocator.FreePTEs(pudEntries)
 		}
 	}
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
index 4f15c6b58..c81786133 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -23,7 +23,7 @@ import (
 )
 
 func Test2MAnd4K(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map a small page and a huge page.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -37,7 +37,7 @@ func Test2MAnd4K(t *testing.T) {
 }
 
 func Test1GAnd4K(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map a small page and a super page.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -51,7 +51,7 @@ func Test1GAnd4K(t *testing.T) {
 }
 
 func TestSplit1GPage(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map a super page and knock out the middle.
 	pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*42)
@@ -65,7 +65,7 @@ func TestSplit1GPage(t *testing.T) {
 }
 
 func TestSplit2MPage(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map a huge page and knock out the middle.
 	pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*42)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index a4f684af2..dec8def7f 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -15,18 +15,11 @@
 package pagetables
 
 import (
-	"reflect"
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-type reflectTranslater struct{}
-
-func (r reflectTranslater) TranslateToPhysical(ptes *PTEs) uintptr {
-	return reflect.ValueOf(ptes).Pointer()
-}
-
 type mapping struct {
 	start  uintptr
 	length uintptr
@@ -80,12 +73,12 @@ func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
 }
 
 func TestAllocFree(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 	pt.Release()
 }
 
 func TestUnmap(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map and unmap one entry.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -96,7 +89,7 @@ func TestUnmap(t *testing.T) {
 }
 
 func TestReadOnly(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map one entry.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42)
@@ -108,7 +101,7 @@ func TestReadOnly(t *testing.T) {
 }
 
 func TestReadWrite(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map one entry.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -120,7 +113,7 @@ func TestReadWrite(t *testing.T) {
 }
 
 func TestSerialEntries(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map two sequential entries.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -134,7 +127,7 @@ func TestSerialEntries(t *testing.T) {
 }
 
 func TestSpanningEntries(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Span a pgd with two pages.
 	pt.Map(0x00007efffffff000, 2*pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42)
@@ -147,7 +140,7 @@ func TestSpanningEntries(t *testing.T) {
 }
 
 func TestSparseEntries(t *testing.T) {
-	pt := New(reflectTranslater{}, Opts{})
+	pt := New(NewRuntimeAllocator(), Opts{})
 
 	// Map two entries in different pgds.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
deleted file mode 100644
index a2b44fb79..000000000
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pagetables
-
-import (
-	"unsafe"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-// PTEs returns aligned PTE entries.
-func (n *Node) PTEs() *PTEs {
-	addr := uintptr(unsafe.Pointer(&n.unalignedData[0]))
-	offset := addr & (usermem.PageSize - 1)
-	if offset != 0 {
-		offset = usermem.PageSize - offset
-	}
-	return (*PTEs)(unsafe.Pointer(addr + offset))
-}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index 8ba78ed0d..72a955d08 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -70,9 +70,9 @@ func (p *PageTables) CR3() uint64 {
 	// Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1).
 	const noFlushBit uint64 = 0x8000000000000000
 	if p.pcid != 0 {
-		return noFlushBit | uint64(p.root.physical) | uint64(p.pcid)
+		return noFlushBit | uint64(p.rootPhysical) | uint64(p.pcid)
 	}
-	return uint64(p.root.physical)
+	return uint64(p.rootPhysical)
 }
 
 // FlushCR3 returns the CR3 value that flushes the TLB.
@@ -81,7 +81,7 @@ func (p *PageTables) CR3() uint64 {
 //
 //go:nosplit
 func (p *PageTables) FlushCR3() uint64 {
-	return uint64(p.root.physical) | uint64(p.pcid)
+	return uint64(p.rootPhysical) | uint64(p.pcid)
 }
 
 // Bits in page table entries.
@@ -200,8 +200,13 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
 // be cleared. This is used explicitly for breaking super pages.
 //
 //go:nosplit
-func (p *PTE) setPageTable(addr uintptr) {
-	v := (addr &^ optionMask) | present | user | writable | accessed | dirty
+func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
+	addr := pt.Allocator.PhysicalFor(ptes)
+	if addr&^optionMask != addr {
+		// This should never happen.
+		panic("unaligned physical address!")
+	}
+	v := addr | present | user | writable | accessed | dirty
 	atomic.StoreUintptr((*uintptr)(p), v)
 }
 
-- 
cgit v1.2.3


From 3374849cb553fab16e69d39cf6e49f843d94790b Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 6 Jun 2018 22:51:58 -0700
Subject: Split PCID implementation from page tables.

Instead of associating a single PCID with each set of page tables (which
will reach the maximum quickly), allow a dynamic pool for each vCPU.
This is the same way that Linux operates. We also split management of
PCIDs out of the page tables themselves for simplicity.

PiperOrigin-RevId: 199585631
Change-Id: I42f3486ada3cb2a26f623c65ac279b473ae63201
---
 pkg/sentry/platform/kvm/address_space.go           |   4 +-
 pkg/sentry/platform/kvm/kvm.go                     |   2 +-
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go        |   6 --
 pkg/sentry/platform/kvm/kvm_test.go                |   5 -
 pkg/sentry/platform/kvm/machine.go                 |  10 +-
 pkg/sentry/platform/kvm/machine_amd64.go           |  53 ++++++++++-
 pkg/sentry/platform/ring0/defs.go                  |   3 +
 pkg/sentry/platform/ring0/defs_amd64.go            |  15 +++
 pkg/sentry/platform/ring0/kernel_amd64.go          |  13 +--
 pkg/sentry/platform/ring0/pagetables/BUILD         |   1 -
 pkg/sentry/platform/ring0/pagetables/pagetables.go |  25 +----
 .../ring0/pagetables/pagetables_amd64_test.go      |  12 +--
 .../platform/ring0/pagetables/pagetables_test.go   |  23 ++---
 .../platform/ring0/pagetables/pagetables_x86.go    |  60 +++---------
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go  | 102 ++++++++++++---------
 .../platform/ring0/pagetables/pcids_x86_test.go    |  65 -------------
 16 files changed, 162 insertions(+), 237 deletions(-)
 delete mode 100644 pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 15d45f5bc..c2f4559a0 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -226,8 +226,10 @@ func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 // Release releases the page tables.
 func (as *addressSpace) Release() {
 	as.Unmap(0, ^uint64(0))
-	as.pageTables.Release()
 
 	// Free all pages from the allocator.
 	as.pageTables.Allocator.(allocator).base.Drain()
+
+	// Drop all cached machine references.
+	as.machine.dropPageTables(as.pageTables)
 }
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 13c363993..1a8e16ca0 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -121,7 +121,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
 // NewAddressSpace returns a new pagetable root.
 func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
 	// Allocate page tables and install system mappings.
-	pageTables := k.machine.kernel.PageTables.New(newAllocator())
+	pageTables := pagetables.New(newAllocator())
 	applyPhysicalRegions(func(pr physicalRegion) bool {
 		// Map the kernel in the upper half.
 		pageTables.Map(
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index 834e6b96d..476e783a0 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -20,14 +20,11 @@ import (
 	"fmt"
 	"syscall"
 	"unsafe"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 )
 
 var (
 	runDataSize    int
 	hasGuestPCID   bool
-	pagetablesOpts pagetables.Opts
 	cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES}
 )
 
@@ -75,9 +72,6 @@ func updateSystemValues(fd int) error {
 		}
 	}
 
-	// Set the pagetables to use PCID if it's available.
-	pagetablesOpts.EnablePCID = hasGuestPCID
-
 	// Success.
 	return nil
 }
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 00919b214..71c5c856e 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -121,11 +121,6 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func
 		pt   *pagetables.PageTables
 	)
 	testutil.SetTestTarget(&regs, target)
-	defer func() {
-		if pt != nil {
-			pt.Release()
-		}
-	}()
 
 	kvmTest(t, func(k *KVM) {
 		// Create new page tables.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 949abd838..3c1e01241 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -112,6 +112,9 @@ type vCPU struct {
 	// active is the current addressSpace: this is set and read atomically,
 	// it is used to elide unnecessary interrupts due to invalidations.
 	active atomicAddressSpace
+
+	// vCPUArchState is the architecture-specific state.
+	vCPUArchState
 }
 
 // newMachine returns a new VM context.
@@ -133,7 +136,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 		vCPUs = n
 	}
 	m.kernel = ring0.New(ring0.KernelOpts{
-		PageTables: pagetables.New(newAllocator(), pagetablesOpts),
+		PageTables: pagetables.New(newAllocator()),
 	})
 
 	// Initialize architecture state.
@@ -285,11 +288,6 @@ func (m *machine) Destroy() {
 		}
 	}
 
-	// Release host mappings.
-	if m.kernel.PageTables != nil {
-		m.kernel.PageTables.Release()
-	}
-
 	// vCPUs are gone: teardown machine state.
 	if err := syscall.Close(m.fd); err != nil {
 		panic(fmt.Sprintf("error closing VM fd: %v", err))
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index ba7bbcb91..6afae5cae 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -41,6 +42,38 @@ func (m *machine) initArchState(vCPUs int) error {
 	return nil
 }
 
+type vCPUArchState struct {
+	// PCIDs is the set of PCIDs for this vCPU.
+	//
+	// This starts above fixedKernelPCID.
+	PCIDs *pagetables.PCIDs
+}
+
+const (
+	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
+	// tables. We must start allocating user PCIDs above this in order to
+	// avoid any conflict (see below).
+	fixedKernelPCID = 1
+
+	// poolPCIDs is the number of PCIDs to record in the database. As this
+	// grows, assignment can take longer, since it is a simple linear scan.
+	// Beyond a relatively small number, there are likely few perform
+	// benefits, since the TLB has likely long since lost any translations
+	// from more than a few PCIDs past.
+	poolPCIDs = 8
+)
+
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Clear from all PCIDs.
+	for _, c := range m.vCPUs {
+		c.PCIDs.Drop(pt)
+	}
+}
+
 // initArchState initializes architecture-specific state.
 func (c *vCPU) initArchState() error {
 	var (
@@ -67,8 +100,16 @@ func (c *vCPU) initArchState() error {
 	kernelSystemRegs.TR.base = tssBase
 	kernelSystemRegs.TR.limit = uint32(tssLimit)
 
-	// Point to kernel page tables.
-	kernelSystemRegs.CR3 = c.machine.kernel.PageTables.FlushCR3()
+	// Point to kernel page tables, with no initial PCID.
+	kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0)
+
+	// Initialize the PCID database.
+	if hasGuestPCID {
+		// Note that NewPCIDs may return a nil table here, in which
+		// case we simply don't use PCID support (see below). In
+		// practice, this should not happen, however.
+		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
+	}
 
 	// Set the CPUID; this is required before setting system registers,
 	// since KVM will reject several CR4 bits if the CPUID does not
@@ -121,6 +162,14 @@ func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error)
 
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, usermem.AccessType, error) {
+	// Assign PCIDs.
+	if c.PCIDs != nil {
+		var requireFlushPCID bool // Force a flush?
+		switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
+		switchOpts.KernelPCID = fixedKernelPCID
+		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
+	}
+
 	// See below.
 	var vector ring0.Vector
 
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 7b3bed1c7..f09d045eb 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -109,4 +109,7 @@ type SwitchOpts struct {
 
 	// FullRestore indicates that an iret-based restore should be used.
 	FullRestore bool
+
+	// SwitchArchOpts are architecture-specific options.
+	SwitchArchOpts
 }
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index bb3420125..0d068c00a 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -104,6 +104,21 @@ func (c *CPU) ErrorCode() (value uintptr, user bool) {
 	return c.errorCode, c.errorType != 0
 }
 
+// SwitchArchOpts are embedded in SwitchOpts.
+type SwitchArchOpts struct {
+	// UserPCID indicates that the application PCID to be used on switch,
+	// assuming that PCIDs are supported.
+	//
+	// Per pagetables_x86.go, a zero PCID implies a flush.
+	UserPCID uint16
+
+	// KernelPCID indicates that the kernel PCID to be used on return,
+	// assuming that PCIDs are supported.
+	//
+	// Per pagetables_x86.go, a zero PCID implies a flush.
+	KernelPCID uint16
+}
+
 func init() {
 	KernelCodeSegment.setCode64(0, 0, 0)
 	KernelDataSegment.setData(0, 0xffffffff, 0)
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 58ac4b4b2..37d5484e1 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -180,23 +180,14 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	if !IsCanonical(regs.Rip) || !IsCanonical(regs.Rsp) || !IsCanonical(regs.Fs_base) || !IsCanonical(regs.Gs_base) {
 		return GeneralProtectionFault
 	}
-
-	var (
-		userCR3   uint64
-		kernelCR3 uint64
-	)
+	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
+	kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
 
 	// Sanitize registers.
-	if switchOpts.Flush {
-		userCR3 = switchOpts.PageTables.FlushCR3()
-	} else {
-		userCR3 = switchOpts.PageTables.CR3()
-	}
 	regs.Eflags &= ^uint64(UserFlagsClear)
 	regs.Eflags |= UserFlagsSet
 	regs.Cs = uint64(Ucode64) // Required for iret.
 	regs.Ss = uint64(Udata)   // Ditto.
-	kernelCR3 = c.kernel.PageTables.CR3()
 
 	// Perform the switch.
 	swapgs()                                         // GS will be swapped on return.
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 768f96678..08b73e87d 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -26,7 +26,6 @@ go_test(
     srcs = [
         "pagetables_amd64_test.go",
         "pagetables_test.go",
-        "pcids_x86_test.go",
     ],
     embed = [":pagetables"],
     deps = ["//pkg/sentry/usermem"],
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 929771cca..6963ba62d 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -37,27 +37,13 @@ type PageTables struct {
 }
 
 // New returns new PageTables.
-func New(a Allocator, opts Opts) *PageTables {
+func New(a Allocator) *PageTables {
 	p := &PageTables{Allocator: a}
 	p.root = p.Allocator.NewPTEs()
 	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
-	p.init(opts)
 	return p
 }
 
-// New returns a new set of PageTables derived from the given one.
-//
-// This function should always be preferred to New if there are existing
-// pagetables, as this function preserves architectural constraints relevant to
-// managing multiple sets of pagetables.
-func (p *PageTables) New(a Allocator) *PageTables {
-	np := &PageTables{Allocator: a}
-	np.root = np.Allocator.NewPTEs()
-	np.rootPhysical = p.Allocator.PhysicalFor(np.root)
-	np.initFrom(&p.archPageTables)
-	return np
-}
-
 // Map installs a mapping with the given physical address.
 //
 // True is returned iff there was a previous mapping in the range.
@@ -99,15 +85,6 @@ func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
 	return count > 0
 }
 
-// Release releases this address space.
-//
-// This must be called to release the PCID.
-func (p *PageTables) Release() {
-	// Clear all pages.
-	p.Unmap(0, ^uintptr(0))
-	p.release()
-}
-
 // Lookup returns the physical address for the given virtual address.
 func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
 	mask := uintptr(usermem.PageSize - 1)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
index c81786133..a7f2ad9a4 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -23,7 +23,7 @@ import (
 )
 
 func Test2MAnd4K(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map a small page and a huge page.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -33,11 +33,10 @@ func Test2MAnd4K(t *testing.T) {
 		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
 		{0x00007f0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read}},
 	})
-	pt.Release()
 }
 
 func Test1GAnd4K(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map a small page and a super page.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -47,11 +46,10 @@ func Test1GAnd4K(t *testing.T) {
 		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
 		{0x00007f0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read}},
 	})
-	pt.Release()
 }
 
 func TestSplit1GPage(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map a super page and knock out the middle.
 	pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*42)
@@ -61,11 +59,10 @@ func TestSplit1GPage(t *testing.T) {
 		{0x00007f0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read}},
 		{0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read}},
 	})
-	pt.Release()
 }
 
 func TestSplit2MPage(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map a huge page and knock out the middle.
 	pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*42)
@@ -75,5 +72,4 @@ func TestSplit2MPage(t *testing.T) {
 		{0x00007f0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read}},
 		{0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read}},
 	})
-	pt.Release()
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index dec8def7f..28178f656 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -72,24 +72,18 @@ func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
 	}
 }
 
-func TestAllocFree(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
-	pt.Release()
-}
-
 func TestUnmap(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map and unmap one entry.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
 	pt.Unmap(0x400000, pteSize)
 
 	checkMappings(t, pt, nil)
-	pt.Release()
 }
 
 func TestReadOnly(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map one entry.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42)
@@ -97,11 +91,10 @@ func TestReadOnly(t *testing.T) {
 	checkMappings(t, pt, []mapping{
 		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}},
 	})
-	pt.Release()
 }
 
 func TestReadWrite(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map one entry.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -109,11 +102,10 @@ func TestReadWrite(t *testing.T) {
 	checkMappings(t, pt, []mapping{
 		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
 	})
-	pt.Release()
 }
 
 func TestSerialEntries(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map two sequential entries.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -123,11 +115,10 @@ func TestSerialEntries(t *testing.T) {
 		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
 		{0x401000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.ReadWrite}},
 	})
-	pt.Release()
 }
 
 func TestSpanningEntries(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Span a pgd with two pages.
 	pt.Map(0x00007efffffff000, 2*pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42)
@@ -136,11 +127,10 @@ func TestSpanningEntries(t *testing.T) {
 		{0x00007efffffff000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}},
 		{0x00007f0000000000, pteSize, pteSize * 43, MapOpts{AccessType: usermem.Read}},
 	})
-	pt.Release()
 }
 
 func TestSparseEntries(t *testing.T) {
-	pt := New(NewRuntimeAllocator(), Opts{})
+	pt := New(NewRuntimeAllocator())
 
 	// Map two entries in different pgds.
 	pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42)
@@ -150,5 +140,4 @@ func TestSparseEntries(t *testing.T) {
 		{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}},
 		{0x00007f0000000000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.Read}},
 	})
-	pt.Release()
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index 72a955d08..ca49d20f8 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -22,66 +22,28 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// Opts are pagetable options.
-type Opts struct {
-	EnablePCID bool
-}
-
-// archPageTables has x86-specific features.
+// archPageTables is architecture-specific data.
 type archPageTables struct {
-	// pcids is the PCID database.
-	pcids *PCIDs
-
-	// pcid is the globally unique identifier, or zero if none were
-	// available or pcids is nil.
+	// pcid is the value assigned by PCIDs.Assign.
+	//
+	// Note that zero is a valid PCID.
 	pcid uint16
 }
 
-// init initializes arch-specific features.
-func (a *archPageTables) init(opts Opts) {
-	if opts.EnablePCID {
-		a.pcids = NewPCIDs()
-		a.pcid = a.pcids.allocate()
-	}
-}
-
-// initFrom initializes arch-specific features from an existing entry.'
-func (a *archPageTables) initFrom(other *archPageTables) {
-	a.pcids = other.pcids // Refer to the same PCID database.
-	if a.pcids != nil {
-		a.pcid = a.pcids.allocate()
-	}
-}
-
-// release is called from Release.
-func (a *archPageTables) release() {
-	// Return the PCID.
-	if a.pcids != nil {
-		a.pcids.free(a.pcid)
-	}
-}
-
 // CR3 returns the CR3 value for these tables.
 //
-// This may be called in interrupt contexts.
+// This may be called in interrupt contexts. A PCID of zero always implies a
+// flush and should be passed when PCIDs are not enabled. See pcids_x86.go for
+// more information.
 //
 //go:nosplit
-func (p *PageTables) CR3() uint64 {
+func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 {
 	// Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1).
 	const noFlushBit uint64 = 0x8000000000000000
-	if p.pcid != 0 {
-		return noFlushBit | uint64(p.rootPhysical) | uint64(p.pcid)
+	if noFlush && pcid != 0 {
+		return noFlushBit | uint64(p.rootPhysical) | uint64(pcid)
 	}
-	return uint64(p.rootPhysical)
-}
-
-// FlushCR3 returns the CR3 value that flushes the TLB.
-//
-// This may be called in interrupt contexts.
-//
-//go:nosplit
-func (p *PageTables) FlushCR3() uint64 {
-	return uint64(p.rootPhysical) | uint64(p.pcid)
+	return uint64(p.rootPhysical) | uint64(pcid)
 }
 
 // Bits in page table entries.
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index 509e8c0d9..4296371e8 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -16,59 +16,79 @@
 
 package pagetables
 
-import (
-	"sync"
-)
-
-// maxPCID is the maximum allowed PCID.
-const maxPCID = 4095
+// limitPCID is the number of valid PCIDs.
+const limitPCID = 4096
 
 // PCIDs is a simple PCID database.
+//
+// This is not protected by locks and is thus suitable for use only with a
+// single CPU at a time.
 type PCIDs struct {
-	mu sync.Mutex
+	// cache are the assigned page tables.
+	cache map[*PageTables]uint16
 
-	// last is the last fresh PCID given out (not including the available
-	// pool). If last >= maxPCID, then the only PCIDs available in the
-	// available pool below.
-	last uint16
-
-	// available are PCIDs that have been freed.
-	available map[uint16]struct{}
+	// avail are available PCIDs.
+	avail []uint16
 }
 
-// NewPCIDs returns a new PCID set.
-func NewPCIDs() *PCIDs {
-	return &PCIDs{
-		available: make(map[uint16]struct{}),
+// NewPCIDs returns a new PCID database.
+//
+// start is the first index to assign. Typically this will be one, as the zero
+// pcid will always be flushed on transition (see pagetables_x86.go). This may
+// be more than one if specific PCIDs are reserved.
+//
+// Nil is returned iff the start and size are out of range.
+func NewPCIDs(start, size uint16) *PCIDs {
+	if start+uint16(size) >= limitPCID {
+		return nil // See comment.
+	}
+	p := &PCIDs{
+		cache: make(map[*PageTables]uint16),
 	}
+	for pcid := start; pcid < start+size; pcid++ {
+		p.avail = append(p.avail, pcid)
+	}
+	return p
 }
 
-// allocate returns an unused PCID, or zero if all are taken.
-func (p *PCIDs) allocate() uint16 {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	if len(p.available) > 0 {
-		for id := range p.available {
-			delete(p.available, id)
-			return id
-		}
+// Assign assigns a PCID to the given PageTables.
+//
+// This may overwrite any previous assignment provided. If this in the case,
+// true is returned to indicate that the PCID should be flushed.
+func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
+	if pcid, ok := p.cache[pt]; ok {
+		return pcid, false // No flush.
 	}
-	if id := p.last + 1; id <= maxPCID {
-		p.last = id
-		return id
+
+	// Is there something available?
+	if len(p.avail) > 0 {
+		pcid := p.avail[len(p.avail)-1]
+		p.avail = p.avail[:len(p.avail)-1]
+
+		// We need to flush because while this is in the available
+		// pool, it may have been used previously.
+		return pcid, true
 	}
-	// Nothing available.
-	return 0
+
+	// Evict an existing table.
+	for old, pcid := range p.cache {
+		delete(p.cache, old)
+		p.cache[pt] = pcid
+
+		// A flush is definitely required in this case, these page
+		// tables may still be active. (They will just be assigned some
+		// other PCID if and when they hit the given CPU again.)
+		return pcid, true
+	}
+
+	// No PCID.
+	return 0, false
 }
 
-// free returns a PCID to the pool.
-//
-// It is safe to call free with a zero pcid. That is, you may always call free
-// with anything returned by allocate.
-func (p *PCIDs) free(id uint16) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	if id != 0 {
-		p.available[id] = struct{}{}
+// Drop drops references to a set of page tables.
+func (p *PCIDs) Drop(pt *PageTables) {
+	if pcid, ok := p.cache[pt]; ok {
+		delete(p.cache, pt)
+		p.avail = append(p.avail, pcid)
 	}
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
deleted file mode 100644
index 0b555cd76..000000000
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build i386 amd64
-
-package pagetables
-
-import (
-	"testing"
-)
-
-func TestMaxPCID(t *testing.T) {
-	p := NewPCIDs()
-	for i := 0; i < maxPCID; i++ {
-		if id := p.allocate(); id != uint16(i+1) {
-			t.Errorf("got %d, expected %d", id, i+1)
-		}
-	}
-	if id := p.allocate(); id != 0 {
-		if id != 0 {
-			t.Errorf("got %d, expected 0", id)
-		}
-	}
-}
-
-func TestFirstPCID(t *testing.T) {
-	p := NewPCIDs()
-	if id := p.allocate(); id != 1 {
-		t.Errorf("got %d, expected 1", id)
-	}
-}
-
-func TestFreePCID(t *testing.T) {
-	p := NewPCIDs()
-	p.free(0)
-	if id := p.allocate(); id != 1 {
-		t.Errorf("got %d, expected 1 (not zero)", id)
-	}
-}
-
-func TestReusePCID(t *testing.T) {
-	p := NewPCIDs()
-	id := p.allocate()
-	if id != 1 {
-		t.Errorf("got %d, expected 1", id)
-	}
-	p.free(id)
-	if id := p.allocate(); id != 1 {
-		t.Errorf("got %d, expected 1", id)
-	}
-	if id := p.allocate(); id != 2 {
-		t.Errorf("got %d, expected 2", id)
-	}
-}
-- 
cgit v1.2.3


From d26984515900a2f88da047ee8a28ba1ca152aa58 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 6 Jun 2018 23:25:26 -0700
Subject: Ensure guest-mode for page table modifications.

Because of the KVM shadow page table implementation, modifications made
to guest page tables from host mode may not be syncronized correctly,
resulting in undefined behavior. This is a KVM bug: page table pages
should also be tracked for host modifications and resynced appropriately
(e.g. the guest could "DMA" into a page table page in theory).

However, since we can't rely on this being fixed everywhere, workaround
the issue by forcing page table modifications to be in guest mode. This
will generally be the case anyways, but now if an exit occurs during
modifications, we will re-enter and perform the modifications again.

PiperOrigin-RevId: 199587895
Change-Id: I83c20b4cf2a9f9fa56f59f34939601dd34538fb0
---
 pkg/sentry/platform/kvm/address_space.go | 24 ++++++++++++++++++------
 pkg/sentry/platform/kvm/machine_amd64.go | 23 +++++++++++++++++++++++
 pkg/sentry/platform/ring0/defs_amd64.go  |  8 ++++++++
 3 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index c2f4559a0..f74c98dd0 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -102,11 +102,18 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 		// important; if the pagetable mappings were installed before
 		// ensuring the physical pages were available, then some other
 		// thread could theoretically access them.
-		prev := as.pageTables.Map(addr, length, pagetables.MapOpts{
-			AccessType: at,
-			User:       true,
-		}, physical)
-		inv = inv || prev
+		//
+		// Due to the way KVM's shadow paging implementation works,
+		// modifications to the page tables while in host mode may not
+		// be trapped, leading to the shadow pages being out of sync.
+		// Therefore, we need to ensure that we are in guest mode for
+		// page table modifications. See the call to bluepill, below.
+		as.machine.retryInGuest(func() {
+			inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+				AccessType: at,
+				User:       true,
+			}, physical) || inv
+		})
 		m.addr += length
 		m.length -= length
 		addr += usermem.Addr(length)
@@ -214,7 +221,12 @@ func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 	as.mu.Lock()
 	defer as.mu.Unlock()
 
-	if prev := as.pageTables.Unmap(addr, uintptr(length)); prev {
+	// See above re: retryInGuest.
+	var prev bool
+	as.machine.retryInGuest(func() {
+		prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
+	})
+	if prev {
 		as.invalidate()
 		as.files.DeleteMapping(usermem.AddrRange{
 			Start: addr,
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 6afae5cae..7fcb7451f 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -227,3 +227,26 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
 	}
 }
+
+// retryInGuest runs the given function in guest mode.
+//
+// If the function does not complete in guest mode (due to execution of a
+// system call due to a GC stall, for example), then it will be retried. The
+// given function must be idempotent as a result of the retry mechanism.
+func (m *machine) retryInGuest(fn func()) {
+	c := m.Get()
+	defer m.Put(c)
+	for {
+		c.ClearErrorCode() // See below.
+		bluepill(c)        // Force guest mode.
+		fn()               // Execute the given function.
+		_, user := c.ErrorCode()
+		if user {
+			// If user is set, then we haven't bailed back to host
+			// mode via a kernel exception or system call. We
+			// consider the full function to have executed in guest
+			// mode and we can return.
+			break
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 0d068c00a..84819f132 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -104,6 +104,14 @@ func (c *CPU) ErrorCode() (value uintptr, user bool) {
 	return c.errorCode, c.errorType != 0
 }
 
+// ClearErrorCode resets the error code.
+//
+//go:nosplit
+func (c *CPU) ClearErrorCode() {
+	c.errorCode = 0 // No code.
+	c.errorType = 1 // User mode.
+}
+
 // SwitchArchOpts are embedded in SwitchOpts.
 type SwitchArchOpts struct {
 	// UserPCID indicates that the application PCID to be used on switch,
-- 
cgit v1.2.3


From 9170303105bedbe7dda1d11b196e21abe9040cdf Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 7 Jun 2018 10:20:28 -0700
Subject: Sentry: very basic terminal echo support.

Adds support for echo to terminals. Echoing is just copying input back out to
the user, e.g. when I type "foo" into a terminal, I expect "foo" to be echoed
back to my terminal.

Also makes the transform function part of the queue, eliminating the need to
pass them around together and the possibility of using the wrong transform for a
queue.

PiperOrigin-RevId: 199655147
Change-Id: I37c490d4fc1ee91da20ae58ba1f884a5c14fd0d8
---
 pkg/sentry/fs/tty/line_discipline.go | 135 +++++++++++++++++++++++++----------
 pkg/sentry/fs/tty/terminal.go        |   4 +-
 pkg/sentry/fs/tty/tty_test.go        |   2 +-
 3 files changed, 99 insertions(+), 42 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index bdc4f5b92..a4012135c 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -38,8 +38,20 @@ const (
 	nonCanonMaxBytes = canonMaxBytes - 1
 
 	spacesPerTab = 8
+
+	// transformInputStateifyKey is used to save and restore queues.
+	transformInputStateifyKey = "transformInput"
+
+	// transformOutputStateifyKey is used to save and restore queues.
+	transformOutputStateifyKey = "transformOutput"
 )
 
+// transformer is a helper interface to make it easier to stateify queue.
+type transformer interface {
+	// transform functions require queue's mutex to be held.
+	transform(*lineDiscipline, *queue, []byte) int
+}
+
 // queue represents one of the input or output queues between a pty master and
 // slave. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
@@ -62,6 +74,11 @@ type queue struct {
 	// canonical mode, there can be an unterminated line in the read buffer,
 	// so readable must be checked.
 	readable bool
+
+	// transform is the the queue's function for transforming bytes
+	// entering the queue. For example, transform might convert all '\r's
+	// entering the queue to '\n's.
+	transformer
 }
 
 // saveReadBuf is invoked by stateify.
@@ -142,9 +159,9 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 //        (outputQueueRead)    +--------------+    (outputQueueWrite)
 //
 // Lock order:
-//  inMu
-//    outMu
-//      termiosMu
+//  termiosMu
+//    inMu
+//      outMu
 type lineDiscipline struct {
 	// inMu protects inQueue.
 	inMu sync.Mutex `state:"nosave"`
@@ -159,7 +176,7 @@ type lineDiscipline struct {
 	outQueue queue
 
 	// termiosMu protects termios.
-	termiosMu sync.Mutex `state:"nosave"`
+	termiosMu sync.RWMutex `state:"nosave"`
 
 	// termios is the terminal configuration used by the lineDiscipline.
 	termios linux.KernelTermios
@@ -169,10 +186,17 @@ type lineDiscipline struct {
 	column int
 }
 
+func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
+	ld := lineDiscipline{termios: termios}
+	ld.inQueue.transformer = &inputQueueTransformer{}
+	ld.outQueue.transformer = &outputQueueTransformer{}
+	return &ld
+}
+
 // getTermios gets the linux.Termios for the tty.
 func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	l.termiosMu.Lock()
-	defer l.termiosMu.Unlock()
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
 	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
@@ -183,10 +207,10 @@ func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arc
 
 // setTermios sets a linux.Termios for the tty.
 func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	l.inMu.Lock()
-	defer l.inMu.Unlock()
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
+	l.inMu.Lock()
+	defer l.inMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
@@ -199,7 +223,7 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	// buffer to its read buffer. Anything already in the read buffer is
 	// now readable.
 	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
-		l.pushWaitBuf(&l.inQueue, transformInput)
+		l.pushWaitBuf(&l.inQueue)
 	}
 
 	return 0, err
@@ -216,12 +240,12 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 }
 
 func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
 	l.inMu.Lock()
 	defer l.inMu.Unlock()
 	l.outMu.Lock()
 	defer l.outMu.Unlock()
-	l.termiosMu.Lock()
-	defer l.termiosMu.Unlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
@@ -232,15 +256,19 @@ func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO,
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
 	l.inMu.Lock()
 	defer l.inMu.Unlock()
-	return l.queueRead(ctx, dst, &l.inQueue, transformInput)
+	return l.queueRead(ctx, dst, &l.inQueue)
 }
 
 func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
 	l.inMu.Lock()
 	defer l.inMu.Unlock()
-	return l.queueWrite(ctx, src, &l.inQueue, transformInput)
+	return l.queueWrite(ctx, src, &l.inQueue)
 }
 
 func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
@@ -250,21 +278,27 @@ func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO,
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
 	l.outMu.Lock()
 	defer l.outMu.Unlock()
-	return l.queueRead(ctx, dst, &l.outQueue, transformOutput)
+	return l.queueRead(ctx, dst, &l.outQueue)
 }
 
 func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
 	l.outMu.Lock()
 	defer l.outMu.Unlock()
-	return l.queueWrite(ctx, src, &l.outQueue, transformOutput)
+	return l.queueWrite(ctx, src, &l.outQueue)
 }
 
 // queueRead reads from q to userspace.
 //
-// Preconditions: q's lock must be held.
-func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence, q *queue, f transform) (int64, error) {
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q's lock must be held.
+func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence, q *queue) (int64, error) {
 	if !q.readable {
 		return 0, syserror.ErrWouldBlock
 	}
@@ -290,9 +324,7 @@ func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence,
 	}
 
 	// Move data from the queue's wait buffer to its read buffer.
-	l.termiosMu.Lock()
-	defer l.termiosMu.Unlock()
-	l.pushWaitBuf(q, f)
+	l.pushWaitBuf(q)
 
 	// If state changed, notify any waiters. If nothing was available to
 	// read, let the caller know we could block.
@@ -304,11 +336,12 @@ func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence,
 	return int64(n), nil
 }
 
-// queueWrite writes to q from userspace. f is the function used to perform
-// processing on data being written and write it to the read buffer.
+// queueWrite writes to q from userspace.
 //
-// Precondition: q's lock must be held.
-func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence, q *queue, f transform) (int64, error) {
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q's lock must be held.
+func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence, q *queue) (int64, error) {
 	// TODO: Use CopyInTo/safemem to avoid extra copying.
 	// Copy in the bytes to write from user-space.
 	b := make([]byte, src.NumBytes())
@@ -317,11 +350,17 @@ func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence,
 		return 0, err
 	}
 	b = b[:n]
+	return l.queueWriteBytes(b, q)
+}
 
+// queueWriteBytes writes to q from b.
+//
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * q's lock must be held.
+func (l *lineDiscipline) queueWriteBytes(b []byte, q *queue) (int64, error) {
 	// Write as much as possible to the read buffer.
-	l.termiosMu.Lock()
-	defer l.termiosMu.Unlock()
-	n = f(l, q, b)
+	n := q.transform(l, q, b)
 
 	// Write remaining data to the wait buffer.
 	nWaiting, _ := q.waitBuf.Write(b[n:])
@@ -338,10 +377,12 @@ func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence,
 
 // pushWaitBuf fills the queue's read buffer with data from the wait buffer.
 //
-// Precondition: l.inMu and l.termiosMu must be held.
-func (l *lineDiscipline) pushWaitBuf(q *queue, f transform) {
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * l.inMu must be held.
+func (l *lineDiscipline) pushWaitBuf(q *queue) {
 	// Remove bytes from the wait buffer and move them to the read buffer.
-	n := f(l, q, q.waitBuf.Bytes())
+	n := q.transform(l, q, q.waitBuf.Bytes())
 	q.waitBuf.Next(n)
 
 	// If state changed, notify any waiters.
@@ -350,14 +391,16 @@ func (l *lineDiscipline) pushWaitBuf(q *queue, f transform) {
 	}
 }
 
-// transform functions require the passed in lineDiscipline's mutex to be held.
-type transform func(*lineDiscipline, *queue, []byte) int
+// outputQueueTransformer implements transformer.
+type outputQueueTransformer struct{}
 
-// transformOutput does output processing for one end of the pty. See
+// transform does output processing for one end of the pty. See
 // drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
 //
-// Precondition: l.termiosMu and q's mutex must be held.
-func transformOutput(l *lineDiscipline, q *queue, buf []byte) int {
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * q's mutex must be held.
+func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
 	// transformOutput is effectively always in noncanonical mode, as the
 	// master termios never has ICANON set.
 
@@ -418,13 +461,18 @@ func transformOutput(l *lineDiscipline, q *queue, buf []byte) int {
 	return ret
 }
 
-// transformInput does input processing for one end of the pty. Characters read
-// are transformed according to flags set in the termios struct. See
+// inputQueueTransformer implements transformer.
+type inputQueueTransformer struct{}
+
+// transform does input processing for one end of the pty. Characters read are
+// transformed according to flags set in the termios struct. See
 // drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
 // function.
 //
-// Precondition: l.termiosMu and q's mutex must be held.
-func transformInput(l *lineDiscipline, q *queue, buf []byte) int {
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * q's mutex must be held.
+func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
 	// If there's a line waiting to be read in canonical mode, don't write
 	// anything else to the read buffer.
 	if l.termios.LEnabled(linux.ICANON) && q.readable {
@@ -467,6 +515,7 @@ func transformInput(l *lineDiscipline, q *queue, buf []byte) int {
 		if q.readBuf.Len()+size > maxBytes {
 			break
 		}
+		cBytes := buf[:size]
 		buf = buf[size:]
 		ret += size
 
@@ -477,6 +526,14 @@ func transformInput(l *lineDiscipline, q *queue, buf []byte) int {
 		}
 
 		q.readBuf.WriteRune(c)
+		// Anything written to the readBuf will have to be echoed.
+		if l.termios.LEnabled(linux.ECHO) {
+			// We can't defer Unlock here because we may
+			// Lock/Unlock l.outMu multiple times in this loop.
+			l.outMu.Lock()
+			l.queueWriteBytes(cBytes, &l.outQueue)
+			l.outMu.Unlock()
+		}
 
 		// If we finish a line, make it available for reading.
 		if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(c) {
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 6ae713a32..fa5b00409 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -31,7 +31,7 @@ type Terminal struct {
 	d *dirInodeOperations
 
 	// ld is the line discipline of the terminal.
-	ld lineDiscipline
+	ld *lineDiscipline
 }
 
 func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
@@ -39,6 +39,6 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
 	return &Terminal{
 		d:  d,
 		n:  n,
-		ld: lineDiscipline{termios: termios},
+		ld: newLineDiscipline(termios),
 	}
 }
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 0c7560ed7..32e1b1556 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -23,7 +23,7 @@ import (
 )
 
 func TestSimpleMasterToSlave(t *testing.T) {
-	ld := lineDiscipline{termios: linux.DefaultSlaveTermios}
+	ld := newLineDiscipline(linux.DefaultSlaveTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
-- 
cgit v1.2.3


From 7e9893eeb500ab56dcab80471300df50c12288ae Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Thu, 7 Jun 2018 11:36:26 -0700
Subject: Add missing rpcinet ioctls.

PiperOrigin-RevId: 199669120
Change-Id: I0be88cdbba29760f967e9a5bb4144ca62c1ed7aa
---
 pkg/sentry/socket/rpcinet/socket.go | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 3356f7804..29546b683 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -417,10 +417,22 @@ func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.S
 	var buf []byte
 	switch cmd {
 	// The following ioctls take 4 byte argument parameters.
-	case syscall.TIOCINQ, syscall.TIOCOUTQ:
+	case syscall.TIOCINQ,
+		syscall.TIOCOUTQ:
 		buf = make([]byte, 4)
 	// The following ioctls have args which are sizeof(struct ifreq).
-	case syscall.SIOCGIFINDEX, syscall.SIOCGIFNETMASK, syscall.SIOCGIFHWADDR, syscall.SIOCGIFNAME, syscall.SIOCGIFFLAGS:
+	case syscall.SIOCGIFADDR,
+		syscall.SIOCGIFBRDADDR,
+		syscall.SIOCGIFDSTADDR,
+		syscall.SIOCGIFFLAGS,
+		syscall.SIOCGIFHWADDR,
+		syscall.SIOCGIFINDEX,
+		syscall.SIOCGIFMAP,
+		syscall.SIOCGIFMETRIC,
+		syscall.SIOCGIFMTU,
+		syscall.SIOCGIFNAME,
+		syscall.SIOCGIFNETMASK,
+		syscall.SIOCGIFTXQLEN:
 		buf = make([]byte, sizeOfIfReq)
 	default:
 		return 0, syserror.ENOTTY
-- 
cgit v1.2.3


From 5c37097e34a513845d77bb8b7240f0074aa1c1e9 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Thu, 7 Jun 2018 15:09:27 -0700
Subject: rpcinet should not block in read(2) rpcs.

PiperOrigin-RevId: 199703609
Change-Id: I8153b0396b22a230a68d4b69c46652a5545f7630
---
 pkg/sentry/socket/rpcinet/socket.go | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 29546b683..69cf604b7 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -145,29 +145,8 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 		n, e := dst.CopyOut(ctx, res.Data)
 		return int64(n), e
 	}
-	if se != syserr.ErrWouldBlock {
-		return 0, se.ToError()
-	}
-
-	// We'll have to block. Register for notifications and read again when ready.
-	e, ch := waiter.NewChannelEntry(nil)
-	s.EventRegister(&e, waiter.EventIn)
-	defer s.EventUnregister(&e)
 
-	for {
-		res, se := rpcRead(ctx.(*kernel.Task), req)
-		if se == nil {
-			n, e := dst.CopyOut(ctx, res.Data)
-			return int64(n), e
-		}
-		if se != syserr.ErrWouldBlock {
-			return 0, se.ToError()
-		}
-
-		if err := ctx.(*kernel.Task).Block(ch); err != nil {
-			return 0, err
-		}
-	}
+	return 0, se.ToError()
 }
 
 func rpcWrite(t *kernel.Task, req *pb.SyscallRequest_Write) (uint32, *syserr.Error) {
-- 
cgit v1.2.3


From 2f3895d6f7ad37915edcdd80706f880ce50c519c Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Fri, 8 Jun 2018 10:32:30 -0700
Subject: rpcinet is not correctly handling MSG_TRUNC on recvmsg(2).

MSG_TRUNC can cause recvmsg(2) to return a value larger than
the buffer size. In this situation it's an indication that the
buffer was completely filled and that the msg was truncated.
Previously in rpcinet we were returning the buffer size but we
should actually be returning the payload length as returned by
the syscall.

PiperOrigin-RevId: 199814221
Change-Id: If09aa364219c1bf193603896fcc0dc5c55e85d21
---
 pkg/sentry/socket/rpcinet/socket.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 69cf604b7..c4ecb30f5 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -465,8 +465,8 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 	res, err := rpcRecvMsg(t, req)
 	if err == nil {
-		n, e := dst.CopyOut(t, res.Data)
-		return int(n), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
+		_, e := dst.CopyOut(t, res.Data)
+		return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 	}
 	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		return 0, nil, 0, socket.ControlMessages{}, err
@@ -481,8 +481,8 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	for {
 		res, err := rpcRecvMsg(t, req)
 		if err == nil {
-			n, e := dst.CopyOut(t, res.Data)
-			return int(n), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
+			_, e := dst.CopyOut(t, res.Data)
+			return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 		}
 		if err != syserr.ErrWouldBlock {
 			return 0, nil, 0, socket.ControlMessages{}, err
-- 
cgit v1.2.3


From 6728f09910bd9f7633f277fafe6945cfaa2abf42 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 8 Jun 2018 15:00:29 -0700
Subject: Fix sigaltstack semantics.

Walking off the bottom of the sigaltstack, for example with recursive faults,
results in forced signal delivery, not resetting the stack or pushing signal
stack to whatever happens to lie below the signal stack.

PiperOrigin-RevId: 199856085
Change-Id: I0004d2523f0df35d18714de2685b3eaa147837e0
---
 pkg/sentry/arch/arch.go                 |  2 +-
 pkg/sentry/arch/signal_amd64.go         | 16 ++++++---
 pkg/sentry/arch/signal_stack.go         | 11 ++++--
 pkg/sentry/kernel/task_signals.go       | 60 ++++++++++++++++++++++-----------
 pkg/sentry/syscalls/linux/sys_signal.go | 16 ++++-----
 5 files changed, 70 insertions(+), 35 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 021789e4b..0189e958d 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -158,7 +158,7 @@ type Context interface {
 	// rt is true if SignalRestore is being entered from rt_sigreturn and
 	// false if SignalRestore is being entered from sigreturn.
 	// SignalRestore returns the thread's new signal mask.
-	SignalRestore(st *Stack, rt bool) (linux.SignalSet, error)
+	SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error)
 
 	// CPUIDEmulate emulates a CPUID instruction according to current register state.
 	CPUIDEmulate(l log.Logger)
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 4040b530f..c1d743f38 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -377,6 +377,14 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 	sp = frameBottom + usermem.Addr(frameSize)
 	st.Bottom = sp
 
+	// Prior to proceeding, figure out if the frame will exhaust the range
+	// for the signal stack. This is not allowed, and should immediately
+	// force signal delivery (reverting to the default handler).
+	if act.IsOnStack() && alt.IsEnabled() && !alt.Contains(frameBottom) {
+		return syscall.EFAULT
+	}
+
+	// Adjust the code.
 	info.FixSignalCodeForUser()
 
 	// Set up the stack frame.
@@ -422,15 +430,15 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 
 // SignalRestore implements Context.SignalRestore. (Compare to Linux's
 // arch/x86/kernel/signal.c:sys_rt_sigreturn().)
-func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, error) {
+func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
 	// Copy out the stack frame.
 	var uc UContext64
 	if _, err := st.Pop(&uc); err != nil {
-		return 0, err
+		return 0, SignalStack{}, err
 	}
 	var info SignalInfo
 	if _, err := st.Pop(&info); err != nil {
-		return 0, err
+		return 0, SignalStack{}, err
 	}
 
 	// Restore registers.
@@ -472,5 +480,5 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, error) {
 		log.Infof("sigreturn unable to restore application fpstate")
 	}
 
-	return uc.Sigset, nil
+	return uc.Sigset, uc.Stack, nil
 }
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 7c6531d79..ba43dd1d4 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -39,12 +39,19 @@ func (s SignalStack) Top() usermem.Addr {
 	return usermem.Addr(s.Addr + s.Size)
 }
 
-// SetOnStack marks this signal stack as in use. (This is only called on copies
-// sent to user applications, so there's no corresponding ClearOnStack.)
+// SetOnStack marks this signal stack as in use.
+//
+// Note that there is no corresponding ClearOnStack, and that this should only
+// be called on copies that are serialized to userspace.
 func (s *SignalStack) SetOnStack() {
 	s.Flags |= SignalStackFlagOnStack
 }
 
+// Contains checks if the stack pointer is within this stack.
+func (s *SignalStack) Contains(sp usermem.Addr) bool {
+	return usermem.Addr(s.Addr) < sp && sp <= usermem.Addr(s.Addr+s.Size)
+}
+
 // NativeSignalStack is a type that is equivalent to stack_t in the guest
 // architecture.
 type NativeSignalStack interface {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index e4ef7fd67..91f6c0874 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -212,7 +212,9 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
 		// Try to deliver the signal to the user-configured handler.
 		t.Debugf("Signal %d: delivering to handler", info.Signo)
 		if err := t.deliverSignalToHandler(info, act); err != nil {
-			t.Warningf("Failed to deliver signal %+v to user handler: %v", info, err)
+			// This is not a warning, it can occur during normal operation.
+			t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err)
+
 			// Send a forced SIGSEGV. If the signal that couldn't be delivered
 			// was a SIGSEGV, force the handler to SIG_DFL.
 			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
@@ -241,7 +243,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	alt := t.signalStack
 	if act.IsOnStack() && alt.IsEnabled() {
 		alt.SetOnStack()
-		if !t.OnSignalStack(alt) {
+		if !alt.Contains(sp) {
 			sp = usermem.Addr(alt.Top())
 		}
 	}
@@ -275,18 +277,20 @@ var ctrlResume = &SyscallControl{ignoreReturn: true}
 // rt is true).
 func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
 	st := t.Stack()
-	sigset, err := t.Arch().SignalRestore(st, rt)
+	sigset, alt, err := t.Arch().SignalRestore(st, rt)
 	if err != nil {
 		return nil, err
 	}
 
+	// Attempt to record the given signal stack. Note that we silently
+	// ignore failures here, as does Linux. Only an EFAULT may be
+	// generated, but SignalRestore has already deserialized the entire
+	// frame successfully.
+	t.SetSignalStack(alt)
+
 	// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
 	t.SetSignalMask(sigset &^ UnblockableSignals)
 
-	// TODO: sys_rt_sigreturn also calls restore_altstack from
-	// uc.stack, allowing the signal handler to implicitly mutate the signal
-	// stack.
-
 	return ctrlResume, nil
 }
 
@@ -624,23 +628,41 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
 
 // SignalStack returns the task-private signal stack.
 func (t *Task) SignalStack() arch.SignalStack {
-	return t.signalStack
+	alt := t.signalStack
+	if t.onSignalStack(alt) {
+		alt.Flags |= arch.SignalStackFlagOnStack
+	}
+	return alt
 }
 
-// OnSignalStack returns true if, when the task resumes running, it will run on
-// the task-private signal stack.
-func (t *Task) OnSignalStack(s arch.SignalStack) bool {
+// onSignalStack returns true if the task is executing on the given signal stack.
+func (t *Task) onSignalStack(alt arch.SignalStack) bool {
 	sp := usermem.Addr(t.Arch().Stack())
-	return usermem.Addr(s.Addr) <= sp && sp < usermem.Addr(s.Addr+s.Size)
+	return alt.Contains(sp)
 }
 
-// SetSignalStack sets the task-private signal stack and clears the
-// SignalStackFlagDisable, since we have a signal stack.
-func (t *Task) SetSignalStack(alt arch.SignalStack) error {
-	// Mask out irrelevant parts: only disable matters.
-	alt.Flags &= arch.SignalStackFlagDisable
-	t.signalStack = alt
-	return nil
+// SetSignalStack sets the task-private signal stack.
+//
+// This value may not be changed if the task is currently executing on the
+// signal stack, i.e. if t.onSignalStack returns true. In this case, this
+// function will return false. Otherwise, true is returned.
+func (t *Task) SetSignalStack(alt arch.SignalStack) bool {
+	// Check that we're not executing on the stack.
+	if t.onSignalStack(t.signalStack) {
+		return false
+	}
+
+	if alt.Flags&arch.SignalStackFlagDisable != 0 {
+		// Don't record anything beyond the flags.
+		t.signalStack = arch.SignalStack{
+			Flags: arch.SignalStackFlagDisable,
+		}
+	} else {
+		// Mask out irrelevant parts: only disable matters.
+		alt.Flags &= arch.SignalStackFlagDisable
+		t.signalStack = alt
+	}
+	return true
 }
 
 // SetSignalAct atomically sets the thread group's signal action for signal sig
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 93b3f531a..66ecb1299 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -315,25 +315,23 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 	setaddr := args[0].Pointer()
 	oldaddr := args[1].Pointer()
 
+	alt := t.SignalStack()
 	if oldaddr != 0 {
-		alt := t.SignalStack()
-		if t.OnSignalStack(alt) {
-			alt.Flags |= arch.SignalStackFlagOnStack
-		}
 		if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil {
 			return 0, nil, err
 		}
 	}
 	if setaddr != 0 {
-		if t.OnSignalStack(t.SignalStack()) {
-			return 0, nil, syserror.EPERM
-		}
 		alt, err := t.CopyInSignalStack(setaddr)
 		if err != nil {
 			return 0, nil, err
 		}
-		if err := t.SetSignalStack(alt); err != nil {
-			return 0, nil, err
+		// The signal stack cannot be changed if the task is currently
+		// on the stack. This is enforced at the lowest level because
+		// these semantics apply to changing the signal stack via a
+		// ucontext during a signal handler.
+		if !t.SetSignalStack(alt) {
+			return 0, nil, syserror.EPERM
 		}
 	}
 
-- 
cgit v1.2.3


From 2fbd1cf57cb06c5f0165a2d0e9225eed242a41f5 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Fri, 8 Jun 2018 15:57:33 -0700
Subject: Add checks for short CopyOut in rpcinet

PiperOrigin-RevId: 199864753
Change-Id: Ibace6a1fdf99ee6ce368ac12c390aa8a02dbdfb7
---
 pkg/sentry/socket/rpcinet/socket.go | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index c4ecb30f5..a9dd1780a 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -465,7 +465,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 	res, err := rpcRecvMsg(t, req)
 	if err == nil {
-		_, e := dst.CopyOut(t, res.Data)
+		n, e := dst.CopyOut(t, res.Data)
+		if e == nil && n != len(res.Data) {
+			panic("CopyOut failed to copy full buffer")
+		}
 		return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 	}
 	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
@@ -481,7 +484,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	for {
 		res, err := rpcRecvMsg(t, req)
 		if err == nil {
-			_, e := dst.CopyOut(t, res.Data)
+			n, e := dst.CopyOut(t, res.Data)
+			if e == nil && n != len(res.Data) {
+				panic("CopyOut failed to copy full buffer")
+			}
 			return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 		}
 		if err != syserr.ErrWouldBlock {
-- 
cgit v1.2.3


From c0ab059e7b904197f52ade879711d7fb02ffa8c0 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 8 Jun 2018 17:50:55 -0700
Subject: Fix kernel flags handling and add missing vectors.

PiperOrigin-RevId: 199877174
Change-Id: I9d19ea301608c2b989df0a6123abb1e779427853
---
 pkg/sentry/platform/kvm/machine_amd64.go   | 52 ++++++++++++++++++++++++------
 pkg/sentry/platform/ring0/entry_amd64.s    | 10 +++---
 pkg/sentry/platform/ring0/kernel_amd64.go  | 21 +++---------
 pkg/sentry/platform/ring0/offsets_amd64.go |  3 +-
 pkg/sentry/platform/ring0/x86.go           | 15 +++++++++
 5 files changed, 70 insertions(+), 31 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 7fcb7451f..7ac289756 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -150,13 +150,20 @@ func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error)
 		// the code provided here. We need to re-execute.
 		return nil, usermem.NoAccess, platform.ErrContextInterrupt
 	}
-	info := &arch.SignalInfo{Signo: signal}
+	info := &arch.SignalInfo{
+		Signo: signal,
+	}
 	info.SetAddr(uint64(faultAddr))
 	accessType := usermem.AccessType{
 		Read:    code&(1<<1) == 0,
 		Write:   code&(1<<1) != 0,
 		Execute: code&(1<<4) != 0,
 	}
+	if !accessType.Write && !accessType.Execute {
+		info.Code = 1 // SEGV_MAPERR.
+	} else {
+		info.Code = 2 // SEGV_ACCERR.
+	}
 	return info, accessType, platform.ErrContextSignal
 }
 
@@ -191,30 +198,55 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 		return c.fault(int32(syscall.SIGSEGV))
 
 	case ring0.Debug, ring0.Breakpoint:
-		info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)}
+		info := &arch.SignalInfo{
+			Signo: int32(syscall.SIGTRAP),
+			Code:  1, // TRAP_BRKPT (breakpoint).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.GeneralProtectionFault:
-		if !ring0.IsCanonical(switchOpts.Registers.Rip) {
-			// If the RIP is non-canonical, it's a SEGV.
-			info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
-			return info, usermem.AccessType{}, platform.ErrContextSignal
+		info := &arch.SignalInfo{
+			Signo: int32(syscall.SIGSEGV),
+			Code:  arch.SignalInfoKernel,
 		}
-		// Otherwise, we deliver a SIGBUS.
-		info := &arch.SignalInfo{Signo: int32(syscall.SIGBUS)}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.InvalidOpcode:
-		info := &arch.SignalInfo{Signo: int32(syscall.SIGILL)}
+		info := &arch.SignalInfo{
+			Signo: int32(syscall.SIGILL),
+			Code:  1, // ILL_ILLOPC (illegal opcode).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return info, usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.DivideByZero:
+		info := &arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  1, // FPE_INTDIV (divide by zero).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.X87FloatingPointException:
-		info := &arch.SignalInfo{Signo: int32(syscall.SIGFPE)}
+		info := &arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  7, // FPE_FLTINV (invalid operation).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.Vector(bounce):
 		return nil, usermem.NoAccess, platform.ErrContextInterrupt
 
+	case ring0.AlignmentCheck:
+		info := &arch.SignalInfo{
+			Signo: int32(syscall.SIGBUS),
+			Code:  2, // BUS_ADRERR (physical address does not exist).
+		}
+		return info, usermem.NoAccess, platform.ErrContextSignal
+
 	case ring0.NMI:
 		// An NMI is generated only when a fault is not servicable by
 		// KVM itself, so we think some mapping is writeable but it's
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index e8638133b..08c15ad65 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -248,10 +248,12 @@ TEXT ·exception(SB),NOSPLIT,$0
 
 user:
 	SWAP_GS()
-	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
-	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
-	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX  // Load saved AX value.
-	MOVQ BX, PTRACE_RAX(AX)                // Save everything else.
+	ADDQ $-8, SP                            // Adjust for flags.
+	MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
+	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX  // Swap for user regs.
+	REGISTERS_SAVE(AX, 0)                   // Save all except IP, FLAGS, SP, AX.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX   // Restore original AX.
+	MOVQ BX, PTRACE_RAX(AX)                 // Save it.
 	MOVQ BX, PTRACE_ORIGRAX(AX)
 	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
 	MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 37d5484e1..c828af654 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -20,20 +20,6 @@ import (
 	"encoding/binary"
 )
 
-const (
-	// KernelFlagsSet should always be set in the kernel.
-	KernelFlagsSet = _RFLAGS_RESERVED
-
-	// UserFlagsSet are always set in userspace.
-	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
-
-	// KernelFlagsClear should always be clear in the kernel.
-	KernelFlagsClear = _RFLAGS_IF | _RFLAGS_NT | _RFLAGS_IOPL
-
-	// UserFlagsClear are always cleared in userspace.
-	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
-)
-
 // init initializes architecture-specific state.
 func (k *Kernel) init(opts KernelOpts) {
 	// Save the root page tables.
@@ -85,6 +71,9 @@ func (c *CPU) init() {
 	c.registers.Ss = uint64(Kdata)
 	c.registers.Fs = uint64(Kdata)
 	c.registers.Gs = uint64(Kdata)
+
+	// Set mandatory flags.
+	c.registers.Eflags = KernelFlagsSet
 }
 
 // StackTop returns the kernel's stack address.
@@ -119,7 +108,7 @@ func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
 //
 //go:nosplit
 func (c *CPU) CR0() uint64 {
-	return _CR0_PE | _CR0_PG | _CR0_ET
+	return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET
 }
 
 // CR4 returns the CPU's CR4 value.
@@ -240,7 +229,7 @@ func start(c *CPU) {
 
 	// Set the syscall target.
 	wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
-	wrmsr(_MSR_SYSCALL_MASK, _RFLAGS_STEP|_RFLAGS_IF|_RFLAGS_DF|_RFLAGS_IOPL|_RFLAGS_AC|_RFLAGS_NT)
+	wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
 
 	// NOTE: This depends on having the 64-bit segments immediately
 	// following the 32-bit user segments. This is simply the way the
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index 9acd442ba..ca5fd456b 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -38,7 +38,8 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define CPU_KERNEL_SYSCALL   0x%02x\n", reflect.ValueOf(&c.KernelSyscall).Pointer()-reflect.ValueOf(c).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
-	fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF)
+	fmt.Fprintf(w, "#define _RFLAGS_IF           0x%02x\n", _RFLAGS_IF)
+	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
 
 	fmt.Fprintf(w, "\n// Vectors.\n")
 	fmt.Fprintf(w, "#define DivideByZero               0x%02x\n", DivideByZero)
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 74b140066..3d437a77c 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -24,6 +24,7 @@ import (
 const (
 	_CR0_PE = 1 << 0
 	_CR0_ET = 1 << 4
+	_CR0_AM = 1 << 18
 	_CR0_PG = 1 << 31
 
 	_CR4_PSE        = 1 << 4
@@ -55,6 +56,20 @@ const (
 	_MSR_SYSCALL_MASK = 0xc0000084
 )
 
+const (
+	// KernelFlagsSet should always be set in the kernel.
+	KernelFlagsSet = _RFLAGS_RESERVED
+
+	// UserFlagsSet are always set in userspace.
+	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+
+	// KernelFlagsClear should always be clear in the kernel.
+	KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
+
+	// UserFlagsClear are always cleared in userspace.
+	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+)
+
 // Vector is an exception vector.
 type Vector uintptr
 
-- 
cgit v1.2.3


From 032b0398a5a664c345c4868d5527846a1b6848db Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 11 Jun 2018 11:08:51 -0700
Subject: Sentry: split tty.queue into its own file.

Minor refactor. line_discipline.go was home to 2 large structs (lineDiscipline
and queue), and queue is now large enough IMO to get its own file.

Also moves queue locks into the queue struct, making locking simpler.

PiperOrigin-RevId: 200080301
Change-Id: Ia75a0e9b3d9ac8d7e5a0f0099a54e1f5b8bdea34
---
 pkg/sentry/fs/tty/BUILD              |   2 +
 pkg/sentry/fs/tty/line_discipline.go | 252 +++--------------------------------
 pkg/sentry/fs/tty/queue.go           | 218 ++++++++++++++++++++++++++++++
 3 files changed, 242 insertions(+), 230 deletions(-)
 create mode 100644 pkg/sentry/fs/tty/queue.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 90b350410..fce327dfe 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -11,6 +11,7 @@ go_stateify(
         "inode.go",
         "line_discipline.go",
         "master.go",
+        "queue.go",
         "slave.go",
         "terminal.go",
     ],
@@ -26,6 +27,7 @@ go_library(
         "inode.go",
         "line_discipline.go",
         "master.go",
+        "queue.go",
         "slave.go",
         "terminal.go",
         "tty_state.go",
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index a4012135c..f094635f5 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -23,7 +23,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -38,97 +37,8 @@ const (
 	nonCanonMaxBytes = canonMaxBytes - 1
 
 	spacesPerTab = 8
-
-	// transformInputStateifyKey is used to save and restore queues.
-	transformInputStateifyKey = "transformInput"
-
-	// transformOutputStateifyKey is used to save and restore queues.
-	transformOutputStateifyKey = "transformOutput"
 )
 
-// transformer is a helper interface to make it easier to stateify queue.
-type transformer interface {
-	// transform functions require queue's mutex to be held.
-	transform(*lineDiscipline, *queue, []byte) int
-}
-
-// queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
-// full, at which point they are written to the wait buffer. Bytes are
-// processed (i.e. undergo termios transformations) as they are added to the
-// read buffer. The read buffer is readable when its length is nonzero and
-// readable is true.
-type queue struct {
-	waiter.Queue `state:"nosave"`
-
-	// readBuf is buffer of data ready to be read when readable is true.
-	// This data has been processed.
-	readBuf bytes.Buffer `state:".([]byte)"`
-
-	// waitBuf contains data that can't fit into readBuf. It is put here
-	// until it can be loaded into the read buffer. waitBuf contains data
-	// that hasn't been processed.
-	waitBuf bytes.Buffer `state:".([]byte)"`
-
-	// readable indicates whether the read buffer can be read from.  In
-	// canonical mode, there can be an unterminated line in the read buffer,
-	// so readable must be checked.
-	readable bool
-
-	// transform is the the queue's function for transforming bytes
-	// entering the queue. For example, transform might convert all '\r's
-	// entering the queue to '\n's.
-	transformer
-}
-
-// saveReadBuf is invoked by stateify.
-func (q *queue) saveReadBuf() []byte {
-	return append([]byte(nil), q.readBuf.Bytes()...)
-}
-
-// loadReadBuf is invoked by stateify.
-func (q *queue) loadReadBuf(b []byte) {
-	q.readBuf.Write(b)
-}
-
-// saveWaitBuf is invoked by stateify.
-func (q *queue) saveWaitBuf() []byte {
-	return append([]byte(nil), q.waitBuf.Bytes()...)
-}
-
-// loadWaitBuf is invoked by stateify.
-func (q *queue) loadWaitBuf(b []byte) {
-	q.waitBuf.Write(b)
-}
-
-// readReadiness returns whether q is ready to be read from.
-func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
-	if q.readBuf.Len() > 0 && q.readable {
-		return waiter.EventIn
-	}
-	return waiter.EventMask(0)
-}
-
-// writeReadiness returns whether q is ready to be written to.
-func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
-	// Like Linux, we don't impose a maximum size on what can be enqueued.
-	return waiter.EventOut
-}
-
-// readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	var size int32
-	if q.readable {
-		size = int32(q.readBuf.Len())
-	}
-
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
-	return err
-
-}
-
 // lineDiscipline dictates how input and output are handled between the
 // pseudoterminal (pty) master and slave. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
@@ -160,18 +70,12 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 //
 // Lock order:
 //  termiosMu
-//    inMu
-//      outMu
+//    inQueue.mu
+//      outQueue.mu
 type lineDiscipline struct {
-	// inMu protects inQueue.
-	inMu sync.Mutex `state:"nosave"`
-
 	// inQueue is the input queue of the terminal.
 	inQueue queue
 
-	// outMu protects outQueue.
-	outMu sync.Mutex `state:"nosave"`
-
 	// outQueue is the output queue of the terminal.
 	outQueue queue
 
@@ -209,8 +113,6 @@ func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arc
 func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
-	l.inMu.Lock()
-	defer l.inMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
@@ -223,17 +125,13 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	// buffer to its read buffer. Anything already in the read buffer is
 	// now readable.
 	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
-		l.pushWaitBuf(&l.inQueue)
+		l.inQueue.pushWaitBuf(l)
 	}
 
 	return 0, err
 }
 
 func (l *lineDiscipline) masterReadiness() waiter.EventMask {
-	l.inMu.Lock()
-	defer l.inMu.Unlock()
-	l.outMu.Lock()
-	defer l.outMu.Unlock()
 	// We don't have to lock a termios because the default master termios
 	// is immutable.
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
@@ -242,156 +140,49 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	l.inMu.Lock()
-	defer l.inMu.Unlock()
-	l.outMu.Lock()
-	defer l.outMu.Unlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
 func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	l.inMu.Lock()
-	defer l.inMu.Unlock()
 	return l.inQueue.readableSize(ctx, io, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	l.inMu.Lock()
-	defer l.inMu.Unlock()
-	return l.queueRead(ctx, dst, &l.inQueue)
+	return l.inQueue.read(ctx, dst, l)
 }
 
 func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	l.inMu.Lock()
-	defer l.inMu.Unlock()
-	return l.queueWrite(ctx, src, &l.inQueue)
+	return l.inQueue.write(ctx, src, l)
 }
 
 func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	l.outMu.Lock()
-	defer l.outMu.Unlock()
 	return l.outQueue.readableSize(ctx, io, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	l.outMu.Lock()
-	defer l.outMu.Unlock()
-	return l.queueRead(ctx, dst, &l.outQueue)
+	return l.outQueue.read(ctx, dst, l)
 }
 
 func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	l.outMu.Lock()
-	defer l.outMu.Unlock()
-	return l.queueWrite(ctx, src, &l.outQueue)
+	return l.outQueue.write(ctx, src, l)
 }
 
-// queueRead reads from q to userspace.
-//
-// Preconditions:
-// * l.termiosMu must be held for reading.
-// * q's lock must be held.
-func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence, q *queue) (int64, error) {
-	if !q.readable {
-		return 0, syserror.ErrWouldBlock
-	}
-
-	// Read out from the read buffer.
-	n := canonMaxBytes
-	if n > int(dst.NumBytes()) {
-		n = int(dst.NumBytes())
-	}
-	if n > q.readBuf.Len() {
-		n = q.readBuf.Len()
-	}
-	n, err := dst.Writer(ctx).Write(q.readBuf.Bytes()[:n])
-	if err != nil {
-		return 0, err
-	}
-	// Discard bytes read out.
-	q.readBuf.Next(n)
-
-	// If we read everything, this queue is no longer readable.
-	if q.readBuf.Len() == 0 {
-		q.readable = false
-	}
-
-	// Move data from the queue's wait buffer to its read buffer.
-	l.pushWaitBuf(q)
-
-	// If state changed, notify any waiters. If nothing was available to
-	// read, let the caller know we could block.
-	if n > 0 {
-		q.Notify(waiter.EventOut)
-	} else {
-		return 0, syserror.ErrWouldBlock
-	}
-	return int64(n), nil
-}
-
-// queueWrite writes to q from userspace.
-//
-// Preconditions:
-// * l.termiosMu must be held for reading.
-// * q's lock must be held.
-func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence, q *queue) (int64, error) {
-	// TODO: Use CopyInTo/safemem to avoid extra copying.
-	// Copy in the bytes to write from user-space.
-	b := make([]byte, src.NumBytes())
-	n, err := src.CopyIn(ctx, b)
-	if err != nil {
-		return 0, err
-	}
-	b = b[:n]
-	return l.queueWriteBytes(b, q)
-}
-
-// queueWriteBytes writes to q from b.
-//
-// Precondition:
-// * l.termiosMu must be held for reading.
-// * q's lock must be held.
-func (l *lineDiscipline) queueWriteBytes(b []byte, q *queue) (int64, error) {
-	// Write as much as possible to the read buffer.
-	n := q.transform(l, q, b)
-
-	// Write remaining data to the wait buffer.
-	nWaiting, _ := q.waitBuf.Write(b[n:])
-
-	// If state changed, notify any waiters. If we were unable to write
-	// anything, let the caller know we could block.
-	if n > 0 {
-		q.Notify(waiter.EventIn)
-	} else if nWaiting == 0 {
-		return 0, syserror.ErrWouldBlock
-	}
-	return int64(n + nWaiting), nil
-}
-
-// pushWaitBuf fills the queue's read buffer with data from the wait buffer.
-//
-// Precondition:
-// * l.termiosMu must be held for reading.
-// * l.inMu must be held.
-func (l *lineDiscipline) pushWaitBuf(q *queue) {
-	// Remove bytes from the wait buffer and move them to the read buffer.
-	n := q.transform(l, q, q.waitBuf.Bytes())
-	q.waitBuf.Next(n)
-
-	// If state changed, notify any waiters.
-	if n > 0 {
-		q.Notify(waiter.EventIn)
-	}
+// transformer is a helper interface to make it easier to stateify queue.
+type transformer interface {
+	// transform functions require queue's mutex to be held.
+	transform(*lineDiscipline, *queue, []byte) int
 }
 
-// outputQueueTransformer implements transformer.
+// outputQueueTransformer implements transformer. It performs line discipline
+// transformations on the output queue.
 type outputQueueTransformer struct{}
 
 // transform does output processing for one end of the pty. See
@@ -399,7 +190,7 @@ type outputQueueTransformer struct{}
 //
 // Precondition:
 // * l.termiosMu must be held for reading.
-// * q's mutex must be held.
+// * q.mu must be held.
 func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
 	// transformOutput is effectively always in noncanonical mode, as the
 	// master termios never has ICANON set.
@@ -461,7 +252,8 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 	return ret
 }
 
-// inputQueueTransformer implements transformer.
+// inputQueueTransformer implements transformer. It performs line discipline
+// transformations on the input queue.
 type inputQueueTransformer struct{}
 
 // transform does input processing for one end of the pty. Characters read are
@@ -471,7 +263,7 @@ type inputQueueTransformer struct{}
 //
 // Precondition:
 // * l.termiosMu must be held for reading.
-// * q's mutex must be held.
+// * q.mu must be held.
 func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
 	// If there's a line waiting to be read in canonical mode, don't write
 	// anything else to the read buffer.
@@ -528,11 +320,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 		q.readBuf.WriteRune(c)
 		// Anything written to the readBuf will have to be echoed.
 		if l.termios.LEnabled(linux.ECHO) {
-			// We can't defer Unlock here because we may
-			// Lock/Unlock l.outMu multiple times in this loop.
-			l.outMu.Lock()
-			l.queueWriteBytes(cBytes, &l.outQueue)
-			l.outMu.Unlock()
+			l.outQueue.writeBytes(cBytes, l)
 		}
 
 		// If we finish a line, make it available for reading.
@@ -553,6 +341,10 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 // shouldDiscard returns whether c should be discarded. In canonical mode, if
 // too many bytes are enqueued, we keep reading input and discarding it until
 // we find a terminating character. Signal/echo processing still occurs.
+//
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
 func (l *lineDiscipline) shouldDiscard(q *queue, c rune) bool {
 	return l.termios.LEnabled(linux.ICANON) && q.readBuf.Len()+utf8.RuneLen(c) >= canonMaxBytes && !l.termios.IsTerminating(c)
 }
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
new file mode 100644
index 000000000..026d5e077
--- /dev/null
+++ b/pkg/sentry/fs/tty/queue.go
@@ -0,0 +1,218 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tty
+
+import (
+	"bytes"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// queue represents one of the input or output queues between a pty master and
+// slave. Bytes written to a queue are added to the read buffer until it is
+// full, at which point they are written to the wait buffer. Bytes are
+// processed (i.e. undergo termios transformations) as they are added to the
+// read buffer. The read buffer is readable when its length is nonzero and
+// readable is true.
+type queue struct {
+	// mu protects everything in queue.
+	mu sync.Mutex `state:"nosave"`
+
+	waiter.Queue `state:"nosave"`
+
+	// readBuf is buffer of data ready to be read when readable is true.
+	// This data has been processed.
+	readBuf bytes.Buffer `state:".([]byte)"`
+
+	// waitBuf contains data that can't fit into readBuf. It is put here
+	// until it can be loaded into the read buffer. waitBuf contains data
+	// that hasn't been processed.
+	waitBuf bytes.Buffer `state:".([]byte)"`
+
+	// readable indicates whether the read buffer can be read from.  In
+	// canonical mode, there can be an unterminated line in the read buffer,
+	// so readable must be checked.
+	readable bool
+
+	// transform is the the queue's function for transforming bytes
+	// entering the queue. For example, transform might convert all '\r's
+	// entering the queue to '\n's.
+	transformer
+}
+
+// saveReadBuf is invoked by stateify.
+func (q *queue) saveReadBuf() []byte {
+	return append([]byte(nil), q.readBuf.Bytes()...)
+}
+
+// loadReadBuf is invoked by stateify.
+func (q *queue) loadReadBuf(b []byte) {
+	q.readBuf.Write(b)
+}
+
+// saveWaitBuf is invoked by stateify.
+func (q *queue) saveWaitBuf() []byte {
+	return append([]byte(nil), q.waitBuf.Bytes()...)
+}
+
+// loadWaitBuf is invoked by stateify.
+func (q *queue) loadWaitBuf(b []byte) {
+	q.waitBuf.Write(b)
+}
+
+// readReadiness returns whether q is ready to be read from.
+func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if q.readBuf.Len() > 0 && q.readable {
+		return waiter.EventIn
+	}
+	return waiter.EventMask(0)
+}
+
+// writeReadiness returns whether q is ready to be written to.
+func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
+	// Like Linux, we don't impose a maximum size on what can be enqueued.
+	return waiter.EventOut
+}
+
+// readableSize writes the number of readable bytes to userspace.
+func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	var size int32
+	if q.readable {
+		size = int32(q.readBuf.Len())
+	}
+
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+
+}
+
+// read reads from q to userspace.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if !q.readable {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Read out from the read buffer.
+	n := canonMaxBytes
+	if n > int(dst.NumBytes()) {
+		n = int(dst.NumBytes())
+	}
+	if n > q.readBuf.Len() {
+		n = q.readBuf.Len()
+	}
+	n, err := dst.Writer(ctx).Write(q.readBuf.Bytes()[:n])
+	if err != nil {
+		return 0, err
+	}
+	// Discard bytes read out.
+	q.readBuf.Next(n)
+
+	// If we read everything, this queue is no longer readable.
+	if q.readBuf.Len() == 0 {
+		q.readable = false
+	}
+
+	// Move data from the queue's wait buffer to its read buffer.
+	q.pushWaitBufLocked(l)
+
+	// If state changed, notify any waiters. If nothing was available to
+	// read, let the caller know we could block.
+	if n > 0 {
+		q.Notify(waiter.EventOut)
+	} else {
+		return 0, syserror.ErrWouldBlock
+	}
+	return int64(n), nil
+}
+
+// write writes to q from userspace.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
+	// TODO: Use CopyInTo/safemem to avoid extra copying.
+	// Copy in the bytes to write from user-space.
+	b := make([]byte, src.NumBytes())
+	n, err := src.CopyIn(ctx, b)
+	if err != nil {
+		return 0, err
+	}
+	b = b[:n]
+	return q.writeBytes(b, l)
+}
+
+// writeBytes writes to q from b.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) writeBytes(b []byte, l *lineDiscipline) (int64, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Write as much as possible to the read buffer.
+	n := q.transform(l, q, b)
+
+	// Write remaining data to the wait buffer.
+	nWaiting, _ := q.waitBuf.Write(b[n:])
+
+	// If state changed, notify any waiters. If we were unable to write
+	// anything, let the caller know we could block.
+	if n > 0 {
+		q.Notify(waiter.EventIn)
+	} else if nWaiting == 0 {
+		return 0, syserror.ErrWouldBlock
+	}
+	return int64(n + nWaiting), nil
+}
+
+// pushWaitBuf fills the queue's read buffer with data from the wait buffer.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) pushWaitBuf(l *lineDiscipline) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	q.pushWaitBufLocked(l)
+}
+
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be locked.
+func (q *queue) pushWaitBufLocked(l *lineDiscipline) {
+	// Remove bytes from the wait buffer and move them to the read buffer.
+	n := q.transform(l, q, q.waitBuf.Bytes())
+	q.waitBuf.Next(n)
+
+	// If state changed, notify any waiters.
+	if n > 0 {
+		q.Notify(waiter.EventIn)
+	}
+}
-- 
cgit v1.2.3


From 7260363751915d21538c13b08b5bb6a48d0f4f8e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 11 Jun 2018 13:34:27 -0700
Subject: Add O_TRUNC handling in openat

PiperOrigin-RevId: 200103677
Change-Id: I3efb565c30c64d35f8fd7b5c05ed78dcc2990c51
---
 pkg/sentry/syscalls/linux/sys_file.go | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index a2db9d4c9..e2980842f 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -147,21 +147,25 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
 		}
 
 		fileFlags := linuxToFlags(flags)
-		isDir := fs.IsDir(d.Inode.StableAttr)
-
-		// If O_DIRECTORY is set, but the file is not a directory, then fail.
-		if fileFlags.Directory && !isDir {
-			return syserror.ENOTDIR
-		}
-
-		// If it's a directory, then make sure.
-		if dirPath && !isDir {
-			return syserror.ENOTDIR
-		}
-
-		// Don't allow directories to be opened writable.
-		if isDir && fileFlags.Write {
-			return syserror.EISDIR
+		if fs.IsDir(d.Inode.StableAttr) {
+			// Don't allow directories to be opened writable.
+			if fileFlags.Write {
+				return syserror.EISDIR
+			}
+		} else {
+			// If O_DIRECTORY is set, but the file is not a directory, then fail.
+			if fileFlags.Directory {
+				return syserror.ENOTDIR
+			}
+			// If it's a directory, then make sure.
+			if dirPath {
+				return syserror.ENOTDIR
+			}
+			if fileFlags.Write && flags&syscall.O_TRUNC != 0 {
+				if err := d.Inode.Truncate(t, d, 0); err != nil {
+					return err
+				}
+			}
 		}
 
 		file, err := d.Inode.GetFile(t, d, fileFlags)
-- 
cgit v1.2.3


From 0412f17e06670fb1f1d1d85ddd73bbadde40c087 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Mon, 11 Jun 2018 15:33:07 -0700
Subject: rpcinet is treating EAGAIN and EWOULDBLOCK as different errnos.

PiperOrigin-RevId: 200124614
Change-Id: I38a7b083f1464a2a586fe24db648e624c455fec5
---
 pkg/sentry/socket/rpcinet/socket.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index a9dd1780a..ffe947500 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -228,7 +228,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	payload, se := rpcAccept(t, s.fd, peerRequested)
 
 	// Check if we need to block.
-	if blocking && se == syserr.ErrWouldBlock {
+	if blocking && se == syserr.ErrTryAgain {
 		// Register for notifications.
 		e, ch := waiter.NewChannelEntry(nil)
 		s.EventRegister(&e, waiter.EventIn)
@@ -237,7 +237,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		// Try to accept the connection again; if it fails, then wait until we
 		// get a notification.
 		for {
-			if payload, se = rpcAccept(t, s.fd, peerRequested); se != syserr.ErrWouldBlock {
+			if payload, se = rpcAccept(t, s.fd, peerRequested); se != syserr.ErrTryAgain {
 				break
 			}
 
@@ -471,7 +471,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		}
 		return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 	}
-	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
 		return 0, nil, 0, socket.ControlMessages{}, err
 	}
 
@@ -490,7 +490,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			}
 			return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 		}
-		if err != syserr.ErrWouldBlock {
+		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
 			return 0, nil, 0, socket.ControlMessages{}, err
 		}
 
@@ -546,7 +546,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	}}
 
 	n, err := rpcSendMsg(t, req)
-	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
 		return int(n), err
 	}
 
@@ -558,7 +558,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 	for {
 		n, err := rpcSendMsg(t, req)
-		if err != syserr.ErrWouldBlock {
+		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
 			return int(n), err
 		}
 
-- 
cgit v1.2.3


From ab2c2575d61266725ce13dff570663464a171342 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Mon, 11 Jun 2018 16:39:39 -0700
Subject: Rpcinet is incorrectly handling MSG_TRUNC with SOCK_STREAM

SOCK_STREAM has special behavior with respect to MSG_TRUNC. Specifically,
the data isn't actually copied back out to userspace when MSG_TRUNC is
provided on a SOCK_STREAM.

According to tcp(7): "Since version 2.4, Linux supports the use of
MSG_TRUNC in the flags argument of recv(2) (and recvmsg(2)). This flag
causes the received bytes of data to be discarded, rather than passed
back in a caller-supplied buffer."

PiperOrigin-RevId: 200134860
Change-Id: I70f17a5f60ffe7794c3f0cfafd131c069202e90d
---
 pkg/sentry/socket/rpcinet/socket.go | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index ffe947500..6f1a4fe01 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -465,9 +465,13 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 	res, err := rpcRecvMsg(t, req)
 	if err == nil {
-		n, e := dst.CopyOut(t, res.Data)
-		if e == nil && n != len(res.Data) {
-			panic("CopyOut failed to copy full buffer")
+		var e error
+		var n int
+		if len(res.Data) > 0 {
+			n, e = dst.CopyOut(t, res.Data)
+			if e == nil && n != len(res.Data) {
+				panic("CopyOut failed to copy full buffer")
+			}
 		}
 		return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 	}
@@ -484,9 +488,13 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	for {
 		res, err := rpcRecvMsg(t, req)
 		if err == nil {
-			n, e := dst.CopyOut(t, res.Data)
-			if e == nil && n != len(res.Data) {
-				panic("CopyOut failed to copy full buffer")
+			var e error
+			var n int
+			if len(res.Data) > 0 {
+				n, e = dst.CopyOut(t, res.Data)
+				if e == nil && n != len(res.Data) {
+					panic("CopyOut failed to copy full buffer")
+				}
 			}
 			return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
 		}
-- 
cgit v1.2.3


From ea4a468fbaacd55597ce89e3eabd2bb42746427b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 11 Jun 2018 16:44:56 -0700
Subject: Set CLOEXEC option to sockets

hostinet/socket.go: the Sentry doesn't spawn new processes, but it doesn't hurt to protect the socket from leaking.
unet/unet.go: should be setting closing on exec. The FD is explicitly donated to children when needed.

PiperOrigin-RevId: 200135682
Change-Id: Ia8a45ced1e00a19420c8611b12e7a8ee770f89cb
---
 pkg/sentry/socket/hostinet/socket.go | 6 +++---
 pkg/unet/unet.go                     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 8f901df6c..d0f3054dc 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -193,7 +193,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 
 	// Conservatively ignore all flags specified by the application and add
 	// SOCK_NONBLOCK since socketOperations requires it.
-	fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK)
+	fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC)
 	if blocking {
 		var ch chan struct{}
 		for syscallErr == syserror.ErrWouldBlock {
@@ -207,7 +207,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 				s.EventRegister(&e, waiter.EventIn)
 				defer s.EventUnregister(&e)
 			}
-			fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK)
+			fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC)
 		}
 	}
 
@@ -545,7 +545,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protoc
 	// Conservatively ignore all flags specified by the application and add
 	// SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0
 	// to simplify the syscall filters, since 0 and IPPROTO_* are equivalent.
-	fd, err := syscall.Socket(p.family, stype|syscall.SOCK_NONBLOCK, 0)
+	fd, err := syscall.Socket(p.family, stype|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, syserr.FromError(err)
 	}
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index 59b6c5568..f4800e0d9 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -201,7 +201,7 @@ func (s *Socket) enterFD() (int, bool) {
 // SocketPair creates a pair of connected sockets.
 func SocketPair(packet bool) (*Socket, *Socket, error) {
 	// Make a new pair.
-	fds, err := syscall.Socketpair(syscall.AF_UNIX, socketType(packet), 0)
+	fds, err := syscall.Socketpair(syscall.AF_UNIX, socketType(packet)|syscall.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, nil, err
 	}
-- 
cgit v1.2.3


From 09b0a9c320bd777bc52384bd0ec91ecfc61e481d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 11 Jun 2018 17:56:18 -0700
Subject: Handle all exception vectors.

PiperOrigin-RevId: 200144655
Change-Id: I5a753c74b75007b7714d6fe34aa0d2e845dc5c41
---
 pkg/sentry/platform/kvm/machine_amd64.go  | 48 +++++++++++++++++++++++++++++--
 pkg/sentry/platform/ring0/kernel_amd64.go |  9 +++---
 2 files changed, 49 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 7ac289756..52896eefe 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -137,6 +137,18 @@ func (c *vCPU) initArchState() error {
 	return c.setSystemTime()
 }
 
+// nonCanonical generates a canonical address return.
+//
+//go:nosplit
+func nonCanonical(addr uint64, signal int32) (*arch.SignalInfo, usermem.AccessType, error) {
+	info := &arch.SignalInfo{
+		Signo: signal,
+		Code:  arch.SignalInfoKernel,
+	}
+	info.SetAddr(addr) // Include address.
+	return info, usermem.NoAccess, platform.ErrContextSignal
+}
+
 // fault generates an appropriate fault return.
 //
 //go:nosplit
@@ -169,6 +181,17 @@ func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error)
 
 // SwitchToUser unpacks architectural-details.
 func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, usermem.AccessType, error) {
+	// Check for canonical addresses.
+	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) {
+		return nonCanonical(regs.Rip, int32(syscall.SIGSEGV))
+	} else if !ring0.IsCanonical(regs.Rsp) {
+		return nonCanonical(regs.Rsp, int32(syscall.SIGBUS))
+	} else if !ring0.IsCanonical(regs.Fs_base) {
+		return nonCanonical(regs.Fs_base, int32(syscall.SIGBUS))
+	} else if !ring0.IsCanonical(regs.Gs_base) {
+		return nonCanonical(regs.Gs_base, int32(syscall.SIGBUS))
+	}
+
 	// Assign PCIDs.
 	if c.PCIDs != nil {
 		var requireFlushPCID bool // Force a flush?
@@ -205,7 +228,11 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
-	case ring0.GeneralProtectionFault:
+	case ring0.GeneralProtectionFault,
+		ring0.SegmentNotPresent,
+		ring0.BoundRangeExceeded,
+		ring0.InvalidTSS,
+		ring0.StackSegmentFault:
 		info := &arch.SignalInfo{
 			Signo: int32(syscall.SIGSEGV),
 			Code:  arch.SignalInfoKernel,
@@ -229,7 +256,16 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
-	case ring0.X87FloatingPointException:
+	case ring0.Overflow:
+		info := &arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  1, // FPE_INTOVF (integer overflow).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return info, usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.X87FloatingPointException,
+		ring0.SIMDFloatingPointException:
 		info := &arch.SignalInfo{
 			Signo: int32(syscall.SIGFPE),
 			Code:  7, // FPE_FLTINV (invalid operation).
@@ -237,7 +273,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
-	case ring0.Vector(bounce):
+	case ring0.Vector(bounce): // ring0.VirtualizationException
 		return nil, usermem.NoAccess, platform.ErrContextInterrupt
 
 	case ring0.AlignmentCheck:
@@ -255,6 +291,12 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 		// directly into the instance.
 		return c.fault(int32(syscall.SIGBUS))
 
+	case ring0.DeviceNotAvailable,
+		ring0.DoubleFault,
+		ring0.CoprocessorSegmentOverrun,
+		ring0.MachineCheck,
+		ring0.SecurityException:
+		fallthrough
 	default:
 		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
 	}
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index c828af654..117e86104 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -161,18 +161,17 @@ func IsCanonical(addr uint64) bool {
 // Also note that this function transitively depends on the compiler generating
 // code that uses IP-relative addressing inside of absolute addresses. That's
 // the case for amd64, but may not be the case for other architectures.
+//
+// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
+
 //
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
-	// Check for canonical addresses.
-	regs := switchOpts.Registers
-	if !IsCanonical(regs.Rip) || !IsCanonical(regs.Rsp) || !IsCanonical(regs.Fs_base) || !IsCanonical(regs.Gs_base) {
-		return GeneralProtectionFault
-	}
 	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
 	kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
 
 	// Sanitize registers.
+	regs := switchOpts.Registers
 	regs.Eflags &= ^uint64(UserFlagsClear)
 	regs.Eflags |= UserFlagsSet
 	regs.Cs = uint64(Ucode64) // Required for iret.
-- 
cgit v1.2.3


From 1397a413b49d6036f2586e85c8074aa3d4d6c6fa Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 11 Jun 2018 18:14:22 -0700
Subject: Make page tables split-safe.

In order to minimize the likelihood of exit during page table
modifications, make the full set of page table functions split-safe.
This is not strictly necessary (and you may still incur splits due to
allocations from the allocator pool) but should make retries a very rare
occurance.

PiperOrigin-RevId: 200146688
Change-Id: I8fa36aa16b807beda2f0b057be60038258e8d597
---
 pkg/sentry/platform/ring0/pagetables/BUILD         |  72 +++++
 pkg/sentry/platform/ring0/pagetables/pagetables.go | 192 ++++++++++---
 .../platform/ring0/pagetables/pagetables_amd64.go  | 276 ------------------
 .../platform/ring0/pagetables/pagetables_test.go   |  83 +++---
 .../platform/ring0/pagetables/walker_amd64.go      | 307 +++++++++++++++++++++
 5 files changed, 583 insertions(+), 347 deletions(-)
 create mode 100644 pkg/sentry/platform/ring0/pagetables/walker_amd64.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 08b73e87d..023e298a0 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,6 +1,73 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
+
+go_template(
+    name = "generic_walker",
+    srcs = [
+        "walker_amd64.go",
+    ],
+    opt_types = [
+        "Visitor",
+    ],
+    visibility = [":__pkg__"],
+)
+
+go_template_instance(
+    name = "walker_map",
+    out = "walker_map.go",
+    package = "pagetables",
+    prefix = "map",
+    template = ":generic_walker",
+    types = {
+        "Visitor": "mapVisitor",
+    },
+)
+
+go_template_instance(
+    name = "walker_unmap",
+    out = "walker_unmap.go",
+    package = "pagetables",
+    prefix = "unmap",
+    template = ":generic_walker",
+    types = {
+        "Visitor": "unmapVisitor",
+    },
+)
+
+go_template_instance(
+    name = "walker_lookup",
+    out = "walker_lookup.go",
+    package = "pagetables",
+    prefix = "lookup",
+    template = ":generic_walker",
+    types = {
+        "Visitor": "lookupVisitor",
+    },
+)
+
+go_template_instance(
+    name = "walker_empty",
+    out = "walker_empty.go",
+    package = "pagetables",
+    prefix = "empty",
+    template = ":generic_walker",
+    types = {
+        "Visitor": "emptyVisitor",
+    },
+)
+
+go_template_instance(
+    name = "walker_check",
+    out = "walker_check.go",
+    package = "pagetables",
+    prefix = "check",
+    template = ":generic_walker",
+    types = {
+        "Visitor": "checkVisitor",
+    },
+)
 
 go_library(
     name = "pagetables",
@@ -11,6 +78,10 @@ go_library(
         "pagetables_amd64.go",
         "pagetables_x86.go",
         "pcids_x86.go",
+        "walker_empty.go",
+        "walker_lookup.go",
+        "walker_map.go",
+        "walker_unmap.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables",
     visibility = [
@@ -26,6 +97,7 @@ go_test(
     srcs = [
         "pagetables_amd64_test.go",
         "pagetables_test.go",
+        "walker_check.go",
     ],
     embed = [":pagetables"],
     deps = ["//pkg/sentry/usermem"],
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 6963ba62d..ff5787f89 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -13,6 +13,11 @@
 // limitations under the License.
 
 // Package pagetables provides a generic implementation of pagetables.
+//
+// The core functions must be safe to call from a nosplit context. Furthermore,
+// this pagetables implementation goes to lengths to ensure that all functions
+// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made
+// during walks, but these can be cached elsewhere if required.
 package pagetables
 
 import (
@@ -38,64 +43,179 @@ type PageTables struct {
 
 // New returns new PageTables.
 func New(a Allocator) *PageTables {
-	p := &PageTables{Allocator: a}
+	p := new(PageTables)
+	p.Init(a)
+	return p
+}
+
+// Init initializes a set of PageTables.
+//
+//go:nosplit
+func (p *PageTables) Init(allocator Allocator) {
+	p.Allocator = allocator
 	p.root = p.Allocator.NewPTEs()
 	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
-	return p
 }
 
+// mapVisitor is used for map.
+type mapVisitor struct {
+	target   uintptr // Input.
+	physical uintptr // Input.
+	opts     MapOpts // Input.
+	prev     bool    // Output.
+}
+
+// visit is used for map.
+//
+//go:nosplit
+func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	p := v.physical + (start - uintptr(v.target))
+	if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
+		v.prev = true
+	}
+	if p&align != 0 {
+		// We will install entries at a smaller granulaity if we don't
+		// install a valid entry here, however we must zap any existing
+		// entry to ensure this happens.
+		pte.Clear()
+		return
+	}
+	pte.Set(p, v.opts)
+}
+
+//go:nosplit
+func (*mapVisitor) requiresAlloc() bool { return true }
+
+//go:nosplit
+func (*mapVisitor) requiresSplit() bool { return true }
+
 // Map installs a mapping with the given physical address.
 //
 // True is returned iff there was a previous mapping in the range.
 //
-// Precondition: addr & length must be aligned, their sum must not overflow.
+// Precondition: addr & length must be page-aligned, their sum must not overflow.
+//
+//go:nosplit
 func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
 	if !opts.AccessType.Any() {
 		return p.Unmap(addr, length)
 	}
-	prev := false
-	end, ok := addr.AddLength(uint64(length))
-	if !ok {
-		panic("pagetables.Map: overflow")
+	w := mapWalker{
+		pageTables: p,
+		visitor: mapVisitor{
+			target:   uintptr(addr),
+			physical: physical,
+			opts:     opts,
+		},
 	}
-	p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) {
-		p := physical + (s - uintptr(addr))
-		prev = prev || (pte.Valid() && (p != pte.Address() || opts != pte.Opts()))
-		if p&align != 0 {
-			// We will install entries at a smaller granulaity if
-			// we don't install a valid entry here, however we must
-			// zap any existing entry to ensure this happens.
-			pte.Clear()
-			return
-		}
-		pte.Set(p, opts)
-	})
-	return prev
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.prev
+}
+
+// unmapVisitor is used for unmap.
+type unmapVisitor struct {
+	count int
+}
+
+//go:nosplit
+func (*unmapVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*unmapVisitor) requiresSplit() bool { return true }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	pte.Clear()
+	v.count++
 }
 
 // Unmap unmaps the given range.
 //
 // True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
 func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
-	count := 0
-	p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) {
-		pte.Clear()
-		count++
-	})
-	return count > 0
+	w := unmapWalker{
+		pageTables: p,
+		visitor: unmapVisitor{
+			count: 0,
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.count > 0
 }
 
+// emptyVisitor is used for emptiness checks.
+type emptyVisitor struct {
+	count int
+}
+
+//go:nosplit
+func (*emptyVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*emptyVisitor) requiresSplit() bool { return false }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	v.count++
+}
+
+// IsEmpty checks if the given range is empty.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
+func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
+	w := emptyWalker{
+		pageTables: p,
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.count == 0
+}
+
+// lookupVisitor is used for lookup.
+type lookupVisitor struct {
+	target   uintptr // Input.
+	physical uintptr // Output.
+	opts     MapOpts // Output.
+}
+
+// visit matches the given address.
+//
+//go:nosplit
+func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	if !pte.Valid() {
+		return
+	}
+	v.physical = pte.Address() + (start - uintptr(v.target))
+	v.opts = pte.Opts()
+}
+
+//go:nosplit
+func (*lookupVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*lookupVisitor) requiresSplit() bool { return false }
+
 // Lookup returns the physical address for the given virtual address.
+//
+//go:nosplit
 func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
 	mask := uintptr(usermem.PageSize - 1)
-	off := uintptr(addr) & mask
-	addr = addr &^ usermem.Addr(mask)
-	p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) {
-		if !pte.Valid() {
-			return
-		}
-		physical = pte.Address() + (s - uintptr(addr)) + off
-		opts = pte.Opts()
-	})
-	return
+	offset := uintptr(addr) & mask
+	w := lookupWalker{
+		pageTables: p,
+		visitor: lookupVisitor{
+			target: uintptr(addr &^ usermem.Addr(mask)),
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+1)
+	return w.visitor.physical + offset, w.visitor.opts
 }
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 6a724e4fd..878463018 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -12,14 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// +build amd64
-
 package pagetables
 
-import (
-	"fmt"
-)
-
 // Address constraints.
 //
 // The lowerTop and upperBottom currently apply to four-level pagetables;
@@ -49,273 +43,3 @@ const (
 
 // PTEs is a collection of entries.
 type PTEs [entriesPerPage]PTE
-
-// next returns the next address quantized by the given size.
-func next(start uint64, size uint64) uint64 {
-	start &= ^(size - 1)
-	start += size
-	return start
-}
-
-// iterateRange iterates over all appropriate levels of page tables for the given range.
-//
-// If alloc is set, then Set _must_ be called on all given PTEs. The exception
-// is super pages. If a valid super page cannot be installed, then the walk
-// will continue to individual entries.
-//
-// This algorithm will attempt to maximize the use of super pages whenever
-// possible. Whether a super page is provided will be clear through the range
-// provided in the callback.
-//
-// Note that if alloc set, then no gaps will be present. However, if alloc is
-// not set, then the iteration will likely be full of gaps.
-//
-// Note that this function should generally be avoided in favor of Map, Unmap,
-// etc. when not necessary.
-//
-// Precondition: startAddr and endAddr must be page-aligned.
-//
-// Precondition: startStart must be less than endAddr.
-//
-// Precondition: If alloc is set, then startAddr and endAddr should not span
-// non-canonical ranges. If they do, a panic will result.
-func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) {
-	start := uint64(startAddr)
-	end := uint64(endAddr)
-	if start%pteSize != 0 {
-		panic(fmt.Sprintf("unaligned start: %v", start))
-	}
-	if start > end {
-		panic(fmt.Sprintf("start > end (%v > %v))", start, end))
-	}
-
-	// Deal with cases where we traverse the "gap".
-	//
-	// These are all explicitly disallowed if alloc is set, and we must
-	// traverse an entry for each address explicitly.
-	switch {
-	case start < lowerTop && end > lowerTop && end < upperBottom:
-		if alloc {
-			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
-		}
-		p.iterateRange(startAddr, lowerTop, false, fn)
-		return
-	case start < lowerTop && end > lowerTop:
-		if alloc {
-			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
-		}
-		p.iterateRange(startAddr, lowerTop, false, fn)
-		p.iterateRange(upperBottom, endAddr, false, fn)
-		return
-	case start > lowerTop && end < upperBottom:
-		if alloc {
-			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
-		}
-		return
-	case start > lowerTop && start < upperBottom && end > upperBottom:
-		if alloc {
-			panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
-		}
-		p.iterateRange(upperBottom, endAddr, false, fn)
-		return
-	}
-
-	for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
-		var (
-			pgdEntry   = &p.root[pgdIndex]
-			pudEntries *PTEs
-		)
-		if !pgdEntry.Valid() {
-			if !alloc {
-				// Skip over this entry.
-				start = next(start, pgdSize)
-				continue
-			}
-
-			// Allocate a new pgd.
-			pudEntries = p.Allocator.NewPTEs()
-			pgdEntry.setPageTable(p, pudEntries)
-		} else {
-			pudEntries = p.Allocator.LookupPTEs(pgdEntry.Address())
-		}
-
-		// Map the next level.
-		clearPUDEntries := 0
-
-		for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
-			var (
-				pudEntry   = &pudEntries[pudIndex]
-				pmdEntries *PTEs
-			)
-			if !pudEntry.Valid() {
-				if !alloc {
-					// Skip over this entry.
-					clearPUDEntries++
-					start = next(start, pudSize)
-					continue
-				}
-
-				// This level has 1-GB super pages. Is this
-				// entire region contained in a single PUD
-				// entry? If so, we can skip allocating a new
-				// page for the pmd.
-				if start&(pudSize-1) == 0 && end-start >= pudSize {
-					pudEntry.SetSuper()
-					fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
-					if pudEntry.Valid() {
-						start = next(start, pudSize)
-						continue
-					}
-				}
-
-				// Allocate a new pud.
-				pmdEntries = p.Allocator.NewPTEs()
-				pudEntry.setPageTable(p, pmdEntries)
-
-			} else if pudEntry.IsSuper() {
-				// Does this page need to be split?
-				if start&(pudSize-1) != 0 || end < next(start, pudSize) {
-					currentAddr := uint64(pudEntry.Address())
-
-					// Install the relevant entries.
-					pmdEntries = p.Allocator.NewPTEs()
-					for index := 0; index < entriesPerPage; index++ {
-						pmdEntry := &pmdEntries[index]
-						pmdEntry.SetSuper()
-						pmdEntry.Set(uintptr(currentAddr), pudEntry.Opts())
-						currentAddr += pmdSize
-					}
-
-					// Reset to point to the new page.
-					pudEntry.setPageTable(p, pmdEntries)
-				} else {
-					// A super page to be checked directly.
-					fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
-
-					// Might have been cleared.
-					if !pudEntry.Valid() {
-						clearPUDEntries++
-					}
-
-					// Note that the super page was changed.
-					start = next(start, pudSize)
-					continue
-				}
-			} else {
-				pmdEntries = p.Allocator.LookupPTEs(pudEntry.Address())
-			}
-
-			// Map the next level, since this is valid.
-			clearPMDEntries := 0
-
-			for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
-				var (
-					pmdEntry   = &pmdEntries[pmdIndex]
-					pteEntries *PTEs
-				)
-				if !pmdEntry.Valid() {
-					if !alloc {
-						// Skip over this entry.
-						clearPMDEntries++
-						start = next(start, pmdSize)
-						continue
-					}
-
-					// This level has 2-MB huge pages. If this
-					// region is contained in a single PMD entry?
-					// As above, we can skip allocating a new page.
-					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
-						pmdEntry.SetSuper()
-						fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
-						if pmdEntry.Valid() {
-							start = next(start, pmdSize)
-							continue
-						}
-					}
-
-					// Allocate a new pmd.
-					pteEntries = p.Allocator.NewPTEs()
-					pmdEntry.setPageTable(p, pteEntries)
-
-				} else if pmdEntry.IsSuper() {
-					// Does this page need to be split?
-					if start&(pmdSize-1) != 0 || end < next(start, pmdSize) {
-						currentAddr := uint64(pmdEntry.Address())
-
-						// Install the relevant entries.
-						pteEntries = p.Allocator.NewPTEs()
-						for index := 0; index < entriesPerPage; index++ {
-							pteEntry := &pteEntries[index]
-							pteEntry.Set(uintptr(currentAddr), pmdEntry.Opts())
-							currentAddr += pteSize
-						}
-
-						// Reset to point to the new page.
-						pmdEntry.setPageTable(p, pteEntries)
-					} else {
-						// A huge page to be checked directly.
-						fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
-
-						// Might have been cleared.
-						if !pmdEntry.Valid() {
-							clearPMDEntries++
-						}
-
-						// Note that the huge page was changed.
-						start = next(start, pmdSize)
-						continue
-					}
-				} else {
-					pteEntries = p.Allocator.LookupPTEs(pmdEntry.Address())
-				}
-
-				// Map the next level, since this is valid.
-				clearPTEEntries := 0
-
-				for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
-					var (
-						pteEntry = &pteEntries[pteIndex]
-					)
-					if !pteEntry.Valid() && !alloc {
-						clearPTEEntries++
-						start += pteSize
-						continue
-					}
-
-					// At this point, we are guaranteed that start%pteSize == 0.
-					fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1)
-					if !pteEntry.Valid() {
-						if alloc {
-							panic("PTE not set after iteration with alloc=true!")
-						}
-						clearPTEEntries++
-					}
-
-					// Note that the pte was changed.
-					start += pteSize
-					continue
-				}
-
-				// Check if we no longer need this page.
-				if clearPTEEntries == entriesPerPage {
-					pmdEntry.Clear()
-					p.Allocator.FreePTEs(pteEntries)
-					clearPMDEntries++
-				}
-			}
-
-			// Check if we no longer need this page.
-			if clearPMDEntries == entriesPerPage {
-				pudEntry.Clear()
-				p.Allocator.FreePTEs(pmdEntries)
-				clearPUDEntries++
-			}
-		}
-
-		// Check if we no longer need this page.
-		if clearPUDEntries == entriesPerPage {
-			pgdEntry.Clear()
-			p.Allocator.FreePTEs(pudEntries)
-		}
-	}
-}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index 28178f656..dca3f69ef 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -27,48 +27,61 @@ type mapping struct {
 	opts   MapOpts
 }
 
-func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
-	var (
-		current int
-		found   []mapping
-		failed  string
-	)
+type checkVisitor struct {
+	expected []mapping // Input.
+	current  int       // Temporary.
+	found    []mapping // Output.
+	failed   string    // Output.
+}
 
-	// Iterate over all the mappings.
-	pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) {
-		found = append(found, mapping{
-			start:  s,
-			length: e - s,
-			addr:   pte.Address(),
-			opts:   pte.Opts(),
-		})
-		if failed != "" {
-			// Don't keep looking for errors.
-			return
-		}
-
-		if current >= len(m) {
-			failed = "more mappings than expected"
-		} else if m[current].start != s {
-			failed = "start didn't match expected"
-		} else if m[current].length != (e - s) {
-			failed = "end didn't match expected"
-		} else if m[current].addr != pte.Address() {
-			failed = "address didn't match expected"
-		} else if m[current].opts != pte.Opts() {
-			failed = "opts didn't match"
-		}
-		current++
+func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	v.found = append(v.found, mapping{
+		start:  start,
+		length: align + 1,
+		addr:   pte.Address(),
+		opts:   pte.Opts(),
 	})
+	if v.failed != "" {
+		// Don't keep looking for errors.
+		return
+	}
+
+	if v.current >= len(v.expected) {
+		v.failed = "more mappings than expected"
+	} else if v.expected[v.current].start != start {
+		v.failed = "start didn't match expected"
+	} else if v.expected[v.current].length != (align + 1) {
+		v.failed = "end didn't match expected"
+	} else if v.expected[v.current].addr != pte.Address() {
+		v.failed = "address didn't match expected"
+	} else if v.expected[v.current].opts != pte.Opts() {
+		v.failed = "opts didn't match"
+	}
+	v.current++
+}
+
+func (*checkVisitor) requiresAlloc() bool { return false }
+
+func (*checkVisitor) requiresSplit() bool { return false }
+
+func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
+	// Iterate over all the mappings.
+	w := checkWalker{
+		pageTables: pt,
+		visitor: checkVisitor{
+			expected: m,
+		},
+	}
+	w.iterateRange(0, ^uintptr(0))
 
 	// Were we expected additional mappings?
-	if failed == "" && current != len(m) {
-		failed = "insufficient mappings found"
+	if w.visitor.failed == "" && w.visitor.current != len(w.visitor.expected) {
+		w.visitor.failed = "insufficient mappings found"
 	}
 
 	// Emit a meaningful error message on failure.
-	if failed != "" {
-		t.Errorf("%s; got %#v, wanted %#v", failed, found, m)
+	if w.visitor.failed != "" {
+		t.Errorf("%s; got %#v, wanted %#v", w.visitor.failed, w.visitor.found, w.visitor.expected)
 	}
 }
 
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
new file mode 100644
index 000000000..afa4d473a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
@@ -0,0 +1,307 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package pagetables
+
+// Visitor is a generic type.
+type Visitor interface {
+	// visit is called on each PTE.
+	visit(start uintptr, pte *PTE, align uintptr)
+
+	// requiresAlloc indicates that new entries should be allocated within
+	// the walked range.
+	requiresAlloc() bool
+
+	// requiresSplit indicates that entries in the given range should be
+	// split if they are huge or jumbo pages.
+	requiresSplit() bool
+}
+
+// Walker walks page tables.
+type Walker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor Visitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *Walker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func next(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *Walker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+				// Skip over this entry.
+				start = next(start, pgdSize)
+				continue
+			}
+
+			// Allocate a new pgd.
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		// Map the next level.
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+					// Skip over this entry.
+					clearPUDEntries++
+					start = next(start, pudSize)
+					continue
+				}
+
+				// This level has 1-GB super pages. Is this
+				// entire region at least as large as a single
+				// PUD entry?  If so, we can skip allocating a
+				// new page for the pmd.
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = next(start, pudSize)
+						continue
+					}
+				}
+
+				// Allocate a new pud.
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+				// Does this page need to be split?
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) {
+					// Install the relevant entries.
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+					// A super page to be checked directly.
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					// Might have been cleared.
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					// Note that the super page was changed.
+					start = next(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			// Map the next level, since this is valid.
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+						// Skip over this entry.
+						clearPMDEntries++
+						start = next(start, pmdSize)
+						continue
+					}
+
+					// This level has 2-MB huge pages. If this
+					// region is contined in a single PMD entry?
+					// As above, we can skip allocating a new page.
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = next(start, pmdSize)
+							continue
+						}
+					}
+
+					// Allocate a new pmd.
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+					// Does this page need to be split?
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) {
+						// Install the relevant entries.
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+						// A huge page to be checked directly.
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						// Might have been cleared.
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						// Note that the huge page was changed.
+						start = next(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				// Map the next level, since this is valid.
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					// At this point, we are guaranteed that start%pteSize == 0.
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					// Note that the pte was changed.
+					start += pteSize
+					continue
+				}
+
+				// Check if we no longer need this page.
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			// Check if we no longer need this page.
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		// Check if we no longer need this page.
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
-- 
cgit v1.2.3


From 41f766893ab804cd2d3ccfd782d97c022e987f79 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 11 Jun 2018 18:16:13 -0700
Subject: Minor ring0 interface cleanup.

- Remove unused methods.
- Provide declaration for asm function.

PiperOrigin-RevId: 200146850
Change-Id: Ic455c96ffe0d2e78ef15f824eb65d7de705b054a
---
 pkg/sentry/platform/kvm/machine.go     |  6 +++---
 pkg/sentry/platform/ring0/kernel.go    | 23 ++++-------------------
 pkg/sentry/platform/ring0/lib_amd64.go |  3 +++
 3 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 3c1e01241..ab2ccc695 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -40,7 +40,7 @@ type machine struct {
 	nextSlot uint32
 
 	// kernel is the set of global structures.
-	kernel *ring0.Kernel
+	kernel ring0.Kernel
 
 	// mappingCache is used for mapPhysical.
 	mappingCache sync.Map
@@ -135,7 +135,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 		// issues when you've got > n active threads.)
 		vCPUs = n
 	}
-	m.kernel = ring0.New(ring0.KernelOpts{
+	m.kernel.Init(ring0.KernelOpts{
 		PageTables: pagetables.New(newAllocator()),
 	})
 
@@ -158,7 +158,7 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
 			fd:      int(fd),
 			machine: m,
 		}
-		c.CPU.Init(m.kernel)
+		c.CPU.Init(&m.kernel)
 		c.CPU.KernelSyscall = bluepillSyscall
 		c.CPU.KernelException = bluepillException
 		m.vCPUs[uint64(-id)] = c // See above.
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index b0471ab9a..62e67005e 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -14,27 +14,13 @@
 
 package ring0
 
-// New creates a new kernel.
+// Init initializes a new kernel.
 //
 // N.B. that constraints on KernelOpts must be satisfied.
 //
-// Init must have been called.
-func New(opts KernelOpts) *Kernel {
-	k := new(Kernel)
+//go:nosplit
+func (k *Kernel) Init(opts KernelOpts) {
 	k.init(opts)
-	return k
-}
-
-// NewCPU creates a new CPU associated with this Kernel.
-//
-// Note that execution of the new CPU must begin at Start, with constraints as
-// documented. Initialization is not completed by this method alone.
-//
-// See also Init.
-func (k *Kernel) NewCPU() *CPU {
-	c := new(CPU)
-	c.Init(k)
-	return c
 }
 
 // Halt halts execution.
@@ -56,8 +42,7 @@ func defaultSyscall() { Halt() }
 //go:nosplit
 func defaultException(Vector) { Halt() }
 
-// Init allows the initialization of a CPU from a kernel without allocation.
-// The same constraints as NewCPU apply.
+// Init initializes a new CPU.
 //
 // Init allows embedding in other objects.
 func (c *CPU) Init(k *Kernel) {
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index de2842b5a..989e3e383 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -64,6 +64,9 @@ func wrgsmsr(addr uintptr)
 // writeCR3 writes the CR3 value.
 func writeCR3(phys uintptr)
 
+// readCR3 reads the current CR3 value.
+func readCR3() uintptr
+
 // readCR2 reads the current CR2 value.
 func readCR2() uintptr
 
-- 
cgit v1.2.3


From 7a10df454b1c12b207f479cdda7338fff2875d5f Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 12 Jun 2018 12:37:06 -0700
Subject: Drop MMapOpts.MappingIdentity reference in loader.mapSegment.

PiperOrigin-RevId: 200261995
Change-Id: I7e460b18ceab2c23096bdeb7416159d6e774aaf7
---
 pkg/sentry/loader/elf.go | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 8579eeee4..d2f18cd4f 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -271,6 +271,11 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.
 		Perms:    prot,
 		MaxPerms: usermem.AnyAccess,
 	}
+	defer func() {
+		if mopts.MappingIdentity != nil {
+			mopts.MappingIdentity.DecRef()
+		}
+	}()
 	if err := f.ConfigureMMap(ctx, &mopts); err != nil {
 		ctx.Infof("File is not memory-mappable: %v", err)
 		return err
-- 
cgit v1.2.3


From 711a9869e54743b05fc3478be5adce31d45cefe5 Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Tue, 12 Jun 2018 13:24:22 -0700
Subject: Runsc checkpoint works.

This is the first iteration of checkpoint that actually saves to a file.
Tests for checkpoint are included.

Ran into an issue when private unix sockets are enabled. An error message
was added for this case and the mutex state was set.

PiperOrigin-RevId: 200269470
Change-Id: I28d29a9f92c44bf73dc4a4b12ae0509ee4070e93
---
 pkg/sentry/fs/gofer/session.go       |  5 ++--
 pkg/sentry/fs/gofer/session_state.go |  9 ++++++
 runsc/boot/loader.go                 |  4 ++-
 runsc/cmd/checkpoint.go              | 17 +++++++++++-
 runsc/container/container.go         |  5 ++--
 runsc/container/container_test.go    | 53 +++++++++++++++++++++++++++++++++++-
 runsc/sandbox/sandbox.go             | 11 ++++++--
 7 files changed, 95 insertions(+), 9 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 1076e3e55..baf00d8e7 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -28,8 +28,9 @@ import (
 )
 
 type endpointMap struct {
-	mu sync.RWMutex
-	m  map[device.MultiDeviceKey]unix.BoundEndpoint
+	mu sync.RWMutex `state:"nosave"`
+	// TODO: Make map with private unix sockets savable.
+	m map[device.MultiDeviceKey]unix.BoundEndpoint
 }
 
 // add adds the endpoint to the map.
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 4d993a219..0154810c8 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -22,6 +22,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
+// beforeSave is invoked by stateify.
+//
+// TODO: Make map with private unix sockets savable.
+func (e *endpointMap) beforeSave() {
+	if len(e.m) != 0 {
+		panic("EndpointMap with existing private unix sockets cannot be saved")
+	}
+}
+
 // afterLoad is invoked by stateify.
 func (s *session) afterLoad() {
 	// The restore environment contains the 9p connection of this mount.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 41d1ee50d..4a6528307 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -100,7 +100,9 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	}
 
 	// Create VDSO.
-	vdso, err := loader.PrepareVDSO(p)
+	//
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("error creating vdso: %v", err)
 	}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 9b045da1c..927027c2b 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -15,6 +15,8 @@
 package cmd
 
 import (
+	"os"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -24,6 +26,7 @@ import (
 
 // Checkpoint implements subcommands.Command for the "checkpoint" command.
 type Checkpoint struct {
+	imagePath string
 }
 
 // Name implements subcommands.Command.Name.
@@ -44,6 +47,7 @@ func (*Checkpoint) Usage() string {
 
 // SetFlags implements subcommands.Command.SetFlags.
 func (c *Checkpoint) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.imagePath, "image-path", "", "path to saved container image")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -62,7 +66,18 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("error loading container: %v", err)
 	}
 
-	if err := cont.Checkpoint(); err != nil {
+	if c.imagePath == "" {
+		Fatalf("image-path flag must be provided")
+	}
+
+	// Create the image file and open for writing.
+	file, err := os.OpenFile(c.imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+	if err != nil {
+		Fatalf("os.OpenFile(%q) failed: %v", c.imagePath, err)
+	}
+	defer file.Close()
+
+	if err := cont.Checkpoint(file); err != nil {
 		Fatalf("checkpoint failed: %v", err)
 	}
 
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 66a2f27a1..d323388fb 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -339,13 +339,14 @@ func (c *Container) Signal(sig syscall.Signal) error {
 }
 
 // Checkpoint sends the checkpoint call to the container.
-func (c *Container) Checkpoint() error {
+// The statefile will be written to f, the file at the specified image-path.
+func (c *Container) Checkpoint(f *os.File) error {
 	log.Debugf("Checkpoint container %q", c.ID)
 	if c.Status == Stopped {
 		log.Warningf("container %q not running, not checkpointing", c.ID)
 		return nil
 	}
-	return c.Sandbox.Checkpoint(c.ID)
+	return c.Sandbox.Checkpoint(c.ID, f)
 }
 
 // State returns the metadata of the container.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 43cd177ce..b6d19bf33 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -408,6 +408,57 @@ func TestExec(t *testing.T) {
 	}
 }
 
+// TestCheckpoint verifies that calling checkpoint with an image-path flag succeeds.
+// Since there is no current default image path, confirming that calling
+// checkpoint without an image path fails.
+// Checks that there is a file with the name and location given by image path.
+func TestCheckpoint(t *testing.T) {
+	// Container will succeed.
+	spec := testutil.NewSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont.Destroy()
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Set the image path, which is where the checkpoint image will be saved.
+	imagePath := filepath.Join(os.TempDir(), "test-image-file")
+
+	// Create the image file and open for writing.
+	file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+	if err != nil {
+		t.Fatalf("error opening new file at imagePath: %v", err)
+	}
+	defer file.Close()
+
+	// Checkpoint running container; save state into new file.
+	if err := cont.Checkpoint(file); err != nil {
+		t.Fatalf("error checkpointing container to empty file: %v", err)
+	}
+	defer os.RemoveAll(imagePath)
+
+	// Check to see if file exists and contains data.
+	fileInfo, err := os.Stat(imagePath)
+	if err != nil {
+		t.Fatalf("error checkpointing container: %v", err)
+	}
+	if size := fileInfo.Size(); size == 0 {
+		t.Fatalf("failed checkpoint, file still appears empty: %v", err)
+	}
+}
+
 // TestCapabilities verifies that:
 // - Running exec as non-root UID and GID will result in an error (because the
 //   executable file can't be read).
@@ -602,7 +653,7 @@ func TestSpecUnsupported(t *testing.T) {
 }
 
 // TestRunNonRoot checks that sandbox can be configured when running as
-// non-priviledged user.
+// non-privileged user.
 func TestRunNonRoot(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/true")
 	spec.Process.User.UID = 343
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 48388aa7f..c1efab7f5 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -441,7 +441,8 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 }
 
 // Checkpoint sends the checkpoint call for a container in the sandbox.
-func (s *Sandbox) Checkpoint(cid string) error {
+// The statefile will be written to f.
+func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 	log.Debugf("Checkpoint sandbox %q", s.ID)
 	conn, err := s.connect()
 	if err != nil {
@@ -449,7 +450,13 @@ func (s *Sandbox) Checkpoint(cid string) error {
 	}
 	defer conn.Close()
 
-	if err := conn.Call(boot.ContainerCheckpoint, nil, nil); err != nil {
+	opt := control.SaveOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+
+	if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil {
 		return fmt.Errorf("err checkpointing container %q: %v", cid, err)
 	}
 	return nil
-- 
cgit v1.2.3


From c2b3f04d1c7b5d376a3fa305fc5e309e9ec81d99 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 12 Jun 2018 16:15:21 -0700
Subject: Rpcinet doensn't handle SO_RCVTIMEO properly.

Rpcinet already inherits socket.ReceiveTimeout; however, it's
never set on setsockopt(2). The value is currently forwarded
as an RPC and ignored as all sockets will be non-blocking
on the RPC side.

PiperOrigin-RevId: 200299260
Change-Id: I6c610ea22c808ff6420c63759dccfaeab17959dd
---
 pkg/sentry/socket/rpcinet/socket.go | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 6f1a4fe01..4ef8b91c3 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -18,6 +18,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -318,6 +319,15 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
 func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+	// SO_RCVTIMEO is special because blocking is performed within the sentry.
+	if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
+		if outLen < linux.SizeOfTimeval {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		return linux.NsecToTimeval(s.RecvTimeout()), nil
+	}
+
 	stack := t.NetworkContext().(*Stack)
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockOpt{&pb.GetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Length: uint32(outLen)}}}, false /* ignoreResult */)
 	<-c
@@ -332,6 +342,20 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLe
 
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
+	// Because blocking actually happens within the sentry we need to inspect
+	// this socket option to determine if it's a SO_RCVTIMEO, and if so, we will
+	// save it and use it as the deadline for recv(2) related syscalls.
+	if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
+		if len(opt) < linux.SizeOfTimeval {
+			return syserr.ErrInvalidArgument
+		}
+
+		var v linux.Timeval
+		binary.Unmarshal(opt[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+		s.SetRecvTimeout(v.ToNsecCapped())
+		return nil
+	}
+
 	stack := t.NetworkContext().(*Stack)
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_SetSockOpt{&pb.SetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Opt: opt}}}, false /* ignoreResult */)
 	<-c
-- 
cgit v1.2.3


From ba426f7782d35f971820a0193cfda58485b92cad Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 12 Jun 2018 17:03:31 -0700
Subject: Fix reference leak for negative dirents

PiperOrigin-RevId: 200306715
Change-Id: I7c80059c77ebd3d9a5d7d48b05c8e7a597f10850
---
 pkg/sentry/fs/dirent.go | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index a75c7ea7e..554aa30d8 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1257,6 +1257,15 @@ func (d *Dirent) destroy() {
 
 	// Drop all weak references.
 	for _, w := range d.children {
+		if c := w.Get(); c != nil {
+			if c.(*Dirent).IsNegative() {
+				// The parent holds both weak and strong refs in the case of
+				// negative dirents.
+				c.DecRef()
+			}
+			// Drop the reference we just acquired in WeakRef.Get.
+			c.DecRef()
+		}
 		w.Drop()
 	}
 	d.children = nil
-- 
cgit v1.2.3


From 55b905845650efc9a0a23066f8ffd25ce2565bbc Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 13 Jun 2018 10:03:06 -0700
Subject: Log filemem state when panicing due to invalid refcount.

PiperOrigin-RevId: 200408305
Change-Id: I676ee49ec77697105723577928c7f82088cd378e
---
 pkg/sentry/platform/filemem/filemem.go | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index d79c3c7f1..45ef98eb0 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -156,17 +156,6 @@ type usageInfo struct {
 	refs uint64
 }
 
-func (u *usageInfo) incRef() {
-	u.refs++
-}
-
-func (u *usageInfo) decRef() {
-	if u.refs == 0 {
-		panic("DecRef at 0 refs!")
-	}
-	u.refs--
-}
-
 const (
 	chunkShift = 24
 	chunkSize  = 1 << chunkShift // 16 MB
@@ -506,7 +495,7 @@ func (f *FileMem) IncRef(fr platform.FileRange) {
 	defer f.mu.Unlock()
 
 	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
-		seg.ValuePtr().incRef()
+		seg.ValuePtr().refs++
 	})
 	if gap.Ok() {
 		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
@@ -527,7 +516,10 @@ func (f *FileMem) DecRef(fr platform.FileRange) {
 	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
 		seg = f.usage.Isolate(seg, fr)
 		val := seg.ValuePtr()
-		val.decRef()
+		if val.refs == 0 {
+			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
+		}
+		val.refs--
 		if val.refs == 0 {
 			freed = true
 			// Reclassify memory as System, until it's freed by the reclaim
-- 
cgit v1.2.3


From 686093669eb094eb585009b08175a70928849134 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Wed, 13 Jun 2018 10:13:23 -0700
Subject: sentry: do not treat all save errors as state file errors.

PiperOrigin-RevId: 200410220
Change-Id: I6a8745e33be949e335719083501f18b24f6ba471
---
 pkg/sentry/state/state.go | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 393289926..43e88a713 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -78,10 +78,7 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
 		// Save the kernel.
 		err = k.SaveTo(wc)
 		if closeErr := wc.Close(); err == nil && closeErr != nil {
-			err = closeErr
-		}
-		if err != nil {
-			err = ErrStateFile{err}
+			err = ErrStateFile{closeErr}
 		}
 	}
 	opts.Callback(err)
-- 
cgit v1.2.3


From 717f2501c9c4cec4e4fb6c76d49779d899f024ae Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 13 Jun 2018 10:19:03 -0700
Subject: Fix failure to mount volume that sandbox process has no access

Boot loader tries to stat mount to determine whether it's a file or not. This
may file if the sandbox process doesn't have access to the file. Instead, add
overlay on top of file, which is better anyway since we don't want to propagate
changes to the host.

PiperOrigin-RevId: 200411261
Change-Id: I14222410e8bc00ed037b779a1883d503843ffebb
---
 pkg/sentry/fs/overlay.go | 22 ++++++++++++++++++++++
 runsc/boot/fs.go         | 16 +++++++++-------
 2 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 40eed3feb..90d21642e 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -103,6 +103,28 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 	return newOverlayInode(ctx, overlay, msrc), nil
 }
 
+// NewOverlayRootFile produces the root of an overlay that points to a file.
+//
+// Preconditions:
+//
+// - lower must be non-nil.
+// - lower should not expose character devices, pipes, or sockets, because
+//   copying up these types of files is not supported. Neither it can be a dir.
+// - lower must not require that file objects be revalidated.
+// - lower must not have dynamic file/directory content.
+func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
+	if IsRegular(lower.StableAttr) {
+		return nil, fmt.Errorf("lower Inode is not a regular file")
+	}
+	msrc := newOverlayMountSource(upperMS, lower.MountSource, flags)
+	overlay, err := newOverlayEntry(ctx, nil, lower, true)
+	if err != nil {
+		msrc.DecRef()
+		return nil, err
+	}
+	return newOverlayInode(ctx, overlay, msrc), nil
+}
+
 // newOverlayInode creates a new Inode for an overlay.
 func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *Inode {
 	var inode *Inode
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7243153f2..3113f1857 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"os"
 	"path/filepath"
 	"strings"
 
@@ -209,6 +208,13 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	lowerFlags.ReadOnly = false
 
 	tmpFS := mustFindFilesystem("tmpfs")
+	if !fs.IsDir(lower.StableAttr) {
+		// Create overlay on top of mount file, e.g. /etc/hostname.
+		msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
+		return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
+	}
+
+	// Create overlay on top of mount dir.
 	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
 	if err != nil {
 		return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
@@ -248,13 +254,9 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		default:
 			return fmt.Errorf("invalid file access type: %v", conf.FileAccess)
 		}
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
-		fi, err := os.Stat(m.Source)
-		if err != nil {
-			return err
-		}
-		// Add overlay to all writable mounts, except when mapping an individual file.
-		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly && fi.Mode().IsDir()
 	default:
 		// TODO: Support all the mount types and make this a
 		// fatal error.  Most applications will "just work" without
-- 
cgit v1.2.3


From 7b7b199ed0e282c42a753b1dc2ee16fe15aaa6d3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 13 Jun 2018 13:04:36 -0700
Subject: Deflake kvm_test.

PiperOrigin-RevId: 200439846
Change-Id: I9970fe0716cb02f0f41b754891d55db7e0729f56
---
 pkg/sentry/platform/kvm/BUILD               |   1 -
 pkg/sentry/platform/kvm/kvm_test.go         | 153 +++++++++++++++++-----------
 pkg/sentry/platform/kvm/virtual_map_test.go |  37 +++++--
 3 files changed, 120 insertions(+), 71 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 89d98c5c7..135861368 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -71,7 +71,6 @@ go_library(
 
 go_test(
     name = "kvm_test",
-    size = "small",
     srcs = [
         "kvm_test.go",
         "virtual_map_test.go",
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 71c5c856e..180bf7bb0 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -157,7 +157,9 @@ func TestApplicationSyscall(t *testing.T) {
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
 			FullRestore:        true,
-		}); err != nil {
+		}); err == platform.ErrContextInterrupt {
+			return true // Retry.
+		} else if err != nil {
 			t.Errorf("application syscall with full restore failed: %v", err)
 		}
 		return false
@@ -167,7 +169,9 @@ func TestApplicationSyscall(t *testing.T) {
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err != nil {
+		}); err == platform.ErrContextInterrupt {
+			return true // Retry.
+		} else if err != nil {
 			t.Errorf("application syscall with partial restore failed: %v", err)
 		}
 		return false
@@ -182,7 +186,9 @@ func TestApplicationFault(t *testing.T) {
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
 			FullRestore:        true,
-		}); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+		}); err == platform.ErrContextInterrupt {
+			return true // Retry.
+		} else if err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
 			t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
 		}
 		return false
@@ -193,7 +199,9 @@ func TestApplicationFault(t *testing.T) {
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+		}); err == platform.ErrContextInterrupt {
+			return true // Retry.
+		} else if err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
 			t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
 		}
 		return false
@@ -203,15 +211,20 @@ func TestApplicationFault(t *testing.T) {
 func TestRegistersSyscall(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
-			Registers:          regs,
-			FloatingPointState: dummyFPState,
-			PageTables:         pt,
-		}); err != nil {
-			t.Errorf("application register check with partial restore got unexpected error: %v", err)
-		}
-		if err := testutil.CheckTestRegs(regs, false); err != nil {
-			t.Errorf("application register check with partial restore failed: %v", err)
+		for {
+			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+			}); err == platform.ErrContextInterrupt {
+				continue // Retry.
+			} else if err != nil {
+				t.Errorf("application register check with partial restore got unexpected error: %v", err)
+			}
+			if err := testutil.CheckTestRegs(regs, false); err != nil {
+				t.Errorf("application register check with partial restore failed: %v", err)
+			}
+			break // Done.
 		}
 		return false
 	})
@@ -220,16 +233,21 @@ func TestRegistersSyscall(t *testing.T) {
 func TestRegistersFault(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
-		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
-			Registers:          regs,
-			FloatingPointState: dummyFPState,
-			PageTables:         pt,
-			FullRestore:        true,
-		}); err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
-			t.Errorf("application register check with full restore got unexpected error: %v", err)
-		}
-		if err := testutil.CheckTestRegs(regs, true); err != nil {
-			t.Errorf("application register check with full restore failed: %v", err)
+		for {
+			if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+				FullRestore:        true,
+			}); err == platform.ErrContextInterrupt {
+				continue // Retry.
+			} else if err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
+				t.Errorf("application register check with full restore got unexpected error: %v", err)
+			}
+			if err := testutil.CheckTestRegs(regs, true); err != nil {
+				t.Errorf("application register check with full restore failed: %v", err)
+			}
+			break // Done.
 		}
 		return false
 	})
@@ -238,16 +256,21 @@ func TestRegistersFault(t *testing.T) {
 func TestSegments(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestSegments(regs)
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
-			Registers:          regs,
-			FloatingPointState: dummyFPState,
-			PageTables:         pt,
-			FullRestore:        true,
-		}); err != nil {
-			t.Errorf("application segment check with full restore got unexpected error: %v", err)
-		}
-		if err := testutil.CheckTestSegments(regs); err != nil {
-			t.Errorf("application segment check with full restore failed: %v", err)
+		for {
+			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+				FullRestore:        true,
+			}); err == platform.ErrContextInterrupt {
+				continue // Retry.
+			} else if err != nil {
+				t.Errorf("application segment check with full restore got unexpected error: %v", err)
+			}
+			if err := testutil.CheckTestSegments(regs); err != nil {
+				t.Errorf("application segment check with full restore failed: %v", err)
+			}
+			break // Done.
 		}
 		return false
 	})
@@ -323,22 +346,32 @@ func TestInvalidate(t *testing.T) {
 	var data uintptr // Used below.
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, &data) // Read legitimate value.
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
-			Registers:          regs,
-			FloatingPointState: dummyFPState,
-			PageTables:         pt,
-		}); err != nil {
-			t.Errorf("application partial restore: got %v, wanted nil", err)
+		for {
+			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+			}); err == platform.ErrContextInterrupt {
+				continue // Retry.
+			} else if err != nil {
+				t.Errorf("application partial restore: got %v, wanted nil", err)
+			}
+			break // Done.
 		}
 		// Unmap the page containing data & invalidate.
 		pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize)
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
-			Registers:          regs,
-			FloatingPointState: dummyFPState,
-			PageTables:         pt,
-			Flush:              true,
-		}); err != platform.ErrContextSignal {
-			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
+		for {
+			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+				Registers:          regs,
+				FloatingPointState: dummyFPState,
+				PageTables:         pt,
+				Flush:              true,
+			}); err == platform.ErrContextInterrupt {
+				continue // Retry.
+			} else if err != platform.ErrContextSignal {
+				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
+			}
+			break // Success.
 		}
 		return false
 	})
@@ -355,7 +388,9 @@ func TestEmptyAddressSpace(t *testing.T) {
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); !IsFault(err, si) {
+		}); err == platform.ErrContextInterrupt {
+			return true // Retry.
+		} else if !IsFault(err, si) {
 			t.Errorf("first fault with partial restore failed got %v", err)
 			t.Logf("registers: %#v", &regs)
 		}
@@ -367,7 +402,9 @@ func TestEmptyAddressSpace(t *testing.T) {
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
 			FullRestore:        true,
-		}); !IsFault(err, si) {
+		}); err == platform.ErrContextInterrupt {
+			return true // Retry.
+		} else if !IsFault(err, si) {
 			t.Errorf("first fault with full restore failed got %v", err)
 			t.Logf("registers: %#v", &regs)
 		}
@@ -422,11 +459,10 @@ func BenchmarkApplicationSyscall(b *testing.B) {
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err != nil {
-			if err == platform.ErrContextInterrupt {
-				a++
-				return true // Ignore.
-			}
+		}); err == platform.ErrContextInterrupt {
+			a++
+			return true // Ignore.
+		} else if err != nil {
 			b.Fatalf("benchmark failed: %v", err)
 		}
 		i++
@@ -459,11 +495,10 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err != nil {
-			if err == platform.ErrContextInterrupt {
-				a++
-				return true // Ignore.
-			}
+		}); err == platform.ErrContextInterrupt {
+			a++
+			return true // Ignore.
+		} else if err != nil {
 			b.Fatalf("benchmark failed: %v", err)
 		}
 		// This will intentionally cause the world switch. By executing
@@ -474,6 +509,6 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
 		return i < b.N
 	})
 	if a != 0 {
-		b.Logf("EAGAIN occurred %d times (in %d iterations).", a, a+i)
+		b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i)
 	}
 }
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
index 31e5b0e61..7875bd3e9 100644
--- a/pkg/sentry/platform/kvm/virtual_map_test.go
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -22,14 +22,16 @@ import (
 )
 
 type checker struct {
-	ok bool
+	ok         bool
+	accessType usermem.AccessType
 }
 
-func (c *checker) Contains(addr uintptr) func(virtualRegion) {
+func (c *checker) Containing(addr uintptr) func(virtualRegion) {
 	c.ok = false // Reset for below calls.
 	return func(vr virtualRegion) {
 		if vr.virtual <= addr && addr < vr.virtual+vr.length {
 			c.ok = true
+			c.accessType = vr.accessType
 		}
 	}
 }
@@ -38,7 +40,7 @@ func TestParseMaps(t *testing.T) {
 	c := new(checker)
 
 	// Simple test.
-	if err := applyVirtualRegions(c.Contains(0)); err != nil {
+	if err := applyVirtualRegions(c.Containing(0)); err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 
@@ -52,7 +54,7 @@ func TestParseMaps(t *testing.T) {
 	}
 
 	// Re-parse maps.
-	if err := applyVirtualRegions(c.Contains(addr)); err != nil {
+	if err := applyVirtualRegions(c.Containing(addr)); err != nil {
 		syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -63,16 +65,29 @@ func TestParseMaps(t *testing.T) {
 		t.Fatalf("updated map does not contain 0x%08x, expected true", addr)
 	}
 
-	// Unmap the region.
-	syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
+	// Map the region as PROT_NONE.
+	newAddr, _, errno := syscall.RawSyscall6(
+		syscall.SYS_MMAP, addr, usermem.PageSize,
+		syscall.PROT_NONE,
+		syscall.MAP_ANONYMOUS|syscall.MAP_FIXED|syscall.MAP_PRIVATE, 0, 0)
+	if errno != 0 {
+		t.Fatalf("unexpected map error: %v", errno)
+	}
+	if newAddr != addr {
+		t.Fatalf("unable to remap address: got 0x%08x, wanted 0x%08x", newAddr, addr)
+	}
 
 	// Re-parse maps.
-	if err := applyVirtualRegions(c.Contains(addr)); err != nil {
+	if err := applyVirtualRegions(c.Containing(addr)); err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
-
-	// Assert that it once again does _not_ contain the region.
-	if c.ok {
-		t.Fatalf("final map does contain 0x%08x, expected false", addr)
+	if !c.ok {
+		t.Fatalf("final map does not contain 0x%08x, expected true", addr)
+	}
+	if c.accessType.Read || c.accessType.Write || c.accessType.Execute {
+		t.Fatalf("final map has incorrect permissions for 0x%08x", addr)
 	}
+
+	// Unmap the region.
+	syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
 }
-- 
cgit v1.2.3


From 1170039e788db368615451a0a1f5cfccb1d28d41 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Wed, 13 Jun 2018 16:20:30 -0700
Subject: Fix missing returns in rpcinet.

PiperOrigin-RevId: 200472634
Change-Id: I3f0fb9e3b2f8616e6aa1569188258f330bf1ed31
---
 pkg/sentry/socket/rpcinet/socket.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 4ef8b91c3..d8c1f2c1a 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -288,7 +288,7 @@ func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	<-c
 
 	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Bind).Bind.ErrorNumber; e != 0 {
-		syserr.FromHost(syscall.Errno(e))
+		return syserr.FromHost(syscall.Errno(e))
 	}
 	return nil
 }
@@ -300,7 +300,7 @@ func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	<-c
 
 	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Listen).Listen.ErrorNumber; e != 0 {
-		syserr.FromHost(syscall.Errno(e))
+		return syserr.FromHost(syscall.Errno(e))
 	}
 	return nil
 }
@@ -361,7 +361,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 	<-c
 
 	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_SetSockOpt).SetSockOpt.ErrorNumber; e != 0 {
-		syserr.FromHost(syscall.Errno(e))
+		return syserr.FromHost(syscall.Errno(e))
 	}
 	return nil
 }
-- 
cgit v1.2.3


From f5d0c59f5c736f5f7fceb566e134f41b03229c22 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 13 Jun 2018 20:00:00 -0700
Subject: Fix reference leak in VDSO validation

PiperOrigin-RevId: 200496070
Change-Id: I33adb717c44e5b4bcadece882be3ab1ee3920556
---
 pkg/sentry/fs/dirent.go   |  5 +++++
 pkg/sentry/loader/BUILD   |  1 +
 pkg/sentry/loader/vdso.go | 20 +++++++++++++++++++-
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 554aa30d8..b56437b3c 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -213,7 +213,12 @@ func NewDirent(inode *Inode, name string) *Dirent {
 
 // NewTransientDirent creates a transient Dirent that shouldn't actually be
 // visible to users.
+//
+// An Inode is required.
 func NewTransientDirent(inode *Inode) *Dirent {
+	if inode == nil {
+		panic("an inode is required")
+	}
 	return newDirent(inode, "transient")
 }
 
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 917ec8cc8..08cb3a777 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index ce4f6f5d9..037576e41 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -20,10 +20,12 @@ import (
 	"io"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
@@ -63,8 +65,23 @@ func (f *fileContext) Value(key interface{}) interface{} {
 	}
 }
 
+// newByteReaderFile creates a fake file to read data from.
 func newByteReaderFile(data []byte) *fs.File {
-	dirent := fs.NewTransientDirent(nil)
+	// Create a fake inode.
+	inode := fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
+		FSType: linux.ANON_INODE_FS_MAGIC,
+	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+		Type:      fs.Anonymous,
+		DeviceID:  anon.PseudoDevice.DeviceID(),
+		InodeID:   anon.PseudoDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+	})
+
+	// Use the fake inode to create a fake dirent.
+	dirent := fs.NewTransientDirent(inode)
+	defer dirent.DecRef()
+
+	// Use the fake dirent to make a fake file.
 	flags := fs.FileFlags{Read: true, Pread: true}
 	return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
 		data: data,
@@ -202,6 +219,7 @@ func PrepareVDSO(p platform.Platform) (*VDSO, error) {
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
 	// nil context can be passed.
 	info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
+	vdsoFile.DecRef()
 	if err != nil {
 		return nil, err
 	}
-- 
cgit v1.2.3


From 657db692b2241d89a324acc246b3c5230d8bd6ac Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 14 Jun 2018 11:34:15 -0700
Subject: Ignore expiration count in kernelCPUClockListener.Notify.

PiperOrigin-RevId: 200590832
Change-Id: I35b817ecccc9414a742dee4815dfc67d0c7d0496
---
 pkg/sentry/kernel/kernel.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 536461bbd..a17148af1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -960,7 +960,13 @@ type kernelCPUClockListener struct {
 
 // Notify implements ktime.TimerListener.Notify.
 func (l kernelCPUClockListener) Notify(exp uint64) {
-	atomic.AddUint64(&l.k.cpuClock, exp)
+	// Only increment cpuClock by 1 regardless of the number of expirations.
+	// This approximately compensates for cases where thread throttling or bad
+	// Go runtime scheduling prevents the cpuClockTicker goroutine, and
+	// presumably task goroutines as well, from executing for a long period of
+	// time. It's also necessary to prevent CPU clocks from seeing large
+	// discontinuous jumps.
+	atomic.AddUint64(&l.k.cpuClock, 1)
 }
 
 // Destroy implements ktime.TimerListener.Destroy.
-- 
cgit v1.2.3


From 119a302ceb070243cc2d3d3b4dcf5f4d57809479 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 15 Jun 2018 09:17:08 -0700
Subject: Implement /proc/thread-self

Closes #68

PiperOrigin-RevId: 200725401
Change-Id: I4827009b8aee89d22887c3af67291ccf7058d420
---
 pkg/sentry/fs/proc/proc.go | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index d727e1bc9..b2a8d639c 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -111,6 +111,13 @@ func (p *proc) newSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	return newFile(s, msrc, fs.Symlink, nil)
 }
 
+// newThreadSelf returns a new "threadSelf" node.
+func (p *proc) newThreadSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	s := &threadSelf{pidns: p.pidns}
+	s.InitSymlink(ctx, fs.RootOwner, "")
+	return newFile(s, msrc, fs.Symlink, nil)
+}
+
 // newStubProcFsFile returns a procfs file with constant contents.
 func (p *proc) newStubProcFSFile(ctx context.Context, msrc *fs.MountSource, c []byte) *fs.Inode {
 	u := &stubProcFSFile{
@@ -134,6 +141,28 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	return "", ramfs.ErrInvalidOp
 }
 
+// threadSelf is more magical than "self" link.
+type threadSelf struct {
+	ramfs.Symlink
+
+	pidns *kernel.PIDNamespace
+}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
+	if t := kernel.TaskFromContext(ctx); t != nil {
+		tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+		tid := s.pidns.IDOfTask(t)
+		if tid == 0 || tgid == 0 {
+			return "", ramfs.ErrNotFound
+		}
+		return fmt.Sprintf("%d/task/%d", tgid, tid), nil
+	}
+
+	// Who is reading this link?
+	return "", ramfs.ErrInvalidOp
+}
+
 // Lookup loads an Inode at name into a Dirent.
 func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
 	// Is it one of the static ones?
@@ -151,8 +180,9 @@ func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dire
 			}
 			return p.newNetDir(ctx, dir.MountSource)
 		},
-		"self": func() *fs.Inode { return p.newSelf(ctx, dir.MountSource) },
-		"sys":  func() *fs.Inode { return p.newSysDir(ctx, dir.MountSource) },
+		"self":        func() *fs.Inode { return p.newSelf(ctx, dir.MountSource) },
+		"sys":         func() *fs.Inode { return p.newSysDir(ctx, dir.MountSource) },
+		"thread-self": func() *fs.Inode { return p.newThreadSelf(ctx, dir.MountSource) },
 	}
 	if nf, ok := nfs[name]; ok {
 		return fs.NewDirent(nf(), name), nil
-- 
cgit v1.2.3


From b31ac4e1dfc0eef688e2d8e85df965292690726e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 15 Jun 2018 09:29:19 -0700
Subject: Use notify explicitly on unlock path.

There are circumstances under which the redpill call will not generate
the appropriate action and notification. Replace this call with an
explicit notification, which is guaranteed to transition as well as
perform the futex wake.

PiperOrigin-RevId: 200726934
Change-Id: Ie19e008a6007692dd7335a31a8b59f0af6e54aaa
---
 pkg/sentry/platform/kvm/machine.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index ab2ccc695..f045345d5 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -397,7 +397,7 @@ func (c *vCPU) unlock() {
 	case vCPUUser | vCPUGuest | vCPUWaiter:
 		// Force a transition: this must trigger a notification when we
 		// return from guest mode.
-		redpill()
+		c.notify()
 	case vCPUUser | vCPUWaiter:
 		// Waiting for the lock to be released; the responsibility is
 		// on us to notify the waiter and clear the associated bit.
-- 
cgit v1.2.3


From fa6db05e0ce828f2500651ca1226babbbf5edc80 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Fri, 15 Jun 2018 12:54:38 -0700
Subject: FIFOs should support O_TRUNC as a no-op.

PiperOrigin-RevId: 200759323
Change-Id: I683b2edcc2188304c4ca563e46af457e23625905
---
 pkg/sentry/kernel/pipe/node.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 5b47427ef..e418cf174 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -162,6 +162,18 @@ func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Slee
 	}
 }
 
+// Truncate implements fs.InodeOperations.Truncate
+//
+// This method is required to override the default i.InodeOperations.Truncate
+// which may return ErrInvalidOperation, this allows open related
+// syscalls to set the O_TRUNC flag without returning an error by
+// calling Truncate directly during openat. The ftruncate and truncate
+// system calls will check that the file is an actual file and return
+// EINVAL because it's a PIPE, making this behavior consistent with linux.
+func (i *inodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
 // newHandleLocked signals a new pipe reader or writer depending on where
 // 'wakeupChan' points. This unblocks any corresponding reader or writer
 // waiting for the other end of the channel to be opened, see Fifo.waitFor.
-- 
cgit v1.2.3


From fc8ca72a32bb4cb348ece3033c84696ea3502068 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Fri, 15 Jun 2018 13:37:21 -0700
Subject: sentry: do not start delivering external signal immediately.

PiperOrigin-RevId: 200765756
Change-Id: Ie4266f32e4e977df3925eb29f3fbb756e0337606
---
 pkg/sentry/sighandling/sighandling.go | 50 ++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 1a94b535b..0c3a14da5 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -29,23 +29,31 @@ import (
 // numSignals is the number of normal (non-realtime) signals on Linux.
 const numSignals = 32
 
-// forwardSignals listens for incoming signals and delivers them to k. It stops
-// when the stop channel is closed.
-func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, stop chan struct{}) {
+// forwardSignals listens for incoming signals and delivers them to k. It starts
+// when the start channel is closed and stops when the stop channel is closed.
+func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop chan struct{}) {
 	// Build a select case.
-	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}}
+	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}}
 	for _, sigchan := range sigchans {
 		sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)})
 	}
 
+	started := false
 	for {
 		// Wait for a notification.
 		index, _, ok := reflect.Select(sc)
 
-		// Was it the stop channel?
+		// Was it the start / stop channel?
 		if index == 0 {
 			if !ok {
-				break
+				if started {
+					// stop channel
+					break
+				} else {
+					// start channel
+					started = true
+					sc[0] = reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}
+				}
 			}
 			continue
 		}
@@ -57,18 +65,18 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, stop chan struc
 
 		// Otherwise, it was a signal on channel N. Index 0 represents the stop
 		// channel, so index N represents the channel for signal N.
-		if !k.SendExternalSignal(&arch.SignalInfo{Signo: int32(index)}, "sentry") {
+		if !started || !k.SendExternalSignal(&arch.SignalInfo{Signo: int32(index)}, "sentry") {
 			// Kernel is not ready to receive signals.
 			//
 			// Kill ourselves if this signal would have killed the
-			// process before StartForwarding was called. i.e., all
+			// process before PrepareForwarding was called. i.e., all
 			// _SigKill signals; see Go
 			// src/runtime/sigtab_linux_generic.go.
 			//
 			// Otherwise ignore the signal.
 			//
 			// TODO: Convert Go's runtime.raise from
-			// tkill to tgkill so StartForwarding doesn't need to
+			// tkill to tgkill so PrepareForwarding doesn't need to
 			// be called until after filter installation.
 			switch linux.Signal(index) {
 			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
@@ -84,9 +92,11 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, stop chan struc
 	}
 }
 
-// StartForwarding ensures that synchronous signals are forwarded to k and
-// returns a callback that stops signal forwarding.
-func StartForwarding(k *kernel.Kernel) func() {
+// PrepareForwarding ensures that synchronous signals are forwarded to k and
+// returns a callback that starts signal delivery, which itself returns a
+// callback that stops signal forwarding.
+func PrepareForwarding(k *kernel.Kernel) func() func() {
+	start := make(chan struct{})
 	stop := make(chan struct{})
 
 	// Register individual channels. One channel per standard signal is
@@ -109,8 +119,18 @@ func StartForwarding(k *kernel.Kernel) func() {
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
-	go forwardSignals(k, sigchans, stop) // S/R-SAFE: synchronized by Kernel.extMu
+	go forwardSignals(k, sigchans, start, stop) // S/R-SAFE: synchronized by Kernel.extMu
+
+	return func() func() {
+		close(start)
+		return func() {
+			close(stop)
+		}
+	}
+}
 
-	// ... shouldn't this wait until the forwardSignals goroutine returns?
-	return func() { close(stop) }
+// StartForwarding ensures that synchronous signals are forwarded to k and
+// returns a callback that stops signal forwarding.
+func StartForwarding(k *kernel.Kernel) func() {
+	return PrepareForwarding(k)()
 }
-- 
cgit v1.2.3


From bd2d1aaa16474202b1a2c1edbf62e6782fa2dc36 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 15 Jun 2018 15:35:09 -0700
Subject: Replace crypto/rand with internal rand package

PiperOrigin-RevId: 200784607
Change-Id: I39aa6ee632936dcbb00fc298adccffa606e9f4c0
---
 pkg/dhcp/BUILD                          |  1 +
 pkg/dhcp/client.go                      |  2 +-
 pkg/rand/BUILD                          | 11 ++++++++++
 pkg/rand/rand.go                        | 39 +++++++++++++++++++++++++++++++++
 pkg/sentry/fs/dev/BUILD                 |  1 +
 pkg/sentry/fs/dev/random.go             |  3 +--
 pkg/sentry/loader/BUILD                 |  1 +
 pkg/sentry/loader/loader.go             |  2 +-
 pkg/sentry/syscalls/linux/BUILD         |  1 +
 pkg/sentry/syscalls/linux/sys_random.go |  2 +-
 pkg/tcpip/network/hash/BUILD            |  5 ++++-
 pkg/tcpip/network/hash/hash.go          |  2 +-
 pkg/tcpip/transport/tcp/BUILD           |  1 +
 pkg/tcpip/transport/tcp/accept.go       |  2 +-
 pkg/tcpip/transport/tcp/connect.go      |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go     |  2 +-
 16 files changed, 67 insertions(+), 10 deletions(-)
 create mode 100644 pkg/rand/BUILD
 create mode 100644 pkg/rand/rand.go

(limited to 'pkg/sentry')

diff --git a/pkg/dhcp/BUILD b/pkg/dhcp/BUILD
index b40860aac..3564da7e7 100644
--- a/pkg/dhcp/BUILD
+++ b/pkg/dhcp/BUILD
@@ -11,6 +11,7 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/dhcp",
     deps = [
+        "//pkg/rand",
         "//pkg/tcpip",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/stack",
diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 37deb69ff..09b724b48 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -7,12 +7,12 @@ package dhcp
 import (
 	"bytes"
 	"context"
-	"crypto/rand"
 	"fmt"
 	"log"
 	"sync"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
new file mode 100644
index 000000000..2bb59f895
--- /dev/null
+++ b/pkg/rand/BUILD
@@ -0,0 +1,11 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "rand",
+    srcs = ["rand.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/rand",
+    visibility = ["//:sandbox"],
+    deps = ["@org_golang_x_sys//unix:go_default_library"],
+)
diff --git a/pkg/rand/rand.go b/pkg/rand/rand.go
new file mode 100644
index 000000000..37ac07620
--- /dev/null
+++ b/pkg/rand/rand.go
@@ -0,0 +1,39 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package rand implements a cryptographically secure pseudorandom number
+// generator.
+package rand
+
+import (
+	"io"
+
+	"golang.org/x/sys/unix"
+)
+
+// reader implements an io.Reader that returns pseudorandom bytes.
+type reader struct{}
+
+// Read implements io.Reader.Read.
+func (reader) Read(p []byte) (int, error) {
+	return unix.Getrandom(p, 0)
+}
+
+// Reader is the default reader.
+var Reader io.Reader = reader{}
+
+// Read reads from the default reader.
+func Read(b []byte) (int, error) {
+	return io.ReadFull(Reader, b)
+}
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 42049ecb5..d33a19c2f 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -33,6 +33,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/amutex",
         "//pkg/log",
+        "//pkg/rand",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 0402f9355..33a045a05 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -15,9 +15,8 @@
 package dev
 
 import (
-	"crypto/rand"
-
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 08cb3a777..b7aebd9ec 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -39,6 +39,7 @@ go_library(
         "//pkg/binary",
         "//pkg/cpuid",
         "//pkg/log",
+        "//pkg/rand",
         "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index a68ab33e7..3cda0fe6f 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -17,13 +17,13 @@ package loader
 
 import (
 	"bytes"
-	"crypto/rand"
 	"io"
 	"path"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index f9e0a4be3..7cfd37fb1 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -70,6 +70,7 @@ go_library(
         "//pkg/eventchannel",
         "//pkg/log",
         "//pkg/metric",
+        "//pkg/rand",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
index 2dd59b1c3..be31e6b17 100644
--- a/pkg/sentry/syscalls/linux/sys_random.go
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -15,10 +15,10 @@
 package linux
 
 import (
-	"crypto/rand"
 	"io"
 	"math"
 
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index 96805c690..1e76fed36 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -7,5 +7,8 @@ go_library(
     srcs = ["hash.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/network/hash",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/tcpip/header"],
+    deps = [
+        "//pkg/rand",
+        "//pkg/tcpip/header",
+    ],
 )
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
index e5a696158..60227d515 100644
--- a/pkg/tcpip/network/hash/hash.go
+++ b/pkg/tcpip/network/hash/hash.go
@@ -6,9 +6,9 @@
 package hash
 
 import (
-	"crypto/rand"
 	"encoding/binary"
 
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 )
 
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index d0eb8b8bd..f38f58e87 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -51,6 +51,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/rand",
         "//pkg/sleep",
         "//pkg/state",
         "//pkg/tcpip",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index e78a56cf5..85adeef0e 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -5,7 +5,6 @@
 package tcp
 
 import (
-	"crypto/rand"
 	"crypto/sha1"
 	"encoding/binary"
 	"hash"
@@ -13,6 +12,7 @@ import (
 	"sync"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sleep"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 0571ceaa5..9aaabe0b1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -5,11 +5,11 @@
 package tcp
 
 import (
-	"crypto/rand"
 	"sync"
 	"sync/atomic"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sleep"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 3f87c4cac..b21c2b4ab 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -5,12 +5,12 @@
 package tcp
 
 import (
-	"crypto/rand"
 	"math"
 	"sync"
 	"sync/atomic"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sleep"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-- 
cgit v1.2.3


From 563a71ef243360bc20db0e481b3adbfb07cd8702 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Sun, 17 Jun 2018 17:05:36 -0700
Subject: Add rpcinet support for control messages.

Add support for control messages, but at this time the only
control message that the sentry will support here is SO_TIMESTAMP.

PiperOrigin-RevId: 200922230
Change-Id: I63a852d9305255625d9df1d989bd46a66e93c446
---
 pkg/sentry/socket/rpcinet/socket.go         | 37 +++++++++++++++++++++++++++--
 pkg/sentry/socket/rpcinet/syscall_rpc.proto |  2 ++
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index d8c1f2c1a..b4b380ac6 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -477,6 +477,37 @@ func rpcRecvMsg(t *kernel.Task, req *pb.SyscallRequest_Recvmsg) (*pb.RecvmsgResp
 	return res.(*pb.RecvmsgResponse_Payload).Payload, nil
 }
 
+// Because we only support SO_TIMESTAMP we will search control messages for
+// that value and set it if so, all other control messages will be ignored.
+func (s *socketOperations) extractControlMessages(payload *pb.RecvmsgResponse_ResultPayload) socket.ControlMessages {
+	c := socket.ControlMessages{}
+	if len(payload.GetCmsgData()) > 0 {
+		// Parse the control messages looking for SO_TIMESTAMP.
+		msgs, e := syscall.ParseSocketControlMessage(payload.GetCmsgData())
+		if e != nil {
+			return socket.ControlMessages{}
+		}
+		for _, m := range msgs {
+			if m.Header.Level != linux.SOL_SOCKET || m.Header.Type != linux.SO_TIMESTAMP {
+				continue
+			}
+
+			// Let's parse the time stamp and set it.
+			if len(m.Data) < linux.SizeOfTimeval {
+				// Give up on locating the SO_TIMESTAMP option.
+				return socket.ControlMessages{}
+			}
+
+			var v linux.Timeval
+			binary.Unmarshal(m.Data[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+			c.IP.HasTimestamp = true
+			c.IP.Timestamp = v.ToNsecCapped()
+			break
+		}
+	}
+	return c
+}
+
 // RecvMsg implements socket.Socket.RecvMsg.
 func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
@@ -497,7 +528,8 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 				panic("CopyOut failed to copy full buffer")
 			}
 		}
-		return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
+		c := s.extractControlMessages(res)
+		return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
 	}
 	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
 		return 0, nil, 0, socket.ControlMessages{}, err
@@ -520,7 +552,8 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 					panic("CopyOut failed to copy full buffer")
 				}
 			}
-			return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), socket.ControlMessages{}, syserr.FromError(e)
+			c := s.extractControlMessages(res)
+			return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
 		}
 		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
 			return 0, nil, 0, socket.ControlMessages{}, err
diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
index 996962aae..c056e4c9d 100644
--- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto
+++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
@@ -40,6 +40,7 @@ message RecvmsgRequest {
   bool sender = 3;
   bool peek = 4;
   bool trunc = 5;
+  uint32 cmsg_length = 6;
 }
 
 message OpenRequest {
@@ -110,6 +111,7 @@ message RecvmsgResponse {
     bytes data = 1 [ctype = CORD];
     AddressResponse address = 2;
     uint32 length = 3;
+    bytes cmsg_data = 4;
   }
   oneof result {
     uint32 error_number = 1;
-- 
cgit v1.2.3


From 4fd1d40e1d874ef4eb2f6cb13de66f1b756aa92c Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 19 Jun 2018 10:42:39 -0700
Subject: Rpcinet needs to track shutdown state for blocking sockets.

Because rpcinet will emulate a blocking socket backed by an rpc based
non-blocking socket. In the event of a shutdown(SHUT_RD) followed by a
read a non-blocking socket is allowed to return an EWOULDBLOCK however
since a blocking socket knows it cannot receive anymore data it would
block indefinitely and in this situation linux returns 0. We have to
track this on the rpcinet sentry side so we can emulate that behavior
because the remote side has no way to know if the socket is actually
blocking within the sentry.

PiperOrigin-RevId: 201201618
Change-Id: I4ac3a7b74b5dae471ab97c2e7d33b83f425aedac
---
 pkg/sentry/socket/rpcinet/BUILD     |  1 +
 pkg/sentry/socket/rpcinet/socket.go | 64 +++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index b0351b363..8973453f9 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
+        "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/transport/unix",
         "//pkg/unet",
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index b4b380ac6..f641f25df 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -52,6 +53,11 @@ type socketOperations struct {
 	wq       *waiter.Queue
 	rpcConn  *conn.RPCConnection
 	notifier *notifier.Notifier
+
+	// shState is the state of the connection with respect to shutdown. Because
+	// we're mixing non-blocking semantics on the other side we have to adapt for
+	// some strange differences between blocking and non-blocking sockets.
+	shState tcpip.ShutdownFlags
 }
 
 // Verify that we actually implement socket.Socket.
@@ -96,6 +102,31 @@ func translateIOSyscallError(err error) error {
 	return err
 }
 
+// setShutdownFlags will set the shutdown flag so we can handle blocking reads
+// after a read shutdown.
+func (s *socketOperations) setShutdownFlags(how int) {
+	switch how {
+	case linux.SHUT_RD:
+		s.shState |= tcpip.ShutdownRead
+	case linux.SHUT_WR:
+		s.shState |= tcpip.ShutdownWrite
+	case linux.SHUT_RDWR:
+		s.shState |= tcpip.ShutdownWrite | tcpip.ShutdownRead
+	}
+}
+
+func (s *socketOperations) resetShutdownFlags() {
+	s.shState = 0
+}
+
+func (s *socketOperations) isShutRdSet() bool {
+	return s.shState&tcpip.ShutdownRead != 0
+}
+
+func (s *socketOperations) isShutWrSet() bool {
+	return s.shState&tcpip.ShutdownWrite != 0
+}
+
 // Release implements fs.FileOperations.Release.
 func (s *socketOperations) Release() {
 	s.notifier.RemoveFD(s.fd)
@@ -191,7 +222,12 @@ func rpcConnect(t *kernel.Task, fd uint32, sockaddr []byte) *syserr.Error {
 // Connect implements socket.Socket.Connect.
 func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
 	if !blocking {
-		return rpcConnect(t, s.fd, sockaddr)
+		e := rpcConnect(t, s.fd, sockaddr)
+		if e == nil {
+			// Reset the shutdown state on new connects.
+			s.resetShutdownFlags()
+		}
+		return e
 	}
 
 	// Register for notification when the endpoint becomes writable, then
@@ -201,6 +237,10 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 	defer s.EventUnregister(&e)
 	for {
 		if err := rpcConnect(t, s.fd, sockaddr); err == nil || err != syserr.ErrInProgress && err != syserr.ErrAlreadyInProgress {
+			if err == nil {
+				// Reset the shutdown state on new connects.
+				s.resetShutdownFlags()
+			}
 			return err
 		}
 
@@ -314,6 +354,11 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	if e := stack.rpcConn.Request(id).Result.(*pb.SyscallResponse_Shutdown).Shutdown.ErrorNumber; e != 0 {
 		return syserr.FromHost(syscall.Errno(e))
 	}
+
+	// We save the shutdown state because of strange differences on linux
+	// related to recvs on blocking vs. non-blocking sockets after a SHUT_RD.
+	// We need to emulate that behavior on the blocking side.
+	s.setShutdownFlags(how)
 	return nil
 }
 
@@ -511,11 +556,12 @@ func (s *socketOperations) extractControlMessages(payload *pb.RecvmsgResponse_Re
 // RecvMsg implements socket.Socket.RecvMsg.
 func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
-		Fd:     s.fd,
-		Length: uint32(dst.NumBytes()),
-		Sender: senderRequested,
-		Trunc:  flags&linux.MSG_TRUNC != 0,
-		Peek:   flags&linux.MSG_PEEK != 0,
+		Fd:         s.fd,
+		Length:     uint32(dst.NumBytes()),
+		Sender:     senderRequested,
+		Trunc:      flags&linux.MSG_TRUNC != 0,
+		Peek:       flags&linux.MSG_PEEK != 0,
+		CmsgLength: uint32(controlDataLen),
 	}}
 
 	res, err := rpcRecvMsg(t, req)
@@ -559,6 +605,12 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			return 0, nil, 0, socket.ControlMessages{}, err
 		}
 
+		if s.isShutRdSet() {
+			// Blocking would have caused us to block indefinitely so we return 0,
+			// this is the same behavior as Linux.
+			return 0, nil, 0, socket.ControlMessages{}, nil
+		}
+
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
 				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
-- 
cgit v1.2.3


From 9db7cfad93abff181c59d61892d32b9b05f4234f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 19 Jun 2018 11:09:20 -0700
Subject: Add a new cache policy FSCACHE_WRITETHROUGH.

The new policy is identical to FSCACHE (which caches everything in memory), but
it also flushes writes to the backing fs agent immediately.

All gofer cache policy decisions have been moved into the cachePolicy type.
Previously they were sprinkled around the codebase.

There are many different things that we cache (page cache, negative dirents,
dirent LRU, unstable attrs, readdir results....), and I don't think we should
have individual flags to control each of these.  Instead, we should have a few
high-level cache policies that are consistent and useful to users.  This
refactoring makes it easy to add more such policies.

PiperOrigin-RevId: 201206937
Change-Id: I6e225c382b2e5e1b0ad4ccf8ca229873f4cd389d
---
 pkg/sentry/fs/gofer/BUILD           |   1 +
 pkg/sentry/fs/gofer/cache_policy.go | 103 ++++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/gofer/file.go         |  31 ++++++-----
 pkg/sentry/fs/gofer/fs.go           |  21 --------
 pkg/sentry/fs/gofer/inode.go        |  28 ++++------
 pkg/sentry/fs/gofer/path.go         |  50 +++++++++--------
 pkg/sentry/fs/gofer/session.go      |  11 ++--
 7 files changed, 163 insertions(+), 82 deletions(-)
 create mode 100644 pkg/sentry/fs/gofer/cache_policy.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index ca42b0a54..e6f659c53 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -22,6 +22,7 @@ go_library(
     name = "gofer",
     srcs = [
         "attr.go",
+        "cache_policy.go",
         "context_file.go",
         "device.go",
         "file.go",
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
new file mode 100644
index 000000000..eec8c07cb
--- /dev/null
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -0,0 +1,103 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// cachePolicy is a 9p cache policy. It has methods that determine what to
+// cache (if anything) for a given inode.
+type cachePolicy int
+
+const (
+	// Cache nothing.
+	cacheNone cachePolicy = iota
+
+	// Use virtual file system cache for everything.
+	cacheAll
+
+	// Use virtual file system cache for everything, but send writes to the
+	// fs agent immediately.
+	cacheAllWritethrough
+)
+
+func parseCachePolicy(policy string) (cachePolicy, error) {
+	switch policy {
+	case "fscache":
+		return cacheAll, nil
+	case "none":
+		return cacheNone, nil
+	case "fscache_writethrough":
+		return cacheAllWritethrough, nil
+	}
+	return cacheNone, fmt.Errorf("unsupported cache mode: %s", policy)
+}
+
+// cacheUAtters determines whether unstable attributes should be cached for the
+// given inode.
+func (cp cachePolicy) cacheUAttrs(inode *fs.Inode) bool {
+	if !fs.IsFile(inode.StableAttr) && !fs.IsDir(inode.StableAttr) {
+		return false
+	}
+	return cp == cacheAll || cp == cacheAllWritethrough
+}
+
+// cacheReaddir determines whether readdir results should be cached.
+func (cp cachePolicy) cacheReaddir() bool {
+	return cp == cacheAll || cp == cacheAllWritethrough
+}
+
+// usePageCache determines whether the page cache should be used for the given
+// inode.
+func (cp cachePolicy) usePageCache(inode *fs.Inode) bool {
+	// Do cached IO for regular files only. Some "character devices" expect
+	// no caching.
+	if !fs.IsFile(inode.StableAttr) {
+		return false
+	}
+	return cp == cacheAll || cp == cacheAllWritethrough
+}
+
+// writeThough indicates whether writes to the file should be synced to the
+// gofer immediately.
+func (cp cachePolicy) writeThrough(inode *fs.Inode) bool {
+	return cp == cacheNone || cp == cacheAllWritethrough
+}
+
+// revalidateDirent indicates that dirents should be revalidated after they are
+// looked up.
+func (cp cachePolicy) revalidateDirent() bool {
+	return cp == cacheNone
+}
+
+// keepDirent indicates that dirents should be kept pinned in the dirent tree
+// even if there are no application references on the file.
+func (cp cachePolicy) keepDirent(inode *fs.Inode) bool {
+	if cp == cacheNone {
+		return false
+	}
+	sattr := inode.StableAttr
+	// NOTE: Only cache files, directories, and symlinks.
+	return fs.IsFile(sattr) || fs.IsDir(sattr) || fs.IsSymlink(sattr)
+}
+
+// cacheNegativeDirents indicates that negative dirents should be held in the
+// dirent tree.
+func (cp cachePolicy) cacheNegativeDirents() bool {
+	return cp == cacheAll || cp == cacheAllWritethrough
+}
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 07c9bf01d..69cee7026 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -93,7 +93,7 @@ func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer
 		DirCursor:  &f.dirCursor,
 	}
 	n, err := fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
-	if f.inodeOperations.session().cachePolicy != cacheNone {
+	if f.inodeOperations.session().cachePolicy.cacheUAttrs(file.Dirent.Inode) {
 		f.inodeOperations.cachingInodeOps.TouchAccessTime(ctx, file.Dirent.Inode)
 	}
 	return n, err
@@ -105,7 +105,7 @@ func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offs
 	defer f.inodeOperations.readdirMu.Unlock()
 
 	// Fetch directory entries if needed.
-	if f.inodeOperations.readdirCache == nil || f.inodeOperations.session().cachePolicy == cacheNone {
+	if !f.inodeOperations.session().cachePolicy.cacheReaddir() || f.inodeOperations.readdirCache == nil {
 		entries, err := f.readdirAll(ctx)
 		if err != nil {
 			return offset, err
@@ -183,13 +183,20 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I
 		// Not all remote file systems enforce this so this client does.
 		return 0, syserror.EISDIR
 	}
+	cp := f.inodeOperations.session().cachePolicy
+	if cp.usePageCache(file.Dirent.Inode) {
+		n, err := f.inodeOperations.cachingInodeOps.Write(ctx, src, offset)
+		if err != nil {
+			return n, err
+		}
+		if cp.writeThrough(file.Dirent.Inode) {
+			// Write out the file.
+			err = f.inodeOperations.cachingInodeOps.WriteOut(ctx, file.Dirent.Inode)
+		}
+		return n, err
 
-	// Do cached IO for regular files only. Some character devices expect no caching.
-	isFile := fs.IsFile(file.Dirent.Inode.StableAttr)
-	if f.inodeOperations.session().cachePolicy == cacheNone || !isFile {
-		return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
 	}
-	return f.inodeOperations.cachingInodeOps.Write(ctx, src, offset)
+	return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
 }
 
 // Read implements fs.FileOperations.Read.
@@ -199,12 +206,10 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO
 		return 0, syserror.EISDIR
 	}
 
-	// Do cached IO for regular files only. Some character devices expect no caching.
-	isFile := fs.IsFile(file.Dirent.Inode.StableAttr)
-	if f.inodeOperations.session().cachePolicy == cacheNone || !isFile {
-		return dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset))
+	if f.inodeOperations.session().cachePolicy.usePageCache(file.Dirent.Inode) {
+		return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
 	}
-	return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
+	return dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset))
 }
 
 // Fsync implements fs.FileOperations.Fsync.
@@ -243,7 +248,7 @@ func (f *fileOperations) Flush(ctx context.Context, file *fs.File) error {
 
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
-	if !isFileCachable(f.inodeOperations.session(), file.Dirent.Inode) {
+	if !f.inodeOperations.session().cachePolicy.usePageCache(file.Dirent.Inode) {
 		return syserror.ENODEV
 	}
 	return fsutil.GenericConfigureMMap(file, f.inodeOperations.cachingInodeOps, opts)
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index a8a3ec19d..e041074d2 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -56,27 +56,6 @@ const (
 	privateUnixSocketKey = "privateunixsocket"
 )
 
-// cachePolicy is a 9p cache policy.
-type cachePolicy int
-
-const (
-	// TODO: fully support cache=none.
-	cacheNone cachePolicy = iota
-
-	// Use virtual file system cache.
-	cacheAll
-)
-
-func parseCachePolicy(policy string) (cachePolicy, error) {
-	switch policy {
-	case "fscache":
-		return cacheAll, nil
-	case "none":
-		return cacheNone, nil
-	}
-	return cacheNone, fmt.Errorf("unsupported cache mode: %s", policy)
-}
-
 // defaultAname is the default attach name.
 const defaultAname = "/"
 
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index c00da5fec..fa9013b75 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -335,23 +335,15 @@ func (i *inodeOperations) Release(ctx context.Context) {
 
 // Mappable implements fs.InodeOperations.Mappable.
 func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
-	if i.session().cachePolicy == cacheNone || !fs.IsFile(inode.StableAttr) {
-		return nil
+	if i.session().cachePolicy.usePageCache(inode) {
+		return i.cachingInodeOps
 	}
-	return i.cachingInodeOps
-}
-
-func isCachable(session *session, inode *fs.Inode) bool {
-	return session.cachePolicy != cacheNone && (fs.IsFile(inode.StableAttr) || fs.IsDir(inode.StableAttr))
-}
-
-func isFileCachable(session *session, inode *fs.Inode) bool {
-	return session.cachePolicy != cacheNone && fs.IsFile(inode.StableAttr)
+	return nil
 }
 
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
 func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	if isCachable(i.session(), inode) {
+	if i.session().cachePolicy.cacheUAttrs(inode) {
 		return i.cachingInodeOps.UnstableAttr(ctx, inode)
 	}
 	return i.fileState.unstableAttr(ctx)
@@ -433,7 +425,7 @@ func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*
 }
 
 func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	if !isFileCachable(i.session(), d.Inode) {
+	if !i.session().cachePolicy.usePageCache(d.Inode) {
 		h, err := newHandles(ctx, i.fileState.file, flags)
 		if err != nil {
 			return nil, err
@@ -456,7 +448,7 @@ func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flag
 
 // SetPermissions implements fs.InodeOperations.SetPermissions.
 func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
-	if isCachable(i.session(), inode) {
+	if i.session().cachePolicy.cacheUAttrs(inode) {
 		return i.cachingInodeOps.SetPermissions(ctx, inode, p)
 	}
 
@@ -473,7 +465,7 @@ func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner f
 		return nil
 	}
 
-	if isCachable(i.session(), inode) {
+	if i.session().cachePolicy.cacheUAttrs(inode) {
 		return i.cachingInodeOps.SetOwner(ctx, inode, owner)
 	}
 
@@ -492,7 +484,7 @@ func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner f
 
 // SetTimestamps implements fs.InodeOperations.SetTimestamps.
 func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if isCachable(i.session(), inode) {
+	if i.session().cachePolicy.cacheUAttrs(inode) {
 		return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
 	}
 
@@ -502,7 +494,7 @@ func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts
 // Truncate implements fs.InodeOperations.Truncate.
 func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error {
 	// This can only be called for files anyway.
-	if isFileCachable(i.session(), inode) {
+	if i.session().cachePolicy.usePageCache(inode) {
 		return i.cachingInodeOps.Truncate(ctx, inode, length)
 	}
 
@@ -511,7 +503,7 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
-	if !isCachable(i.session(), inode) {
+	if !i.session().cachePolicy.cacheUAttrs(inode) {
 		return nil
 	}
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 6c4c2eed9..e78172bda 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -29,7 +29,7 @@ import (
 // Lookup loads an Inode at name into a Dirent based on the session's cache
 // policy.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	if i.session().cachePolicy != cacheNone {
+	if i.session().cachePolicy.cacheReaddir() {
 		// Check to see if we have readdirCache that indicates the
 		// child does not exist.  Avoid holding readdirMu longer than
 		// we need to.
@@ -46,7 +46,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 	qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		if err == syscall.ENOENT {
-			if i.session().cachePolicy != cacheNone {
+			if i.session().cachePolicy.cacheNegativeDirents() {
 				// Return a negative Dirent. It will stay cached until something
 				// is created over it.
 				return fs.NewNegativeDirent(name), nil
@@ -95,7 +95,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 		return nil, err
 	}
 
-	i.touchModificationTime(ctx)
+	i.touchModificationTime(ctx, dir)
 
 	// Get the attributes of the file.
 	qid, mask, p9attr, err := getattr(ctx, newFile)
@@ -124,7 +124,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 		File: newFile,
 		Host: hostFile,
 	}
-	if isFileCachable(iops.session(), d.Inode) {
+	if iops.session().cachePolicy.usePageCache(d.Inode) {
 		iops.fileState.setHandlesForCachedIO(flags, h)
 	}
 	return NewFile(ctx, d, flags, iops, h), nil
@@ -136,12 +136,12 @@ func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname
 	if _, err := i.fileState.file.symlink(ctx, oldname, newname, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
 		return err
 	}
-	i.touchModificationTime(ctx)
+	i.touchModificationTime(ctx, dir)
 	return nil
 }
 
 // CreateHardLink implements InodeOperations.CreateHardLink.
-func (i *inodeOperations) CreateHardLink(ctx context.Context, _ *fs.Inode, target *fs.Inode, newName string) error {
+func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, target *fs.Inode, newName string) error {
 	targetOpts, ok := target.InodeOperations.(*inodeOperations)
 	if !ok {
 		return syscall.EXDEV
@@ -150,11 +150,11 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, _ *fs.Inode, targe
 	if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil {
 		return err
 	}
-	if i.session().cachePolicy == cacheAll {
+	if i.session().cachePolicy.cacheUAttrs(inode) {
 		// Increase link count.
 		targetOpts.cachingInodeOps.IncLinks(ctx)
 	}
-	i.touchModificationTime(ctx)
+	i.touchModificationTime(ctx, inode)
 	return nil
 }
 
@@ -164,10 +164,11 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s
 	if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
 		return err
 	}
-	if i.session().cachePolicy == cacheAll {
+	if i.session().cachePolicy.cacheUAttrs(dir) {
 		// Increase link count.
 		i.cachingInodeOps.IncLinks(ctx)
-
+	}
+	if i.session().cachePolicy.cacheReaddir() {
 		// Invalidate readdir cache.
 		i.markDirectoryDirty()
 	}
@@ -202,7 +203,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// We're not going to use this file.
 	hostFile.Close()
 
-	i.touchModificationTime(ctx)
+	i.touchModificationTime(ctx, dir)
 
 	// Get the attributes of the file to create inode key.
 	qid, _, attr, err := getattr(ctx, newFile)
@@ -254,7 +255,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 	if removeSocket {
 		i.session().endpoints.remove(key)
 	}
-	i.touchModificationTime(ctx)
+	i.touchModificationTime(ctx, dir)
 
 	return nil
 }
@@ -265,10 +266,11 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na
 	if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil {
 		return err
 	}
-	if i.session().cachePolicy == cacheAll {
+	if i.session().cachePolicy.cacheUAttrs(dir) {
 		// Decrease link count and updates atime.
 		i.cachingInodeOps.DecLinks(ctx)
-
+	}
+	if i.session().cachePolicy.cacheReaddir() {
 		// Invalidate readdir cache.
 		i.markDirectoryDirty()
 	}
@@ -294,14 +296,17 @@ func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldNa
 		return err
 	}
 
-	// Update cached state.
-	if i.session().cachePolicy == cacheAll {
-		// Is the renamed entity a directory? Fix link counts.
-		if fs.IsDir(i.fileState.sattr) {
+	// Is the renamed entity a directory? Fix link counts.
+	if fs.IsDir(i.fileState.sattr) {
+		// Update cached state.
+		if i.session().cachePolicy.cacheUAttrs(oldParent) {
 			oldParentInodeOperations.cachingInodeOps.DecLinks(ctx)
+		}
+		if i.session().cachePolicy.cacheUAttrs(newParent) {
 			newParentInodeOperations.cachingInodeOps.IncLinks(ctx)
 		}
-
+	}
+	if i.session().cachePolicy.cacheReaddir() {
 		// Mark old directory dirty.
 		oldParentInodeOperations.markDirectoryDirty()
 		if oldParent != newParent {
@@ -312,10 +317,11 @@ func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldNa
 	return nil
 }
 
-func (i *inodeOperations) touchModificationTime(ctx context.Context) {
-	if i.session().cachePolicy == cacheAll {
+func (i *inodeOperations) touchModificationTime(ctx context.Context, inode *fs.Inode) {
+	if i.session().cachePolicy.cacheUAttrs(inode) {
 		i.cachingInodeOps.TouchModificationTime(ctx)
-
+	}
+	if i.session().cachePolicy.cacheReaddir() {
 		// Invalidate readdir cache.
 		i.markDirectoryDirty()
 	}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index baf00d8e7..21dc5e08d 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -122,17 +122,12 @@ func (s *session) Destroy() {
 
 // Revalidate returns true if the cache policy is does not allow for VFS caching.
 func (s *session) Revalidate(*fs.Dirent) bool {
-	return s.cachePolicy == cacheNone
+	return s.cachePolicy.revalidateDirent()
 }
 
 // TakeRefs takes an extra reference on dirent if possible.
-func (s *session) Keep(dirent *fs.Dirent) bool {
-	sattr := dirent.Inode.StableAttr
-	if s.cachePolicy == cacheNone {
-		return false
-	}
-	// NOTE: Only cache files, directories, and symlinks.
-	return fs.IsFile(sattr) || fs.IsDir(sattr) || fs.IsSymlink(sattr)
+func (s *session) Keep(d *fs.Dirent) bool {
+	return s.cachePolicy.keepDirent(d.Inode)
 }
 
 // ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
-- 
cgit v1.2.3


From bda2a1ed3503699b8cb814bb3cc7ad0b9694155b Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 19 Jun 2018 14:11:58 -0700
Subject: Rpcinet is racy around shutdown flags.

Correct a data race in rpcinet where a shutdown and recvmsg can
race around shutown flags.

PiperOrigin-RevId: 201238366
Change-Id: I5eb06df4a2b4eba331eeb5de19076213081d581f
---
 pkg/sentry/socket/rpcinet/socket.go | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index f641f25df..207123d6f 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -15,6 +15,7 @@
 package rpcinet
 
 import (
+	"sync/atomic"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -57,7 +58,7 @@ type socketOperations struct {
 	// shState is the state of the connection with respect to shutdown. Because
 	// we're mixing non-blocking semantics on the other side we have to adapt for
 	// some strange differences between blocking and non-blocking sockets.
-	shState tcpip.ShutdownFlags
+	shState int32
 }
 
 // Verify that we actually implement socket.Socket.
@@ -105,26 +106,35 @@ func translateIOSyscallError(err error) error {
 // setShutdownFlags will set the shutdown flag so we can handle blocking reads
 // after a read shutdown.
 func (s *socketOperations) setShutdownFlags(how int) {
+	var f tcpip.ShutdownFlags
 	switch how {
 	case linux.SHUT_RD:
-		s.shState |= tcpip.ShutdownRead
+		f = tcpip.ShutdownRead
 	case linux.SHUT_WR:
-		s.shState |= tcpip.ShutdownWrite
+		f = tcpip.ShutdownWrite
 	case linux.SHUT_RDWR:
-		s.shState |= tcpip.ShutdownWrite | tcpip.ShutdownRead
+		f = tcpip.ShutdownWrite | tcpip.ShutdownRead
+	}
+
+	// Atomically update the flags.
+	for {
+		old := atomic.LoadInt32(&s.shState)
+		if atomic.CompareAndSwapInt32(&s.shState, old, old|int32(f)) {
+			break
+		}
 	}
 }
 
 func (s *socketOperations) resetShutdownFlags() {
-	s.shState = 0
+	atomic.StoreInt32(&s.shState, 0)
 }
 
 func (s *socketOperations) isShutRdSet() bool {
-	return s.shState&tcpip.ShutdownRead != 0
+	return atomic.LoadInt32(&s.shState)&int32(tcpip.ShutdownRead) != 0
 }
 
 func (s *socketOperations) isShutWrSet() bool {
-	return s.shState&tcpip.ShutdownWrite != 0
+	return atomic.LoadInt32(&s.shState)&int32(tcpip.ShutdownWrite) != 0
 }
 
 // Release implements fs.FileOperations.Release.
-- 
cgit v1.2.3


From a6dbef045ff684e92f472280eb6f7f688b9bc87a Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Tue, 19 Jun 2018 15:22:23 -0700
Subject: Added a resume command to unpause a paused container.

Resume checks the status of the container and unpauses the kernel
if its status is paused. Otherwise nothing happens.
Tests were added to ensure that the process is in the correct state
after various commands.

PiperOrigin-RevId: 201251234
Change-Id: Ifd11b336c33b654fea6238738f864fcf2bf81e19
---
 pkg/sentry/control/proc.go        |   2 +
 runsc/boot/controller.go          |  11 ++-
 runsc/cmd/BUILD                   |   1 +
 runsc/cmd/resume.go               |  68 +++++++++++++++++
 runsc/container/container.go      |  21 +++++-
 runsc/container/container_test.go | 152 ++++++++++++++++++++++++++++++++++++--
 runsc/main.go                     |   2 +
 runsc/sandbox/sandbox.go          |  15 ++++
 8 files changed, 262 insertions(+), 10 deletions(-)
 create mode 100644 runsc/cmd/resume.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index d77b30c90..d94ae560f 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"sort"
 	"syscall"
 	"text/tabwriter"
 	"time"
@@ -245,6 +246,7 @@ func Processes(k *kernel.Kernel, out *[]*Process) error {
 			Cmd:   tg.Leader().Name(),
 		})
 	}
+	sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
 	return nil
 }
 
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 564f2d271..ae727f144 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -44,6 +44,9 @@ const (
 	// processes running in a container.
 	ContainerProcesses = "containerManager.Processes"
 
+	// ContainerResume unpauses the paused container.
+	ContainerResume = "containerManager.Resume"
+
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
@@ -156,12 +159,18 @@ func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
 	return state.Save(o, nil)
 }
 
-// Pause suspends the process in a container.
+// Pause suspends a container.
 func (cm *containerManager) Pause(_, _ *struct{}) error {
 	cm.k.Pause()
 	return nil
 }
 
+// Resume unpauses a container.
+func (cm *containerManager) Resume(_, _ *struct{}) error {
+	cm.k.Unpause()
+	return nil
+}
+
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
 	// TODO: Use the cid and wait on the init process in that
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 8fbce294f..fffb6f359 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -20,6 +20,7 @@ go_library(
         "pause.go",
         "ps.go",
         "restore.go",
+        "resume.go",
         "run.go",
         "start.go",
         "state.go",
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
new file mode 100644
index 000000000..a12adf1a3
--- /dev/null
+++ b/runsc/cmd/resume.go
@@ -0,0 +1,68 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Resume implements subcommands.Command for the "resume" command.
+type Resume struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Resume) Name() string {
+	return "resume"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Resume) Synopsis() string {
+	return "Resume unpauses a paused container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Resume) Usage() string {
+	return `resume <container id> - resume a paused container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Resume) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	cont, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading container: %v", err)
+	}
+
+	if err := cont.Resume(); err != nil {
+		Fatalf("resume failed: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index dc7fccdee..571784e07 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -361,8 +361,23 @@ func (c *Container) Pause() error {
 		c.Status = Paused
 		return c.save()
 	default:
-		log.Warningf("container %q not created or running, not pausing", c.ID)
-		return nil
+		return fmt.Errorf("container %q not created or running, not pausing", c.ID)
+	}
+}
+
+// Resume unpauses the container and its kernel.
+// The call only succeeds if the container's status is paused.
+func (c *Container) Resume() error {
+	log.Debugf("Resuming container %q", c.ID)
+	switch c.Status {
+	case Paused:
+		if err := c.Sandbox.Resume(c.ID); err != nil {
+			return fmt.Errorf("error resuming container: %v", err)
+		}
+		c.Status = Running
+		return c.save()
+	default:
+		return fmt.Errorf("container %q not paused, not resuming", c.ID)
 	}
 }
 
@@ -380,7 +395,7 @@ func (c *Container) State() specs.State {
 // Processes retrieves the list of processes and associated metadata inside a
 // container.
 func (c *Container) Processes() ([]*control.Process, error) {
-	if c.Status != Running {
+	if c.Status != Running && c.Status != Paused {
 		return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", c.ID, c.Status)
 	}
 	return c.Sandbox.Processes(c.ID)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5659abab3..7818990a7 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -84,6 +84,19 @@ func procListsEqual(got, want []*control.Process) bool {
 	return true
 }
 
+// getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
+// test for equality. This is because we already confirmed that exec occurred.
+func getAndCheckProcLists(cont *container.Container, want []*control.Process) error {
+	got, err := cont.Processes()
+	if err != nil {
+		return fmt.Errorf("error getting process data from container: %v", err)
+	}
+	if procListsEqual(got, want) {
+		return nil
+	}
+	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+}
+
 func procListToString(pl []*control.Process) string {
 	strs := make([]string, 0, len(pl))
 	for _, p := range pl {
@@ -459,11 +472,14 @@ func TestCheckpoint(t *testing.T) {
 	}
 }
 
-// TestPause tests that calling pause successfully pauses the container.
-// It checks that no errors are returned and that the state of the container
-// is in fact 'Paused.'
-func TestPause(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+// TestPauseResume tests that we can successfully pause and resume a container.
+// It checks starts running sleep and executes another sleep. It pauses and checks
+// that both processes are still running: sleep will be paused and still exist.
+// It will then unpause and confirm that both processes are running. Then it will
+// wait until one sleep completes and check to make sure the other is running.
+func TestPauseResume(t *testing.T) {
+	const uid = 343
+	spec := testutil.NewSpecWithArgs("sleep", "20")
 
 	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
@@ -482,15 +498,139 @@ func TestPause(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
+	// expectedPL lists the expected process state of the container.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	execArgs := control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"sleep", "5"},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+	}
+
+	// First, start running exec (whick blocks).
+	go cont.Execute(&execArgs)
+
+	// Verify that "sleep 5" is running.
+	if err := waitForProcessList(cont, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
 	// Pause the running container.
 	if err := cont.Pause(); err != nil {
 		t.Errorf("error pausing container: %v", err)
 	}
+	if got, want := cont.Status, container.Paused; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	time.Sleep(10 * time.Second)
+
+	// Verify that the two processes still exist. Sleep 5 is paused so
+	// it should still be in the process list after 10 seconds.
+	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+		t.Fatal(err)
+	}
 
-	// Confirm the status of the container is paused.
+	// Resume the running container.
+	if err := cont.Resume(); err != nil {
+		t.Errorf("error pausing container: %v", err)
+	}
+	if got, want := cont.Status, container.Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	expectedPL2 := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Verify there is only one process left since we waited 10 at most seconds for
+	// sleep 5 to end.
+	if err := waitForProcessList(cont, expectedPL2); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// TestPauseResumeStatus makes sure that the statuses are set correctly
+// with calls to pause and resume and that pausing and resuming only
+// occurs given the correct state.
+func TestPauseResumeStatus(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("sleep", "20")
+
+	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont.Destroy()
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Pause the running container.
+	if err := cont.Pause(); err != nil {
+		t.Errorf("error pausing container: %v", err)
+	}
 	if got, want := cont.Status, container.Paused; got != want {
 		t.Errorf("container status got %v, want %v", got, want)
 	}
+
+	// Try to Pause again. Should cause error.
+	if err := cont.Pause(); err == nil {
+		t.Errorf("error pausing container that was already paused: %v", err)
+	}
+	if got, want := cont.Status, container.Paused; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Resume the running container.
+	if err := cont.Resume(); err != nil {
+		t.Errorf("error resuming container: %v", err)
+	}
+	if got, want := cont.Status, container.Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Try to resume again. Should cause error.
+	if err := cont.Resume(); err == nil {
+		t.Errorf("error resuming container already running: %v", err)
+	}
+	if got, want := cont.Status, container.Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
 }
 
 // TestCapabilities verifies that:
diff --git a/runsc/main.go b/runsc/main.go
index 42c8ee315..4d69f5803 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -76,7 +76,9 @@ func main() {
 	subcommands.Register(new(cmd.Gofer), "")
 	subcommands.Register(new(cmd.Kill), "")
 	subcommands.Register(new(cmd.List), "")
+	subcommands.Register(new(cmd.Pause), "")
 	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Resume), "")
 	subcommands.Register(new(cmd.Run), "")
 	subcommands.Register(new(cmd.Start), "")
 	subcommands.Register(new(cmd.State), "")
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index b008eba1e..0181dc9d4 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -477,6 +477,21 @@ func (s *Sandbox) Pause(cid string) error {
 	return nil
 }
 
+// Resume sends the resume call for a container in the sandbox.
+func (s *Sandbox) Resume(cid string) error {
+	log.Debugf("Resume sandbox %q", s.ID)
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerResume, nil, nil); err != nil {
+		return fmt.Errorf("err resuming container %q: %v", cid, err)
+	}
+	return nil
+}
+
 // IsRunning returns true if the sandbox or gofer process is running.
 func (s *Sandbox) IsRunning() bool {
 	if s.Pid != 0 {
-- 
cgit v1.2.3


From aa14a2c1be7f705927e9558f0e46ceca159e23e6 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 19 Jun 2018 16:07:08 -0700
Subject: sentry: futex S/R optimization.

No need to save thousands of zerovalue buckets.

PiperOrigin-RevId: 201258598
Change-Id: I5d3ea7b6a5345117ab4f610332d5288ca550be33
---
 pkg/sentry/kernel/futex/futex.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index b3ba57a2c..15e3e5e2c 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -197,7 +197,7 @@ func bucketIndexForAddr(addr uintptr) uintptr {
 
 // Manager holds futex state for a single virtual address space.
 type Manager struct {
-	buckets [bucketCount]bucket
+	buckets [bucketCount]bucket `state:"zerovalue"`
 }
 
 // NewManager returns an initialized futex manager.
-- 
cgit v1.2.3


From be76cad5bccd4091393e523b57960a4107101ca9 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 19 Jun 2018 16:59:25 -0700
Subject: Make KVM more scalable by removing CPU cap.

Instead, CPUs will be created dynamically. We also allow a relatively
efficient mechanism for stealing and notifying when a vCPU becomes
available via unlock.

Since the number of vCPUs is no longer fixed at machine creation time,
we make the dirtySet packing more efficient. This has the pleasant side
effect of cutting out the unsafe address space code.

PiperOrigin-RevId: 201266691
Change-Id: I275c73525a4f38e3714b9ac0fd88731c26adfe66
---
 pkg/sentry/platform/kvm/BUILD                   |   1 -
 pkg/sentry/platform/kvm/address_space.go        |  67 +++++++++---
 pkg/sentry/platform/kvm/address_space_unsafe.go |  44 --------
 pkg/sentry/platform/kvm/kvm.go                  |   4 +-
 pkg/sentry/platform/kvm/machine.go              | 131 ++++++++++++------------
 pkg/sentry/platform/kvm/machine_amd64.go        |   2 +-
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go |   4 +-
 7 files changed, 123 insertions(+), 130 deletions(-)
 delete mode 100644 pkg/sentry/platform/kvm/address_space_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 135861368..673393fad 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -27,7 +27,6 @@ go_library(
     name = "kvm",
     srcs = [
         "address_space.go",
-        "address_space_unsafe.go",
         "allocator.go",
         "bluepill.go",
         "bluepill_amd64.go",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index f74c98dd0..fbd11ed71 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -17,13 +17,62 @@ package kvm
 import (
 	"reflect"
 	"sync"
+	"sync/atomic"
 
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+type vCPUBitArray [(_KVM_NR_VCPUS + 63) / 64]uint64
+
+// dirtySet tracks vCPUs for invalidation.
+type dirtySet struct {
+	vCPUs vCPUBitArray
+}
+
+// forEach iterates over all CPUs in the dirty set.
+func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
+	var localSet vCPUBitArray
+	for index := 0; index < len(ds.vCPUs); index++ {
+		// Clear the dirty set, copy to the local one.
+		localSet[index] = atomic.SwapUint64(&ds.vCPUs[index], 0)
+	}
+
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	for _, c := range m.vCPUs {
+		index := uint64(c.id) / 64
+		bit := uint64(1) << uint(c.id%64)
+
+		// Call the function if it was set.
+		if localSet[index]&bit != 0 {
+			fn(c)
+		}
+	}
+}
+
+// mark marks the given vCPU as dirty and returns whether it was previously
+// clean. Being previously clean implies that a flush is needed on entry.
+func (ds *dirtySet) mark(c *vCPU) bool {
+	index := uint64(c.id) / 64
+	bit := uint64(1) << uint(c.id%64)
+
+	oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+	if oldValue&bit != 0 {
+		return false // Not clean.
+	}
+
+	// Set the bit unilaterally, and ensure that a flush takes place. Note
+	// that it's possible for races to occur here, but since the flush is
+	// taking place long after these lines there's no race in practice.
+	atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+	return true // Previously clean.
+}
+
 // addressSpace is a wrapper for PageTables.
 type addressSpace struct {
 	platform.NoAddressSpaceIO
@@ -43,10 +92,6 @@ type addressSpace struct {
 	pageTables *pagetables.PageTables
 
 	// dirtySet is the set of dirty vCPUs.
-	//
-	// These are actually vCPU pointers that are stored iff the vCPU is
-	// dirty. If the vCPU is not dirty and requires invalidation, then a
-	// nil value is stored here instead.
 	dirtySet dirtySet
 
 	// files contains files mapped in the host address space.
@@ -57,11 +102,11 @@ type addressSpace struct {
 
 // invalidate is the implementation for Invalidate.
 func (as *addressSpace) invalidate() {
-	for i := 0; i < as.dirtySet.size(); i++ {
-		if c := as.dirtySet.swap(i, nil); c != nil && c.active.get() == as {
-			c.BounceToKernel() // Force a kernel transition.
+	as.dirtySet.forEach(as.machine, func(c *vCPU) {
+		if c.active.get() == as { // If this happens to be active,
+			c.BounceToKernel() // ... force a kernel transition.
 		}
-	}
+	})
 }
 
 // Invalidate interrupts all dirty contexts.
@@ -75,11 +120,7 @@ func (as *addressSpace) Invalidate() {
 //
 // The return value indicates whether a flush is required.
 func (as *addressSpace) Touch(c *vCPU) bool {
-	if old := as.dirtySet.swap(c.id, c); old == nil {
-		return true // Flush is required.
-	}
-	// Already dirty: no flush required.
-	return false
+	return as.dirtySet.mark(c)
 }
 
 func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
diff --git a/pkg/sentry/platform/kvm/address_space_unsafe.go b/pkg/sentry/platform/kvm/address_space_unsafe.go
deleted file mode 100644
index b6c31ce10..000000000
--- a/pkg/sentry/platform/kvm/address_space_unsafe.go
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kvm
-
-import (
-	"sync/atomic"
-	"unsafe"
-)
-
-// dirtySet tracks vCPUs for invalidation.
-type dirtySet struct {
-	vCPUs []unsafe.Pointer
-}
-
-// makeDirtySet makes a new dirtySet.
-func makeDirtySet(size int) dirtySet {
-	return dirtySet{
-		vCPUs: make([]unsafe.Pointer, size),
-	}
-}
-
-// size is the size of the set.
-func (ds *dirtySet) size() int {
-	return len(ds.vCPUs)
-}
-
-// swap sets the given index and returns the previous value.
-//
-// The index is typically the id for a non-nil vCPU.
-func (ds *dirtySet) swap(index int, c *vCPU) *vCPU {
-	return (*vCPU)(atomic.SwapPointer(&ds.vCPUs[index], unsafe.Pointer(c)))
-}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 1a8e16ca0..3ed057881 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -17,7 +17,6 @@ package kvm
 
 import (
 	"fmt"
-	"runtime"
 	"sync"
 	"syscall"
 
@@ -77,7 +76,7 @@ func New() (*KVM, error) {
 	}
 
 	// Create a VM context.
-	machine, err := newMachine(int(vm), runtime.NumCPU())
+	machine, err := newMachine(int(vm))
 	if err != nil {
 		return nil, err
 	}
@@ -137,7 +136,6 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru
 		filemem:    k.FileMem,
 		machine:    k.machine,
 		pageTables: pageTables,
-		dirtySet:   makeDirtySet(len(k.machine.vCPUs)),
 	}, nil, nil
 }
 
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f045345d5..abdc51431 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -46,16 +46,14 @@ type machine struct {
 	mappingCache sync.Map
 
 	// mu protects vCPUs.
-	mu sync.Mutex
+	mu sync.RWMutex
 
 	// available is notified when vCPUs are available.
 	available sync.Cond
 
 	// vCPUs are the machine vCPUs.
 	//
-	// This is eventually keyed by system TID, but is initially indexed by
-	// the negative vCPU id. This is merely an optimization, so while
-	// collisions here are not possible, it wouldn't matter anyways.
+	// These are populated dynamically.
 	vCPUs map[uint64]*vCPU
 }
 
@@ -117,73 +115,65 @@ type vCPU struct {
 	vCPUArchState
 }
 
+// newVCPU creates a returns a new vCPU.
+//
+// Precondtion: mu must be held.
+func (m *machine) newVCPU() *vCPU {
+	id := len(m.vCPUs)
+
+	// Create the vCPU.
+	fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
+	if errno != 0 {
+		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
+	}
+
+	c := &vCPU{
+		id:      id,
+		fd:      int(fd),
+		machine: m,
+	}
+	c.CPU.Init(&m.kernel)
+	c.CPU.KernelSyscall = bluepillSyscall
+	c.CPU.KernelException = bluepillException
+
+	// Ensure the signal mask is correct.
+	if err := c.setSignalMask(); err != nil {
+		panic(fmt.Sprintf("error setting signal mask: %v", err))
+	}
+
+	// Initialize architecture state.
+	if err := c.initArchState(); err != nil {
+		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
+	}
+
+	// Map the run data.
+	runData, err := mapRunData(int(fd))
+	if err != nil {
+		panic(fmt.Sprintf("error mapping run data: %v", err))
+	}
+	c.runData = runData
+
+	return c // Done.
+}
+
 // newMachine returns a new VM context.
-func newMachine(vm int, vCPUs int) (*machine, error) {
+func newMachine(vm int) (*machine, error) {
 	// Create the machine.
 	m := &machine{
 		fd:    vm,
 		vCPUs: make(map[uint64]*vCPU),
 	}
 	m.available.L = &m.mu
-	if vCPUs > _KVM_NR_VCPUS {
-		// Hard cap at KVM's limit.
-		vCPUs = _KVM_NR_VCPUS
-	}
-	if n := 2 * runtime.NumCPU(); vCPUs > n {
-		// Cap at twice the number of physical cores. Otherwise we're
-		// just wasting memory and thrashing. (There may be scheduling
-		// issues when you've got > n active threads.)
-		vCPUs = n
-	}
 	m.kernel.Init(ring0.KernelOpts{
 		PageTables: pagetables.New(newAllocator()),
 	})
 
 	// Initialize architecture state.
-	if err := m.initArchState(vCPUs); err != nil {
+	if err := m.initArchState(); err != nil {
 		m.Destroy()
 		return nil, err
 	}
 
-	// Create all the vCPUs.
-	for id := 0; id < vCPUs; id++ {
-		// Create the vCPU.
-		fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id))
-		if errno != 0 {
-			m.Destroy()
-			return nil, fmt.Errorf("error creating VCPU: %v", errno)
-		}
-		c := &vCPU{
-			id:      id,
-			fd:      int(fd),
-			machine: m,
-		}
-		c.CPU.Init(&m.kernel)
-		c.CPU.KernelSyscall = bluepillSyscall
-		c.CPU.KernelException = bluepillException
-		m.vCPUs[uint64(-id)] = c // See above.
-
-		// Ensure the signal mask is correct.
-		if err := c.setSignalMask(); err != nil {
-			m.Destroy()
-			return nil, err
-		}
-
-		// Initialize architecture state.
-		if err := c.initArchState(); err != nil {
-			m.Destroy()
-			return nil, err
-		}
-
-		// Map the run data.
-		runData, err := mapRunData(int(fd))
-		if err != nil {
-			m.Destroy()
-			return nil, err
-		}
-		c.runData = runData
-	}
-
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
 	// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -298,15 +288,20 @@ func (m *machine) Destroy() {
 func (m *machine) Get() *vCPU {
 	runtime.LockOSThread()
 	tid := procid.Current()
-	m.mu.Lock()
+	m.mu.RLock()
 
 	// Check for an exact match.
 	if c := m.vCPUs[tid]; c != nil {
 		c.lock()
-		m.mu.Unlock()
+		m.mu.RUnlock()
 		return c
 	}
 
+	// The happy path failed. We now proceed to acquire an exclusive lock
+	// (because the vCPU map may change), and scan all available vCPUs.
+	m.mu.RUnlock()
+	m.mu.Lock()
+
 	for {
 		// Scan for an available vCPU.
 		for origTID, c := range m.vCPUs {
@@ -314,16 +309,21 @@ func (m *machine) Get() *vCPU {
 				delete(m.vCPUs, origTID)
 				m.vCPUs[tid] = c
 				m.mu.Unlock()
-
-				// We need to reload thread-local segments as
-				// we have origTID != tid and the vCPU state
-				// may be stale.
-				c.loadSegments()
-				atomic.StoreUint64(&c.tid, tid)
+				c.loadSegments(tid)
 				return c
 			}
 		}
 
+		// Create a new vCPU (maybe).
+		if len(m.vCPUs) < _KVM_NR_VCPUS {
+			c := m.newVCPU()
+			c.lock()
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+			c.loadSegments(tid)
+			return c
+		}
+
 		// Scan for something not in user mode.
 		for origTID, c := range m.vCPUs {
 			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
@@ -346,10 +346,7 @@ func (m *machine) Get() *vCPU {
 			delete(m.vCPUs, origTID)
 			m.vCPUs[tid] = c
 			m.mu.Unlock()
-
-			// See above.
-			c.loadSegments()
-			atomic.StoreUint64(&c.tid, tid)
+			c.loadSegments(tid)
 			return c
 		}
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 52896eefe..9af4f3f3d 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -29,7 +29,7 @@ import (
 )
 
 // initArchState initializes architecture-specific state.
-func (m *machine) initArchState(vCPUs int) error {
+func (m *machine) initArchState() error {
 	// Set the legacy TSS address. This address is covered by the reserved
 	// range (up to 4GB). In fact, this is a main reason it exists.
 	if _, _, errno := syscall.RawSyscall(
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index c2bcb3a47..8b9041f13 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -18,6 +18,7 @@ package kvm
 
 import (
 	"fmt"
+	"sync/atomic"
 	"syscall"
 	"unsafe"
 
@@ -54,7 +55,7 @@ func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) s
 // This may be called from within the signal context and throws on error.
 //
 //go:nosplit
-func (c *vCPU) loadSegments() {
+func (c *vCPU) loadSegments(tid uint64) {
 	if _, _, errno := syscall.RawSyscall(
 		syscall.SYS_ARCH_PRCTL,
 		linux.ARCH_GET_FS,
@@ -69,6 +70,7 @@ func (c *vCPU) loadSegments() {
 		0); errno != 0 {
 		throw("getting GS segment")
 	}
+	atomic.StoreUint64(&c.tid, tid)
 }
 
 // setUserRegisters sets user registers in the vCPU.
-- 
cgit v1.2.3


From db66e383c33228c43efbe16ad3b14ae9833879dc Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 19 Jun 2018 17:28:19 -0700
Subject: Epsocket has incorrect recv(2) behavior after SHUT_RD.

After shutdown(SHUT_RD) calls to recv /w MSG_DONTWAIT or with
O_NONBLOCK should result in a EAGAIN and not 0. Blocking sockets
should return 0 as they would have otherwise blocked indefinitely.

PiperOrigin-RevId: 201271123
Change-Id: If589b69c17fa5b9ff05bcf9e44024da9588c8876
---
 pkg/sentry/socket/epsocket/epsocket.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 9ff9af0bc..a1bb265c0 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -952,6 +952,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		senderRequested = false
 	}
 	n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+
+	if err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
+		// In this situation we should return EAGAIN.
+		return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
+	}
+
 	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		return
 	}
-- 
cgit v1.2.3


From 4e9f0e91d724b547e1ecaeeb210017f4c0b3fd0d Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Wed, 20 Jun 2018 11:01:32 -0700
Subject: sentry: pending signals S/R optimization.

Almost all of the hundreds of pending signal queues are empty upon save.

PiperOrigin-RevId: 201380318
Change-Id: I40747072435299de681d646e0862efac0637e172
---
 pkg/sentry/kernel/BUILD                    |  8 +++++--
 pkg/sentry/kernel/pending_signals.go       |  4 ++--
 pkg/sentry/kernel/pending_signals_state.go | 37 ++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 4 deletions(-)
 create mode 100644 pkg/sentry/kernel/pending_signals_state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 377c94e4c..b2a55ddff 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -13,7 +13,7 @@ go_stateify(
         "ipc_namespace.go",
         "kernel.go",
         "pending_signals.go",
-        "pending_signals_list.go",
+        "pending_signals_state.go",
         "process_group_list.go",
         "ptrace.go",
         "rseq.go",
@@ -46,7 +46,10 @@ go_stateify(
         "version.go",
     ],
     out = "kernel_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"],
+    imports = [
+        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
+        "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+    ],
     package = "kernel",
 )
 
@@ -117,6 +120,7 @@ go_library(
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
+        "pending_signals_state.go",
         "process_group_list.go",
         "ptrace.go",
         "rseq.go",
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index d8701f47a..5dc0f266c 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -44,11 +44,11 @@ type pendingSignals struct {
 	// Note that signals is zero-indexed, but signal 1 is the first valid
 	// signal, so signals[0] contains signals with signo 1 etc. This offset is
 	// usually handled by using Signal.index().
-	signals [linux.SignalMaximum]pendingSignalQueue
+	signals [linux.SignalMaximum]pendingSignalQueue `state:".([]*arch.SignalInfo)"`
 
 	// Bit i of pendingSet is set iff there is at least one signal with signo
 	// i+1 pending.
-	pendingSet linux.SignalSet
+	pendingSet linux.SignalSet `state:"manual"`
 }
 
 // pendingSignalQueue holds a pendingSignalList for a single signal number.
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
new file mode 100644
index 000000000..af61f6e8e
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// saveSignals is invoked by stateify.
+func (p *pendingSignals) saveSignals() []*arch.SignalInfo {
+	var pending []*arch.SignalInfo
+	for _, q := range p.signals {
+		for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
+			pending = append(pending, ps.SignalInfo)
+		}
+	}
+	return pending
+}
+
+// loadSignals is invoked by stateify.
+func (p *pendingSignals) loadSignals(pending []*arch.SignalInfo) {
+	for _, si := range pending {
+		p.enqueue(si)
+	}
+}
-- 
cgit v1.2.3


From d93f55e863c598de9126a0316a813f872b11e29f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 20 Jun 2018 13:05:00 -0700
Subject: Remove some defers in hot paths in the filesystem code.

PiperOrigin-RevId: 201401727
Change-Id: Ia5589882ba58a00efb522ab372e206b7e8e62aee
---
 pkg/sentry/fs/fsutil/inode_cached.go |  89 +++++++++++++++++++----------
 pkg/sentry/fs/inode_overlay.go       | 105 ++++++++++++++++++++++++-----------
 pkg/sentry/fs/ramfs/ramfs.go         |  52 +++++++++--------
 3 files changed, 160 insertions(+), 86 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 484668735..7c0f96ac2 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -179,8 +179,9 @@ func (c *CachingInodeOperations) Release() {
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
 func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
 	c.attrMu.Lock()
-	defer c.attrMu.Unlock()
-	return c.attr, nil
+	attr := c.attr
+	c.attrMu.Unlock()
+	return attr, nil
 }
 
 // SetPermissions implements fs.InodeOperations.SetPermissions.
@@ -463,15 +464,17 @@ func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst us
 //
 // If Write partially fills src, a non-nil error is returned.
 func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	// Hot path. Avoid defers.
 	if src.NumBytes() == 0 {
 		return 0, nil
 	}
 
 	c.attrMu.Lock()
-	defer c.attrMu.Unlock()
 	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
 	c.touchModificationTimeLocked(ctx)
-	return src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
+	n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
+	c.attrMu.Unlock()
+	return n, err
 }
 
 type inodeReadWriter struct {
@@ -482,15 +485,17 @@ type inodeReadWriter struct {
 
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	// Hot path. Avoid defers.
 	rw.c.dataMu.RLock()
-	defer rw.c.dataMu.RUnlock()
 
 	// Compute the range to read.
 	if rw.offset >= rw.c.attr.Size {
+		rw.c.dataMu.RUnlock()
 		return 0, io.EOF
 	}
 	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size)
 	if end == rw.offset { // dsts.NumBytes() == 0?
+		rw.c.dataMu.RUnlock()
 		return 0, nil
 	}
 
@@ -504,6 +509,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 			// Get internal mappings from the cache.
 			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
 			if err != nil {
+				rw.c.dataMu.RUnlock()
 				return done, err
 			}
 
@@ -513,6 +519,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 			rw.offset += int64(n)
 			dsts = dsts.DropFirst64(n)
 			if err != nil {
+				rw.c.dataMu.RUnlock()
 				return done, err
 			}
 
@@ -529,6 +536,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 			dsts = dsts.DropFirst64(n)
 			// Partial reads are fine. But we must stop reading.
 			if n != dst.NumBytes() || err != nil {
+				rw.c.dataMu.RUnlock()
 				return done, err
 			}
 
@@ -539,38 +547,44 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 			break
 		}
 	}
+	rw.c.dataMu.RUnlock()
 	return done, nil
 }
 
+// maybeGrowFile grows the file's size if data has been written past the old
+// size.
+//
+// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked.
+func (rw *inodeReadWriter) maybeGrowFile() {
+	// If the write ends beyond the file's previous size, it causes the
+	// file to grow.
+	if rw.offset > rw.c.attr.Size {
+		rw.c.attr.Size = rw.offset
+		rw.c.dirtyAttr.Size = true
+	}
+	if rw.offset > rw.c.attr.Usage {
+		// This is incorrect if CachingInodeOperations is caching a sparse
+		// file. (In Linux, keeping inode::i_blocks up to date is the
+		// filesystem's responsibility.)
+		rw.c.attr.Usage = rw.offset
+		rw.c.dirtyAttr.Usage = true
+	}
+}
+
 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
 //
 // Preconditions: rw.c.attrMu must be locked.
 func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	// Hot path. Avoid defers.
 	rw.c.dataMu.Lock()
-	defer rw.c.dataMu.Unlock()
 
 	// Compute the range to write.
 	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
 	if end == rw.offset { // srcs.NumBytes() == 0?
+		rw.c.dataMu.Unlock()
 		return 0, nil
 	}
 
-	defer func() {
-		// If the write ends beyond the file's previous size, it causes the
-		// file to grow.
-		if rw.offset > rw.c.attr.Size {
-			rw.c.attr.Size = rw.offset
-			rw.c.dirtyAttr.Size = true
-		}
-		if rw.offset > rw.c.attr.Usage {
-			// This is incorrect if CachingInodeOperations is caching a sparse
-			// file. (In Linux, keeping inode::i_blocks up to date is the
-			// filesystem's responsibility.)
-			rw.c.attr.Usage = rw.offset
-			rw.c.dirtyAttr.Usage = true
-		}
-	}()
-
 	mem := rw.c.platform.Memory()
 	var done uint64
 	seg, gap := rw.c.cache.Find(uint64(rw.offset))
@@ -582,6 +596,8 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 			segMR := seg.Range().Intersect(mr)
 			ims, err := mem.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
 			if err != nil {
+				rw.maybeGrowFile()
+				rw.c.dataMu.Unlock()
 				return done, err
 			}
 
@@ -592,6 +608,8 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 			srcs = srcs.DropFirst64(n)
 			rw.c.dirty.MarkDirty(segMR)
 			if err != nil {
+				rw.maybeGrowFile()
+				rw.c.dataMu.Unlock()
 				return done, err
 			}
 
@@ -608,6 +626,8 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 			srcs = srcs.DropFirst64(n)
 			// Partial writes are fine. But we must stop writing.
 			if n != src.NumBytes() || err != nil {
+				rw.maybeGrowFile()
+				rw.c.dataMu.Unlock()
 				return done, err
 			}
 
@@ -618,13 +638,15 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 			break
 		}
 	}
+	rw.maybeGrowFile()
+	rw.c.dataMu.Unlock()
 	return done, nil
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
 func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+	// Hot path. Avoid defers.
 	c.mapsMu.Lock()
-	defer c.mapsMu.Unlock()
 	mapped := c.mappings.AddMapping(ms, ar, offset)
 	// Do this unconditionally since whether we have c.backingFile.FD() >= 0
 	// can change across save/restore.
@@ -636,13 +658,14 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi
 			usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
 		}
 	}
+	c.mapsMu.Unlock()
 	return nil
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
 func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+	// Hot path. Avoid defers.
 	c.mapsMu.Lock()
-	defer c.mapsMu.Unlock()
 	unmapped := c.mappings.RemoveMapping(ms, ar, offset)
 	for _, r := range unmapped {
 		c.hostFileMapper.DecRefOn(r)
@@ -653,6 +676,7 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 				usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
 			}
 		}
+		c.mapsMu.Unlock()
 		return
 	}
 
@@ -661,7 +685,6 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 	// strategy.
 	mem := c.platform.Memory()
 	c.dataMu.Lock()
-	defer c.dataMu.Unlock()
 	for _, r := range unmapped {
 		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
 			log.Warningf("Failed to writeback cached data %v: %v", r, err)
@@ -669,6 +692,8 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 		c.cache.Drop(r, mem)
 		c.dirty.KeepClean(r)
 	}
+	c.dataMu.Unlock()
+	c.mapsMu.Unlock()
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
@@ -678,6 +703,7 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp
 
 // Translate implements memmap.Mappable.Translate.
 func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	// Hot path. Avoid defer.
 	if !c.forcePageCache && c.backingFile.FD() >= 0 {
 		return []memmap.Translation{
 			{
@@ -689,7 +715,6 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 	}
 
 	c.dataMu.Lock()
-	defer c.dataMu.Unlock()
 
 	// Constrain translations to c.attr.Size (rounded up) to prevent
 	// translation to pages that may be concurrently truncated.
@@ -697,6 +722,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 	var beyondEOF bool
 	if required.End > pgend {
 		if required.Start >= pgend {
+			c.dataMu.Unlock()
 			return nil, &memmap.BusError{io.EOF}
 		}
 		beyondEOF = true
@@ -726,6 +752,8 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 		translatedEnd = segMR.End
 	}
 
+	c.dataMu.Unlock()
+
 	// Don't return the error returned by c.cache.Fill if it occurred outside
 	// of required.
 	if translatedEnd < required.End && cerr != nil {
@@ -797,9 +825,8 @@ func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.A
 // underlying host fd and CachingInodeOperations is used as the platform.File
 // during translation.
 func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
+	// Hot path. Avoid defers.
 	c.dataMu.Lock()
-	defer c.dataMu.Unlock()
-
 	seg, gap := c.refs.Find(fr.Start)
 	for {
 		switch {
@@ -815,6 +842,7 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
 			seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
 		default:
 			c.refs.MergeAdjacent(fr)
+			c.dataMu.Unlock()
 			return
 		}
 	}
@@ -824,9 +852,8 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
 // underlying host fd and CachingInodeOperations is used as the platform.File
 // during translation.
 func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
+	// Hot path. Avoid defers.
 	c.dataMu.Lock()
-	defer c.dataMu.Unlock()
-
 	seg := c.refs.FindSegment(fr.Start)
 
 	for seg.Ok() && seg.Start() < fr.End {
@@ -842,4 +869,6 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
 		}
 	}
 	c.refs.MergeAdjacent(fr)
+	c.dataMu.Unlock()
+
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 343150bb8..53fbd1481 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -34,20 +34,23 @@ func overlayCreateWhiteout(parent *Inode, name string) error {
 }
 
 func overlayWriteOut(ctx context.Context, o *overlayEntry) error {
+	// Hot path. Avoid defers.
+	var err error
 	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
-	if o.upper == nil {
-		return nil
+	if o.upper != nil {
+		err = o.upper.InodeOperations.WriteOut(ctx, o.upper)
 	}
-	return o.upper.InodeOperations.WriteOut(ctx, o.upper)
+	o.copyMu.RUnlock()
+	return err
 }
 
 func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name string) (*Dirent, error) {
+	// Hot path. Avoid defers.
 	parent.copyMu.RLock()
-	defer parent.copyMu.RUnlock()
 
 	// Assert that there is at least one upper or lower entry.
 	if parent.upper == nil && parent.lower == nil {
+		parent.copyMu.RUnlock()
 		panic("invalid overlayEntry, needs at least one Inode")
 	}
 
@@ -63,30 +66,33 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 		if err != nil && err != syserror.ENOENT {
 			// We encountered an error that an overlay cannot handle,
 			// we must propagate it to the caller.
+			parent.copyMu.RUnlock()
 			return nil, err
 		}
 		if child != nil {
-			defer child.DecRef()
-
-			// Is the child non-negative?
 			if !child.IsNegative() {
 				upperInode = child.Inode
 				upperInode.IncRef()
 			}
+			child.DecRef()
 		}
 
 		// Are we done?
 		if overlayHasWhiteout(parent.upper, name) {
 			if upperInode == nil {
+				parent.copyMu.RUnlock()
 				return NewNegativeDirent(name), nil
 			}
 			entry, err := newOverlayEntry(ctx, upperInode, nil, false)
 			if err != nil {
 				// Don't leak resources.
 				upperInode.DecRef()
+				parent.copyMu.RUnlock()
 				return nil, err
 			}
-			return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
+			d, err := NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
+			parent.copyMu.RUnlock()
+			return d, err
 		}
 	}
 
@@ -103,12 +109,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 			if upperInode != nil {
 				upperInode.DecRef()
 			}
+			parent.copyMu.RUnlock()
 			return nil, err
 		}
 		if child != nil {
-			defer child.DecRef()
-
-			// Is the child negative?
 			if !child.IsNegative() {
 				// Did we find something in the upper filesystem? We can
 				// only use it if the types match.
@@ -117,12 +121,14 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 					lowerInode.IncRef()
 				}
 			}
+			child.DecRef()
 		}
 	}
 
 	// Was all of this for naught?
 	if upperInode == nil && lowerInode == nil {
 		// Return a negative Dirent indicating that nothing was found.
+		parent.copyMu.RUnlock()
 		return NewNegativeDirent(name), nil
 	}
 
@@ -157,9 +163,12 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 		if lowerInode != nil {
 			lowerInode.DecRef()
 		}
+		parent.copyMu.RUnlock()
 		return nil, err
 	}
-	return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
+	d, err := NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
+	parent.copyMu.RUnlock()
+	return d, err
 }
 
 func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) {
@@ -349,6 +358,7 @@ func overlayBoundEndpoint(o *overlayEntry, path string) unix.BoundEndpoint {
 }
 
 func overlayGetFile(ctx context.Context, o *overlayEntry, d *Dirent, flags FileFlags) (*File, error) {
+	// Hot path. Avoid defers.
 	if flags.Write {
 		if err := copyUp(ctx, d); err != nil {
 			return nil, err
@@ -356,48 +366,69 @@ func overlayGetFile(ctx context.Context, o *overlayEntry, d *Dirent, flags FileF
 	}
 
 	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
 
 	if o.upper != nil {
 		upper, err := overlayFile(ctx, o.upper, flags)
 		if err != nil {
+			o.copyMu.RUnlock()
 			return nil, err
 		}
 		flags.Pread = upper.Flags().Pread
 		flags.Pwrite = upper.Flags().Pwrite
-		return NewFile(ctx, d, flags, &overlayFileOperations{upper: upper}), nil
+		f, err := NewFile(ctx, d, flags, &overlayFileOperations{upper: upper}), nil
+		o.copyMu.RUnlock()
+		return f, err
 	}
 
 	lower, err := overlayFile(ctx, o.lower, flags)
 	if err != nil {
+		o.copyMu.RUnlock()
 		return nil, err
 	}
 	flags.Pread = lower.Flags().Pread
 	flags.Pwrite = lower.Flags().Pwrite
+	o.copyMu.RUnlock()
 	return NewFile(ctx, d, flags, &overlayFileOperations{lower: lower}), nil
 }
 
 func overlayUnstableAttr(ctx context.Context, o *overlayEntry) (UnstableAttr, error) {
+	// Hot path. Avoid defers.
+	var (
+		attr UnstableAttr
+		err  error
+	)
 	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
 	if o.upper != nil {
-		return o.upper.UnstableAttr(ctx)
+		attr, err = o.upper.UnstableAttr(ctx)
+	} else {
+		attr, err = o.lower.UnstableAttr(ctx)
 	}
-	return o.lower.UnstableAttr(ctx)
+	o.copyMu.RUnlock()
+	return attr, err
 }
 
 func overlayGetxattr(o *overlayEntry, name string) ([]byte, error) {
+	// Hot path. This is how the overlay checks for whiteout files.
+	// Avoid defers.
+	var (
+		b   []byte
+		err error
+	)
+
 	// Don't forward the value of the extended attribute if it would
 	// unexpectedly change the behavior of a wrapping overlay layer.
 	if strings.HasPrefix(XattrOverlayPrefix, name) {
 		return nil, syserror.ENODATA
 	}
+
 	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
 	if o.upper != nil {
-		return o.upper.Getxattr(name)
+		b, err = o.upper.Getxattr(name)
+	} else {
+		b, err = o.lower.Getxattr(name)
 	}
-	return o.lower.Getxattr(name)
+	o.copyMu.RUnlock()
+	return b, err
 }
 
 func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
@@ -422,17 +453,21 @@ func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
 
 func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error {
 	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
+	// Hot path. Avoid defers.
+	var err error
 	if o.upper != nil {
-		return o.upper.check(ctx, p)
-	}
-	if p.Write {
-		// Since writes will be redirected to the upper filesystem, the lower
-		// filesystem need not be writable, but must be readable for copy-up.
-		p.Write = false
-		p.Read = true
+		err = o.upper.check(ctx, p)
+	} else {
+		if p.Write {
+			// Since writes will be redirected to the upper filesystem, the lower
+			// filesystem need not be writable, but must be readable for copy-up.
+			p.Write = false
+			p.Read = true
+		}
+		err = o.lower.check(ctx, p)
 	}
-	return o.lower.check(ctx, p)
+	o.copyMu.RUnlock()
+	return err
 }
 
 func overlaySetPermissions(ctx context.Context, o *overlayEntry, d *Dirent, f FilePermissions) bool {
@@ -520,12 +555,16 @@ func overlayStatFS(ctx context.Context, o *overlayEntry) (Info, error) {
 }
 
 func overlayHandleOps(o *overlayEntry) HandleOperations {
+	// Hot path. Avoid defers.
+	var hops HandleOperations
 	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
 	if o.upper != nil {
-		return o.upper.HandleOps()
+		hops = o.upper.HandleOps()
+	} else {
+		hops = o.lower.HandleOps()
 	}
-	return o.lower.HandleOps()
+	o.copyMu.RUnlock()
+	return hops
 }
 
 // NewTestOverlayDir returns an overlay Inode for tests.
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 04f2d38de..90b6c9a4f 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -95,8 +95,9 @@ func (e *Entry) InitEntryWithAttr(ctx context.Context, uattr fs.UnstableAttr) {
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
 func (e *Entry) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
 	e.mu.Lock()
-	defer e.mu.Unlock()
-	return e.unstable, nil
+	attr := e.unstable
+	e.mu.Unlock()
+	return attr, nil
 }
 
 // Check implements fs.InodeOperations.Check.
@@ -106,9 +107,11 @@ func (*Entry) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
 
 // Getxattr implements fs.InodeOperations.Getxattr.
 func (e *Entry) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
+	// Hot path. Avoid defers.
 	e.mu.Lock()
-	defer e.mu.Unlock()
-	if value, ok := e.xattrs[name]; ok {
+	value, ok := e.xattrs[name]
+	e.mu.Unlock()
+	if ok {
 		return value, nil
 	}
 	return nil, syserror.ENOATTR
@@ -117,19 +120,19 @@ func (e *Entry) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
 // Setxattr implements fs.InodeOperations.Setxattr.
 func (e *Entry) Setxattr(inode *fs.Inode, name string, value []byte) error {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	e.xattrs[name] = value
+	e.mu.Unlock()
 	return nil
 }
 
 // Listxattr implements fs.InodeOperations.Listxattr.
 func (e *Entry) Listxattr(inode *fs.Inode) (map[string]struct{}, error) {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	names := make(map[string]struct{}, len(e.xattrs))
 	for name := range e.xattrs {
 		names[name] = struct{}{}
 	}
+	e.mu.Unlock()
 	return names, nil
 }
 
@@ -141,22 +144,22 @@ func (*Entry) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*f
 // SetPermissions always sets the permissions.
 func (e *Entry) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	e.unstable.Perms = p
 	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+	e.mu.Unlock()
 	return true
 }
 
 // SetOwner always sets ownership.
 func (e *Entry) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	if owner.UID.Ok() {
 		e.unstable.Owner.UID = owner.UID
 	}
 	if owner.GID.Ok() {
 		e.unstable.Owner.GID = owner.GID
 	}
+	e.mu.Unlock()
 	return nil
 }
 
@@ -167,8 +170,6 @@ func (e *Entry) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSp
 	}
 
 	e.mu.Lock()
-	defer e.mu.Unlock()
-
 	now := ktime.NowFromContext(ctx)
 	if !ts.ATimeOmit {
 		if ts.ATimeSetSystemTime {
@@ -185,59 +186,64 @@ func (e *Entry) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSp
 		}
 	}
 	e.unstable.StatusChangeTime = now
+	e.mu.Unlock()
 	return nil
 }
 
 // NotifyStatusChange updates the status change time (ctime).
 func (e *Entry) NotifyStatusChange(ctx context.Context) {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+	e.mu.Unlock()
 }
 
 // StatusChangeTime returns the last status change time for this node.
 func (e *Entry) StatusChangeTime() ktime.Time {
 	e.mu.Lock()
-	defer e.mu.Unlock()
-	return e.unstable.StatusChangeTime
+	t := e.unstable.StatusChangeTime
+	e.mu.Unlock()
+	return t
 }
 
 // NotifyModification updates the modification time and the status change time.
 func (e *Entry) NotifyModification(ctx context.Context) {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	now := ktime.NowFromContext(ctx)
 	e.unstable.ModificationTime = now
 	e.unstable.StatusChangeTime = now
+	e.mu.Unlock()
 }
 
 // ModificationTime returns the last modification time for this node.
 func (e *Entry) ModificationTime() ktime.Time {
 	e.mu.Lock()
-	defer e.mu.Unlock()
-	return e.unstable.ModificationTime
+	t := e.unstable.ModificationTime
+	e.mu.Unlock()
+	return t
 }
 
 // NotifyAccess updates the access time.
 func (e *Entry) NotifyAccess(ctx context.Context) {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	now := ktime.NowFromContext(ctx)
 	e.unstable.AccessTime = now
+	e.mu.Unlock()
 }
 
 // AccessTime returns the last access time for this node.
 func (e *Entry) AccessTime() ktime.Time {
 	e.mu.Lock()
-	defer e.mu.Unlock()
-	return e.unstable.AccessTime
+	t := e.unstable.AccessTime
+	e.mu.Unlock()
+	return t
 }
 
 // Permissions returns permissions on this entry.
 func (e *Entry) Permissions() fs.FilePermissions {
 	e.mu.Lock()
-	defer e.mu.Unlock()
-	return e.unstable.Perms
+	p := e.unstable.Perms
+	e.mu.Unlock()
+	return p
 }
 
 // Lookup is not supported by default.
@@ -379,15 +385,15 @@ func (e *Entry) Release(context.Context) {}
 // AddLink implements InodeOperationss.AddLink.
 func (e *Entry) AddLink() {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	e.unstable.Links++
+	e.mu.Unlock()
 }
 
 // DropLink implements InodeOperationss.DropLink.
 func (e *Entry) DropLink() {
 	e.mu.Lock()
-	defer e.mu.Unlock()
 	e.unstable.Links--
+	e.mu.Unlock()
 }
 
 // DeprecatedReaddir is not supported by default.
-- 
cgit v1.2.3


From d571a4359cebbcf8a9b201bb125f1cdc9fb126e4 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 21 Jun 2018 10:52:33 -0700
Subject: Implement ioctl(FIOASYNC)

FIOASYNC and friends are used to send signals when a file is ready for IO.

This may or may not be needed by Nginx. While Nginx does use it, it is unclear
if the code that uses it has any effect.

PiperOrigin-RevId: 201550828
Change-Id: I7ba05a7db4eb2dfffde11e9bd9a35b65b98d7f50
---
 pkg/abi/linux/BUILD                   |   1 +
 pkg/abi/linux/fcntl.go                |  29 +++++++
 pkg/abi/linux/ioctl.go                |   5 ++
 pkg/abi/linux/signal.go               |  34 ++++++++
 pkg/sentry/fs/file.go                 |  62 +++++++++++++--
 pkg/sentry/fs/file_state.go           |  10 ---
 pkg/sentry/fs/flags.go                |   7 ++
 pkg/sentry/kernel/fasync/BUILD        |  18 +++++
 pkg/sentry/kernel/fasync/fasync.go    | 145 ++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/sessions.go         |   5 ++
 pkg/sentry/syscalls/linux/BUILD       |   1 +
 pkg/sentry/syscalls/linux/flags.go    |   5 ++
 pkg/sentry/syscalls/linux/sys_file.go |  78 ++++++++++++++++--
 13 files changed, 376 insertions(+), 24 deletions(-)
 create mode 100644 pkg/abi/linux/fcntl.go
 create mode 100644 pkg/sentry/kernel/fasync/BUILD
 create mode 100644 pkg/sentry/kernel/fasync/fasync.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 693ce0fdd..5d00b66cc 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -31,6 +31,7 @@ go_library(
         "elf.go",
         "errors.go",
         "exec.go",
+        "fcntl.go",
         "file.go",
         "fs.go",
         "futex.go",
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
new file mode 100644
index 000000000..f5dbe5199
--- /dev/null
+++ b/pkg/abi/linux/fcntl.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Comands from linux/fcntl.h.
+const (
+	F_DUPFD         = 0
+	F_DUPFD_CLOEXEC = 1030
+	F_GETFD         = 1
+	F_GETFL         = 3
+	F_GETOWN        = 9
+	F_SETFD         = 2
+	F_SETFL         = 4
+	F_SETLK         = 6
+	F_SETLKW        = 7
+	F_SETOWN        = 8
+)
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 35cefbdfc..3ef046562 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -29,6 +29,11 @@ const (
 	TIOCSPTLCK = 0x40045431
 	FIONCLEX   = 0x00005450
 	FIOCLEX    = 0x00005451
+	FIOASYNC   = 0x00005452
+	FIOSETOWN  = 0x00008901
+	SIOCSPGRP  = 0x00008902
+	FIOGETOWN  = 0x00008903
+	SIOCGPGRP  = 0x00008904
 )
 
 // ioctl(2) requests provided by uapi/linux/android/binder.h
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index cd09008b5..fed2a159f 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -175,3 +175,37 @@ const (
 	SA_NOMASK      = SA_NODEFER
 	SA_ONESHOT     = SA_RESTARTHAND
 )
+
+// Signal info types.
+const (
+	SI_MASK  = 0xffff0000
+	SI_KILL  = 0 << 16
+	SI_TIMER = 1 << 16
+	SI_POLL  = 2 << 16
+	SI_FAULT = 3 << 16
+	SI_CHLD  = 4 << 16
+	SI_RT    = 5 << 16
+	SI_MESGQ = 6 << 16
+	SI_SYS   = 7 << 16
+)
+
+// SIGPOLL si_codes.
+const (
+	// POLL_IN indicates that data input available.
+	POLL_IN = SI_POLL | 1
+
+	// POLL_OUT indicates that output buffers available.
+	POLL_OUT = SI_POLL | 2
+
+	// POLL_MSG indicates that an input message available.
+	POLL_MSG = SI_POLL | 3
+
+	// POLL_ERR indicates that there was an i/o error.
+	POLL_ERR = SI_POLL | 4
+
+	// POLL_PRI indicates that a high priority input available.
+	POLL_PRI = SI_POLL | 5
+
+	// POLL_HUP indicates that a device disconnected.
+	POLL_HUP = SI_POLL | 6
+)
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index f2683bbd2..6d93ef760 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -16,6 +16,7 @@ package fs
 
 import (
 	"math"
+	"sync"
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/amutex"
@@ -72,9 +73,15 @@ type File struct {
 	// other files via the Dirent cache.
 	Dirent *Dirent
 
+	// flagsMu protects flags and async below.
+	flagsMu sync.Mutex `state:"nosave"`
+
 	// flags are the File's flags. Setting or getting flags is fully atomic
 	// and is not protected by mu (below).
-	flags atomic.Value `state:".(FileFlags)"`
+	flags FileFlags
+
+	// async handles O_ASYNC notifications.
+	async FileAsync
 
 	// mu is dual-purpose: first, to make read(2) and write(2) thread-safe
 	// in conformity with POSIX, and second, to cancel operations before they
@@ -99,8 +106,8 @@ func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOper
 		UniqueID:       uniqueid.GlobalFromContext(ctx),
 		Dirent:         dirent,
 		FileOperations: fops,
+		flags:          flags,
 	}
-	f.flags.Store(flags)
 	f.mu.Init()
 	return f
 }
@@ -117,22 +124,40 @@ func (f *File) DecRef() {
 
 		// Release a reference on the Dirent.
 		f.Dirent.DecRef()
+
+		f.flagsMu.Lock()
+		if f.flags.Async && f.async != nil {
+			f.async.Unregister(f)
+		}
+		f.flagsMu.Unlock()
 	})
 }
 
 // Flags atomically loads the File's flags.
 func (f *File) Flags() FileFlags {
-	return f.flags.Load().(FileFlags)
+	f.flagsMu.Lock()
+	flags := f.flags
+	f.flagsMu.Unlock()
+	return flags
 }
 
 // SetFlags atomically changes the File's flags to the values contained
 // in newFlags. See SettableFileFlags for values that can be set.
 func (f *File) SetFlags(newFlags SettableFileFlags) {
-	flags := f.flags.Load().(FileFlags)
-	flags.Direct = newFlags.Direct
-	flags.NonBlocking = newFlags.NonBlocking
-	flags.Append = newFlags.Append
-	f.flags.Store(flags)
+	f.flagsMu.Lock()
+	f.flags.Direct = newFlags.Direct
+	f.flags.NonBlocking = newFlags.NonBlocking
+	f.flags.Append = newFlags.Append
+	if f.async != nil {
+		if newFlags.Async && !f.flags.Async {
+			f.async.Register(f)
+		}
+		if !newFlags.Async && f.flags.Async {
+			f.async.Unregister(f)
+		}
+	}
+	f.flags.Async = newFlags.Async
+	f.flagsMu.Unlock()
 }
 
 // Offset atomically loads the File's offset.
@@ -361,6 +386,27 @@ func (f *File) Msync(ctx context.Context, mr memmap.MappableRange) error {
 	return f.Fsync(ctx, int64(mr.Start), int64(mr.End-1), SyncData)
 }
 
+// A FileAsync sends signals to its owner when w is ready for IO.
+type FileAsync interface {
+	Register(w waiter.Waitable)
+	Unregister(w waiter.Waitable)
+}
+
+// Async gets the stored FileAsync or creates a new one with the supplied
+// function. If the supplied function is nil, no FileAsync is created and the
+// current value is returned.
+func (f *File) Async(newAsync func() FileAsync) FileAsync {
+	f.flagsMu.Lock()
+	defer f.flagsMu.Unlock()
+	if f.async == nil && newAsync != nil {
+		f.async = newAsync()
+		if f.flags.Async {
+			f.async.Register(f)
+		}
+	}
+	return f.async
+}
+
 // FileReader implements io.Reader and io.ReaderAt.
 type FileReader struct {
 	// Ctx is the context for the file reader.
diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go
index 341cbda0b..3384737ab 100644
--- a/pkg/sentry/fs/file_state.go
+++ b/pkg/sentry/fs/file_state.go
@@ -18,13 +18,3 @@ package fs
 func (f *File) afterLoad() {
 	f.mu.Init()
 }
-
-// saveFlags is invoked by stateify.
-func (f *File) saveFlags() FileFlags {
-	return f.flags.Load().(FileFlags)
-}
-
-// loadFlags is invoked by stateify.
-func (f *File) loadFlags(flags FileFlags) {
-	f.flags.Store(flags)
-}
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index dfa6a3d62..7a8eefd02 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -42,6 +42,9 @@ type FileFlags struct {
 
 	// Directory indicates that this file must be a directory.
 	Directory bool
+
+	// Async indicates that this file sends signals on IO events.
+	Async bool
 }
 
 // SettableFileFlags is a subset of FileFlags above that can be changed
@@ -55,6 +58,9 @@ type SettableFileFlags struct {
 
 	// Append indicates this file is append only.
 	Append bool
+
+	// Async indicates that this file sends signals on IO events.
+	Async bool
 }
 
 // Settable returns the subset of f that are settable.
@@ -63,5 +69,6 @@ func (f FileFlags) Settable() SettableFileFlags {
 		Direct:      f.Direct,
 		NonBlocking: f.NonBlocking,
 		Append:      f.Append,
+		Async:       f.Async,
 	}
 }
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
new file mode 100644
index 000000000..8d06e1182
--- /dev/null
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -0,0 +1,18 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "fasync",
+    srcs = ["fasync.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
new file mode 100644
index 000000000..028d6766f
--- /dev/null
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -0,0 +1,145 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fasync provides FIOASYNC related functionality.
+package fasync
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// New creates a new FileAsync.
+func New() fs.FileAsync {
+	return &FileAsync{}
+}
+
+// FileAsync sends signals when the registered file is ready for IO.
+type FileAsync struct {
+	mu        sync.Mutex
+	e         waiter.Entry
+	requester auth.Credentials
+
+	// Only one of the following is allowed to be non-nil.
+	recipientPG *kernel.ProcessGroup
+	recipientTG *kernel.ThreadGroup
+	recipientT  *kernel.Task
+}
+
+// Callback sends a signal.
+func (a *FileAsync) Callback(e *waiter.Entry) {
+	a.mu.Lock()
+	if a.e.Callback == nil {
+		return
+	}
+	t := a.recipientT
+	tg := a.recipientTG
+	if a.recipientPG != nil {
+		tg = a.recipientPG.Originator()
+	}
+	if tg != nil {
+		t = tg.Leader()
+	}
+	c := t.Credentials()
+	// Logic from sigio_perm in fs/fcntl.c.
+	if a.requester.EffectiveKUID == 0 ||
+		a.requester.EffectiveKUID == c.SavedKUID ||
+		a.requester.EffectiveKUID == c.RealKUID ||
+		a.requester.RealKUID == c.SavedKUID ||
+		a.requester.RealKUID == c.RealKUID {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(linux.SIGIO),
+			// SEND_SIG_PRIV
+			Code: arch.SignalInfoKernel,
+		})
+	}
+	a.mu.Unlock()
+}
+
+// Register sets the file which will be monitored for IO events.
+//
+// The file must not be currently registered.
+func (a *FileAsync) Register(w waiter.Waitable) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if a.e.Callback != nil {
+		panic("registering already registered file")
+	}
+
+	a.e.Callback = a
+	w.EventRegister(&a.e, waiter.EventIn|waiter.EventOut|waiter.EventErr|waiter.EventHUp)
+}
+
+// Unregister stops monitoring a file.
+//
+// The file must be currently registered.
+func (a *FileAsync) Unregister(w waiter.Waitable) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+
+	if a.e.Callback == nil {
+		panic("unregistering unregistered file")
+	}
+
+	w.EventUnregister(&a.e)
+	a.e.Callback = nil
+}
+
+// Owner returns who is currently getting signals. All return values will be
+// nil if no one is set to receive signals.
+func (a *FileAsync) Owner() (*kernel.Task, *kernel.ThreadGroup, *kernel.ProcessGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	return a.recipientT, a.recipientTG, a.recipientPG
+}
+
+// SetOwnerTask sets the owner (who will receive signals) to a specified task.
+// Only this owner will receive signals.
+func (a *FileAsync) SetOwnerTask(requester *kernel.Task, recipient *kernel.Task) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = recipient
+	a.recipientTG = nil
+	a.recipientPG = nil
+}
+
+// SetOwnerThreadGroup sets the owner (who will receive signals) to a specified
+// thread group. Only this owner will receive signals.
+func (a *FileAsync) SetOwnerThreadGroup(requester *kernel.Task, recipient *kernel.ThreadGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = nil
+	a.recipientTG = recipient
+	a.recipientPG = nil
+}
+
+// SetOwnerProcessGroup sets the owner (who will receive signals) to a
+// specified process group. Only this owner will receive signals.
+func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kernel.ProcessGroup) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.requester = requester.Credentials()
+	a.recipientT = nil
+	a.recipientTG = nil
+	a.recipientPG = recipient
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 53d8fb844..fa4c7b8f6 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -110,6 +110,11 @@ type ProcessGroup struct {
 	processGroupEntry
 }
 
+// Originator retuns the originator of the process group.
+func (pg *ProcessGroup) Originator() *ThreadGroup {
+	return pg.originator
+}
+
 // incRefWithParent grabs a reference.
 //
 // This function is called when this ProcessGroup is being associated with some
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 7cfd37fb1..d3f3cc459 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -82,6 +82,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/epoll",
         "//pkg/sentry/kernel/eventfd",
+        "//pkg/sentry/kernel/fasync",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/sched",
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index 82bfd7c2a..3d39a20f4 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -61,6 +61,9 @@ func flagsToLinux(flags fs.FileFlags) (mask uint) {
 	if flags.Directory {
 		mask |= syscall.O_DIRECTORY
 	}
+	if flags.Async {
+		mask |= syscall.O_ASYNC
+	}
 	switch {
 	case flags.Read && flags.Write:
 		mask |= syscall.O_RDWR
@@ -82,6 +85,7 @@ func linuxToFlags(mask uint) (flags fs.FileFlags) {
 		Write:       (mask & syscall.O_ACCMODE) != syscall.O_RDONLY,
 		Append:      mask&syscall.O_APPEND != 0,
 		Directory:   mask&syscall.O_DIRECTORY != 0,
+		Async:       mask&syscall.O_ASYNC != 0,
 	}
 }
 
@@ -91,5 +95,6 @@ func linuxToSettableFlags(mask uint) fs.SettableFileFlags {
 		Direct:      mask&syscall.O_DIRECT != 0,
 		NonBlocking: mask&syscall.O_NONBLOCK != 0,
 		Append:      mask&syscall.O_APPEND != 0,
+		Async:       mask&syscall.O_ASYNC != 0,
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index e2980842f..490649f87 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
@@ -528,6 +529,33 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		file.SetFlags(flags.Settable())
 		return 0, nil, nil
 
+	case linux.FIOASYNC:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		flags := file.Flags()
+		if set != 0 {
+			flags.Async = true
+		} else {
+			flags.Async = false
+		}
+		file.SetFlags(flags.Settable())
+		return 0, nil, nil
+
+	case linux.FIOSETOWN, linux.SIOCSPGRP:
+		var set int32
+		if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil {
+			return 0, nil, err
+		}
+		fSetOwn(t, file, set)
+		return 0, nil, nil
+
+	case linux.FIOGETOWN, linux.SIOCGPGRP:
+		who := fGetOwn(t, file)
+		_, err := t.CopyOut(args[2].Pointer(), &who)
+		return 0, nil, err
+
 	default:
 		ret, err := file.FileOperations.Ioctl(t, t.MemoryManager(), args)
 		if err != nil {
@@ -725,6 +753,39 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	return uintptr(newfd), nil, nil
 }
 
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+	ma := file.Async(nil)
+	if ma == nil {
+		return 0
+	}
+	a := ma.(*fasync.FileAsync)
+	ot, otg, opg := a.Owner()
+	switch {
+	case ot != nil:
+		return int32(t.PIDNamespace().IDOfTask(ot))
+	case otg != nil:
+		return int32(t.PIDNamespace().IDOfThreadGroup(otg))
+	case opg != nil:
+		return int32(-t.PIDNamespace().IDOfProcessGroup(opg))
+	default:
+		return 0
+	}
+}
+
+// fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
+//
+// If who is positive, it represents a PID. If negative, it represents a PGID.
+// If the PID or PGID is invalid, the owner is silently unset.
+func fSetOwn(t *kernel.Task, file *fs.File, who int32) {
+	a := file.Async(fasync.New).(*fasync.FileAsync)
+	if who < 0 {
+		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
+		a.SetOwnerProcessGroup(t, pg)
+	}
+	tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
+	a.SetOwnerThreadGroup(t, tg)
+}
+
 // Fcntl implements linux syscall fcntl(2).
 func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := kdefs.FD(args[0].Int())
@@ -737,7 +798,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	defer file.DecRef()
 
 	switch cmd {
-	case syscall.F_DUPFD, syscall.F_DUPFD_CLOEXEC:
+	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
 		from := kdefs.FD(args[2].Int())
 		fdFlags := kernel.FDFlags{CloseOnExec: cmd == syscall.F_DUPFD_CLOEXEC}
 		fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits())
@@ -745,19 +806,19 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			return 0, nil, err
 		}
 		return uintptr(fd), nil, nil
-	case syscall.F_GETFD:
+	case linux.F_GETFD:
 		return uintptr(fdFlagsToLinux(flags)), nil, nil
-	case syscall.F_SETFD:
+	case linux.F_SETFD:
 		flags := args[2].Uint()
 		t.FDMap().SetFlags(fd, kernel.FDFlags{
 			CloseOnExec: flags&syscall.FD_CLOEXEC != 0,
 		})
-	case syscall.F_GETFL:
+	case linux.F_GETFL:
 		return uintptr(flagsToLinux(file.Flags())), nil, nil
-	case syscall.F_SETFL:
+	case linux.F_SETFL:
 		flags := uint(args[2].Uint())
 		file.SetFlags(linuxToSettableFlags(flags))
-	case syscall.F_SETLK, syscall.F_SETLKW:
+	case linux.F_SETLK, linux.F_SETLKW:
 		// In Linux the file system can choose to provide lock operations for an inode.
 		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
 		// hammer by only allowing locks on files and directories.
@@ -854,6 +915,11 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		default:
 			return 0, nil, syserror.EINVAL
 		}
+	case linux.F_GETOWN:
+		return uintptr(fGetOwn(t, file)), nil, nil
+	case linux.F_SETOWN:
+		fSetOwn(t, file, args[2].Int())
+		return 0, nil, nil
 	default:
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
-- 
cgit v1.2.3


From f6be5fe6193163ad46722bc36209572da4a15ad0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 21 Jun 2018 13:21:25 -0700
Subject: Forward SIGUSR2 to the sandbox too

SIGUSR2 was being masked out to be used as a way to dump sentry
stacks. This could cause compatibility problems in cases anyone
uses SIGUSR2 to communicate with the container init process.

PiperOrigin-RevId: 201575374
Change-Id: I312246e828f38ad059139bb45b8addc2ed055d74
---
 pkg/sentry/sighandling/sighandling.go | 10 ++--------
 runsc/boot/loader.go                  |  5 ++---
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 0c3a14da5..ef6f7f617 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -95,7 +95,7 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 // PrepareForwarding ensures that synchronous signals are forwarded to k and
 // returns a callback that starts signal delivery, which itself returns a
 // callback that stops signal forwarding.
-func PrepareForwarding(k *kernel.Kernel) func() func() {
+func PrepareForwarding(k *kernel.Kernel, enablePanicSignal bool) func() func() {
 	start := make(chan struct{})
 	stop := make(chan struct{})
 
@@ -112,7 +112,7 @@ func PrepareForwarding(k *kernel.Kernel) func() func() {
 		sigchans = append(sigchans, sigchan)
 
 		// SignalPanic is handled by Run.
-		if linux.Signal(sig) == kernel.SignalPanic {
+		if enablePanicSignal && linux.Signal(sig) == kernel.SignalPanic {
 			continue
 		}
 
@@ -128,9 +128,3 @@ func PrepareForwarding(k *kernel.Kernel) func() func() {
 		}
 	}
 }
-
-// StartForwarding ensures that synchronous signals are forwarded to k and
-// returns a callback that stops signal forwarding.
-func StartForwarding(k *kernel.Kernel) func() {
-	return PrepareForwarding(k)()
-}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index e1194bd03..a0a28dc43 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -215,9 +215,8 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	if err := sighandling.IgnoreChildStop(); err != nil {
 		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
 	}
-	// Ensure that most signals received in sentry context are forwarded to
-	// the emulated kernel.
-	stopSignalForwarding := sighandling.StartForwarding(k)
+	// Ensure that signals received are forwarded to the emulated kernel.
+	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
 
 	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
 	if err != nil {
-- 
cgit v1.2.3


From 2dedbc7211fb6b7f8b86148e6627054e781eaa87 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 21 Jun 2018 14:53:05 -0700
Subject: Drop return from SendExternalSignal

SendExternalSignal is no longer called before CreateProcess, so it can
enforce this simplified precondition.

StartForwarding, and after Kernel.Start.

PiperOrigin-RevId: 201591170
Change-Id: Ib7022ef7895612d7d82a00942ab59fa433c4d6e9
---
 pkg/sentry/kernel/kernel.go           |  7 +++----
 pkg/sentry/kernel/signal.go           | 13 +++++--------
 pkg/sentry/sighandling/sighandling.go | 14 +++++++++++---
 3 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index a17148af1..5662b8f08 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -760,12 +760,11 @@ func (k *Kernel) Unpause() {
 //
 // context is used only for debugging to describe how the signal was received.
 //
-// Returns false if signal could not be sent because the Kernel is not fully
-// initialized yet.
-func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool {
+// Preconditions: Kernel must have an init process.
+func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
-	return k.sendExternalSignal(info, context)
+	k.sendExternalSignal(info, context)
 }
 
 // FeatureSet returns the FeatureSet.
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index 8edd05cdf..e3a2a777a 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -15,6 +15,8 @@
 package kernel
 
 import (
+	"fmt"
+
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
@@ -33,13 +35,11 @@ const SignalPanic = linux.SIGUSR2
 //
 // context is used only for debugging to differentiate these cases.
 //
-// Returns false if signal could not be sent because the Kernel is not fully
-// initialized yet.
-func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) bool {
+// Preconditions: Kernel must have an init process.
+func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) {
 	switch linux.Signal(info.Signo) {
 	case platform.SignalInterrupt:
 		// Assume that a call to platform.Context.Interrupt() misfired.
-		return true
 
 	case SignalPanic:
 		// SignalPanic is also specially handled in sentry setup to ensure that
@@ -50,13 +50,10 @@ func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) bool
 	default:
 		log.Infof("Received external signal %d in %s context", info.Signo, context)
 		if k.globalInit == nil {
-			log.Warningf("Received external signal %d before init created", info.Signo)
-			return false
+			panic(fmt.Sprintf("Received external signal %d before init created", info.Signo))
 		}
 		k.globalInit.SendSignal(info)
 	}
-
-	return true
 }
 
 // sigPriv returns a SignalInfo representing a signal sent by the sentry. (The
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index ef6f7f617..25295440c 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -16,6 +16,7 @@
 package sighandling
 
 import (
+	"fmt"
 	"os"
 	"os/signal"
 	"reflect"
@@ -65,7 +66,9 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 
 		// Otherwise, it was a signal on channel N. Index 0 represents the stop
 		// channel, so index N represents the channel for signal N.
-		if !started || !k.SendExternalSignal(&arch.SignalInfo{Signo: int32(index)}, "sentry") {
+		signal := linux.Signal(index)
+
+		if !started {
 			// Kernel is not ready to receive signals.
 			//
 			// Kill ourselves if this signal would have killed the
@@ -78,11 +81,16 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 			// TODO: Convert Go's runtime.raise from
 			// tkill to tgkill so PrepareForwarding doesn't need to
 			// be called until after filter installation.
-			switch linux.Signal(index) {
+			switch signal {
 			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
-				dieFromSignal(linux.Signal(index))
+				dieFromSignal(signal)
+				panic(fmt.Sprintf("Failed to die from signal %d", signal))
+			default:
+				continue
 			}
 		}
+
+		k.SendExternalSignal(&arch.SignalInfo{Signo: int32(signal)}, "sentry")
 	}
 
 	// Close all individual channels.
-- 
cgit v1.2.3


From 0e434b66a625b937d90e4ebe632de4546101be5a Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 21 Jun 2018 15:18:47 -0700
Subject: netstack: tcp socket connected state S/R support.

PiperOrigin-RevId: 201596247
Change-Id: Id22f47b2cdcbe14aa0d930f7807ba75f91a56724
---
 pkg/sentry/kernel/BUILD                   |   5 +-
 pkg/sentry/kernel/kernel.go               |   6 +
 pkg/sentry/kernel/kernel_state.go         |  31 +++++
 pkg/tcpip/stack/stack_global_state.go     |   2 +-
 pkg/tcpip/tcpip.go                        |  36 ++++++
 pkg/tcpip/transport/tcp/BUILD             |   7 ++
 pkg/tcpip/transport/tcp/accept.go         |  11 +-
 pkg/tcpip/transport/tcp/connect.go        |  43 ++++++-
 pkg/tcpip/transport/tcp/endpoint.go       |  72 +++++++++---
 pkg/tcpip/transport/tcp/endpoint_state.go | 185 ++++++++++++++++++++----------
 pkg/tcpip/transport/tcp/segment.go        |   8 +-
 pkg/tcpip/transport/tcp/segment_queue.go  |   4 +-
 pkg/tcpip/transport/tcp/segment_state.go  |  41 +++++++
 pkg/tcpip/transport/tcp/snd.go            |   4 +-
 pkg/tcpip/transport/tcp/snd_state.go      |  39 +++++++
 15 files changed, 403 insertions(+), 91 deletions(-)
 create mode 100644 pkg/sentry/kernel/kernel_state.go
 create mode 100644 pkg/tcpip/transport/tcp/segment_state.go
 create mode 100644 pkg/tcpip/transport/tcp/snd_state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index b2a55ddff..07568b47c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -12,6 +12,7 @@ go_stateify(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_state.go",
         "pending_signals.go",
         "pending_signals_state.go",
         "process_group_list.go",
@@ -45,10 +46,11 @@ go_stateify(
         "vdso.go",
         "version.go",
     ],
-    out = "kernel_state.go",
+    out = "kernel_autogen_state.go",
     imports = [
         "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
         "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+        "gvisor.googlesource.com/gvisor/pkg/tcpip",
     ],
     package = "kernel",
 )
@@ -117,6 +119,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_autogen_state.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5662b8f08..64439cd9d 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -57,6 +57,7 @@ import (
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/state"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 )
 
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
@@ -158,6 +159,9 @@ type Kernel struct {
 	// exitErr is the error causing the sandbox to exit, if any. It is
 	// protected by extMu.
 	exitErr error
+
+	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
+	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -422,6 +426,8 @@ func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) erro
 		return err
 	}
 
+	tcpip.AsyncLoading.Wait()
+
 	log.Infof("Overall load took [%s]", time.Since(loadStart))
 
 	// Applications may size per-cpu structures based on k.applicationCores, so
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
new file mode 100644
index 000000000..bb2d5102d
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+)
+
+// saveDanglingEndpoints is invoked by stateify.
+func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint {
+	return tcpip.GetDanglingEndpoints()
+}
+
+// loadDanglingEndpoints is invoked by stateify.
+func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) {
+	for _, e := range es {
+		tcpip.AddDanglingEndpoint(e)
+	}
+}
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index 030ae98d1..260d7d05c 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -5,5 +5,5 @@
 package stack
 
 // StackFromEnv is the global stack created in restore run.
-// FIXME: remove this variable once tcpip S/R is fully supported.
+// FIXME
 var StackFromEnv *Stack
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index cf25a086d..17fa0efb7 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -23,6 +23,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
@@ -552,3 +553,38 @@ type ProtocolAddress struct {
 	// Address is a network address.
 	Address Address
 }
+
+// danglingEndpointsMu protects access to danglingEndpoints.
+var danglingEndpointsMu sync.Mutex
+
+// danglingEndpoints tracks all dangling endpoints no longer owned by the app.
+var danglingEndpoints = make(map[Endpoint]struct{})
+
+// GetDanglingEndpoints returns all dangling endpoints.
+func GetDanglingEndpoints() []Endpoint {
+	es := make([]Endpoint, 0, len(danglingEndpoints))
+	danglingEndpointsMu.Lock()
+	for e, _ := range danglingEndpoints {
+		es = append(es, e)
+	}
+	danglingEndpointsMu.Unlock()
+	return es
+}
+
+// AddDanglingEndpoint adds a dangling endpoint.
+func AddDanglingEndpoint(e Endpoint) {
+	danglingEndpointsMu.Lock()
+	danglingEndpoints[e] = struct{}{}
+	danglingEndpointsMu.Unlock()
+}
+
+// DeleteDanglingEndpoint removes a dangling endpoint.
+func DeleteDanglingEndpoint(e Endpoint) {
+	danglingEndpointsMu.Lock()
+	delete(danglingEndpoints, e)
+	danglingEndpointsMu.Unlock()
+}
+
+// AsyncLoading is the global barrier for asynchronous endpoint loading
+// activities.
+var AsyncLoading sync.WaitGroup
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index f38f58e87..d129aa285 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -10,11 +10,16 @@ go_stateify(
         "endpoint.go",
         "endpoint_state.go",
         "rcv.go",
+        "segment.go",
         "segment_heap.go",
+        "segment_queue.go",
+        "segment_state.go",
         "snd.go",
+        "snd_state.go",
         "tcp_segment_list.go",
     ],
     out = "tcp_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     package = "tcp",
 )
 
@@ -43,7 +48,9 @@ go_library(
         "segment.go",
         "segment_heap.go",
         "segment_queue.go",
+        "segment_state.go",
         "snd.go",
+        "snd_state.go",
         "tcp_segment_list.go",
         "tcp_state.go",
         "timer.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 85adeef0e..410dfdad4 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -68,7 +68,8 @@ func encodeMSS(mss uint16) uint32 {
 // to go above a threshold.
 var synRcvdCount struct {
 	sync.Mutex
-	value uint64
+	value   uint64
+	pending sync.WaitGroup
 }
 
 // listenContext is used by a listening endpoint to store state used while
@@ -102,6 +103,7 @@ func incSynRcvdCount() bool {
 		return false
 	}
 
+	synRcvdCount.pending.Add(1)
 	synRcvdCount.value++
 
 	return true
@@ -115,6 +117,7 @@ func decSynRcvdCount() {
 	defer synRcvdCount.Unlock()
 
 	synRcvdCount.value--
+	synRcvdCount.pending.Done()
 }
 
 // newListenContext creates a new listen context.
@@ -292,7 +295,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		opts := parseSynSegmentOptions(s)
 		if incSynRcvdCount() {
 			s.incRef()
-			go e.handleSynSegment(ctx, s, &opts) // S/R-FIXME
+			go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
 		} else {
 			cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
 			// Send SYN with window scaling because we currently
@@ -381,10 +384,12 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 				return nil
 			}
 			if n&notifyDrain != 0 {
-				for s := e.segmentQueue.dequeue(); s != nil; s = e.segmentQueue.dequeue() {
+				for !e.segmentQueue.empty() {
+					s := e.segmentQueue.dequeue()
 					e.handleListenSegment(ctx, s)
 					s.decRef()
 				}
+				synRcvdCount.pending.Wait()
 				close(e.drainDone)
 				<-e.undrain
 			}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 9aaabe0b1..d9f87c793 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -443,7 +443,8 @@ func (h *handshake) execute() *tcpip.Error {
 				return tcpip.ErrAborted
 			}
 			if n&notifyDrain != 0 {
-				for s := h.ep.segmentQueue.dequeue(); s != nil; s = h.ep.segmentQueue.dequeue() {
+				for !h.ep.segmentQueue.empty() {
+					s := h.ep.segmentQueue.dequeue()
 					err := h.handleSegment(s)
 					s.decRef()
 					if err != nil {
@@ -813,15 +814,13 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
 // goroutine and is responsible for sending segments and handling received
 // segments.
-func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
+func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
 	defer func() {
 		// e.mu is expected to be hold upon entering this section.
 
-		e.completeWorkerLocked()
-
 		if e.snd != nil {
 			e.snd.resendTimer.cleanup()
 		}
@@ -830,6 +829,8 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 			closeTimer.Stop()
 		}
 
+		e.completeWorkerLocked()
+
 		if e.drainDone != nil {
 			close(e.drainDone)
 		}
@@ -840,7 +841,7 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}()
 
-	if !passive {
+	if handshake {
 		// This is an active connection, so we must initiate the 3-way
 		// handshake, and then inform potential waiters about its
 		// completion.
@@ -945,6 +946,17 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 						closeWaker.Assert()
 					})
 				}
+
+				if n&notifyDrain != 0 {
+					for !e.segmentQueue.empty() {
+						if err := e.handleSegments(); err != nil {
+							return err
+						}
+					}
+					close(e.drainDone)
+					<-e.undrain
+				}
+
 				return nil
 			},
 		},
@@ -956,6 +968,27 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 		s.AddWaker(funcs[i].w, i)
 	}
 
+	// The following assertions and notifications are needed for restored
+	// endpoints. Fresh newly created endpoints have empty states and should
+	// not invoke any.
+	e.segmentQueue.mu.Lock()
+	if !e.segmentQueue.list.Empty() {
+		e.newSegmentWaker.Assert()
+	}
+	e.segmentQueue.mu.Unlock()
+
+	e.rcvListMu.Lock()
+	if !e.rcvList.Empty() {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+	e.rcvListMu.Unlock()
+
+	e.mu.RLock()
+	if e.workerCleanup {
+		e.notifyProtocolGoroutine(notifyClose)
+	}
+	e.mu.RUnlock()
+
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 	for !e.rcv.closed || !e.snd.closed || e.snd.sndUna != e.snd.sndNxtList {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b21c2b4ab..706977618 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -69,7 +69,7 @@ type endpoint struct {
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack `state:"manual"`
 	netProto    tcpip.NetworkProtocolNumber
-	waiterQueue *waiter.Queue
+	waiterQueue *waiter.Queue `state:"wait"`
 
 	// lastError represents the last error that the endpoint reported;
 	// access to it is protected by the following mutex.
@@ -82,8 +82,8 @@ type endpoint struct {
 	//
 	// Once the peer has closed its send side, rcvClosed is set to true
 	// to indicate to users that no more data is coming.
-	rcvListMu  sync.Mutex `state:"nosave"`
-	rcvList    segmentList
+	rcvListMu  sync.Mutex  `state:"nosave"`
+	rcvList    segmentList `state:"wait"`
 	rcvClosed  bool
 	rcvBufSize int
 	rcvBufUsed int
@@ -91,8 +91,8 @@ type endpoint struct {
 	// The following fields are protected by the mutex.
 	mu                sync.RWMutex `state:"nosave"`
 	id                stack.TransportEndpointID
-	state             endpointState
-	isPortReserved    bool `state:"manual"`
+	state             endpointState `state:".(endpointState)"`
+	isPortReserved    bool          `state:"manual"`
 	isRegistered      bool
 	boundNICID        tcpip.NICID `state:"manual"`
 	route             stack.Route `state:"manual"`
@@ -118,7 +118,7 @@ type endpoint struct {
 	// workerCleanup specifies if the worker goroutine must perform cleanup
 	// before exitting. This can only be set to true when workerRunning is
 	// also true, and they're both protected by the mutex.
-	workerCleanup bool `state:"zerovalue"`
+	workerCleanup bool
 
 	// sendTSOk is used to indicate when the TS Option has been negotiated.
 	// When sendTSOk is true every non-RST segment should carry a TS as per
@@ -153,7 +153,7 @@ type endpoint struct {
 	// segmentQueue is used to hand received segments to the protocol
 	// goroutine. Segments are queued as long as the queue is not full,
 	// and dropped when it is.
-	segmentQueue segmentQueue `state:"zerovalue"`
+	segmentQueue segmentQueue `state:"wait"`
 
 	// The following fields are used to manage the send buffer. When
 	// segments are ready to be sent, they are added to sndQueue and the
@@ -166,7 +166,7 @@ type endpoint struct {
 	sndBufUsed    int
 	sndClosed     bool
 	sndBufInQueue seqnum.Size
-	sndQueue      segmentList
+	sndQueue      segmentList `state:"wait"`
 	sndWaker      sleep.Waker `state:"manual"`
 	sndCloseWaker sleep.Waker `state:"manual"`
 
@@ -188,17 +188,21 @@ type endpoint struct {
 
 	// notifyFlags is a bitmask of flags used to indicate to the protocol
 	// goroutine what it was notified; this is only accessed atomically.
-	notifyFlags uint32 `state:"zerovalue"`
+	notifyFlags uint32 `state:"nosave"`
 
 	// acceptedChan is used by a listening endpoint protocol goroutine to
 	// send newly accepted connections to the endpoint so that they can be
 	// read by Accept() calls.
-	acceptedChan chan *endpoint `state:".(endpointChan)"`
+	acceptedChan chan *endpoint `state:"manual"`
+
+	// acceptedEndpoints is only used to save / restore the channel buffer.
+	// FIXME
+	acceptedEndpoints []*endpoint
 
 	// The following are only used from the protocol goroutine, and
 	// therefore don't need locks to protect them.
-	rcv *receiver
-	snd *sender
+	rcv *receiver `state:"wait"`
+	snd *sender   `state:"wait"`
 
 	// The goroutine drain completion notification channel.
 	drainDone chan struct{} `state:"nosave"`
@@ -211,6 +215,7 @@ type endpoint struct {
 	probe stack.TCPProbeFunc `state:"nosave"`
 
 	// The following are only used to assist the restore run to re-connect.
+	bindAddress       tcpip.Address
 	connectingAddress tcpip.Address
 }
 
@@ -344,6 +349,7 @@ func (e *endpoint) Close() {
 
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
+	tcpip.AddDanglingEndpoint(e)
 	if !e.workerRunning {
 		e.cleanupLocked()
 	} else {
@@ -363,9 +369,12 @@ func (e *endpoint) cleanupLocked() {
 	if e.acceptedChan != nil {
 		close(e.acceptedChan)
 		for n := range e.acceptedChan {
+			n.mu.Lock()
 			n.resetConnectionLocked(tcpip.ErrConnectionAborted)
+			n.mu.Unlock()
 			n.Close()
 		}
+		e.acceptedChan = nil
 	}
 	e.workerCleanup = false
 
@@ -374,6 +383,7 @@ func (e *endpoint) cleanupLocked() {
 	}
 
 	e.route.Release()
+	tcpip.DeleteDanglingEndpoint(e)
 }
 
 // Read reads data from the endpoint.
@@ -786,6 +796,16 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocol
 
 // Connect connects the endpoint to its peer.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	return e.connect(addr, true, true)
+}
+
+// connect connects the endpoint to its peer. In the normal non-S/R case, the
+// new connection is expected to run the main goroutine and perform handshake.
+// In restore of previously connected endpoints, both ends will be passively
+// created (so no new handshaking is done); for stack-accepted connections not
+// yet accepted by the app, they are restored without running the main goroutine
+// here.
+func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
@@ -897,9 +917,27 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.boundNICID = nicid
 	e.effectiveNetProtos = netProtos
 	e.connectingAddress = connectingAddr
-	e.workerRunning = true
 
-	go e.protocolMainLoop(false) // S/R-SAFE: will be drained before save.
+	// Connect in the restore phase does not perform handshake. Restore its
+	// connection setting here.
+	if !handshake {
+		e.segmentQueue.mu.Lock()
+		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
+			for s := l.Front(); s != nil; s = s.Next() {
+				s.id = e.id
+				s.route = r.Clone()
+				e.sndWaker.Assert()
+			}
+		}
+		e.segmentQueue.mu.Unlock()
+		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
+		e.state = stateConnected
+	}
+
+	if run {
+		e.workerRunning = true
+		go e.protocolMainLoop(handshake) // S/R-SAFE: will be drained before save.
+	}
 
 	return tcpip.ErrConnectStarted
 }
@@ -971,6 +1009,9 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 		if len(e.acceptedChan) > backlog {
 			return tcpip.ErrInvalidEndpointState
 		}
+		if cap(e.acceptedChan) == backlog {
+			return nil
+		}
 		origChan := e.acceptedChan
 		e.acceptedChan = make(chan *endpoint, backlog)
 		close(origChan)
@@ -1008,7 +1049,7 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
 	e.waiterQueue = waiterQueue
 	e.workerRunning = true
-	go e.protocolMainLoop(true) // S/R-FIXME
+	go e.protocolMainLoop(false) // S/R-SAFE: drained on save.
 }
 
 // Accept returns a new endpoint if a peer has established a connection
@@ -1049,6 +1090,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) (ret
 		return tcpip.ErrAlreadyBound
 	}
 
+	e.bindAddress = addr.Addr
 	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index b1e249bff..38c97c796 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -9,6 +9,7 @@ import (
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 )
 
@@ -22,7 +23,7 @@ func (e *endpoint) drainSegmentLocked() {
 	e.undrain = make(chan struct{})
 	e.mu.Unlock()
 
-	e.notificationWaker.Assert()
+	e.notifyProtocolGoroutine(notifyDrain)
 	<-e.drainDone
 
 	e.mu.Lock()
@@ -38,37 +39,98 @@ func (e *endpoint) beforeSave() {
 
 	switch e.state {
 	case stateInitial, stateBound:
-	case stateListen:
-		if !e.segmentQueue.empty() {
-			e.drainSegmentLocked()
+	case stateListen, stateConnecting, stateConnected:
+		if e.state == stateConnected && !e.workerRunning {
+			// The endpoint must be in acceptedChan.
+			break
 		}
-	case stateConnecting:
 		e.drainSegmentLocked()
-		if e.state != stateConnected {
+		if e.state != stateClosed && e.state != stateError {
+			if !e.workerRunning {
+				panic("endpoint has no worker running in listen, connecting, or connected state")
+			}
 			break
 		}
 		fallthrough
-	case stateConnected:
-		// FIXME
-		panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%v, remote %v:%v", e.id.LocalAddress, e.id.LocalPort, e.id.RemoteAddress, e.id.RemotePort)})
 	case stateClosed, stateError:
 		if e.workerRunning {
-			panic(fmt.Sprintf("endpoint still has worker running in closed or error state"))
+			panic("endpoint still has worker running in closed or error state")
 		}
 	default:
 		panic(fmt.Sprintf("endpoint in unknown state %v", e.state))
 	}
+
+	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
+		panic("endpoint still has waiters upon save")
+	}
+
+	if !((e.state == stateBound || e.state == stateListen) == e.isPortReserved) {
+		panic("endpoint port must and must only be reserved in bound or listen state")
+	}
+
+	if e.acceptedChan != nil {
+		close(e.acceptedChan)
+		e.acceptedEndpoints = make([]*endpoint, len(e.acceptedChan), cap(e.acceptedChan))
+		i := 0
+		for ep := range e.acceptedChan {
+			e.acceptedEndpoints[i] = ep
+			i++
+		}
+		if i != len(e.acceptedEndpoints) {
+			panic("endpoint acceptedChan buffer got consumed by background context")
+		}
+	}
+}
+
+// saveState is invoked by stateify.
+func (e *endpoint) saveState() endpointState {
+	return e.state
+}
+
+// Endpoint loading must be done in the following ordering by their state, to
+// avoid dangling connecting w/o listening peer, and to avoid conflicts in port
+// reservation.
+var connectedLoading sync.WaitGroup
+var listenLoading sync.WaitGroup
+var connectingLoading sync.WaitGroup
+
+// Bound endpoint loading happens last.
+
+// loadState is invoked by stateify.
+func (e *endpoint) loadState(state endpointState) {
+	// This is to ensure that the loading wait groups include all applicable
+	// endpoints before any asynchronous calls to the Wait() methods.
+	switch state {
+	case stateConnected:
+		connectedLoading.Add(1)
+	case stateListen:
+		listenLoading.Add(1)
+	case stateConnecting:
+		connectingLoading.Add(1)
+	}
+	e.state = state
 }
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
+	// We load acceptedChan buffer indirectly here. Note that closed
+	// endpoints might not need to allocate the channel.
+	// FIXME
+	if cap(e.acceptedEndpoints) > 0 {
+		e.acceptedChan = make(chan *endpoint, cap(e.acceptedEndpoints))
+		for _, ep := range e.acceptedEndpoints {
+			e.acceptedChan <- ep
+		}
+		e.acceptedEndpoints = nil
+	}
+
 	e.stack = stack.StackFromEnv
 	e.segmentQueue.setLimit(2 * e.rcvBufSize)
 	e.workMu.Init()
 
 	state := e.state
 	switch state {
-	case stateInitial, stateBound, stateListen, stateConnecting:
+	case stateInitial, stateBound, stateListen, stateConnecting, stateConnected:
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
@@ -80,65 +142,72 @@ func (e *endpoint) afterLoad() {
 		}
 	}
 
-	switch state {
-	case stateBound, stateListen, stateConnecting:
+	bind := func() {
 		e.state = stateInitial
-		if err := e.Bind(tcpip.FullAddress{Addr: e.id.LocalAddress, Port: e.id.LocalPort}, nil); err != nil {
+		if len(e.bindAddress) == 0 {
+			e.bindAddress = e.id.LocalAddress
+		}
+		if err := e.Bind(tcpip.FullAddress{Addr: e.bindAddress, Port: e.id.LocalPort}, nil); err != nil {
 			panic("endpoint binding failed: " + err.String())
 		}
 	}
 
 	switch state {
-	case stateListen:
-		backlog := cap(e.acceptedChan)
-		e.acceptedChan = nil
-		if err := e.Listen(backlog); err != nil {
-			panic("endpoint listening failed: " + err.String())
+	case stateConnected:
+		bind()
+		if len(e.connectingAddress) == 0 {
+			// This endpoint is accepted by netstack but not yet by
+			// the app. If the endpoint is IPv6 but the remote
+			// address is IPv4, we need to connect as IPv6 so that
+			// dual-stack mode can be properly activated.
+			if e.netProto == header.IPv6ProtocolNumber && len(e.id.RemoteAddress) != header.IPv6AddressSize {
+				e.connectingAddress = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + e.id.RemoteAddress
+			} else {
+				e.connectingAddress = e.id.RemoteAddress
+			}
 		}
-	}
-
-	switch state {
-	case stateConnecting:
-		if err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}); err != tcpip.ErrConnectStarted {
+		if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted {
 			panic("endpoint connecting failed: " + err.String())
 		}
+		connectedLoading.Done()
+	case stateListen:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			bind()
+			backlog := cap(e.acceptedChan)
+			if err := e.Listen(backlog); err != nil {
+				panic("endpoint listening failed: " + err.String())
+			}
+			listenLoading.Done()
+			tcpip.AsyncLoading.Done()
+		}()
+	case stateConnecting:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			listenLoading.Wait()
+			bind()
+			if err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}); err != tcpip.ErrConnectStarted {
+				panic("endpoint connecting failed: " + err.String())
+			}
+			connectingLoading.Done()
+			tcpip.AsyncLoading.Done()
+		}()
+	case stateBound:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			listenLoading.Wait()
+			connectingLoading.Wait()
+			bind()
+			tcpip.AsyncLoading.Done()
+		}()
+	case stateClosed, stateError:
+		tcpip.DeleteDanglingEndpoint(e)
 	}
 }
 
-// saveAcceptedChan is invoked by stateify.
-func (e *endpoint) saveAcceptedChan() endpointChan {
-	if e.acceptedChan == nil {
-		return endpointChan{}
-	}
-	close(e.acceptedChan)
-	buffer := make([]*endpoint, 0, len(e.acceptedChan))
-	for ep := range e.acceptedChan {
-		buffer = append(buffer, ep)
-	}
-	if len(buffer) != cap(buffer) {
-		panic("endpoint.acceptedChan buffer got consumed by background context")
-	}
-	c := cap(e.acceptedChan)
-	e.acceptedChan = nil
-	return endpointChan{buffer: buffer, cap: c}
-}
-
-// loadAcceptedChan is invoked by stateify.
-func (e *endpoint) loadAcceptedChan(c endpointChan) {
-	if c.cap == 0 {
-		return
-	}
-	e.acceptedChan = make(chan *endpoint, c.cap)
-	for _, ep := range c.buffer {
-		e.acceptedChan <- ep
-	}
-}
-
-type endpointChan struct {
-	buffer []*endpoint
-	cap    int
-}
-
 // saveLastError is invoked by stateify.
 func (e *endpoint) saveLastError() string {
 	if e.lastError == nil {
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 07e4bfd73..c5bff5f4f 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -29,9 +29,9 @@ const (
 type segment struct {
 	segmentEntry
 	refCnt int32
-	id     stack.TransportEndpointID
-	route  stack.Route `state:"manual"`
-	data   buffer.VectorisedView
+	id     stack.TransportEndpointID `state:"manual"`
+	route  stack.Route               `state:"manual"`
+	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View
@@ -45,7 +45,7 @@ type segment struct {
 
 	// parsedOptions stores the parsed values from the options in the segment.
 	parsedOptions header.TCPOptions
-	options       []byte
+	options       []byte `state:".([]byte)"`
 }
 
 func newSegment(r *stack.Route, id stack.TransportEndpointID, vv *buffer.VectorisedView) *segment {
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index c4a7f7d5b..a5e7b2ebf 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -12,8 +12,8 @@ import (
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
 type segmentQueue struct {
-	mu    sync.Mutex
-	list  segmentList
+	mu    sync.Mutex  `state:"nosave"`
+	list  segmentList `state:"wait"`
 	limit int
 	used  int
 }
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
new file mode 100644
index 000000000..e5243200b
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tcp
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+)
+
+// saveData is invoked by stateify.
+func (s *segment) saveData() buffer.VectorisedView {
+	// We cannot save s.data directly as s.data.views may alias to s.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return s.data.Clone(nil)
+}
+
+// loadData is invoked by stateify.
+func (s *segment) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the s.data = data.Clone(s.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing s.views for data.views.
+	s.data = data
+}
+
+// saveOptions is invoked by stateify.
+func (s *segment) saveOptions() []byte {
+	// We cannot save s.options directly as it may point to s.data's trimmed
+	// tail, which is not allowed by state framework (in-struct pointer).
+	b := make([]byte, 0, cap(s.options))
+	return append(b, s.options...)
+}
+
+// loadOptions is invoked by stateify.
+func (s *segment) loadOptions(options []byte) {
+	// NOTE: We cannot point s.options back into s.data's trimmed tail. But
+	// it is OK as they do not need to aliased. Plus, options is already
+	// allocated so there is no cost here.
+	s.options = options
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 95bea4d88..a98aca293 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -28,7 +28,7 @@ type sender struct {
 	ep *endpoint
 
 	// lastSendTime is the timestamp when the last packet was sent.
-	lastSendTime time.Time
+	lastSendTime time.Time `state:".(unixTime)"`
 
 	// dupAckCount is the number of duplicated acks received. It is used for
 	// fast retransmit.
@@ -71,7 +71,7 @@ type sender struct {
 	rttMeasureSeqNum seqnum.Value
 
 	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
-	rttMeasureTime time.Time
+	rttMeasureTime time.Time `state:".(unixTime)"`
 
 	closed      bool
 	writeNext   *segment
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
new file mode 100644
index 000000000..d68773a7c
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -0,0 +1,39 @@
+// Copyright 2018 The Netstack Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package tcp
+
+import (
+	"time"
+)
+
+type unixTime struct {
+	second int64
+	nano   int64
+}
+
+// saveLastSendTime is invoked by stateify.
+func (s *sender) saveLastSendTime() unixTime {
+	return unixTime{s.lastSendTime.Unix(), s.lastSendTime.UnixNano()}
+}
+
+// loadLastSendTime is invoked by stateify.
+func (s *sender) loadLastSendTime(unix unixTime) {
+	s.lastSendTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveRttMeasureTime is invoked by stateify.
+func (s *sender) saveRttMeasureTime() unixTime {
+	return unixTime{s.rttMeasureTime.Unix(), s.rttMeasureTime.UnixNano()}
+}
+
+// loadRttMeasureTime is invoked by stateify.
+func (s *sender) loadRttMeasureTime(unix unixTime) {
+	s.rttMeasureTime = time.Unix(unix.second, unix.nano)
+}
+
+// afterLoad is invoked by stateify.
+func (s *sender) afterLoad() {
+	s.resendTimer.init(&s.resendWaker)
+}
-- 
cgit v1.2.3


From 5d45f88f2c2840123e2f5ec2e45ac6d5b5a5729f Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Fri, 22 Jun 2018 10:18:19 -0700
Subject: Netstack should return EOF on closed read.

The shutdown behavior where we return EAGAIN for sockets
which are non-blocking is only correct for packet based sockets.
SOCK_STREAM sockets should return EOF.

PiperOrigin-RevId: 201703055
Change-Id: I20b25ceca7286c37766936475855959706fc5397
---
 pkg/sentry/socket/epsocket/epsocket.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index a1bb265c0..a2927e1b9 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -945,7 +945,6 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 // tcpip.Endpoint.
 func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
-
 	peek := flags&linux.MSG_PEEK != 0
 	if senderRequested && !s.isPacketBased() {
 		// Stream sockets ignore the sender address.
@@ -953,7 +952,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	}
 	n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
 
-	if err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
+	if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
 		// In this situation we should return EAGAIN.
 		return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 	}
-- 
cgit v1.2.3


From fe3fc44da3ca47fa27d55294e6c31d51b6b5dc14 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 22 Jun 2018 13:07:21 -0700
Subject: Handle mremap(old_size=0).

PiperOrigin-RevId: 201729703
Change-Id: I486900b0c6ec59533b88da225a5829c474e35a70
---
 pkg/sentry/memmap/memmap.go |  7 ++++---
 pkg/sentry/mm/syscalls.go   | 50 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 44 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 72986cbb9..cdc5f2b27 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -49,13 +49,14 @@ type Mappable interface {
 
 	// CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
 	// from srcAR to dstAR. For most Mappables, this is equivalent to
-	// AddMapping.
+	// AddMapping. Note that it is possible that srcAR.Length() != dstAR.Length(),
+	// and also that srcAR.Length() == 0.
 	//
 	// CopyMapping is only called when a mapping is copied within a given
 	// MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
 	//
-	// Preconditions: offset+dstAR.Length() does not overflow. The mapping at
-	// srcAR must exist.
+	// Preconditions: offset+srcAR.Length() and offset+dstAR.Length() do not
+	// overflow. The mapping at srcAR must exist.
 	CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error
 
 	// Translate returns the Mappable's current mappings for at least the range
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 0730be65b..21aeabde8 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -320,8 +320,21 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		return 0, syserror.EFAULT
 	}
 
+	// Behavior matrix:
+	//
+	// Move     | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize
+	// ---------+-------------+-------------------+-------------------+------------------
+	//   NoMove | ENOMEM [1]  | Grow in-place     | No-op             | Shrink in-place
+	//  MayMove | Copy [1]    | Grow in-place or  | No-op             | Shrink in-place
+	//          |             |   move            |                   |
+	// MustMove | Copy        | Move and grow     | Move              | Shrink and move
+	//
+	// [1] In-place growth is impossible because the vma at oldAddr already
+	// occupies at least part of the destination. Thus the NoMove case always
+	// fails and the MayMove case always falls back to copying.
+
 	if opts.Move != MRemapMustMove {
-		// Handle noops and in-place shrinking. These cases don't care if
+		// Handle no-ops and in-place shrinking. These cases don't care if
 		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
 		// (aside from oldAddr).
 		if newSize <= oldSize {
@@ -363,15 +376,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			return oldAddr, nil
 		}
 		// In-place growth failed. In the MRemapMayMove case, fall through to
-		// moving below.
+		// copying/moving below.
 		if opts.Move == MRemapNoMove {
 			return 0, err
 		}
 	}
 
-	// Handle moving, which is the only remaining case.
-
-	// Find a destination for the move.
+	// Find a location for the new mapping.
 	var newAR usermem.AddrRange
 	switch opts.Move {
 	case MRemapMayMove:
@@ -399,7 +410,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		mm.unmapLocked(ctx, newAR)
 
 		// If the sizes specify shrinking, unmap everything between the new and
-		// old sizes at the source.
+		// old sizes at the source. Unmapping before the following checks is
+		// correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(),
+		// vma_to_resize().
 		if newSize < oldSize {
 			oldNewEnd := oldAddr + usermem.Addr(newSize)
 			mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
@@ -412,9 +425,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 
 	oldAR := usermem.AddrRange{oldAddr, oldEnd}
 
-	// In the MRemapMustMove case, these checks happen after unmapping:
-	// mm/mremap.c:mremap_to() => do_munmap(), vma_to_resize().
-
 	// Check that oldEnd maps to the same vma as oldAddr.
 	if vseg.End() < oldEnd {
 		return 0, syserror.EFAULT
@@ -431,12 +441,32 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.off+uint64(newAR.Length()) < vma.off {
 			return 0, syserror.EINVAL
 		}
-		// Inform the Mappable, if any, of the copied mapping.
+		// Inform the Mappable, if any, of the new mapping.
 		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start)); err != nil {
 			return 0, err
 		}
 	}
 
+	if oldSize == 0 {
+		// Handle copying.
+		//
+		// We can't use createVMALocked because it calls Mappable.AddMapping,
+		// whereas we've already called Mappable.CopyMapping (which is
+		// consistent with Linux). Call vseg.Value() (rather than
+		// vseg.ValuePtr()) to make a copy of the vma.
+		vma := vseg.Value()
+		if vma.mappable != nil {
+			vma.off = vseg.mappableOffsetAt(oldAR.Start)
+		}
+		if vma.id != nil {
+			vma.id.IncRef()
+		}
+		mm.vmas.Add(newAR, vma)
+		return newAR.Start, nil
+	}
+
+	// Handle moving.
+	//
 	// Remove the existing vma before inserting the new one to minimize
 	// iterator invalidation. We do this directly (instead of calling
 	// removeVMAsLocked) because:
-- 
cgit v1.2.3


From e0e640981282ece051c33700f4e272047fa4e5b6 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 22 Jun 2018 14:09:34 -0700
Subject: Simplify some handle logic.

PiperOrigin-RevId: 201738936
Change-Id: Ib75136415e28e8df0c742acd6b9512d4809fe3a8
---
 pkg/sentry/fs/gofer/handles.go | 45 ++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index a660c9230..a3e52aad6 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -62,43 +62,28 @@ func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*han
 		return nil, err
 	}
 
+	var p9flags p9.OpenFlags
 	switch {
 	case flags.Read && flags.Write:
-		hostFile, _, _, err := newFile.open(ctx, p9.ReadWrite)
-		if err != nil {
-			newFile.close(ctx)
-			return nil, err
-		}
-		h := &handles{
-			File: newFile,
-			Host: hostFile,
-		}
-		return h, nil
+		p9flags = p9.ReadWrite
 	case flags.Read && !flags.Write:
-		hostFile, _, _, err := newFile.open(ctx, p9.ReadOnly)
-		if err != nil {
-			newFile.close(ctx)
-			return nil, err
-		}
-		h := &handles{
-			File: newFile,
-			Host: hostFile,
-		}
-		return h, nil
+		p9flags = p9.ReadOnly
 	case !flags.Read && flags.Write:
-		hostFile, _, _, err := newFile.open(ctx, p9.WriteOnly)
-		if err != nil {
-			newFile.close(ctx)
-			return nil, err
-		}
-		h := &handles{
-			File: newFile,
-			Host: hostFile,
-		}
-		return h, nil
+		p9flags = p9.WriteOnly
 	default:
 		panic("impossible fs.FileFlags")
 	}
+
+	hostFile, _, _, err := newFile.open(ctx, p9flags)
+	if err != nil {
+		newFile.close(ctx)
+		return nil, err
+	}
+	h := &handles{
+		File: newFile,
+		Host: hostFile,
+	}
+	return h, nil
 }
 
 type handleReadWriter struct {
-- 
cgit v1.2.3


From 7c645ac27355a9d7016e0d5c74ce70eed2add600 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Fri, 22 Jun 2018 14:47:15 -0700
Subject: Add rpcinet support for SIOCGIFCONF.

The interfaces and their addresses are already available via
the stack Intefaces and InterfaceAddrs.

Also add some tests as we had no tests around SIOCGIFCONF. I also added the socket_netgofer lifecycle for IOCTL tests.

PiperOrigin-RevId: 201744863
Change-Id: Ie0a285a2a2f859fa0cafada13201d5941b95499a
---
 pkg/sentry/socket/rpcinet/socket.go | 73 ++++++++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 207123d6f..72fa1ca8f 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -64,10 +64,6 @@ type socketOperations struct {
 // Verify that we actually implement socket.Socket.
 var _ = socket.Socket(&socketOperations{})
 
-const (
-	sizeOfIfReq = 40
-)
-
 // New creates a new RPC socket.
 func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) {
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */)
@@ -465,6 +461,55 @@ func rpcIoctl(t *kernel.Task, fd, cmd uint32, arg []byte) ([]byte, error) {
 	return res.(*pb.IOCtlResponse_Value).Value, nil
 }
 
+// ifconfIoctlFromStack populates a struct ifconf for the SIOCGIFCONF ioctl.
+func ifconfIoctlFromStack(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
+	// If Ptr is NULL, return the necessary buffer size via Len.
+	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
+	// structs.
+	t := ctx.(*kernel.Task)
+	s := t.NetworkContext().(*Stack)
+	if s == nil {
+		return syserr.ErrNoDevice.ToError()
+	}
+
+	if ifc.Ptr == 0 {
+		ifc.Len = int32(len(s.Interfaces())) * int32(linux.SizeOfIFReq)
+		return nil
+	}
+
+	max := ifc.Len
+	ifc.Len = 0
+	for key, ifaceAddrs := range s.InterfaceAddrs() {
+		iface := s.Interfaces()[key]
+		for _, ifaceAddr := range ifaceAddrs {
+			// Don't write past the end of the buffer.
+			if ifc.Len+int32(linux.SizeOfIFReq) > max {
+				break
+			}
+			if ifaceAddr.Family != linux.AF_INET {
+				continue
+			}
+
+			// Populate ifr.ifr_addr.
+			ifr := linux.IFReq{}
+			ifr.SetName(iface.Name)
+			usermem.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
+			usermem.ByteOrder.PutUint16(ifr.Data[2:4], 0)
+			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
+
+			// Copy the ifr to userspace.
+			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
+			ifc.Len += int32(linux.SizeOfIFReq)
+			if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{
+				AddressSpaceActive: true,
+			}); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 // Ioctl implements fs.FileOperations.Ioctl.
 func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	t := ctx.(*kernel.Task)
@@ -491,7 +536,25 @@ func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.S
 		syscall.SIOCGIFNAME,
 		syscall.SIOCGIFNETMASK,
 		syscall.SIOCGIFTXQLEN:
-		buf = make([]byte, sizeOfIfReq)
+		buf = make([]byte, linux.SizeOfIFReq)
+	case syscall.SIOCGIFCONF:
+		// SIOCGIFCONF has slightly different behavior than the others, in that it
+		// will need to populate the array of ifreqs.
+		var ifc linux.IFConf
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		if err := ifconfIoctlFromStack(ctx, io, &ifc); err != nil {
+			return 0, err
+		}
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+
+		return 0, err
 	default:
 		return 0, syserror.ENOTTY
 	}
-- 
cgit v1.2.3


From 478f0ac0038afda267814fa154bcd32feb07c3b3 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 25 Jun 2018 15:22:04 -0700
Subject: Don't read FSContext.root without holding FSContext.mu

IsChrooted still has the opportunity to race with another thread
entering the FSContext into a chroot, but that is unchanged (and
fine, AFAIK).

PiperOrigin-RevId: 202029117
Change-Id: I38bce763b3a7715fa6ae98aa200a19d51a0235f1
---
 pkg/sentry/kernel/task_resources.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
index e529f0c2d..0389989e8 100644
--- a/pkg/sentry/kernel/task_resources.go
+++ b/pkg/sentry/kernel/task_resources.go
@@ -122,5 +122,7 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 func (t *Task) IsChrooted() bool {
 	realRoot := t.k.mounts.Root()
 	defer realRoot.DecRef()
-	return t.tr.FSContext.root != realRoot
+	root := t.tr.FSContext.RootDirectory()
+	defer root.DecRef()
+	return root != realRoot
 }
-- 
cgit v1.2.3


From 1a9917d14d66250fdc9a3781ef65df4413340a2f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 25 Jun 2018 16:16:23 -0700
Subject: MountSource.Root() should return a refernce on the dirent.

PiperOrigin-RevId: 202038397
Change-Id: I074d525f2e2d9bcd43b247b62f86f9129c101b78
---
 pkg/sentry/fs/mount.go       | 4 +++-
 pkg/sentry/fs/mount_test.go  | 4 +++-
 pkg/sentry/fs/proc/mounts.go | 8 ++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index a2943b097..1d05a36a7 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -198,10 +198,12 @@ func (msrc *MountSource) Submounts() []*MountSource {
 	return ms
 }
 
-// Root returns the root dirent of this mount.
+// Root returns the root dirent of this mount. Callers must call DecRef on the
+// returned dirent.
 func (msrc *MountSource) Root() *Dirent {
 	msrc.mu.Lock()
 	defer msrc.mu.Unlock()
+	msrc.root.IncRef()
 	return msrc.root
 }
 
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 3a053c154..968b435ab 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -204,7 +204,9 @@ func mountPathsAre(root *Dirent, got []*MountSource, want ...string) error {
 	}
 	gotPaths := make(map[string]struct{}, len(got))
 	for _, g := range got {
-		n, _ := g.Root().FullName(root)
+		groot := g.Root()
+		n, _ := groot.FullName(root)
+		groot.DecRef()
 		gotPaths[n] = struct{}{}
 	}
 	for _, w := range want {
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 76092567d..3d276dfa5 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -44,7 +44,9 @@ func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
 		return ms[i].ID() < ms[j].ID()
 	})
 	for _, m := range ms {
-		mountPath, desc := m.Root().FullName(rootDir)
+		mroot := m.Root()
+		mountPath, desc := mroot.FullName(rootDir)
+		mroot.DecRef()
 		if !desc {
 			// MountSources that are not descendants of the chroot jail are ignored.
 			continue
@@ -88,7 +90,9 @@ func (mif *mountInfoFile) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.S
 
 		// (3) Major:Minor device ID. We don't have a superblock, so we
 		// just use the root inode device number.
-		sa := m.Root().Inode.StableAttr
+		mroot := m.Root()
+		sa := mroot.Inode.StableAttr
+		mroot.DecRef()
 		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
 
 		// (4) Root: the pathname of the directory in the filesystem
-- 
cgit v1.2.3


From 4ac79312b093f2831079d0d71846747a4996d9ad Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 25 Jun 2018 16:45:31 -0700
Subject: Don't read cwd or root without holding mu

PiperOrigin-RevId: 202043090
Change-Id: I3c47fb3413ca8615d50d8a0503d72fcce9b09421
---
 pkg/sentry/kernel/fs_context.go | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 9aa6fa951..d1ca9c09d 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -114,11 +114,14 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetWorkingDirectory called with nil dirent")
 	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
 	if f.cwd == nil {
 		panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d))
 	}
-	f.mu.Lock()
-	defer f.mu.Unlock()
+
 	old := f.cwd
 	f.cwd = d
 	d.IncRef()
@@ -144,11 +147,14 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
 	if d == nil {
 		panic("FSContext.SetRootDirectory called with nil dirent")
 	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
 	if f.root == nil {
 		panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d))
 	}
-	f.mu.Lock()
-	defer f.mu.Unlock()
+
 	old := f.root
 	f.root = d
 	d.IncRef()
-- 
cgit v1.2.3


From 16882484f96f9d75348904bd5a4e2a53acb67378 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 25 Jun 2018 16:49:47 -0700
Subject: Check for empty applicationAddrRange in MM.DecUsers.

PiperOrigin-RevId: 202043776
Change-Id: I4373abbcf735dc1cf4bebbbbb0c7124df36e9e78
---
 pkg/sentry/mm/lifecycle.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index de7f29b04..a4b5cb443 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -214,5 +214,9 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) {
 
 	mm.mappingMu.Lock()
 	defer mm.mappingMu.Unlock()
-	mm.unmapLocked(ctx, mm.applicationAddrRange())
+	// If mm is being dropped before mm.SetMmapLayout was called,
+	// mm.applicationAddrRange() will be empty.
+	if ar := mm.applicationAddrRange(); ar.Length() != 0 {
+		mm.unmapLocked(ctx, ar)
+	}
 }
-- 
cgit v1.2.3


From db94befb634b05aab0255214cd8c5eab0f5daaf2 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 25 Jun 2018 18:16:20 -0700
Subject: Fix panic message

The arguments are backwards from the message.

PiperOrigin-RevId: 202054887
Change-Id: Id5750a84ca091f8b8fbe15be8c648d4fa3e31eb2
---
 pkg/sentry/fs/dirent.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index b56437b3c..739ae1d2d 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -964,7 +964,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
 	offset -= 2
 	newOffset, err := it.IterateDir(ctx, dirCtx, int(offset))
 	if int64(newOffset) < offset {
-		panic(fmt.Sprintf("node.Readdir returned offset %v less that input offset %v", offset, newOffset))
+		panic(fmt.Sprintf("node.Readdir returned offset %v less than input offset %v", newOffset, offset))
 	}
 	// Add the initial nodes back to the offset count.
 	newOffset += 2
-- 
cgit v1.2.3


From 51c1e510ab79607d80d6b81c2ae8ab308c323a58 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 26 Jun 2018 10:32:22 -0700
Subject: Automated rollback of changelist 201596247

PiperOrigin-RevId: 202151720
Change-Id: I0491172c436bbb32b977f557953ba0bc41cfe299
---
 pkg/sentry/kernel/BUILD                   |   5 +-
 pkg/sentry/kernel/kernel.go               |   6 -
 pkg/sentry/kernel/kernel_state.go         |  31 -----
 pkg/tcpip/stack/stack_global_state.go     |   2 +-
 pkg/tcpip/tcpip.go                        |  36 ------
 pkg/tcpip/transport/tcp/BUILD             |   7 --
 pkg/tcpip/transport/tcp/accept.go         |  11 +-
 pkg/tcpip/transport/tcp/connect.go        |  43 +------
 pkg/tcpip/transport/tcp/endpoint.go       |  72 +++---------
 pkg/tcpip/transport/tcp/endpoint_state.go | 185 ++++++++++--------------------
 pkg/tcpip/transport/tcp/segment.go        |   8 +-
 pkg/tcpip/transport/tcp/segment_queue.go  |   4 +-
 pkg/tcpip/transport/tcp/segment_state.go  |  41 -------
 pkg/tcpip/transport/tcp/snd.go            |   4 +-
 pkg/tcpip/transport/tcp/snd_state.go      |  39 -------
 15 files changed, 91 insertions(+), 403 deletions(-)
 delete mode 100644 pkg/sentry/kernel/kernel_state.go
 delete mode 100644 pkg/tcpip/transport/tcp/segment_state.go
 delete mode 100644 pkg/tcpip/transport/tcp/snd_state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 07568b47c..b2a55ddff 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -12,7 +12,6 @@ go_stateify(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
-        "kernel_state.go",
         "pending_signals.go",
         "pending_signals_state.go",
         "process_group_list.go",
@@ -46,11 +45,10 @@ go_stateify(
         "vdso.go",
         "version.go",
     ],
-    out = "kernel_autogen_state.go",
+    out = "kernel_state.go",
     imports = [
         "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
         "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
-        "gvisor.googlesource.com/gvisor/pkg/tcpip",
     ],
     package = "kernel",
 )
@@ -119,7 +117,6 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
-        "kernel_autogen_state.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 64439cd9d..5662b8f08 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -57,7 +57,6 @@ import (
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/state"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 )
 
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
@@ -159,9 +158,6 @@ type Kernel struct {
 	// exitErr is the error causing the sandbox to exit, if any. It is
 	// protected by extMu.
 	exitErr error
-
-	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
-	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -426,8 +422,6 @@ func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) erro
 		return err
 	}
 
-	tcpip.AsyncLoading.Wait()
-
 	log.Infof("Overall load took [%s]", time.Since(loadStart))
 
 	// Applications may size per-cpu structures based on k.applicationCores, so
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
deleted file mode 100644
index bb2d5102d..000000000
--- a/pkg/sentry/kernel/kernel_state.go
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-)
-
-// saveDanglingEndpoints is invoked by stateify.
-func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint {
-	return tcpip.GetDanglingEndpoints()
-}
-
-// loadDanglingEndpoints is invoked by stateify.
-func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) {
-	for _, e := range es {
-		tcpip.AddDanglingEndpoint(e)
-	}
-}
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index 260d7d05c..030ae98d1 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -5,5 +5,5 @@
 package stack
 
 // StackFromEnv is the global stack created in restore run.
-// FIXME
+// FIXME: remove this variable once tcpip S/R is fully supported.
 var StackFromEnv *Stack
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 17fa0efb7..cf25a086d 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -23,7 +23,6 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
-	"sync"
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
@@ -553,38 +552,3 @@ type ProtocolAddress struct {
 	// Address is a network address.
 	Address Address
 }
-
-// danglingEndpointsMu protects access to danglingEndpoints.
-var danglingEndpointsMu sync.Mutex
-
-// danglingEndpoints tracks all dangling endpoints no longer owned by the app.
-var danglingEndpoints = make(map[Endpoint]struct{})
-
-// GetDanglingEndpoints returns all dangling endpoints.
-func GetDanglingEndpoints() []Endpoint {
-	es := make([]Endpoint, 0, len(danglingEndpoints))
-	danglingEndpointsMu.Lock()
-	for e, _ := range danglingEndpoints {
-		es = append(es, e)
-	}
-	danglingEndpointsMu.Unlock()
-	return es
-}
-
-// AddDanglingEndpoint adds a dangling endpoint.
-func AddDanglingEndpoint(e Endpoint) {
-	danglingEndpointsMu.Lock()
-	danglingEndpoints[e] = struct{}{}
-	danglingEndpointsMu.Unlock()
-}
-
-// DeleteDanglingEndpoint removes a dangling endpoint.
-func DeleteDanglingEndpoint(e Endpoint) {
-	danglingEndpointsMu.Lock()
-	delete(danglingEndpoints, e)
-	danglingEndpointsMu.Unlock()
-}
-
-// AsyncLoading is the global barrier for asynchronous endpoint loading
-// activities.
-var AsyncLoading sync.WaitGroup
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index d129aa285..f38f58e87 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -10,16 +10,11 @@ go_stateify(
         "endpoint.go",
         "endpoint_state.go",
         "rcv.go",
-        "segment.go",
         "segment_heap.go",
-        "segment_queue.go",
-        "segment_state.go",
         "snd.go",
-        "snd_state.go",
         "tcp_segment_list.go",
     ],
     out = "tcp_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     package = "tcp",
 )
 
@@ -48,9 +43,7 @@ go_library(
         "segment.go",
         "segment_heap.go",
         "segment_queue.go",
-        "segment_state.go",
         "snd.go",
-        "snd_state.go",
         "tcp_segment_list.go",
         "tcp_state.go",
         "timer.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 410dfdad4..85adeef0e 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -68,8 +68,7 @@ func encodeMSS(mss uint16) uint32 {
 // to go above a threshold.
 var synRcvdCount struct {
 	sync.Mutex
-	value   uint64
-	pending sync.WaitGroup
+	value uint64
 }
 
 // listenContext is used by a listening endpoint to store state used while
@@ -103,7 +102,6 @@ func incSynRcvdCount() bool {
 		return false
 	}
 
-	synRcvdCount.pending.Add(1)
 	synRcvdCount.value++
 
 	return true
@@ -117,7 +115,6 @@ func decSynRcvdCount() {
 	defer synRcvdCount.Unlock()
 
 	synRcvdCount.value--
-	synRcvdCount.pending.Done()
 }
 
 // newListenContext creates a new listen context.
@@ -295,7 +292,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		opts := parseSynSegmentOptions(s)
 		if incSynRcvdCount() {
 			s.incRef()
-			go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
+			go e.handleSynSegment(ctx, s, &opts) // S/R-FIXME
 		} else {
 			cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
 			// Send SYN with window scaling because we currently
@@ -384,12 +381,10 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 				return nil
 			}
 			if n&notifyDrain != 0 {
-				for !e.segmentQueue.empty() {
-					s := e.segmentQueue.dequeue()
+				for s := e.segmentQueue.dequeue(); s != nil; s = e.segmentQueue.dequeue() {
 					e.handleListenSegment(ctx, s)
 					s.decRef()
 				}
-				synRcvdCount.pending.Wait()
 				close(e.drainDone)
 				<-e.undrain
 			}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index d9f87c793..9aaabe0b1 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -443,8 +443,7 @@ func (h *handshake) execute() *tcpip.Error {
 				return tcpip.ErrAborted
 			}
 			if n&notifyDrain != 0 {
-				for !h.ep.segmentQueue.empty() {
-					s := h.ep.segmentQueue.dequeue()
+				for s := h.ep.segmentQueue.dequeue(); s != nil; s = h.ep.segmentQueue.dequeue() {
 					err := h.handleSegment(s)
 					s.decRef()
 					if err != nil {
@@ -814,13 +813,15 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
 // goroutine and is responsible for sending segments and handling received
 // segments.
-func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
+func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
 	defer func() {
 		// e.mu is expected to be hold upon entering this section.
 
+		e.completeWorkerLocked()
+
 		if e.snd != nil {
 			e.snd.resendTimer.cleanup()
 		}
@@ -829,8 +830,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 			closeTimer.Stop()
 		}
 
-		e.completeWorkerLocked()
-
 		if e.drainDone != nil {
 			close(e.drainDone)
 		}
@@ -841,7 +840,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}()
 
-	if handshake {
+	if !passive {
 		// This is an active connection, so we must initiate the 3-way
 		// handshake, and then inform potential waiters about its
 		// completion.
@@ -946,17 +945,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 						closeWaker.Assert()
 					})
 				}
-
-				if n&notifyDrain != 0 {
-					for !e.segmentQueue.empty() {
-						if err := e.handleSegments(); err != nil {
-							return err
-						}
-					}
-					close(e.drainDone)
-					<-e.undrain
-				}
-
 				return nil
 			},
 		},
@@ -968,27 +956,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 		s.AddWaker(funcs[i].w, i)
 	}
 
-	// The following assertions and notifications are needed for restored
-	// endpoints. Fresh newly created endpoints have empty states and should
-	// not invoke any.
-	e.segmentQueue.mu.Lock()
-	if !e.segmentQueue.list.Empty() {
-		e.newSegmentWaker.Assert()
-	}
-	e.segmentQueue.mu.Unlock()
-
-	e.rcvListMu.Lock()
-	if !e.rcvList.Empty() {
-		e.waiterQueue.Notify(waiter.EventIn)
-	}
-	e.rcvListMu.Unlock()
-
-	e.mu.RLock()
-	if e.workerCleanup {
-		e.notifyProtocolGoroutine(notifyClose)
-	}
-	e.mu.RUnlock()
-
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 	for !e.rcv.closed || !e.snd.closed || e.snd.sndUna != e.snd.sndNxtList {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 706977618..b21c2b4ab 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -69,7 +69,7 @@ type endpoint struct {
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack `state:"manual"`
 	netProto    tcpip.NetworkProtocolNumber
-	waiterQueue *waiter.Queue `state:"wait"`
+	waiterQueue *waiter.Queue
 
 	// lastError represents the last error that the endpoint reported;
 	// access to it is protected by the following mutex.
@@ -82,8 +82,8 @@ type endpoint struct {
 	//
 	// Once the peer has closed its send side, rcvClosed is set to true
 	// to indicate to users that no more data is coming.
-	rcvListMu  sync.Mutex  `state:"nosave"`
-	rcvList    segmentList `state:"wait"`
+	rcvListMu  sync.Mutex `state:"nosave"`
+	rcvList    segmentList
 	rcvClosed  bool
 	rcvBufSize int
 	rcvBufUsed int
@@ -91,8 +91,8 @@ type endpoint struct {
 	// The following fields are protected by the mutex.
 	mu                sync.RWMutex `state:"nosave"`
 	id                stack.TransportEndpointID
-	state             endpointState `state:".(endpointState)"`
-	isPortReserved    bool          `state:"manual"`
+	state             endpointState
+	isPortReserved    bool `state:"manual"`
 	isRegistered      bool
 	boundNICID        tcpip.NICID `state:"manual"`
 	route             stack.Route `state:"manual"`
@@ -118,7 +118,7 @@ type endpoint struct {
 	// workerCleanup specifies if the worker goroutine must perform cleanup
 	// before exitting. This can only be set to true when workerRunning is
 	// also true, and they're both protected by the mutex.
-	workerCleanup bool
+	workerCleanup bool `state:"zerovalue"`
 
 	// sendTSOk is used to indicate when the TS Option has been negotiated.
 	// When sendTSOk is true every non-RST segment should carry a TS as per
@@ -153,7 +153,7 @@ type endpoint struct {
 	// segmentQueue is used to hand received segments to the protocol
 	// goroutine. Segments are queued as long as the queue is not full,
 	// and dropped when it is.
-	segmentQueue segmentQueue `state:"wait"`
+	segmentQueue segmentQueue `state:"zerovalue"`
 
 	// The following fields are used to manage the send buffer. When
 	// segments are ready to be sent, they are added to sndQueue and the
@@ -166,7 +166,7 @@ type endpoint struct {
 	sndBufUsed    int
 	sndClosed     bool
 	sndBufInQueue seqnum.Size
-	sndQueue      segmentList `state:"wait"`
+	sndQueue      segmentList
 	sndWaker      sleep.Waker `state:"manual"`
 	sndCloseWaker sleep.Waker `state:"manual"`
 
@@ -188,21 +188,17 @@ type endpoint struct {
 
 	// notifyFlags is a bitmask of flags used to indicate to the protocol
 	// goroutine what it was notified; this is only accessed atomically.
-	notifyFlags uint32 `state:"nosave"`
+	notifyFlags uint32 `state:"zerovalue"`
 
 	// acceptedChan is used by a listening endpoint protocol goroutine to
 	// send newly accepted connections to the endpoint so that they can be
 	// read by Accept() calls.
-	acceptedChan chan *endpoint `state:"manual"`
-
-	// acceptedEndpoints is only used to save / restore the channel buffer.
-	// FIXME
-	acceptedEndpoints []*endpoint
+	acceptedChan chan *endpoint `state:".(endpointChan)"`
 
 	// The following are only used from the protocol goroutine, and
 	// therefore don't need locks to protect them.
-	rcv *receiver `state:"wait"`
-	snd *sender   `state:"wait"`
+	rcv *receiver
+	snd *sender
 
 	// The goroutine drain completion notification channel.
 	drainDone chan struct{} `state:"nosave"`
@@ -215,7 +211,6 @@ type endpoint struct {
 	probe stack.TCPProbeFunc `state:"nosave"`
 
 	// The following are only used to assist the restore run to re-connect.
-	bindAddress       tcpip.Address
 	connectingAddress tcpip.Address
 }
 
@@ -349,7 +344,6 @@ func (e *endpoint) Close() {
 
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
-	tcpip.AddDanglingEndpoint(e)
 	if !e.workerRunning {
 		e.cleanupLocked()
 	} else {
@@ -369,12 +363,9 @@ func (e *endpoint) cleanupLocked() {
 	if e.acceptedChan != nil {
 		close(e.acceptedChan)
 		for n := range e.acceptedChan {
-			n.mu.Lock()
 			n.resetConnectionLocked(tcpip.ErrConnectionAborted)
-			n.mu.Unlock()
 			n.Close()
 		}
-		e.acceptedChan = nil
 	}
 	e.workerCleanup = false
 
@@ -383,7 +374,6 @@ func (e *endpoint) cleanupLocked() {
 	}
 
 	e.route.Release()
-	tcpip.DeleteDanglingEndpoint(e)
 }
 
 // Read reads data from the endpoint.
@@ -796,16 +786,6 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocol
 
 // Connect connects the endpoint to its peer.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
-	return e.connect(addr, true, true)
-}
-
-// connect connects the endpoint to its peer. In the normal non-S/R case, the
-// new connection is expected to run the main goroutine and perform handshake.
-// In restore of previously connected endpoints, both ends will be passively
-// created (so no new handshaking is done); for stack-accepted connections not
-// yet accepted by the app, they are restored without running the main goroutine
-// here.
-func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
@@ -917,27 +897,9 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	e.boundNICID = nicid
 	e.effectiveNetProtos = netProtos
 	e.connectingAddress = connectingAddr
+	e.workerRunning = true
 
-	// Connect in the restore phase does not perform handshake. Restore its
-	// connection setting here.
-	if !handshake {
-		e.segmentQueue.mu.Lock()
-		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
-			for s := l.Front(); s != nil; s = s.Next() {
-				s.id = e.id
-				s.route = r.Clone()
-				e.sndWaker.Assert()
-			}
-		}
-		e.segmentQueue.mu.Unlock()
-		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
-		e.state = stateConnected
-	}
-
-	if run {
-		e.workerRunning = true
-		go e.protocolMainLoop(handshake) // S/R-SAFE: will be drained before save.
-	}
+	go e.protocolMainLoop(false) // S/R-SAFE: will be drained before save.
 
 	return tcpip.ErrConnectStarted
 }
@@ -1009,9 +971,6 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 		if len(e.acceptedChan) > backlog {
 			return tcpip.ErrInvalidEndpointState
 		}
-		if cap(e.acceptedChan) == backlog {
-			return nil
-		}
 		origChan := e.acceptedChan
 		e.acceptedChan = make(chan *endpoint, backlog)
 		close(origChan)
@@ -1049,7 +1008,7 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
 	e.waiterQueue = waiterQueue
 	e.workerRunning = true
-	go e.protocolMainLoop(false) // S/R-SAFE: drained on save.
+	go e.protocolMainLoop(true) // S/R-FIXME
 }
 
 // Accept returns a new endpoint if a peer has established a connection
@@ -1090,7 +1049,6 @@ func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) (ret
 		return tcpip.ErrAlreadyBound
 	}
 
-	e.bindAddress = addr.Addr
 	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 38c97c796..b1e249bff 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -9,7 +9,6 @@ import (
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 )
 
@@ -23,7 +22,7 @@ func (e *endpoint) drainSegmentLocked() {
 	e.undrain = make(chan struct{})
 	e.mu.Unlock()
 
-	e.notifyProtocolGoroutine(notifyDrain)
+	e.notificationWaker.Assert()
 	<-e.drainDone
 
 	e.mu.Lock()
@@ -39,98 +38,37 @@ func (e *endpoint) beforeSave() {
 
 	switch e.state {
 	case stateInitial, stateBound:
-	case stateListen, stateConnecting, stateConnected:
-		if e.state == stateConnected && !e.workerRunning {
-			// The endpoint must be in acceptedChan.
-			break
+	case stateListen:
+		if !e.segmentQueue.empty() {
+			e.drainSegmentLocked()
 		}
+	case stateConnecting:
 		e.drainSegmentLocked()
-		if e.state != stateClosed && e.state != stateError {
-			if !e.workerRunning {
-				panic("endpoint has no worker running in listen, connecting, or connected state")
-			}
+		if e.state != stateConnected {
 			break
 		}
 		fallthrough
+	case stateConnected:
+		// FIXME
+		panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%v, remote %v:%v", e.id.LocalAddress, e.id.LocalPort, e.id.RemoteAddress, e.id.RemotePort)})
 	case stateClosed, stateError:
 		if e.workerRunning {
-			panic("endpoint still has worker running in closed or error state")
+			panic(fmt.Sprintf("endpoint still has worker running in closed or error state"))
 		}
 	default:
 		panic(fmt.Sprintf("endpoint in unknown state %v", e.state))
 	}
-
-	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
-		panic("endpoint still has waiters upon save")
-	}
-
-	if !((e.state == stateBound || e.state == stateListen) == e.isPortReserved) {
-		panic("endpoint port must and must only be reserved in bound or listen state")
-	}
-
-	if e.acceptedChan != nil {
-		close(e.acceptedChan)
-		e.acceptedEndpoints = make([]*endpoint, len(e.acceptedChan), cap(e.acceptedChan))
-		i := 0
-		for ep := range e.acceptedChan {
-			e.acceptedEndpoints[i] = ep
-			i++
-		}
-		if i != len(e.acceptedEndpoints) {
-			panic("endpoint acceptedChan buffer got consumed by background context")
-		}
-	}
-}
-
-// saveState is invoked by stateify.
-func (e *endpoint) saveState() endpointState {
-	return e.state
-}
-
-// Endpoint loading must be done in the following ordering by their state, to
-// avoid dangling connecting w/o listening peer, and to avoid conflicts in port
-// reservation.
-var connectedLoading sync.WaitGroup
-var listenLoading sync.WaitGroup
-var connectingLoading sync.WaitGroup
-
-// Bound endpoint loading happens last.
-
-// loadState is invoked by stateify.
-func (e *endpoint) loadState(state endpointState) {
-	// This is to ensure that the loading wait groups include all applicable
-	// endpoints before any asynchronous calls to the Wait() methods.
-	switch state {
-	case stateConnected:
-		connectedLoading.Add(1)
-	case stateListen:
-		listenLoading.Add(1)
-	case stateConnecting:
-		connectingLoading.Add(1)
-	}
-	e.state = state
 }
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
-	// We load acceptedChan buffer indirectly here. Note that closed
-	// endpoints might not need to allocate the channel.
-	// FIXME
-	if cap(e.acceptedEndpoints) > 0 {
-		e.acceptedChan = make(chan *endpoint, cap(e.acceptedEndpoints))
-		for _, ep := range e.acceptedEndpoints {
-			e.acceptedChan <- ep
-		}
-		e.acceptedEndpoints = nil
-	}
-
 	e.stack = stack.StackFromEnv
 	e.segmentQueue.setLimit(2 * e.rcvBufSize)
 	e.workMu.Init()
 
 	state := e.state
 	switch state {
-	case stateInitial, stateBound, stateListen, stateConnecting, stateConnected:
+	case stateInitial, stateBound, stateListen, stateConnecting:
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
@@ -142,72 +80,65 @@ func (e *endpoint) afterLoad() {
 		}
 	}
 
-	bind := func() {
+	switch state {
+	case stateBound, stateListen, stateConnecting:
 		e.state = stateInitial
-		if len(e.bindAddress) == 0 {
-			e.bindAddress = e.id.LocalAddress
-		}
-		if err := e.Bind(tcpip.FullAddress{Addr: e.bindAddress, Port: e.id.LocalPort}, nil); err != nil {
+		if err := e.Bind(tcpip.FullAddress{Addr: e.id.LocalAddress, Port: e.id.LocalPort}, nil); err != nil {
 			panic("endpoint binding failed: " + err.String())
 		}
 	}
 
 	switch state {
-	case stateConnected:
-		bind()
-		if len(e.connectingAddress) == 0 {
-			// This endpoint is accepted by netstack but not yet by
-			// the app. If the endpoint is IPv6 but the remote
-			// address is IPv4, we need to connect as IPv6 so that
-			// dual-stack mode can be properly activated.
-			if e.netProto == header.IPv6ProtocolNumber && len(e.id.RemoteAddress) != header.IPv6AddressSize {
-				e.connectingAddress = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + e.id.RemoteAddress
-			} else {
-				e.connectingAddress = e.id.RemoteAddress
-			}
+	case stateListen:
+		backlog := cap(e.acceptedChan)
+		e.acceptedChan = nil
+		if err := e.Listen(backlog); err != nil {
+			panic("endpoint listening failed: " + err.String())
 		}
-		if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted {
+	}
+
+	switch state {
+	case stateConnecting:
+		if err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}); err != tcpip.ErrConnectStarted {
 			panic("endpoint connecting failed: " + err.String())
 		}
-		connectedLoading.Done()
-	case stateListen:
-		tcpip.AsyncLoading.Add(1)
-		go func() {
-			connectedLoading.Wait()
-			bind()
-			backlog := cap(e.acceptedChan)
-			if err := e.Listen(backlog); err != nil {
-				panic("endpoint listening failed: " + err.String())
-			}
-			listenLoading.Done()
-			tcpip.AsyncLoading.Done()
-		}()
-	case stateConnecting:
-		tcpip.AsyncLoading.Add(1)
-		go func() {
-			connectedLoading.Wait()
-			listenLoading.Wait()
-			bind()
-			if err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}); err != tcpip.ErrConnectStarted {
-				panic("endpoint connecting failed: " + err.String())
-			}
-			connectingLoading.Done()
-			tcpip.AsyncLoading.Done()
-		}()
-	case stateBound:
-		tcpip.AsyncLoading.Add(1)
-		go func() {
-			connectedLoading.Wait()
-			listenLoading.Wait()
-			connectingLoading.Wait()
-			bind()
-			tcpip.AsyncLoading.Done()
-		}()
-	case stateClosed, stateError:
-		tcpip.DeleteDanglingEndpoint(e)
 	}
 }
 
+// saveAcceptedChan is invoked by stateify.
+func (e *endpoint) saveAcceptedChan() endpointChan {
+	if e.acceptedChan == nil {
+		return endpointChan{}
+	}
+	close(e.acceptedChan)
+	buffer := make([]*endpoint, 0, len(e.acceptedChan))
+	for ep := range e.acceptedChan {
+		buffer = append(buffer, ep)
+	}
+	if len(buffer) != cap(buffer) {
+		panic("endpoint.acceptedChan buffer got consumed by background context")
+	}
+	c := cap(e.acceptedChan)
+	e.acceptedChan = nil
+	return endpointChan{buffer: buffer, cap: c}
+}
+
+// loadAcceptedChan is invoked by stateify.
+func (e *endpoint) loadAcceptedChan(c endpointChan) {
+	if c.cap == 0 {
+		return
+	}
+	e.acceptedChan = make(chan *endpoint, c.cap)
+	for _, ep := range c.buffer {
+		e.acceptedChan <- ep
+	}
+}
+
+type endpointChan struct {
+	buffer []*endpoint
+	cap    int
+}
+
 // saveLastError is invoked by stateify.
 func (e *endpoint) saveLastError() string {
 	if e.lastError == nil {
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index c5bff5f4f..07e4bfd73 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -29,9 +29,9 @@ const (
 type segment struct {
 	segmentEntry
 	refCnt int32
-	id     stack.TransportEndpointID `state:"manual"`
-	route  stack.Route               `state:"manual"`
-	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	id     stack.TransportEndpointID
+	route  stack.Route `state:"manual"`
+	data   buffer.VectorisedView
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View
@@ -45,7 +45,7 @@ type segment struct {
 
 	// parsedOptions stores the parsed values from the options in the segment.
 	parsedOptions header.TCPOptions
-	options       []byte `state:".([]byte)"`
+	options       []byte
 }
 
 func newSegment(r *stack.Route, id stack.TransportEndpointID, vv *buffer.VectorisedView) *segment {
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index a5e7b2ebf..c4a7f7d5b 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -12,8 +12,8 @@ import (
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
 type segmentQueue struct {
-	mu    sync.Mutex  `state:"nosave"`
-	list  segmentList `state:"wait"`
+	mu    sync.Mutex
+	list  segmentList
 	limit int
 	used  int
 }
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
deleted file mode 100644
index e5243200b..000000000
--- a/pkg/tcpip/transport/tcp/segment_state.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2018 The Netstack Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package tcp
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-)
-
-// saveData is invoked by stateify.
-func (s *segment) saveData() buffer.VectorisedView {
-	// We cannot save s.data directly as s.data.views may alias to s.views,
-	// which is not allowed by state framework (in-struct pointer).
-	return s.data.Clone(nil)
-}
-
-// loadData is invoked by stateify.
-func (s *segment) loadData(data buffer.VectorisedView) {
-	// NOTE: We cannot do the s.data = data.Clone(s.views[:]) optimization
-	// here because data.views is not guaranteed to be loaded by now. Plus,
-	// data.views will be allocated anyway so there really is little point
-	// of utilizing s.views for data.views.
-	s.data = data
-}
-
-// saveOptions is invoked by stateify.
-func (s *segment) saveOptions() []byte {
-	// We cannot save s.options directly as it may point to s.data's trimmed
-	// tail, which is not allowed by state framework (in-struct pointer).
-	b := make([]byte, 0, cap(s.options))
-	return append(b, s.options...)
-}
-
-// loadOptions is invoked by stateify.
-func (s *segment) loadOptions(options []byte) {
-	// NOTE: We cannot point s.options back into s.data's trimmed tail. But
-	// it is OK as they do not need to aliased. Plus, options is already
-	// allocated so there is no cost here.
-	s.options = options
-}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index a98aca293..95bea4d88 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -28,7 +28,7 @@ type sender struct {
 	ep *endpoint
 
 	// lastSendTime is the timestamp when the last packet was sent.
-	lastSendTime time.Time `state:".(unixTime)"`
+	lastSendTime time.Time
 
 	// dupAckCount is the number of duplicated acks received. It is used for
 	// fast retransmit.
@@ -71,7 +71,7 @@ type sender struct {
 	rttMeasureSeqNum seqnum.Value
 
 	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
-	rttMeasureTime time.Time `state:".(unixTime)"`
+	rttMeasureTime time.Time
 
 	closed      bool
 	writeNext   *segment
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
deleted file mode 100644
index d68773a7c..000000000
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 The Netstack Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package tcp
-
-import (
-	"time"
-)
-
-type unixTime struct {
-	second int64
-	nano   int64
-}
-
-// saveLastSendTime is invoked by stateify.
-func (s *sender) saveLastSendTime() unixTime {
-	return unixTime{s.lastSendTime.Unix(), s.lastSendTime.UnixNano()}
-}
-
-// loadLastSendTime is invoked by stateify.
-func (s *sender) loadLastSendTime(unix unixTime) {
-	s.lastSendTime = time.Unix(unix.second, unix.nano)
-}
-
-// saveRttMeasureTime is invoked by stateify.
-func (s *sender) saveRttMeasureTime() unixTime {
-	return unixTime{s.rttMeasureTime.Unix(), s.rttMeasureTime.UnixNano()}
-}
-
-// loadRttMeasureTime is invoked by stateify.
-func (s *sender) loadRttMeasureTime(unix unixTime) {
-	s.rttMeasureTime = time.Unix(unix.second, unix.nano)
-}
-
-// afterLoad is invoked by stateify.
-func (s *sender) afterLoad() {
-	s.resendTimer.init(&s.resendWaker)
-}
-- 
cgit v1.2.3


From 33041b36cb7e8e9795545837355e4576ff2be4da Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 26 Jun 2018 11:34:16 -0700
Subject: Add Context to seqfile.SeqSource.ReadSeqFileData.

PiperOrigin-RevId: 202163895
Change-Id: Ib9942fcff80c0834216f4f10780662bef5b52270
---
 pkg/sentry/fs/proc/filesystems.go          |  3 ++-
 pkg/sentry/fs/proc/loadavg.go              |  4 +++-
 pkg/sentry/fs/proc/meminfo.go              |  3 ++-
 pkg/sentry/fs/proc/mounts.go               |  5 +++--
 pkg/sentry/fs/proc/net.go                  |  4 ++--
 pkg/sentry/fs/proc/seqfile/seqfile.go      | 10 +++++-----
 pkg/sentry/fs/proc/seqfile/seqfile_test.go |  2 +-
 pkg/sentry/fs/proc/stat.go                 |  3 ++-
 pkg/sentry/fs/proc/sys.go                  |  4 ++--
 pkg/sentry/fs/proc/task.go                 | 10 +++++-----
 pkg/sentry/fs/proc/uid_gid_map.go          |  2 +-
 pkg/sentry/fs/proc/version.go              |  3 ++-
 12 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index fe4de18ba..aa2c4db10 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 )
@@ -33,7 +34,7 @@ func (*filesystemsData) NeedsUpdate(generation int64) bool {
 
 // ReadSeqFileData returns data for the SeqFile reader.
 // SeqData, the current generation and where in the file the handle corresponds to.
-func (*filesystemsData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (*filesystemsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	// We don't ever expect to see a non-nil SeqHandle.
 	if h != nil {
 		return nil, 0
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 694cde656..7583b6ccd 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 )
 
@@ -29,7 +30,8 @@ func (*loadavgData) NeedsUpdate(generation int64) bool {
 	return true
 }
 
-func (d *loadavgData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (d *loadavgData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 489f796e5..49cb0faed 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -36,7 +37,7 @@ func (*meminfoData) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (d *meminfoData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 3d276dfa5..b9988625e 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -19,6 +19,7 @@ import (
 	"fmt"
 	"sort"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -67,7 +68,7 @@ func (mif *mountInfoFile) NeedsUpdate(_ int64) bool {
 }
 
 // ReadSeqFileData implements SeqSource.ReadSeqFileData.
-func (mif *mountInfoFile) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if handle != nil {
 		return nil, 0
 	}
@@ -148,7 +149,7 @@ func (mf *mountsFile) NeedsUpdate(_ int64) bool {
 }
 
 // ReadSeqFileData implements SeqSource.ReadSeqFileData.
-func (mf *mountsFile) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if handle != nil {
 		return nil, 0
 	}
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index e6bd35f27..ad94c475a 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -94,7 +94,7 @@ func (*ifinet6) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (n *ifinet6) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (n *ifinet6) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
@@ -119,7 +119,7 @@ func (n *netDev) NeedsUpdate(generation int64) bool {
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. See Linux's
 // net/core/net-procfs.c:dev_seq_show.
-func (n *netDev) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index e37a85869..c08565f8a 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -49,7 +49,7 @@ type SeqSource interface {
 	// generation. The first entry in the slice is greater than the handle.
 	// If handle is nil then all known records are returned. Generation
 	// must always be greater than 0.
-	ReadSeqFileData(handle SeqHandle) ([]SeqData, int64)
+	ReadSeqFileData(ctx context.Context, handle SeqHandle) ([]SeqData, int64)
 }
 
 // SeqGenerationCounter is a counter to keep track if the SeqSource should be
@@ -155,7 +155,7 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 			return 0, io.EOF
 		}
 		oldLen := len(s.source)
-		s.updateSourceLocked(len(s.source))
+		s.updateSourceLocked(ctx, len(s.source))
 		updated = true
 		// We know that we had consumed everything up until this point
 		// so we search in the new slice instead of starting over.
@@ -187,7 +187,7 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	// check to see if we've seeked backwards and if so always update our
 	// data source.
 	if !updated && (s.SeqSource.NeedsUpdate(s.generation) || s.lastRead > offset) {
-		s.updateSourceLocked(i)
+		s.updateSourceLocked(ctx, i)
 		// recordOffset is 0 here and we won't update records behind the
 		// current one so recordOffset is still 0 even though source
 		// just got updated. Just read the next record.
@@ -212,7 +212,7 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 }
 
 // updateSourceLocked requires that s.mu is held.
-func (s *SeqFile) updateSourceLocked(record int) {
+func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
 	var h SeqHandle
 	if record == 0 {
 		h = nil
@@ -222,7 +222,7 @@ func (s *SeqFile) updateSourceLocked(record int) {
 	// Save what we have previously read.
 	s.source = s.source[:record]
 	var newSource []SeqData
-	newSource, s.generation = s.SeqSource.ReadSeqFileData(h)
+	newSource, s.generation = s.SeqSource.ReadSeqFileData(ctx, h)
 	s.source = append(s.source, newSource...)
 }
 
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index 0bf39ad82..d90e3e736 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -55,7 +55,7 @@ func (s *seqTest) NeedsUpdate(int64) bool {
 
 // ReadSeqFiledata returns a slice of SeqData which contains elements
 // greater than the handle.
-func (s *seqTest) ReadSeqFileData(handle SeqHandle) ([]SeqData, int64) {
+func (s *seqTest) ReadSeqFileData(ctx context.Context, handle SeqHandle) ([]SeqData, int64) {
 	if handle == nil {
 		return s.actual, 0
 	}
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index dee836a05..284f3e52b 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -19,6 +19,7 @@ import (
 	"fmt"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 )
@@ -73,7 +74,7 @@ func (c cpuStats) String() string {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (s *statData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index db9ec83b9..aab891c53 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -62,7 +62,7 @@ func (*mmapMinAddrData) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (d *mmapMinAddrData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (d *mmapMinAddrData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
@@ -81,7 +81,7 @@ func (*overcommitMemory) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.
-func (*overcommitMemory) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (*overcommitMemory) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 147d57a8f..e2d2ce3ba 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -304,7 +304,7 @@ func (md *mapsData) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (md *mapsData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if mm := md.mm(); mm != nil {
 		return mm.ReadSeqFileData(md.t.AsyncContext(), h)
 	}
@@ -334,7 +334,7 @@ func (s *taskStatData) NeedsUpdate(generation int64) bool {
 
 // ReadSeqFileData returns data for the SeqFile reader.
 // SeqData, the current generation and where in the file the handle corresponds to.
-func (s *taskStatData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
@@ -405,7 +405,7 @@ func (s *statmData) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (s *statmData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (s *statmData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
@@ -440,7 +440,7 @@ func (s *statusData) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (s *statusData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
@@ -505,7 +505,7 @@ func (i *ioData) NeedsUpdate(generation int64) bool {
 
 // ReadSeqFileData returns data for the SeqFile reader.
 // SeqData, the current generation and where in the file the handle corresponds to.
-func (i *ioData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index a2a070bdd..85acb5163 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -40,7 +40,7 @@ func (imss *idMapSeqSource) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (imss *idMapSeqSource) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (imss *idMapSeqSource) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	var start int
 	if handle != nil {
 		start = handle.(*idMapSeqHandle).value
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index df3040d37..c0f2e87e3 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -17,6 +17,7 @@ package proc
 import (
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 )
@@ -33,7 +34,7 @@ func (*versionData) NeedsUpdate(generation int64) bool {
 }
 
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (v *versionData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+func (v *versionData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if h != nil {
 		return nil, 0
 	}
-- 
cgit v1.2.3


From ea10949a0036cdef95a1397ccad8fcc138ce3c0d Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 26 Jun 2018 13:09:02 -0700
Subject: Use the correct Context for /proc/[pid]/maps.

PiperOrigin-RevId: 202180487
Change-Id: I95cce41a4842ab731a4821b387b32008bfbdcb08
---
 pkg/sentry/fs/proc/task.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index e2d2ce3ba..efc635946 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -306,7 +306,7 @@ func (md *mapsData) NeedsUpdate(generation int64) bool {
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
 func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if mm := md.mm(); mm != nil {
-		return mm.ReadSeqFileData(md.t.AsyncContext(), h)
+		return mm.ReadSeqFileData(ctx, h)
 	}
 	return []seqfile.SeqData{}, 0
 }
-- 
cgit v1.2.3


From dc33d71f8cf11bede2b224f4be730916f7faac81 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 26 Jun 2018 16:53:48 -0700
Subject: Change SIGCHLD to SIGKILL in ptrace stubs.

If the child stubs are killed by any unmaskable signal (e.g. SIGKILL), then
the parent process will similarly be killed, resulting in the death of all
other stubs.

The effect of this is that if the OOM killer selects and kills a stub, the
effect is the same as though the OOM killer selected and killed the sentry.

PiperOrigin-RevId: 202219984
Change-Id: I0b638ce7e59e0a0f4d5cde12a7d05242673049d7
---
 pkg/sentry/platform/ptrace/subprocess_linux.go | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 227dd4882..b3f2ebb20 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -41,7 +41,15 @@ func createStub() (*thread, error) {
 
 	// Among other things, beforeFork masks all signals.
 	beforeFork()
-	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
+
+	// When creating the new child process, we specify SIGKILL as the
+	// signal to deliver when the child exits. We never expect a subprocess
+	// to exit; they are pooled and reused. This is done to ensure that if
+	// a subprocess is OOM-killed, this process (and all other stubs,
+	// transitively) will be killed as well. It's simply not possible to
+	// safely handle a single stub getting killed: the exact state of
+	// execution is unknown and not recoverable.
+	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
 	if errno != 0 {
 		afterFork()
 		return nil, errno
@@ -103,10 +111,12 @@ func (s *subprocess) createStub() (*thread, error) {
 	//
 	// Instead, we create the child untraced, which will do the PDEATHSIG
 	// setup and then SIGSTOP itself for our attach below.
+	//
+	// See above re: SIGKILL.
 	pid, err := t.syscallIgnoreInterrupt(
 		&regs,
 		syscall.SYS_CLONE,
-		arch.SyscallArgument{Value: uintptr(syscall.SIGCHLD | syscall.CLONE_FILES)},
+		arch.SyscallArgument{Value: uintptr(syscall.SIGKILL | syscall.CLONE_FILES)},
 		arch.SyscallArgument{Value: 0},
 		arch.SyscallArgument{Value: 0},
 		arch.SyscallArgument{Value: 0},
@@ -127,7 +137,7 @@ func (s *subprocess) createStub() (*thread, error) {
 		syscall.SYS_WAIT4,
 		arch.SyscallArgument{Value: uintptr(pid)},
 		arch.SyscallArgument{Value: 0},
-		arch.SyscallArgument{Value: syscall.WUNTRACED},
+		arch.SyscallArgument{Value: syscall.WALL | syscall.WUNTRACED},
 		arch.SyscallArgument{Value: 0},
 		arch.SyscallArgument{Value: 0},
 		arch.SyscallArgument{Value: 0})
-- 
cgit v1.2.3


From c186e408cc61cbefd6d72c2ff3e9d629572570db Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 26 Jun 2018 19:04:51 -0700
Subject: Add KVM, overlay and host network to image tests

PiperOrigin-RevId: 202236006
Change-Id: I4ea964a70fc49e8b51c9da27d77301c4eadaae71
---
 kokoro/gcp_ubuntu/run_tests.sh | 3 +++
 pkg/sentry/fs/overlay.go       | 2 +-
 runsc/test/image/install.sh    | 9 ++++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/kokoro/gcp_ubuntu/run_tests.sh b/kokoro/gcp_ubuntu/run_tests.sh
index 2f5e375eb..5554350da 100755
--- a/kokoro/gcp_ubuntu/run_tests.sh
+++ b/kokoro/gcp_ubuntu/run_tests.sh
@@ -46,6 +46,9 @@ exit_code=${?}
 if [[ ${exit_code} -eq 0 ]]; then
   # image_test is tagged manual
   bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime} //runsc/test/image:image_test
+  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-kvm //runsc/test/image:image_test
+  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-nethost //runsc/test/image:image_test
+  bazel test --test_output=errors --test_env=RUNSC_RUNTIME=${runtime}-overlay //runsc/test/image:image_test
   exit_code=${?}
 fi
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 90d21642e..a63f00e0e 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -113,7 +113,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 // - lower must not require that file objects be revalidated.
 // - lower must not have dynamic file/directory content.
 func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) {
-	if IsRegular(lower.StableAttr) {
+	if !IsRegular(lower.StableAttr) {
 		return nil, fmt.Errorf("lower Inode is not a regular file")
 	}
 	msrc := newOverlayMountSource(upperMS, lower.MountSource, flags)
diff --git a/runsc/test/image/install.sh b/runsc/test/image/install.sh
index 94832dbe4..c110d96f9 100755
--- a/runsc/test/image/install.sh
+++ b/runsc/test/image/install.sh
@@ -75,10 +75,17 @@ if [[ ${uninstall} == 0 ]]; then
   mkdir -p "${logdir}"
   sudo -n chmod a+wx "${logdir}"
 
-  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" --debug-log-dir "${logdir}" --debug --strace --log-packets
+  declare -r args="--debug-log-dir "${logdir}" --debug --strace --log-packets"
+  sudo -n "${dockercfg}" runtime-add "${runtime}" "${runsc}" ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-kvm "${runsc}" --platform=kvm ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-hostnet "${runsc}" --network=host ${args}
+  sudo -n "${dockercfg}" runtime-add "${runtime}"-overlay "${runsc}" --overlay ${args}
 
 else
   sudo -n "${dockercfg}" runtime-rm "${runtime}"
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-kvm
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-hostnet
+  sudo -n "${dockercfg}" runtime-rm "${runtime}"-overlay
 fi
 
 echo "Restarting docker service..."
-- 
cgit v1.2.3


From 4215e059e24c5ed6298060769444b0eeaa03da8a Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 27 Jun 2018 13:41:50 -0700
Subject: Ignore MADV_DONTDUMP and MADV_DODUMP.

PiperOrigin-RevId: 202361912
Change-Id: I1d0ee529073954d467b870872f494cebbf8ea61a
---
 pkg/sentry/syscalls/linux/sys_mmap.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index bfa23f6a8..1a98328dc 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -181,6 +181,10 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		fallthrough
 	case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
 		fallthrough
+	case linux.MADV_DONTDUMP, linux.MADV_DODUMP:
+		// TODO: Core dumping isn't implemented, so these are
+		// no-ops.
+		fallthrough
 	case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
 		// Do nothing, we totally ignore the suggestions above.
 		return 0, nil, nil
-- 
cgit v1.2.3


From 99afc982f1f0e40059e1446ea6f3cb725b1fbde7 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 27 Jun 2018 14:30:45 -0700
Subject: Call mm.CheckIORange() when copying in IOVecs.

CheckIORange is analagous to Linux's access_ok() method, which is checked when
copying in IOVecs in both lib/iov_iter.c:import_single_range() and
lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector().

gVisor copies in IOVecs via Task.SingleIOSequence() and Task.CopyInIovecs().
We were checking the address range bounds, but not whether the address is
valid. To conform with linux, we should also check that the address is valid.

For usual preadv/pwritev syscalls, the effect of this change is not noticeable,
since we find out that the address is invalid before the syscall completes.

For vectorized async-IO operations, however, this change is necessary because
Linux returns EFAULT when the operation is submitted, but before it executes.
Thus, we must validate the iovecs when copying them in.

PiperOrigin-RevId: 202370092
Change-Id: I8759a63ccf7e6b90d90d30f78ab8935a0fcf4936
---
 pkg/sentry/kernel/task_usermem.go | 17 +++++++++++------
 pkg/sentry/mm/io.go               | 18 +++++++++---------
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 54964dd0d..2b4954869 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -184,6 +184,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 // - If the length of any AddrRange would cause its end to overflow,
 // CopyInIovecs returns EFAULT.
 //
+// - If any AddrRange would include addresses outside the application address
+// range, CopyInIovecs returns EFAULT.
+//
 // - The combined length of all AddrRanges is limited to _MAX_RW_COUNT. If the
 // combined length of all AddrRanges would otherwise exceed this amount, ranges
 // beyond _MAX_RW_COUNT are silently truncated.
@@ -218,7 +221,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 			if length > math.MaxInt64 {
 				return usermem.AddrRangeSeq{}, syserror.EINVAL
 			}
-			ar, ok := base.ToRange(length)
+			ar, ok := t.MemoryManager().CheckIORange(base, int64(length))
 			if !ok {
 				return usermem.AddrRangeSeq{}, syserror.EFAULT
 			}
@@ -251,18 +254,20 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 }
 
 // SingleIOSequence returns a usermem.IOSequence representing [addr,
-// addr+length) in t's address space. If length exceeds _MAX_RW_COUNT, it is
-// silently truncated.
+// addr+length) in t's address space. If this contains addresses outside the
+// application address range, it returns EFAULT. If length exceeds
+// _MAX_RW_COUNT, the range is silently truncated.
 //
 // SingleIOSequence is analogous to Linux's
 // lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
-// write syscalls in Linux do not use import_single_range(), but are still
-// truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
+// write syscalls in Linux do not use import_single_range(). However they check
+// access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address
+// ranges are truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
 func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
 	if length > _MAX_RW_COUNT {
 		length = _MAX_RW_COUNT
 	}
-	ar, ok := addr.ToRange(uint64(length))
+	ar, ok := t.MemoryManager().CheckIORange(addr, int64(length))
 	if !ok {
 		return usermem.IOSequence{}, syserror.EFAULT
 	}
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index cac81a59d..6741db594 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -60,11 +60,11 @@ const (
 	rwMapMinBytes = 512
 )
 
-// checkIORange is similar to usermem.Addr.ToRange, but applies bounds checks
+// CheckIORange is similar to usermem.Addr.ToRange, but applies bounds checks
 // consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok().
 //
 // Preconditions: length >= 0.
-func (mm *MemoryManager) checkIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) {
+func (mm *MemoryManager) CheckIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) {
 	// Note that access_ok() constrains end even if length == 0.
 	ar, ok := addr.ToRange(uint64(length))
 	return ar, (ok && ar.End <= mm.layout.MaxAddr)
@@ -75,7 +75,7 @@ func (mm *MemoryManager) checkIORange(addr usermem.Addr, length int64) (usermem.
 func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool {
 	for !ars.IsEmpty() {
 		ar := ars.Head()
-		if _, ok := mm.checkIORange(ar.Start, int64(ar.Length())); !ok {
+		if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok {
 			return false
 		}
 		ars = ars.Tail()
@@ -101,7 +101,7 @@ func translateIOError(ctx context.Context, err error) error {
 
 // CopyOut implements usermem.IO.CopyOut.
 func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
-	ar, ok := mm.checkIORange(addr, int64(len(src)))
+	ar, ok := mm.CheckIORange(addr, int64(len(src)))
 	if !ok {
 		return 0, syserror.EFAULT
 	}
@@ -144,7 +144,7 @@ func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src [
 
 // CopyIn implements usermem.IO.CopyIn.
 func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
-	ar, ok := mm.checkIORange(addr, int64(len(dst)))
+	ar, ok := mm.CheckIORange(addr, int64(len(dst)))
 	if !ok {
 		return 0, syserror.EFAULT
 	}
@@ -187,7 +187,7 @@ func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []
 
 // ZeroOut implements usermem.IO.ZeroOut.
 func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
-	ar, ok := mm.checkIORange(addr, toZero)
+	ar, ok := mm.CheckIORange(addr, toZero)
 	if !ok {
 		return 0, syserror.EFAULT
 	}
@@ -311,7 +311,7 @@ func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq,
 
 // SwapUint32 implements usermem.IO.SwapUint32.
 func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
-	ar, ok := mm.checkIORange(addr, 4)
+	ar, ok := mm.CheckIORange(addr, 4)
 	if !ok {
 		return 0, syserror.EFAULT
 	}
@@ -353,7 +353,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new
 
 // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
 func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
-	ar, ok := mm.checkIORange(addr, 4)
+	ar, ok := mm.CheckIORange(addr, 4)
 	if !ok {
 		return 0, syserror.EFAULT
 	}
@@ -399,7 +399,7 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.
 // Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr).
 func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error {
 	// Try to map all remaining pages in the I/O operation. This RoundUp can't
-	// overflow because otherwise it would have been caught by checkIORange.
+	// overflow because otherwise it would have been caught by CheckIORange.
 	end, _ := ioar.End.RoundUp()
 	ar := usermem.AddrRange{addr.RoundDown(), end}
 
-- 
cgit v1.2.3


From 6b6852bceb12900f27a541682ddfe47893911c6e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 27 Jun 2018 14:40:37 -0700
Subject: Fix semaphore data races

PiperOrigin-RevId: 202371908
Change-Id: I72603b1d321878cae6404987c49e64732b676331
---
 pkg/sentry/kernel/semaphore/semaphore.go | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index e9027dc14..a1ee83ce5 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -118,6 +118,9 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
 	if !private {
 		// Look up an existing semaphore.
 		if set := r.findByKey(key); set != nil {
+			set.mu.Lock()
+			defer set.mu.Unlock()
+
 			// Check that caller can access semaphore set.
 			creds := auth.CredentialsFromContext(ctx)
 			if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
@@ -170,6 +173,9 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
 		return syserror.EINVAL
 	}
 
+	set.mu.Lock()
+	defer set.mu.Unlock()
+
 	// "The effective user ID of the calling process must match the creator or
 	// owner of the semaphore set, or the caller must be privileged."
 	if !set.checkCredentials(creds) && !set.checkCapability(creds) {
@@ -444,11 +450,9 @@ func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
 	return s.checkCapability(creds)
 }
 
+// destroy destroys the set. Caller must hold 's.mu'.
 func (s *Set) destroy() {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	// Notify all waiters. Tney will fail on the next attempt to execute
+	// Notify all waiters. They will fail on the next attempt to execute
 	// operations and return error.
 	s.dead = true
 	for _, s := range s.sems {
-- 
cgit v1.2.3


From 1ceed49ba94c139be274fe5eaf367201ab0042a6 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 28 Jun 2018 12:54:14 -0700
Subject: Check for invalid offset when submitting an AIO read/write request.

PiperOrigin-RevId: 202528335
Change-Id: Ic32312cf4337bcb40a7155cb2174e5cd89a280f7
---
 pkg/sentry/syscalls/linux/sys_aio.go | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 80407a082..470027206 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -319,6 +319,14 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
 		return err
 	}
 
+	// Check offset for reads/writes.
+	switch cb.OpCode {
+	case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV:
+		if cb.Offset < 0 {
+			return syserror.EINVAL
+		}
+	}
+
 	// Prepare the request.
 	ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
 	if !ok {
-- 
cgit v1.2.3


From f93bd2cbe66817c300114630bb52702466e52129 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 28 Jun 2018 16:10:17 -0700
Subject: Hold t.mu while calling t.FSContext().

PiperOrigin-RevId: 202562686
Change-Id: I0f5be7cc9098e86fa31d016251c127cb91084b05
---
 pkg/sentry/fs/proc/mounts.go        | 15 ++++++++++++++-
 pkg/sentry/kernel/fs_context.go     | 17 ++++++++++++-----
 pkg/sentry/kernel/task_resources.go |  4 +++-
 3 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index b9988625e..108432f4e 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -28,9 +28,22 @@ import (
 // forEachMountSource runs f for the process root mount and  each mount that is a
 // descendant of the root.
 func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
+	var fsctx *kernel.FSContext
+	t.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
+	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return
+	}
+
 	// All mount points must be relative to the rootDir, and mounts outside
 	// will be excluded.
-	rootDir := t.FSContext().RootDirectory()
+	rootDir := fsctx.RootDirectory()
+	if rootDir == nil {
+		// The task has been destroyed. Nothing to show here.
+		return
+	}
 	defer rootDir.DecRef()
 
 	if rootDir.Inode == nil {
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index d1ca9c09d..dbc097696 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -66,6 +66,11 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 // (that return nil).  This is because valid references may still be held via
 // proc files or other mechanisms.
 func (f *FSContext) destroy() {
+	// Hold f.mu so that we don't race with RootDirectory() and
+	// WorkingDirectory().
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
 	f.root.DecRef()
 	f.root = nil
 
@@ -94,9 +99,9 @@ func (f *FSContext) Fork() *FSContext {
 }
 
 // WorkingDirectory returns the current working directory.
-// You should call DecRef on the returned Dirent when finished.
 //
-// This will return nil if called after destroy().
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
 func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -129,13 +134,15 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
 }
 
 // RootDirectory returns the current filesystem root.
-// You should call DecRef on the returned Dirent when finished.
 //
-// This will return nil if called after destroy().
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
 func (f *FSContext) RootDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
-	f.root.IncRef()
+	if f.root != nil {
+		f.root.IncRef()
+	}
 	return f.root
 }
 
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
index 0389989e8..4ca25664a 100644
--- a/pkg/sentry/kernel/task_resources.go
+++ b/pkg/sentry/kernel/task_resources.go
@@ -123,6 +123,8 @@ func (t *Task) IsChrooted() bool {
 	realRoot := t.k.mounts.Root()
 	defer realRoot.DecRef()
 	root := t.tr.FSContext.RootDirectory()
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	return root != realRoot
 }
-- 
cgit v1.2.3


From 1b5e09f968bf923f5583e8bc7627691b7c62770a Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 29 Jun 2018 10:46:49 -0700
Subject: aio: Return EINVAL if the number of events is negative.

PiperOrigin-RevId: 202671065
Change-Id: I248b74544d47ddde9cd59d89aa6ccb7dad2b6f89
---
 pkg/sentry/syscalls/linux/sys_aio.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 470027206..345ef9bec 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -132,7 +132,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
 	timespecAddr := args[4].Pointer()
 
 	// Sanity check arguments.
-	if minEvents > events {
+	if minEvents < 0 || minEvents > events {
 		return 0, nil, syserror.EINVAL
 	}
 
@@ -359,6 +359,10 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	nrEvents := args[1].Int()
 	addr := args[2].Pointer()
 
+	if nrEvents < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
 	for i := int32(0); i < nrEvents; i++ {
 		// Copy in the address.
 		cbAddrNative := t.Arch().Native(0)
-- 
cgit v1.2.3


From 80bdf8a4068de3ac4a73b6b61a0cdcfe3e3571af Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Fri, 29 Jun 2018 14:46:45 -0700
Subject: Sets the restore environment for restoring a container.

Updated how restoring occurs through boot.go with a separate Restore function.
This prevents a new process and new mounts from being created.
Added tests to ensure the container is restored.
Registered checkpoint and restore commands so they can be used.
Docker support for these commands is still limited.
Working on #80.

PiperOrigin-RevId: 202710950
Change-Id: I2b893ceaef6b9442b1ce3743bd112383cb92af0c
---
 pkg/sentry/fs/gofer/inode_state.go |  27 ++++++-
 runsc/boot/fs.go                   | 103 +++++++++++++++----------
 runsc/boot/loader.go               | 146 ++++++++++++++++++++++--------------
 runsc/boot/loader_test.go          |  74 +++++++++++++++++-
 runsc/cmd/boot.go                  |   2 +
 runsc/cmd/checkpoint.go            |   2 +-
 runsc/cmd/restore.go               |   2 +-
 runsc/container/container_test.go  | 150 ++++++++++++++++++++++++++++++++++---
 runsc/main.go                      |   2 +
 runsc/specutils/specutils.go       |  11 ---
 10 files changed, 391 insertions(+), 128 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 82d1dd4da..33ec33364 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -17,6 +17,7 @@ package gofer
 import (
 	"errors"
 	"fmt"
+	"path/filepath"
 	"strings"
 
 	"gvisor.googlesource.com/gvisor/pkg/p9"
@@ -77,6 +78,29 @@ func (i *inodeFileState) saveLoading() struct{} {
 	return struct{}{}
 }
 
+// splitAbsolutePath splits the path on slashes ignoring the leading slash.
+func splitAbsolutePath(path string) []string {
+	if len(path) == 0 {
+		panic("There is no path!")
+	}
+	if path != filepath.Clean(path) {
+		panic(fmt.Sprintf("path %q is not clean", path))
+	}
+	// This case is to return {} rather than {""}
+	if path == "/" {
+		return []string{}
+	}
+	if path[0] != '/' {
+		panic(fmt.Sprintf("path %q is not absolute", path))
+	}
+
+	s := strings.Split(path, "/")
+
+	// Since p is absolute, the first component of s
+	// is an empty string. We must remove that.
+	return s[1:]
+}
+
 // loadLoading is invoked by stateify.
 func (i *inodeFileState) loadLoading(_ struct{}) {
 	i.loading.Lock()
@@ -98,7 +122,8 @@ func (i *inodeFileState) afterLoad() {
 		// TODO: Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 		var err error
-		_, i.file, err = i.s.attach.walk(ctx, strings.Split(name, "/"))
+
+		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
 		if err != nil {
 			return fmt.Errorf("failed to walk to %q: %v", name, err)
 		}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e0d7fc769..a9b2f225a 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -38,6 +38,14 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
+const (
+	// Filesystem name for 9p gofer mounts.
+	rootFsName = "9p"
+
+	// Device name for root mount.
+	rootDevice = "9pfs-/"
+)
+
 type fdDispenser struct {
 	fds []int
 }
@@ -64,7 +72,8 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
 	}
-	if err := configureMounts(rootCtx, spec, conf, mns, fds); err != nil {
+	mounts := compileMounts(spec)
+	if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil {
 		return nil, fmt.Errorf("failed to configure mounts: %v", err)
 	}
 	if !fds.empty() {
@@ -73,27 +82,23 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 	return mns, nil
 }
 
-// configureMounts iterates over Spec.Mounts and mounts them in the specified
-// mount namespace.
-func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser) error {
+// compileMounts returns the supported mounts from the mount spec, adding any
+// additional mounts that are required by the OCI specification.
+func compileMounts(spec *specs.Spec) []specs.Mount {
 	// Keep track of whether proc, sys, and tmp were mounted.
 	var procMounted, sysMounted, tmpMounted bool
+	var mounts []specs.Mount
 
 	// Always mount /dev.
-	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+	mounts = append(mounts, specs.Mount{
 		Type:        "devtmpfs",
 		Destination: "/dev",
-	}); err != nil {
-		return err
-	}
+	})
 
-	// Always mount /dev/pts.
-	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+	mounts = append(mounts, specs.Mount{
 		Type:        "devpts",
 		Destination: "/dev/pts",
-	}); err != nil {
-		return err
-	}
+	})
 
 	// Mount all submounts from the spec.
 	for _, m := range spec.Mounts {
@@ -101,6 +106,7 @@ func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *f
 			log.Warningf("ignoring dev mount at %q", m.Destination)
 			continue
 		}
+		mounts = append(mounts, m)
 		switch filepath.Clean(m.Destination) {
 		case "/proc":
 			procMounted = true
@@ -109,43 +115,45 @@ func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *f
 		case "/tmp":
 			tmpMounted = true
 		}
-
-		if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil {
-			return err
-		}
 	}
 
 	// Mount proc and sys even if the user did not ask for it, as the spec
 	// says we SHOULD.
 	if !procMounted {
-		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		mounts = append(mounts, specs.Mount{
 			Type:        "proc",
 			Destination: "/proc",
-		}); err != nil {
-			return err
-		}
+		})
 	}
 	if !sysMounted {
-		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		mounts = append(mounts, specs.Mount{
 			Type:        "sysfs",
 			Destination: "/sys",
-		}); err != nil {
-			return err
-		}
+		})
 	}
 
 	// Technically we don't have to mount tmpfs at /tmp, as we could just
 	// rely on the host /tmp, but this is a nice optimization, and fixes
 	// some apps that call mknod in /tmp.
 	if !tmpMounted {
-		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		mounts = append(mounts, specs.Mount{
 			Type:        "tmpfs",
 			Destination: "/tmp",
-		}); err != nil {
+		})
+	}
+	return mounts
+}
+
+// setMounts iterates over mounts and mounts them in the specified
+// mount namespace.
+func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
+
+	// Mount all submounts from mounts.
+	for _, m := range mounts {
+		if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil {
 			return err
 		}
 	}
-
 	return nil
 }
 
@@ -158,19 +166,20 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 		rootInode *fs.Inode
 		err       error
 	)
+
 	switch conf.FileAccess {
 	case FileAccessProxy:
 		fd := fds.remove()
 		log.Infof("Mounting root over 9P, ioFD: %d", fd)
 		hostFS := mustFindFilesystem("9p")
-		rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
+		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
 		if err != nil {
 			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 		}
 
 	case FileAccessDirect:
 		hostFS := mustFindFilesystem("whitelistfs")
-		rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
+		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
 		if err != nil {
 			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 		}
@@ -263,7 +272,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	return fsName, data, useOverlay, err
 }
 
-func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error {
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, data, useOverlay, err := getMountNameAndOptions(conf, m, fds)
@@ -285,14 +294,13 @@ func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.
 		mf.ReadOnly = true
 	}
 
-	inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ","))
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(data, ","))
 	if err != nil {
 		return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
 	}
 
 	// If there are submounts, we need to overlay the mount on top of a
 	// ramfs with stub directories for submount paths.
-	mounts := specutils.SupportedMounts(spec.Mounts)
 	submounts := subtargets(m.Destination, mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
@@ -406,7 +414,7 @@ func mountDevice(m specs.Mount) string {
 	if m.Type == "bind" {
 		// Make a device string that includes the target, which is consistent across
 		// S/R and uniquely identifies the connection.
-		return "p9fs-" + m.Destination
+		return "9pfs-" + m.Destination
 	}
 	// All other fs types use device "none".
 	return "none"
@@ -417,14 +425,24 @@ func mountDevice(m specs.Mount) string {
 func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
 	fsName, data, _, err := getMountNameAndOptions(conf, m, fds)
 	dataString := strings.Join(data, ",")
+
+	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
 	if err != nil {
 		return err
 	}
-	renv.MountSources[fsName] = append(renv.MountSources[fsName], fs.MountArgs{
+	// TODO: Fix this when we support all the mount types and make this a
+	// fatal error.
+	if fsName == "" {
+		return nil
+	}
+
+	newMount := fs.MountArgs{
 		Dev:   mountDevice(m),
 		Flags: mountFlags(m.Options),
 		Data:  dataString,
-	})
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+	log.Infof("Added mount at %q: %+v", fsName, newMount)
 	return nil
 }
 
@@ -438,6 +456,8 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 		MountSources: make(map[string][]fs.MountArgs),
 	}
 
+	mounts := compileMounts(spec)
+
 	// Add root mount.
 	fd := fds.remove()
 	dataString := strings.Join([]string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}, ",")
@@ -445,15 +465,16 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	if spec.Root.Readonly {
 		mf.ReadOnly = true
 	}
-	const rootFSName = "9p"
-	renv.MountSources[rootFSName] = append(renv.MountSources[rootFSName], fs.MountArgs{
-		Dev:   "p9fs-/",
+
+	rootMount := fs.MountArgs{
+		Dev:   rootDevice,
 		Flags: mf,
 		Data:  dataString,
-	})
+	}
+	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
 	// Add submounts
-	for _, m := range spec.Mounts {
+	for _, m := range mounts {
 		if err := addRestoreMount(conf, renv, m, fds); err != nil {
 			return nil, err
 		}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 014908179..6fcfba5cb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -80,6 +81,9 @@ type Loader struct {
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
 
+	// restore is set to true if we are restoring a container.
+	restore bool
+
 	// rootProcArgs refers to the root sandbox init task.
 	rootProcArgs kernel.CreateProcessArgs
 
@@ -106,7 +110,17 @@ func init() {
 }
 
 // New initializes a new kernel loader configured by spec.
+// New also handles setting up a kernel for restoring a container.
 func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []int, console bool) (*Loader, error) {
+	var (
+		tk          *kernel.Timekeeper
+		creds       *auth.Credentials
+		vdso        *loader.VDSO
+		utsns       *kernel.UTSNamespace
+		ipcns       *kernel.IPCNamespace
+		restoreFile *os.File
+		procArgs    kernel.CreateProcessArgs
+	)
 	// Create kernel and platform.
 	p, err := createPlatform(conf)
 	if err != nil {
@@ -116,47 +130,60 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		Platform: p,
 	}
 
-	// Create VDSO.
-	//
-	// Pass k as the platform since it is savable, unlike the actual platform.
-	vdso, err := loader.PrepareVDSO(k)
-	if err != nil {
-		return nil, fmt.Errorf("error creating vdso: %v", err)
-	}
+	if restoreFD == -1 {
+		// Create VDSO.
+		//
+		// Pass k as the platform since it is savable, unlike the actual platform.
+		vdso, err := loader.PrepareVDSO(k)
+		if err != nil {
+			return nil, fmt.Errorf("error creating vdso: %v", err)
+		}
 
-	// Create timekeeper.
-	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
-	if err != nil {
-		return nil, fmt.Errorf("error creating timekeeper: %v", err)
-	}
-	tk.SetClocks(time.NewCalibratedClocks())
+		// Create timekeeper.
+		tk, err = kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+		if err != nil {
+			return nil, fmt.Errorf("error creating timekeeper: %v", err)
+		}
+		tk.SetClocks(time.NewCalibratedClocks())
 
-	// Create capabilities.
-	caps, err := specutils.Capabilities(spec.Process.Capabilities)
-	if err != nil {
-		return nil, fmt.Errorf("error creating capabilities: %v", err)
-	}
+		// Create capabilities.
+		caps, err := specutils.Capabilities(spec.Process.Capabilities)
+		if err != nil {
+			return nil, fmt.Errorf("error creating capabilities: %v", err)
+		}
 
-	// Convert the spec's additional GIDs to KGIDs.
-	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
-	for _, GID := range spec.Process.User.AdditionalGids {
-		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
-	}
+		// Convert the spec's additional GIDs to KGIDs.
+		extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+		for _, GID := range spec.Process.User.AdditionalGids {
+			extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+		}
 
-	// Create credentials.
-	creds := auth.NewUserCredentials(
-		auth.KUID(spec.Process.User.UID),
-		auth.KGID(spec.Process.User.GID),
-		extraKGIDs,
-		caps,
-		auth.NewRootUserNamespace())
+		// Create credentials.
+		creds = auth.NewUserCredentials(
+			auth.KUID(spec.Process.User.UID),
+			auth.KGID(spec.Process.User.GID),
+			extraKGIDs,
+			caps,
+			auth.NewRootUserNamespace())
 
-	// Create user namespace.
-	// TODO: Not clear what domain name should be here.  It is
-	// not configurable from runtime spec.
-	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
+		// Create user namespace.
+		// TODO: Not clear what domain name should be here.  It is
+		// not configurable from runtime spec.
+		utsns = kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
 
-	ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
+		ipcns = kernel.NewIPCNamespace(creds.UserNamespace)
+	} else {
+		// Create and set RestoreEnvironment
+		fds := &fdDispenser{fds: ioFDs}
+		renv, err := createRestoreEnvironment(spec, conf, fds)
+		if err != nil {
+			return nil, fmt.Errorf("error creating RestoreEnvironment: %v", err)
+		}
+		fs.SetRestoreEnvironment(*renv)
+
+		restoreFile = os.NewFile(uintptr(restoreFD), "restore_file")
+		defer restoreFile.Close()
+	}
 
 	if err := enableStrace(conf); err != nil {
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
@@ -168,19 +195,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	// Run().
 	networkStack := newEmptyNetworkStack(conf, k)
 
-	// Check if we need to restore the kernel
-	if restoreFD != -1 {
-		restoreFile := os.NewFile(uintptr(restoreFD), "restore_file")
-		defer restoreFile.Close()
-
-		// Load the state.
-		loadOpts := state.LoadOpts{
-			Source: restoreFile,
-		}
-		if err := loadOpts.Load(k, p, networkStack); err != nil {
-			return nil, err
-		}
-	} else {
+	if restoreFile == nil {
 		// Initiate the Kernel object, which is required by the Context passed
 		// to createVFS in order to mount (among other things) procfs.
 		if err = k.Init(kernel.InitKernelArgs{
@@ -196,6 +211,17 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		}); err != nil {
 			return nil, fmt.Errorf("error initializing kernel: %v", err)
 		}
+	} else {
+		// Load the state.
+		loadOpts := state.LoadOpts{
+			Source: restoreFile,
+		}
+		if err := loadOpts.Load(k, p, networkStack); err != nil {
+			return nil, err
+		}
+
+		// Set timekeeper.
+		k.Timekeeper().SetClocks(time.NewCalibratedClocks())
 	}
 
 	// Turn on packet logging if enabled.
@@ -232,9 +258,11 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	// Ensure that signals received are forwarded to the emulated kernel.
 	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
 
-	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create root process: %v", err)
+	if restoreFile == nil {
+		procArgs, err = newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create root process: %v", err)
+		}
 	}
 
 	l := &Loader{
@@ -245,6 +273,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		watchdog:             watchdog,
 		stopSignalForwarding: stopSignalForwarding,
 		rootProcArgs:         procArgs,
+		restore:              restoreFile != nil,
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -378,13 +407,16 @@ func (l *Loader) run() error {
 		}
 	}
 
-	// Create the root container init task.
-	if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
-		return fmt.Errorf("failed to create init process: %v", err)
-	}
+	// If we are restoring, we do not want to create a process.
+	if !l.restore {
+		// Create the root container init task.
+		if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+			return fmt.Errorf("failed to create init process: %v", err)
+		}
 
-	// CreateProcess takes a reference on FDMap if successful.
-	l.rootProcArgs.FDMap.DecRef()
+		// CreateProcess takes a reference on FDMap if successful.
+		l.rootProcArgs.FDMap.DecRef()
+	}
 
 	l.watchdog.Start()
 	return l.k.Start()
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 15ced0601..28d45b54b 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -364,7 +364,7 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "p9fs-/",
+							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
 							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
@@ -376,6 +376,24 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev: "none",
 						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
 					},
 				},
 			},
@@ -406,15 +424,40 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "p9fs-/",
+							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
 							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 						{
-							Dev:  "p9fs-/dev/fd-foo",
+							Dev:  "9pfs-/dev/fd-foo",
 							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true",
 						},
 					},
+					"tmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
 				},
 			},
 		},
@@ -445,7 +488,7 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "p9fs-/",
+							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
 							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
@@ -456,6 +499,29 @@ func TestRestoreEnvironment(t *testing.T) {
 							Flags: fs.MountSourceFlags{NoAtime: true},
 							Data:  "uid=1022",
 						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
 					},
 				},
 			},
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 685cb6f00..b19da315f 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -142,7 +142,9 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
+
 	l, err := boot.New(spec, conf, b.controllerFD, b.restoreFD, b.ioFDs.GetArray(), b.console)
+
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 9348289ca..94efc3517 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -44,7 +44,7 @@ func (*Checkpoint) Name() string {
 
 // Synopsis implements subcommands.Command.Synopsis.
 func (*Checkpoint) Synopsis() string {
-	return "checkpoint current state of container"
+	return "checkpoint current state of container (experimental)"
 }
 
 // Usage implements subcommands.Command.Usage.
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index cc55beeaf..69cdb35c1 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -42,7 +42,7 @@ func (*Restore) Name() string {
 
 // Synopsis implements subcommands.Command.Synopsis.
 func (*Restore) Synopsis() string {
-	return "restore a saved state of container"
+	return "restore a saved state of container (experimental)"
 }
 
 // Usage implements subcommands.Command.Usage.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index ae500e7d0..a6bb39c5d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -22,6 +22,7 @@ import (
 	"path"
 	"path/filepath"
 	"reflect"
+	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -106,6 +107,56 @@ func procListToString(pl []*control.Process) string {
 	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
 }
 
+// createWriteableOutputFile creates an output file that can be read and written to in the sandbox.
+func createWriteableOutputFile(path string) (*os.File, error) {
+	outputFile, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+	if err != nil {
+		return nil, fmt.Errorf("error creating file: %q, %v", path, err)
+	}
+
+	// Chmod to allow writing after umask.
+	if err := outputFile.Chmod(0666); err != nil {
+		return nil, fmt.Errorf("error chmoding file: %q, %v", path, err)
+	}
+	return outputFile, nil
+}
+
+func readOutputNum(outputFile *os.File, path string, first bool) (int, error) {
+	var num int
+	time.Sleep(1 * time.Second)
+
+	// Check that outputFile exists and contains counting data.
+	fileInfo, err := os.Stat(path)
+	if err != nil {
+		return 0, fmt.Errorf("error creating output file: %v", err)
+	}
+
+	if fileInfo.Size() == 0 {
+		return 0, fmt.Errorf("failed to write to file, file still appears empty")
+	}
+
+	// Read the first number in the new file
+	outputFileContent, err := ioutil.ReadAll(outputFile)
+	if err != nil {
+		return 0, fmt.Errorf("error reading file: %v", err)
+	}
+	if len(outputFileContent) == 0 {
+		return 0, fmt.Errorf("error no content was read")
+	}
+
+	nums := strings.Split(string(outputFileContent), "\n")
+
+	if first {
+		num, err = strconv.Atoi(nums[0])
+	} else {
+		num, err = strconv.Atoi(nums[len(nums)-2])
+	}
+	if err != nil {
+		return 0, fmt.Errorf("error getting number from file: %v", err)
+	}
+	return num, nil
+}
+
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
 func run(spec *specs.Spec) error {
@@ -429,13 +480,28 @@ func TestExec(t *testing.T) {
 	}
 }
 
-// TestCheckpoint verifies that calling checkpoint with an image-path flag succeeds.
-// Since there is no current default image path, confirming that calling
-// checkpoint without an image path fails.
-// Checks that there is a file with the name and location given by image path.
-func TestCheckpoint(t *testing.T) {
-	// Container will succeed.
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+// TestCheckpointRestore creates a container that continuously writes successive integers
+// to a file. To test checkpoint and restore functionality, the container is
+// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
+// new containers and the first number printed from these containers is checked. Both should
+// be the next consecutive number after the last number from the checkpointed container.
+func TestCheckpointRestore(t *testing.T) {
+	outputPath := filepath.Join(os.TempDir(), "output")
+	outputFile, err := createWriteableOutputFile(outputPath)
+	if err != nil {
+		t.Fatalf("error creating output file: %v", err)
+	}
+	defer outputFile.Close()
+
+	outputFileSandbox := strings.Replace(outputPath, os.TempDir(), "/tmp2", -1)
+
+	script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %s; sleep 1; done", outputFileSandbox)
+	spec := testutil.NewSpecWithArgs("bash", "-c", script)
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Type:        "bind",
+		Destination: "/tmp2",
+		Source:      os.TempDir(),
+	})
 
 	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
 	if err != nil {
@@ -464,20 +530,80 @@ func TestCheckpoint(t *testing.T) {
 	}
 	defer file.Close()
 
+	time.Sleep(1 * time.Second)
+
 	// Checkpoint running container; save state into new file.
 	if err := cont.Checkpoint(file); err != nil {
 		t.Fatalf("error checkpointing container to empty file: %v", err)
 	}
 	defer os.RemoveAll(imagePath)
 
-	// Check to see if file exists and contains data.
-	fileInfo, err := os.Stat(imagePath)
+	lastNum, err := readOutputNum(outputFile, outputPath, false)
+	if err != nil {
+		t.Fatalf("error with outputFile: %v", err)
+	}
+
+	// Delete and recreate file before restoring.
+	if err := os.Remove(outputPath); err != nil {
+		t.Fatalf("error removing file")
+	}
+	outputFile2, err := createWriteableOutputFile(outputPath)
+	if err != nil {
+		t.Fatalf("error creating output file: %v", err)
+	}
+	defer outputFile2.Close()
+
+	// Restore into a new container.
+	cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", imagePath)
 	if err != nil {
-		t.Fatalf("error checkpointing container: %v", err)
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont2.Destroy()
+	if err := cont2.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	firstNum, err := readOutputNum(outputFile2, outputPath, true)
+	if err != nil {
+		t.Fatalf("error with outputFile: %v", err)
 	}
-	if size := fileInfo.Size(); size == 0 {
-		t.Fatalf("failed checkpoint, file still appears empty: %v", err)
+
+	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+	if lastNum+1 != firstNum {
+		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
 	}
+
+	// Restore into another container!
+	// Delete and recreate file before restoring.
+	if err := os.Remove(outputPath); err != nil {
+		t.Fatalf("error removing file")
+	}
+	outputFile3, err := createWriteableOutputFile(outputPath)
+	if err != nil {
+		t.Fatalf("error creating output file: %v", err)
+	}
+	defer outputFile3.Close()
+
+	// Restore into a new container.
+	cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", imagePath)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont3.Destroy()
+	if err := cont3.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	firstNum2, err := readOutputNum(outputFile3, outputPath, true)
+	if err != nil {
+		t.Fatalf("error with outputFile: %v", err)
+	}
+
+	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+	if lastNum+1 != firstNum2 {
+		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+	}
+
 }
 
 // TestPauseResume tests that we can successfully pause and resume a container.
diff --git a/runsc/main.go b/runsc/main.go
index dfb338b0f..10ae44b5e 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -71,6 +71,7 @@ func main() {
 	subcommands.Register(subcommands.FlagsCommand(), "")
 
 	// Register user-facing runsc commands.
+	subcommands.Register(new(cmd.Checkpoint), "")
 	subcommands.Register(new(cmd.Create), "")
 	subcommands.Register(new(cmd.Delete), "")
 	subcommands.Register(new(cmd.Events), "")
@@ -80,6 +81,7 @@ func main() {
 	subcommands.Register(new(cmd.List), "")
 	subcommands.Register(new(cmd.Pause), "")
 	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Restore), "")
 	subcommands.Register(new(cmd.Resume), "")
 	subcommands.Register(new(cmd.Run), "")
 	subcommands.Register(new(cmd.Start), "")
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 0d9e09e9d..34243e623 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -266,17 +266,6 @@ func IsSupportedDevMount(m specs.Mount) bool {
 	return true
 }
 
-// SupportedMounts filters out unsupported mounts.
-func SupportedMounts(mounts []specs.Mount) []specs.Mount {
-	var newMounts []specs.Mount
-	for _, m := range mounts {
-		if IsSupportedDevMount(m) {
-			newMounts = append(newMounts, m)
-		}
-	}
-	return newMounts
-}
-
 // BinPath returns the real path to self, resolving symbolink links. This is done
 // to make the process name appears as 'runsc', instead of 'exe'.
 func BinPath() (string, error) {
-- 
cgit v1.2.3


From 2821dfe6ce95ad32bb0084cb3b2335bf7b31de7a Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 2 Jul 2018 17:38:01 -0700
Subject: Hold d.parent.mu when reading d.name

PiperOrigin-RevId: 203041657
Change-Id: I120783d91712818e600505454c9276f8d9877f37
---
 pkg/sentry/fs/dirent.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 739ae1d2d..410f93b13 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1342,7 +1342,15 @@ func (d *Dirent) InotifyEvent(events, cookie uint32) {
 
 	// The ordering below is important, Linux always notifies the parent first.
 	if d.parent != nil {
-		d.parent.Inode.Watches.Notify(d.name, events, cookie)
+		// name is immediately stale w.r.t. renames (renameMu doesn't
+		// protect against renames in the same directory). Holding
+		// d.parent.mu around Notify() wouldn't matter since Notify
+		// doesn't provide a synchronous mechanism for reading the name
+		// anyway.
+		d.parent.mu.Lock()
+		name := d.name
+		d.parent.mu.Unlock()
+		d.parent.Inode.Watches.Notify(name, events, cookie)
 	}
 	d.Inode.Watches.Notify("", events, cookie)
 
-- 
cgit v1.2.3


From 062a6f6ec5f4bf2ce46790a22d8e7278d51e6836 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 3 Jul 2018 11:27:29 -0700
Subject: Handle NUL-only paths in exec

The path in execve(2), interpreter script, and ELF interpreter may all
be no more than a NUL-byte. Handle each of those cases.

PiperOrigin-RevId: 203155745
Change-Id: I1c8b1b387924b23b2cf942341dfc76c9003da959
---
 pkg/sentry/loader/elf.go         | 26 ++++++++++++++++++++++++--
 pkg/sentry/loader/interpreter.go |  5 +++++
 pkg/sentry/loader/loader.go      |  5 +++++
 3 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index d2f18cd4f..0462a1788 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -405,6 +405,10 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 			}
 
 		case elf.PT_INTERP:
+			if phdr.Filesz < 2 {
+				ctx.Infof("PT_INTERP path too small: %v", phdr.Filesz)
+				return loadedELF{}, syserror.ENOEXEC
+			}
 			if phdr.Filesz > syscall.PathMax {
 				ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
 				return loadedELF{}, syserror.ENOEXEC
@@ -423,8 +427,26 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 				return loadedELF{}, syserror.ENOEXEC
 			}
 
-			// Strip NUL-terminator from string.
-			interpreter = string(path[:len(path)-1])
+			// Strip NUL-terminator and everything beyond from
+			// string. Note that there may be a NUL-terminator
+			// before len(path)-1.
+			interpreter = string(path[:bytes.IndexByte(path, '\x00')])
+			if interpreter == "" {
+				// Linux actually attempts to open_exec("\0").
+				// open_exec -> do_open_execat fails to check
+				// that name != '\0' before calling
+				// do_filp_open, which thus opens the working
+				// directory.  do_open_execat returns EACCES
+				// because the directory is not a regular file.
+				//
+				// We bypass that nonsense and simply
+				// short-circuit with EACCES. Those this does
+				// mean that there may be some edge cases where
+				// the open path would return a different
+				// error.
+				ctx.Infof("PT_INTERP path is empty: %v", path)
+				return loadedELF{}, syserror.EACCES
+			}
 		}
 	}
 
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index b8ecbe92f..7249b8f30 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -82,6 +82,11 @@ func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, ar
 		}
 	}
 
+	if string(interp) == "" {
+		ctx.Infof("Interpreter script contains no interpreter: %v", line)
+		return "", []string{}, syserror.ENOEXEC
+	}
+
 	// Build the new argument list:
 	//
 	// 1. The interpreter.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 3cda0fe6f..1b2e9f183 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -55,6 +55,11 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 //
 // name must be a readable, executable, regular file.
 func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, name string) (*fs.Dirent, *fs.File, error) {
+	if name == "" {
+		ctx.Infof("cannot open empty name")
+		return nil, nil, syserror.ENOENT
+	}
+
 	d, err := mm.FindInode(ctx, root, wd, name, maxTraversals)
 	if err != nil {
 		return nil, nil, err
-- 
cgit v1.2.3


From 660f1203ff1949a7b7869b801f4aa2133d30b91f Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 3 Jul 2018 12:52:39 -0700
Subject: Fix runsc VDSO mapping

80bdf8a4068de3ac4a73b6b61a0cdcfe3e3571af accidentally moved vdso into an
inner scope, never assigning the vdso variable passed to the Kernel and
thus skipping VDSO mappings.

Fix this and remove the ability for loadVDSO to skip VDSO mappings,
since tests that do so are gone.

PiperOrigin-RevId: 203169135
Change-Id: Ifd8cadcbaf82f959223c501edcc4d83d05327eba
---
 pkg/sentry/loader/vdso.go | 6 ------
 runsc/boot/loader.go      | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 037576e41..2e8693f8e 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -278,12 +278,6 @@ func PrepareVDSO(p platform.Platform) (*VDSO, error) {
 //
 // loadVDSO takes a reference on the VDSO and parameter page FrameRegions.
 func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (usermem.Addr, error) {
-	if v == nil {
-		// Should be used only by tests.
-		ctx.Warningf("No VDSO provided, skipping VDSO mapping")
-		return 0, nil
-	}
-
 	if v.os != bin.os {
 		ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os)
 		return 0, syserror.ENOEXEC
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 6fcfba5cb..a3cc0e4a4 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -134,7 +134,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		// Create VDSO.
 		//
 		// Pass k as the platform since it is savable, unlike the actual platform.
-		vdso, err := loader.PrepareVDSO(k)
+		vdso, err = loader.PrepareVDSO(k)
 		if err != nil {
 			return nil, fmt.Errorf("error creating vdso: %v", err)
 		}
-- 
cgit v1.2.3


From 34af9a61741f26be403231ec302b4e0795147906 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Tue, 3 Jul 2018 14:07:43 -0700
Subject: Fix data race on inotify.Watch.mask.

PiperOrigin-RevId: 203180463
Change-Id: Ief50988c1c028f81ec07a26e704d893e86985bf0
---
 pkg/sentry/fs/inotify.go       |  7 ++++---
 pkg/sentry/fs/inotify_watch.go | 12 +++++++-----
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index a87be8590..6f5e8ce5e 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -16,6 +16,7 @@ package fs
 
 import (
 	"sync"
+	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/ilist"
@@ -279,13 +280,13 @@ func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 {
 		// same inode. Obtain an extra reference if necessary.
 		existing.Pin(target)
 
+		newmask := mask
 		if mergeMask := mask&linux.IN_MASK_ADD != 0; mergeMask {
 			// "Add (OR) events to watch mask for this pathname if it already
 			// exists (instead of replacing mask)." -- inotify(7)
-			existing.mask |= mask
-		} else {
-			existing.mask = mask
+			newmask |= atomic.LoadUint32(&existing.mask)
 		}
+		atomic.StoreUint32(&existing.mask, newmask)
 		return existing.wd
 	}
 
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index ff6ec6e3e..8904ef544 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -16,6 +16,7 @@ package fs
 
 import (
 	"sync"
+	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 )
@@ -33,9 +34,6 @@ type Watch struct {
 	// Descriptor for this watch. This is unique across an inotify instance.
 	wd int32
 
-	// Events being monitored via this watch.
-	mask uint32
-
 	// The inode being watched. Note that we don't directly hold a reference on
 	// this inode. Instead we hold a reference on the dirent(s) containing the
 	// inode, which we record in pins.
@@ -48,6 +46,10 @@ type Watch struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
 
+	// Events being monitored via this watch. Must be accessed atomically,
+	// writes are protected by mu.
+	mask uint32
+
 	// pins is the set of dirents this watch is currently pinning in memory by
 	// holding a reference to them. See Pin()/Unpin().
 	pins map[*Dirent]bool
@@ -62,7 +64,7 @@ func (w *Watch) ID() uint64 {
 // should continue to be be notified of events after the target has been
 // unlinked.
 func (w *Watch) NotifyParentAfterUnlink() bool {
-	return w.mask&linux.IN_EXCL_UNLINK == 0
+	return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK == 0
 }
 
 // isRenameEvent returns true if eventMask describes a rename event.
@@ -73,7 +75,7 @@ func isRenameEvent(eventMask uint32) bool {
 // Notify queues a new event on this watch.
 func (w *Watch) Notify(name string, events uint32, cookie uint32) {
 	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
-	effectiveMask := unmaskableBits | w.mask
+	effectiveMask := unmaskableBits | atomic.LoadUint32(&w.mask)
 	matchedEvents := effectiveMask & events
 
 	if matchedEvents == 0 {
-- 
cgit v1.2.3


From 0dedac637ff9f6f7a0556d42d90787584a4051da Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 9 Jul 2018 11:43:06 -0700
Subject: Trim all whitespace between interpreter and arg

Multiple whitespace characters are allowed. This fixes Ubuntu's
/usr/sbin/invoke-rc.d, which has trailing whitespace after the
interpreter which we were treating as an arg.

PiperOrigin-RevId: 203802278
Change-Id: I0a6cdb0af4b139cf8abb22fa70351fe3697a5c6b
---
 pkg/sentry/loader/interpreter.go | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 7249b8f30..54534952b 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -66,7 +66,7 @@ func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, ar
 	// Skip any whitespace before the interpeter.
 	line = bytes.TrimLeft(line, " \t")
 
-	// Linux only looks for a space or tab delimiting the interpreter and
+	// Linux only looks for spaces or tabs delimiting the interpreter and
 	// arg.
 	//
 	// execve(2): "On Linux, the entire string following the interpreter
@@ -77,9 +77,7 @@ func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, ar
 	i = bytes.IndexAny(line, " \t")
 	if i >= 0 {
 		interp = line[:i]
-		if i+1 < len(line) {
-			arg = line[i+1:]
-		}
+		arg = bytes.TrimLeft(line[i:], " \t")
 	}
 
 	if string(interp) == "" {
-- 
cgit v1.2.3


From 41aeb680b1882c9416e25e100b5ff5eebead36de Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 9 Jul 2018 16:15:14 -0700
Subject: Inherit parent in clone(CLONE_THREAD) under TaskSet.mu.

PiperOrigin-RevId: 203849534
Change-Id: I4d81513bfd32e0b7fc40c8a4c194eba7abc35a83
---
 pkg/sentry/kernel/task_clone.go | 8 +++++---
 pkg/sentry/kernel/task_start.go | 7 +++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 0c2427952..a61283267 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -220,18 +220,15 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		pidns = pidns.NewChild(userns)
 	}
 	tg := t.tg
-	parent := t.parent
 	if opts.NewThreadGroup {
 		sh := t.tg.signalHandlers
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
 		}
 		tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
-		parent = t
 	}
 	cfg := &TaskConfig{
 		Kernel:            t.k,
-		Parent:            parent,
 		ThreadGroup:       tg,
 		TaskContext:       tc,
 		TaskResources:     t.tr.Fork(!opts.NewFiles, !opts.NewFSContext),
@@ -242,6 +239,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:      utsns,
 		IPCNamespace:      ipcns,
 	}
+	if opts.NewThreadGroup {
+		cfg.Parent = t
+	} else {
+		cfg.InheritParent = t
+	}
 	if opts.NewNetworkNamespace {
 		cfg.NetworkNamespaced = true
 	}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 801cb3395..c97dee8fc 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -31,6 +31,10 @@ type TaskConfig struct {
 	// Parent is the new task's parent. Parent may be nil.
 	Parent *Task
 
+	// If InheritParent is not nil, use InheritParent's parent as the new
+	// task's parent.
+	InheritParent *Task
+
 	// ThreadGroup is the ThreadGroup the new task belongs to.
 	*ThreadGroup
 
@@ -133,6 +137,9 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 	// IDs).
 	t.updateLogPrefixLocked()
 
+	if cfg.InheritParent != nil {
+		t.parent = cfg.InheritParent.parent
+	}
 	if t.parent != nil {
 		t.parent.children[t] = struct{}{}
 	}
-- 
cgit v1.2.3


From b1683df90bf81974e9e309ed66edaff30537c1be Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 10 Jul 2018 09:22:37 -0700
Subject: netstack: tcp socket connected state S/R support.

PiperOrigin-RevId: 203958972
Change-Id: Ia6fe16547539296d48e2c6731edacdd96bd6e93c
---
 pkg/sentry/kernel/BUILD                   |   5 +-
 pkg/sentry/kernel/kernel.go               |   6 +
 pkg/sentry/kernel/kernel_state.go         |  31 +++++
 pkg/tcpip/stack/stack_global_state.go     |   2 +-
 pkg/tcpip/tcpip.go                        |  36 ++++++
 pkg/tcpip/transport/tcp/BUILD             |   7 ++
 pkg/tcpip/transport/tcp/accept.go         |  11 +-
 pkg/tcpip/transport/tcp/connect.go        |  43 ++++++-
 pkg/tcpip/transport/tcp/endpoint.go       |  72 ++++++++---
 pkg/tcpip/transport/tcp/endpoint_state.go | 193 +++++++++++++++++++++---------
 pkg/tcpip/transport/tcp/segment.go        |   8 +-
 pkg/tcpip/transport/tcp/segment_queue.go  |   4 +-
 pkg/tcpip/transport/tcp/segment_state.go  |  51 ++++++++
 pkg/tcpip/transport/tcp/snd.go            |   4 +-
 pkg/tcpip/transport/tcp/snd_state.go      |  49 ++++++++
 15 files changed, 430 insertions(+), 92 deletions(-)
 create mode 100644 pkg/sentry/kernel/kernel_state.go
 create mode 100644 pkg/tcpip/transport/tcp/segment_state.go
 create mode 100644 pkg/tcpip/transport/tcp/snd_state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index b2a55ddff..07568b47c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -12,6 +12,7 @@ go_stateify(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_state.go",
         "pending_signals.go",
         "pending_signals_state.go",
         "process_group_list.go",
@@ -45,10 +46,11 @@ go_stateify(
         "vdso.go",
         "version.go",
     ],
-    out = "kernel_state.go",
+    out = "kernel_autogen_state.go",
     imports = [
         "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
         "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+        "gvisor.googlesource.com/gvisor/pkg/tcpip",
     ],
     package = "kernel",
 )
@@ -117,6 +119,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_autogen_state.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5662b8f08..64439cd9d 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -57,6 +57,7 @@ import (
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/state"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 )
 
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
@@ -158,6 +159,9 @@ type Kernel struct {
 	// exitErr is the error causing the sandbox to exit, if any. It is
 	// protected by extMu.
 	exitErr error
+
+	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
+	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -422,6 +426,8 @@ func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) erro
 		return err
 	}
 
+	tcpip.AsyncLoading.Wait()
+
 	log.Infof("Overall load took [%s]", time.Since(loadStart))
 
 	// Applications may size per-cpu structures based on k.applicationCores, so
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
new file mode 100644
index 000000000..bb2d5102d
--- /dev/null
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+)
+
+// saveDanglingEndpoints is invoked by stateify.
+func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint {
+	return tcpip.GetDanglingEndpoints()
+}
+
+// loadDanglingEndpoints is invoked by stateify.
+func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) {
+	for _, e := range es {
+		tcpip.AddDanglingEndpoint(e)
+	}
+}
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index 6d261ce96..b6c095efb 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -15,5 +15,5 @@
 package stack
 
 // StackFromEnv is the global stack created in restore run.
-// FIXME: remove this variable once tcpip S/R is fully supported.
+// FIXME
 var StackFromEnv *Stack
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 4107c0f78..eb1e4645d 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -33,6 +33,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
@@ -562,3 +563,38 @@ type ProtocolAddress struct {
 	// Address is a network address.
 	Address Address
 }
+
+// danglingEndpointsMu protects access to danglingEndpoints.
+var danglingEndpointsMu sync.Mutex
+
+// danglingEndpoints tracks all dangling endpoints no longer owned by the app.
+var danglingEndpoints = make(map[Endpoint]struct{})
+
+// GetDanglingEndpoints returns all dangling endpoints.
+func GetDanglingEndpoints() []Endpoint {
+	es := make([]Endpoint, 0, len(danglingEndpoints))
+	danglingEndpointsMu.Lock()
+	for e, _ := range danglingEndpoints {
+		es = append(es, e)
+	}
+	danglingEndpointsMu.Unlock()
+	return es
+}
+
+// AddDanglingEndpoint adds a dangling endpoint.
+func AddDanglingEndpoint(e Endpoint) {
+	danglingEndpointsMu.Lock()
+	danglingEndpoints[e] = struct{}{}
+	danglingEndpointsMu.Unlock()
+}
+
+// DeleteDanglingEndpoint removes a dangling endpoint.
+func DeleteDanglingEndpoint(e Endpoint) {
+	danglingEndpointsMu.Lock()
+	delete(danglingEndpoints, e)
+	danglingEndpointsMu.Unlock()
+}
+
+// AsyncLoading is the global barrier for asynchronous endpoint loading
+// activities.
+var AsyncLoading sync.WaitGroup
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 6cb0ebab2..6a2f42a12 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -10,11 +10,16 @@ go_stateify(
         "endpoint.go",
         "endpoint_state.go",
         "rcv.go",
+        "segment.go",
         "segment_heap.go",
+        "segment_queue.go",
+        "segment_state.go",
         "snd.go",
+        "snd_state.go",
         "tcp_segment_list.go",
     ],
     out = "tcp_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     package = "tcp",
 )
 
@@ -43,7 +48,9 @@ go_library(
         "segment.go",
         "segment_heap.go",
         "segment_queue.go",
+        "segment_state.go",
         "snd.go",
+        "snd_state.go",
         "tcp_segment_list.go",
         "tcp_state.go",
         "timer.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index ae4359ff4..d6d2b4555 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -78,7 +78,8 @@ func encodeMSS(mss uint16) uint32 {
 // to go above a threshold.
 var synRcvdCount struct {
 	sync.Mutex
-	value uint64
+	value   uint64
+	pending sync.WaitGroup
 }
 
 // listenContext is used by a listening endpoint to store state used while
@@ -112,6 +113,7 @@ func incSynRcvdCount() bool {
 		return false
 	}
 
+	synRcvdCount.pending.Add(1)
 	synRcvdCount.value++
 
 	return true
@@ -125,6 +127,7 @@ func decSynRcvdCount() {
 	defer synRcvdCount.Unlock()
 
 	synRcvdCount.value--
+	synRcvdCount.pending.Done()
 }
 
 // newListenContext creates a new listen context.
@@ -302,7 +305,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		opts := parseSynSegmentOptions(s)
 		if incSynRcvdCount() {
 			s.incRef()
-			go e.handleSynSegment(ctx, s, &opts) // S/R-FIXME
+			go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
 		} else {
 			cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
 			// Send SYN with window scaling because we currently
@@ -391,10 +394,12 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 				return nil
 			}
 			if n&notifyDrain != 0 {
-				for s := e.segmentQueue.dequeue(); s != nil; s = e.segmentQueue.dequeue() {
+				for !e.segmentQueue.empty() {
+					s := e.segmentQueue.dequeue()
 					e.handleListenSegment(ctx, s)
 					s.decRef()
 				}
+				synRcvdCount.pending.Wait()
 				close(e.drainDone)
 				<-e.undrain
 			}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index afdea2b53..33bf4fc0b 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -453,7 +453,8 @@ func (h *handshake) execute() *tcpip.Error {
 				return tcpip.ErrAborted
 			}
 			if n&notifyDrain != 0 {
-				for s := h.ep.segmentQueue.dequeue(); s != nil; s = h.ep.segmentQueue.dequeue() {
+				for !h.ep.segmentQueue.empty() {
+					s := h.ep.segmentQueue.dequeue()
 					err := h.handleSegment(s)
 					s.decRef()
 					if err != nil {
@@ -823,15 +824,13 @@ func (e *endpoint) handleSegments() *tcpip.Error {
 // protocolMainLoop is the main loop of the TCP protocol. It runs in its own
 // goroutine and is responsible for sending segments and handling received
 // segments.
-func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
+func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
 	var closeTimer *time.Timer
 	var closeWaker sleep.Waker
 
 	defer func() {
 		// e.mu is expected to be hold upon entering this section.
 
-		e.completeWorkerLocked()
-
 		if e.snd != nil {
 			e.snd.resendTimer.cleanup()
 		}
@@ -840,6 +839,8 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 			closeTimer.Stop()
 		}
 
+		e.completeWorkerLocked()
+
 		if e.drainDone != nil {
 			close(e.drainDone)
 		}
@@ -850,7 +851,7 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 	}()
 
-	if !passive {
+	if handshake {
 		// This is an active connection, so we must initiate the 3-way
 		// handshake, and then inform potential waiters about its
 		// completion.
@@ -960,6 +961,17 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 						closeWaker.Assert()
 					})
 				}
+
+				if n&notifyDrain != 0 {
+					for !e.segmentQueue.empty() {
+						if err := e.handleSegments(); err != nil {
+							return err
+						}
+					}
+					close(e.drainDone)
+					<-e.undrain
+				}
+
 				return nil
 			},
 		},
@@ -971,6 +983,27 @@ func (e *endpoint) protocolMainLoop(passive bool) *tcpip.Error {
 		s.AddWaker(funcs[i].w, i)
 	}
 
+	// The following assertions and notifications are needed for restored
+	// endpoints. Fresh newly created endpoints have empty states and should
+	// not invoke any.
+	e.segmentQueue.mu.Lock()
+	if !e.segmentQueue.list.Empty() {
+		e.newSegmentWaker.Assert()
+	}
+	e.segmentQueue.mu.Unlock()
+
+	e.rcvListMu.Lock()
+	if !e.rcvList.Empty() {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+	e.rcvListMu.Unlock()
+
+	e.mu.RLock()
+	if e.workerCleanup {
+		e.notifyProtocolGoroutine(notifyClose)
+	}
+	e.mu.RUnlock()
+
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 	for !e.rcv.closed || !e.snd.closed || e.snd.sndUna != e.snd.sndNxtList {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index cb105b863..8b9a81f6a 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -80,7 +80,7 @@ type endpoint struct {
 	// change throughout the lifetime of the endpoint.
 	stack       *stack.Stack `state:"manual"`
 	netProto    tcpip.NetworkProtocolNumber
-	waiterQueue *waiter.Queue
+	waiterQueue *waiter.Queue `state:"wait"`
 
 	// lastError represents the last error that the endpoint reported;
 	// access to it is protected by the following mutex.
@@ -95,8 +95,8 @@ type endpoint struct {
 	// to indicate to users that no more data is coming.
 	//
 	// rcvListMu can be taken after the endpoint mu below.
-	rcvListMu  sync.Mutex `state:"nosave"`
-	rcvList    segmentList
+	rcvListMu  sync.Mutex  `state:"nosave"`
+	rcvList    segmentList `state:"wait"`
 	rcvClosed  bool
 	rcvBufSize int
 	rcvBufUsed int
@@ -104,8 +104,8 @@ type endpoint struct {
 	// The following fields are protected by the mutex.
 	mu                sync.RWMutex `state:"nosave"`
 	id                stack.TransportEndpointID
-	state             endpointState
-	isPortReserved    bool `state:"manual"`
+	state             endpointState `state:".(endpointState)"`
+	isPortReserved    bool          `state:"manual"`
 	isRegistered      bool
 	boundNICID        tcpip.NICID `state:"manual"`
 	route             stack.Route `state:"manual"`
@@ -131,7 +131,7 @@ type endpoint struct {
 	// workerCleanup specifies if the worker goroutine must perform cleanup
 	// before exitting. This can only be set to true when workerRunning is
 	// also true, and they're both protected by the mutex.
-	workerCleanup bool `state:"zerovalue"`
+	workerCleanup bool
 
 	// sendTSOk is used to indicate when the TS Option has been negotiated.
 	// When sendTSOk is true every non-RST segment should carry a TS as per
@@ -166,7 +166,7 @@ type endpoint struct {
 	// segmentQueue is used to hand received segments to the protocol
 	// goroutine. Segments are queued as long as the queue is not full,
 	// and dropped when it is.
-	segmentQueue segmentQueue `state:"zerovalue"`
+	segmentQueue segmentQueue `state:"wait"`
 
 	// The following fields are used to manage the send buffer. When
 	// segments are ready to be sent, they are added to sndQueue and the
@@ -179,7 +179,7 @@ type endpoint struct {
 	sndBufUsed    int
 	sndClosed     bool
 	sndBufInQueue seqnum.Size
-	sndQueue      segmentList
+	sndQueue      segmentList `state:"wait"`
 	sndWaker      sleep.Waker `state:"manual"`
 	sndCloseWaker sleep.Waker `state:"manual"`
 
@@ -201,17 +201,21 @@ type endpoint struct {
 
 	// notifyFlags is a bitmask of flags used to indicate to the protocol
 	// goroutine what it was notified; this is only accessed atomically.
-	notifyFlags uint32 `state:"zerovalue"`
+	notifyFlags uint32 `state:"nosave"`
 
 	// acceptedChan is used by a listening endpoint protocol goroutine to
 	// send newly accepted connections to the endpoint so that they can be
 	// read by Accept() calls.
-	acceptedChan chan *endpoint `state:".(endpointChan)"`
+	acceptedChan chan *endpoint `state:"manual"`
+
+	// acceptedEndpoints is only used to save / restore the channel buffer.
+	// FIXME
+	acceptedEndpoints []*endpoint
 
 	// The following are only used from the protocol goroutine, and
 	// therefore don't need locks to protect them.
-	rcv *receiver
-	snd *sender
+	rcv *receiver `state:"wait"`
+	snd *sender   `state:"wait"`
 
 	// The goroutine drain completion notification channel.
 	drainDone chan struct{} `state:"nosave"`
@@ -224,6 +228,7 @@ type endpoint struct {
 	probe stack.TCPProbeFunc `state:"nosave"`
 
 	// The following are only used to assist the restore run to re-connect.
+	bindAddress       tcpip.Address
 	connectingAddress tcpip.Address
 }
 
@@ -357,6 +362,7 @@ func (e *endpoint) Close() {
 
 	// Either perform the local cleanup or kick the worker to make sure it
 	// knows it needs to cleanup.
+	tcpip.AddDanglingEndpoint(e)
 	if !e.workerRunning {
 		e.cleanupLocked()
 	} else {
@@ -376,9 +382,12 @@ func (e *endpoint) cleanupLocked() {
 	if e.acceptedChan != nil {
 		close(e.acceptedChan)
 		for n := range e.acceptedChan {
+			n.mu.Lock()
 			n.resetConnectionLocked(tcpip.ErrConnectionAborted)
+			n.mu.Unlock()
 			n.Close()
 		}
+		e.acceptedChan = nil
 	}
 	e.workerCleanup = false
 
@@ -387,6 +396,7 @@ func (e *endpoint) cleanupLocked() {
 	}
 
 	e.route.Release()
+	tcpip.DeleteDanglingEndpoint(e)
 }
 
 // Read reads data from the endpoint.
@@ -801,6 +811,16 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocol
 
 // Connect connects the endpoint to its peer.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	return e.connect(addr, true, true)
+}
+
+// connect connects the endpoint to its peer. In the normal non-S/R case, the
+// new connection is expected to run the main goroutine and perform handshake.
+// In restore of previously connected endpoints, both ends will be passively
+// created (so no new handshaking is done); for stack-accepted connections not
+// yet accepted by the app, they are restored without running the main goroutine
+// here.
+func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
@@ -912,9 +932,27 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.boundNICID = nicid
 	e.effectiveNetProtos = netProtos
 	e.connectingAddress = connectingAddr
-	e.workerRunning = true
 
-	go e.protocolMainLoop(false) // S/R-SAFE: will be drained before save.
+	// Connect in the restore phase does not perform handshake. Restore its
+	// connection setting here.
+	if !handshake {
+		e.segmentQueue.mu.Lock()
+		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
+			for s := l.Front(); s != nil; s = s.Next() {
+				s.id = e.id
+				s.route = r.Clone()
+				e.sndWaker.Assert()
+			}
+		}
+		e.segmentQueue.mu.Unlock()
+		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
+		e.state = stateConnected
+	}
+
+	if run {
+		e.workerRunning = true
+		go e.protocolMainLoop(handshake) // S/R-SAFE: will be drained before save.
+	}
 
 	return tcpip.ErrConnectStarted
 }
@@ -999,6 +1037,9 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 		if len(e.acceptedChan) > backlog {
 			return tcpip.ErrInvalidEndpointState
 		}
+		if cap(e.acceptedChan) == backlog {
+			return nil
+		}
 		origChan := e.acceptedChan
 		e.acceptedChan = make(chan *endpoint, backlog)
 		close(origChan)
@@ -1036,7 +1077,7 @@ func (e *endpoint) Listen(backlog int) *tcpip.Error {
 func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
 	e.waiterQueue = waiterQueue
 	e.workerRunning = true
-	go e.protocolMainLoop(true) // S/R-FIXME
+	go e.protocolMainLoop(false) // S/R-SAFE: drained on save.
 }
 
 // Accept returns a new endpoint if a peer has established a connection
@@ -1077,6 +1118,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) (ret
 		return tcpip.ErrAlreadyBound
 	}
 
+	e.bindAddress = addr.Addr
 	netProto, err := e.checkV4Mapped(&addr)
 	if err != nil {
 		return err
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index aa4ccea75..43765d425 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -17,8 +17,10 @@ package tcp
 import (
 	"fmt"
 	"sync"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 )
 
@@ -32,7 +34,7 @@ func (e *endpoint) drainSegmentLocked() {
 	e.undrain = make(chan struct{})
 	e.mu.Unlock()
 
-	e.notificationWaker.Assert()
+	e.notifyProtocolGoroutine(notifyDrain)
 	<-e.drainDone
 
 	e.mu.Lock()
@@ -48,37 +50,103 @@ func (e *endpoint) beforeSave() {
 
 	switch e.state {
 	case stateInitial, stateBound:
-	case stateListen:
-		if !e.segmentQueue.empty() {
-			e.drainSegmentLocked()
+	case stateListen, stateConnecting, stateConnected:
+		if e.state == stateConnected && !e.workerRunning {
+			// The endpoint must be in acceptedChan.
+			break
 		}
-	case stateConnecting:
 		e.drainSegmentLocked()
-		if e.state != stateConnected {
+		if e.state != stateClosed && e.state != stateError {
+			if !e.workerRunning {
+				panic("endpoint has no worker running in listen, connecting, or connected state")
+			}
 			break
 		}
 		fallthrough
-	case stateConnected:
-		// FIXME
-		panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%v, remote %v:%v", e.id.LocalAddress, e.id.LocalPort, e.id.RemoteAddress, e.id.RemotePort)})
-	case stateClosed, stateError:
+	case stateError, stateClosed:
+		for e.state == stateError && e.workerRunning {
+			e.mu.Unlock()
+			time.Sleep(100 * time.Millisecond)
+			e.mu.Lock()
+		}
 		if e.workerRunning {
-			panic(fmt.Sprintf("endpoint still has worker running in closed or error state"))
+			panic("endpoint still has worker running in closed or error state")
 		}
 	default:
 		panic(fmt.Sprintf("endpoint in unknown state %v", e.state))
 	}
+
+	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
+		panic("endpoint still has waiters upon save")
+	}
+
+	if !((e.state == stateBound || e.state == stateListen) == e.isPortReserved) {
+		panic("endpoint port must and must only be reserved in bound or listen state")
+	}
+
+	if e.acceptedChan != nil {
+		close(e.acceptedChan)
+		e.acceptedEndpoints = make([]*endpoint, len(e.acceptedChan), cap(e.acceptedChan))
+		i := 0
+		for ep := range e.acceptedChan {
+			e.acceptedEndpoints[i] = ep
+			i++
+		}
+		if i != len(e.acceptedEndpoints) {
+			panic("endpoint acceptedChan buffer got consumed by background context")
+		}
+	}
+}
+
+// saveState is invoked by stateify.
+func (e *endpoint) saveState() endpointState {
+	return e.state
+}
+
+// Endpoint loading must be done in the following ordering by their state, to
+// avoid dangling connecting w/o listening peer, and to avoid conflicts in port
+// reservation.
+var connectedLoading sync.WaitGroup
+var listenLoading sync.WaitGroup
+var connectingLoading sync.WaitGroup
+
+// Bound endpoint loading happens last.
+
+// loadState is invoked by stateify.
+func (e *endpoint) loadState(state endpointState) {
+	// This is to ensure that the loading wait groups include all applicable
+	// endpoints before any asynchronous calls to the Wait() methods.
+	switch state {
+	case stateConnected:
+		connectedLoading.Add(1)
+	case stateListen:
+		listenLoading.Add(1)
+	case stateConnecting:
+		connectingLoading.Add(1)
+	}
+	e.state = state
 }
 
 // afterLoad is invoked by stateify.
 func (e *endpoint) afterLoad() {
+	// We load acceptedChan buffer indirectly here. Note that closed
+	// endpoints might not need to allocate the channel.
+	// FIXME
+	if cap(e.acceptedEndpoints) > 0 {
+		e.acceptedChan = make(chan *endpoint, cap(e.acceptedEndpoints))
+		for _, ep := range e.acceptedEndpoints {
+			e.acceptedChan <- ep
+		}
+		e.acceptedEndpoints = nil
+	}
+
 	e.stack = stack.StackFromEnv
 	e.segmentQueue.setLimit(2 * e.rcvBufSize)
 	e.workMu.Init()
 
 	state := e.state
 	switch state {
-	case stateInitial, stateBound, stateListen, stateConnecting:
+	case stateInitial, stateBound, stateListen, stateConnecting, stateConnected:
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
@@ -90,65 +158,72 @@ func (e *endpoint) afterLoad() {
 		}
 	}
 
-	switch state {
-	case stateBound, stateListen, stateConnecting:
+	bind := func() {
 		e.state = stateInitial
-		if err := e.Bind(tcpip.FullAddress{Addr: e.id.LocalAddress, Port: e.id.LocalPort}, nil); err != nil {
+		if len(e.bindAddress) == 0 {
+			e.bindAddress = e.id.LocalAddress
+		}
+		if err := e.Bind(tcpip.FullAddress{Addr: e.bindAddress, Port: e.id.LocalPort}, nil); err != nil {
 			panic("endpoint binding failed: " + err.String())
 		}
 	}
 
 	switch state {
-	case stateListen:
-		backlog := cap(e.acceptedChan)
-		e.acceptedChan = nil
-		if err := e.Listen(backlog); err != nil {
-			panic("endpoint listening failed: " + err.String())
+	case stateConnected:
+		bind()
+		if len(e.connectingAddress) == 0 {
+			// This endpoint is accepted by netstack but not yet by
+			// the app. If the endpoint is IPv6 but the remote
+			// address is IPv4, we need to connect as IPv6 so that
+			// dual-stack mode can be properly activated.
+			if e.netProto == header.IPv6ProtocolNumber && len(e.id.RemoteAddress) != header.IPv6AddressSize {
+				e.connectingAddress = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + e.id.RemoteAddress
+			} else {
+				e.connectingAddress = e.id.RemoteAddress
+			}
 		}
-	}
-
-	switch state {
-	case stateConnecting:
-		if err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}); err != tcpip.ErrConnectStarted {
+		if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted {
 			panic("endpoint connecting failed: " + err.String())
 		}
+		connectedLoading.Done()
+	case stateListen:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			bind()
+			backlog := cap(e.acceptedChan)
+			if err := e.Listen(backlog); err != nil {
+				panic("endpoint listening failed: " + err.String())
+			}
+			listenLoading.Done()
+			tcpip.AsyncLoading.Done()
+		}()
+	case stateConnecting:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			listenLoading.Wait()
+			bind()
+			if err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.id.RemotePort}); err != tcpip.ErrConnectStarted {
+				panic("endpoint connecting failed: " + err.String())
+			}
+			connectingLoading.Done()
+			tcpip.AsyncLoading.Done()
+		}()
+	case stateBound:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			listenLoading.Wait()
+			connectingLoading.Wait()
+			bind()
+			tcpip.AsyncLoading.Done()
+		}()
+	case stateClosed, stateError:
+		tcpip.DeleteDanglingEndpoint(e)
 	}
 }
 
-// saveAcceptedChan is invoked by stateify.
-func (e *endpoint) saveAcceptedChan() endpointChan {
-	if e.acceptedChan == nil {
-		return endpointChan{}
-	}
-	close(e.acceptedChan)
-	buffer := make([]*endpoint, 0, len(e.acceptedChan))
-	for ep := range e.acceptedChan {
-		buffer = append(buffer, ep)
-	}
-	if len(buffer) != cap(buffer) {
-		panic("endpoint.acceptedChan buffer got consumed by background context")
-	}
-	c := cap(e.acceptedChan)
-	e.acceptedChan = nil
-	return endpointChan{buffer: buffer, cap: c}
-}
-
-// loadAcceptedChan is invoked by stateify.
-func (e *endpoint) loadAcceptedChan(c endpointChan) {
-	if c.cap == 0 {
-		return
-	}
-	e.acceptedChan = make(chan *endpoint, c.cap)
-	for _, ep := range c.buffer {
-		e.acceptedChan <- ep
-	}
-}
-
-type endpointChan struct {
-	buffer []*endpoint
-	cap    int
-}
-
 // saveLastError is invoked by stateify.
 func (e *endpoint) saveLastError() string {
 	if e.lastError == nil {
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index a90f6661d..40928ba2c 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -39,9 +39,9 @@ const (
 type segment struct {
 	segmentEntry
 	refCnt int32
-	id     stack.TransportEndpointID
-	route  stack.Route `state:"manual"`
-	data   buffer.VectorisedView
+	id     stack.TransportEndpointID `state:"manual"`
+	route  stack.Route               `state:"manual"`
+	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View
@@ -55,7 +55,7 @@ type segment struct {
 
 	// parsedOptions stores the parsed values from the options in the segment.
 	parsedOptions header.TCPOptions
-	options       []byte
+	options       []byte `state:".([]byte)"`
 }
 
 func newSegment(r *stack.Route, id stack.TransportEndpointID, vv *buffer.VectorisedView) *segment {
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 83f554ebd..2ddcf5f10 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -22,8 +22,8 @@ import (
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
 type segmentQueue struct {
-	mu    sync.Mutex
-	list  segmentList
+	mu    sync.Mutex  `state:"nosave"`
+	list  segmentList `state:"wait"`
 	limit int
 	used  int
 }
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
new file mode 100644
index 000000000..22f0bbf18
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+)
+
+// saveData is invoked by stateify.
+func (s *segment) saveData() buffer.VectorisedView {
+	// We cannot save s.data directly as s.data.views may alias to s.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return s.data.Clone(nil)
+}
+
+// loadData is invoked by stateify.
+func (s *segment) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the s.data = data.Clone(s.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing s.views for data.views.
+	s.data = data
+}
+
+// saveOptions is invoked by stateify.
+func (s *segment) saveOptions() []byte {
+	// We cannot save s.options directly as it may point to s.data's trimmed
+	// tail, which is not allowed by state framework (in-struct pointer).
+	b := make([]byte, 0, cap(s.options))
+	return append(b, s.options...)
+}
+
+// loadOptions is invoked by stateify.
+func (s *segment) loadOptions(options []byte) {
+	// NOTE: We cannot point s.options back into s.data's trimmed tail. But
+	// it is OK as they do not need to aliased. Plus, options is already
+	// allocated so there is no cost here.
+	s.options = options
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index a9892eb64..7dfbf6384 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -38,7 +38,7 @@ type sender struct {
 	ep *endpoint
 
 	// lastSendTime is the timestamp when the last packet was sent.
-	lastSendTime time.Time
+	lastSendTime time.Time `state:".(unixTime)"`
 
 	// dupAckCount is the number of duplicated acks received. It is used for
 	// fast retransmit.
@@ -81,7 +81,7 @@ type sender struct {
 	rttMeasureSeqNum seqnum.Value
 
 	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
-	rttMeasureTime time.Time
+	rttMeasureTime time.Time `state:".(unixTime)"`
 
 	closed      bool
 	writeNext   *segment
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
new file mode 100644
index 000000000..33c8867f4
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -0,0 +1,49 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+type unixTime struct {
+	second int64
+	nano   int64
+}
+
+// saveLastSendTime is invoked by stateify.
+func (s *sender) saveLastSendTime() unixTime {
+	return unixTime{s.lastSendTime.Unix(), s.lastSendTime.UnixNano()}
+}
+
+// loadLastSendTime is invoked by stateify.
+func (s *sender) loadLastSendTime(unix unixTime) {
+	s.lastSendTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveRttMeasureTime is invoked by stateify.
+func (s *sender) saveRttMeasureTime() unixTime {
+	return unixTime{s.rttMeasureTime.Unix(), s.rttMeasureTime.UnixNano()}
+}
+
+// loadRttMeasureTime is invoked by stateify.
+func (s *sender) loadRttMeasureTime(unix unixTime) {
+	s.rttMeasureTime = time.Unix(unix.second, unix.nano)
+}
+
+// afterLoad is invoked by stateify.
+func (s *sender) afterLoad() {
+	s.resendTimer.init(&s.resendWaker)
+}
-- 
cgit v1.2.3


From 06920b3d1bb6346a20aa0e154b14e68116919dbc Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 10 Jul 2018 13:58:00 -0700
Subject: Exit tmpfs.fileInodeOperations.Translate early if required.Start >=
 EOF.

Otherwise required and optional can be empty or have negative length.

PiperOrigin-RevId: 204007079
Change-Id: I59e472a87a8caac11ffb9a914b8d79bf0cd70995
---
 pkg/sentry/fs/tmpfs/inode_file.go | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 66bc934ae..4e803c9ff 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -451,9 +451,12 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 	// Constrain translations to f.attr.Unstable.Size (rounded up) to prevent
 	// translation to pages that may be concurrently truncated.
 	pgend := fs.OffsetPageEnd(f.attr.Unstable.Size)
-	var buserr error
+	var beyondEOF bool
 	if required.End > pgend {
-		buserr = &memmap.BusError{io.EOF}
+		if required.Start >= pgend {
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
 		required.End = pgend
 	}
 	if optional.End > pgend {
@@ -481,9 +484,12 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 	// Don't return the error returned by f.data.Fill if it occurred outside of
 	// required.
 	if translatedEnd < required.End && cerr != nil {
-		return ts, cerr
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
 	}
-	return ts, buserr
+	return ts, nil
 }
 
 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
-- 
cgit v1.2.3


From ee0ef506d4060eaf0736997a56fd8490e2434495 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 11 Jul 2018 11:51:05 -0700
Subject: Add MemoryManager.Pin.

PiperOrigin-RevId: 204162313
Change-Id: Ib0593dde88ac33e222c12d0dca6733ef1f1035dc
---
 pkg/sentry/mm/pma.go            | 90 +++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/platform/platform.go |  4 +-
 2 files changed, 92 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 35e873762..9febb25ac 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -578,6 +578,96 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat
 	}
 }
 
+// Pin returns the platform.File ranges currently mapped by addresses in ar in
+// mm, acquiring a reference on the returned ranges which the caller must
+// release by calling Unpin. If not all addresses are mapped, Pin returns a
+// non-nil error. Note that Pin may return both a non-empty slice of
+// PinnedRanges and a non-nil error.
+//
+// Pin does not prevent mapped ranges from changing, making it unsuitable for
+// most I/O. It should only be used in contexts that would use get_user_pages()
+// in the Linux kernel.
+//
+// Preconditions: ar.Length() != 0. ar must be page-aligned.
+func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
+	if checkInvariants {
+		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+			panic(fmt.Sprintf("invalid ar: %v", ar))
+		}
+	}
+
+	// Ensure that we have usable vmas.
+	mm.mappingMu.RLock()
+	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
+	if vendaddr := vend.Start(); vendaddr < ar.End {
+		if vendaddr <= ar.Start {
+			mm.mappingMu.RUnlock()
+			return nil, verr
+		}
+		ar.End = vendaddr
+	}
+
+	// Ensure that we have usable pmas.
+	mm.activeMu.Lock()
+	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
+		breakCOW: at.Write,
+	})
+	mm.mappingMu.RUnlock()
+	if pendaddr := pend.Start(); pendaddr < ar.End {
+		if pendaddr <= ar.Start {
+			mm.activeMu.Unlock()
+			return nil, perr
+		}
+		ar.End = pendaddr
+	}
+
+	// Gather pmas.
+	var prs []PinnedRange
+	for pseg.Ok() && pseg.Start() < ar.End {
+		psar := pseg.Range().Intersect(ar)
+		f := pseg.ValuePtr().file
+		fr := pseg.fileRangeOf(psar)
+		f.IncRef(fr)
+		prs = append(prs, PinnedRange{
+			Source: psar,
+			File:   f,
+			Offset: fr.Start,
+		})
+		pseg = pseg.NextSegment()
+	}
+	mm.activeMu.Unlock()
+
+	// Return the first error in order of progress through ar.
+	if perr != nil {
+		return prs, perr
+	}
+	return prs, verr
+}
+
+// PinnedRanges are returned by MemoryManager.Pin.
+type PinnedRange struct {
+	// Source is the corresponding range of addresses.
+	Source usermem.AddrRange
+
+	// File is the mapped file.
+	File platform.File
+
+	// Offset is the offset into File at which this PinnedRange begins.
+	Offset uint64
+}
+
+// FileRange returns the platform.File offsets mapped by pr.
+func (pr PinnedRange) FileRange() platform.FileRange {
+	return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
+}
+
+// Unpin releases the reference held by prs.
+func Unpin(prs []PinnedRange) {
+	for i := range prs {
+		prs[i].File.DecRef(prs[i].FileRange())
+	}
+}
+
 // movePMAsLocked moves all pmas in oldAR to newAR.
 //
 // Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 1c385bc5a..f2fe163e8 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -305,8 +305,8 @@ type File interface {
 	MapInto(as AddressSpace, addr usermem.Addr, fr FileRange, at usermem.AccessType, precommit bool) error
 
 	// MapInternal returns a mapping of the given file offsets in the invoking
-	// process' address space for reading and writing. The lifetime of the
-	// returned mapping is implementation-defined.
+	// process' address space for reading and writing. The returned mapping is
+	// valid as long as a reference is held on the mapped range.
 	//
 	// Note that fr.Start and fr.End need not be page-aligned.
 	//
-- 
cgit v1.2.3


From b9c469f37282129031a6036cfe43028faaeb1a96 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 11 Jul 2018 14:23:17 -0700
Subject: Move ptrace constants to abi/linux.

PiperOrigin-RevId: 204188763
Change-Id: I5596ab7abb3ec9e210a7f57b3fc420e836fa43f3
---
 pkg/abi/linux/BUILD         |   1 +
 pkg/abi/linux/ptrace.go     |  89 ++++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/ptrace.go | 108 ++++++++++++++++++++------------------------
 pkg/sentry/strace/ptrace.go |  80 ++++++++++++++++----------------
 4 files changed, 179 insertions(+), 99 deletions(-)
 create mode 100644 pkg/abi/linux/ptrace.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 5d00b66cc..e164945cf 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -48,6 +48,7 @@ go_library(
         "netlink_route.go",
         "poll.go",
         "prctl.go",
+        "ptrace.go",
         "rusage.go",
         "sched.go",
         "seccomp.go",
diff --git a/pkg/abi/linux/ptrace.go b/pkg/abi/linux/ptrace.go
new file mode 100644
index 000000000..ba48d4d6d
--- /dev/null
+++ b/pkg/abi/linux/ptrace.go
@@ -0,0 +1,89 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// ptrace commands from include/uapi/linux/ptrace.h.
+const (
+	PTRACE_TRACEME              = 0
+	PTRACE_PEEKTEXT             = 1
+	PTRACE_PEEKDATA             = 2
+	PTRACE_PEEKUSR              = 3
+	PTRACE_POKETEXT             = 4
+	PTRACE_POKEDATA             = 5
+	PTRACE_POKEUSR              = 6
+	PTRACE_CONT                 = 7
+	PTRACE_KILL                 = 8
+	PTRACE_SINGLESTEP           = 9
+	PTRACE_ATTACH               = 16
+	PTRACE_DETACH               = 17
+	PTRACE_SYSCALL              = 24
+	PTRACE_SETOPTIONS           = 0x4200
+	PTRACE_GETEVENTMSG          = 0x4201
+	PTRACE_GETSIGINFO           = 0x4202
+	PTRACE_SETSIGINFO           = 0x4203
+	PTRACE_GETREGSET            = 0x4204
+	PTRACE_SETREGSET            = 0x4205
+	PTRACE_SEIZE                = 0x4206
+	PTRACE_INTERRUPT            = 0x4207
+	PTRACE_LISTEN               = 0x4208
+	PTRACE_PEEKSIGINFO          = 0x4209
+	PTRACE_GETSIGMASK           = 0x420a
+	PTRACE_SETSIGMASK           = 0x420b
+	PTRACE_SECCOMP_GET_FILTER   = 0x420c
+	PTRACE_SECCOMP_GET_METADATA = 0x420d
+)
+
+// ptrace commands from arch/x86/include/uapi/asm/ptrace-abi.h.
+const (
+	PTRACE_GETREGS           = 12
+	PTRACE_SETREGS           = 13
+	PTRACE_GETFPREGS         = 14
+	PTRACE_SETFPREGS         = 15
+	PTRACE_GETFPXREGS        = 18
+	PTRACE_SETFPXREGS        = 19
+	PTRACE_OLDSETOPTIONS     = 21
+	PTRACE_GET_THREAD_AREA   = 25
+	PTRACE_SET_THREAD_AREA   = 26
+	PTRACE_ARCH_PRCTL        = 30
+	PTRACE_SYSEMU            = 31
+	PTRACE_SYSEMU_SINGLESTEP = 32
+	PTRACE_SINGLEBLOCK       = 33
+)
+
+// ptrace event codes from include/uapi/linux/ptrace.h.
+const (
+	PTRACE_EVENT_FORK       = 1
+	PTRACE_EVENT_VFORK      = 2
+	PTRACE_EVENT_CLONE      = 3
+	PTRACE_EVENT_EXEC       = 4
+	PTRACE_EVENT_VFORK_DONE = 5
+	PTRACE_EVENT_EXIT       = 6
+	PTRACE_EVENT_SECCOMP    = 7
+	PTRACE_EVENT_STOP       = 128
+)
+
+// PTRACE_SETOPTIONS options from include/uapi/linux/ptrace.h.
+const (
+	PTRACE_O_TRACESYSGOOD    = 1
+	PTRACE_O_TRACEFORK       = 1 << PTRACE_EVENT_FORK
+	PTRACE_O_TRACEVFORK      = 1 << PTRACE_EVENT_VFORK
+	PTRACE_O_TRACECLONE      = 1 << PTRACE_EVENT_CLONE
+	PTRACE_O_TRACEEXEC       = 1 << PTRACE_EVENT_EXEC
+	PTRACE_O_TRACEVFORKDONE  = 1 << PTRACE_EVENT_VFORK_DONE
+	PTRACE_O_TRACEEXIT       = 1 << PTRACE_EVENT_EXIT
+	PTRACE_O_TRACESECCOMP    = 1 << PTRACE_EVENT_SECCOMP
+	PTRACE_O_EXITKILL        = 1 << 20
+	PTRACE_O_SUSPEND_SECCOMP = 1 << 21
+)
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 20b1c4cd4..f1c2c4bf0 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -16,7 +16,6 @@ package kernel
 
 import (
 	"fmt"
-	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
@@ -24,19 +23,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-// ptrace constants from Linux's include/uapi/linux/ptrace.h.
-const (
-	_PTRACE_EVENT_SECCOMP  = 7
-	PTRACE_SEIZE           = 0x4206
-	PTRACE_INTERRUPT       = 0x4207
-	PTRACE_LISTEN          = 0x4208
-	PTRACE_PEEKSIGINFO     = 0x4209
-	PTRACE_GETSIGMASK      = 0x420a
-	PTRACE_SETSIGMASK      = 0x420b
-	_PTRACE_O_EXITKILL     = 1 << 20
-	_PTRACE_O_TRACESECCOMP = 1 << _PTRACE_EVENT_SECCOMP
-)
-
 // ptraceOptions are the subset of options controlling a task's ptrace behavior
 // that are set by ptrace(PTRACE_SETOPTIONS).
 type ptraceOptions struct {
@@ -505,7 +491,7 @@ func (t *Task) ptraceSeccomp(data uint16) bool {
 		return false
 	}
 	t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
-	t.ptraceEventLocked(_PTRACE_EVENT_SECCOMP, uint64(data))
+	t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data))
 	return true
 }
 
@@ -587,19 +573,19 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions
 		case ptraceCloneKindClone:
 			if t.ptraceOpts.TraceClone {
 				t.Debugf("Entering PTRACE_EVENT_CLONE stop")
-				t.ptraceEventLocked(syscall.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
+				t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
 				event = true
 			}
 		case ptraceCloneKindFork:
 			if t.ptraceOpts.TraceFork {
 				t.Debugf("Entering PTRACE_EVENT_FORK stop")
-				t.ptraceEventLocked(syscall.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
+				t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
 				event = true
 			}
 		case ptraceCloneKindVfork:
 			if t.ptraceOpts.TraceVfork {
 				t.Debugf("Entering PTRACE_EVENT_VFORK stop")
-				t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
+				t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
 				event = true
 			}
 		default:
@@ -657,7 +643,7 @@ func (t *Task) ptraceVforkDone(child ThreadID) bool {
 		return false
 	}
 	t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
-	t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK_DONE, uint64(child))
+	t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child))
 	return true
 }
 
@@ -680,7 +666,7 @@ func (t *Task) ptraceExec(oldTID ThreadID) {
 	}
 	if t.ptraceOpts.TraceExec {
 		t.Debugf("Entering PTRACE_EVENT_EXEC stop")
-		t.ptraceEventLocked(syscall.PTRACE_EVENT_EXEC, uint64(oldTID))
+		t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID))
 		return
 	}
 	// "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
@@ -714,7 +700,7 @@ func (t *Task) ptraceExit() {
 	status := t.exitStatus.Status()
 	t.tg.signalHandlers.mu.Unlock()
 	t.Debugf("Entering PTRACE_EVENT_EXIT stop")
-	t.ptraceEventLocked(syscall.PTRACE_EVENT_EXIT, uint64(status))
+	t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status))
 }
 
 // Preconditions: The TaskSet mutex must be locked.
@@ -762,7 +748,7 @@ func (t *Task) ptraceKill(target *Task) error {
 // Ptrace implements the ptrace system call.
 func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	// PTRACE_TRACEME ignores all other arguments.
-	if req == syscall.PTRACE_TRACEME {
+	if req == linux.PTRACE_TRACEME {
 		return t.ptraceTraceme()
 	}
 	// All other ptrace requests operate on a current or future tracee
@@ -774,12 +760,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 
 	// PTRACE_ATTACH (and PTRACE_SEIZE, which is unimplemented) do not require
 	// that target is not already a tracee.
-	if req == syscall.PTRACE_ATTACH {
+	if req == linux.PTRACE_ATTACH {
 		return t.ptraceAttach(target)
 	}
 	// PTRACE_KILL (and PTRACE_INTERRUPT, which is unimplemented) require that
 	// the target is a tracee, but does not require that it is ptrace-stopped.
-	if req == syscall.PTRACE_KILL {
+	if req == linux.PTRACE_KILL {
 		return t.ptraceKill(target)
 	}
 	// All other ptrace requests require that the target is a ptrace-stopped
@@ -812,37 +798,37 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 
 	// Resuming commands end the ptrace stop, but only if successful.
 	switch req {
-	case syscall.PTRACE_DETACH:
+	case linux.PTRACE_DETACH:
 		if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
-	case syscall.PTRACE_CONT:
+	case linux.PTRACE_CONT:
 		if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
-	case syscall.PTRACE_SYSCALL:
+	case linux.PTRACE_SYSCALL:
 		if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
-	case syscall.PTRACE_SINGLESTEP:
+	case linux.PTRACE_SINGLESTEP:
 		if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
-	case syscall.PTRACE_SYSEMU:
+	case linux.PTRACE_SYSEMU:
 		if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
-	case syscall.PTRACE_SYSEMU_SINGLESTEP:
+	case linux.PTRACE_SYSEMU_SINGLESTEP:
 		if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
@@ -853,7 +839,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	defer target.ptraceUnfreeze()
 
 	switch req {
-	case syscall.PTRACE_PEEKTEXT, syscall.PTRACE_PEEKDATA:
+	case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA:
 		// "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
 		// PTRACE_PEEKUSER requests have a different API: they store the result
 		// at the address specified by the data parameter, and the return value
@@ -867,13 +853,13 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		_, err := t.CopyOut(data, word)
 		return err
 
-	case syscall.PTRACE_POKETEXT, syscall.PTRACE_POKEDATA:
+	case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
 		_, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
 			IgnorePermissions: true,
 		})
 		return err
 
-	case syscall.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
+	case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
 		n, err := target.Arch().PtracePeekUser(uintptr(addr))
 		if err != nil {
 			return err
@@ -881,10 +867,10 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		_, err = t.CopyOut(data, n)
 		return err
 
-	case syscall.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
+	case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
 		return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
 
-	case syscall.PTRACE_GETREGS:
+	case linux.PTRACE_GETREGS:
 		// "Copy the tracee's general-purpose ... registers ... to the address
 		// data in the tracer. ... (addr is ignored.) Note that SPARC systems
 		// have the meaning of data and addr reversed ..."
@@ -898,7 +884,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		})
 		return err
 
-	case syscall.PTRACE_GETFPREGS:
+	case linux.PTRACE_GETFPREGS:
 		_, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
 			Ctx:  t,
 			IO:   t.MemoryManager(),
@@ -909,7 +895,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		})
 		return err
 
-	case syscall.PTRACE_GETREGSET:
+	case linux.PTRACE_GETREGSET:
 		// "Read the tracee's registers. addr specifies, in an
 		// architecture-dependent way, the type of registers to be read. ...
 		// data points to a struct iovec, which describes the destination
@@ -934,7 +920,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		ar.End -= usermem.Addr(n)
 		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
 
-	case syscall.PTRACE_SETREGS:
+	case linux.PTRACE_SETREGS:
 		_, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
 			Ctx:  t,
 			IO:   t.MemoryManager(),
@@ -945,7 +931,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		})
 		return err
 
-	case syscall.PTRACE_SETFPREGS:
+	case linux.PTRACE_SETFPREGS:
 		_, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
 			Ctx:  t,
 			IO:   t.MemoryManager(),
@@ -956,7 +942,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		})
 		return err
 
-	case syscall.PTRACE_SETREGSET:
+	case linux.PTRACE_SETREGSET:
 		ars, err := t.CopyInIovecs(data, 1)
 		if err != nil {
 			return err
@@ -976,7 +962,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		ar.End -= usermem.Addr(n)
 		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
 
-	case syscall.PTRACE_GETSIGINFO:
+	case linux.PTRACE_GETSIGINFO:
 		t.tg.pidns.owner.mu.RLock()
 		defer t.tg.pidns.owner.mu.RUnlock()
 		if target.ptraceSiginfo == nil {
@@ -985,7 +971,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		_, err := t.CopyOut(data, target.ptraceSiginfo)
 		return err
 
-	case syscall.PTRACE_SETSIGINFO:
+	case linux.PTRACE_SETSIGINFO:
 		var info arch.SignalInfo
 		if _, err := t.CopyIn(data, &info); err != nil {
 			return err
@@ -998,7 +984,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		target.ptraceSiginfo = &info
 		return nil
 
-	case PTRACE_GETSIGMASK:
+	case linux.PTRACE_GETSIGMASK:
 		if addr != linux.SignalSetSize {
 			return syserror.EINVAL
 		}
@@ -1007,7 +993,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		_, err := t.CopyOut(data, target.tr.SignalMask)
 		return err
 
-	case PTRACE_SETSIGMASK:
+	case linux.PTRACE_SETSIGMASK:
 		if addr != linux.SignalSetSize {
 			return syserror.EINVAL
 		}
@@ -1019,29 +1005,35 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		target.SetSignalMask(mask &^ UnblockableSignals)
 		return nil
 
-	case syscall.PTRACE_SETOPTIONS:
+	case linux.PTRACE_SETOPTIONS:
 		t.tg.pidns.owner.mu.Lock()
 		defer t.tg.pidns.owner.mu.Unlock()
-		validOpts := uintptr(_PTRACE_O_EXITKILL | syscall.PTRACE_O_TRACESYSGOOD | syscall.PTRACE_O_TRACECLONE |
-			syscall.PTRACE_O_TRACEEXEC | syscall.PTRACE_O_TRACEEXIT | syscall.PTRACE_O_TRACEFORK |
-			_PTRACE_O_TRACESECCOMP | syscall.PTRACE_O_TRACEVFORK | syscall.PTRACE_O_TRACEVFORKDONE)
+		validOpts := uintptr(linux.PTRACE_O_EXITKILL |
+			linux.PTRACE_O_TRACESYSGOOD |
+			linux.PTRACE_O_TRACECLONE |
+			linux.PTRACE_O_TRACEEXEC |
+			linux.PTRACE_O_TRACEEXIT |
+			linux.PTRACE_O_TRACEFORK |
+			linux.PTRACE_O_TRACESECCOMP |
+			linux.PTRACE_O_TRACEVFORK |
+			linux.PTRACE_O_TRACEVFORKDONE)
 		if uintptr(data)&^validOpts != 0 {
 			return syserror.EINVAL
 		}
 		target.ptraceOpts = ptraceOptions{
-			ExitKill:       data&_PTRACE_O_EXITKILL != 0,
-			SysGood:        data&syscall.PTRACE_O_TRACESYSGOOD != 0,
-			TraceClone:     data&syscall.PTRACE_O_TRACECLONE != 0,
-			TraceExec:      data&syscall.PTRACE_O_TRACEEXEC != 0,
-			TraceExit:      data&syscall.PTRACE_O_TRACEEXIT != 0,
-			TraceFork:      data&syscall.PTRACE_O_TRACEFORK != 0,
-			TraceSeccomp:   data&_PTRACE_O_TRACESECCOMP != 0,
-			TraceVfork:     data&syscall.PTRACE_O_TRACEVFORK != 0,
-			TraceVforkDone: data&syscall.PTRACE_O_TRACEVFORKDONE != 0,
+			ExitKill:       data&linux.PTRACE_O_EXITKILL != 0,
+			SysGood:        data&linux.PTRACE_O_TRACESYSGOOD != 0,
+			TraceClone:     data&linux.PTRACE_O_TRACECLONE != 0,
+			TraceExec:      data&linux.PTRACE_O_TRACEEXEC != 0,
+			TraceExit:      data&linux.PTRACE_O_TRACEEXIT != 0,
+			TraceFork:      data&linux.PTRACE_O_TRACEFORK != 0,
+			TraceSeccomp:   data&linux.PTRACE_O_TRACESECCOMP != 0,
+			TraceVfork:     data&linux.PTRACE_O_TRACEVFORK != 0,
+			TraceVforkDone: data&linux.PTRACE_O_TRACEVFORKDONE != 0,
 		}
 		return nil
 
-	case syscall.PTRACE_GETEVENTMSG:
+	case linux.PTRACE_GETEVENTMSG:
 		t.tg.pidns.owner.mu.RLock()
 		defer t.tg.pidns.owner.mu.RUnlock()
 		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
diff --git a/pkg/sentry/strace/ptrace.go b/pkg/sentry/strace/ptrace.go
index a0dabb27a..fcdb7e9f4 100644
--- a/pkg/sentry/strace/ptrace.go
+++ b/pkg/sentry/strace/ptrace.go
@@ -15,164 +15,162 @@
 package strace
 
 import (
-	"syscall"
-
 	"gvisor.googlesource.com/gvisor/pkg/abi"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 )
 
 // PtraceRequestSet are the possible ptrace(2) requests.
 var PtraceRequestSet = abi.ValueSet{
 	{
-		Value: syscall.PTRACE_TRACEME,
+		Value: linux.PTRACE_TRACEME,
 		Name:  "PTRACE_TRACEME",
 	},
 	{
-		Value: syscall.PTRACE_PEEKTEXT,
+		Value: linux.PTRACE_PEEKTEXT,
 		Name:  "PTRACE_PEEKTEXT",
 	},
 	{
-		Value: syscall.PTRACE_PEEKDATA,
+		Value: linux.PTRACE_PEEKDATA,
 		Name:  "PTRACE_PEEKDATA",
 	},
 	{
-		Value: syscall.PTRACE_PEEKUSR,
+		Value: linux.PTRACE_PEEKUSR,
 		Name:  "PTRACE_PEEKUSR",
 	},
 	{
-		Value: syscall.PTRACE_POKETEXT,
+		Value: linux.PTRACE_POKETEXT,
 		Name:  "PTRACE_POKETEXT",
 	},
 	{
-		Value: syscall.PTRACE_POKEDATA,
+		Value: linux.PTRACE_POKEDATA,
 		Name:  "PTRACE_POKEDATA",
 	},
 	{
-		Value: syscall.PTRACE_POKEUSR,
+		Value: linux.PTRACE_POKEUSR,
 		Name:  "PTRACE_POKEUSR",
 	},
 	{
-		Value: syscall.PTRACE_CONT,
+		Value: linux.PTRACE_CONT,
 		Name:  "PTRACE_CONT",
 	},
 	{
-		Value: syscall.PTRACE_KILL,
+		Value: linux.PTRACE_KILL,
 		Name:  "PTRACE_KILL",
 	},
 	{
-		Value: syscall.PTRACE_SINGLESTEP,
+		Value: linux.PTRACE_SINGLESTEP,
 		Name:  "PTRACE_SINGLESTEP",
 	},
 	{
-		Value: syscall.PTRACE_ATTACH,
+		Value: linux.PTRACE_ATTACH,
 		Name:  "PTRACE_ATTACH",
 	},
 	{
-		Value: syscall.PTRACE_DETACH,
+		Value: linux.PTRACE_DETACH,
 		Name:  "PTRACE_DETACH",
 	},
 	{
-		Value: syscall.PTRACE_SYSCALL,
+		Value: linux.PTRACE_SYSCALL,
 		Name:  "PTRACE_SYSCALL",
 	},
 	{
-		Value: syscall.PTRACE_SETOPTIONS,
+		Value: linux.PTRACE_SETOPTIONS,
 		Name:  "PTRACE_SETOPTIONS",
 	},
 	{
-		Value: syscall.PTRACE_GETEVENTMSG,
+		Value: linux.PTRACE_GETEVENTMSG,
 		Name:  "PTRACE_GETEVENTMSG",
 	},
 	{
-		Value: syscall.PTRACE_GETSIGINFO,
+		Value: linux.PTRACE_GETSIGINFO,
 		Name:  "PTRACE_GETSIGINFO",
 	},
 	{
-		Value: syscall.PTRACE_SETSIGINFO,
+		Value: linux.PTRACE_SETSIGINFO,
 		Name:  "PTRACE_SETSIGINFO",
 	},
 	{
-		Value: syscall.PTRACE_GETREGSET,
+		Value: linux.PTRACE_GETREGSET,
 		Name:  "PTRACE_GETREGSET",
 	},
 	{
-		Value: syscall.PTRACE_SETREGSET,
+		Value: linux.PTRACE_SETREGSET,
 		Name:  "PTRACE_SETREGSET",
 	},
 	{
-		Value: kernel.PTRACE_SEIZE,
+		Value: linux.PTRACE_SEIZE,
 		Name:  "PTRACE_SEIZE",
 	},
 	{
-		Value: kernel.PTRACE_INTERRUPT,
+		Value: linux.PTRACE_INTERRUPT,
 		Name:  "PTRACE_INTERRUPT",
 	},
 	{
-		Value: kernel.PTRACE_LISTEN,
+		Value: linux.PTRACE_LISTEN,
 		Name:  "PTRACE_LISTEN",
 	},
 	{
-		Value: kernel.PTRACE_PEEKSIGINFO,
+		Value: linux.PTRACE_PEEKSIGINFO,
 		Name:  "PTRACE_PEEKSIGINFO",
 	},
 	{
-		Value: kernel.PTRACE_GETSIGMASK,
+		Value: linux.PTRACE_GETSIGMASK,
 		Name:  "PTRACE_GETSIGMASK",
 	},
 	{
-		Value: kernel.PTRACE_SETSIGMASK,
+		Value: linux.PTRACE_SETSIGMASK,
 		Name:  "PTRACE_SETSIGMASK",
 	},
 	{
-		Value: syscall.PTRACE_GETREGS,
+		Value: linux.PTRACE_GETREGS,
 		Name:  "PTRACE_GETREGS",
 	},
 	{
-		Value: syscall.PTRACE_SETREGS,
+		Value: linux.PTRACE_SETREGS,
 		Name:  "PTRACE_SETREGS",
 	},
 	{
-		Value: syscall.PTRACE_GETFPREGS,
+		Value: linux.PTRACE_GETFPREGS,
 		Name:  "PTRACE_GETFPREGS",
 	},
 	{
-		Value: syscall.PTRACE_SETFPREGS,
+		Value: linux.PTRACE_SETFPREGS,
 		Name:  "PTRACE_SETFPREGS",
 	},
 	{
-		Value: syscall.PTRACE_GETFPXREGS,
+		Value: linux.PTRACE_GETFPXREGS,
 		Name:  "PTRACE_GETFPXREGS",
 	},
 	{
-		Value: syscall.PTRACE_SETFPXREGS,
+		Value: linux.PTRACE_SETFPXREGS,
 		Name:  "PTRACE_SETFPXREGS",
 	},
 	{
-		Value: syscall.PTRACE_OLDSETOPTIONS,
+		Value: linux.PTRACE_OLDSETOPTIONS,
 		Name:  "PTRACE_OLDSETOPTIONS",
 	},
 	{
-		Value: syscall.PTRACE_GET_THREAD_AREA,
+		Value: linux.PTRACE_GET_THREAD_AREA,
 		Name:  "PTRACE_GET_THREAD_AREA",
 	},
 	{
-		Value: syscall.PTRACE_SET_THREAD_AREA,
+		Value: linux.PTRACE_SET_THREAD_AREA,
 		Name:  "PTRACE_SET_THREAD_AREA",
 	},
 	{
-		Value: syscall.PTRACE_ARCH_PRCTL,
+		Value: linux.PTRACE_ARCH_PRCTL,
 		Name:  "PTRACE_ARCH_PRCTL",
 	},
 	{
-		Value: syscall.PTRACE_SYSEMU,
+		Value: linux.PTRACE_SYSEMU,
 		Name:  "PTRACE_SYSEMU",
 	},
 	{
-		Value: syscall.PTRACE_SYSEMU_SINGLESTEP,
+		Value: linux.PTRACE_SYSEMU_SINGLESTEP,
 		Name:  "PTRACE_SYSEMU_SINGLESTEP",
 	},
 	{
-		Value: syscall.PTRACE_SINGLEBLOCK,
+		Value: linux.PTRACE_SINGLEBLOCK,
 		Name:  "PTRACE_SINGLEBLOCK",
 	},
 }
-- 
cgit v1.2.3


From 41e0b977e5ffc667750c0f706bb70173c5de2161 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 12 Jul 2018 10:36:16 -0700
Subject: Format documentation

PiperOrigin-RevId: 204323728
Change-Id: I1ff9aa062ffa12583b2e38ec94c87db7a3711971
---
 CONTRIBUTING.md              |  74 +++++++++++---------
 README.md                    | 119 ++++++++++++++++----------------
 pkg/sentry/fs/README.md      |   8 +--
 pkg/sentry/fs/proc/README.md |   5 +-
 pkg/sentry/kernel/README.md  |  80 ++++++++++-----------
 pkg/sentry/mm/README.md      | 161 ++++++++++++++++++++++---------------------
 pkg/sentry/usermem/README.md |  42 +++++------
 7 files changed, 252 insertions(+), 237 deletions(-)

(limited to 'pkg/sentry')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index fa607113c..7ad19fb02 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,15 +7,16 @@ Before we can use your code, you must sign the
 online. The CLA is necessary mainly because you own the copyright to your
 changes, even after your contribution becomes part of our codebase, so we need
 your permission to use and distribute your code. We also need to be sure of
-various other things—for instance that you'll tell us if you know that your
-code infringes on other people's patents. You don't have to sign the CLA until
-after you've submitted your code for review and a member has approved it, but
-you must do it before we can put your code into our codebase. Before you start
-working on a larger contribution, you should get in touch with us first through
-the issue tracker with your idea so that we can help out and possibly guide you.
+various other things—for instance that you'll tell us if you know that your code
+infringes on other people's patents. You don't have to sign the CLA until after
+you've submitted your code for review and a member has approved it, but you must
+do it before we can put your code into our codebase. Before you start working on
+a larger contribution, you should get in touch with us first through the issue
+tracker with your idea so that we can help out and possibly guide you.
 Coordinating up front makes it much easier to avoid frustration later on.
 
 ### Coding Guidelines
+
 All code should conform to the [Go style guidelines][gostyle].
 
 As a secure runtime, we need to maintain the safety of all of code included in
@@ -25,34 +26,41 @@ Definitions for the rules below:
 
 `core`:
 
-  * `//pkg/sentry/...`
-  * Transitive dependencies in `//pkg/...`
+*   `//pkg/sentry/...`
+*   Transitive dependencies in `//pkg/...`
 
 `runsc`:
 
-  * `//runsc/...`
+*   `//runsc/...`
 
 Rules:
 
-  * No cgo in `core` or `runsc`. The final binary must be a statically-linked
+*   No cgo in `core` or `runsc`. The final binary must be a statically-linked
     pure Go binary.
 
-  * Any files importing "unsafe" must have a name ending in `_unsafe.go`.
+*   Any files importing "unsafe" must have a name ending in `_unsafe.go`.
+
+*   `core` may only depend on the following packages:
+
+    *   Itself.
+    *   Go standard library.
+        *   Except (transitively) package "net" (this will result in a non-cgo
+            binary). Use `//pkg/unet` instead.
+    *   `@org_golang_x_sys//unix:go_default_library` (Go import
+        `golang.org/x/sys/unix`).
+    *   Generated Go protobuf packages.
+    *   `@com_github_golang_protobuf//proto:go_default_library` (Go import
+        `github.com/golang/protobuf/proto`).
+    *   `@com_github_golang_protobuf//ptypes:go_default_library` (Go import
+        `github.com/golang/protobuf/ptypes`).
 
-  * `core` may only depend on the following packages:
-    * Itself.
-    * Go standard library.
-      * Except (transitively) package "net" (this will result in a non-cgo
-        binary). Use `//pkg/unet` instead.
-    * `@org_golang_x_sys//unix:go_default_library` (Go import `golang.org/x/sys/unix`).
-    * Generated Go protobuf packages.
-    * `@com_github_golang_protobuf//proto:go_default_library` (Go import `github.com/golang/protobuf/proto`).
-    * `@com_github_golang_protobuf//ptypes:go_default_library` (Go import `github.com/golang/protobuf/ptypes`).
+*   `runsc` may only depend on the following packages:
 
-  * `runsc` may only depend on the following packages:
-    * All packages allowed for `core`.
-    * `@com_github_google_subcommands//:go_default_library` (Go import `github.com/google/subcommands`).
-    * `@com_github_opencontainers_runtime_spec//specs_go:go_default_library` (Go import `github.com/opencontainers/runtime-spec/specs_go`).
+    *   All packages allowed for `core`.
+    *   `@com_github_google_subcommands//:go_default_library` (Go import
+        `github.com/google/subcommands`).
+    *   `@com_github_opencontainers_runtime_spec//specs_go:go_default_library`
+        (Go import `github.com/opencontainers/runtime-spec/specs_go`).
 
 ### Code reviews
 
@@ -66,8 +74,8 @@ To submit a patch, first clone the canonical repository.
 git clone https://gvisor.googlesource.com/gvisor
 ```
 
-From within the cloned directory, install the commit hooks (optional, but if
-you don't you will need to generate Change-Ids manually in your commits).
+From within the cloned directory, install the commit hooks (optional, but if you
+don't you will need to generate Change-Ids manually in your commits).
 
 ```
 curl -Lo `git rev-parse --git-dir`/hooks/commit-msg https://gerrit-review.googlesource.com/tools/hooks/commit-msg
@@ -79,8 +87,8 @@ changes, remember to organize commits logically. Changes are not reviewed per
 branch (as with a pull request), they are reviewed per commit.
 
 Before posting a new patch, you will need to generate an appropriate
-authentication cookie. Visit the [repository][repo] and click the
-"Generate Password" link at the top of the page for instructions.
+authentication cookie. Visit the [repository][repo] and click the "Generate
+Password" link at the top of the page for instructions.
 
 To post a patch for review, push to a special "for" reference.
 
@@ -90,17 +98,17 @@ git push origin HEAD:refs/for/master
 
 A change link will be generated for the commit, and a team member will review
 your change request, provide feedback (and submit when appropriate). To address
-feedback, you may be required to amend your commit and repush (don't change
-the Commit-Id in the commit message). This will generate a new version of
-the change.
+feedback, you may be required to amend your commit and repush (don't change the
+Commit-Id in the commit message). This will generate a new version of the
+change.
 
 When approved, the change will be submitted by a team member and automatically
 merged into the repository.
 
 ### The small print
 
-Contributions made by corporations are covered by a different agreement than
-the one above, the
+Contributions made by corporations are covered by a different agreement than the
+one above, the
 [Software Grant and Corporate Contributor License Agreement][gccla].
 
 [gcla]: https://cla.developers.google.com/about/google-individual
diff --git a/README.md b/README.md
index ed989b0ed..624b8c062 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,11 @@
 # gVisor
 
 gVisor is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system surface. It includes an [Open Container Initiative
-(OCI)][oci] runtime called `runsc` that provides an isolation boundary between
-the application and the host kernel. The `runsc` runtime integrates with Docker
-and Kubernetes, making it simple to run sandboxed containers.
+portion of the Linux system surface. It includes an
+[Open Container Initiative (OCI)][oci] runtime called `runsc` that provides an
+isolation boundary between the application and the host kernel. The `runsc`
+runtime integrates with Docker and Kubernetes, making it simple to run sandboxed
+containers.
 
 gVisor takes a distinct approach to container sandboxing and makes a different
 set of technical trade-offs compared to existing sandbox technologies, thus
@@ -51,11 +52,11 @@ require a larger resource footprint and slower start-up times.
 [AppArmor][apparmor], allows the specification of a fine-grained security policy
 for an application or container. These schemes typically rely on hooks
 implemented inside the host kernel to enforce the rules. If the surface can be
-made small enough (i.e. a sufficiently complete policy defined), then this is
-an excellent way to sandbox applications and maintain native performance.
-However, in practice it can be extremely difficult (if not impossible) to
-reliably define a policy for arbitrary, previously unknown applications,
-making this approach challenging to apply universally.
+made small enough (i.e. a sufficiently complete policy defined), then this is an
+excellent way to sandbox applications and maintain native performance. However,
+in practice it can be extremely difficult (if not impossible) to reliably define
+a policy for arbitrary, previously unknown applications, making this approach
+challenging to apply universally.
 
 ![Rule-based execution](g3doc/Rule-Based-Execution.png "Rule-based execution")
 
@@ -109,9 +110,9 @@ application to directly control the system calls it makes.
 In order to provide defense-in-depth and limit the host system surface, the
 gVisor container runtime is normally split into two separate processes. First,
 the *Sentry* process includes the kernel and is responsible for executing user
-code and handling system calls. Second, file system operations that extend beyond
-the sandbox (not internal proc or tmp files, pipes, etc.) are sent to a proxy,
-called a *Gofer*, via a 9P connection.
+code and handling system calls. Second, file system operations that extend
+beyond the sandbox (not internal proc or tmp files, pipes, etc.) are sent to a
+proxy, called a *Gofer*, via a 9P connection.
 
 ![Sentry](g3doc/Sentry-Gofer.png "Sentry and Gofer")
 
@@ -138,17 +139,17 @@ isolation (see below).
 The Sentry requires a *platform* to implement basic context switching and memory
 mapping functionality. Today, gVisor supports two platforms:
 
-* The **Ptrace** platform uses SYSEMU functionality to execute user code without
-  executing host system calls. This platform can run anywhere that `ptrace`
-  works (even VMs without nested virtualization).
+*   The **Ptrace** platform uses SYSEMU functionality to execute user code
+    without executing host system calls. This platform can run anywhere that
+    `ptrace` works (even VMs without nested virtualization).
 
-* The **KVM** platform (experimental) allows the Sentry to act as both guest OS
-  and VMM, switching back and forth between the two worlds seamlessly. The KVM
-  platform can run on bare-metal or on a VM with nested virtualization enabled.
-  While there is no virtualized hardware layer -- the sandbox retains a process
-  model -- gVisor leverages virtualization extensions available on modern
-  processors in order to improve isolation and performance of address space
-  switches.
+*   The **KVM** platform (experimental) allows the Sentry to act as both guest
+    OS and VMM, switching back and forth between the two worlds seamlessly. The
+    KVM platform can run on bare-metal or on a VM with nested virtualization
+    enabled. While there is no virtualized hardware layer -- the sandbox retains
+    a process model -- gVisor leverages virtualization extensions available on
+    modern processors in order to improve isolation and performance of address
+    space switches.
 
 ### Performance
 
@@ -172,8 +173,8 @@ binaries).
 
 The easiest way to get `runsc` is from the
 [latest nightly build][runsc-nightly]. After you download the binary, check it
-against the SHA512 [checksum file][runsc-nightly-sha]. Older builds can be
-found here:
+against the SHA512 [checksum file][runsc-nightly-sha]. Older builds can be found
+here:
 `https://storage.googleapis.com/gvisor/releases/nightly/${yyyy-mm-dd}/runsc` and
 `https://storage.googleapis.com/gvisor/releases/nightly/${yyyy-mm-dd}/runsc.sha512`
 
@@ -193,8 +194,8 @@ sudo mv runsc /usr/local/bin
 
 Next, configure Docker to use `runsc` by adding a runtime entry to your Docker
 configuration (`/etc/docker/daemon.json`). You may have to create this file if
-it does not exist. Also, some Docker versions also require you to [specify the
-`storage-driver` field][docker-storage-driver].
+it does not exist. Also, some Docker versions also require you to
+[specify the `storage-driver` field][docker-storage-driver].
 
 In the end, the file should look something like:
 
@@ -208,7 +209,8 @@ In the end, the file should look something like:
 }
 ```
 
-You must restart the Docker daemon after making changes to this file, typically this is done via:
+You must restart the Docker daemon after making changes to this file, typically
+this is done via:
 
 ```
 sudo systemctl restart docker
@@ -229,8 +231,8 @@ docker run --runtime=runsc -it ubuntu /bin/bash
 ### Kubernetes Support (Experimental)
 
 gVisor can run sandboxed containers in a Kubernetes cluster with cri-o, although
-this is not recommended for production environments yet. Follow [these
-instructions][cri-o-k8s] to run [cri-o][cri-o] on a node in a Kubernetes
+this is not recommended for production environments yet. Follow
+[these instructions][cri-o-k8s] to run [cri-o][cri-o] on a node in a Kubernetes
 cluster. Build `runsc` and put it on the node, and set it as the
 `runtime_untrusted_workload` in `/etc/crio/crio.conf`.
 
@@ -251,11 +253,11 @@ gVisor currently requires x86\_64 Linux to build.
 
 Make sure the following dependencies are installed:
 
-* [git][git]
-* [Bazel][bazel]
-* [Python][python]
-* [Docker version 17.09.0 or greater][docker]
-* Gold linker (e.g. `binutils-gold` package on Ubuntu)
+*   [git][git]
+*   [Bazel][bazel]
+*   [Python][python]
+*   [Docker version 17.09.0 or greater][docker]
+*   Gold linker (e.g. `binutils-gold` package on Ubuntu)
 
 #### Getting the source
 
@@ -275,7 +277,6 @@ bazel build runsc
 sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin
 ```
 
-
 ### Testing
 
 The gVisor test suite can be run with Bazel:
@@ -366,33 +367,33 @@ Then restart the Docker daemon.
 gVisor implements a large portion of the Linux surface and while we strive to
 make it broadly compatible, there are (and always will be) unimplemented
 features and bugs. The only real way to know if it will work is to try. If you
-find a container that doesn’t work and there is no known issue, please [file a
-bug][bug] indicating the full command you used to run the image. Providing the
-debug logs is also helpful.
+find a container that doesn’t work and there is no known issue, please
+[file a bug][bug] indicating the full command you used to run the image.
+Providing the debug logs is also helpful.
 
 ### What works?
 
 The following applications/images have been tested:
 
-* elasticsearch
-* golang
-* httpd
-* java8
-* jenkins
-* mariadb
-* memcached
-* mongo
-* mysql
-* nginx
-* node
-* php
-* postgres
-* prometheus
-* python
-* redis
-* registry
-* tomcat
-* wordpress
+*   elasticsearch
+*   golang
+*   httpd
+*   java8
+*   jenkins
+*   mariadb
+*   memcached
+*   mongo
+*   mysql
+*   nginx
+*   node
+*   php
+*   postgres
+*   prometheus
+*   python
+*   redis
+*   registry
+*   tomcat
+*   wordpress
 
 ### My container runs fine with *runc* but fails with *runsc*.
 
@@ -416,8 +417,8 @@ This bug is tracked in [bug #4](https://github.com/google/gvisor/issues/4).
 
 ## Technical details
 
-We plan to release a full paper with technical details and will include it
-here when available.
+We plan to release a full paper with technical details and will include it here
+when available.
 
 ## Community
 
diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
index 898271ee8..76638cdae 100644
--- a/pkg/sentry/fs/README.md
+++ b/pkg/sentry/fs/README.md
@@ -149,10 +149,10 @@ An `fs.File` references the following filesystem objects:
 fs.File -> fs.Dirent -> fs.Inode -> fs.MountedFilesystem
 ```
 
-The `fs.Inode` is restored using its `fs.MountedFilesystem`. The [Mount
-points](#mount-points) section above describes how this happens in detail. The
-`fs.Dirent` restores its pointer to an `fs.Inode`, pointers to parent and
-children `fs.Dirents`, and the basename of the file.
+The `fs.Inode` is restored using its `fs.MountedFilesystem`. The
+[Mount points](#mount-points) section above describes how this happens in
+detail. The `fs.Dirent` restores its pointer to an `fs.Inode`, pointers to
+parent and children `fs.Dirents`, and the basename of the file.
 
 Otherwise an `fs.File` restores flags, an offset, and a unique identifier (only
 used internally).
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 6ad7297d2..cec842403 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -6,6 +6,7 @@ procfs generally.
 inconsistency, please file a bug.
 
 [TOC]
+
 ## Kernel data
 
 The following files are implemented:
@@ -91,6 +92,7 @@ Num currently running processes       | Always zero
 Total num processes                   | Always zero
 
 TODO: Populate the columns with accurate statistics.
+
 ### meminfo
 
 ```bash
@@ -122,7 +124,7 @@ Shmem:                 0 kB
 Notable divergences:
 
 Field name        | Notes
-:---------------- | :--------------------------------------------------------
+:---------------- | :-----------------------------------------------------
 Buffers           | Always zero, no block devices
 SwapCache         | Always zero, no swap
 Inactive(anon)    | Always zero, see SwapCache
@@ -182,6 +184,7 @@ softirq 0 0 0 0 0 0 0 0 0 0 0
 ```
 
 All fields except for `btime` are always zero.
+
 TODO: Populate with accurate fields.
 
 ### sys
diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md
index 88760a9bb..427311be8 100644
--- a/pkg/sentry/kernel/README.md
+++ b/pkg/sentry/kernel/README.md
@@ -1,12 +1,12 @@
 This package contains:
 
-- A (partial) emulation of the "core Linux kernel", which governs task
-  execution and scheduling, system call dispatch, and signal handling. See
-  below for details.
+-   A (partial) emulation of the "core Linux kernel", which governs task
+    execution and scheduling, system call dispatch, and signal handling. See
+    below for details.
 
-- The top-level interface for the sentry's Linux kernel emulation in general,
-  used by the `main` function of all versions of the sentry. This interface
-  revolves around the `Env` type (defined in `kernel.go`).
+-   The top-level interface for the sentry's Linux kernel emulation in general,
+    used by the `main` function of all versions of the sentry. This interface
+    revolves around the `Env` type (defined in `kernel.go`).
 
 # Background
 
@@ -20,15 +20,15 @@ sentry's notion of a task unless otherwise specified.)
 At a high level, Linux application threads can be thought of as repeating a "run
 loop":
 
-- Some amount of application code is executed in userspace.
+-   Some amount of application code is executed in userspace.
 
-- A trap (explicit syscall invocation, hardware interrupt or exception, etc.)
-  causes control flow to switch to the kernel.
+-   A trap (explicit syscall invocation, hardware interrupt or exception, etc.)
+    causes control flow to switch to the kernel.
 
-- Some amount of kernel code is executed in kernelspace, e.g. to handle the
-  cause of the trap.
+-   Some amount of kernel code is executed in kernelspace, e.g. to handle the
+    cause of the trap.
 
-- The kernel "returns from the trap" into application code.
+-   The kernel "returns from the trap" into application code.
 
 Analogously, each task in the sentry is associated with a *task goroutine* that
 executes that task's run loop (`Task.run` in `task_run.go`). However, the
@@ -38,24 +38,25 @@ state to, and resuming execution from, checkpoints.
 While in kernelspace, a Linux thread can be descheduled (cease execution) in a
 variety of ways:
 
-- It can yield or be preempted, becoming temporarily descheduled but still
-  runnable. At present, the sentry delegates scheduling of runnable threads to
-  the Go runtime.
+-   It can yield or be preempted, becoming temporarily descheduled but still
+    runnable. At present, the sentry delegates scheduling of runnable threads to
+    the Go runtime.
 
-- It can exit, becoming permanently descheduled. The sentry's equivalent is
-  returning from `Task.run`, terminating the task goroutine.
+-   It can exit, becoming permanently descheduled. The sentry's equivalent is
+    returning from `Task.run`, terminating the task goroutine.
 
-- It can enter interruptible sleep, a state in which it can be woken by a
-  caller-defined wakeup or the receipt of a signal. In the sentry, interruptible
-  sleep (which is ambiguously referred to as *blocking*) is implemented by
-  making all events that can end blocking (including signal notifications)
-  communicated via Go channels and using `select` to multiplex wakeup sources;
-  see `task_block.go`.
+-   It can enter interruptible sleep, a state in which it can be woken by a
+    caller-defined wakeup or the receipt of a signal. In the sentry,
+    interruptible sleep (which is ambiguously referred to as *blocking*) is
+    implemented by making all events that can end blocking (including signal
+    notifications) communicated via Go channels and using `select` to multiplex
+    wakeup sources; see `task_block.go`.
 
-- It can enter uninterruptible sleep, a state in which it can only be woken by a
-  caller-defined wakeup. Killable sleep is a closely related variant in which
-  the task can also be woken by SIGKILL. (These definitions also include Linux's
-  "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped" (`TASK_TRACED`) states.)
+-   It can enter uninterruptible sleep, a state in which it can only be woken by
+    a caller-defined wakeup. Killable sleep is a closely related variant in
+    which the task can also be woken by SIGKILL. (These definitions also include
+    Linux's "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped"
+    (`TASK_TRACED`) states.)
 
 To maximize compatibility with Linux, sentry checkpointing appears as a spurious
 signal-delivery interrupt on all tasks; interrupted system calls return `EINTR`
@@ -71,21 +72,22 @@ through sleeping operations.
 
 We break the task's control flow graph into *states*, delimited by:
 
-1. Points where uninterruptible and killable sleeps may occur. For example,
-there exists a state boundary between signal dequeueing and signal delivery
-because there may be an intervening ptrace signal-delivery-stop.
+1.  Points where uninterruptible and killable sleeps may occur. For example,
+    there exists a state boundary between signal dequeueing and signal delivery
+    because there may be an intervening ptrace signal-delivery-stop.
 
-2. Points where sleep-induced branches may "rejoin" normal execution. For
-example, the syscall exit state exists because it can be reached immediately
-following a synchronous syscall, or after a task that is sleeping in `execve()`
-or `vfork()` resumes execution.
+2.  Points where sleep-induced branches may "rejoin" normal execution. For
+    example, the syscall exit state exists because it can be reached immediately
+    following a synchronous syscall, or after a task that is sleeping in
+    `execve()` or `vfork()` resumes execution.
 
-3. Points containing large branches. This is strictly for organizational
-purposes. For example, the state that processes interrupt-signaled conditions is
-kept separate from the main "app" state to reduce the size of the latter.
+3.  Points containing large branches. This is strictly for organizational
+    purposes. For example, the state that processes interrupt-signaled
+    conditions is kept separate from the main "app" state to reduce the size of
+    the latter.
 
-4. `SyscallReinvoke`, which does not correspond to anything in Linux, and exists
-solely to serve the autosave feature.
+4.  `SyscallReinvoke`, which does not correspond to anything in Linux, and
+    exists solely to serve the autosave feature.
 
 ![dot -Tpng -Goverlap=false -orun_states.png run_states.dot](g3doc/run_states.png "Task control flow graph")
 
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
index 067733475..e485a5ca5 100644
--- a/pkg/sentry/mm/README.md
+++ b/pkg/sentry/mm/README.md
@@ -38,50 +38,50 @@ forces the kernel to create such a mapping to service the read.
 
 For a file, doing so consists of several logical phases:
 
-1. The kernel allocates physical memory to store the contents of the required
-   part of the file, and copies file contents to the allocated memory. Supposing
-   that the kernel chooses the physical memory at physical address (PA)
-   0x2fb000, the resulting state of the system is:
+1.  The kernel allocates physical memory to store the contents of the required
+    part of the file, and copies file contents to the allocated memory.
+    Supposing that the kernel chooses the physical memory at physical address
+    (PA) 0x2fb000, the resulting state of the system is:
 
         VMA:     VA:0x400000 -> /tmp/foo:0x0
         Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
 
-   (In Linux the state of the mapping from file offset to physical memory is
-   stored in `struct address_space`, but to avoid confusion with other notions
-   of address space we will refer to this system as filemap, named after Linux
-   kernel source file `mm/filemap.c`.)
+    (In Linux the state of the mapping from file offset to physical memory is
+    stored in `struct address_space`, but to avoid confusion with other notions
+    of address space we will refer to this system as filemap, named after Linux
+    kernel source file `mm/filemap.c`.)
 
-2. The kernel stores the effective mapping from virtual to physical address in a
-   *page table entry* (PTE) in the application's *page tables*, which are used
-   by the CPU's virtual memory hardware to perform address translation. The
-   resulting state of the system is:
+2.  The kernel stores the effective mapping from virtual to physical address in
+    a *page table entry* (PTE) in the application's *page tables*, which are
+    used by the CPU's virtual memory hardware to perform address translation.
+    The resulting state of the system is:
 
         VMA:     VA:0x400000 -> /tmp/foo:0x0
         Filemap:                /tmp/foo:0x0 -> PA:0x2fb000
         PTE:     VA:0x400000 -----------------> PA:0x2fb000
 
-   The PTE is required for the application to actually use the contents of the
-   mapped file as virtual memory. However, the PTE is derived from the VMA and
-   filemap state, both of which are independently mutable, such that mutations
-   to either will affect the PTE. For example:
-
-   - The application may remove the VMA using the `munmap` system call. This
-     breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently the
-     mapping from VA:0x400000 to PA:0x2fb000. However, it does not necessarily
-     break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a future mapping of
-     the same file offset may reuse this physical memory.
-
-   - The application may invalidate the file's contents by passing a length of 0
-     to the `ftruncate` system call. This breaks the mapping from /tmp/foo:0x0
-     to PA:0x2fb000, and consequently the mapping from VA:0x400000 to
-     PA:0x2fb000. However, it does not break the mapping from VA:0x400000 to
-     /tmp/foo:0x0, so future changes to the file's contents may again be made
-     visible at VA:0x400000 after another page fault results in the allocation
-     of a new physical address.
-
-   Note that, in order to correctly break the mapping from VA:0x400000 to
-   PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping*
-   from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE.
+    The PTE is required for the application to actually use the contents of the
+    mapped file as virtual memory. However, the PTE is derived from the VMA and
+    filemap state, both of which are independently mutable, such that mutations
+    to either will affect the PTE. For example:
+
+    -   The application may remove the VMA using the `munmap` system call. This
+        breaks the mapping from VA:0x400000 to /tmp/foo:0x0, and consequently
+        the mapping from VA:0x400000 to PA:0x2fb000. However, it does not
+        necessarily break the mapping from /tmp/foo:0x0 to PA:0x2fb000, so a
+        future mapping of the same file offset may reuse this physical memory.
+
+    -   The application may invalidate the file's contents by passing a length
+        of 0 to the `ftruncate` system call. This breaks the mapping from
+        /tmp/foo:0x0 to PA:0x2fb000, and consequently the mapping from
+        VA:0x400000 to PA:0x2fb000. However, it does not break the mapping from
+        VA:0x400000 to /tmp/foo:0x0, so future changes to the file's contents
+        may again be made visible at VA:0x400000 after another page fault
+        results in the allocation of a new physical address.
+
+    Note that, in order to correctly break the mapping from VA:0x400000 to
+    PA:0x2fb000 in the latter case, filemap must also store a *reverse mapping*
+    from /tmp/foo:0x0 to VA:0x400000 so that it can locate and remove the PTE.
 
 [^mmap-anon]: Memory mappings to non-files are discussed in later sections.
 
@@ -146,30 +146,30 @@ When the application first incurs a page fault on this address, the host kernel
 delivers information about the page fault to the sentry in a platform-dependent
 manner, and the sentry handles the fault:
 
-1. The sentry allocates memory to store the contents of the required part of the
-   file, and copies file contents to the allocated memory. However, since the
-   sentry is implemented atop a host kernel, it does not configure mappings to
-   physical memory directly. Instead, mappable "memory" in the sentry is
-   represented by a host file descriptor and offset, since (as noted in
-   "Background") this is the memory mapping primitive provided by the host
-   kernel. In general, memory is allocated from a temporary host file using the
-   `filemem` package. Supposing that the sentry allocates offset 0x3000 from
-   host file "memory-file", the resulting state is:
+1.  The sentry allocates memory to store the contents of the required part of
+    the file, and copies file contents to the allocated memory. However, since
+    the sentry is implemented atop a host kernel, it does not configure mappings
+    to physical memory directly. Instead, mappable "memory" in the sentry is
+    represented by a host file descriptor and offset, since (as noted in
+    "Background") this is the memory mapping primitive provided by the host
+    kernel. In general, memory is allocated from a temporary host file using the
+    `filemem` package. Supposing that the sentry allocates offset 0x3000 from
+    host file "memory-file", the resulting state is:
 
         Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
         Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
 
-2. The sentry stores the effective mapping from virtual address to host file in
-   a host VMA by invoking the `mmap` system call:
+2.  The sentry stores the effective mapping from virtual address to host file in
+    a host VMA by invoking the `mmap` system call:
 
         Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
         Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
           Host VMA:     VA:0x400000 -----------------> host:memory-file:0x3000
 
-3. The sentry returns control to the application, which immediately incurs the
-   page fault again.[^mmap-populate] However, since a host VMA now exists for
-   the faulting virtual address, the host kernel now handles the page fault as
-   described in "Background":
+3.  The sentry returns control to the application, which immediately incurs the
+    page fault again.[^mmap-populate] However, since a host VMA now exists for
+    the faulting virtual address, the host kernel now handles the page fault as
+    described in "Background":
 
         Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
         Sentry filemap:                /tmp/foo:0x0 -> host:memory-file:0x3000
@@ -183,12 +183,12 @@ independently mutable, and the desired state of host VMAs is derived from that
 state.
 
 [^mmap-populate]: The sentry could force the host kernel to establish PTEs when
-                  it creates the host VMA by passing the `MAP_POPULATE` flag to
-                  the `mmap` system call, but usually does not. This is because,
-                  to reduce the number of page faults that require handling by
-                  the sentry and (correspondingly) the number of host `mmap`
-                  system calls, the sentry usually creates host VMAs that are
-                  much larger than the single faulting page.
+    it creates the host VMA by passing the `MAP_POPULATE` flag to
+    the `mmap` system call, but usually does not. This is because,
+    to reduce the number of page faults that require handling by
+    the sentry and (correspondingly) the number of host `mmap`
+    system calls, the sentry usually creates host VMAs that are
+    much larger than the single faulting page.
 
 ## Private Mappings
 
@@ -233,45 +233,46 @@ there is no shared zero page.
 
 In Linux:
 
-- A virtual address space is represented by `struct mm_struct`.
+-   A virtual address space is represented by `struct mm_struct`.
 
-- VMAs are represented by `struct vm_area_struct`, stored in `struct
-  mm_struct::mmap`.
+-   VMAs are represented by `struct vm_area_struct`, stored in `struct
+    mm_struct::mmap`.
 
-- Mappings from file offsets to physical memory are stored in `struct
-  address_space`.
+-   Mappings from file offsets to physical memory are stored in `struct
+    address_space`.
 
-- Reverse mappings from file offsets to virtual mappings are stored in `struct
-  address_space::i_mmap`.
+-   Reverse mappings from file offsets to virtual mappings are stored in `struct
+    address_space::i_mmap`.
 
-- Physical memory pages are represented by a pointer to `struct page` or an
-  index called a *page frame number* (PFN), represented by `pfn_t`.
+-   Physical memory pages are represented by a pointer to `struct page` or an
+    index called a *page frame number* (PFN), represented by `pfn_t`.
 
-- PTEs are represented by architecture-dependent type `pte_t`, stored in a table
-  hierarchy rooted at `struct mm_struct::pgd`.
+-   PTEs are represented by architecture-dependent type `pte_t`, stored in a
+    table hierarchy rooted at `struct mm_struct::pgd`.
 
 In the sentry:
 
-- A virtual address space is represented by type [`mm.MemoryManager`][mm].
+-   A virtual address space is represented by type [`mm.MemoryManager`][mm].
 
-- Sentry VMAs are represented by type [`mm.vma`][mm], stored in
-  `mm.MemoryManager.vmas`.
+-   Sentry VMAs are represented by type [`mm.vma`][mm], stored in
+    `mm.MemoryManager.vmas`.
 
-- Mappings from sentry file offsets to host file offsets are abstracted through
-  interface method [`memmap.Mappable.Translate`][memmap].
+-   Mappings from sentry file offsets to host file offsets are abstracted
+    through interface method [`memmap.Mappable.Translate`][memmap].
 
-- Reverse mappings from sentry file offsets to virtual mappings are abstracted
-  through interface methods [`memmap.Mappable.AddMapping` and
-  `memmap.Mappable.RemoveMapping`][memmap].
+-   Reverse mappings from sentry file offsets to virtual mappings are abstracted
+    through interface methods
+    [`memmap.Mappable.AddMapping` and `memmap.Mappable.RemoveMapping`][memmap].
 
-- Host files that may be mapped into host VMAs are represented by type
-  [`platform.File`][platform].
+-   Host files that may be mapped into host VMAs are represented by type
+    [`platform.File`][platform].
 
-- Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform
-  mapping area"), stored in `mm.MemoryManager.pmas`.
+-   Host VMAs are represented in the sentry by type [`mm.pma`][mm] ("platform
+    mapping area"), stored in `mm.MemoryManager.pmas`.
 
-- Creation and destruction of host VMAs is abstracted through interface methods
-  [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
+-   Creation and destruction of host VMAs is abstracted through interface
+    methods
+    [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
 
 [filemem]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/filemem/filemem.go
 [memmap]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/memmap/memmap.go
diff --git a/pkg/sentry/usermem/README.md b/pkg/sentry/usermem/README.md
index 2ebd3bcc1..f6d2137eb 100644
--- a/pkg/sentry/usermem/README.md
+++ b/pkg/sentry/usermem/README.md
@@ -2,30 +2,30 @@ This package defines primitives for sentry access to application memory.
 
 Major types:
 
-- The `IO` interface represents a virtual address space and provides I/O methods
-  on that address space. `IO` is the lowest-level primitive. The primary
-  implementation of the `IO` interface is `mm.MemoryManager`.
+-   The `IO` interface represents a virtual address space and provides I/O
+    methods on that address space. `IO` is the lowest-level primitive. The
+    primary implementation of the `IO` interface is `mm.MemoryManager`.
 
-- `IOSequence` represents a collection of individually-contiguous address ranges
-  in a `IO` that is operated on sequentially, analogous to Linux's `struct
-  iov_iter`.
+-   `IOSequence` represents a collection of individually-contiguous address
+    ranges in a `IO` that is operated on sequentially, analogous to Linux's
+    `struct iov_iter`.
 
 Major usage patterns:
 
-- Access to a task's virtual memory, subject to the application's memory
-  protections and while running on that task's goroutine, from a context that is
-  at or above the level of the `kernel` package (e.g. most syscall
-  implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers
-  defined in `kernel/task_usermem.go`.
+-   Access to a task's virtual memory, subject to the application's memory
+    protections and while running on that task's goroutine, from a context that
+    is at or above the level of the `kernel` package (e.g. most syscall
+    implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers
+    defined in `kernel/task_usermem.go`.
 
-- Access to a task's virtual memory, from a context that is at or above the
-  level of the `kernel` package, but where any of the above constraints does not
-  hold (e.g. `PTRACE_POKEDATA`, which ignores application memory protections);
-  obtain the task's `mm.MemoryManager` by calling `kernel.Task.MemoryManager`,
-  and call its `IO` methods directly.
+-   Access to a task's virtual memory, from a context that is at or above the
+    level of the `kernel` package, but where any of the above constraints does
+    not hold (e.g. `PTRACE_POKEDATA`, which ignores application memory
+    protections); obtain the task's `mm.MemoryManager` by calling
+    `kernel.Task.MemoryManager`, and call its `IO` methods directly.
 
-- Access to a task's virtual memory, from a context that is below the level of
-  the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments
-  from higher layers, usually in the form of an `IOSequence`. The
-  `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions in
-  `kernel/task_usermem.go` are convenience functions for doing so.
+-   Access to a task's virtual memory, from a context that is below the level of
+    the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments
+    from higher layers, usually in the form of an `IOSequence`. The
+    `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions
+    in `kernel/task_usermem.go` are convenience functions for doing so.
-- 
cgit v1.2.3


From bb41ad808a75b8a945d82df51f0e322d98edf951 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 12 Jul 2018 14:18:11 -0700
Subject: sentry: save inet stacks in proc files.

PiperOrigin-RevId: 204362791
Change-Id: If85ea7442741e299f0d7cddbc3d6b415e285da81
---
 pkg/sentry/fs/proc/net.go     | 2 +-
 pkg/sentry/fs/proc/sys_net.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index ad94c475a..ee0c825e8 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -55,7 +55,7 @@ func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 
 // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
 type ifinet6 struct {
-	s inet.Stack `state:"nosave"` // S/R-FIXME
+	s inet.Stack
 }
 
 func (n *ifinet6) contents() []string {
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 2a108708c..f3a5043f8 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -102,7 +102,7 @@ func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 
 type tcpSack struct {
 	ramfs.Entry
-	s inet.Stack `state:"nosave"` // S/R-FIXME
+	s inet.Stack
 }
 
 func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
-- 
cgit v1.2.3


From 1cd46c8dd1a92dd0ad3eeb60a763278f2e98d0b4 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 12 Jul 2018 15:07:59 -0700
Subject: sentry: wait for restore clock instead of panicing in Timekeeper.

PiperOrigin-RevId: 204372296
Change-Id: If1ed9843b93039806e0c65521f30177dc8036979
---
 pkg/sentry/kernel/timekeeper.go       | 25 ++++++++++++++++---------
 pkg/sentry/kernel/timekeeper_state.go |  2 +-
 pkg/sentry/kernel/timekeeper_test.go  |  4 ++--
 3 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 3f16c1676..4de8ac13b 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -44,14 +44,14 @@ type Timekeeper struct {
 	// It is set only once, by SetClocks.
 	monotonicOffset int64 `state:"nosave"`
 
-	// restored indicates that this Timekeeper was restored from a state
-	// file.
-	restored bool `state:"nosave"`
+	// restored, if non-nil, indicates that this Timekeeper was restored
+	// from a state file. The clocks are not set until restored is closed.
+	restored chan struct{} `state:"nosave"`
 
 	// saveMonotonic is the (offset) value of the monotonic clock at the
 	// time of save.
 	//
-	// It is only valid if restored is true.
+	// It is only valid if restored is non-nil.
 	//
 	// It is only used in SetClocks after restore to compute the new
 	// monotonicOffset.
@@ -59,7 +59,7 @@ type Timekeeper struct {
 
 	// saveRealtime is the value of the realtime clock at the time of save.
 	//
-	// It is only valid if restored is true.
+	// It is only valid if restored is non-nil.
 	//
 	// It is only used in SetClocks after restore to compute the new
 	// monotonicOffset.
@@ -98,7 +98,7 @@ func NewTimekeeper(platform platform.Platform, paramPage platform.FileRange) (*T
 func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
 	// Update the params, marking them "not ready", as we may need to
 	// restart calibration on this new machine.
-	if t.restored {
+	if t.restored != nil {
 		if err := t.params.Write(func() vdsoParams {
 			return vdsoParams{}
 		}); err != nil {
@@ -135,7 +135,7 @@ func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
 		panic("Unable to get current realtime: " + err.Error())
 	}
 
-	if t.restored {
+	if t.restored != nil {
 		wantMonotonic = t.saveMonotonic
 		elapsed := nowRealtime - t.saveRealtime
 		if elapsed > 0 {
@@ -145,7 +145,7 @@ func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
 
 	t.monotonicOffset = wantMonotonic - nowMonotonic
 
-	if !t.restored {
+	if t.restored == nil {
 		// Hold on to the initial "boot" time.
 		t.bootTime = ktime.FromNanoseconds(nowRealtime)
 	}
@@ -153,6 +153,10 @@ func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	t.startUpdater()
+
+	if t.restored != nil {
+		close(t.restored)
+	}
 }
 
 // startUpdater starts an update goroutine that keeps the clocks updated.
@@ -255,7 +259,10 @@ func (t *Timekeeper) ResumeUpdates() {
 // GetTime returns the current time in nanoseconds.
 func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
 	if t.clocks == nil {
-		panic("Timekeeper used before initialized with SetClocks")
+		if t.restored == nil {
+			panic("Timekeeper used before initialized with SetClocks")
+		}
+		<-t.restored
 	}
 	now, err := t.clocks.GetTime(c)
 	if err == nil && c == sentrytime.Monotonic {
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
index aee983ac7..2e7fed4d8 100644
--- a/pkg/sentry/kernel/timekeeper_state.go
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -37,5 +37,5 @@ func (t *Timekeeper) beforeSave() {
 
 // afterLoad is invoked by stateify.
 func (t *Timekeeper) afterLoad() {
-	t.restored = true
+	t.restored = make(chan struct{})
 }
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 08bacba4f..34a5cec27 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -107,7 +107,7 @@ func TestTimekeeperMonotonicForward(t *testing.T) {
 	}
 
 	tk := stateTestClocklessTimekeeper(t)
-	tk.restored = true
+	tk.restored = make(chan struct{})
 	tk.saveMonotonic = 100000
 	tk.saveRealtime = 400000
 	tk.SetClocks(c)
@@ -135,7 +135,7 @@ func TestTimekeeperMonotonicJumpBackwards(t *testing.T) {
 	}
 
 	tk := stateTestClocklessTimekeeper(t)
-	tk.restored = true
+	tk.restored = make(chan struct{})
 	tk.saveMonotonic = 100000
 	tk.saveRealtime = 600000
 	tk.SetClocks(c)
-- 
cgit v1.2.3


From a28b274abb3ac0ce652ee395d5a48e7b7fdfb3ad Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 12 Jul 2018 17:13:41 -0700
Subject: Fix aio eventfd lookup

We're failing to set eventFile in the outer scope.

PiperOrigin-RevId: 204392995
Change-Id: Ib9b04f839599ef552d7b5951d08223e2b1d5f6ad
---
 pkg/sentry/syscalls/linux/sys_aio.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 345ef9bec..fc3397081 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -300,7 +300,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad
 	// Was there an eventFD? Extract it.
 	var eventFile *fs.File
 	if cb.Flags&_IOCB_FLAG_RESFD != 0 {
-		eventFile := t.FDMap().GetFile(kdefs.FD(cb.ResFD))
+		eventFile = t.FDMap().GetFile(kdefs.FD(cb.ResFD))
 		if eventFile == nil {
 			// Bad FD.
 			return syserror.EBADF
-- 
cgit v1.2.3


From f09ebd9c71eecdfb79f64b6abb26db3b66b8156b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 13 Jul 2018 10:23:16 -0700
Subject: Note that Mount errors do not require translations

PiperOrigin-RevId: 204490639
Change-Id: I0fe26306bae9320c6aa4f854fe0ef25eebd93233
---
 pkg/sentry/fs/filesystems.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 7cd76dfe9..e2c255be6 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -48,6 +48,8 @@ type Filesystem interface {
 	// Mount generates a mountable Inode backed by device and configured
 	// using file system independent flags and file system dependent
 	// data options.
+	//
+	// Mount may return arbitrary errors. They do not need syserr translations.
 	Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error)
 
 	// AllowUserMount determines whether mount(2) is allowed to mount a
-- 
cgit v1.2.3


From 5b09ec3b890141959aa6a6a73b1ee4e26490c5cc Mon Sep 17 00:00:00 2001
From: Neel Natu <neelnatu@google.com>
Date: Fri, 13 Jul 2018 12:10:01 -0700
Subject: Allow a filesystem to control its visibility in /proc/filesystems.

PiperOrigin-RevId: 204508520
Change-Id: I09e5f8b6e69413370e1a0d39dbb7dc1ee0b6192d
---
 pkg/sentry/fs/dev/fs.go           | 5 +++++
 pkg/sentry/fs/filesystems.go      | 4 ++++
 pkg/sentry/fs/gofer/fs.go         | 5 +++++
 pkg/sentry/fs/host/fs.go          | 5 +++++
 pkg/sentry/fs/mount_overlay.go    | 5 +++++
 pkg/sentry/fs/proc/filesystems.go | 3 +++
 pkg/sentry/fs/proc/fs.go          | 5 +++++
 pkg/sentry/fs/sys/fs.go           | 5 +++++
 pkg/sentry/fs/tmpfs/fs.go         | 5 +++++
 pkg/sentry/fs/tty/fs.go           | 5 +++++
 10 files changed, 47 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index 4945ac962..3c79f3782 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -49,6 +49,11 @@ func (*filesystem) AllowUserMount() bool {
 	return true
 }
 
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return true
+}
+
 // Flags returns that there is nothing special about this file system.
 //
 // In Linux, devtmpfs does the same thing.
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index e2c255be6..200e792f4 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -55,6 +55,10 @@ type Filesystem interface {
 	// AllowUserMount determines whether mount(2) is allowed to mount a
 	// file system of this type.
 	AllowUserMount() bool
+
+	// AllowUserList determines whether this filesystem is listed in
+	// /proc/filesystems
+	AllowUserList() bool
 }
 
 // filesystems is the global set of registered file systems. It does not need
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index e041074d2..dd5d43c47 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -103,6 +103,11 @@ func (*filesystem) AllowUserMount() bool {
 	return false
 }
 
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return true
+}
+
 // Flags returns that there is nothing special about this file system.
 //
 // The 9p Linux client returns FS_RENAME_DOES_D_MOVE, see fs/9p/vfs_super.c.
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index ffd55a5ab..974700636 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -66,6 +66,11 @@ func (*Filesystem) AllowUserMount() bool {
 	return false
 }
 
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*Filesystem) AllowUserList() bool {
+	return true
+}
+
 // Flags returns that there is nothing special about this file system.
 func (*Filesystem) Flags() fs.FilesystemFlags {
 	return 0
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 16c25e46c..343202400 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -89,6 +89,11 @@ func (ofs *overlayFilesystem) AllowUserMount() bool {
 	return false
 }
 
+// AllowUserList implements Filesystem.AllowUserList.
+func (*overlayFilesystem) AllowUserList() bool {
+	return true
+}
+
 // Mount implements Filesystem.Mount.
 func (ofs *overlayFilesystem) Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error) {
 	panic("overlayFilesystem.Mount should not be called!")
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index aa2c4db10..37db9cf9c 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -43,6 +43,9 @@ func (*filesystemsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle
 	// Generate the file contents.
 	var buf bytes.Buffer
 	for _, sys := range fs.GetFilesystems() {
+		if !sys.AllowUserList() {
+			continue
+		}
 		nodev := "nodev"
 		if sys.Flags()&fs.FilesystemRequiresDev != 0 {
 			nodev = ""
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 072d00beb..3aadd6ac4 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -42,6 +42,11 @@ func (*filesystem) AllowUserMount() bool {
 	return true
 }
 
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return true
+}
+
 // Flags returns that there is nothing special about this file system.
 //
 // In Linux, proc returns FS_USERNS_VISIBLE | FS_USERNS_MOUNT, see fs/proc/root.c.
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index f25f648c3..c6d5f7fd8 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -40,6 +40,11 @@ func (*filesystem) AllowUserMount() bool {
 	return true
 }
 
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return true
+}
+
 // Flags returns that there is nothing special about this file system.
 //
 // In Linux, sysfs returns FS_USERNS_VISIBLE | FS_USERNS_MOUNT, see fs/sysfs/mount.c.
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 639a19b0d..5bd9ade52 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -67,6 +67,11 @@ func (*Filesystem) AllowUserMount() bool {
 	return true
 }
 
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*Filesystem) AllowUserList() bool {
+	return true
+}
+
 // Flags returns that there is nothing special about this file system.
 //
 // In Linux, tmpfs returns FS_USERNS_MOUNT, see mm/shmem.c.
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index f5e7a3162..1ef1a85e3 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -46,6 +46,11 @@ func (*filesystem) AllowUserMount() bool {
 	return false
 }
 
+// AllowUserList allows this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return true
+}
+
 // Flags returns that there is nothing special about this file system.
 func (*filesystem) Flags() fs.FilesystemFlags {
 	return 0
-- 
cgit v1.2.3


From 8f21c0bb2807888d812318def43c2405c9b13f5a Mon Sep 17 00:00:00 2001
From: Neel Natu <neelnatu@google.com>
Date: Mon, 16 Jul 2018 12:19:02 -0700
Subject: Add EventOperations.HostFD()

This method allows an eventfd inside the Sentry to be registered with with
the host kernel.

Update comment about memory mapping host fds via CachingInodeOperations.

PiperOrigin-RevId: 204784859
Change-Id: I55823321e2d84c17ae0f7efaabc6b55b852ae257
---
 pkg/abi/linux/BUILD                  |   1 +
 pkg/abi/linux/eventfd.go             |  22 +++++++
 pkg/sentry/fs/fsutil/inode_cached.go |   3 +-
 pkg/sentry/kernel/eventfd/BUILD      |   2 +
 pkg/sentry/kernel/eventfd/eventfd.go | 119 +++++++++++++++++++++++++++++++++--
 5 files changed, 139 insertions(+), 8 deletions(-)
 create mode 100644 pkg/abi/linux/eventfd.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index e164945cf..ae7e4378c 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -30,6 +30,7 @@ go_library(
         "dev.go",
         "elf.go",
         "errors.go",
+        "eventfd.go",
         "exec.go",
         "fcntl.go",
         "file.go",
diff --git a/pkg/abi/linux/eventfd.go b/pkg/abi/linux/eventfd.go
new file mode 100644
index 000000000..bc0fb44d2
--- /dev/null
+++ b/pkg/abi/linux/eventfd.go
@@ -0,0 +1,22 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Constants for eventfd2(2).
+const (
+	EFD_SEMAPHORE = 0x1
+	EFD_CLOEXEC   = O_CLOEXEC
+	EFD_NONBLOCK  = O_NONBLOCK
+)
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 7c0f96ac2..cba642a8f 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -44,8 +44,7 @@ import (
 //
 // CachingInodeOperations implements Mappable for the CachedFileObject:
 //
-// - If CachedFileObject.FD returns a value >= 0 and the current platform shares
-//   a host fd table with the sentry, then the value of CachedFileObject.FD
+// - If CachedFileObject.FD returns a value >= 0 then the file descriptor
 //   will be memory mapped on the host.
 //
 // - Otherwise, the contents of CachedFileObject are buffered into memory
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 2d5a3c693..561ced852 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -21,6 +21,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -30,6 +31,7 @@ go_library(
         "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
+        "//pkg/waiter/fdnotifier",
     ],
 )
 
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index c9333719e..bd50bd9fe 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -21,6 +21,7 @@ import (
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
@@ -28,10 +29,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 // EventOperations represents an event with the semantics of Linux's file-based event
-// notification (eventfd).
+// notification (eventfd). Eventfds are usually internal to the Sentry but in certain
+// situations they may be converted into a host-backed eventfd.
 type EventOperations struct {
 	fsutil.NoopRelease   `state:"nosave"`
 	fsutil.PipeSeek      `state:"nosave"`
@@ -46,13 +49,16 @@ type EventOperations struct {
 
 	// Queue is used to notify interested parties when the event object
 	// becomes readable or writable.
-	waiter.Queue `state:"nosave"`
+	wq waiter.Queue `state:"nosave"`
 
 	// val is the current value of the event counter.
 	val uint64
 
 	// semMode specifies whether the event is in "semaphore" mode.
 	semMode bool
+
+	// hostfd indicates whether this eventfd is passed through to the host.
+	hostfd int
 }
 
 // New creates a new event object with the supplied initial value and mode.
@@ -62,9 +68,48 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
 	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
 		val:     initVal,
 		semMode: semMode,
+		hostfd:  -1,
 	})
 }
 
+// HostFD returns the host eventfd associated with this event.
+func (e *EventOperations) HostFD() (int, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		return e.hostfd, nil
+	}
+
+	flags := linux.EFD_NONBLOCK
+	if e.semMode {
+		flags |= linux.EFD_SEMAPHORE
+	}
+
+	fd, _, err := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(e.val), uintptr(flags), 0)
+	if err != 0 {
+		return -1, err
+	}
+
+	if err := fdnotifier.AddFD(int32(fd), &e.wq); err != nil {
+		syscall.Close(int(fd))
+		return -1, err
+	}
+
+	e.hostfd = int(fd)
+	return e.hostfd, nil
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventOperations) Release() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.RemoveFD(int32(e.hostfd))
+		syscall.Close(e.hostfd)
+		e.hostfd = -1
+	}
+}
+
 // Read implements fs.FileOperations.Read.
 func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
 	if dst.NumBytes() < 8 {
@@ -87,9 +132,29 @@ func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOS
 	return 8, nil
 }
 
+// Must be called with e.mu locked.
+func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) error {
+	var buf [8]byte
+
+	if _, err := syscall.Read(e.hostfd, buf[:]); err != nil {
+		if err == syscall.EWOULDBLOCK {
+			return syserror.ErrWouldBlock
+		}
+		return err
+	}
+
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
 func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error {
 	e.mu.Lock()
 
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return e.hostRead(ctx, dst)
+	}
+
 	// We can't complete the read if the value is currently zero.
 	if e.val == 0 {
 		e.mu.Unlock()
@@ -112,7 +177,7 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro
 	// Notify writers. We do this even if we were already writable because
 	// it is possible that a writer is waiting to write the maximum value
 	// to the event.
-	e.Notify(waiter.EventOut)
+	e.wq.Notify(waiter.EventOut)
 
 	var buf [8]byte
 	usermem.ByteOrder.PutUint64(buf[:], val)
@@ -120,6 +185,17 @@ func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) erro
 	return err
 }
 
+// Must be called with e.mu locked.
+func (e *EventOperations) hostWrite(val uint64) error {
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := syscall.Write(e.hostfd, buf[:])
+	if err == syscall.EWOULDBLOCK {
+		return syserror.ErrWouldBlock
+	}
+	return err
+}
+
 func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error {
 	var buf [8]byte
 	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
@@ -138,6 +214,11 @@ func (e *EventOperations) Signal(val uint64) error {
 
 	e.mu.Lock()
 
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return e.hostWrite(val)
+	}
+
 	// We only allow writes that won't cause the value to go over the max
 	// uint64 minus 1.
 	if val > math.MaxUint64-1-e.val {
@@ -149,16 +230,20 @@ func (e *EventOperations) Signal(val uint64) error {
 	e.mu.Unlock()
 
 	// Always trigger a notification.
-	e.Notify(waiter.EventIn)
+	e.wq.Notify(waiter.EventIn)
 
 	return nil
 }
 
 // Readiness returns the ready events for the event fd.
 func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
-	ready := waiter.EventMask(0)
-
 	e.mu.Lock()
+	if e.hostfd >= 0 {
+		defer e.mu.Unlock()
+		return fdnotifier.NonBlockingPoll(int32(e.hostfd), mask)
+	}
+
+	ready := waiter.EventMask(0)
 	if e.val > 0 {
 		ready |= waiter.EventIn
 	}
@@ -170,3 +255,25 @@ func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 	return mask & ready
 }
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (e *EventOperations) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
+	e.wq.EventRegister(entry, mask)
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(e.hostfd))
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (e *EventOperations) EventUnregister(entry *waiter.Entry) {
+	e.wq.EventUnregister(entry)
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	if e.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(e.hostfd))
+	}
+}
-- 
cgit v1.2.3


From 14d06064d26b1cd9e2ccad08ebe997e704092eb8 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 16 Jul 2018 18:18:06 -0700
Subject: Start allocation and reclaim scans only where they may find a match

If usageSet is heavily fragmented, findUnallocatedRange and findReclaimable
can spend excessive cycles linearly scanning the set for unallocated/free
pages.

Improve common cases by beginning the scan only at the first page that could
possibly contain an unallocated/free page. This metadata only guarantees that
there is no lower unallocated/free page, but a scan may still be required
(especially for multi-page allocations).

That said, this heuristic can still provide significant performance
improvements for certain applications.

PiperOrigin-RevId: 204841833
Change-Id: Ic41ad33bf9537ecd673a6f5852ab353bf63ea1e6
---
 pkg/sentry/platform/filemem/filemem.go      |  65 +++++++++++++++--
 pkg/sentry/platform/filemem/filemem_test.go | 106 ++++++++++++++++++++--------
 2 files changed, 134 insertions(+), 37 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index 45ef98eb0..6c8b95578 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -105,6 +105,12 @@ type FileMem struct {
 	usageSwapped  uint64
 	usageLast     time.Time
 
+	// minUnallocatedPage is the minimum page that may be unallocated.
+	// i.e., there are no unallocated pages below minUnallocatedPage.
+	//
+	// minUnallocatedPage is protected by mu.
+	minUnallocatedPage uint64
+
 	// fileSize is the size of the backing memory file in bytes. fileSize is
 	// always a power-of-two multiple of chunkSize.
 	//
@@ -119,6 +125,12 @@ type FileMem struct {
 	// is protected by mu.
 	reclaimable bool
 
+	// minReclaimablePage is the minimum page that may be reclaimable.
+	// i.e., all reclaimable pages are >= minReclaimablePage.
+	//
+	// minReclaimablePage is protected by mu.
+	minReclaimablePage uint64
+
 	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
 	// transitions from false to true.
 	reclaimCond sync.Cond
@@ -162,6 +174,9 @@ const (
 	chunkMask  = chunkSize - 1
 
 	initialSize = chunkSize
+
+	// maxPage is the highest 64-bit page.
+	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
 )
 
 // newFromFile creates a FileMem backed by the given file.
@@ -172,6 +187,9 @@ func newFromFile(file *os.File) (*FileMem, error) {
 	f := &FileMem{
 		fileSize: initialSize,
 		file:     file,
+		// No pages are reclaimable. DecRef will always be able to
+		// decrease minReclaimablePage from this point.
+		minReclaimablePage: maxPage,
 	}
 	f.reclaimCond.L = &f.mu
 	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
@@ -242,7 +260,7 @@ func (f *FileMem) Allocate(length uint64, kind usage.MemoryKind) (platform.FileR
 		alignment = usermem.HugePageSize
 	}
 
-	start := findUnallocatedRange(&f.usage, length, alignment)
+	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
 	end := start + length
 	// File offsets are int64s. Since length must be strictly positive, end
 	// cannot legitimately be 0.
@@ -281,17 +299,36 @@ func (f *FileMem) Allocate(length uint64, kind usage.MemoryKind) (platform.FileR
 	}) {
 		panic(fmt.Sprintf("allocating %v: failed to insert into f.usage:\n%v", fr, &f.usage))
 	}
+
+	if minUnallocatedPage < start {
+		f.minUnallocatedPage = minUnallocatedPage
+	} else {
+		// start was the first unallocated page. The next must be
+		// somewhere beyond end.
+		f.minUnallocatedPage = end
+	}
+
 	return fr, nil
 }
 
-func findUnallocatedRange(usage *usageSet, length, alignment uint64) uint64 {
+// findUnallocatedRange returns the first unallocated page in usage of the
+// specified length and alignment beginning at page start and the first single
+// unallocated page.
+func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
+	// Only searched until the first page is found.
+	firstPage := start
+	foundFirstPage := false
 	alignMask := alignment - 1
-	var start uint64
-	for seg := usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
 		r := seg.Range()
+
+		if !foundFirstPage && r.Start > firstPage {
+			foundFirstPage = true
+		}
+
 		if start >= r.End {
 			// start was rounded up to an alignment boundary from the end
-			// of a previous segment.
+			// of a previous segment and is now beyond r.End.
 			continue
 		}
 		// This segment represents allocated or reclaimable pages; only the
@@ -301,8 +338,11 @@ func findUnallocatedRange(usage *usageSet, length, alignment uint64) uint64 {
 			break
 		}
 		start = (r.End + alignMask) &^ alignMask
+		if !foundFirstPage {
+			firstPage = r.End
+		}
 	}
-	return start
+	return start, firstPage
 }
 
 // fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
@@ -418,12 +458,15 @@ func (f *FileMem) findReclaimable() (platform.FileRange, bool) {
 		// Allocate returns the first usable range in offset order and is
 		// currently a linear scan, so reclaiming from the beginning of the
 		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
 			if seg.ValuePtr().refs == 0 {
+				f.minReclaimablePage = seg.End()
 				return seg.Range(), true
 			}
 		}
 		f.reclaimable = false
+		// No pages are reclaimable.
+		f.minReclaimablePage = maxPage
 	}
 }
 
@@ -450,6 +493,10 @@ func (f *FileMem) markReclaimed(fr platform.FileRange) {
 	// caller of markReclaimed may not have decommitted it, so we can only mark
 	// fr as reclaimed.
 	f.usage.Remove(f.usage.Isolate(seg, fr))
+	if fr.Start < f.minUnallocatedPage {
+		// We've deallocated at least one lower page.
+		f.minUnallocatedPage = fr.Start
+	}
 }
 
 // MapInto implements platform.File.MapInto.
@@ -533,6 +580,10 @@ func (f *FileMem) DecRef(fr platform.FileRange) {
 	f.usage.MergeAdjacent(fr)
 
 	if freed {
+		if fr.Start < f.minReclaimablePage {
+			// We've freed at least one lower page.
+			f.minReclaimablePage = fr.Start
+		}
 		f.reclaimable = true
 		f.reclaimCond.Signal()
 	}
diff --git a/pkg/sentry/platform/filemem/filemem_test.go b/pkg/sentry/platform/filemem/filemem_test.go
index 46ffcf116..4b165dc48 100644
--- a/pkg/sentry/platform/filemem/filemem_test.go
+++ b/pkg/sentry/platform/filemem/filemem_test.go
@@ -27,18 +27,22 @@ const (
 
 func TestFindUnallocatedRange(t *testing.T) {
 	for _, test := range []struct {
-		desc      string
-		usage     *usageSegmentDataSlices
-		length    uint64
-		alignment uint64
-		start     uint64
+		desc           string
+		usage          *usageSegmentDataSlices
+		start          uint64
+		length         uint64
+		alignment      uint64
+		unallocated    uint64
+		minUnallocated uint64
 	}{
 		{
-			desc:      "Initial allocation succeeds",
-			usage:     &usageSegmentDataSlices{},
-			length:    page,
-			alignment: page,
-			start:     0,
+			desc:           "Initial allocation succeeds",
+			usage:          &usageSegmentDataSlices{},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    0,
+			minUnallocated: 0,
 		},
 		{
 			desc: "Allocation begins at start of file",
@@ -47,9 +51,11 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{2 * page},
 				Values: []usageInfo{{refs: 1}},
 			},
-			length:    page,
-			alignment: page,
-			start:     0,
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    0,
+			minUnallocated: 0,
 		},
 		{
 			desc: "In-use frames are not allocatable",
@@ -58,9 +64,11 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			length:    page,
-			alignment: page,
-			start:     2 * page,
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
 		},
 		{
 			desc: "Reclaimable frames are not allocatable",
@@ -69,9 +77,11 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
 			},
-			length:    page,
-			alignment: page,
-			start:     3 * page,
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    3 * page,
+			minUnallocated: 3 * page,
 		},
 		{
 			desc: "Gaps between in-use frames are allocatable",
@@ -80,9 +90,11 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			length:    page,
-			alignment: page,
-			start:     page,
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    page,
+			minUnallocated: page,
 		},
 		{
 			desc: "Inadequately-sized gaps are rejected",
@@ -91,9 +103,11 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			length:    2 * page,
-			alignment: page,
-			start:     3 * page,
+			start:          0,
+			length:         2 * page,
+			alignment:      page,
+			unallocated:    3 * page,
+			minUnallocated: page,
 		},
 		{
 			desc: "Hugepage alignment is honored",
@@ -104,9 +118,37 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, hugepage + 2*page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			length:    hugepage,
-			alignment: hugepage,
-			start:     2 * hugepage,
+			start:          0,
+			length:         hugepage,
+			alignment:      hugepage,
+			unallocated:    2 * hugepage,
+			minUnallocated: page,
+		},
+		{
+			desc: "Pages before start ignored",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 3 * page},
+				End:    []uint64{2 * page, 4 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          page,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
+		},
+		{
+			desc: "start may be in the middle of segment",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 3 * page},
+				End:    []uint64{2 * page, 4 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          page,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
 		},
 	} {
 		t.Run(test.desc, func(t *testing.T) {
@@ -114,8 +156,12 @@ func TestFindUnallocatedRange(t *testing.T) {
 			if err := usage.ImportSortedSlices(test.usage); err != nil {
 				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
 			}
-			if got, want := findUnallocatedRange(&usage, test.length, test.alignment), test.start; got != want {
-				t.Errorf("findUnallocatedRange(%v, %d, %d): got %d, wanted %d", test.usage, test.length, test.alignment, got, want)
+			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
+			if unallocated != test.unallocated {
+				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+			}
+			if minUnallocated != test.minUnallocated {
+				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
 			}
 		})
 	}
-- 
cgit v1.2.3


From 29e00c943a61dfcfd4ac8d3f6f526eab641c44a6 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 16 Jul 2018 22:02:03 -0700
Subject: Add CPUID faulting for ptrace and KVM.

PiperOrigin-RevId: 204858314
Change-Id: I8252bf8de3232a7a27af51076139b585e73276d4
---
 pkg/abi/linux/prctl.go                         |  9 +++---
 pkg/sentry/kernel/task_run.go                  | 41 ++++++++++++--------------
 pkg/sentry/platform/kvm/machine.go             | 22 +++++++-------
 pkg/sentry/platform/kvm/machine_amd64.go       | 22 ++++++++++++++
 pkg/sentry/platform/platform.go                |  7 +++++
 pkg/sentry/platform/ptrace/ptrace.go           | 15 +++++++---
 pkg/sentry/platform/ptrace/subprocess_linux.go |  5 ++++
 pkg/sentry/platform/ring0/kernel_amd64.go      | 22 +++++++++++++-
 pkg/sentry/platform/ring0/x86.go               | 14 ++++++---
 9 files changed, 111 insertions(+), 46 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index 6c93601de..074ec03f0 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -65,8 +65,9 @@ const (
 // From <asm/prctl.h>
 // Flags are used in syscall arch_prctl(2).
 const (
-	ARCH_SET_GS = 0x1001
-	ARCH_SET_FS = 0x1002
-	ARCH_GET_FS = 0x1003
-	ARCH_GET_GS = 0x1004
+	ARCH_SET_GS    = 0x1001
+	ARCH_SET_FS    = 0x1002
+	ARCH_GET_FS    = 0x1003
+	ARCH_GET_GS    = 0x1004
+	ARCH_SET_CPUID = 0x1012
 )
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 94ce5582b..a03fa6ac0 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -221,6 +221,24 @@ func (*runApp) execute(t *Task) taskRunState {
 		// loop to figure out why.
 		return (*runApp)(nil)
 
+	case platform.ErrContextSignalCPUID:
+		// Is this a CPUID instruction?
+		expected := arch.CPUIDInstruction[:]
+		found := make([]byte, len(expected))
+		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+		if err == nil && bytes.Equal(expected, found) {
+			// Skip the cpuid instruction.
+			t.Arch().CPUIDEmulate(t)
+			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+
+			// Resume execution.
+			return (*runApp)(nil)
+		}
+
+		// The instruction at the given RIP was not a CPUID, and we
+		// fallthrough to the default signal deliver behavior below.
+		fallthrough
+
 	case platform.ErrContextSignal:
 		// Looks like a signal has been delivered to us. If it's a synchronous
 		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
@@ -266,28 +284,7 @@ func (*runApp) execute(t *Task) taskRunState {
 		}
 
 		switch sig {
-		case linux.SIGILL:
-			// N.B. The debug stuff here is arguably
-			// expensive.  Don't fret. This gets called
-			// about 5 times for a typical application, if
-			// that.
-			t.Debugf("SIGILL @ %x", t.Arch().IP())
-
-			// Is this a CPUID instruction?
-			expected := arch.CPUIDInstruction[:]
-			found := make([]byte, len(expected))
-			_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
-			if err == nil && bytes.Equal(expected, found) {
-				// Skip the cpuid instruction.
-				t.Arch().CPUIDEmulate(t)
-				t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
-				break
-			}
-
-			// Treat it like any other synchronous signal.
-			fallthrough
-
-		case linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
+		case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
 			// Synchronous signal. Send it to ourselves. Assume the signal is
 			// legitimate and force it (work around the signal being ignored or
 			// blocked) like Linux does. Conveniently, this is even the correct
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index abdc51431..68e099d1b 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -141,11 +141,6 @@ func (m *machine) newVCPU() *vCPU {
 		panic(fmt.Sprintf("error setting signal mask: %v", err))
 	}
 
-	// Initialize architecture state.
-	if err := c.initArchState(); err != nil {
-		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
-	}
-
 	// Map the run data.
 	runData, err := mapRunData(int(fd))
 	if err != nil {
@@ -153,6 +148,11 @@ func (m *machine) newVCPU() *vCPU {
 	}
 	c.runData = runData
 
+	// Initialize architecture state.
+	if err := c.initArchState(); err != nil {
+		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
+	}
+
 	return c // Done.
 }
 
@@ -168,12 +168,6 @@ func newMachine(vm int) (*machine, error) {
 		PageTables: pagetables.New(newAllocator()),
 	})
 
-	// Initialize architecture state.
-	if err := m.initArchState(); err != nil {
-		m.Destroy()
-		return nil, err
-	}
-
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
 	// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -221,6 +215,12 @@ func newMachine(vm int) (*machine, error) {
 		}
 	})
 
+	// Initialize architecture state.
+	if err := m.initArchState(); err != nil {
+		m.Destroy()
+		return nil, err
+	}
+
 	// Ensure the machine is cleaned up properly.
 	runtime.SetFinalizer(m, (*machine).Destroy)
 	return m, nil
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 9af4f3f3d..bcd29a947 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -19,6 +19,7 @@ package kvm
 import (
 	"fmt"
 	"reflect"
+	"runtime/debug"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
@@ -39,6 +40,21 @@ func (m *machine) initArchState() error {
 		uintptr(reservedMemory-(3*usermem.PageSize))); errno != 0 {
 		return errno
 	}
+
+	// Enable CPUID faulting, if possible. Note that this also serves as a
+	// basic platform sanity tests, since we will enter guest mode for the
+	// first time here. The recovery is necessary, since if we fail to read
+	// the platform info register, we will retry to host mode and
+	// ultimately need to handle a segmentation fault.
+	old := debug.SetPanicOnFault(true)
+	defer func() {
+		recover()
+		debug.SetPanicOnFault(old)
+	}()
+	m.retryInGuest(func() {
+		ring0.SetCPUIDFaulting(true)
+	})
+
 	return nil
 }
 
@@ -238,6 +254,12 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 			Code:  arch.SignalInfoKernel,
 		}
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		if vector == ring0.GeneralProtectionFault {
+			// When CPUID faulting is enabled, we will generate a #GP(0) when
+			// userspace executes a CPUID instruction. This is handled above,
+			// because we need to be able to map and read user memory.
+			return info, usermem.AccessType{}, platform.ErrContextSignalCPUID
+		}
 		return info, usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.InvalidOpcode:
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index f2fe163e8..6eb2acbd7 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -154,6 +154,13 @@ var (
 	// Context was interrupted by a signal.
 	ErrContextSignal = fmt.Errorf("interrupted by signal")
 
+	// ErrContextSignalCPUID is equivalent to ErrContextSignal, except that
+	// a check should be done for execution of the CPUID instruction. If
+	// the current instruction pointer is a CPUID instruction, then this
+	// should be emulated appropriately. If not, then the given signal
+	// should be handled per above.
+	ErrContextSignalCPUID = fmt.Errorf("interrupted by signal, possible CPUID")
+
 	// ErrContextInterrupt is returned by Context.Switch() to indicate that the
 	// Context was interrupted by a call to Context.Interrupt().
 	ErrContextInterrupt = fmt.Errorf("interrupted by platform.Context.Interrupt()")
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 05f8b1d05..a44f549a2 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -101,9 +101,11 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (
 	s := as.(*subprocess)
 	isSyscall := s.switchToApp(c, ac)
 
-	var faultSP *subprocess
-	var faultAddr usermem.Addr
-	var faultIP usermem.Addr
+	var (
+		faultSP   *subprocess
+		faultAddr usermem.Addr
+		faultIP   usermem.Addr
+	)
 	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV {
 		faultSP = s
 		faultAddr = usermem.Addr(c.signalInfo.Addr())
@@ -161,7 +163,12 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (
 		lastFaultIP == faultIP {
 		at.Write = true
 	}
-	return &c.signalInfo, at, platform.ErrContextSignal
+
+	// Unfortunately, we have to unilaterally return ErrContextSignalCPUID
+	// here, in case this fault was generated by a CPUID exception. There
+	// is no way to distinguish between CPUID-generated faults and regular
+	// page faults.
+	return &c.signalInfo, at, platform.ErrContextSignalCPUID
 }
 
 // Interrupt interrupts the running guest application associated with this context.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index b3f2ebb20..b212bbdfe 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 )
@@ -85,6 +86,10 @@ func createStub() (*thread, error) {
 		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
 	}
 
+	// Enable cpuid-faulting; this may fail on older kernels or hardware,
+	// so we just disregard the result. Host CPUID will be enabled.
+	syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
+
 	// Call the stub; should not return.
 	stubCall(stubStart, ppid)
 	panic("unreachable")
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 117e86104..0d2b0f7dc 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -163,7 +163,6 @@ func IsCanonical(addr uint64) bool {
 // the case for amd64, but may not be the case for other architectures.
 //
 // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
-
 //
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
@@ -237,6 +236,27 @@ func start(c *CPU) {
 	wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
 }
 
+// SetCPUIDFaulting sets CPUID faulting per the boolean value.
+//
+// True is returned if faulting could be set.
+//
+//go:nosplit
+func SetCPUIDFaulting(on bool) bool {
+	// Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support
+	// for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR.
+	if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 {
+		features := rdmsr(_MSR_MISC_FEATURES)
+		if on {
+			features |= _MISC_FEATURE_CPUID_TRAP
+		} else {
+			features &^= _MISC_FEATURE_CPUID_TRAP
+		}
+		wrmsr(_MSR_MISC_FEATURES, features)
+		return true // Setting successful.
+	}
+	return false
+}
+
 // ReadCR2 reads the current CR2 value.
 //
 //go:nosplit
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 3d437a77c..f489fcecb 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -50,10 +50,16 @@ const (
 	_EFER_LMA = 0x400
 	_EFER_NX  = 0x800
 
-	_MSR_STAR         = 0xc0000081
-	_MSR_LSTAR        = 0xc0000082
-	_MSR_CSTAR        = 0xc0000083
-	_MSR_SYSCALL_MASK = 0xc0000084
+	_MSR_STAR          = 0xc0000081
+	_MSR_LSTAR         = 0xc0000082
+	_MSR_CSTAR         = 0xc0000083
+	_MSR_SYSCALL_MASK  = 0xc0000084
+	_MSR_PLATFORM_INFO = 0xce
+	_MSR_MISC_FEATURES = 0x140
+
+	_PLATFORM_INFO_CPUID_FAULT = 1 << 31
+
+	_MISC_FEATURE_CPUID_TRAP = 0x1
 )
 
 const (
-- 
cgit v1.2.3


From 733ebe7c09404ea2e443e12143edc768a81cd415 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 17 Jul 2018 13:03:03 -0700
Subject: Merge FileMem.usage in IncRef

Per the doc, usage must be kept maximally merged. Beyond that, it is simply a
good idea to keep fragmentation in usage to a minimum.

The glibc malloc allocator allocates one page at a time, potentially causing
lots of fragmentation. However, those pages are likely to have the same number
of references, often making it possible to merge ranges.

PiperOrigin-RevId: 204960339
Change-Id: I03a050cf771c29a4f05b36eaf75b1a09c9465e14
---
 pkg/sentry/fs/proc/README.md           | 2 ++
 pkg/sentry/platform/filemem/filemem.go | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index cec842403..e1ed88512 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,6 +11,7 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
+
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -22,6 +23,7 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
+
 ### cpuinfo
 
 ```bash
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index 6c8b95578..870274ae1 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -547,6 +547,8 @@ func (f *FileMem) IncRef(fr platform.FileRange) {
 	if gap.Ok() {
 		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
 	}
+
+	f.usage.MergeAdjacent(fr)
 }
 
 // DecRef implements platform.File.DecRef.
-- 
cgit v1.2.3


From 63e2820f7bc5b15eacd406ac10b8e83b3bc87fa4 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 18 Jul 2018 11:48:56 -0700
Subject: Fix lock-ordering violation in Create by logging BaseName instead of
 FullName.

Dirent.FullName takes the global renameMu, but can be called during Create,
which itself takes dirent.mu and dirent.dirMu, which is a lock-order violation:

Dirent.Create
  d.dirMu.Lock
  d.mu.Lock
  Inode.Create
    gofer.inodeOperations.Create
      gofer.NewFile
        Dirent.FullName
          d.renameMu.RLock

We only use the FullName here for logging, and in this case we can get by with
logging only the BaseName.

A `BaseName` method was added to Dirent, which simply returns the name, taking
d.parent.mu as required.

In the Create pathway, we can't call d.BaseName() because taking d.parent.mu
after d.mu violates the lock order. But we already know the base name of the
file we just created, so that's OK.

In the Open/GetFile pathway, we are free to call d.BaseName() because the other
dirent locks are not held.

PiperOrigin-RevId: 205112278
Change-Id: Ib45c734081aecc9b225249a65fa8093eb4995f10
---
 pkg/sentry/fs/dirent.go           | 11 +++++++++++
 pkg/sentry/fs/gofer/file.go       | 10 ++++++++--
 pkg/sentry/fs/gofer/gofer_test.go |  2 ++
 pkg/sentry/fs/gofer/inode.go      |  6 +++---
 pkg/sentry/fs/gofer/path.go       |  2 +-
 5 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 410f93b13..5eaa2189a 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -334,6 +334,17 @@ func (d *Dirent) SyncAll(ctx context.Context) {
 	}
 }
 
+// BaseName returns the base name of the dirent.
+func (d *Dirent) BaseName() string {
+	p := d.parent
+	if p == nil {
+		return d.name
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return d.name
+}
+
 // FullName returns the fully-qualified name and a boolean value representing
 // whether this Dirent was a descendant of root.
 // If the root argument is nil it is assumed to be the root of the Dirent tree.
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 69cee7026..039618808 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -57,7 +57,14 @@ type fileOperations struct {
 var _ fs.FileOperations = (*fileOperations)(nil)
 
 // NewFile returns a file. NewFile is not appropriate with host pipes and sockets.
-func NewFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, i *inodeOperations, handles *handles) *fs.File {
+//
+// The `name` argument is only used to log a warning if we are returning a
+// writeable+executable file. (A metric counter is incremented in this case as
+// well.) Note that we cannot call d.BaseName() directly in this function,
+// because that would lead to a lock order violation, since this is called in
+// d.Create which holds d.mu, while d.BaseName() takes d.parent.mu, and the two
+// locks must be taken in the opposite order.
+func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileFlags, i *inodeOperations, handles *handles) *fs.File {
 	// Remote file systems enforce readability/writability at an offset,
 	// see fs/9p/vfs_inode.c:v9fs_vfs_atomic_open -> fs/open.c:finish_open.
 	flags.Pread = true
@@ -70,7 +77,6 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, i *inod
 	}
 	if flags.Write {
 		if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil {
-			name, _ := dirent.FullName(fs.RootFromContext(ctx))
 			openedWX.Increment()
 			log.Warningf("Opened a writable executable: %q", name)
 		}
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 58a2e2ef5..3df72dd37 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -545,6 +545,7 @@ func TestPreadv(t *testing.T) {
 		f := NewFile(
 			ctx,
 			fs.NewDirent(rootInode, ""),
+			"",
 			fs.FileFlags{Read: true},
 			rootInode.InodeOperations.(*inodeOperations),
 			&handles{File: contextFile{file: openFile}},
@@ -751,6 +752,7 @@ func TestPwritev(t *testing.T) {
 		f := NewFile(
 			ctx,
 			fs.NewDirent(rootInode, ""),
+			"",
 			fs.FileFlags{Write: true},
 			rootInode.InodeOperations.(*inodeOperations),
 			&handles{File: contextFile{file: openFile}},
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index fa9013b75..df584c382 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -391,7 +391,7 @@ func (i *inodeOperations) getFilePipe(ctx context.Context, d *fs.Dirent, flags f
 	if err != nil {
 		return nil, err
 	}
-	return NewFile(ctx, d, flags, i, h), nil
+	return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
 }
 
 // errNotHostFile indicates that the file is not a host file.
@@ -430,7 +430,7 @@ func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flag
 		if err != nil {
 			return nil, err
 		}
-		return NewFile(ctx, d, flags, i, h), nil
+		return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
 	}
 
 	h, ok := i.fileState.getCachedHandles(ctx, flags, d.Inode.MountSource)
@@ -443,7 +443,7 @@ func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flag
 	}
 	i.fileState.setHandlesForCachedIO(flags, h)
 
-	return NewFile(ctx, d, flags, i, h), nil
+	return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
 }
 
 // SetPermissions implements fs.InodeOperations.SetPermissions.
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index e78172bda..bfeab3833 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -127,7 +127,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	if iops.session().cachePolicy.usePageCache(d.Inode) {
 		iops.fileState.setHandlesForCachedIO(flags, h)
 	}
-	return NewFile(ctx, d, flags, iops, h), nil
+	return NewFile(ctx, d, name, flags, iops, h), nil
 }
 
 // CreateLink uses Create to create a symlink between oldname and newname.
-- 
cgit v1.2.3


From a95640b1e9fb8c3751c54c80f6c04f0dff233aed Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 19 Jul 2018 09:36:34 -0700
Subject: sentry: save stack in proc net dev.

PiperOrigin-RevId: 205253858
Change-Id: Iccdc493b66d1b4d39de44afb1184952183b1283f
---
 pkg/sentry/fs/proc/net.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index ee0c825e8..8cd6fe9d3 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -109,7 +109,7 @@ func (n *ifinet6) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 
 // netDev implements seqfile.SeqSource for /proc/net/dev.
 type netDev struct {
-	s inet.Stack `state:"nosave"` // S/R-FIXME
+	s inet.Stack
 }
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
-- 
cgit v1.2.3


From df5a5d388e1fc3349ee70c3476fdffb195fbce9c Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Thu, 19 Jul 2018 12:41:00 -0700
Subject: Add AT_UID, AT_EUID, AT_GID, AT_EGID to aux vector.

With musl libc when these entries are missing from the aux vector
it's forcing libc.secure (effectively AT_SECURE). This mode prevents
RPATH and LD_LIBRARY_PATH from working.

https://git.musl-libc.org/cgit/musl/tree/ldso/dynlink.c#n1488
As the first entry is a mask of all the aux fields set:
https://git.musl-libc.org/cgit/musl/tree/ldso/dynlink.c#n187

PiperOrigin-RevId: 205284684
Change-Id: I04de7bab241043306b4f732306a81d74edfdff26
---
 pkg/sentry/loader/BUILD     | 1 +
 pkg/sentry/loader/loader.go | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index b7aebd9ec..01a0ec426 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -46,6 +46,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 1b2e9f183..62b39e52b 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -247,8 +248,14 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r
 		return 0, nil, "", err
 	}
 
-	// Add generic auxv entries
+	c := auth.CredentialsFromContext(ctx)
+
+	// Add generic auxv entries.
 	auxv := append(loaded.auxv, arch.Auxv{
+		arch.AuxEntry{linux.AT_UID, usermem.Addr(c.RealKUID.In(c.UserNamespace).OrOverflow())},
+		arch.AuxEntry{linux.AT_EUID, usermem.Addr(c.EffectiveKUID.In(c.UserNamespace).OrOverflow())},
+		arch.AuxEntry{linux.AT_GID, usermem.Addr(c.RealKGID.In(c.UserNamespace).OrOverflow())},
+		arch.AuxEntry{linux.AT_EGID, usermem.Addr(c.EffectiveKGID.In(c.UserNamespace).OrOverflow())},
 		arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC},
 		arch.AuxEntry{linux.AT_EXECFN, execfn},
 		arch.AuxEntry{linux.AT_RANDOM, random},
-- 
cgit v1.2.3


From ea371031968095209793ea39007cfdf5a9d66d10 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 19 Jul 2018 14:52:43 -0700
Subject: ConfigureMMap on an overlay file delegates to the upper if there is
 no lower.

In the general case with an overlay, all mmap calls must go through the
overlay, because in the event of a copy-up, the overlay needs to invalidate any
previously-created mappings.

If there if no lower file, however, there will never be a copy-up, so the
overlay can delegate directly to the upper file in that case.

This also allows us to correctly mmap /dev/zero when it is in an overlay. This
file has special semantics which the overlay does not know about. In
particular, it does not implement Mappable(), which (in the general case) the
overlay uses to detect if a file is mappable or not.

PiperOrigin-RevId: 205306743
Change-Id: I92331649aa648340ef6e65411c2b42c12fa69631
---
 pkg/sentry/fs/file_overlay.go | 68 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index c27c5946e..36b2cf75e 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -18,6 +18,7 @@ import (
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -263,6 +264,34 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 
+	// If there is no lower inode, the overlay will never need to do a
+	// copy-up, and thus will never need to invalidate any mappings. We can
+	// call ConfigureMMap directly on the upper file.
+	if o.lower == nil {
+		f := file.FileOperations.(*overlayFileOperations)
+		if err := f.upper.ConfigureMMap(ctx, opts); err != nil {
+			return err
+		}
+
+		// ConfigureMMap will set the MappableIdentity to the upper
+		// file and take a reference on it, but we must also hold a
+		// reference to the overlay file during the lifetime of the
+		// Mappable. If we do not do this, the overlay file can be
+		// Released before the upper file is Released, and we will be
+		// unable to traverse to the upper file during Save, thus
+		// preventing us from saving a proper inode mapping for the
+		// file.
+		file.IncRef()
+		id := &overlayMappingIdentity{
+			id:          opts.MappingIdentity,
+			overlayFile: file,
+		}
+
+		// Swap out the old MappingIdentity for the wrapped one.
+		opts.MappingIdentity = id
+		return nil
+	}
+
 	if !o.isMappableLocked() {
 		return syserror.ENODEV
 	}
@@ -343,3 +372,42 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) {
 	delete(stubSerializer.Entries, "..")
 	return stubSerializer.Entries, nil
 }
+
+// overlayMappingIdentity wraps a MappingIdentity, and also holds a reference
+// on a file during its lifetime.
+type overlayMappingIdentity struct {
+	refs.AtomicRefCount
+	id          memmap.MappingIdentity
+	overlayFile *File
+}
+
+// DecRef implements AtomicRefCount.DecRef.
+func (omi *overlayMappingIdentity) DecRef() {
+	omi.AtomicRefCount.DecRefWithDestructor(func() {
+		omi.overlayFile.DecRef()
+		omi.id.DecRef()
+	})
+}
+
+// DeviceID implements MappingIdentity.DeviceID using the device id from the
+// overlayFile.
+func (omi *overlayMappingIdentity) DeviceID() uint64 {
+	return omi.overlayFile.Dirent.Inode.StableAttr.DeviceID
+}
+
+// DeviceID implements MappingIdentity.InodeID using the inode id from the
+// overlayFile.
+func (omi *overlayMappingIdentity) InodeID() uint64 {
+	return omi.overlayFile.Dirent.Inode.StableAttr.InodeID
+}
+
+// MappedName implements MappingIdentity.MappedName.
+func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string {
+	name, _ := omi.overlayFile.Dirent.FullName(RootFromContext(ctx))
+	return name
+}
+
+// Msync implements MappingIdentity.Msync.
+func (omi *overlayMappingIdentity) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	return omi.id.Msync(ctx, mr)
+}
-- 
cgit v1.2.3


From be431d0934b8d33dcb1909527e0f9ed7eb504b6f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 19 Jul 2018 14:56:42 -0700
Subject: fs: Pass context to Revalidate() function.

The current revalidation logic is very simple and does not do much
introspection of the dirent being revalidated (other than looking at the type
of file).

Fancier revalidation logic is coming soon, and we need to be able to look at
the cached and uncached attributes of a given dirent, and we need a context to
perform some of these operations.

PiperOrigin-RevId: 205307351
Change-Id: If17ea1c631d8f9489c0e05a263e23d7a8a3bf159
---
 pkg/sentry/fs/dirent.go        |  2 +-
 pkg/sentry/fs/gofer/session.go |  2 +-
 pkg/sentry/fs/mock.go          |  2 +-
 pkg/sentry/fs/mount.go         | 14 ++++++++------
 pkg/sentry/fs/mount_overlay.go |  4 ++--
 pkg/sentry/fs/tty/fs.go        |  2 +-
 6 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 5eaa2189a..f9bf2fba6 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -488,7 +488,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 			//
 			// We never allow the file system to revalidate mounts, that could cause them
 			// to unexpectedly drop out before umount.
-			if cd.mounted || !cd.Inode.MountSource.Revalidate(cd) {
+			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, cd) {
 				// Good to go. This is the fast-path.
 				return cd, nil
 			}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 21dc5e08d..b6841526a 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -121,7 +121,7 @@ func (s *session) Destroy() {
 }
 
 // Revalidate returns true if the cache policy is does not allow for VFS caching.
-func (s *session) Revalidate(*fs.Dirent) bool {
+func (s *session) Revalidate(ctx context.Context, d *fs.Dirent) bool {
 	return s.cachePolicy.revalidateDirent()
 }
 
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index b3bfa5268..dc82a2002 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -68,7 +68,7 @@ func NewMockMountSource(cache *DirentCache) *MountSource {
 }
 
 // Revalidate implements fs.MountSourceOperations.Revalidate.
-func (n *MockMountSourceOps) Revalidate(*Dirent) bool {
+func (n *MockMountSourceOps) Revalidate(context.Context, *Dirent) bool {
 	return n.revalidate
 }
 
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 1d05a36a7..eb1897174 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -21,17 +21,19 @@ import (
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 )
 
 // DirentOperations provide file systems greater control over how long a Dirent stays pinned
 // in core. Implementations must not take Dirent.mu.
 type DirentOperations interface {
-	// Revalidate returns true if the Dirent is stale and its InodeOperations needs to be reloaded. Revalidate
-	// will never be called on a Dirent that is mounted.
-	Revalidate(dirent *Dirent) bool
+	// Revalidate returns true if the Dirent is stale and its
+	// InodeOperations needs to be reloaded. Revalidate will never be
+	// called on a Dirent that is mounted.
+	Revalidate(ctx context.Context, dirent *Dirent) bool
 
-	// Keep returns true if the Dirent should be kept in memory for as long as possible
-	// beyond any active references.
+	// Keep returns true if the Dirent should be kept in memory for as long
+	// as possible beyond any active references.
 	Keep(dirent *Dirent) bool
 }
 
@@ -263,7 +265,7 @@ type SimpleMountSourceOperations struct {
 }
 
 // Revalidate implements MountSourceOperations.Revalidate.
-func (*SimpleMountSourceOperations) Revalidate(*Dirent) bool {
+func (*SimpleMountSourceOperations) Revalidate(context.Context, *Dirent) bool {
 	return false
 }
 
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 343202400..1be81e3a1 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -34,8 +34,8 @@ func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *M
 
 // Revalidate panics if the upper or lower MountSource require that dirent be
 // revalidated. Otherwise always returns false.
-func (o *overlayMountSourceOperations) Revalidate(dirent *Dirent) bool {
-	if o.upper.Revalidate(dirent) || o.lower.Revalidate(dirent) {
+func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, dirent *Dirent) bool {
+	if o.upper.Revalidate(ctx, dirent) || o.lower.Revalidate(ctx, dirent) {
 		panic("an overlay cannot revalidate file objects")
 	}
 	return false
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 1ef1a85e3..dbaffe95e 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -78,7 +78,7 @@ type superOperations struct{}
 // Slave entries are dropped from dir when their master is closed, so an
 // existing slave Dirent in the tree is not sufficient to guarantee that it
 // still exists on the filesystem.
-func (superOperations) Revalidate(*fs.Dirent) bool {
+func (superOperations) Revalidate(context.Context, *fs.Dirent) bool {
 	return true
 }
 
-- 
cgit v1.2.3


From 8b8aad91d581ee5f600f5ec0b7fb407b36d07db1 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 19 Jul 2018 15:48:08 -0700
Subject: kernel: mutations on creds now require a copy.

PiperOrigin-RevId: 205315612
Change-Id: I9a0a1e32c8abfb7467a38743b82449cc92830316
---
 pkg/sentry/kernel/fasync/fasync.go    |  2 +-
 pkg/sentry/kernel/task.go             |  4 +++-
 pkg/sentry/kernel/task_identity.go    | 16 +++++++++++++---
 pkg/sentry/syscalls/linux/sys_file.go |  4 ++--
 4 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 028d6766f..15218fb5a 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -35,7 +35,7 @@ func New() fs.FileAsync {
 type FileAsync struct {
 	mu        sync.Mutex
 	e         waiter.Entry
-	requester auth.Credentials
+	requester *auth.Credentials
 
 	// Only one of the following is allowed to be non-nil.
 	recipientPG *kernel.ProcessGroup
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 7763050a5..7f6735320 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -334,7 +334,9 @@ type Task struct {
 
 	// creds is the task's credentials.
 	//
-	// creds is protected by mu.
+	// creds is protected by mu, however the value itself is immutable and
+	// can only be changed by a copy. After reading the pointer, access
+	// will proceed outside the scope of mu.
 	creds *auth.Credentials
 
 	// utsns is the task's UTS namespace.
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index a51fa9d7e..b0921b2eb 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -20,11 +20,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-// Credentials returns t's credentials by value.
-func (t *Task) Credentials() auth.Credentials {
+// Credentials returns t's credentials.
+//
+// This value must be considered immutable.
+func (t *Task) Credentials() *auth.Credentials {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return *t.creds // Copy out with lock held.
+	return t.creds
 }
 
 // UserNamespace returns the user namespace associated with the task.
@@ -162,6 +164,7 @@ func (t *Task) SetRESUID(r, e, s auth.UID) error {
 func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
 	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
 	oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
+	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
 
 	// "1. If one or more of the real, effective or saved set user IDs was
@@ -297,6 +300,7 @@ func (t *Task) SetRESGID(r, e, s auth.GID) error {
 
 func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
 	oldE := t.creds.EffectiveKGID
+	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
 
 	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
@@ -321,6 +325,7 @@ func (t *Task) SetExtraGIDs(gids []auth.GID) error {
 		}
 		kgids[i] = kgid
 	}
+	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.ExtraKGIDs = kgids
 	return nil
 }
@@ -352,6 +357,7 @@ func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.Capabili
 	if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
 		return syserror.EPERM
 	}
+	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.PermittedCaps = permitted
 	t.creds.InheritableCaps = inheritable
 	t.creds.EffectiveCaps = effective
@@ -384,6 +390,7 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
 		return syserror.EPERM
 	}
 
+	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.UserNamespace = ns
 	// "The child process created by clone(2) with the CLONE_NEWUSER flag
 	// starts out with a complete set of capabilities in the new user
@@ -407,6 +414,7 @@ func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
 func (t *Task) SetKeepCaps(k bool) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
+	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.KeepCaps = k
 }
 
@@ -491,6 +499,8 @@ func (t *Task) updateCredsForExecLocked() {
 		}
 	}
 
+	t.creds = t.creds.Fork() // See doc for creds.
+
 	// Now we enter poorly-documented, somewhat confusing territory. (The
 	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
 	// is not very helpful.) My reading of it is:
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 490649f87..66e6fd9d4 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -415,14 +415,14 @@ func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 // accessContext should only be used for access(2).
 type accessContext struct {
 	context.Context
-	creds auth.Credentials
+	creds *auth.Credentials
 }
 
 // Value implements context.Context.
 func (ac accessContext) Value(key interface{}) interface{} {
 	switch key {
 	case auth.CtxCredentials:
-		return &ac.creds
+		return ac.creds
 	default:
 		return ac.Context.Value(key)
 	}
-- 
cgit v1.2.3


From 5f134b3c0a08c0e170aa50ad3342df59832b4356 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 20 Jul 2018 12:58:59 -0700
Subject: Format getcwd path

PiperOrigin-RevId: 205440332
Change-Id: I2a838f363e079164c83da88e1b0b8769844fe79b
---
 pkg/sentry/strace/linux64.go  | 2 +-
 pkg/sentry/strace/strace.go   | 2 ++
 pkg/sentry/strace/syscalls.go | 4 ++++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 90ea8c36f..85e1e1f83 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -96,7 +96,7 @@ var linuxAMD64 = SyscallMap{
 	76:  makeSyscallInfo("truncate", Path, Hex),
 	77:  makeSyscallInfo("ftruncate", Hex, Hex),
 	78:  makeSyscallInfo("getdents", Hex, Hex, Hex),
-	79:  makeSyscallInfo("getcwd", Hex, Hex),
+	79:  makeSyscallInfo("getcwd", PostPath, Hex),
 	80:  makeSyscallInfo("chdir", Path),
 	81:  makeSyscallInfo("fchdir", Hex),
 	82:  makeSyscallInfo("rename", Path, Path),
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 4cd16d2f8..03b4a350a 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -346,6 +346,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = msghdr(t, args[arg].Pointer(), false /* content */, uint64(maximumBlobSize))
 		case RecvMsgHdr:
 			output[arg] = msghdr(t, args[arg].Pointer(), true /* content */, uint64(maximumBlobSize))
+		case PostPath:
+			output[arg] = path(t, args[arg].Pointer())
 		case PipeFDs:
 			output[arg] = fdpair(t, args[arg].Pointer())
 		case Uname:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index eccee733e..4513d1ba6 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -75,6 +75,10 @@ const (
 	// Path is a pointer to a char* path.
 	Path
 
+	// PostPath is a pointer to a char* path, formatted after syscall
+	// execution.
+	PostPath
+
 	// ExecveStringVector is a NULL-terminated array of strings. Enforces
 	// the maximum execve array length.
 	ExecveStringVector
-- 
cgit v1.2.3


From d7a34790a0cc3cfdef9d9e54f17c4bc0a6819900 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 23 Jul 2018 13:30:29 -0700
Subject: Add KVM and overlay dimensions to container_test

PiperOrigin-RevId: 205714667
Change-Id: I317a2ca98ac3bdad97c4790fcc61b004757d99ef
---
 pkg/sentry/platform/kvm/kvm.go       |    2 +-
 runsc/container/BUILD                |    6 +-
 runsc/container/container_test.go    | 1475 ++++++++++++++++++----------------
 runsc/sandbox/sandbox_test.go        |    3 +-
 runsc/test/testutil/BUILD            |    1 +
 runsc/test/testutil/testutil.go      |   42 +-
 runsc/test/testutil/testutil_race.go |   21 +
 7 files changed, 831 insertions(+), 719 deletions(-)
 create mode 100644 runsc/test/testutil/testutil_race.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 3ed057881..2dc3239a5 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -66,7 +66,7 @@ func New() (*KVM, error) {
 		ring0.Init(cpuid.HostFeatureSet())
 	})
 	if globalErr != nil {
-		return nil, err
+		return nil, globalErr
 	}
 
 	// Create a new VM fd.
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 679d7e097..7ec68f573 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -26,17 +26,21 @@ go_library(
 
 go_test(
     name = "container_test",
-    size = "small",
+    size = "medium",
     srcs = ["container_test.go"],
     data = [
         "//runsc",
     ],
+    tags = [
+        "requires-kvm",
+    ],
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
+        "//runsc/boot",
         "//runsc/container",
         "//runsc/specutils",
         "//runsc/test/testutil",
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 62a681ac2..34febe038 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
@@ -159,8 +160,8 @@ func readOutputNum(f *os.File, first bool) (int, error) {
 
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
-func run(spec *specs.Spec) error {
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+func run(spec *specs.Spec, conf *boot.Config) error {
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		return fmt.Errorf("error setting up container: %v", err)
 	}
@@ -186,173 +187,207 @@ func run(spec *specs.Spec) error {
 	return nil
 }
 
-// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
-// It verifies after each step that the container can be loaded from disk, and
-// has the correct status.
-func TestLifecycle(t *testing.T) {
-	// The container will just sleep for a long time.  We will kill it before
-	// it finishes sleeping.
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+type configOptions int
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+const (
+	overlay configOptions = 1 << iota
+	kvm
+)
+const all = overlay | kvm
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
-	// Create the container.
-	id := testutil.UniqueContainerID()
-	if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
+// configs generates different configurations to run tests.
+func configs(opts configOptions) []*boot.Config {
+	cs := []*boot.Config{testutil.TestConfig()}
 
-	// Load the container from disk and check the status.
-	s, err := container.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading container: %v", err)
-	}
-	if got, want := s.Status, container.Created; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
+	if opts&overlay != 0 {
+		c := testutil.TestConfig()
+		c.Overlay = true
+		cs = append(cs, c)
 	}
 
-	// List should return the container id.
-	ids, err := container.List(rootDir)
-	if err != nil {
-		t.Fatalf("error listing containers: %v", err)
-	}
-	if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
-		t.Errorf("container list got %v, want %v", got, want)
+	// TODO: KVM doesn't work with --race.
+	if !testutil.RaceEnabled && opts&kvm != 0 {
+		c := testutil.TestConfig()
+		c.Platform = boot.PlatformKVM
+		cs = append(cs, c)
 	}
 
-	// Start the container.
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-	// Load the container from disk and check the status.
-	s, err = container.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading container: %v", err)
-	}
-	if got, want := s.Status, container.Running; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+	return cs
+}
 
-	// Verify that "sleep 100" is running.
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Error(err)
-	}
+// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
+// It verifies after each step that the container can be loaded from disk, and
+// has the correct status.
+func TestLifecycle(t *testing.T) {
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
+		// The container will just sleep for a long time.  We will kill it before
+		// it finishes sleeping.
+		spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// Wait on the container.
-	var wg sync.WaitGroup
-	wg.Add(1)
-	ch := make(chan struct{})
-	go func() {
-		ch <- struct{}{}
-		ws, err := s.Wait()
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 		if err != nil {
-			t.Fatalf("error waiting on container: %v", err)
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
 		}
-		if got, want := ws.Signal(), syscall.SIGTERM; got != want {
-			t.Fatalf("got signal %v, want %v", got, want)
+		// Create the container.
+		id := testutil.UniqueContainerID()
+		if _, err := container.Create(id, spec, conf, bundleDir, "", ""); err != nil {
+			t.Fatalf("error creating container: %v", err)
 		}
-		wg.Done()
-	}()
 
-	// Wait a bit to ensure that we've started waiting on the container
-	// before we signal.
-	<-ch
-	time.Sleep(100 * time.Millisecond)
-	// Send the container a SIGTERM which will cause it to stop.
-	if err := s.Signal(syscall.SIGTERM); err != nil {
-		t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
-	}
-	// Wait for it to die.
-	wg.Wait()
+		// Load the container from disk and check the status.
+		s, err := container.Load(rootDir, id)
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		if got, want := s.Status, container.Created; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	// The sandbox process should have exited by now, but it is a zombie.
-	// In normal runsc usage, it will be parented to init, and init will
-	// reap the sandbox. However, in this case the test runner is the
-	// parent and will not reap the sandbox process, so we must do it
-	// ourselves.
-	p, _ := os.FindProcess(s.Sandbox.Pid)
-	p.Wait()
-	g, _ := os.FindProcess(s.Sandbox.GoferPid)
-	g.Wait()
-
-	// Load the container from disk and check the status.
-	s, err = container.Load(rootDir, id)
-	if err != nil {
-		t.Fatalf("error loading container: %v", err)
-	}
-	if got, want := s.Status, container.Stopped; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+		// List should return the container id.
+		ids, err := container.List(rootDir)
+		if err != nil {
+			t.Fatalf("error listing containers: %v", err)
+		}
+		if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
+			t.Errorf("container list got %v, want %v", got, want)
+		}
 
-	// Destroy the container.
-	if err := s.Destroy(); err != nil {
-		t.Fatalf("error destroying container: %v", err)
-	}
+		// Start the container.
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		// Load the container from disk and check the status.
+		s, err = container.Load(rootDir, id)
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		if got, want := s.Status, container.Running; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	// List should not return the container id.
-	ids, err = container.List(rootDir)
-	if err != nil {
-		t.Fatalf("error listing containers: %v", err)
-	}
-	if len(ids) != 0 {
-		t.Errorf("expected container list to be empty, but got %v", ids)
-	}
+		// Verify that "sleep 100" is running.
+		if err := waitForProcessList(s, expectedPL); err != nil {
+			t.Error(err)
+		}
 
-	// Loading the container by id should fail.
-	if _, err = container.Load(rootDir, id); err == nil {
-		t.Errorf("expected loading destroyed container to fail, but it did not")
-	}
-}
+		// Wait on the container.
+		var wg sync.WaitGroup
+		wg.Add(1)
+		ch := make(chan struct{})
+		go func() {
+			ch <- struct{}{}
+			ws, err := s.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got, want := ws.Signal(), syscall.SIGTERM; got != want {
+				t.Fatalf("got signal %v, want %v", got, want)
+			}
+			wg.Done()
+		}()
 
-// Test the we can execute the application with different path formats.
-func TestExePath(t *testing.T) {
-	for _, test := range []struct {
-		path    string
-		success bool
-	}{
-		{path: "true", success: true},
-		{path: "bin/true", success: true},
-		{path: "/bin/true", success: true},
-		{path: "thisfiledoesntexit", success: false},
-		{path: "bin/thisfiledoesntexit", success: false},
-		{path: "/bin/thisfiledoesntexit", success: false},
-	} {
-		spec := testutil.NewSpecWithArgs(test.path)
-		rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+		// Wait a bit to ensure that we've started waiting on the container
+		// before we signal.
+		<-ch
+		time.Sleep(100 * time.Millisecond)
+		// Send the container a SIGTERM which will cause it to stop.
+		if err := s.Signal(syscall.SIGTERM); err != nil {
+			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
+		}
+		// Wait for it to die.
+		wg.Wait()
+
+		// The sandbox process should have exited by now, but it is a zombie.
+		// In normal runsc usage, it will be parented to init, and init will
+		// reap the sandbox. However, in this case the test runner is the
+		// parent and will not reap the sandbox process, so we must do it
+		// ourselves.
+		p, _ := os.FindProcess(s.Sandbox.Pid)
+		p.Wait()
+		g, _ := os.FindProcess(s.Sandbox.GoferPid)
+		g.Wait()
+
+		// Load the container from disk and check the status.
+		s, err = container.Load(rootDir, id)
 		if err != nil {
-			t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
+			t.Fatalf("error loading container: %v", err)
+		}
+		if got, want := s.Status, container.Stopped; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
+
+		// Destroy the container.
+		if err := s.Destroy(); err != nil {
+			t.Fatalf("error destroying container: %v", err)
 		}
 
-		ws, err := container.Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		// List should not return the container id.
+		ids, err = container.List(rootDir)
+		if err != nil {
+			t.Fatalf("error listing containers: %v", err)
+		}
+		if len(ids) != 0 {
+			t.Errorf("expected container list to be empty, but got %v", ids)
+		}
 
-		os.RemoveAll(rootDir)
-		os.RemoveAll(bundleDir)
+		// Loading the container by id should fail.
+		if _, err = container.Load(rootDir, id); err == nil {
+			t.Errorf("expected loading destroyed container to fail, but it did not")
+		}
+	}
+}
 
-		if test.success {
+// Test the we can execute the application with different path formats.
+func TestExePath(t *testing.T) {
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+		for _, test := range []struct {
+			path    string
+			success bool
+		}{
+			{path: "true", success: true},
+			{path: "bin/true", success: true},
+			{path: "/bin/true", success: true},
+			{path: "thisfiledoesntexit", success: false},
+			{path: "bin/thisfiledoesntexit", success: false},
+			{path: "/bin/thisfiledoesntexit", success: false},
+		} {
+			spec := testutil.NewSpecWithArgs(test.path)
+			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
-				t.Errorf("exec: %s, error running container: %v", test.path, err)
+				t.Fatalf("exec: %s, error setting up container: %v", test.path, err)
 			}
-			if ws.ExitStatus() != 0 {
-				t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
-			}
-		} else {
-			if err == nil {
-				t.Errorf("exec: %s, got: no error, want: error", test.path)
+
+			ws, err := container.Run(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+
+			os.RemoveAll(rootDir)
+			os.RemoveAll(bundleDir)
+
+			if test.success {
+				if err != nil {
+					t.Errorf("exec: %s, error running container: %v", test.path, err)
+				}
+				if ws.ExitStatus() != 0 {
+					t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
+				}
+			} else {
+				if err == nil {
+					t.Errorf("exec: %s, got: no error, want: error", test.path)
+				}
 			}
 		}
 	}
@@ -362,8 +397,8 @@ func TestExePath(t *testing.T) {
 func TestAppExitStatus(t *testing.T) {
 	// First container will succeed.
 	succSpec := testutil.NewSpecWithArgs("true")
-
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(succSpec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -382,7 +417,7 @@ func TestAppExitStatus(t *testing.T) {
 	wantStatus := 123
 	errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
 
-	rootDir2, bundleDir2, conf, err := testutil.SetupContainer(errSpec)
+	rootDir2, bundleDir2, err := testutil.SetupContainer(errSpec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -400,82 +435,86 @@ func TestAppExitStatus(t *testing.T) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	const uid = 343
-	spec := testutil.NewSpecWithArgs("sleep", "100")
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		const uid = 343
+		spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		// Create and start the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	// Verify that "sleep 100" is running.
-	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
-		t.Error(err)
-	}
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  uid,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	execArgs := control.ExecArgs{
-		Filename:         "/bin/sleep",
-		Argv:             []string{"sleep", "5"},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-	}
+		// Verify that "sleep 100" is running.
+		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+			t.Error(err)
+		}
 
-	// Verify that "sleep 100" and "sleep 5" are running after exec.
-	// First, start running exec (whick blocks).
-	status := make(chan error, 1)
-	go func() {
-		exitStatus, err := s.Execute(&execArgs)
-		if err != nil {
-			status <- err
-		} else if exitStatus != 0 {
-			status <- fmt.Errorf("failed with exit status: %v", exitStatus)
-		} else {
-			status <- nil
+		execArgs := control.ExecArgs{
+			Filename:         "/bin/sleep",
+			Argv:             []string{"sleep", "5"},
+			Envv:             []string{"PATH=" + os.Getenv("PATH")},
+			WorkingDirectory: "/",
+			KUID:             uid,
 		}
-	}()
 
-	if err := waitForProcessList(s, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		// Verify that "sleep 100" and "sleep 5" are running after exec.
+		// First, start running exec (whick blocks).
+		status := make(chan error, 1)
+		go func() {
+			exitStatus, err := s.Execute(&execArgs)
+			if err != nil {
+				status <- err
+			} else if exitStatus != 0 {
+				status <- fmt.Errorf("failed with exit status: %v", exitStatus)
+			} else {
+				status <- nil
+			}
+		}()
 
-	// Ensure that exec finished without error.
-	select {
-	case <-time.After(10 * time.Second):
-		t.Fatalf("container timed out waiting for exec to finish.")
-	case st := <-status:
-		if st != nil {
-			t.Errorf("container failed to exec %v: %v", execArgs, err)
+		if err := waitForProcessList(s, expectedPL); err != nil {
+			t.Fatal(err)
+		}
+
+		// Ensure that exec finished without error.
+		select {
+		case <-time.After(10 * time.Second):
+			t.Fatalf("container timed out waiting for exec to finish.")
+		case st := <-status:
+			if st != nil {
+				t.Errorf("container failed to exec %v: %v", execArgs, err)
+			}
 		}
 	}
 }
@@ -486,129 +525,136 @@ func TestExec(t *testing.T) {
 // new containers and the first number printed from these containers is checked. Both should
 // be the next consecutive number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
-	outputPath := filepath.Join(os.TempDir(), "output")
-	// Make sure it does not already exist.
-	os.Remove(outputPath)
+	// Skip overlay because test requires writing to host file.
+	for _, conf := range configs(kvm) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	outputFile, err := createWriteableOutputFile(outputPath)
-	if err != nil {
-		t.Fatalf("error creating output file: %v", err)
-	}
-	defer outputFile.Close()
+		dir, err := ioutil.TempDir("", "checkpoint-test")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir failed: %v", err)
+		}
+		if err := os.Chmod(dir, 0777); err != nil {
+			t.Fatalf("error chmoding file: %q, %v", dir, err)
+		}
 
-	outputFileSandbox := strings.Replace(outputPath, os.TempDir(), "/tmp2", -1)
+		outputPath := filepath.Join(dir, "output")
+		outputFile, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile.Close()
 
-	script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %s; sleep 1; done", outputFileSandbox)
-	spec := testutil.NewSpecWithArgs("bash", "-c", script)
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Type:        "bind",
-		Destination: "/tmp2",
-		Source:      os.TempDir(),
-	})
+		script := "for ((i=0; ;i++)); do echo $i >> /tmp2/output; sleep 1; done"
+		spec := testutil.NewSpecWithArgs("bash", "-c", script)
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: "/tmp2",
+			Source:      dir,
+		})
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont.Destroy()
-	if err := cont.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		// Create and start the container.
+		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	// Set the image path, which is where the checkpoint image will be saved.
-	imagePath := filepath.Join(os.TempDir(), "test-image-file")
+		// Set the image path, which is where the checkpoint image will be saved.
+		imagePath := filepath.Join(dir, "test-image-file")
 
-	// Create the image file and open for writing.
-	file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
-	if err != nil {
-		t.Fatalf("error opening new file at imagePath: %v", err)
-	}
-	defer file.Close()
+		// Create the image file and open for writing.
+		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+		if err != nil {
+			t.Fatalf("error opening new file at imagePath: %v", err)
+		}
+		defer file.Close()
 
-	time.Sleep(1 * time.Second)
+		time.Sleep(1 * time.Second)
 
-	// Checkpoint running container; save state into new file.
-	if err := cont.Checkpoint(file); err != nil {
-		t.Fatalf("error checkpointing container to empty file: %v", err)
-	}
-	defer os.RemoveAll(imagePath)
+		// Checkpoint running container; save state into new file.
+		if err := cont.Checkpoint(file); err != nil {
+			t.Fatalf("error checkpointing container to empty file: %v", err)
+		}
+		defer os.RemoveAll(imagePath)
 
-	lastNum, err := readOutputNum(outputFile, false)
-	if err != nil {
-		t.Fatalf("error with outputFile: %v", err)
-	}
+		lastNum, err := readOutputNum(outputFile, false)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
 
-	// Delete and recreate file before restoring.
-	if err := os.Remove(outputPath); err != nil {
-		t.Fatalf("error removing file")
-	}
-	outputFile2, err := createWriteableOutputFile(outputPath)
-	if err != nil {
-		t.Fatalf("error creating output file: %v", err)
-	}
-	defer outputFile2.Close()
+		// Delete and recreate file before restoring.
+		if err := os.Remove(outputPath); err != nil {
+			t.Fatalf("error removing file")
+		}
+		outputFile2, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile2.Close()
 
-	// Restore into a new container.
-	cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont2.Destroy()
+		// Restore into a new container.
+		cont2, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont2.Destroy()
 
-	if err := cont2.Restore(spec, conf, imagePath); err != nil {
-		t.Fatalf("error restoring container: %v", err)
-	}
+		if err := cont2.Restore(spec, conf, imagePath); err != nil {
+			t.Fatalf("error restoring container: %v", err)
+		}
 
-	firstNum, err := readOutputNum(outputFile2, true)
-	if err != nil {
-		t.Fatalf("error with outputFile: %v", err)
-	}
+		firstNum, err := readOutputNum(outputFile2, true)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
 
-	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
-	if lastNum+1 != firstNum {
-		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
-	}
+		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		if lastNum+1 != firstNum {
+			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+		}
 
-	// Restore into another container!
-	// Delete and recreate file before restoring.
-	if err := os.Remove(outputPath); err != nil {
-		t.Fatalf("error removing file")
-	}
-	outputFile3, err := createWriteableOutputFile(outputPath)
-	if err != nil {
-		t.Fatalf("error creating output file: %v", err)
-	}
-	defer outputFile3.Close()
+		// Restore into another container!
+		// Delete and recreate file before restoring.
+		if err := os.Remove(outputPath); err != nil {
+			t.Fatalf("error removing file")
+		}
+		outputFile3, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile3.Close()
 
-	// Restore into a new container.
-	cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont3.Destroy()
+		// Restore into a new container.
+		cont3, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont3.Destroy()
 
-	if err := cont3.Restore(spec, conf, imagePath); err != nil {
-		t.Fatalf("error restoring container: %v", err)
-	}
+		if err := cont3.Restore(spec, conf, imagePath); err != nil {
+			t.Fatalf("error restoring container: %v", err)
+		}
 
-	firstNum2, err := readOutputNum(outputFile3, true)
-	if err != nil {
-		t.Fatalf("error with outputFile: %v", err)
-	}
+		firstNum2, err := readOutputNum(outputFile3, true)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
 
-	// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
-	if lastNum+1 != firstNum2 {
-		t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
+		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		if lastNum+1 != firstNum2 {
+			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
+		}
 	}
-
 }
 
 // TestPauseResume tests that we can successfully pause and resume a container.
@@ -617,102 +663,105 @@ func TestCheckpointRestore(t *testing.T) {
 // It will then unpause and confirm that both processes are running. Then it will
 // wait until one sleep completes and check to make sure the other is running.
 func TestPauseResume(t *testing.T) {
-	const uid = 343
-	spec := testutil.NewSpecWithArgs("sleep", "20")
-
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
+		const uid = 343
+		spec := testutil.NewSpecWithArgs("sleep", "20")
 
-	// Create and start the container.
-	cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer cont.Destroy()
-	if err := cont.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		// Create and start the container.
+		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	execArgs := control.ExecArgs{
-		Filename:         "/bin/sleep",
-		Argv:             []string{"sleep", "5"},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-	}
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  uid,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	// First, start running exec (whick blocks).
-	go cont.Execute(&execArgs)
+		execArgs := control.ExecArgs{
+			Filename:         "/bin/sleep",
+			Argv:             []string{"sleep", "5"},
+			Envv:             []string{"PATH=" + os.Getenv("PATH")},
+			WorkingDirectory: "/",
+			KUID:             uid,
+		}
 
-	// Verify that "sleep 5" is running.
-	if err := waitForProcessList(cont, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		// First, start running exec (whick blocks).
+		go cont.Execute(&execArgs)
 
-	// Pause the running container.
-	if err := cont.Pause(); err != nil {
-		t.Errorf("error pausing container: %v", err)
-	}
-	if got, want := cont.Status, container.Paused; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+		// Verify that "sleep 5" is running.
+		if err := waitForProcessList(cont, expectedPL); err != nil {
+			t.Fatal(err)
+		}
 
-	time.Sleep(10 * time.Second)
+		// Pause the running container.
+		if err := cont.Pause(); err != nil {
+			t.Errorf("error pausing container: %v", err)
+		}
+		if got, want := cont.Status, container.Paused; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	// Verify that the two processes still exist. Sleep 5 is paused so
-	// it should still be in the process list after 10 seconds.
-	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		time.Sleep(10 * time.Second)
 
-	// Resume the running container.
-	if err := cont.Resume(); err != nil {
-		t.Errorf("error pausing container: %v", err)
-	}
-	if got, want := cont.Status, container.Running; got != want {
-		t.Errorf("container status got %v, want %v", got, want)
-	}
+		// Verify that the two processes still exist. Sleep 5 is paused so
+		// it should still be in the process list after 10 seconds.
+		if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+			t.Fatal(err)
+		}
 
-	if err := getAndCheckProcLists(cont, expectedPL); err != nil {
-		t.Fatal(err)
-	}
+		// Resume the running container.
+		if err := cont.Resume(); err != nil {
+			t.Errorf("error pausing container: %v", err)
+		}
+		if got, want := cont.Status, container.Running; got != want {
+			t.Errorf("container status got %v, want %v", got, want)
+		}
 
-	expectedPL2 := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		if err := getAndCheckProcLists(cont, expectedPL); err != nil {
+			t.Fatal(err)
+		}
+
+		expectedPL2 := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	// Verify there is only one process left since we waited 10 at most seconds for
-	// sleep 5 to end.
-	if err := waitForProcessList(cont, expectedPL2); err != nil {
-		t.Fatal(err)
+		// Verify there is only one process left since we waited 10 at most seconds for
+		// sleep 5 to end.
+		if err := waitForProcessList(cont, expectedPL2); err != nil {
+			t.Fatal(err)
+		}
 	}
 }
 
@@ -721,8 +770,8 @@ func TestPauseResume(t *testing.T) {
 // occurs given the correct state.
 func TestPauseResumeStatus(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("sleep", "20")
-
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
@@ -780,297 +829,321 @@ func TestPauseResumeStatus(t *testing.T) {
 func TestCapabilities(t *testing.T) {
 	const uid = 343
 	const gid = 2401
-	spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// We generate files in the host temporary directory.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: os.TempDir(),
-		Source:      os.TempDir(),
-		Type:        "bind",
-	})
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		spec := testutil.NewSpecWithArgs("sleep", "100")
 
-	// Create and start the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
+		// We generate files in the host temporary directory.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: os.TempDir(),
+			Source:      os.TempDir(),
+			Type:        "bind",
+		})
 
-	// expectedPL lists the expected process state of the container.
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  uid,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "exe",
-		},
-	}
-	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
-		t.Fatalf("Failed to wait for sleep to start, err: %v", err)
-	}
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// Create an executable that can't be run with the specified UID:GID.
-	// This shouldn't be callable within the container until we add the
-	// CAP_DAC_OVERRIDE capability to skip the access check.
-	exePath := filepath.Join(rootDir, "exe")
-	if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
-		t.Fatalf("couldn't create executable: %v", err)
-	}
-	defer os.Remove(exePath)
+		// Create and start the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
 
-	// Need to traverse the intermediate directory.
-	os.Chmod(rootDir, 0755)
+		// expectedPL lists the expected process state of the container.
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  uid,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "exe",
+			},
+		}
+		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+			t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+		}
 
-	execArgs := control.ExecArgs{
-		Filename:         exePath,
-		Argv:             []string{exePath},
-		Envv:             []string{"PATH=" + os.Getenv("PATH")},
-		WorkingDirectory: "/",
-		KUID:             uid,
-		KGID:             gid,
-		Capabilities:     &auth.TaskCapabilities{},
-	}
+		// Create an executable that can't be run with the specified UID:GID.
+		// This shouldn't be callable within the container until we add the
+		// CAP_DAC_OVERRIDE capability to skip the access check.
+		exePath := filepath.Join(rootDir, "exe")
+		if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+			t.Fatalf("couldn't create executable: %v", err)
+		}
+		defer os.Remove(exePath)
+
+		// Need to traverse the intermediate directory.
+		os.Chmod(rootDir, 0755)
+
+		execArgs := control.ExecArgs{
+			Filename:         exePath,
+			Argv:             []string{exePath},
+			Envv:             []string{"PATH=" + os.Getenv("PATH")},
+			WorkingDirectory: "/",
+			KUID:             uid,
+			KGID:             gid,
+			Capabilities:     &auth.TaskCapabilities{},
+		}
 
-	// "exe" should fail because we don't have the necessary permissions.
-	if _, err := s.Execute(&execArgs); err == nil {
-		t.Fatalf("container executed without error, but an error was expected")
-	}
+		// "exe" should fail because we don't have the necessary permissions.
+		if _, err := s.Execute(&execArgs); err == nil {
+			t.Fatalf("container executed without error, but an error was expected")
+		}
 
-	// Now we run with the capability enabled and should succeed.
-	execArgs.Capabilities = &auth.TaskCapabilities{
-		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
-	}
-	// "exe" should not fail this time.
-	if _, err := s.Execute(&execArgs); err != nil {
-		t.Fatalf("container failed to exec %v: %v", execArgs, err)
+		// Now we run with the capability enabled and should succeed.
+		execArgs.Capabilities = &auth.TaskCapabilities{
+			EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+		}
+		// "exe" should not fail this time.
+		if _, err := s.Execute(&execArgs); err != nil {
+			t.Fatalf("container failed to exec %v: %v", execArgs, err)
+		}
 	}
 }
 
 // Test that an tty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("true")
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
+		spec := testutil.NewSpecWithArgs("true")
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	// Create a named socket and start listening.  We use a relative path
-	// to avoid overflowing the unix path length limit (108 chars).
-	socketPath := filepath.Join(bundleDir, "socket")
-	cwd, err := os.Getwd()
-	if err != nil {
-		t.Fatalf("error getting cwd: %v", err)
-	}
-	socketRelPath, err := filepath.Rel(cwd, socketPath)
-	if err != nil {
-		t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
-	}
-	if len(socketRelPath) > len(socketPath) {
-		socketRelPath = socketPath
-	}
-	srv, err := unet.BindAndListen(socketRelPath, false)
-	if err != nil {
-		t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
-	}
-	defer os.Remove(socketPath)
+		// Create a named socket and start listening.  We use a relative path
+		// to avoid overflowing the unix path length limit (108 chars).
+		socketPath := filepath.Join(bundleDir, "socket")
+		cwd, err := os.Getwd()
+		if err != nil {
+			t.Fatalf("error getting cwd: %v", err)
+		}
+		socketRelPath, err := filepath.Rel(cwd, socketPath)
+		if err != nil {
+			t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+		}
+		if len(socketRelPath) > len(socketPath) {
+			socketRelPath = socketPath
+		}
+		srv, err := unet.BindAndListen(socketRelPath, false)
+		if err != nil {
+			t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
+		}
+		defer os.Remove(socketPath)
 
-	// Create the container and pass the socket name.
-	id := testutil.UniqueContainerID()
-	s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
+		// Create the container and pass the socket name.
+		id := testutil.UniqueContainerID()
+		s, err := container.Create(id, spec, conf, bundleDir, socketRelPath, "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
 
-	// Open the othe end of the socket.
-	sock, err := srv.Accept()
-	if err != nil {
-		t.Fatalf("error accepting socket connection: %v", err)
-	}
+		// Open the othe end of the socket.
+		sock, err := srv.Accept()
+		if err != nil {
+			t.Fatalf("error accepting socket connection: %v", err)
+		}
 
-	// Allow 3 fds to be received.  We only expect 1.
-	r := sock.Reader(true /* blocking */)
-	r.EnableFDs(1)
+		// Allow 3 fds to be received.  We only expect 1.
+		r := sock.Reader(true /* blocking */)
+		r.EnableFDs(1)
 
-	// The socket is closed right after sending the FD, so EOF is
-	// an allowed error.
-	b := [][]byte{{}}
-	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
-		t.Fatalf("error reading from socket connection: %v", err)
-	}
+		// The socket is closed right after sending the FD, so EOF is
+		// an allowed error.
+		b := [][]byte{{}}
+		if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+			t.Fatalf("error reading from socket connection: %v", err)
+		}
 
-	// We should have gotten a control message.
-	fds, err := r.ExtractFDs()
-	if err != nil {
-		t.Fatalf("error extracting fds from socket connection: %v", err)
-	}
-	if len(fds) != 1 {
-		t.Fatalf("got %d fds from socket, wanted 1", len(fds))
-	}
+		// We should have gotten a control message.
+		fds, err := r.ExtractFDs()
+		if err != nil {
+			t.Fatalf("error extracting fds from socket connection: %v", err)
+		}
+		if len(fds) != 1 {
+			t.Fatalf("got %d fds from socket, wanted 1", len(fds))
+		}
 
-	// Verify that the fd is a terminal.
-	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
-		t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
-	}
+		// Verify that the fd is a terminal.
+		if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+			t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+		}
 
-	// Shut it down.
-	if err := s.Destroy(); err != nil {
-		t.Fatalf("error destroying container: %v", err)
-	}
+		// Shut it down.
+		if err := s.Destroy(); err != nil {
+			t.Fatalf("error destroying container: %v", err)
+		}
 
-	// Close socket.
-	if err := srv.Close(); err != nil {
-		t.Fatalf("error destroying container: %v", err)
+		// Close socket.
+		if err := srv.Close(); err != nil {
+			t.Fatalf("error destroying container: %v", err)
+		}
 	}
 }
 
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/true")
-	spec.Process.User.UID = 343
-	spec.Process.User.GID = 2401
+	for _, conf := range configs(kvm) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	// User that container runs as can't list '$TMP/blocked' and would fail to
-	// mount it.
-	dir, err := ioutil.TempDir("", "blocked")
-	if err != nil {
-		t.Fatalf("ioutil.TempDir() failed: %v", err)
-	}
-	if err := os.Chmod(dir, 0700); err != nil {
-		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-	}
-	dir = path.Join(dir, "test")
-	if err := os.Mkdir(dir, 0755); err != nil {
-		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
-	}
+		spec := testutil.NewSpecWithArgs("/bin/true")
+		spec.Process.User.UID = 343
+		spec.Process.User.GID = 2401
+
+		// User that container runs as can't list '$TMP/blocked' and would fail to
+		// mount it.
+		dir, err := ioutil.TempDir("", "blocked")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+		if err := os.Chmod(dir, 0700); err != nil {
+			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+		}
+		dir = path.Join(dir, "test")
+		if err := os.Mkdir(dir, 0755); err != nil {
+			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+		}
 
-	// We generate files in the host temporary directory.
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: dir,
-		Source:      dir,
-		Type:        "bind",
-	})
+		// We generate files in the host temporary directory.
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: dir,
+			Source:      dir,
+			Type:        "bind",
+		})
 
-	if err := run(spec); err != nil {
-		t.Fatalf("error running sadbox: %v", err)
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("error running sadbox: %v", err)
+		}
 	}
 }
 
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	srcDir := path.Join(os.TempDir(), "src", "newdir", "anotherdir")
-	if err := os.MkdirAll(srcDir, 0755); err != nil {
-		t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
-	}
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	// Attempt to remove dir to ensure it doesn't exist.
-	mountDir := path.Join(os.TempDir(), "newdir")
-	if err := os.RemoveAll(mountDir); err != nil {
-		t.Fatalf("os.RemoveAll(%q) failed: %v", mountDir, err)
-	}
-	mountDir = path.Join(mountDir, "anotherdir")
+		srcDir := path.Join(os.TempDir(), "src", "newdir", "anotherdir")
+		if err := os.MkdirAll(srcDir, 0755); err != nil {
+			t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
+		}
+
+		// Attempt to remove dir to ensure it doesn't exist.
+		mountDir := path.Join(os.TempDir(), "newdir")
+		if err := os.RemoveAll(mountDir); err != nil {
+			t.Fatalf("os.RemoveAll(%q) failed: %v", mountDir, err)
+		}
+		mountDir = path.Join(mountDir, "anotherdir")
 
-	spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: mountDir,
-		Source:      srcDir,
-		Type:        "bind",
-	})
+		spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: mountDir,
+			Source:      srcDir,
+			Type:        "bind",
+		})
 
-	if err := run(spec); err != nil {
-		t.Fatalf("error running sadbox: %v", err)
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("error running sadbox: %v", err)
+		}
 	}
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
-	spec.Root.Readonly = true
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+		spec.Root.Readonly = true
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	conf.Overlay = true
+		conf.Overlay = true
 
-	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-	ws, err := s.Wait()
-	if err != nil {
-		t.Fatalf("error waiting on container: %v", err)
-	}
-	if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-		t.Fatalf("container failed, waitStatus: %v", ws)
+		// Create, start and wait for the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		ws, err := s.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
+		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+			t.Fatalf("container failed, waitStatus: %v", ws)
+		}
 	}
 }
 
 func TestReadonlyMount(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/touch", "/foo/file")
-	dir, err := ioutil.TempDir("", "ro-mount")
-	if err != nil {
-		t.Fatalf("ioutil.TempDir() failed: %v", err)
-	}
-	spec.Mounts = append(spec.Mounts, specs.Mount{
-		Destination: "/foo",
-		Source:      dir,
-		Type:        "bind",
-		Options:     []string{"ro"},
-	})
-	spec.Root.Readonly = false
+	for _, conf := range configs(overlay) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
+		spec := testutil.NewSpecWithArgs("/bin/touch", "/foo/file")
+		dir, err := ioutil.TempDir("", "ro-mount")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: "/foo",
+			Source:      dir,
+			Type:        "bind",
+			Options:     []string{"ro"},
+		})
+		spec.Root.Readonly = false
+
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
 
-	conf.Overlay = true
+		conf.Overlay = true
 
-	// Create, start and wait for the container.
-	s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer s.Destroy()
-	if err := s.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-	ws, err := s.Wait()
-	if err != nil {
-		t.Fatalf("error waiting on container: %v", err)
-	}
-	if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-		t.Fatalf("container failed, waitStatus: %v", ws)
+		// Create, start and wait for the container.
+		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer s.Destroy()
+		if err := s.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		ws, err := s.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
+		if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+			t.Fatalf("container failed, waitStatus: %v", ws)
+		}
 	}
 }
 
@@ -1089,7 +1162,8 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 	for _, cid := range cids {
 		spec := testutil.NewSpecWithArgs("sleep", "100")
-		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
@@ -1134,70 +1208,74 @@ func TestAbbreviatedIDs(t *testing.T) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	containerIDs := []string{
-		testutil.UniqueContainerID(),
-		testutil.UniqueContainerID(),
-	}
-	containerAnnotations := []map[string]string{
-		// The first container creates a sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
-		},
-		// The second container creates a container within the first
-		// container's sandbox.
-		map[string]string{
-			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
-			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
-		},
-	}
-
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
+	for _, conf := range configs(all) {
+		t.Logf("Running test with conf: %+v", conf)
 
-	// Setup the containers.
-	containers := make([]*container.Container, 0, len(containerIDs))
-	for i, annotations := range containerAnnotations {
-		spec := testutil.NewSpecWithArgs("sleep", "100")
-		spec.Annotations = annotations
-		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
+		containerIDs := []string{
+			testutil.UniqueContainerID(),
+			testutil.UniqueContainerID(),
 		}
-		defer os.RemoveAll(bundleDir)
-		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		containerAnnotations := []map[string]string{
+			// The first container creates a sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+			},
+			// The second container creates a container within the first
+			// container's sandbox.
+			map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+				specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+			},
+		}
+
+		rootDir, err := testutil.SetupRootDir()
 		if err != nil {
-			t.Fatalf("error creating container: %v", err)
+			t.Fatalf("error creating root dir: %v", err)
 		}
-		defer cont.Destroy()
-		if err := cont.Start(conf); err != nil {
-			t.Fatalf("error starting container: %v", err)
+		defer os.RemoveAll(rootDir)
+
+		// Setup the containers.
+		containers := make([]*container.Container, 0, len(containerIDs))
+		for i, annotations := range containerAnnotations {
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Annotations = annotations
+			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(bundleDir)
+			cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+			containers = append(containers, cont)
 		}
-		containers = append(containers, cont)
-	}
 
-	expectedPL := []*control.Process{
-		{
-			UID:  0,
-			PID:  1,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-		{
-			UID:  0,
-			PID:  2,
-			PPID: 0,
-			C:    0,
-			Cmd:  "sleep",
-		},
-	}
+		expectedPL := []*control.Process{
+			{
+				UID:  0,
+				PID:  1,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+			{
+				UID:  0,
+				PID:  2,
+				PPID: 0,
+				C:    0,
+				Cmd:  "sleep",
+			},
+		}
 
-	// Check via ps that multiple processes are running.
-	if err := waitForProcessList(containers[0], expectedPL); err != nil {
-		t.Errorf("failed to wait for sleep to start: %v", err)
+		// Check via ps that multiple processes are running.
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
 	}
 }
 
@@ -1238,7 +1316,8 @@ func TestMultiContainerWait(t *testing.T) {
 	for i, annotations := range containerAnnotations {
 		spec := testutil.NewSpecWithArgs(args[i][0], args[i][1])
 		spec.Annotations = annotations
-		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 		if err != nil {
 			t.Fatalf("error setting up container: %v", err)
 		}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index fee2de283..40337bc53 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -31,7 +31,8 @@ func init() {
 
 func TestGoferExits(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	rootDir, bundleDir, conf, err := testutil.SetupContainer(spec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 6aec54abe..3ebcc1362 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "docker.go",
         "testutil.go",
+        "testutil_race.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/test/testutil",
     visibility = [
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9d70d29f2..c7cef9c75 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -29,6 +29,9 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
+// RaceEnabled is set to true if it was built with '--race' option.
+var RaceEnabled = false
+
 // ConfigureExePath configures the executable for runsc in the test environment.
 func ConfigureExePath() error {
 
@@ -66,6 +69,18 @@ func ConfigureExePath() error {
 	return nil
 }
 
+// TestConfig return the default configuration to use in tests.
+func TestConfig() *boot.Config {
+	return &boot.Config{
+		Debug:          true,
+		LogFormat:      "text",
+		LogPackets:     true,
+		Network:        boot.NetworkNone,
+		Strace:         true,
+		MultiContainer: true,
+	}
+}
+
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
 // in tests.
 func NewSpecWithArgs(args ...string) *specs.Spec {
@@ -96,38 +111,29 @@ func SetupRootDir() (string, error) {
 
 // SetupContainer creates a bundle and root dir for the container, generates a
 // test config, and writes the spec to config.json in the bundle dir.
-func SetupContainer(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
+func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) {
 	rootDir, err = SetupRootDir()
 	if err != nil {
-		return "", "", nil, err
+		return "", "", err
 	}
-	bundleDir, conf, err = SetupContainerInRoot(rootDir, spec)
-	return rootDir, bundleDir, conf, err
+	bundleDir, err = SetupContainerInRoot(rootDir, spec, conf)
+	return rootDir, bundleDir, err
 }
 
 // SetupContainerInRoot creates a bundle for the container, generates a test
 // config, and writes the spec to config.json in the bundle dir.
-func SetupContainerInRoot(rootDir string, spec *specs.Spec) (bundleDir string, conf *boot.Config, err error) {
+func SetupContainerInRoot(rootDir string, spec *specs.Spec, conf *boot.Config) (bundleDir string, err error) {
 	bundleDir, err = ioutil.TempDir("", "bundle")
 	if err != nil {
-		return "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+		return "", fmt.Errorf("error creating bundle dir: %v", err)
 	}
 
 	if err = writeSpec(bundleDir, spec); err != nil {
-		return "", nil, fmt.Errorf("error writing spec: %v", err)
-	}
-
-	conf = &boot.Config{
-		Debug:          true,
-		LogFormat:      "text",
-		LogPackets:     true,
-		Network:        boot.NetworkNone,
-		RootDir:        rootDir,
-		Strace:         true,
-		MultiContainer: true,
+		return "", fmt.Errorf("error writing spec: %v", err)
 	}
 
-	return bundleDir, conf, nil
+	conf.RootDir = rootDir
+	return bundleDir, nil
 }
 
 // writeSpec writes the spec to disk in the given directory.
diff --git a/runsc/test/testutil/testutil_race.go b/runsc/test/testutil/testutil_race.go
new file mode 100644
index 000000000..59cfdaa7b
--- /dev/null
+++ b/runsc/test/testutil/testutil_race.go
@@ -0,0 +1,21 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package testutil
+
+func init() {
+	RaceEnabled = true
+}
-- 
cgit v1.2.3


From 32aa0f5465832c437a9de83c1c1a04b615d68122 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 24 Jul 2018 13:24:51 -0700
Subject: Typo fix.

PiperOrigin-RevId: 205880843
Change-Id: If2272b25f08a18ebe9b6309a1032dd5cdaa59866
---
 pkg/sentry/fs/ramfs/ramfs.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 90b6c9a4f..d6cfaf753 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -36,7 +36,7 @@ var (
 	// ErrInvalidOp indicates the operation is not valid.
 	ErrInvalidOp = errors.New("invalid operation")
 
-	// ErrDenied indicates the operation was denid.
+	// ErrDenied indicates the operation was denied.
 	ErrDenied = errors.New("operation denied")
 
 	// ErrNotFound indicates that a node was not found on a walk.
-- 
cgit v1.2.3


From 7cd9405b9cc112ebe352af0e5f13b7b57628001b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 25 Jul 2018 11:05:59 -0700
Subject: Format openat flags

PiperOrigin-RevId: 206021774
Change-Id: I447b6c751c28a8d8d4d78468b756b6ad8c61e169
---
 pkg/sentry/strace/linux64.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 85e1e1f83..7a1eb581d 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -274,7 +274,7 @@ var linuxAMD64 = SyscallMap{
 	254: makeSyscallInfo("inotify_add_watch", Hex, Hex, Hex),
 	255: makeSyscallInfo("inotify_rm_watch", Hex, Hex),
 	256: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
-	257: makeSyscallInfo("openat", Hex, Path, Hex, Mode),
+	257: makeSyscallInfo("openat", Hex, Path, OpenFlags, Mode),
 	258: makeSyscallInfo("mkdirat", Hex, Path, Hex),
 	259: makeSyscallInfo("mknodat", Hex, Path, Mode, Hex),
 	260: makeSyscallInfo("fchownat", Hex, Path, Hex, Hex, Hex),
-- 
cgit v1.2.3


From 127c977ab04d56de78c5caf16a8e6446eda340d4 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 26 Jul 2018 15:54:55 -0700
Subject: Don't copy-up extended attributes that specifically configure a lower
 overlay.

When copying-up files from a lower fs to an upper, we also copy the extended
attributes on the file. If there is a (nested) overlay inside the lower, some
of these extended attributes configure the lower overlay, and should not be
copied-up to the upper.

In particular, whiteout attributes in the lower fs overlay should not be
copied-up, since the upper fs may actually contain the file.

PiperOrigin-RevId: 206236010
Change-Id: Ia0454ac7b99d0e11383f732a529cb195ed364062
---
 pkg/sentry/fs/copy_up.go | 5 +++++
 pkg/sentry/fs/overlay.go | 7 +++++++
 2 files changed, 12 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index ea74d0efd..8c949b176 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -402,6 +402,11 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error
 		return err
 	}
 	for name := range lowerXattr {
+		// Don't copy-up attributes that configure an overlay in the
+		// lower.
+		if isXattrOverlay(name) {
+			continue
+		}
 		value, err := lower.Getxattr(name)
 		if err != nil {
 			return err
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index a63f00e0e..7357d6401 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -16,6 +16,7 @@ package fs
 
 import (
 	"fmt"
+	"strings"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -76,6 +77,12 @@ func XattrOverlayWhiteout(name string) string {
 	return XattrOverlayWhiteoutPrefix + name
 }
 
+// isXattrOverlay returns whether the given extended attribute configures the
+// overlay.
+func isXattrOverlay(name string) bool {
+	return strings.HasPrefix(name, XattrOverlayPrefix)
+}
+
 // NewOverlayRoot produces the root of an overlay.
 //
 // Preconditions:
-- 
cgit v1.2.3


From be7fcbc5582fe831b5ec63f773d867d7591e27a1 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Fri, 27 Jul 2018 10:16:27 -0700
Subject: stateify: support explicit annotation mode; convert refs and stack
 packages.

We have been unnecessarily creating too many savable types implicitly.

PiperOrigin-RevId: 206334201
Change-Id: Idc5a3a14bfb7ee125c4f2bb2b1c53164e46f29a8
---
 pkg/abi/BUILD                                 |  3 +-
 pkg/abi/linux/BUILD                           |  3 +-
 pkg/amutex/BUILD                              |  2 +-
 pkg/atomicbitops/BUILD                        |  2 +-
 pkg/binary/BUILD                              |  2 +-
 pkg/bits/BUILD                                |  2 +-
 pkg/bpf/BUILD                                 |  3 +-
 pkg/compressio/BUILD                          |  2 +-
 pkg/control/client/BUILD                      |  2 +-
 pkg/control/server/BUILD                      |  2 +-
 pkg/cpuid/BUILD                               |  3 +-
 pkg/dhcp/BUILD                                |  2 +-
 pkg/eventchannel/BUILD                        |  2 +-
 pkg/fd/BUILD                                  |  2 +-
 pkg/gate/BUILD                                |  2 +-
 pkg/hashio/BUILD                              |  2 +-
 pkg/ilist/BUILD                               |  3 +-
 pkg/linewriter/BUILD                          |  2 +-
 pkg/log/BUILD                                 |  2 +-
 pkg/metric/BUILD                              |  2 +-
 pkg/p9/BUILD                                  |  2 +-
 pkg/p9/p9test/BUILD                           |  2 +-
 pkg/rand/BUILD                                |  2 +-
 pkg/refs/BUILD                                | 20 +-------
 pkg/refs/refcounter.go                        |  4 ++
 pkg/refs/refcounter_state.go                  |  1 +
 pkg/seccomp/BUILD                             |  3 +-
 pkg/secio/BUILD                               |  2 +-
 pkg/segment/test/BUILD                        |  2 +-
 pkg/sentry/arch/BUILD                         |  3 +-
 pkg/sentry/context/BUILD                      |  2 +-
 pkg/sentry/context/contexttest/BUILD          |  3 +-
 pkg/sentry/control/BUILD                      |  2 +-
 pkg/sentry/device/BUILD                       |  2 +-
 pkg/sentry/fs/BUILD                           |  3 +-
 pkg/sentry/fs/anon/BUILD                      |  2 +-
 pkg/sentry/fs/ashmem/BUILD                    |  3 +-
 pkg/sentry/fs/binder/BUILD                    |  3 +-
 pkg/sentry/fs/dev/BUILD                       |  3 +-
 pkg/sentry/fs/fdpipe/BUILD                    |  3 +-
 pkg/sentry/fs/filetest/BUILD                  |  3 +-
 pkg/sentry/fs/fsutil/BUILD                    |  3 +-
 pkg/sentry/fs/gofer/BUILD                     |  3 +-
 pkg/sentry/fs/host/BUILD                      |  3 +-
 pkg/sentry/fs/lock/BUILD                      |  3 +-
 pkg/sentry/fs/proc/BUILD                      |  3 +-
 pkg/sentry/fs/proc/device/BUILD               |  2 +-
 pkg/sentry/fs/proc/seqfile/BUILD              |  3 +-
 pkg/sentry/fs/ramfs/BUILD                     |  3 +-
 pkg/sentry/fs/ramfs/test/BUILD                |  3 +-
 pkg/sentry/fs/sys/BUILD                       |  3 +-
 pkg/sentry/fs/timerfd/BUILD                   |  3 +-
 pkg/sentry/fs/tmpfs/BUILD                     |  3 +-
 pkg/sentry/fs/tty/BUILD                       |  3 +-
 pkg/sentry/hostcpu/BUILD                      |  2 +-
 pkg/sentry/inet/BUILD                         |  4 +-
 pkg/sentry/kernel/BUILD                       |  3 +-
 pkg/sentry/kernel/auth/BUILD                  |  3 +-
 pkg/sentry/kernel/epoll/BUILD                 |  3 +-
 pkg/sentry/kernel/eventfd/BUILD               |  3 +-
 pkg/sentry/kernel/fasync/BUILD                |  2 +-
 pkg/sentry/kernel/futex/BUILD                 |  3 +-
 pkg/sentry/kernel/kdefs/BUILD                 |  2 +-
 pkg/sentry/kernel/memevent/BUILD              |  2 +-
 pkg/sentry/kernel/pipe/BUILD                  |  3 +-
 pkg/sentry/kernel/sched/BUILD                 |  2 +-
 pkg/sentry/kernel/semaphore/BUILD             |  3 +-
 pkg/sentry/kernel/shm/BUILD                   |  3 +-
 pkg/sentry/kernel/time/BUILD                  |  3 +-
 pkg/sentry/limits/BUILD                       |  3 +-
 pkg/sentry/loader/BUILD                       |  4 +-
 pkg/sentry/memmap/BUILD                       |  3 +-
 pkg/sentry/memutil/BUILD                      |  2 +-
 pkg/sentry/mm/BUILD                           |  3 +-
 pkg/sentry/platform/BUILD                     |  3 +-
 pkg/sentry/platform/filemem/BUILD             |  3 +-
 pkg/sentry/platform/interrupt/BUILD           |  2 +-
 pkg/sentry/platform/kvm/BUILD                 |  2 +-
 pkg/sentry/platform/kvm/testutil/BUILD        |  2 +-
 pkg/sentry/platform/procid/BUILD              |  2 +-
 pkg/sentry/platform/ptrace/BUILD              |  2 +-
 pkg/sentry/platform/ring0/BUILD               |  2 +-
 pkg/sentry/platform/ring0/pagetables/BUILD    |  2 +-
 pkg/sentry/platform/safecopy/BUILD            |  2 +-
 pkg/sentry/safemem/BUILD                      |  2 +-
 pkg/sentry/sighandling/BUILD                  |  2 +-
 pkg/sentry/socket/BUILD                       |  3 +-
 pkg/sentry/socket/control/BUILD               |  3 +-
 pkg/sentry/socket/epsocket/BUILD              |  3 +-
 pkg/sentry/socket/hostinet/BUILD              |  3 +-
 pkg/sentry/socket/netlink/BUILD               |  3 +-
 pkg/sentry/socket/netlink/port/BUILD          |  3 +-
 pkg/sentry/socket/netlink/route/BUILD         |  3 +-
 pkg/sentry/socket/rpcinet/BUILD               |  2 +-
 pkg/sentry/socket/rpcinet/conn/BUILD          |  2 +-
 pkg/sentry/socket/rpcinet/notifier/BUILD      |  2 +-
 pkg/sentry/socket/unix/BUILD                  |  3 +-
 pkg/sentry/state/BUILD                        |  2 +-
 pkg/sentry/strace/BUILD                       |  2 +-
 pkg/sentry/syscalls/BUILD                     |  2 +-
 pkg/sentry/syscalls/linux/BUILD               |  3 +-
 pkg/sentry/time/BUILD                         |  2 +-
 pkg/sentry/uniqueid/BUILD                     |  2 +-
 pkg/sentry/usage/BUILD                        |  3 +-
 pkg/sentry/usermem/BUILD                      |  3 +-
 pkg/sentry/watchdog/BUILD                     |  2 +-
 pkg/sleep/BUILD                               |  2 +-
 pkg/state/BUILD                               |  2 +-
 pkg/state/statefile/BUILD                     |  2 +-
 pkg/sync/BUILD                                |  2 +-
 pkg/sync/seqatomictest/BUILD                  |  2 +-
 pkg/syserr/BUILD                              |  2 +-
 pkg/syserror/BUILD                            |  2 +-
 pkg/tcpip/BUILD                               |  3 +-
 pkg/tcpip/adapters/gonet/BUILD                |  2 +-
 pkg/tcpip/buffer/BUILD                        |  3 +-
 pkg/tcpip/checker/BUILD                       |  2 +-
 pkg/tcpip/header/BUILD                        |  3 +-
 pkg/tcpip/link/channel/BUILD                  |  2 +-
 pkg/tcpip/link/fdbased/BUILD                  |  2 +-
 pkg/tcpip/link/loopback/BUILD                 |  2 +-
 pkg/tcpip/link/rawfile/BUILD                  |  2 +-
 pkg/tcpip/link/sharedmem/BUILD                |  2 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD           |  2 +-
 pkg/tcpip/link/sharedmem/queue/BUILD          |  2 +-
 pkg/tcpip/link/sniffer/BUILD                  |  2 +-
 pkg/tcpip/link/tun/BUILD                      |  2 +-
 pkg/tcpip/link/waitable/BUILD                 |  2 +-
 pkg/tcpip/network/BUILD                       |  2 +-
 pkg/tcpip/network/arp/BUILD                   |  2 +-
 pkg/tcpip/network/fragmentation/BUILD         |  3 +-
 pkg/tcpip/network/hash/BUILD                  |  2 +-
 pkg/tcpip/network/ipv4/BUILD                  |  2 +-
 pkg/tcpip/network/ipv6/BUILD                  |  2 +-
 pkg/tcpip/ports/BUILD                         |  2 +-
 pkg/tcpip/seqnum/BUILD                        |  3 +-
 pkg/tcpip/stack/BUILD                         | 15 +-----
 pkg/tcpip/stack/registration.go               |  2 +
 pkg/tcpip/transport/ping/BUILD                |  3 +-
 pkg/tcpip/transport/queue/BUILD               |  3 +-
 pkg/tcpip/transport/tcp/BUILD                 |  3 +-
 pkg/tcpip/transport/tcp/testing/context/BUILD |  2 +-
 pkg/tcpip/transport/tcpconntrack/BUILD        |  2 +-
 pkg/tcpip/transport/udp/BUILD                 |  3 +-
 pkg/tcpip/transport/unix/BUILD                |  3 +-
 pkg/tmutex/BUILD                              |  2 +-
 pkg/unet/BUILD                                |  2 +-
 pkg/urpc/BUILD                                |  2 +-
 pkg/waiter/BUILD                              |  3 +-
 pkg/waiter/fdnotifier/BUILD                   |  2 +-
 tools/go_stateify/defs.bzl                    | 58 ++++++++++++++++++-----
 tools/go_stateify/main.go                     | 66 +++++++++++++++++++++------
 152 files changed, 255 insertions(+), 267 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index 4d507161f..f1e6bac67 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "abi_state",
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index ae7e4378c..38b4829c9 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -4,8 +4,7 @@
 
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "linux_state",
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 442096319..84e6b79a5 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "amutex",
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index f20a9f855..a8dd17825 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "atomicbitops",
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 16f08b13f..586d05634 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "binary",
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 9897e5dc3..8c943b615 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_library(
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index d4f12f13a..403270049 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "bpf_state",
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index 721b2d983..d70f982c1 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "compressio",
diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD
index 9e1c058e4..d58cd1b71 100644
--- a/pkg/control/client/BUILD
+++ b/pkg/control/client/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "client",
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index 2d0fdd8b8..c3f74a532 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "server",
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index a503b7ae8..9a0ca1b33 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "cpuid_state",
diff --git a/pkg/dhcp/BUILD b/pkg/dhcp/BUILD
index f56969ad8..bd9f592b4 100644
--- a/pkg/dhcp/BUILD
+++ b/pkg/dhcp/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "dhcp",
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index ea0c587be..ac2ea869d 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "eventchannel",
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index e69d83d06..435b6fa34 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "fd",
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index 0b8b01da8..872eff531 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "gate",
diff --git a/pkg/hashio/BUILD b/pkg/hashio/BUILD
index aaa58b58f..5736e2e73 100644
--- a/pkg/hashio/BUILD
+++ b/pkg/hashio/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "hashio",
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index 16a738e89..e32f26ffa 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "list_state",
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index 4a96c6f1d..6c3795432 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "linewriter",
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index 2530cfd18..fc9281079 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "log",
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index e3f50d528..c0cd40c7b 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 go_library(
     name = "metric",
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index f348ff2e9..1cf5c6458 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index 339c86089..d6f428e11 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_test(
     name = "p9test_test",
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index 2bb59f895..12e6cf25a 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "rand",
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 4b7c9345d..3ea877ccf 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,32 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
-
-go_stateify(
-    name = "refs_state",
-    srcs = [
-        "refcounter.go",
-        "refcounter_state.go",
-    ],
-    out = "refs_state.go",
-    package = "refs",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "refs",
     srcs = [
         "refcounter.go",
         "refcounter_state.go",
-        "refs_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/refs",
     visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/ilist",
-        "//pkg/log",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/ilist"],
 )
 
 go_test(
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 3162001e1..0d44c2499 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -58,6 +58,8 @@ type WeakRefUser interface {
 }
 
 // WeakRef is a weak reference.
+//
+// +stateify savable
 type WeakRef struct {
 	ilist.Entry `state:"nosave"`
 
@@ -177,6 +179,8 @@ func (w *WeakRef) zap() {
 //
 // N.B. To allow the zero-object to be initialized, the count is offset by
 //      1, that is, when refCount is n, there are really n+1 references.
+//
+// +stateify savable
 type AtomicRefCount struct {
 	// refCount is composed of two fields:
 	//
diff --git a/pkg/refs/refcounter_state.go b/pkg/refs/refcounter_state.go
index 1be67f951..093eae785 100644
--- a/pkg/refs/refcounter_state.go
+++ b/pkg/refs/refcounter_state.go
@@ -14,6 +14,7 @@
 
 package refs
 
+// +stateify savable
 type savedReference struct {
 	obj interface{}
 }
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index cadd24505..b3e2f0b38 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,6 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data")
 
 go_binary(
     name = "victim",
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index 9a28d2c1f..0ed38c64a 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "secio",
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index 9d398d71a..bdf53e24e 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:private"],
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index a88f57ac7..0a2a35400 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "arch_state",
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
index ff39f94ba..2a7a6df23 100644
--- a/pkg/sentry/context/BUILD
+++ b/pkg/sentry/context/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "context",
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 5977344de..591b11a4d 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "contexttest_state",
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 6169891f7..fbdde0721 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "control",
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 1a8b461ba..69c99b0b3 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "device",
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 9b7264753..e3c9a9b70 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "fs_state",
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index 6b18aee47..ff4ab850a 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "anon",
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index e20e22a0f..9f166799a 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_stateify(
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index 15f91699f..ec3928baf 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "binder_state",
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index d33a19c2f..ea41615fd 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "dev_state",
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 9e1f65d3e..4fcb06f1f 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "pipe_state",
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index 51a390d77..f481c57fb 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "filetest_state",
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 4fa6395f7..6eea64298 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "fsutil_state",
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index e6f659c53..1277379e7 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "gofer_state",
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 97b64daed..23ec66f50 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "host_state",
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index c15dde800..2607d7ed3 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "lock_state",
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 21b5fc0c3..870df47b2 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "proc_state",
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
index b62062bd7..34582f275 100644
--- a/pkg/sentry/fs/proc/device/BUILD
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "device",
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 48dd25e5b..c84f7e20d 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "seqfile_state",
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 663a1aeb9..d84f2c624 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "ramfs_state",
diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD
index 074b0f5ad..57fee45e2 100644
--- a/pkg/sentry/fs/ramfs/test/BUILD
+++ b/pkg/sentry/fs/ramfs/test/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "test_state",
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 0ae2cbac8..095ff1f25 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "sys_state",
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index 7fddc29f4..8b1b7872e 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "timerfd_state",
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index be4e695d3..473ab4296 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "tmpfs_state",
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index fce327dfe..363897b2c 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "tty_state",
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index 9457618d8..f362d15c8 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "hostcpu",
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 1150ced57..eaf8f15b2 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -1,11 +1,9 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
 package(
     default_visibility = ["//:sandbox"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "inet_state",
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 07568b47c..c4a7dacb2 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "kernel_state",
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 7f0680b88..5b7b30557 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "auth_state",
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 04651d961..7d491efbc 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "epoll_autogen_state",
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 561ced852..7ec179bd8 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "eventfd_state",
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 8d06e1182..17749c0de 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "fasync",
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index de9897c58..a97a43549 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_template_instance(
     name = "waiter_list",
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
index b6c00042a..fe6fa2260 100644
--- a/pkg/sentry/kernel/kdefs/BUILD
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "kdefs",
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index c7779e1d5..66899910c 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "memevent",
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index ca9825f9d..4600d19bd 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "pipe_state",
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index b533c51c4..125792f39 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "sched",
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 1656ad126..969145fe1 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_template_instance(
     name = "waiter_list",
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 182cc1c76..0f88eb0ac 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "shm_state",
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 84f31b2dc..b3ed42aa4 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "time_state",
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 06c3e72b0..3ce41cacc 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "limits_state",
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 01a0ec426..e63052c6d 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_embed_data", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_embed_data(
     name = "vdso_bin",
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 7525fea45..2e367e189 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "memmap_state",
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
index a387a0c9f..341b30b98 100644
--- a/pkg/sentry/memutil/BUILD
+++ b/pkg/sentry/memutil/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "memutil",
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 258389bb2..3f396986a 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "mm_state",
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index d5be81f8d..15a7fbbc3 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "platform_state",
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
index 3c4d5b0b6..dadba1d38 100644
--- a/pkg/sentry/platform/filemem/BUILD
+++ b/pkg/sentry/platform/filemem/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "filemem_autogen_state",
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index 33dde2a31..35121321a 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "interrupt",
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 673393fad..4ef9e20d7 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index 8533a8d89..e779e3893 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "testutil",
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD
index 5db4f6261..ba68d48f4 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/sentry/platform/procid/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "procid",
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 16b0b3c69..ceee895dc 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "ptrace",
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 2df232a64..2485eb2eb 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template(
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 023e298a0..7a86e2234 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template(
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 8b9f29403..7dcf6e561 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "safecopy",
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index dc4cfce41..e96509ce1 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "safemem",
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index daaad7c90..f480f0735 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "sighandling",
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 5500a676e..929787aa0 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "socket_state",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 25de2f655..faf2b4c27 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "control_state",
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 8430886cb..7ad5e88c5 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "epsocket_state",
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 60ec265ba..227ca3926 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "hostinet_state",
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 9df3ab17c..b23a243f7 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "netlink_state",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 7340b95c9..ba6f686e4 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "port_state",
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index ff3f7b7a4..726469fc9 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "route_state",
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 8973453f9..288199779 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "rpcinet",
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
index 4923dee4b..c51ca14b1 100644
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # BSD
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "conn",
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index 6f3b06a05..2ae902b3f 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # BSD
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "notifier",
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 1ec6eb7ed..7d04d6b6b 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "unix_state",
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 9bd98f445..a57a8298e 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "state",
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index c5946a564..e1c8db67a 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "strace",
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index d667b42c8..22a757095 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
 go_library(
     name = "syscalls",
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index d3f3cc459..574621ad2 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "linux_state",
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index cbcd699d5..9452787fb 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index c8ab03c3d..8eba3609e 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "uniqueid",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index a0fe0aa07..edee44d96 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "usage_state",
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 36c0760dd..9dd1cd2b5 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "usermem_state",
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index 28fae4490..13bc33eb1 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "watchdog",
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index f2b69b225..05e4ca540 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "sleep",
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index bb6415d9b..012b0484e 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index df2c6a578..16abe1930 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "statefile",
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 1fc0c25b5..3959fea36 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//:sandbox"],
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index 9d6ee2dfb..07b4f85ab 100644
--- a/pkg/sync/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index e5ce48412..c0850f3d9 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "syserr",
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index 68ddec786..e050c2043 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "syserror",
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 186a0d3bf..391d801d0 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "tcpip_state",
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 97da46776..bf618831a 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "gonet",
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index 08adf18cd..efeb6a448 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "buffer_state",
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index 5447cfbf4..e8a524918 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "checker",
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 859c2a106..3aa2cfb24 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "tcp_header_state",
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index f2f0c8b6f..9a6f49c45 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "channel",
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index aca3b14ca..6e75e9f47 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "fdbased",
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index 9714e93db..cc4247ffd 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "loopback",
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 4b30c7c1c..10b35a37e 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "rawfile",
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index 1bd79a3f4..5390257c5 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "sharedmem",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index e6c658071..ff798ae6f 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "pipe",
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index 80cedade1..c4a7879c4 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "queue",
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index d14f150d1..1e844f949 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "sniffer",
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index 21da7d57e..a8bb03661 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "tun",
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 3b513383a..7582df32e 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "waitable",
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 963857f51..9a26b46c4 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_test")
 
 go_test(
     name = "ip_test",
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index 689f66d6e..44f2b66e5 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "arp",
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index a173f87fb..ac97ebe43 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "fragmentation_state",
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index e1b5f26c4..1c22c52fc 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "hash",
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index ae42b662f..19314e9bd 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "ipv4",
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index d008ac7fb..1c3eccae0 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "ipv6",
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 710c283f7..3c3374275 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "ports",
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index 6d28dbc3f..a75869dac 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "seqnum_state",
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 6d201d0a2..5e7355135 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -1,17 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
-
-go_stateify(
-    name = "stack_state",
-    srcs = [
-        "registration.go",
-        "stack.go",
-    ],
-    out = "stack_state.go",
-    package = "stack",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "stack",
@@ -22,7 +11,6 @@ go_library(
         "route.go",
         "stack.go",
         "stack_global_state.go",
-        "stack_state.go",
         "transport_demuxer.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/stack",
@@ -32,7 +20,6 @@ go_library(
     deps = [
         "//pkg/ilist",
         "//pkg/sleep",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index e9550a062..c66f925a8 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -31,6 +31,8 @@ type NetworkEndpointID struct {
 }
 
 // TransportEndpointID is the identifier of a transport layer protocol endpoint.
+//
+// +stateify savable
 type TransportEndpointID struct {
 	// LocalPort is the local port associated with the endpoint.
 	LocalPort uint16
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
index 1febbf7f5..28e3e1700 100644
--- a/pkg/tcpip/transport/ping/BUILD
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "ping_state",
diff --git a/pkg/tcpip/transport/queue/BUILD b/pkg/tcpip/transport/queue/BUILD
index 7e8ee1f66..fb878ad36 100644
--- a/pkg/tcpip/transport/queue/BUILD
+++ b/pkg/tcpip/transport/queue/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "queue_state",
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 53623787d..6a7153e4d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "tcp_state",
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index 3caa38bcb..7a95594ef 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "context",
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index 3d748528e..46da3e6f1 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tcpconntrack",
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 4f7a47973..790dd55a3 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,8 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "udp_state",
diff --git a/pkg/tcpip/transport/unix/BUILD b/pkg/tcpip/transport/unix/BUILD
index d58f06544..676f2cf92 100644
--- a/pkg/tcpip/transport/unix/BUILD
+++ b/pkg/tcpip/transport/unix/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_stateify(
     name = "unix_state",
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index d9a2c5ae5..d18338fff 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tmutex",
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index e8e40315a..acdfd7cb6 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "unet",
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index b29b25637..d32c57d1a 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "urpc",
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 032ec3237..8256acdb4 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_stateify(
     name = "waiter_state",
diff --git a/pkg/waiter/fdnotifier/BUILD b/pkg/waiter/fdnotifier/BUILD
index d5b5ee82d..4e582755d 100644
--- a/pkg/waiter/fdnotifier/BUILD
+++ b/pkg/waiter/fdnotifier/BUILD
@@ -1,6 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "fdnotifier",
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index 60a9895ff..2b2582b7a 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -22,6 +22,8 @@ go_library(
 )
 """
 
+load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library", _go_test = "go_test")
+
 def _go_stateify_impl(ctx):
     """Implementation for the stateify tool."""
     output = ctx.outputs.out
@@ -33,6 +35,8 @@ def _go_stateify_impl(ctx):
         args += ["-statepkg=%s" % ctx.attr._statepkg]
     if ctx.attr.imports:
         args += ["-imports=%s" % ",".join(ctx.attr.imports)]
+    if ctx.attr.explicit:
+        args += ["-explicit=true"]
     args += ["--"]
     for src in ctx.attr.srcs:
         args += [f.path for f in src.files]
@@ -45,17 +49,15 @@ def _go_stateify_impl(ctx):
         executable = ctx.executable._tool,
     )
 
-"""
-Generates save and restore logic from a set of Go files.
-
-
-Args:
-  name: the name of the rule.
-  srcs: the input source files. These files should include all structs in the package that need to be saved.
-  imports: an optional list of extra non-aliased, Go-style absolute import paths.
-  out: the name of the generated file output. This must not conflict with any other files and must be added to the srcs of the relevant go_library.
-  package: the package name for the input sources.
-"""
+# Generates save and restore logic from a set of Go files.
+#
+# Args:
+#   name: the name of the rule.
+#   srcs: the input source files. These files should include all structs in the package that need to be saved.
+#   imports: an optional list of extra non-aliased, Go-style absolute import paths.
+#   out: the name of the generated file output. This must not conflict with any other files and must be added to the srcs of the relevant go_library.
+#   package: the package name for the input sources.
+#   explicit: only generate for types explicitly annotated as savable.
 go_stateify = rule(
     implementation = _go_stateify_impl,
     attrs = {
@@ -63,7 +65,41 @@ go_stateify = rule(
         "imports": attr.string_list(mandatory = False),
         "package": attr.string(mandatory = True),
         "out": attr.output(mandatory = True),
+        "explicit": attr.bool(default = False),
         "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_stateify:stateify")),
         "_statepkg": attr.string(default = "gvisor.googlesource.com/gvisor/pkg/state"),
     },
 )
+
+def go_library(name, srcs, deps = [], imports = [], **kwargs):
+    """wraps the standard go_library and does stateification."""
+    if "encode_unsafe.go" not in srcs and (name + "_state_autogen.go") not in srcs:
+        # Only do stateification for non-state packages without manual autogen.
+        go_stateify(
+            name = name + "_state_autogen",
+            srcs = [src for src in srcs if src.endswith(".go")],
+            imports = imports,
+            package = name,
+            out = name + "_state_autogen.go",
+            explicit = True,
+        )
+        all_srcs = srcs + [name + "_state_autogen.go"]
+        if "//pkg/state" not in deps:
+            all_deps = deps + ["//pkg/state"]
+        else:
+            all_deps = deps
+    else:
+        all_deps = deps
+        all_srcs = srcs
+    _go_library(
+        name = name,
+        srcs = all_srcs,
+        deps = all_deps,
+        **kwargs
+    )
+
+def go_test(**kwargs):
+    """Wraps the standard go_test."""
+    _go_test(
+        **kwargs
+    )
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 6c3583c62..231c6d80b 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -25,6 +25,7 @@ import (
 	"os"
 	"reflect"
 	"strings"
+	"sync"
 )
 
 var (
@@ -32,6 +33,7 @@ var (
 	imports  = flag.String("imports", "", "extra imports for the output file")
 	output   = flag.String("output", "", "output file")
 	statePkg = flag.String("statepkg", "", "state import package; defaults to empty")
+	explicit = flag.Bool("explicit", false, "only generate for types explicitly tagged '// +stateify savable'")
 )
 
 // resolveTypeName returns a qualified type name.
@@ -224,16 +226,24 @@ func main() {
 	// Emit the package name.
 	fmt.Fprint(outputFile, "// automatically generated by stateify.\n\n")
 	fmt.Fprintf(outputFile, "package %s\n\n", *pkg)
-	fmt.Fprint(outputFile, "import (\n")
-	if *statePkg != "" {
-		fmt.Fprintf(outputFile, "	\"%s\"\n", *statePkg)
-	}
-	if *imports != "" {
-		for _, i := range strings.Split(*imports, ",") {
-			fmt.Fprintf(outputFile, "	\"%s\"\n", i)
-		}
+
+	// Emit the imports lazily.
+	var once sync.Once
+	maybeEmitImports := func() {
+		once.Do(func() {
+			// Emit the imports.
+			fmt.Fprint(outputFile, "import (\n")
+			if *statePkg != "" {
+				fmt.Fprintf(outputFile, "	\"%s\"\n", *statePkg)
+			}
+			if *imports != "" {
+				for _, i := range strings.Split(*imports, ",") {
+					fmt.Fprintf(outputFile, "	\"%s\"\n", i)
+				}
+			}
+			fmt.Fprint(outputFile, ")\n\n")
+		})
 	}
-	fmt.Fprint(outputFile, ")\n\n")
 
 	files := make([]*ast.File, 0, len(flag.Args()))
 
@@ -241,7 +251,7 @@ func main() {
 	for _, filename := range flag.Args() {
 		// Parse the file.
 		fset := token.NewFileSet()
-		f, err := parser.ParseFile(fset, filename, nil, 0)
+		f, err := parser.ParseFile(fset, filename, nil, parser.ParseComments)
 		if err != nil {
 			// Not a valid input file?
 			fmt.Fprintf(os.Stderr, "Input %q can't be parsed: %v\n", filename, err)
@@ -308,6 +318,26 @@ func main() {
 				continue
 			}
 
+			if *explicit {
+				// In explicit mode, only generate code for
+				// types explicitly marked
+				// "// +stateify savable" in one of the
+				// proceeding comment lines.
+				if d.Doc == nil {
+					continue
+				}
+				savable := false
+				for _, l := range d.Doc.List {
+					if l.Text == "// +stateify savable" {
+						savable = true
+						break
+					}
+				}
+				if !savable {
+					continue
+				}
+			}
+
 			for _, gs := range d.Specs {
 				ts := gs.(*ast.TypeSpec)
 				switch ts.Type.(type) {
@@ -315,6 +345,8 @@ func main() {
 					// Don't register.
 					break
 				case *ast.StructType:
+					maybeEmitImports()
+
 					ss := ts.Type.(*ast.StructType)
 
 					// Define beforeSave if a definition was not found. This
@@ -360,6 +392,8 @@ func main() {
 					// Add to our registration.
 					emitRegister(ts.Name.Name)
 				case *ast.Ident, *ast.SelectorExpr, *ast.ArrayType:
+					maybeEmitImports()
+
 					_, val := resolveTypeName(ts.Name.Name, ts.Type)
 
 					// Dispatch directly.
@@ -377,10 +411,12 @@ func main() {
 		}
 	}
 
-	// Emit the init() function.
-	fmt.Fprintf(outputFile, "func init() {\n")
-	for _, ic := range initCalls {
-		fmt.Fprintf(outputFile, "	%s\n", ic)
+	if len(initCalls) > 0 {
+		// Emit the init() function.
+		fmt.Fprintf(outputFile, "func init() {\n")
+		for _, ic := range initCalls {
+			fmt.Fprintf(outputFile, "	%s\n", ic)
+		}
+		fmt.Fprintf(outputFile, "}\n")
 	}
-	fmt.Fprintf(outputFile, "}\n")
 }
-- 
cgit v1.2.3


From 2793f7ac5f96b474decfff68cfde86bb5c2ed0a4 Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Fri, 27 Jul 2018 12:26:42 -0700
Subject: Added the O_LARGEFILE flag.

This flag will always be true for gVisor files.

PiperOrigin-RevId: 206355963
Change-Id: I2f03d2412e2609042df43b06d1318cba674574d0
---
 pkg/abi/linux/fcntl.go                |  5 +++
 pkg/abi/linux/file.go                 | 16 ++++++++--
 pkg/sentry/fs/flags.go                |  6 ++++
 pkg/sentry/syscalls/linux/flags.go    | 59 ++++++++++++++++++-----------------
 pkg/sentry/syscalls/linux/sys_file.go |  2 ++
 5 files changed, 57 insertions(+), 31 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index f5dbe5199..2a5ad6ed7 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -27,3 +27,8 @@ const (
 	F_SETLKW        = 7
 	F_SETOWN        = 8
 )
+
+// Flags for fcntl.
+const (
+	FD_CLOEXEC = 00000001
+)
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 44672647b..f2b7e26ca 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -23,9 +23,19 @@ import (
 
 // Constants for open(2).
 const (
-	O_NONBLOCK = 00004000
-	O_CLOEXEC  = 02000000
-	O_PATH     = 010000000
+	O_ACCMODE   = 00000003
+	O_RDONLY    = 00000000
+	O_WRONLY    = 00000001
+	O_RDWR      = 00000002
+	O_APPEND    = 00002000
+	O_NONBLOCK  = 00004000
+	O_ASYNC     = 00020000
+	O_DIRECT    = 00040000
+	O_LARGEFILE = 00100000
+	O_DIRECTORY = 00200000
+	O_CLOEXEC   = 02000000
+	O_SYNC      = 04010000
+	O_PATH      = 010000000
 )
 
 // Constants for fstatat(2).
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index 7a8eefd02..810452584 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -45,6 +45,12 @@ type FileFlags struct {
 
 	// Async indicates that this file sends signals on IO events.
 	Async bool
+
+	// LargeFile indicates that this file should be opened even if it has
+	// size greater than linux's off_t. When running in 64-bit mode,
+	// Linux sets this flag for all files. Since gVisor is only compatible
+	// with 64-bit Linux, it also sets this flag for all files.
+	LargeFile bool
 }
 
 // SettableFileFlags is a subset of FileFlags above that can be changed
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index 3d39a20f4..b2e173f3e 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -15,8 +15,7 @@
 package linux
 
 import (
-	"syscall"
-
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 )
@@ -24,13 +23,13 @@ import (
 // flagsToPermissions returns a Permissions object from Linux flags.
 // This includes truncate permission if O_TRUNC is set in the mask.
 func flagsToPermissions(mask uint) (p fs.PermMask) {
-	switch mask & syscall.O_ACCMODE {
-	case syscall.O_WRONLY:
+	switch mask & linux.O_ACCMODE {
+	case linux.O_WRONLY:
 		p.Write = true
-	case syscall.O_RDWR:
+	case linux.O_RDWR:
 		p.Write = true
 		p.Read = true
-	case syscall.O_RDONLY:
+	case linux.O_RDONLY:
 		p.Read = true
 	}
 	return
@@ -39,7 +38,7 @@ func flagsToPermissions(mask uint) (p fs.PermMask) {
 // fdFlagsToLinux converts a kernel.FDFlags object to a Linux representation.
 func fdFlagsToLinux(flags kernel.FDFlags) (mask uint) {
 	if flags.CloseOnExec {
-		mask |= syscall.FD_CLOEXEC
+		mask |= linux.FD_CLOEXEC
 	}
 	return
 }
@@ -47,30 +46,33 @@ func fdFlagsToLinux(flags kernel.FDFlags) (mask uint) {
 // flagsToLinux converts a FileFlags object to a Linux representation.
 func flagsToLinux(flags fs.FileFlags) (mask uint) {
 	if flags.Direct {
-		mask |= syscall.O_DIRECT
+		mask |= linux.O_DIRECT
 	}
 	if flags.NonBlocking {
-		mask |= syscall.O_NONBLOCK
+		mask |= linux.O_NONBLOCK
 	}
 	if flags.Sync {
-		mask |= syscall.O_SYNC
+		mask |= linux.O_SYNC
 	}
 	if flags.Append {
-		mask |= syscall.O_APPEND
+		mask |= linux.O_APPEND
 	}
 	if flags.Directory {
-		mask |= syscall.O_DIRECTORY
+		mask |= linux.O_DIRECTORY
 	}
 	if flags.Async {
-		mask |= syscall.O_ASYNC
+		mask |= linux.O_ASYNC
+	}
+	if flags.LargeFile {
+		mask |= linux.O_LARGEFILE
 	}
 	switch {
 	case flags.Read && flags.Write:
-		mask |= syscall.O_RDWR
+		mask |= linux.O_RDWR
 	case flags.Write:
-		mask |= syscall.O_WRONLY
+		mask |= linux.O_WRONLY
 	case flags.Read:
-		mask |= syscall.O_RDONLY
+		mask |= linux.O_RDONLY
 	}
 	return
 }
@@ -78,23 +80,24 @@ func flagsToLinux(flags fs.FileFlags) (mask uint) {
 // linuxToFlags converts linux file flags to a FileFlags object.
 func linuxToFlags(mask uint) (flags fs.FileFlags) {
 	return fs.FileFlags{
-		Direct:      mask&syscall.O_DIRECT != 0,
-		Sync:        mask&syscall.O_SYNC != 0,
-		NonBlocking: mask&syscall.O_NONBLOCK != 0,
-		Read:        (mask & syscall.O_ACCMODE) != syscall.O_WRONLY,
-		Write:       (mask & syscall.O_ACCMODE) != syscall.O_RDONLY,
-		Append:      mask&syscall.O_APPEND != 0,
-		Directory:   mask&syscall.O_DIRECTORY != 0,
-		Async:       mask&syscall.O_ASYNC != 0,
+		Direct:      mask&linux.O_DIRECT != 0,
+		Sync:        mask&linux.O_SYNC != 0,
+		NonBlocking: mask&linux.O_NONBLOCK != 0,
+		Read:        (mask & linux.O_ACCMODE) != linux.O_WRONLY,
+		Write:       (mask & linux.O_ACCMODE) != linux.O_RDONLY,
+		Append:      mask&linux.O_APPEND != 0,
+		Directory:   mask&linux.O_DIRECTORY != 0,
+		Async:       mask&linux.O_ASYNC != 0,
+		LargeFile:   mask&linux.O_LARGEFILE != 0,
 	}
 }
 
 // linuxToSettableFlags converts linux file flags to a SettableFileFlags object.
 func linuxToSettableFlags(mask uint) fs.SettableFileFlags {
 	return fs.SettableFileFlags{
-		Direct:      mask&syscall.O_DIRECT != 0,
-		NonBlocking: mask&syscall.O_NONBLOCK != 0,
-		Append:      mask&syscall.O_APPEND != 0,
-		Async:       mask&syscall.O_ASYNC != 0,
+		Direct:      mask&linux.O_DIRECT != 0,
+		NonBlocking: mask&linux.O_NONBLOCK != 0,
+		Append:      mask&linux.O_APPEND != 0,
+		Async:       mask&linux.O_ASYNC != 0,
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 66e6fd9d4..2f28fbea6 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -148,6 +148,8 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
 		}
 
 		fileFlags := linuxToFlags(flags)
+		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+		fileFlags.LargeFile = true
 		if fs.IsDir(d.Inode.StableAttr) {
 			// Don't allow directories to be opened writable.
 			if fileFlags.Write {
-- 
cgit v1.2.3


From a7a0167716d71895919021692b15bd000f63b24f Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Tue, 31 Jul 2018 11:18:02 -0700
Subject: proc: show file flags in fdinfo

Currently, there is an attempt to print FD flags, but
they are not decoded into a number, so we see something like this:

/criu # cat /proc/self/fdinfo/0
flags: {%!o(bool=000false)}

Actually, fdinfo has to contain file flags.

Change-Id: Idcbb7db908067447eb9ae6f2c3cfb861f2be1a97
PiperOrigin-RevId: 206794498
---
 pkg/sentry/fs/flags.go                | 39 +++++++++++++++++++++++++++++++
 pkg/sentry/fs/proc/fds.go             | 21 +++++++++--------
 pkg/sentry/kernel/fd_map.go           | 16 +++++++++++++
 pkg/sentry/syscalls/linux/flags.go    | 43 -----------------------------------
 pkg/sentry/syscalls/linux/sys_file.go |  4 ++--
 5 files changed, 69 insertions(+), 54 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index 810452584..da0ff58af 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -14,6 +14,10 @@
 
 package fs
 
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
 // FileFlags encodes file flags.
 type FileFlags struct {
 	// Direct indicates that I/O should be done directly.
@@ -78,3 +82,38 @@ func (f FileFlags) Settable() SettableFileFlags {
 		Async:       f.Async,
 	}
 }
+
+// ToLinux converts a FileFlags object to a Linux representation.
+func (f FileFlags) ToLinux() (mask uint) {
+	if f.Direct {
+		mask |= linux.O_DIRECT
+	}
+	if f.NonBlocking {
+		mask |= linux.O_NONBLOCK
+	}
+	if f.Sync {
+		mask |= linux.O_SYNC
+	}
+	if f.Append {
+		mask |= linux.O_APPEND
+	}
+	if f.Directory {
+		mask |= linux.O_DIRECTORY
+	}
+	if f.Async {
+		mask |= linux.O_ASYNC
+	}
+	if f.LargeFile {
+		mask |= linux.O_LARGEFILE
+	}
+
+	switch {
+	case f.Read && f.Write:
+		mask |= linux.O_RDWR
+	case f.Write:
+		mask |= linux.O_WRONLY
+	case f.Read:
+		mask |= linux.O_RDONLY
+	}
+	return
+}
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 2eca9ac31..194a9c12a 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -40,16 +40,16 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
 	}
 
 	var file *fs.File
-	var flags kernel.FDFlags
+	var fdFlags kernel.FDFlags
 	t.WithMuLocked(func(t *kernel.Task) {
 		if fdm := t.FDMap(); fdm != nil {
-			file, flags = fdm.GetDescriptor(kdefs.FD(n))
+			file, fdFlags = fdm.GetDescriptor(kdefs.FD(n))
 		}
 	})
 	if file == nil {
 		return nil, syserror.ENOENT
 	}
-	return toInode(file, flags), nil
+	return toInode(file, fdFlags), nil
 }
 
 // readDescriptors reads fds in the task starting at offset, and calls the
@@ -200,17 +200,20 @@ func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 type fdInfo struct {
 	ramfs.File
 
-	flags kernel.FDFlags
+	flags   fs.FileFlags
+	fdFlags kernel.FDFlags
 }
 
 // newFdInfo returns a new fdInfo based on an existing file.
-func newFdInfo(t *kernel.Task, _ *fs.File, flags kernel.FDFlags, msrc *fs.MountSource) *fs.Inode {
-	fdi := &fdInfo{flags: flags}
+func newFdInfo(t *kernel.Task, file *fs.File, fdFlags kernel.FDFlags, msrc *fs.MountSource) *fs.Inode {
+	fdi := &fdInfo{flags: file.Flags(), fdFlags: fdFlags}
 	fdi.InitFile(t, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true}})
 	// TODO: Get pos, locks, and other data.  For now we only
 	// have flags.
 	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
-	fdi.Append([]byte(fmt.Sprintf("flags: %08o\n", flags)))
+
+	flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
+	fdi.Append([]byte(fmt.Sprintf("flags:\t0%o\n", flags)))
 	return newFile(fdi, msrc, fs.SpecialFile, t)
 }
 
@@ -241,8 +244,8 @@ func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 
 // Lookup loads an fd in /proc/TID/fdinfo into a Dirent.
 func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
-	n, err := walkDescriptors(fdid.t, p, func(file *fs.File, flags kernel.FDFlags) *fs.Inode {
-		return newFdInfo(fdid.t, file, flags, dir.MountSource)
+	n, err := walkDescriptors(fdid.t, p, func(file *fs.File, fdFlags kernel.FDFlags) *fs.Inode {
+		return newFdInfo(fdid.t, file, fdFlags, dir.MountSource)
 	})
 	if err != nil {
 		return nil, err
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index ef73125fd..299506330 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -51,6 +51,22 @@ type FDFlags struct {
 	CloseOnExec bool
 }
 
+// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags representation.
+func (f FDFlags) ToLinuxFileFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.O_CLOEXEC
+	}
+	return
+}
+
+// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags representation.
+func (f FDFlags) ToLinuxFDFlags() (mask uint) {
+	if f.CloseOnExec {
+		mask |= linux.FD_CLOEXEC
+	}
+	return
+}
+
 // descriptor holds the details about a file descriptor, namely a pointer the
 // file itself and the descriptor flags.
 type descriptor struct {
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index b2e173f3e..e8db3e0de 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -17,7 +17,6 @@ package linux
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 )
 
 // flagsToPermissions returns a Permissions object from Linux flags.
@@ -35,48 +34,6 @@ func flagsToPermissions(mask uint) (p fs.PermMask) {
 	return
 }
 
-// fdFlagsToLinux converts a kernel.FDFlags object to a Linux representation.
-func fdFlagsToLinux(flags kernel.FDFlags) (mask uint) {
-	if flags.CloseOnExec {
-		mask |= linux.FD_CLOEXEC
-	}
-	return
-}
-
-// flagsToLinux converts a FileFlags object to a Linux representation.
-func flagsToLinux(flags fs.FileFlags) (mask uint) {
-	if flags.Direct {
-		mask |= linux.O_DIRECT
-	}
-	if flags.NonBlocking {
-		mask |= linux.O_NONBLOCK
-	}
-	if flags.Sync {
-		mask |= linux.O_SYNC
-	}
-	if flags.Append {
-		mask |= linux.O_APPEND
-	}
-	if flags.Directory {
-		mask |= linux.O_DIRECTORY
-	}
-	if flags.Async {
-		mask |= linux.O_ASYNC
-	}
-	if flags.LargeFile {
-		mask |= linux.O_LARGEFILE
-	}
-	switch {
-	case flags.Read && flags.Write:
-		mask |= linux.O_RDWR
-	case flags.Write:
-		mask |= linux.O_WRONLY
-	case flags.Read:
-		mask |= linux.O_RDONLY
-	}
-	return
-}
-
 // linuxToFlags converts linux file flags to a FileFlags object.
 func linuxToFlags(mask uint) (flags fs.FileFlags) {
 	return fs.FileFlags{
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 2f28fbea6..692f11ed7 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -809,14 +809,14 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 		return uintptr(fd), nil, nil
 	case linux.F_GETFD:
-		return uintptr(fdFlagsToLinux(flags)), nil, nil
+		return uintptr(flags.ToLinuxFDFlags()), nil, nil
 	case linux.F_SETFD:
 		flags := args[2].Uint()
 		t.FDMap().SetFlags(fd, kernel.FDFlags{
 			CloseOnExec: flags&syscall.FD_CLOEXEC != 0,
 		})
 	case linux.F_GETFL:
-		return uintptr(flagsToLinux(file.Flags())), nil, nil
+		return uintptr(file.Flags().ToLinux()), nil, nil
 	case linux.F_SETFL:
 		flags := uint(args[2].Uint())
 		file.SetFlags(linuxToSettableFlags(flags))
-- 
cgit v1.2.3


From 6b87378634e1575cf590b7558f19b40b012849c2 Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Wed, 1 Aug 2018 09:43:47 -0700
Subject: New conditional for adding key/value pairs to maps.

When adding MultiDeviceKeys and their values into MultiDevice maps, make
sure the keys and values have not already been added. This ensures that
preexisting key/value pairs are not overridden.

PiperOrigin-RevId: 206942766
Change-Id: I9d85f38eb59ba59f0305e6614a52690608944981
---
 pkg/sentry/device/device.go | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index a5514c72f..21fee8f8a 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -183,6 +183,14 @@ func (m *MultiDevice) Load(key MultiDeviceKey, value uint64) bool {
 		m.rcache = make(map[uint64]MultiDeviceKey)
 	}
 
+	if val, exists := m.cache[key]; exists && val != value {
+		return false
+	}
+	if k, exists := m.rcache[value]; exists && k != key {
+		// Should never happen.
+		panic("MultiDevice's caches are inconsistent")
+	}
+
 	// Cache value at key.
 	m.cache[key] = value
 
-- 
cgit v1.2.3


From b9e1cf8404ce1263176643dee1a1cc835c9d1448 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Wed, 1 Aug 2018 15:42:07 -0700
Subject: stateify: convert all packages to use explicit mode.

PiperOrigin-RevId: 207007153
Change-Id: Ifedf1cc3758dc18be16647a4ece9c840c1c636c9
---
 pkg/abi/BUILD                                  | 13 +-----
 pkg/abi/linux/BUILD                            | 16 +------
 pkg/abi/linux/bpf.go                           |  2 +
 pkg/abi/linux/tty.go                           |  2 +
 pkg/bpf/BUILD                                  | 17 +-------
 pkg/bpf/interpreter.go                         |  2 +
 pkg/cpuid/BUILD                                | 15 +------
 pkg/cpuid/cpuid.go                             |  2 +
 pkg/ilist/BUILD                                | 15 +------
 pkg/ilist/list.go                              |  4 ++
 pkg/segment/range.go                           |  2 +
 pkg/segment/set.go                             |  5 +++
 pkg/sentry/arch/BUILD                          | 18 +-------
 pkg/sentry/arch/arch.go                        |  2 +
 pkg/sentry/arch/arch_amd64.go                  |  2 +
 pkg/sentry/arch/arch_state_x86.go              |  1 +
 pkg/sentry/arch/arch_x86.go                    |  2 +
 pkg/sentry/arch/auxv.go                        |  2 +
 pkg/sentry/arch/signal_amd64.go                |  6 +++
 pkg/sentry/context/contexttest/BUILD           | 17 +-------
 pkg/sentry/fs/BUILD                            | 36 +---------------
 pkg/sentry/fs/ashmem/BUILD                     | 17 +-------
 pkg/sentry/fs/ashmem/area.go                   |  8 ++--
 pkg/sentry/fs/ashmem/device.go                 | 22 +++++-----
 pkg/sentry/fs/ashmem/pin_board.go              |  2 +
 pkg/sentry/fs/attr.go                          | 12 ++++++
 pkg/sentry/fs/binder/BUILD                     | 13 +-----
 pkg/sentry/fs/binder/binder.go                 | 26 +++++++-----
 pkg/sentry/fs/dentry.go                        |  4 ++
 pkg/sentry/fs/dev/BUILD                        | 20 +--------
 pkg/sentry/fs/dev/dev.go                       |  2 +
 pkg/sentry/fs/dev/fs.go                        |  2 +
 pkg/sentry/fs/dev/full.go                      |  2 +
 pkg/sentry/fs/dev/null.go                      |  3 ++
 pkg/sentry/fs/dev/random.go                    |  1 +
 pkg/sentry/fs/dirent.go                        |  2 +
 pkg/sentry/fs/dirent_cache.go                  |  2 +
 pkg/sentry/fs/fdpipe/BUILD                     | 31 +-------------
 pkg/sentry/fs/fdpipe/pipe.go                   |  2 +
 pkg/sentry/fs/file.go                          |  2 +
 pkg/sentry/fs/file_overlay.go                  |  4 ++
 pkg/sentry/fs/filesystems.go                   |  2 +
 pkg/sentry/fs/filetest/BUILD                   | 18 +-------
 pkg/sentry/fs/flags.go                         |  2 +
 pkg/sentry/fs/fsutil/BUILD                     | 20 +--------
 pkg/sentry/fs/fsutil/dirty_set.go              |  2 +
 pkg/sentry/fs/fsutil/handle.go                 |  2 +
 pkg/sentry/fs/fsutil/host_file_mapper.go       |  2 +
 pkg/sentry/fs/fsutil/inode.go                  |  6 +++
 pkg/sentry/fs/fsutil/inode_cached.go           |  2 +
 pkg/sentry/fs/gofer/BUILD                      | 23 +---------
 pkg/sentry/fs/gofer/file.go                    |  2 +
 pkg/sentry/fs/gofer/fs.go                      |  2 +
 pkg/sentry/fs/gofer/inode.go                   |  4 ++
 pkg/sentry/fs/gofer/session.go                 |  3 ++
 pkg/sentry/fs/host/BUILD                       | 27 +-----------
 pkg/sentry/fs/host/descriptor.go               |  2 +
 pkg/sentry/fs/host/file.go                     |  2 +
 pkg/sentry/fs/host/fs.go                       |  6 ++-
 pkg/sentry/fs/host/inode.go                    |  4 ++
 pkg/sentry/fs/inode.go                         |  4 ++
 pkg/sentry/fs/inode_inotify.go                 |  2 +
 pkg/sentry/fs/inotify.go                       |  2 +
 pkg/sentry/fs/inotify_event.go                 |  2 +
 pkg/sentry/fs/inotify_watch.go                 |  2 +
 pkg/sentry/fs/lock/BUILD                       | 15 +------
 pkg/sentry/fs/lock/lock.go                     |  6 ++-
 pkg/sentry/fs/mount.go                         |  4 ++
 pkg/sentry/fs/mount_overlay.go                 |  4 ++
 pkg/sentry/fs/mounts.go                        |  2 +
 pkg/sentry/fs/overlay.go                       |  2 +
 pkg/sentry/fs/proc/BUILD                       | 34 +--------------
 pkg/sentry/fs/proc/cpuinfo.go                  |  2 +
 pkg/sentry/fs/proc/exec_args.go                |  2 +
 pkg/sentry/fs/proc/fds.go                      |  6 +++
 pkg/sentry/fs/proc/file.go                     |  1 +
 pkg/sentry/fs/proc/filesystems.go              |  2 +
 pkg/sentry/fs/proc/fs.go                       |  2 +
 pkg/sentry/fs/proc/loadavg.go                  |  2 +
 pkg/sentry/fs/proc/meminfo.go                  |  2 +
 pkg/sentry/fs/proc/mounts.go                   |  4 ++
 pkg/sentry/fs/proc/proc.go                     |  4 ++
 pkg/sentry/fs/proc/seqfile/BUILD               | 30 ++-----------
 pkg/sentry/fs/proc/seqfile/seqfile.go          |  4 ++
 pkg/sentry/fs/proc/stat.go                     |  2 +
 pkg/sentry/fs/proc/sys.go                      |  5 +++
 pkg/sentry/fs/proc/sys_net.go                  |  2 +
 pkg/sentry/fs/proc/task.go                     | 20 +++++++++
 pkg/sentry/fs/proc/uid_gid_map.go              |  3 ++
 pkg/sentry/fs/proc/uptime.go                   |  2 +
 pkg/sentry/fs/proc/version.go                  |  2 +
 pkg/sentry/fs/ramfs/BUILD                      | 21 +--------
 pkg/sentry/fs/ramfs/dir.go                     |  2 +
 pkg/sentry/fs/ramfs/ramfs.go                   |  2 +
 pkg/sentry/fs/ramfs/socket.go                  |  2 +
 pkg/sentry/fs/ramfs/symlink.go                 |  2 +
 pkg/sentry/fs/ramfs/test/BUILD                 | 18 +-------
 pkg/sentry/fs/sys/BUILD                        | 14 +-----
 pkg/sentry/fs/sys/fs.go                        |  2 +
 pkg/sentry/fs/sys/sys.go                       |  5 ++-
 pkg/sentry/fs/timerfd/BUILD                    | 18 +-------
 pkg/sentry/fs/timerfd/timerfd.go               |  4 +-
 pkg/sentry/fs/tmpfs/BUILD                      | 17 +-------
 pkg/sentry/fs/tmpfs/file_regular.go            |  2 +
 pkg/sentry/fs/tmpfs/fs.go                      |  2 +
 pkg/sentry/fs/tmpfs/inode_file.go              |  2 +
 pkg/sentry/fs/tmpfs/tmpfs.go                   |  8 ++++
 pkg/sentry/fs/tty/BUILD                        | 20 +--------
 pkg/sentry/fs/tty/dir.go                       | 18 +++++---
 pkg/sentry/fs/tty/fs.go                        |  4 ++
 pkg/sentry/fs/tty/inode.go                     |  2 +
 pkg/sentry/fs/tty/line_discipline.go           |  6 +++
 pkg/sentry/fs/tty/master.go                    |  4 ++
 pkg/sentry/fs/tty/queue.go                     |  4 +-
 pkg/sentry/fs/tty/slave.go                     |  4 ++
 pkg/sentry/fs/tty/terminal.go                  |  2 +
 pkg/sentry/inet/BUILD                          | 15 +------
 pkg/sentry/inet/inet.go                        |  2 +
 pkg/sentry/kernel/BUILD                        | 59 +++-----------------------
 pkg/sentry/kernel/abstract_socket_namespace.go |  3 ++
 pkg/sentry/kernel/auth/BUILD                   | 17 +-------
 pkg/sentry/kernel/auth/credentials.go          |  2 +
 pkg/sentry/kernel/auth/id_map.go               |  2 +
 pkg/sentry/kernel/auth/user_namespace.go       |  2 +
 pkg/sentry/kernel/epoll/BUILD                  | 15 +------
 pkg/sentry/kernel/epoll/epoll.go               |  8 +++-
 pkg/sentry/kernel/eventfd/BUILD                | 18 +-------
 pkg/sentry/kernel/eventfd/eventfd.go           |  4 +-
 pkg/sentry/kernel/fd_map.go                    |  6 +++
 pkg/sentry/kernel/fs_context.go                |  2 +
 pkg/sentry/kernel/futex/BUILD                  | 18 +-------
 pkg/sentry/kernel/futex/futex.go               |  2 +
 pkg/sentry/kernel/ipc_namespace.go             |  2 +
 pkg/sentry/kernel/kernel.go                    |  4 +-
 pkg/sentry/kernel/pending_signals.go           |  5 +++
 pkg/sentry/kernel/pipe/BUILD                   | 20 +--------
 pkg/sentry/kernel/pipe/buffers.go              |  2 +
 pkg/sentry/kernel/pipe/node.go                 |  2 +
 pkg/sentry/kernel/pipe/pipe.go                 |  2 +
 pkg/sentry/kernel/pipe/reader.go               |  2 +
 pkg/sentry/kernel/pipe/reader_writer.go        |  2 +
 pkg/sentry/kernel/pipe/writer.go               |  2 +
 pkg/sentry/kernel/ptrace.go                    |  4 ++
 pkg/sentry/kernel/rseq.go                      |  2 +
 pkg/sentry/kernel/semaphore/BUILD              | 15 +------
 pkg/sentry/kernel/semaphore/semaphore.go       |  8 ++++
 pkg/sentry/kernel/sessions.go                  |  4 ++
 pkg/sentry/kernel/shm/BUILD                    | 13 +-----
 pkg/sentry/kernel/shm/shm.go                   |  4 ++
 pkg/sentry/kernel/signal_handlers.go           |  2 +
 pkg/sentry/kernel/syscalls.go                  |  2 +
 pkg/sentry/kernel/syslog.go                    |  2 +
 pkg/sentry/kernel/task.go                      |  2 +
 pkg/sentry/kernel/task_clone.go                |  4 ++
 pkg/sentry/kernel/task_context.go              |  2 +
 pkg/sentry/kernel/task_exec.go                 |  4 ++
 pkg/sentry/kernel/task_exit.go                 |  6 +++
 pkg/sentry/kernel/task_resources.go            |  2 +
 pkg/sentry/kernel/task_run.go                  |  2 +
 pkg/sentry/kernel/task_sched.go                |  2 +
 pkg/sentry/kernel/task_signals.go              |  5 +++
 pkg/sentry/kernel/task_syscall.go              |  4 ++
 pkg/sentry/kernel/thread_group.go              |  2 +
 pkg/sentry/kernel/threads.go                   |  8 ++++
 pkg/sentry/kernel/time/BUILD                   | 14 +-----
 pkg/sentry/kernel/time/time.go                 |  6 +++
 pkg/sentry/kernel/timekeeper.go                |  2 +
 pkg/sentry/kernel/timer.go                     |  8 ++++
 pkg/sentry/kernel/uts_namespace.go             |  2 +
 pkg/sentry/kernel/vdso.go                      |  2 +
 pkg/sentry/limits/BUILD                        | 13 +-----
 pkg/sentry/limits/limits.go                    |  4 ++
 pkg/sentry/loader/BUILD                        | 15 +------
 pkg/sentry/loader/vdso.go                      |  2 +
 pkg/sentry/loader/vdso_state.go                |  1 +
 pkg/sentry/memmap/BUILD                        | 15 +------
 pkg/sentry/memmap/mapping_set.go               |  2 +
 pkg/sentry/mm/BUILD                            | 21 +--------
 pkg/sentry/mm/aio_context.go                   |  8 ++++
 pkg/sentry/mm/mm.go                            |  7 +++
 pkg/sentry/mm/special_mappable.go              |  2 +
 pkg/sentry/platform/BUILD                      | 13 +-----
 pkg/sentry/platform/filemem/BUILD              | 14 +-----
 pkg/sentry/platform/filemem/filemem.go         |  2 +
 pkg/sentry/socket/BUILD                        | 17 +-------
 pkg/sentry/socket/control/BUILD                | 23 +++-------
 pkg/sentry/socket/control/control.go           |  4 ++
 pkg/sentry/socket/epsocket/BUILD               | 16 +------
 pkg/sentry/socket/epsocket/epsocket.go         |  2 +
 pkg/sentry/socket/epsocket/stack.go            |  2 +
 pkg/sentry/socket/hostinet/BUILD               | 15 +------
 pkg/sentry/socket/netlink/BUILD                | 13 +-----
 pkg/sentry/socket/netlink/port/BUILD           | 15 +------
 pkg/sentry/socket/netlink/port/port.go         |  2 +
 pkg/sentry/socket/netlink/route/BUILD          | 17 +-------
 pkg/sentry/socket/netlink/route/protocol.go    |  2 +
 pkg/sentry/socket/netlink/socket.go            |  2 +
 pkg/sentry/socket/socket.go                    |  2 +
 pkg/sentry/socket/unix/BUILD                   | 13 +-----
 pkg/sentry/socket/unix/unix.go                 |  2 +
 pkg/sentry/syscalls/linux/BUILD                | 20 +--------
 pkg/sentry/syscalls/linux/sys_aio.go           |  2 +
 pkg/sentry/syscalls/linux/sys_futex.go         |  2 +
 pkg/sentry/syscalls/linux/sys_poll.go          |  2 +
 pkg/sentry/syscalls/linux/sys_time.go          |  2 +
 pkg/sentry/usage/BUILD                         | 17 +-------
 pkg/sentry/usage/cpu.go                        |  2 +
 pkg/sentry/usage/io.go                         |  2 +
 pkg/sentry/usermem/BUILD                       | 16 +------
 pkg/sentry/usermem/access_type.go              |  2 +
 pkg/sentry/usermem/addr.go                     |  2 +
 pkg/tcpip/BUILD                                | 17 +-------
 pkg/tcpip/buffer/BUILD                         | 13 +-----
 pkg/tcpip/buffer/view.go                       |  2 +
 pkg/tcpip/header/BUILD                         | 13 +-----
 pkg/tcpip/header/tcp.go                        |  4 ++
 pkg/tcpip/network/fragmentation/BUILD          | 11 +----
 pkg/tcpip/seqnum/BUILD                         | 17 +-------
 pkg/tcpip/tcpip.go                             |  4 ++
 pkg/tcpip/transport/ping/BUILD                 | 17 +-------
 pkg/tcpip/transport/ping/endpoint.go           |  1 +
 pkg/tcpip/transport/queue/BUILD                | 17 +-------
 pkg/tcpip/transport/queue/queue.go             |  2 +
 pkg/tcpip/transport/tcp/BUILD                  | 25 +----------
 pkg/tcpip/transport/tcp/endpoint.go            |  4 ++
 pkg/tcpip/transport/tcp/rcv.go                 |  2 +
 pkg/tcpip/transport/tcp/reno.go                |  2 +
 pkg/tcpip/transport/tcp/segment.go             |  2 +
 pkg/tcpip/transport/tcp/segment_queue.go       |  2 +
 pkg/tcpip/transport/tcp/snd.go                 |  4 ++
 pkg/tcpip/transport/tcp/snd_state.go           |  1 +
 pkg/tcpip/transport/udp/BUILD                  | 17 +-------
 pkg/tcpip/transport/udp/endpoint.go            |  3 ++
 pkg/tcpip/transport/unix/BUILD                 | 16 +------
 pkg/tcpip/transport/unix/connectioned.go       |  2 +
 pkg/tcpip/transport/unix/connectionless.go     |  2 +
 pkg/tcpip/transport/unix/unix.go               | 11 +++++
 pkg/waiter/BUILD                               | 21 ++-------
 pkg/waiter/waiter.go                           |  2 +
 239 files changed, 662 insertions(+), 1108 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index f1e6bac67..c014d2c4b 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,24 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "abi_state",
-    srcs = [
-        "abi.go",
-    ],
-    out = "abi_state.go",
-    package = "abi",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "abi",
     srcs = [
         "abi.go",
-        "abi_state.go",
         "flag.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/abi",
     visibility = ["//:sandbox"],
-    deps = ["//pkg/state"],
 )
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 38b4829c9..ac4ceefbc 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -4,19 +4,7 @@
 
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "linux_state",
-    srcs = [
-        "binder.go",
-        "bpf.go",
-        "time.go",
-        "tty.go",
-    ],
-    out = "linux_state.go",
-    package = "linux",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "linux",
@@ -41,7 +29,6 @@ go_library(
         "ipc.go",
         "limits.go",
         "linux.go",
-        "linux_state.go",
         "mm.go",
         "netdevice.go",
         "netlink.go",
@@ -67,6 +54,5 @@ go_library(
         "//pkg/abi",
         "//pkg/binary",
         "//pkg/bits",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index f597ef4f5..80e5b1af1 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -15,6 +15,8 @@
 package linux
 
 // BPFInstruction is a raw BPF virtual machine instruction.
+//
+// +stateify savable
 type BPFInstruction struct {
 	// OpCode is the operation to execute.
 	OpCode uint16
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 84b6ccc87..b640f7627 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -38,6 +38,8 @@ type Termios struct {
 
 // KernelTermios is struct ktermios/struct termios2, defined in
 // uapi/asm-generic/termbits.h.
+//
+// +stateify savable
 type KernelTermios struct {
 	InputFlags        uint32
 	OutputFlags       uint32
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index 403270049..564df3af5 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,21 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "bpf_state",
-    srcs = [
-        "interpreter.go",
-    ],
-    out = "bpf_state.go",
-    package = "bpf",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "bpf",
     srcs = [
         "bpf.go",
-        "bpf_state.go",
         "decoder.go",
         "input_bytes.go",
         "interpreter.go",
@@ -23,10 +13,7 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/bpf",
     visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/abi/linux"],
 )
 
 go_test(
diff --git a/pkg/bpf/interpreter.go b/pkg/bpf/interpreter.go
index b7dee86a8..111ada9d1 100644
--- a/pkg/bpf/interpreter.go
+++ b/pkg/bpf/interpreter.go
@@ -88,6 +88,8 @@ func (e Error) Error() string {
 }
 
 // Program is a BPF program that has been validated for consistency.
+//
+// +stateify savable
 type Program struct {
 	instructions []linux.BPFInstruction
 }
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 9a0ca1b33..46fc4703b 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,27 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "cpuid_state",
-    srcs = ["cpuid.go"],
-    out = "cpuid_state.go",
-    package = "cpuid",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "cpuid",
     srcs = [
         "cpu_amd64.s",
         "cpuid.go",
-        "cpuid_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/cpuid",
     visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/log",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/log"],
 )
 
 go_test(
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index b486ab037..e91e34dc7 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -409,6 +409,8 @@ func (f Feature) flagString(cpuinfoOnly bool) string {
 }
 
 // FeatureSet is a set of Features for a cpu.
+//
+// +stateify savable
 type FeatureSet struct {
 	// Set is the set of features that are enabled in this FeatureSet.
 	Set map[Feature]bool
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index e32f26ffa..b26a39132 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,28 +1,15 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "list_state",
-    srcs = [
-        "interface_list.go",
-    ],
-    out = "interface_list_state.go",
-    package = "ilist",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "ilist",
     srcs = [
         "interface_list.go",
-        "interface_list_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/ilist",
     visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/state",
-    ],
 )
 
 go_template_instance(
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 5efb6c072..a88b82196 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -36,6 +36,8 @@ type Linker interface {
 //      for e := l.Front(); e != nil; e = e.Next() {
 // 		// do something with e.
 //      }
+//
+// +stateify savable
 type List struct {
 	head Linker
 	tail Linker
@@ -155,6 +157,8 @@ func (l *List) Remove(e Linker) {
 // Entry is a default implementation of Linker. Users can add anonymous fields
 // of this type to their structs to make them automatically implement the
 // methods needed by List.
+//
+// +stateify savable
 type Entry struct {
 	next Linker
 	prev Linker
diff --git a/pkg/segment/range.go b/pkg/segment/range.go
index 5ff30d489..34c067265 100644
--- a/pkg/segment/range.go
+++ b/pkg/segment/range.go
@@ -18,6 +18,8 @@ package segment
 type T uint64
 
 // A Range represents a contiguous range of T.
+//
+// +stateify savable
 type Range struct {
 	// Start is the inclusive start of the range.
 	Start T
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 6eed1d930..cffec2a2c 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -88,6 +88,8 @@ const (
 // A Set is a mapping of segments with non-overlapping Range keys. The zero
 // value for a Set is an empty set. Set values are not safely movable nor
 // copyable. Set is thread-compatible.
+//
+// +stateify savable
 type Set struct {
 	root node `state:".(*SegmentDataSlices)"`
 }
@@ -596,6 +598,7 @@ func (s *Set) ApplyContiguous(r Range, fn func(seg Iterator)) GapIterator {
 	}
 }
 
+// +stateify savable
 type node struct {
 	// An internal binary tree node looks like:
 	//
@@ -1317,6 +1320,8 @@ func (n *node) writeDebugString(buf *bytes.Buffer, prefix string) {
 // SegmentDataSlices represents segments from a set as slices of start, end, and
 // values. SegmentDataSlices is primarily used as an intermediate representation
 // for save/restore and the layout here is optimized for that.
+//
+// +stateify savable
 type SegmentDataSlices struct {
 	Start  []Key
 	End    []Key
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 0a2a35400..314b3e962 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,21 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "arch_state",
-    srcs = [
-        "arch.go",
-        "arch_amd64.go",
-        "arch_state_x86.go",
-        "arch_x86.go",
-        "auxv.go",
-        "signal_amd64.go",
-    ],
-    out = "arch_state.go",
-    package = "arch",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "arch",
@@ -24,7 +10,6 @@ go_library(
         "arch.go",
         "arch_amd64.go",
         "arch_amd64.s",
-        "arch_state.go",
         "arch_state_x86.go",
         "arch_x86.go",
         "auxv.go",
@@ -46,7 +31,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/limits",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 0189e958d..21cb84502 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -254,6 +254,8 @@ const (
 // MemoryManager.
 //
 // Note that "highest address" below is always exclusive.
+//
+// +stateify savable
 type MmapLayout struct {
 	// MinAddr is the lowest mappable address.
 	MinAddr usermem.Addr
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 23526fe8e..f1e408af9 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -95,6 +95,8 @@ const (
 )
 
 // context64 represents an AMD64 context.
+//
+// +stateify savable
 type context64 struct {
 	State
 	sigFPState []x86FPState // fpstate to be restored on sigreturn.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index cb38d098a..e9c23a06b 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -56,6 +56,7 @@ func (s *State) afterLoad() {
 	copy(s.x86FPState, old)
 }
 
+// +stateify savable
 type syscallPtraceRegs struct {
 	R15      uint64
 	R14      uint64
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 5cc4f8377..b35eec53c 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -153,6 +153,8 @@ func NewFloatingPointData() *FloatingPointData {
 
 // State contains the common architecture bits for X86 (the build tag of this
 // file ensures it's only built on x86).
+//
+// +stateify savable
 type State struct {
 	// The system registers.
 	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 70e0e35b7..81cfb4a01 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -19,6 +19,8 @@ import (
 )
 
 // An AuxEntry represents an entry in an ELF auxiliary vector.
+//
+// +stateify savable
 type AuxEntry struct {
 	Key   uint64
 	Value usermem.Addr
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index c1d743f38..e81717e8b 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -28,6 +28,8 @@ import (
 
 // SignalAct represents the action that should be taken when a signal is
 // delivered, and is equivalent to struct sigaction on 64-bit x86.
+//
+// +stateify savable
 type SignalAct struct {
 	Handler  uint64
 	Flags    uint64
@@ -47,6 +49,8 @@ func (s *SignalAct) DeserializeTo(other *SignalAct) {
 
 // SignalStack represents information about a user stack, and is equivalent to
 // stack_t on 64-bit x86.
+//
+// +stateify savable
 type SignalStack struct {
 	Addr  uint64
 	Flags uint32
@@ -66,6 +70,8 @@ func (s *SignalStack) DeserializeTo(other *SignalStack) {
 
 // SignalInfo represents information about a signal being delivered, and is
 // equivalent to struct siginfo on 64-bit x86.
+//
+// +stateify savable
 type SignalInfo struct {
 	Signo int32 // Signal number
 	Errno int32 // Errno value
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 591b11a4d..01bb40b04 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,23 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "contexttest_state",
-    srcs = [
-        "contexttest.go",
-    ],
-    out = "contexttest_state.go",
-    package = "contexttest",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "contexttest",
     testonly = 1,
-    srcs = [
-        "contexttest.go",
-        "contexttest_state.go",
-    ],
+    srcs = ["contexttest.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -28,6 +16,5 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/ptrace",
         "//pkg/sentry/uniqueid",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index e3c9a9b70..18cd5ae8e 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,40 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "fs_state",
-    srcs = [
-        "attr.go",
-        "dentry.go",
-        "dirent.go",
-        "dirent_cache.go",
-        "dirent_list.go",
-        "dirent_state.go",
-        "file.go",
-        "file_overlay.go",
-        "file_state.go",
-        "filesystems.go",
-        "flags.go",
-        "inode.go",
-        "inode_inotify.go",
-        "inode_operations.go",
-        "inode_overlay.go",
-        "inotify.go",
-        "inotify_event.go",
-        "inotify_watch.go",
-        "mock.go",
-        "mount.go",
-        "mount_overlay.go",
-        "mount_state.go",
-        "mounts.go",
-        "overlay.go",
-        "path.go",
-    ],
-    out = "fs_state.go",
-    package = "fs",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "fs",
@@ -54,7 +21,6 @@ go_library(
         "filesystems.go",
         "flags.go",
         "fs.go",
-        "fs_state.go",
         "inode.go",
         "inode_inotify.go",
         "inode_operations.go",
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index 9f166799a..dc893d22f 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -1,26 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
-go_stateify(
-    name = "ashmem_state",
-    srcs = [
-        "area.go",
-        "device.go",
-        "pin_board.go",
-        "uint64_range.go",
-        "uint64_set.go",
-    ],
-    out = "ashmem_state.go",
-    package = "ashmem",
-)
-
 go_library(
     name = "ashmem",
     srcs = [
         "area.go",
-        "ashmem_state.go",
         "device.go",
         "pin_board.go",
         "uint64_range.go",
@@ -41,7 +27,6 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
     ],
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index e4f76f0d0..bfd7f2762 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -39,10 +39,12 @@ const (
 )
 
 // Area implements fs.FileOperations.
+//
+// +stateify savable
 type Area struct {
-	fsutil.NoFsync
-	fsutil.DeprecatedFileOperations
-	fsutil.NotDirReaddir
+	fsutil.NoFsync                  `state:"nosave"`
+	fsutil.DeprecatedFileOperations `state:"nosave"`
+	fsutil.NotDirReaddir            `state:"nosave"`
 
 	ad *Device
 
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index c5b51d4a7..d0986fa11 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -27,17 +27,19 @@ import (
 )
 
 // Device implements fs.InodeOperations.
+//
+// +stateify savable
 type Device struct {
-	fsutil.DeprecatedFileOperations
-	fsutil.InodeNoExtendedAttributes
-	fsutil.InodeNotDirectory
-	fsutil.InodeNotRenameable
-	fsutil.InodeNotSocket
-	fsutil.InodeNotSymlink
-	fsutil.NoFsync
-	fsutil.NoMappable
-	fsutil.NoopWriteOut
-	fsutil.NotDirReaddir
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.NoFsync                   `state:"nosave"`
+	fsutil.NoMappable                `state:"nosave"`
+	fsutil.NoopWriteOut              `state:"nosave"`
+	fsutil.NotDirReaddir             `state:"nosave"`
 
 	mu       sync.Mutex `state:"nosave"`
 	unstable fs.UnstableAttr
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
index c7fb3822c..ecba395a0 100644
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -56,6 +56,8 @@ func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) {
 // segment.Set is used for implementation where segments represent
 // ranges of pinned bytes, while gaps represent ranges of unpinned
 // bytes. All ranges are page-aligned.
+//
+// +stateify savable
 type PinBoard struct {
 	Set
 }
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 56a2ad6f7..4178f18b2 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -91,6 +91,8 @@ func (n InodeType) String() string {
 
 // StableAttr contains Inode attributes that will be stable throughout the
 // lifetime of the Inode.
+//
+// +stateify savable
 type StableAttr struct {
 	// Type is the InodeType of a InodeOperations.
 	Type InodeType
@@ -150,6 +152,8 @@ func IsCharDevice(s StableAttr) bool {
 
 // UnstableAttr contains Inode attributes that may change over the lifetime
 // of the Inode.
+//
+// +stateify savable
 type UnstableAttr struct {
 	// Size is the file size in bytes.
 	Size int64
@@ -186,6 +190,8 @@ func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr {
 }
 
 // AttrMask contains fields to mask StableAttr and UnstableAttr.
+//
+// +stateify savable
 type AttrMask struct {
 	Type             bool
 	DeviceID         bool
@@ -227,6 +233,8 @@ func (a AttrMask) Union(b AttrMask) AttrMask {
 }
 
 // PermMask are file access permissions.
+//
+// +stateify savable
 type PermMask struct {
 	// Read indicates reading is permitted.
 	Read bool
@@ -280,6 +288,8 @@ func (p PermMask) SupersetOf(other PermMask) bool {
 
 // FilePermissions represents the permissions of a file, with
 // Read/Write/Execute bits for user, group, and other.
+//
+// +stateify savable
 type FilePermissions struct {
 	User  PermMask
 	Group PermMask
@@ -370,6 +380,8 @@ func (f FilePermissions) AnyRead() bool {
 }
 
 // FileOwner represents ownership of a file.
+//
+// +stateify savable
 type FileOwner struct {
 	UID auth.KUID
 	GID auth.KGID
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index ec3928baf..a077b91d2 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -1,25 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "binder_state",
-    srcs = ["binder.go"],
-    out = "binder_state.go",
-    package = "binder",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "binder",
     srcs = [
         "binder.go",
-        "binder_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -30,8 +21,6 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
     ],
 )
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 3f87b6b08..502a262dd 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -40,15 +40,17 @@ const (
 )
 
 // Device implements fs.InodeOperations.
+//
+// +stateify savable
 type Device struct {
-	fsutil.InodeNoExtendedAttributes
-	fsutil.InodeNotDirectory
-	fsutil.InodeNotRenameable
-	fsutil.InodeNotSocket
-	fsutil.InodeNotSymlink
-	fsutil.NoMappable
-	fsutil.NoopWriteOut
-	fsutil.DeprecatedFileOperations
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.NoMappable                `state:"nosave"`
+	fsutil.NoopWriteOut              `state:"nosave"`
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
 
 	// mu protects unstable.
 	mu       sync.Mutex `state:"nosave"`
@@ -186,10 +188,12 @@ func (bd *Device) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Proc implements fs.FileOperations and fs.IoctlGetter.
+//
+// +stateify savable
 type Proc struct {
-	fsutil.NoFsync
-	fsutil.DeprecatedFileOperations
-	fsutil.NotDirReaddir
+	fsutil.NoFsync                  `state:"nosave"`
+	fsutil.DeprecatedFileOperations `state:"nosave"`
+	fsutil.NotDirReaddir            `state:"nosave"`
 
 	bd       *Device
 	task     *kernel.Task
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index d42e8da81..b347468ff 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -21,6 +21,8 @@ import (
 )
 
 // DentAttr is the metadata of a directory entry. It is a subset of StableAttr.
+//
+// +stateify savable
 type DentAttr struct {
 	// Type is the InodeType of an Inode.
 	Type InodeType
@@ -154,6 +156,8 @@ func GenericReaddir(ctx *DirCtx, s *SortedDentryMap) (int, error) {
 }
 
 // SortedDentryMap is a sorted map of names and fs.DentAttr entries.
+//
+// +stateify savable
 type SortedDentryMap struct {
 	// names is always kept in sorted-order.
 	names []string
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index ea41615fd..fc069bb5f 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,25 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "dev_state",
-    srcs = [
-        "dev.go",
-        "fs.go",
-        "full.go",
-        "null.go",
-        "random.go",
-    ],
-    out = "dev_state.go",
-    package = "dev",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "dev",
     srcs = [
         "dev.go",
-        "dev_state.go",
         "device.go",
         "fs.go",
         "full.go",
@@ -30,8 +16,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
-        "//pkg/log",
         "//pkg/rand",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
@@ -45,9 +29,7 @@ go_library(
         "//pkg/sentry/mm",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 36c61bfc2..3f4f2a40a 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -27,6 +27,8 @@ import (
 )
 
 // Dev is the root node.
+//
+// +stateify savable
 type Dev struct {
 	ramfs.Dir
 }
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index 3c79f3782..2ae49be4e 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -29,6 +29,8 @@ const binderEnabledKey = "binder_enabled"
 const ashmemEnabledKey = "ashmem_enabled"
 
 // filesystem is a devtmpfs.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index e13eb6c03..492b8eb3a 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -26,6 +26,8 @@ import (
 )
 
 // fullDevice is used to implement /dev/full.
+//
+// +stateify savable
 type fullDevice struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 66b8ba967..2977c8670 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// +stateify savable
 type nullDevice struct {
 	ramfs.Entry
 }
@@ -54,6 +55,7 @@ func (n *nullDevice) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
+// +stateify savable
 type zeroDevice struct {
 	nullDevice
 }
@@ -80,6 +82,7 @@ func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 	}), nil
 }
 
+// +stateify savable
 type zeroFileOperations struct {
 	fs.FileOperations
 }
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 33a045a05..47b76218f 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// +stateify savable
 type randomDevice struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index f9bf2fba6..4658d044f 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -81,6 +81,8 @@ var renameMu sync.RWMutex
 //
 // Dirents currently do not attempt to free entries that lack application references under
 // memory pressure.
+//
+// +stateify savable
 type Dirent struct {
 	// AtomicRefCount is our reference count.
 	refs.AtomicRefCount
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index e786e4f65..c680e4828 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -25,6 +25,8 @@ import (
 //
 // A nil DirentCache corresponds to a cache with size 0. All methods can be
 // called, but nothing is actually cached.
+//
+// +stateify savable
 type DirentCache struct {
 	// Maximum size of the cache. This must be saved manually, to handle the case
 	// when cache is nil.
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 4fcb06f1f..ffe4204bc 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,54 +1,27 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "pipe_state",
-    srcs = [
-        "pipe.go",
-        "pipe_state.go",
-    ],
-    out = "pipe_autogen_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"],
-    package = "fdpipe",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "fdpipe",
     srcs = [
         "pipe.go",
-        "pipe_autogen_state.go",
         "pipe_opener.go",
         "pipe_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/abi/linux",
-        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
-        "//pkg/metric",
-        "//pkg/p9",
-        "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/context",
-        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/fs/lock",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/time",
-        "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
-        "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
-        "//pkg/unet",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
     ],
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 7b318e35f..2e34604e6 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -34,6 +34,8 @@ import (
 )
 
 // pipeOperations are the fs.FileOperations of a host pipe.
+//
+// +stateify savable
 type pipeOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 6d93ef760..8e535a618 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -47,6 +47,8 @@ const FileMaxOffset = math.MaxInt64
 // and write(2).
 //
 // FIXME: Split synchronization from cancellation.
+//
+// +stateify savable
 type File struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 36b2cf75e..113962368 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -60,6 +60,8 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err
 }
 
 // overlayFileOperations implements FileOperations for a file in an overlay.
+//
+// +stateify savable
 type overlayFileOperations struct {
 	// upperMu protects upper below. In contrast lower is stable.
 	upperMu sync.Mutex `state:"nosave"`
@@ -375,6 +377,8 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) {
 
 // overlayMappingIdentity wraps a MappingIdentity, and also holds a reference
 // on a file during its lifetime.
+//
+// +stateify savable
 type overlayMappingIdentity struct {
 	refs.AtomicRefCount
 	id          memmap.MappingIdentity
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 200e792f4..5a1e7a270 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -125,6 +125,8 @@ func GetFilesystems() []Filesystem {
 }
 
 // MountSourceFlags represents all mount option flags as a struct.
+//
+// +stateify savable
 type MountSourceFlags struct {
 	// ReadOnly corresponds to mount(2)'s "MS_RDONLY" and indicates that
 	// the filesystem should be mounted read-only.
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index f481c57fb..d137fee4c 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,34 +1,20 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "filetest_state",
-    srcs = [
-        "filetest.go",
-    ],
-    out = "filetest_state.go",
-    package = "filetest",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "filetest",
     testonly = 1,
-    srcs = [
-        "filetest.go",
-        "filetest_state.go",
-    ],
+    srcs = ["filetest.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index da0ff58af..1aa271560 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -19,6 +19,8 @@ import (
 )
 
 // FileFlags encodes file flags.
+//
+// +stateify savable
 type FileFlags struct {
 	// Direct indicates that I/O should be done directly.
 	Direct bool
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 6eea64298..3512bae6f 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,24 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "fsutil_state",
-    srcs = [
-        "dirty_set_impl.go",
-        "file.go",
-        "file_range_set_impl.go",
-        "frame_ref_set_impl.go",
-        "handle.go",
-        "host_file_mapper.go",
-        "host_file_mapper_state.go",
-        "inode.go",
-        "inode_cached.go",
-    ],
-    out = "fsutil_state.go",
-    package = "fsutil",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "dirty_set_impl",
@@ -84,7 +67,6 @@ go_library(
         "frame_ref_set.go",
         "frame_ref_set_impl.go",
         "fsutil.go",
-        "fsutil_state.go",
         "handle.go",
         "host_file_mapper.go",
         "host_file_mapper_state.go",
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 9c6c98542..8e31e48fd 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -32,6 +32,8 @@ import (
 // DirtyInfo is the value type of DirtySet, and represents information about a
 // Mappable offset that is dirty (the cached data for that offset is newer than
 // its source).
+//
+// +stateify savable
 type DirtyInfo struct {
 	// Keep is true if the represented offset is concurrently writable, such
 	// that writing the data for that offset back to the source does not
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
index 149c0f84a..e7efd3c0f 100644
--- a/pkg/sentry/fs/fsutil/handle.go
+++ b/pkg/sentry/fs/fsutil/handle.go
@@ -27,6 +27,8 @@ import (
 //
 // FIXME: Remove Handle entirely in favor of individual fs.File
 // implementations using simple generic utilities.
+//
+// +stateify savable
 type Handle struct {
 	NoopRelease      `state:"nosave"`
 	NoIoctl          `state:"nosave"`
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index d0a27fc1c..9c1e2f76f 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -29,6 +29,8 @@ import (
 // HostFileMapper caches mappings of an arbitrary host file descriptor. It is
 // used by implementations of memmap.Mappable that represent a host file
 // descriptor.
+//
+// +stateify savable
 type HostFileMapper struct {
 	// HostFile conceptually breaks the file into pieces called chunks, of
 	// size and alignment chunkSize, and caches mappings of the file on a chunk
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index e1ad07df2..177396fdc 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -31,6 +31,8 @@ func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations {
 }
 
 // simpleInodeOperations is a simple implementation of Inode.
+//
+// +stateify savable
 type simpleInodeOperations struct {
 	DeprecatedFileOperations  `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
@@ -48,6 +50,8 @@ type simpleInodeOperations struct {
 
 // InodeSimpleAttributes implements a subset of the Inode interface. It provides
 // read-only access to attributes.
+//
+// +stateify savable
 type InodeSimpleAttributes struct {
 	// FSType is the filesystem type reported by StatFS.
 	FSType uint64
@@ -110,6 +114,8 @@ func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error
 //
 // Users need not initialize Xattrs to non-nil (it will be initialized
 // when the first extended attribute is set.
+//
+// +stateify savable
 type InMemoryAttributes struct {
 	Unstable fs.UnstableAttr
 	Xattrs   map[string][]byte
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index cba642a8f..0a320e2d8 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -55,6 +55,8 @@ import (
 //
 // Implementations of InodeOperations.WriteOut must call Sync to write out
 // in-memory modifications of data and metadata to the CachedFileObject.
+//
+// +stateify savable
 type CachingInodeOperations struct {
 	// backingFile is a handle to a cached file object.
 	backingFile CachedFileObject
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 1277379e7..cb17339c9 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,21 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "gofer_state",
-    srcs = [
-        "file.go",
-        "file_state.go",
-        "fs.go",
-        "inode.go",
-        "inode_state.go",
-        "session.go",
-        "session_state.go",
-    ],
-    out = "gofer_state.go",
-    package = "gofer",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "gofer",
@@ -27,7 +12,6 @@ go_library(
         "file.go",
         "file_state.go",
         "fs.go",
-        "gofer_state.go",
         "handles.go",
         "inode.go",
         "inode_state.go",
@@ -41,7 +25,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/metric",
@@ -54,15 +37,11 @@ go_library(
         "//pkg/sentry/fs/fdpipe",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/host",
-        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 039618808..46a6bbd5d 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -33,6 +33,8 @@ import (
 var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
 
 // fileOperations implements fs.FileOperations for a remote file system.
+//
+// +stateify savable
 type fileOperations struct {
 	fsutil.NoIoctl     `state:"nosave"`
 	waiter.AlwaysReady `state:"nosave"`
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index dd5d43c47..3ae93f059 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -83,6 +83,8 @@ var (
 )
 
 // filesystem is a 9p client.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index df584c382..7fc8f77b0 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -35,6 +35,8 @@ import (
 )
 
 // inodeOperations implements fs.InodeOperations.
+//
+// +stateify savable
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
@@ -68,6 +70,8 @@ type inodeOperations struct {
 // circular load dependency between it and inodeOperations). Even with
 // lazy loading, this approach defines the dependencies between objects
 // and the expected load behavior more concretely.
+//
+// +stateify savable
 type inodeFileState struct {
 	// s is common file system state for Gofers.
 	s *session `state:"wait"`
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index b6841526a..648a11435 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
+// +stateify savable
 type endpointMap struct {
 	mu sync.RWMutex `state:"nosave"`
 	// TODO: Make map with private unix sockets savable.
@@ -63,6 +64,8 @@ func (e *endpointMap) get(key device.MultiDeviceKey) unix.BoundEndpoint {
 }
 
 // session holds state for each 9p session established during sys_mount.
+//
+// +stateify savable
 type session struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 23ec66f50..29c79284a 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,23 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "host_state",
-    srcs = [
-        "control.go",
-        "descriptor.go",
-        "descriptor_state.go",
-        "file.go",
-        "fs.go",
-        "inode.go",
-        "inode_state.go",
-        "socket.go",
-        "socket_state.go",
-    ],
-    out = "host_state.go",
-    package = "host",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "host",
@@ -28,7 +11,6 @@ go_library(
         "device.go",
         "file.go",
         "fs.go",
-        "host_state.go",
         "inode.go",
         "inode_state.go",
         "ioctl_unsafe.go",
@@ -42,7 +24,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/refs",
@@ -52,20 +33,14 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
-        "//pkg/sentry/uniqueid",
-        "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/rawfile",
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 613bd06e8..3aee4d11c 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -25,6 +25,8 @@ import (
 )
 
 // descriptor wraps a host fd.
+//
+// +stateify savable
 type descriptor struct {
 	// donated is true if the host fd was donated by another process.
 	donated bool
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index bdf844337..f9bef6d93 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -37,6 +37,8 @@ import (
 )
 
 // fileOperations implements fs.FileOperations for a host file descriptor.
+//
+// +stateify savable
 type fileOperations struct {
 	fsutil.NoopRelease `state:"nosave"`
 
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index 974700636..e46ae433c 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -51,6 +51,8 @@ const maxTraversals = 10
 // to lock down the configurations. This filesystem should only be mounted at root.
 //
 // Think twice before exposing this to applications.
+//
+// +stateify savable
 type Filesystem struct {
 	// whitelist is a set of host paths to whitelist.
 	paths []string
@@ -266,8 +268,10 @@ func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, file
 }
 
 // superOperations implements fs.MountSourceOperations.
+//
+// +stateify savable
 type superOperations struct {
-	fs.SimpleMountSourceOperations `state:"nosave"`
+	fs.SimpleMountSourceOperations
 
 	// root is the path of the mount point. All inode mappings
 	// are relative to this root.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 226bc5164..761ccde33 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -34,6 +34,8 @@ import (
 
 // inodeOperations implements fs.InodeOperations for an fs.Inodes backed
 // by a host file descriptor.
+//
+// +stateify savable
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
@@ -65,6 +67,8 @@ type inodeOperations struct {
 // circular load dependency between it and inodeOperations). Even with
 // lazy loading, this approach defines the dependencies between objects
 // and the expected load behavior more concretely.
+//
+// +stateify savable
 type inodeFileState struct {
 	// Common file system state.
 	mops *superOperations `state:"wait"`
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 6c8e6f188..d0dbce5dd 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -28,6 +28,8 @@ import (
 
 // Inode is a file system object that can be simultaneously referenced by different
 // components of the VFS (Dirent, fs.File, etc).
+//
+// +stateify savable
 type Inode struct {
 	// AtomicRefCount is our reference count.
 	refs.AtomicRefCount
@@ -58,6 +60,8 @@ type Inode struct {
 // Note that in Linux fcntl(2) and flock(2) locks are _not_ cooperative, because race and
 // deadlock conditions make merging them prohibitive. We do the same and keep them oblivious
 // to each other but provide a "context" as a convenient container.
+//
+// +stateify savable
 type LockCtx struct {
 	// Posix is a set of POSIX-style regional advisory locks, see fcntl(2).
 	Posix lock.Locks
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index 358bbecdf..683140afe 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -20,6 +20,8 @@ import (
 )
 
 // Watches is the collection of inotify watches on an inode.
+//
+// +stateify savable
 type Watches struct {
 	// mu protects the fields below.
 	mu sync.RWMutex `state:"nosave"`
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 6f5e8ce5e..2aabdded8 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -34,6 +34,8 @@ import (
 //
 // Lock ordering:
 //   Inotify.mu -> Inode.Watches.mu -> Watch.mu -> Inotify.evMu
+//
+// +stateify savable
 type Inotify struct {
 	// Unique identifier for this inotify instance. We don't just reuse the
 	// inotify fd because fds can be duped. These should not be exposed to the
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index 217915ba4..e9b5e0f56 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -28,6 +28,8 @@ import (
 const inotifyEventBaseSize = 16
 
 // Event represents a struct inotify_event from linux.
+//
+// +stateify savable
 type Event struct {
 	ilist.Entry
 
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 8904ef544..3e1959e83 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -27,6 +27,8 @@ import (
 // holding an extra ref on each dirent known (by inotify) to point to the
 // inode. These are known as pins. For a full discussion, see
 // fs/g3doc/inotify.md.
+//
+// +stateify savable
 type Watch struct {
 	// Inotify instance which owns this watch.
 	owner *Inotify
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 2607d7ed3..3159ff1da 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,18 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "lock_state",
-    srcs = [
-        "lock.go",
-        "lock_range.go",
-        "lock_set.go",
-    ],
-    out = "lock_state.go",
-    package = "lock",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "lock_range",
@@ -49,13 +38,11 @@ go_library(
         "lock_range.go",
         "lock_set.go",
         "lock_set_functions.go",
-        "lock_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
-        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 24d54c989..e9b376eb6 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -88,6 +88,8 @@ const LockEOF = math.MaxUint64
 //
 // A Lock may be downgraded from a write lock to a read lock only if
 // the write lock's uid is the same as the read lock.
+//
+// +stateify savable
 type Lock struct {
 	// Readers are the set of read lock holders identified by UniqueID.
 	// If len(Readers) > 0 then HasWriter must be false.
@@ -103,6 +105,8 @@ type Lock struct {
 }
 
 // Locks is a thread-safe wrapper around a LockSet.
+//
+// +stateify savable
 type Locks struct {
 	// mu protects locks below.
 	mu sync.Mutex `state:"nosave"`
@@ -111,7 +115,7 @@ type Locks struct {
 	locks LockSet
 
 	// blockedQueue is the queue of waiters that are waiting on a lock.
-	blockedQueue waiter.Queue
+	blockedQueue waiter.Queue `state:"zerovalue"`
 }
 
 // Blocker is the interface used for blocking locks. Passing a nil Blocker
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index eb1897174..4ede767f9 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -101,6 +101,8 @@ func (i InodeMappings) String() string {
 // (e.g. cannot be mounted at different locations).
 //
 // TODO: Move mount-specific information out of MountSource.
+//
+// +stateify savable
 type MountSource struct {
 	refs.AtomicRefCount
 
@@ -260,6 +262,8 @@ func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *Mo
 }
 
 // SimpleMountSourceOperations implements MountSourceOperations.
+//
+// +stateify savable
 type SimpleMountSourceOperations struct {
 	keep bool
 }
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 1be81e3a1..d135e8a37 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -18,6 +18,8 @@ import "gvisor.googlesource.com/gvisor/pkg/sentry/context"
 
 // overlayMountSourceOperations implements MountSourceOperations for an overlay
 // mount point.
+//
+// +stateify savable
 type overlayMountSourceOperations struct {
 	upper *MountSource
 	lower *MountSource
@@ -72,6 +74,8 @@ func (o *overlayMountSourceOperations) Destroy() {
 }
 
 // type overlayFilesystem is the filesystem for overlay mounts.
+//
+// +stateify savable
 type overlayFilesystem struct{}
 
 // Name implements Filesystem.Name.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 87da4ee0e..144d3427d 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -32,6 +32,8 @@ import (
 const DefaultTraversalLimit = 10
 
 // MountNamespace defines a collection of mounts.
+//
+// +stateify savable
 type MountNamespace struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 7357d6401..af13dc8c7 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -145,6 +145,8 @@ func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *I
 }
 
 // overlayEntry is the overlay metadata of an Inode. It implements Mappable.
+//
+// +stateify savable
 type overlayEntry struct {
 	// lowerExists is true if an Inode exists for this file in the lower
 	// filesystem. If lowerExists is true, then the overlay must create
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 870df47b2..2d9f07f2f 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,32 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "proc_state",
-    srcs = [
-        "cpuinfo.go",
-        "exec_args.go",
-        "fds.go",
-        "file.go",
-        "filesystems.go",
-        "fs.go",
-        "loadavg.go",
-        "meminfo.go",
-        "mounts.go",
-        "net.go",
-        "proc.go",
-        "stat.go",
-        "sys.go",
-        "sys_net.go",
-        "task.go",
-        "uid_gid_map.go",
-        "uptime.go",
-        "version.go",
-    ],
-    out = "proc_state.go",
-    package = "proc",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "proc",
@@ -42,7 +16,6 @@ go_library(
         "mounts.go",
         "net.go",
         "proc.go",
-        "proc_state.go",
         "rpcinet_proc.go",
         "stat.go",
         "sys.go",
@@ -56,9 +29,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
-        "//pkg/log",
-        "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/device",
@@ -73,8 +43,6 @@ go_library(
         "//pkg/sentry/socket/rpcinet",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
-        "//pkg/syserr",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index f80aaa5b1..4dfec03a4 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -27,6 +27,8 @@ import (
 // cpuinfo is a file describing the CPU capabilities.
 //
 // Presently cpuinfo never changes, so it doesn't need to be a SeqFile.
+//
+// +stateify savable
 type cpuinfo struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index 0e1523bf1..a69cbaa0e 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -37,6 +37,8 @@ const (
 
 // execArgFile is a file containing the exec args (either cmdline or environ)
 // for a given task.
+//
+// +stateify savable
 type execArgFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 194a9c12a..cca8f874c 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -138,6 +138,8 @@ func (f *fd) Close() error {
 }
 
 // fdDir implements /proc/TID/fd.
+//
+// +stateify savable
 type fdDir struct {
 	ramfs.Dir
 
@@ -197,6 +199,8 @@ func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 }
 
 // fdInfo is a single file in /proc/TID/fdinfo/.
+//
+// +stateify savable
 type fdInfo struct {
 	ramfs.File
 
@@ -229,6 +233,8 @@ func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error
 
 // fdInfoDir implements /proc/TID/fdinfo.  It embeds an fdDir, but overrides
 // Lookup and Readdir.
+//
+// +stateify savable
 type fdInfoDir struct {
 	ramfs.Dir
 
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
index 9a433cdf8..4b2d08e75 100644
--- a/pkg/sentry/fs/proc/file.go
+++ b/pkg/sentry/fs/proc/file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// +stateify savable
 type file struct {
 	fs.InodeOperations
 
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index 37db9cf9c..49b92fd8a 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -24,6 +24,8 @@ import (
 )
 
 // filesystemsData backs /proc/filesystems.
+//
+// +stateify savable
 type filesystemsData struct{}
 
 // NeedsUpdate returns true on the first generation. The set of registered file
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 3aadd6ac4..061824b8c 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -22,6 +22,8 @@ import (
 )
 
 // filesystem is a procfs.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 7583b6ccd..6fac251d2 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -23,6 +23,8 @@ import (
 )
 
 // loadavgData backs /proc/loadavg.
+//
+// +stateify savable
 type loadavgData struct{}
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 49cb0faed..53dfd59ef 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -26,6 +26,8 @@ import (
 )
 
 // meminfoData backs /proc/meminfo.
+//
+// +stateify savable
 type meminfoData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 108432f4e..2b8167c28 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -71,6 +71,8 @@ func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
 }
 
 // mountInfoFile is used to implement /proc/[pid]/mountinfo.
+//
+// +stateify savable
 type mountInfoFile struct {
 	t *kernel.Task
 }
@@ -152,6 +154,8 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 }
 
 // mountsFile is used to implement /proc/[pid]/mountinfo.
+//
+// +stateify savable
 type mountsFile struct {
 	t *kernel.Task
 }
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index b2a8d639c..07029a7bb 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -33,6 +33,8 @@ import (
 )
 
 // proc is a root proc node.
+//
+// +stateify savable
 type proc struct {
 	ramfs.Dir
 
@@ -47,6 +49,8 @@ type proc struct {
 // stubProcFSFile is a file type that can be used to return file contents
 // which are constant. This file is not writable and will always have mode
 // 0444.
+//
+// +stateify savable
 type stubProcFSFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index c84f7e20d..53c475652 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,22 +1,10 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "seqfile_state",
-    srcs = [
-        "seqfile.go",
-    ],
-    out = "seqfile_state.go",
-    package = "seqfile",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "seqfile",
-    srcs = [
-        "seqfile.go",
-        "seqfile_state.go",
-    ],
+    srcs = ["seqfile.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -26,26 +14,16 @@ go_library(
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
     ],
 )
 
-go_stateify(
-    name = "seqfile_test_state",
-    srcs = ["seqfile_test.go"],
-    out = "seqfile_test_state.go",
-    package = "seqfile",
-)
-
 go_test(
     name = "seqfile_test",
     size = "small",
-    srcs = [
-        "seqfile_test.go",
-        "seqfile_test_state.go",
-    ],
+    srcs = ["seqfile_test.go"],
     embed = [":seqfile"],
     deps = [
+        "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs/test",
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index c08565f8a..51cae5e37 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -30,6 +30,8 @@ import (
 type SeqHandle interface{}
 
 // SeqData holds the data for one unit in the file.
+//
+// +stateify savable
 type SeqData struct {
 	// The data to be returned to the user.
 	Buf []byte
@@ -82,6 +84,8 @@ func (s *SeqGenerationCounter) IsCurrent(generation int64) bool {
 }
 
 // SeqFile is used to provide dynamic files that can be ordered by record.
+//
+// +stateify savable
 type SeqFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index 284f3e52b..bf7650211 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -25,6 +25,8 @@ import (
 )
 
 // statData backs /proc/stat.
+//
+// +stateify savable
 type statData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index aab891c53..a2d36ca23 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -28,6 +28,8 @@ import (
 )
 
 // hostname is a file containing the system hostname.
+//
+// +stateify savable
 type hostname struct {
 	ramfs.Entry
 }
@@ -52,6 +54,8 @@ func (p *proc) newHostname(ctx context.Context, msrc *fs.MountSource) *fs.Inode
 }
 
 // mmapMinAddrData backs /proc/sys/vm/mmap_min_addr.
+//
+// +stateify savable
 type mmapMinAddrData struct {
 	k *kernel.Kernel
 }
@@ -74,6 +78,7 @@ func (d *mmapMinAddrData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHand
 	}, 0
 }
 
+// +stateify savable
 type overcommitMemory struct{}
 
 func (*overcommitMemory) NeedsUpdate(generation int64) bool {
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index f3a5043f8..beb25be20 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -33,6 +33,7 @@ const (
 	tcpWMem
 )
 
+// +stateify savable
 type tcpMem struct {
 	ramfs.Entry
 	s    inet.Stack
@@ -100,6 +101,7 @@ func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 	return n, cperr
 }
 
+// +stateify savable
 type tcpSack struct {
 	ramfs.Entry
 	s inet.Stack
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index efc635946..748ca4320 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -52,6 +52,8 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 }
 
 // taskDir represents a task-level directory.
+//
+// +stateify savable
 type taskDir struct {
 	ramfs.Dir
 
@@ -92,6 +94,8 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 }
 
 // subtasks represents a /proc/TID/task directory.
+//
+// +stateify savable
 type subtasks struct {
 	ramfs.Dir
 
@@ -167,6 +171,8 @@ func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, off
 }
 
 // exe is an fs.InodeOperations symlink for the /proc/PID/exe file.
+//
+// +stateify savable
 type exe struct {
 	ramfs.Symlink
 
@@ -226,6 +232,8 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 
 // namespaceFile represents a file in the namespacefs, such as the files in
 // /proc/<pid>/ns.
+//
+// +stateify savable
 type namespaceFile struct {
 	ramfs.Symlink
 
@@ -274,6 +282,8 @@ func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 // mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
+//
+// +stateify savable
 type mapsData struct {
 	t *kernel.Task
 }
@@ -311,6 +321,7 @@ func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([
 	return []seqfile.SeqData{}, 0
 }
 
+// +stateify savable
 type taskStatData struct {
 	t *kernel.Task
 
@@ -391,6 +402,8 @@ func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 }
 
 // statmData implements seqfile.SeqSource for /proc/[pid]/statm.
+//
+// +stateify savable
 type statmData struct {
 	t *kernel.Task
 }
@@ -425,6 +438,8 @@ func (s *statmData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([
 }
 
 // statusData implements seqfile.SeqSource for /proc/[pid]/status.
+//
+// +stateify savable
 type statusData struct {
 	t     *kernel.Task
 	pidns *kernel.PIDNamespace
@@ -490,6 +505,7 @@ type ioUsage interface {
 	IOUsage() *usage.IO
 }
 
+// +stateify savable
 type ioData struct {
 	ioUsage
 }
@@ -530,6 +546,8 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 // On Linux, /proc/[pid]/comm is writable, and writing to the comm file changes
 // the thread name. We don't implement this yet as there are no known users of
 // this feature.
+//
+// +stateify savable
 type comm struct {
 	ramfs.Entry
 
@@ -559,6 +577,8 @@ func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, off
 }
 
 // auxvec is a file containing the auxiliary vector for a task.
+//
+// +stateify savable
 type auxvec struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 85acb5163..9811d9c9d 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -29,6 +29,8 @@ import (
 
 // An idMapSeqSource is a seqfile.SeqSource that returns UID or GID mappings
 // from a task's user namespace.
+//
+// +stateify savable
 type idMapSeqSource struct {
 	t    *kernel.Task
 	gids bool
@@ -70,6 +72,7 @@ type idMapSeqHandle struct {
 	value int
 }
 
+// +stateify savable
 type idMapSeqFile struct {
 	seqfile.SeqFile
 }
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index 4679d5821..f3a9b81df 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -27,6 +27,8 @@ import (
 )
 
 // uptime is a file containing the system uptime.
+//
+// +stateify savable
 type uptime struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index c0f2e87e3..00f6a2afd 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -23,6 +23,8 @@ import (
 )
 
 // versionData backs /proc/version.
+//
+// +stateify savable
 type versionData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index d84f2c624..5230157fe 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,19 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "ramfs_state",
-    srcs = [
-        "dir.go",
-        "file.go",
-        "ramfs.go",
-        "socket.go",
-        "symlink.go",
-    ],
-    out = "ramfs_state.go",
-    package = "ramfs",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "ramfs",
@@ -21,7 +8,6 @@ go_library(
         "dir.go",
         "file.go",
         "ramfs.go",
-        "ramfs_state.go",
         "socket.go",
         "symlink.go",
         "tree.go",
@@ -29,12 +15,8 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/amutex",
-        "//pkg/log",
-        "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/context",
-        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
@@ -42,7 +24,6 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 19d5612ed..04432f28c 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -44,6 +44,8 @@ type CreateOps struct {
 }
 
 // Dir represents a single directory in the filesystem.
+//
+// +stateify savable
 type Dir struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index d6cfaf753..13e72e775 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -60,6 +60,8 @@ var (
 
 // Entry represents common internal state for file and directory nodes.
 // This may be used by other packages to easily create ramfs files.
+//
+// +stateify savable
 type Entry struct {
 	waiter.AlwaysReady    `state:"nosave"`
 	fsutil.NoMappable     `state:"nosave"`
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index b0c79325f..93427a1ff 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -21,6 +21,8 @@ import (
 )
 
 // Socket represents a socket.
+//
+// +stateify savable
 type Socket struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 9bbf78619..1c54d9991 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -22,6 +22,8 @@ import (
 )
 
 // Symlink represents a symlink.
+//
+// +stateify savable
 type Symlink struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD
index 57fee45e2..187eac49d 100644
--- a/pkg/sentry/fs/ramfs/test/BUILD
+++ b/pkg/sentry/fs/ramfs/test/BUILD
@@ -1,30 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "test_state",
-    srcs = [
-        "test.go",
-    ],
-    out = "test_state.go",
-    package = "test",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "test",
     testonly = 1,
-    srcs = [
-        "test.go",
-        "test_state.go",
-    ],
+    srcs = ["test.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
-        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 095ff1f25..bc24e980e 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,16 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "sys_state",
-    srcs = [
-        "fs.go",
-        "sys.go",
-    ],
-    out = "sys_state.go",
-    package = "sys",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "sys",
@@ -18,7 +8,6 @@ go_library(
         "device.go",
         "fs.go",
         "sys.go",
-        "sys_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys",
     visibility = ["//pkg/sentry:internal"],
@@ -28,6 +17,5 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/usermem",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index c6d5f7fd8..625525540 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -20,6 +20,8 @@ import (
 )
 
 // filesystem is a sysfs.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index ccf56f644..b9b2fb4a1 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -22,12 +22,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-type Dir struct {
+// +stateify savable
+type dir struct {
 	ramfs.Dir
 }
 
 func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode {
-	d := &Dir{}
+	d := &dir{}
 	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index 8b1b7872e..ffdd7e0dc 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,33 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "timerfd_state",
-    srcs = [
-        "timerfd.go",
-    ],
-    out = "timerfd_state.go",
-    package = "timerfd",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "timerfd",
-    srcs = [
-        "timerfd.go",
-        "timerfd_state.go",
-    ],
+    srcs = ["timerfd.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index ae58f6fd7..767db95a0 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -30,6 +30,8 @@ import (
 )
 
 // TimerOperations implements fs.FileOperations for timerfds.
+//
+// +stateify savable
 type TimerOperations struct {
 	fsutil.ZeroSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
@@ -38,7 +40,7 @@ type TimerOperations struct {
 	fsutil.NoMMap        `state:"nosave"`
 	fsutil.NoIoctl       `state:"nosave"`
 
-	events waiter.Queue `state:"nosave"`
+	events waiter.Queue `state:"zerovalue"`
 	timer  *ktime.Timer
 
 	// val is the number of timer expirations since the last successful call to
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 473ab4296..cfe11ab02 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,18 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tmpfs_state",
-    srcs = [
-        "file_regular.go",
-        "fs.go",
-        "inode_file.go",
-        "tmpfs.go",
-    ],
-    out = "tmpfs_state.go",
-    package = "tmpfs",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tmpfs",
@@ -22,13 +10,11 @@ go_library(
         "fs.go",
         "inode_file.go",
         "tmpfs.go",
-        "tmpfs_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -41,7 +27,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 9811d90bc..342688f81 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -25,6 +25,8 @@ import (
 
 // regularFileOperations implements fs.FileOperations for a regular
 // tmpfs file.
+//
+// +stateify savable
 type regularFileOperations struct {
 	waiter.AlwaysReady   `state:"nosave"`
 	fsutil.NoopRelease   `state:"nosave"`
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 5bd9ade52..ca620e65e 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -47,6 +47,8 @@ const (
 var modeRegexp = regexp.MustCompile("0[0-7][0-7][0-7]")
 
 // Filesystem is a tmpfs.
+//
+// +stateify savable
 type Filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 4e803c9ff..1e4fe47d2 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -43,6 +43,8 @@ import (
 // include an InvalidatorRegion associated with that reference. When the
 // referenced portion of the file is removed (with Truncate), the associated
 // InvalidatorRegion is invalidated.
+//
+// +stateify savable
 type fileInodeOperations struct {
 	fsutil.DeprecatedFileOperations `state:"nosave"`
 	fsutil.InodeNotDirectory        `state:"nosave"`
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 1cc7ae491..10cb5451d 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -49,6 +49,8 @@ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent
 }
 
 // Dir is a directory.
+//
+// +stateify savable
 type Dir struct {
 	ramfs.Dir
 
@@ -122,6 +124,8 @@ func (*Dir) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Symlink is a symlink.
+//
+// +stateify savable
 type Symlink struct {
 	ramfs.Symlink
 }
@@ -149,6 +153,8 @@ func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Socket is a socket.
+//
+// +stateify savable
 type Socket struct {
 	ramfs.Socket
 }
@@ -176,6 +182,8 @@ func (s *Socket) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Fifo is a tmpfs named pipe.
+//
+// +stateify savable
 type Fifo struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 363897b2c..3c446eef4 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,22 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tty_state",
-    srcs = [
-        "dir.go",
-        "fs.go",
-        "inode.go",
-        "line_discipline.go",
-        "master.go",
-        "queue.go",
-        "slave.go",
-        "terminal.go",
-    ],
-    out = "tty_state.go",
-    package = "tty",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tty",
@@ -29,7 +13,6 @@ go_library(
         "queue.go",
         "slave.go",
         "terminal.go",
-        "tty_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty",
     visibility = ["//pkg/sentry:internal"],
@@ -44,7 +27,6 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 2c5b2aed6..c91091db4 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -49,14 +49,16 @@ import (
 // corresponding Dirents hold on their parent (this directory).
 //
 // dirInodeOperations implements fs.InodeOperations.
+//
+// +stateify savable
 type dirInodeOperations struct {
-	fsutil.DeprecatedFileOperations
-	fsutil.InodeNotSocket
-	fsutil.InodeNotRenameable
-	fsutil.InodeNotSymlink
-	fsutil.InodeNoExtendedAttributes
-	fsutil.NoMappable
-	fsutil.NoopWriteOut
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.NoMappable                `state:"nosave"`
+	fsutil.NoopWriteOut              `state:"nosave"`
 
 	// msrc is the super block this directory is on.
 	//
@@ -348,6 +350,8 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
 //
 // This is nearly identical to fsutil.DirFileOperations, except that it takes
 // df.di.mu in IterateDir.
+//
+// +stateify savable
 type dirFileOperations struct {
 	waiter.AlwaysReady `state:"nosave"`
 	fsutil.NoopRelease `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index dbaffe95e..e28635607 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -28,6 +28,8 @@ var ptsDevice = device.NewAnonDevice()
 //
 // This devpts is always in the new "multi-instance" mode. i.e., it contains a
 // ptmx device tied to this mount.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
@@ -69,6 +71,8 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 }
 
 // superOperations implements fs.MountSourceOperations, preventing caching.
+//
+// +stateify savable
 type superOperations struct{}
 
 // Revalidate implements fs.DirentOperations.Revalidate.
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
index 04b9a7727..c0fa2b407 100644
--- a/pkg/sentry/fs/tty/inode.go
+++ b/pkg/sentry/fs/tty/inode.go
@@ -31,6 +31,8 @@ import (
 //
 // * fs.InodeOperations.Release
 // * fs.InodeOperations.GetFile
+//
+// +stateify savable
 type inodeOperations struct {
 	fsutil.DeprecatedFileOperations  `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index f094635f5..d243ee40e 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -72,6 +72,8 @@ const (
 //  termiosMu
 //    inQueue.mu
 //      outQueue.mu
+//
+// +stateify savable
 type lineDiscipline struct {
 	// inQueue is the input queue of the terminal.
 	inQueue queue
@@ -183,6 +185,8 @@ type transformer interface {
 
 // outputQueueTransformer implements transformer. It performs line discipline
 // transformations on the output queue.
+//
+// +stateify savable
 type outputQueueTransformer struct{}
 
 // transform does output processing for one end of the pty. See
@@ -254,6 +258,8 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 
 // inputQueueTransformer implements transformer. It performs line discipline
 // transformations on the input queue.
+//
+// +stateify savable
 type inputQueueTransformer struct{}
 
 // transform does input processing for one end of the pty. Characters read are
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 74cdbe874..c7198e218 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -27,6 +27,8 @@ import (
 
 // masterInodeOperations are the fs.InodeOperations for the master end of the
 // Terminal (ptmx file).
+//
+// +stateify savable
 type masterInodeOperations struct {
 	inodeOperations
 
@@ -96,6 +98,8 @@ func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flag
 }
 
 // masterFileOperations are the fs.FileOperations for the master end of a terminal.
+//
+// +stateify savable
 type masterFileOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 026d5e077..42c105abc 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -32,11 +32,13 @@ import (
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
 // readable is true.
+//
+// +stateify savable
 type queue struct {
 	// mu protects everything in queue.
 	mu sync.Mutex `state:"nosave"`
 
-	waiter.Queue `state:"nosave"`
+	waiter.Queue `state:"zerovalue"`
 
 	// readBuf is buffer of data ready to be read when readable is true.
 	// This data has been processed.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index f5eec726e..1c562b172 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -27,6 +27,8 @@ import (
 
 // slaveInodeOperations are the fs.InodeOperations for the slave end of the
 // Terminal (pts file).
+//
+// +stateify savable
 type slaveInodeOperations struct {
 	inodeOperations
 
@@ -86,6 +88,8 @@ func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags
 }
 
 // slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+//
+// +stateify savable
 type slaveFileOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index fa5b00409..3cb135124 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -21,6 +21,8 @@ import (
 )
 
 // Terminal is a pseudoterminal.
+//
+// +stateify savable
 type Terminal struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index eaf8f15b2..159c50efb 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -3,26 +3,15 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "inet_state",
-    srcs = ["inet.go"],
-    out = "inet_state.go",
-    package = "inet",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "inet",
     srcs = [
         "context.go",
         "inet.go",
-        "inet_state.go",
         "test_stack.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/inet",
-    deps = [
-        "//pkg/sentry/context",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/sentry/context"],
 )
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index e4b326993..e54a61196 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -87,6 +87,8 @@ type InterfaceAddr struct {
 }
 
 // TCPBufferSize contains settings controlling TCP buffer sizing.
+//
+// +stateify savable
 type TCPBufferSize struct {
 	// Min is the minimum size.
 	Min int
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c4a7dacb2..0ebacefa6 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,58 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "kernel_state",
-    srcs = [
-        "abstract_socket_namespace.go",
-        "fd_map.go",
-        "fs_context.go",
-        "ipc_namespace.go",
-        "kernel.go",
-        "kernel_state.go",
-        "pending_signals.go",
-        "pending_signals_state.go",
-        "process_group_list.go",
-        "ptrace.go",
-        "rseq.go",
-        "session_list.go",
-        "sessions.go",
-        "signal.go",
-        "signal_handlers.go",
-        "syscalls.go",
-        "syscalls_state.go",
-        "syslog.go",
-        "task.go",
-        "task_clone.go",
-        "task_context.go",
-        "task_exec.go",
-        "task_exit.go",
-        "task_list.go",
-        "task_resources.go",
-        "task_run.go",
-        "task_sched.go",
-        "task_signals.go",
-        "task_start.go",
-        "task_syscall.go",
-        "thread_group.go",
-        "threads.go",
-        "timekeeper.go",
-        "timekeeper_state.go",
-        "timer.go",
-        "uts_namespace.go",
-        "vdso.go",
-        "version.go",
-    ],
-    out = "kernel_autogen_state.go",
-    imports = [
-        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
-        "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
-        "gvisor.googlesource.com/gvisor/pkg/tcpip",
-    ],
-    package = "kernel",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "pending_signals_list",
@@ -118,7 +67,6 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
-        "kernel_autogen_state.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
@@ -164,6 +112,11 @@ go_library(
         "version.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
+    imports = [
+        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
+        # "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+        "gvisor.googlesource.com/gvisor/pkg/tcpip",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 014c4a3bf..d6d1d341d 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
+// +stateify savable
 type abstractEndpoint struct {
 	ep   unix.BoundEndpoint
 	wr   *refs.WeakRef
@@ -39,6 +40,8 @@ func (e *abstractEndpoint) WeakRefGone() {
 }
 
 // AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
+//
+// +stateify savable
 type AbstractSocketNamespace struct {
 	mu sync.Mutex `state:"nosave"`
 
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 5b7b30557..a81085372 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,20 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "auth_state",
-    srcs = [
-        "credentials.go",
-        "id.go",
-        "id_map_range.go",
-        "id_map_set.go",
-        "user_namespace.go",
-    ],
-    out = "auth_state.go",
-    package = "auth",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "id_map_range",
@@ -48,7 +35,6 @@ go_library(
     name = "auth",
     srcs = [
         "auth.go",
-        "auth_state.go",
         "capability_set.go",
         "context.go",
         "credentials.go",
@@ -66,7 +52,6 @@ go_library(
         "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/context",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index f6fb05285..f18f7dac9 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -21,6 +21,8 @@ import (
 
 // Credentials contains information required to authorize privileged operations
 // in a user namespace.
+//
+// +stateify savable
 type Credentials struct {
 	// Real/effective/saved user/group IDs in the root user namespace. None of
 	// these should ever be NoID.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index 6adb33530..bd0090e0f 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -77,6 +77,8 @@ func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
 // An IDMapEntry represents a mapping from a range of contiguous IDs in a user
 // namespace to an equally-sized range of contiguous IDs in the namespace's
 // parent.
+//
+// +stateify savable
 type IDMapEntry struct {
 	// FirstID is the first ID in the range in the namespace.
 	FirstID uint32
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 0980aeadf..d359f3f31 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -23,6 +23,8 @@ import (
 
 // A UserNamespace represents a user namespace. See user_namespaces(7) for
 // details.
+//
+// +stateify savable
 type UserNamespace struct {
 	// parent is this namespace's parent. If this is the root namespace, parent
 	// is nil. The parent pointer is immutable.
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 7d491efbc..5e8b36ed6 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,22 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "epoll_autogen_state",
-    srcs = [
-        "epoll.go",
-        "epoll_state.go",
-    ],
-    out = "epoll_autogen_state.go",
-    package = "epoll",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "epoll",
     srcs = [
         "epoll.go",
-        "epoll_autogen_state.go",
         "epoll_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll",
@@ -29,9 +18,7 @@ go_library(
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/kdefs",
-        "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index b572fcd7e..d87e64a1c 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -58,6 +58,8 @@ const (
 // potentially be reassigned. We also cannot use just the file pointer because
 // it is possible to have multiple entries for the same file object as long as
 // they are created with different FDs (i.e., the FDs point to the same file).
+//
+// +stateify savable
 type FileIdentifier struct {
 	File *fs.File
 	Fd   kdefs.FD
@@ -65,6 +67,8 @@ type FileIdentifier struct {
 
 // pollEntry holds all the state associated with an event poll entry, that is,
 // a file being observed by an event poll object.
+//
+// +stateify savable
 type pollEntry struct {
 	ilist.Entry
 	file     *refs.WeakRef  `state:"manual"`
@@ -92,6 +96,8 @@ func (p *pollEntry) WeakRefGone() {
 
 // EventPoll holds all the state associated with an event poll object, that is,
 // collection of files to observe and their current state.
+//
+// +stateify savable
 type EventPoll struct {
 	fsutil.PipeSeek      `state:"zerovalue"`
 	fsutil.NotDirReaddir `state:"zerovalue"`
@@ -102,7 +108,7 @@ type EventPoll struct {
 
 	// Wait queue is used to notify interested parties when the event poll
 	// object itself becomes readable or writable.
-	waiter.Queue
+	waiter.Queue `state:"zerovalue"`
 
 	// files is the map of all the files currently being observed, it is
 	// protected by mu.
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 7ec179bd8..cc1120b4f 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,33 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "eventfd_state",
-    srcs = [
-        "eventfd.go",
-    ],
-    out = "eventfd_state.go",
-    package = "eventfd",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "eventfd",
-    srcs = [
-        "eventfd.go",
-        "eventfd_state.go",
-    ],
+    srcs = ["eventfd.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index bd50bd9fe..a4ada0e78 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -35,6 +35,8 @@ import (
 // EventOperations represents an event with the semantics of Linux's file-based event
 // notification (eventfd). Eventfds are usually internal to the Sentry but in certain
 // situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
 type EventOperations struct {
 	fsutil.NoopRelease   `state:"nosave"`
 	fsutil.PipeSeek      `state:"nosave"`
@@ -49,7 +51,7 @@ type EventOperations struct {
 
 	// Queue is used to notify interested parties when the event object
 	// becomes readable or writable.
-	wq waiter.Queue `state:"nosave"`
+	wq waiter.Queue `state:"zerovalue"`
 
 	// val is the current value of the event counter.
 	val uint64
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index 299506330..d5d4aaacb 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -46,6 +46,8 @@ func (f FDs) Less(i, j int) bool {
 }
 
 // FDFlags define flags for an individual descriptor.
+//
+// +stateify savable
 type FDFlags struct {
 	// CloseOnExec indicates the descriptor should be closed on exec.
 	CloseOnExec bool
@@ -69,12 +71,16 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) {
 
 // descriptor holds the details about a file descriptor, namely a pointer the
 // file itself and the descriptor flags.
+//
+// +stateify savable
 type descriptor struct {
 	file  *fs.File
 	flags FDFlags
 }
 
 // FDMap is used to manage File references and flags.
+//
+// +stateify savable
 type FDMap struct {
 	refs.AtomicRefCount
 	k     *Kernel
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index dbc097696..f3f05e8f5 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -25,6 +25,8 @@ import (
 // FSContext contains filesystem context.
 //
 // This includes umask and working directory.
+//
+// +stateify savable
 type FSContext struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index a97a43549..b44a26974 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "waiter_list",
@@ -14,29 +14,15 @@ go_template_instance(
     },
 )
 
-go_stateify(
-    name = "futex_state",
-    srcs = [
-        "futex.go",
-        "waiter_list.go",
-    ],
-    out = "futex_state.go",
-    package = "futex",
-)
-
 go_library(
     name = "futex",
     srcs = [
         "futex.go",
-        "futex_state.go",
         "waiter_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
     visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/state",
-        "//pkg/syserror",
-    ],
+    deps = ["//pkg/syserror"],
 )
 
 go_test(
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 15e3e5e2c..4a1f2a0ef 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -196,6 +196,8 @@ func bucketIndexForAddr(addr uintptr) uintptr {
 }
 
 // Manager holds futex state for a single virtual address space.
+//
+// +stateify savable
 type Manager struct {
 	buckets [bucketCount]bucket `state:"zerovalue"`
 }
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index a86bda77b..5eef49f59 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -21,6 +21,8 @@ import (
 )
 
 // IPCNamespace represents an IPC namespace.
+//
+// +stateify savable
 type IPCNamespace struct {
 	// User namespace which owns this IPC namespace. Immutable.
 	userNS *auth.UserNamespace
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 64439cd9d..419a1d473 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -62,6 +62,8 @@ import (
 
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
+//
+// +stateify savable
 type Kernel struct {
 	// extMu serializes external changes to the Kernel with calls to
 	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
@@ -158,7 +160,7 @@ type Kernel struct {
 
 	// exitErr is the error causing the sandbox to exit, if any. It is
 	// protected by extMu.
-	exitErr error
+	exitErr error `state:"nosave"`
 
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index 5dc0f266c..06be5a7e1 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -38,6 +38,8 @@ const (
 // pendingSignals holds a collection of pending signals. The zero value of
 // pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
 // users must provide synchronization.
+//
+// +stateify savable
 type pendingSignals struct {
 	// signals contains all pending signals.
 	//
@@ -52,11 +54,14 @@ type pendingSignals struct {
 }
 
 // pendingSignalQueue holds a pendingSignalList for a single signal number.
+//
+// +stateify savable
 type pendingSignalQueue struct {
 	pendingSignalList
 	length int
 }
 
+// +stateify savable
 type pendingSignal struct {
 	// pendingSignalEntry links into a pendingSignalList.
 	pendingSignalEntry
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4600d19bd..19b23c6d2 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,20 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "pipe_state",
-    srcs = [
-        "buffers.go",
-        "node.go",
-        "pipe.go",
-        "reader.go",
-        "reader_writer.go",
-        "writer.go",
-    ],
-    out = "pipe_state.go",
-    package = "pipe",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "pipe",
@@ -23,7 +9,6 @@ go_library(
         "device.go",
         "node.go",
         "pipe.go",
-        "pipe_state.go",
         "reader.go",
         "reader_writer.go",
         "writer.go",
@@ -34,15 +19,12 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/amutex",
         "//pkg/ilist",
-        "//pkg/log",
-        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index f300537c5..a82e45c3f 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -20,6 +20,8 @@ import (
 
 // Buffer encapsulates a queueable byte buffer that can
 // easily be truncated.  It is designed only for use with pipes.
+//
+// +stateify savable
 type Buffer struct {
 	ilist.Entry
 	data []byte
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index e418cf174..23d692da1 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -24,6 +24,8 @@ import (
 )
 
 // inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
+//
+// +stateify savable
 type inodeOperations struct {
 	fs.InodeOperations
 
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 9a21df5b4..ced2559a7 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -41,6 +41,8 @@ const DefaultPipeSize = 65536
 // Pipe is an encapsulation of a platform-independent pipe.
 // It manages a buffered byte queue shared between a reader/writer
 // pair.
+//
+// +stateify savable
 type Pipe struct {
 	waiter.Queue `state:"nosave"`
 
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 40d5e4943..1fa5e9a32 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -20,6 +20,8 @@ import (
 
 // Reader satisfies the fs.FileOperations interface for read-only pipes.
 // Reader should be used with !fs.FileFlags.Write to reject writes.
+//
+// +stateify savable
 type Reader struct {
 	ReaderWriter
 }
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index dc642a3a6..82607367b 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -31,6 +31,8 @@ import (
 // read and write requests. This should only be used directly for named pipes.
 // pipe(2) and pipe2(2) only support unidirectional pipes and should use
 // either pipe.Reader or pipe.Writer.
+//
+// +stateify savable
 type ReaderWriter struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index fd13008ac..d93324b53 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -20,6 +20,8 @@ import (
 
 // Writer satisfies the fs.FileOperations interface for write-only pipes.
 // Writer should be used with !fs.FileFlags.Read to reject reads.
+//
+// +stateify savable
 type Writer struct {
 	ReaderWriter
 }
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index f1c2c4bf0..e9e69004d 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -25,6 +25,8 @@ import (
 
 // ptraceOptions are the subset of options controlling a task's ptrace behavior
 // that are set by ptrace(PTRACE_SETOPTIONS).
+//
+// +stateify savable
 type ptraceOptions struct {
 	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
 	// exits.
@@ -185,6 +187,8 @@ func (t *Task) hasTracer() bool {
 }
 
 // ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
+//
+// +stateify savable
 type ptraceStop struct {
 	// If frozen is true, the stopped task's tracer is currently operating on
 	// it, so Task.Kill should not remove the stop.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 635372993..1f3de58e3 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -23,6 +23,8 @@ import (
 // Restartable sequences, as described in https://lwn.net/Articles/650333/.
 
 // RSEQCriticalRegion describes a restartable sequence critical region.
+//
+// +stateify savable
 type RSEQCriticalRegion struct {
 	// When a task in this thread group has its CPU preempted (as defined by
 	// platform.ErrContextCPUPreempted) or has a signal delivered to an
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 969145fe1..e7fa44e2c 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "waiter_list",
@@ -14,21 +14,10 @@ go_template_instance(
     },
 )
 
-go_stateify(
-    name = "semaphore_state",
-    srcs = [
-        "semaphore.go",
-        "waiter_list.go",
-    ],
-    out = "semaphore_autogen_state.go",
-    package = "semaphore",
-)
-
 go_library(
     name = "semaphore",
     srcs = [
         "semaphore.go",
-        "semaphore_autogen_state.go",
         "waiter_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore",
@@ -40,8 +29,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
-        "//pkg/state",
-        "//pkg/state/statefile",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index a1ee83ce5..aa07946cf 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -42,6 +42,8 @@ const (
 )
 
 // Registry maintains a set of semaphores that can be found by key or ID.
+//
+// +stateify savable
 type Registry struct {
 	// userNS owning the ipc name this registry belongs to. Immutable.
 	userNS *auth.UserNamespace
@@ -52,6 +54,8 @@ type Registry struct {
 }
 
 // Set represents a set of semaphores that can be operated atomically.
+//
+// +stateify savable
 type Set struct {
 	// registry owning this sem set. Immutable.
 	registry *Registry
@@ -79,6 +83,8 @@ type Set struct {
 }
 
 // sem represents a single semanphore from a set.
+//
+// +stateify savable
 type sem struct {
 	value   int16
 	waiters waiterList `state:"zerovalue"`
@@ -86,6 +92,8 @@ type sem struct {
 
 // waiter represents a caller that is waiting for the semaphore value to
 // become positive or zero.
+//
+// +stateify savable
 type waiter struct {
 	waiterEntry
 
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index fa4c7b8f6..cf4e18805 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -27,6 +27,8 @@ type SessionID ThreadID
 type ProcessGroupID ThreadID
 
 // Session contains a leader threadgroup and a list of ProcessGroups.
+//
+// +stateify savable
 type Session struct {
 	refs refs.AtomicRefCount
 
@@ -76,6 +78,8 @@ func (s *Session) decRef() {
 }
 
 // ProcessGroup contains an originator threadgroup and a parent Session.
+//
+// +stateify savable
 type ProcessGroup struct {
 	refs refs.AtomicRefCount // not exported.
 
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 0f88eb0ac..40e641355 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,22 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "shm_state",
-    srcs = [
-        "shm.go",
-    ],
-    out = "shm_autogen_state.go",
-    package = "shm",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "shm",
     srcs = [
         "device.go",
         "shm.go",
-        "shm_autogen_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm",
     visibility = ["//pkg/sentry:internal"],
@@ -33,7 +23,6 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 7217e8103..1ac444094 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -72,6 +72,8 @@ const (
 // Registry tracks all shared memory segments in an IPC namespace. The registry
 // provides the mechanisms for creating and finding segments, and reporting
 // global shm parameters.
+//
+// +stateify savable
 type Registry struct {
 	// userNS owns the IPC namespace this registry belong to. Immutable.
 	userNS *auth.UserNamespace
@@ -288,6 +290,8 @@ func (r *Registry) remove(s *Shm) {
 // shmctl(SHM_RMID).
 //
 // Shm implements memmap.Mappable and memmap.MappingIdentity.
+//
+// +stateify savable
 type Shm struct {
 	// AtomicRefCount tracks the number of references to this segment from
 	// maps. A segment always holds a reference to itself, until it's marked for
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 21ba4ee70..3649f5e4d 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -22,6 +22,8 @@ import (
 )
 
 // SignalHandlers holds information about signal actions.
+//
+// +stateify savable
 type SignalHandlers struct {
 	// mu protects actions, as well as the signal state of all tasks and thread
 	// groups using this SignalHandlers object. (See comment on
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index e20fa3eb6..4c7811b6c 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -176,6 +176,8 @@ type Stracer interface {
 // SyscallTable is a lookup table of system calls. Critically, a SyscallTable
 // is *immutable*. In order to make supporting suspend and resume sane, they
 // must be uniquely registered and may not change during operation.
+//
+// +stateify savable
 type SyscallTable struct {
 	// OS is the operating system that this syscall table implements.
 	OS abi.OS `state:"wait"`
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 31541749e..125312b6a 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -23,6 +23,8 @@ import (
 // syslog represents a sentry-global kernel log.
 //
 // Currently, it contains only fun messages for a dmesg easter egg.
+//
+// +stateify savable
 type syslog struct {
 	// mu protects the below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 7f6735320..ae9b3d175 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -52,6 +52,8 @@ import (
 // All fields that are "exclusive to the task goroutine" can only be accessed
 // by the task goroutine while it is running. The task goroutine does not
 // require synchronization to read or write these fields.
+//
+// +stateify savable
 type Task struct {
 	taskNode
 
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index a61283267..38f7826e2 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -349,6 +349,7 @@ func (t *Task) unstopVforkParent() {
 	}
 }
 
+// +stateify savable
 type runSyscallAfterPtraceEventClone struct {
 	vforkChild *Task
 
@@ -366,6 +367,7 @@ func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
 	return (*runSyscallExit)(nil)
 }
 
+// +stateify savable
 type runSyscallAfterVforkStop struct {
 	// childTID has the same meaning as
 	// runSyscallAfterPtraceEventClone.vforkChildTID.
@@ -471,6 +473,8 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
 // that the child and parent share mappings until the child execve()s into a
 // new process image or exits.)
+//
+// +stateify savable
 type vforkStop struct{}
 
 // StopIgnoresKill implements TaskStop.Killable.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 5c563ba08..9a59cbd33 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -35,6 +35,8 @@ var ErrNoSyscalls = errors.New("no syscall table found")
 type Auxmap map[string]interface{}
 
 // TaskContext is the subset of a task's data that is provided by the loader.
+//
+// +stateify savable
 type TaskContext struct {
 	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
 	Name string
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 2285847a2..385299b24 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -73,6 +73,8 @@ import (
 
 // execStop is a TaskStop that a task sets on itself when it wants to execve
 // and is waiting for the other tasks in its thread group to exit first.
+//
+// +stateify savable
 type execStop struct{}
 
 // Killable implements TaskStop.Killable.
@@ -119,6 +121,8 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
 
 // The runSyscallAfterExecStop state continues execve(2) after all siblings of
 // a thread in the execve syscall have exited.
+//
+// +stateify savable
 type runSyscallAfterExecStop struct {
 	tc *TaskContext
 }
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index d6604f37b..b16844e91 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -38,6 +38,8 @@ import (
 
 // An ExitStatus is a value communicated from an exiting task or thread group
 // to the party that reaps it.
+//
+// +stateify savable
 type ExitStatus struct {
 	// Code is the numeric value passed to the call to exit or exit_group that
 	// caused the exit. If the exit was not caused by such a call, Code is 0.
@@ -222,6 +224,8 @@ func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
 }
 
 // runExit is the entry point into the task exit path.
+//
+// +stateify savable
 type runExit struct{}
 
 func (*runExit) execute(t *Task) taskRunState {
@@ -229,6 +233,7 @@ func (*runExit) execute(t *Task) taskRunState {
 	return (*runExitMain)(nil)
 }
 
+// +stateify savable
 type runExitMain struct{}
 
 func (*runExitMain) execute(t *Task) taskRunState {
@@ -531,6 +536,7 @@ func (t *Task) reparentLocked(parent *Task) {
 // tracer (if one exists) and reaps the leader immediately. In Linux, this is
 // in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
 
+// +stateify savable
 type runExitNotify struct{}
 
 func (*runExitNotify) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
index 4ca25664a..0832bf989 100644
--- a/pkg/sentry/kernel/task_resources.go
+++ b/pkg/sentry/kernel/task_resources.go
@@ -21,6 +21,8 @@ import (
 
 // TaskResources is the subset of a task's data provided by its creator that is
 // not provided by the loader.
+//
+// +stateify savable
 type TaskResources struct {
 	// SignalMask is the set of signals whose delivery is currently blocked.
 	//
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index a03fa6ac0..8dd0ef6ea 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -131,6 +131,8 @@ func (t *Task) doStop() {
 
 // The runApp state checks for interrupts before executing untrusted
 // application code.
+//
+// +stateify savable
 type runApp struct{}
 
 func (*runApp) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index b50139077..49141ab74 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -65,6 +65,8 @@ const (
 
 // TaskGoroutineSchedInfo contains task goroutine scheduling state which must
 // be read and updated atomically.
+//
+// +stateify savable
 type TaskGoroutineSchedInfo struct {
 	// Timestamp was the value of Kernel.cpuClock when this
 	// TaskGoroutineSchedInfo was last updated.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 91f6c0874..62ec530be 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -748,6 +748,8 @@ func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
 // groupStop is a TaskStop placed on tasks that have received a stop signal
 // (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
 // the ptrace man page.)
+//
+// +stateify savable
 type groupStop struct{}
 
 // Killable implements TaskStop.Killable.
@@ -881,6 +883,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 }
 
 // The runInterrupt state handles conditions indicated by interrupts.
+//
+// +stateify savable
 type runInterrupt struct{}
 
 func (*runInterrupt) execute(t *Task) taskRunState {
@@ -1020,6 +1024,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 	return (*runApp)(nil)
 }
 
+// +stateify savable
 type runInterruptAfterSignalDeliveryStop struct{}
 
 func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 79f4ff60c..3b9652504 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -241,6 +241,7 @@ func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRun
 	return t.doSyscallInvoke(sysno, args)
 }
 
+// +stateify savable
 type runSyscallAfterSyscallEnterStop struct{}
 
 func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
@@ -260,6 +261,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
 	return t.doSyscallInvoke(sysno, args)
 }
 
+// +stateify savable
 type runSyscallAfterSysemuStop struct{}
 
 func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
@@ -294,6 +296,7 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu
 	return (*runSyscallExit)(nil).execute(t)
 }
 
+// +stateify savable
 type runSyscallReinvoke struct{}
 
 func (*runSyscallReinvoke) execute(t *Task) taskRunState {
@@ -310,6 +313,7 @@ func (*runSyscallReinvoke) execute(t *Task) taskRunState {
 	return t.doSyscallInvoke(sysno, args)
 }
 
+// +stateify savable
 type runSyscallExit struct{}
 
 func (*runSyscallExit) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 8fffd3446..441b8a822 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -28,6 +28,8 @@ import (
 // groups" are usually called "processes" in userspace documentation.)
 //
 // ThreadGroup is a superset of Linux's struct signal_struct.
+//
+// +stateify savable
 type ThreadGroup struct {
 	threadGroupNode
 
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 440da9dad..844213c35 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -50,6 +50,8 @@ func (tid ThreadID) String() string {
 const InitTID ThreadID = 1
 
 // A TaskSet comprises all tasks in a system.
+//
+// +stateify savable
 type TaskSet struct {
 	// mu protects all relationships betweens tasks and thread groups in the
 	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
@@ -110,6 +112,8 @@ func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
 //
 // N.B. A task is said to be visible in a PID namespace if the PID namespace
 // contains a thread ID that maps to that task.
+//
+// +stateify savable
 type PIDNamespace struct {
 	// owner is the TaskSet that this PID namespace belongs to. The owner
 	// pointer is immutable.
@@ -263,6 +267,8 @@ func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
 // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
 // threadGroupEntry's methods on ThreadGroup to make it implement
 // threadGroupLinker.)
+//
+// +stateify savable
 type threadGroupNode struct {
 	// pidns is the PID namespace containing the thread group and all of its
 	// member tasks. The pidns pointer is immutable.
@@ -382,6 +388,8 @@ func (tg *ThreadGroup) ID() ThreadID {
 
 // A taskNode defines the relationship between a task and the rest of the
 // system. The comments on threadGroupNode also apply to taskNode.
+//
+// +stateify savable
 type taskNode struct {
 	// tg is the thread group that this task belongs to. The tg pointer is
 	// immutable.
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index b3ed42aa4..5d8db2273 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,30 +1,18 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "time_state",
-    srcs = [
-        "time.go",
-    ],
-    out = "time_state.go",
-    package = "time",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "time",
     srcs = [
         "context.go",
         "time.go",
-        "time_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/log",
         "//pkg/sentry/context",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index c223c2f19..6eadd2878 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -42,6 +42,8 @@ const (
 //
 // Time may represent time with respect to any clock and may not have any
 // meaning in the real world.
+//
+// +stateify savable
 type Time struct {
 	ns int64
 }
@@ -286,6 +288,8 @@ type TimerListener interface {
 }
 
 // Setting contains user-controlled mutable Timer properties.
+//
+// +stateify savable
 type Setting struct {
 	// Enabled is true if the timer is running.
 	Enabled bool
@@ -371,6 +375,8 @@ func (s Setting) advancedTo(now Time) (Setting, uint64) {
 //
 // Timers should be created using NewTimer and must be cleaned up by calling
 // Timer.Destroy when no longer used.
+//
+// +stateify savable
 type Timer struct {
 	// clock is the time source. clock is immutable.
 	clock Clock
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 4de8ac13b..df5dbe128 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -25,6 +25,8 @@ import (
 )
 
 // Timekeeper manages all of the kernel clocks.
+//
+// +stateify savable
 type Timekeeper struct {
 	// clocks are the clock sources.
 	//
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
index 03a3310be..534d03d0f 100644
--- a/pkg/sentry/kernel/timer.go
+++ b/pkg/sentry/kernel/timer.go
@@ -26,6 +26,8 @@ import (
 
 // timekeeperClock is a ktime.Clock that reads time from a
 // kernel.Timekeeper-managed clock.
+//
+// +stateify savable
 type timekeeperClock struct {
 	tk *Timekeeper
 	c  sentrytime.ClockID
@@ -49,6 +51,8 @@ func (tc *timekeeperClock) Now() ktime.Time {
 
 // tgClock is a ktime.Clock that measures the time a thread group has spent
 // executing.
+//
+// +stateify savable
 type tgClock struct {
 	tg *ThreadGroup
 
@@ -155,6 +159,8 @@ func (tc *taskClock) Now() ktime.Time {
 }
 
 // signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
+//
+// +stateify savable
 type signalNotifier struct {
 	tg         *ThreadGroup
 	signal     linux.Signal
@@ -179,6 +185,8 @@ func (s *signalNotifier) Notify(exp uint64) {
 func (s *signalNotifier) Destroy() {}
 
 // TimerManager is a collection of supported process cpu timers.
+//
+// +stateify savable
 type TimerManager struct {
 	// Clocks used to drive thread group execution time timers.
 	virtClock *tgClock
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index 58e9b4d1b..7e0fe0d21 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -22,6 +22,8 @@ import (
 
 // UTSNamespace represents a UTS namespace, a holder of two system identifiers:
 // the hostname and domain name.
+//
+// +stateify savable
 type UTSNamespace struct {
 	// mu protects all fields below.
 	mu         sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 0bacbea49..971e8bc59 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -52,6 +52,8 @@ type vdsoParams struct {
 // Everything in the struct is 8 bytes for easy alignment.
 //
 // It must be kept in sync with params in vdso/vdso_time.cc.
+//
+// +stateify savable
 type VDSOParamPage struct {
 	// The parameter page is fr, allocated from platform.Memory().
 	platform platform.Platform
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 3ce41cacc..90f4395d4 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,22 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "limits_state",
-    srcs = [
-        "limits.go",
-    ],
-    out = "limits_state.go",
-    package = "limits",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "limits",
     srcs = [
         "context.go",
         "limits.go",
-        "limits_state.go",
         "linux.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/limits",
@@ -24,7 +14,6 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
-        "//pkg/state",
     ],
 )
 
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index 4230ba958..02c8b60e3 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -47,6 +47,8 @@ const (
 const Infinity = ^uint64(0)
 
 // Limit specifies a system limit.
+//
+// +stateify savable
 type Limit struct {
 	// Cur specifies the current limit.
 	Cur uint64
@@ -55,6 +57,8 @@ type Limit struct {
 }
 
 // LimitSet represents the Limits that correspond to each LimitType.
+//
+// +stateify savable
 type LimitSet struct {
 	mu   sync.Mutex `state:"nosave"`
 	data map[LimitType]Limit
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index e63052c6d..0beb4561b 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_embed_data(
     name = "vdso_bin",
@@ -10,23 +10,12 @@ go_embed_data(
     var = "vdsoBin",
 )
 
-go_stateify(
-    name = "loader_state",
-    srcs = [
-        "vdso.go",
-        "vdso_state.go",
-    ],
-    out = "loader_state.go",
-    package = "loader",
-)
-
 go_library(
     name = "loader",
     srcs = [
         "elf.go",
         "interpreter.go",
         "loader.go",
-        "loader_state.go",
         "vdso.go",
         "vdso_state.go",
         ":vdso_bin",
@@ -40,7 +29,6 @@ go_library(
         "//pkg/cpuid",
         "//pkg/log",
         "//pkg/rand",
-        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -55,7 +43,6 @@ go_library(
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 2e8693f8e..a06e27ac9 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -193,6 +193,8 @@ func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error)
 //
 // NOTE: to support multiple architectures or operating systems, this
 // would need to contain a VDSO for each.
+//
+// +stateify savable
 type VDSO struct {
 	// ParamPage is the VDSO parameter page. This page should be updated to
 	// inform the VDSO for timekeeping data.
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
index 92004ad9e..dc71e1c2d 100644
--- a/pkg/sentry/loader/vdso_state.go
+++ b/pkg/sentry/loader/vdso_state.go
@@ -18,6 +18,7 @@ import (
 	"debug/elf"
 )
 
+// +stateify savable
 type elfProgHeader struct {
 	Type   elf.ProgType
 	Flags  elf.ProgFlag
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 2e367e189..c9e0b95a0 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,18 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "memmap_state",
-    srcs = [
-        "mappable_range.go",
-        "mapping_set.go",
-        "mapping_set_impl.go",
-    ],
-    out = "memmap_state.go",
-    package = "memmap",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "mappable_range",
@@ -46,7 +35,6 @@ go_library(
         "mapping_set.go",
         "mapping_set_impl.go",
         "memmap.go",
-        "memmap_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
     visibility = ["//pkg/sentry:internal"],
@@ -56,7 +44,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/platform",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index 0cd42ffbf..c9483905d 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -35,6 +35,8 @@ import (
 type MappingsOfRange map[MappingOfRange]struct{}
 
 // MappingOfRange represents a mapping of a MappableRange.
+//
+// +stateify savable
 type MappingOfRange struct {
 	MappingSpace MappingSpace
 	AddrRange    usermem.AddrRange
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 3f396986a..bbdfae247 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,24 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "mm_state",
-    srcs = [
-        "aio_context.go",
-        "aio_context_state.go",
-        "file_refcount_set.go",
-        "io_list.go",
-        "mm.go",
-        "pma_set.go",
-        "save_restore.go",
-        "special_mappable.go",
-        "vma_set.go",
-    ],
-    out = "mm_state.go",
-    package = "mm",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "file_refcount_set",
@@ -101,7 +84,6 @@ go_library(
         "lifecycle.go",
         "metadata.go",
         "mm.go",
-        "mm_state.go",
         "pma.go",
         "pma_set.go",
         "proc_pid_maps.go",
@@ -131,7 +113,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 992bde5a5..b42156d45 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -28,6 +28,8 @@ import (
 )
 
 // aioManager creates and manages asynchronous I/O contexts.
+//
+// +stateify savable
 type aioManager struct {
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -89,12 +91,16 @@ func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
 }
 
 // ioResult is a completed I/O operation.
+//
+// +stateify savable
 type ioResult struct {
 	data interface{}
 	ioEntry
 }
 
 // AIOContext is a single asynchronous I/O context.
+//
+// +stateify savable
 type AIOContext struct {
 	// done is the notification channel used for all requests.
 	done chan struct{} `state:"nosave"`
@@ -190,6 +196,8 @@ func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
 
 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
 // ring buffers.
+//
+// +stateify savable
 type aioMappable struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index ce8097b7f..3299ae164 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -46,6 +46,8 @@ import (
 )
 
 // MemoryManager implements a virtual address space.
+//
+// +stateify savable
 type MemoryManager struct {
 	// p is the platform.
 	//
@@ -207,6 +209,8 @@ type MemoryManager struct {
 }
 
 // vma represents a virtual memory area.
+//
+// +stateify savable
 type vma struct {
 	// mappable is the virtual memory object mapped by this vma. If mappable is
 	// nil, the vma represents a private anonymous mapping.
@@ -346,6 +350,8 @@ func (v *vma) loadRealPerms(b int) {
 }
 
 // pma represents a platform mapping area.
+//
+// +stateify savable
 type pma struct {
 	// file is the file mapped by this pma. Only pmas for which file ==
 	// platform.Platform.Memory() may be saved. pmas hold a reference to the
@@ -380,6 +386,7 @@ type pma struct {
 	internalMappings safemem.BlockSeq `state:"nosave"`
 }
 
+// +stateify savable
 type privateRefs struct {
 	mu sync.Mutex `state:"nosave"`
 
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 9d3614034..aa2f87107 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -28,6 +28,8 @@ import (
 // semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
 // that SpecialMappable takes ownership of the memory that it represents
 // (_install_special_mapping() does not.)
+//
+// +stateify savable
 type SpecialMappable struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 15a7fbbc3..af9ba5394 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,16 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "platform_state",
-    srcs = [
-        "file_range.go",
-    ],
-    out = "platform_state.go",
-    package = "platform",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "file_range",
@@ -30,7 +21,6 @@ go_library(
         "file_range.go",
         "mmap_min_addr.go",
         "platform.go",
-        "platform_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
     visibility = ["//pkg/sentry:internal"],
@@ -44,7 +34,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
index dadba1d38..2a5982763 100644
--- a/pkg/sentry/platform/filemem/BUILD
+++ b/pkg/sentry/platform/filemem/BUILD
@@ -1,18 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "filemem_autogen_state",
-    srcs = [
-        "filemem.go",
-        "filemem_state.go",
-        "usage_set.go",
-    ],
-    out = "filemem_autogen_state.go",
-    package = "filemem",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "usage_set",
@@ -38,7 +27,6 @@ go_library(
     name = "filemem",
     srcs = [
         "filemem.go",
-        "filemem_autogen_state.go",
         "filemem_state.go",
         "filemem_unsafe.go",
         "usage_set.go",
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index 870274ae1..feb020ef8 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -155,6 +155,8 @@ type FileMem struct {
 }
 
 // usage tracks usage information.
+//
+// +stateify savable
 type usageInfo struct {
 	// kind is the usage kind.
 	kind usage.MemoryKind
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 929787aa0..a320fca0b 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,22 +1,10 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "socket_state",
-    srcs = [
-        "socket.go",
-    ],
-    out = "socket_state_autogen.go",
-    package = "socket",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "socket",
-    srcs = [
-        "socket.go",
-        "socket_state_autogen.go",
-    ],
+    srcs = ["socket.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -29,7 +17,6 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index faf2b4c27..c4874fdfb 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,26 +1,14 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "control_state",
-    srcs = [
-        "control.go",
-    ],
-    out = "control_state.go",
-    imports = [
-        "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
-    ],
-    package = "control",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "control",
-    srcs = [
-        "control.go",
-        "control_state.go",
-    ],
+    srcs = ["control.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control",
+    imports = [
+        "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -31,7 +19,6 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
     ],
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 17ecdd11c..c31182e69 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -51,6 +51,8 @@ type SCMRights interface {
 // RightsFiles represents a SCM_RIGHTS socket control message. A reference is
 // maintained for each fs.File and is release either when an FD is created or
 // when the Release method is called.
+//
+// +stateify savable
 type RightsFiles []*fs.File
 
 // NewSCMRights creates a new SCM_RIGHTS socket control message representation
@@ -128,6 +130,8 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte) []by
 }
 
 // scmCredentials represents an SCM_CREDENTIALS socket control message.
+//
+// +stateify savable
 type scmCredentials struct {
 	t    *kernel.Task
 	kuid auth.KUID
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 7ad5e88c5..49af8db85 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -1,24 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "epsocket_state",
-    srcs = [
-        "epsocket.go",
-        "save_restore.go",
-        "stack.go",
-    ],
-    out = "epsocket_state.go",
-    package = "epsocket",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "epsocket",
     srcs = [
         "device.go",
         "epsocket.go",
-        "epsocket_state.go",
         "provider.go",
         "save_restore.go",
         "stack.go",
@@ -31,7 +19,6 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/log",
-        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
@@ -44,7 +31,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index a2927e1b9..f969a1d7c 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -95,6 +95,8 @@ type commonEndpoint interface {
 
 // SocketOperations encapsulates all the state needed to represent a network stack
 // endpoint in the kernel context.
+//
+// +stateify savable
 type SocketOperations struct {
 	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index ec1d96ccb..12b4b4767 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -26,6 +26,8 @@ import (
 )
 
 // Stack implements inet.Stack for netstack/tcpip/stack.Stack.
+//
+// +stateify savable
 type Stack struct {
 	Stack *stack.Stack `state:"manual"`
 }
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 227ca3926..d623718b3 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,24 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "hostinet_state",
-    srcs = [
-        "save_restore.go",
-        "socket.go",
-        "stack.go",
-    ],
-    out = "hostinet_autogen_state.go",
-    package = "hostinet",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "hostinet",
     srcs = [
         "device.go",
         "hostinet.go",
-        "hostinet_autogen_state.go",
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
@@ -42,7 +30,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index b23a243f7..b852165f7 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,21 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "netlink_state",
-    srcs = [
-        "socket.go",
-    ],
-    out = "netlink_state.go",
-    package = "netlink",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "netlink",
     srcs = [
         "message.go",
-        "netlink_state.go",
         "provider.go",
         "socket.go",
     ],
@@ -36,7 +26,6 @@ go_library(
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index ba6f686e4..3a7dbc5ed 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,23 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "port_state",
-    srcs = ["port.go"],
-    out = "port_state.go",
-    package = "port",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "port",
-    srcs = [
-        "port.go",
-        "port_state.go",
-    ],
+    srcs = ["port.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port",
     visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/state"],
 )
 
 go_test(
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index 4ccf0b84c..1c5d4c3a5 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -32,6 +32,8 @@ import (
 const maxPorts = 10000
 
 // Manager allocates netlink port IDs.
+//
+// +stateify savable
 type Manager struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 726469fc9..e1bcfe252 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,32 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "route_state",
-    srcs = ["protocol.go"],
-    out = "route_state.go",
-    package = "route",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "route",
-    srcs = [
-        "protocol.go",
-        "route_state.go",
-    ],
+    srcs = ["protocol.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
-        "//pkg/sentry/fs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
-        "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index e8030c518..55a76e916 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -43,6 +43,8 @@ func typeKind(typ uint16) commandKind {
 }
 
 // Protocol implements netlink.Protocol.
+//
+// +stateify savable
 type Protocol struct{}
 
 var _ netlink.Protocol = (*Protocol)(nil)
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0b8f528d0..e15d1546c 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -51,6 +51,8 @@ var netlinkSocketDevice = device.NewAnonDevice()
 // to/from the kernel.
 //
 // Socket implements socket.Socket.
+//
+// +stateify savable
 type Socket struct {
 	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index bd4858a34..54fe64595 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -195,6 +195,8 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 //
 // Care must be taken when copying ReceiveTimeout as it contains atomic
 // variables.
+//
+// +stateify savable
 type ReceiveTimeout struct {
 	// ns is length of the timeout in nanoseconds.
 	//
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 7d04d6b6b..9fe681e9a 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,15 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "unix_state",
-    srcs = [
-        "unix.go",
-    ],
-    out = "unix_state.go",
-    package = "unix",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "unix",
@@ -17,7 +8,6 @@ go_library(
         "device.go",
         "io.go",
         "unix.go",
-        "unix_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix",
     visibility = ["//pkg/sentry:internal"],
@@ -37,7 +27,6 @@ go_library(
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/epsocket",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 27bacbbc3..5b6411f97 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -42,6 +42,8 @@ import (
 
 // SocketOperations is a Unix socket. It is similar to an epsocket, except it is backed
 // by a unix.Endpoint instead of a tcpip.Endpoint.
+//
+// +stateify savable
 type SocketOperations struct {
 	refs.AtomicRefCount
 	socket.ReceiveTimeout
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 574621ad2..e4450a093 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,18 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "linux_state",
-    srcs = [
-        "sys_aio.go",
-        "sys_futex.go",
-        "sys_poll.go",
-        "sys_time.go",
-    ],
-    out = "linux_state.go",
-    package = "linux",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "linux",
@@ -20,7 +8,6 @@ go_library(
         "error.go",
         "flags.go",
         "linux64.go",
-        "linux_state.go",
         "sigset.go",
         "sys_aio.go",
         "sys_capability.go",
@@ -66,7 +53,6 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/bpf",
-        "//pkg/eventchannel",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/rand",
@@ -74,7 +60,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
-        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/kernel",
@@ -85,7 +70,6 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/sched",
-        "//pkg/sentry/kernel/semaphore",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
@@ -97,8 +81,6 @@ go_library(
         "//pkg/sentry/syscalls",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
-        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index fc3397081..54e4afa9e 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -69,6 +69,8 @@ type ioCallback struct {
 }
 
 // ioEvent describes an I/O result.
+//
+// +stateify savable
 type ioEvent struct {
 	Data    uint64
 	Obj     uint64
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 57762d058..1a0e1f5fb 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -132,6 +132,8 @@ func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
 
 // futexWaitRestartBlock encapsulates the state required to restart futex(2)
 // via restart_syscall(2).
+//
+// +stateify savable
 type futexWaitRestartBlock struct {
 	duration time.Duration
 
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index d4dbfd285..b9bdefadb 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -274,6 +274,8 @@ func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Du
 
 // pollRestartBlock encapsulates the state required to restart poll(2) via
 // restart_syscall(2).
+//
+// +stateify savable
 type pollRestartBlock struct {
 	pfdAddr usermem.Addr
 	nfds    uint
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index dcee694b2..8e6683444 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -168,6 +168,8 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 
 // clockNanosleepRestartBlock encapsulates the state required to restart
 // clock_nanosleep(2) via restart_syscall(2).
+//
+// +stateify savable
 type clockNanosleepRestartBlock struct {
 	c        ktime.Clock
 	duration time.Duration
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index edee44d96..868dfd400 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,17 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "usage_state",
-    srcs = [
-        "cpu.go",
-        "io.go",
-        "memory.go",
-    ],
-    out = "usage_state.go",
-    package = "usage",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "usage",
@@ -21,7 +10,6 @@ go_library(
         "memory.go",
         "memory_unsafe.go",
         "usage.go",
-        "usage_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usage",
     visibility = [
@@ -29,9 +17,6 @@ go_library(
     ],
     deps = [
         "//pkg/bits",
-        "//pkg/log",
         "//pkg/sentry/memutil",
-        "//pkg/state",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
index 1c2cc90e1..ed7b04b9e 100644
--- a/pkg/sentry/usage/cpu.go
+++ b/pkg/sentry/usage/cpu.go
@@ -20,6 +20,8 @@ import (
 
 // CPUStats contains the subset of struct rusage fields that relate to CPU
 // scheduling.
+//
+// +stateify savable
 type CPUStats struct {
 	// UserTime is the amount of time spent executing application code.
 	UserTime time.Duration
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
index a05053c32..49faa507d 100644
--- a/pkg/sentry/usage/io.go
+++ b/pkg/sentry/usage/io.go
@@ -19,6 +19,8 @@ import (
 )
 
 // IO contains I/O-related statistics.
+//
+// +stateify savable
 type IO struct {
 	// CharsRead is the number of bytes read by read syscalls.
 	CharsRead uint64
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 9dd1cd2b5..69ba919e0 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,19 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "usermem_state",
-    srcs = [
-        "access_type.go",
-        "addr.go",
-        "addr_range.go",
-        "addr_range_seq_unsafe.go",
-    ],
-    out = "usermem_state.go",
-    package = "usermem",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "addr_range",
@@ -36,7 +24,6 @@ go_library(
         "bytes_io.go",
         "bytes_io_unsafe.go",
         "usermem.go",
-        "usermem_state.go",
         "usermem_x86.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
@@ -47,7 +34,6 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/safemem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
     ],
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index 7eabecf30..75346d854 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -20,6 +20,8 @@ import (
 
 // AccessType specifies memory access types. This is used for
 // setting mapping permissions, as well as communicating faults.
+//
+// +stateify savable
 type AccessType struct {
 	// Read is read access.
 	Read bool
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
index d175fdc74..fc94bee80 100644
--- a/pkg/sentry/usermem/addr.go
+++ b/pkg/sentry/usermem/addr.go
@@ -19,6 +19,8 @@ import (
 )
 
 // Addr represents a generic virtual address.
+//
+// +stateify savable
 type Addr uintptr
 
 // AddLength adds the given length to start and returns the result. ok is true
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 391d801d0..5153bd3b4 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,26 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tcpip_state",
-    srcs = [
-        "tcpip.go",
-    ],
-    out = "tcpip_state.go",
-    package = "tcpip",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tcpip",
-    srcs = [
-        "tcpip.go",
-        "tcpip_state.go",
-    ],
+    srcs = ["tcpip.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/state",
         "//pkg/tcpip/buffer",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index efeb6a448..11a725423 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,26 +1,15 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "buffer_state",
-    srcs = [
-        "view.go",
-    ],
-    out = "buffer_state.go",
-    package = "buffer",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "buffer",
     srcs = [
-        "buffer_state.go",
         "prependable.go",
         "view.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/state"],
 )
 
 go_test(
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index a5774a327..bbb4e1d24 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -54,6 +54,8 @@ func (v *View) ToVectorisedView(views [1]View) VectorisedView {
 
 // VectorisedView is a vectorised version of View using non contigous memory.
 // It supports all the convenience methods supported by View.
+//
+// +stateify savable
 type VectorisedView struct {
 	views []View
 	size  int
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 3aa2cfb24..8f22ba3a5 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,15 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tcp_header_state",
-    srcs = [
-        "tcp.go",
-    ],
-    out = "tcp_header_state.go",
-    package = "header",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "header",
@@ -25,13 +16,11 @@ go_library(
         "ipv6.go",
         "ipv6_fragment.go",
         "tcp.go",
-        "tcp_header_state.go",
         "udp.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/header",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/seqnum",
     ],
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index a95d282b0..6689a6dc5 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -120,6 +120,8 @@ type TCPSynOptions struct {
 }
 
 // SACKBlock represents a single contiguous SACK block.
+//
+// +stateify savable
 type SACKBlock struct {
 	// Start indicates the lowest sequence number in the block.
 	Start seqnum.Value
@@ -131,6 +133,8 @@ type SACKBlock struct {
 
 // TCPOptions are used to parse and cache the TCP segment options for a non
 // syn/syn-ack segment.
+//
+// +stateify savable
 type TCPOptions struct {
 	// TS is true if the TimeStamp option is enabled.
 	TS bool
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index ac97ebe43..83b4d253f 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,14 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "fragmentation_state",
-    srcs = ["reassembler_list.go"],
-    out = "fragmentation_state.go",
-    package = "fragmentation",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "reassembler_list",
@@ -26,7 +19,6 @@ go_library(
     srcs = [
         "frag_heap.go",
         "fragmentation.go",
-        "fragmentation_state.go",
         "reassembler.go",
         "reassembler_list.go",
     ],
@@ -34,7 +26,6 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
-        "//pkg/state",
         "//pkg/tcpip/buffer",
     ],
 )
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index a75869dac..c5c889239 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,25 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "seqnum_state",
-    srcs = [
-        "seqnum.go",
-    ],
-    out = "seqnum_state.go",
-    package = "seqnum",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "seqnum",
-    srcs = [
-        "seqnum.go",
-        "seqnum_state.go",
-    ],
+    srcs = ["seqnum.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum",
     visibility = [
         "//visibility:public",
     ],
-    deps = ["//pkg/state"],
 )
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index eb1e4645d..af0aec85c 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -213,6 +213,8 @@ const (
 
 // FullAddress represents a full transport node address, as required by the
 // Connect() and Bind() methods.
+//
+// +stateify savable
 type FullAddress struct {
 	// NIC is the ID of the NIC this address refers to.
 	//
@@ -256,6 +258,8 @@ func (s SlicePayload) Size() int {
 }
 
 // A ControlMessages contains socket control messages for IP sockets.
+//
+// +stateify savable
 type ControlMessages struct {
 	// HasTimestamp indicates whether Timestamp is valid/set.
 	HasTimestamp bool
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
index 28e3e1700..117532fea 100644
--- a/pkg/tcpip/transport/ping/BUILD
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -1,19 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "ping_state",
-    srcs = [
-        "endpoint.go",
-        "endpoint_state.go",
-        "ping_packet_list.go",
-    ],
-    out = "ping_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
-    package = "ping",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "ping_packet_list",
@@ -32,14 +20,13 @@ go_library(
         "endpoint.go",
         "endpoint_state.go",
         "ping_packet_list.go",
-        "ping_state.go",
         "protocol.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index f15e44b61..a22684de9 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+// +stateify savable
 type pingPacket struct {
 	pingPacketEntry
 	senderAddress tcpip.FullAddress
diff --git a/pkg/tcpip/transport/queue/BUILD b/pkg/tcpip/transport/queue/BUILD
index fb878ad36..6dcec312e 100644
--- a/pkg/tcpip/transport/queue/BUILD
+++ b/pkg/tcpip/transport/queue/BUILD
@@ -1,27 +1,14 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "queue_state",
-    srcs = [
-        "queue.go",
-    ],
-    out = "queue_state.go",
-    package = "queue",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "queue",
-    srcs = [
-        "queue.go",
-        "queue_state.go",
-    ],
+    srcs = ["queue.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/queue",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/transport/queue/queue.go b/pkg/tcpip/transport/queue/queue.go
index 6a17441ae..eb9ee8a3f 100644
--- a/pkg/tcpip/transport/queue/queue.go
+++ b/pkg/tcpip/transport/queue/queue.go
@@ -33,6 +33,8 @@ type Entry interface {
 }
 
 // Queue is a buffer queue.
+//
+// +stateify savable
 type Queue struct {
 	ReaderQueue *waiter.Queue
 	WriterQueue *waiter.Queue
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 6a7153e4d..9ebae6cc7 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,27 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tcp_state",
-    srcs = [
-        "endpoint.go",
-        "endpoint_state.go",
-        "rcv.go",
-        "reno.go",
-        "segment.go",
-        "segment_heap.go",
-        "segment_queue.go",
-        "segment_state.go",
-        "snd.go",
-        "snd_state.go",
-        "tcp_segment_list.go",
-    ],
-    out = "tcp_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
-    package = "tcp",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "tcp_segment_list",
@@ -53,15 +33,14 @@ go_library(
         "snd.go",
         "snd_state.go",
         "tcp_segment_list.go",
-        "tcp_state.go",
         "timer.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/rand",
         "//pkg/sleep",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5b8a1e20f..de1883d84 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -54,6 +54,8 @@ const (
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
+//
+// +stateify savable
 type SACKInfo struct {
 	// Blocks is the maximum number of SACK blocks we track
 	// per endpoint.
@@ -69,6 +71,8 @@ type SACKInfo struct {
 // have concurrent goroutines make calls into the endpoint, they are properly
 // synchronized. The protocol implementation, however, runs in a single
 // goroutine.
+//
+// +stateify savable
 type endpoint struct {
 	// workMu is used to arbitrate which goroutine may perform protocol
 	// work. Only the main protocol goroutine is expected to call Lock() on
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index b22a00ce1..92ef9c6f7 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -22,6 +22,8 @@ import (
 
 // receiver holds the state necessary to receive TCP segments and turn them
 // into a stream of bytes.
+//
+// +stateify savable
 type receiver struct {
 	ep *endpoint
 
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
index 60f170a27..03ae8d747 100644
--- a/pkg/tcpip/transport/tcp/reno.go
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -16,6 +16,8 @@ package tcp
 
 // renoState stores the variables related to TCP New Reno congestion
 // control algorithm.
+//
+// +stateify savable
 type renoState struct {
 	s *sender
 }
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 40928ba2c..8dccea2ba 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -36,6 +36,8 @@ const (
 // segment represents a TCP segment. It holds the payload and parsed TCP segment
 // information, and can be added to intrusive lists.
 // segment is mostly immutable, the only field allowed to change is viewToDeliver.
+//
+// +stateify savable
 type segment struct {
 	segmentEntry
 	refCnt int32
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 2ddcf5f10..6a2d7bc0b 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -21,6 +21,8 @@ import (
 )
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
+//
+// +stateify savable
 type segmentQueue struct {
 	mu    sync.Mutex  `state:"nosave"`
 	list  segmentList `state:"wait"`
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index e38686e1b..376e81846 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -54,6 +54,8 @@ type congestionControl interface {
 }
 
 // sender holds the state necessary to send TCP segments.
+//
+// +stateify savable
 type sender struct {
 	ep *endpoint
 
@@ -133,6 +135,8 @@ type sender struct {
 }
 
 // fastRecovery holds information related to fast recovery from a packet loss.
+//
+// +stateify savable
 type fastRecovery struct {
 	// active whether the endpoint is in fast recovery. The following fields
 	// are only meaningful when active is true.
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 33c8867f4..d536839af 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -18,6 +18,7 @@ import (
 	"time"
 )
 
+// +stateify savable
 type unixTime struct {
 	second int64
 	nano   int64
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 790dd55a3..1a3a62d3d 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,19 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "udp_state",
-    srcs = [
-        "endpoint.go",
-        "endpoint_state.go",
-        "udp_packet_list.go",
-    ],
-    out = "udp_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
-    package = "udp",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "udp_packet_list",
@@ -33,13 +21,12 @@ go_library(
         "endpoint_state.go",
         "protocol.go",
         "udp_packet_list.go",
-        "udp_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 2a32c3a87..03fb76f92 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+// +stateify savable
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
@@ -49,6 +50,8 @@ const (
 // between users of the endpoint and the protocol implementation; it is legal to
 // have concurrent goroutines make calls into the endpoint, they are properly
 // synchronized.
+//
+// +stateify savable
 type endpoint struct {
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
diff --git a/pkg/tcpip/transport/unix/BUILD b/pkg/tcpip/transport/unix/BUILD
index 676f2cf92..dae0bd079 100644
--- a/pkg/tcpip/transport/unix/BUILD
+++ b/pkg/tcpip/transport/unix/BUILD
@@ -1,17 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "unix_state",
-    srcs = [
-        "connectioned.go",
-        "connectionless.go",
-        "unix.go",
-    ],
-    out = "unix_state.go",
-    package = "unix",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "unix",
@@ -20,14 +9,11 @@ go_library(
         "connectioned_state.go",
         "connectionless.go",
         "unix.go",
-        "unix_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
-        "//pkg/log",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/transport/queue",
diff --git a/pkg/tcpip/transport/unix/connectioned.go b/pkg/tcpip/transport/unix/connectioned.go
index 0e63186b2..dd7c03cf1 100644
--- a/pkg/tcpip/transport/unix/connectioned.go
+++ b/pkg/tcpip/transport/unix/connectioned.go
@@ -85,6 +85,8 @@ type ConnectingEndpoint interface {
 // path != "" && acceptedChan != nil => bound and listening.
 //
 // Only one of these will be true at any moment.
+//
+// +stateify savable
 type connectionedEndpoint struct {
 	baseEndpoint
 
diff --git a/pkg/tcpip/transport/unix/connectionless.go b/pkg/tcpip/transport/unix/connectionless.go
index 3276ddcd0..2a6ec8b4b 100644
--- a/pkg/tcpip/transport/unix/connectionless.go
+++ b/pkg/tcpip/transport/unix/connectionless.go
@@ -25,6 +25,8 @@ import (
 //
 // Specifically, this means datagram unix sockets not created with
 // socketpair(2).
+//
+// +stateify savable
 type connectionlessEndpoint struct {
 	baseEndpoint
 }
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
index 190a1ccdb..8e4af3139 100644
--- a/pkg/tcpip/transport/unix/unix.go
+++ b/pkg/tcpip/transport/unix/unix.go
@@ -60,6 +60,8 @@ type CredentialsControlMessage interface {
 }
 
 // A ControlMessages represents a collection of socket control messages.
+//
+// +stateify savable
 type ControlMessages struct {
 	// Rights is a control message containing FDs.
 	Rights RightsControlMessage
@@ -235,6 +237,8 @@ type BoundEndpoint interface {
 }
 
 // message represents a message passed over a Unix domain socket.
+//
+// +stateify savable
 type message struct {
 	ilist.Entry
 
@@ -306,6 +310,8 @@ type Receiver interface {
 }
 
 // queueReceiver implements Receiver for datagram sockets.
+//
+// +stateify savable
 type queueReceiver struct {
 	readQueue *queue.Queue
 }
@@ -369,6 +375,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 {
 func (*queueReceiver) Release() {}
 
 // streamQueueReceiver implements Receiver for stream sockets.
+//
+// +stateify savable
 type streamQueueReceiver struct {
 	queueReceiver
 
@@ -579,6 +587,7 @@ type ConnectedEndpoint interface {
 	Release()
 }
 
+// +stateify savable
 type connectedEndpoint struct {
 	// endpoint represents the subset of the Endpoint functionality needed by
 	// the connectedEndpoint. It is implemented by both connectionedEndpoint
@@ -671,6 +680,8 @@ func (*connectedEndpoint) Release() {}
 // unix domain socket Endpoint implementations.
 //
 // Not to be used on its own.
+//
+// +stateify savable
 type baseEndpoint struct {
 	*waiter.Queue
 
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 8256acdb4..5e611c54f 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,28 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "waiter_state",
-    srcs = [
-        "waiter.go",
-    ],
-    out = "waiter_state.go",
-    package = "waiter",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "waiter",
-    srcs = [
-        "waiter.go",
-        "waiter_state.go",
-    ],
+    srcs = ["waiter.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/waiter",
     visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/ilist",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/ilist"],
 )
 
 go_test(
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 9b189bb9e..9825880ca 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -157,6 +157,8 @@ func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) {
 // notifiers can notify them when events happen.
 //
 // The zero value for waiter.Queue is an empty queue ready for use.
+//
+// +stateify savable
 type Queue struct {
 	list ilist.List   `state:"zerovalue"`
 	mu   sync.RWMutex `state:"nosave"`
-- 
cgit v1.2.3


From 60add78980737a7330100d98bf6a214892dee3c0 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 1 Aug 2018 19:56:12 -0700
Subject: Automated rollback of changelist 207007153

PiperOrigin-RevId: 207037226
Change-Id: I8b5f1a056d4f3eab17846f2e0193bb737ecb5428
---
 pkg/abi/BUILD                                  | 13 +++++-
 pkg/abi/linux/BUILD                            | 16 ++++++-
 pkg/abi/linux/bpf.go                           |  2 -
 pkg/abi/linux/tty.go                           |  2 -
 pkg/bpf/BUILD                                  | 17 +++++++-
 pkg/bpf/interpreter.go                         |  2 -
 pkg/cpuid/BUILD                                | 15 ++++++-
 pkg/cpuid/cpuid.go                             |  2 -
 pkg/ilist/BUILD                                | 15 ++++++-
 pkg/ilist/list.go                              |  4 --
 pkg/segment/range.go                           |  2 -
 pkg/segment/set.go                             |  5 ---
 pkg/sentry/arch/BUILD                          | 18 +++++++-
 pkg/sentry/arch/arch.go                        |  2 -
 pkg/sentry/arch/arch_amd64.go                  |  2 -
 pkg/sentry/arch/arch_state_x86.go              |  1 -
 pkg/sentry/arch/arch_x86.go                    |  2 -
 pkg/sentry/arch/auxv.go                        |  2 -
 pkg/sentry/arch/signal_amd64.go                |  6 ---
 pkg/sentry/context/contexttest/BUILD           | 17 +++++++-
 pkg/sentry/fs/BUILD                            | 36 +++++++++++++++-
 pkg/sentry/fs/ashmem/BUILD                     | 17 +++++++-
 pkg/sentry/fs/ashmem/area.go                   |  8 ++--
 pkg/sentry/fs/ashmem/device.go                 | 22 +++++-----
 pkg/sentry/fs/ashmem/pin_board.go              |  2 -
 pkg/sentry/fs/attr.go                          | 12 ------
 pkg/sentry/fs/binder/BUILD                     | 13 +++++-
 pkg/sentry/fs/binder/binder.go                 | 26 +++++-------
 pkg/sentry/fs/dentry.go                        |  4 --
 pkg/sentry/fs/dev/BUILD                        | 20 ++++++++-
 pkg/sentry/fs/dev/dev.go                       |  2 -
 pkg/sentry/fs/dev/fs.go                        |  2 -
 pkg/sentry/fs/dev/full.go                      |  2 -
 pkg/sentry/fs/dev/null.go                      |  3 --
 pkg/sentry/fs/dev/random.go                    |  1 -
 pkg/sentry/fs/dirent.go                        |  2 -
 pkg/sentry/fs/dirent_cache.go                  |  2 -
 pkg/sentry/fs/fdpipe/BUILD                     | 31 +++++++++++++-
 pkg/sentry/fs/fdpipe/pipe.go                   |  2 -
 pkg/sentry/fs/file.go                          |  2 -
 pkg/sentry/fs/file_overlay.go                  |  4 --
 pkg/sentry/fs/filesystems.go                   |  2 -
 pkg/sentry/fs/filetest/BUILD                   | 18 +++++++-
 pkg/sentry/fs/flags.go                         |  2 -
 pkg/sentry/fs/fsutil/BUILD                     | 20 ++++++++-
 pkg/sentry/fs/fsutil/dirty_set.go              |  2 -
 pkg/sentry/fs/fsutil/handle.go                 |  2 -
 pkg/sentry/fs/fsutil/host_file_mapper.go       |  2 -
 pkg/sentry/fs/fsutil/inode.go                  |  6 ---
 pkg/sentry/fs/fsutil/inode_cached.go           |  2 -
 pkg/sentry/fs/gofer/BUILD                      | 23 +++++++++-
 pkg/sentry/fs/gofer/file.go                    |  2 -
 pkg/sentry/fs/gofer/fs.go                      |  2 -
 pkg/sentry/fs/gofer/inode.go                   |  4 --
 pkg/sentry/fs/gofer/session.go                 |  3 --
 pkg/sentry/fs/host/BUILD                       | 27 +++++++++++-
 pkg/sentry/fs/host/descriptor.go               |  2 -
 pkg/sentry/fs/host/file.go                     |  2 -
 pkg/sentry/fs/host/fs.go                       |  6 +--
 pkg/sentry/fs/host/inode.go                    |  4 --
 pkg/sentry/fs/inode.go                         |  4 --
 pkg/sentry/fs/inode_inotify.go                 |  2 -
 pkg/sentry/fs/inotify.go                       |  2 -
 pkg/sentry/fs/inotify_event.go                 |  2 -
 pkg/sentry/fs/inotify_watch.go                 |  2 -
 pkg/sentry/fs/lock/BUILD                       | 15 ++++++-
 pkg/sentry/fs/lock/lock.go                     |  6 +--
 pkg/sentry/fs/mount.go                         |  4 --
 pkg/sentry/fs/mount_overlay.go                 |  4 --
 pkg/sentry/fs/mounts.go                        |  2 -
 pkg/sentry/fs/overlay.go                       |  2 -
 pkg/sentry/fs/proc/BUILD                       | 34 ++++++++++++++-
 pkg/sentry/fs/proc/cpuinfo.go                  |  2 -
 pkg/sentry/fs/proc/exec_args.go                |  2 -
 pkg/sentry/fs/proc/fds.go                      |  6 ---
 pkg/sentry/fs/proc/file.go                     |  1 -
 pkg/sentry/fs/proc/filesystems.go              |  2 -
 pkg/sentry/fs/proc/fs.go                       |  2 -
 pkg/sentry/fs/proc/loadavg.go                  |  2 -
 pkg/sentry/fs/proc/meminfo.go                  |  2 -
 pkg/sentry/fs/proc/mounts.go                   |  4 --
 pkg/sentry/fs/proc/proc.go                     |  4 --
 pkg/sentry/fs/proc/seqfile/BUILD               | 30 +++++++++++--
 pkg/sentry/fs/proc/seqfile/seqfile.go          |  4 --
 pkg/sentry/fs/proc/stat.go                     |  2 -
 pkg/sentry/fs/proc/sys.go                      |  5 ---
 pkg/sentry/fs/proc/sys_net.go                  |  2 -
 pkg/sentry/fs/proc/task.go                     | 20 ---------
 pkg/sentry/fs/proc/uid_gid_map.go              |  3 --
 pkg/sentry/fs/proc/uptime.go                   |  2 -
 pkg/sentry/fs/proc/version.go                  |  2 -
 pkg/sentry/fs/ramfs/BUILD                      | 21 ++++++++-
 pkg/sentry/fs/ramfs/dir.go                     |  2 -
 pkg/sentry/fs/ramfs/ramfs.go                   |  2 -
 pkg/sentry/fs/ramfs/socket.go                  |  2 -
 pkg/sentry/fs/ramfs/symlink.go                 |  2 -
 pkg/sentry/fs/ramfs/test/BUILD                 | 18 +++++++-
 pkg/sentry/fs/sys/BUILD                        | 14 +++++-
 pkg/sentry/fs/sys/fs.go                        |  2 -
 pkg/sentry/fs/sys/sys.go                       |  5 +--
 pkg/sentry/fs/timerfd/BUILD                    | 18 +++++++-
 pkg/sentry/fs/timerfd/timerfd.go               |  4 +-
 pkg/sentry/fs/tmpfs/BUILD                      | 17 +++++++-
 pkg/sentry/fs/tmpfs/file_regular.go            |  2 -
 pkg/sentry/fs/tmpfs/fs.go                      |  2 -
 pkg/sentry/fs/tmpfs/inode_file.go              |  2 -
 pkg/sentry/fs/tmpfs/tmpfs.go                   |  8 ----
 pkg/sentry/fs/tty/BUILD                        | 20 ++++++++-
 pkg/sentry/fs/tty/dir.go                       | 18 +++-----
 pkg/sentry/fs/tty/fs.go                        |  4 --
 pkg/sentry/fs/tty/inode.go                     |  2 -
 pkg/sentry/fs/tty/line_discipline.go           |  6 ---
 pkg/sentry/fs/tty/master.go                    |  4 --
 pkg/sentry/fs/tty/queue.go                     |  4 +-
 pkg/sentry/fs/tty/slave.go                     |  4 --
 pkg/sentry/fs/tty/terminal.go                  |  2 -
 pkg/sentry/inet/BUILD                          | 15 ++++++-
 pkg/sentry/inet/inet.go                        |  2 -
 pkg/sentry/kernel/BUILD                        | 59 +++++++++++++++++++++++---
 pkg/sentry/kernel/abstract_socket_namespace.go |  3 --
 pkg/sentry/kernel/auth/BUILD                   | 17 +++++++-
 pkg/sentry/kernel/auth/credentials.go          |  2 -
 pkg/sentry/kernel/auth/id_map.go               |  2 -
 pkg/sentry/kernel/auth/user_namespace.go       |  2 -
 pkg/sentry/kernel/epoll/BUILD                  | 15 ++++++-
 pkg/sentry/kernel/epoll/epoll.go               |  8 +---
 pkg/sentry/kernel/eventfd/BUILD                | 18 +++++++-
 pkg/sentry/kernel/eventfd/eventfd.go           |  4 +-
 pkg/sentry/kernel/fd_map.go                    |  6 ---
 pkg/sentry/kernel/fs_context.go                |  2 -
 pkg/sentry/kernel/futex/BUILD                  | 18 +++++++-
 pkg/sentry/kernel/futex/futex.go               |  2 -
 pkg/sentry/kernel/ipc_namespace.go             |  2 -
 pkg/sentry/kernel/kernel.go                    |  4 +-
 pkg/sentry/kernel/pending_signals.go           |  5 ---
 pkg/sentry/kernel/pipe/BUILD                   | 20 ++++++++-
 pkg/sentry/kernel/pipe/buffers.go              |  2 -
 pkg/sentry/kernel/pipe/node.go                 |  2 -
 pkg/sentry/kernel/pipe/pipe.go                 |  2 -
 pkg/sentry/kernel/pipe/reader.go               |  2 -
 pkg/sentry/kernel/pipe/reader_writer.go        |  2 -
 pkg/sentry/kernel/pipe/writer.go               |  2 -
 pkg/sentry/kernel/ptrace.go                    |  4 --
 pkg/sentry/kernel/rseq.go                      |  2 -
 pkg/sentry/kernel/semaphore/BUILD              | 15 ++++++-
 pkg/sentry/kernel/semaphore/semaphore.go       |  8 ----
 pkg/sentry/kernel/sessions.go                  |  4 --
 pkg/sentry/kernel/shm/BUILD                    | 13 +++++-
 pkg/sentry/kernel/shm/shm.go                   |  4 --
 pkg/sentry/kernel/signal_handlers.go           |  2 -
 pkg/sentry/kernel/syscalls.go                  |  2 -
 pkg/sentry/kernel/syslog.go                    |  2 -
 pkg/sentry/kernel/task.go                      |  2 -
 pkg/sentry/kernel/task_clone.go                |  4 --
 pkg/sentry/kernel/task_context.go              |  2 -
 pkg/sentry/kernel/task_exec.go                 |  4 --
 pkg/sentry/kernel/task_exit.go                 |  6 ---
 pkg/sentry/kernel/task_resources.go            |  2 -
 pkg/sentry/kernel/task_run.go                  |  2 -
 pkg/sentry/kernel/task_sched.go                |  2 -
 pkg/sentry/kernel/task_signals.go              |  5 ---
 pkg/sentry/kernel/task_syscall.go              |  4 --
 pkg/sentry/kernel/thread_group.go              |  2 -
 pkg/sentry/kernel/threads.go                   |  8 ----
 pkg/sentry/kernel/time/BUILD                   | 14 +++++-
 pkg/sentry/kernel/time/time.go                 |  6 ---
 pkg/sentry/kernel/timekeeper.go                |  2 -
 pkg/sentry/kernel/timer.go                     |  8 ----
 pkg/sentry/kernel/uts_namespace.go             |  2 -
 pkg/sentry/kernel/vdso.go                      |  2 -
 pkg/sentry/limits/BUILD                        | 13 +++++-
 pkg/sentry/limits/limits.go                    |  4 --
 pkg/sentry/loader/BUILD                        | 15 ++++++-
 pkg/sentry/loader/vdso.go                      |  2 -
 pkg/sentry/loader/vdso_state.go                |  1 -
 pkg/sentry/memmap/BUILD                        | 15 ++++++-
 pkg/sentry/memmap/mapping_set.go               |  2 -
 pkg/sentry/mm/BUILD                            | 21 ++++++++-
 pkg/sentry/mm/aio_context.go                   |  8 ----
 pkg/sentry/mm/mm.go                            |  7 ---
 pkg/sentry/mm/special_mappable.go              |  2 -
 pkg/sentry/platform/BUILD                      | 13 +++++-
 pkg/sentry/platform/filemem/BUILD              | 14 +++++-
 pkg/sentry/platform/filemem/filemem.go         |  2 -
 pkg/sentry/socket/BUILD                        | 17 +++++++-
 pkg/sentry/socket/control/BUILD                | 23 +++++++---
 pkg/sentry/socket/control/control.go           |  4 --
 pkg/sentry/socket/epsocket/BUILD               | 16 ++++++-
 pkg/sentry/socket/epsocket/epsocket.go         |  2 -
 pkg/sentry/socket/epsocket/stack.go            |  2 -
 pkg/sentry/socket/hostinet/BUILD               | 15 ++++++-
 pkg/sentry/socket/netlink/BUILD                | 13 +++++-
 pkg/sentry/socket/netlink/port/BUILD           | 15 ++++++-
 pkg/sentry/socket/netlink/port/port.go         |  2 -
 pkg/sentry/socket/netlink/route/BUILD          | 17 +++++++-
 pkg/sentry/socket/netlink/route/protocol.go    |  2 -
 pkg/sentry/socket/netlink/socket.go            |  2 -
 pkg/sentry/socket/socket.go                    |  2 -
 pkg/sentry/socket/unix/BUILD                   | 13 +++++-
 pkg/sentry/socket/unix/unix.go                 |  2 -
 pkg/sentry/syscalls/linux/BUILD                | 20 ++++++++-
 pkg/sentry/syscalls/linux/sys_aio.go           |  2 -
 pkg/sentry/syscalls/linux/sys_futex.go         |  2 -
 pkg/sentry/syscalls/linux/sys_poll.go          |  2 -
 pkg/sentry/syscalls/linux/sys_time.go          |  2 -
 pkg/sentry/usage/BUILD                         | 17 +++++++-
 pkg/sentry/usage/cpu.go                        |  2 -
 pkg/sentry/usage/io.go                         |  2 -
 pkg/sentry/usermem/BUILD                       | 16 ++++++-
 pkg/sentry/usermem/access_type.go              |  2 -
 pkg/sentry/usermem/addr.go                     |  2 -
 pkg/tcpip/BUILD                                | 17 +++++++-
 pkg/tcpip/buffer/BUILD                         | 13 +++++-
 pkg/tcpip/buffer/view.go                       |  2 -
 pkg/tcpip/header/BUILD                         | 13 +++++-
 pkg/tcpip/header/tcp.go                        |  4 --
 pkg/tcpip/network/fragmentation/BUILD          | 11 ++++-
 pkg/tcpip/seqnum/BUILD                         | 17 +++++++-
 pkg/tcpip/tcpip.go                             |  4 --
 pkg/tcpip/transport/ping/BUILD                 | 17 +++++++-
 pkg/tcpip/transport/ping/endpoint.go           |  1 -
 pkg/tcpip/transport/queue/BUILD                | 17 +++++++-
 pkg/tcpip/transport/queue/queue.go             |  2 -
 pkg/tcpip/transport/tcp/BUILD                  | 25 ++++++++++-
 pkg/tcpip/transport/tcp/endpoint.go            |  4 --
 pkg/tcpip/transport/tcp/rcv.go                 |  2 -
 pkg/tcpip/transport/tcp/reno.go                |  2 -
 pkg/tcpip/transport/tcp/segment.go             |  2 -
 pkg/tcpip/transport/tcp/segment_queue.go       |  2 -
 pkg/tcpip/transport/tcp/snd.go                 |  4 --
 pkg/tcpip/transport/tcp/snd_state.go           |  1 -
 pkg/tcpip/transport/udp/BUILD                  | 17 +++++++-
 pkg/tcpip/transport/udp/endpoint.go            |  3 --
 pkg/tcpip/transport/unix/BUILD                 | 16 ++++++-
 pkg/tcpip/transport/unix/connectioned.go       |  2 -
 pkg/tcpip/transport/unix/connectionless.go     |  2 -
 pkg/tcpip/transport/unix/unix.go               | 11 -----
 pkg/waiter/BUILD                               | 21 +++++++--
 pkg/waiter/waiter.go                           |  2 -
 239 files changed, 1108 insertions(+), 662 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index c014d2c4b..f1e6bac67 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,13 +1,24 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "abi_state",
+    srcs = [
+        "abi.go",
+    ],
+    out = "abi_state.go",
+    package = "abi",
+)
 
 go_library(
     name = "abi",
     srcs = [
         "abi.go",
+        "abi_state.go",
         "flag.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/abi",
     visibility = ["//:sandbox"],
+    deps = ["//pkg/state"],
 )
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index ac4ceefbc..38b4829c9 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -4,7 +4,19 @@
 
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "linux_state",
+    srcs = [
+        "binder.go",
+        "bpf.go",
+        "time.go",
+        "tty.go",
+    ],
+    out = "linux_state.go",
+    package = "linux",
+)
 
 go_library(
     name = "linux",
@@ -29,6 +41,7 @@ go_library(
         "ipc.go",
         "limits.go",
         "linux.go",
+        "linux_state.go",
         "mm.go",
         "netdevice.go",
         "netlink.go",
@@ -54,5 +67,6 @@ go_library(
         "//pkg/abi",
         "//pkg/binary",
         "//pkg/bits",
+        "//pkg/state",
     ],
 )
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index 80e5b1af1..f597ef4f5 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -15,8 +15,6 @@
 package linux
 
 // BPFInstruction is a raw BPF virtual machine instruction.
-//
-// +stateify savable
 type BPFInstruction struct {
 	// OpCode is the operation to execute.
 	OpCode uint16
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index b640f7627..84b6ccc87 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -38,8 +38,6 @@ type Termios struct {
 
 // KernelTermios is struct ktermios/struct termios2, defined in
 // uapi/asm-generic/termbits.h.
-//
-// +stateify savable
 type KernelTermios struct {
 	InputFlags        uint32
 	OutputFlags       uint32
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index 564df3af5..403270049 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,11 +1,21 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "bpf_state",
+    srcs = [
+        "interpreter.go",
+    ],
+    out = "bpf_state.go",
+    package = "bpf",
+)
 
 go_library(
     name = "bpf",
     srcs = [
         "bpf.go",
+        "bpf_state.go",
         "decoder.go",
         "input_bytes.go",
         "interpreter.go",
@@ -13,7 +23,10 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/bpf",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/abi/linux"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/state",
+    ],
 )
 
 go_test(
diff --git a/pkg/bpf/interpreter.go b/pkg/bpf/interpreter.go
index 111ada9d1..b7dee86a8 100644
--- a/pkg/bpf/interpreter.go
+++ b/pkg/bpf/interpreter.go
@@ -88,8 +88,6 @@ func (e Error) Error() string {
 }
 
 // Program is a BPF program that has been validated for consistency.
-//
-// +stateify savable
 type Program struct {
 	instructions []linux.BPFInstruction
 }
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 46fc4703b..9a0ca1b33 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,16 +1,27 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "cpuid_state",
+    srcs = ["cpuid.go"],
+    out = "cpuid_state.go",
+    package = "cpuid",
+)
 
 go_library(
     name = "cpuid",
     srcs = [
         "cpu_amd64.s",
         "cpuid.go",
+        "cpuid_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/cpuid",
     visibility = ["//:sandbox"],
-    deps = ["//pkg/log"],
+    deps = [
+        "//pkg/log",
+        "//pkg/state",
+    ],
 )
 
 go_test(
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index e91e34dc7..b486ab037 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -409,8 +409,6 @@ func (f Feature) flagString(cpuinfoOnly bool) string {
 }
 
 // FeatureSet is a set of Features for a cpu.
-//
-// +stateify savable
 type FeatureSet struct {
 	// Set is the set of features that are enabled in this FeatureSet.
 	Set map[Feature]bool
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index b26a39132..e32f26ffa 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,15 +1,28 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "list_state",
+    srcs = [
+        "interface_list.go",
+    ],
+    out = "interface_list_state.go",
+    package = "ilist",
+)
 
 go_library(
     name = "ilist",
     srcs = [
         "interface_list.go",
+        "interface_list_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/ilist",
     visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/state",
+    ],
 )
 
 go_template_instance(
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index a88b82196..5efb6c072 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -36,8 +36,6 @@ type Linker interface {
 //      for e := l.Front(); e != nil; e = e.Next() {
 // 		// do something with e.
 //      }
-//
-// +stateify savable
 type List struct {
 	head Linker
 	tail Linker
@@ -157,8 +155,6 @@ func (l *List) Remove(e Linker) {
 // Entry is a default implementation of Linker. Users can add anonymous fields
 // of this type to their structs to make them automatically implement the
 // methods needed by List.
-//
-// +stateify savable
 type Entry struct {
 	next Linker
 	prev Linker
diff --git a/pkg/segment/range.go b/pkg/segment/range.go
index 34c067265..5ff30d489 100644
--- a/pkg/segment/range.go
+++ b/pkg/segment/range.go
@@ -18,8 +18,6 @@ package segment
 type T uint64
 
 // A Range represents a contiguous range of T.
-//
-// +stateify savable
 type Range struct {
 	// Start is the inclusive start of the range.
 	Start T
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index cffec2a2c..6eed1d930 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -88,8 +88,6 @@ const (
 // A Set is a mapping of segments with non-overlapping Range keys. The zero
 // value for a Set is an empty set. Set values are not safely movable nor
 // copyable. Set is thread-compatible.
-//
-// +stateify savable
 type Set struct {
 	root node `state:".(*SegmentDataSlices)"`
 }
@@ -598,7 +596,6 @@ func (s *Set) ApplyContiguous(r Range, fn func(seg Iterator)) GapIterator {
 	}
 }
 
-// +stateify savable
 type node struct {
 	// An internal binary tree node looks like:
 	//
@@ -1320,8 +1317,6 @@ func (n *node) writeDebugString(buf *bytes.Buffer, prefix string) {
 // SegmentDataSlices represents segments from a set as slices of start, end, and
 // values. SegmentDataSlices is primarily used as an intermediate representation
 // for save/restore and the layout here is optimized for that.
-//
-// +stateify savable
 type SegmentDataSlices struct {
 	Start  []Key
 	End    []Key
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 314b3e962..0a2a35400 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,7 +1,21 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "arch_state",
+    srcs = [
+        "arch.go",
+        "arch_amd64.go",
+        "arch_state_x86.go",
+        "arch_x86.go",
+        "auxv.go",
+        "signal_amd64.go",
+    ],
+    out = "arch_state.go",
+    package = "arch",
+)
 
 go_library(
     name = "arch",
@@ -10,6 +24,7 @@ go_library(
         "arch.go",
         "arch_amd64.go",
         "arch_amd64.s",
+        "arch_state.go",
         "arch_state_x86.go",
         "arch_x86.go",
         "auxv.go",
@@ -31,6 +46,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/limits",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 21cb84502..0189e958d 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -254,8 +254,6 @@ const (
 // MemoryManager.
 //
 // Note that "highest address" below is always exclusive.
-//
-// +stateify savable
 type MmapLayout struct {
 	// MinAddr is the lowest mappable address.
 	MinAddr usermem.Addr
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index f1e408af9..23526fe8e 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -95,8 +95,6 @@ const (
 )
 
 // context64 represents an AMD64 context.
-//
-// +stateify savable
 type context64 struct {
 	State
 	sigFPState []x86FPState // fpstate to be restored on sigreturn.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index e9c23a06b..cb38d098a 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -56,7 +56,6 @@ func (s *State) afterLoad() {
 	copy(s.x86FPState, old)
 }
 
-// +stateify savable
 type syscallPtraceRegs struct {
 	R15      uint64
 	R14      uint64
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index b35eec53c..5cc4f8377 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -153,8 +153,6 @@ func NewFloatingPointData() *FloatingPointData {
 
 // State contains the common architecture bits for X86 (the build tag of this
 // file ensures it's only built on x86).
-//
-// +stateify savable
 type State struct {
 	// The system registers.
 	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 81cfb4a01..70e0e35b7 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -19,8 +19,6 @@ import (
 )
 
 // An AuxEntry represents an entry in an ELF auxiliary vector.
-//
-// +stateify savable
 type AuxEntry struct {
 	Key   uint64
 	Value usermem.Addr
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index e81717e8b..c1d743f38 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -28,8 +28,6 @@ import (
 
 // SignalAct represents the action that should be taken when a signal is
 // delivered, and is equivalent to struct sigaction on 64-bit x86.
-//
-// +stateify savable
 type SignalAct struct {
 	Handler  uint64
 	Flags    uint64
@@ -49,8 +47,6 @@ func (s *SignalAct) DeserializeTo(other *SignalAct) {
 
 // SignalStack represents information about a user stack, and is equivalent to
 // stack_t on 64-bit x86.
-//
-// +stateify savable
 type SignalStack struct {
 	Addr  uint64
 	Flags uint32
@@ -70,8 +66,6 @@ func (s *SignalStack) DeserializeTo(other *SignalStack) {
 
 // SignalInfo represents information about a signal being delivered, and is
 // equivalent to struct siginfo on 64-bit x86.
-//
-// +stateify savable
 type SignalInfo struct {
 	Signo int32 // Signal number
 	Errno int32 // Errno value
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 01bb40b04..591b11a4d 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,11 +1,23 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "contexttest_state",
+    srcs = [
+        "contexttest.go",
+    ],
+    out = "contexttest_state.go",
+    package = "contexttest",
+)
 
 go_library(
     name = "contexttest",
     testonly = 1,
-    srcs = ["contexttest.go"],
+    srcs = [
+        "contexttest.go",
+        "contexttest_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -16,5 +28,6 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/ptrace",
         "//pkg/sentry/uniqueid",
+        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 18cd5ae8e..e3c9a9b70 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,7 +1,40 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "fs_state",
+    srcs = [
+        "attr.go",
+        "dentry.go",
+        "dirent.go",
+        "dirent_cache.go",
+        "dirent_list.go",
+        "dirent_state.go",
+        "file.go",
+        "file_overlay.go",
+        "file_state.go",
+        "filesystems.go",
+        "flags.go",
+        "inode.go",
+        "inode_inotify.go",
+        "inode_operations.go",
+        "inode_overlay.go",
+        "inotify.go",
+        "inotify_event.go",
+        "inotify_watch.go",
+        "mock.go",
+        "mount.go",
+        "mount_overlay.go",
+        "mount_state.go",
+        "mounts.go",
+        "overlay.go",
+        "path.go",
+    ],
+    out = "fs_state.go",
+    package = "fs",
+)
 
 go_library(
     name = "fs",
@@ -21,6 +54,7 @@ go_library(
         "filesystems.go",
         "flags.go",
         "fs.go",
+        "fs_state.go",
         "inode.go",
         "inode_inotify.go",
         "inode_operations.go",
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index dc893d22f..9f166799a 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -1,12 +1,26 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
+go_stateify(
+    name = "ashmem_state",
+    srcs = [
+        "area.go",
+        "device.go",
+        "pin_board.go",
+        "uint64_range.go",
+        "uint64_set.go",
+    ],
+    out = "ashmem_state.go",
+    package = "ashmem",
+)
+
 go_library(
     name = "ashmem",
     srcs = [
         "area.go",
+        "ashmem_state.go",
         "device.go",
         "pin_board.go",
         "uint64_range.go",
@@ -27,6 +41,7 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
     ],
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index bfd7f2762..e4f76f0d0 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -39,12 +39,10 @@ const (
 )
 
 // Area implements fs.FileOperations.
-//
-// +stateify savable
 type Area struct {
-	fsutil.NoFsync                  `state:"nosave"`
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.NotDirReaddir            `state:"nosave"`
+	fsutil.NoFsync
+	fsutil.DeprecatedFileOperations
+	fsutil.NotDirReaddir
 
 	ad *Device
 
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index d0986fa11..c5b51d4a7 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -27,19 +27,17 @@ import (
 )
 
 // Device implements fs.InodeOperations.
-//
-// +stateify savable
 type Device struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
-	fsutil.InodeNotSocket            `state:"nosave"`
-	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoFsync                   `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-	fsutil.NotDirReaddir             `state:"nosave"`
+	fsutil.DeprecatedFileOperations
+	fsutil.InodeNoExtendedAttributes
+	fsutil.InodeNotDirectory
+	fsutil.InodeNotRenameable
+	fsutil.InodeNotSocket
+	fsutil.InodeNotSymlink
+	fsutil.NoFsync
+	fsutil.NoMappable
+	fsutil.NoopWriteOut
+	fsutil.NotDirReaddir
 
 	mu       sync.Mutex `state:"nosave"`
 	unstable fs.UnstableAttr
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
index ecba395a0..c7fb3822c 100644
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -56,8 +56,6 @@ func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) {
 // segment.Set is used for implementation where segments represent
 // ranges of pinned bytes, while gaps represent ranges of unpinned
 // bytes. All ranges are page-aligned.
-//
-// +stateify savable
 type PinBoard struct {
 	Set
 }
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 4178f18b2..56a2ad6f7 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -91,8 +91,6 @@ func (n InodeType) String() string {
 
 // StableAttr contains Inode attributes that will be stable throughout the
 // lifetime of the Inode.
-//
-// +stateify savable
 type StableAttr struct {
 	// Type is the InodeType of a InodeOperations.
 	Type InodeType
@@ -152,8 +150,6 @@ func IsCharDevice(s StableAttr) bool {
 
 // UnstableAttr contains Inode attributes that may change over the lifetime
 // of the Inode.
-//
-// +stateify savable
 type UnstableAttr struct {
 	// Size is the file size in bytes.
 	Size int64
@@ -190,8 +186,6 @@ func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr {
 }
 
 // AttrMask contains fields to mask StableAttr and UnstableAttr.
-//
-// +stateify savable
 type AttrMask struct {
 	Type             bool
 	DeviceID         bool
@@ -233,8 +227,6 @@ func (a AttrMask) Union(b AttrMask) AttrMask {
 }
 
 // PermMask are file access permissions.
-//
-// +stateify savable
 type PermMask struct {
 	// Read indicates reading is permitted.
 	Read bool
@@ -288,8 +280,6 @@ func (p PermMask) SupersetOf(other PermMask) bool {
 
 // FilePermissions represents the permissions of a file, with
 // Read/Write/Execute bits for user, group, and other.
-//
-// +stateify savable
 type FilePermissions struct {
 	User  PermMask
 	Group PermMask
@@ -380,8 +370,6 @@ func (f FilePermissions) AnyRead() bool {
 }
 
 // FileOwner represents ownership of a file.
-//
-// +stateify savable
 type FileOwner struct {
 	UID auth.KUID
 	GID auth.KGID
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index a077b91d2..ec3928baf 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -1,16 +1,25 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "binder_state",
+    srcs = ["binder.go"],
+    out = "binder_state.go",
+    package = "binder",
+)
 
 go_library(
     name = "binder",
     srcs = [
         "binder.go",
+        "binder_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -21,6 +30,8 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
+        "//pkg/tcpip/transport/unix",
     ],
 )
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 502a262dd..3f87b6b08 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -40,17 +40,15 @@ const (
 )
 
 // Device implements fs.InodeOperations.
-//
-// +stateify savable
 type Device struct {
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
-	fsutil.InodeNotSocket            `state:"nosave"`
-	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes
+	fsutil.InodeNotDirectory
+	fsutil.InodeNotRenameable
+	fsutil.InodeNotSocket
+	fsutil.InodeNotSymlink
+	fsutil.NoMappable
+	fsutil.NoopWriteOut
+	fsutil.DeprecatedFileOperations
 
 	// mu protects unstable.
 	mu       sync.Mutex `state:"nosave"`
@@ -188,12 +186,10 @@ func (bd *Device) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Proc implements fs.FileOperations and fs.IoctlGetter.
-//
-// +stateify savable
 type Proc struct {
-	fsutil.NoFsync                  `state:"nosave"`
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.NotDirReaddir            `state:"nosave"`
+	fsutil.NoFsync
+	fsutil.DeprecatedFileOperations
+	fsutil.NotDirReaddir
 
 	bd       *Device
 	task     *kernel.Task
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index b347468ff..d42e8da81 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -21,8 +21,6 @@ import (
 )
 
 // DentAttr is the metadata of a directory entry. It is a subset of StableAttr.
-//
-// +stateify savable
 type DentAttr struct {
 	// Type is the InodeType of an Inode.
 	Type InodeType
@@ -156,8 +154,6 @@ func GenericReaddir(ctx *DirCtx, s *SortedDentryMap) (int, error) {
 }
 
 // SortedDentryMap is a sorted map of names and fs.DentAttr entries.
-//
-// +stateify savable
 type SortedDentryMap struct {
 	// names is always kept in sorted-order.
 	names []string
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index fc069bb5f..ea41615fd 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,11 +1,25 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "dev_state",
+    srcs = [
+        "dev.go",
+        "fs.go",
+        "full.go",
+        "null.go",
+        "random.go",
+    ],
+    out = "dev_state.go",
+    package = "dev",
+)
 
 go_library(
     name = "dev",
     srcs = [
         "dev.go",
+        "dev_state.go",
         "device.go",
         "fs.go",
         "full.go",
@@ -16,6 +30,8 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/log",
         "//pkg/rand",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
@@ -29,7 +45,9 @@ go_library(
         "//pkg/sentry/mm",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 3f4f2a40a..36c61bfc2 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -27,8 +27,6 @@ import (
 )
 
 // Dev is the root node.
-//
-// +stateify savable
 type Dev struct {
 	ramfs.Dir
 }
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index 2ae49be4e..3c79f3782 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -29,8 +29,6 @@ const binderEnabledKey = "binder_enabled"
 const ashmemEnabledKey = "ashmem_enabled"
 
 // filesystem is a devtmpfs.
-//
-// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 492b8eb3a..e13eb6c03 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -26,8 +26,6 @@ import (
 )
 
 // fullDevice is used to implement /dev/full.
-//
-// +stateify savable
 type fullDevice struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 2977c8670..66b8ba967 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -29,7 +29,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// +stateify savable
 type nullDevice struct {
 	ramfs.Entry
 }
@@ -55,7 +54,6 @@ func (n *nullDevice) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
-// +stateify savable
 type zeroDevice struct {
 	nullDevice
 }
@@ -82,7 +80,6 @@ func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 	}), nil
 }
 
-// +stateify savable
 type zeroFileOperations struct {
 	fs.FileOperations
 }
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 47b76218f..33a045a05 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// +stateify savable
 type randomDevice struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4658d044f..f9bf2fba6 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -81,8 +81,6 @@ var renameMu sync.RWMutex
 //
 // Dirents currently do not attempt to free entries that lack application references under
 // memory pressure.
-//
-// +stateify savable
 type Dirent struct {
 	// AtomicRefCount is our reference count.
 	refs.AtomicRefCount
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index c680e4828..e786e4f65 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -25,8 +25,6 @@ import (
 //
 // A nil DirentCache corresponds to a cache with size 0. All methods can be
 // called, but nothing is actually cached.
-//
-// +stateify savable
 type DirentCache struct {
 	// Maximum size of the cache. This must be saved manually, to handle the case
 	// when cache is nil.
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index ffe4204bc..4fcb06f1f 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,27 +1,54 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "pipe_state",
+    srcs = [
+        "pipe.go",
+        "pipe_state.go",
+    ],
+    out = "pipe_autogen_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"],
+    package = "fdpipe",
+)
 
 go_library(
     name = "fdpipe",
     srcs = [
         "pipe.go",
+        "pipe_autogen_state.go",
         "pipe_opener.go",
         "pipe_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
+        "//pkg/metric",
+        "//pkg/p9",
+        "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/context",
+        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/transport/unix",
+        "//pkg/unet",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
     ],
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 2e34604e6..7b318e35f 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -34,8 +34,6 @@ import (
 )
 
 // pipeOperations are the fs.FileOperations of a host pipe.
-//
-// +stateify savable
 type pipeOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 8e535a618..6d93ef760 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -47,8 +47,6 @@ const FileMaxOffset = math.MaxInt64
 // and write(2).
 //
 // FIXME: Split synchronization from cancellation.
-//
-// +stateify savable
 type File struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 113962368..36b2cf75e 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -60,8 +60,6 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err
 }
 
 // overlayFileOperations implements FileOperations for a file in an overlay.
-//
-// +stateify savable
 type overlayFileOperations struct {
 	// upperMu protects upper below. In contrast lower is stable.
 	upperMu sync.Mutex `state:"nosave"`
@@ -377,8 +375,6 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) {
 
 // overlayMappingIdentity wraps a MappingIdentity, and also holds a reference
 // on a file during its lifetime.
-//
-// +stateify savable
 type overlayMappingIdentity struct {
 	refs.AtomicRefCount
 	id          memmap.MappingIdentity
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 5a1e7a270..200e792f4 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -125,8 +125,6 @@ func GetFilesystems() []Filesystem {
 }
 
 // MountSourceFlags represents all mount option flags as a struct.
-//
-// +stateify savable
 type MountSourceFlags struct {
 	// ReadOnly corresponds to mount(2)'s "MS_RDONLY" and indicates that
 	// the filesystem should be mounted read-only.
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index d137fee4c..f481c57fb 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,20 +1,34 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "filetest_state",
+    srcs = [
+        "filetest.go",
+    ],
+    out = "filetest_state.go",
+    package = "filetest",
+)
 
 go_library(
     name = "filetest",
     testonly = 1,
-    srcs = ["filetest.go"],
+    srcs = [
+        "filetest.go",
+        "filetest_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index 1aa271560..da0ff58af 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -19,8 +19,6 @@ import (
 )
 
 // FileFlags encodes file flags.
-//
-// +stateify savable
 type FileFlags struct {
 	// Direct indicates that I/O should be done directly.
 	Direct bool
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 3512bae6f..6eea64298 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,7 +1,24 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "fsutil_state",
+    srcs = [
+        "dirty_set_impl.go",
+        "file.go",
+        "file_range_set_impl.go",
+        "frame_ref_set_impl.go",
+        "handle.go",
+        "host_file_mapper.go",
+        "host_file_mapper_state.go",
+        "inode.go",
+        "inode_cached.go",
+    ],
+    out = "fsutil_state.go",
+    package = "fsutil",
+)
 
 go_template_instance(
     name = "dirty_set_impl",
@@ -67,6 +84,7 @@ go_library(
         "frame_ref_set.go",
         "frame_ref_set_impl.go",
         "fsutil.go",
+        "fsutil_state.go",
         "handle.go",
         "host_file_mapper.go",
         "host_file_mapper_state.go",
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 8e31e48fd..9c6c98542 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -32,8 +32,6 @@ import (
 // DirtyInfo is the value type of DirtySet, and represents information about a
 // Mappable offset that is dirty (the cached data for that offset is newer than
 // its source).
-//
-// +stateify savable
 type DirtyInfo struct {
 	// Keep is true if the represented offset is concurrently writable, such
 	// that writing the data for that offset back to the source does not
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
index e7efd3c0f..149c0f84a 100644
--- a/pkg/sentry/fs/fsutil/handle.go
+++ b/pkg/sentry/fs/fsutil/handle.go
@@ -27,8 +27,6 @@ import (
 //
 // FIXME: Remove Handle entirely in favor of individual fs.File
 // implementations using simple generic utilities.
-//
-// +stateify savable
 type Handle struct {
 	NoopRelease      `state:"nosave"`
 	NoIoctl          `state:"nosave"`
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 9c1e2f76f..d0a27fc1c 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -29,8 +29,6 @@ import (
 // HostFileMapper caches mappings of an arbitrary host file descriptor. It is
 // used by implementations of memmap.Mappable that represent a host file
 // descriptor.
-//
-// +stateify savable
 type HostFileMapper struct {
 	// HostFile conceptually breaks the file into pieces called chunks, of
 	// size and alignment chunkSize, and caches mappings of the file on a chunk
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 177396fdc..e1ad07df2 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -31,8 +31,6 @@ func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations {
 }
 
 // simpleInodeOperations is a simple implementation of Inode.
-//
-// +stateify savable
 type simpleInodeOperations struct {
 	DeprecatedFileOperations  `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
@@ -50,8 +48,6 @@ type simpleInodeOperations struct {
 
 // InodeSimpleAttributes implements a subset of the Inode interface. It provides
 // read-only access to attributes.
-//
-// +stateify savable
 type InodeSimpleAttributes struct {
 	// FSType is the filesystem type reported by StatFS.
 	FSType uint64
@@ -114,8 +110,6 @@ func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error
 //
 // Users need not initialize Xattrs to non-nil (it will be initialized
 // when the first extended attribute is set.
-//
-// +stateify savable
 type InMemoryAttributes struct {
 	Unstable fs.UnstableAttr
 	Xattrs   map[string][]byte
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 0a320e2d8..cba642a8f 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -55,8 +55,6 @@ import (
 //
 // Implementations of InodeOperations.WriteOut must call Sync to write out
 // in-memory modifications of data and metadata to the CachedFileObject.
-//
-// +stateify savable
 type CachingInodeOperations struct {
 	// backingFile is a handle to a cached file object.
 	backingFile CachedFileObject
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index cb17339c9..1277379e7 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,6 +1,21 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "gofer_state",
+    srcs = [
+        "file.go",
+        "file_state.go",
+        "fs.go",
+        "inode.go",
+        "inode_state.go",
+        "session.go",
+        "session_state.go",
+    ],
+    out = "gofer_state.go",
+    package = "gofer",
+)
 
 go_library(
     name = "gofer",
@@ -12,6 +27,7 @@ go_library(
         "file.go",
         "file_state.go",
         "fs.go",
+        "gofer_state.go",
         "handles.go",
         "inode.go",
         "inode_state.go",
@@ -25,6 +41,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/metric",
@@ -37,11 +54,15 @@ go_library(
         "//pkg/sentry/fs/fdpipe",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/host",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 46a6bbd5d..039618808 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -33,8 +33,6 @@ import (
 var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
 
 // fileOperations implements fs.FileOperations for a remote file system.
-//
-// +stateify savable
 type fileOperations struct {
 	fsutil.NoIoctl     `state:"nosave"`
 	waiter.AlwaysReady `state:"nosave"`
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 3ae93f059..dd5d43c47 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -83,8 +83,6 @@ var (
 )
 
 // filesystem is a 9p client.
-//
-// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 7fc8f77b0..df584c382 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -35,8 +35,6 @@ import (
 )
 
 // inodeOperations implements fs.InodeOperations.
-//
-// +stateify savable
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
@@ -70,8 +68,6 @@ type inodeOperations struct {
 // circular load dependency between it and inodeOperations). Even with
 // lazy loading, this approach defines the dependencies between objects
 // and the expected load behavior more concretely.
-//
-// +stateify savable
 type inodeFileState struct {
 	// s is common file system state for Gofers.
 	s *session `state:"wait"`
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 648a11435..b6841526a 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -27,7 +27,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
-// +stateify savable
 type endpointMap struct {
 	mu sync.RWMutex `state:"nosave"`
 	// TODO: Make map with private unix sockets savable.
@@ -64,8 +63,6 @@ func (e *endpointMap) get(key device.MultiDeviceKey) unix.BoundEndpoint {
 }
 
 // session holds state for each 9p session established during sys_mount.
-//
-// +stateify savable
 type session struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 29c79284a..23ec66f50 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,6 +1,23 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "host_state",
+    srcs = [
+        "control.go",
+        "descriptor.go",
+        "descriptor_state.go",
+        "file.go",
+        "fs.go",
+        "inode.go",
+        "inode_state.go",
+        "socket.go",
+        "socket_state.go",
+    ],
+    out = "host_state.go",
+    package = "host",
+)
 
 go_library(
     name = "host",
@@ -11,6 +28,7 @@ go_library(
         "device.go",
         "file.go",
         "fs.go",
+        "host_state.go",
         "inode.go",
         "inode_state.go",
         "ioctl_unsafe.go",
@@ -24,6 +42,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/refs",
@@ -33,14 +52,20 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/rawfile",
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 3aee4d11c..613bd06e8 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -25,8 +25,6 @@ import (
 )
 
 // descriptor wraps a host fd.
-//
-// +stateify savable
 type descriptor struct {
 	// donated is true if the host fd was donated by another process.
 	donated bool
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index f9bef6d93..bdf844337 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -37,8 +37,6 @@ import (
 )
 
 // fileOperations implements fs.FileOperations for a host file descriptor.
-//
-// +stateify savable
 type fileOperations struct {
 	fsutil.NoopRelease `state:"nosave"`
 
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index e46ae433c..974700636 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -51,8 +51,6 @@ const maxTraversals = 10
 // to lock down the configurations. This filesystem should only be mounted at root.
 //
 // Think twice before exposing this to applications.
-//
-// +stateify savable
 type Filesystem struct {
 	// whitelist is a set of host paths to whitelist.
 	paths []string
@@ -268,10 +266,8 @@ func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, file
 }
 
 // superOperations implements fs.MountSourceOperations.
-//
-// +stateify savable
 type superOperations struct {
-	fs.SimpleMountSourceOperations
+	fs.SimpleMountSourceOperations `state:"nosave"`
 
 	// root is the path of the mount point. All inode mappings
 	// are relative to this root.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 761ccde33..226bc5164 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -34,8 +34,6 @@ import (
 
 // inodeOperations implements fs.InodeOperations for an fs.Inodes backed
 // by a host file descriptor.
-//
-// +stateify savable
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
@@ -67,8 +65,6 @@ type inodeOperations struct {
 // circular load dependency between it and inodeOperations). Even with
 // lazy loading, this approach defines the dependencies between objects
 // and the expected load behavior more concretely.
-//
-// +stateify savable
 type inodeFileState struct {
 	// Common file system state.
 	mops *superOperations `state:"wait"`
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d0dbce5dd..6c8e6f188 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -28,8 +28,6 @@ import (
 
 // Inode is a file system object that can be simultaneously referenced by different
 // components of the VFS (Dirent, fs.File, etc).
-//
-// +stateify savable
 type Inode struct {
 	// AtomicRefCount is our reference count.
 	refs.AtomicRefCount
@@ -60,8 +58,6 @@ type Inode struct {
 // Note that in Linux fcntl(2) and flock(2) locks are _not_ cooperative, because race and
 // deadlock conditions make merging them prohibitive. We do the same and keep them oblivious
 // to each other but provide a "context" as a convenient container.
-//
-// +stateify savable
 type LockCtx struct {
 	// Posix is a set of POSIX-style regional advisory locks, see fcntl(2).
 	Posix lock.Locks
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index 683140afe..358bbecdf 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -20,8 +20,6 @@ import (
 )
 
 // Watches is the collection of inotify watches on an inode.
-//
-// +stateify savable
 type Watches struct {
 	// mu protects the fields below.
 	mu sync.RWMutex `state:"nosave"`
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 2aabdded8..6f5e8ce5e 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -34,8 +34,6 @@ import (
 //
 // Lock ordering:
 //   Inotify.mu -> Inode.Watches.mu -> Watch.mu -> Inotify.evMu
-//
-// +stateify savable
 type Inotify struct {
 	// Unique identifier for this inotify instance. We don't just reuse the
 	// inotify fd because fds can be duped. These should not be exposed to the
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index e9b5e0f56..217915ba4 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -28,8 +28,6 @@ import (
 const inotifyEventBaseSize = 16
 
 // Event represents a struct inotify_event from linux.
-//
-// +stateify savable
 type Event struct {
 	ilist.Entry
 
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 3e1959e83..8904ef544 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -27,8 +27,6 @@ import (
 // holding an extra ref on each dirent known (by inotify) to point to the
 // inode. These are known as pins. For a full discussion, see
 // fs/g3doc/inotify.md.
-//
-// +stateify savable
 type Watch struct {
 	// Inotify instance which owns this watch.
 	owner *Inotify
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 3159ff1da..2607d7ed3 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,7 +1,18 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "lock_state",
+    srcs = [
+        "lock.go",
+        "lock_range.go",
+        "lock_set.go",
+    ],
+    out = "lock_state.go",
+    package = "lock",
+)
 
 go_template_instance(
     name = "lock_range",
@@ -38,11 +49,13 @@ go_library(
         "lock_range.go",
         "lock_set.go",
         "lock_set_functions.go",
+        "lock_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
+        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index e9b376eb6..24d54c989 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -88,8 +88,6 @@ const LockEOF = math.MaxUint64
 //
 // A Lock may be downgraded from a write lock to a read lock only if
 // the write lock's uid is the same as the read lock.
-//
-// +stateify savable
 type Lock struct {
 	// Readers are the set of read lock holders identified by UniqueID.
 	// If len(Readers) > 0 then HasWriter must be false.
@@ -105,8 +103,6 @@ type Lock struct {
 }
 
 // Locks is a thread-safe wrapper around a LockSet.
-//
-// +stateify savable
 type Locks struct {
 	// mu protects locks below.
 	mu sync.Mutex `state:"nosave"`
@@ -115,7 +111,7 @@ type Locks struct {
 	locks LockSet
 
 	// blockedQueue is the queue of waiters that are waiting on a lock.
-	blockedQueue waiter.Queue `state:"zerovalue"`
+	blockedQueue waiter.Queue
 }
 
 // Blocker is the interface used for blocking locks. Passing a nil Blocker
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 4ede767f9..eb1897174 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -101,8 +101,6 @@ func (i InodeMappings) String() string {
 // (e.g. cannot be mounted at different locations).
 //
 // TODO: Move mount-specific information out of MountSource.
-//
-// +stateify savable
 type MountSource struct {
 	refs.AtomicRefCount
 
@@ -262,8 +260,6 @@ func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *Mo
 }
 
 // SimpleMountSourceOperations implements MountSourceOperations.
-//
-// +stateify savable
 type SimpleMountSourceOperations struct {
 	keep bool
 }
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index d135e8a37..1be81e3a1 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -18,8 +18,6 @@ import "gvisor.googlesource.com/gvisor/pkg/sentry/context"
 
 // overlayMountSourceOperations implements MountSourceOperations for an overlay
 // mount point.
-//
-// +stateify savable
 type overlayMountSourceOperations struct {
 	upper *MountSource
 	lower *MountSource
@@ -74,8 +72,6 @@ func (o *overlayMountSourceOperations) Destroy() {
 }
 
 // type overlayFilesystem is the filesystem for overlay mounts.
-//
-// +stateify savable
 type overlayFilesystem struct{}
 
 // Name implements Filesystem.Name.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 144d3427d..87da4ee0e 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -32,8 +32,6 @@ import (
 const DefaultTraversalLimit = 10
 
 // MountNamespace defines a collection of mounts.
-//
-// +stateify savable
 type MountNamespace struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index af13dc8c7..7357d6401 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -145,8 +145,6 @@ func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *I
 }
 
 // overlayEntry is the overlay metadata of an Inode. It implements Mappable.
-//
-// +stateify savable
 type overlayEntry struct {
 	// lowerExists is true if an Inode exists for this file in the lower
 	// filesystem. If lowerExists is true, then the overlay must create
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 2d9f07f2f..870df47b2 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,6 +1,32 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "proc_state",
+    srcs = [
+        "cpuinfo.go",
+        "exec_args.go",
+        "fds.go",
+        "file.go",
+        "filesystems.go",
+        "fs.go",
+        "loadavg.go",
+        "meminfo.go",
+        "mounts.go",
+        "net.go",
+        "proc.go",
+        "stat.go",
+        "sys.go",
+        "sys_net.go",
+        "task.go",
+        "uid_gid_map.go",
+        "uptime.go",
+        "version.go",
+    ],
+    out = "proc_state.go",
+    package = "proc",
+)
 
 go_library(
     name = "proc",
@@ -16,6 +42,7 @@ go_library(
         "mounts.go",
         "net.go",
         "proc.go",
+        "proc_state.go",
         "rpcinet_proc.go",
         "stat.go",
         "sys.go",
@@ -29,6 +56,9 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/log",
+        "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/device",
@@ -43,6 +73,8 @@ go_library(
         "//pkg/sentry/socket/rpcinet",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index 4dfec03a4..f80aaa5b1 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -27,8 +27,6 @@ import (
 // cpuinfo is a file describing the CPU capabilities.
 //
 // Presently cpuinfo never changes, so it doesn't need to be a SeqFile.
-//
-// +stateify savable
 type cpuinfo struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index a69cbaa0e..0e1523bf1 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -37,8 +37,6 @@ const (
 
 // execArgFile is a file containing the exec args (either cmdline or environ)
 // for a given task.
-//
-// +stateify savable
 type execArgFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index cca8f874c..194a9c12a 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -138,8 +138,6 @@ func (f *fd) Close() error {
 }
 
 // fdDir implements /proc/TID/fd.
-//
-// +stateify savable
 type fdDir struct {
 	ramfs.Dir
 
@@ -199,8 +197,6 @@ func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 }
 
 // fdInfo is a single file in /proc/TID/fdinfo/.
-//
-// +stateify savable
 type fdInfo struct {
 	ramfs.File
 
@@ -233,8 +229,6 @@ func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error
 
 // fdInfoDir implements /proc/TID/fdinfo.  It embeds an fdDir, but overrides
 // Lookup and Readdir.
-//
-// +stateify savable
 type fdInfoDir struct {
 	ramfs.Dir
 
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
index 4b2d08e75..9a433cdf8 100644
--- a/pkg/sentry/fs/proc/file.go
+++ b/pkg/sentry/fs/proc/file.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// +stateify savable
 type file struct {
 	fs.InodeOperations
 
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index 49b92fd8a..37db9cf9c 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -24,8 +24,6 @@ import (
 )
 
 // filesystemsData backs /proc/filesystems.
-//
-// +stateify savable
 type filesystemsData struct{}
 
 // NeedsUpdate returns true on the first generation. The set of registered file
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 061824b8c..3aadd6ac4 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -22,8 +22,6 @@ import (
 )
 
 // filesystem is a procfs.
-//
-// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 6fac251d2..7583b6ccd 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -23,8 +23,6 @@ import (
 )
 
 // loadavgData backs /proc/loadavg.
-//
-// +stateify savable
 type loadavgData struct{}
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 53dfd59ef..49cb0faed 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -26,8 +26,6 @@ import (
 )
 
 // meminfoData backs /proc/meminfo.
-//
-// +stateify savable
 type meminfoData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 2b8167c28..108432f4e 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -71,8 +71,6 @@ func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
 }
 
 // mountInfoFile is used to implement /proc/[pid]/mountinfo.
-//
-// +stateify savable
 type mountInfoFile struct {
 	t *kernel.Task
 }
@@ -154,8 +152,6 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 }
 
 // mountsFile is used to implement /proc/[pid]/mountinfo.
-//
-// +stateify savable
 type mountsFile struct {
 	t *kernel.Task
 }
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 07029a7bb..b2a8d639c 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -33,8 +33,6 @@ import (
 )
 
 // proc is a root proc node.
-//
-// +stateify savable
 type proc struct {
 	ramfs.Dir
 
@@ -49,8 +47,6 @@ type proc struct {
 // stubProcFSFile is a file type that can be used to return file contents
 // which are constant. This file is not writable and will always have mode
 // 0444.
-//
-// +stateify savable
 type stubProcFSFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 53c475652..c84f7e20d 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,10 +1,22 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "seqfile_state",
+    srcs = [
+        "seqfile.go",
+    ],
+    out = "seqfile_state.go",
+    package = "seqfile",
+)
 
 go_library(
     name = "seqfile",
-    srcs = ["seqfile.go"],
+    srcs = [
+        "seqfile.go",
+        "seqfile_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -14,16 +26,26 @@ go_library(
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/state",
     ],
 )
 
+go_stateify(
+    name = "seqfile_test_state",
+    srcs = ["seqfile_test.go"],
+    out = "seqfile_test_state.go",
+    package = "seqfile",
+)
+
 go_test(
     name = "seqfile_test",
     size = "small",
-    srcs = ["seqfile_test.go"],
+    srcs = [
+        "seqfile_test.go",
+        "seqfile_test_state.go",
+    ],
     embed = [":seqfile"],
     deps = [
-        "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs/test",
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 51cae5e37..c08565f8a 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -30,8 +30,6 @@ import (
 type SeqHandle interface{}
 
 // SeqData holds the data for one unit in the file.
-//
-// +stateify savable
 type SeqData struct {
 	// The data to be returned to the user.
 	Buf []byte
@@ -84,8 +82,6 @@ func (s *SeqGenerationCounter) IsCurrent(generation int64) bool {
 }
 
 // SeqFile is used to provide dynamic files that can be ordered by record.
-//
-// +stateify savable
 type SeqFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index bf7650211..284f3e52b 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -25,8 +25,6 @@ import (
 )
 
 // statData backs /proc/stat.
-//
-// +stateify savable
 type statData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index a2d36ca23..aab891c53 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -28,8 +28,6 @@ import (
 )
 
 // hostname is a file containing the system hostname.
-//
-// +stateify savable
 type hostname struct {
 	ramfs.Entry
 }
@@ -54,8 +52,6 @@ func (p *proc) newHostname(ctx context.Context, msrc *fs.MountSource) *fs.Inode
 }
 
 // mmapMinAddrData backs /proc/sys/vm/mmap_min_addr.
-//
-// +stateify savable
 type mmapMinAddrData struct {
 	k *kernel.Kernel
 }
@@ -78,7 +74,6 @@ func (d *mmapMinAddrData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHand
 	}, 0
 }
 
-// +stateify savable
 type overcommitMemory struct{}
 
 func (*overcommitMemory) NeedsUpdate(generation int64) bool {
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index beb25be20..f3a5043f8 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -33,7 +33,6 @@ const (
 	tcpWMem
 )
 
-// +stateify savable
 type tcpMem struct {
 	ramfs.Entry
 	s    inet.Stack
@@ -101,7 +100,6 @@ func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 	return n, cperr
 }
 
-// +stateify savable
 type tcpSack struct {
 	ramfs.Entry
 	s inet.Stack
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 748ca4320..efc635946 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -52,8 +52,6 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 }
 
 // taskDir represents a task-level directory.
-//
-// +stateify savable
 type taskDir struct {
 	ramfs.Dir
 
@@ -94,8 +92,6 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 }
 
 // subtasks represents a /proc/TID/task directory.
-//
-// +stateify savable
 type subtasks struct {
 	ramfs.Dir
 
@@ -171,8 +167,6 @@ func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, off
 }
 
 // exe is an fs.InodeOperations symlink for the /proc/PID/exe file.
-//
-// +stateify savable
 type exe struct {
 	ramfs.Symlink
 
@@ -232,8 +226,6 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 
 // namespaceFile represents a file in the namespacefs, such as the files in
 // /proc/<pid>/ns.
-//
-// +stateify savable
 type namespaceFile struct {
 	ramfs.Symlink
 
@@ -282,8 +274,6 @@ func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 // mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
-//
-// +stateify savable
 type mapsData struct {
 	t *kernel.Task
 }
@@ -321,7 +311,6 @@ func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([
 	return []seqfile.SeqData{}, 0
 }
 
-// +stateify savable
 type taskStatData struct {
 	t *kernel.Task
 
@@ -402,8 +391,6 @@ func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 }
 
 // statmData implements seqfile.SeqSource for /proc/[pid]/statm.
-//
-// +stateify savable
 type statmData struct {
 	t *kernel.Task
 }
@@ -438,8 +425,6 @@ func (s *statmData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([
 }
 
 // statusData implements seqfile.SeqSource for /proc/[pid]/status.
-//
-// +stateify savable
 type statusData struct {
 	t     *kernel.Task
 	pidns *kernel.PIDNamespace
@@ -505,7 +490,6 @@ type ioUsage interface {
 	IOUsage() *usage.IO
 }
 
-// +stateify savable
 type ioData struct {
 	ioUsage
 }
@@ -546,8 +530,6 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 // On Linux, /proc/[pid]/comm is writable, and writing to the comm file changes
 // the thread name. We don't implement this yet as there are no known users of
 // this feature.
-//
-// +stateify savable
 type comm struct {
 	ramfs.Entry
 
@@ -577,8 +559,6 @@ func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, off
 }
 
 // auxvec is a file containing the auxiliary vector for a task.
-//
-// +stateify savable
 type auxvec struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 9811d9c9d..85acb5163 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -29,8 +29,6 @@ import (
 
 // An idMapSeqSource is a seqfile.SeqSource that returns UID or GID mappings
 // from a task's user namespace.
-//
-// +stateify savable
 type idMapSeqSource struct {
 	t    *kernel.Task
 	gids bool
@@ -72,7 +70,6 @@ type idMapSeqHandle struct {
 	value int
 }
 
-// +stateify savable
 type idMapSeqFile struct {
 	seqfile.SeqFile
 }
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index f3a9b81df..4679d5821 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -27,8 +27,6 @@ import (
 )
 
 // uptime is a file containing the system uptime.
-//
-// +stateify savable
 type uptime struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index 00f6a2afd..c0f2e87e3 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -23,8 +23,6 @@ import (
 )
 
 // versionData backs /proc/version.
-//
-// +stateify savable
 type versionData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 5230157fe..d84f2c624 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,6 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "ramfs_state",
+    srcs = [
+        "dir.go",
+        "file.go",
+        "ramfs.go",
+        "socket.go",
+        "symlink.go",
+    ],
+    out = "ramfs_state.go",
+    package = "ramfs",
+)
 
 go_library(
     name = "ramfs",
@@ -8,6 +21,7 @@ go_library(
         "dir.go",
         "file.go",
         "ramfs.go",
+        "ramfs_state.go",
         "socket.go",
         "symlink.go",
         "tree.go",
@@ -15,8 +29,12 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/amutex",
+        "//pkg/log",
+        "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/context",
+        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
@@ -24,6 +42,7 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 04432f28c..19d5612ed 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -44,8 +44,6 @@ type CreateOps struct {
 }
 
 // Dir represents a single directory in the filesystem.
-//
-// +stateify savable
 type Dir struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 13e72e775..d6cfaf753 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -60,8 +60,6 @@ var (
 
 // Entry represents common internal state for file and directory nodes.
 // This may be used by other packages to easily create ramfs files.
-//
-// +stateify savable
 type Entry struct {
 	waiter.AlwaysReady    `state:"nosave"`
 	fsutil.NoMappable     `state:"nosave"`
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 93427a1ff..b0c79325f 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -21,8 +21,6 @@ import (
 )
 
 // Socket represents a socket.
-//
-// +stateify savable
 type Socket struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 1c54d9991..9bbf78619 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -22,8 +22,6 @@ import (
 )
 
 // Symlink represents a symlink.
-//
-// +stateify savable
 type Symlink struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD
index 187eac49d..57fee45e2 100644
--- a/pkg/sentry/fs/ramfs/test/BUILD
+++ b/pkg/sentry/fs/ramfs/test/BUILD
@@ -1,16 +1,30 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "test_state",
+    srcs = [
+        "test.go",
+    ],
+    out = "test_state.go",
+    package = "test",
+)
 
 go_library(
     name = "test",
     testonly = 1,
-    srcs = ["test.go"],
+    srcs = [
+        "test.go",
+        "test_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
+        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
+        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index bc24e980e..095ff1f25 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,6 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "sys_state",
+    srcs = [
+        "fs.go",
+        "sys.go",
+    ],
+    out = "sys_state.go",
+    package = "sys",
+)
 
 go_library(
     name = "sys",
@@ -8,6 +18,7 @@ go_library(
         "device.go",
         "fs.go",
         "sys.go",
+        "sys_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys",
     visibility = ["//pkg/sentry:internal"],
@@ -17,5 +28,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/usermem",
+        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 625525540..c6d5f7fd8 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -20,8 +20,6 @@ import (
 )
 
 // filesystem is a sysfs.
-//
-// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index b9b2fb4a1..ccf56f644 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -22,13 +22,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// +stateify savable
-type dir struct {
+type Dir struct {
 	ramfs.Dir
 }
 
 func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode {
-	d := &dir{}
+	d := &Dir{}
 	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index ffdd7e0dc..8b1b7872e 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,19 +1,33 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "timerfd_state",
+    srcs = [
+        "timerfd.go",
+    ],
+    out = "timerfd_state.go",
+    package = "timerfd",
+)
 
 go_library(
     name = "timerfd",
-    srcs = ["timerfd.go"],
+    srcs = [
+        "timerfd.go",
+        "timerfd_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 767db95a0..ae58f6fd7 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -30,8 +30,6 @@ import (
 )
 
 // TimerOperations implements fs.FileOperations for timerfds.
-//
-// +stateify savable
 type TimerOperations struct {
 	fsutil.ZeroSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
@@ -40,7 +38,7 @@ type TimerOperations struct {
 	fsutil.NoMMap        `state:"nosave"`
 	fsutil.NoIoctl       `state:"nosave"`
 
-	events waiter.Queue `state:"zerovalue"`
+	events waiter.Queue `state:"nosave"`
 	timer  *ktime.Timer
 
 	// val is the number of timer expirations since the last successful call to
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index cfe11ab02..473ab4296 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,6 +1,18 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "tmpfs_state",
+    srcs = [
+        "file_regular.go",
+        "fs.go",
+        "inode_file.go",
+        "tmpfs.go",
+    ],
+    out = "tmpfs_state.go",
+    package = "tmpfs",
+)
 
 go_library(
     name = "tmpfs",
@@ -10,11 +22,13 @@ go_library(
         "fs.go",
         "inode_file.go",
         "tmpfs.go",
+        "tmpfs_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -27,6 +41,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 342688f81..9811d90bc 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -25,8 +25,6 @@ import (
 
 // regularFileOperations implements fs.FileOperations for a regular
 // tmpfs file.
-//
-// +stateify savable
 type regularFileOperations struct {
 	waiter.AlwaysReady   `state:"nosave"`
 	fsutil.NoopRelease   `state:"nosave"`
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index ca620e65e..5bd9ade52 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -47,8 +47,6 @@ const (
 var modeRegexp = regexp.MustCompile("0[0-7][0-7][0-7]")
 
 // Filesystem is a tmpfs.
-//
-// +stateify savable
 type Filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1e4fe47d2..4e803c9ff 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -43,8 +43,6 @@ import (
 // include an InvalidatorRegion associated with that reference. When the
 // referenced portion of the file is removed (with Truncate), the associated
 // InvalidatorRegion is invalidated.
-//
-// +stateify savable
 type fileInodeOperations struct {
 	fsutil.DeprecatedFileOperations `state:"nosave"`
 	fsutil.InodeNotDirectory        `state:"nosave"`
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 10cb5451d..1cc7ae491 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -49,8 +49,6 @@ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent
 }
 
 // Dir is a directory.
-//
-// +stateify savable
 type Dir struct {
 	ramfs.Dir
 
@@ -124,8 +122,6 @@ func (*Dir) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Symlink is a symlink.
-//
-// +stateify savable
 type Symlink struct {
 	ramfs.Symlink
 }
@@ -153,8 +149,6 @@ func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Socket is a socket.
-//
-// +stateify savable
 type Socket struct {
 	ramfs.Socket
 }
@@ -182,8 +176,6 @@ func (s *Socket) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Fifo is a tmpfs named pipe.
-//
-// +stateify savable
 type Fifo struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 3c446eef4..363897b2c 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,6 +1,22 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "tty_state",
+    srcs = [
+        "dir.go",
+        "fs.go",
+        "inode.go",
+        "line_discipline.go",
+        "master.go",
+        "queue.go",
+        "slave.go",
+        "terminal.go",
+    ],
+    out = "tty_state.go",
+    package = "tty",
+)
 
 go_library(
     name = "tty",
@@ -13,6 +29,7 @@ go_library(
         "queue.go",
         "slave.go",
         "terminal.go",
+        "tty_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty",
     visibility = ["//pkg/sentry:internal"],
@@ -27,6 +44,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index c91091db4..2c5b2aed6 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -49,16 +49,14 @@ import (
 // corresponding Dirents hold on their parent (this directory).
 //
 // dirInodeOperations implements fs.InodeOperations.
-//
-// +stateify savable
 type dirInodeOperations struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
-	fsutil.InodeNotSocket            `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
-	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
+	fsutil.DeprecatedFileOperations
+	fsutil.InodeNotSocket
+	fsutil.InodeNotRenameable
+	fsutil.InodeNotSymlink
+	fsutil.InodeNoExtendedAttributes
+	fsutil.NoMappable
+	fsutil.NoopWriteOut
 
 	// msrc is the super block this directory is on.
 	//
@@ -350,8 +348,6 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
 //
 // This is nearly identical to fsutil.DirFileOperations, except that it takes
 // df.di.mu in IterateDir.
-//
-// +stateify savable
 type dirFileOperations struct {
 	waiter.AlwaysReady `state:"nosave"`
 	fsutil.NoopRelease `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index e28635607..dbaffe95e 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -28,8 +28,6 @@ var ptsDevice = device.NewAnonDevice()
 //
 // This devpts is always in the new "multi-instance" mode. i.e., it contains a
 // ptmx device tied to this mount.
-//
-// +stateify savable
 type filesystem struct{}
 
 func init() {
@@ -71,8 +69,6 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 }
 
 // superOperations implements fs.MountSourceOperations, preventing caching.
-//
-// +stateify savable
 type superOperations struct{}
 
 // Revalidate implements fs.DirentOperations.Revalidate.
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
index c0fa2b407..04b9a7727 100644
--- a/pkg/sentry/fs/tty/inode.go
+++ b/pkg/sentry/fs/tty/inode.go
@@ -31,8 +31,6 @@ import (
 //
 // * fs.InodeOperations.Release
 // * fs.InodeOperations.GetFile
-//
-// +stateify savable
 type inodeOperations struct {
 	fsutil.DeprecatedFileOperations  `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index d243ee40e..f094635f5 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -72,8 +72,6 @@ const (
 //  termiosMu
 //    inQueue.mu
 //      outQueue.mu
-//
-// +stateify savable
 type lineDiscipline struct {
 	// inQueue is the input queue of the terminal.
 	inQueue queue
@@ -185,8 +183,6 @@ type transformer interface {
 
 // outputQueueTransformer implements transformer. It performs line discipline
 // transformations on the output queue.
-//
-// +stateify savable
 type outputQueueTransformer struct{}
 
 // transform does output processing for one end of the pty. See
@@ -258,8 +254,6 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 
 // inputQueueTransformer implements transformer. It performs line discipline
 // transformations on the input queue.
-//
-// +stateify savable
 type inputQueueTransformer struct{}
 
 // transform does input processing for one end of the pty. Characters read are
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index c7198e218..74cdbe874 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -27,8 +27,6 @@ import (
 
 // masterInodeOperations are the fs.InodeOperations for the master end of the
 // Terminal (ptmx file).
-//
-// +stateify savable
 type masterInodeOperations struct {
 	inodeOperations
 
@@ -98,8 +96,6 @@ func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flag
 }
 
 // masterFileOperations are the fs.FileOperations for the master end of a terminal.
-//
-// +stateify savable
 type masterFileOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 42c105abc..026d5e077 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -32,13 +32,11 @@ import (
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
 // readable is true.
-//
-// +stateify savable
 type queue struct {
 	// mu protects everything in queue.
 	mu sync.Mutex `state:"nosave"`
 
-	waiter.Queue `state:"zerovalue"`
+	waiter.Queue `state:"nosave"`
 
 	// readBuf is buffer of data ready to be read when readable is true.
 	// This data has been processed.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 1c562b172..f5eec726e 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -27,8 +27,6 @@ import (
 
 // slaveInodeOperations are the fs.InodeOperations for the slave end of the
 // Terminal (pts file).
-//
-// +stateify savable
 type slaveInodeOperations struct {
 	inodeOperations
 
@@ -88,8 +86,6 @@ func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags
 }
 
 // slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
-//
-// +stateify savable
 type slaveFileOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 3cb135124..fa5b00409 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -21,8 +21,6 @@ import (
 )
 
 // Terminal is a pseudoterminal.
-//
-// +stateify savable
 type Terminal struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 159c50efb..eaf8f15b2 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -3,15 +3,26 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "inet_state",
+    srcs = ["inet.go"],
+    out = "inet_state.go",
+    package = "inet",
+)
 
 go_library(
     name = "inet",
     srcs = [
         "context.go",
         "inet.go",
+        "inet_state.go",
         "test_stack.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/inet",
-    deps = ["//pkg/sentry/context"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/state",
+    ],
 )
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index e54a61196..e4b326993 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -87,8 +87,6 @@ type InterfaceAddr struct {
 }
 
 // TCPBufferSize contains settings controlling TCP buffer sizing.
-//
-// +stateify savable
 type TCPBufferSize struct {
 	// Min is the minimum size.
 	Min int
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 0ebacefa6..c4a7dacb2 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,7 +1,58 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "kernel_state",
+    srcs = [
+        "abstract_socket_namespace.go",
+        "fd_map.go",
+        "fs_context.go",
+        "ipc_namespace.go",
+        "kernel.go",
+        "kernel_state.go",
+        "pending_signals.go",
+        "pending_signals_state.go",
+        "process_group_list.go",
+        "ptrace.go",
+        "rseq.go",
+        "session_list.go",
+        "sessions.go",
+        "signal.go",
+        "signal_handlers.go",
+        "syscalls.go",
+        "syscalls_state.go",
+        "syslog.go",
+        "task.go",
+        "task_clone.go",
+        "task_context.go",
+        "task_exec.go",
+        "task_exit.go",
+        "task_list.go",
+        "task_resources.go",
+        "task_run.go",
+        "task_sched.go",
+        "task_signals.go",
+        "task_start.go",
+        "task_syscall.go",
+        "thread_group.go",
+        "threads.go",
+        "timekeeper.go",
+        "timekeeper_state.go",
+        "timer.go",
+        "uts_namespace.go",
+        "vdso.go",
+        "version.go",
+    ],
+    out = "kernel_autogen_state.go",
+    imports = [
+        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
+        "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+        "gvisor.googlesource.com/gvisor/pkg/tcpip",
+    ],
+    package = "kernel",
+)
 
 go_template_instance(
     name = "pending_signals_list",
@@ -67,6 +118,7 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
+        "kernel_autogen_state.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
@@ -112,11 +164,6 @@ go_library(
         "version.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
-    imports = [
-        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
-        # "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
-        "gvisor.googlesource.com/gvisor/pkg/tcpip",
-    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index d6d1d341d..014c4a3bf 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
-// +stateify savable
 type abstractEndpoint struct {
 	ep   unix.BoundEndpoint
 	wr   *refs.WeakRef
@@ -40,8 +39,6 @@ func (e *abstractEndpoint) WeakRefGone() {
 }
 
 // AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
-//
-// +stateify savable
 type AbstractSocketNamespace struct {
 	mu sync.Mutex `state:"nosave"`
 
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index a81085372..5b7b30557 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,7 +1,20 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "auth_state",
+    srcs = [
+        "credentials.go",
+        "id.go",
+        "id_map_range.go",
+        "id_map_set.go",
+        "user_namespace.go",
+    ],
+    out = "auth_state.go",
+    package = "auth",
+)
 
 go_template_instance(
     name = "id_map_range",
@@ -35,6 +48,7 @@ go_library(
     name = "auth",
     srcs = [
         "auth.go",
+        "auth_state.go",
         "capability_set.go",
         "context.go",
         "credentials.go",
@@ -52,6 +66,7 @@ go_library(
         "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/context",
+        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index f18f7dac9..f6fb05285 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -21,8 +21,6 @@ import (
 
 // Credentials contains information required to authorize privileged operations
 // in a user namespace.
-//
-// +stateify savable
 type Credentials struct {
 	// Real/effective/saved user/group IDs in the root user namespace. None of
 	// these should ever be NoID.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index bd0090e0f..6adb33530 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -77,8 +77,6 @@ func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
 // An IDMapEntry represents a mapping from a range of contiguous IDs in a user
 // namespace to an equally-sized range of contiguous IDs in the namespace's
 // parent.
-//
-// +stateify savable
 type IDMapEntry struct {
 	// FirstID is the first ID in the range in the namespace.
 	FirstID uint32
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index d359f3f31..0980aeadf 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -23,8 +23,6 @@ import (
 
 // A UserNamespace represents a user namespace. See user_namespaces(7) for
 // details.
-//
-// +stateify savable
 type UserNamespace struct {
 	// parent is this namespace's parent. If this is the root namespace, parent
 	// is nil. The parent pointer is immutable.
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 5e8b36ed6..7d491efbc 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,11 +1,22 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "epoll_autogen_state",
+    srcs = [
+        "epoll.go",
+        "epoll_state.go",
+    ],
+    out = "epoll_autogen_state.go",
+    package = "epoll",
+)
 
 go_library(
     name = "epoll",
     srcs = [
         "epoll.go",
+        "epoll_autogen_state.go",
         "epoll_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll",
@@ -18,7 +29,9 @@ go_library(
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index d87e64a1c..b572fcd7e 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -58,8 +58,6 @@ const (
 // potentially be reassigned. We also cannot use just the file pointer because
 // it is possible to have multiple entries for the same file object as long as
 // they are created with different FDs (i.e., the FDs point to the same file).
-//
-// +stateify savable
 type FileIdentifier struct {
 	File *fs.File
 	Fd   kdefs.FD
@@ -67,8 +65,6 @@ type FileIdentifier struct {
 
 // pollEntry holds all the state associated with an event poll entry, that is,
 // a file being observed by an event poll object.
-//
-// +stateify savable
 type pollEntry struct {
 	ilist.Entry
 	file     *refs.WeakRef  `state:"manual"`
@@ -96,8 +92,6 @@ func (p *pollEntry) WeakRefGone() {
 
 // EventPoll holds all the state associated with an event poll object, that is,
 // collection of files to observe and their current state.
-//
-// +stateify savable
 type EventPoll struct {
 	fsutil.PipeSeek      `state:"zerovalue"`
 	fsutil.NotDirReaddir `state:"zerovalue"`
@@ -108,7 +102,7 @@ type EventPoll struct {
 
 	// Wait queue is used to notify interested parties when the event poll
 	// object itself becomes readable or writable.
-	waiter.Queue `state:"zerovalue"`
+	waiter.Queue
 
 	// files is the map of all the files currently being observed, it is
 	// protected by mu.
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index cc1120b4f..7ec179bd8 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,19 +1,33 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "eventfd_state",
+    srcs = [
+        "eventfd.go",
+    ],
+    out = "eventfd_state.go",
+    package = "eventfd",
+)
 
 go_library(
     name = "eventfd",
-    srcs = ["eventfd.go"],
+    srcs = [
+        "eventfd.go",
+        "eventfd_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index a4ada0e78..bd50bd9fe 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -35,8 +35,6 @@ import (
 // EventOperations represents an event with the semantics of Linux's file-based event
 // notification (eventfd). Eventfds are usually internal to the Sentry but in certain
 // situations they may be converted into a host-backed eventfd.
-//
-// +stateify savable
 type EventOperations struct {
 	fsutil.NoopRelease   `state:"nosave"`
 	fsutil.PipeSeek      `state:"nosave"`
@@ -51,7 +49,7 @@ type EventOperations struct {
 
 	// Queue is used to notify interested parties when the event object
 	// becomes readable or writable.
-	wq waiter.Queue `state:"zerovalue"`
+	wq waiter.Queue `state:"nosave"`
 
 	// val is the current value of the event counter.
 	val uint64
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index d5d4aaacb..299506330 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -46,8 +46,6 @@ func (f FDs) Less(i, j int) bool {
 }
 
 // FDFlags define flags for an individual descriptor.
-//
-// +stateify savable
 type FDFlags struct {
 	// CloseOnExec indicates the descriptor should be closed on exec.
 	CloseOnExec bool
@@ -71,16 +69,12 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) {
 
 // descriptor holds the details about a file descriptor, namely a pointer the
 // file itself and the descriptor flags.
-//
-// +stateify savable
 type descriptor struct {
 	file  *fs.File
 	flags FDFlags
 }
 
 // FDMap is used to manage File references and flags.
-//
-// +stateify savable
 type FDMap struct {
 	refs.AtomicRefCount
 	k     *Kernel
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index f3f05e8f5..dbc097696 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -25,8 +25,6 @@ import (
 // FSContext contains filesystem context.
 //
 // This includes umask and working directory.
-//
-// +stateify savable
 type FSContext struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index b44a26974..a97a43549 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_template_instance(
     name = "waiter_list",
@@ -14,15 +14,29 @@ go_template_instance(
     },
 )
 
+go_stateify(
+    name = "futex_state",
+    srcs = [
+        "futex.go",
+        "waiter_list.go",
+    ],
+    out = "futex_state.go",
+    package = "futex",
+)
+
 go_library(
     name = "futex",
     srcs = [
         "futex.go",
+        "futex_state.go",
         "waiter_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
     visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/syserror"],
+    deps = [
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
 )
 
 go_test(
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 4a1f2a0ef..15e3e5e2c 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -196,8 +196,6 @@ func bucketIndexForAddr(addr uintptr) uintptr {
 }
 
 // Manager holds futex state for a single virtual address space.
-//
-// +stateify savable
 type Manager struct {
 	buckets [bucketCount]bucket `state:"zerovalue"`
 }
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 5eef49f59..a86bda77b 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -21,8 +21,6 @@ import (
 )
 
 // IPCNamespace represents an IPC namespace.
-//
-// +stateify savable
 type IPCNamespace struct {
 	// User namespace which owns this IPC namespace. Immutable.
 	userNS *auth.UserNamespace
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 419a1d473..64439cd9d 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -62,8 +62,6 @@ import (
 
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
-//
-// +stateify savable
 type Kernel struct {
 	// extMu serializes external changes to the Kernel with calls to
 	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
@@ -160,7 +158,7 @@ type Kernel struct {
 
 	// exitErr is the error causing the sandbox to exit, if any. It is
 	// protected by extMu.
-	exitErr error `state:"nosave"`
+	exitErr error
 
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index 06be5a7e1..5dc0f266c 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -38,8 +38,6 @@ const (
 // pendingSignals holds a collection of pending signals. The zero value of
 // pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
 // users must provide synchronization.
-//
-// +stateify savable
 type pendingSignals struct {
 	// signals contains all pending signals.
 	//
@@ -54,14 +52,11 @@ type pendingSignals struct {
 }
 
 // pendingSignalQueue holds a pendingSignalList for a single signal number.
-//
-// +stateify savable
 type pendingSignalQueue struct {
 	pendingSignalList
 	length int
 }
 
-// +stateify savable
 type pendingSignal struct {
 	// pendingSignalEntry links into a pendingSignalList.
 	pendingSignalEntry
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 19b23c6d2..4600d19bd 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,6 +1,20 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "pipe_state",
+    srcs = [
+        "buffers.go",
+        "node.go",
+        "pipe.go",
+        "reader.go",
+        "reader_writer.go",
+        "writer.go",
+    ],
+    out = "pipe_state.go",
+    package = "pipe",
+)
 
 go_library(
     name = "pipe",
@@ -9,6 +23,7 @@ go_library(
         "device.go",
         "node.go",
         "pipe.go",
+        "pipe_state.go",
         "reader.go",
         "reader_writer.go",
         "writer.go",
@@ -19,12 +34,15 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/amutex",
         "//pkg/ilist",
+        "//pkg/log",
+        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index a82e45c3f..f300537c5 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -20,8 +20,6 @@ import (
 
 // Buffer encapsulates a queueable byte buffer that can
 // easily be truncated.  It is designed only for use with pipes.
-//
-// +stateify savable
 type Buffer struct {
 	ilist.Entry
 	data []byte
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 23d692da1..e418cf174 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -24,8 +24,6 @@ import (
 )
 
 // inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
-//
-// +stateify savable
 type inodeOperations struct {
 	fs.InodeOperations
 
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index ced2559a7..9a21df5b4 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -41,8 +41,6 @@ const DefaultPipeSize = 65536
 // Pipe is an encapsulation of a platform-independent pipe.
 // It manages a buffered byte queue shared between a reader/writer
 // pair.
-//
-// +stateify savable
 type Pipe struct {
 	waiter.Queue `state:"nosave"`
 
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 1fa5e9a32..40d5e4943 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -20,8 +20,6 @@ import (
 
 // Reader satisfies the fs.FileOperations interface for read-only pipes.
 // Reader should be used with !fs.FileFlags.Write to reject writes.
-//
-// +stateify savable
 type Reader struct {
 	ReaderWriter
 }
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 82607367b..dc642a3a6 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -31,8 +31,6 @@ import (
 // read and write requests. This should only be used directly for named pipes.
 // pipe(2) and pipe2(2) only support unidirectional pipes and should use
 // either pipe.Reader or pipe.Writer.
-//
-// +stateify savable
 type ReaderWriter struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index d93324b53..fd13008ac 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -20,8 +20,6 @@ import (
 
 // Writer satisfies the fs.FileOperations interface for write-only pipes.
 // Writer should be used with !fs.FileFlags.Read to reject reads.
-//
-// +stateify savable
 type Writer struct {
 	ReaderWriter
 }
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index e9e69004d..f1c2c4bf0 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -25,8 +25,6 @@ import (
 
 // ptraceOptions are the subset of options controlling a task's ptrace behavior
 // that are set by ptrace(PTRACE_SETOPTIONS).
-//
-// +stateify savable
 type ptraceOptions struct {
 	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
 	// exits.
@@ -187,8 +185,6 @@ func (t *Task) hasTracer() bool {
 }
 
 // ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
-//
-// +stateify savable
 type ptraceStop struct {
 	// If frozen is true, the stopped task's tracer is currently operating on
 	// it, so Task.Kill should not remove the stop.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 1f3de58e3..635372993 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -23,8 +23,6 @@ import (
 // Restartable sequences, as described in https://lwn.net/Articles/650333/.
 
 // RSEQCriticalRegion describes a restartable sequence critical region.
-//
-// +stateify savable
 type RSEQCriticalRegion struct {
 	// When a task in this thread group has its CPU preempted (as defined by
 	// platform.ErrContextCPUPreempted) or has a signal delivered to an
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index e7fa44e2c..969145fe1 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
 
 go_template_instance(
     name = "waiter_list",
@@ -14,10 +14,21 @@ go_template_instance(
     },
 )
 
+go_stateify(
+    name = "semaphore_state",
+    srcs = [
+        "semaphore.go",
+        "waiter_list.go",
+    ],
+    out = "semaphore_autogen_state.go",
+    package = "semaphore",
+)
+
 go_library(
     name = "semaphore",
     srcs = [
         "semaphore.go",
+        "semaphore_autogen_state.go",
         "waiter_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore",
@@ -29,6 +40,8 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
+        "//pkg/state",
+        "//pkg/state/statefile",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index aa07946cf..a1ee83ce5 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -42,8 +42,6 @@ const (
 )
 
 // Registry maintains a set of semaphores that can be found by key or ID.
-//
-// +stateify savable
 type Registry struct {
 	// userNS owning the ipc name this registry belongs to. Immutable.
 	userNS *auth.UserNamespace
@@ -54,8 +52,6 @@ type Registry struct {
 }
 
 // Set represents a set of semaphores that can be operated atomically.
-//
-// +stateify savable
 type Set struct {
 	// registry owning this sem set. Immutable.
 	registry *Registry
@@ -83,8 +79,6 @@ type Set struct {
 }
 
 // sem represents a single semanphore from a set.
-//
-// +stateify savable
 type sem struct {
 	value   int16
 	waiters waiterList `state:"zerovalue"`
@@ -92,8 +86,6 @@ type sem struct {
 
 // waiter represents a caller that is waiting for the semaphore value to
 // become positive or zero.
-//
-// +stateify savable
 type waiter struct {
 	waiterEntry
 
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index cf4e18805..fa4c7b8f6 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -27,8 +27,6 @@ type SessionID ThreadID
 type ProcessGroupID ThreadID
 
 // Session contains a leader threadgroup and a list of ProcessGroups.
-//
-// +stateify savable
 type Session struct {
 	refs refs.AtomicRefCount
 
@@ -78,8 +76,6 @@ func (s *Session) decRef() {
 }
 
 // ProcessGroup contains an originator threadgroup and a parent Session.
-//
-// +stateify savable
 type ProcessGroup struct {
 	refs refs.AtomicRefCount // not exported.
 
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 40e641355..0f88eb0ac 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,12 +1,22 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "shm_state",
+    srcs = [
+        "shm.go",
+    ],
+    out = "shm_autogen_state.go",
+    package = "shm",
+)
 
 go_library(
     name = "shm",
     srcs = [
         "device.go",
         "shm.go",
+        "shm_autogen_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm",
     visibility = ["//pkg/sentry:internal"],
@@ -23,6 +33,7 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 1ac444094..7217e8103 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -72,8 +72,6 @@ const (
 // Registry tracks all shared memory segments in an IPC namespace. The registry
 // provides the mechanisms for creating and finding segments, and reporting
 // global shm parameters.
-//
-// +stateify savable
 type Registry struct {
 	// userNS owns the IPC namespace this registry belong to. Immutable.
 	userNS *auth.UserNamespace
@@ -290,8 +288,6 @@ func (r *Registry) remove(s *Shm) {
 // shmctl(SHM_RMID).
 //
 // Shm implements memmap.Mappable and memmap.MappingIdentity.
-//
-// +stateify savable
 type Shm struct {
 	// AtomicRefCount tracks the number of references to this segment from
 	// maps. A segment always holds a reference to itself, until it's marked for
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 3649f5e4d..21ba4ee70 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -22,8 +22,6 @@ import (
 )
 
 // SignalHandlers holds information about signal actions.
-//
-// +stateify savable
 type SignalHandlers struct {
 	// mu protects actions, as well as the signal state of all tasks and thread
 	// groups using this SignalHandlers object. (See comment on
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 4c7811b6c..e20fa3eb6 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -176,8 +176,6 @@ type Stracer interface {
 // SyscallTable is a lookup table of system calls. Critically, a SyscallTable
 // is *immutable*. In order to make supporting suspend and resume sane, they
 // must be uniquely registered and may not change during operation.
-//
-// +stateify savable
 type SyscallTable struct {
 	// OS is the operating system that this syscall table implements.
 	OS abi.OS `state:"wait"`
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 125312b6a..31541749e 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -23,8 +23,6 @@ import (
 // syslog represents a sentry-global kernel log.
 //
 // Currently, it contains only fun messages for a dmesg easter egg.
-//
-// +stateify savable
 type syslog struct {
 	// mu protects the below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index ae9b3d175..7f6735320 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -52,8 +52,6 @@ import (
 // All fields that are "exclusive to the task goroutine" can only be accessed
 // by the task goroutine while it is running. The task goroutine does not
 // require synchronization to read or write these fields.
-//
-// +stateify savable
 type Task struct {
 	taskNode
 
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 38f7826e2..a61283267 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -349,7 +349,6 @@ func (t *Task) unstopVforkParent() {
 	}
 }
 
-// +stateify savable
 type runSyscallAfterPtraceEventClone struct {
 	vforkChild *Task
 
@@ -367,7 +366,6 @@ func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
 	return (*runSyscallExit)(nil)
 }
 
-// +stateify savable
 type runSyscallAfterVforkStop struct {
 	// childTID has the same meaning as
 	// runSyscallAfterPtraceEventClone.vforkChildTID.
@@ -473,8 +471,6 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
 // that the child and parent share mappings until the child execve()s into a
 // new process image or exits.)
-//
-// +stateify savable
 type vforkStop struct{}
 
 // StopIgnoresKill implements TaskStop.Killable.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 9a59cbd33..5c563ba08 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -35,8 +35,6 @@ var ErrNoSyscalls = errors.New("no syscall table found")
 type Auxmap map[string]interface{}
 
 // TaskContext is the subset of a task's data that is provided by the loader.
-//
-// +stateify savable
 type TaskContext struct {
 	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
 	Name string
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 385299b24..2285847a2 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -73,8 +73,6 @@ import (
 
 // execStop is a TaskStop that a task sets on itself when it wants to execve
 // and is waiting for the other tasks in its thread group to exit first.
-//
-// +stateify savable
 type execStop struct{}
 
 // Killable implements TaskStop.Killable.
@@ -121,8 +119,6 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
 
 // The runSyscallAfterExecStop state continues execve(2) after all siblings of
 // a thread in the execve syscall have exited.
-//
-// +stateify savable
 type runSyscallAfterExecStop struct {
 	tc *TaskContext
 }
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b16844e91..d6604f37b 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -38,8 +38,6 @@ import (
 
 // An ExitStatus is a value communicated from an exiting task or thread group
 // to the party that reaps it.
-//
-// +stateify savable
 type ExitStatus struct {
 	// Code is the numeric value passed to the call to exit or exit_group that
 	// caused the exit. If the exit was not caused by such a call, Code is 0.
@@ -224,8 +222,6 @@ func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
 }
 
 // runExit is the entry point into the task exit path.
-//
-// +stateify savable
 type runExit struct{}
 
 func (*runExit) execute(t *Task) taskRunState {
@@ -233,7 +229,6 @@ func (*runExit) execute(t *Task) taskRunState {
 	return (*runExitMain)(nil)
 }
 
-// +stateify savable
 type runExitMain struct{}
 
 func (*runExitMain) execute(t *Task) taskRunState {
@@ -536,7 +531,6 @@ func (t *Task) reparentLocked(parent *Task) {
 // tracer (if one exists) and reaps the leader immediately. In Linux, this is
 // in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
 
-// +stateify savable
 type runExitNotify struct{}
 
 func (*runExitNotify) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
index 0832bf989..4ca25664a 100644
--- a/pkg/sentry/kernel/task_resources.go
+++ b/pkg/sentry/kernel/task_resources.go
@@ -21,8 +21,6 @@ import (
 
 // TaskResources is the subset of a task's data provided by its creator that is
 // not provided by the loader.
-//
-// +stateify savable
 type TaskResources struct {
 	// SignalMask is the set of signals whose delivery is currently blocked.
 	//
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 8dd0ef6ea..a03fa6ac0 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -131,8 +131,6 @@ func (t *Task) doStop() {
 
 // The runApp state checks for interrupts before executing untrusted
 // application code.
-//
-// +stateify savable
 type runApp struct{}
 
 func (*runApp) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 49141ab74..b50139077 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -65,8 +65,6 @@ const (
 
 // TaskGoroutineSchedInfo contains task goroutine scheduling state which must
 // be read and updated atomically.
-//
-// +stateify savable
 type TaskGoroutineSchedInfo struct {
 	// Timestamp was the value of Kernel.cpuClock when this
 	// TaskGoroutineSchedInfo was last updated.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 62ec530be..91f6c0874 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -748,8 +748,6 @@ func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
 // groupStop is a TaskStop placed on tasks that have received a stop signal
 // (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
 // the ptrace man page.)
-//
-// +stateify savable
 type groupStop struct{}
 
 // Killable implements TaskStop.Killable.
@@ -883,8 +881,6 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 }
 
 // The runInterrupt state handles conditions indicated by interrupts.
-//
-// +stateify savable
 type runInterrupt struct{}
 
 func (*runInterrupt) execute(t *Task) taskRunState {
@@ -1024,7 +1020,6 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 	return (*runApp)(nil)
 }
 
-// +stateify savable
 type runInterruptAfterSignalDeliveryStop struct{}
 
 func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 3b9652504..79f4ff60c 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -241,7 +241,6 @@ func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRun
 	return t.doSyscallInvoke(sysno, args)
 }
 
-// +stateify savable
 type runSyscallAfterSyscallEnterStop struct{}
 
 func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
@@ -261,7 +260,6 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
 	return t.doSyscallInvoke(sysno, args)
 }
 
-// +stateify savable
 type runSyscallAfterSysemuStop struct{}
 
 func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
@@ -296,7 +294,6 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu
 	return (*runSyscallExit)(nil).execute(t)
 }
 
-// +stateify savable
 type runSyscallReinvoke struct{}
 
 func (*runSyscallReinvoke) execute(t *Task) taskRunState {
@@ -313,7 +310,6 @@ func (*runSyscallReinvoke) execute(t *Task) taskRunState {
 	return t.doSyscallInvoke(sysno, args)
 }
 
-// +stateify savable
 type runSyscallExit struct{}
 
 func (*runSyscallExit) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 441b8a822..8fffd3446 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -28,8 +28,6 @@ import (
 // groups" are usually called "processes" in userspace documentation.)
 //
 // ThreadGroup is a superset of Linux's struct signal_struct.
-//
-// +stateify savable
 type ThreadGroup struct {
 	threadGroupNode
 
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 844213c35..440da9dad 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -50,8 +50,6 @@ func (tid ThreadID) String() string {
 const InitTID ThreadID = 1
 
 // A TaskSet comprises all tasks in a system.
-//
-// +stateify savable
 type TaskSet struct {
 	// mu protects all relationships betweens tasks and thread groups in the
 	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
@@ -112,8 +110,6 @@ func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
 //
 // N.B. A task is said to be visible in a PID namespace if the PID namespace
 // contains a thread ID that maps to that task.
-//
-// +stateify savable
 type PIDNamespace struct {
 	// owner is the TaskSet that this PID namespace belongs to. The owner
 	// pointer is immutable.
@@ -267,8 +263,6 @@ func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
 // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
 // threadGroupEntry's methods on ThreadGroup to make it implement
 // threadGroupLinker.)
-//
-// +stateify savable
 type threadGroupNode struct {
 	// pidns is the PID namespace containing the thread group and all of its
 	// member tasks. The pidns pointer is immutable.
@@ -388,8 +382,6 @@ func (tg *ThreadGroup) ID() ThreadID {
 
 // A taskNode defines the relationship between a task and the rest of the
 // system. The comments on threadGroupNode also apply to taskNode.
-//
-// +stateify savable
 type taskNode struct {
 	// tg is the thread group that this task belongs to. The tg pointer is
 	// immutable.
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 5d8db2273..b3ed42aa4 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,18 +1,30 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "time_state",
+    srcs = [
+        "time.go",
+    ],
+    out = "time_state.go",
+    package = "time",
+)
 
 go_library(
     name = "time",
     srcs = [
         "context.go",
         "time.go",
+        "time_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
         "//pkg/sentry/context",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 6eadd2878..c223c2f19 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -42,8 +42,6 @@ const (
 //
 // Time may represent time with respect to any clock and may not have any
 // meaning in the real world.
-//
-// +stateify savable
 type Time struct {
 	ns int64
 }
@@ -288,8 +286,6 @@ type TimerListener interface {
 }
 
 // Setting contains user-controlled mutable Timer properties.
-//
-// +stateify savable
 type Setting struct {
 	// Enabled is true if the timer is running.
 	Enabled bool
@@ -375,8 +371,6 @@ func (s Setting) advancedTo(now Time) (Setting, uint64) {
 //
 // Timers should be created using NewTimer and must be cleaned up by calling
 // Timer.Destroy when no longer used.
-//
-// +stateify savable
 type Timer struct {
 	// clock is the time source. clock is immutable.
 	clock Clock
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index df5dbe128..4de8ac13b 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -25,8 +25,6 @@ import (
 )
 
 // Timekeeper manages all of the kernel clocks.
-//
-// +stateify savable
 type Timekeeper struct {
 	// clocks are the clock sources.
 	//
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
index 534d03d0f..03a3310be 100644
--- a/pkg/sentry/kernel/timer.go
+++ b/pkg/sentry/kernel/timer.go
@@ -26,8 +26,6 @@ import (
 
 // timekeeperClock is a ktime.Clock that reads time from a
 // kernel.Timekeeper-managed clock.
-//
-// +stateify savable
 type timekeeperClock struct {
 	tk *Timekeeper
 	c  sentrytime.ClockID
@@ -51,8 +49,6 @@ func (tc *timekeeperClock) Now() ktime.Time {
 
 // tgClock is a ktime.Clock that measures the time a thread group has spent
 // executing.
-//
-// +stateify savable
 type tgClock struct {
 	tg *ThreadGroup
 
@@ -159,8 +155,6 @@ func (tc *taskClock) Now() ktime.Time {
 }
 
 // signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
-//
-// +stateify savable
 type signalNotifier struct {
 	tg         *ThreadGroup
 	signal     linux.Signal
@@ -185,8 +179,6 @@ func (s *signalNotifier) Notify(exp uint64) {
 func (s *signalNotifier) Destroy() {}
 
 // TimerManager is a collection of supported process cpu timers.
-//
-// +stateify savable
 type TimerManager struct {
 	// Clocks used to drive thread group execution time timers.
 	virtClock *tgClock
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index 7e0fe0d21..58e9b4d1b 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -22,8 +22,6 @@ import (
 
 // UTSNamespace represents a UTS namespace, a holder of two system identifiers:
 // the hostname and domain name.
-//
-// +stateify savable
 type UTSNamespace struct {
 	// mu protects all fields below.
 	mu         sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 971e8bc59..0bacbea49 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -52,8 +52,6 @@ type vdsoParams struct {
 // Everything in the struct is 8 bytes for easy alignment.
 //
 // It must be kept in sync with params in vdso/vdso_time.cc.
-//
-// +stateify savable
 type VDSOParamPage struct {
 	// The parameter page is fr, allocated from platform.Memory().
 	platform platform.Platform
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 90f4395d4..3ce41cacc 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,12 +1,22 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "limits_state",
+    srcs = [
+        "limits.go",
+    ],
+    out = "limits_state.go",
+    package = "limits",
+)
 
 go_library(
     name = "limits",
     srcs = [
         "context.go",
         "limits.go",
+        "limits_state.go",
         "linux.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/limits",
@@ -14,6 +24,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
+        "//pkg/state",
     ],
 )
 
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index 02c8b60e3..4230ba958 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -47,8 +47,6 @@ const (
 const Infinity = ^uint64(0)
 
 // Limit specifies a system limit.
-//
-// +stateify savable
 type Limit struct {
 	// Cur specifies the current limit.
 	Cur uint64
@@ -57,8 +55,6 @@ type Limit struct {
 }
 
 // LimitSet represents the Limits that correspond to each LimitType.
-//
-// +stateify savable
 type LimitSet struct {
 	mu   sync.Mutex `state:"nosave"`
 	data map[LimitType]Limit
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 0beb4561b..e63052c6d 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
 go_embed_data(
     name = "vdso_bin",
@@ -10,12 +10,23 @@ go_embed_data(
     var = "vdsoBin",
 )
 
+go_stateify(
+    name = "loader_state",
+    srcs = [
+        "vdso.go",
+        "vdso_state.go",
+    ],
+    out = "loader_state.go",
+    package = "loader",
+)
+
 go_library(
     name = "loader",
     srcs = [
         "elf.go",
         "interpreter.go",
         "loader.go",
+        "loader_state.go",
         "vdso.go",
         "vdso_state.go",
         ":vdso_bin",
@@ -29,6 +40,7 @@ go_library(
         "//pkg/cpuid",
         "//pkg/log",
         "//pkg/rand",
+        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -43,6 +55,7 @@ go_library(
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index a06e27ac9..2e8693f8e 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -193,8 +193,6 @@ func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error)
 //
 // NOTE: to support multiple architectures or operating systems, this
 // would need to contain a VDSO for each.
-//
-// +stateify savable
 type VDSO struct {
 	// ParamPage is the VDSO parameter page. This page should be updated to
 	// inform the VDSO for timekeeping data.
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
index dc71e1c2d..92004ad9e 100644
--- a/pkg/sentry/loader/vdso_state.go
+++ b/pkg/sentry/loader/vdso_state.go
@@ -18,7 +18,6 @@ import (
 	"debug/elf"
 )
 
-// +stateify savable
 type elfProgHeader struct {
 	Type   elf.ProgType
 	Flags  elf.ProgFlag
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index c9e0b95a0..2e367e189 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,7 +1,18 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "memmap_state",
+    srcs = [
+        "mappable_range.go",
+        "mapping_set.go",
+        "mapping_set_impl.go",
+    ],
+    out = "memmap_state.go",
+    package = "memmap",
+)
 
 go_template_instance(
     name = "mappable_range",
@@ -35,6 +46,7 @@ go_library(
         "mapping_set.go",
         "mapping_set_impl.go",
         "memmap.go",
+        "memmap_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
     visibility = ["//pkg/sentry:internal"],
@@ -44,6 +56,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/platform",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index c9483905d..0cd42ffbf 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -35,8 +35,6 @@ import (
 type MappingsOfRange map[MappingOfRange]struct{}
 
 // MappingOfRange represents a mapping of a MappableRange.
-//
-// +stateify savable
 type MappingOfRange struct {
 	MappingSpace MappingSpace
 	AddrRange    usermem.AddrRange
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index bbdfae247..3f396986a 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,7 +1,24 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "mm_state",
+    srcs = [
+        "aio_context.go",
+        "aio_context_state.go",
+        "file_refcount_set.go",
+        "io_list.go",
+        "mm.go",
+        "pma_set.go",
+        "save_restore.go",
+        "special_mappable.go",
+        "vma_set.go",
+    ],
+    out = "mm_state.go",
+    package = "mm",
+)
 
 go_template_instance(
     name = "file_refcount_set",
@@ -84,6 +101,7 @@ go_library(
         "lifecycle.go",
         "metadata.go",
         "mm.go",
+        "mm_state.go",
         "pma.go",
         "pma_set.go",
         "proc_pid_maps.go",
@@ -113,6 +131,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index b42156d45..992bde5a5 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -28,8 +28,6 @@ import (
 )
 
 // aioManager creates and manages asynchronous I/O contexts.
-//
-// +stateify savable
 type aioManager struct {
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -91,16 +89,12 @@ func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
 }
 
 // ioResult is a completed I/O operation.
-//
-// +stateify savable
 type ioResult struct {
 	data interface{}
 	ioEntry
 }
 
 // AIOContext is a single asynchronous I/O context.
-//
-// +stateify savable
 type AIOContext struct {
 	// done is the notification channel used for all requests.
 	done chan struct{} `state:"nosave"`
@@ -196,8 +190,6 @@ func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
 
 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
 // ring buffers.
-//
-// +stateify savable
 type aioMappable struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 3299ae164..ce8097b7f 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -46,8 +46,6 @@ import (
 )
 
 // MemoryManager implements a virtual address space.
-//
-// +stateify savable
 type MemoryManager struct {
 	// p is the platform.
 	//
@@ -209,8 +207,6 @@ type MemoryManager struct {
 }
 
 // vma represents a virtual memory area.
-//
-// +stateify savable
 type vma struct {
 	// mappable is the virtual memory object mapped by this vma. If mappable is
 	// nil, the vma represents a private anonymous mapping.
@@ -350,8 +346,6 @@ func (v *vma) loadRealPerms(b int) {
 }
 
 // pma represents a platform mapping area.
-//
-// +stateify savable
 type pma struct {
 	// file is the file mapped by this pma. Only pmas for which file ==
 	// platform.Platform.Memory() may be saved. pmas hold a reference to the
@@ -386,7 +380,6 @@ type pma struct {
 	internalMappings safemem.BlockSeq `state:"nosave"`
 }
 
-// +stateify savable
 type privateRefs struct {
 	mu sync.Mutex `state:"nosave"`
 
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index aa2f87107..9d3614034 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -28,8 +28,6 @@ import (
 // semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
 // that SpecialMappable takes ownership of the memory that it represents
 // (_install_special_mapping() does not.)
-//
-// +stateify savable
 type SpecialMappable struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index af9ba5394..15a7fbbc3 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,7 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "platform_state",
+    srcs = [
+        "file_range.go",
+    ],
+    out = "platform_state.go",
+    package = "platform",
+)
 
 go_template_instance(
     name = "file_range",
@@ -21,6 +30,7 @@ go_library(
         "file_range.go",
         "mmap_min_addr.go",
         "platform.go",
+        "platform_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
     visibility = ["//pkg/sentry:internal"],
@@ -34,6 +44,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
index 2a5982763..dadba1d38 100644
--- a/pkg/sentry/platform/filemem/BUILD
+++ b/pkg/sentry/platform/filemem/BUILD
@@ -1,7 +1,18 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "filemem_autogen_state",
+    srcs = [
+        "filemem.go",
+        "filemem_state.go",
+        "usage_set.go",
+    ],
+    out = "filemem_autogen_state.go",
+    package = "filemem",
+)
 
 go_template_instance(
     name = "usage_set",
@@ -27,6 +38,7 @@ go_library(
     name = "filemem",
     srcs = [
         "filemem.go",
+        "filemem_autogen_state.go",
         "filemem_state.go",
         "filemem_unsafe.go",
         "usage_set.go",
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index feb020ef8..870274ae1 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -155,8 +155,6 @@ type FileMem struct {
 }
 
 // usage tracks usage information.
-//
-// +stateify savable
 type usageInfo struct {
 	// kind is the usage kind.
 	kind usage.MemoryKind
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index a320fca0b..929787aa0 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,10 +1,22 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "socket_state",
+    srcs = [
+        "socket.go",
+    ],
+    out = "socket_state_autogen.go",
+    package = "socket",
+)
 
 go_library(
     name = "socket",
-    srcs = ["socket.go"],
+    srcs = [
+        "socket.go",
+        "socket_state_autogen.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -17,6 +29,7 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index c4874fdfb..faf2b4c27 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,14 +1,26 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
 
-go_library(
-    name = "control",
-    srcs = ["control.go"],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control",
+go_stateify(
+    name = "control_state",
+    srcs = [
+        "control.go",
+    ],
+    out = "control_state.go",
     imports = [
         "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
     ],
+    package = "control",
+)
+
+go_library(
+    name = "control",
+    srcs = [
+        "control.go",
+        "control_state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -19,6 +31,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
     ],
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index c31182e69..17ecdd11c 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -51,8 +51,6 @@ type SCMRights interface {
 // RightsFiles represents a SCM_RIGHTS socket control message. A reference is
 // maintained for each fs.File and is release either when an FD is created or
 // when the Release method is called.
-//
-// +stateify savable
 type RightsFiles []*fs.File
 
 // NewSCMRights creates a new SCM_RIGHTS socket control message representation
@@ -130,8 +128,6 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte) []by
 }
 
 // scmCredentials represents an SCM_CREDENTIALS socket control message.
-//
-// +stateify savable
 type scmCredentials struct {
 	t    *kernel.Task
 	kuid auth.KUID
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 49af8db85..7ad5e88c5 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -1,12 +1,24 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "epsocket_state",
+    srcs = [
+        "epsocket.go",
+        "save_restore.go",
+        "stack.go",
+    ],
+    out = "epsocket_state.go",
+    package = "epsocket",
+)
 
 go_library(
     name = "epsocket",
     srcs = [
         "device.go",
         "epsocket.go",
+        "epsocket_state.go",
         "provider.go",
         "save_restore.go",
         "stack.go",
@@ -19,6 +31,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/log",
+        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
@@ -31,6 +44,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index f969a1d7c..a2927e1b9 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -95,8 +95,6 @@ type commonEndpoint interface {
 
 // SocketOperations encapsulates all the state needed to represent a network stack
 // endpoint in the kernel context.
-//
-// +stateify savable
 type SocketOperations struct {
 	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 12b4b4767..ec1d96ccb 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -26,8 +26,6 @@ import (
 )
 
 // Stack implements inet.Stack for netstack/tcpip/stack.Stack.
-//
-// +stateify savable
 type Stack struct {
 	Stack *stack.Stack `state:"manual"`
 }
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index d623718b3..227ca3926 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,12 +1,24 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "hostinet_state",
+    srcs = [
+        "save_restore.go",
+        "socket.go",
+        "stack.go",
+    ],
+    out = "hostinet_autogen_state.go",
+    package = "hostinet",
+)
 
 go_library(
     name = "hostinet",
     srcs = [
         "device.go",
         "hostinet.go",
+        "hostinet_autogen_state.go",
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
@@ -30,6 +42,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index b852165f7..b23a243f7 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,11 +1,21 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "netlink_state",
+    srcs = [
+        "socket.go",
+    ],
+    out = "netlink_state.go",
+    package = "netlink",
+)
 
 go_library(
     name = "netlink",
     srcs = [
         "message.go",
+        "netlink_state.go",
         "provider.go",
         "socket.go",
     ],
@@ -26,6 +36,7 @@ go_library(
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 3a7dbc5ed..ba6f686e4 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,12 +1,23 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "port_state",
+    srcs = ["port.go"],
+    out = "port_state.go",
+    package = "port",
+)
 
 go_library(
     name = "port",
-    srcs = ["port.go"],
+    srcs = [
+        "port.go",
+        "port_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port",
     visibility = ["//pkg/sentry:internal"],
+    deps = ["//pkg/state"],
 )
 
 go_test(
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index 1c5d4c3a5..4ccf0b84c 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -32,8 +32,6 @@ import (
 const maxPorts = 10000
 
 // Manager allocates netlink port IDs.
-//
-// +stateify savable
 type Manager struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index e1bcfe252..726469fc9 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,19 +1,32 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "route_state",
+    srcs = ["protocol.go"],
+    out = "route_state.go",
+    package = "route",
+)
 
 go_library(
     name = "route",
-    srcs = ["protocol.go"],
+    srcs = [
+        "protocol.go",
+        "route_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
+        "//pkg/sentry/fs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserr",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 55a76e916..e8030c518 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -43,8 +43,6 @@ func typeKind(typ uint16) commandKind {
 }
 
 // Protocol implements netlink.Protocol.
-//
-// +stateify savable
 type Protocol struct{}
 
 var _ netlink.Protocol = (*Protocol)(nil)
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index e15d1546c..0b8f528d0 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -51,8 +51,6 @@ var netlinkSocketDevice = device.NewAnonDevice()
 // to/from the kernel.
 //
 // Socket implements socket.Socket.
-//
-// +stateify savable
 type Socket struct {
 	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 54fe64595..bd4858a34 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -195,8 +195,6 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 //
 // Care must be taken when copying ReceiveTimeout as it contains atomic
 // variables.
-//
-// +stateify savable
 type ReceiveTimeout struct {
 	// ns is length of the timeout in nanoseconds.
 	//
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 9fe681e9a..7d04d6b6b 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,6 +1,15 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "unix_state",
+    srcs = [
+        "unix.go",
+    ],
+    out = "unix_state.go",
+    package = "unix",
+)
 
 go_library(
     name = "unix",
@@ -8,6 +17,7 @@ go_library(
         "device.go",
         "io.go",
         "unix.go",
+        "unix_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix",
     visibility = ["//pkg/sentry:internal"],
@@ -27,6 +37,7 @@ go_library(
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/epsocket",
         "//pkg/sentry/usermem",
+        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 5b6411f97..27bacbbc3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -42,8 +42,6 @@ import (
 
 // SocketOperations is a Unix socket. It is similar to an epsocket, except it is backed
 // by a unix.Endpoint instead of a tcpip.Endpoint.
-//
-// +stateify savable
 type SocketOperations struct {
 	refs.AtomicRefCount
 	socket.ReceiveTimeout
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index e4450a093..574621ad2 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,6 +1,18 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "linux_state",
+    srcs = [
+        "sys_aio.go",
+        "sys_futex.go",
+        "sys_poll.go",
+        "sys_time.go",
+    ],
+    out = "linux_state.go",
+    package = "linux",
+)
 
 go_library(
     name = "linux",
@@ -8,6 +20,7 @@ go_library(
         "error.go",
         "flags.go",
         "linux64.go",
+        "linux_state.go",
         "sigset.go",
         "sys_aio.go",
         "sys_capability.go",
@@ -53,6 +66,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/bpf",
+        "//pkg/eventchannel",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/rand",
@@ -60,6 +74,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/kernel",
@@ -70,6 +85,7 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/kernel/semaphore",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
@@ -81,6 +97,8 @@ go_library(
         "//pkg/sentry/syscalls",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 54e4afa9e..fc3397081 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -69,8 +69,6 @@ type ioCallback struct {
 }
 
 // ioEvent describes an I/O result.
-//
-// +stateify savable
 type ioEvent struct {
 	Data    uint64
 	Obj     uint64
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 1a0e1f5fb..57762d058 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -132,8 +132,6 @@ func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
 
 // futexWaitRestartBlock encapsulates the state required to restart futex(2)
 // via restart_syscall(2).
-//
-// +stateify savable
 type futexWaitRestartBlock struct {
 	duration time.Duration
 
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index b9bdefadb..d4dbfd285 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -274,8 +274,6 @@ func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Du
 
 // pollRestartBlock encapsulates the state required to restart poll(2) via
 // restart_syscall(2).
-//
-// +stateify savable
 type pollRestartBlock struct {
 	pfdAddr usermem.Addr
 	nfds    uint
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 8e6683444..dcee694b2 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -168,8 +168,6 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 
 // clockNanosleepRestartBlock encapsulates the state required to restart
 // clock_nanosleep(2) via restart_syscall(2).
-//
-// +stateify savable
 type clockNanosleepRestartBlock struct {
 	c        ktime.Clock
 	duration time.Duration
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index 868dfd400..edee44d96 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,6 +1,17 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "usage_state",
+    srcs = [
+        "cpu.go",
+        "io.go",
+        "memory.go",
+    ],
+    out = "usage_state.go",
+    package = "usage",
+)
 
 go_library(
     name = "usage",
@@ -10,6 +21,7 @@ go_library(
         "memory.go",
         "memory_unsafe.go",
         "usage.go",
+        "usage_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usage",
     visibility = [
@@ -17,6 +29,9 @@ go_library(
     ],
     deps = [
         "//pkg/bits",
+        "//pkg/log",
         "//pkg/sentry/memutil",
+        "//pkg/state",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
index ed7b04b9e..1c2cc90e1 100644
--- a/pkg/sentry/usage/cpu.go
+++ b/pkg/sentry/usage/cpu.go
@@ -20,8 +20,6 @@ import (
 
 // CPUStats contains the subset of struct rusage fields that relate to CPU
 // scheduling.
-//
-// +stateify savable
 type CPUStats struct {
 	// UserTime is the amount of time spent executing application code.
 	UserTime time.Duration
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
index 49faa507d..a05053c32 100644
--- a/pkg/sentry/usage/io.go
+++ b/pkg/sentry/usage/io.go
@@ -19,8 +19,6 @@ import (
 )
 
 // IO contains I/O-related statistics.
-//
-// +stateify savable
 type IO struct {
 	// CharsRead is the number of bytes read by read syscalls.
 	CharsRead uint64
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 69ba919e0..9dd1cd2b5 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,7 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "usermem_state",
+    srcs = [
+        "access_type.go",
+        "addr.go",
+        "addr_range.go",
+        "addr_range_seq_unsafe.go",
+    ],
+    out = "usermem_state.go",
+    package = "usermem",
+)
 
 go_template_instance(
     name = "addr_range",
@@ -24,6 +36,7 @@ go_library(
         "bytes_io.go",
         "bytes_io_unsafe.go",
         "usermem.go",
+        "usermem_state.go",
         "usermem_x86.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
@@ -34,6 +47,7 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/safemem",
+        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
     ],
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index 75346d854..7eabecf30 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -20,8 +20,6 @@ import (
 
 // AccessType specifies memory access types. This is used for
 // setting mapping permissions, as well as communicating faults.
-//
-// +stateify savable
 type AccessType struct {
 	// Read is read access.
 	Read bool
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
index fc94bee80..d175fdc74 100644
--- a/pkg/sentry/usermem/addr.go
+++ b/pkg/sentry/usermem/addr.go
@@ -19,8 +19,6 @@ import (
 )
 
 // Addr represents a generic virtual address.
-//
-// +stateify savable
 type Addr uintptr
 
 // AddLength adds the given length to start and returns the result. ok is true
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 5153bd3b4..391d801d0 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,13 +1,26 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "tcpip_state",
+    srcs = [
+        "tcpip.go",
+    ],
+    out = "tcpip_state.go",
+    package = "tcpip",
+)
 
 go_library(
     name = "tcpip",
-    srcs = ["tcpip.go"],
+    srcs = [
+        "tcpip.go",
+        "tcpip_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/state",
         "//pkg/tcpip/buffer",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index 11a725423..efeb6a448 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,15 +1,26 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "buffer_state",
+    srcs = [
+        "view.go",
+    ],
+    out = "buffer_state.go",
+    package = "buffer",
+)
 
 go_library(
     name = "buffer",
     srcs = [
+        "buffer_state.go",
         "prependable.go",
         "view.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer",
     visibility = ["//visibility:public"],
+    deps = ["//pkg/state"],
 )
 
 go_test(
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index bbb4e1d24..a5774a327 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -54,8 +54,6 @@ func (v *View) ToVectorisedView(views [1]View) VectorisedView {
 
 // VectorisedView is a vectorised version of View using non contigous memory.
 // It supports all the convenience methods supported by View.
-//
-// +stateify savable
 type VectorisedView struct {
 	views []View
 	size  int
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 8f22ba3a5..3aa2cfb24 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,6 +1,15 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "tcp_header_state",
+    srcs = [
+        "tcp.go",
+    ],
+    out = "tcp_header_state.go",
+    package = "header",
+)
 
 go_library(
     name = "header",
@@ -16,11 +25,13 @@ go_library(
         "ipv6.go",
         "ipv6_fragment.go",
         "tcp.go",
+        "tcp_header_state.go",
         "udp.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/header",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/seqnum",
     ],
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 6689a6dc5..a95d282b0 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -120,8 +120,6 @@ type TCPSynOptions struct {
 }
 
 // SACKBlock represents a single contiguous SACK block.
-//
-// +stateify savable
 type SACKBlock struct {
 	// Start indicates the lowest sequence number in the block.
 	Start seqnum.Value
@@ -133,8 +131,6 @@ type SACKBlock struct {
 
 // TCPOptions are used to parse and cache the TCP segment options for a non
 // syn/syn-ack segment.
-//
-// +stateify savable
 type TCPOptions struct {
 	// TS is true if the TimeStamp option is enabled.
 	TS bool
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index 83b4d253f..ac97ebe43 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,7 +1,14 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "fragmentation_state",
+    srcs = ["reassembler_list.go"],
+    out = "fragmentation_state.go",
+    package = "fragmentation",
+)
 
 go_template_instance(
     name = "reassembler_list",
@@ -19,6 +26,7 @@ go_library(
     srcs = [
         "frag_heap.go",
         "fragmentation.go",
+        "fragmentation_state.go",
         "reassembler.go",
         "reassembler_list.go",
     ],
@@ -26,6 +34,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
+        "//pkg/state",
         "//pkg/tcpip/buffer",
     ],
 )
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index c5c889239..a75869dac 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,12 +1,25 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "seqnum_state",
+    srcs = [
+        "seqnum.go",
+    ],
+    out = "seqnum_state.go",
+    package = "seqnum",
+)
 
 go_library(
     name = "seqnum",
-    srcs = ["seqnum.go"],
+    srcs = [
+        "seqnum.go",
+        "seqnum_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum",
     visibility = [
         "//visibility:public",
     ],
+    deps = ["//pkg/state"],
 )
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index af0aec85c..eb1e4645d 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -213,8 +213,6 @@ const (
 
 // FullAddress represents a full transport node address, as required by the
 // Connect() and Bind() methods.
-//
-// +stateify savable
 type FullAddress struct {
 	// NIC is the ID of the NIC this address refers to.
 	//
@@ -258,8 +256,6 @@ func (s SlicePayload) Size() int {
 }
 
 // A ControlMessages contains socket control messages for IP sockets.
-//
-// +stateify savable
 type ControlMessages struct {
 	// HasTimestamp indicates whether Timestamp is valid/set.
 	HasTimestamp bool
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
index 117532fea..28e3e1700 100644
--- a/pkg/tcpip/transport/ping/BUILD
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -1,7 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "ping_state",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "ping_packet_list.go",
+    ],
+    out = "ping_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
+    package = "ping",
+)
 
 go_template_instance(
     name = "ping_packet_list",
@@ -20,13 +32,14 @@ go_library(
         "endpoint.go",
         "endpoint_state.go",
         "ping_packet_list.go",
+        "ping_state.go",
         "protocol.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
+        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index a22684de9..f15e44b61 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// +stateify savable
 type pingPacket struct {
 	pingPacketEntry
 	senderAddress tcpip.FullAddress
diff --git a/pkg/tcpip/transport/queue/BUILD b/pkg/tcpip/transport/queue/BUILD
index 6dcec312e..fb878ad36 100644
--- a/pkg/tcpip/transport/queue/BUILD
+++ b/pkg/tcpip/transport/queue/BUILD
@@ -1,14 +1,27 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "queue_state",
+    srcs = [
+        "queue.go",
+    ],
+    out = "queue_state.go",
+    package = "queue",
+)
 
 go_library(
     name = "queue",
-    srcs = ["queue.go"],
+    srcs = [
+        "queue.go",
+        "queue_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/queue",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
+        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/transport/queue/queue.go b/pkg/tcpip/transport/queue/queue.go
index eb9ee8a3f..6a17441ae 100644
--- a/pkg/tcpip/transport/queue/queue.go
+++ b/pkg/tcpip/transport/queue/queue.go
@@ -33,8 +33,6 @@ type Entry interface {
 }
 
 // Queue is a buffer queue.
-//
-// +stateify savable
 type Queue struct {
 	ReaderQueue *waiter.Queue
 	WriterQueue *waiter.Queue
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 9ebae6cc7..6a7153e4d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,7 +1,27 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "tcp_state",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "rcv.go",
+        "reno.go",
+        "segment.go",
+        "segment_heap.go",
+        "segment_queue.go",
+        "segment_state.go",
+        "snd.go",
+        "snd_state.go",
+        "tcp_segment_list.go",
+    ],
+    out = "tcp_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
+    package = "tcp",
+)
 
 go_template_instance(
     name = "tcp_segment_list",
@@ -33,14 +53,15 @@ go_library(
         "snd.go",
         "snd_state.go",
         "tcp_segment_list.go",
+        "tcp_state.go",
         "timer.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/rand",
         "//pkg/sleep",
+        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index de1883d84..5b8a1e20f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -54,8 +54,6 @@ const (
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
-//
-// +stateify savable
 type SACKInfo struct {
 	// Blocks is the maximum number of SACK blocks we track
 	// per endpoint.
@@ -71,8 +69,6 @@ type SACKInfo struct {
 // have concurrent goroutines make calls into the endpoint, they are properly
 // synchronized. The protocol implementation, however, runs in a single
 // goroutine.
-//
-// +stateify savable
 type endpoint struct {
 	// workMu is used to arbitrate which goroutine may perform protocol
 	// work. Only the main protocol goroutine is expected to call Lock() on
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 92ef9c6f7..b22a00ce1 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -22,8 +22,6 @@ import (
 
 // receiver holds the state necessary to receive TCP segments and turn them
 // into a stream of bytes.
-//
-// +stateify savable
 type receiver struct {
 	ep *endpoint
 
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
index 03ae8d747..60f170a27 100644
--- a/pkg/tcpip/transport/tcp/reno.go
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -16,8 +16,6 @@ package tcp
 
 // renoState stores the variables related to TCP New Reno congestion
 // control algorithm.
-//
-// +stateify savable
 type renoState struct {
 	s *sender
 }
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 8dccea2ba..40928ba2c 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -36,8 +36,6 @@ const (
 // segment represents a TCP segment. It holds the payload and parsed TCP segment
 // information, and can be added to intrusive lists.
 // segment is mostly immutable, the only field allowed to change is viewToDeliver.
-//
-// +stateify savable
 type segment struct {
 	segmentEntry
 	refCnt int32
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 6a2d7bc0b..2ddcf5f10 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -21,8 +21,6 @@ import (
 )
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
-//
-// +stateify savable
 type segmentQueue struct {
 	mu    sync.Mutex  `state:"nosave"`
 	list  segmentList `state:"wait"`
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 376e81846..e38686e1b 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -54,8 +54,6 @@ type congestionControl interface {
 }
 
 // sender holds the state necessary to send TCP segments.
-//
-// +stateify savable
 type sender struct {
 	ep *endpoint
 
@@ -135,8 +133,6 @@ type sender struct {
 }
 
 // fastRecovery holds information related to fast recovery from a packet loss.
-//
-// +stateify savable
 type fastRecovery struct {
 	// active whether the endpoint is in fast recovery. The following fields
 	// are only meaningful when active is true.
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index d536839af..33c8867f4 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -18,7 +18,6 @@ import (
 	"time"
 )
 
-// +stateify savable
 type unixTime struct {
 	second int64
 	nano   int64
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 1a3a62d3d..790dd55a3 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,7 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "udp_state",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "udp_packet_list.go",
+    ],
+    out = "udp_state.go",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
+    package = "udp",
+)
 
 go_template_instance(
     name = "udp_packet_list",
@@ -21,12 +33,13 @@ go_library(
         "endpoint_state.go",
         "protocol.go",
         "udp_packet_list.go",
+        "udp_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
+        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 03fb76f92..2a32c3a87 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// +stateify savable
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
@@ -50,8 +49,6 @@ const (
 // between users of the endpoint and the protocol implementation; it is legal to
 // have concurrent goroutines make calls into the endpoint, they are properly
 // synchronized.
-//
-// +stateify savable
 type endpoint struct {
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
diff --git a/pkg/tcpip/transport/unix/BUILD b/pkg/tcpip/transport/unix/BUILD
index dae0bd079..676f2cf92 100644
--- a/pkg/tcpip/transport/unix/BUILD
+++ b/pkg/tcpip/transport/unix/BUILD
@@ -1,6 +1,17 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+
+go_stateify(
+    name = "unix_state",
+    srcs = [
+        "connectioned.go",
+        "connectionless.go",
+        "unix.go",
+    ],
+    out = "unix_state.go",
+    package = "unix",
+)
 
 go_library(
     name = "unix",
@@ -9,11 +20,14 @@ go_library(
         "connectioned_state.go",
         "connectionless.go",
         "unix.go",
+        "unix_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
+        "//pkg/log",
+        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/transport/queue",
diff --git a/pkg/tcpip/transport/unix/connectioned.go b/pkg/tcpip/transport/unix/connectioned.go
index dd7c03cf1..0e63186b2 100644
--- a/pkg/tcpip/transport/unix/connectioned.go
+++ b/pkg/tcpip/transport/unix/connectioned.go
@@ -85,8 +85,6 @@ type ConnectingEndpoint interface {
 // path != "" && acceptedChan != nil => bound and listening.
 //
 // Only one of these will be true at any moment.
-//
-// +stateify savable
 type connectionedEndpoint struct {
 	baseEndpoint
 
diff --git a/pkg/tcpip/transport/unix/connectionless.go b/pkg/tcpip/transport/unix/connectionless.go
index 2a6ec8b4b..3276ddcd0 100644
--- a/pkg/tcpip/transport/unix/connectionless.go
+++ b/pkg/tcpip/transport/unix/connectionless.go
@@ -25,8 +25,6 @@ import (
 //
 // Specifically, this means datagram unix sockets not created with
 // socketpair(2).
-//
-// +stateify savable
 type connectionlessEndpoint struct {
 	baseEndpoint
 }
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
index 8e4af3139..190a1ccdb 100644
--- a/pkg/tcpip/transport/unix/unix.go
+++ b/pkg/tcpip/transport/unix/unix.go
@@ -60,8 +60,6 @@ type CredentialsControlMessage interface {
 }
 
 // A ControlMessages represents a collection of socket control messages.
-//
-// +stateify savable
 type ControlMessages struct {
 	// Rights is a control message containing FDs.
 	Rights RightsControlMessage
@@ -237,8 +235,6 @@ type BoundEndpoint interface {
 }
 
 // message represents a message passed over a Unix domain socket.
-//
-// +stateify savable
 type message struct {
 	ilist.Entry
 
@@ -310,8 +306,6 @@ type Receiver interface {
 }
 
 // queueReceiver implements Receiver for datagram sockets.
-//
-// +stateify savable
 type queueReceiver struct {
 	readQueue *queue.Queue
 }
@@ -375,8 +369,6 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 {
 func (*queueReceiver) Release() {}
 
 // streamQueueReceiver implements Receiver for stream sockets.
-//
-// +stateify savable
 type streamQueueReceiver struct {
 	queueReceiver
 
@@ -587,7 +579,6 @@ type ConnectedEndpoint interface {
 	Release()
 }
 
-// +stateify savable
 type connectedEndpoint struct {
 	// endpoint represents the subset of the Endpoint functionality needed by
 	// the connectedEndpoint. It is implemented by both connectionedEndpoint
@@ -680,8 +671,6 @@ func (*connectedEndpoint) Release() {}
 // unix domain socket Endpoint implementations.
 //
 // Not to be used on its own.
-//
-// +stateify savable
 type baseEndpoint struct {
 	*waiter.Queue
 
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 5e611c54f..8256acdb4 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,13 +1,28 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+
+go_stateify(
+    name = "waiter_state",
+    srcs = [
+        "waiter.go",
+    ],
+    out = "waiter_state.go",
+    package = "waiter",
+)
 
 go_library(
     name = "waiter",
-    srcs = ["waiter.go"],
+    srcs = [
+        "waiter.go",
+        "waiter_state.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/waiter",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/ilist"],
+    deps = [
+        "//pkg/ilist",
+        "//pkg/state",
+    ],
 )
 
 go_test(
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 9825880ca..9b189bb9e 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -157,8 +157,6 @@ func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) {
 // notifiers can notify them when events happen.
 //
 // The zero value for waiter.Queue is an empty queue ready for use.
-//
-// +stateify savable
 type Queue struct {
 	list ilist.List   `state:"zerovalue"`
 	mu   sync.RWMutex `state:"nosave"`
-- 
cgit v1.2.3


From cf44aff6e08b0e19935d5cd98455b4af98fd8794 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Thu, 2 Aug 2018 08:09:03 -0700
Subject: Add seccomp(2) support.

Add support for the seccomp syscall and the flag SECCOMP_FILTER_FLAG_TSYNC.

PiperOrigin-RevId: 207101507
Change-Id: I5eb8ba9d5ef71b0e683930a6429182726dc23175
---
 pkg/sentry/kernel/BUILD                  |  1 +
 pkg/sentry/kernel/seccomp.go             | 62 ++++++++++++++++++++----
 pkg/sentry/kernel/task.go                | 19 ++++++--
 pkg/sentry/kernel/task_clone.go          |  5 +-
 pkg/sentry/kernel/task_syscall.go        |  4 +-
 pkg/sentry/syscalls/linux/BUILD          |  1 +
 pkg/sentry/syscalls/linux/linux64.go     |  1 +
 pkg/sentry/syscalls/linux/sys_prctl.go   | 32 +------------
 pkg/sentry/syscalls/linux/sys_seccomp.go | 82 ++++++++++++++++++++++++++++++++
 9 files changed, 160 insertions(+), 47 deletions(-)
 create mode 100644 pkg/sentry/syscalls/linux/sys_seccomp.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c4a7dacb2..1c1633068 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -47,6 +47,7 @@ go_stateify(
     ],
     out = "kernel_autogen_state.go",
     imports = [
+        "gvisor.googlesource.com/gvisor/pkg/bpf",
         "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
         "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
         "gvisor.googlesource.com/gvisor/pkg/tcpip",
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index b7c4a507f..d77c05e2f 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -144,10 +144,15 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
 	input := data.asBPFInput()
 
 	ret := uint32(linux.SECCOMP_RET_ALLOW)
+	f := t.syscallFilters.Load()
+	if f == nil {
+		return ret
+	}
+
 	// "Every filter successfully installed will be evaluated (in reverse
 	// order) for each system call the task makes." - kernel/seccomp.c
-	for i := len(t.syscallFilters) - 1; i >= 0; i-- {
-		thisRet, err := bpf.Exec(t.syscallFilters[i], input)
+	for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- {
+		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
 		if err != nil {
 			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
 			thisRet = linux.SECCOMP_RET_KILL
@@ -180,15 +185,53 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error {
 	// maxSyscallFilterInstructions. (This restriction is inherited from
 	// Linux.)
 	totalLength := p.Length()
-	for _, f := range t.syscallFilters {
-		totalLength += f.Length() + 4
+	var newFilters []bpf.Program
+
+	// While syscallFilters are an atomic.Value we must take the mutex to
+	// prevent our read-copy-update from happening while another task
+	// is syncing syscall filters to us, this keeps the filters in a
+	// consistent state.
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if sf := t.syscallFilters.Load(); sf != nil {
+		oldFilters := sf.([]bpf.Program)
+		for _, f := range oldFilters {
+			totalLength += f.Length() + 4
+		}
+		newFilters = append(newFilters, oldFilters...)
 	}
+
 	if totalLength > maxSyscallFilterInstructions {
 		return syserror.ENOMEM
 	}
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	t.syscallFilters = append(t.syscallFilters, p)
+
+	newFilters = append(newFilters, p)
+	t.syscallFilters.Store(newFilters)
+	return nil
+}
+
+// SyncSyscallFiltersToThreadGroup will copy this task's filters to all other
+// threads in our thread group.
+func (t *Task) SyncSyscallFiltersToThreadGroup() error {
+	f := t.syscallFilters.Load()
+
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
+
+	// Note: No new privs is always assumed to be set.
+	for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
+		if ot.ThreadID() != t.ThreadID() {
+			// We must take the other task's mutex to prevent it from
+			// appending to its own syscall filters while we're syncing.
+			ot.mu.Lock()
+			var copiedFilters []bpf.Program
+			if f != nil {
+				copiedFilters = append(copiedFilters, f.([]bpf.Program)...)
+			}
+			ot.syscallFilters.Store(copiedFilters)
+			ot.mu.Unlock()
+		}
+	}
 	return nil
 }
 
@@ -196,9 +239,8 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error {
 // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
 // and /proc/[pid]/status.
 func (t *Task) SeccompMode() int {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	if len(t.syscallFilters) > 0 {
+	f := t.syscallFilters.Load()
+	if f != nil && len(f.([]bpf.Program)) > 0 {
 		return linux.SECCOMP_MODE_FILTER
 	}
 	return linux.SECCOMP_MODE_NONE
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 7f6735320..e705260da 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -355,11 +355,11 @@ type Task struct {
 	parentDeathSignal linux.Signal
 
 	// syscallFilters is all seccomp-bpf syscall filters applicable to the
-	// task, in the order in which they were installed.
+	// task, in the order in which they were installed. The type of the atomic
+	// is []bpf.Program. Writing needs to be protected by mu.
 	//
-	// syscallFilters is protected by mu. syscallFilters is owned by the task
-	// goroutine.
-	syscallFilters []bpf.Program
+	// syscallFilters is owned by the task goroutine.
+	syscallFilters atomic.Value `state:".([]bpf.Program)"`
 
 	// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
 	// task's virtual address space; when the task exits, set the pointed-to
@@ -469,6 +469,17 @@ func (t *Task) loadLogPrefix(prefix string) {
 	t.logPrefix.Store(prefix)
 }
 
+func (t *Task) saveSyscallFilters() []bpf.Program {
+	if f := t.syscallFilters.Load(); f != nil {
+		return f.([]bpf.Program)
+	}
+	return nil
+}
+
+func (t *Task) loadSyscallFilters(filters []bpf.Program) {
+	t.syscallFilters.Store(filters)
+}
+
 // afterLoad is invoked by stateify.
 func (t *Task) afterLoad() {
 	t.interruptChan = make(chan struct{}, 1)
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index a61283267..3b77a4965 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -280,7 +280,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	// "If fork/clone and execve are allowed by @prog, any child processes will
 	// be constrained to the same filters and system call ABI as the parent." -
 	// Documentation/prctl/seccomp_filter.txt
-	nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...)
+	if f := t.syscallFilters.Load(); f != nil {
+		copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
+		nt.syscallFilters.Store(copiedFilters)
+	}
 	if opts.Vfork {
 		nt.vforkParent = t
 	}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 79f4ff60c..92ca0acd9 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -194,7 +194,7 @@ func (t *Task) doSyscall() taskRunState {
 
 	// Check seccomp filters. The nil check is for performance (as seccomp use
 	// is rare), not needed for correctness.
-	if t.syscallFilters != nil {
+	if t.syscallFilters.Load() != nil {
 		switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
 		case seccompResultDeny:
 			t.Debugf("Syscall %d: denied by seccomp", sysno)
@@ -334,7 +334,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 	// to syscall ABI because they both use RDI, RSI, and RDX for the first three
 	// arguments and none of the vsyscalls uses more than two arguments.
 	args := t.Arch().SyscallArgs()
-	if t.syscallFilters != nil {
+	if t.syscallFilters.Load() != nil {
 		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
 		case seccompResultDeny:
 			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 574621ad2..32fca3811 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,6 +42,7 @@ go_library(
         "sys_rlimit.go",
         "sys_rusage.go",
         "sys_sched.go",
+        "sys_seccomp.go",
         "sys_sem.go",
         "sys_shm.go",
         "sys_signal.go",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index edfcdca3f..c102af101 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -359,6 +359,7 @@ var AMD64 = &kernel.SyscallTable{
 		312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace
 		313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module
 		// "Backports."
+		317: Seccomp,
 		318: GetRandom,
 	},
 
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 2ca7471cf..911fef658 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -18,29 +18,13 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/bpf"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
-type userSockFprog struct {
-	// Len is the length of the filter in BPF instructions.
-	Len uint16
-
-	_ [6]byte // padding for alignment
-
-	// Filter is a user pointer to the struct sock_filter array that makes up
-	// the filter program. Filter is a uint64 rather than a usermem.Addr
-	// because usermem.Addr is actually uintptr, which is not a fixed-size
-	// type, and encoding/binary.Read objects to this.
-	Filter uint64
-}
-
 // Prctl implements linux syscall prctl(2).
 // It has a list of subfunctions which operate on the process. The arguments are
 // all based on each subfunction.
@@ -143,20 +127,8 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			// Unsupported mode.
 			return 0, nil, syscall.EINVAL
 		}
-		var fprog userSockFprog
-		if _, err := t.CopyIn(args[2].Pointer(), &fprog); err != nil {
-			return 0, nil, err
-		}
-		filter := make([]linux.BPFInstruction, int(fprog.Len))
-		if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
-			return 0, nil, err
-		}
-		compiledFilter, err := bpf.Compile(filter)
-		if err != nil {
-			t.Debugf("Invalid seccomp-bpf filter: %v", err)
-			return 0, nil, syscall.EINVAL
-		}
-		return 0, nil, t.AppendSyscallFilter(compiledFilter)
+
+		return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer())
 
 	case linux.PR_GET_SECCOMP:
 		return uintptr(t.SeccompMode()), nil, nil
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
new file mode 100644
index 000000000..4323a4df4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/bpf"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
+type userSockFprog struct {
+	// Len is the length of the filter in BPF instructions.
+	Len uint16
+
+	_ [6]byte // padding for alignment
+
+	// Filter is a user pointer to the struct sock_filter array that makes up
+	// the filter program. Filter is a uint64 rather than a usermem.Addr
+	// because usermem.Addr is actually uintptr, which is not a fixed-size
+	// type, and encoding/binary.Read objects to this.
+	Filter uint64
+}
+
+// seccomp applies a seccomp policy to the current task.
+func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
+	// We only support SECCOMP_SET_MODE_FILTER at the moment.
+	if mode != linux.SECCOMP_SET_MODE_FILTER {
+		// Unsupported mode.
+		return syscall.EINVAL
+	}
+
+	tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0
+
+	// The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC.
+	if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 {
+		// Unsupported flag.
+		return syscall.EINVAL
+	}
+
+	var fprog userSockFprog
+	if _, err := t.CopyIn(addr, &fprog); err != nil {
+		return err
+	}
+	filter := make([]linux.BPFInstruction, int(fprog.Len))
+	if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil {
+		return err
+	}
+	compiledFilter, err := bpf.Compile(filter)
+	if err != nil {
+		t.Debugf("Invalid seccomp-bpf filter: %v", err)
+		return syscall.EINVAL
+	}
+
+	err = t.AppendSyscallFilter(compiledFilter)
+	if err == nil && tsync {
+		// Now we must copy this seccomp program to all other threads.
+		err = t.SyncSyscallFiltersToThreadGroup()
+	}
+	return err
+}
+
+// Seccomp implements linux syscall seccomp(2).
+func Seccomp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer())
+}
-- 
cgit v1.2.3


From 57d0fcbdbf7e9d2d573ce8d4ca2f72b82f778d63 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 2 Aug 2018 10:41:44 -0700
Subject: Automated rollback of changelist 207037226

PiperOrigin-RevId: 207125440
Change-Id: I6c572afb4d693ee72a0c458a988b0e96d191cd49
---
 pkg/abi/BUILD                                  | 13 +-----
 pkg/abi/linux/BUILD                            | 16 +------
 pkg/abi/linux/bpf.go                           |  2 +
 pkg/abi/linux/tty.go                           |  2 +
 pkg/bpf/BUILD                                  | 17 +-------
 pkg/bpf/interpreter.go                         |  2 +
 pkg/cpuid/BUILD                                | 15 +------
 pkg/cpuid/cpuid.go                             |  2 +
 pkg/ilist/BUILD                                | 15 +------
 pkg/ilist/list.go                              |  4 ++
 pkg/segment/range.go                           |  2 +
 pkg/segment/set.go                             |  5 +++
 pkg/sentry/arch/BUILD                          | 18 +-------
 pkg/sentry/arch/arch.go                        |  2 +
 pkg/sentry/arch/arch_amd64.go                  |  2 +
 pkg/sentry/arch/arch_state_x86.go              |  1 +
 pkg/sentry/arch/arch_x86.go                    |  2 +
 pkg/sentry/arch/auxv.go                        |  2 +
 pkg/sentry/arch/signal_amd64.go                |  6 +++
 pkg/sentry/context/contexttest/BUILD           | 17 +-------
 pkg/sentry/fs/BUILD                            | 36 +---------------
 pkg/sentry/fs/ashmem/BUILD                     | 17 +-------
 pkg/sentry/fs/ashmem/area.go                   |  8 ++--
 pkg/sentry/fs/ashmem/device.go                 | 22 +++++-----
 pkg/sentry/fs/ashmem/pin_board.go              |  2 +
 pkg/sentry/fs/attr.go                          | 12 ++++++
 pkg/sentry/fs/binder/BUILD                     | 13 +-----
 pkg/sentry/fs/binder/binder.go                 | 26 ++++++-----
 pkg/sentry/fs/dentry.go                        |  4 ++
 pkg/sentry/fs/dev/BUILD                        | 20 +--------
 pkg/sentry/fs/dev/dev.go                       |  2 +
 pkg/sentry/fs/dev/fs.go                        |  2 +
 pkg/sentry/fs/dev/full.go                      |  2 +
 pkg/sentry/fs/dev/null.go                      |  3 ++
 pkg/sentry/fs/dev/random.go                    |  1 +
 pkg/sentry/fs/dirent.go                        |  2 +
 pkg/sentry/fs/dirent_cache.go                  |  2 +
 pkg/sentry/fs/fdpipe/BUILD                     | 31 +------------
 pkg/sentry/fs/fdpipe/pipe.go                   |  2 +
 pkg/sentry/fs/file.go                          |  2 +
 pkg/sentry/fs/file_overlay.go                  |  4 ++
 pkg/sentry/fs/filesystems.go                   |  2 +
 pkg/sentry/fs/filetest/BUILD                   | 18 +-------
 pkg/sentry/fs/flags.go                         |  2 +
 pkg/sentry/fs/fsutil/BUILD                     | 20 +--------
 pkg/sentry/fs/fsutil/dirty_set.go              |  2 +
 pkg/sentry/fs/fsutil/handle.go                 |  2 +
 pkg/sentry/fs/fsutil/host_file_mapper.go       |  2 +
 pkg/sentry/fs/fsutil/inode.go                  |  6 +++
 pkg/sentry/fs/fsutil/inode_cached.go           |  2 +
 pkg/sentry/fs/gofer/BUILD                      | 23 +---------
 pkg/sentry/fs/gofer/file.go                    |  2 +
 pkg/sentry/fs/gofer/fs.go                      |  2 +
 pkg/sentry/fs/gofer/inode.go                   |  4 ++
 pkg/sentry/fs/gofer/session.go                 |  3 ++
 pkg/sentry/fs/host/BUILD                       | 27 +-----------
 pkg/sentry/fs/host/descriptor.go               |  2 +
 pkg/sentry/fs/host/file.go                     |  2 +
 pkg/sentry/fs/host/fs.go                       |  6 ++-
 pkg/sentry/fs/host/inode.go                    |  4 ++
 pkg/sentry/fs/host/socket.go                   |  2 +
 pkg/sentry/fs/inode.go                         |  4 ++
 pkg/sentry/fs/inode_inotify.go                 |  2 +
 pkg/sentry/fs/inotify.go                       |  2 +
 pkg/sentry/fs/inotify_event.go                 |  2 +
 pkg/sentry/fs/inotify_watch.go                 |  2 +
 pkg/sentry/fs/lock/BUILD                       | 15 +------
 pkg/sentry/fs/lock/lock.go                     |  6 ++-
 pkg/sentry/fs/mount.go                         |  4 ++
 pkg/sentry/fs/mount_overlay.go                 |  4 ++
 pkg/sentry/fs/mounts.go                        |  2 +
 pkg/sentry/fs/overlay.go                       |  2 +
 pkg/sentry/fs/proc/BUILD                       | 34 +--------------
 pkg/sentry/fs/proc/cpuinfo.go                  |  2 +
 pkg/sentry/fs/proc/exec_args.go                |  2 +
 pkg/sentry/fs/proc/fds.go                      |  6 +++
 pkg/sentry/fs/proc/file.go                     |  1 +
 pkg/sentry/fs/proc/filesystems.go              |  2 +
 pkg/sentry/fs/proc/fs.go                       |  2 +
 pkg/sentry/fs/proc/loadavg.go                  |  2 +
 pkg/sentry/fs/proc/meminfo.go                  |  2 +
 pkg/sentry/fs/proc/mounts.go                   |  4 ++
 pkg/sentry/fs/proc/proc.go                     |  4 ++
 pkg/sentry/fs/proc/seqfile/BUILD               | 30 ++-----------
 pkg/sentry/fs/proc/seqfile/seqfile.go          |  4 ++
 pkg/sentry/fs/proc/stat.go                     |  2 +
 pkg/sentry/fs/proc/sys.go                      |  5 +++
 pkg/sentry/fs/proc/sys_net.go                  |  2 +
 pkg/sentry/fs/proc/task.go                     | 20 +++++++++
 pkg/sentry/fs/proc/uid_gid_map.go              |  3 ++
 pkg/sentry/fs/proc/uptime.go                   |  2 +
 pkg/sentry/fs/proc/version.go                  |  2 +
 pkg/sentry/fs/ramfs/BUILD                      | 21 +--------
 pkg/sentry/fs/ramfs/dir.go                     |  2 +
 pkg/sentry/fs/ramfs/ramfs.go                   |  2 +
 pkg/sentry/fs/ramfs/socket.go                  |  2 +
 pkg/sentry/fs/ramfs/symlink.go                 |  2 +
 pkg/sentry/fs/ramfs/test/BUILD                 | 18 +-------
 pkg/sentry/fs/sys/BUILD                        | 14 +-----
 pkg/sentry/fs/sys/fs.go                        |  2 +
 pkg/sentry/fs/sys/sys.go                       |  5 ++-
 pkg/sentry/fs/timerfd/BUILD                    | 18 +-------
 pkg/sentry/fs/timerfd/timerfd.go               |  4 +-
 pkg/sentry/fs/tmpfs/BUILD                      | 17 +-------
 pkg/sentry/fs/tmpfs/file_regular.go            |  2 +
 pkg/sentry/fs/tmpfs/fs.go                      |  2 +
 pkg/sentry/fs/tmpfs/inode_file.go              |  2 +
 pkg/sentry/fs/tmpfs/tmpfs.go                   |  8 ++++
 pkg/sentry/fs/tty/BUILD                        | 20 +--------
 pkg/sentry/fs/tty/dir.go                       | 18 +++++---
 pkg/sentry/fs/tty/fs.go                        |  4 ++
 pkg/sentry/fs/tty/inode.go                     |  2 +
 pkg/sentry/fs/tty/line_discipline.go           |  6 +++
 pkg/sentry/fs/tty/master.go                    |  4 ++
 pkg/sentry/fs/tty/queue.go                     |  4 +-
 pkg/sentry/fs/tty/slave.go                     |  4 ++
 pkg/sentry/fs/tty/terminal.go                  |  2 +
 pkg/sentry/inet/BUILD                          | 15 +------
 pkg/sentry/inet/inet.go                        |  2 +
 pkg/sentry/kernel/BUILD                        | 60 +++-----------------------
 pkg/sentry/kernel/abstract_socket_namespace.go |  3 ++
 pkg/sentry/kernel/auth/BUILD                   | 17 +-------
 pkg/sentry/kernel/auth/credentials.go          |  2 +
 pkg/sentry/kernel/auth/id_map.go               |  2 +
 pkg/sentry/kernel/auth/user_namespace.go       |  2 +
 pkg/sentry/kernel/epoll/BUILD                  | 15 +------
 pkg/sentry/kernel/epoll/epoll.go               |  8 +++-
 pkg/sentry/kernel/eventfd/BUILD                | 18 +-------
 pkg/sentry/kernel/eventfd/eventfd.go           |  4 +-
 pkg/sentry/kernel/fd_map.go                    |  6 +++
 pkg/sentry/kernel/fs_context.go                |  2 +
 pkg/sentry/kernel/futex/BUILD                  | 18 +-------
 pkg/sentry/kernel/futex/futex.go               |  2 +
 pkg/sentry/kernel/ipc_namespace.go             |  2 +
 pkg/sentry/kernel/kernel.go                    |  4 +-
 pkg/sentry/kernel/pending_signals.go           |  5 +++
 pkg/sentry/kernel/pipe/BUILD                   | 20 +--------
 pkg/sentry/kernel/pipe/buffers.go              |  2 +
 pkg/sentry/kernel/pipe/node.go                 |  2 +
 pkg/sentry/kernel/pipe/pipe.go                 |  2 +
 pkg/sentry/kernel/pipe/reader.go               |  2 +
 pkg/sentry/kernel/pipe/reader_writer.go        |  2 +
 pkg/sentry/kernel/pipe/writer.go               |  2 +
 pkg/sentry/kernel/ptrace.go                    |  4 ++
 pkg/sentry/kernel/rseq.go                      |  2 +
 pkg/sentry/kernel/semaphore/BUILD              | 15 +------
 pkg/sentry/kernel/semaphore/semaphore.go       |  8 ++++
 pkg/sentry/kernel/sessions.go                  |  4 ++
 pkg/sentry/kernel/shm/BUILD                    | 13 +-----
 pkg/sentry/kernel/shm/shm.go                   |  4 ++
 pkg/sentry/kernel/signal_handlers.go           |  2 +
 pkg/sentry/kernel/syscalls.go                  |  2 +
 pkg/sentry/kernel/syslog.go                    |  2 +
 pkg/sentry/kernel/task.go                      |  2 +
 pkg/sentry/kernel/task_clone.go                |  4 ++
 pkg/sentry/kernel/task_context.go              |  2 +
 pkg/sentry/kernel/task_exec.go                 |  4 ++
 pkg/sentry/kernel/task_exit.go                 |  6 +++
 pkg/sentry/kernel/task_resources.go            |  2 +
 pkg/sentry/kernel/task_run.go                  |  2 +
 pkg/sentry/kernel/task_sched.go                |  2 +
 pkg/sentry/kernel/task_signals.go              |  5 +++
 pkg/sentry/kernel/task_syscall.go              |  4 ++
 pkg/sentry/kernel/thread_group.go              |  2 +
 pkg/sentry/kernel/threads.go                   |  8 ++++
 pkg/sentry/kernel/time/BUILD                   | 14 +-----
 pkg/sentry/kernel/time/time.go                 |  6 +++
 pkg/sentry/kernel/timekeeper.go                |  2 +
 pkg/sentry/kernel/timer.go                     |  8 ++++
 pkg/sentry/kernel/uts_namespace.go             |  2 +
 pkg/sentry/kernel/vdso.go                      |  2 +
 pkg/sentry/limits/BUILD                        | 13 +-----
 pkg/sentry/limits/limits.go                    |  4 ++
 pkg/sentry/loader/BUILD                        | 15 +------
 pkg/sentry/loader/vdso.go                      |  2 +
 pkg/sentry/loader/vdso_state.go                |  1 +
 pkg/sentry/memmap/BUILD                        | 15 +------
 pkg/sentry/memmap/mapping_set.go               |  2 +
 pkg/sentry/mm/BUILD                            | 21 +--------
 pkg/sentry/mm/aio_context.go                   |  8 ++++
 pkg/sentry/mm/mm.go                            |  7 +++
 pkg/sentry/mm/special_mappable.go              |  2 +
 pkg/sentry/platform/BUILD                      | 13 +-----
 pkg/sentry/platform/filemem/BUILD              | 14 +-----
 pkg/sentry/platform/filemem/filemem.go         |  2 +
 pkg/sentry/socket/BUILD                        | 17 +-------
 pkg/sentry/socket/control/BUILD                | 23 +++-------
 pkg/sentry/socket/control/control.go           |  4 ++
 pkg/sentry/socket/epsocket/BUILD               | 16 +------
 pkg/sentry/socket/epsocket/epsocket.go         |  2 +
 pkg/sentry/socket/epsocket/stack.go            |  2 +
 pkg/sentry/socket/hostinet/BUILD               | 15 +------
 pkg/sentry/socket/netlink/BUILD                | 13 +-----
 pkg/sentry/socket/netlink/port/BUILD           | 15 +------
 pkg/sentry/socket/netlink/port/port.go         |  2 +
 pkg/sentry/socket/netlink/route/BUILD          | 17 +-------
 pkg/sentry/socket/netlink/route/protocol.go    |  2 +
 pkg/sentry/socket/netlink/socket.go            |  2 +
 pkg/sentry/socket/socket.go                    |  2 +
 pkg/sentry/socket/unix/BUILD                   | 13 +-----
 pkg/sentry/socket/unix/unix.go                 |  2 +
 pkg/sentry/syscalls/linux/BUILD                | 20 +--------
 pkg/sentry/syscalls/linux/sys_aio.go           |  2 +
 pkg/sentry/syscalls/linux/sys_futex.go         |  2 +
 pkg/sentry/syscalls/linux/sys_poll.go          |  2 +
 pkg/sentry/syscalls/linux/sys_time.go          |  2 +
 pkg/sentry/usage/BUILD                         | 17 +-------
 pkg/sentry/usage/cpu.go                        |  2 +
 pkg/sentry/usage/io.go                         |  2 +
 pkg/sentry/usermem/BUILD                       | 16 +------
 pkg/sentry/usermem/access_type.go              |  2 +
 pkg/sentry/usermem/addr.go                     |  2 +
 pkg/tcpip/BUILD                                | 17 +-------
 pkg/tcpip/buffer/BUILD                         | 13 +-----
 pkg/tcpip/buffer/view.go                       |  2 +
 pkg/tcpip/header/BUILD                         | 13 +-----
 pkg/tcpip/header/tcp.go                        |  4 ++
 pkg/tcpip/network/fragmentation/BUILD          | 11 +----
 pkg/tcpip/seqnum/BUILD                         | 17 +-------
 pkg/tcpip/tcpip.go                             |  4 ++
 pkg/tcpip/transport/ping/BUILD                 | 17 +-------
 pkg/tcpip/transport/ping/endpoint.go           |  1 +
 pkg/tcpip/transport/queue/BUILD                | 17 +-------
 pkg/tcpip/transport/queue/queue.go             |  2 +
 pkg/tcpip/transport/tcp/BUILD                  | 25 +----------
 pkg/tcpip/transport/tcp/endpoint.go            |  4 ++
 pkg/tcpip/transport/tcp/rcv.go                 |  2 +
 pkg/tcpip/transport/tcp/reno.go                |  2 +
 pkg/tcpip/transport/tcp/segment.go             |  2 +
 pkg/tcpip/transport/tcp/segment_queue.go       |  2 +
 pkg/tcpip/transport/tcp/snd.go                 |  4 ++
 pkg/tcpip/transport/tcp/snd_state.go           |  1 +
 pkg/tcpip/transport/udp/BUILD                  | 17 +-------
 pkg/tcpip/transport/udp/endpoint.go            |  3 ++
 pkg/tcpip/transport/unix/BUILD                 | 16 +------
 pkg/tcpip/transport/unix/connectioned.go       |  2 +
 pkg/tcpip/transport/unix/connectionless.go     |  2 +
 pkg/tcpip/transport/unix/unix.go               | 11 +++++
 pkg/waiter/BUILD                               | 21 ++-------
 pkg/waiter/waiter.go                           |  2 +
 240 files changed, 664 insertions(+), 1109 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index f1e6bac67..c014d2c4b 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,24 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "abi_state",
-    srcs = [
-        "abi.go",
-    ],
-    out = "abi_state.go",
-    package = "abi",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "abi",
     srcs = [
         "abi.go",
-        "abi_state.go",
         "flag.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/abi",
     visibility = ["//:sandbox"],
-    deps = ["//pkg/state"],
 )
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 38b4829c9..ac4ceefbc 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -4,19 +4,7 @@
 
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "linux_state",
-    srcs = [
-        "binder.go",
-        "bpf.go",
-        "time.go",
-        "tty.go",
-    ],
-    out = "linux_state.go",
-    package = "linux",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "linux",
@@ -41,7 +29,6 @@ go_library(
         "ipc.go",
         "limits.go",
         "linux.go",
-        "linux_state.go",
         "mm.go",
         "netdevice.go",
         "netlink.go",
@@ -67,6 +54,5 @@ go_library(
         "//pkg/abi",
         "//pkg/binary",
         "//pkg/bits",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index f597ef4f5..80e5b1af1 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -15,6 +15,8 @@
 package linux
 
 // BPFInstruction is a raw BPF virtual machine instruction.
+//
+// +stateify savable
 type BPFInstruction struct {
 	// OpCode is the operation to execute.
 	OpCode uint16
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 84b6ccc87..b640f7627 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -38,6 +38,8 @@ type Termios struct {
 
 // KernelTermios is struct ktermios/struct termios2, defined in
 // uapi/asm-generic/termbits.h.
+//
+// +stateify savable
 type KernelTermios struct {
 	InputFlags        uint32
 	OutputFlags       uint32
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index 403270049..564df3af5 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,21 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "bpf_state",
-    srcs = [
-        "interpreter.go",
-    ],
-    out = "bpf_state.go",
-    package = "bpf",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "bpf",
     srcs = [
         "bpf.go",
-        "bpf_state.go",
         "decoder.go",
         "input_bytes.go",
         "interpreter.go",
@@ -23,10 +13,7 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/bpf",
     visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/abi/linux"],
 )
 
 go_test(
diff --git a/pkg/bpf/interpreter.go b/pkg/bpf/interpreter.go
index b7dee86a8..111ada9d1 100644
--- a/pkg/bpf/interpreter.go
+++ b/pkg/bpf/interpreter.go
@@ -88,6 +88,8 @@ func (e Error) Error() string {
 }
 
 // Program is a BPF program that has been validated for consistency.
+//
+// +stateify savable
 type Program struct {
 	instructions []linux.BPFInstruction
 }
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 9a0ca1b33..46fc4703b 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,27 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "cpuid_state",
-    srcs = ["cpuid.go"],
-    out = "cpuid_state.go",
-    package = "cpuid",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "cpuid",
     srcs = [
         "cpu_amd64.s",
         "cpuid.go",
-        "cpuid_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/cpuid",
     visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/log",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/log"],
 )
 
 go_test(
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index b486ab037..e91e34dc7 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -409,6 +409,8 @@ func (f Feature) flagString(cpuinfoOnly bool) string {
 }
 
 // FeatureSet is a set of Features for a cpu.
+//
+// +stateify savable
 type FeatureSet struct {
 	// Set is the set of features that are enabled in this FeatureSet.
 	Set map[Feature]bool
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index e32f26ffa..b26a39132 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,28 +1,15 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "list_state",
-    srcs = [
-        "interface_list.go",
-    ],
-    out = "interface_list_state.go",
-    package = "ilist",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "ilist",
     srcs = [
         "interface_list.go",
-        "interface_list_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/ilist",
     visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/state",
-    ],
 )
 
 go_template_instance(
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 5efb6c072..a88b82196 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -36,6 +36,8 @@ type Linker interface {
 //      for e := l.Front(); e != nil; e = e.Next() {
 // 		// do something with e.
 //      }
+//
+// +stateify savable
 type List struct {
 	head Linker
 	tail Linker
@@ -155,6 +157,8 @@ func (l *List) Remove(e Linker) {
 // Entry is a default implementation of Linker. Users can add anonymous fields
 // of this type to their structs to make them automatically implement the
 // methods needed by List.
+//
+// +stateify savable
 type Entry struct {
 	next Linker
 	prev Linker
diff --git a/pkg/segment/range.go b/pkg/segment/range.go
index 5ff30d489..34c067265 100644
--- a/pkg/segment/range.go
+++ b/pkg/segment/range.go
@@ -18,6 +18,8 @@ package segment
 type T uint64
 
 // A Range represents a contiguous range of T.
+//
+// +stateify savable
 type Range struct {
 	// Start is the inclusive start of the range.
 	Start T
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 6eed1d930..cffec2a2c 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -88,6 +88,8 @@ const (
 // A Set is a mapping of segments with non-overlapping Range keys. The zero
 // value for a Set is an empty set. Set values are not safely movable nor
 // copyable. Set is thread-compatible.
+//
+// +stateify savable
 type Set struct {
 	root node `state:".(*SegmentDataSlices)"`
 }
@@ -596,6 +598,7 @@ func (s *Set) ApplyContiguous(r Range, fn func(seg Iterator)) GapIterator {
 	}
 }
 
+// +stateify savable
 type node struct {
 	// An internal binary tree node looks like:
 	//
@@ -1317,6 +1320,8 @@ func (n *node) writeDebugString(buf *bytes.Buffer, prefix string) {
 // SegmentDataSlices represents segments from a set as slices of start, end, and
 // values. SegmentDataSlices is primarily used as an intermediate representation
 // for save/restore and the layout here is optimized for that.
+//
+// +stateify savable
 type SegmentDataSlices struct {
 	Start  []Key
 	End    []Key
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 0a2a35400..314b3e962 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,21 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "arch_state",
-    srcs = [
-        "arch.go",
-        "arch_amd64.go",
-        "arch_state_x86.go",
-        "arch_x86.go",
-        "auxv.go",
-        "signal_amd64.go",
-    ],
-    out = "arch_state.go",
-    package = "arch",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "arch",
@@ -24,7 +10,6 @@ go_library(
         "arch.go",
         "arch_amd64.go",
         "arch_amd64.s",
-        "arch_state.go",
         "arch_state_x86.go",
         "arch_x86.go",
         "auxv.go",
@@ -46,7 +31,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/limits",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 0189e958d..21cb84502 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -254,6 +254,8 @@ const (
 // MemoryManager.
 //
 // Note that "highest address" below is always exclusive.
+//
+// +stateify savable
 type MmapLayout struct {
 	// MinAddr is the lowest mappable address.
 	MinAddr usermem.Addr
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 23526fe8e..f1e408af9 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -95,6 +95,8 @@ const (
 )
 
 // context64 represents an AMD64 context.
+//
+// +stateify savable
 type context64 struct {
 	State
 	sigFPState []x86FPState // fpstate to be restored on sigreturn.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index cb38d098a..e9c23a06b 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -56,6 +56,7 @@ func (s *State) afterLoad() {
 	copy(s.x86FPState, old)
 }
 
+// +stateify savable
 type syscallPtraceRegs struct {
 	R15      uint64
 	R14      uint64
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 5cc4f8377..b35eec53c 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -153,6 +153,8 @@ func NewFloatingPointData() *FloatingPointData {
 
 // State contains the common architecture bits for X86 (the build tag of this
 // file ensures it's only built on x86).
+//
+// +stateify savable
 type State struct {
 	// The system registers.
 	Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 70e0e35b7..81cfb4a01 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -19,6 +19,8 @@ import (
 )
 
 // An AuxEntry represents an entry in an ELF auxiliary vector.
+//
+// +stateify savable
 type AuxEntry struct {
 	Key   uint64
 	Value usermem.Addr
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index c1d743f38..e81717e8b 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -28,6 +28,8 @@ import (
 
 // SignalAct represents the action that should be taken when a signal is
 // delivered, and is equivalent to struct sigaction on 64-bit x86.
+//
+// +stateify savable
 type SignalAct struct {
 	Handler  uint64
 	Flags    uint64
@@ -47,6 +49,8 @@ func (s *SignalAct) DeserializeTo(other *SignalAct) {
 
 // SignalStack represents information about a user stack, and is equivalent to
 // stack_t on 64-bit x86.
+//
+// +stateify savable
 type SignalStack struct {
 	Addr  uint64
 	Flags uint32
@@ -66,6 +70,8 @@ func (s *SignalStack) DeserializeTo(other *SignalStack) {
 
 // SignalInfo represents information about a signal being delivered, and is
 // equivalent to struct siginfo on 64-bit x86.
+//
+// +stateify savable
 type SignalInfo struct {
 	Signo int32 // Signal number
 	Errno int32 // Errno value
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 591b11a4d..01bb40b04 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,23 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "contexttest_state",
-    srcs = [
-        "contexttest.go",
-    ],
-    out = "contexttest_state.go",
-    package = "contexttest",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "contexttest",
     testonly = 1,
-    srcs = [
-        "contexttest.go",
-        "contexttest_state.go",
-    ],
+    srcs = ["contexttest.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -28,6 +16,5 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/ptrace",
         "//pkg/sentry/uniqueid",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index e3c9a9b70..18cd5ae8e 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,40 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "fs_state",
-    srcs = [
-        "attr.go",
-        "dentry.go",
-        "dirent.go",
-        "dirent_cache.go",
-        "dirent_list.go",
-        "dirent_state.go",
-        "file.go",
-        "file_overlay.go",
-        "file_state.go",
-        "filesystems.go",
-        "flags.go",
-        "inode.go",
-        "inode_inotify.go",
-        "inode_operations.go",
-        "inode_overlay.go",
-        "inotify.go",
-        "inotify_event.go",
-        "inotify_watch.go",
-        "mock.go",
-        "mount.go",
-        "mount_overlay.go",
-        "mount_state.go",
-        "mounts.go",
-        "overlay.go",
-        "path.go",
-    ],
-    out = "fs_state.go",
-    package = "fs",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "fs",
@@ -54,7 +21,6 @@ go_library(
         "filesystems.go",
         "flags.go",
         "fs.go",
-        "fs_state.go",
         "inode.go",
         "inode_inotify.go",
         "inode_operations.go",
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index 9f166799a..dc893d22f 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -1,26 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
-go_stateify(
-    name = "ashmem_state",
-    srcs = [
-        "area.go",
-        "device.go",
-        "pin_board.go",
-        "uint64_range.go",
-        "uint64_set.go",
-    ],
-    out = "ashmem_state.go",
-    package = "ashmem",
-)
-
 go_library(
     name = "ashmem",
     srcs = [
         "area.go",
-        "ashmem_state.go",
         "device.go",
         "pin_board.go",
         "uint64_range.go",
@@ -41,7 +27,6 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
     ],
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index e4f76f0d0..bfd7f2762 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -39,10 +39,12 @@ const (
 )
 
 // Area implements fs.FileOperations.
+//
+// +stateify savable
 type Area struct {
-	fsutil.NoFsync
-	fsutil.DeprecatedFileOperations
-	fsutil.NotDirReaddir
+	fsutil.NoFsync                  `state:"nosave"`
+	fsutil.DeprecatedFileOperations `state:"nosave"`
+	fsutil.NotDirReaddir            `state:"nosave"`
 
 	ad *Device
 
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index c5b51d4a7..d0986fa11 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -27,17 +27,19 @@ import (
 )
 
 // Device implements fs.InodeOperations.
+//
+// +stateify savable
 type Device struct {
-	fsutil.DeprecatedFileOperations
-	fsutil.InodeNoExtendedAttributes
-	fsutil.InodeNotDirectory
-	fsutil.InodeNotRenameable
-	fsutil.InodeNotSocket
-	fsutil.InodeNotSymlink
-	fsutil.NoFsync
-	fsutil.NoMappable
-	fsutil.NoopWriteOut
-	fsutil.NotDirReaddir
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.NoFsync                   `state:"nosave"`
+	fsutil.NoMappable                `state:"nosave"`
+	fsutil.NoopWriteOut              `state:"nosave"`
+	fsutil.NotDirReaddir             `state:"nosave"`
 
 	mu       sync.Mutex `state:"nosave"`
 	unstable fs.UnstableAttr
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
index c7fb3822c..ecba395a0 100644
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -56,6 +56,8 @@ func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) {
 // segment.Set is used for implementation where segments represent
 // ranges of pinned bytes, while gaps represent ranges of unpinned
 // bytes. All ranges are page-aligned.
+//
+// +stateify savable
 type PinBoard struct {
 	Set
 }
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 56a2ad6f7..4178f18b2 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -91,6 +91,8 @@ func (n InodeType) String() string {
 
 // StableAttr contains Inode attributes that will be stable throughout the
 // lifetime of the Inode.
+//
+// +stateify savable
 type StableAttr struct {
 	// Type is the InodeType of a InodeOperations.
 	Type InodeType
@@ -150,6 +152,8 @@ func IsCharDevice(s StableAttr) bool {
 
 // UnstableAttr contains Inode attributes that may change over the lifetime
 // of the Inode.
+//
+// +stateify savable
 type UnstableAttr struct {
 	// Size is the file size in bytes.
 	Size int64
@@ -186,6 +190,8 @@ func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr {
 }
 
 // AttrMask contains fields to mask StableAttr and UnstableAttr.
+//
+// +stateify savable
 type AttrMask struct {
 	Type             bool
 	DeviceID         bool
@@ -227,6 +233,8 @@ func (a AttrMask) Union(b AttrMask) AttrMask {
 }
 
 // PermMask are file access permissions.
+//
+// +stateify savable
 type PermMask struct {
 	// Read indicates reading is permitted.
 	Read bool
@@ -280,6 +288,8 @@ func (p PermMask) SupersetOf(other PermMask) bool {
 
 // FilePermissions represents the permissions of a file, with
 // Read/Write/Execute bits for user, group, and other.
+//
+// +stateify savable
 type FilePermissions struct {
 	User  PermMask
 	Group PermMask
@@ -370,6 +380,8 @@ func (f FilePermissions) AnyRead() bool {
 }
 
 // FileOwner represents ownership of a file.
+//
+// +stateify savable
 type FileOwner struct {
 	UID auth.KUID
 	GID auth.KGID
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index ec3928baf..a077b91d2 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -1,25 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "binder_state",
-    srcs = ["binder.go"],
-    out = "binder_state.go",
-    package = "binder",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "binder",
     srcs = [
         "binder.go",
-        "binder_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -30,8 +21,6 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
     ],
 )
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 3f87b6b08..502a262dd 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -40,15 +40,17 @@ const (
 )
 
 // Device implements fs.InodeOperations.
+//
+// +stateify savable
 type Device struct {
-	fsutil.InodeNoExtendedAttributes
-	fsutil.InodeNotDirectory
-	fsutil.InodeNotRenameable
-	fsutil.InodeNotSocket
-	fsutil.InodeNotSymlink
-	fsutil.NoMappable
-	fsutil.NoopWriteOut
-	fsutil.DeprecatedFileOperations
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.NoMappable                `state:"nosave"`
+	fsutil.NoopWriteOut              `state:"nosave"`
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
 
 	// mu protects unstable.
 	mu       sync.Mutex `state:"nosave"`
@@ -186,10 +188,12 @@ func (bd *Device) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Proc implements fs.FileOperations and fs.IoctlGetter.
+//
+// +stateify savable
 type Proc struct {
-	fsutil.NoFsync
-	fsutil.DeprecatedFileOperations
-	fsutil.NotDirReaddir
+	fsutil.NoFsync                  `state:"nosave"`
+	fsutil.DeprecatedFileOperations `state:"nosave"`
+	fsutil.NotDirReaddir            `state:"nosave"`
 
 	bd       *Device
 	task     *kernel.Task
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index d42e8da81..b347468ff 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -21,6 +21,8 @@ import (
 )
 
 // DentAttr is the metadata of a directory entry. It is a subset of StableAttr.
+//
+// +stateify savable
 type DentAttr struct {
 	// Type is the InodeType of an Inode.
 	Type InodeType
@@ -154,6 +156,8 @@ func GenericReaddir(ctx *DirCtx, s *SortedDentryMap) (int, error) {
 }
 
 // SortedDentryMap is a sorted map of names and fs.DentAttr entries.
+//
+// +stateify savable
 type SortedDentryMap struct {
 	// names is always kept in sorted-order.
 	names []string
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index ea41615fd..fc069bb5f 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,25 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "dev_state",
-    srcs = [
-        "dev.go",
-        "fs.go",
-        "full.go",
-        "null.go",
-        "random.go",
-    ],
-    out = "dev_state.go",
-    package = "dev",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "dev",
     srcs = [
         "dev.go",
-        "dev_state.go",
         "device.go",
         "fs.go",
         "full.go",
@@ -30,8 +16,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
-        "//pkg/log",
         "//pkg/rand",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
@@ -45,9 +29,7 @@ go_library(
         "//pkg/sentry/mm",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 36c61bfc2..3f4f2a40a 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -27,6 +27,8 @@ import (
 )
 
 // Dev is the root node.
+//
+// +stateify savable
 type Dev struct {
 	ramfs.Dir
 }
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index 3c79f3782..2ae49be4e 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -29,6 +29,8 @@ const binderEnabledKey = "binder_enabled"
 const ashmemEnabledKey = "ashmem_enabled"
 
 // filesystem is a devtmpfs.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index e13eb6c03..492b8eb3a 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -26,6 +26,8 @@ import (
 )
 
 // fullDevice is used to implement /dev/full.
+//
+// +stateify savable
 type fullDevice struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 66b8ba967..2977c8670 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// +stateify savable
 type nullDevice struct {
 	ramfs.Entry
 }
@@ -54,6 +55,7 @@ func (n *nullDevice) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
+// +stateify savable
 type zeroDevice struct {
 	nullDevice
 }
@@ -80,6 +82,7 @@ func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 	}), nil
 }
 
+// +stateify savable
 type zeroFileOperations struct {
 	fs.FileOperations
 }
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 33a045a05..47b76218f 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// +stateify savable
 type randomDevice struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index f9bf2fba6..4658d044f 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -81,6 +81,8 @@ var renameMu sync.RWMutex
 //
 // Dirents currently do not attempt to free entries that lack application references under
 // memory pressure.
+//
+// +stateify savable
 type Dirent struct {
 	// AtomicRefCount is our reference count.
 	refs.AtomicRefCount
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index e786e4f65..c680e4828 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -25,6 +25,8 @@ import (
 //
 // A nil DirentCache corresponds to a cache with size 0. All methods can be
 // called, but nothing is actually cached.
+//
+// +stateify savable
 type DirentCache struct {
 	// Maximum size of the cache. This must be saved manually, to handle the case
 	// when cache is nil.
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 4fcb06f1f..ffe4204bc 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,54 +1,27 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "pipe_state",
-    srcs = [
-        "pipe.go",
-        "pipe_state.go",
-    ],
-    out = "pipe_autogen_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"],
-    package = "fdpipe",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "fdpipe",
     srcs = [
         "pipe.go",
-        "pipe_autogen_state.go",
         "pipe_opener.go",
         "pipe_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"],
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/abi/linux",
-        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
-        "//pkg/metric",
-        "//pkg/p9",
-        "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/context",
-        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/fs/lock",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/time",
-        "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
-        "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
-        "//pkg/unet",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
     ],
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 7b318e35f..2e34604e6 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -34,6 +34,8 @@ import (
 )
 
 // pipeOperations are the fs.FileOperations of a host pipe.
+//
+// +stateify savable
 type pipeOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 6d93ef760..8e535a618 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -47,6 +47,8 @@ const FileMaxOffset = math.MaxInt64
 // and write(2).
 //
 // FIXME: Split synchronization from cancellation.
+//
+// +stateify savable
 type File struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 36b2cf75e..113962368 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -60,6 +60,8 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err
 }
 
 // overlayFileOperations implements FileOperations for a file in an overlay.
+//
+// +stateify savable
 type overlayFileOperations struct {
 	// upperMu protects upper below. In contrast lower is stable.
 	upperMu sync.Mutex `state:"nosave"`
@@ -375,6 +377,8 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) {
 
 // overlayMappingIdentity wraps a MappingIdentity, and also holds a reference
 // on a file during its lifetime.
+//
+// +stateify savable
 type overlayMappingIdentity struct {
 	refs.AtomicRefCount
 	id          memmap.MappingIdentity
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 200e792f4..5a1e7a270 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -125,6 +125,8 @@ func GetFilesystems() []Filesystem {
 }
 
 // MountSourceFlags represents all mount option flags as a struct.
+//
+// +stateify savable
 type MountSourceFlags struct {
 	// ReadOnly corresponds to mount(2)'s "MS_RDONLY" and indicates that
 	// the filesystem should be mounted read-only.
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index f481c57fb..d137fee4c 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,34 +1,20 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "filetest_state",
-    srcs = [
-        "filetest.go",
-    ],
-    out = "filetest_state.go",
-    package = "filetest",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "filetest",
     testonly = 1,
-    srcs = [
-        "filetest.go",
-        "filetest_state.go",
-    ],
+    srcs = ["filetest.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index da0ff58af..1aa271560 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -19,6 +19,8 @@ import (
 )
 
 // FileFlags encodes file flags.
+//
+// +stateify savable
 type FileFlags struct {
 	// Direct indicates that I/O should be done directly.
 	Direct bool
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 6eea64298..3512bae6f 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,24 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "fsutil_state",
-    srcs = [
-        "dirty_set_impl.go",
-        "file.go",
-        "file_range_set_impl.go",
-        "frame_ref_set_impl.go",
-        "handle.go",
-        "host_file_mapper.go",
-        "host_file_mapper_state.go",
-        "inode.go",
-        "inode_cached.go",
-    ],
-    out = "fsutil_state.go",
-    package = "fsutil",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "dirty_set_impl",
@@ -84,7 +67,6 @@ go_library(
         "frame_ref_set.go",
         "frame_ref_set_impl.go",
         "fsutil.go",
-        "fsutil_state.go",
         "handle.go",
         "host_file_mapper.go",
         "host_file_mapper_state.go",
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 9c6c98542..8e31e48fd 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -32,6 +32,8 @@ import (
 // DirtyInfo is the value type of DirtySet, and represents information about a
 // Mappable offset that is dirty (the cached data for that offset is newer than
 // its source).
+//
+// +stateify savable
 type DirtyInfo struct {
 	// Keep is true if the represented offset is concurrently writable, such
 	// that writing the data for that offset back to the source does not
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
index 149c0f84a..e7efd3c0f 100644
--- a/pkg/sentry/fs/fsutil/handle.go
+++ b/pkg/sentry/fs/fsutil/handle.go
@@ -27,6 +27,8 @@ import (
 //
 // FIXME: Remove Handle entirely in favor of individual fs.File
 // implementations using simple generic utilities.
+//
+// +stateify savable
 type Handle struct {
 	NoopRelease      `state:"nosave"`
 	NoIoctl          `state:"nosave"`
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index d0a27fc1c..9c1e2f76f 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -29,6 +29,8 @@ import (
 // HostFileMapper caches mappings of an arbitrary host file descriptor. It is
 // used by implementations of memmap.Mappable that represent a host file
 // descriptor.
+//
+// +stateify savable
 type HostFileMapper struct {
 	// HostFile conceptually breaks the file into pieces called chunks, of
 	// size and alignment chunkSize, and caches mappings of the file on a chunk
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index e1ad07df2..177396fdc 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -31,6 +31,8 @@ func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations {
 }
 
 // simpleInodeOperations is a simple implementation of Inode.
+//
+// +stateify savable
 type simpleInodeOperations struct {
 	DeprecatedFileOperations  `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
@@ -48,6 +50,8 @@ type simpleInodeOperations struct {
 
 // InodeSimpleAttributes implements a subset of the Inode interface. It provides
 // read-only access to attributes.
+//
+// +stateify savable
 type InodeSimpleAttributes struct {
 	// FSType is the filesystem type reported by StatFS.
 	FSType uint64
@@ -110,6 +114,8 @@ func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error
 //
 // Users need not initialize Xattrs to non-nil (it will be initialized
 // when the first extended attribute is set.
+//
+// +stateify savable
 type InMemoryAttributes struct {
 	Unstable fs.UnstableAttr
 	Xattrs   map[string][]byte
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index cba642a8f..0a320e2d8 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -55,6 +55,8 @@ import (
 //
 // Implementations of InodeOperations.WriteOut must call Sync to write out
 // in-memory modifications of data and metadata to the CachedFileObject.
+//
+// +stateify savable
 type CachingInodeOperations struct {
 	// backingFile is a handle to a cached file object.
 	backingFile CachedFileObject
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 1277379e7..cb17339c9 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,21 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "gofer_state",
-    srcs = [
-        "file.go",
-        "file_state.go",
-        "fs.go",
-        "inode.go",
-        "inode_state.go",
-        "session.go",
-        "session_state.go",
-    ],
-    out = "gofer_state.go",
-    package = "gofer",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "gofer",
@@ -27,7 +12,6 @@ go_library(
         "file.go",
         "file_state.go",
         "fs.go",
-        "gofer_state.go",
         "handles.go",
         "inode.go",
         "inode_state.go",
@@ -41,7 +25,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/metric",
@@ -54,15 +37,11 @@ go_library(
         "//pkg/sentry/fs/fdpipe",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/host",
-        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 039618808..46a6bbd5d 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -33,6 +33,8 @@ import (
 var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
 
 // fileOperations implements fs.FileOperations for a remote file system.
+//
+// +stateify savable
 type fileOperations struct {
 	fsutil.NoIoctl     `state:"nosave"`
 	waiter.AlwaysReady `state:"nosave"`
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index dd5d43c47..3ae93f059 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -83,6 +83,8 @@ var (
 )
 
 // filesystem is a 9p client.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index df584c382..7fc8f77b0 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -35,6 +35,8 @@ import (
 )
 
 // inodeOperations implements fs.InodeOperations.
+//
+// +stateify savable
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
@@ -68,6 +70,8 @@ type inodeOperations struct {
 // circular load dependency between it and inodeOperations). Even with
 // lazy loading, this approach defines the dependencies between objects
 // and the expected load behavior more concretely.
+//
+// +stateify savable
 type inodeFileState struct {
 	// s is common file system state for Gofers.
 	s *session `state:"wait"`
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index b6841526a..648a11435 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
+// +stateify savable
 type endpointMap struct {
 	mu sync.RWMutex `state:"nosave"`
 	// TODO: Make map with private unix sockets savable.
@@ -63,6 +64,8 @@ func (e *endpointMap) get(key device.MultiDeviceKey) unix.BoundEndpoint {
 }
 
 // session holds state for each 9p session established during sys_mount.
+//
+// +stateify savable
 type session struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 23ec66f50..29c79284a 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,23 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "host_state",
-    srcs = [
-        "control.go",
-        "descriptor.go",
-        "descriptor_state.go",
-        "file.go",
-        "fs.go",
-        "inode.go",
-        "inode_state.go",
-        "socket.go",
-        "socket_state.go",
-    ],
-    out = "host_state.go",
-    package = "host",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "host",
@@ -28,7 +11,6 @@ go_library(
         "device.go",
         "file.go",
         "fs.go",
-        "host_state.go",
         "inode.go",
         "inode_state.go",
         "ioctl_unsafe.go",
@@ -42,7 +24,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/refs",
@@ -52,20 +33,14 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
-        "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
-        "//pkg/sentry/uniqueid",
-        "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/rawfile",
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 613bd06e8..3aee4d11c 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -25,6 +25,8 @@ import (
 )
 
 // descriptor wraps a host fd.
+//
+// +stateify savable
 type descriptor struct {
 	// donated is true if the host fd was donated by another process.
 	donated bool
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index bdf844337..f9bef6d93 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -37,6 +37,8 @@ import (
 )
 
 // fileOperations implements fs.FileOperations for a host file descriptor.
+//
+// +stateify savable
 type fileOperations struct {
 	fsutil.NoopRelease `state:"nosave"`
 
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index 974700636..e46ae433c 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -51,6 +51,8 @@ const maxTraversals = 10
 // to lock down the configurations. This filesystem should only be mounted at root.
 //
 // Think twice before exposing this to applications.
+//
+// +stateify savable
 type Filesystem struct {
 	// whitelist is a set of host paths to whitelist.
 	paths []string
@@ -266,8 +268,10 @@ func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, file
 }
 
 // superOperations implements fs.MountSourceOperations.
+//
+// +stateify savable
 type superOperations struct {
-	fs.SimpleMountSourceOperations `state:"nosave"`
+	fs.SimpleMountSourceOperations
 
 	// root is the path of the mount point. All inode mappings
 	// are relative to this root.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 226bc5164..761ccde33 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -34,6 +34,8 @@ import (
 
 // inodeOperations implements fs.InodeOperations for an fs.Inodes backed
 // by a host file descriptor.
+//
+// +stateify savable
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
@@ -65,6 +67,8 @@ type inodeOperations struct {
 // circular load dependency between it and inodeOperations). Even with
 // lazy loading, this approach defines the dependencies between objects
 // and the expected load behavior more concretely.
+//
+// +stateify savable
 type inodeFileState struct {
 	// Common file system state.
 	mops *superOperations `state:"wait"`
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index f4689f51f..1d93eb1e3 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -34,6 +34,8 @@ import (
 )
 
 // endpoint encapsulates the state needed to represent a host Unix socket.
+//
+// +stateify savable
 type endpoint struct {
 	queue waiter.Queue `state:"nosave"`
 
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 6c8e6f188..d0dbce5dd 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -28,6 +28,8 @@ import (
 
 // Inode is a file system object that can be simultaneously referenced by different
 // components of the VFS (Dirent, fs.File, etc).
+//
+// +stateify savable
 type Inode struct {
 	// AtomicRefCount is our reference count.
 	refs.AtomicRefCount
@@ -58,6 +60,8 @@ type Inode struct {
 // Note that in Linux fcntl(2) and flock(2) locks are _not_ cooperative, because race and
 // deadlock conditions make merging them prohibitive. We do the same and keep them oblivious
 // to each other but provide a "context" as a convenient container.
+//
+// +stateify savable
 type LockCtx struct {
 	// Posix is a set of POSIX-style regional advisory locks, see fcntl(2).
 	Posix lock.Locks
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index 358bbecdf..683140afe 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -20,6 +20,8 @@ import (
 )
 
 // Watches is the collection of inotify watches on an inode.
+//
+// +stateify savable
 type Watches struct {
 	// mu protects the fields below.
 	mu sync.RWMutex `state:"nosave"`
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 6f5e8ce5e..2aabdded8 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -34,6 +34,8 @@ import (
 //
 // Lock ordering:
 //   Inotify.mu -> Inode.Watches.mu -> Watch.mu -> Inotify.evMu
+//
+// +stateify savable
 type Inotify struct {
 	// Unique identifier for this inotify instance. We don't just reuse the
 	// inotify fd because fds can be duped. These should not be exposed to the
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index 217915ba4..e9b5e0f56 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -28,6 +28,8 @@ import (
 const inotifyEventBaseSize = 16
 
 // Event represents a struct inotify_event from linux.
+//
+// +stateify savable
 type Event struct {
 	ilist.Entry
 
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 8904ef544..3e1959e83 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -27,6 +27,8 @@ import (
 // holding an extra ref on each dirent known (by inotify) to point to the
 // inode. These are known as pins. For a full discussion, see
 // fs/g3doc/inotify.md.
+//
+// +stateify savable
 type Watch struct {
 	// Inotify instance which owns this watch.
 	owner *Inotify
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 2607d7ed3..3159ff1da 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,18 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "lock_state",
-    srcs = [
-        "lock.go",
-        "lock_range.go",
-        "lock_set.go",
-    ],
-    out = "lock_state.go",
-    package = "lock",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "lock_range",
@@ -49,13 +38,11 @@ go_library(
         "lock_range.go",
         "lock_set.go",
         "lock_set_functions.go",
-        "lock_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/log",
-        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 24d54c989..e9b376eb6 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -88,6 +88,8 @@ const LockEOF = math.MaxUint64
 //
 // A Lock may be downgraded from a write lock to a read lock only if
 // the write lock's uid is the same as the read lock.
+//
+// +stateify savable
 type Lock struct {
 	// Readers are the set of read lock holders identified by UniqueID.
 	// If len(Readers) > 0 then HasWriter must be false.
@@ -103,6 +105,8 @@ type Lock struct {
 }
 
 // Locks is a thread-safe wrapper around a LockSet.
+//
+// +stateify savable
 type Locks struct {
 	// mu protects locks below.
 	mu sync.Mutex `state:"nosave"`
@@ -111,7 +115,7 @@ type Locks struct {
 	locks LockSet
 
 	// blockedQueue is the queue of waiters that are waiting on a lock.
-	blockedQueue waiter.Queue
+	blockedQueue waiter.Queue `state:"zerovalue"`
 }
 
 // Blocker is the interface used for blocking locks. Passing a nil Blocker
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index eb1897174..4ede767f9 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -101,6 +101,8 @@ func (i InodeMappings) String() string {
 // (e.g. cannot be mounted at different locations).
 //
 // TODO: Move mount-specific information out of MountSource.
+//
+// +stateify savable
 type MountSource struct {
 	refs.AtomicRefCount
 
@@ -260,6 +262,8 @@ func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *Mo
 }
 
 // SimpleMountSourceOperations implements MountSourceOperations.
+//
+// +stateify savable
 type SimpleMountSourceOperations struct {
 	keep bool
 }
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 1be81e3a1..d135e8a37 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -18,6 +18,8 @@ import "gvisor.googlesource.com/gvisor/pkg/sentry/context"
 
 // overlayMountSourceOperations implements MountSourceOperations for an overlay
 // mount point.
+//
+// +stateify savable
 type overlayMountSourceOperations struct {
 	upper *MountSource
 	lower *MountSource
@@ -72,6 +74,8 @@ func (o *overlayMountSourceOperations) Destroy() {
 }
 
 // type overlayFilesystem is the filesystem for overlay mounts.
+//
+// +stateify savable
 type overlayFilesystem struct{}
 
 // Name implements Filesystem.Name.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 87da4ee0e..144d3427d 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -32,6 +32,8 @@ import (
 const DefaultTraversalLimit = 10
 
 // MountNamespace defines a collection of mounts.
+//
+// +stateify savable
 type MountNamespace struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 7357d6401..af13dc8c7 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -145,6 +145,8 @@ func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *I
 }
 
 // overlayEntry is the overlay metadata of an Inode. It implements Mappable.
+//
+// +stateify savable
 type overlayEntry struct {
 	// lowerExists is true if an Inode exists for this file in the lower
 	// filesystem. If lowerExists is true, then the overlay must create
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 870df47b2..2d9f07f2f 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,32 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "proc_state",
-    srcs = [
-        "cpuinfo.go",
-        "exec_args.go",
-        "fds.go",
-        "file.go",
-        "filesystems.go",
-        "fs.go",
-        "loadavg.go",
-        "meminfo.go",
-        "mounts.go",
-        "net.go",
-        "proc.go",
-        "stat.go",
-        "sys.go",
-        "sys_net.go",
-        "task.go",
-        "uid_gid_map.go",
-        "uptime.go",
-        "version.go",
-    ],
-    out = "proc_state.go",
-    package = "proc",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "proc",
@@ -42,7 +16,6 @@ go_library(
         "mounts.go",
         "net.go",
         "proc.go",
-        "proc_state.go",
         "rpcinet_proc.go",
         "stat.go",
         "sys.go",
@@ -56,9 +29,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/amutex",
-        "//pkg/log",
-        "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/device",
@@ -73,8 +43,6 @@ go_library(
         "//pkg/sentry/socket/rpcinet",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
-        "//pkg/syserr",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index f80aaa5b1..4dfec03a4 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -27,6 +27,8 @@ import (
 // cpuinfo is a file describing the CPU capabilities.
 //
 // Presently cpuinfo never changes, so it doesn't need to be a SeqFile.
+//
+// +stateify savable
 type cpuinfo struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index 0e1523bf1..a69cbaa0e 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -37,6 +37,8 @@ const (
 
 // execArgFile is a file containing the exec args (either cmdline or environ)
 // for a given task.
+//
+// +stateify savable
 type execArgFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 194a9c12a..cca8f874c 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -138,6 +138,8 @@ func (f *fd) Close() error {
 }
 
 // fdDir implements /proc/TID/fd.
+//
+// +stateify savable
 type fdDir struct {
 	ramfs.Dir
 
@@ -197,6 +199,8 @@ func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 }
 
 // fdInfo is a single file in /proc/TID/fdinfo/.
+//
+// +stateify savable
 type fdInfo struct {
 	ramfs.File
 
@@ -229,6 +233,8 @@ func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error
 
 // fdInfoDir implements /proc/TID/fdinfo.  It embeds an fdDir, but overrides
 // Lookup and Readdir.
+//
+// +stateify savable
 type fdInfoDir struct {
 	ramfs.Dir
 
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
index 9a433cdf8..4b2d08e75 100644
--- a/pkg/sentry/fs/proc/file.go
+++ b/pkg/sentry/fs/proc/file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// +stateify savable
 type file struct {
 	fs.InodeOperations
 
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index 37db9cf9c..49b92fd8a 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -24,6 +24,8 @@ import (
 )
 
 // filesystemsData backs /proc/filesystems.
+//
+// +stateify savable
 type filesystemsData struct{}
 
 // NeedsUpdate returns true on the first generation. The set of registered file
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 3aadd6ac4..061824b8c 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -22,6 +22,8 @@ import (
 )
 
 // filesystem is a procfs.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 7583b6ccd..6fac251d2 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -23,6 +23,8 @@ import (
 )
 
 // loadavgData backs /proc/loadavg.
+//
+// +stateify savable
 type loadavgData struct{}
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 49cb0faed..53dfd59ef 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -26,6 +26,8 @@ import (
 )
 
 // meminfoData backs /proc/meminfo.
+//
+// +stateify savable
 type meminfoData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 108432f4e..2b8167c28 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -71,6 +71,8 @@ func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
 }
 
 // mountInfoFile is used to implement /proc/[pid]/mountinfo.
+//
+// +stateify savable
 type mountInfoFile struct {
 	t *kernel.Task
 }
@@ -152,6 +154,8 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 }
 
 // mountsFile is used to implement /proc/[pid]/mountinfo.
+//
+// +stateify savable
 type mountsFile struct {
 	t *kernel.Task
 }
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index b2a8d639c..07029a7bb 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -33,6 +33,8 @@ import (
 )
 
 // proc is a root proc node.
+//
+// +stateify savable
 type proc struct {
 	ramfs.Dir
 
@@ -47,6 +49,8 @@ type proc struct {
 // stubProcFSFile is a file type that can be used to return file contents
 // which are constant. This file is not writable and will always have mode
 // 0444.
+//
+// +stateify savable
 type stubProcFSFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index c84f7e20d..53c475652 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,22 +1,10 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "seqfile_state",
-    srcs = [
-        "seqfile.go",
-    ],
-    out = "seqfile_state.go",
-    package = "seqfile",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "seqfile",
-    srcs = [
-        "seqfile.go",
-        "seqfile_state.go",
-    ],
+    srcs = ["seqfile.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -26,26 +14,16 @@ go_library(
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
     ],
 )
 
-go_stateify(
-    name = "seqfile_test_state",
-    srcs = ["seqfile_test.go"],
-    out = "seqfile_test_state.go",
-    package = "seqfile",
-)
-
 go_test(
     name = "seqfile_test",
     size = "small",
-    srcs = [
-        "seqfile_test.go",
-        "seqfile_test_state.go",
-    ],
+    srcs = ["seqfile_test.go"],
     embed = [":seqfile"],
     deps = [
+        "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs/test",
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index c08565f8a..51cae5e37 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -30,6 +30,8 @@ import (
 type SeqHandle interface{}
 
 // SeqData holds the data for one unit in the file.
+//
+// +stateify savable
 type SeqData struct {
 	// The data to be returned to the user.
 	Buf []byte
@@ -82,6 +84,8 @@ func (s *SeqGenerationCounter) IsCurrent(generation int64) bool {
 }
 
 // SeqFile is used to provide dynamic files that can be ordered by record.
+//
+// +stateify savable
 type SeqFile struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index 284f3e52b..bf7650211 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -25,6 +25,8 @@ import (
 )
 
 // statData backs /proc/stat.
+//
+// +stateify savable
 type statData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index aab891c53..a2d36ca23 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -28,6 +28,8 @@ import (
 )
 
 // hostname is a file containing the system hostname.
+//
+// +stateify savable
 type hostname struct {
 	ramfs.Entry
 }
@@ -52,6 +54,8 @@ func (p *proc) newHostname(ctx context.Context, msrc *fs.MountSource) *fs.Inode
 }
 
 // mmapMinAddrData backs /proc/sys/vm/mmap_min_addr.
+//
+// +stateify savable
 type mmapMinAddrData struct {
 	k *kernel.Kernel
 }
@@ -74,6 +78,7 @@ func (d *mmapMinAddrData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHand
 	}, 0
 }
 
+// +stateify savable
 type overcommitMemory struct{}
 
 func (*overcommitMemory) NeedsUpdate(generation int64) bool {
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index f3a5043f8..beb25be20 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -33,6 +33,7 @@ const (
 	tcpWMem
 )
 
+// +stateify savable
 type tcpMem struct {
 	ramfs.Entry
 	s    inet.Stack
@@ -100,6 +101,7 @@ func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 	return n, cperr
 }
 
+// +stateify savable
 type tcpSack struct {
 	ramfs.Entry
 	s inet.Stack
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index efc635946..748ca4320 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -52,6 +52,8 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 }
 
 // taskDir represents a task-level directory.
+//
+// +stateify savable
 type taskDir struct {
 	ramfs.Dir
 
@@ -92,6 +94,8 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 }
 
 // subtasks represents a /proc/TID/task directory.
+//
+// +stateify savable
 type subtasks struct {
 	ramfs.Dir
 
@@ -167,6 +171,8 @@ func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, off
 }
 
 // exe is an fs.InodeOperations symlink for the /proc/PID/exe file.
+//
+// +stateify savable
 type exe struct {
 	ramfs.Symlink
 
@@ -226,6 +232,8 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 
 // namespaceFile represents a file in the namespacefs, such as the files in
 // /proc/<pid>/ns.
+//
+// +stateify savable
 type namespaceFile struct {
 	ramfs.Symlink
 
@@ -274,6 +282,8 @@ func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 // mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
+//
+// +stateify savable
 type mapsData struct {
 	t *kernel.Task
 }
@@ -311,6 +321,7 @@ func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([
 	return []seqfile.SeqData{}, 0
 }
 
+// +stateify savable
 type taskStatData struct {
 	t *kernel.Task
 
@@ -391,6 +402,8 @@ func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 }
 
 // statmData implements seqfile.SeqSource for /proc/[pid]/statm.
+//
+// +stateify savable
 type statmData struct {
 	t *kernel.Task
 }
@@ -425,6 +438,8 @@ func (s *statmData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([
 }
 
 // statusData implements seqfile.SeqSource for /proc/[pid]/status.
+//
+// +stateify savable
 type statusData struct {
 	t     *kernel.Task
 	pidns *kernel.PIDNamespace
@@ -490,6 +505,7 @@ type ioUsage interface {
 	IOUsage() *usage.IO
 }
 
+// +stateify savable
 type ioData struct {
 	ioUsage
 }
@@ -530,6 +546,8 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 // On Linux, /proc/[pid]/comm is writable, and writing to the comm file changes
 // the thread name. We don't implement this yet as there are no known users of
 // this feature.
+//
+// +stateify savable
 type comm struct {
 	ramfs.Entry
 
@@ -559,6 +577,8 @@ func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, off
 }
 
 // auxvec is a file containing the auxiliary vector for a task.
+//
+// +stateify savable
 type auxvec struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 85acb5163..9811d9c9d 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -29,6 +29,8 @@ import (
 
 // An idMapSeqSource is a seqfile.SeqSource that returns UID or GID mappings
 // from a task's user namespace.
+//
+// +stateify savable
 type idMapSeqSource struct {
 	t    *kernel.Task
 	gids bool
@@ -70,6 +72,7 @@ type idMapSeqHandle struct {
 	value int
 }
 
+// +stateify savable
 type idMapSeqFile struct {
 	seqfile.SeqFile
 }
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index 4679d5821..f3a9b81df 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -27,6 +27,8 @@ import (
 )
 
 // uptime is a file containing the system uptime.
+//
+// +stateify savable
 type uptime struct {
 	ramfs.Entry
 
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index c0f2e87e3..00f6a2afd 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -23,6 +23,8 @@ import (
 )
 
 // versionData backs /proc/version.
+//
+// +stateify savable
 type versionData struct {
 	// k is the owning Kernel.
 	k *kernel.Kernel
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index d84f2c624..5230157fe 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,19 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "ramfs_state",
-    srcs = [
-        "dir.go",
-        "file.go",
-        "ramfs.go",
-        "socket.go",
-        "symlink.go",
-    ],
-    out = "ramfs_state.go",
-    package = "ramfs",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "ramfs",
@@ -21,7 +8,6 @@ go_library(
         "dir.go",
         "file.go",
         "ramfs.go",
-        "ramfs_state.go",
         "socket.go",
         "symlink.go",
         "tree.go",
@@ -29,12 +15,8 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/amutex",
-        "//pkg/log",
-        "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/context",
-        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
@@ -42,7 +24,6 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 19d5612ed..04432f28c 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -44,6 +44,8 @@ type CreateOps struct {
 }
 
 // Dir represents a single directory in the filesystem.
+//
+// +stateify savable
 type Dir struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index d6cfaf753..13e72e775 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -60,6 +60,8 @@ var (
 
 // Entry represents common internal state for file and directory nodes.
 // This may be used by other packages to easily create ramfs files.
+//
+// +stateify savable
 type Entry struct {
 	waiter.AlwaysReady    `state:"nosave"`
 	fsutil.NoMappable     `state:"nosave"`
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index b0c79325f..93427a1ff 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -21,6 +21,8 @@ import (
 )
 
 // Socket represents a socket.
+//
+// +stateify savable
 type Socket struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 9bbf78619..1c54d9991 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -22,6 +22,8 @@ import (
 )
 
 // Symlink represents a symlink.
+//
+// +stateify savable
 type Symlink struct {
 	Entry
 
diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD
index 57fee45e2..187eac49d 100644
--- a/pkg/sentry/fs/ramfs/test/BUILD
+++ b/pkg/sentry/fs/ramfs/test/BUILD
@@ -1,30 +1,16 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "test_state",
-    srcs = [
-        "test.go",
-    ],
-    out = "test_state.go",
-    package = "test",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "test",
     testonly = 1,
-    srcs = [
-        "test.go",
-        "test_state.go",
-    ],
+    srcs = ["test.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
-        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 095ff1f25..bc24e980e 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,16 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "sys_state",
-    srcs = [
-        "fs.go",
-        "sys.go",
-    ],
-    out = "sys_state.go",
-    package = "sys",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "sys",
@@ -18,7 +8,6 @@ go_library(
         "device.go",
         "fs.go",
         "sys.go",
-        "sys_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys",
     visibility = ["//pkg/sentry:internal"],
@@ -28,6 +17,5 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/usermem",
-        "//pkg/state",
     ],
 )
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index c6d5f7fd8..625525540 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -20,6 +20,8 @@ import (
 )
 
 // filesystem is a sysfs.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index ccf56f644..b9b2fb4a1 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -22,12 +22,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-type Dir struct {
+// +stateify savable
+type dir struct {
 	ramfs.Dir
 }
 
 func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode {
-	d := &Dir{}
+	d := &dir{}
 	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index 8b1b7872e..ffdd7e0dc 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,33 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "timerfd_state",
-    srcs = [
-        "timerfd.go",
-    ],
-    out = "timerfd_state.go",
-    package = "timerfd",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "timerfd",
-    srcs = [
-        "timerfd.go",
-        "timerfd_state.go",
-    ],
+    srcs = ["timerfd.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index ae58f6fd7..767db95a0 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -30,6 +30,8 @@ import (
 )
 
 // TimerOperations implements fs.FileOperations for timerfds.
+//
+// +stateify savable
 type TimerOperations struct {
 	fsutil.ZeroSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
@@ -38,7 +40,7 @@ type TimerOperations struct {
 	fsutil.NoMMap        `state:"nosave"`
 	fsutil.NoIoctl       `state:"nosave"`
 
-	events waiter.Queue `state:"nosave"`
+	events waiter.Queue `state:"zerovalue"`
 	timer  *ktime.Timer
 
 	// val is the number of timer expirations since the last successful call to
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 473ab4296..cfe11ab02 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,18 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tmpfs_state",
-    srcs = [
-        "file_regular.go",
-        "fs.go",
-        "inode_file.go",
-        "tmpfs.go",
-    ],
-    out = "tmpfs_state.go",
-    package = "tmpfs",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tmpfs",
@@ -22,13 +10,11 @@ go_library(
         "fs.go",
         "inode_file.go",
         "tmpfs.go",
-        "tmpfs_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
@@ -41,7 +27,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 9811d90bc..342688f81 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -25,6 +25,8 @@ import (
 
 // regularFileOperations implements fs.FileOperations for a regular
 // tmpfs file.
+//
+// +stateify savable
 type regularFileOperations struct {
 	waiter.AlwaysReady   `state:"nosave"`
 	fsutil.NoopRelease   `state:"nosave"`
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 5bd9ade52..ca620e65e 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -47,6 +47,8 @@ const (
 var modeRegexp = regexp.MustCompile("0[0-7][0-7][0-7]")
 
 // Filesystem is a tmpfs.
+//
+// +stateify savable
 type Filesystem struct{}
 
 func init() {
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 4e803c9ff..1e4fe47d2 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -43,6 +43,8 @@ import (
 // include an InvalidatorRegion associated with that reference. When the
 // referenced portion of the file is removed (with Truncate), the associated
 // InvalidatorRegion is invalidated.
+//
+// +stateify savable
 type fileInodeOperations struct {
 	fsutil.DeprecatedFileOperations `state:"nosave"`
 	fsutil.InodeNotDirectory        `state:"nosave"`
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 1cc7ae491..10cb5451d 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -49,6 +49,8 @@ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent
 }
 
 // Dir is a directory.
+//
+// +stateify savable
 type Dir struct {
 	ramfs.Dir
 
@@ -122,6 +124,8 @@ func (*Dir) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Symlink is a symlink.
+//
+// +stateify savable
 type Symlink struct {
 	ramfs.Symlink
 }
@@ -149,6 +153,8 @@ func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Socket is a socket.
+//
+// +stateify savable
 type Socket struct {
 	ramfs.Socket
 }
@@ -176,6 +182,8 @@ func (s *Socket) StatFS(context.Context) (fs.Info, error) {
 }
 
 // Fifo is a tmpfs named pipe.
+//
+// +stateify savable
 type Fifo struct {
 	ramfs.Entry
 }
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 363897b2c..3c446eef4 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,22 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tty_state",
-    srcs = [
-        "dir.go",
-        "fs.go",
-        "inode.go",
-        "line_discipline.go",
-        "master.go",
-        "queue.go",
-        "slave.go",
-        "terminal.go",
-    ],
-    out = "tty_state.go",
-    package = "tty",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tty",
@@ -29,7 +13,6 @@ go_library(
         "queue.go",
         "slave.go",
         "terminal.go",
-        "tty_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty",
     visibility = ["//pkg/sentry:internal"],
@@ -44,7 +27,6 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 2c5b2aed6..c91091db4 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -49,14 +49,16 @@ import (
 // corresponding Dirents hold on their parent (this directory).
 //
 // dirInodeOperations implements fs.InodeOperations.
+//
+// +stateify savable
 type dirInodeOperations struct {
-	fsutil.DeprecatedFileOperations
-	fsutil.InodeNotSocket
-	fsutil.InodeNotRenameable
-	fsutil.InodeNotSymlink
-	fsutil.InodeNoExtendedAttributes
-	fsutil.NoMappable
-	fsutil.NoopWriteOut
+	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.NoMappable                `state:"nosave"`
+	fsutil.NoopWriteOut              `state:"nosave"`
 
 	// msrc is the super block this directory is on.
 	//
@@ -348,6 +350,8 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
 //
 // This is nearly identical to fsutil.DirFileOperations, except that it takes
 // df.di.mu in IterateDir.
+//
+// +stateify savable
 type dirFileOperations struct {
 	waiter.AlwaysReady `state:"nosave"`
 	fsutil.NoopRelease `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index dbaffe95e..e28635607 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -28,6 +28,8 @@ var ptsDevice = device.NewAnonDevice()
 //
 // This devpts is always in the new "multi-instance" mode. i.e., it contains a
 // ptmx device tied to this mount.
+//
+// +stateify savable
 type filesystem struct{}
 
 func init() {
@@ -69,6 +71,8 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 }
 
 // superOperations implements fs.MountSourceOperations, preventing caching.
+//
+// +stateify savable
 type superOperations struct{}
 
 // Revalidate implements fs.DirentOperations.Revalidate.
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
index 04b9a7727..c0fa2b407 100644
--- a/pkg/sentry/fs/tty/inode.go
+++ b/pkg/sentry/fs/tty/inode.go
@@ -31,6 +31,8 @@ import (
 //
 // * fs.InodeOperations.Release
 // * fs.InodeOperations.GetFile
+//
+// +stateify savable
 type inodeOperations struct {
 	fsutil.DeprecatedFileOperations  `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index f094635f5..d243ee40e 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -72,6 +72,8 @@ const (
 //  termiosMu
 //    inQueue.mu
 //      outQueue.mu
+//
+// +stateify savable
 type lineDiscipline struct {
 	// inQueue is the input queue of the terminal.
 	inQueue queue
@@ -183,6 +185,8 @@ type transformer interface {
 
 // outputQueueTransformer implements transformer. It performs line discipline
 // transformations on the output queue.
+//
+// +stateify savable
 type outputQueueTransformer struct{}
 
 // transform does output processing for one end of the pty. See
@@ -254,6 +258,8 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 
 // inputQueueTransformer implements transformer. It performs line discipline
 // transformations on the input queue.
+//
+// +stateify savable
 type inputQueueTransformer struct{}
 
 // transform does input processing for one end of the pty. Characters read are
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 74cdbe874..c7198e218 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -27,6 +27,8 @@ import (
 
 // masterInodeOperations are the fs.InodeOperations for the master end of the
 // Terminal (ptmx file).
+//
+// +stateify savable
 type masterInodeOperations struct {
 	inodeOperations
 
@@ -96,6 +98,8 @@ func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flag
 }
 
 // masterFileOperations are the fs.FileOperations for the master end of a terminal.
+//
+// +stateify savable
 type masterFileOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 026d5e077..42c105abc 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -32,11 +32,13 @@ import (
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
 // readable is true.
+//
+// +stateify savable
 type queue struct {
 	// mu protects everything in queue.
 	mu sync.Mutex `state:"nosave"`
 
-	waiter.Queue `state:"nosave"`
+	waiter.Queue `state:"zerovalue"`
 
 	// readBuf is buffer of data ready to be read when readable is true.
 	// This data has been processed.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index f5eec726e..1c562b172 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -27,6 +27,8 @@ import (
 
 // slaveInodeOperations are the fs.InodeOperations for the slave end of the
 // Terminal (pts file).
+//
+// +stateify savable
 type slaveInodeOperations struct {
 	inodeOperations
 
@@ -86,6 +88,8 @@ func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags
 }
 
 // slaveFileOperations are the fs.FileOperations for the slave end of a terminal.
+//
+// +stateify savable
 type slaveFileOperations struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index fa5b00409..3cb135124 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -21,6 +21,8 @@ import (
 )
 
 // Terminal is a pseudoterminal.
+//
+// +stateify savable
 type Terminal struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index eaf8f15b2..159c50efb 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -3,26 +3,15 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "inet_state",
-    srcs = ["inet.go"],
-    out = "inet_state.go",
-    package = "inet",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "inet",
     srcs = [
         "context.go",
         "inet.go",
-        "inet_state.go",
         "test_stack.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/inet",
-    deps = [
-        "//pkg/sentry/context",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/sentry/context"],
 )
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index e4b326993..e54a61196 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -87,6 +87,8 @@ type InterfaceAddr struct {
 }
 
 // TCPBufferSize contains settings controlling TCP buffer sizing.
+//
+// +stateify savable
 type TCPBufferSize struct {
 	// Min is the minimum size.
 	Min int
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 1c1633068..69a3fbc45 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,59 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "kernel_state",
-    srcs = [
-        "abstract_socket_namespace.go",
-        "fd_map.go",
-        "fs_context.go",
-        "ipc_namespace.go",
-        "kernel.go",
-        "kernel_state.go",
-        "pending_signals.go",
-        "pending_signals_state.go",
-        "process_group_list.go",
-        "ptrace.go",
-        "rseq.go",
-        "session_list.go",
-        "sessions.go",
-        "signal.go",
-        "signal_handlers.go",
-        "syscalls.go",
-        "syscalls_state.go",
-        "syslog.go",
-        "task.go",
-        "task_clone.go",
-        "task_context.go",
-        "task_exec.go",
-        "task_exit.go",
-        "task_list.go",
-        "task_resources.go",
-        "task_run.go",
-        "task_sched.go",
-        "task_signals.go",
-        "task_start.go",
-        "task_syscall.go",
-        "thread_group.go",
-        "threads.go",
-        "timekeeper.go",
-        "timekeeper_state.go",
-        "timer.go",
-        "uts_namespace.go",
-        "vdso.go",
-        "version.go",
-    ],
-    out = "kernel_autogen_state.go",
-    imports = [
-        "gvisor.googlesource.com/gvisor/pkg/bpf",
-        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
-        "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
-        "gvisor.googlesource.com/gvisor/pkg/tcpip",
-    ],
-    package = "kernel",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "pending_signals_list",
@@ -119,7 +67,6 @@ go_library(
         "fs_context.go",
         "ipc_namespace.go",
         "kernel.go",
-        "kernel_autogen_state.go",
         "kernel_state.go",
         "pending_signals.go",
         "pending_signals_list.go",
@@ -165,6 +112,11 @@ go_library(
         "version.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
+    imports = [
+        "gvisor.googlesource.com/gvisor/pkg/bpf",
+        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
+        "gvisor.googlesource.com/gvisor/pkg/tcpip",
+    ],
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi",
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 014c4a3bf..d6d1d341d 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
+// +stateify savable
 type abstractEndpoint struct {
 	ep   unix.BoundEndpoint
 	wr   *refs.WeakRef
@@ -39,6 +40,8 @@ func (e *abstractEndpoint) WeakRefGone() {
 }
 
 // AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
+//
+// +stateify savable
 type AbstractSocketNamespace struct {
 	mu sync.Mutex `state:"nosave"`
 
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 5b7b30557..a81085372 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,20 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "auth_state",
-    srcs = [
-        "credentials.go",
-        "id.go",
-        "id_map_range.go",
-        "id_map_set.go",
-        "user_namespace.go",
-    ],
-    out = "auth_state.go",
-    package = "auth",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "id_map_range",
@@ -48,7 +35,6 @@ go_library(
     name = "auth",
     srcs = [
         "auth.go",
-        "auth_state.go",
         "capability_set.go",
         "context.go",
         "credentials.go",
@@ -66,7 +52,6 @@ go_library(
         "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/context",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index f6fb05285..f18f7dac9 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -21,6 +21,8 @@ import (
 
 // Credentials contains information required to authorize privileged operations
 // in a user namespace.
+//
+// +stateify savable
 type Credentials struct {
 	// Real/effective/saved user/group IDs in the root user namespace. None of
 	// these should ever be NoID.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index 6adb33530..bd0090e0f 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -77,6 +77,8 @@ func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
 // An IDMapEntry represents a mapping from a range of contiguous IDs in a user
 // namespace to an equally-sized range of contiguous IDs in the namespace's
 // parent.
+//
+// +stateify savable
 type IDMapEntry struct {
 	// FirstID is the first ID in the range in the namespace.
 	FirstID uint32
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 0980aeadf..d359f3f31 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -23,6 +23,8 @@ import (
 
 // A UserNamespace represents a user namespace. See user_namespaces(7) for
 // details.
+//
+// +stateify savable
 type UserNamespace struct {
 	// parent is this namespace's parent. If this is the root namespace, parent
 	// is nil. The parent pointer is immutable.
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 7d491efbc..5e8b36ed6 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,22 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "epoll_autogen_state",
-    srcs = [
-        "epoll.go",
-        "epoll_state.go",
-    ],
-    out = "epoll_autogen_state.go",
-    package = "epoll",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "epoll",
     srcs = [
         "epoll.go",
-        "epoll_autogen_state.go",
         "epoll_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll",
@@ -29,9 +18,7 @@ go_library(
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/kdefs",
-        "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index b572fcd7e..d87e64a1c 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -58,6 +58,8 @@ const (
 // potentially be reassigned. We also cannot use just the file pointer because
 // it is possible to have multiple entries for the same file object as long as
 // they are created with different FDs (i.e., the FDs point to the same file).
+//
+// +stateify savable
 type FileIdentifier struct {
 	File *fs.File
 	Fd   kdefs.FD
@@ -65,6 +67,8 @@ type FileIdentifier struct {
 
 // pollEntry holds all the state associated with an event poll entry, that is,
 // a file being observed by an event poll object.
+//
+// +stateify savable
 type pollEntry struct {
 	ilist.Entry
 	file     *refs.WeakRef  `state:"manual"`
@@ -92,6 +96,8 @@ func (p *pollEntry) WeakRefGone() {
 
 // EventPoll holds all the state associated with an event poll object, that is,
 // collection of files to observe and their current state.
+//
+// +stateify savable
 type EventPoll struct {
 	fsutil.PipeSeek      `state:"zerovalue"`
 	fsutil.NotDirReaddir `state:"zerovalue"`
@@ -102,7 +108,7 @@ type EventPoll struct {
 
 	// Wait queue is used to notify interested parties when the event poll
 	// object itself becomes readable or writable.
-	waiter.Queue
+	waiter.Queue `state:"zerovalue"`
 
 	// files is the map of all the files currently being observed, it is
 	// protected by mu.
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index 7ec179bd8..cc1120b4f 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,33 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "eventfd_state",
-    srcs = [
-        "eventfd.go",
-    ],
-    out = "eventfd_state.go",
-    package = "eventfd",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "eventfd",
-    srcs = [
-        "eventfd.go",
-        "eventfd_state.go",
-    ],
+    srcs = ["eventfd.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index bd50bd9fe..a4ada0e78 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -35,6 +35,8 @@ import (
 // EventOperations represents an event with the semantics of Linux's file-based event
 // notification (eventfd). Eventfds are usually internal to the Sentry but in certain
 // situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
 type EventOperations struct {
 	fsutil.NoopRelease   `state:"nosave"`
 	fsutil.PipeSeek      `state:"nosave"`
@@ -49,7 +51,7 @@ type EventOperations struct {
 
 	// Queue is used to notify interested parties when the event object
 	// becomes readable or writable.
-	wq waiter.Queue `state:"nosave"`
+	wq waiter.Queue `state:"zerovalue"`
 
 	// val is the current value of the event counter.
 	val uint64
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index 299506330..d5d4aaacb 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -46,6 +46,8 @@ func (f FDs) Less(i, j int) bool {
 }
 
 // FDFlags define flags for an individual descriptor.
+//
+// +stateify savable
 type FDFlags struct {
 	// CloseOnExec indicates the descriptor should be closed on exec.
 	CloseOnExec bool
@@ -69,12 +71,16 @@ func (f FDFlags) ToLinuxFDFlags() (mask uint) {
 
 // descriptor holds the details about a file descriptor, namely a pointer the
 // file itself and the descriptor flags.
+//
+// +stateify savable
 type descriptor struct {
 	file  *fs.File
 	flags FDFlags
 }
 
 // FDMap is used to manage File references and flags.
+//
+// +stateify savable
 type FDMap struct {
 	refs.AtomicRefCount
 	k     *Kernel
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index dbc097696..f3f05e8f5 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -25,6 +25,8 @@ import (
 // FSContext contains filesystem context.
 //
 // This includes umask and working directory.
+//
+// +stateify savable
 type FSContext struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index a97a43549..b44a26974 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "waiter_list",
@@ -14,29 +14,15 @@ go_template_instance(
     },
 )
 
-go_stateify(
-    name = "futex_state",
-    srcs = [
-        "futex.go",
-        "waiter_list.go",
-    ],
-    out = "futex_state.go",
-    package = "futex",
-)
-
 go_library(
     name = "futex",
     srcs = [
         "futex.go",
-        "futex_state.go",
         "waiter_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
     visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/state",
-        "//pkg/syserror",
-    ],
+    deps = ["//pkg/syserror"],
 )
 
 go_test(
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 15e3e5e2c..4a1f2a0ef 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -196,6 +196,8 @@ func bucketIndexForAddr(addr uintptr) uintptr {
 }
 
 // Manager holds futex state for a single virtual address space.
+//
+// +stateify savable
 type Manager struct {
 	buckets [bucketCount]bucket `state:"zerovalue"`
 }
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index a86bda77b..5eef49f59 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -21,6 +21,8 @@ import (
 )
 
 // IPCNamespace represents an IPC namespace.
+//
+// +stateify savable
 type IPCNamespace struct {
 	// User namespace which owns this IPC namespace. Immutable.
 	userNS *auth.UserNamespace
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 64439cd9d..419a1d473 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -62,6 +62,8 @@ import (
 
 // Kernel represents an emulated Linux kernel. It must be initialized by calling
 // Init() or LoadFrom().
+//
+// +stateify savable
 type Kernel struct {
 	// extMu serializes external changes to the Kernel with calls to
 	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
@@ -158,7 +160,7 @@ type Kernel struct {
 
 	// exitErr is the error causing the sandbox to exit, if any. It is
 	// protected by extMu.
-	exitErr error
+	exitErr error `state:"nosave"`
 
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index 5dc0f266c..06be5a7e1 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -38,6 +38,8 @@ const (
 // pendingSignals holds a collection of pending signals. The zero value of
 // pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
 // users must provide synchronization.
+//
+// +stateify savable
 type pendingSignals struct {
 	// signals contains all pending signals.
 	//
@@ -52,11 +54,14 @@ type pendingSignals struct {
 }
 
 // pendingSignalQueue holds a pendingSignalList for a single signal number.
+//
+// +stateify savable
 type pendingSignalQueue struct {
 	pendingSignalList
 	length int
 }
 
+// +stateify savable
 type pendingSignal struct {
 	// pendingSignalEntry links into a pendingSignalList.
 	pendingSignalEntry
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 4600d19bd..19b23c6d2 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,20 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "pipe_state",
-    srcs = [
-        "buffers.go",
-        "node.go",
-        "pipe.go",
-        "reader.go",
-        "reader_writer.go",
-        "writer.go",
-    ],
-    out = "pipe_state.go",
-    package = "pipe",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "pipe",
@@ -23,7 +9,6 @@ go_library(
         "device.go",
         "node.go",
         "pipe.go",
-        "pipe_state.go",
         "reader.go",
         "reader_writer.go",
         "writer.go",
@@ -34,15 +19,12 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/amutex",
         "//pkg/ilist",
-        "//pkg/log",
-        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index f300537c5..a82e45c3f 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -20,6 +20,8 @@ import (
 
 // Buffer encapsulates a queueable byte buffer that can
 // easily be truncated.  It is designed only for use with pipes.
+//
+// +stateify savable
 type Buffer struct {
 	ilist.Entry
 	data []byte
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index e418cf174..23d692da1 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -24,6 +24,8 @@ import (
 )
 
 // inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
+//
+// +stateify savable
 type inodeOperations struct {
 	fs.InodeOperations
 
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 9a21df5b4..ced2559a7 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -41,6 +41,8 @@ const DefaultPipeSize = 65536
 // Pipe is an encapsulation of a platform-independent pipe.
 // It manages a buffered byte queue shared between a reader/writer
 // pair.
+//
+// +stateify savable
 type Pipe struct {
 	waiter.Queue `state:"nosave"`
 
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 40d5e4943..1fa5e9a32 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -20,6 +20,8 @@ import (
 
 // Reader satisfies the fs.FileOperations interface for read-only pipes.
 // Reader should be used with !fs.FileFlags.Write to reject writes.
+//
+// +stateify savable
 type Reader struct {
 	ReaderWriter
 }
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index dc642a3a6..82607367b 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -31,6 +31,8 @@ import (
 // read and write requests. This should only be used directly for named pipes.
 // pipe(2) and pipe2(2) only support unidirectional pipes and should use
 // either pipe.Reader or pipe.Writer.
+//
+// +stateify savable
 type ReaderWriter struct {
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index fd13008ac..d93324b53 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -20,6 +20,8 @@ import (
 
 // Writer satisfies the fs.FileOperations interface for write-only pipes.
 // Writer should be used with !fs.FileFlags.Read to reject reads.
+//
+// +stateify savable
 type Writer struct {
 	ReaderWriter
 }
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index f1c2c4bf0..e9e69004d 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -25,6 +25,8 @@ import (
 
 // ptraceOptions are the subset of options controlling a task's ptrace behavior
 // that are set by ptrace(PTRACE_SETOPTIONS).
+//
+// +stateify savable
 type ptraceOptions struct {
 	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
 	// exits.
@@ -185,6 +187,8 @@ func (t *Task) hasTracer() bool {
 }
 
 // ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
+//
+// +stateify savable
 type ptraceStop struct {
 	// If frozen is true, the stopped task's tracer is currently operating on
 	// it, so Task.Kill should not remove the stop.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 635372993..1f3de58e3 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -23,6 +23,8 @@ import (
 // Restartable sequences, as described in https://lwn.net/Articles/650333/.
 
 // RSEQCriticalRegion describes a restartable sequence critical region.
+//
+// +stateify savable
 type RSEQCriticalRegion struct {
 	// When a task in this thread group has its CPU preempted (as defined by
 	// platform.ErrContextCPUPreempted) or has a signal delivered to an
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index 969145fe1..e7fa44e2c 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "waiter_list",
@@ -14,21 +14,10 @@ go_template_instance(
     },
 )
 
-go_stateify(
-    name = "semaphore_state",
-    srcs = [
-        "semaphore.go",
-        "waiter_list.go",
-    ],
-    out = "semaphore_autogen_state.go",
-    package = "semaphore",
-)
-
 go_library(
     name = "semaphore",
     srcs = [
         "semaphore.go",
-        "semaphore_autogen_state.go",
         "waiter_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore",
@@ -40,8 +29,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
-        "//pkg/state",
-        "//pkg/state/statefile",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index a1ee83ce5..aa07946cf 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -42,6 +42,8 @@ const (
 )
 
 // Registry maintains a set of semaphores that can be found by key or ID.
+//
+// +stateify savable
 type Registry struct {
 	// userNS owning the ipc name this registry belongs to. Immutable.
 	userNS *auth.UserNamespace
@@ -52,6 +54,8 @@ type Registry struct {
 }
 
 // Set represents a set of semaphores that can be operated atomically.
+//
+// +stateify savable
 type Set struct {
 	// registry owning this sem set. Immutable.
 	registry *Registry
@@ -79,6 +83,8 @@ type Set struct {
 }
 
 // sem represents a single semanphore from a set.
+//
+// +stateify savable
 type sem struct {
 	value   int16
 	waiters waiterList `state:"zerovalue"`
@@ -86,6 +92,8 @@ type sem struct {
 
 // waiter represents a caller that is waiting for the semaphore value to
 // become positive or zero.
+//
+// +stateify savable
 type waiter struct {
 	waiterEntry
 
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index fa4c7b8f6..cf4e18805 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -27,6 +27,8 @@ type SessionID ThreadID
 type ProcessGroupID ThreadID
 
 // Session contains a leader threadgroup and a list of ProcessGroups.
+//
+// +stateify savable
 type Session struct {
 	refs refs.AtomicRefCount
 
@@ -76,6 +78,8 @@ func (s *Session) decRef() {
 }
 
 // ProcessGroup contains an originator threadgroup and a parent Session.
+//
+// +stateify savable
 type ProcessGroup struct {
 	refs refs.AtomicRefCount // not exported.
 
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 0f88eb0ac..40e641355 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,22 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "shm_state",
-    srcs = [
-        "shm.go",
-    ],
-    out = "shm_autogen_state.go",
-    package = "shm",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "shm",
     srcs = [
         "device.go",
         "shm.go",
-        "shm_autogen_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm",
     visibility = ["//pkg/sentry:internal"],
@@ -33,7 +23,6 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 7217e8103..1ac444094 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -72,6 +72,8 @@ const (
 // Registry tracks all shared memory segments in an IPC namespace. The registry
 // provides the mechanisms for creating and finding segments, and reporting
 // global shm parameters.
+//
+// +stateify savable
 type Registry struct {
 	// userNS owns the IPC namespace this registry belong to. Immutable.
 	userNS *auth.UserNamespace
@@ -288,6 +290,8 @@ func (r *Registry) remove(s *Shm) {
 // shmctl(SHM_RMID).
 //
 // Shm implements memmap.Mappable and memmap.MappingIdentity.
+//
+// +stateify savable
 type Shm struct {
 	// AtomicRefCount tracks the number of references to this segment from
 	// maps. A segment always holds a reference to itself, until it's marked for
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 21ba4ee70..3649f5e4d 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -22,6 +22,8 @@ import (
 )
 
 // SignalHandlers holds information about signal actions.
+//
+// +stateify savable
 type SignalHandlers struct {
 	// mu protects actions, as well as the signal state of all tasks and thread
 	// groups using this SignalHandlers object. (See comment on
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index e20fa3eb6..4c7811b6c 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -176,6 +176,8 @@ type Stracer interface {
 // SyscallTable is a lookup table of system calls. Critically, a SyscallTable
 // is *immutable*. In order to make supporting suspend and resume sane, they
 // must be uniquely registered and may not change during operation.
+//
+// +stateify savable
 type SyscallTable struct {
 	// OS is the operating system that this syscall table implements.
 	OS abi.OS `state:"wait"`
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 31541749e..125312b6a 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -23,6 +23,8 @@ import (
 // syslog represents a sentry-global kernel log.
 //
 // Currently, it contains only fun messages for a dmesg easter egg.
+//
+// +stateify savable
 type syslog struct {
 	// mu protects the below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e705260da..19029adb1 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -52,6 +52,8 @@ import (
 // All fields that are "exclusive to the task goroutine" can only be accessed
 // by the task goroutine while it is running. The task goroutine does not
 // require synchronization to read or write these fields.
+//
+// +stateify savable
 type Task struct {
 	taskNode
 
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 3b77a4965..526165af0 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -352,6 +352,7 @@ func (t *Task) unstopVforkParent() {
 	}
 }
 
+// +stateify savable
 type runSyscallAfterPtraceEventClone struct {
 	vforkChild *Task
 
@@ -369,6 +370,7 @@ func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
 	return (*runSyscallExit)(nil)
 }
 
+// +stateify savable
 type runSyscallAfterVforkStop struct {
 	// childTID has the same meaning as
 	// runSyscallAfterPtraceEventClone.vforkChildTID.
@@ -474,6 +476,8 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
 // that the child and parent share mappings until the child execve()s into a
 // new process image or exits.)
+//
+// +stateify savable
 type vforkStop struct{}
 
 // StopIgnoresKill implements TaskStop.Killable.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 5c563ba08..9a59cbd33 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -35,6 +35,8 @@ var ErrNoSyscalls = errors.New("no syscall table found")
 type Auxmap map[string]interface{}
 
 // TaskContext is the subset of a task's data that is provided by the loader.
+//
+// +stateify savable
 type TaskContext struct {
 	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
 	Name string
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 2285847a2..385299b24 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -73,6 +73,8 @@ import (
 
 // execStop is a TaskStop that a task sets on itself when it wants to execve
 // and is waiting for the other tasks in its thread group to exit first.
+//
+// +stateify savable
 type execStop struct{}
 
 // Killable implements TaskStop.Killable.
@@ -119,6 +121,8 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
 
 // The runSyscallAfterExecStop state continues execve(2) after all siblings of
 // a thread in the execve syscall have exited.
+//
+// +stateify savable
 type runSyscallAfterExecStop struct {
 	tc *TaskContext
 }
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index d6604f37b..b16844e91 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -38,6 +38,8 @@ import (
 
 // An ExitStatus is a value communicated from an exiting task or thread group
 // to the party that reaps it.
+//
+// +stateify savable
 type ExitStatus struct {
 	// Code is the numeric value passed to the call to exit or exit_group that
 	// caused the exit. If the exit was not caused by such a call, Code is 0.
@@ -222,6 +224,8 @@ func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
 }
 
 // runExit is the entry point into the task exit path.
+//
+// +stateify savable
 type runExit struct{}
 
 func (*runExit) execute(t *Task) taskRunState {
@@ -229,6 +233,7 @@ func (*runExit) execute(t *Task) taskRunState {
 	return (*runExitMain)(nil)
 }
 
+// +stateify savable
 type runExitMain struct{}
 
 func (*runExitMain) execute(t *Task) taskRunState {
@@ -531,6 +536,7 @@ func (t *Task) reparentLocked(parent *Task) {
 // tracer (if one exists) and reaps the leader immediately. In Linux, this is
 // in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
 
+// +stateify savable
 type runExitNotify struct{}
 
 func (*runExitNotify) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
index 4ca25664a..0832bf989 100644
--- a/pkg/sentry/kernel/task_resources.go
+++ b/pkg/sentry/kernel/task_resources.go
@@ -21,6 +21,8 @@ import (
 
 // TaskResources is the subset of a task's data provided by its creator that is
 // not provided by the loader.
+//
+// +stateify savable
 type TaskResources struct {
 	// SignalMask is the set of signals whose delivery is currently blocked.
 	//
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index a03fa6ac0..8dd0ef6ea 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -131,6 +131,8 @@ func (t *Task) doStop() {
 
 // The runApp state checks for interrupts before executing untrusted
 // application code.
+//
+// +stateify savable
 type runApp struct{}
 
 func (*runApp) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index b50139077..49141ab74 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -65,6 +65,8 @@ const (
 
 // TaskGoroutineSchedInfo contains task goroutine scheduling state which must
 // be read and updated atomically.
+//
+// +stateify savable
 type TaskGoroutineSchedInfo struct {
 	// Timestamp was the value of Kernel.cpuClock when this
 	// TaskGoroutineSchedInfo was last updated.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 91f6c0874..62ec530be 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -748,6 +748,8 @@ func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
 // groupStop is a TaskStop placed on tasks that have received a stop signal
 // (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
 // the ptrace man page.)
+//
+// +stateify savable
 type groupStop struct{}
 
 // Killable implements TaskStop.Killable.
@@ -881,6 +883,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 }
 
 // The runInterrupt state handles conditions indicated by interrupts.
+//
+// +stateify savable
 type runInterrupt struct{}
 
 func (*runInterrupt) execute(t *Task) taskRunState {
@@ -1020,6 +1024,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 	return (*runApp)(nil)
 }
 
+// +stateify savable
 type runInterruptAfterSignalDeliveryStop struct{}
 
 func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 92ca0acd9..f0373c375 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -241,6 +241,7 @@ func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRun
 	return t.doSyscallInvoke(sysno, args)
 }
 
+// +stateify savable
 type runSyscallAfterSyscallEnterStop struct{}
 
 func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
@@ -260,6 +261,7 @@ func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
 	return t.doSyscallInvoke(sysno, args)
 }
 
+// +stateify savable
 type runSyscallAfterSysemuStop struct{}
 
 func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
@@ -294,6 +296,7 @@ func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRu
 	return (*runSyscallExit)(nil).execute(t)
 }
 
+// +stateify savable
 type runSyscallReinvoke struct{}
 
 func (*runSyscallReinvoke) execute(t *Task) taskRunState {
@@ -310,6 +313,7 @@ func (*runSyscallReinvoke) execute(t *Task) taskRunState {
 	return t.doSyscallInvoke(sysno, args)
 }
 
+// +stateify savable
 type runSyscallExit struct{}
 
 func (*runSyscallExit) execute(t *Task) taskRunState {
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 8fffd3446..441b8a822 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -28,6 +28,8 @@ import (
 // groups" are usually called "processes" in userspace documentation.)
 //
 // ThreadGroup is a superset of Linux's struct signal_struct.
+//
+// +stateify savable
 type ThreadGroup struct {
 	threadGroupNode
 
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 440da9dad..844213c35 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -50,6 +50,8 @@ func (tid ThreadID) String() string {
 const InitTID ThreadID = 1
 
 // A TaskSet comprises all tasks in a system.
+//
+// +stateify savable
 type TaskSet struct {
 	// mu protects all relationships betweens tasks and thread groups in the
 	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
@@ -110,6 +112,8 @@ func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
 //
 // N.B. A task is said to be visible in a PID namespace if the PID namespace
 // contains a thread ID that maps to that task.
+//
+// +stateify savable
 type PIDNamespace struct {
 	// owner is the TaskSet that this PID namespace belongs to. The owner
 	// pointer is immutable.
@@ -263,6 +267,8 @@ func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
 // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
 // threadGroupEntry's methods on ThreadGroup to make it implement
 // threadGroupLinker.)
+//
+// +stateify savable
 type threadGroupNode struct {
 	// pidns is the PID namespace containing the thread group and all of its
 	// member tasks. The pidns pointer is immutable.
@@ -382,6 +388,8 @@ func (tg *ThreadGroup) ID() ThreadID {
 
 // A taskNode defines the relationship between a task and the rest of the
 // system. The comments on threadGroupNode also apply to taskNode.
+//
+// +stateify savable
 type taskNode struct {
 	// tg is the thread group that this task belongs to. The tg pointer is
 	// immutable.
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index b3ed42aa4..5d8db2273 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,30 +1,18 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "time_state",
-    srcs = [
-        "time.go",
-    ],
-    out = "time_state.go",
-    package = "time",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "time",
     srcs = [
         "context.go",
         "time.go",
-        "time_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/log",
         "//pkg/sentry/context",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index c223c2f19..6eadd2878 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -42,6 +42,8 @@ const (
 //
 // Time may represent time with respect to any clock and may not have any
 // meaning in the real world.
+//
+// +stateify savable
 type Time struct {
 	ns int64
 }
@@ -286,6 +288,8 @@ type TimerListener interface {
 }
 
 // Setting contains user-controlled mutable Timer properties.
+//
+// +stateify savable
 type Setting struct {
 	// Enabled is true if the timer is running.
 	Enabled bool
@@ -371,6 +375,8 @@ func (s Setting) advancedTo(now Time) (Setting, uint64) {
 //
 // Timers should be created using NewTimer and must be cleaned up by calling
 // Timer.Destroy when no longer used.
+//
+// +stateify savable
 type Timer struct {
 	// clock is the time source. clock is immutable.
 	clock Clock
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 4de8ac13b..df5dbe128 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -25,6 +25,8 @@ import (
 )
 
 // Timekeeper manages all of the kernel clocks.
+//
+// +stateify savable
 type Timekeeper struct {
 	// clocks are the clock sources.
 	//
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
index 03a3310be..534d03d0f 100644
--- a/pkg/sentry/kernel/timer.go
+++ b/pkg/sentry/kernel/timer.go
@@ -26,6 +26,8 @@ import (
 
 // timekeeperClock is a ktime.Clock that reads time from a
 // kernel.Timekeeper-managed clock.
+//
+// +stateify savable
 type timekeeperClock struct {
 	tk *Timekeeper
 	c  sentrytime.ClockID
@@ -49,6 +51,8 @@ func (tc *timekeeperClock) Now() ktime.Time {
 
 // tgClock is a ktime.Clock that measures the time a thread group has spent
 // executing.
+//
+// +stateify savable
 type tgClock struct {
 	tg *ThreadGroup
 
@@ -155,6 +159,8 @@ func (tc *taskClock) Now() ktime.Time {
 }
 
 // signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
+//
+// +stateify savable
 type signalNotifier struct {
 	tg         *ThreadGroup
 	signal     linux.Signal
@@ -179,6 +185,8 @@ func (s *signalNotifier) Notify(exp uint64) {
 func (s *signalNotifier) Destroy() {}
 
 // TimerManager is a collection of supported process cpu timers.
+//
+// +stateify savable
 type TimerManager struct {
 	// Clocks used to drive thread group execution time timers.
 	virtClock *tgClock
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index 58e9b4d1b..7e0fe0d21 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -22,6 +22,8 @@ import (
 
 // UTSNamespace represents a UTS namespace, a holder of two system identifiers:
 // the hostname and domain name.
+//
+// +stateify savable
 type UTSNamespace struct {
 	// mu protects all fields below.
 	mu         sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 0bacbea49..971e8bc59 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -52,6 +52,8 @@ type vdsoParams struct {
 // Everything in the struct is 8 bytes for easy alignment.
 //
 // It must be kept in sync with params in vdso/vdso_time.cc.
+//
+// +stateify savable
 type VDSOParamPage struct {
 	// The parameter page is fr, allocated from platform.Memory().
 	platform platform.Platform
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 3ce41cacc..90f4395d4 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,22 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "limits_state",
-    srcs = [
-        "limits.go",
-    ],
-    out = "limits_state.go",
-    package = "limits",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "limits",
     srcs = [
         "context.go",
         "limits.go",
-        "limits_state.go",
         "linux.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/limits",
@@ -24,7 +14,6 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
-        "//pkg/state",
     ],
 )
 
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index 4230ba958..02c8b60e3 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -47,6 +47,8 @@ const (
 const Infinity = ^uint64(0)
 
 // Limit specifies a system limit.
+//
+// +stateify savable
 type Limit struct {
 	// Cur specifies the current limit.
 	Cur uint64
@@ -55,6 +57,8 @@ type Limit struct {
 }
 
 // LimitSet represents the Limits that correspond to each LimitType.
+//
+// +stateify savable
 type LimitSet struct {
 	mu   sync.Mutex `state:"nosave"`
 	data map[LimitType]Limit
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index e63052c6d..0beb4561b 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_embed_data(
     name = "vdso_bin",
@@ -10,23 +10,12 @@ go_embed_data(
     var = "vdsoBin",
 )
 
-go_stateify(
-    name = "loader_state",
-    srcs = [
-        "vdso.go",
-        "vdso_state.go",
-    ],
-    out = "loader_state.go",
-    package = "loader",
-)
-
 go_library(
     name = "loader",
     srcs = [
         "elf.go",
         "interpreter.go",
         "loader.go",
-        "loader_state.go",
         "vdso.go",
         "vdso_state.go",
         ":vdso_bin",
@@ -40,7 +29,6 @@ go_library(
         "//pkg/cpuid",
         "//pkg/log",
         "//pkg/rand",
-        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
@@ -55,7 +43,6 @@ go_library(
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 2e8693f8e..a06e27ac9 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -193,6 +193,8 @@ func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error)
 //
 // NOTE: to support multiple architectures or operating systems, this
 // would need to contain a VDSO for each.
+//
+// +stateify savable
 type VDSO struct {
 	// ParamPage is the VDSO parameter page. This page should be updated to
 	// inform the VDSO for timekeeping data.
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
index 92004ad9e..dc71e1c2d 100644
--- a/pkg/sentry/loader/vdso_state.go
+++ b/pkg/sentry/loader/vdso_state.go
@@ -18,6 +18,7 @@ import (
 	"debug/elf"
 )
 
+// +stateify savable
 type elfProgHeader struct {
 	Type   elf.ProgType
 	Flags  elf.ProgFlag
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 2e367e189..c9e0b95a0 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,18 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "memmap_state",
-    srcs = [
-        "mappable_range.go",
-        "mapping_set.go",
-        "mapping_set_impl.go",
-    ],
-    out = "memmap_state.go",
-    package = "memmap",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "mappable_range",
@@ -46,7 +35,6 @@ go_library(
         "mapping_set.go",
         "mapping_set_impl.go",
         "memmap.go",
-        "memmap_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
     visibility = ["//pkg/sentry:internal"],
@@ -56,7 +44,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/platform",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index 0cd42ffbf..c9483905d 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -35,6 +35,8 @@ import (
 type MappingsOfRange map[MappingOfRange]struct{}
 
 // MappingOfRange represents a mapping of a MappableRange.
+//
+// +stateify savable
 type MappingOfRange struct {
 	MappingSpace MappingSpace
 	AddrRange    usermem.AddrRange
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 3f396986a..bbdfae247 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,24 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "mm_state",
-    srcs = [
-        "aio_context.go",
-        "aio_context_state.go",
-        "file_refcount_set.go",
-        "io_list.go",
-        "mm.go",
-        "pma_set.go",
-        "save_restore.go",
-        "special_mappable.go",
-        "vma_set.go",
-    ],
-    out = "mm_state.go",
-    package = "mm",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "file_refcount_set",
@@ -101,7 +84,6 @@ go_library(
         "lifecycle.go",
         "metadata.go",
         "mm.go",
-        "mm_state.go",
         "pma.go",
         "pma_set.go",
         "proc_pid_maps.go",
@@ -131,7 +113,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 992bde5a5..b42156d45 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -28,6 +28,8 @@ import (
 )
 
 // aioManager creates and manages asynchronous I/O contexts.
+//
+// +stateify savable
 type aioManager struct {
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
@@ -89,12 +91,16 @@ func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
 }
 
 // ioResult is a completed I/O operation.
+//
+// +stateify savable
 type ioResult struct {
 	data interface{}
 	ioEntry
 }
 
 // AIOContext is a single asynchronous I/O context.
+//
+// +stateify savable
 type AIOContext struct {
 	// done is the notification channel used for all requests.
 	done chan struct{} `state:"nosave"`
@@ -190,6 +196,8 @@ func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
 
 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
 // ring buffers.
+//
+// +stateify savable
 type aioMappable struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index ce8097b7f..3299ae164 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -46,6 +46,8 @@ import (
 )
 
 // MemoryManager implements a virtual address space.
+//
+// +stateify savable
 type MemoryManager struct {
 	// p is the platform.
 	//
@@ -207,6 +209,8 @@ type MemoryManager struct {
 }
 
 // vma represents a virtual memory area.
+//
+// +stateify savable
 type vma struct {
 	// mappable is the virtual memory object mapped by this vma. If mappable is
 	// nil, the vma represents a private anonymous mapping.
@@ -346,6 +350,8 @@ func (v *vma) loadRealPerms(b int) {
 }
 
 // pma represents a platform mapping area.
+//
+// +stateify savable
 type pma struct {
 	// file is the file mapped by this pma. Only pmas for which file ==
 	// platform.Platform.Memory() may be saved. pmas hold a reference to the
@@ -380,6 +386,7 @@ type pma struct {
 	internalMappings safemem.BlockSeq `state:"nosave"`
 }
 
+// +stateify savable
 type privateRefs struct {
 	mu sync.Mutex `state:"nosave"`
 
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 9d3614034..aa2f87107 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -28,6 +28,8 @@ import (
 // semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
 // that SpecialMappable takes ownership of the memory that it represents
 // (_install_special_mapping() does not.)
+//
+// +stateify savable
 type SpecialMappable struct {
 	refs.AtomicRefCount
 
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index 15a7fbbc3..af9ba5394 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,16 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "platform_state",
-    srcs = [
-        "file_range.go",
-    ],
-    out = "platform_state.go",
-    package = "platform",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "file_range",
@@ -30,7 +21,6 @@ go_library(
         "file_range.go",
         "mmap_min_addr.go",
         "platform.go",
-        "platform_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
     visibility = ["//pkg/sentry:internal"],
@@ -44,7 +34,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
index dadba1d38..2a5982763 100644
--- a/pkg/sentry/platform/filemem/BUILD
+++ b/pkg/sentry/platform/filemem/BUILD
@@ -1,18 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "filemem_autogen_state",
-    srcs = [
-        "filemem.go",
-        "filemem_state.go",
-        "usage_set.go",
-    ],
-    out = "filemem_autogen_state.go",
-    package = "filemem",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "usage_set",
@@ -38,7 +27,6 @@ go_library(
     name = "filemem",
     srcs = [
         "filemem.go",
-        "filemem_autogen_state.go",
         "filemem_state.go",
         "filemem_unsafe.go",
         "usage_set.go",
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index 870274ae1..feb020ef8 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -155,6 +155,8 @@ type FileMem struct {
 }
 
 // usage tracks usage information.
+//
+// +stateify savable
 type usageInfo struct {
 	// kind is the usage kind.
 	kind usage.MemoryKind
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 929787aa0..a320fca0b 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,22 +1,10 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "socket_state",
-    srcs = [
-        "socket.go",
-    ],
-    out = "socket_state_autogen.go",
-    package = "socket",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "socket",
-    srcs = [
-        "socket.go",
-        "socket_state_autogen.go",
-    ],
+    srcs = ["socket.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket",
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -29,7 +17,6 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index faf2b4c27..c4874fdfb 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,26 +1,14 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "control_state",
-    srcs = [
-        "control.go",
-    ],
-    out = "control_state.go",
-    imports = [
-        "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
-    ],
-    package = "control",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "control",
-    srcs = [
-        "control.go",
-        "control_state.go",
-    ],
+    srcs = ["control.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control",
+    imports = [
+        "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -31,7 +19,6 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
     ],
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 17ecdd11c..c31182e69 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -51,6 +51,8 @@ type SCMRights interface {
 // RightsFiles represents a SCM_RIGHTS socket control message. A reference is
 // maintained for each fs.File and is release either when an FD is created or
 // when the Release method is called.
+//
+// +stateify savable
 type RightsFiles []*fs.File
 
 // NewSCMRights creates a new SCM_RIGHTS socket control message representation
@@ -128,6 +130,8 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte) []by
 }
 
 // scmCredentials represents an SCM_CREDENTIALS socket control message.
+//
+// +stateify savable
 type scmCredentials struct {
 	t    *kernel.Task
 	kuid auth.KUID
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 7ad5e88c5..49af8db85 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -1,24 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "epsocket_state",
-    srcs = [
-        "epsocket.go",
-        "save_restore.go",
-        "stack.go",
-    ],
-    out = "epsocket_state.go",
-    package = "epsocket",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "epsocket",
     srcs = [
         "device.go",
         "epsocket.go",
-        "epsocket_state.go",
         "provider.go",
         "save_restore.go",
         "stack.go",
@@ -31,7 +19,6 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/log",
-        "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
@@ -44,7 +31,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index a2927e1b9..f969a1d7c 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -95,6 +95,8 @@ type commonEndpoint interface {
 
 // SocketOperations encapsulates all the state needed to represent a network stack
 // endpoint in the kernel context.
+//
+// +stateify savable
 type SocketOperations struct {
 	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index ec1d96ccb..12b4b4767 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -26,6 +26,8 @@ import (
 )
 
 // Stack implements inet.Stack for netstack/tcpip/stack.Stack.
+//
+// +stateify savable
 type Stack struct {
 	Stack *stack.Stack `state:"manual"`
 }
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 227ca3926..d623718b3 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,24 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "hostinet_state",
-    srcs = [
-        "save_restore.go",
-        "socket.go",
-        "stack.go",
-    ],
-    out = "hostinet_autogen_state.go",
-    package = "hostinet",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "hostinet",
     srcs = [
         "device.go",
         "hostinet.go",
-        "hostinet_autogen_state.go",
         "save_restore.go",
         "socket.go",
         "socket_unsafe.go",
@@ -42,7 +30,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index b23a243f7..b852165f7 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,21 +1,11 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "netlink_state",
-    srcs = [
-        "socket.go",
-    ],
-    out = "netlink_state.go",
-    package = "netlink",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "netlink",
     srcs = [
         "message.go",
-        "netlink_state.go",
         "provider.go",
         "socket.go",
     ],
@@ -36,7 +26,6 @@ go_library(
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index ba6f686e4..3a7dbc5ed 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,23 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "port_state",
-    srcs = ["port.go"],
-    out = "port_state.go",
-    package = "port",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "port",
-    srcs = [
-        "port.go",
-        "port_state.go",
-    ],
+    srcs = ["port.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port",
     visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/state"],
 )
 
 go_test(
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index 4ccf0b84c..1c5d4c3a5 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -32,6 +32,8 @@ import (
 const maxPorts = 10000
 
 // Manager allocates netlink port IDs.
+//
+// +stateify savable
 type Manager struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index 726469fc9..e1bcfe252 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,32 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "route_state",
-    srcs = ["protocol.go"],
-    out = "route_state.go",
-    package = "route",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "route",
-    srcs = [
-        "protocol.go",
-        "route_state.go",
-    ],
+    srcs = ["protocol.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/context",
-        "//pkg/sentry/fs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/netlink",
-        "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index e8030c518..55a76e916 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -43,6 +43,8 @@ func typeKind(typ uint16) commandKind {
 }
 
 // Protocol implements netlink.Protocol.
+//
+// +stateify savable
 type Protocol struct{}
 
 var _ netlink.Protocol = (*Protocol)(nil)
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0b8f528d0..e15d1546c 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -51,6 +51,8 @@ var netlinkSocketDevice = device.NewAnonDevice()
 // to/from the kernel.
 //
 // Socket implements socket.Socket.
+//
+// +stateify savable
 type Socket struct {
 	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index bd4858a34..54fe64595 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -195,6 +195,8 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 //
 // Care must be taken when copying ReceiveTimeout as it contains atomic
 // variables.
+//
+// +stateify savable
 type ReceiveTimeout struct {
 	// ns is length of the timeout in nanoseconds.
 	//
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 7d04d6b6b..9fe681e9a 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,15 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "unix_state",
-    srcs = [
-        "unix.go",
-    ],
-    out = "unix_state.go",
-    package = "unix",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "unix",
@@ -17,7 +8,6 @@ go_library(
         "device.go",
         "io.go",
         "unix.go",
-        "unix_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix",
     visibility = ["//pkg/sentry:internal"],
@@ -37,7 +27,6 @@ go_library(
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/epsocket",
         "//pkg/sentry/usermem",
-        "//pkg/state",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 27bacbbc3..5b6411f97 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -42,6 +42,8 @@ import (
 
 // SocketOperations is a Unix socket. It is similar to an epsocket, except it is backed
 // by a unix.Endpoint instead of a tcpip.Endpoint.
+//
+// +stateify savable
 type SocketOperations struct {
 	refs.AtomicRefCount
 	socket.ReceiveTimeout
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 32fca3811..bbdfad9da 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,18 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "linux_state",
-    srcs = [
-        "sys_aio.go",
-        "sys_futex.go",
-        "sys_poll.go",
-        "sys_time.go",
-    ],
-    out = "linux_state.go",
-    package = "linux",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "linux",
@@ -20,7 +8,6 @@ go_library(
         "error.go",
         "flags.go",
         "linux64.go",
-        "linux_state.go",
         "sigset.go",
         "sys_aio.go",
         "sys_capability.go",
@@ -67,7 +54,6 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/bpf",
-        "//pkg/eventchannel",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/rand",
@@ -75,7 +61,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
-        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
         "//pkg/sentry/kernel",
@@ -86,7 +71,6 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/sched",
-        "//pkg/sentry/kernel/semaphore",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
@@ -98,8 +82,6 @@ go_library(
         "//pkg/sentry/syscalls",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/state",
-        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index fc3397081..54e4afa9e 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -69,6 +69,8 @@ type ioCallback struct {
 }
 
 // ioEvent describes an I/O result.
+//
+// +stateify savable
 type ioEvent struct {
 	Data    uint64
 	Obj     uint64
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 57762d058..1a0e1f5fb 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -132,6 +132,8 @@ func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
 
 // futexWaitRestartBlock encapsulates the state required to restart futex(2)
 // via restart_syscall(2).
+//
+// +stateify savable
 type futexWaitRestartBlock struct {
 	duration time.Duration
 
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index d4dbfd285..b9bdefadb 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -274,6 +274,8 @@ func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Du
 
 // pollRestartBlock encapsulates the state required to restart poll(2) via
 // restart_syscall(2).
+//
+// +stateify savable
 type pollRestartBlock struct {
 	pfdAddr usermem.Addr
 	nfds    uint
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index dcee694b2..8e6683444 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -168,6 +168,8 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 
 // clockNanosleepRestartBlock encapsulates the state required to restart
 // clock_nanosleep(2) via restart_syscall(2).
+//
+// +stateify savable
 type clockNanosleepRestartBlock struct {
 	c        ktime.Clock
 	duration time.Duration
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index edee44d96..868dfd400 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,17 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "usage_state",
-    srcs = [
-        "cpu.go",
-        "io.go",
-        "memory.go",
-    ],
-    out = "usage_state.go",
-    package = "usage",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "usage",
@@ -21,7 +10,6 @@ go_library(
         "memory.go",
         "memory_unsafe.go",
         "usage.go",
-        "usage_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usage",
     visibility = [
@@ -29,9 +17,6 @@ go_library(
     ],
     deps = [
         "//pkg/bits",
-        "//pkg/log",
         "//pkg/sentry/memutil",
-        "//pkg/state",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
index 1c2cc90e1..ed7b04b9e 100644
--- a/pkg/sentry/usage/cpu.go
+++ b/pkg/sentry/usage/cpu.go
@@ -20,6 +20,8 @@ import (
 
 // CPUStats contains the subset of struct rusage fields that relate to CPU
 // scheduling.
+//
+// +stateify savable
 type CPUStats struct {
 	// UserTime is the amount of time spent executing application code.
 	UserTime time.Duration
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
index a05053c32..49faa507d 100644
--- a/pkg/sentry/usage/io.go
+++ b/pkg/sentry/usage/io.go
@@ -19,6 +19,8 @@ import (
 )
 
 // IO contains I/O-related statistics.
+//
+// +stateify savable
 type IO struct {
 	// CharsRead is the number of bytes read by read syscalls.
 	CharsRead uint64
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 9dd1cd2b5..69ba919e0 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,19 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "usermem_state",
-    srcs = [
-        "access_type.go",
-        "addr.go",
-        "addr_range.go",
-        "addr_range_seq_unsafe.go",
-    ],
-    out = "usermem_state.go",
-    package = "usermem",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "addr_range",
@@ -36,7 +24,6 @@ go_library(
         "bytes_io.go",
         "bytes_io_unsafe.go",
         "usermem.go",
-        "usermem_state.go",
         "usermem_x86.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
@@ -47,7 +34,6 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/safemem",
-        "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
     ],
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index 7eabecf30..75346d854 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -20,6 +20,8 @@ import (
 
 // AccessType specifies memory access types. This is used for
 // setting mapping permissions, as well as communicating faults.
+//
+// +stateify savable
 type AccessType struct {
 	// Read is read access.
 	Read bool
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
index d175fdc74..fc94bee80 100644
--- a/pkg/sentry/usermem/addr.go
+++ b/pkg/sentry/usermem/addr.go
@@ -19,6 +19,8 @@ import (
 )
 
 // Addr represents a generic virtual address.
+//
+// +stateify savable
 type Addr uintptr
 
 // AddLength adds the given length to start and returns the result. ok is true
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 391d801d0..5153bd3b4 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,26 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tcpip_state",
-    srcs = [
-        "tcpip.go",
-    ],
-    out = "tcpip_state.go",
-    package = "tcpip",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tcpip",
-    srcs = [
-        "tcpip.go",
-        "tcpip_state.go",
-    ],
+    srcs = ["tcpip.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/state",
         "//pkg/tcpip/buffer",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index efeb6a448..11a725423 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,26 +1,15 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "buffer_state",
-    srcs = [
-        "view.go",
-    ],
-    out = "buffer_state.go",
-    package = "buffer",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "buffer",
     srcs = [
-        "buffer_state.go",
         "prependable.go",
         "view.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/state"],
 )
 
 go_test(
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index a5774a327..bbb4e1d24 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -54,6 +54,8 @@ func (v *View) ToVectorisedView(views [1]View) VectorisedView {
 
 // VectorisedView is a vectorised version of View using non contigous memory.
 // It supports all the convenience methods supported by View.
+//
+// +stateify savable
 type VectorisedView struct {
 	views []View
 	size  int
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 3aa2cfb24..8f22ba3a5 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,15 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tcp_header_state",
-    srcs = [
-        "tcp.go",
-    ],
-    out = "tcp_header_state.go",
-    package = "header",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "header",
@@ -25,13 +16,11 @@ go_library(
         "ipv6.go",
         "ipv6_fragment.go",
         "tcp.go",
-        "tcp_header_state.go",
         "udp.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/header",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/seqnum",
     ],
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index a95d282b0..6689a6dc5 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -120,6 +120,8 @@ type TCPSynOptions struct {
 }
 
 // SACKBlock represents a single contiguous SACK block.
+//
+// +stateify savable
 type SACKBlock struct {
 	// Start indicates the lowest sequence number in the block.
 	Start seqnum.Value
@@ -131,6 +133,8 @@ type SACKBlock struct {
 
 // TCPOptions are used to parse and cache the TCP segment options for a non
 // syn/syn-ack segment.
+//
+// +stateify savable
 type TCPOptions struct {
 	// TS is true if the TimeStamp option is enabled.
 	TS bool
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index ac97ebe43..83b4d253f 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,14 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "fragmentation_state",
-    srcs = ["reassembler_list.go"],
-    out = "fragmentation_state.go",
-    package = "fragmentation",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "reassembler_list",
@@ -26,7 +19,6 @@ go_library(
     srcs = [
         "frag_heap.go",
         "fragmentation.go",
-        "fragmentation_state.go",
         "reassembler.go",
         "reassembler_list.go",
     ],
@@ -34,7 +26,6 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/log",
-        "//pkg/state",
         "//pkg/tcpip/buffer",
     ],
 )
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index a75869dac..c5c889239 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,25 +1,12 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "seqnum_state",
-    srcs = [
-        "seqnum.go",
-    ],
-    out = "seqnum_state.go",
-    package = "seqnum",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "seqnum",
-    srcs = [
-        "seqnum.go",
-        "seqnum_state.go",
-    ],
+    srcs = ["seqnum.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum",
     visibility = [
         "//visibility:public",
     ],
-    deps = ["//pkg/state"],
 )
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index eb1e4645d..af0aec85c 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -213,6 +213,8 @@ const (
 
 // FullAddress represents a full transport node address, as required by the
 // Connect() and Bind() methods.
+//
+// +stateify savable
 type FullAddress struct {
 	// NIC is the ID of the NIC this address refers to.
 	//
@@ -256,6 +258,8 @@ func (s SlicePayload) Size() int {
 }
 
 // A ControlMessages contains socket control messages for IP sockets.
+//
+// +stateify savable
 type ControlMessages struct {
 	// HasTimestamp indicates whether Timestamp is valid/set.
 	HasTimestamp bool
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
index 28e3e1700..117532fea 100644
--- a/pkg/tcpip/transport/ping/BUILD
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -1,19 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "ping_state",
-    srcs = [
-        "endpoint.go",
-        "endpoint_state.go",
-        "ping_packet_list.go",
-    ],
-    out = "ping_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
-    package = "ping",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_template_instance(
     name = "ping_packet_list",
@@ -32,14 +20,13 @@ go_library(
         "endpoint.go",
         "endpoint_state.go",
         "ping_packet_list.go",
-        "ping_state.go",
         "protocol.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index f15e44b61..a22684de9 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+// +stateify savable
 type pingPacket struct {
 	pingPacketEntry
 	senderAddress tcpip.FullAddress
diff --git a/pkg/tcpip/transport/queue/BUILD b/pkg/tcpip/transport/queue/BUILD
index fb878ad36..6dcec312e 100644
--- a/pkg/tcpip/transport/queue/BUILD
+++ b/pkg/tcpip/transport/queue/BUILD
@@ -1,27 +1,14 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "queue_state",
-    srcs = [
-        "queue.go",
-    ],
-    out = "queue_state.go",
-    package = "queue",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "queue",
-    srcs = [
-        "queue.go",
-        "queue_state.go",
-    ],
+    srcs = ["queue.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/queue",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/waiter",
     ],
diff --git a/pkg/tcpip/transport/queue/queue.go b/pkg/tcpip/transport/queue/queue.go
index 6a17441ae..eb9ee8a3f 100644
--- a/pkg/tcpip/transport/queue/queue.go
+++ b/pkg/tcpip/transport/queue/queue.go
@@ -33,6 +33,8 @@ type Entry interface {
 }
 
 // Queue is a buffer queue.
+//
+// +stateify savable
 type Queue struct {
 	ReaderQueue *waiter.Queue
 	WriterQueue *waiter.Queue
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 6a7153e4d..9ebae6cc7 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,27 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "tcp_state",
-    srcs = [
-        "endpoint.go",
-        "endpoint_state.go",
-        "rcv.go",
-        "reno.go",
-        "segment.go",
-        "segment_heap.go",
-        "segment_queue.go",
-        "segment_state.go",
-        "snd.go",
-        "snd_state.go",
-        "tcp_segment_list.go",
-    ],
-    out = "tcp_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
-    package = "tcp",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "tcp_segment_list",
@@ -53,15 +33,14 @@ go_library(
         "snd.go",
         "snd_state.go",
         "tcp_segment_list.go",
-        "tcp_state.go",
         "timer.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/rand",
         "//pkg/sleep",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 5b8a1e20f..de1883d84 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -54,6 +54,8 @@ const (
 )
 
 // SACKInfo holds TCP SACK related information for a given endpoint.
+//
+// +stateify savable
 type SACKInfo struct {
 	// Blocks is the maximum number of SACK blocks we track
 	// per endpoint.
@@ -69,6 +71,8 @@ type SACKInfo struct {
 // have concurrent goroutines make calls into the endpoint, they are properly
 // synchronized. The protocol implementation, however, runs in a single
 // goroutine.
+//
+// +stateify savable
 type endpoint struct {
 	// workMu is used to arbitrate which goroutine may perform protocol
 	// work. Only the main protocol goroutine is expected to call Lock() on
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index b22a00ce1..92ef9c6f7 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -22,6 +22,8 @@ import (
 
 // receiver holds the state necessary to receive TCP segments and turn them
 // into a stream of bytes.
+//
+// +stateify savable
 type receiver struct {
 	ep *endpoint
 
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
index 60f170a27..03ae8d747 100644
--- a/pkg/tcpip/transport/tcp/reno.go
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -16,6 +16,8 @@ package tcp
 
 // renoState stores the variables related to TCP New Reno congestion
 // control algorithm.
+//
+// +stateify savable
 type renoState struct {
 	s *sender
 }
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 40928ba2c..8dccea2ba 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -36,6 +36,8 @@ const (
 // segment represents a TCP segment. It holds the payload and parsed TCP segment
 // information, and can be added to intrusive lists.
 // segment is mostly immutable, the only field allowed to change is viewToDeliver.
+//
+// +stateify savable
 type segment struct {
 	segmentEntry
 	refCnt int32
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 2ddcf5f10..6a2d7bc0b 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -21,6 +21,8 @@ import (
 )
 
 // segmentQueue is a bounded, thread-safe queue of TCP segments.
+//
+// +stateify savable
 type segmentQueue struct {
 	mu    sync.Mutex  `state:"nosave"`
 	list  segmentList `state:"wait"`
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index e38686e1b..376e81846 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -54,6 +54,8 @@ type congestionControl interface {
 }
 
 // sender holds the state necessary to send TCP segments.
+//
+// +stateify savable
 type sender struct {
 	ep *endpoint
 
@@ -133,6 +135,8 @@ type sender struct {
 }
 
 // fastRecovery holds information related to fast recovery from a packet loss.
+//
+// +stateify savable
 type fastRecovery struct {
 	// active whether the endpoint is in fast recovery. The following fields
 	// are only meaningful when active is true.
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 33c8867f4..d536839af 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -18,6 +18,7 @@ import (
 	"time"
 )
 
+// +stateify savable
 type unixTime struct {
 	second int64
 	nano   int64
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 790dd55a3..1a3a62d3d 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,19 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "udp_state",
-    srcs = [
-        "endpoint.go",
-        "endpoint_state.go",
-        "udp_packet_list.go",
-    ],
-    out = "udp_state.go",
-    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
-    package = "udp",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_template_instance(
     name = "udp_packet_list",
@@ -33,13 +21,12 @@ go_library(
         "endpoint_state.go",
         "protocol.go",
         "udp_packet_list.go",
-        "udp_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp",
+    imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/sleep",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 2a32c3a87..03fb76f92 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+// +stateify savable
 type udpPacket struct {
 	udpPacketEntry
 	senderAddress tcpip.FullAddress
@@ -49,6 +50,8 @@ const (
 // between users of the endpoint and the protocol implementation; it is legal to
 // have concurrent goroutines make calls into the endpoint, they are properly
 // synchronized.
+//
+// +stateify savable
 type endpoint struct {
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
diff --git a/pkg/tcpip/transport/unix/BUILD b/pkg/tcpip/transport/unix/BUILD
index 676f2cf92..dae0bd079 100644
--- a/pkg/tcpip/transport/unix/BUILD
+++ b/pkg/tcpip/transport/unix/BUILD
@@ -1,17 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify")
-
-go_stateify(
-    name = "unix_state",
-    srcs = [
-        "connectioned.go",
-        "connectionless.go",
-        "unix.go",
-    ],
-    out = "unix_state.go",
-    package = "unix",
-)
+load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
     name = "unix",
@@ -20,14 +9,11 @@ go_library(
         "connectioned_state.go",
         "connectionless.go",
         "unix.go",
-        "unix_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
-        "//pkg/log",
-        "//pkg/state",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/transport/queue",
diff --git a/pkg/tcpip/transport/unix/connectioned.go b/pkg/tcpip/transport/unix/connectioned.go
index 0e63186b2..dd7c03cf1 100644
--- a/pkg/tcpip/transport/unix/connectioned.go
+++ b/pkg/tcpip/transport/unix/connectioned.go
@@ -85,6 +85,8 @@ type ConnectingEndpoint interface {
 // path != "" && acceptedChan != nil => bound and listening.
 //
 // Only one of these will be true at any moment.
+//
+// +stateify savable
 type connectionedEndpoint struct {
 	baseEndpoint
 
diff --git a/pkg/tcpip/transport/unix/connectionless.go b/pkg/tcpip/transport/unix/connectionless.go
index 3276ddcd0..2a6ec8b4b 100644
--- a/pkg/tcpip/transport/unix/connectionless.go
+++ b/pkg/tcpip/transport/unix/connectionless.go
@@ -25,6 +25,8 @@ import (
 //
 // Specifically, this means datagram unix sockets not created with
 // socketpair(2).
+//
+// +stateify savable
 type connectionlessEndpoint struct {
 	baseEndpoint
 }
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
index 190a1ccdb..8e4af3139 100644
--- a/pkg/tcpip/transport/unix/unix.go
+++ b/pkg/tcpip/transport/unix/unix.go
@@ -60,6 +60,8 @@ type CredentialsControlMessage interface {
 }
 
 // A ControlMessages represents a collection of socket control messages.
+//
+// +stateify savable
 type ControlMessages struct {
 	// Rights is a control message containing FDs.
 	Rights RightsControlMessage
@@ -235,6 +237,8 @@ type BoundEndpoint interface {
 }
 
 // message represents a message passed over a Unix domain socket.
+//
+// +stateify savable
 type message struct {
 	ilist.Entry
 
@@ -306,6 +310,8 @@ type Receiver interface {
 }
 
 // queueReceiver implements Receiver for datagram sockets.
+//
+// +stateify savable
 type queueReceiver struct {
 	readQueue *queue.Queue
 }
@@ -369,6 +375,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 {
 func (*queueReceiver) Release() {}
 
 // streamQueueReceiver implements Receiver for stream sockets.
+//
+// +stateify savable
 type streamQueueReceiver struct {
 	queueReceiver
 
@@ -579,6 +587,7 @@ type ConnectedEndpoint interface {
 	Release()
 }
 
+// +stateify savable
 type connectedEndpoint struct {
 	// endpoint represents the subset of the Endpoint functionality needed by
 	// the connectedEndpoint. It is implemented by both connectionedEndpoint
@@ -671,6 +680,8 @@ func (*connectedEndpoint) Release() {}
 // unix domain socket Endpoint implementations.
 //
 // Not to be used on its own.
+//
+// +stateify savable
 type baseEndpoint struct {
 	*waiter.Queue
 
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 8256acdb4..5e611c54f 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,28 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_stateify", "go_test")
-
-go_stateify(
-    name = "waiter_state",
-    srcs = [
-        "waiter.go",
-    ],
-    out = "waiter_state.go",
-    package = "waiter",
-)
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "waiter",
-    srcs = [
-        "waiter.go",
-        "waiter_state.go",
-    ],
+    srcs = ["waiter.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/waiter",
     visibility = ["//visibility:public"],
-    deps = [
-        "//pkg/ilist",
-        "//pkg/state",
-    ],
+    deps = ["//pkg/ilist"],
 )
 
 go_test(
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 9b189bb9e..9825880ca 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -157,6 +157,8 @@ func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) {
 // notifiers can notify them when events happen.
 //
 // The zero value for waiter.Queue is an empty queue ready for use.
+//
+// +stateify savable
 type Queue struct {
 	list ilist.List   `state:"zerovalue"`
 	mu   sync.RWMutex `state:"nosave"`
-- 
cgit v1.2.3


From b6a37ab9d96b382e26e3836a42ea485c48a521a8 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 2 Aug 2018 15:55:19 -0700
Subject: Update comment reference

PiperOrigin-RevId: 207180809
Change-Id: I08c264812919e81b2c56fdd4a9ef06924de8b52f
---
 pkg/abi/linux/tty.go                   | 24 ++++++++++++------------
 pkg/sentry/fs/mount.go                 |  8 ++++----
 pkg/sentry/syscalls/linux/sys_prctl.go |  2 +-
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index b640f7627..8c611d22a 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -286,18 +286,18 @@ var DefaultControlCharacters = [NumControlCharacters]uint8{
 	'\x7f',                 // VERASE = DEL
 	ControlCharacter('U'),  // VKILL = ^U
 	ControlCharacter('D'),  // VEOF = ^D
-	0, // VTIME
-	1, // VMIN
-	0, // VSWTC
-	ControlCharacter('Q'), // VSTART = ^Q
-	ControlCharacter('S'), // VSTOP = ^S
-	ControlCharacter('Z'), // VSUSP = ^Z
-	0, // VEOL
-	ControlCharacter('R'), // VREPRINT = ^R
-	ControlCharacter('O'), // VDISCARD = ^O
-	ControlCharacter('W'), // VWERASE = ^W
-	ControlCharacter('V'), // VLNEXT = ^V
-	0, // VEOL2
+	0,                      // VTIME
+	1,                      // VMIN
+	0,                      // VSWTC
+	ControlCharacter('Q'),  // VSTART = ^Q
+	ControlCharacter('S'),  // VSTOP = ^S
+	ControlCharacter('Z'),  // VSUSP = ^Z
+	0,                      // VEOL
+	ControlCharacter('R'),  // VREPRINT = ^R
+	ControlCharacter('O'),  // VDISCARD = ^O
+	ControlCharacter('W'),  // VWERASE = ^W
+	ControlCharacter('V'),  // VLNEXT = ^V
+	0,                      // VEOL2
 }
 
 // MasterTermios is the terminal configuration of the master end of a Unix98
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 4ede767f9..c72372929 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -159,10 +159,10 @@ const defaultDirentCacheSize uint64 = 1000
 func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags MountSourceFlags) *MountSource {
 	return &MountSource{
 		MountSourceOperations: mops,
-		Flags:      flags,
-		Filesystem: filesystem,
-		fscache:    NewDirentCache(defaultDirentCacheSize),
-		children:   make(map[*MountSource]struct{}),
+		Flags:                 flags,
+		Filesystem:            filesystem,
+		fscache:               NewDirentCache(defaultDirentCacheSize),
+		children:              make(map[*MountSource]struct{}),
 	}
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 911fef658..a1242acd3 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -113,7 +113,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			return 0, nil, syscall.EINVAL
 		}
 		// no_new_privs is assumed to always be set. See
-		// auth.Credentials.UpdateForExec.
+		// kernel.Task.updateCredsForExec.
 		return 0, nil, nil
 
 	case linux.PR_GET_NO_NEW_PRIVS:
-- 
cgit v1.2.3


From a3927157c56cc022cefebc30c8a9b6014f5d0412 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 2 Aug 2018 16:00:29 -0700
Subject: Copy creds in access

PiperOrigin-RevId: 207181631
Change-Id: Ic6205278715a9260fb970efb414fc758ea72c4c6
---
 pkg/sentry/syscalls/linux/sys_file.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 692f11ed7..94b7ac7a5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -453,7 +453,7 @@ func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, m
 		// uid/gid. We do this by temporarily clearing all FS-related
 		// capabilities and switching the fsuid/fsgid around to the
 		// real ones." -fs/open.c:faccessat
-		creds := t.Credentials()
+		creds := t.Credentials().Fork()
 		creds.EffectiveKUID = creds.RealKUID
 		creds.EffectiveKGID = creds.RealKGID
 		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
-- 
cgit v1.2.3


From 25178ebdf5e881eae8e81eaf2f69d96de42d2250 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Fri, 3 Aug 2018 12:07:57 -0700
Subject: stateify: make explicit mode no longer optional.

PiperOrigin-RevId: 207303405
Change-Id: I17b6433963d78e3631a862b7ac80f566c8e7d106
---
 pkg/sentry/fs/host/socket.go |  2 +-
 tools/go_stateify/defs.bzl   | 22 +++++++++++++++-------
 tools/go_stateify/main.go    | 32 ++++++++++++++------------------
 3 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 1d93eb1e3..4ace71c3e 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -37,7 +37,7 @@ import (
 //
 // +stateify savable
 type endpoint struct {
-	queue waiter.Queue `state:"nosave"`
+	queue waiter.Queue `state:"zerovalue"`
 
 	// stype is the type of Unix socket. (Ex: unix.SockStream,
 	// unix.SockSeqpacket, unix.SockDgram)
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index 2b2582b7a..70ce73d7b 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -1,7 +1,20 @@
 """Stateify is a tool for generating state wrappers for Go types.
 
-The go_stateify rule is used to generate a file that will appear in a Go
-target; the output file should appear explicitly in a srcs list. For example:
+The recommended way is to use the go_library rule defined below with mostly
+identical configuration as the native go_library rule.
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+    name = "foo",
+    srcs = ["foo.go"],
+)
+
+Under the hood, the go_stateify rule is used to generate a file that will
+appear in a Go target; the output file should appear explicitly in a srcs list.
+For example (the above is still the preferred way):
+
+load("//tools/go_stateify:defs.bzl", "go_stateify")
 
 go_stateify(
     name = "foo_state",
@@ -35,8 +48,6 @@ def _go_stateify_impl(ctx):
         args += ["-statepkg=%s" % ctx.attr._statepkg]
     if ctx.attr.imports:
         args += ["-imports=%s" % ",".join(ctx.attr.imports)]
-    if ctx.attr.explicit:
-        args += ["-explicit=true"]
     args += ["--"]
     for src in ctx.attr.srcs:
         args += [f.path for f in src.files]
@@ -57,7 +68,6 @@ def _go_stateify_impl(ctx):
 #   imports: an optional list of extra non-aliased, Go-style absolute import paths.
 #   out: the name of the generated file output. This must not conflict with any other files and must be added to the srcs of the relevant go_library.
 #   package: the package name for the input sources.
-#   explicit: only generate for types explicitly annotated as savable.
 go_stateify = rule(
     implementation = _go_stateify_impl,
     attrs = {
@@ -65,7 +75,6 @@ go_stateify = rule(
         "imports": attr.string_list(mandatory = False),
         "package": attr.string(mandatory = True),
         "out": attr.output(mandatory = True),
-        "explicit": attr.bool(default = False),
         "_tool": attr.label(executable = True, cfg = "host", default = Label("//tools/go_stateify:stateify")),
         "_statepkg": attr.string(default = "gvisor.googlesource.com/gvisor/pkg/state"),
     },
@@ -81,7 +90,6 @@ def go_library(name, srcs, deps = [], imports = [], **kwargs):
             imports = imports,
             package = name,
             out = name + "_state_autogen.go",
-            explicit = True,
         )
         all_srcs = srcs + [name + "_state_autogen.go"]
         if "//pkg/state" not in deps:
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 231c6d80b..5646b879a 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -33,7 +33,6 @@ var (
 	imports  = flag.String("imports", "", "extra imports for the output file")
 	output   = flag.String("output", "", "output file")
 	statePkg = flag.String("statepkg", "", "state import package; defaults to empty")
-	explicit = flag.Bool("explicit", false, "only generate for types explicitly tagged '// +stateify savable'")
 )
 
 // resolveTypeName returns a qualified type name.
@@ -318,25 +317,22 @@ func main() {
 				continue
 			}
 
-			if *explicit {
-				// In explicit mode, only generate code for
-				// types explicitly marked
-				// "// +stateify savable" in one of the
-				// proceeding comment lines.
-				if d.Doc == nil {
-					continue
-				}
-				savable := false
-				for _, l := range d.Doc.List {
-					if l.Text == "// +stateify savable" {
-						savable = true
-						break
-					}
-				}
-				if !savable {
-					continue
+			// Only generate code for types marked
+			// "// +stateify savable" in one of the proceeding
+			// comment lines.
+			if d.Doc == nil {
+				continue
+			}
+			savable := false
+			for _, l := range d.Doc.List {
+				if l.Text == "// +stateify savable" {
+					savable = true
+					break
 				}
 			}
+			if !savable {
+				continue
+			}
 
 			for _, gs := range d.Specs {
 				ts := gs.(*ast.TypeSpec)
-- 
cgit v1.2.3


From 3ec074897f9d0aba21bc9f41be18f52bfbeb599e Mon Sep 17 00:00:00 2001
From: ShiruRen <renshiru2000@gmail.com>
Date: Mon, 6 Aug 2018 10:10:25 -0700
Subject: Fix a bug in PCIDs.Assign

Store the new assigned pcid in p.cache[pt].

Signed-off-by: ShiruRen <renshiru2000@gmail.com>

Change-Id: I4aee4e06559e429fb5e90cb9fe28b36139e3b4b6
PiperOrigin-RevId: 207563833
---
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index 4296371e8..227cf7aad 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -64,6 +64,7 @@ func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
 	if len(p.avail) > 0 {
 		pcid := p.avail[len(p.avail)-1]
 		p.avail = p.avail[:len(p.avail)-1]
+		p.cache[pt] = pcid
 
 		// We need to flush because while this is in the available
 		// pool, it may have been used previously.
-- 
cgit v1.2.3


From 42086fe8e1244d93e87f0cada9d0b1a8b764c6a8 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 6 Aug 2018 10:14:40 -0700
Subject: Make ramfs.File savable

In other news, apparently proc.fdInfo is the last user of ramfs.File.

PiperOrigin-RevId: 207564572
Change-Id: I5a92515698cc89652b80bea9a32d309e14059869
---
 pkg/sentry/fs/ramfs/file.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/ramfs/file.go b/pkg/sentry/fs/ramfs/file.go
index e8363c3e2..0b94d92a1 100644
--- a/pkg/sentry/fs/ramfs/file.go
+++ b/pkg/sentry/fs/ramfs/file.go
@@ -30,6 +30,8 @@ import (
 // thus should only be used for small files.
 //
 // A File is not mappable.
+//
+// +stateify savable
 type File struct {
 	Entry
 
-- 
cgit v1.2.3


From c348d0786388ded1a4bad3c98000b4653724c764 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 7 Aug 2018 10:26:17 -0700
Subject: sentry: make epoll.pollEntry wait for the file operation in restore.

PiperOrigin-RevId: 207737935
Change-Id: I3a301ece1f1d30909715f36562474e3248b6a0d5
---
 pkg/sentry/fs/file.go            | 2 +-
 pkg/sentry/fs/host/inode.go      | 2 +-
 pkg/sentry/kernel/epoll/epoll.go | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 8e535a618..904827a3e 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -91,7 +91,7 @@ type File struct {
 	mu amutex.AbortableMutex `state:"nosave"`
 
 	// FileOperations implements file system specific behavior for this File.
-	FileOperations FileOperations
+	FileOperations FileOperations `state:"wait"`
 
 	// offset is the File's offset. Updating offset is protected by mu but
 	// can be read atomically via File.Offset() outside of mu.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 761ccde33..66c17debb 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -77,7 +77,7 @@ type inodeFileState struct {
 	descriptor *descriptor `state:"wait"`
 
 	// Event queue for blocking operations.
-	queue waiter.Queue `state:"nosave"`
+	queue waiter.Queue `state:"zerovalue"`
 
 	// sattr is used to restore the inodeOperations.
 	sattr fs.StableAttr `state:"wait"`
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index d87e64a1c..a8eb114c0 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -61,7 +61,7 @@ const (
 //
 // +stateify savable
 type FileIdentifier struct {
-	File *fs.File
+	File *fs.File `state:"wait"`
 	Fd   kdefs.FD
 }
 
-- 
cgit v1.2.3


From a38f41b4643520a3b2a078e73ec012ffd3f71f54 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 7 Aug 2018 11:42:29 -0700
Subject: fs: Add new cache policy "remote_revalidate".

This CL adds a new cache-policy for gofer filesystems that uses the host page
cache, but causes dirents to be reloaded on each Walk, and does not cache
readdir results.

This policy is useful when the remote filesystem may change out from underneath
us, as any remote changes will be reflected on the next Walk.

Importantly, this cache policy is only consistent if we do not use gVisor's
internal page cache, since that page cache is tied to the Inode and may be
thrown away upon Revalidation.

This cache policy should only be used when the gofer supports donating host
FDs, since then gVisor will make use of the host kernel page cache, which will
be consistent for all open files in the gofer. In fact, a panic will be raised
if a file is opened without a donated FD.

PiperOrigin-RevId: 207752937
Change-Id: I233cb78b4695bbe00a4605ae64080a47629329b8
---
 pkg/sentry/fs/gofer/cache_policy.go |  46 ++-
 pkg/sentry/fs/gofer/file.go         |  13 +-
 pkg/sentry/fs/gofer/gofer_test.go   | 597 +++++++++++++++++++++---------------
 3 files changed, 408 insertions(+), 248 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index eec8c07cb..52d97b54f 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -34,8 +34,34 @@ const (
 	// Use virtual file system cache for everything, but send writes to the
 	// fs agent immediately.
 	cacheAllWritethrough
+
+	// Use virtual file system cache for everything, but reload dirents
+	// from the remote filesystem on each lookup. Thus, if the remote
+	// filesystem has changed, the returned dirent will have the updated
+	// state.
+	//
+	// This policy should *only* be used with remote filesystems that
+	// donate their host FDs to the sandbox and thus use the host page
+	// cache, otherwise the dirent state will be inconsistent.
+	cacheRemoteRevalidating
 )
 
+// String returns the string name of the cache policy.
+func (cp cachePolicy) String() string {
+	switch cp {
+	case cacheNone:
+		return "cacheNone"
+	case cacheAll:
+		return "cacheAll"
+	case cacheAllWritethrough:
+		return "cacheAllWritethrough"
+	case cacheRemoteRevalidating:
+		return "cacheRemoteRevalidating"
+	default:
+		return "unknown"
+	}
+}
+
 func parseCachePolicy(policy string) (cachePolicy, error) {
 	switch policy {
 	case "fscache":
@@ -44,6 +70,8 @@ func parseCachePolicy(policy string) (cachePolicy, error) {
 		return cacheNone, nil
 	case "fscache_writethrough":
 		return cacheAllWritethrough, nil
+	case "remote_revalidating":
+		return cacheRemoteRevalidating, nil
 	}
 	return cacheNone, fmt.Errorf("unsupported cache mode: %s", policy)
 }
@@ -63,14 +91,16 @@ func (cp cachePolicy) cacheReaddir() bool {
 }
 
 // usePageCache determines whether the page cache should be used for the given
-// inode.
+// inode. If the remote filesystem donates host FDs to the sentry, then the
+// host kernel's page cache will be used, otherwise we will use a
+// sentry-internal page cache.
 func (cp cachePolicy) usePageCache(inode *fs.Inode) bool {
 	// Do cached IO for regular files only. Some "character devices" expect
 	// no caching.
 	if !fs.IsFile(inode.StableAttr) {
 		return false
 	}
-	return cp == cacheAll || cp == cacheAllWritethrough
+	return cp == cacheAll || cp == cacheAllWritethrough || cp == cacheRemoteRevalidating
 }
 
 // writeThough indicates whether writes to the file should be synced to the
@@ -79,10 +109,16 @@ func (cp cachePolicy) writeThrough(inode *fs.Inode) bool {
 	return cp == cacheNone || cp == cacheAllWritethrough
 }
 
-// revalidateDirent indicates that dirents should be revalidated after they are
-// looked up.
+// revalidateDirent indicates that a dirent should be revalidated after a
+// lookup, because the looked up version may be stale.
 func (cp cachePolicy) revalidateDirent() bool {
-	return cp == cacheNone
+	if cp == cacheAll || cp == cacheAllWritethrough {
+		return false
+	}
+
+	// TODO: The cacheRemoteRevalidating policy should only
+	// return true if the remote file's attributes have changed.
+	return true
 }
 
 // keepDirent indicates that dirents should be kept pinned in the dirent tree
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 46a6bbd5d..c4a210656 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -15,6 +15,7 @@
 package gofer
 
 import (
+	"fmt"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -72,6 +73,17 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF
 	flags.Pread = true
 	flags.Pwrite = true
 
+	if fs.IsFile(dirent.Inode.StableAttr) {
+		// If cache policy is "remote revalidating", then we must
+		// ensure that we have a host FD. Otherwise, the
+		// sentry-internal page cache will be used, and we can end up
+		// in an inconsistent state if the remote file changes.
+		cp := dirent.Inode.InodeOperations.(*inodeOperations).session().cachePolicy
+		if cp == cacheRemoteRevalidating && handles.Host == nil {
+			panic(fmt.Sprintf("remote-revalidating cache policy requires gofer to donate host FD, but file %q did not have host FD", name))
+		}
+	}
+
 	f := &fileOperations{
 		inodeOperations: i,
 		handles:         handles,
@@ -202,7 +214,6 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I
 			err = f.inodeOperations.cachingInodeOps.WriteOut(ctx, file.Dirent.Inode)
 		}
 		return n, err
-
 	}
 	return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
 }
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 3df72dd37..764b530cb 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -15,11 +15,11 @@
 package gofer
 
 import (
-	"errors"
 	"fmt"
 	"io"
 	"syscall"
 	"testing"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
@@ -32,15 +32,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
-// A errMock is an error that comes from bad usage of the mock.
-var errMock = errors.New("mock error")
-
 // goodMockFile returns a file that can be Walk'ed to and created.
 func goodMockFile(mode p9.FileMode, size uint64) *p9test.FileMock {
 	return &p9test.FileMock{
 		GetAttrMock: p9test.GetAttrMock{
-			Valid: p9.AttrMask{Mode: true, Size: true, RDev: true},
 			Attr:  p9.Attr{Mode: mode, Size: size, RDev: 0},
+			Valid: p9.AttrMaskAll(),
 		},
 	}
 }
@@ -62,7 +59,7 @@ func newClosedSocket() (*unet.Socket, error) {
 
 // root returns a p9 file mock and an fs.InodeOperations created from that file.  Any
 // functions performed on fs.InodeOperations will use the p9 file mock.
-func root(ctx context.Context, mode p9.FileMode, size uint64) (*p9test.FileMock, *fs.Inode, error) {
+func root(ctx context.Context, cp cachePolicy, mode p9.FileMode, size uint64) (*p9test.FileMock, *fs.Inode, error) {
 	sock, err := newClosedSocket()
 	if err != nil {
 		return nil, nil, err
@@ -72,7 +69,8 @@ func root(ctx context.Context, mode p9.FileMode, size uint64) (*p9test.FileMock,
 	s := &session{
 		conn:        sock,
 		mounter:     fs.RootOwner,
-		cachePolicy: cacheNone,
+		cachePolicy: cp,
+		client:      &p9.Client{},
 	}
 
 	rootFile := goodMockFile(mode, size)
@@ -109,43 +107,149 @@ func TestLookup(t *testing.T) {
 
 	ctx := contexttest.Context(t)
 	for _, test := range tests {
-		// Set up mock.
-		rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
-		if err != nil {
-			t.Errorf("TestWalk %s failed: root error got %v, want nil", test.name, err)
-		}
-
-		rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-		rootFile.WalkGetAttrMock.Err = test.want
-		rootFile.WalkGetAttrMock.File = goodMockFile(p9.PermissionsMask, 0)
-
-		// Call function.
-		dirent, err := rootInode.Lookup(ctx, test.fileName)
-
-		// Unwrap the InodeOperations.
-		var newInodeOperations fs.InodeOperations
-		if dirent != nil {
-			if dirent.IsNegative() {
-				err = syscall.ENOENT
-			} else {
-				newInodeOperations = dirent.Inode.InodeOperations
+		t.Run(test.name, func(t *testing.T) {
+			// Set up mock.
+			rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
+			if err != nil {
+				t.Fatalf("error creating root: %v", err)
+			}
+
+			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
+			rootFile.WalkGetAttrMock.Err = test.want
+			rootFile.WalkGetAttrMock.File = goodMockFile(p9.PermissionsMask, 0)
+
+			// Call function.
+			dirent, err := rootInode.Lookup(ctx, test.fileName)
+
+			// Unwrap the InodeOperations.
+			var newInodeOperations fs.InodeOperations
+			if dirent != nil {
+				if dirent.IsNegative() {
+					err = syscall.ENOENT
+				} else {
+					newInodeOperations = dirent.Inode.InodeOperations
+				}
+			}
+
+			// Check return values.
+			if err != test.want {
+				t.Errorf("Lookup got err %v, want %v", err, test.want)
+			}
+			if err == nil && newInodeOperations == nil {
+				t.Errorf("Lookup got non-nil err and non-nil node, wanted at least one non-nil")
+			}
+
+			// Check mock parameters.
+			if !rootFile.WalkGetAttrMock.Called {
+				t.Errorf("GetAttr not called; error: %v", err)
+			} else if rootFile.WalkGetAttrMock.Names[0] != test.fileName {
+				t.Errorf("file name not set")
+			}
+		})
+	}
+}
+
+func TestRevalidation(t *testing.T) {
+	tests := []struct {
+		cachePolicy               cachePolicy
+		preModificationWantReval  bool
+		postModificationWantReval bool
+	}{
+		{
+			// Policy cacheNone causes Revalidate to always return
+			// true.
+			cachePolicy:               cacheNone,
+			preModificationWantReval:  true,
+			postModificationWantReval: true,
+		},
+		{
+			// Policy cacheAll causes Revalidate to always return
+			// false.
+			cachePolicy:               cacheAll,
+			preModificationWantReval:  false,
+			postModificationWantReval: false,
+		},
+		{
+			// Policy cacheAllWritethrough causes Revalidate to
+			// always return false.
+			cachePolicy:               cacheAllWritethrough,
+			preModificationWantReval:  false,
+			postModificationWantReval: false,
+		},
+		{
+			// Policy cacheRemoteRevalidating causes Revalidate to
+			// always return true.
+			//
+			// TODO: The cacheRemoteRevalidating
+			// policy should only return true if the remote file's
+			// attributes have changed.
+			cachePolicy:               cacheRemoteRevalidating,
+			preModificationWantReval:  true,
+			postModificationWantReval: true,
+		},
+	}
+
+	ctx := contexttest.Context(t)
+	for _, test := range tests {
+		name := fmt.Sprintf("cachepolicy=%s", test.cachePolicy)
+		t.Run(name, func(t *testing.T) {
+			// Set up mock.
+			rootFile, rootInode, err := root(ctx, test.cachePolicy, p9.ModeDirectory|p9.PermissionsMask, 0)
+			if err != nil {
+				t.Fatalf("error creating root: %v", err)
+			}
+
+			rootDir := fs.NewDirent(rootInode, "root")
+
+			// Create a mock file that we will walk to from the root.
+			const (
+				name = "foo"
+				mode = p9.PermissionsMask
+			)
+			file := goodMockFile(mode, 0)
+			file.GetAttrMock.Valid = p9.AttrMaskAll()
+
+			// Tell the root mock how to walk to this file.
+			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
+			rootFile.WalkGetAttrMock.File = file
+			rootFile.WalkGetAttrMock.Attr = file.GetAttrMock.Attr
+			rootFile.WalkGetAttrMock.Valid = file.GetAttrMock.Valid
+
+			// Do the walk.
+			dirent, err := rootDir.Walk(ctx, rootDir, name)
+			if err != nil {
+				t.Fatalf("Lookup(%q) failed: %v", name, err)
+			}
+
+			// Walk again. Depending on the cache policy, we may get a new
+			// dirent.
+			newDirent, err := rootDir.Walk(ctx, rootDir, name)
+			if err != nil {
+				t.Fatalf("Lookup(%q) failed: %v", name, err)
+			}
+			if test.preModificationWantReval && dirent == newDirent {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
+			}
+			if !test.preModificationWantReval && dirent != newDirent {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
 			}
-		}
 
-		// Check return values.
-		if err != test.want {
-			t.Errorf("TestWalk %s failed: got %v, want %v", test.name, err, test.want)
-		}
-		if err == nil && newInodeOperations == nil {
-			t.Errorf("TestWalk %s failed: expected either non-nil err or non-nil node, but both are nil", test.name)
-		}
+			// Modify the underlying mocked file's modification time.
+			file.GetAttrMock.Attr.MTimeSeconds = uint64(time.Now().Unix())
 
-		// Check mock parameters.
-		if !rootFile.WalkGetAttrMock.Called {
-			t.Errorf("TestWalk %s failed: GetAttr not called; error: %v", test.name, err)
-		} else if rootFile.WalkGetAttrMock.Names[0] != test.fileName {
-			t.Errorf("TestWalk %s failed: file name not set", test.name)
-		}
+			// Walk again. Depending on the cache policy, we may get a new
+			// dirent.
+			newDirent, err = rootDir.Walk(ctx, rootDir, name)
+			if err != nil {
+				t.Fatalf("Lookup(%q) failed: %v", name, err)
+			}
+			if test.postModificationWantReval && dirent == newDirent {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
+			}
+			if !test.postModificationWantReval && dirent != newDirent {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
+			}
+		})
 	}
 }
 
@@ -197,56 +301,57 @@ func TestSetTimestamps(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		// Set up mock.
-		rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
-		if err != nil {
-			t.Errorf("TestSetTimestamps %s failed: root error got %v, want nil", test.name, err)
-		}
-
-		// Call function.
-		err = rootInode.SetTimestamps(ctx, nil /* Dirent */, test.ts)
-
-		// Check return values.
-		if err != nil {
-			t.Errorf("TestSetTimestamps %s failed: got %v, want nil", test.name, err)
-		}
+		t.Run(test.name, func(t *testing.T) {
+			// Set up mock.
+			rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
+			if err != nil {
+				t.Fatalf("error creating root: %v", err)
+			}
 
-		// Check mock parameters.
-		if !(test.ts.ATimeOmit && test.ts.MTimeOmit) && !rootFile.SetAttrMock.Called {
-			t.Errorf("TestSetTimestamps %s failed: SetAttr not called", test.name)
-			continue
-		}
+			// Call function.
+			err = rootInode.SetTimestamps(ctx, nil /* Dirent */, test.ts)
 
-		// Check what was passed to the mock function.
-		attr := rootFile.SetAttrMock.Attr
-		atimeGiven := ktime.FromUnix(int64(attr.ATimeSeconds), int64(attr.ATimeNanoSeconds))
-		if test.ts.ATimeOmit {
-			if rootFile.SetAttrMock.Valid.ATime {
-				t.Errorf("TestSetTimestamps %s failed: ATime got set true in mask, wanted false", test.name)
-			}
-		} else {
-			if got, want := rootFile.SetAttrMock.Valid.ATimeNotSystemTime, !test.ts.ATimeSetSystemTime; got != want {
-				t.Errorf("TestSetTimestamps %s failed: got ATimeNotSystemTime %v, want %v", test.name, got, want)
+			// Check return values.
+			if err != nil {
+				t.Errorf("SetTimestamps failed: got error %v, want nil", err)
 			}
-			if !test.ts.ATimeSetSystemTime && !test.ts.ATime.Equal(atimeGiven) {
-				t.Errorf("TestSetTimestamps %s failed: ATime got %v, want %v", test.name, atimeGiven, test.ts.ATime)
-			}
-		}
 
-		mtimeGiven := ktime.FromUnix(int64(attr.MTimeSeconds), int64(attr.MTimeNanoSeconds))
-		if test.ts.MTimeOmit {
-			if rootFile.SetAttrMock.Valid.MTime {
-				t.Errorf("TestSetTimestamps %s failed: MTime got set true in mask, wanted false", test.name)
-			}
-		} else {
-			if got, want := rootFile.SetAttrMock.Valid.MTimeNotSystemTime, !test.ts.MTimeSetSystemTime; got != want {
-				t.Errorf("TestSetTimestamps %s failed: got MTimeNotSystemTime %v, want %v", test.name, got, want)
+			// Check mock parameters.
+			if !(test.ts.ATimeOmit && test.ts.MTimeOmit) && !rootFile.SetAttrMock.Called {
+				t.Errorf("TestSetTimestamps failed: SetAttr not called")
+				return
 			}
-			if !test.ts.MTimeSetSystemTime && !test.ts.MTime.Equal(mtimeGiven) {
-				t.Errorf("TestSetTimestamps %s failed: MTime got %v, want %v", test.name, mtimeGiven, test.ts.MTime)
+
+			// Check what was passed to the mock function.
+			attr := rootFile.SetAttrMock.Attr
+			atimeGiven := ktime.FromUnix(int64(attr.ATimeSeconds), int64(attr.ATimeNanoSeconds))
+			if test.ts.ATimeOmit {
+				if rootFile.SetAttrMock.Valid.ATime {
+					t.Errorf("ATime got set true in mask, wanted false")
+				}
+			} else {
+				if got, want := rootFile.SetAttrMock.Valid.ATimeNotSystemTime, !test.ts.ATimeSetSystemTime; got != want {
+					t.Errorf("got ATimeNotSystemTime %v, want %v", got, want)
+				}
+				if !test.ts.ATimeSetSystemTime && !test.ts.ATime.Equal(atimeGiven) {
+					t.Errorf("ATime got %v, want %v", atimeGiven, test.ts.ATime)
+				}
 			}
-		}
 
+			mtimeGiven := ktime.FromUnix(int64(attr.MTimeSeconds), int64(attr.MTimeNanoSeconds))
+			if test.ts.MTimeOmit {
+				if rootFile.SetAttrMock.Valid.MTime {
+					t.Errorf("MTime got set true in mask, wanted false")
+				}
+			} else {
+				if got, want := rootFile.SetAttrMock.Valid.MTimeNotSystemTime, !test.ts.MTimeSetSystemTime; got != want {
+					t.Errorf("got MTimeNotSystemTime %v, want %v", got, want)
+				}
+				if !test.ts.MTimeSetSystemTime && !test.ts.MTime.Equal(mtimeGiven) {
+					t.Errorf("MTime got %v, want %v", mtimeGiven, test.ts.MTime)
+				}
+			}
+		})
 	}
 }
 
@@ -283,43 +388,43 @@ func TestSetPermissions(t *testing.T) {
 
 	ctx := contexttest.Context(t)
 	for _, test := range tests {
-		// Set up mock.
-		rootFile, rootInode, err := root(ctx, 0, 0)
-		if err != nil {
-			t.Errorf("TestSetPermissions %s failed: root error got %v, want nil", test.name, err)
-		}
-		rootFile.SetAttrMock.Err = test.setAttrErr
-
-		ok := rootInode.SetPermissions(ctx, nil /* Dirent */, test.perms)
-
-		// Check return value.
-		if ok != test.want {
-			t.Errorf("TestSetPermissions %s failed: got %v, want %v", test.name, ok, test.want)
-		}
-
-		// Check mock parameters.
-		pattr := rootFile.SetAttrMock.Attr
-		if !rootFile.SetAttrMock.Called {
-			t.Errorf("TestSetPermissions %s failed: SetAttr not called", test.name)
-			continue
-		}
-		if !rootFile.SetAttrMock.Valid.Permissions {
-			t.Errorf("TestSetPermissions %s failed: SetAttr did not get right request (got false, expected SetAttrMask.Permissions true)",
-				test.name)
-		}
-		if got := fs.FilePermsFromP9(pattr.Permissions); got != test.perms {
-			t.Errorf("TestSetPermissions %s failed: SetAttr did not get right permissions -- got %v, want %v",
-				test.name, got, test.perms)
-		}
+		t.Run(test.name, func(t *testing.T) {
+			// Set up mock.
+			rootFile, rootInode, err := root(ctx, cacheNone, 0, 0)
+			if err != nil {
+				t.Fatalf("error creating root: %v", err)
+			}
+			rootFile.SetAttrMock.Err = test.setAttrErr
+
+			ok := rootInode.SetPermissions(ctx, nil /* Dirent */, test.perms)
+
+			// Check return value.
+			if ok != test.want {
+				t.Errorf("SetPermissions got %v, want %v", ok, test.want)
+			}
+
+			// Check mock parameters.
+			pattr := rootFile.SetAttrMock.Attr
+			if !rootFile.SetAttrMock.Called {
+				t.Errorf("SetAttr not called")
+				return
+			}
+			if !rootFile.SetAttrMock.Valid.Permissions {
+				t.Errorf("SetAttr did not get right request (got false, expected SetAttrMask.Permissions true)")
+			}
+			if got := fs.FilePermsFromP9(pattr.Permissions); got != test.perms {
+				t.Errorf("SetAttr did not get right permissions -- got %v, want %v", got, test.perms)
+			}
+		})
 	}
 }
 
 func TestClose(t *testing.T) {
 	ctx := contexttest.Context(t)
 	// Set up mock.
-	rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
+	rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
 	if err != nil {
-		t.Errorf("TestClose failed: root error got %v, want nil", err)
+		t.Fatalf("error creating root: %v", err)
 	}
 
 	// Call function.
@@ -350,9 +455,9 @@ func TestRename(t *testing.T) {
 		want error
 	}
 	ctx := contexttest.Context(t)
-	rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0)
+	rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
 	if err != nil {
-		t.Errorf("TestRename failed: root error got %v, want nil", err)
+		t.Fatalf("error creating root: %v", err)
 	}
 
 	tests := []renameTest{
@@ -383,35 +488,37 @@ func TestRename(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		mockFile := goodMockFile(p9.PermissionsMask, 0)
-		rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-		rootFile.WalkGetAttrMock.File = mockFile
-
-		dirent, err := rootInode.Lookup(ctx, "foo")
-		if err != nil {
-			t.Fatalf("root.Walk failed: %v", err)
-		}
-		mockFile.RenameMock.Err = test.renameErr
-		mockFile.RenameMock.Called = false
-
-		// Use a dummy oldParent to acquire write access to that directory.
-		oldParent := &inodeOperations{
-			readdirCache: fs.NewSortedDentryMap(nil),
-		}
-		oldInode := fs.NewInode(oldParent, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory})
-
-		// Call function.
-		err = dirent.Inode.InodeOperations.Rename(ctx, oldInode, "", test.newParent, test.newName)
-
-		// Check return value.
-		if err != test.want {
-			t.Errorf("TestRename %s failed: got %v, want %v", test.name, err, test.want)
-		}
-
-		// Check mock parameters.
-		if got, want := mockFile.RenameMock.Called, test.renameCalled; got != want {
-			t.Errorf("TestRename %s failed: renameCalled got %v want %v", test.name, got, want)
-		}
+		t.Run(test.name, func(t *testing.T) {
+			mockFile := goodMockFile(p9.PermissionsMask, 0)
+			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
+			rootFile.WalkGetAttrMock.File = mockFile
+
+			dirent, err := rootInode.Lookup(ctx, "foo")
+			if err != nil {
+				t.Fatalf("root.Walk failed: %v", err)
+			}
+			mockFile.RenameMock.Err = test.renameErr
+			mockFile.RenameMock.Called = false
+
+			// Use a dummy oldParent to acquire write access to that directory.
+			oldParent := &inodeOperations{
+				readdirCache: fs.NewSortedDentryMap(nil),
+			}
+			oldInode := fs.NewInode(oldParent, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory})
+
+			// Call function.
+			err = dirent.Inode.InodeOperations.Rename(ctx, oldInode, "", test.newParent, test.newName)
+
+			// Check return value.
+			if err != test.want {
+				t.Errorf("Rename got %v, want %v", err, test.want)
+			}
+
+			// Check mock parameters.
+			if got, want := mockFile.RenameMock.Called, test.renameCalled; got != want {
+				t.Errorf("renameCalled got %v want %v", got, want)
+			}
+		})
 	}
 }
 
@@ -523,46 +630,48 @@ func TestPreadv(t *testing.T) {
 
 	ctx := contexttest.Context(t)
 	for _, test := range tests {
-		// Set up mock.
-		rootFile, rootInode, err := root(ctx, test.mode, 1024)
-		if err != nil {
-			t.Errorf("TestPreadv %s failed: root error got %v, want nil", test.name, err)
-		}
-
-		// Set up the read buffer.
-		dst := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
-
-		// This file will be read from.
-		openFile := &readAtFileFake{
-			Err:        test.readAtErr,
-			FileLength: test.sliceSize,
-			ChunkSize:  test.chunkSize,
-		}
-		rootFile.WalkGetAttrMock.File = openFile
-		rootFile.WalkGetAttrMock.Attr.Mode = test.mode
-		rootFile.WalkGetAttrMock.Valid.Mode = true
-
-		f := NewFile(
-			ctx,
-			fs.NewDirent(rootInode, ""),
-			"",
-			fs.FileFlags{Read: true},
-			rootInode.InodeOperations.(*inodeOperations),
-			&handles{File: contextFile{file: openFile}},
-		)
-
-		// Call function.
-		_, err = f.Preadv(ctx, dst, 0)
-
-		// Check return value.
-		if err != test.want {
-			t.Errorf("TestPreadv %s failed: got %v, want %v", test.name, err, test.want)
-		}
-
-		// Check mock parameters.
-		if test.readAtCalled != openFile.Called {
-			t.Errorf("TestPreadv %s failed: ReadAt called: %v, but expected opposite", test.name, openFile.Called)
-		}
+		t.Run(test.name, func(t *testing.T) {
+			// Set up mock.
+			rootFile, rootInode, err := root(ctx, cacheNone, test.mode, 1024)
+			if err != nil {
+				t.Fatalf("error creating root: %v", err)
+			}
+
+			// Set up the read buffer.
+			dst := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
+
+			// This file will be read from.
+			openFile := &readAtFileFake{
+				Err:        test.readAtErr,
+				FileLength: test.sliceSize,
+				ChunkSize:  test.chunkSize,
+			}
+			rootFile.WalkGetAttrMock.File = openFile
+			rootFile.WalkGetAttrMock.Attr.Mode = test.mode
+			rootFile.WalkGetAttrMock.Valid.Mode = true
+
+			f := NewFile(
+				ctx,
+				fs.NewDirent(rootInode, ""),
+				"",
+				fs.FileFlags{Read: true},
+				rootInode.InodeOperations.(*inodeOperations),
+				&handles{File: contextFile{file: openFile}},
+			)
+
+			// Call function.
+			_, err = f.Preadv(ctx, dst, 0)
+
+			// Check return value.
+			if err != test.want {
+				t.Errorf("Preadv got %v, want %v", err, test.want)
+			}
+
+			// Check mock parameters.
+			if test.readAtCalled != openFile.Called {
+				t.Errorf("ReadAt called: %v, but expected opposite", openFile.Called)
+			}
+		})
 	}
 }
 
@@ -610,28 +719,30 @@ func TestReadlink(t *testing.T) {
 
 	ctx := contexttest.Context(t)
 	for _, test := range tests {
-		// Set up mock.
-		rootFile, rootInode, err := root(ctx, test.mode, 0)
-		if err != nil {
-			t.Errorf("TestReadlink %s failed: root error got %v, want nil", test.name, err)
-		}
+		t.Run(test.name, func(t *testing.T) {
+			// Set up mock.
+			rootFile, rootInode, err := root(ctx, cacheNone, test.mode, 0)
+			if err != nil {
+				t.Fatalf("error creating root: %v", err)
+			}
 
-		openFile := goodMockFile(test.mode, 0)
-		rootFile.WalkMock.File = openFile
-		rootFile.ReadlinkMock.Err = test.readlinkErr
+			openFile := goodMockFile(test.mode, 0)
+			rootFile.WalkMock.File = openFile
+			rootFile.ReadlinkMock.Err = test.readlinkErr
 
-		// Call function.
-		_, err = rootInode.Readlink(ctx)
+			// Call function.
+			_, err = rootInode.Readlink(ctx)
 
-		// Check return value.
-		if err != test.want {
-			t.Errorf("TestReadlink %s failed: got %v, want %v", test.name, err, test.want)
-		}
+			// Check return value.
+			if err != test.want {
+				t.Errorf("Readlink got %v, want %v", err, test.want)
+			}
 
-		// Check mock parameters.
-		if test.readlinkCalled && !rootFile.ReadlinkMock.Called {
-			t.Errorf("TestReadlink %s failed: Readlink not called", test.name)
-		}
+			// Check mock parameters.
+			if test.readlinkCalled && !rootFile.ReadlinkMock.Called {
+				t.Errorf("Readlink not called")
+			}
+		})
 	}
 }
 
@@ -735,44 +846,46 @@ func TestPwritev(t *testing.T) {
 
 	ctx := contexttest.Context(t)
 	for _, test := range tests {
-		// Set up mock.
-		_, rootInode, err := root(ctx, test.mode, 0)
-		if err != nil {
-			t.Errorf("TestPwritev %s failed: root error got %v, want nil", test.name, err)
-		}
-
-		src := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
-
-		// This is the file that will be used for writing.
-		openFile := &writeAtFileFake{
-			Err:       test.writeAtErr,
-			ChunkSize: test.chunkSize,
-		}
-
-		f := NewFile(
-			ctx,
-			fs.NewDirent(rootInode, ""),
-			"",
-			fs.FileFlags{Write: true},
-			rootInode.InodeOperations.(*inodeOperations),
-			&handles{File: contextFile{file: openFile}},
-		)
-
-		// Call function.
-		_, err = f.Pwritev(ctx, src, 0)
-
-		// Check return value.
-		if err != test.want {
-			t.Errorf("TestPwritev %s failed: got %v, want %v", test.name, err, test.want)
-		}
-
-		// Check mock parameters.
-		if test.writeAtCalled != openFile.Called {
-			t.Errorf("TestPwritev %s failed: WriteAt called: %v, but expected opposite", test.name, openFile.Called)
-			continue
-		}
-		if openFile.Called && test.writeAtErr != nil && openFile.LengthWritten != test.sliceSize {
-			t.Errorf("TestPwritev %s failed: wrote %d bytes, expected %d bytes written", test.name, openFile.LengthWritten, test.sliceSize)
-		}
+		t.Run(test.name, func(t *testing.T) {
+			// Set up mock.
+			_, rootInode, err := root(ctx, cacheNone, test.mode, 0)
+			if err != nil {
+				t.Fatalf("error creating root: %v", err)
+			}
+
+			src := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
+
+			// This is the file that will be used for writing.
+			openFile := &writeAtFileFake{
+				Err:       test.writeAtErr,
+				ChunkSize: test.chunkSize,
+			}
+
+			f := NewFile(
+				ctx,
+				fs.NewDirent(rootInode, ""),
+				"",
+				fs.FileFlags{Write: true},
+				rootInode.InodeOperations.(*inodeOperations),
+				&handles{File: contextFile{file: openFile}},
+			)
+
+			// Call function.
+			_, err = f.Pwritev(ctx, src, 0)
+
+			// Check return value.
+			if err != test.want {
+				t.Errorf("Pwritev got %v, want %v", err, test.want)
+			}
+
+			// Check mock parameters.
+			if test.writeAtCalled != openFile.Called {
+				t.Errorf("WriteAt called: %v, but expected opposite", openFile.Called)
+				return
+			}
+			if openFile.Called && test.writeAtErr != nil && openFile.LengthWritten != test.sliceSize {
+				t.Errorf("wrote %d bytes, expected %d bytes written", openFile.LengthWritten, test.sliceSize)
+			}
+		})
 	}
 }
-- 
cgit v1.2.3


From c036da5dffdf6cad912abe2723e69c04b59430b7 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 7 Aug 2018 13:08:37 -0700
Subject: Hold TaskSet.mu in Task.Parent.

PiperOrigin-RevId: 207766238
Change-Id: Id3b66d8fe1f44c3570f67fa5ae7ba16021e35be1
---
 pkg/sentry/kernel/threads.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 844213c35..3d5713106 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -441,6 +441,8 @@ func (t *Task) Timekeeper() *Timekeeper {
 
 // Parent returns t's parent.
 func (t *Task) Parent() *Task {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
 	return t.parent
 }
 
-- 
cgit v1.2.3


From 0d350aac7f70487bc28bae0d0f457155a4e19081 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Aug 2018 10:24:53 -0700
Subject: Enable SACK in runsc

SACK is disabled by default and needs to be manually enabled. It not only
improves performance, but also fixes hangs downloading files from certain
websites.

PiperOrigin-RevId: 207906742
Change-Id: I4fb7277b67bfdf83ac8195f1b9c38265a0d51e8b
---
 pkg/sentry/socket/hostinet/stack.go |  6 ++++--
 runsc/boot/controller.go            |  5 ++++-
 runsc/boot/loader.go                | 15 +++++++++++----
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 44c3b9a3f..f64809d39 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -84,11 +84,13 @@ func (s *Stack) Configure() error {
 		log.Warningf("Failed to read TCP send buffer size, using default values")
 	}
 
-	s.tcpSACKEnabled = false
+	// SACK is important for performance and even compatibility, assume it's
+	// enabled if we can't find the actual value.
+	s.tcpSACKEnabled = true
 	if sack, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_sack"); err == nil {
 		s.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0"
 	} else {
-		log.Warningf("Failed to read if TCP SACK if enabled, setting to false")
+		log.Warningf("Failed to read if TCP SACK if enabled, setting to true")
 	}
 
 	return nil
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index c6e934e66..fc6ea326a 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -285,7 +285,10 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
-	networkStack := newEmptyNetworkStack(cm.l.conf, k)
+	networkStack, err := newEmptyNetworkStack(cm.l.conf, k)
+	if err != nil {
+		return fmt.Errorf("failed to create network: %v", err)
+	}
 	info, err := o.FilePayload.Files[0].Stat()
 	if err != nil {
 		return err
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2f976cd52..f6c7bf223 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -174,7 +174,10 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack := newEmptyNetworkStack(conf, k)
+	networkStack, err := newEmptyNetworkStack(conf, k)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create network: %v", err)
+	}
 
 	// Initiate the Kernel object, which is required by the Context passed
 	// to createVFS in order to mount (among other things) procfs.
@@ -525,16 +528,20 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) inet.Stack {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack()
+		return hostinet.NewStack(), nil
 
 	case NetworkNone, NetworkSandbox:
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
-		return &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
+		s := &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		}
+		return s, nil
 
 	default:
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
-- 
cgit v1.2.3


From dbbe9ec91541dba387f8044cbf73fd29f604f902 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 8 Aug 2018 21:27:58 -0700
Subject: Protect PCIDs with a mutex.

Because the Drop method may be called across vCPUs, it is necessary to protect
the PCID database with a mutex to prevent concurrent modification. The PCID is
assigned prior to entersyscall, so it's safe to block.

PiperOrigin-RevId: 207992864
Change-Id: I8b36d55106981f51e30dcf03e12886330bb79d67
---
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index 227cf7aad..fa068e35e 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -16,6 +16,10 @@
 
 package pagetables
 
+import (
+	"sync"
+)
+
 // limitPCID is the number of valid PCIDs.
 const limitPCID = 4096
 
@@ -24,6 +28,9 @@ const limitPCID = 4096
 // This is not protected by locks and is thus suitable for use only with a
 // single CPU at a time.
 type PCIDs struct {
+	// mu protects below.
+	mu sync.Mutex
+
 	// cache are the assigned page tables.
 	cache map[*PageTables]uint16
 
@@ -56,7 +63,9 @@ func NewPCIDs(start, size uint16) *PCIDs {
 // This may overwrite any previous assignment provided. If this in the case,
 // true is returned to indicate that the PCID should be flushed.
 func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
+	p.mu.Lock()
 	if pcid, ok := p.cache[pt]; ok {
+		p.mu.Unlock()
 		return pcid, false // No flush.
 	}
 
@@ -68,6 +77,7 @@ func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
 
 		// We need to flush because while this is in the available
 		// pool, it may have been used previously.
+		p.mu.Unlock()
 		return pcid, true
 	}
 
@@ -79,17 +89,21 @@ func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
 		// A flush is definitely required in this case, these page
 		// tables may still be active. (They will just be assigned some
 		// other PCID if and when they hit the given CPU again.)
+		p.mu.Unlock()
 		return pcid, true
 	}
 
 	// No PCID.
+	p.mu.Unlock()
 	return 0, false
 }
 
 // Drop drops references to a set of page tables.
 func (p *PCIDs) Drop(pt *PageTables) {
+	p.mu.Lock()
 	if pcid, ok := p.cache[pt]; ok {
 		delete(p.cache, pt)
 		p.avail = append(p.avail, pcid)
 	}
+	p.mu.Unlock()
 }
-- 
cgit v1.2.3


From 4e171f7590284c1f4cedf90c92204873961b2e97 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 8 Aug 2018 22:38:41 -0700
Subject: Basic support for ip link/addr and ifconfig

Closes #94

PiperOrigin-RevId: 207997580
Change-Id: I19b426f1586b5ec12f8b0cd5884d5b401d334924
---
 pkg/abi/linux/netlink_route.go              |  5 +++
 pkg/sentry/inet/inet.go                     |  3 ++
 pkg/sentry/socket/epsocket/epsocket.go      | 52 ++++++++++-------------
 pkg/sentry/socket/epsocket/stack.go         | 12 ++++--
 pkg/sentry/socket/netlink/route/protocol.go | 15 ++++++-
 pkg/sentry/socket/netlink/socket.go         | 64 ++++++++++++++++++++++++++---
 pkg/tcpip/link/loopback/loopback.go         |  2 +-
 pkg/tcpip/stack/nic.go                      |  7 ++++
 pkg/tcpip/stack/registration.go             |  1 +
 pkg/tcpip/stack/stack.go                    | 36 ++++++++--------
 runsc/boot/network.go                       |  5 ++-
 11 files changed, 138 insertions(+), 64 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 0d88bc5c5..a5d778748 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -184,3 +184,8 @@ const (
 	IFA_MULTICAST = 7
 	IFA_FLAGS     = 8
 )
+
+// Device types, from uapi/linux/if_arp.h.
+const (
+	ARPHRD_LOOPBACK = 772
+)
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index e54a61196..30ca4e0c0 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -67,6 +67,9 @@ type Interface struct {
 
 	// Addr is the hardware device address.
 	Addr []byte
+
+	// MTU is the maximum transmission unit.
+	MTU uint32
 }
 
 // InterfaceAddr contains information about a network interface address.
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index f969a1d7c..b32eda96f 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -48,7 +48,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	nstack "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -452,7 +452,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 // sockets backed by a commonEndpoint.
 func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType unix.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
-	case syscall.SOL_SOCKET:
+	case linux.SOL_SOCKET:
 		switch name {
 		case linux.SO_TYPE:
 			if outLen < sizeOfInt32 {
@@ -634,7 +634,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 // sockets backed by a commonEndpoint.
 func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
 	switch level {
-	case syscall.SOL_SOCKET:
+	case linux.SOL_SOCKET:
 		switch name {
 		case linux.SO_SNDBUF:
 			if len(optVal) < sizeOfInt32 {
@@ -1191,7 +1191,9 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 		if err != nil {
 			return err
 		}
-		usermem.ByteOrder.PutUint16(ifr.Data[:2], f)
+		// Drop the flags that don't fit in the size that we need to return. This
+		// matches Linux behavior.
+		usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
 
 	case syscall.SIOCGIFADDR:
 		// Copy the IPv4 address out.
@@ -1304,7 +1306,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error {
 // interfaceStatusFlags returns status flags for an interface in the stack.
 // Flag values and meanings are described in greater detail in netdevice(7) in
 // the SIOCGIFFLAGS section.
-func interfaceStatusFlags(stack inet.Stack, name string) (uint16, *syserr.Error) {
+func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
 	// epsocket should only ever be passed an epsocket.Stack.
 	epstack, ok := stack.(*Stack)
 	if !ok {
@@ -1312,37 +1314,27 @@ func interfaceStatusFlags(stack inet.Stack, name string) (uint16, *syserr.Error)
 	}
 
 	// Find the NIC corresponding to this interface.
-	var (
-		nicid tcpip.NICID
-		info  nstack.NICInfo
-		found bool
-	)
-	ns := epstack.Stack
-	for nicid, info = range ns.NICInfo() {
+	for _, info := range epstack.Stack.NICInfo() {
 		if info.Name == name {
-			found = true
-			break
+			return nicStateFlagsToLinux(info.Flags), nil
 		}
 	}
-	if !found {
-		return 0, syserr.ErrNoDevice
-	}
+	return 0, syserr.ErrNoDevice
+}
 
-	// Set flags based on NIC state.
-	nicFlags, err := ns.NICFlags(nicid)
-	if err != nil {
-		return 0, syserr.TranslateNetstackError(err)
+func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
+	var rv uint32
+	if f.Up {
+		rv |= linux.IFF_UP | linux.IFF_LOWER_UP
 	}
-
-	var retFlags uint16
-	if nicFlags.Up {
-		retFlags |= linux.IFF_UP
+	if f.Running {
+		rv |= linux.IFF_RUNNING
 	}
-	if nicFlags.Running {
-		retFlags |= linux.IFF_RUNNING
+	if f.Promiscuous {
+		rv |= linux.IFF_PROMISC
 	}
-	if nicFlags.Promiscuous {
-		retFlags |= linux.IFF_PROMISC
+	if f.Loopback {
+		rv |= linux.IFF_LOOPBACK
 	}
-	return retFlags, nil
+	return rv
 }
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 12b4b4767..e4ed52fc8 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -41,10 +41,16 @@ func (s *Stack) SupportsIPv6() bool {
 func (s *Stack) Interfaces() map[int32]inet.Interface {
 	is := make(map[int32]inet.Interface)
 	for id, ni := range s.Stack.NICInfo() {
+		var devType uint16
+		if ni.Flags.Loopback {
+			devType = linux.ARPHRD_LOOPBACK
+		}
 		is[int32(id)] = inet.Interface{
-			Name: ni.Name,
-			Addr: []byte(ni.LinkAddress),
-			// TODO: Other fields.
+			Name:       ni.Name,
+			Addr:       []byte(ni.LinkAddress),
+			Flags:      uint32(nicStateFlagsToLinux(ni.Flags)),
+			DeviceType: devType,
+			MTU:        ni.MTU,
 		}
 	}
 	return is
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 55a76e916..70322b9ed 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -16,6 +16,8 @@
 package route
 
 import (
+	"bytes"
+
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
@@ -97,9 +99,18 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		})
 
 		m.PutAttrString(linux.IFLA_IFNAME, i.Name)
+		m.PutAttr(linux.IFLA_MTU, i.MTU)
+
+		mac := make([]byte, 6)
+		brd := mac
+		if len(i.Addr) > 0 {
+			mac = i.Addr
+			brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
+		}
+		m.PutAttr(linux.IFLA_ADDRESS, mac)
+		m.PutAttr(linux.IFLA_BROADCAST, brd)
 
-		// TODO: There are many more attributes, such as
-		// MAC address.
+		// TODO: There are many more attributes.
 	}
 
 	return nil
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index e15d1546c..f3b2c7256 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -16,6 +16,7 @@
 package netlink
 
 import (
+	"math"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -39,8 +40,18 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// defaultSendBufferSize is the default size for the send buffer.
-const defaultSendBufferSize = 16 * 1024
+const sizeOfInt32 int = 4
+
+const (
+	// minBufferSize is the smallest size of a send buffer.
+	minSendBufferSize = 4 << 10 // 4096 bytes.
+
+	// defaultSendBufferSize is the default size for the send buffer.
+	defaultSendBufferSize = 16 * 1024
+
+	// maxBufferSize is the largest size a send buffer can grow to.
+	maxSendBufferSize = 4 << 20 // 4MB
+)
 
 // netlinkSocketDevice is the netlink socket virtual device.
 var netlinkSocketDevice = device.NewAnonDevice()
@@ -86,7 +97,7 @@ type Socket struct {
 
 	// sendBufferSize is the send buffer "size". We don't actually have a
 	// fixed buffer but only consume this many bytes.
-	sendBufferSize uint64
+	sendBufferSize uint32
 }
 
 var _ socket.Socket = (*Socket)(nil)
@@ -273,13 +284,54 @@ func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error {
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
 func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
-	// TODO: no sockopts supported.
+	switch level {
+	case linux.SOL_SOCKET:
+		switch name {
+		case linux.SO_SNDBUF:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			return int32(s.sendBufferSize), nil
+
+		case linux.SO_RCVBUF:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+			// We don't have limit on receiving size.
+			return math.MaxInt32, nil
+		}
+	}
+	// TODO: other sockopts are not supported.
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
-	// TODO: no sockopts supported.
+	switch level {
+	case linux.SOL_SOCKET:
+		switch name {
+		case linux.SO_SNDBUF:
+			if len(opt) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+			size := usermem.ByteOrder.Uint32(opt)
+			if size < minSendBufferSize {
+				size = minSendBufferSize
+			} else if size > maxSendBufferSize {
+				size = maxSendBufferSize
+			}
+			s.sendBufferSize = size
+			return nil
+		case linux.SO_RCVBUF:
+			if len(opt) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+			// We don't have limit on receiving size. So just accept anything as
+			// valid for compatibility.
+			return nil
+		}
+	}
+	// TODO: other sockopts are not supported.
 	return syserr.ErrProtocolNotAvailable
 }
 
@@ -489,7 +541,7 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte,
 
 	// For simplicity, and consistency with Linux, we copy in the entire
 	// message up front.
-	if uint64(src.NumBytes()) > s.sendBufferSize {
+	if src.NumBytes() > int64(s.sendBufferSize) {
 		return 0, syserr.ErrMessageTooLong
 	}
 
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index b4dc4833c..015275721 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -56,7 +56,7 @@ func (*endpoint) MTU() uint32 {
 // Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises
 // itself as supporting checksum offload, but in reality it's just omitted.
 func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return stack.CapabilityChecksumOffload | stack.CapabilitySaveRestore
+	return stack.CapabilityChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
 }
 
 // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index c1480f97b..592006a32 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -67,6 +67,13 @@ func (n *NIC) setPromiscuousMode(enable bool) {
 	n.mu.Unlock()
 }
 
+func (n *NIC) isPromiscuousMode() bool {
+	n.mu.RLock()
+	rv := n.promiscuous
+	n.mu.RUnlock()
+	return rv
+}
+
 // setSpoofing enables or disables address spoofing.
 func (n *NIC) setSpoofing(enable bool) {
 	n.mu.Lock()
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 01a29689d..bbe887144 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -205,6 +205,7 @@ const (
 	CapabilityResolutionRequired
 	CapabilitySaveRestore
 	CapabilityDisconnectOk
+	CapabilityLoopback
 )
 
 // LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 6c4aa7cc5..e2b9dc2c0 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -563,6 +563,12 @@ type NICInfo struct {
 	Name              string
 	LinkAddress       tcpip.LinkAddress
 	ProtocolAddresses []tcpip.ProtocolAddress
+
+	// Flags indicate the state of the NIC.
+	Flags NICStateFlags
+
+	// MTU is the maximum transmission unit.
+	MTU uint32
 }
 
 // NICInfo returns a map of NICIDs to their associated information.
@@ -572,10 +578,18 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 
 	nics := make(map[tcpip.NICID]NICInfo)
 	for id, nic := range s.nics {
+		flags := NICStateFlags{
+			Up:          true, // Netstack interfaces are always up.
+			Running:     nic.linkEP.IsAttached(),
+			Promiscuous: nic.isPromiscuousMode(),
+			Loopback:    nic.linkEP.Capabilities()&CapabilityLoopback != 0,
+		}
 		nics[id] = NICInfo{
 			Name:              nic.name,
 			LinkAddress:       nic.linkEP.LinkAddress(),
 			ProtocolAddresses: nic.Addresses(),
+			Flags:             flags,
+			MTU:               nic.linkEP.MTU(),
 		}
 	}
 	return nics
@@ -591,27 +605,9 @@ type NICStateFlags struct {
 
 	// Promiscuous indicates whether the interface is in promiscuous mode.
 	Promiscuous bool
-}
-
-// NICFlags returns flags about the state of the NIC. It returns an error if
-// the NIC corresponding to id cannot be found.
-func (s *Stack) NICFlags(id tcpip.NICID) (NICStateFlags, *tcpip.Error) {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
 
-	nic := s.nics[id]
-	if nic == nil {
-		return NICStateFlags{}, tcpip.ErrUnknownNICID
-	}
-
-	ret := NICStateFlags{
-		// Netstack interfaces are always up.
-		Up: true,
-
-		Running:     nic.linkEP.IsAttached(),
-		Promiscuous: nic.promiscuous,
-	}
-	return ret, nil
+	// Loopback indicates whether the interface is a loopback.
+	Loopback bool
 }
 
 // AddAddress adds a new network-layer address to the specified NIC.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index d702ae74e..0e43c91be 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -133,15 +133,16 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
 		}
 
+		mac := tcpip.LinkAddress(generateRndMac())
 		linkEP := fdbased.New(&fdbased.Options{
 			FD:             newFD,
 			MTU:            uint32(link.MTU),
 			EthernetHeader: true,
 			HandleLocal:    true,
-			Address:        tcpip.LinkAddress(generateRndMac()),
+			Address:        mac,
 		})
 
-		log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
 		}
-- 
cgit v1.2.3


From 2e06b23aa61216fcdbefcd6b11a24bca7a456b16 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 9 Aug 2018 16:49:23 -0700
Subject: Fix missing O_LARGEFILE from O_CREAT files

Cleanup some more syscall.O_* references while we're here.

PiperOrigin-RevId: 208133460
Change-Id: I48db71a38f817e4f4673977eafcc0e3874eb9a25
---
 pkg/abi/linux/file.go                 |  4 ++++
 pkg/sentry/kernel/fd_map.go           |  6 ++++--
 pkg/sentry/syscalls/linux/flags.go    | 14 ++------------
 pkg/sentry/syscalls/linux/sys_file.go | 32 ++++++++++++++++++--------------
 pkg/sentry/syscalls/linux/sys_pipe.go |  7 ++++---
 5 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index f2b7e26ca..509f6b5b3 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -27,6 +27,10 @@ const (
 	O_RDONLY    = 00000000
 	O_WRONLY    = 00000001
 	O_RDWR      = 00000002
+	O_CREAT     = 00000100
+	O_EXCL      = 00000200
+	O_NOCTTY    = 00000400
+	O_TRUNC     = 00001000
 	O_APPEND    = 00002000
 	O_NONBLOCK  = 00004000
 	O_ASYNC     = 00020000
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index d5d4aaacb..cad0b0a20 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -53,7 +53,8 @@ type FDFlags struct {
 	CloseOnExec bool
 }
 
-// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags representation.
+// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
+// representation.
 func (f FDFlags) ToLinuxFileFlags() (mask uint) {
 	if f.CloseOnExec {
 		mask |= linux.O_CLOEXEC
@@ -61,7 +62,8 @@ func (f FDFlags) ToLinuxFileFlags() (mask uint) {
 	return
 }
 
-// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags representation.
+// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
+// representation.
 func (f FDFlags) ToLinuxFDFlags() (mask uint) {
 	if f.CloseOnExec {
 		mask |= linux.FD_CLOEXEC
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index e8db3e0de..f01483cd3 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -34,8 +34,8 @@ func flagsToPermissions(mask uint) (p fs.PermMask) {
 	return
 }
 
-// linuxToFlags converts linux file flags to a FileFlags object.
-func linuxToFlags(mask uint) (flags fs.FileFlags) {
+// linuxToFlags converts Linux file flags to a FileFlags object.
+func linuxToFlags(mask uint) fs.FileFlags {
 	return fs.FileFlags{
 		Direct:      mask&linux.O_DIRECT != 0,
 		Sync:        mask&linux.O_SYNC != 0,
@@ -48,13 +48,3 @@ func linuxToFlags(mask uint) (flags fs.FileFlags) {
 		LargeFile:   mask&linux.O_LARGEFILE != 0,
 	}
 }
-
-// linuxToSettableFlags converts linux file flags to a SettableFileFlags object.
-func linuxToSettableFlags(mask uint) fs.SettableFileFlags {
-	return fs.SettableFileFlags{
-		Direct:      mask&linux.O_DIRECT != 0,
-		NonBlocking: mask&linux.O_NONBLOCK != 0,
-		Append:      mask&linux.O_APPEND != 0,
-		Async:       mask&linux.O_ASYNC != 0,
-	}
-}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 94b7ac7a5..2cf429f5c 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -164,7 +164,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
 			if dirPath {
 				return syserror.ENOTDIR
 			}
-			if fileFlags.Write && flags&syscall.O_TRUNC != 0 {
+			if fileFlags.Write && flags&linux.O_TRUNC != 0 {
 				if err := d.Inode.Truncate(t, d, 0); err != nil {
 					return err
 				}
@@ -178,7 +178,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
 		defer file.DecRef()
 
 		// Success.
-		fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
 		newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
 		if err != nil {
 			return err
@@ -302,6 +302,10 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 			return syserror.ENOTDIR
 		}
 
+		fileFlags := linuxToFlags(flags)
+		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
+		fileFlags.LargeFile = true
+
 		// Does this file exist already?
 		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
 		var newFile *fs.File
@@ -311,7 +315,7 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 			defer targetDirent.DecRef()
 
 			// Check if we wanted to create.
-			if flags&syscall.O_EXCL != 0 {
+			if flags&linux.O_EXCL != 0 {
 				return syserror.EEXIST
 			}
 
@@ -323,14 +327,14 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 			}
 
 			// Should we truncate the file?
-			if flags&syscall.O_TRUNC != 0 {
+			if flags&linux.O_TRUNC != 0 {
 				if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil {
 					return err
 				}
 			}
 
 			// Create a new fs.File.
-			newFile, err = targetDirent.Inode.GetFile(t, targetDirent, linuxToFlags(flags))
+			newFile, err = targetDirent.Inode.GetFile(t, targetDirent, fileFlags)
 			if err != nil {
 				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 			}
@@ -346,7 +350,7 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 
 			// Attempt a creation.
 			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
-			newFile, err = d.Create(t, root, name, linuxToFlags(flags), perms)
+			newFile, err = d.Create(t, root, name, fileFlags, perms)
 			if err != nil {
 				// No luck, bail.
 				return err
@@ -356,7 +360,7 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 		}
 
 		// Success.
-		fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}
+		fdFlags := kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}
 		newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits())
 		if err != nil {
 			return err
@@ -380,7 +384,7 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 	flags := uint(args[1].Uint())
-	if flags&syscall.O_CREAT != 0 {
+	if flags&linux.O_CREAT != 0 {
 		mode := linux.FileMode(args[2].ModeT())
 		n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
 		return n, nil, err
@@ -394,7 +398,7 @@ func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	dirFD := kdefs.FD(args[0].Int())
 	addr := args[1].Pointer()
 	flags := uint(args[2].Uint())
-	if flags&syscall.O_CREAT != 0 {
+	if flags&linux.O_CREAT != 0 {
 		mode := linux.FileMode(args[3].ModeT())
 		n, err := createAt(t, dirFD, addr, flags, mode)
 		return n, nil, err
@@ -407,7 +411,7 @@ func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 	mode := linux.FileMode(args[1].ModeT())
-	n, err := createAt(t, linux.AT_FDCWD, addr, syscall.O_WRONLY|syscall.O_TRUNC, mode)
+	n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode)
 	return n, nil, err
 }
 
@@ -747,7 +751,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 	}
 	defer oldFile.DecRef()
 
-	err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}, t.ThreadGroup().Limits())
+	err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}, t.ThreadGroup().Limits())
 	if err != nil {
 		return 0, nil, err
 	}
@@ -802,7 +806,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	switch cmd {
 	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
 		from := kdefs.FD(args[2].Int())
-		fdFlags := kernel.FDFlags{CloseOnExec: cmd == syscall.F_DUPFD_CLOEXEC}
+		fdFlags := kernel.FDFlags{CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC}
 		fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits())
 		if err != nil {
 			return 0, nil, err
@@ -813,13 +817,13 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_SETFD:
 		flags := args[2].Uint()
 		t.FDMap().SetFlags(fd, kernel.FDFlags{
-			CloseOnExec: flags&syscall.FD_CLOEXEC != 0,
+			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
 		})
 	case linux.F_GETFL:
 		return uintptr(file.Flags().ToLinux()), nil, nil
 	case linux.F_SETFL:
 		flags := uint(args[2].Uint())
-		file.SetFlags(linuxToSettableFlags(flags))
+		file.SetFlags(linuxToFlags(flags).Settable())
 	case linux.F_SETLK, linux.F_SETLKW:
 		// In Linux the file system can choose to provide lock operations for an inode.
 		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 3efc06a27..2b544f145 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -17,6 +17,7 @@ package linux
 import (
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
@@ -26,7 +27,7 @@ import (
 
 // pipe2 implements the actual system call with flags.
 func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
-	if flags&^(syscall.O_NONBLOCK|syscall.O_CLOEXEC) != 0 {
+	if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
 		return 0, syscall.EINVAL
 	}
 	r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
@@ -38,14 +39,14 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
 	defer w.DecRef()
 
 	rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{
-		CloseOnExec: flags&syscall.O_CLOEXEC != 0},
+		CloseOnExec: flags&linux.O_CLOEXEC != 0},
 		t.ThreadGroup().Limits())
 	if err != nil {
 		return 0, err
 	}
 
 	wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{
-		CloseOnExec: flags&syscall.O_CLOEXEC != 0},
+		CloseOnExec: flags&linux.O_CLOEXEC != 0},
 		t.ThreadGroup().Limits())
 	if err != nil {
 		t.FDMap().Remove(rfd)
-- 
cgit v1.2.3


From d5b702b64f05a200ed94f0cd977d3f84dae01162 Mon Sep 17 00:00:00 2001
From: Neel Natu <neelnatu@google.com>
Date: Fri, 10 Aug 2018 10:25:37 -0700
Subject: Validate FS.base before establishing it in the task's register set.

PiperOrigin-RevId: 208229341
Change-Id: I5d84bc52bbafa073446ef497e56958d0d7955aa8
---
 pkg/sentry/syscalls/linux/sys_tls.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index 1047364b3..b95d62320 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
 // ArchPrctl implements linux syscall arch_prctl(2).
@@ -36,9 +37,13 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		}
 
 	case linux.ARCH_SET_FS:
+		fsbase := args[1].Uint64()
+		if _, ok := t.MemoryManager().CheckIORange(usermem.Addr(fsbase), 0); !ok {
+			return 0, nil, syscall.EPERM
+		}
 		regs := &t.Arch().StateData().Regs
 		regs.Fs = 0
-		regs.Fs_base = args[1].Uint64()
+		regs.Fs_base = fsbase
 
 	default:
 		return 0, nil, syscall.EINVAL
-- 
cgit v1.2.3


From 4ececd8e8d1124cdd0884480bda5fabd2b48aa8d Mon Sep 17 00:00:00 2001
From: Brielle Broder <bbroder@google.com>
Date: Fri, 10 Aug 2018 14:31:56 -0700
Subject: Enable checkpoint/restore in cases of UDS use.

Previously, processes which used file-system Unix Domain Sockets could not be
checkpoint-ed in runsc because the sockets were saved with their inode
numbers which do not necessarily remain the same upon restore. Now,
the sockets are also saved with their paths so that the new inodes
can be determined for the sockets based on these paths after restoring.
Tests for cases with UDS use are included. Test cleanup to come.

PiperOrigin-RevId: 208268781
Change-Id: Ieaa5d5d9a64914ca105cae199fd8492710b1d7ec
---
 pkg/sentry/fs/dirent.go              |  53 +++++---
 pkg/sentry/fs/fsutil/inode.go        |   4 +-
 pkg/sentry/fs/gofer/gofer_test.go    |   2 +-
 pkg/sentry/fs/gofer/path.go          |  37 ++++--
 pkg/sentry/fs/gofer/session.go       | 149 +++++++++++++++++++----
 pkg/sentry/fs/gofer/session_state.go |  26 +++-
 pkg/sentry/fs/gofer/socket.go        |   2 +
 pkg/sentry/fs/host/inode.go          |   4 +-
 pkg/sentry/fs/inode.go               |   2 +-
 pkg/sentry/fs/inode_operations.go    |   2 +-
 pkg/sentry/fs/inode_overlay.go       |   4 +-
 pkg/sentry/fs/ramfs/dir.go           |  15 ++-
 pkg/sentry/fs/ramfs/ramfs.go         |   4 +-
 pkg/sentry/fs/tty/dir.go             |   4 +-
 pkg/sentry/socket/unix/unix.go       |   4 +-
 runsc/container/BUILD                |  10 +-
 runsc/container/container_test.go    | 228 ++++++++++++++++++++++++++++++++---
 runsc/container/uds_test_app.go      |  83 +++++++++++++
 18 files changed, 541 insertions(+), 92 deletions(-)
 create mode 100644 runsc/container/uds_test_app.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4658d044f..821cc5789 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -385,6 +385,19 @@ func (d *Dirent) fullName(root *Dirent) (string, bool) {
 	return s, reachable
 }
 
+// MountRoot finds and returns the mount-root for a given dirent.
+func (d *Dirent) MountRoot() *Dirent {
+	renameMu.RLock()
+	defer renameMu.RUnlock()
+
+	mountRoot := d
+	for !mountRoot.mounted && mountRoot.parent != nil {
+		mountRoot = mountRoot.parent
+	}
+	mountRoot.IncRef()
+	return mountRoot
+}
+
 func (d *Dirent) freeze() {
 	if d.frozen {
 		// Already frozen.
@@ -665,6 +678,16 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
 	}
 	child := file.Dirent
 
+	d.finishCreate(child, name)
+
+	// Return the reference and the new file. When the last reference to
+	// the file is dropped, file.Dirent may no longer be cached.
+	return file, nil
+}
+
+// finishCreate validates the created file, adds it as a child of this dirent,
+// and notifies any watchers.
+func (d *Dirent) finishCreate(child *Dirent, name string) {
 	// Sanity check c, its name must be consistent.
 	if child.name != name {
 		panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
@@ -697,10 +720,6 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi
 
 	// Allow the file system to take extra references on c.
 	child.maybeExtendReference()
-
-	// Return the reference and the new file. When the last reference to
-	// the file is dropped, file.Dirent may no longer be cached.
-	return file, nil
 }
 
 // genericCreate executes create if name does not exist. Removes a negative Dirent at name if
@@ -718,11 +737,6 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 		return syscall.ENOENT
 	}
 
-	// Execute the create operation.
-	if err := create(); err != nil {
-		return err
-	}
-
 	// Remove any negative Dirent. We've already asserted above with d.exists
 	// that the only thing remaining here can be a negative Dirent.
 	if w, ok := d.children[name]; ok {
@@ -745,7 +759,8 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 		w.Drop()
 	}
 
-	return nil
+	// Execute the create operation.
+	return create()
 }
 
 // CreateLink creates a new link in this directory.
@@ -797,23 +812,29 @@ func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string,
 }
 
 // Bind satisfies the InodeOperations interface; otherwise same as GetFile.
-func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, socket unix.BoundEndpoint, perms FilePermissions) error {
+func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data unix.BoundEndpoint, perms FilePermissions) (*Dirent, error) {
 	d.dirMu.Lock()
 	defer d.dirMu.Unlock()
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
+	var childDir *Dirent
 	err := d.genericCreate(ctx, root, name, func() error {
-		if err := d.Inode.Bind(ctx, name, socket, perms); err != nil {
-			return err
+		var e error
+		childDir, e = d.Inode.Bind(ctx, name, data, perms)
+		if e != nil {
+			return e
 		}
-		d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
+		d.finishCreate(childDir, name)
 		return nil
 	})
 	if err == syscall.EEXIST {
-		return syscall.EADDRINUSE
+		return nil, syscall.EADDRINUSE
+	}
+	if err != nil {
+		return nil, err
 	}
-	return err
+	return childDir, err
 }
 
 // CreateFifo creates a new named pipe under this dirent.
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 177396fdc..3479f2fad 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -254,8 +254,8 @@ func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error {
-	return syserror.ENOTDIR
+func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, syserror.ENOTDIR
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo.
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 764b530cb..45fdaacfd 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -74,7 +74,7 @@ func root(ctx context.Context, cp cachePolicy, mode p9.FileMode, size uint64) (*
 	}
 
 	rootFile := goodMockFile(mode, size)
-	sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr)
+	sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr, false /* socket */)
 	m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{})
 	return rootFile, fs.NewInode(rootInodeOperations, m, sattr), nil
 }
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index bfeab3833..15e9863fb 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -57,7 +57,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 	}
 
 	// Construct the Inode operations.
-	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr)
+	sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr, false)
 
 	// Construct a positive Dirent.
 	return fs.NewDirent(fs.NewInode(node, dir.MountSource, sattr), name), nil
@@ -113,7 +113,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	}
 
 	// Construct the InodeOperations.
-	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr)
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false)
 
 	// Construct the positive Dirent.
 	d := fs.NewDirent(fs.NewInode(iops, dir.MountSource, sattr), name)
@@ -175,10 +175,10 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s
 	return nil
 }
 
-// Bind implements InodeOperations.
-func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perm fs.FilePermissions) error {
+// Bind implements InodeOperations.Bind.
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
 	if i.session().endpoints == nil {
-		return syscall.EOPNOTSUPP
+		return nil, syscall.EOPNOTSUPP
 	}
 
 	// Create replaces the directory fid with the newly created/opened
@@ -186,7 +186,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// this node.
 	_, newFile, err := i.fileState.file.walk(ctx, nil)
 	if err != nil {
-		return err
+		return nil, err
 	}
 
 	// Stabilize the endpoint map while creation is in progress.
@@ -198,7 +198,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	owner := fs.FileOwnerFromContext(ctx)
 	hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
 	if err != nil {
-		return err
+		return nil, err
 	}
 	// We're not going to use this file.
 	hostFile.Close()
@@ -206,10 +206,10 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	i.touchModificationTime(ctx, dir)
 
 	// Get the attributes of the file to create inode key.
-	qid, _, attr, err := getattr(ctx, newFile)
+	qid, mask, attr, err := getattr(ctx, newFile)
 	if err != nil {
 		newFile.close(ctx)
-		return err
+		return nil, err
 	}
 
 	key := device.MultiDeviceKey{
@@ -217,9 +217,24 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 		SecondaryDevice: i.session().connID,
 		Inode:           qid.Path,
 	}
-	i.session().endpoints.add(key, ep)
 
-	return nil
+	// Create child dirent.
+
+	// Get an unopened p9.File for the file we created so that it can be
+	// cloned and re-opened multiple times after creation.
+	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
+	if err != nil {
+		newFile.close(ctx)
+		return nil, err
+	}
+
+	// Construct the InodeOperations.
+	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr, true)
+
+	// Construct the positive Dirent.
+	childDir := fs.NewDirent(fs.NewInode(iops, dir.MountSource, sattr), name)
+	i.session().endpoints.add(key, childDir, ep)
+	return childDir, nil
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo. Gofer nodes do not support the
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 648a11435..bfb1154dc 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -15,6 +15,7 @@
 package gofer
 
 import (
+	"fmt"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/p9"
@@ -28,39 +29,60 @@ import (
 )
 
 // +stateify savable
-type endpointMap struct {
+type endpointMaps struct {
+	// mu protexts the direntMap, the keyMap, and the pathMap below.
 	mu sync.RWMutex `state:"nosave"`
-	// TODO: Make map with private unix sockets savable.
-	m map[device.MultiDeviceKey]unix.BoundEndpoint
+
+	// direntMap links sockets to their dirents.
+	// It is filled concurrently with the keyMap and is stored upon save.
+	// Before saving, this map is used to populate the pathMap.
+	direntMap map[unix.BoundEndpoint]*fs.Dirent
+
+	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets.
+	// It is not stored during save because the inode ID may change upon restore.
+	keyMap map[device.MultiDeviceKey]unix.BoundEndpoint `state:"nosave"`
+
+	// pathMap links the sockets to their paths.
+	// It is filled before saving from the direntMap and is stored upon save.
+	// Upon restore, this map is used to re-populate the keyMap.
+	pathMap map[unix.BoundEndpoint]string
 }
 
-// add adds the endpoint to the map.
+// add adds the endpoint to the maps.
+// A reference is taken on the dirent argument.
 //
-// Precondition: map must have been locked with 'lock'.
-func (e *endpointMap) add(key device.MultiDeviceKey, ep unix.BoundEndpoint) {
-	e.m[key] = ep
+// Precondition: maps must have been locked with 'lock'.
+func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep unix.BoundEndpoint) {
+	e.keyMap[key] = ep
+	d.IncRef()
+	e.direntMap[ep] = d
 }
 
-// remove deletes the key from the map.
+// remove deletes the key from the maps.
 //
-// Precondition: map must have been locked with 'lock'.
-func (e *endpointMap) remove(key device.MultiDeviceKey) {
-	delete(e.m, key)
+// Precondition: maps must have been locked with 'lock'.
+func (e *endpointMaps) remove(key device.MultiDeviceKey) {
+	endpoint := e.get(key)
+	delete(e.keyMap, key)
+
+	d := e.direntMap[endpoint]
+	d.DecRef()
+	delete(e.direntMap, endpoint)
 }
 
 // lock blocks other addition and removal operations from happening while
 // the backing file is being created or deleted. Returns a function that unlocks
 // the endpoint map.
-func (e *endpointMap) lock() func() {
+func (e *endpointMaps) lock() func() {
 	e.mu.Lock()
 	return func() { e.mu.Unlock() }
 }
 
-func (e *endpointMap) get(key device.MultiDeviceKey) unix.BoundEndpoint {
-	e.mu.RLock()
-	ep := e.m[key]
-	e.mu.RUnlock()
-	return ep
+// get returns the endpoint mapped to the given key.
+//
+// Precondition: maps must have been locked for reading.
+func (e *endpointMaps) get(key device.MultiDeviceKey) unix.BoundEndpoint {
+	return e.keyMap[key]
 }
 
 // session holds state for each 9p session established during sys_mount.
@@ -115,7 +137,7 @@ type session struct {
 	// TODO: there are few possible races with someone stat'ing the
 	// file and another deleting it concurrently, where the file will not be
 	// reported as socket file.
-	endpoints *endpointMap `state:"wait"`
+	endpoints *endpointMaps `state:"wait"`
 }
 
 // Destroy tears down the session.
@@ -149,7 +171,9 @@ func (s *session) SaveInodeMapping(inode *fs.Inode, path string) {
 
 // newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes
 // (p9.QID, p9.AttrMask, p9.Attr).
-func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) {
+//
+// Endpoints lock must not be held if socket == false.
+func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr, socket bool) (fs.StableAttr, *inodeOperations) {
 	deviceKey := device.MultiDeviceKey{
 		Device:          attr.RDev,
 		SecondaryDevice: s.connID,
@@ -164,10 +188,16 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 	}
 
 	if s.endpoints != nil {
-		// If unix sockets are allowed on this filesystem, check if this file is
-		// supposed to be a socket file.
-		if s.endpoints.get(deviceKey) != nil {
+		if socket {
 			sattr.Type = fs.Socket
+		} else {
+			// If unix sockets are allowed on this filesystem, check if this file is
+			// supposed to be a socket file.
+			unlock := s.endpoints.lock()
+			if s.endpoints.get(deviceKey) != nil {
+				sattr.Type = fs.Socket
+			}
+			unlock()
 		}
 	}
 
@@ -215,7 +245,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	}
 
 	if o.privateunixsocket {
-		s.endpoints = &endpointMap{m: make(map[device.MultiDeviceKey]unix.BoundEndpoint)}
+		s.endpoints = newEndpointMaps()
 	}
 
 	// Construct the MountSource with the session and superBlockFlags.
@@ -248,6 +278,77 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 		return nil, err
 	}
 
-	sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr)
+	sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr, false)
 	return fs.NewInode(iops, m, sattr), nil
 }
+
+// newEndpointMaps creates a new endpointMaps.
+func newEndpointMaps() *endpointMaps {
+	return &endpointMaps{
+		direntMap: make(map[unix.BoundEndpoint]*fs.Dirent),
+		keyMap:    make(map[device.MultiDeviceKey]unix.BoundEndpoint),
+		pathMap:   make(map[unix.BoundEndpoint]string),
+	}
+}
+
+// fillKeyMap populates key and dirent maps upon restore from saved
+// pathmap.
+func (s *session) fillKeyMap(ctx context.Context) error {
+	unlock := s.endpoints.lock()
+	defer unlock()
+
+	for ep, dirPath := range s.endpoints.pathMap {
+		_, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath))
+		if err != nil {
+			return fmt.Errorf("error filling endpointmaps, failed to walk to %q: %v", dirPath, err)
+		}
+
+		qid, _, attr, err := file.getAttr(ctx, p9.AttrMaskAll())
+		if err != nil {
+			return fmt.Errorf("failed to get file attributes of %s: %v", dirPath, err)
+		}
+
+		key := device.MultiDeviceKey{
+			Device:          attr.RDev,
+			SecondaryDevice: s.connID,
+			Inode:           qid.Path,
+		}
+
+		s.endpoints.keyMap[key] = ep
+	}
+	return nil
+}
+
+// fillPathMap populates paths for endpoints from dirents in direntMap
+// before save.
+func (s *session) fillPathMap() error {
+	unlock := s.endpoints.lock()
+	defer unlock()
+
+	for ep, dir := range s.endpoints.direntMap {
+		mountRoot := dir.MountRoot()
+		defer mountRoot.DecRef()
+		dirPath, _ := dir.FullName(mountRoot)
+		if dirPath == "" {
+			return fmt.Errorf("error getting path from dirent")
+		}
+		s.endpoints.pathMap[ep] = dirPath
+	}
+	return nil
+}
+
+// restoreEndpointMaps recreates and fills the key and dirent maps.
+func (s *session) restoreEndpointMaps(ctx context.Context) error {
+	// When restoring, only need to create the keyMap because the dirent and path
+	// maps got stored through the save.
+	s.endpoints.keyMap = make(map[device.MultiDeviceKey]unix.BoundEndpoint)
+	if err := s.fillKeyMap(ctx); err != nil {
+		return fmt.Errorf("failed to insert sockets into endpoint map: %v", err)
+	}
+
+	// Re-create pathMap because it can no longer be trusted as socket paths can
+	// change while process continues to run. Empty pathMap will be re-filled upon
+	// next save.
+	s.endpoints.pathMap = make(map[unix.BoundEndpoint]string)
+	return nil
+}
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 0154810c8..8e6424492 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -18,16 +18,17 @@ import (
 	"fmt"
 
 	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
 // beforeSave is invoked by stateify.
-//
-// TODO: Make map with private unix sockets savable.
-func (e *endpointMap) beforeSave() {
-	if len(e.m) != 0 {
-		panic("EndpointMap with existing private unix sockets cannot be saved")
+func (s *session) beforeSave() {
+	if s.endpoints != nil {
+		if err := s.fillPathMap(); err != nil {
+			panic("failed to save paths to endpoint map before saving" + err.Error())
+		}
 	}
 }
 
@@ -72,6 +73,9 @@ func (s *session) afterLoad() {
 	if opts.aname != s.aname {
 		panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname))
 	}
+
+	// Check if endpointMaps exist when uds sockets are enabled
+	// (only pathmap will actualy have been saved).
 	if opts.privateunixsocket != (s.endpoints != nil) {
 		panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil))
 	}
@@ -96,4 +100,16 @@ func (s *session) afterLoad() {
 	if err != nil {
 		panic(fmt.Sprintf("failed to attach to aname: %v", err))
 	}
+
+	// If private unix sockets are enabled, create and fill the session's endpoint
+	// maps.
+	if opts.privateunixsocket {
+		// TODO: Context is not plumbed to save/restore.
+		ctx := &dummyClockContext{context.Background()}
+
+		if err = s.restoreEndpointMaps(ctx); err != nil {
+			panic("failed to restore endpoint maps: " + err.Error())
+		}
+	}
+
 }
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 406756f5f..8628b9c69 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -30,6 +30,8 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.Bound
 	}
 
 	if i.session().endpoints != nil {
+		unlock := i.session().endpoints.lock()
+		defer unlock()
 		ep := i.session().endpoints.get(i.fileState.key)
 		if ep != nil {
 			return ep
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 66c17debb..e7254fa7d 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -310,8 +310,8 @@ func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldNa
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
-	return syserror.EOPNOTSUPP
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, syserror.EOPNOTSUPP
 }
 
 // BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d0dbce5dd..db7240dca 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -223,7 +223,7 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent,
 }
 
 // Bind calls i.InodeOperations.Bind with i as the directory.
-func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) error {
+func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
 	if i.overlay != nil {
 		return overlayBind(ctx, i.overlay, name, data, perm)
 	}
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index b33980178..952f9704d 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -146,7 +146,7 @@ type InodeOperations interface {
 	// Implementations must ensure that name does not already exist.
 	//
 	// The caller must ensure that this operation is permitted.
-	Bind(ctx context.Context, dir *Inode, name string, data unix.BoundEndpoint, perm FilePermissions) error
+	Bind(ctx context.Context, dir *Inode, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error)
 
 	// BoundEndpoint returns the socket endpoint at path stored in
 	// or generated by an Inode.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 53fbd1481..543db9ac7 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -334,13 +334,13 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 	return nil
 }
 
-func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.BoundEndpoint, perm FilePermissions) error {
+func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	// We do not support doing anything exciting with sockets unless there
 	// is already a directory in the upper filesystem.
 	if o.upper == nil {
-		return syserror.EOPNOTSUPP
+		return nil, syserror.EOPNOTSUPP
 	}
 	return o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm)
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 04432f28c..d8333194b 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -314,17 +314,22 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perms fs.FilePermissions) error {
+func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
 	if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil {
-		return ErrDenied
+		return nil, ErrDenied
 	}
-	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
+	inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewBoundEndpoint(ctx, dir, ep, perms)
 	})
 	if err == syscall.EEXIST {
-		return syscall.EADDRINUSE
+		return nil, syscall.EADDRINUSE
 	}
-	return err
+	if err != nil {
+		return nil, err
+	}
+	// Take another ref on inode which will be donated to the new dirent.
+	inode.IncRef()
+	return fs.NewDirent(inode, name), nil
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo.
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 13e72e775..1028b5f1d 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -279,8 +279,8 @@ func (*Entry) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermiss
 }
 
 // Bind is not supported by default.
-func (*Entry) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error {
-	return ErrInvalidOp
+func (*Entry) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, ErrInvalidOp
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo. CreateFifo is not supported by
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index c91091db4..c6f39fce3 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -215,8 +215,8 @@ func (d *dirInodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode,
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error {
-	return syserror.EPERM
+func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+	return nil, syserror.EPERM
 }
 
 // GetFile implements fs.InodeOperations.GetFile.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 5b6411f97..1c22e78b3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -276,9 +276,11 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			}
 
 			// Create the socket.
-			if err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}); err != nil {
+			childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
+			if err != nil {
 				return tcpip.ErrPortInUse
 			}
+			childDir.DecRef()
 		}
 
 		return nil
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 7ec68f573..d4c650892 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,6 +1,13 @@
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
+
+go_binary(
+    name = "uds_test_app",
+    srcs = [
+        "uds_test_app.go",
+    ],
+)
 
 go_library(
     name = "container",
@@ -29,6 +36,7 @@ go_test(
     size = "medium",
     srcs = ["container_test.go"],
     data = [
+        ":uds_test_app",
         "//runsc",
     ],
     tags = [
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9e38f5f77..11edcd615 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -15,6 +15,7 @@
 package container_test
 
 import (
+	"bytes"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -108,7 +109,8 @@ func procListToString(pl []*control.Process) string {
 	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
 }
 
-// createWriteableOutputFile creates an output file that can be read and written to in the sandbox.
+// createWriteableOutputFile creates an output file that can be read and
+// written to in the sandbox.
 func createWriteableOutputFile(path string) (*os.File, error) {
 	outputFile, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
 	if err != nil {
@@ -136,13 +138,19 @@ func waitForFile(f *os.File) error {
 	return testutil.Poll(op, 5*time.Second)
 }
 
-func readOutputNum(f *os.File, first bool) (int, error) {
-	// Wait until file has contents.
+// readOutputNum reads a file at given filepath and returns the int at the
+// requested position.
+func readOutputNum(file string, position int) (int, error) {
+	f, err := os.Open(file)
+	if err != nil {
+		return 0, fmt.Errorf("error opening file: %q, %v", file, err)
+	}
+
+	// Ensure that there is content in output file.
 	if err := waitForFile(f); err != nil {
-		return 0, err
+		return 0, fmt.Errorf("error waiting for output file: %v", err)
 	}
 
-	// Read the first number in the new file
 	b, err := ioutil.ReadAll(f)
 	if err != nil {
 		return 0, fmt.Errorf("error reading file: %v", err)
@@ -151,14 +159,18 @@ func readOutputNum(f *os.File, first bool) (int, error) {
 		return 0, fmt.Errorf("error no content was read")
 	}
 
+	// Strip leading null bytes caused by file offset not being 0 upon restore.
+	b = bytes.Trim(b, "\x00")
 	nums := strings.Split(string(b), "\n")
 
-	var num int
-	if first {
-		num, err = strconv.Atoi(nums[0])
-	} else {
-		num, err = strconv.Atoi(nums[len(nums)-2])
+	if position >= len(nums) {
+		return 0, fmt.Errorf("position %v is not within the length of content %v", position, nums)
+	}
+	if position == -1 {
+		// Expectation of newline at the end of last position.
+		position = len(nums) - 2
 	}
+	num, err := strconv.Atoi(nums[position])
 	if err != nil {
 		return 0, fmt.Errorf("error getting number from file: %v", err)
 	}
@@ -194,6 +206,27 @@ func run(spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
+// findUDSApp finds the uds_test_app binary to be used in the UnixDomainSocket test.
+func findUDSApp() (string, error) {
+	// TODO: Use bazel FindBinary function.
+
+	// uds_test_app is in a directory like:
+	// './linux_amd64_pure_stripped/uds_test_app.go'.
+	//
+	// Since I don't want to construct 'linux_amd64_pure_stripped' based on the
+	// build type, do a quick search for: './*/uds_test_app'
+	// Note: This glob will only succeed when file is one directory deep.
+	matches, err := filepath.Glob("./*/uds_test_app")
+	if err != nil {
+		return "", fmt.Errorf("error globbing: %v", err)
+	}
+	if i := len(matches); i != 1 {
+		return "", fmt.Errorf("error identifying uds_test_app from matches: got %d matches", i)
+	}
+
+	return matches[0], nil
+}
+
 type configOptions int
 
 const (
@@ -204,7 +237,8 @@ const all = overlay | kvm
 
 // configs generates different configurations to run tests.
 func configs(opts configOptions) []*boot.Config {
-	cs := []*boot.Config{testutil.TestConfig()}
+	cs := []*boot.Config{testutil.TestConfig(), testutil.TestConfig()}
+	return cs
 
 	if opts&overlay != 0 {
 		c := testutil.TestConfig()
@@ -544,6 +578,7 @@ func TestCheckpointRestore(t *testing.T) {
 		if err := os.Chmod(dir, 0777); err != nil {
 			t.Fatalf("error chmoding file: %q, %v", dir, err)
 		}
+		defer os.RemoveAll(dir)
 
 		outputPath := filepath.Join(dir, "output")
 		outputFile, err := createWriteableOutputFile(outputPath)
@@ -598,7 +633,7 @@ func TestCheckpointRestore(t *testing.T) {
 		}
 		defer os.RemoveAll(imagePath)
 
-		lastNum, err := readOutputNum(outputFile, false)
+		lastNum, err := readOutputNum(outputPath, -1)
 		if err != nil {
 			t.Fatalf("error with outputFile: %v", err)
 		}
@@ -624,15 +659,22 @@ func TestCheckpointRestore(t *testing.T) {
 			t.Fatalf("error restoring container: %v", err)
 		}
 
-		firstNum, err := readOutputNum(outputFile2, true)
+		// Wait until application has ran.
+		if err := waitForFile(outputFile2); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		firstNum, err := readOutputNum(outputPath, 0)
 		if err != nil {
 			t.Fatalf("error with outputFile: %v", err)
 		}
 
-		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		// Check that lastNum is one less than firstNum and that the container picks
+		// up from where it left off.
 		if lastNum+1 != firstNum {
 			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
 		}
+		cont2.Destroy()
 
 		// Restore into another container!
 		// Delete and recreate file before restoring.
@@ -656,15 +698,169 @@ func TestCheckpointRestore(t *testing.T) {
 			t.Fatalf("error restoring container: %v", err)
 		}
 
-		firstNum2, err := readOutputNum(outputFile3, true)
+		// Wait until application has ran.
+		if err := waitForFile(outputFile3); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		firstNum2, err := readOutputNum(outputPath, 0)
 		if err != nil {
 			t.Fatalf("error with outputFile: %v", err)
 		}
 
-		// Check that lastNum is one less than firstNum and that the container picks up from where it left off.
+		// Check that lastNum is one less than firstNum and that the container picks
+		// up from where it left off.
 		if lastNum+1 != firstNum2 {
 			t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
 		}
+		cont3.Destroy()
+	}
+}
+
+// TestUnixDomainSockets checks that Checkpoint/Restore works in cases
+// with filesystem Unix Domain Socket use.
+func TestUnixDomainSockets(t *testing.T) {
+	const (
+		output    = "uds_output"
+		goferRoot = "/tmp2"
+		socket    = "uds_socket"
+	)
+
+	// Skip overlay because test requires writing to host file.
+	for _, conf := range configs(kvm) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		dir, err := ioutil.TempDir("", "uds-test")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir failed: %v", err)
+		}
+		if err := os.Chmod(dir, 0777); err != nil {
+			t.Fatalf("error chmoding file: %q, %v", dir, err)
+		}
+		defer os.RemoveAll(dir)
+
+		outputPath := filepath.Join(dir, output)
+
+		outputFile, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile.Close()
+
+		// Get file path for corresponding output file in sandbox.
+		outputFileSandbox := filepath.Join(goferRoot, output)
+
+		// Need to get working directory, even though not intuitive.
+		wd, _ := os.Getwd()
+		localPath, err := findUDSApp()
+		if err != nil {
+			t.Fatalf("error finding localPath: %v", err)
+		}
+		app := filepath.Join(wd, localPath)
+
+		if _, err = os.Stat(app); err != nil {
+			t.Fatalf("error finding the uds_test_app: %v", err)
+		}
+
+		socketPath := filepath.Join(dir, socket)
+		socketPathSandbox := filepath.Join(goferRoot, socket)
+		defer os.Remove(socketPath)
+
+		spec := testutil.NewSpecWithArgs(app, "--file", outputFileSandbox,
+			"--socket", socketPathSandbox)
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Type:        "bind",
+			Destination: goferRoot,
+			Source:      dir,
+		})
+
+		spec.Process.User = specs.User{
+			UID: uint32(os.Getuid()),
+			GID: uint32(os.Getgid()),
+		}
+
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create and start the container.
+		cont, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		// Set the image path, the location where the checkpoint image will be saved.
+		imagePath := filepath.Join(dir, "test-image-file")
+
+		// Create the image file and open for writing.
+		file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+		if err != nil {
+			t.Fatalf("error opening new file at imagePath: %v", err)
+		}
+		defer file.Close()
+		defer os.RemoveAll(imagePath)
+
+		// Wait until application has ran.
+		if err := waitForFile(outputFile); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		// Checkpoint running container; save state into new file.
+		if err := cont.Checkpoint(file); err != nil {
+			t.Fatalf("error checkpointing container to empty file: %v", err)
+		}
+
+		// Read last number outputted before checkpoint.
+		lastNum, err := readOutputNum(outputPath, -1)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
+
+		// Delete and recreate file before restoring.
+		if err := os.Remove(outputPath); err != nil {
+			t.Fatalf("error removing file")
+		}
+		outputFile2, err := createWriteableOutputFile(outputPath)
+		if err != nil {
+			t.Fatalf("error creating output file: %v", err)
+		}
+		defer outputFile2.Close()
+
+		// Restore into a new container.
+		contRestore, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer contRestore.Destroy()
+
+		if err := contRestore.Restore(spec, conf, imagePath); err != nil {
+			t.Fatalf("error restoring container: %v", err)
+		}
+
+		// Wait until application has ran.
+		if err := waitForFile(outputFile2); err != nil {
+			t.Fatalf("Failed to wait for output file: %v", err)
+		}
+
+		// Read first number outputted after restore.
+		firstNum, err := readOutputNum(outputPath, 0)
+		if err != nil {
+			t.Fatalf("error with outputFile: %v", err)
+		}
+
+		// Check that lastNum is one less than firstNum.
+		if lastNum+1 != firstNum {
+			t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum)
+		}
+		contRestore.Destroy()
 	}
 }
 
diff --git a/runsc/container/uds_test_app.go b/runsc/container/uds_test_app.go
new file mode 100644
index 000000000..bef98ac66
--- /dev/null
+++ b/runsc/container/uds_test_app.go
@@ -0,0 +1,83 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary uds-test-app opens a socket and reads a series of numbers
+// which are then written to an output file.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"net"
+	"os"
+	"strconv"
+	"time"
+)
+
+var (
+	fileName   = flag.String("file", "", "name of output file")
+	socketPath = flag.String("socket", "", "path to socket")
+)
+
+func server(listener net.Listener, f *os.File) {
+	buf := make([]byte, 16)
+
+	for {
+		c, err := listener.Accept()
+		if err != nil {
+			log.Fatal("error accepting connection:", err)
+		}
+		nr, err := c.Read(buf)
+		if err != nil {
+			log.Fatal("error reading from buf:", err)
+		}
+		data := buf[0:nr]
+		fmt.Fprintf(f, string(data)+"\n")
+	}
+}
+
+func main() {
+	flag.Parse()
+	if *fileName == "" || *socketPath == "" {
+		log.Fatalf("Flags cannot be empty, given: fileName=%s, socketPath=%s", *fileName, *socketPath)
+	}
+	outputFile, err := os.OpenFile(*fileName, os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		log.Fatal("error opening output file:", err)
+	}
+
+	socket := *socketPath
+	defer os.Remove(socket)
+
+	listener, err := net.Listen("unix", socket)
+	if err != nil {
+		log.Fatal("error listening on socket:", err)
+	}
+
+	go server(listener, outputFile)
+	for i := 0; ; i++ {
+
+		conn, err := net.Dial("unix", socket)
+		if err != nil {
+			log.Fatal("error dialing:", err)
+		}
+		if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil {
+			log.Fatal("error writing:", err)
+		}
+		conn.Close()
+		time.Sleep(100 * time.Millisecond)
+	}
+
+}
-- 
cgit v1.2.3


From 567c5eed11cfcea78b80169487664106a41fa1fe Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 10 Aug 2018 15:41:44 -0700
Subject: cache policy: Check policy before returning a negative dirent.

The cache policy determines whether Lookup should return a negative dirent, or
just ENOENT. This CL fixes one spot where we returned a negative dirent without
first consulting the policy.

PiperOrigin-RevId: 208280230
Change-Id: I8f963bbdb45a95a74ad0ecc1eef47eff2092d3a4
---
 pkg/sentry/fs/gofer/path.go | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 15e9863fb..bec9680f8 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -29,15 +29,19 @@ import (
 // Lookup loads an Inode at name into a Dirent based on the session's cache
 // policy.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	if i.session().cachePolicy.cacheReaddir() {
+	cp := i.session().cachePolicy
+	if cp.cacheReaddir() {
 		// Check to see if we have readdirCache that indicates the
 		// child does not exist.  Avoid holding readdirMu longer than
 		// we need to.
 		i.readdirMu.Lock()
 		if i.readdirCache != nil && !i.readdirCache.Contains(name) {
-			// No such child.  Return a negative dirent.
+			// No such child.
 			i.readdirMu.Unlock()
-			return fs.NewNegativeDirent(name), nil
+			if cp.cacheNegativeDirents() {
+				return fs.NewNegativeDirent(name), nil
+			}
+			return nil, syserror.ENOENT
 		}
 		i.readdirMu.Unlock()
 	}
@@ -46,7 +50,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 	qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		if err == syscall.ENOENT {
-			if i.session().cachePolicy.cacheNegativeDirents() {
+			if cp.cacheNegativeDirents() {
 				// Return a negative Dirent. It will stay cached until something
 				// is created over it.
 				return fs.NewNegativeDirent(name), nil
-- 
cgit v1.2.3


From ae6f092fe117a738df34e072ef5ba01a41c89222 Mon Sep 17 00:00:00 2001
From: Justine Olshan <justineolshan@google.com>
Date: Fri, 10 Aug 2018 16:09:52 -0700
Subject: Implemented the splice(2) syscall.

Currently the implementation matches the behavior of moving data
between two file descriptors. However, it does not implement this
through zero-copy movement. Thus, this code is a starting point
to build the more complex implementation.

PiperOrigin-RevId: 208284483
Change-Id: Ibde79520a3d50bc26aead7ad4f128d2be31db14e
---
 pkg/abi/linux/BUILD                     |   1 +
 pkg/abi/linux/splice.go                 |  20 +++
 pkg/sentry/syscalls/linux/BUILD         |   1 +
 pkg/sentry/syscalls/linux/linux64.go    |   2 +-
 pkg/sentry/syscalls/linux/sys_splice.go | 260 ++++++++++++++++++++++++++++++++
 5 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 pkg/abi/linux/splice.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_splice.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index ac4ceefbc..9a44c2042 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -43,6 +43,7 @@ go_library(
         "shm.go",
         "signal.go",
         "socket.go",
+        "splice.go",
         "time.go",
         "tty.go",
         "uio.go",
diff --git a/pkg/abi/linux/splice.go b/pkg/abi/linux/splice.go
new file mode 100644
index 000000000..9331ec84b
--- /dev/null
+++ b/pkg/abi/linux/splice.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Flags for splice(2).
+const (
+	SPLICE_F_NONBLOCK = 2
+)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index bbdfad9da..62423c0fa 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -34,6 +34,7 @@ go_library(
         "sys_shm.go",
         "sys_signal.go",
         "sys_socket.go",
+        "sys_splice.go",
         "sys_stat.go",
         "sys_sync.go",
         "sys_sysinfo.go",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index c102af101..485c96202 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -319,7 +319,7 @@ var AMD64 = &kernel.SyscallTable{
 		272: Unshare,
 		273: syscalls.Error(syscall.ENOSYS), // SetRobustList, obsolete
 		274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete
-		//     275: Splice, TODO
+		275: Splice,
 		//     276: Tee, TODO
 		//     277: SyncFileRange, TODO
 		//     278: Vmsplice, TODO
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
new file mode 100644
index 000000000..8151e3599
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -0,0 +1,260 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Splice implements linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fdIn := kdefs.FD(args[0].Int())
+	offIn := args[1].Pointer()
+	fdOut := kdefs.FD(args[2].Int())
+	offOut := args[3].Pointer()
+	size := int64(args[4].SizeT())
+	flags := uint(args[5].Uint())
+
+	fileIn := t.FDMap().GetFile(fdIn)
+	if fileIn == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer fileIn.DecRef()
+	fileOut := t.FDMap().GetFile(fdOut)
+	if fileOut == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer fileOut.DecRef()
+
+	// Check for whether we have pipes.
+	ipipe := fs.IsPipe(fileIn.Dirent.Inode.StableAttr)
+	opipe := fs.IsPipe(fileOut.Dirent.Inode.StableAttr)
+	if (ipipe && offIn != 0) || (opipe && offOut != 0) {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check if both file descriptors are pipes.
+	if ipipe && opipe {
+		var readPipe *pipe.Pipe
+		switch p := fileIn.FileOperations.(type) {
+		case *pipe.Reader:
+			readPipe = p.ReaderWriter.Pipe
+		case *pipe.ReaderWriter:
+			readPipe = p.Pipe
+		default:
+			return 0, nil, syserror.EBADF
+		}
+		var writePipe *pipe.Pipe
+		switch p := fileOut.FileOperations.(type) {
+		case *pipe.Writer:
+			writePipe = p.ReaderWriter.Pipe
+		case *pipe.ReaderWriter:
+			writePipe = p.Pipe
+		default:
+			return 0, nil, syserror.EBADF
+		}
+
+		// Splicing with two ends of the same pipe is not allowed.
+		if readPipe == writePipe {
+			return 0, nil, syserror.EINVAL
+		}
+		spliced, err := splicePipeToPipe(t, fileIn, fileOut, size, flags)
+		if err != nil {
+			return 0, nil, err
+		}
+		return uintptr(spliced), nil, nil
+	}
+
+	// Check if the file descriptor that contains the data to move is a pipe.
+	if ipipe {
+		flagsOut := fileOut.Flags()
+		offset := uint64(fileOut.Offset())
+
+		// If there is an offset for the file, ensure the file has the Pwrite flag.
+		if offOut != 0 {
+			if !flagsOut.Pwrite {
+				return 0, nil, syserror.EINVAL
+			}
+			if _, err := t.CopyIn(offOut, &offset); err != nil {
+				return 0, nil, err
+			}
+		}
+
+		if !flagsOut.Write {
+			return 0, nil, syserror.EBADF
+		}
+
+		if flagsOut.Append {
+			return 0, nil, syserror.EINVAL
+		}
+
+		switch fileIn.FileOperations.(type) {
+		case *pipe.Reader, *pipe.ReaderWriter:
+			// If the pipe in is a Reader or ReaderWriter, we can continue.
+		default:
+			return 0, nil, syserror.EBADF
+		}
+		spliced, err := spliceWrite(t, fileIn, fileOut, size, offset, flags)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// Make sure value that offset points to is updated.
+		if offOut == 0 {
+			fileOut.Seek(t, fs.SeekSet, spliced+int64(offset))
+		} else if _, err := t.CopyOut(offOut, spliced+int64(offset)); err != nil {
+			return 0, nil, err
+		}
+		return uintptr(spliced), nil, nil
+	}
+
+	// Check if the file descriptor that the data will be moved to is a pipe.
+	if opipe {
+		flagsIn := fileIn.Flags()
+		offset := uint64(fileIn.Offset())
+
+		// If there is an offset for the file, ensure the file has the Pread flag.
+		if offIn != 0 {
+			if !flagsIn.Pread {
+				return 0, nil, syserror.EINVAL
+			}
+			if _, err := t.CopyIn(offIn, &offset); err != nil {
+				return 0, nil, err
+			}
+		}
+
+		if !flagsIn.Read {
+			return 0, nil, syserror.EBADF
+		}
+
+		switch fileOut.FileOperations.(type) {
+		case *pipe.Writer, *pipe.ReaderWriter:
+			// If the pipe out is a Writer or ReaderWriter, we can continue.
+		default:
+			return 0, nil, syserror.EBADF
+		}
+		spliced, err := spliceRead(t, fileIn, fileOut, size, offset, flags)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// Make sure value that offset points to is updated.
+		if offIn == 0 {
+			fileOut.Seek(t, fs.SeekSet, spliced+int64(offset))
+		} else if _, err := t.CopyOut(offIn, spliced+int64(offset)); err != nil {
+			return 0, nil, err
+		}
+		return uintptr(spliced), nil, nil
+	}
+
+	// Splice requires one of the file descriptors to be a pipe.
+	return 0, nil, syserror.EINVAL
+}
+
+// splicePipeToPipe moves data from one pipe to another pipe.
+// TODO: Implement with zero copy movement/without copying between
+// user and kernel address spaces.
+func splicePipeToPipe(t *kernel.Task, inPipe *fs.File, outPipe *fs.File, size int64, flags uint) (int64, error) {
+	w := &fs.FileWriter{t, outPipe}
+	if flags == linux.SPLICE_F_NONBLOCK {
+		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
+		return io.Copy(w, r)
+	}
+	var n int64
+	for read := int64(0); read < size; {
+		var err error
+		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
+		n, err = io.Copy(w, r)
+		if err != nil && err != syserror.ErrWouldBlock {
+			return 0, err
+		}
+		read += n
+	}
+	return n, nil
+}
+
+// spliceRead moves data from a file to a pipe.
+// TODO: Implement with zero copy movement/without copying between
+// user and kernel address spaces.
+func spliceRead(t *kernel.Task, inFile *fs.File, outPipe *fs.File, size int64, offset uint64, flags uint) (int64, error) {
+	w := &fs.FileWriter{t, outPipe}
+	if flags == linux.SPLICE_F_NONBLOCK {
+		r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), size)
+		return io.Copy(w, r)
+	}
+	var n int64
+	for read := int64(0); read < size; {
+		r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), size)
+		var err error
+		n, err = io.Copy(w, r)
+		if err != nil && err != syserror.ErrWouldBlock {
+			return 0, err
+		}
+		read += n
+	}
+	return n, nil
+}
+
+// offsetWriter implements io.Writer on a section of an underlying
+// WriterAt starting from the offset and ending at the limit.
+type offsetWriter struct {
+	w     io.WriterAt
+	off   int64
+	limit int64
+}
+
+// Write implements io.Writer.Write and writes the content of the offsetWriter
+// starting at the offset and ending at the limit into the given buffer.
+func (o *offsetWriter) Write(p []byte) (n int, err error) {
+	if o.off >= o.limit {
+		return 0, io.EOF
+	}
+	if max := o.limit - o.off; int64(len(p)) > max {
+		p = p[0:max]
+	}
+	n, err = o.w.WriteAt(p, o.off)
+	o.off += int64(n)
+	return n, err
+}
+
+// spliceWrite moves data from a pipe to a file.
+// TODO: Implement with zero copy movement/without copying between
+// user and kernel address spaces.
+func spliceWrite(t *kernel.Task, inPipe *fs.File, outFile *fs.File, size int64, offset uint64, flags uint) (int64, error) {
+	w := &offsetWriter{&fs.FileWriter{t, outFile}, int64(offset), int64(offset) + size}
+	if flags == linux.SPLICE_F_NONBLOCK {
+		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
+		return io.Copy(w, r)
+	}
+	var n int64
+	for read := int64(0); read < size; {
+		var err error
+		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
+		n, err = io.Copy(w, r)
+		if err != nil && err != syserror.ErrWouldBlock {
+			return 0, err
+		}
+		read += n
+	}
+	return n, nil
+}
-- 
cgit v1.2.3


From a2ec391dfbc5a03077b73078777a9347c372dece Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 10 Aug 2018 17:15:27 -0700
Subject: fs: Allow overlays to revalidate files from the upper fs.

Previously, an overlay would panic if either the upper or lower fs required
revalidation for a given Dirent. Now, we allow revalidation from the upper
file, but not the lower.

If a cached overlay inode does need revalidation (because the upper needs
revalidation), then the entire overlay Inode will be discarded and a new
overlay Inode will be built with a fresh copy of the upper file.

As a side effect of this change, Revalidate must take an Inode instead of a
Dirent, since an overlay needs to revalidate individual Inodes.

PiperOrigin-RevId: 208293638
Change-Id: Ic8f8d1ffdc09114721745661a09522b54420c5f1
---
 pkg/sentry/fs/README.md             |  12 ++++
 pkg/sentry/fs/dirent.go             |   2 +-
 pkg/sentry/fs/file_overlay_test.go  |  70 +++++++++++++++++---
 pkg/sentry/fs/gofer/session.go      |   6 +-
 pkg/sentry/fs/inode_overlay.go      |  43 +++++++++++--
 pkg/sentry/fs/inode_overlay_test.go | 123 +++++++++++++++++++++++++++++++-----
 pkg/sentry/fs/mock.go               |   2 +-
 pkg/sentry/fs/mount.go              |  28 +++++---
 pkg/sentry/fs/mount_overlay.go      |  39 ++++++++----
 pkg/sentry/fs/overlay.go            |   8 ++-
 pkg/sentry/fs/tty/fs.go             |   2 +-
 11 files changed, 279 insertions(+), 56 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
index 76638cdae..7680187f4 100644
--- a/pkg/sentry/fs/README.md
+++ b/pkg/sentry/fs/README.md
@@ -193,6 +193,18 @@ interface. It multiplexes between upper and lower directory memory mappings and
 stores a copy of memory references so they can be transferred to the upper
 directory `fs.Mappable` when the file is copied up.
 
+The lower filesystem in an overlay may contain another (nested) overlay, but the
+upper filesystem may not contain another overlay. In other words, nested
+overlays form a tree structure that only allows branching in the lower
+filesystem.
+
+Caching decisions in the overlay are delegated to the upper filesystem, meaning
+that the Keep and Revalidate methods on the overlay return the same values as
+the upper filesystem. A small wrinkle is that the lower filesystem is not
+allowed to return `true` from Revalidate, as the overlay can not reload inodes
+from the lower filesystem. A lower filesystem that does return `true` from
+Revalidate will trigger a panic.
+
 The `fs.Inode` also holds a reference to a `fs.MountedFilesystem` that
 normalizes across the mounted filesystem state of the upper and lower
 directories.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 821cc5789..f81ad5792 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -503,7 +503,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 			//
 			// We never allow the file system to revalidate mounts, that could cause them
 			// to unexpectedly drop out before umount.
-			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, cd) {
+			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, cd.Inode) {
 				// Good to go. This is the fast-path.
 				return cd, nil
 			}
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 407ba8562..38762d8a1 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
 )
 
 func TestReaddir(t *testing.T) {
@@ -48,7 +49,7 @@ func TestReaddir(t *testing.T) {
 					{name: "a"},
 					{name: "b"},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			names: []string{".", "..", "a", "b"},
 		},
 		{
@@ -59,7 +60,7 @@ func TestReaddir(t *testing.T) {
 					{name: "b"},
 				}, nil), /* upper */
 				nil, /* lower */
-			),
+				false /* revalidate */),
 			names: []string{".", "..", "a", "b"},
 		},
 		{
@@ -67,11 +68,11 @@ func TestReaddir(t *testing.T) {
 			dir: fs.NewTestOverlayDir(ctx,
 				newTestRamfsDir(ctx, []dirContent{
 					{name: "a"},
-				}, nil), /* lower */
+				}, nil), /* upper */
 				newTestRamfsDir(ctx, []dirContent{
 					{name: "b"},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			names: []string{".", "..", "a", "b"},
 		},
 		{
@@ -79,11 +80,11 @@ func TestReaddir(t *testing.T) {
 			dir: fs.NewTestOverlayDir(ctx,
 				newTestRamfsDir(ctx, []dirContent{
 					{name: "a"},
-				}, []string{"b"}), /* lower */
+				}, []string{"b"}), /* upper */
 				newTestRamfsDir(ctx, []dirContent{
 					{name: "c"},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			names: []string{".", "..", "a", "c"},
 		},
 		{
@@ -91,12 +92,12 @@ func TestReaddir(t *testing.T) {
 			dir: fs.NewTestOverlayDir(ctx,
 				newTestRamfsDir(ctx, []dirContent{
 					{name: "a"},
-				}, []string{"b"}), /* lower */
+				}, []string{"b"}), /* upper */
 				newTestRamfsDir(ctx, []dirContent{
 					{name: "b"}, /* will be masked */
 					{name: "c"},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			names: []string{".", "..", "a", "c"},
 		},
 	} {
@@ -120,6 +121,59 @@ func TestReaddir(t *testing.T) {
 	}
 }
 
+func TestReaddirRevalidation(t *testing.T) {
+	ctx := contexttest.Context(t)
+	ctx = &rootContext{
+		Context: ctx,
+		root:    fs.NewDirent(newTestRamfsDir(ctx, nil, nil), "root"),
+	}
+
+	// Create an overlay with two directories, each with one file.
+	upper := newTestRamfsDir(ctx, []dirContent{{name: "a"}}, nil)
+	lower := newTestRamfsDir(ctx, []dirContent{{name: "b"}}, nil)
+	overlay := fs.NewTestOverlayDir(ctx, upper, lower, true /* revalidate */)
+
+	// Get a handle to the dirent in the upper filesystem so that we can
+	// modify it without going through the dirent.
+	upperDir := upper.InodeOperations.(*dir).InodeOperations.(*ramfstest.Dir)
+
+	// Check that overlay returns the files from both upper and lower.
+	openDir, err := overlay.GetFile(ctx, fs.NewDirent(overlay, "stub"), fs.FileFlags{Read: true})
+	if err != nil {
+		t.Fatalf("GetFile got error %v, want nil", err)
+	}
+	ser := &fs.CollectEntriesSerializer{}
+	if err := openDir.Readdir(ctx, ser); err != nil {
+		t.Fatalf("Readdir got error %v, want nil", err)
+	}
+	got, want := ser.Order, []string{".", "..", "a", "b"}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Readdir got names %v, want %v", got, want)
+	}
+
+	// Remove "a" from the upper and add "c".
+	if err := upperDir.Remove(ctx, upper, "a"); err != nil {
+		t.Fatalf("error removing child: %v", err)
+	}
+	upperDir.AddChild(ctx, "c", fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}),
+		upper.MountSource, fs.StableAttr{Type: fs.RegularFile}))
+
+	// Seek to beginning of the directory and do the readdir again.
+	if _, err := openDir.Seek(ctx, fs.SeekSet, 0); err != nil {
+		t.Fatalf("error seeking to beginning of dir: %v", err)
+	}
+	ser = &fs.CollectEntriesSerializer{}
+	if err := openDir.Readdir(ctx, ser); err != nil {
+		t.Fatalf("Readdir got error %v, want nil", err)
+	}
+
+	// Readdir should return the updated children.
+	got, want = ser.Order, []string{".", "..", "b", "c"}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("Readdir got names %v, want %v", got, want)
+	}
+}
+
 type rootContext struct {
 	context.Context
 	root *fs.Dirent
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index bfb1154dc..eeb9087e9 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -145,12 +145,12 @@ func (s *session) Destroy() {
 	s.conn.Close()
 }
 
-// Revalidate returns true if the cache policy is does not allow for VFS caching.
-func (s *session) Revalidate(ctx context.Context, d *fs.Dirent) bool {
+// Revalidate implements MountSource.Revalidate.
+func (s *session) Revalidate(ctx context.Context, i *fs.Inode) bool {
 	return s.cachePolicy.revalidateDirent()
 }
 
-// TakeRefs takes an extra reference on dirent if possible.
+// Keep implements MountSource.Keep.
 func (s *session) Keep(d *fs.Dirent) bool {
 	return s.cachePolicy.keepDirent(d.Inode)
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 543db9ac7..34e62a4a2 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -57,6 +57,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 	var upperInode *Inode
 	var lowerInode *Inode
 
+	// We must remember whether the upper fs returned a negative dirent,
+	// because it is only safe to return one if the upper did.
+	var negativeUpperChild bool
+
 	// Does the parent directory exist in the upper file system?
 	if parent.upper != nil {
 		// First check if a file object exists in the upper file system.
@@ -70,7 +74,9 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 			return nil, err
 		}
 		if child != nil {
-			if !child.IsNegative() {
+			if child.IsNegative() {
+				negativeUpperChild = true
+			} else {
 				upperInode = child.Inode
 				upperInode.IncRef()
 			}
@@ -81,7 +87,18 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 		if overlayHasWhiteout(parent.upper, name) {
 			if upperInode == nil {
 				parent.copyMu.RUnlock()
-				return NewNegativeDirent(name), nil
+				if negativeUpperChild {
+					// If the upper fs returnd a negative
+					// Dirent, then the upper is OK with
+					// that negative Dirent being cached in
+					// the Dirent tree, so we can return
+					// one from the overlay.
+					return NewNegativeDirent(name), nil
+				}
+				// Upper fs is not OK with a negative Dirent
+				// being cached in the Dirent tree, so don't
+				// return one.
+				return nil, syserror.ENOENT
 			}
 			entry, err := newOverlayEntry(ctx, upperInode, nil, false)
 			if err != nil {
@@ -127,9 +144,14 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 
 	// Was all of this for naught?
 	if upperInode == nil && lowerInode == nil {
-		// Return a negative Dirent indicating that nothing was found.
 		parent.copyMu.RUnlock()
-		return NewNegativeDirent(name), nil
+		// We can only return a negative dirent if the upper returned
+		// one as well. See comments above regarding negativeUpperChild
+		// for more info.
+		if negativeUpperChild {
+			return NewNegativeDirent(name), nil
+		}
+		return nil, syserror.ENOENT
 	}
 
 	// Did we find a lower Inode? Remember this because we may decide we don't
@@ -568,10 +590,19 @@ func overlayHandleOps(o *overlayEntry) HandleOperations {
 }
 
 // NewTestOverlayDir returns an overlay Inode for tests.
-func NewTestOverlayDir(ctx context.Context, upper *Inode, lower *Inode) *Inode {
+//
+// If `revalidate` is true, then the upper filesystem will require
+// revalidation.
+func NewTestOverlayDir(ctx context.Context, upper, lower *Inode, revalidate bool) *Inode {
 	fs := &overlayFilesystem{}
+	var upperMsrc *MountSource
+	if revalidate {
+		upperMsrc = NewRevalidatingMountSource(fs, MountSourceFlags{})
+	} else {
+		upperMsrc = NewNonCachingMountSource(fs, MountSourceFlags{})
+	}
 	msrc := NewMountSource(&overlayMountSourceOperations{
-		upper: NewNonCachingMountSource(fs, MountSourceFlags{}),
+		upper: upperMsrc,
 		lower: NewNonCachingMountSource(fs, MountSourceFlags{}),
 	}, fs, MountSourceFlags{})
 	overlay := &overlayEntry{
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 684d54bd2..a7be9d040 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -35,7 +35,6 @@ func TestLookup(t *testing.T) {
 		name string
 
 		// Want from lookup.
-		err      error
 		found    bool
 		hasUpper bool
 		hasLower bool
@@ -50,7 +49,7 @@ func TestLookup(t *testing.T) {
 						dir:  false,
 					},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    true,
 			hasUpper: false,
@@ -66,7 +65,7 @@ func TestLookup(t *testing.T) {
 					},
 				}, nil), /* upper */
 				nil, /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    true,
 			hasUpper: true,
@@ -87,7 +86,7 @@ func TestLookup(t *testing.T) {
 						dir:  false,
 					},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    true,
 			hasUpper: false,
@@ -108,7 +107,7 @@ func TestLookup(t *testing.T) {
 						dir:  false,
 					},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    true,
 			hasUpper: true,
@@ -129,7 +128,7 @@ func TestLookup(t *testing.T) {
 						dir:  false,
 					},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    true,
 			hasUpper: true,
@@ -150,7 +149,7 @@ func TestLookup(t *testing.T) {
 						dir:  true,
 					},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    true,
 			hasUpper: true,
@@ -166,7 +165,7 @@ func TestLookup(t *testing.T) {
 						dir:  false,
 					},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    false,
 			hasUpper: false,
@@ -182,7 +181,7 @@ func TestLookup(t *testing.T) {
 						dir:  false,
 					},
 				}, nil), /* lower */
-			),
+				false /* revalidate */),
 			name:     "a",
 			found:    true,
 			hasUpper: false,
@@ -191,13 +190,14 @@ func TestLookup(t *testing.T) {
 	} {
 		t.Run(test.desc, func(t *testing.T) {
 			dirent, err := test.dir.Lookup(ctx, test.name)
-			if err != test.err {
-				t.Fatalf("lookup got error %v, want %v", err, test.err)
-			}
-			if test.found && dirent.IsNegative() {
-				t.Fatalf("lookup expected to find %q, got negative dirent", test.name)
+			if test.found && (err == syserror.ENOENT || dirent.IsNegative()) {
+				t.Fatalf("lookup %q expected to find positive dirent, got dirent %v err %v", test.name, dirent, err)
 			}
 			if !test.found {
+				if err != syserror.ENOENT && !dirent.IsNegative() {
+					t.Errorf("lookup %q expected to return ENOENT or negative dirent, got dirent %v err %v", test.name, dirent, err)
+				}
+				// Nothing more to check.
 				return
 			}
 			if hasUpper := dirent.Inode.TestHasUpperFS(); hasUpper != test.hasUpper {
@@ -210,6 +210,95 @@ func TestLookup(t *testing.T) {
 	}
 }
 
+func TestLookupRevalidation(t *testing.T) {
+	// File name used in the tests.
+	fileName := "foofile"
+	ctx := contexttest.Context(t)
+	for _, tc := range []struct {
+		// Test description.
+		desc string
+
+		// Upper and lower fs for the overlay.
+		upper *fs.Inode
+		lower *fs.Inode
+
+		// Whether the upper requires revalidation.
+		revalidate bool
+
+		// Whether we should get the same dirent on second lookup.
+		wantSame bool
+	}{
+		{
+			desc:       "file from upper with no revalidation",
+			upper:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			lower:      newTestRamfsDir(ctx, nil, nil),
+			revalidate: false,
+			wantSame:   true,
+		},
+		{
+			desc:       "file from upper with revalidation",
+			upper:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			lower:      newTestRamfsDir(ctx, nil, nil),
+			revalidate: true,
+			wantSame:   false,
+		},
+		{
+			desc:       "file from lower with no revalidation",
+			upper:      newTestRamfsDir(ctx, nil, nil),
+			lower:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			revalidate: false,
+			wantSame:   true,
+		},
+		{
+			desc:       "file from lower with revalidation",
+			upper:      newTestRamfsDir(ctx, nil, nil),
+			lower:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			revalidate: true,
+			// The file does not exist in the upper, so we do not
+			// need to revalidate it.
+			wantSame: true,
+		},
+		{
+			desc:       "file from upper and lower with no revalidation",
+			upper:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			lower:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			revalidate: false,
+			wantSame:   true,
+		},
+		{
+			desc:       "file from upper and lower with revalidation",
+			upper:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			lower:      newTestRamfsDir(ctx, []dirContent{{name: fileName}}, nil),
+			revalidate: true,
+			wantSame:   false,
+		},
+	} {
+		t.Run(tc.desc, func(t *testing.T) {
+			root := fs.NewDirent(newTestRamfsDir(ctx, nil, nil), "root")
+			ctx = &rootContext{
+				Context: ctx,
+				root:    root,
+			}
+			overlay := fs.NewDirent(fs.NewTestOverlayDir(ctx, tc.upper, tc.lower, tc.revalidate), "overlay")
+			// Lookup the file twice through the overlay.
+			first, err := overlay.Walk(ctx, root, fileName)
+			if err != nil {
+				t.Fatalf("overlay.Walk(%q) failed: %v", fileName, err)
+			}
+			second, err := overlay.Walk(ctx, root, fileName)
+			if err != nil {
+				t.Fatalf("overlay.Walk(%q) failed: %v", fileName, err)
+			}
+
+			if tc.wantSame && first != second {
+				t.Errorf("dirent lookup got different dirents, wanted same\nfirst=%+v\nsecond=%+v", first, second)
+			} else if !tc.wantSame && first == second {
+				t.Errorf("dirent lookup got the same dirent, wanted different: %+v", first)
+			}
+		})
+	}
+}
+
 type dir struct {
 	fs.InodeOperations
 
@@ -231,6 +320,10 @@ type dirContent struct {
 	dir  bool
 }
 
+func newTestRamfsInode(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	return fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}), msrc, fs.StableAttr{Type: fs.RegularFile})
+}
+
 func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []string) *fs.Inode {
 	msrc := fs.NewCachingMountSource(nil, fs.MountSourceFlags{})
 	contents := make(map[string]*fs.Inode)
@@ -238,7 +331,7 @@ func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []stri
 		if c.dir {
 			contents[c.name] = newTestRamfsDir(ctx, nil, nil)
 		} else {
-			contents[c.name] = fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}), msrc, fs.StableAttr{Type: fs.RegularFile})
+			contents[c.name] = newTestRamfsInode(ctx, msrc)
 		}
 	}
 	dops := ramfstest.NewDir(ctx, contents, fs.FilePermissions{
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index dc82a2002..89a0103ba 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -68,7 +68,7 @@ func NewMockMountSource(cache *DirentCache) *MountSource {
 }
 
 // Revalidate implements fs.MountSourceOperations.Revalidate.
-func (n *MockMountSourceOps) Revalidate(context.Context, *Dirent) bool {
+func (n *MockMountSourceOps) Revalidate(context.Context, *Inode) bool {
 	return n.revalidate
 }
 
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index c72372929..455f5b35c 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -27,10 +27,10 @@ import (
 // DirentOperations provide file systems greater control over how long a Dirent stays pinned
 // in core. Implementations must not take Dirent.mu.
 type DirentOperations interface {
-	// Revalidate returns true if the Dirent is stale and its
+	// Revalidate returns true if the Inode is stale and its
 	// InodeOperations needs to be reloaded. Revalidate will never be
-	// called on a Dirent that is mounted.
-	Revalidate(ctx context.Context, dirent *Dirent) bool
+	// called on a Inode that is mounted.
+	Revalidate(ctx context.Context, inode *Inode) bool
 
 	// Keep returns true if the Dirent should be kept in memory for as long
 	// as possible beyond any active references.
@@ -249,7 +249,8 @@ func (msrc *MountSource) FlushDirentRefs() {
 // aggressively. Filesystem may be nil if there is no backing filesystem.
 func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
 	return NewMountSource(&SimpleMountSourceOperations{
-		keep: true,
+		keep:       true,
+		revalidate: false,
 	}, filesystem, flags)
 }
 
@@ -257,7 +258,17 @@ func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *Mount
 // Filesystem may be nil if there is no backing filesystem.
 func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
 	return NewMountSource(&SimpleMountSourceOperations{
-		keep: false,
+		keep:       false,
+		revalidate: false,
+	}, filesystem, flags)
+}
+
+// NewRevalidatingMountSource returns a generic mount that will cache dirents,
+// but will revalidate them on each lookup.
+func NewRevalidatingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
+	return NewMountSource(&SimpleMountSourceOperations{
+		keep:       true,
+		revalidate: true,
 	}, filesystem, flags)
 }
 
@@ -265,12 +276,13 @@ func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *Mo
 //
 // +stateify savable
 type SimpleMountSourceOperations struct {
-	keep bool
+	keep       bool
+	revalidate bool
 }
 
 // Revalidate implements MountSourceOperations.Revalidate.
-func (*SimpleMountSourceOperations) Revalidate(context.Context, *Dirent) bool {
-	return false
+func (smo *SimpleMountSourceOperations) Revalidate(context.Context, *Inode) bool {
+	return smo.revalidate
 }
 
 // Keep implements MountSourceOperations.Keep.
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index d135e8a37..9fa87c10f 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -14,10 +14,13 @@
 
 package fs
 
-import "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
 
 // overlayMountSourceOperations implements MountSourceOperations for an overlay
-// mount point.
+// mount point. The upper filesystem determines the caching behavior of the
+// overlay.
 //
 // +stateify savable
 type overlayMountSourceOperations struct {
@@ -34,19 +37,33 @@ func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *M
 	}, &overlayFilesystem{}, flags)
 }
 
-// Revalidate panics if the upper or lower MountSource require that dirent be
-// revalidated. Otherwise always returns false.
-func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, dirent *Dirent) bool {
-	if o.upper.Revalidate(ctx, dirent) || o.lower.Revalidate(ctx, dirent) {
-		panic("an overlay cannot revalidate file objects")
+// Revalidate implements MountSourceOperations.Revalidate for an overlay by
+// delegating to the upper filesystem's Revalidate method. We cannot reload
+// files from the lower filesystem, so we panic if the lower filesystem's
+// Revalidate method returns true.
+func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, inode *Inode) bool {
+	if inode.overlay == nil {
+		panic("overlay cannot revalidate inode that is not an overlay")
 	}
-	return false
+
+	// Should we bother checking this, or just ignore?
+	if inode.overlay.lower != nil && o.lower.Revalidate(ctx, inode.overlay.lower) {
+		panic("an overlay cannot revalidate file objects from the lower fs")
+	}
+
+	if inode.overlay.upper == nil {
+		// Nothing to revalidate.
+		return false
+	}
+
+	// Does the upper require revalidation?
+	return o.upper.Revalidate(ctx, inode.overlay.upper)
 }
 
-// Keep returns true if either upper or lower MountSource require that the
-// dirent be kept in memory.
+// Keep implements MountSourceOperations by delegating to the upper
+// filesystem's Keep method.
 func (o *overlayMountSourceOperations) Keep(dirent *Dirent) bool {
-	return o.upper.Keep(dirent) || o.lower.Keep(dirent)
+	return o.upper.Keep(dirent)
 }
 
 // ResetInodeMappings propagates the call to both upper and lower MountSource.
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index af13dc8c7..5a30af419 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -88,10 +88,11 @@ func isXattrOverlay(name string) bool {
 // Preconditions:
 //
 // - upper and lower must be non-nil.
+// - upper must not be an overlay.
 // - lower should not expose character devices, pipes, or sockets, because
 //   copying up these types of files is not supported.
-// - upper and lower must not require that file objects be revalidated.
-// - upper and lower must not have dynamic file/directory content.
+// - lower must not require that file objects be revalidated.
+// - lower must not have dynamic file/directory content.
 func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsDir(upper.StableAttr) {
 		return nil, fmt.Errorf("upper Inode is not a directory")
@@ -99,6 +100,9 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount
 	if !IsDir(lower.StableAttr) {
 		return nil, fmt.Errorf("lower Inode is not a directory")
 	}
+	if upper.overlay != nil {
+		return nil, fmt.Errorf("cannot nest overlay in upper file of another overlay")
+	}
 
 	msrc := newOverlayMountSource(upper.MountSource, lower.MountSource, flags)
 	overlay, err := newOverlayEntry(ctx, upper, lower, true)
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index e28635607..fe7da05b5 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -82,7 +82,7 @@ type superOperations struct{}
 // Slave entries are dropped from dir when their master is closed, so an
 // existing slave Dirent in the tree is not sufficient to guarantee that it
 // still exists on the filesystem.
-func (superOperations) Revalidate(context.Context, *fs.Dirent) bool {
+func (superOperations) Revalidate(context.Context, *fs.Inode) bool {
 	return true
 }
 
-- 
cgit v1.2.3


From dde836a91858ceee25dbe023263752b39ae21274 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 13 Aug 2018 13:29:54 -0700
Subject: Prevent renames across walk fast path.

PiperOrigin-RevId: 208533436
Change-Id: Ifc1a4e2d6438a424650bee831c301b1ac0d670a3
---
 pkg/sentry/fs/dirent.go | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index f81ad5792..4d3aeaf41 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -533,14 +533,18 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 		return nil, syscall.ENOENT
 	}
 
-	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be expensive,
-	// if possible release the lock and re-acquire it.
+	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
+	// expensive, if possible release the lock and re-acquire it.
 	if walkMayUnlock {
+		// While this dirent is unlocked, the lookup below is not allowed to proceed in tandem with a
+		// rename operation. The rename should be fully complete before we call Lookup on anything.
 		d.mu.Unlock()
+		renameMu.RLock()
 	}
 	c, err := d.Inode.Lookup(ctx, name)
 	if walkMayUnlock {
 		d.mu.Lock()
+		renameMu.RUnlock()
 	}
 	// No dice.
 	if err != nil {
@@ -1047,34 +1051,12 @@ func (d *Dirent) flush() {
 	}
 }
 
-// Busy indicates whether this Dirent is a mount point or root dirent, or has
-// active positive children.
-//
-// This is expensive, since it flushes the children cache.
-//
-// TODO: Fix this busy-ness check.
+// Busy indicates whether this Dirent is a mount point or root dirent.
 func (d *Dirent) Busy() bool {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	if d.mounted || d.parent == nil {
-		return true
-	}
-
-	// Flush any cached references to children that are doomed.
-	d.flush()
-
-	// Count positive children.
-	var nonNegative int
-	for _, w := range d.children {
-		if child := w.Get(); child != nil {
-			if !child.(*Dirent).IsNegative() {
-				nonNegative++
-			}
-			child.DecRef()
-		}
-	}
-	return nonNegative > 0
+	return d.mounted || d.parent == nil
 }
 
 // mount mounts a new dirent with the given inode over d.
-- 
cgit v1.2.3


From 66b0f3e15a60df21f67d37dc6e420d1825acacfe Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 14 Aug 2018 10:33:53 -0700
Subject: Fix bind() on overlays.

InodeOperations.Bind now returns a Dirent which will be cached in the Dirent
tree.

When an overlay is in-use, Bind cannot return the Dirent created by the upper
filesystem because the Dirent does not know about the overlay. Instead,
overlayBind must create a new overlay-aware Inode and Dirent and return that.
This is analagous to how Lookup and overlayLookup work.

PiperOrigin-RevId: 208670710
Change-Id: I6390affbcf94c38656b4b458e248739b4853da29
---
 pkg/sentry/fs/inode_overlay.go | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 34e62a4a2..e18e095a0 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -364,7 +364,23 @@ func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.Bo
 	if o.upper == nil {
 		return nil, syserror.EOPNOTSUPP
 	}
-	return o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm)
+	d, err := o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm)
+	if err != nil {
+		return nil, err
+	}
+
+	// Grab the inode and drop the dirent, we don't need it.
+	inode := d.Inode
+	inode.IncRef()
+	d.DecRef()
+
+	// Create a new overlay entry and dirent for the socket.
+	entry, err := newOverlayEntry(ctx, inode, nil, false)
+	if err != nil {
+		inode.DecRef()
+		return nil, err
+	}
+	return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
 }
 
 func overlayBoundEndpoint(o *overlayEntry, path string) unix.BoundEndpoint {
-- 
cgit v1.2.3


From 6cf22781673d75cca459fd668cf291b387d52e0d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 14 Aug 2018 11:49:42 -0700
Subject: Automated rollback of changelist 208284483

PiperOrigin-RevId: 208685417
Change-Id: Ie2849c4811e3a2d14a002f521cef018ded0c6c4a
---
 pkg/abi/linux/BUILD                     |   1 -
 pkg/abi/linux/splice.go                 |  20 ---
 pkg/sentry/syscalls/linux/BUILD         |   1 -
 pkg/sentry/syscalls/linux/linux64.go    |   2 +-
 pkg/sentry/syscalls/linux/sys_splice.go | 260 --------------------------------
 5 files changed, 1 insertion(+), 283 deletions(-)
 delete mode 100644 pkg/abi/linux/splice.go
 delete mode 100644 pkg/sentry/syscalls/linux/sys_splice.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 9a44c2042..ac4ceefbc 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -43,7 +43,6 @@ go_library(
         "shm.go",
         "signal.go",
         "socket.go",
-        "splice.go",
         "time.go",
         "tty.go",
         "uio.go",
diff --git a/pkg/abi/linux/splice.go b/pkg/abi/linux/splice.go
deleted file mode 100644
index 9331ec84b..000000000
--- a/pkg/abi/linux/splice.go
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package linux
-
-// Flags for splice(2).
-const (
-	SPLICE_F_NONBLOCK = 2
-)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 62423c0fa..bbdfad9da 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -34,7 +34,6 @@ go_library(
         "sys_shm.go",
         "sys_signal.go",
         "sys_socket.go",
-        "sys_splice.go",
         "sys_stat.go",
         "sys_sync.go",
         "sys_sysinfo.go",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 485c96202..c102af101 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -319,7 +319,7 @@ var AMD64 = &kernel.SyscallTable{
 		272: Unshare,
 		273: syscalls.Error(syscall.ENOSYS), // SetRobustList, obsolete
 		274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete
-		275: Splice,
+		//     275: Splice, TODO
 		//     276: Tee, TODO
 		//     277: SyncFileRange, TODO
 		//     278: Vmsplice, TODO
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
deleted file mode 100644
index 8151e3599..000000000
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ /dev/null
@@ -1,260 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package linux
-
-import (
-	"io"
-
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-)
-
-// Splice implements linux syscall splice(2).
-func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	fdIn := kdefs.FD(args[0].Int())
-	offIn := args[1].Pointer()
-	fdOut := kdefs.FD(args[2].Int())
-	offOut := args[3].Pointer()
-	size := int64(args[4].SizeT())
-	flags := uint(args[5].Uint())
-
-	fileIn := t.FDMap().GetFile(fdIn)
-	if fileIn == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer fileIn.DecRef()
-	fileOut := t.FDMap().GetFile(fdOut)
-	if fileOut == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer fileOut.DecRef()
-
-	// Check for whether we have pipes.
-	ipipe := fs.IsPipe(fileIn.Dirent.Inode.StableAttr)
-	opipe := fs.IsPipe(fileOut.Dirent.Inode.StableAttr)
-	if (ipipe && offIn != 0) || (opipe && offOut != 0) {
-		return 0, nil, syserror.ESPIPE
-	}
-
-	// Check if both file descriptors are pipes.
-	if ipipe && opipe {
-		var readPipe *pipe.Pipe
-		switch p := fileIn.FileOperations.(type) {
-		case *pipe.Reader:
-			readPipe = p.ReaderWriter.Pipe
-		case *pipe.ReaderWriter:
-			readPipe = p.Pipe
-		default:
-			return 0, nil, syserror.EBADF
-		}
-		var writePipe *pipe.Pipe
-		switch p := fileOut.FileOperations.(type) {
-		case *pipe.Writer:
-			writePipe = p.ReaderWriter.Pipe
-		case *pipe.ReaderWriter:
-			writePipe = p.Pipe
-		default:
-			return 0, nil, syserror.EBADF
-		}
-
-		// Splicing with two ends of the same pipe is not allowed.
-		if readPipe == writePipe {
-			return 0, nil, syserror.EINVAL
-		}
-		spliced, err := splicePipeToPipe(t, fileIn, fileOut, size, flags)
-		if err != nil {
-			return 0, nil, err
-		}
-		return uintptr(spliced), nil, nil
-	}
-
-	// Check if the file descriptor that contains the data to move is a pipe.
-	if ipipe {
-		flagsOut := fileOut.Flags()
-		offset := uint64(fileOut.Offset())
-
-		// If there is an offset for the file, ensure the file has the Pwrite flag.
-		if offOut != 0 {
-			if !flagsOut.Pwrite {
-				return 0, nil, syserror.EINVAL
-			}
-			if _, err := t.CopyIn(offOut, &offset); err != nil {
-				return 0, nil, err
-			}
-		}
-
-		if !flagsOut.Write {
-			return 0, nil, syserror.EBADF
-		}
-
-		if flagsOut.Append {
-			return 0, nil, syserror.EINVAL
-		}
-
-		switch fileIn.FileOperations.(type) {
-		case *pipe.Reader, *pipe.ReaderWriter:
-			// If the pipe in is a Reader or ReaderWriter, we can continue.
-		default:
-			return 0, nil, syserror.EBADF
-		}
-		spliced, err := spliceWrite(t, fileIn, fileOut, size, offset, flags)
-		if err != nil {
-			return 0, nil, err
-		}
-
-		// Make sure value that offset points to is updated.
-		if offOut == 0 {
-			fileOut.Seek(t, fs.SeekSet, spliced+int64(offset))
-		} else if _, err := t.CopyOut(offOut, spliced+int64(offset)); err != nil {
-			return 0, nil, err
-		}
-		return uintptr(spliced), nil, nil
-	}
-
-	// Check if the file descriptor that the data will be moved to is a pipe.
-	if opipe {
-		flagsIn := fileIn.Flags()
-		offset := uint64(fileIn.Offset())
-
-		// If there is an offset for the file, ensure the file has the Pread flag.
-		if offIn != 0 {
-			if !flagsIn.Pread {
-				return 0, nil, syserror.EINVAL
-			}
-			if _, err := t.CopyIn(offIn, &offset); err != nil {
-				return 0, nil, err
-			}
-		}
-
-		if !flagsIn.Read {
-			return 0, nil, syserror.EBADF
-		}
-
-		switch fileOut.FileOperations.(type) {
-		case *pipe.Writer, *pipe.ReaderWriter:
-			// If the pipe out is a Writer or ReaderWriter, we can continue.
-		default:
-			return 0, nil, syserror.EBADF
-		}
-		spliced, err := spliceRead(t, fileIn, fileOut, size, offset, flags)
-		if err != nil {
-			return 0, nil, err
-		}
-
-		// Make sure value that offset points to is updated.
-		if offIn == 0 {
-			fileOut.Seek(t, fs.SeekSet, spliced+int64(offset))
-		} else if _, err := t.CopyOut(offIn, spliced+int64(offset)); err != nil {
-			return 0, nil, err
-		}
-		return uintptr(spliced), nil, nil
-	}
-
-	// Splice requires one of the file descriptors to be a pipe.
-	return 0, nil, syserror.EINVAL
-}
-
-// splicePipeToPipe moves data from one pipe to another pipe.
-// TODO: Implement with zero copy movement/without copying between
-// user and kernel address spaces.
-func splicePipeToPipe(t *kernel.Task, inPipe *fs.File, outPipe *fs.File, size int64, flags uint) (int64, error) {
-	w := &fs.FileWriter{t, outPipe}
-	if flags == linux.SPLICE_F_NONBLOCK {
-		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
-		return io.Copy(w, r)
-	}
-	var n int64
-	for read := int64(0); read < size; {
-		var err error
-		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
-		n, err = io.Copy(w, r)
-		if err != nil && err != syserror.ErrWouldBlock {
-			return 0, err
-		}
-		read += n
-	}
-	return n, nil
-}
-
-// spliceRead moves data from a file to a pipe.
-// TODO: Implement with zero copy movement/without copying between
-// user and kernel address spaces.
-func spliceRead(t *kernel.Task, inFile *fs.File, outPipe *fs.File, size int64, offset uint64, flags uint) (int64, error) {
-	w := &fs.FileWriter{t, outPipe}
-	if flags == linux.SPLICE_F_NONBLOCK {
-		r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), size)
-		return io.Copy(w, r)
-	}
-	var n int64
-	for read := int64(0); read < size; {
-		r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), size)
-		var err error
-		n, err = io.Copy(w, r)
-		if err != nil && err != syserror.ErrWouldBlock {
-			return 0, err
-		}
-		read += n
-	}
-	return n, nil
-}
-
-// offsetWriter implements io.Writer on a section of an underlying
-// WriterAt starting from the offset and ending at the limit.
-type offsetWriter struct {
-	w     io.WriterAt
-	off   int64
-	limit int64
-}
-
-// Write implements io.Writer.Write and writes the content of the offsetWriter
-// starting at the offset and ending at the limit into the given buffer.
-func (o *offsetWriter) Write(p []byte) (n int, err error) {
-	if o.off >= o.limit {
-		return 0, io.EOF
-	}
-	if max := o.limit - o.off; int64(len(p)) > max {
-		p = p[0:max]
-	}
-	n, err = o.w.WriteAt(p, o.off)
-	o.off += int64(n)
-	return n, err
-}
-
-// spliceWrite moves data from a pipe to a file.
-// TODO: Implement with zero copy movement/without copying between
-// user and kernel address spaces.
-func spliceWrite(t *kernel.Task, inPipe *fs.File, outFile *fs.File, size int64, offset uint64, flags uint) (int64, error) {
-	w := &offsetWriter{&fs.FileWriter{t, outFile}, int64(offset), int64(offset) + size}
-	if flags == linux.SPLICE_F_NONBLOCK {
-		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
-		return io.Copy(w, r)
-	}
-	var n int64
-	for read := int64(0); read < size; {
-		var err error
-		r := &io.LimitedReader{R: &fs.FileReader{t, inPipe}, N: size}
-		n, err = io.Copy(w, r)
-		if err != nil && err != syserror.ErrWouldBlock {
-			return 0, err
-		}
-		read += n
-	}
-	return n, nil
-}
-- 
cgit v1.2.3


From e97717e29a1bb3e373b130086c4182c598a8121c Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 14 Aug 2018 15:05:44 -0700
Subject: Enforce Unix socket address length limit

PiperOrigin-RevId: 208720936
Change-Id: Ic943a88b6efeff49574306d4d4e1f113116ae32e
---
 pkg/sentry/socket/epsocket/epsocket.go | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index b32eda96f..0000875e7 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -150,6 +150,9 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
 	switch family {
 	case linux.AF_UNIX:
 		path := addr[2:]
+		if len(path) > linux.UnixPathMax {
+			return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+		}
 		// Drop the terminating NUL (if one exists) and everything after it.
 		// Skip the first byte, which is NUL for abstract paths.
 		if len(path) > 1 {
-- 
cgit v1.2.3


From 12a4912aedc834fc8f404dc1ffeaa37088dd2d6b Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 14 Aug 2018 15:48:52 -0700
Subject: Fix `ls -laR | wc -l` hanging.

stat()-ing /proc/PID/fd/FD incremented but didn't decrement the refcount for
FD. This behavior wasn't usually noticeable, but in the above case:

- ls would never decrement the refcount of the write end of the pipe to 0.
- This caused the write end of the pipe never to close.
- wc would then hang read()-ing from the pipe.

PiperOrigin-RevId: 208728817
Change-Id: I4fca1ba5ca24e4108915a1d30b41dc63da40604d
---
 pkg/sentry/fs/proc/fds.go | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index cca8f874c..dada8f982 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -94,7 +94,7 @@ type fd struct {
 	*fs.File
 }
 
-// newFD returns a new fd based on an existing file.
+// newFd returns a new fd based on an existing file.
 //
 // This inherits one reference to the file.
 func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode {
@@ -131,6 +131,11 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error {
 	return nil
 }
 
+func (f *fd) Release(ctx context.Context) {
+	f.Symlink.Release(ctx)
+	f.File.DecRef()
+}
+
 // Close releases the reference on the file.
 func (f *fd) Close() error {
 	f.DecRef()
@@ -204,13 +209,14 @@ func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 type fdInfo struct {
 	ramfs.File
 
+	file    *fs.File
 	flags   fs.FileFlags
 	fdFlags kernel.FDFlags
 }
 
 // newFdInfo returns a new fdInfo based on an existing file.
 func newFdInfo(t *kernel.Task, file *fs.File, fdFlags kernel.FDFlags, msrc *fs.MountSource) *fs.Inode {
-	fdi := &fdInfo{flags: file.Flags(), fdFlags: fdFlags}
+	fdi := &fdInfo{file: file, flags: file.Flags(), fdFlags: fdFlags}
 	fdi.InitFile(t, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true}})
 	// TODO: Get pos, locks, and other data.  For now we only
 	// have flags.
@@ -231,6 +237,11 @@ func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error
 	return ramfs.ErrInvalidOp
 }
 
+func (f *fdInfo) Release(ctx context.Context) {
+	f.File.Release(ctx)
+	f.file.DecRef()
+}
+
 // fdInfoDir implements /proc/TID/fdinfo.  It embeds an fdDir, but overrides
 // Lookup and Readdir.
 //
-- 
cgit v1.2.3


From d4939f6dc22e5607cf2ff8d2a9eb1178e47b0a22 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 14 Aug 2018 16:21:38 -0700
Subject: TTY: Fix data race where calls into tty.queue's waiter were not
 synchronized.

Now, there's a waiter for each end (master and slave) of the TTY, and each
waiter.Entry is only enqueued in one of the waiters.

PiperOrigin-RevId: 208734483
Change-Id: I06996148f123075f8dd48cde5a553e2be74c6dce
---
 pkg/sentry/fs/tty/line_discipline.go | 61 ++++++++++++++++++++++++++++++++----
 pkg/sentry/fs/tty/master.go          |  6 ++--
 pkg/sentry/fs/tty/queue.go           | 55 +++++++++++++-------------------
 pkg/sentry/fs/tty/slave.go           |  6 ++--
 4 files changed, 81 insertions(+), 47 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index d243ee40e..c7f6c5645 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -90,6 +91,12 @@ type lineDiscipline struct {
 	// column is the location in a row of the cursor. This is important for
 	// handling certain special characters like backspace.
 	column int
+
+	// masterWaiter is used to wait on the master end of the TTY.
+	masterWaiter waiter.Queue `state:"zerovalue"`
+
+	// slaveWaiter is used to wait on the slave end of the TTY.
+	slaveWaiter waiter.Queue `state:"zerovalue"`
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -127,7 +134,9 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	// buffer to its read buffer. Anything already in the read buffer is
 	// now readable.
 	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
-		l.inQueue.pushWaitBuf(l)
+		if n := l.inQueue.pushWaitBuf(l); n > 0 {
+			l.slaveWaiter.Notify(waiter.EventIn)
+		}
 	}
 
 	return 0, err
@@ -152,13 +161,32 @@ func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO,
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	return l.inQueue.read(ctx, dst, l)
+	n, pushed, err := l.inQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.slaveWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
 }
 
 func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	return l.inQueue.write(ctx, src, l)
+	n, err := l.inQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
 }
 
 func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
@@ -168,13 +196,32 @@ func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO,
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	return l.outQueue.read(ctx, dst, l)
+	n, pushed, err := l.outQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.masterWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
 }
 
 func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
-	return l.outQueue.write(ctx, src, l)
+	n, err := l.outQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
 }
 
 // transformer is a helper interface to make it easier to stateify queue.
@@ -326,7 +373,9 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 		q.readBuf.WriteRune(c)
 		// Anything written to the readBuf will have to be echoed.
 		if l.termios.LEnabled(linux.ECHO) {
-			l.outQueue.writeBytes(cBytes, l)
+			if l.outQueue.writeBytes(cBytes, l) > 0 {
+				l.masterWaiter.Notify(waiter.EventIn)
+			}
 		}
 
 		// If we finish a line, make it available for reading.
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index c7198e218..c8dc08c1a 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -124,14 +124,12 @@ func (mf *masterFileOperations) Release() {
 
 // EventRegister implements waiter.Waitable.EventRegister.
 func (mf *masterFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	mf.t.ld.inQueue.EventRegister(e, mask)
-	mf.t.ld.outQueue.EventRegister(e, mask)
+	mf.t.ld.masterWaiter.EventRegister(e, mask)
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
 func (mf *masterFileOperations) EventUnregister(e *waiter.Entry) {
-	mf.t.ld.inQueue.EventUnregister(e)
-	mf.t.ld.outQueue.EventUnregister(e)
+	mf.t.ld.masterWaiter.EventUnregister(e)
 }
 
 // Readiness implements waiter.Waitable.Readiness.
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 42c105abc..01dc8d1ac 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -38,8 +38,6 @@ type queue struct {
 	// mu protects everything in queue.
 	mu sync.Mutex `state:"nosave"`
 
-	waiter.Queue `state:"zerovalue"`
-
 	// readBuf is buffer of data ready to be read when readable is true.
 	// This data has been processed.
 	readBuf bytes.Buffer `state:".([]byte)"`
@@ -112,15 +110,17 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 
 }
 
-// read reads from q to userspace.
+// read reads from q to userspace. It returns the number of bytes read as well
+// as whether the read caused more readable data to become available (whether
+// data was pushed from the wait buffer to the read buffer).
 //
 // Preconditions:
 // * l.termiosMu must be held for reading.
-func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, error) {
+func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 	if !q.readable {
-		return 0, syserror.ErrWouldBlock
+		return 0, false, syserror.ErrWouldBlock
 	}
 
 	// Read out from the read buffer.
@@ -133,7 +133,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 	}
 	n, err := dst.Writer(ctx).Write(q.readBuf.Bytes()[:n])
 	if err != nil {
-		return 0, err
+		return 0, false, err
 	}
 	// Discard bytes read out.
 	q.readBuf.Next(n)
@@ -144,16 +144,9 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 	}
 
 	// Move data from the queue's wait buffer to its read buffer.
-	q.pushWaitBufLocked(l)
-
-	// If state changed, notify any waiters. If nothing was available to
-	// read, let the caller know we could block.
-	if n > 0 {
-		q.Notify(waiter.EventOut)
-	} else {
-		return 0, syserror.ErrWouldBlock
-	}
-	return int64(n), nil
+	nPushed := q.pushWaitBufLocked(l)
+
+	return int64(n), nPushed > 0, nil
 }
 
 // write writes to q from userspace.
@@ -169,14 +162,20 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 		return 0, err
 	}
 	b = b[:n]
-	return q.writeBytes(b, l)
+
+	// If state changed, notify any waiters. If we were unable to write
+	// anything, let the caller know we could block.
+	if c := q.writeBytes(b, l); c > 0 {
+		return c, nil
+	}
+	return 0, syserror.ErrWouldBlock
 }
 
 // writeBytes writes to q from b.
 //
 // Preconditions:
 // * l.termiosMu must be held for reading.
-func (q *queue) writeBytes(b []byte, l *lineDiscipline) (int64, error) {
+func (q *queue) writeBytes(b []byte, l *lineDiscipline) int64 {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 	// Write as much as possible to the read buffer.
@@ -185,36 +184,26 @@ func (q *queue) writeBytes(b []byte, l *lineDiscipline) (int64, error) {
 	// Write remaining data to the wait buffer.
 	nWaiting, _ := q.waitBuf.Write(b[n:])
 
-	// If state changed, notify any waiters. If we were unable to write
-	// anything, let the caller know we could block.
-	if n > 0 {
-		q.Notify(waiter.EventIn)
-	} else if nWaiting == 0 {
-		return 0, syserror.ErrWouldBlock
-	}
-	return int64(n + nWaiting), nil
+	return int64(n + nWaiting)
 }
 
 // pushWaitBuf fills the queue's read buffer with data from the wait buffer.
 //
 // Preconditions:
 // * l.termiosMu must be held for reading.
-func (q *queue) pushWaitBuf(l *lineDiscipline) {
+func (q *queue) pushWaitBuf(l *lineDiscipline) int {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	q.pushWaitBufLocked(l)
+	return q.pushWaitBufLocked(l)
 }
 
 // Preconditions:
 // * l.termiosMu must be held for reading.
 // * q.mu must be locked.
-func (q *queue) pushWaitBufLocked(l *lineDiscipline) {
+func (q *queue) pushWaitBufLocked(l *lineDiscipline) int {
 	// Remove bytes from the wait buffer and move them to the read buffer.
 	n := q.transform(l, q, q.waitBuf.Bytes())
 	q.waitBuf.Next(n)
 
-	// If state changed, notify any waiters.
-	if n > 0 {
-		q.Notify(waiter.EventIn)
-	}
+	return n
 }
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 1c562b172..ab92ced7e 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -109,14 +109,12 @@ func (sf *slaveFileOperations) Release() {
 
 // EventRegister implements waiter.Waitable.EventRegister.
 func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sf.si.t.ld.outQueue.EventRegister(e, mask)
-	sf.si.t.ld.inQueue.EventRegister(e, mask)
+	sf.si.t.ld.slaveWaiter.EventRegister(e, mask)
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
 func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) {
-	sf.si.t.ld.outQueue.EventUnregister(e)
-	sf.si.t.ld.inQueue.EventUnregister(e)
+	sf.si.t.ld.slaveWaiter.EventUnregister(e)
 }
 
 // Readiness implements waiter.Waitable.Readiness.
-- 
cgit v1.2.3


From e8a4f2e133c3a7fb4a2dceb6675ebc57ea4f7350 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 14 Aug 2018 16:24:46 -0700
Subject: runsc: Change cache policy for root fs and volume mounts.

Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively.  While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.

This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.

This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.

A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.

All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.

PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
---
 pkg/sentry/fs/gofer/cache_policy.go |   7 +-
 runsc/boot/config.go                |   9 ++
 runsc/boot/fs.go                    |  62 +++++++----
 runsc/boot/loader_test.go           |  36 +++----
 runsc/container/container_test.go   | 207 +++++++++++++++++++++++++++++++-----
 runsc/main.go                       |   6 +-
 runsc/sandbox/sandbox.go            |   8 +-
 runsc/test/testutil/testutil.go     |   1 +
 8 files changed, 261 insertions(+), 75 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 52d97b54f..fa8abf51c 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -35,10 +35,9 @@ const (
 	// fs agent immediately.
 	cacheAllWritethrough
 
-	// Use virtual file system cache for everything, but reload dirents
-	// from the remote filesystem on each lookup. Thus, if the remote
-	// filesystem has changed, the returned dirent will have the updated
-	// state.
+	// Use the (host) page cache for reads/writes, but don't cache anything
+	// else. This allows the sandbox filesystem to stay in sync with any
+	// changes to the remote filesystem.
 	//
 	// This policy should *only* be used with remote filesystems that
 	// donate their host FDs to the sandbox and thus use the host page
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 074cd6a63..6c69a7c38 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -64,6 +64,11 @@ const (
 	// requests and forwards them to the host.
 	FileAccessProxy FileAccessType = iota
 
+	// FileAccessProxyExclusive is the same as FileAccessProxy, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessProxyExclusive
+
 	// FileAccessDirect connects the sandbox directly to the host filesystem.
 	FileAccessDirect
 )
@@ -73,6 +78,8 @@ func MakeFileAccessType(s string) (FileAccessType, error) {
 	switch s {
 	case "proxy":
 		return FileAccessProxy, nil
+	case "proxy-exclusive":
+		return FileAccessProxyExclusive, nil
 	case "direct":
 		return FileAccessDirect, nil
 	default:
@@ -84,6 +91,8 @@ func (f FileAccessType) String() string {
 	switch f {
 	case FileAccessProxy:
 		return "proxy"
+	case FileAccessProxyExclusive:
+		return "proxy-exclusive"
 	case FileAccessDirect:
 		return "direct"
 	default:
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e596c739f..eea2ec1f5 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"fmt"
 	"path/filepath"
+	"strconv"
 	"strings"
 
 	// Include filesystem types that OCI spec might mount.
@@ -54,6 +55,9 @@ type fdDispenser struct {
 }
 
 func (f *fdDispenser) remove() int {
+	if f.empty() {
+		panic("fdDispenser out of fds")
+	}
 	rv := f.fds[0]
 	f.fds = f.fds[1:]
 	return rv
@@ -160,8 +164,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 // setMounts iterates over mounts and mounts them in the specified
 // mount namespace.
 func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
-
-	// Mount all submounts from mounts.
 	for _, m := range mounts {
 		if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil {
 			return err
@@ -181,11 +183,12 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	)
 
 	switch conf.FileAccess {
-	case FileAccessProxy:
+	case FileAccessProxy, FileAccessProxyExclusive:
 		fd := fds.remove()
 		log.Infof("Mounting root over 9P, ioFD: %d", fd)
 		hostFS := mustFindFilesystem("9p")
-		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
+		opts := p9MountOptions(conf, fd)
+		rootInode, err = hostFS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
 		if err != nil {
 			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
 		}
@@ -242,13 +245,16 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
 }
 
-// getMountNameAndOptions retrieves the fsName, data, and useOverlay values
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
 func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
-	var fsName string
-	var data []string
-	var useOverlay bool
-	var err error
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+		err        error
+	)
+
 	switch m.Type {
 	case "devpts", "devtmpfs", "proc", "sysfs":
 		fsName = m.Type
@@ -258,17 +264,17 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 		fsName = m.Type
 
 		// tmpfs has some extra supported options that we must pass through.
-		data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
 
 	case "bind":
 		switch conf.FileAccess {
-		case FileAccessProxy:
+		case FileAccessProxy, FileAccessProxyExclusive:
 			fd := fds.remove()
 			fsName = "9p"
-			data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}
+			opts = p9MountOptions(conf, fd)
 		case FileAccessDirect:
 			fsName = "whitelistfs"
-			data = []string{"root=" + m.Source, "dont_translate_ownership=true"}
+			opts = []string{"root=" + m.Source, "dont_translate_ownership=true"}
 		default:
 			err = fmt.Errorf("invalid file access type: %v", conf.FileAccess)
 		}
@@ -282,13 +288,13 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 		// we do not support.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
-	return fsName, data, useOverlay, err
+	return fsName, opts, useOverlay, err
 }
 
 func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
-	fsName, data, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
 
 	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
 	if err != nil {
@@ -307,7 +313,7 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd
 		mf.ReadOnly = true
 	}
 
-	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(data, ","))
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","))
 	if err != nil {
 		return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
 	}
@@ -387,6 +393,20 @@ func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
 	return nil
 }
 
+// p9MountOptions creates a slice of options for a p9 mount.
+func p9MountOptions(conf *Config, fd int) []string {
+	opts := []string{
+		"trans=fd",
+		"rfdno=" + strconv.Itoa(fd),
+		"wfdno=" + strconv.Itoa(fd),
+		"privateunixsocket=true",
+	}
+	if conf.FileAccess == FileAccessProxy {
+		opts = append(opts, "cache=remote_revalidating")
+	}
+	return opts
+}
+
 // parseAndFilterOptions parses a MountOptions slice and filters by the allowed
 // keys.
 func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
@@ -436,8 +456,7 @@ func mountDevice(m specs.Mount) string {
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
 func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
-	fsName, data, _, err := getMountNameAndOptions(conf, m, fds)
-	dataString := strings.Join(data, ",")
+	fsName, opts, _, err := getMountNameAndOptions(conf, m, fds)
 
 	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
 	if err != nil {
@@ -452,7 +471,7 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	newMount := fs.MountArgs{
 		Dev:   mountDevice(m),
 		Flags: mountFlags(m.Options),
-		Data:  dataString,
+		Data:  strings.Join(opts, ","),
 	}
 	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
 	log.Infof("Added mount at %q: %+v", fsName, newMount)
@@ -473,7 +492,8 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 
 	// Add root mount.
 	fd := fds.remove()
-	dataString := strings.Join([]string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}, ",")
+	opts := p9MountOptions(conf, fd)
+
 	mf := fs.MountSourceFlags{}
 	if spec.Root.Readonly {
 		mf.ReadOnly = true
@@ -482,7 +502,7 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	rootMount := fs.MountArgs{
 		Dev:   rootDevice,
 		Flags: mf,
-		Data:  dataString,
+		Data:  strings.Join(opts, ","),
 	}
 	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 7ea2e1ee5..f2f690b5d 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -398,7 +398,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -458,11 +458,11 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 						{
 							Dev:  "9pfs-/dev/fd-foo",
-							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true",
+							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -522,7 +522,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:   "9pfs-/",
 							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
+							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -606,21 +606,21 @@ func TestRestoreEnvironment(t *testing.T) {
 			errorExpected: true,
 		},
 	}
-
 	for _, tc := range testCases {
-		fds := &fdDispenser{fds: tc.ioFDs}
-
-		actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds)
-		if !tc.errorExpected && err != nil {
-			t.Fatalf("could not create restore environment for test:%s", tc.name)
-		} else if tc.errorExpected {
-			if err == nil {
-				t.Fatalf("expected an error, but no error occurred.")
+		t.Run(tc.name, func(t *testing.T) {
+			fds := &fdDispenser{fds: tc.ioFDs}
+			actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds)
+			if !tc.errorExpected && err != nil {
+				t.Fatalf("could not create restore environment for test:%s", tc.name)
+			} else if tc.errorExpected {
+				if err == nil {
+					t.Errorf("expected an error, but no error occurred.")
+				}
+			} else {
+				if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
+					t.Errorf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv)
+				}
 			}
-		} else {
-			if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
-				t.Fatalf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv)
-			}
-		}
+		})
 	}
 }
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 11edcd615..33c53e189 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -227,33 +227,43 @@ func findUDSApp() (string, error) {
 	return matches[0], nil
 }
 
-type configOptions int
+type configOption int
 
 const (
-	overlay configOptions = 1 << iota
+	overlay configOption = iota
 	kvm
+	nonExclusiveFS
 )
-const all = overlay | kvm
+
+var all = []configOption{overlay, kvm, nonExclusiveFS}
 
 // configs generates different configurations to run tests.
-func configs(opts configOptions) []*boot.Config {
-	cs := []*boot.Config{testutil.TestConfig(), testutil.TestConfig()}
-	return cs
+func configs(opts ...configOption) []*boot.Config {
+	// Always load the default config.
+	cs := []*boot.Config{testutil.TestConfig()}
 
-	if opts&overlay != 0 {
+	for _, o := range opts {
 		c := testutil.TestConfig()
-		c.Overlay = true
+		switch o {
+		case overlay:
+			c.Overlay = true
+		case kvm:
+			// TODO: KVM tests are flaky. Disable until fixed.
+			continue
+
+			// TODO: KVM doesn't work with --race.
+			if testutil.RaceEnabled {
+				continue
+			}
+			c.Platform = boot.PlatformKVM
+		case nonExclusiveFS:
+			c.FileAccess = boot.FileAccessProxy
+		default:
+			panic(fmt.Sprintf("unknown config option %v", o))
+
+		}
 		cs = append(cs, c)
 	}
-
-	// TODO: KVM tests are flaky. Disable until fixed.
-	// // TODO: KVM doesn't work with --race.
-	// if !testutil.RaceEnabled && opts&kvm != 0 {
-	// 	c := testutil.TestConfig()
-	// 	c.Platform = boot.PlatformKVM
-	// 	cs = append(cs, c)
-	// }
-
 	return cs
 }
 
@@ -261,7 +271,7 @@ func configs(opts configOptions) []*boot.Config {
 // It verifies after each step that the container can be loaded from disk, and
 // has the correct status.
 func TestLifecycle(t *testing.T) {
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		// The container will just sleep for a long time.  We will kill it before
 		// it finishes sleeping.
@@ -1049,10 +1059,11 @@ func TestPauseResumeStatus(t *testing.T) {
 // - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
 //   this check.
 func TestCapabilities(t *testing.T) {
-	const uid = 343
-	const gid = 2401
+	// Pick uid/gid different than ours.
+	uid := auth.KUID(os.Getuid() + 1)
+	gid := auth.KGID(os.Getgid() + 1)
 
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("sleep", "100")
@@ -1142,7 +1153,7 @@ func TestCapabilities(t *testing.T) {
 
 // Test that an tty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 		spec := testutil.NewSpecWithArgs("true")
 		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
@@ -1303,8 +1314,6 @@ func TestReadonlyRoot(t *testing.T) {
 		defer os.RemoveAll(rootDir)
 		defer os.RemoveAll(bundleDir)
 
-		conf.Overlay = true
-
 		// Create, start and wait for the container.
 		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
@@ -1348,8 +1357,6 @@ func TestReadonlyMount(t *testing.T) {
 		defer os.RemoveAll(rootDir)
 		defer os.RemoveAll(bundleDir)
 
-		conf.Overlay = true
-
 		// Create, start and wait for the container.
 		s, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
@@ -1430,7 +1437,7 @@ func TestAbbreviatedIDs(t *testing.T) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	for _, conf := range configs(all) {
+	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		containerIDs := []string{
@@ -1619,3 +1626,149 @@ func TestMultiContainerWait(t *testing.T) {
 
 	wg.Wait()
 }
+
+// Check that modifications to a volume mount are propigated into and out of
+// the sandbox.
+func TestContainerVolumeContentsShared(t *testing.T) {
+	// Only run this test with shared proxy, since that is the only
+	// behavior it is testing.
+	conf := testutil.TestConfig()
+	conf.FileAccess = boot.FileAccessProxy
+	t.Logf("Running test with conf: %+v", conf)
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	// Mount host temp dir inside the sandbox at '/tmp2'.
+	hostTmpDir, err := ioutil.TempDir("", "root-fs-test")
+	sandboxTmpDir := "/tmp2"
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Type:        "bind",
+		Destination: sandboxTmpDir,
+		Source:      hostTmpDir,
+	})
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := container.Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	hostFilename := filepath.Join(hostTmpDir, "file")
+	sandboxFilename := filepath.Join(sandboxTmpDir, "file")
+
+	// File does not exist yet. Reading from the sandbox should fail.
+	execArgsTestFile := control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", sandboxFilename},
+	}
+	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
+	}
+
+	// Create the file from outside of the sandbox.
+	if err := ioutil.WriteFile(hostFilename, []byte("foobar"), 0777); err != nil {
+		t.Fatalf("error writing to file %q: %v", hostFilename, err)
+	}
+
+	// Now we should be able to test the file from within the sandbox.
+	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// Rename the file from outside of the sandbox.
+	newHostFilename := filepath.Join(hostTmpDir, "newfile")
+	newSandboxFilename := filepath.Join(sandboxTmpDir, "newfile")
+	if err := os.Rename(hostFilename, newHostFilename); err != nil {
+		t.Fatalf("os.Rename(%q, %q) failed: %v", hostFilename, newHostFilename, err)
+	}
+
+	// File should no longer exist at the old path within the sandbox.
+	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// We should be able to test the new filename from within the sandbox.
+	execArgsTestNewFile := control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", newSandboxFilename},
+	}
+	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newSandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", newSandboxFilename, ws.ExitStatus())
+	}
+
+	// Delete the renamed file from outside of the sandbox.
+	if err := os.Remove(newHostFilename); err != nil {
+		t.Fatalf("error removing file %q: %v", hostFilename, err)
+	}
+
+	// Renamed file should no longer exist at the old path within the sandbox.
+	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newSandboxFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", newSandboxFilename, ws.ExitStatus())
+	}
+
+	// Now create the file from WITHIN the sandbox.
+	execArgsTouch := control.ExecArgs{
+		Filename: "/usr/bin/touch",
+		Argv:     []string{"touch", sandboxFilename},
+		KUID:     auth.KUID(os.Getuid()),
+		KGID:     auth.KGID(os.Getgid()),
+	}
+	if ws, err := c.Execute(&execArgsTouch); err != nil {
+		t.Fatalf("unexpected error touching file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("touch %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(hostFilename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", hostFilename, err)
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(hostFilename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", hostFilename, err)
+	}
+
+	// Delete the file from within the sandbox.
+	execArgsRemove := control.ExecArgs{
+		Filename: "/bin/rm",
+		Argv:     []string{"rm", sandboxFilename},
+	}
+	if ws, err := c.Execute(&execArgsRemove); err != nil {
+		t.Fatalf("unexpected error removing file %q: %v", sandboxFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("remove %q exited with code %v, wanted zero", sandboxFilename, ws.ExitStatus())
+	}
+
+	// File should not exist outside the sandbox.
+	if _, err := os.Stat(hostFilename); !os.IsNotExist(err) {
+		t.Errorf("stat %q got error %v, wanted ErrNotExist", hostFilename, err)
+	}
+}
diff --git a/runsc/main.go b/runsc/main.go
index 10ae44b5e..b36100cca 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -57,7 +57,7 @@ var (
 	// Flags that control sandbox runtime behavior.
 	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
 	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	fileAccess     = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
+	fileAccess     = flag.String("file-access", "proxy-exclusive", "specifies which filesystem to use: proxy-exclusive (default), proxy-shared, or direct. Using a proxy is more secure because it disallows the sandbox from opening files directly in the host. Setting 'proxy-shared' will disable caches and should be used if external modifications to the filesystem are expected.")
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
@@ -119,6 +119,10 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
+	if *fileAccess == "proxy" && *overlay {
+		cmd.Fatalf("overlay flag is incompatible with file-access=proxy")
+	}
+
 	// Create a new Config from the flags.
 	conf := &boot.Config{
 		RootDir:        *rootDir,
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 196949f11..2b043d412 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -233,7 +233,7 @@ func (s *Sandbox) connError(err error) error {
 }
 
 func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) ([]*os.File, error) {
-	if conf.FileAccess != boot.FileAccessProxy {
+	if conf.FileAccess == boot.FileAccessDirect {
 		// Don't start a gofer. The sandbox will access host FS directly.
 		return nil, nil
 	}
@@ -369,11 +369,11 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
 	}
 
-	if conf.FileAccess == boot.FileAccessProxy {
+	if conf.FileAccess == boot.FileAccessDirect {
+		log.Infof("Sandbox will be started in the current mount namespace")
+	} else {
 		log.Infof("Sandbox will be started in new mount namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
-	} else {
-		log.Infof("Sandbox will be started in the current mount namespace")
 	}
 
 	// Joins the network namespace if network is enabled. the sandbox talks
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4e7ab3760..d2b39b58c 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -81,6 +81,7 @@ func TestConfig() *boot.Config {
 		Network:        boot.NetworkNone,
 		Strace:         true,
 		MultiContainer: true,
+		FileAccess:     boot.FileAccessProxyExclusive,
 	}
 }
 
-- 
cgit v1.2.3


From a620bea045b018b717fbba3193975e6d97c09bf9 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 14 Aug 2018 19:02:36 -0700
Subject: Reduce map lookups in syserr

PiperOrigin-RevId: 208755352
Change-Id: Ia24630f452a4a42940ab73a8113a2fd5ea2cfca2
---
 pkg/sentry/socket/epsocket/epsocket.go |   2 +-
 pkg/syserr/BUILD                       |   1 -
 pkg/syserr/host_linux.go               | 150 ++-------------
 pkg/syserr/linuxabi.go                 |  30 ---
 pkg/syserr/syserr.go                   | 342 ++++++++++++++++++---------------
 5 files changed, 211 insertions(+), 314 deletions(-)
 delete mode 100644 pkg/syserr/linuxabi.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 0000875e7..f8b24aaf1 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -473,7 +473,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 			if err == nil {
 				return int32(0), nil
 			}
-			return int32(syserr.ToLinux(syserr.TranslateNetstackError(err)).Number()), nil
+			return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
 
 		case linux.SO_PEERCRED:
 			if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index c0850f3d9..5dd2e90bb 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -6,7 +6,6 @@ go_library(
     name = "syserr",
     srcs = [
         "host_linux.go",
-        "linuxabi.go",
         "netstack.go",
         "syserr.go",
     ],
diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go
index ffd78e8f8..22009a799 100644
--- a/pkg/syserr/host_linux.go
+++ b/pkg/syserr/host_linux.go
@@ -17,144 +17,30 @@
 package syserr
 
 import (
+	"fmt"
 	"syscall"
 )
 
-var linuxHostTranslations = map[syscall.Errno]*Error{
-	syscall.EPERM:           ErrNotPermitted,
-	syscall.ENOENT:          ErrNoFileOrDir,
-	syscall.ESRCH:           ErrNoProcess,
-	syscall.EINTR:           ErrInterrupted,
-	syscall.EIO:             ErrIO,
-	syscall.ENXIO:           ErrDeviceOrAddress,
-	syscall.E2BIG:           ErrTooManyArgs,
-	syscall.ENOEXEC:         ErrEcec,
-	syscall.EBADF:           ErrBadFD,
-	syscall.ECHILD:          ErrNoChild,
-	syscall.EAGAIN:          ErrTryAgain,
-	syscall.ENOMEM:          ErrNoMemory,
-	syscall.EACCES:          ErrPermissionDenied,
-	syscall.EFAULT:          ErrBadAddress,
-	syscall.ENOTBLK:         ErrNotBlockDevice,
-	syscall.EBUSY:           ErrBusy,
-	syscall.EEXIST:          ErrExists,
-	syscall.EXDEV:           ErrCrossDeviceLink,
-	syscall.ENODEV:          ErrNoDevice,
-	syscall.ENOTDIR:         ErrNotDir,
-	syscall.EISDIR:          ErrIsDir,
-	syscall.EINVAL:          ErrInvalidArgument,
-	syscall.ENFILE:          ErrFileTableOverflow,
-	syscall.EMFILE:          ErrTooManyOpenFiles,
-	syscall.ENOTTY:          ErrNotTTY,
-	syscall.ETXTBSY:         ErrTestFileBusy,
-	syscall.EFBIG:           ErrFileTooBig,
-	syscall.ENOSPC:          ErrNoSpace,
-	syscall.ESPIPE:          ErrIllegalSeek,
-	syscall.EROFS:           ErrReadOnlyFS,
-	syscall.EMLINK:          ErrTooManyLinks,
-	syscall.EPIPE:           ErrBrokenPipe,
-	syscall.EDOM:            ErrDomain,
-	syscall.ERANGE:          ErrRange,
-	syscall.EDEADLOCK:       ErrDeadlock,
-	syscall.ENAMETOOLONG:    ErrNameTooLong,
-	syscall.ENOLCK:          ErrNoLocksAvailable,
-	syscall.ENOSYS:          ErrInvalidSyscall,
-	syscall.ENOTEMPTY:       ErrDirNotEmpty,
-	syscall.ELOOP:           ErrLinkLoop,
-	syscall.ENOMSG:          ErrNoMessage,
-	syscall.EIDRM:           ErrIdentifierRemoved,
-	syscall.ECHRNG:          ErrChannelOutOfRange,
-	syscall.EL2NSYNC:        ErrLevelTwoNotSynced,
-	syscall.EL3HLT:          ErrLevelThreeHalted,
-	syscall.EL3RST:          ErrLevelThreeReset,
-	syscall.ELNRNG:          ErrLinkNumberOutOfRange,
-	syscall.EUNATCH:         ErrProtocolDriverNotAttached,
-	syscall.ENOCSI:          ErrNoCSIAvailable,
-	syscall.EL2HLT:          ErrLevelTwoHalted,
-	syscall.EBADE:           ErrInvalidExchange,
-	syscall.EBADR:           ErrInvalidRequestDescriptor,
-	syscall.EXFULL:          ErrExchangeFull,
-	syscall.ENOANO:          ErrNoAnode,
-	syscall.EBADRQC:         ErrInvalidRequestCode,
-	syscall.EBADSLT:         ErrInvalidSlot,
-	syscall.EBFONT:          ErrBadFontFile,
-	syscall.ENOSTR:          ErrNotStream,
-	syscall.ENODATA:         ErrNoDataAvailable,
-	syscall.ETIME:           ErrTimerExpired,
-	syscall.ENOSR:           ErrStreamsResourceDepleted,
-	syscall.ENONET:          ErrMachineNotOnNetwork,
-	syscall.ENOPKG:          ErrPackageNotInstalled,
-	syscall.EREMOTE:         ErrIsRemote,
-	syscall.ENOLINK:         ErrNoLink,
-	syscall.EADV:            ErrAdvertise,
-	syscall.ESRMNT:          ErrSRMount,
-	syscall.ECOMM:           ErrSendCommunication,
-	syscall.EPROTO:          ErrProtocol,
-	syscall.EMULTIHOP:       ErrMultihopAttempted,
-	syscall.EDOTDOT:         ErrRFS,
-	syscall.EBADMSG:         ErrInvalidDataMessage,
-	syscall.EOVERFLOW:       ErrOverflow,
-	syscall.ENOTUNIQ:        ErrNetworkNameNotUnique,
-	syscall.EBADFD:          ErrFDInBadState,
-	syscall.EREMCHG:         ErrRemoteAddressChanged,
-	syscall.ELIBACC:         ErrSharedLibraryInaccessible,
-	syscall.ELIBBAD:         ErrCorruptedSharedLibrary,
-	syscall.ELIBSCN:         ErrLibSectionCorrupted,
-	syscall.ELIBMAX:         ErrTooManySharedLibraries,
-	syscall.ELIBEXEC:        ErrSharedLibraryExeced,
-	syscall.EILSEQ:          ErrIllegalByteSequence,
-	syscall.ERESTART:        ErrShouldRestart,
-	syscall.ESTRPIPE:        ErrStreamPipe,
-	syscall.EUSERS:          ErrTooManyUsers,
-	syscall.ENOTSOCK:        ErrNotASocket,
-	syscall.EDESTADDRREQ:    ErrDestinationAddressRequired,
-	syscall.EMSGSIZE:        ErrMessageTooLong,
-	syscall.EPROTOTYPE:      ErrWrongProtocolForSocket,
-	syscall.ENOPROTOOPT:     ErrProtocolNotAvailable,
-	syscall.EPROTONOSUPPORT: ErrProtocolNotSupported,
-	syscall.ESOCKTNOSUPPORT: ErrSocketNotSupported,
-	syscall.EOPNOTSUPP:      ErrEndpointOperation,
-	syscall.EPFNOSUPPORT:    ErrProtocolFamilyNotSupported,
-	syscall.EAFNOSUPPORT:    ErrAddressFamilyNotSupported,
-	syscall.EADDRINUSE:      ErrAddressInUse,
-	syscall.EADDRNOTAVAIL:   ErrAddressNotAvailable,
-	syscall.ENETDOWN:        ErrNetworkDown,
-	syscall.ENETUNREACH:     ErrNetworkUnreachable,
-	syscall.ENETRESET:       ErrNetworkReset,
-	syscall.ECONNABORTED:    ErrConnectionAborted,
-	syscall.ECONNRESET:      ErrConnectionReset,
-	syscall.ENOBUFS:         ErrNoBufferSpace,
-	syscall.EISCONN:         ErrAlreadyConnected,
-	syscall.ENOTCONN:        ErrNotConnected,
-	syscall.ESHUTDOWN:       ErrShutdown,
-	syscall.ETOOMANYREFS:    ErrTooManyRefs,
-	syscall.ETIMEDOUT:       ErrTimedOut,
-	syscall.ECONNREFUSED:    ErrConnectionRefused,
-	syscall.EHOSTDOWN:       ErrHostDown,
-	syscall.EHOSTUNREACH:    ErrNoRoute,
-	syscall.EALREADY:        ErrAlreadyInProgress,
-	syscall.EINPROGRESS:     ErrInProgress,
-	syscall.ESTALE:          ErrStaleFileHandle,
-	syscall.EUCLEAN:         ErrStructureNeedsCleaning,
-	syscall.ENOTNAM:         ErrIsNamedFile,
-	syscall.EREMOTEIO:       ErrRemoteIO,
-	syscall.EDQUOT:          ErrQuotaExceeded,
-	syscall.ENOMEDIUM:       ErrNoMedium,
-	syscall.EMEDIUMTYPE:     ErrWrongMediumType,
-	syscall.ECANCELED:       ErrCanceled,
-	syscall.ENOKEY:          ErrNoKey,
-	syscall.EKEYEXPIRED:     ErrKeyExpired,
-	syscall.EKEYREVOKED:     ErrKeyRevoked,
-	syscall.EKEYREJECTED:    ErrKeyRejected,
-	syscall.EOWNERDEAD:      ErrOwnerDied,
-	syscall.ENOTRECOVERABLE: ErrNotRecoverable,
+const maxErrno = 134
+
+type linuxHostTranslation struct {
+	err *Error
+	ok  bool
 }
 
+var linuxHostTranslations [maxErrno]linuxHostTranslation
+
 // FromHost translates a syscall.Errno to a corresponding Error value.
 func FromHost(err syscall.Errno) *Error {
-	e, ok := linuxHostTranslations[err]
-	if !ok {
-		panic("Unknown host errno " + err.Error())
+	if err < 0 || int(err) >= len(linuxHostTranslations) || !linuxHostTranslations[err].ok {
+		panic(fmt.Sprintf("unknown host errno %q (%d)", err.Error(), err))
+	}
+	return linuxHostTranslations[err].err
+}
+
+func addLinuxHostTranslation(host syscall.Errno, trans *Error) {
+	if linuxHostTranslations[host].ok {
+		panic(fmt.Sprintf("duplicate translation for host errno %q (%d)", host.Error(), host))
 	}
-	return e
+	linuxHostTranslations[host] = linuxHostTranslation{err: trans, ok: true}
 }
diff --git a/pkg/syserr/linuxabi.go b/pkg/syserr/linuxabi.go
deleted file mode 100644
index 71e3d1a81..000000000
--- a/pkg/syserr/linuxabi.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package syserr
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-)
-
-var linuxABITranslations = map[*Error]*linux.Errno{}
-
-// ToLinux translates an Error to a corresponding *linux.Errno value.
-func ToLinux(err *Error) *linux.Errno {
-	le, ok := linuxABITranslations[err]
-	if !ok {
-		panic("No Linux ABI translation available for " + err.String())
-	}
-	return le
-}
diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go
index 5d9fa24de..dba6cb7de 100644
--- a/pkg/syserr/syserr.go
+++ b/pkg/syserr/syserr.go
@@ -27,30 +27,43 @@ import (
 
 // Error represents an internal error.
 type Error struct {
-	string
+	// message is the human readable form of this Error.
+	message string
+
+	// noTranslation indicates that this Error cannot be translated to a
+	// linux.Errno.
+	noTranslation bool
+
+	// errno is the linux.Errno this Error should be translated to. nil means
+	// that this Error should be translated to a nil linux.Errno.
+	errno *linux.Errno
 }
 
 // New creates a new Error and adds a translation for it.
 //
 // New must only be called at init.
 func New(message string, linuxTranslation *linux.Errno) *Error {
-	err := &Error{message}
-	linuxABITranslations[err] = linuxTranslation
+	err := &Error{message: message, errno: linuxTranslation}
 
-	// TODO: Remove this.
 	if linuxTranslation == nil {
-		linuxBackwardsTranslations[err] = nil
-	} else {
-		e := error(syscall.Errno(linuxTranslation.Number()))
-		// syserror.ErrWouldBlock gets translated to syserror.EWOULDBLOCK and
-		// enables proper blocking semantics. This should temporary address the
-		// class of blocking bugs that keep popping up with the current state of
-		// the error space.
-		if e == syserror.EWOULDBLOCK {
-			e = syserror.ErrWouldBlock
-		}
-		linuxBackwardsTranslations[err] = e
+		return err
+	}
+
+	// TODO: Remove this.
+	errno := linuxTranslation.Number()
+	if errno <= 0 || errno >= len(linuxBackwardsTranslations) {
+		panic(fmt.Sprint("invalid errno: ", errno))
+	}
+
+	e := error(syscall.Errno(errno))
+	// syserror.ErrWouldBlock gets translated to syserror.EWOULDBLOCK and
+	// enables proper blocking semantics. This should temporary address the
+	// class of blocking bugs that keep popping up with the current state of
+	// the error space.
+	if e == syserror.EWOULDBLOCK {
+		e = syserror.ErrWouldBlock
 	}
+	linuxBackwardsTranslations[errno] = linuxBackwardsTranslation{err: e, ok: true}
 
 	return err
 }
@@ -58,7 +71,13 @@ func New(message string, linuxTranslation *linux.Errno) *Error {
 // NewWithoutTranslation creates a new Error. If translation is attempted on
 // the error, translation will fail.
 func NewWithoutTranslation(message string) *Error {
-	return &Error{message}
+	return &Error{message: message, noTranslation: true}
+}
+
+func newWithHost(message string, linuxTranslation *linux.Errno, hostErrno syscall.Errno) *Error {
+	e := New(message, linuxTranslation)
+	addLinuxHostTranslation(hostErrno, e)
+	return e
 }
 
 // String implements fmt.Stringer.String.
@@ -66,11 +85,16 @@ func (e *Error) String() string {
 	if e == nil {
 		return "<nil>"
 	}
-	return e.string
+	return e.message
+}
+
+type linuxBackwardsTranslation struct {
+	err error
+	ok  bool
 }
 
 // TODO: Remove this.
-var linuxBackwardsTranslations = map[*Error]error{}
+var linuxBackwardsTranslations [maxErrno]linuxBackwardsTranslation
 
 // ToError translates an Error to a corresponding error value.
 //
@@ -79,11 +103,26 @@ func (e *Error) ToError() error {
 	if e == nil {
 		return nil
 	}
-	err, ok := linuxBackwardsTranslations[e]
-	if !ok {
-		panic(fmt.Sprintf("unknown error: %q", e.string))
+	if e.noTranslation {
+		panic(fmt.Sprintf("error %q does not support translation", e.message))
 	}
-	return err
+	if e.errno == nil {
+		return nil
+	}
+	errno := e.errno.Number()
+	if errno <= 0 || errno >= len(linuxBackwardsTranslations) || !linuxBackwardsTranslations[errno].ok {
+		panic(fmt.Sprintf("unknown error %q (%d)", e.message, errno))
+	}
+	return linuxBackwardsTranslations[errno].err
+}
+
+// ToLinux converts the Error to a Linux ABI error that can be returned to the
+// application.
+func (e *Error) ToLinux() *linux.Errno {
+	if e.noTranslation {
+		panic(fmt.Sprintf("No Linux ABI translation available for %q", e.message))
+	}
+	return e.errno
 }
 
 // TODO: Remove or replace most of these errors.
@@ -91,134 +130,137 @@ func (e *Error) ToError() error {
 // Some of the errors should be replaced with package specific errors and
 // others should be removed entirely.
 var (
-	ErrNotPermitted               = New("operation not permitted", linux.EPERM)
-	ErrNoFileOrDir                = New("no such file or directory", linux.ENOENT)
-	ErrNoProcess                  = New("no such process", linux.ESRCH)
-	ErrInterrupted                = New("interrupted system call", linux.EINTR)
-	ErrIO                         = New("I/O error", linux.EIO)
-	ErrDeviceOrAddress            = New("no such device or address", linux.ENXIO)
-	ErrTooManyArgs                = New("argument list too long", linux.E2BIG)
-	ErrEcec                       = New("exec format error", linux.ENOEXEC)
-	ErrBadFD                      = New("bad file number", linux.EBADF)
-	ErrNoChild                    = New("no child processes", linux.ECHILD)
-	ErrTryAgain                   = New("try again", linux.EAGAIN)
-	ErrNoMemory                   = New("out of memory", linux.ENOMEM)
-	ErrPermissionDenied           = New("permission denied", linux.EACCES)
-	ErrBadAddress                 = New("bad address", linux.EFAULT)
-	ErrNotBlockDevice             = New("block device required", linux.ENOTBLK)
-	ErrBusy                       = New("device or resource busy", linux.EBUSY)
-	ErrExists                     = New("file exists", linux.EEXIST)
-	ErrCrossDeviceLink            = New("cross-device link", linux.EXDEV)
-	ErrNoDevice                   = New("no such device", linux.ENODEV)
-	ErrNotDir                     = New("not a directory", linux.ENOTDIR)
-	ErrIsDir                      = New("is a directory", linux.EISDIR)
-	ErrInvalidArgument            = New("invalid argument", linux.EINVAL)
-	ErrFileTableOverflow          = New("file table overflow", linux.ENFILE)
-	ErrTooManyOpenFiles           = New("too many open files", linux.EMFILE)
-	ErrNotTTY                     = New("not a typewriter", linux.ENOTTY)
-	ErrTestFileBusy               = New("text file busy", linux.ETXTBSY)
-	ErrFileTooBig                 = New("file too large", linux.EFBIG)
-	ErrNoSpace                    = New("no space left on device", linux.ENOSPC)
-	ErrIllegalSeek                = New("illegal seek", linux.ESPIPE)
-	ErrReadOnlyFS                 = New("read-only file system", linux.EROFS)
-	ErrTooManyLinks               = New("too many links", linux.EMLINK)
-	ErrBrokenPipe                 = New("broken pipe", linux.EPIPE)
-	ErrDomain                     = New("math argument out of domain of func", linux.EDOM)
-	ErrRange                      = New("math result not representable", linux.ERANGE)
-	ErrDeadlock                   = New("resource deadlock would occur", linux.EDEADLOCK)
-	ErrNameTooLong                = New("file name too long", linux.ENAMETOOLONG)
-	ErrNoLocksAvailable           = New("no record locks available", linux.ENOLCK)
-	ErrInvalidSyscall             = New("invalid system call number", linux.ENOSYS)
-	ErrDirNotEmpty                = New("directory not empty", linux.ENOTEMPTY)
-	ErrLinkLoop                   = New("too many symbolic links encountered", linux.ELOOP)
-	ErrWouldBlock                 = New("operation would block", linux.EWOULDBLOCK)
-	ErrNoMessage                  = New("no message of desired type", linux.ENOMSG)
-	ErrIdentifierRemoved          = New("identifier removed", linux.EIDRM)
-	ErrChannelOutOfRange          = New("channel number out of range", linux.ECHRNG)
-	ErrLevelTwoNotSynced          = New("level 2 not synchronized", linux.EL2NSYNC)
-	ErrLevelThreeHalted           = New("level 3 halted", linux.EL3HLT)
-	ErrLevelThreeReset            = New("level 3 reset", linux.EL3RST)
-	ErrLinkNumberOutOfRange       = New("link number out of range", linux.ELNRNG)
-	ErrProtocolDriverNotAttached  = New("protocol driver not attached", linux.EUNATCH)
-	ErrNoCSIAvailable             = New("no CSI structure available", linux.ENOCSI)
-	ErrLevelTwoHalted             = New("level 2 halted", linux.EL2HLT)
-	ErrInvalidExchange            = New("invalid exchange", linux.EBADE)
-	ErrInvalidRequestDescriptor   = New("invalid request descriptor", linux.EBADR)
-	ErrExchangeFull               = New("exchange full", linux.EXFULL)
-	ErrNoAnode                    = New("no anode", linux.ENOANO)
-	ErrInvalidRequestCode         = New("invalid request code", linux.EBADRQC)
-	ErrInvalidSlot                = New("invalid slot", linux.EBADSLT)
-	ErrBadFontFile                = New("bad font file format", linux.EBFONT)
-	ErrNotStream                  = New("device not a stream", linux.ENOSTR)
-	ErrNoDataAvailable            = New("no data available", linux.ENODATA)
-	ErrTimerExpired               = New("timer expired", linux.ETIME)
-	ErrStreamsResourceDepleted    = New("out of streams resources", linux.ENOSR)
-	ErrMachineNotOnNetwork        = New("machine is not on the network", linux.ENONET)
-	ErrPackageNotInstalled        = New("package not installed", linux.ENOPKG)
-	ErrIsRemote                   = New("object is remote", linux.EREMOTE)
-	ErrNoLink                     = New("link has been severed", linux.ENOLINK)
-	ErrAdvertise                  = New("advertise error", linux.EADV)
-	ErrSRMount                    = New("srmount error", linux.ESRMNT)
-	ErrSendCommunication          = New("communication error on send", linux.ECOMM)
-	ErrProtocol                   = New("protocol error", linux.EPROTO)
-	ErrMultihopAttempted          = New("multihop attempted", linux.EMULTIHOP)
-	ErrRFS                        = New("RFS specific error", linux.EDOTDOT)
-	ErrInvalidDataMessage         = New("not a data message", linux.EBADMSG)
-	ErrOverflow                   = New("value too large for defined data type", linux.EOVERFLOW)
-	ErrNetworkNameNotUnique       = New("name not unique on network", linux.ENOTUNIQ)
-	ErrFDInBadState               = New("file descriptor in bad state", linux.EBADFD)
-	ErrRemoteAddressChanged       = New("remote address changed", linux.EREMCHG)
-	ErrSharedLibraryInaccessible  = New("can not access a needed shared library", linux.ELIBACC)
-	ErrCorruptedSharedLibrary     = New("accessing a corrupted shared library", linux.ELIBBAD)
-	ErrLibSectionCorrupted        = New(".lib section in a.out corrupted", linux.ELIBSCN)
-	ErrTooManySharedLibraries     = New("attempting to link in too many shared libraries", linux.ELIBMAX)
-	ErrSharedLibraryExeced        = New("cannot exec a shared library directly", linux.ELIBEXEC)
-	ErrIllegalByteSequence        = New("illegal byte sequence", linux.EILSEQ)
-	ErrShouldRestart              = New("interrupted system call should be restarted", linux.ERESTART)
-	ErrStreamPipe                 = New("streams pipe error", linux.ESTRPIPE)
-	ErrTooManyUsers               = New("too many users", linux.EUSERS)
-	ErrNotASocket                 = New("socket operation on non-socket", linux.ENOTSOCK)
-	ErrDestinationAddressRequired = New("destination address required", linux.EDESTADDRREQ)
-	ErrMessageTooLong             = New("message too long", linux.EMSGSIZE)
-	ErrWrongProtocolForSocket     = New("protocol wrong type for socket", linux.EPROTOTYPE)
-	ErrProtocolNotAvailable       = New("protocol not available", linux.ENOPROTOOPT)
-	ErrProtocolNotSupported       = New("protocol not supported", linux.EPROTONOSUPPORT)
-	ErrSocketNotSupported         = New("socket type not supported", linux.ESOCKTNOSUPPORT)
-	ErrEndpointOperation          = New("operation not supported on transport endpoint", linux.EOPNOTSUPP)
-	ErrProtocolFamilyNotSupported = New("protocol family not supported", linux.EPFNOSUPPORT)
-	ErrAddressFamilyNotSupported  = New("address family not supported by protocol", linux.EAFNOSUPPORT)
-	ErrAddressInUse               = New("address already in use", linux.EADDRINUSE)
-	ErrAddressNotAvailable        = New("cannot assign requested address", linux.EADDRNOTAVAIL)
-	ErrNetworkDown                = New("network is down", linux.ENETDOWN)
-	ErrNetworkUnreachable         = New("network is unreachable", linux.ENETUNREACH)
-	ErrNetworkReset               = New("network dropped connection because of reset", linux.ENETRESET)
-	ErrConnectionAborted          = New("software caused connection abort", linux.ECONNABORTED)
-	ErrConnectionReset            = New("connection reset by peer", linux.ECONNRESET)
-	ErrNoBufferSpace              = New("no buffer space available", linux.ENOBUFS)
-	ErrAlreadyConnected           = New("transport endpoint is already connected", linux.EISCONN)
-	ErrNotConnected               = New("transport endpoint is not connected", linux.ENOTCONN)
-	ErrShutdown                   = New("cannot send after transport endpoint shutdown", linux.ESHUTDOWN)
-	ErrTooManyRefs                = New("too many references: cannot splice", linux.ETOOMANYREFS)
-	ErrTimedOut                   = New("connection timed out", linux.ETIMEDOUT)
-	ErrConnectionRefused          = New("connection refused", linux.ECONNREFUSED)
-	ErrHostDown                   = New("host is down", linux.EHOSTDOWN)
-	ErrNoRoute                    = New("no route to host", linux.EHOSTUNREACH)
-	ErrAlreadyInProgress          = New("operation already in progress", linux.EALREADY)
-	ErrInProgress                 = New("operation now in progress", linux.EINPROGRESS)
-	ErrStaleFileHandle            = New("stale file handle", linux.ESTALE)
-	ErrStructureNeedsCleaning     = New("structure needs cleaning", linux.EUCLEAN)
-	ErrIsNamedFile                = New("is a named type file", linux.ENOTNAM)
-	ErrRemoteIO                   = New("remote I/O error", linux.EREMOTEIO)
-	ErrQuotaExceeded              = New("quota exceeded", linux.EDQUOT)
-	ErrNoMedium                   = New("no medium found", linux.ENOMEDIUM)
-	ErrWrongMediumType            = New("wrong medium type", linux.EMEDIUMTYPE)
-	ErrCanceled                   = New("operation Canceled", linux.ECANCELED)
-	ErrNoKey                      = New("required key not available", linux.ENOKEY)
-	ErrKeyExpired                 = New("key has expired", linux.EKEYEXPIRED)
-	ErrKeyRevoked                 = New("key has been revoked", linux.EKEYREVOKED)
-	ErrKeyRejected                = New("key was rejected by service", linux.EKEYREJECTED)
-	ErrOwnerDied                  = New("owner died", linux.EOWNERDEAD)
-	ErrNotRecoverable             = New("state not recoverable", linux.ENOTRECOVERABLE)
+	ErrNotPermitted               = newWithHost("operation not permitted", linux.EPERM, syscall.EPERM)
+	ErrNoFileOrDir                = newWithHost("no such file or directory", linux.ENOENT, syscall.ENOENT)
+	ErrNoProcess                  = newWithHost("no such process", linux.ESRCH, syscall.ESRCH)
+	ErrInterrupted                = newWithHost("interrupted system call", linux.EINTR, syscall.EINTR)
+	ErrIO                         = newWithHost("I/O error", linux.EIO, syscall.EIO)
+	ErrDeviceOrAddress            = newWithHost("no such device or address", linux.ENXIO, syscall.ENXIO)
+	ErrTooManyArgs                = newWithHost("argument list too long", linux.E2BIG, syscall.E2BIG)
+	ErrEcec                       = newWithHost("exec format error", linux.ENOEXEC, syscall.ENOEXEC)
+	ErrBadFD                      = newWithHost("bad file number", linux.EBADF, syscall.EBADF)
+	ErrNoChild                    = newWithHost("no child processes", linux.ECHILD, syscall.ECHILD)
+	ErrTryAgain                   = newWithHost("try again", linux.EAGAIN, syscall.EAGAIN)
+	ErrNoMemory                   = newWithHost("out of memory", linux.ENOMEM, syscall.ENOMEM)
+	ErrPermissionDenied           = newWithHost("permission denied", linux.EACCES, syscall.EACCES)
+	ErrBadAddress                 = newWithHost("bad address", linux.EFAULT, syscall.EFAULT)
+	ErrNotBlockDevice             = newWithHost("block device required", linux.ENOTBLK, syscall.ENOTBLK)
+	ErrBusy                       = newWithHost("device or resource busy", linux.EBUSY, syscall.EBUSY)
+	ErrExists                     = newWithHost("file exists", linux.EEXIST, syscall.EEXIST)
+	ErrCrossDeviceLink            = newWithHost("cross-device link", linux.EXDEV, syscall.EXDEV)
+	ErrNoDevice                   = newWithHost("no such device", linux.ENODEV, syscall.ENODEV)
+	ErrNotDir                     = newWithHost("not a directory", linux.ENOTDIR, syscall.ENOTDIR)
+	ErrIsDir                      = newWithHost("is a directory", linux.EISDIR, syscall.EISDIR)
+	ErrInvalidArgument            = newWithHost("invalid argument", linux.EINVAL, syscall.EINVAL)
+	ErrFileTableOverflow          = newWithHost("file table overflow", linux.ENFILE, syscall.ENFILE)
+	ErrTooManyOpenFiles           = newWithHost("too many open files", linux.EMFILE, syscall.EMFILE)
+	ErrNotTTY                     = newWithHost("not a typewriter", linux.ENOTTY, syscall.ENOTTY)
+	ErrTestFileBusy               = newWithHost("text file busy", linux.ETXTBSY, syscall.ETXTBSY)
+	ErrFileTooBig                 = newWithHost("file too large", linux.EFBIG, syscall.EFBIG)
+	ErrNoSpace                    = newWithHost("no space left on device", linux.ENOSPC, syscall.ENOSPC)
+	ErrIllegalSeek                = newWithHost("illegal seek", linux.ESPIPE, syscall.ESPIPE)
+	ErrReadOnlyFS                 = newWithHost("read-only file system", linux.EROFS, syscall.EROFS)
+	ErrTooManyLinks               = newWithHost("too many links", linux.EMLINK, syscall.EMLINK)
+	ErrBrokenPipe                 = newWithHost("broken pipe", linux.EPIPE, syscall.EPIPE)
+	ErrDomain                     = newWithHost("math argument out of domain of func", linux.EDOM, syscall.EDOM)
+	ErrRange                      = newWithHost("math result not representable", linux.ERANGE, syscall.ERANGE)
+	ErrDeadlock                   = newWithHost("resource deadlock would occur", linux.EDEADLOCK, syscall.EDEADLOCK)
+	ErrNameTooLong                = newWithHost("file name too long", linux.ENAMETOOLONG, syscall.ENAMETOOLONG)
+	ErrNoLocksAvailable           = newWithHost("no record locks available", linux.ENOLCK, syscall.ENOLCK)
+	ErrInvalidSyscall             = newWithHost("invalid system call number", linux.ENOSYS, syscall.ENOSYS)
+	ErrDirNotEmpty                = newWithHost("directory not empty", linux.ENOTEMPTY, syscall.ENOTEMPTY)
+	ErrLinkLoop                   = newWithHost("too many symbolic links encountered", linux.ELOOP, syscall.ELOOP)
+	ErrNoMessage                  = newWithHost("no message of desired type", linux.ENOMSG, syscall.ENOMSG)
+	ErrIdentifierRemoved          = newWithHost("identifier removed", linux.EIDRM, syscall.EIDRM)
+	ErrChannelOutOfRange          = newWithHost("channel number out of range", linux.ECHRNG, syscall.ECHRNG)
+	ErrLevelTwoNotSynced          = newWithHost("level 2 not synchronized", linux.EL2NSYNC, syscall.EL2NSYNC)
+	ErrLevelThreeHalted           = newWithHost("level 3 halted", linux.EL3HLT, syscall.EL3HLT)
+	ErrLevelThreeReset            = newWithHost("level 3 reset", linux.EL3RST, syscall.EL3RST)
+	ErrLinkNumberOutOfRange       = newWithHost("link number out of range", linux.ELNRNG, syscall.ELNRNG)
+	ErrProtocolDriverNotAttached  = newWithHost("protocol driver not attached", linux.EUNATCH, syscall.EUNATCH)
+	ErrNoCSIAvailable             = newWithHost("no CSI structure available", linux.ENOCSI, syscall.ENOCSI)
+	ErrLevelTwoHalted             = newWithHost("level 2 halted", linux.EL2HLT, syscall.EL2HLT)
+	ErrInvalidExchange            = newWithHost("invalid exchange", linux.EBADE, syscall.EBADE)
+	ErrInvalidRequestDescriptor   = newWithHost("invalid request descriptor", linux.EBADR, syscall.EBADR)
+	ErrExchangeFull               = newWithHost("exchange full", linux.EXFULL, syscall.EXFULL)
+	ErrNoAnode                    = newWithHost("no anode", linux.ENOANO, syscall.ENOANO)
+	ErrInvalidRequestCode         = newWithHost("invalid request code", linux.EBADRQC, syscall.EBADRQC)
+	ErrInvalidSlot                = newWithHost("invalid slot", linux.EBADSLT, syscall.EBADSLT)
+	ErrBadFontFile                = newWithHost("bad font file format", linux.EBFONT, syscall.EBFONT)
+	ErrNotStream                  = newWithHost("device not a stream", linux.ENOSTR, syscall.ENOSTR)
+	ErrNoDataAvailable            = newWithHost("no data available", linux.ENODATA, syscall.ENODATA)
+	ErrTimerExpired               = newWithHost("timer expired", linux.ETIME, syscall.ETIME)
+	ErrStreamsResourceDepleted    = newWithHost("out of streams resources", linux.ENOSR, syscall.ENOSR)
+	ErrMachineNotOnNetwork        = newWithHost("machine is not on the network", linux.ENONET, syscall.ENONET)
+	ErrPackageNotInstalled        = newWithHost("package not installed", linux.ENOPKG, syscall.ENOPKG)
+	ErrIsRemote                   = newWithHost("object is remote", linux.EREMOTE, syscall.EREMOTE)
+	ErrNoLink                     = newWithHost("link has been severed", linux.ENOLINK, syscall.ENOLINK)
+	ErrAdvertise                  = newWithHost("advertise error", linux.EADV, syscall.EADV)
+	ErrSRMount                    = newWithHost("srmount error", linux.ESRMNT, syscall.ESRMNT)
+	ErrSendCommunication          = newWithHost("communication error on send", linux.ECOMM, syscall.ECOMM)
+	ErrProtocol                   = newWithHost("protocol error", linux.EPROTO, syscall.EPROTO)
+	ErrMultihopAttempted          = newWithHost("multihop attempted", linux.EMULTIHOP, syscall.EMULTIHOP)
+	ErrRFS                        = newWithHost("RFS specific error", linux.EDOTDOT, syscall.EDOTDOT)
+	ErrInvalidDataMessage         = newWithHost("not a data message", linux.EBADMSG, syscall.EBADMSG)
+	ErrOverflow                   = newWithHost("value too large for defined data type", linux.EOVERFLOW, syscall.EOVERFLOW)
+	ErrNetworkNameNotUnique       = newWithHost("name not unique on network", linux.ENOTUNIQ, syscall.ENOTUNIQ)
+	ErrFDInBadState               = newWithHost("file descriptor in bad state", linux.EBADFD, syscall.EBADFD)
+	ErrRemoteAddressChanged       = newWithHost("remote address changed", linux.EREMCHG, syscall.EREMCHG)
+	ErrSharedLibraryInaccessible  = newWithHost("can not access a needed shared library", linux.ELIBACC, syscall.ELIBACC)
+	ErrCorruptedSharedLibrary     = newWithHost("accessing a corrupted shared library", linux.ELIBBAD, syscall.ELIBBAD)
+	ErrLibSectionCorrupted        = newWithHost(".lib section in a.out corrupted", linux.ELIBSCN, syscall.ELIBSCN)
+	ErrTooManySharedLibraries     = newWithHost("attempting to link in too many shared libraries", linux.ELIBMAX, syscall.ELIBMAX)
+	ErrSharedLibraryExeced        = newWithHost("cannot exec a shared library directly", linux.ELIBEXEC, syscall.ELIBEXEC)
+	ErrIllegalByteSequence        = newWithHost("illegal byte sequence", linux.EILSEQ, syscall.EILSEQ)
+	ErrShouldRestart              = newWithHost("interrupted system call should be restarted", linux.ERESTART, syscall.ERESTART)
+	ErrStreamPipe                 = newWithHost("streams pipe error", linux.ESTRPIPE, syscall.ESTRPIPE)
+	ErrTooManyUsers               = newWithHost("too many users", linux.EUSERS, syscall.EUSERS)
+	ErrNotASocket                 = newWithHost("socket operation on non-socket", linux.ENOTSOCK, syscall.ENOTSOCK)
+	ErrDestinationAddressRequired = newWithHost("destination address required", linux.EDESTADDRREQ, syscall.EDESTADDRREQ)
+	ErrMessageTooLong             = newWithHost("message too long", linux.EMSGSIZE, syscall.EMSGSIZE)
+	ErrWrongProtocolForSocket     = newWithHost("protocol wrong type for socket", linux.EPROTOTYPE, syscall.EPROTOTYPE)
+	ErrProtocolNotAvailable       = newWithHost("protocol not available", linux.ENOPROTOOPT, syscall.ENOPROTOOPT)
+	ErrProtocolNotSupported       = newWithHost("protocol not supported", linux.EPROTONOSUPPORT, syscall.EPROTONOSUPPORT)
+	ErrSocketNotSupported         = newWithHost("socket type not supported", linux.ESOCKTNOSUPPORT, syscall.ESOCKTNOSUPPORT)
+	ErrEndpointOperation          = newWithHost("operation not supported on transport endpoint", linux.EOPNOTSUPP, syscall.EOPNOTSUPP)
+	ErrProtocolFamilyNotSupported = newWithHost("protocol family not supported", linux.EPFNOSUPPORT, syscall.EPFNOSUPPORT)
+	ErrAddressFamilyNotSupported  = newWithHost("address family not supported by protocol", linux.EAFNOSUPPORT, syscall.EAFNOSUPPORT)
+	ErrAddressInUse               = newWithHost("address already in use", linux.EADDRINUSE, syscall.EADDRINUSE)
+	ErrAddressNotAvailable        = newWithHost("cannot assign requested address", linux.EADDRNOTAVAIL, syscall.EADDRNOTAVAIL)
+	ErrNetworkDown                = newWithHost("network is down", linux.ENETDOWN, syscall.ENETDOWN)
+	ErrNetworkUnreachable         = newWithHost("network is unreachable", linux.ENETUNREACH, syscall.ENETUNREACH)
+	ErrNetworkReset               = newWithHost("network dropped connection because of reset", linux.ENETRESET, syscall.ENETRESET)
+	ErrConnectionAborted          = newWithHost("software caused connection abort", linux.ECONNABORTED, syscall.ECONNABORTED)
+	ErrConnectionReset            = newWithHost("connection reset by peer", linux.ECONNRESET, syscall.ECONNRESET)
+	ErrNoBufferSpace              = newWithHost("no buffer space available", linux.ENOBUFS, syscall.ENOBUFS)
+	ErrAlreadyConnected           = newWithHost("transport endpoint is already connected", linux.EISCONN, syscall.EISCONN)
+	ErrNotConnected               = newWithHost("transport endpoint is not connected", linux.ENOTCONN, syscall.ENOTCONN)
+	ErrShutdown                   = newWithHost("cannot send after transport endpoint shutdown", linux.ESHUTDOWN, syscall.ESHUTDOWN)
+	ErrTooManyRefs                = newWithHost("too many references: cannot splice", linux.ETOOMANYREFS, syscall.ETOOMANYREFS)
+	ErrTimedOut                   = newWithHost("connection timed out", linux.ETIMEDOUT, syscall.ETIMEDOUT)
+	ErrConnectionRefused          = newWithHost("connection refused", linux.ECONNREFUSED, syscall.ECONNREFUSED)
+	ErrHostDown                   = newWithHost("host is down", linux.EHOSTDOWN, syscall.EHOSTDOWN)
+	ErrNoRoute                    = newWithHost("no route to host", linux.EHOSTUNREACH, syscall.EHOSTUNREACH)
+	ErrAlreadyInProgress          = newWithHost("operation already in progress", linux.EALREADY, syscall.EALREADY)
+	ErrInProgress                 = newWithHost("operation now in progress", linux.EINPROGRESS, syscall.EINPROGRESS)
+	ErrStaleFileHandle            = newWithHost("stale file handle", linux.ESTALE, syscall.ESTALE)
+	ErrStructureNeedsCleaning     = newWithHost("structure needs cleaning", linux.EUCLEAN, syscall.EUCLEAN)
+	ErrIsNamedFile                = newWithHost("is a named type file", linux.ENOTNAM, syscall.ENOTNAM)
+	ErrRemoteIO                   = newWithHost("remote I/O error", linux.EREMOTEIO, syscall.EREMOTEIO)
+	ErrQuotaExceeded              = newWithHost("quota exceeded", linux.EDQUOT, syscall.EDQUOT)
+	ErrNoMedium                   = newWithHost("no medium found", linux.ENOMEDIUM, syscall.ENOMEDIUM)
+	ErrWrongMediumType            = newWithHost("wrong medium type", linux.EMEDIUMTYPE, syscall.EMEDIUMTYPE)
+	ErrCanceled                   = newWithHost("operation canceled", linux.ECANCELED, syscall.ECANCELED)
+	ErrNoKey                      = newWithHost("required key not available", linux.ENOKEY, syscall.ENOKEY)
+	ErrKeyExpired                 = newWithHost("key has expired", linux.EKEYEXPIRED, syscall.EKEYEXPIRED)
+	ErrKeyRevoked                 = newWithHost("key has been revoked", linux.EKEYREVOKED, syscall.EKEYREVOKED)
+	ErrKeyRejected                = newWithHost("key was rejected by service", linux.EKEYREJECTED, syscall.EKEYREJECTED)
+	ErrOwnerDied                  = newWithHost("owner died", linux.EOWNERDEAD, syscall.EOWNERDEAD)
+	ErrNotRecoverable             = newWithHost("state not recoverable", linux.ENOTRECOVERABLE, syscall.ENOTRECOVERABLE)
+
+	// ErrWouldBlock translates to EWOULDBLOCK which is the same as EAGAIN
+	// on Linux.
+	ErrWouldBlock = New("operation would block", linux.EWOULDBLOCK)
 )
 
 // FromError converts a generic error to an *Error.
-- 
cgit v1.2.3


From 635b0c45933cd841298b0c21a513a9169e849594 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 15 Aug 2018 16:24:07 -0700
Subject: runsc fsgofer: Support dynamic serving of filesystems.

When multiple containers run inside a sentry, each container has its own root
filesystem and set of mounts. Containers are also added after sentry boot rather
than all configured and known at boot time.

The fsgofer needs to be able to serve the root filesystem of each container.
Thus, it must be possible to add filesystems after the fsgofer has already
started.

This change:
* Creates a URPC endpoint within the gofer process that listens for requests to
  serve new content.
* Enables the sentry, when starting a new container, to add the new container's
  filesystem.
* Mounts those new filesystems at separate roots within the sentry.

PiperOrigin-RevId: 208903248
Change-Id: Ifa91ec9c8caf5f2f0a9eead83c4a57090ce92068
---
 pkg/sentry/kernel/kernel.go  |  14 ++-
 pkg/urpc/urpc.go             |  10 +++
 runsc/boot/controller.go     |  19 +++-
 runsc/boot/fs.go             | 159 ++++++++++++++++++++++++---------
 runsc/boot/loader.go         |  36 ++++----
 runsc/cmd/BUILD              |   1 -
 runsc/cmd/gofer.go           |  36 +++-----
 runsc/container/container.go |   7 ++
 runsc/fsgofer/BUILD          |   4 +
 runsc/fsgofer/control.go     | 203 +++++++++++++++++++++++++++++++++++++++++++
 runsc/fsgofer/fsgofer.go     |   5 ++
 runsc/sandbox/BUILD          |   1 +
 runsc/sandbox/sandbox.go     | 139 ++++++++++++++++++++++-------
 13 files changed, 515 insertions(+), 119 deletions(-)
 create mode 100644 runsc/fsgofer/control.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 419a1d473..cb43fdcdc 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -504,6 +504,14 @@ type CreateProcessArgs struct {
 
 	// IPCNamespace is the initial IPC namespace.
 	IPCNamespace *IPCNamespace
+
+	// Root optionally contains the dirent that serves as the root for the
+	// process. If nil, the mount namespace's root is used as the process'
+	// root.
+	//
+	// Anyone setting Root must donate a reference (i.e. increment it) to
+	// keep it alive until it is decremented by CreateProcess.
+	Root *fs.Dirent
 }
 
 // NewContext returns a context.Context that represents the task that will be
@@ -581,8 +589,12 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	ctx := args.NewContext(k)
 
 	// Grab the root directory.
-	root := fs.RootFromContext(ctx)
+	root := args.Root
+	if root == nil {
+		root = fs.RootFromContext(ctx)
+	}
 	defer root.DecRef()
+	args.Root = nil
 
 	// Grab the working directory.
 	wd := root // Default.
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index af620b704..1ec06dd4c 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -63,6 +63,10 @@ func (r RemoteError) Error() string {
 // file as a result of an RPC. These are not actually serialized, rather they
 // are sent via an accompanying SCM_RIGHTS message (plumbed through the unet
 // package).
+//
+// When embedding a FilePayload in an argument struct, the argument type _must_
+// be a pointer to the struct rather than the struct type itself. This is
+// because the urpc package defines pointer methods on FilePayload.
 type FilePayload struct {
 	Files []*os.File `json:"-"`
 }
@@ -552,6 +556,12 @@ func (c *Client) Call(method string, arg interface{}, result interface{}) error
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
+	// If arg is a FilePayload, not a *FilePayload, files won't actually be
+	// sent, so error out.
+	if _, ok := arg.(FilePayload); ok {
+		return fmt.Errorf("argument is a FilePayload, but should be a *FilePayload")
+	}
+
 	// Are there files to send?
 	var fs []*os.File
 	if fp, ok := arg.(filePayloader); ok {
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index fc6ea326a..69e88d8e0 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"errors"
 	"fmt"
+	"path"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
@@ -181,11 +182,15 @@ type StartArgs struct {
 
 	// CID is the ID of the container to start.
 	CID string
+
+	// FilePayload contains the file descriptor over which the sandbox will
+	// request files from its root filesystem.
+	urpc.FilePayload
 }
 
 // Start runs a created container within a sandbox.
 func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
-	log.Debugf("containerManager.Start")
+	log.Debugf("containerManager.Start: %+v", args)
 
 	// Validate arguments.
 	if args == nil {
@@ -200,8 +205,18 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if args.CID == "" {
 		return errors.New("start argument missing container ID")
 	}
+	// Prevent CIDs containing ".." from confusing the sentry when creating
+	// /containers/<cid> directory.
+	// TODO: Once we have multiple independant roots, this
+	// check won't be necessary.
+	if path.Clean(args.CID) != args.CID {
+		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
+	}
+	if len(args.FilePayload.Files) != 1 {
+		return fmt.Errorf("start arguments must contain one file for the container root")
+	}
 
-	tgid, err := cm.l.startContainer(args, cm.l.k)
+	tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0])
 	if err != nil {
 		return err
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index eea2ec1f5..8996b1398 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -48,6 +48,19 @@ const (
 
 	// Device name for root mount.
 	rootDevice = "9pfs-/"
+
+	// childContainersDir is the directory where child container root
+	// filesystems are mounted.
+	childContainersDir = "/__runsc_containers__"
+
+	// Filesystems that runsc supports.
+	bind     = "bind"
+	devpts   = "devpts"
+	devtmpfs = "devtmpfs"
+	proc     = "proc"
+	sysfs    = "sysfs"
+	tmpfs    = "tmpfs"
+	nonefs   = "none"
 )
 
 type fdDispenser struct {
@@ -70,8 +83,15 @@ func (f *fdDispenser) empty() bool {
 // createMountNamespace creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
 func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+	mounts := compileMounts(spec)
+	// Create a tmpfs mount where we create and mount a root filesystem for
+	// each child container.
+	mounts = append(mounts, specs.Mount{
+		Type:        tmpfs,
+		Destination: childContainersDir,
+	})
 	fds := &fdDispenser{fds: ioFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds)
+	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount: %v", err)
 	}
@@ -79,7 +99,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
 	}
-	mounts := compileMounts(spec)
+
 	if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil {
 		return nil, fmt.Errorf("failed to configure mounts: %v", err)
 	}
@@ -98,12 +118,12 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 
 	// Always mount /dev.
 	mounts = append(mounts, specs.Mount{
-		Type:        "devtmpfs",
+		Type:        devtmpfs,
 		Destination: "/dev",
 	})
 
 	mounts = append(mounts, specs.Mount{
-		Type:        "devpts",
+		Type:        devpts,
 		Destination: "/dev/pts",
 	})
 
@@ -129,13 +149,13 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	var mandatoryMounts []specs.Mount
 	if !procMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        "proc",
+			Type:        proc,
 			Destination: "/proc",
 		})
 	}
 	if !sysMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        "sysfs",
+			Type:        sysfs,
 			Destination: "/sys",
 		})
 	}
@@ -149,7 +169,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 		// that. Until then, the /tmp mount will always appear empty at
 		// container creation.
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        "tmpfs",
+			Type:        tmpfs,
 			Destination: "/tmp",
 		})
 	}
@@ -165,7 +185,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 // mount namespace.
 func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
 	for _, m := range mounts {
-		if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil {
+		if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil {
 			return err
 		}
 	}
@@ -173,7 +193,7 @@ func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *f
 }
 
 // createRootMount creates the root filesystem.
-func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) {
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
 	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly}
 
@@ -207,7 +227,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	// We need to overlay the root on top of a ramfs with stub directories
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
-	submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp")
+	submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
 	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
 	if err != nil {
 		return nil, fmt.Errorf("error adding submount overlay: %v", err)
@@ -256,17 +276,17 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	)
 
 	switch m.Type {
-	case "devpts", "devtmpfs", "proc", "sysfs":
+	case devpts, devtmpfs, proc, sysfs:
 		fsName = m.Type
-	case "none":
-		fsName = "sysfs"
-	case "tmpfs":
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
 		fsName = m.Type
 
 		// tmpfs has some extra supported options that we must pass through.
 		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
 
-	case "bind":
+	case bind:
 		switch conf.FileAccess {
 		case FileAccessProxy, FileAccessProxyExclusive:
 			fd := fds.remove()
@@ -291,7 +311,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 	return fsName, opts, useOverlay, err
 }
 
-func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
@@ -342,51 +362,52 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd
 	// in the right location, e.g.
 	//   mount: /var/run/secrets, may be created in '/run/secrets' if
 	//   '/var/run' => '/var'.
-	if err := mkdirAll(ctx, mns, m.Destination); err != nil {
+	if err := mkdirAll(ctx, mns, dest); err != nil {
 		return err
 	}
 
 	root := mns.Root()
 	defer root.DecRef()
-	dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals)
+	dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals)
 	if err != nil {
-		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
+		return fmt.Errorf("failed to find mount destination %q: %v", dest, err)
 	}
 	defer dirent.DecRef()
 	if err := mns.Mount(ctx, dirent, inode); err != nil {
-		return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err)
+		return fmt.Errorf("failed to mount at destination %q: %v", dest, err)
 	}
 
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type)
 	return nil
 }
 
 func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
+	log.Infof("mkdirAll called with path %s", path)
 	root := mns.Root()
 	defer root.DecRef()
 
 	// Starting at the root, walk the path.
 	parent := root
 	ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
-	for i := 0; i < len(ps); i++ {
-		if ps[i] == "" {
+	for _, pathElem := range ps {
+		if pathElem == "" {
 			// This will be case for the first and last element, if the path
 			// begins or ends with '/'. Note that we always treat the path as
 			// absolute, regardless of what the first character contains.
 			continue
 		}
-		d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit)
+		d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit)
 		if err == syserror.ENOENT {
 			// If we encounter a path that does not exist, then
 			// create it.
-			if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil {
-				return fmt.Errorf("failed to create directory %q: %v", ps[i], err)
+			if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil {
+				return fmt.Errorf("failed to create directory %q: %v", pathElem, err)
 			}
-			if d, err = parent.Walk(ctx, root, ps[i]); err != nil {
-				return fmt.Errorf("walk to %q failed: %v", ps[i], err)
+			if d, err = parent.Walk(ctx, root, pathElem); err != nil {
+				return fmt.Errorf("walk to %q failed: %v", pathElem, err)
 			}
 		} else if err != nil {
-			return fmt.Errorf("failed to find inode %q: %v", ps[i], err)
+			return fmt.Errorf("failed to find inode %q: %v", pathElem, err)
 		}
 		parent = d
 	}
@@ -444,7 +465,7 @@ func destinations(mounts []specs.Mount, extra ...string) []string {
 // mountDevice returns a device string based on the fs type and target
 // of the mount.
 func mountDevice(m specs.Mount) string {
-	if m.Type == "bind" {
+	if m.Type == bind {
 		// Make a device string that includes the target, which is consistent across
 		// S/R and uniquely identifies the connection.
 		return "9pfs-" + m.Destination
@@ -589,7 +610,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
 
 // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
 // procArgs are passed by reference and the FDMap field is modified.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel) error {
+func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If
@@ -604,27 +625,79 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	// won't need ours either way.
 	procArgs.FDMap = fdm
 
+	// Use root user to configure mounts. The current user might not have
+	// permission to do so.
+	rootProcArgs := kernel.CreateProcessArgs{
+		WorkingDirectory:     "/",
+		Credentials:          auth.NewRootCredentials(creds.UserNamespace),
+		Umask:                0022,
+		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+	}
+	rootCtx := rootProcArgs.NewContext(k)
+
 	// If this is the root container, we also need to setup the root mount
 	// namespace.
-	if k.RootMountNamespace() == nil {
-		// Use root user to configure mounts. The current user might not have
-		// permission to do so.
-		rootProcArgs := kernel.CreateProcessArgs{
-			WorkingDirectory:     "/",
-			Credentials:          auth.NewRootCredentials(creds.UserNamespace),
-			Umask:                0022,
-			MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		}
-		rootCtx := rootProcArgs.NewContext(k)
-
+	mns := k.RootMountNamespace()
+	if mns == nil {
 		// Create the virtual filesystem.
 		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
 		if err != nil {
 			return fmt.Errorf("error creating mounts: %v", err)
 		}
-
 		k.SetRootMountNamespace(mns)
+		return nil
+	}
+
+	// Setup a child container.
+
+	// Create the container's root filesystem mount.
+	log.Infof("Creating new process in child container.")
+	fds := &fdDispenser{fds: append([]int{}, ioFDs...)}
+	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+	if err != nil {
+		return fmt.Errorf("error creating filesystem for container: %v", err)
+	}
+
+	// Make directories for submounts within the container.
+	rootDir := mns.Root()
+	defer rootDir.DecRef()
+	containerRoot := filepath.Join(childContainersDir, cid)
+	mkdirAll(ctx, mns, containerRoot)
+
+	// Mount the container's root filesystem to the newly created
+	// mount point.
+	containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err)
+	}
+	if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil {
+		return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err)
+	}
+	containerRootDirent.DecRef()
+
+	// We have to re-walk to the dirent to find the mounted
+	// directory. The old dirent is invalid at this point.
+	containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err)
+	}
+	log.Infof("Mounted child's root fs to %q", containerRoot)
+
+	// Mount all submounts.
+	mounts := compileMounts(spec)
+	for _, m := range mounts {
+		// TODO: Enable bind mounts in child containers.
+		if m.Type == bind {
+			log.Infof("Bind mounts in child containers are not yet supported: %+v", m)
+			continue
+		}
+		dest := filepath.Join(containerRoot, m.Destination)
+		if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil {
+			return fmt.Errorf("error mounting filesystem for container: %v", err)
+		}
 	}
 
+	// Set the procArgs root directory.
+	procArgs.Root = containerRootDirent
 	return nil
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f6c7bf223..7debf0ac2 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -19,6 +19,7 @@ import (
 	"errors"
 	"fmt"
 	"math/rand"
+	"os"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -229,7 +230,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	// Ensure that signals received are forwarded to the emulated kernel.
 	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
 
-	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
+	procArgs, err := newProcess(spec, creds, utsns, ipcns, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
@@ -250,7 +251,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 }
 
 // newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -277,7 +278,6 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds
 		UTSNamespace:         utsns,
 		IPCNamespace:         ipcns,
 	}
-
 	return procArgs, nil
 }
 
@@ -356,7 +356,8 @@ func (l *Loader) run() error {
 			l.console,
 			l.rootProcArgs.Credentials,
 			l.rootProcArgs.Limits,
-			l.k)
+			l.k,
+			"" /* CID, which isn't needed for the root container */)
 		if err != nil {
 			return err
 		}
@@ -376,8 +377,7 @@ func (l *Loader) run() error {
 
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process.
-func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.ThreadID, error) {
-	spec := args.Spec
+func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
@@ -406,26 +406,24 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 	// when indicated by the spec.
 
 	procArgs, err := newProcess(
-		args.Spec,
-		args.Conf,
-		nil,   // ioFDs
-		false, // console
+		spec,
 		creds,
-		k.RootUTSNamespace(),
-		k.RootIPCNamespace(),
-		k)
+		l.k.RootUTSNamespace(),
+		l.k.RootIPCNamespace(),
+		l.k)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
 	err = setFileSystemForProcess(
 		&procArgs,
-		args.Spec,
-		args.Conf,
-		nil,
+		spec,
+		conf,
+		[]int{int(file.Fd())}, // ioFDs
 		false,
 		creds,
 		procArgs.Limits,
-		k)
+		k,
+		cid)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
@@ -435,7 +433,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 		return 0, fmt.Errorf("failed to create process in sentry: %v", err)
 	}
 
-	ts := k.TaskSet()
+	ts := l.k.TaskSet()
 	tgid := ts.Root.IDOfThreadGroup(tg)
 	if tgid == 0 {
 		return 0, errors.New("failed to get thread group ID of new process")
@@ -446,7 +444,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa
 
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	l.containerRootTGIDs[args.CID] = tgid
+	l.containerRootTGIDs[cid] = tgid
 
 	return tgid, nil
 }
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 18e95284b..c45784749 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -36,7 +36,6 @@ go_library(
         "//pkg/p9",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
-        "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/container",
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 55315c0e8..ed4b1d29c 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"os"
-	"sync"
 	"syscall"
 
 	"context"
@@ -25,7 +24,6 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -36,6 +34,10 @@ type Gofer struct {
 	bundleDir string
 	ioFDs     intFlags
 	applyCaps bool
+
+	// controllerFD is the file descriptor of a stream socket for the
+	// control server that is donated to this process.
+	controllerFD int
 }
 
 // Name implements subcommands.Command.
@@ -58,11 +60,12 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
+	f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
 }
 
 // Execute implements subcommands.Command.
 func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	if g.bundleDir == "" || len(g.ioFDs) < 1 {
+	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.controllerFD == -1 {
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
@@ -134,29 +137,14 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
 	}
 
-	runServers(ats, g.ioFDs)
-	return subcommands.ExitSuccess
-}
+	ctrl, err := fsgofer.NewController(g.controllerFD, g.bundleDir)
 
-func runServers(ats []p9.Attacher, ioFDs []int) {
-	// Run the loops and wait for all to exit.
-	var wg sync.WaitGroup
-	for i, ioFD := range ioFDs {
-		wg.Add(1)
-		go func(ioFD int, at p9.Attacher) {
-			socket, err := unet.NewSocket(ioFD)
-			if err != nil {
-				Fatalf("err creating server on FD %d: %v", ioFD, err)
-			}
-			s := p9.NewServer(at)
-			if err := s.Handle(socket); err != nil {
-				Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
-			}
-			wg.Done()
-		}(ioFD, ats[i])
+	if err := ctrl.Serve(ats, g.ioFDs); err != nil {
+		Fatalf("Failed to serve via P9: %v", err)
 	}
-	wg.Wait()
-	log.Infof("All 9P servers exited.")
+	ctrl.Wait()
+
+	return subcommands.ExitSuccess
 }
 
 func isReadonlyMount(opts []string) bool {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 574075b00..da2ce0d25 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -249,6 +249,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 			return nil, err
 		}
 		c.Sandbox = sb.Sandbox
+
+		// Prepare the gofer to serve the container's filesystem.
+		err = sb.Sandbox.CreateChild(c.ID, bundleDir)
+		if err != nil {
+			c.Destroy()
+			return nil, err
+		}
 	}
 	c.Status = Created
 
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 24e172f48..0bc682b5f 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 go_library(
     name = "fsgofer",
     srcs = [
+        "control.go",
         "fsgofer.go",
         "fsgofer_unsafe.go",
     ],
@@ -14,9 +15,12 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/control/server",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/unet",
+        "//pkg/urpc",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go
new file mode 100644
index 000000000..8ce8ee8a0
--- /dev/null
+++ b/runsc/fsgofer/control.go
@@ -0,0 +1,203 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"fmt"
+	"path/filepath"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Controller manages the fsgofer's control server.
+type Controller struct {
+	// api holds the control server's URPC endpoints.
+	api api
+
+	// srv is the control server.
+	srv *server.Server
+}
+
+// NewController creates a new Controller and starts it listenting
+func NewController(fd int, rootBundleDir string) (*Controller, error) {
+	if !filepath.IsAbs(rootBundleDir) {
+		return nil, fmt.Errorf("NewController should receive an absolute bundle dir path, but got %q", rootBundleDir)
+	}
+
+	srv, err := server.CreateFromFD(fd)
+	if err != nil {
+		return nil, err
+	}
+
+	cr := &Controller{srv: srv}
+	cr.api.rootBundleDir = rootBundleDir
+	cr.api.bundleDirs = make(map[string]string)
+	srv.Register(&cr.api)
+
+	if err := srv.StartServing(); err != nil {
+		return nil, err
+	}
+
+	return cr, nil
+}
+
+// Wait waits for all the p9 servers to finish, then shuts down the control
+// server.
+func (cr *Controller) Wait() {
+	cr.api.p9wg.Wait()
+	cr.srv.Stop()
+	log.Infof("All 9P servers exited.")
+}
+
+// Serve starts serving each Attacher in ats via its corresponding file
+// descriptor in ioFDs.
+func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error {
+	if len(ats) != len(ioFDs) {
+		return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs))
+	}
+	for i, _ := range ats {
+		cr.api.serve(ats[i], ioFDs[i])
+	}
+	return nil
+}
+
+// api URPC methods.
+const (
+	// AddBundleDirs readies the gofer to serve from a new bundle
+	// directory. It should be called during runsc create.
+	AddBundleDirs = "api.AddBundleDirs"
+
+	// ServeDirectory serves a new directory via the fsgofer. It should be
+	// called during runsc start.
+	ServeDirectory = "api.ServeDirectory"
+)
+
+// API defines and implements the URPC endpoints for the gofer.
+type api struct {
+	// p9wg waits for all the goroutines serving the sentry via p9. When its
+	// counter is 0, the gofer is out of work and exits.
+	p9wg sync.WaitGroup
+
+	// bundleDirs maps from container ID to bundle directory for each
+	// container.
+	bundleDirs map[string]string
+
+	// rootBundleDir is the bundle directory of the root container.
+	rootBundleDir string
+}
+
+// AddBundleDirsRequest is the URPC argument to AddBundleDirs.
+type AddBundleDirsRequest struct {
+	// BundleDirs is a map of container IDs to bundle directories to add to
+	// the gofer.
+	BundleDirs map[string]string
+}
+
+// AddBundleDirsRequest adds bundle directories that for the gofer to serve.
+func (api *api) AddBundleDirs(req *AddBundleDirsRequest, _ *struct{}) error {
+	log.Debugf("fsgofer.AddBundleDirs")
+	for cid, bd := range req.BundleDirs {
+		if _, ok := api.bundleDirs[cid]; ok {
+			return fmt.Errorf("fsgofer already has a bundleDir for container %q", cid)
+		}
+		api.bundleDirs[cid] = bd
+	}
+	return nil
+}
+
+// ServeDirectoryRequest is the URPC argument to ServeDirectory.
+type ServeDirectoryRequest struct {
+	// Dir is the absolute path to a directory to be served to the sentry.
+	Dir string
+
+	// IsReadOnly specifies whether the directory should be served in
+	// read-only mode.
+	IsReadOnly bool
+
+	// CID is the container ID of the container that needs to serve a
+	// directory.
+	CID string
+
+	// FilePayload contains the socket over which the sentry will request
+	// files from Dir.
+	urpc.FilePayload
+}
+
+// ServeDirectory begins serving a directory via a file descriptor for the
+// sentry. Directories must be added via AddBundleDirsRequest before
+// ServeDirectory is called.
+func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error {
+	log.Debugf("fsgofer.ServeDirectory: %+v", req)
+
+	if req.Dir == "" {
+		return fmt.Errorf("ServeDirectory should receive a directory argument, but was empty")
+	}
+	if req.CID == "" {
+		return fmt.Errorf("ServeDirectory should receive a CID argument, but was empty")
+	}
+	// Prevent CIDs containing ".." from confusing the sentry when creating
+	// /containers/<cid> directory.
+	// TODO: Once we have multiple independant roots, this
+	// check won't be necessary.
+	if filepath.Clean(req.CID) != req.CID {
+		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", req.CID)
+	}
+	if nFiles := len(req.FilePayload.Files); nFiles != 1 {
+		return fmt.Errorf("ServeDirectory should receive 1 file descriptor, but got %d", nFiles)
+	}
+
+	bd, ok := api.bundleDirs[req.CID]
+	if !ok {
+		// If there's no entry in bundleDirs for the container ID, this
+		// is the root container.
+		bd = api.rootBundleDir
+	}
+
+	// Relative paths are served relative to the bundle directory.
+	absDir := req.Dir
+	if !filepath.IsAbs(absDir) {
+		absDir = filepath.Join(bd, req.Dir)
+	}
+
+	// Create the attach point and start serving.
+	at := NewAttachPoint(absDir, Config{
+		ROMount:          req.IsReadOnly,
+		LazyOpenForWrite: true,
+	})
+	api.serve(at, int(req.FilePayload.Files[0].Fd()))
+
+	return nil
+}
+
+// serve begins serving a directory via a file descriptor.
+func (api *api) serve(at p9.Attacher, ioFD int) {
+	api.p9wg.Add(1)
+	go func(ioFD int, at p9.Attacher) {
+		socket, err := unet.NewSocket(ioFD)
+		if err != nil {
+			panic(fmt.Sprintf("err creating server on FD %d: %v", ioFD, err))
+		}
+		s := p9.NewServer(at)
+		if err := s.Handle(socket); err != nil {
+			panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err))
+		}
+		api.p9wg.Done()
+	}(ioFD, at)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 52cdc91a2..38263896a 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -66,6 +66,11 @@ func (f fileType) String() string {
 	return "unknown"
 }
 
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+	return fmt.Sprintf("\x00runsc-gofer.%s", id)
+}
+
 // Config sets configuration options for each attach point.
 type Config struct {
 	// ROMount is set to true if this is a readonly mount.
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index a961c3cc7..cdacc5e22 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/fsgofer",
         "//runsc/specutils",
         "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 2b043d412..83cc94dc4 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -84,7 +85,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 // StartRoot starts running the root container process inside the sandbox.
 func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 	log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -104,21 +105,67 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
+// CreateChild creates a non-root container inside the sandbox.
+func (s *Sandbox) CreateChild(cid, bundleDir string) error {
+	log.Debugf("Create non-root container sandbox %q, pid: %d for container %q with bundle directory %q", s.ID, s.Pid, cid, bundleDir)
+
+	// Connect to the gofer and prepare it to serve from bundleDir for this
+	// container.
+	goferConn, err := s.goferConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to gofer: %v", err)
+	}
+	defer goferConn.Close()
+	goferReq := fsgofer.AddBundleDirsRequest{BundleDirs: map[string]string{cid: bundleDir}}
+	if err := goferConn.Call(fsgofer.AddBundleDirs, &goferReq, nil); err != nil {
+		return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err)
+	}
+
+	return nil
+}
+
 // Start starts running a non-root container inside the sandbox.
 func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error {
 	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
-	conn, err := s.connect()
+
+	sandboxConn, err := s.sandboxConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to sandbox: %v", err)
+	}
+	defer sandboxConn.Close()
+	goferConn, err := s.goferConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to gofer: %v", err)
+	}
+	defer goferConn.Close()
+
+	// Create socket that connects the sandbox and gofer.
+	sandEnd, goferEnd, err := createSocketPair()
 	if err != nil {
 		return err
 	}
-	defer conn.Close()
+	defer sandEnd.Close()
+	defer goferEnd.Close()
+
+	// Tell the Gofer about the new filesystem it needs to serve.
+	goferReq := fsgofer.ServeDirectoryRequest{
+		Dir:         spec.Root.Path,
+		IsReadOnly:  spec.Root.Readonly,
+		CID:         cid,
+		FilePayload: urpc.FilePayload{Files: []*os.File{goferEnd}},
+	}
+	if err := goferConn.Call(fsgofer.ServeDirectory, &goferReq, nil); err != nil {
+		return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err)
+	}
 
+	// Start running the container.
 	args := boot.StartArgs{
-		Spec: spec,
-		Conf: conf,
-		CID:  cid,
+		Spec:        spec,
+		Conf:        conf,
+		CID:         cid,
+		FilePayload: urpc.FilePayload{Files: []*os.File{sandEnd}},
 	}
-	if err := conn.Call(boot.ContainerStart, args, nil); err != nil {
+	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
 		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
 	}
 
@@ -142,7 +189,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 		SandboxID: s.ID,
 	}
 
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -165,7 +212,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 // given container in this sandbox.
 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return nil, err
 	}
@@ -183,7 +230,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 // Execute runs the specified command in the container.
 func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) {
 	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return 0, s.connError(err)
 	}
@@ -203,7 +250,7 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 // Event retrieves stats about the sandbox such as memory and CPU utilization.
 func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return nil, err
 	}
@@ -219,7 +266,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	return &e, nil
 }
 
-func (s *Sandbox) connect() (*urpc.Client, error) {
+func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
 	log.Debugf("Connecting to sandbox %q", s.ID)
 	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
 	if err != nil {
@@ -228,6 +275,15 @@ func (s *Sandbox) connect() (*urpc.Client, error) {
 	return conn, nil
 }
 
+func (s *Sandbox) goferConnect() (*urpc.Client, error) {
+	log.Debugf("Connecting to gofer for sandbox %q", s.ID)
+	conn, err := client.ConnectTo(fsgofer.ControlSocketAddr(s.ID))
+	if err != nil {
+		return nil, s.connError(err)
+	}
+	return conn, nil
+}
+
 func (s *Sandbox) connError(err error) error {
 	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
 }
@@ -244,31 +300,45 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 
 	// Add root mount and then add any other additional mounts.
 	mountCount := 1
+
+	// Add additional mounts.
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			mountCount++
 		}
 	}
-
 	sandEnds := make([]*os.File, 0, mountCount)
 	goferEnds := make([]*os.File, 0, mountCount)
-	for i := 0; i < mountCount; i++ {
-		// Create socket that connects the sandbox and gofer.
-		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	// nextFD is the next available file descriptor for the gofer process.
+	// It starts at 3 because 0-2 are used by stdin/stdout/stderr.
+	var nextFD int
+	for nextFD = 3; nextFD-3 < mountCount; nextFD++ {
+		sandEnd, goferEnd, err := createSocketPair()
 		if err != nil {
 			return nil, err
 		}
-		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
-
-		goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
 		defer goferEnd.Close()
+		sandEnds = append(sandEnds, sandEnd)
 		goferEnds = append(goferEnds, goferEnd)
+		args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
+	}
 
-		args = append(args, fmt.Sprintf("--io-fds=%d", 3+i))
+	// Create and donate a file descriptor for the control server.
+	addr := fsgofer.ControlSocketAddr(s.ID)
+	serverFD, err := server.CreateSocket(addr)
+	if err != nil {
+		return nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
 	}
 
+	// Add the control server fd.
+	args = append(args, "--controller-fd="+strconv.Itoa(nextFD))
+	nextFD++
+	controllerFile := os.NewFile(uintptr(serverFD), "gofer_control_socket_server")
+	defer controllerFile.Close()
+
 	cmd := exec.Command(binPath, args...)
 	cmd.ExtraFiles = goferEnds
+	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
 
 	// Setup any uid/gid mappings, and create or join the configured user
 	// namespace so the gofer's view of the filesystem aligns with the
@@ -286,6 +356,15 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle
 	return sandEnds, nil
 }
 
+// createSocketPair creates a pair of files wrapping a socket pair.
+func createSocketPair() (*os.File, *os.File, error) {
+	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return nil, nil, err
+	}
+	return os.NewFile(uintptr(fds[0]), "sandbox io fd"), os.NewFile(uintptr(fds[1]), "gofer io fd"), nil
+}
+
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
 func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error {
@@ -296,7 +375,9 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Create control server socket here and donate FD to child process because
 	// it may be in a different network namespace and won't be reachable from
 	// outside.
-	fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID))
+	addr := boot.ControlSocketAddr(s.ID)
+	fd, err := server.CreateSocket(addr)
+	log.Infof("creating sandbox process with addr: %s", addr)
 	if err != nil {
 		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
 	}
@@ -438,7 +519,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 	if err := specutils.WaitForReady(s.Pid, timeout, ready); err != nil {
 		return fmt.Errorf("unexpected error waiting for sandbox %q, err: %v", s.ID, err)
 	}
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -454,7 +535,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error {
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
 	var ws syscall.WaitStatus
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return ws, err
 	}
@@ -471,7 +552,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
 	var ws syscall.WaitStatus
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return ws, err
 	}
@@ -536,7 +617,7 @@ func (s *Sandbox) Destroy() error {
 // Signal sends the signal to a container in the sandbox.
 func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	log.Debugf("Signal sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -556,7 +637,7 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 // The statefile will be written to f.
 func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 	log.Debugf("Checkpoint sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -577,7 +658,7 @@ func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
 // Pause sends the pause call for a container in the sandbox.
 func (s *Sandbox) Pause(cid string) error {
 	log.Debugf("Pause sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -592,7 +673,7 @@ func (s *Sandbox) Pause(cid string) error {
 // Resume sends the resume call for a container in the sandbox.
 func (s *Sandbox) Resume(cid string) error {
 	log.Debugf("Resume sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
@@ -630,7 +711,7 @@ func (s *Sandbox) IsRunning() bool {
 // Stacks collects and returns all stacks for the sandbox.
 func (s *Sandbox) Stacks() (string, error) {
 	log.Debugf("Stacks sandbox %q", s.ID)
-	conn, err := s.connect()
+	conn, err := s.sandboxConnect()
 	if err != nil {
 		return "", err
 	}
-- 
cgit v1.2.3


From aeec7a4c007ac53401e05bf72894a3b998eead95 Mon Sep 17 00:00:00 2001
From: Chenggang Qin <chenggang.qcg@gmail.com>
Date: Thu, 16 Aug 2018 16:27:14 -0700
Subject: fs: Support possible and online knobs for cpu

Some linux commands depend on /sys/devices/system/cpu/possible, such
as 'lscpu'.

Add 2 knobs for cpu:
/sys/devices/system/cpu/possible
/sys/devices/system/cpu/online
Both the values are '0 - Kernel.ApplicationCores()-1'.

Change-Id: Iabd8a4e559cbb630ed249686b92c22b4e7120663
PiperOrigin-RevId: 209070163
---
 pkg/sentry/fs/sys/BUILD      |  3 ++
 pkg/sentry/fs/sys/devices.go | 76 ++++++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/sys/sys.go     | 18 +++++++++--
 3 files changed, 94 insertions(+), 3 deletions(-)
 create mode 100644 pkg/sentry/fs/sys/devices.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index bc24e980e..5ba23d5da 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "sys",
     srcs = [
         "device.go",
+        "devices.go",
         "fs.go",
         "sys.go",
     ],
@@ -16,6 +17,8 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/usermem",
+        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
new file mode 100644
index 000000000..03eddeb93
--- /dev/null
+++ b/pkg/sentry/fs/sys/devices.go
@@ -0,0 +1,76 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+	"fmt"
+	"io"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// +stateify savable
+type cpunum struct {
+	ramfs.Entry
+}
+
+func (c *cpunum) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		return 0, io.EOF
+	}
+
+	str := []byte(fmt.Sprintf("0-%d\n", k.ApplicationCores()-1))
+	if offset >= int64(len(str)) {
+		return 0, io.EOF
+	}
+
+	n, err := dst.CopyOut(ctx, str[offset:])
+	return int64(n), err
+}
+
+func newPossible(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	c := &cpunum{}
+	c.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
+	return newFile(c, msrc)
+}
+
+func newCPU(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	return newDir(ctx, msrc, map[string]*fs.Inode{
+		"possible": newPossible(ctx, msrc),
+		"online":   newPossible(ctx, msrc),
+	})
+}
+
+func newSystemDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	return newDir(ctx, msrc, map[string]*fs.Inode{
+		"cpu": newCPU(ctx, msrc),
+	})
+}
+
+func newDevicesDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	return newDir(ctx, msrc, map[string]*fs.Inode{
+		"system": newSystemDir(ctx, msrc),
+	})
+}
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index b9b2fb4a1..b1c3d48eb 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -22,13 +22,25 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// sys is a root sys node.
+//
 // +stateify savable
-type dir struct {
+type sys struct {
 	ramfs.Dir
 }
 
+func newFile(node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
+	sattr := fs.StableAttr{
+		DeviceID:  sysfsDevice.DeviceID(),
+		InodeID:   sysfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      fs.SpecialFile,
+	}
+	return fs.NewInode(node, msrc, sattr)
+}
+
 func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode {
-	d := &dir{}
+	d := &sys{}
 	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
@@ -48,7 +60,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		"bus":      newDir(ctx, msrc, nil),
 		"class":    newDir(ctx, msrc, nil),
 		"dev":      newDir(ctx, msrc, nil),
-		"devices":  newDir(ctx, msrc, nil),
+		"devices":  newDevicesDir(ctx, msrc),
 		"firmware": newDir(ctx, msrc, nil),
 		"fs":       newDir(ctx, msrc, nil),
 		"kernel":   newDir(ctx, msrc, nil),
-- 
cgit v1.2.3


From 0050e3e71cd07e6ed7cdf08784c042f7c067a5ff Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 20 Aug 2018 11:18:06 -0700
Subject: sysfs: Add (empty) cpu directories for each cpu in
 /sys/devices/system/cpu.

Numpy needs these.

Also added the "present" directory, since the contents are the same as possible
and online.

PiperOrigin-RevId: 209451777
Change-Id: I2048de3f57bf1c57e9b5421d607ca89c2a173684
---
 pkg/sentry/fs/sys/devices.go | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index 03eddeb93..2cf3a6f98 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -57,10 +57,20 @@ func newPossible(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newCPU(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	return newDir(ctx, msrc, map[string]*fs.Inode{
-		"possible": newPossible(ctx, msrc),
+	m := map[string]*fs.Inode{
 		"online":   newPossible(ctx, msrc),
-	})
+		"possible": newPossible(ctx, msrc),
+		"present":  newPossible(ctx, msrc),
+	}
+
+	// Add directories for each of the cpus.
+	if k := kernel.KernelFromContext(ctx); k != nil {
+		for i := 0; uint(i) < k.ApplicationCores(); i++ {
+			m[fmt.Sprintf("cpu%d", i)] = newDir(ctx, msrc, nil)
+		}
+	}
+
+	return newDir(ctx, msrc, m)
 }
 
 func newSystemDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-- 
cgit v1.2.3


From 1501400d9c6f4c5e82e7cb134d9a2bddac548611 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 20 Aug 2018 11:58:46 -0700
Subject: getdents should return type=DT_DIR for SpecialDirectories.

It was returning DT_UNKNOWN, and this was breaking numpy.

PiperOrigin-RevId: 209459351
Change-Id: Ic6f548e23aa9c551b2032b92636cb5f0df9ccbd4
---
 pkg/sentry/syscalls/linux/sys_getdents.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 178714b07..29c0d7a39 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -150,7 +150,7 @@ func toType(nodeType fs.InodeType) uint8 {
 		return syscall.DT_REG
 	case fs.Symlink:
 		return syscall.DT_LNK
-	case fs.Directory:
+	case fs.Directory, fs.SpecialDirectory:
 		return syscall.DT_DIR
 	case fs.Pipe:
 		return syscall.DT_FIFO
-- 
cgit v1.2.3


From 47d5a12ce565a2a63fca3fd70cc073f9883bacd0 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 20 Aug 2018 16:08:32 -0700
Subject: Fix handling of abstract Unix socket addresses

* Don't truncate abstract addresses at second null.
* Properly handle abstract addresses with length < 108 bytes.

PiperOrigin-RevId: 209502703
Change-Id: I49053f2d18b5a78208c3f640c27dbbdaece4f1a9
---
 pkg/sentry/socket/epsocket/epsocket.go | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index f8b24aaf1..4d32f7a31 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -153,9 +153,9 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
 		if len(path) > linux.UnixPathMax {
 			return tcpip.FullAddress{}, syserr.ErrInvalidArgument
 		}
-		// Drop the terminating NUL (if one exists) and everything after it.
-		// Skip the first byte, which is NUL for abstract paths.
-		if len(path) > 1 {
+		// Drop the terminating NUL (if one exists) and everything after
+		// it for filesystem (non-abstract) addresses.
+		if len(path) > 0 && path[0] != 0 {
 			if n := bytes.IndexByte(path[1:], 0); n >= 0 {
 				path = path[:n+1]
 			}
@@ -743,22 +743,20 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
 	case linux.AF_UNIX:
 		var out linux.SockAddrUnix
 		out.Family = linux.AF_UNIX
-		for i := 0; i < len([]byte(addr.Addr)); i++ {
+		l := len([]byte(addr.Addr))
+		for i := 0; i < l; i++ {
 			out.Path[i] = int8(addr.Addr[i])
 		}
-		// Linux just returns the header for empty addresses.
-		if len(addr.Addr) == 0 {
-			return out, 2
-		}
+
 		// Linux returns the used length of the address struct (including the
 		// null terminator) for filesystem paths. The Family field is 2 bytes.
 		// It is sometimes allowed to exclude the null terminator if the
-		// address length is the max. Abstract paths always return the full
-		// length.
-		if out.Path[0] == 0 || len([]byte(addr.Addr)) == len(out.Path) {
-			return out, uint32(binary.Size(out))
+		// address length is the max. Abstract and empty paths always return
+		// the full exact length.
+		if l == 0 || out.Path[0] == 0 || l == len(out.Path) {
+			return out, uint32(2 + l)
 		}
-		return out, uint32(3 + len(addr.Addr))
+		return out, uint32(3 + l)
 	case linux.AF_INET:
 		var out linux.SockAddrInet
 		copy(out.Addr[:], addr.Addr)
-- 
cgit v1.2.3


From 9c407382b031f16160f83383ef8b0d419457829a Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 21 Aug 2018 11:15:15 -0700
Subject: Fix races in kernel.(*Task).Value()

PiperOrigin-RevId: 209627180
Change-Id: Idc84afd38003427e411df6e75abfabd9174174e1
---
 pkg/sentry/kernel/task.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 19029adb1..0f83c0a39 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -568,12 +568,18 @@ func (t *Task) Value(key interface{}) interface{} {
 	case CtxPIDNamespace:
 		return t.tg.pidns
 	case CtxUTSNamespace:
+		t.mu.Lock()
+		defer t.mu.Unlock()
 		return t.utsns
 	case CtxIPCNamespace:
+		t.mu.Lock()
+		defer t.mu.Unlock()
 		return t.ipcns
 	case CtxTask:
 		return t
 	case auth.CtxCredentials:
+		t.mu.Lock()
+		defer t.mu.Unlock()
 		return t.creds
 	case context.CtxThreadGroupID:
 		return int32(t.ThreadGroup().ID())
-- 
cgit v1.2.3


From 8bb50dab790d575a83a935cf3361099cdb1a6aac Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 21 Aug 2018 16:51:08 -0700
Subject: sentry: do not release gofer inode file state loading lock upon
 error.

When an inode file state failed to load asynchronuously, we want to report
the error instead of potentially panicing in another async loading goroutine
incorrectly unblocked.

PiperOrigin-RevId: 209683977
Change-Id: I591cde97710bbe3cdc53717ee58f1d28bbda9261
---
 pkg/sentry/fs/gofer/inode_state.go | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 33ec33364..4f2b01c72 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -108,9 +108,13 @@ func (i *inodeFileState) loadLoading(_ struct{}) {
 
 // afterLoad is invoked by stateify.
 func (i *inodeFileState) afterLoad() {
-	load := func() error {
+	load := func() (err error) {
 		// See comment on i.loading().
-		defer i.loading.Unlock()
+		defer func() {
+			if err == nil {
+				i.loading.Unlock()
+			}
+		}()
 
 		// Manually restore the p9.File.
 		name, ok := i.s.inodeMappings[i.sattr.InodeID]
@@ -121,7 +125,6 @@ func (i *inodeFileState) afterLoad() {
 		}
 		// TODO: Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
-		var err error
 
 		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
 		if err != nil {
-- 
cgit v1.2.3


From 8d318aac553240d4b5044f0ca70bff3e58cf60f3 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 22 Aug 2018 10:05:56 -0700
Subject: fs: Hold Dirent.mu when calling Dirent.flush().

As required by the contract in Dirent.flush().

Also inline Dirent.freeze() into Dirent.Freeze(), since it is only called from
there.

PiperOrigin-RevId: 209783626
Change-Id: Ie6de4533d93dd299ffa01dabfa257c9cc259b1f4
---
 pkg/sentry/fs/dirent.go | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4d3aeaf41..71ef3336e 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -398,7 +398,18 @@ func (d *Dirent) MountRoot() *Dirent {
 	return mountRoot
 }
 
-func (d *Dirent) freeze() {
+// Freeze prevents this dirent from walking to more nodes. Freeze is applied
+// recursively to all children.
+//
+// If this particular Dirent represents a Virtual node, then Walks and Creates
+// may proceed as before.
+//
+// Freeze can only be called before the application starts running, otherwise
+// the root it might be out of sync with the application root if modified by
+// sys_chroot.
+func (d *Dirent) Freeze() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
 	if d.frozen {
 		// Already frozen.
 		return
@@ -419,21 +430,6 @@ func (d *Dirent) freeze() {
 	d.flush()
 }
 
-// Freeze prevents this dirent from walking to more nodes. Freeze is applied
-// recursively to all children.
-//
-// If this particular Dirent represents a Virtual node, then Walks and Creates
-// may proceed as before.
-//
-// Freeze can only be called before the application starts running, otherwise
-// the root it might be out of sync with the application root if modified by
-// sys_chroot.
-func (d *Dirent) Freeze() {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	d.freeze()
-}
-
 // descendantOf returns true if the receiver dirent is equal to, or a
 // descendant of, the argument dirent.
 //
@@ -1586,7 +1582,9 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// reasons, so we flush all references on the replaced node and
 		// its children.
 		replaced.Inode.Watches.Unpin(replaced)
+		replaced.mu.Lock()
 		replaced.flush()
+		replaced.mu.Unlock()
 	}
 
 	if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName); err != nil {
@@ -1637,7 +1635,9 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	renamed.dropExtendedReference()
 
 	// Same as replaced.flush above.
+	renamed.mu.Lock()
 	renamed.flush()
+	renamed.mu.Unlock()
 
 	return nil
 }
-- 
cgit v1.2.3


From 545ea7ab3fa3e976120b74da3271dc7724c05f5e Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Wed, 22 Aug 2018 12:35:40 -0700
Subject: Always add AT_BASE even if there is no interpreter.

Linux will ALWAYS add AT_BASE even for a static binary, expect it
will be set to 0 [1].

1. https://github.com/torvalds/linux/blob/master/fs/binfmt_elf.c#L253

PiperOrigin-RevId: 209811129
Change-Id: I92cc66532f23d40f24414a921c030bd3481e12a0
---
 pkg/sentry/loader/elf.go | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 0462a1788..f4deaa905 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -655,6 +655,9 @@ func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace
 		// Start in the interpreter.
 		// N.B. AT_ENTRY above contains the *original* entry point.
 		bin.entry = interp.entry
+	} else {
+		// Always add AT_BASE even if there is no interpreter.
+		bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, 0})
 	}
 
 	return bin, ac, nil
-- 
cgit v1.2.3


From 6b9133ba96863e3653fa6f3949710203bb077c50 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Wed, 22 Aug 2018 13:18:21 -0700
Subject: sentry: mark S/R stating errors as save rejections / fs corruptions.

PiperOrigin-RevId: 209817767
Change-Id: Iddf2b8441bc44f31f9a8cf6f2bd8e7a5b824b487
---
 pkg/sentry/fs/gofer/inode_state.go | 6 +++---
 pkg/sentry/fs/host/inode_state.go  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 4f2b01c72..ad11034f9 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -67,7 +67,7 @@ func (i *inodeFileState) beforeSave() {
 	if i.sattr.Type == fs.RegularFile {
 		uattr, err := i.unstableAttr(&dummyClockContext{context.Background()})
 		if err != nil {
-			panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.s.inodeMappings[i.sattr.InodeID], err))
+			panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.s.inodeMappings[i.sattr.InodeID], err)})
 		}
 		i.savedUAttr = &uattr
 	}
@@ -128,7 +128,7 @@ func (i *inodeFileState) afterLoad() {
 
 		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
 		if err != nil {
-			return fmt.Errorf("failed to walk to %q: %v", name, err)
+			return fs.ErrCorruption{fmt.Errorf("failed to walk to %q: %v", name, err)}
 		}
 
 		// Remap the saved inode number into the gofer device using the
@@ -136,7 +136,7 @@ func (i *inodeFileState) afterLoad() {
 		// environment.
 		qid, mask, attrs, err := i.file.getAttr(ctx, p9.AttrMaskAll())
 		if err != nil {
-			return fmt.Errorf("failed to get file attributes of %s: %v", name, err)
+			return fs.ErrCorruption{fmt.Errorf("failed to get file attributes of %s: %v", name, err)}
 		}
 		if !mask.RDev {
 			return fs.ErrCorruption{fmt.Errorf("file %s lacks device", name)}
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index 135c75fd5..8bc99d94b 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -31,7 +31,7 @@ func (i *inodeFileState) beforeSave() {
 	if !i.descriptor.donated && i.sattr.Type == fs.RegularFile {
 		uattr, err := i.unstableAttr(context.Background())
 		if err != nil {
-			panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err))
+			panic(fs.ErrSaveRejection{fmt.Errorf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)})
 		}
 		i.savedUAttr = &uattr
 	}
@@ -47,7 +47,7 @@ func (i *inodeFileState) afterLoad() {
 	// Remap the inode number.
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.FD(), &s); err != nil {
-		panic(fmt.Sprintf("failed to get metadata for fd %d: %v", i.FD(), err))
+		panic(fs.ErrCorruption{fmt.Errorf("failed to get metadata for fd %d: %v", i.FD(), err)})
 	}
 	key := device.MultiDeviceKey{
 		Device: s.Dev,
-- 
cgit v1.2.3


From a7a8d07d7d6bd551d96621ee841b1b0e0f217ca3 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 22 Aug 2018 14:14:32 -0700
Subject: Add separate Recycle method for allocator.

This improves debugging for pagetable-related issues.

PiperOrigin-RevId: 209827795
Change-Id: I4cfa11664b0b52f26f6bc90a14c5bb106f01e038
---
 pkg/sentry/platform/kvm/address_space.go          |  3 +++
 pkg/sentry/platform/kvm/allocator.go              |  7 +++++++
 pkg/sentry/platform/ring0/pagetables/allocator.go | 19 ++++++++++++++++---
 3 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index fbd11ed71..463617170 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -273,6 +273,9 @@ func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 			Start: addr,
 			End:   addr + usermem.Addr(length),
 		})
+
+		// Recycle any freed intermediate pages.
+		as.pageTables.Allocator.Recycle()
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
index 80066bfc5..f5cebd5b3 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -67,3 +67,10 @@ func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
 func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
 	a.base.FreePTEs(ptes)
 }
+
+// Recycle implements pagetables.Allocator.Recycle.
+//
+//go:nosplit
+func (a allocator) Recycle() {
+	a.base.Recycle()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 1499623fb..049fd0247 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -27,8 +27,12 @@ type Allocator interface {
 	// LookupPTEs looks up PTEs by physical address.
 	LookupPTEs(physical uintptr) *PTEs
 
-	// FreePTEs frees a set of PTEs.
+	// FreePTEs marks a set of PTEs a freed, although they may not be available
+	// for use again until Recycle is called, below.
 	FreePTEs(ptes *PTEs)
+
+	// Recycle makes freed PTEs available for use again.
+	Recycle()
 }
 
 // RuntimeAllocator is a trivial allocator.
@@ -42,6 +46,9 @@ type RuntimeAllocator struct {
 
 	// pool is the set of free-to-use PTEs.
 	pool []*PTEs
+
+	// freed is the set of recently-freed PTEs.
+	freed []*PTEs
 }
 
 // NewRuntimeAllocator returns an allocator that uses runtime allocation.
@@ -51,8 +58,15 @@ func NewRuntimeAllocator() *RuntimeAllocator {
 	}
 }
 
+// Recycle returns freed pages to the pool.
+func (r *RuntimeAllocator) Recycle() {
+	r.pool = append(r.pool, r.freed...)
+	r.freed = r.freed[:0]
+}
+
 // Drain empties the pool.
 func (r *RuntimeAllocator) Drain() {
+	r.Recycle()
 	for i, ptes := range r.pool {
 		// Zap the entry in the underlying array to ensure that it can
 		// be properly garbage collected.
@@ -104,6 +118,5 @@ func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs {
 //
 //go:nosplit
 func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) {
-	// Add to the pool.
-	r.pool = append(r.pool, ptes)
+	r.freed = append(r.freed, ptes)
 }
-- 
cgit v1.2.3


From ba8f6ba8c899d2e900fa7e9ee5aede31cba1de9c Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 23 Aug 2018 13:57:30 -0700
Subject: sentry: mark idMapSeqHandle as savable.

PiperOrigin-RevId: 209994384
Change-Id: I16186cf79cb4760a134f3968db30c168a5f4340e
---
 pkg/sentry/fs/proc/uid_gid_map.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 9811d9c9d..a7e4cf0a6 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -68,6 +68,8 @@ func (imss *idMapSeqSource) ReadSeqFileData(ctx context.Context, handle seqfile.
 }
 
 // TODO: Fix issue requiring idMapSeqHandle wrapping an int.
+//
+// +stateify savable
 type idMapSeqHandle struct {
 	value int
 }
-- 
cgit v1.2.3


From 64403265a04aa0c8be3ebb652a09f6e2d7a84ca7 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 23 Aug 2018 16:31:25 -0700
Subject: Implement POSIX per-process interval timers.

PiperOrigin-RevId: 210021612
Change-Id: If7c161e6fd08cf17942bfb6bc5a8d2c4e271c61e
---
 pkg/abi/linux/signal.go                    |  20 ++
 pkg/abi/linux/time.go                      |   4 +
 pkg/sentry/arch/signal_amd64.go            |  30 +++
 pkg/sentry/kernel/BUILD                    |   2 +-
 pkg/sentry/kernel/kernel.go                |  19 +-
 pkg/sentry/kernel/pending_signals.go       |  17 +-
 pkg/sentry/kernel/pending_signals_state.go |  21 +-
 pkg/sentry/kernel/posixtimer.go            | 306 +++++++++++++++++++++++++++++
 pkg/sentry/kernel/ptrace.go                |   2 +-
 pkg/sentry/kernel/task_exec.go             |  16 ++
 pkg/sentry/kernel/task_exit.go             |   2 +-
 pkg/sentry/kernel/task_signals.go          |  12 +-
 pkg/sentry/kernel/thread_group.go          |  27 ++-
 pkg/sentry/kernel/time/time.go             |  23 ++-
 pkg/sentry/syscalls/linux/linux64.go       |  10 +-
 pkg/sentry/syscalls/linux/sys_timer.go     |  85 ++++++++
 pkg/sentry/syscalls/linux/sys_timerfd.go   |  33 +---
 17 files changed, 579 insertions(+), 50 deletions(-)
 create mode 100644 pkg/sentry/kernel/posixtimer.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index fed2a159f..b2c7230c4 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -209,3 +209,23 @@ const (
 	// POLL_HUP indicates that a device disconnected.
 	POLL_HUP = SI_POLL | 6
 )
+
+// Sigevent represents struct sigevent.
+type Sigevent struct {
+	Value  uint64 // union sigval {int, void*}
+	Signo  int32
+	Notify int32
+
+	// struct sigevent here contains 48-byte union _sigev_un. However, only
+	// member _tid is significant to the kernel.
+	Tid         int32
+	UnRemainder [44]byte
+}
+
+// Possible values for Sigevent.Notify, aka struct sigevent::sigev_notify.
+const (
+	SIGEV_SIGNAL    = 0
+	SIGEV_NONE      = 1
+	SIGEV_THREAD    = 2
+	SIGEV_THREAD_ID = 4
+)
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 9109a2848..4569f4208 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -222,3 +222,7 @@ type Tms struct {
 	CUTime ClockT
 	CSTime ClockT
 }
+
+// TimerID represents type timer_t, which identifies a POSIX per-process
+// interval timer.
+type TimerID int32
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index e81717e8b..9ca4c8ed1 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -176,6 +176,36 @@ func (s *SignalInfo) SetUid(val int32) {
 	usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
 }
 
+// Sigval returns the sigval field, which is aliased to both si_int and si_ptr.
+func (s *SignalInfo) Sigval() uint64 {
+	return usermem.ByteOrder.Uint64(s.Fields[8:16])
+}
+
+// SetSigval mutates the sigval field.
+func (s *SignalInfo) SetSigval(val uint64) {
+	usermem.ByteOrder.PutUint64(s.Fields[8:16], val)
+}
+
+// TimerID returns the si_timerid field.
+func (s *SignalInfo) TimerID() linux.TimerID {
+	return linux.TimerID(usermem.ByteOrder.Uint32(s.Fields[0:4]))
+}
+
+// SetTimerID sets the si_timerid field.
+func (s *SignalInfo) SetTimerID(val linux.TimerID) {
+	usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
+}
+
+// Overrun returns the si_overrun field.
+func (s *SignalInfo) Overrun() int32 {
+	return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
+}
+
+// SetOverrun sets the si_overrun field.
+func (s *SignalInfo) SetOverrun(val int32) {
+	usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
+}
+
 // Addr returns the si_addr field.
 func (s *SignalInfo) Addr() uint64 {
 	return usermem.ByteOrder.Uint64(s.Fields[0:8])
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 69a3fbc45..a7b847e94 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -71,6 +71,7 @@ go_library(
         "pending_signals.go",
         "pending_signals_list.go",
         "pending_signals_state.go",
+        "posixtimer.go",
         "process_group_list.go",
         "ptrace.go",
         "rseq.go",
@@ -114,7 +115,6 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
     imports = [
         "gvisor.googlesource.com/gvisor/pkg/bpf",
-        "gvisor.googlesource.com/gvisor/pkg/sentry/arch",
         "gvisor.googlesource.com/gvisor/pkg/tcpip",
     ],
     visibility = ["//:sandbox"],
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index cb43fdcdc..33cd727c6 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -19,9 +19,11 @@
 // Lock order (outermost locks must be taken first):
 //
 // Kernel.extMu
-//   TaskSet.mu
-//     SignalHandlers.mu
-//       Task.mu
+//   ThreadGroup.timerMu
+//     ktime.Timer.mu (for IntervalTimer)
+//       TaskSet.mu
+//         SignalHandlers.mu
+//           Task.mu
 //
 // Locking SignalHandlers.mu in multiple SignalHandlers requires locking
 // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
@@ -706,8 +708,12 @@ func (k *Kernel) pauseTimeLocked() {
 		if t == t.tg.leader {
 			t.tg.tm.pause()
 		}
-		// This means we'll iterate FDMaps shared by multiple tasks repeatedly,
-		// but ktime.Timer.Pause is idempotent so this is harmless.
+		// This means we'll iterate ThreadGroups and FDMaps shared by multiple
+		// tasks repeatedly, but ktime.Timer.Pause is idempotent so this is
+		// harmless.
+		for _, it := range t.tg.timers {
+			it.PauseTimer()
+		}
 		if fdm := t.tr.FDMap; fdm != nil {
 			for _, desc := range fdm.files {
 				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
@@ -735,6 +741,9 @@ func (k *Kernel) resumeTimeLocked() {
 		if t == t.tg.leader {
 			t.tg.tm.resume()
 		}
+		for _, it := range t.tg.timers {
+			it.ResumeTimer()
+		}
 		if fdm := t.tr.FDMap; fdm != nil {
 			for _, desc := range fdm.files {
 				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index 06be5a7e1..bb5db0309 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -46,7 +46,7 @@ type pendingSignals struct {
 	// Note that signals is zero-indexed, but signal 1 is the first valid
 	// signal, so signals[0] contains signals with signo 1 etc. This offset is
 	// usually handled by using Signal.index().
-	signals [linux.SignalMaximum]pendingSignalQueue `state:".([]*arch.SignalInfo)"`
+	signals [linux.SignalMaximum]pendingSignalQueue `state:".([]savedPendingSignal)"`
 
 	// Bit i of pendingSet is set iff there is at least one signal with signo
 	// i+1 pending.
@@ -66,13 +66,16 @@ type pendingSignal struct {
 	// pendingSignalEntry links into a pendingSignalList.
 	pendingSignalEntry
 	*arch.SignalInfo
+
+	// If timer is not nil, it is the IntervalTimer which sent this signal.
+	timer *IntervalTimer
 }
 
 // enqueue enqueues the given signal. enqueue returns true on success and false
 // on failure (if the given signal's queue is full).
 //
 // Preconditions: info represents a valid signal.
-func (p *pendingSignals) enqueue(info *arch.SignalInfo) bool {
+func (p *pendingSignals) enqueue(info *arch.SignalInfo, timer *IntervalTimer) bool {
 	sig := linux.Signal(info.Signo)
 	q := &p.signals[sig.Index()]
 	if sig.IsStandard() {
@@ -82,7 +85,7 @@ func (p *pendingSignals) enqueue(info *arch.SignalInfo) bool {
 	} else if q.length >= rtSignalCap {
 		return false
 	}
-	q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info})
+	q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info, timer: timer})
 	q.length++
 	p.pendingSet |= linux.SignalSetOf(sig)
 	return true
@@ -119,12 +122,20 @@ func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo {
 	if q.length == 0 {
 		p.pendingSet &^= linux.SignalSetOf(sig)
 	}
+	if ps.timer != nil {
+		ps.timer.updateDequeuedSignalLocked(ps.SignalInfo)
+	}
 	return ps.SignalInfo
 }
 
 // discardSpecific causes all pending signals with number sig to be discarded.
 func (p *pendingSignals) discardSpecific(sig linux.Signal) {
 	q := &p.signals[sig.Index()]
+	for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
+		if ps.timer != nil {
+			ps.timer.signalRejectedLocked()
+		}
+	}
 	q.pendingSignalList.Reset()
 	q.length = 0
 	p.pendingSet &^= linux.SignalSetOf(sig)
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
index af61f6e8e..6d90ed033 100644
--- a/pkg/sentry/kernel/pending_signals_state.go
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -18,20 +18,29 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 )
 
+// +stateify savable
+type savedPendingSignal struct {
+	si    *arch.SignalInfo
+	timer *IntervalTimer
+}
+
 // saveSignals is invoked by stateify.
-func (p *pendingSignals) saveSignals() []*arch.SignalInfo {
-	var pending []*arch.SignalInfo
+func (p *pendingSignals) saveSignals() []savedPendingSignal {
+	var pending []savedPendingSignal
 	for _, q := range p.signals {
 		for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
-			pending = append(pending, ps.SignalInfo)
+			pending = append(pending, savedPendingSignal{
+				si:    ps.SignalInfo,
+				timer: ps.timer,
+			})
 		}
 	}
 	return pending
 }
 
 // loadSignals is invoked by stateify.
-func (p *pendingSignals) loadSignals(pending []*arch.SignalInfo) {
-	for _, si := range pending {
-		p.enqueue(si)
+func (p *pendingSignals) loadSignals(pending []savedPendingSignal) {
+	for _, sps := range pending {
+		p.enqueue(sps.si, sps.timer)
 	}
 }
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
new file mode 100644
index 000000000..0ab958529
--- /dev/null
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -0,0 +1,306 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"math"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// IntervalTimer represents a POSIX interval timer as described by
+// timer_create(2).
+//
+// +stateify savable
+type IntervalTimer struct {
+	timer *ktime.Timer
+
+	// If target is not nil, it receives signo from timer expirations. If group
+	// is true, these signals are thread-group-directed. These fields are
+	// immutable.
+	target *Task
+	signo  linux.Signal
+	id     linux.TimerID
+	sigval uint64
+	group  bool
+
+	// If sigpending is true, a signal to target is already queued, and timer
+	// expirations should increment overrunCur instead of sending another
+	// signal. sigpending is protected by target's signal mutex. (If target is
+	// nil, the timer will never send signals, so sigpending will be unused.)
+	sigpending bool
+
+	// If sigorphan is true, timer's setting has been changed since sigpending
+	// last became true, such that overruns should no longer be counted in the
+	// pending signals si_overrun. sigorphan is protected by target's signal
+	// mutex.
+	sigorphan bool
+
+	// overrunCur is the number of overruns that have occurred since the last
+	// time a signal was sent. overrunCur is protected by target's signal
+	// mutex.
+	overrunCur uint64
+
+	// Consider the last signal sent by this timer that has been dequeued.
+	// overrunLast is the number of overruns that occurred between when this
+	// signal was sent and when it was dequeued. Equivalently, overrunLast was
+	// the value of overrunCur when this signal was dequeued. overrunLast is
+	// protected by target's signal mutex.
+	overrunLast uint64
+}
+
+// DestroyTimer releases it's resources.
+func (it *IntervalTimer) DestroyTimer() {
+	it.timer.Destroy()
+	it.timerSettingChanged()
+	// A destroyed IntervalTimer is still potentially reachable via a
+	// pendingSignal; nil out timer so that it won't be saved.
+	it.timer = nil
+}
+
+func (it *IntervalTimer) timerSettingChanged() {
+	if it.target == nil {
+		return
+	}
+	it.target.tg.pidns.owner.mu.RLock()
+	defer it.target.tg.pidns.owner.mu.RUnlock()
+	it.target.tg.signalHandlers.mu.Lock()
+	defer it.target.tg.signalHandlers.mu.Unlock()
+	it.sigorphan = true
+	it.overrunCur = 0
+	it.overrunLast = 0
+}
+
+// PauseTimer pauses the associated Timer.
+func (it *IntervalTimer) PauseTimer() {
+	it.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (it *IntervalTimer) ResumeTimer() {
+	it.timer.Resume()
+}
+
+// Preconditions: it.target's signal mutex must be locked.
+func (it *IntervalTimer) updateDequeuedSignalLocked(si *arch.SignalInfo) {
+	it.sigpending = false
+	if it.sigorphan {
+		return
+	}
+	it.overrunLast = it.overrunCur
+	it.overrunCur = 0
+	si.SetOverrun(saturateI32FromU64(it.overrunLast))
+}
+
+// Preconditions: it.target's signal mutex must be locked.
+func (it *IntervalTimer) signalRejectedLocked() {
+	it.sigpending = false
+	if it.sigorphan {
+		return
+	}
+	it.overrunCur++
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (it *IntervalTimer) Notify(exp uint64) {
+	if it.target == nil {
+		return
+	}
+
+	it.target.tg.pidns.owner.mu.RLock()
+	defer it.target.tg.pidns.owner.mu.RUnlock()
+	it.target.tg.signalHandlers.mu.Lock()
+	defer it.target.tg.signalHandlers.mu.Unlock()
+
+	if it.sigpending {
+		it.overrunCur += exp
+		return
+	}
+
+	// sigpending must be set before sendSignalTimerLocked() so that it can be
+	// unset if the signal is discarded (in which case sendSignalTimerLocked()
+	// will return nil).
+	it.sigpending = true
+	it.sigorphan = false
+	it.overrunCur += exp - 1
+	si := &arch.SignalInfo{
+		Signo: int32(it.signo),
+		Code:  arch.SignalInfoTimer,
+	}
+	si.SetTimerID(it.id)
+	si.SetSigval(it.sigval)
+	// si_overrun is set when the signal is dequeued.
+	if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil {
+		it.signalRejectedLocked()
+	}
+}
+
+// Destroy implements ktime.TimerListener.Destroy. Users of Timer should call
+// DestroyTimer instead.
+func (it *IntervalTimer) Destroy() {
+}
+
+// IntervalTimerCreate implements timer_create(2).
+func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.TimerID, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+
+	// Allocate a timer ID.
+	var id linux.TimerID
+	end := t.tg.nextTimerID
+	for {
+		id = t.tg.nextTimerID
+		_, ok := t.tg.timers[id]
+		t.tg.nextTimerID++
+		if t.tg.nextTimerID < 0 {
+			t.tg.nextTimerID = 0
+		}
+		if !ok {
+			break
+		}
+		if t.tg.nextTimerID == end {
+			return 0, syserror.EAGAIN
+		}
+	}
+
+	// "The implementation of the default case where evp [sic] is NULL is
+	// handled inside glibc, which invokes the underlying system call with a
+	// suitably populated sigevent structure." - timer_create(2). This is
+	// misleading; the timer_create syscall also handles a NULL sevp as
+	// described by the man page
+	// (kernel/time/posix-timers.c:sys_timer_create(), do_timer_create()). This
+	// must be handled here instead of the syscall wrapper since sigval is the
+	// timer ID, which isn't available until we allocate it in this function.
+	if sigev == nil {
+		sigev = &linux.Sigevent{
+			Signo:  int32(linux.SIGALRM),
+			Notify: linux.SIGEV_SIGNAL,
+			Value:  uint64(id),
+		}
+	}
+
+	// Construct the timer.
+	it := &IntervalTimer{
+		id:     id,
+		sigval: sigev.Value,
+	}
+	switch sigev.Notify {
+	case linux.SIGEV_NONE:
+		// leave it.target = nil
+	case linux.SIGEV_SIGNAL, linux.SIGEV_THREAD:
+		// POSIX SIGEV_THREAD semantics are implemented in userspace by libc;
+		// to the kernel, SIGEV_THREAD and SIGEV_SIGNAL are equivalent. (See
+		// Linux's kernel/time/posix-timers.c:good_sigevent().)
+		it.target = t.tg.leader
+		it.group = true
+	case linux.SIGEV_THREAD_ID:
+		t.tg.pidns.owner.mu.RLock()
+		target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)]
+		t.tg.pidns.owner.mu.RUnlock()
+		if !ok || target.tg != t.tg {
+			return 0, syserror.EINVAL
+		}
+		it.target = target
+	default:
+		return 0, syserror.EINVAL
+	}
+	if sigev.Notify != linux.SIGEV_NONE {
+		it.signo = linux.Signal(sigev.Signo)
+		if !it.signo.IsValid() {
+			return 0, syserror.EINVAL
+		}
+	}
+	it.timer = ktime.NewTimer(c, it)
+
+	t.tg.timers[id] = it
+	return id, nil
+}
+
+// IntervalTimerDelete implements timer_delete(2).
+func (t *Task) IntervalTimerDelete(id linux.TimerID) error {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return syserror.EINVAL
+	}
+	delete(t.tg.timers, id)
+	it.DestroyTimer()
+	return nil
+}
+
+// IntervalTimerSettime implements timer_settime(2).
+func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs bool) (linux.Itimerspec, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return linux.Itimerspec{}, syserror.EINVAL
+	}
+
+	newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock())
+	if err != nil {
+		return linux.Itimerspec{}, err
+	}
+	tm, oldS := it.timer.SwapAnd(newS, it.timerSettingChanged)
+	its = ktime.ItimerspecFromSetting(tm, oldS)
+	return its, nil
+}
+
+// IntervalTimerGettime implements timer_gettime(2).
+func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return linux.Itimerspec{}, syserror.EINVAL
+	}
+
+	tm, s := it.timer.Get()
+	its := ktime.ItimerspecFromSetting(tm, s)
+	return its, nil
+}
+
+// IntervalTimerGetoverrun implements timer_getoverrun(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) {
+	t.tg.timerMu.Lock()
+	defer t.tg.timerMu.Unlock()
+	it := t.tg.timers[id]
+	if it == nil {
+		return 0, syserror.EINVAL
+	}
+	// By timer_create(2) invariant, either it.target == nil (in which case
+	// it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact
+	// that t is executing timer_getoverrun(2) means that t.tg can't be
+	// completing execve, so t.tg.signalHandlers can't be changing, allowing us
+	// to lock t.tg.signalHandlers.mu without holding the TaskSet mutex.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	// This is consistent with Linux after 78c9c4dfbf8c ("posix-timers:
+	// Sanitize overrun handling").
+	return saturateI32FromU64(it.overrunLast), nil
+}
+
+func saturateI32FromU64(x uint64) int32 {
+	if x > math.MaxInt32 {
+		return math.MaxInt32
+	}
+	return int32(x)
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index e9e69004d..1a0d1876d 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -627,7 +627,7 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions
 			// running, so we don't have to.
 			child.pendingSignals.enqueue(&arch.SignalInfo{
 				Signo: int32(linux.SIGSTOP),
-			})
+			}, nil)
 			child.tg.signalHandlers.mu.Unlock()
 		}
 	}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 385299b24..bb3d0bd02 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -143,6 +143,22 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 		oldTID = tracer.tg.pidns.tids[t]
 	}
 	t.promoteLocked()
+	// "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle
+	// this first since POSIX timers are protected by the signal mutex, which
+	// we're about to change. Note that we have to stop and destroy timers
+	// without holding any mutexes to avoid circular lock ordering.
+	var its []*IntervalTimer
+	t.tg.signalHandlers.mu.Lock()
+	for _, it := range t.tg.timers {
+		its = append(its, it)
+	}
+	t.tg.timers = make(map[linux.TimerID]*IntervalTimer)
+	t.tg.signalHandlers.mu.Unlock()
+	t.tg.pidns.owner.mu.Unlock()
+	for _, it := range its {
+		it.DestroyTimer()
+	}
+	t.tg.pidns.owner.mu.Lock()
 	// "During an execve(2), the dispositions of handled signals are reset to
 	// the default; the dispositions of ignored signals are left unchanged. ...
 	// [The] signal mask is preserved across execve(2). ... [The] pending
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b16844e91..b37fcf4c1 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -129,7 +129,7 @@ func (t *Task) killLocked() {
 		// enqueueing an actual siginfo, such that
 		// kernel/signal.c:collect_signal() initializes si_code to SI_USER.
 		Code: arch.SignalInfoUser,
-	})
+	}, nil)
 	t.interrupt()
 }
 
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 62ec530be..4a66bce6b 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -396,6 +396,10 @@ func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) e
 }
 
 func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
+	return t.sendSignalTimerLocked(info, group, nil)
+}
+
+func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error {
 	if t.exitState == TaskExitDead {
 		return syserror.ESRCH
 	}
@@ -429,6 +433,9 @@ func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
 	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
 	if linux.SignalSetOf(sig)&t.tr.SignalMask == 0 && ignored && !t.hasTracer() {
 		t.Debugf("Discarding ignored signal %d", sig)
+		if timer != nil {
+			timer.signalRejectedLocked()
+		}
 		return nil
 	}
 
@@ -436,11 +443,14 @@ func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
 	if group {
 		q = &t.tg.pendingSignals
 	}
-	if !q.enqueue(info) {
+	if !q.enqueue(info, timer) {
 		if sig.IsRealtime() {
 			return syserror.EAGAIN
 		}
 		t.Debugf("Discarding duplicate signal %d", sig)
+		if timer != nil {
+			timer.signalRejectedLocked()
+		}
 		return nil
 	}
 
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 441b8a822..13dce08ce 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -155,6 +155,14 @@ type ThreadGroup struct {
 	// tm contains process timers. TimerManager fields are immutable.
 	tm TimerManager
 
+	// timers is the thread group's POSIX interval timers. nextTimerID is the
+	// TimerID at which allocation should begin searching for an unused ID.
+	//
+	// timers and nextTimerID are protected by timerMu.
+	timerMu     sync.Mutex `state:"nosave"`
+	timers      map[linux.TimerID]*IntervalTimer
+	nextTimerID linux.TimerID
+
 	// exitedCPUStats is the CPU usage for all exited tasks in the thread
 	// group. exitedCPUStats is protected by the TaskSet mutex.
 	exitedCPUStats usage.CPUStats
@@ -218,6 +226,7 @@ func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linu
 		limits:            limits,
 	}
 	tg.tm = newTimerManager(tg, monotonicClock)
+	tg.timers = make(map[linux.TimerID]*IntervalTimer)
 	tg.rscr.Store(&RSEQCriticalRegion{})
 	return tg
 }
@@ -252,9 +261,23 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
 
 // release releases the thread group's resources.
 func (tg *ThreadGroup) release() {
-	// This must be done without holding the TaskSet mutex since thread group
-	// timers call SendSignal with Timer.mu locked.
+	// These must be done without holding the TaskSet or signal mutexes since
+	// timers send signals with Timer.mu locked.
+
 	tg.tm.destroy()
+
+	var its []*IntervalTimer
+	tg.pidns.owner.mu.Lock()
+	tg.signalHandlers.mu.Lock()
+	for _, it := range tg.timers {
+		its = append(its, it)
+	}
+	tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved
+	tg.signalHandlers.mu.Unlock()
+	tg.pidns.owner.mu.Unlock()
+	for _, it := range its {
+		it.DestroyTimer()
+	}
 }
 
 // forEachChildThreadGroupLocked indicates over all child ThreadGroups.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 6eadd2878..1f6fed007 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -320,8 +320,8 @@ func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Sett
 	}, nil
 }
 
-// SettingFromAbsSpec converts a (value, interval) pair to a Setting based on a
-// reading from c. value is interpreted as an absolute time.
+// SettingFromAbsSpec converts a (value, interval) pair to a Setting. value is
+// interpreted as an absolute time.
 func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
 	if value.Before(ZeroTime) {
 		return Setting{}, syserror.EINVAL
@@ -336,6 +336,16 @@ func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
 	}, nil
 }
 
+// SettingFromItimerspec converts a linux.Itimerspec to a Setting. If abs is
+// true, its.Value is interpreted as an absolute time. Otherwise, it is
+// interpreted as a time relative to c.Now().
+func SettingFromItimerspec(its linux.Itimerspec, abs bool, c Clock) (Setting, error) {
+	if abs {
+		return SettingFromAbsSpec(FromTimespec(its.Value), its.Interval.ToDuration())
+	}
+	return SettingFromSpec(its.Value.ToDuration(), its.Interval.ToDuration(), c)
+}
+
 // SpecFromSetting converts a timestamp and a Setting to a (relative value,
 // interval) pair, as used by most Linux syscalls that return a struct
 // itimerval or struct itimerspec.
@@ -346,6 +356,15 @@ func SpecFromSetting(now Time, s Setting) (value, period time.Duration) {
 	return s.Next.Sub(now), s.Period
 }
 
+// ItimerspecFromSetting converts a Setting to a linux.Itimerspec.
+func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec {
+	val, iv := SpecFromSetting(now, s)
+	return linux.Itimerspec{
+		Interval: linux.DurationToTimespec(iv),
+		Value:    linux.DurationToTimespec(val),
+	}
+}
+
 // advancedTo returns an updated Setting and a number of expirations after
 // the associated Clock indicates a time of now.
 //
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index c102af101..4465549ad 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -266,11 +266,11 @@ var AMD64 = &kernel.SyscallTable{
 		219: RestartSyscall,
 		//     220: Semtimedop, TODO
 		221: Fadvise64,
-		//     222: TimerCreate, TODO
-		//     223: TimerSettime, TODO
-		//     224: TimerGettime, TODO
-		//     225: TimerGetoverrun, TODO
-		//     226: TimerDelete, TODO
+		222: TimerCreate,
+		223: TimerSettime,
+		224: TimerGettime,
+		225: TimerGetoverrun,
+		226: TimerDelete,
 		227: ClockSettime,
 		228: ClockGettime,
 		229: ClockGetres,
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index 4ed077626..aaed75c81 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -166,3 +166,88 @@ func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	return uintptr(sec), nil, nil
 }
+
+// TimerCreate implements linux syscall timer_create(2).
+func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	clockID := args[0].Int()
+	sevp := args[1].Pointer()
+	timerIDp := args[2].Pointer()
+
+	c, err := getClock(t, clockID)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	var sev *linux.Sigevent
+	if sevp != 0 {
+		sev = &linux.Sigevent{}
+		if _, err = t.CopyIn(sevp, sev); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	id, err := t.IntervalTimerCreate(c, sev)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	if _, err := t.CopyOut(timerIDp, &id); err != nil {
+		t.IntervalTimerDelete(id)
+		return 0, nil, err
+	}
+
+	return uintptr(id), nil, nil
+}
+
+// TimerSettime implements linux syscall timer_settime(2).
+func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+	flags := args[1].Int()
+	newValAddr := args[2].Pointer()
+	oldValAddr := args[3].Pointer()
+
+	var newVal linux.Itimerspec
+	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
+		return 0, nil, err
+	}
+	oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
+	if err != nil {
+		return 0, nil, err
+	}
+	if oldValAddr != 0 {
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
+	}
+	return 0, nil, nil
+}
+
+// TimerGettime implements linux syscall timer_gettime(2).
+func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+	curValAddr := args[1].Pointer()
+
+	curVal, err := t.IntervalTimerGettime(timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+	_, err = t.CopyOut(curValAddr, &curVal)
+	return 0, nil, err
+}
+
+// TimerGetoverrun implements linux syscall timer_getoverrun(2).
+func TimerGetoverrun(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+
+	o, err := t.IntervalTimerGetoverrun(timerID)
+	if err != nil {
+		return 0, nil, err
+	}
+	return uintptr(o), nil, nil
+}
+
+// TimerDelete implements linux syscall timer_delete(2).
+func TimerDelete(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	timerID := linux.TimerID(args[0].Value)
+	return 0, nil, t.IntervalTimerDelete(timerID)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index cb81d42b9..92c6a3d60 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -85,28 +85,18 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 	if _, err := t.CopyIn(newValAddr, &newVal); err != nil {
 		return 0, nil, err
 	}
-	var s ktime.Setting
-	var err error
-	if flags&linux.TFD_TIMER_ABSTIME != 0 {
-		s, err = ktime.SettingFromAbsSpec(ktime.FromTimespec(newVal.Value),
-			newVal.Interval.ToDuration())
-	} else {
-		s, err = ktime.SettingFromSpec(newVal.Value.ToDuration(),
-			newVal.Interval.ToDuration(), tf.Clock())
-	}
+	newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock())
 	if err != nil {
 		return 0, nil, err
 	}
-	valueNS, intervalNS := ktime.SpecFromSetting(tf.SetTime(s))
-	if oldValAddr == 0 {
-		return 0, nil, nil
-	}
-	oldVal := linux.Itimerspec{
-		Interval: linux.DurationToTimespec(intervalNS),
-		Value:    linux.DurationToTimespec(valueNS),
+	tm, oldS := tf.SetTime(newS)
+	if oldValAddr != 0 {
+		oldVal := ktime.ItimerspecFromSetting(tm, oldS)
+		if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil {
+			return 0, nil, err
+		}
 	}
-	_, err = t.CopyOut(oldValAddr, &oldVal)
-	return 0, nil, err
+	return 0, nil, nil
 }
 
 // TimerfdGettime implements Linux syscall timerfd_gettime(2).
@@ -125,11 +115,8 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 		return 0, nil, syserror.EINVAL
 	}
 
-	valueNS, intervalNS := ktime.SpecFromSetting(tf.GetTime())
-	curVal := linux.Itimerspec{
-		Interval: linux.DurationToTimespec(intervalNS),
-		Value:    linux.DurationToTimespec(valueNS),
-	}
+	tm, s := tf.GetTime()
+	curVal := ktime.ItimerspecFromSetting(tm, s)
 	_, err := t.CopyOut(curValAddr, &curVal)
 	return 0, nil, err
 }
-- 
cgit v1.2.3


From c48708a041fcc9749e0162a7708f32e5a3d7e526 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 24 Aug 2018 17:14:38 -0700
Subject: fs: Drop unused WaitGroup in Dirent.destroy.

PiperOrigin-RevId: 210182476
Change-Id: I655a2a801e2069108d30323f7f5ae76deb3ea3ec
---
 pkg/sentry/fs/dirent.go | 2 --
 1 file changed, 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 71ef3336e..c1dfa0de7 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1267,8 +1267,6 @@ func (d *Dirent) destroy() {
 		return
 	}
 
-	var wg sync.WaitGroup
-	defer wg.Wait()
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-- 
cgit v1.2.3


From 106de2182d34197d76fb68863cd4a102ebac2dbb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 24 Aug 2018 17:42:30 -0700
Subject: runsc: Terminal support for "docker exec -ti".

This CL adds terminal support for "docker exec".  We previously only supported
consoles for the container process, but not exec processes.

The SYS_IOCTL syscall was added to the default seccomp filter list, but only
for ioctls that get/set winsize and termios structs. We need to allow these
ioctl for all containers because it's possible to run "exec -ti" on a
container that was started without an attached console, after the filters
have been installed.

Note that control-character signals are still not properly supported.

Tested with:
	$ docker run --runtime=runsc -it alpine
In another terminial:
	$ docker exec -it <containerid> /bin/sh

PiperOrigin-RevId: 210185456
Change-Id: I6d2401e53a7697bb988c120a8961505c335f96d9
---
 pkg/abi/linux/ioctl.go             |  6 +++-
 pkg/abi/linux/tty.go               |  8 +++++
 pkg/sentry/control/proc.go         | 17 +++++------
 pkg/sentry/fs/host/BUILD           |  1 -
 pkg/sentry/fs/host/file.go         | 19 ++++++------
 pkg/sentry/fs/host/ioctl_unsafe.go | 19 +++++++++++-
 runsc/boot/controller.go           |  2 +-
 runsc/boot/filter/BUILD            |  1 +
 runsc/boot/filter/config.go        | 38 +++++++++++++++++++-----
 runsc/boot/filter/filter.go        |  6 +---
 runsc/boot/loader.go               |  2 +-
 runsc/cmd/BUILD                    |  1 +
 runsc/cmd/exec.go                  | 39 ++++++++++++++++++++++--
 runsc/console/BUILD                | 16 ++++++++++
 runsc/console/console.go           | 61 ++++++++++++++++++++++++++++++++++++++
 runsc/sandbox/BUILD                |  3 +-
 runsc/sandbox/console.go           | 60 -------------------------------------
 runsc/sandbox/sandbox.go           | 20 +++++++------
 18 files changed, 207 insertions(+), 112 deletions(-)
 create mode 100644 runsc/console/BUILD
 create mode 100644 runsc/console/console.go
 delete mode 100644 runsc/sandbox/console.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 3ef046562..4d7a2dfd7 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -21,8 +21,12 @@ const (
 	TCGETS     = 0x00005401
 	TCSETS     = 0x00005402
 	TCSETSW    = 0x00005403
-	TIOCINQ    = 0x0000541b
+	TIOCGPGRP  = 0x0000540f
+	TIOCSPGRP  = 0x00005410
 	TIOCOUTQ   = 0x00005411
+	TIOCGWINSZ = 0x00005413
+	TIOCSWINSZ = 0x00005414
+	TIOCINQ    = 0x0000541b
 	FIONREAD   = TIOCINQ
 	FIONBIO    = 0x00005421
 	TIOCGPTN   = 0x80045430
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 8c611d22a..81156867c 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -26,6 +26,14 @@ const (
 	disabledChar = 0
 )
 
+// Winsize is struct winsize, defined in uapi/asm-generic/termios.h.
+type Winsize struct {
+	Row    uint16
+	Col    uint16
+	Xpixel uint16
+	Ypixel uint16
+}
+
 // Termios is struct termios, defined in uapi/asm-generic/termbits.h.
 type Termios struct {
 	InputFlags        uint32
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index d94ae560f..2493c5175 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -19,7 +19,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"sort"
-	"syscall"
 	"text/tabwriter"
 	"time"
 
@@ -73,6 +72,10 @@ type ExecArgs struct {
 	// Capabilities is the list of capabilities to give to the process.
 	Capabilities *auth.TaskCapabilities
 
+	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host
+	// pty fd.
+	StdioIsPty bool
+
 	// FilePayload determines the files to give to the new process.
 	urpc.FilePayload
 }
@@ -108,17 +111,11 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 	mounter := fs.FileOwnerFromContext(ctx)
 
 	for appFD, f := range args.FilePayload.Files {
-		// Copy the underlying FD.
-		newFD, err := syscall.Dup(int(f.Fd()))
-		if err != nil {
-			return err
-		}
-		f.Close()
+		enableIoctl := args.StdioIsPty && appFD <= 2
 
-		// Install the given file as an FD.
-		file, err := host.NewFile(ctx, newFD, mounter)
+		// Import the given file FD. This dups the FD as well.
+		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
 		if err != nil {
-			syscall.Close(newFD)
 			return err
 		}
 		defer file.DecRef()
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 29c79284a..f1252b0f2 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -48,7 +48,6 @@ go_library(
         "//pkg/unet",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
-        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index f9bef6d93..8d2463c78 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"syscall"
 
-	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -296,7 +295,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 	fd := f.iops.fileState.FD()
 	ioctl := args[1].Uint64()
 	switch ioctl {
-	case unix.TCGETS:
+	case linux.TCGETS:
 		termios, err := ioctlGetTermios(fd)
 		if err != nil {
 			return 0, err
@@ -306,7 +305,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case unix.TCSETS, unix.TCSETSW:
+	case linux.TCSETS, linux.TCSETSW:
 		var termios linux.Termios
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -316,7 +315,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		err := ioctlSetTermios(fd, ioctl, &termios)
 		return 0, err
 
-	case unix.TIOCGPGRP:
+	case linux.TIOCGPGRP:
 		// Args: pid_t *argp
 		// When successful, equivalent to *argp = tcgetpgrp(fd).
 		// Get the process group ID of the foreground process group on
@@ -332,7 +331,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case unix.TIOCSPGRP:
+	case linux.TIOCSPGRP:
 		// Args: const pid_t *argp
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
@@ -343,10 +342,10 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		log.Warningf("Ignoring application ioctl(TIOCSPGRP) call")
 		return 0, nil
 
-	case unix.TIOCGWINSZ:
+	case linux.TIOCGWINSZ:
 		// Args: struct winsize *argp
 		// Get window size.
-		winsize, err := unix.IoctlGetWinsize(fd, unix.TIOCGWINSZ)
+		winsize, err := ioctlGetWinsize(fd)
 		if err != nil {
 			return 0, err
 		}
@@ -355,16 +354,16 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case unix.TIOCSWINSZ:
+	case linux.TIOCSWINSZ:
 		// Args: const struct winsize *argp
 		// Set window size.
-		var winsize unix.Winsize
+		var winsize linux.Winsize
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
 			AddressSpaceActive: true,
 		}); err != nil {
 			return 0, err
 		}
-		err := unix.IoctlSetWinsize(fd, unix.TIOCSWINSZ, &winsize)
+		err := ioctlSetWinsize(fd, &winsize)
 		return 0, err
 
 	default:
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index 3c07c3850..bc965a1c2 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -23,7 +23,7 @@ import (
 
 func ioctlGetTermios(fd int) (*linux.Termios, error) {
 	var t linux.Termios
-	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TCGETS, uintptr(unsafe.Pointer(&t)))
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
 	if errno != 0 {
 		return nil, errno
 	}
@@ -37,3 +37,20 @@ func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
 	}
 	return nil
 }
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+	var w linux.Winsize
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 69e88d8e0..2d6b507b3 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -227,7 +227,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 
 // Execute runs a command on a created or running sandbox.
 func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
-	log.Debugf("containerManager.Execute")
+	log.Debugf("containerManager.Execute: %+v", *e)
 	proc := control.Proc{Kernel: cm.l.k}
 	if err := proc.Exec(e, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index c9837c236..96be051fe 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//runsc/boot:__subpackages__",
     ],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/seccomp",
         "//pkg/sentry/platform",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index e45e599c3..db2e3f9d8 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -18,6 +18,7 @@ import (
 	"syscall"
 
 	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 )
 
@@ -78,15 +79,36 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_TGKILL:          {},
 	syscall.SYS_WRITE:           {},
 	syscall.SYS_WRITEV:          {},
-}
 
-// TODO: Ioctl is needed in order to support tty consoles.
-// Once filters support argument-checking, we should only allow ioctl
-// with tty-related arguments.
-func consoleFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_IOCTL: {},
-	}
+	// SYS_IOCTL is needed for terminal support, but we only allow
+	// setting/getting termios and winsize.
+	syscall.SYS_IOCTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCGETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSW),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCSWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCGWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+	},
 }
 
 // whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index 6ea9c464e..c57bbd2e5 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -28,7 +28,7 @@ import (
 )
 
 // Install installs seccomp filters for based on the given platform.
-func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error {
+func Install(p platform.Platform, whitelistFS, hostNetwork bool) error {
 	s := allowedSyscalls
 
 	// Set of additional filters used by -race and -msan. Returns empty
@@ -39,10 +39,6 @@ func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error
 		Report("direct file access allows unrestricted file access!")
 		s.Merge(whitelistFSFilters())
 	}
-	if console {
-		Report("console is enabled: syscall filters less restrictive!")
-		s.Merge(consoleFilters())
-	}
 	if hostNetwork {
 		Report("host networking enabled: syscall filters less restrictive!")
 		s.Merge(hostInetFilters())
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2f212c704..0e94cf215 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -340,7 +340,7 @@ func (l *Loader) run() error {
 	} else {
 		whitelistFS := l.conf.FileAccess == FileAccessDirect
 		hostNet := l.conf.Network == NetworkHost
-		if err := filter.Install(l.k.Platform, whitelistFS, l.console, hostNet); err != nil {
+		if err := filter.Install(l.k.Platform, whitelistFS, hostNet); err != nil {
 			return fmt.Errorf("Failed to install seccomp filters: %v", err)
 		}
 	}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index c45784749..b9ef4022f 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -38,6 +38,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/console",
         "//runsc/container",
         "//runsc/fsgofer",
         "//runsc/specutils",
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 4ee370656..b84a80119 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/console"
 	"gvisor.googlesource.com/gvisor/runsc/container"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -50,6 +51,11 @@ type Exec struct {
 	detach      bool
 	processPath string
 	pidFile     string
+
+	// consoleSocket is the path to an AF_UNIX socket which will receive a
+	// file descriptor referencing the master end of the console's
+	// pseudoterminal.
+	consoleSocket string
 }
 
 // Name implements subcommands.Command.Name.
@@ -91,6 +97,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
 	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
+	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -178,11 +185,35 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 			args = append(args, a)
 		}
 	}
-
 	cmd := exec.Command(binPath, args...)
+
+	// Exec stdio defaults to current process stdio.
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
+
+	// If the console control socket file is provided, then create a new
+	// pty master/slave pair and set the tty on the sandbox process.
+	if ex.consoleSocket != "" {
+		// Create a new tty pair and send the master on the provided
+		// socket.
+		tty, err := console.NewWithSocket(ex.consoleSocket)
+		if err != nil {
+			Fatalf("error setting up console with socket %q: %v", ex.consoleSocket, err)
+		}
+		defer tty.Close()
+
+		// Set stdio to the new tty slave.
+		cmd.Stdin = tty
+		cmd.Stdout = tty
+		cmd.Stderr = tty
+		cmd.SysProcAttr = &syscall.SysProcAttr{
+			Setsid:  true,
+			Setctty: true,
+			Ctty:    int(tty.Fd()),
+		}
+	}
+
 	if err := cmd.Start(); err != nil {
 		Fatalf("failure to start child exec process, err: %v", err)
 	}
@@ -252,11 +283,12 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 	return &control.ExecArgs{
 		Argv:             argv,
 		WorkingDirectory: ex.cwd,
-		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
 		KUID:             ex.user.kuid,
 		KGID:             ex.user.kgid,
 		ExtraKGIDs:       extraKGIDs,
 		Capabilities:     caps,
+		StdioIsPty:       ex.consoleSocket != "",
+		FilePayload:      urpc.FilePayload{[]*os.File{os.Stdin, os.Stdout, os.Stderr}},
 	}, nil
 }
 
@@ -292,11 +324,12 @@ func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
 		Argv:             p.Args,
 		Envv:             p.Env,
 		WorkingDirectory: p.Cwd,
-		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
 		KUID:             auth.KUID(p.User.UID),
 		KGID:             auth.KGID(p.User.GID),
 		ExtraKGIDs:       extraKGIDs,
 		Capabilities:     caps,
+		StdioIsPty:       p.Terminal,
+		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
 	}, nil
 }
 
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
new file mode 100644
index 000000000..fa1a7d430
--- /dev/null
+++ b/runsc/console/BUILD
@@ -0,0 +1,16 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "console",
+    srcs = ["console.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/console",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "@com_github_kr_pty//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/console/console.go b/runsc/console/console.go
new file mode 100644
index 000000000..2f2745b2b
--- /dev/null
+++ b/runsc/console/console.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package console contains utilities for working with pty consols in runsc.
+package console
+
+import (
+	"fmt"
+	"net"
+	"os"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+)
+
+// NewWithSocket creates pty master/slave pair, sends the master FD over the given
+// socket, and returns the slave.
+func NewWithSocket(socketPath string) (*os.File, error) {
+	// Create a new pty master and slave.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		return nil, fmt.Errorf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Get a connection to the socket path.
+	conn, err := net.Dial("unix", socketPath)
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
+	}
+	uc, ok := conn.(*net.UnixConn)
+	if !ok {
+		ptySlave.Close()
+		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
+	}
+	socket, err := uc.File()
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
+	}
+
+	// Send the master FD over the connection.
+	msg := unix.UnixRights(int(ptyMaster.Fd()))
+	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
+	}
+	return ptySlave, nil
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index d26a4dac6..e9a39f797 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
     name = "sandbox",
     srcs = [
-        "console.go",
         "namespace.go",
         "network.go",
         "sandbox.go",
@@ -21,9 +20,9 @@ go_library(
         "//pkg/sentry/control",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/console",
         "//runsc/fsgofer",
         "//runsc/specutils",
-        "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/sandbox/console.go b/runsc/sandbox/console.go
deleted file mode 100644
index 3f133e12a..000000000
--- a/runsc/sandbox/console.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sandbox
-
-import (
-	"fmt"
-	"net"
-	"os"
-
-	"github.com/kr/pty"
-	"golang.org/x/sys/unix"
-)
-
-// setupConsole creates pty master/slave pair, sends the master FD over the
-// given socket, and returns the slave.
-func setupConsole(socketPath string) (*os.File, error) {
-	// Create a new pty master and slave.
-	ptyMaster, ptySlave, err := pty.Open()
-	if err != nil {
-		return nil, fmt.Errorf("error opening pty: %v", err)
-	}
-	defer ptyMaster.Close()
-
-	// Get a connection to the socket path.
-	conn, err := net.Dial("unix", socketPath)
-	if err != nil {
-		ptySlave.Close()
-		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
-	}
-	uc, ok := conn.(*net.UnixConn)
-	if !ok {
-		ptySlave.Close()
-		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
-	}
-	socket, err := uc.File()
-	if err != nil {
-		ptySlave.Close()
-		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
-	}
-
-	// Send the master FD over the connection.
-	msg := unix.UnixRights(int(ptyMaster.Fd()))
-	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
-		ptySlave.Close()
-		return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
-	}
-	return ptySlave, nil
-}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 7789608f8..e54ba4ba3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/console"
 	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -392,7 +393,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		"boot",
 		"--bundle", bundleDir,
 		"--controller-fd="+strconv.Itoa(nextFD),
-		fmt.Sprintf("--console=%t", consoleEnabled))
+		"--console="+strconv.FormatBool(consoleEnabled))
 	nextFD++
 
 	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
@@ -407,14 +408,19 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
+	// Sandbox stdio defaults to current process stdio.
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
 	// If the console control socket file is provided, then create a new
 	// pty master/slave pair and set the tty on the sandbox process.
 	if consoleEnabled {
-		// setupConsole will send the master on the socket, and return
-		// the slave.
-		tty, err := setupConsole(consoleSocket)
+		// console.NewWithSocket will send the master on the socket,
+		// and return the slave.
+		tty, err := console.NewWithSocket(consoleSocket)
 		if err != nil {
-			return fmt.Errorf("error setting up control socket %q: %v", consoleSocket, err)
+			return fmt.Errorf("error setting up console with socket %q: %v", consoleSocket, err)
 		}
 		defer tty.Close()
 
@@ -423,10 +429,6 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		cmd.Stderr = tty
 		cmd.SysProcAttr.Setctty = true
 		cmd.SysProcAttr.Ctty = int(tty.Fd())
-	} else {
-		cmd.Stdin = os.Stdin
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
 	}
 
 	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
-- 
cgit v1.2.3


From 2524111fc63343fd7372f5ea0266130adea778a5 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 27 Aug 2018 10:48:02 -0700
Subject: runsc: Terminal resizing support.

Implements the TIOCGWINSZ and TIOCSWINSZ ioctls, which allow processes to resize
the terminal. This allows, for example, sshd to properly set the window size for
ssh sessions.

PiperOrigin-RevId: 210392504
Change-Id: I0d4789154d6d22f02509b31d71392e13ee4a50ba
---
 pkg/abi/linux/tty.go                 | 10 ++++++++++
 pkg/sentry/fs/tty/line_discipline.go | 24 ++++++++++++++++++++++++
 pkg/sentry/fs/tty/master.go          |  4 ++++
 pkg/sentry/fs/tty/slave.go           |  4 ++++
 4 files changed, 42 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index 81156867c..f63dc52aa 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -328,3 +328,13 @@ var DefaultSlaveTermios = KernelTermios{
 	InputSpeed:        38400,
 	OutputSpeed:       38400,
 }
+
+// WindowSize corresponds to struct winsize defined in
+// include/uapi/asm-generic/termios.h.
+//
+// +stateify savable
+type WindowSize struct {
+	Rows uint16
+	Cols uint16
+	_    [4]byte // Padding for 2 unused shorts.
+}
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index c7f6c5645..31804571e 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -76,6 +76,12 @@ const (
 //
 // +stateify savable
 type lineDiscipline struct {
+	// sizeMu protects size.
+	sizeMu sync.Mutex `state:"nosave"`
+
+	// size is the terminal size (width and height).
+	size linux.WindowSize
+
 	// inQueue is the input queue of the terminal.
 	inQueue queue
 
@@ -142,6 +148,24 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	return 0, err
 }
 
+func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
+func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
 func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	// We don't have to lock a termios because the default master termios
 	// is immutable.
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index c8dc08c1a..ae7540eff 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -172,6 +172,10 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args a
 	case linux.TIOCSPTLCK:
 		// TODO: Implement pty locking. For now just pretend we do.
 		return 0, nil
+	case linux.TIOCGWINSZ:
+		return 0, mf.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, mf.t.ld.setWindowSize(ctx, io, args)
 	default:
 		return 0, syserror.ENOTTY
 	}
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index ab92ced7e..963331b9b 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -150,6 +150,10 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 			AddressSpaceActive: true,
 		})
 		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, sf.si.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
 	default:
 		return 0, syserror.ENOTTY
 	}
-- 
cgit v1.2.3


From bd01816c872672b74998694bb6e759df2a336735 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Mon, 27 Aug 2018 11:54:15 -0700
Subject: sentry: mark fsutil.DirFileOperations as savable.

PiperOrigin-RevId: 210405166
Change-Id: I252766015885c418e914007baf2fc058fec39b3e
---
 pkg/sentry/fs/fsutil/file.go | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index b17f11a5a..d5881613b 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -214,6 +214,8 @@ func (NoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 }
 
 // DirFileOperations implements FileOperations for directories.
+//
+// +stateify savable
 type DirFileOperations struct {
 	waiter.AlwaysReady `state:"nosave"`
 	NoopRelease        `state:"nosave"`
-- 
cgit v1.2.3


From 0b3bfe2ea30d491a6533f8ee74eb6e3cea707f06 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 27 Aug 2018 14:25:21 -0700
Subject: fs: Fix remote-revalidate cache policy.

When revalidating a Dirent, if the inode id is the same, then we don't need to
throw away the entire Dirent. We can just update the unstable attributes in
place.

If the inode id has changed, then the remote file has been deleted or moved,
and we have no choice but to throw away the dirent we have a look up another.
In this case, we may still end up losing a mounted dirent that is a child of
the revalidated dirent. However, that seems appropriate here because the entire
mount point has been pulled out from underneath us.

Because gVisor's overlay is at the Inode level rather than the Dirent level, we
must pass the parent Inode and name along with the Inode that is being
revalidated.

PiperOrigin-RevId: 210431270
Change-Id: I705caef9c68900234972d5aac4ae3a78c61c7d42
---
 pkg/sentry/fs/attr.go                | 19 --------
 pkg/sentry/fs/dirent.go              |  2 +-
 pkg/sentry/fs/fsutil/inode_cached.go | 41 ++++++++++++++++
 pkg/sentry/fs/gofer/cache_policy.go  | 64 ++++++++++++++++++++----
 pkg/sentry/fs/gofer/gofer_test.go    | 95 ++++++++++++++++++++++++++----------
 pkg/sentry/fs/gofer/session.go       |  6 +--
 pkg/sentry/fs/mock.go                |  2 +-
 pkg/sentry/fs/mount.go               | 13 +++--
 pkg/sentry/fs/mount_overlay.go       | 20 +++++---
 pkg/sentry/fs/tty/fs.go              |  2 +-
 runsc/container/container_test.go    | 18 +++----
 11 files changed, 201 insertions(+), 81 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 4178f18b2..091f4ac63 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -213,25 +213,6 @@ func (a AttrMask) Empty() bool {
 	return a == AttrMask{}
 }
 
-// Union returns an AttrMask containing the inclusive disjunction of fields in a and b.
-func (a AttrMask) Union(b AttrMask) AttrMask {
-	return AttrMask{
-		Type:             a.Type || b.Type,
-		DeviceID:         a.DeviceID || b.DeviceID,
-		InodeID:          a.InodeID || b.InodeID,
-		BlockSize:        a.BlockSize || b.BlockSize,
-		Size:             a.Size || b.Size,
-		Usage:            a.Usage || b.Usage,
-		Perms:            a.Perms || b.Perms,
-		UID:              a.UID || b.UID,
-		GID:              a.GID || b.GID,
-		AccessTime:       a.AccessTime || b.AccessTime,
-		ModificationTime: a.ModificationTime || b.ModificationTime,
-		StatusChangeTime: a.StatusChangeTime || b.StatusChangeTime,
-		Links:            a.Links || b.Links,
-	}
-}
-
 // PermMask are file access permissions.
 //
 // +stateify savable
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index c1dfa0de7..5587582b5 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -499,7 +499,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 			//
 			// We never allow the file system to revalidate mounts, that could cause them
 			// to unexpectedly drop out before umount.
-			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, cd.Inode) {
+			if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, name, d.Inode, cd.Inode) {
 				// Good to go. This is the fast-path.
 				return cd, nil
 			}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 0a320e2d8..6777c8bf7 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -427,6 +427,47 @@ func (c *CachingInodeOperations) touchStatusChangeTimeLocked(ctx context.Context
 	c.dirtyAttr.StatusChangeTime = true
 }
 
+// UpdateUnstable updates the cached unstable attributes. Only non-dirty
+// attributes are updated.
+func (c *CachingInodeOperations) UpdateUnstable(attr fs.UnstableAttr) {
+	// All attributes are protected by attrMu.
+	c.attrMu.Lock()
+
+	if !c.dirtyAttr.Usage {
+		c.attr.Usage = attr.Usage
+	}
+	if !c.dirtyAttr.Perms {
+		c.attr.Perms = attr.Perms
+	}
+	if !c.dirtyAttr.UID {
+		c.attr.Owner.UID = attr.Owner.UID
+	}
+	if !c.dirtyAttr.GID {
+		c.attr.Owner.GID = attr.Owner.GID
+	}
+	if !c.dirtyAttr.AccessTime {
+		c.attr.AccessTime = attr.AccessTime
+	}
+	if !c.dirtyAttr.ModificationTime {
+		c.attr.ModificationTime = attr.ModificationTime
+	}
+	if !c.dirtyAttr.StatusChangeTime {
+		c.attr.StatusChangeTime = attr.StatusChangeTime
+	}
+	if !c.dirtyAttr.Links {
+		c.attr.Links = attr.Links
+	}
+
+	// Size requires holding attrMu and dataMu.
+	c.dataMu.Lock()
+	if !c.dirtyAttr.Size {
+		c.attr.Size = attr.Size
+	}
+	c.dataMu.Unlock()
+
+	c.attrMu.Unlock()
+}
+
 // Read reads from frames and otherwise directly from the backing file
 // into dst starting at offset until dst is full, EOF is reached, or an
 // error is encountered.
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index fa8abf51c..98f43c578 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -17,6 +17,7 @@ package gofer
 import (
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 )
 
@@ -108,25 +109,68 @@ func (cp cachePolicy) writeThrough(inode *fs.Inode) bool {
 	return cp == cacheNone || cp == cacheAllWritethrough
 }
 
-// revalidateDirent indicates that a dirent should be revalidated after a
-// lookup, because the looked up version may be stale.
-func (cp cachePolicy) revalidateDirent() bool {
+// revalidate revalidates the child Inode if the cache policy allows it.
+//
+// Depending on the cache policy, revalidate will walk from the parent to the
+// child inode, and if any unstable attributes have changed, will update the
+// cached attributes on the child inode. If the walk fails, or the returned
+// inode id is different from the one being revalidated, then the entire Dirent
+// must be reloaded.
+func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child *fs.Inode) bool {
 	if cp == cacheAll || cp == cacheAllWritethrough {
 		return false
 	}
 
-	// TODO: The cacheRemoteRevalidating policy should only
-	// return true if the remote file's attributes have changed.
-	return true
+	if cp == cacheNone {
+		return true
+	}
+
+	childIops, ok := child.InodeOperations.(*inodeOperations)
+	if !ok {
+		panic(fmt.Sprintf("revalidating inode operations of unknown type %T", child.InodeOperations))
+	}
+	parentIops, ok := parent.InodeOperations.(*inodeOperations)
+	if !ok {
+		panic(fmt.Sprintf("revalidating inode operations with parent of unknown type %T", parent.InodeOperations))
+	}
+
+	// Walk from parent to child again.
+	//
+	// TODO: If we have a directory FD in the parent
+	// inodeOperations, then we can use fstatat(2) to get the inode
+	// attributes instead of making this RPC.
+	qids, _, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
+	if err != nil {
+		// Can't look up the name. Trigger reload.
+		return true
+	}
+
+	// If the Path has changed, then we are not looking at the file file.
+	// We must reload.
+	if qids[0].Path != childIops.fileState.key.Inode {
+		return true
+	}
+
+	// If we are not caching unstable attrs, then there is nothing to
+	// update on this inode.
+	if !cp.cacheUAttrs(child) {
+		return false
+	}
+
+	// Update the inode's cached unstable attrs.
+	s := childIops.session()
+	childIops.cachingInodeOps.UpdateUnstable(unstable(ctx, mask, attr, s.mounter, s.client))
+
+	return false
 }
 
-// keepDirent indicates that dirents should be kept pinned in the dirent tree
-// even if there are no application references on the file.
-func (cp cachePolicy) keepDirent(inode *fs.Inode) bool {
+// keep indicates that dirents should be kept pinned in the dirent tree even if
+// there are no application references on the file.
+func (cp cachePolicy) keep(d *fs.Dirent) bool {
 	if cp == cacheNone {
 		return false
 	}
-	sattr := inode.StableAttr
+	sattr := d.Inode.StableAttr
 	// NOTE: Only cache files, directories, and symlinks.
 	return fs.IsFile(sattr) || fs.IsDir(sattr) || fs.IsSymlink(sattr)
 }
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 45fdaacfd..c8d7bd773 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -151,41 +151,60 @@ func TestLookup(t *testing.T) {
 
 func TestRevalidation(t *testing.T) {
 	tests := []struct {
-		cachePolicy               cachePolicy
-		preModificationWantReval  bool
-		postModificationWantReval bool
+		cachePolicy cachePolicy
+
+		// Whether dirent should be reloaded before any modifications.
+		preModificationWantReload bool
+
+		// Whether dirent should be reloaded after updating an unstable
+		// attribute on the remote fs.
+		postModificationWantReload bool
+
+		// Whether dirent unstable attributes should be updated after
+		// updating an attribute on the remote fs.
+		postModificationWantUpdatedAttrs bool
+
+		// Whether dirent should be reloaded after the remote has
+		// removed the file.
+		postRemovalWantReload bool
 	}{
 		{
 			// Policy cacheNone causes Revalidate to always return
 			// true.
-			cachePolicy:               cacheNone,
-			preModificationWantReval:  true,
-			postModificationWantReval: true,
+			cachePolicy:                      cacheNone,
+			preModificationWantReload:        true,
+			postModificationWantReload:       true,
+			postModificationWantUpdatedAttrs: true,
+			postRemovalWantReload:            true,
 		},
 		{
 			// Policy cacheAll causes Revalidate to always return
 			// false.
-			cachePolicy:               cacheAll,
-			preModificationWantReval:  false,
-			postModificationWantReval: false,
+			cachePolicy:                      cacheAll,
+			preModificationWantReload:        false,
+			postModificationWantReload:       false,
+			postModificationWantUpdatedAttrs: false,
+			postRemovalWantReload:            false,
 		},
 		{
 			// Policy cacheAllWritethrough causes Revalidate to
 			// always return false.
-			cachePolicy:               cacheAllWritethrough,
-			preModificationWantReval:  false,
-			postModificationWantReval: false,
+			cachePolicy:                      cacheAllWritethrough,
+			preModificationWantReload:        false,
+			postModificationWantReload:       false,
+			postModificationWantUpdatedAttrs: false,
+			postRemovalWantReload:            false,
 		},
 		{
 			// Policy cacheRemoteRevalidating causes Revalidate to
-			// always return true.
-			//
-			// TODO: The cacheRemoteRevalidating
-			// policy should only return true if the remote file's
-			// attributes have changed.
-			cachePolicy:               cacheRemoteRevalidating,
-			preModificationWantReval:  true,
-			postModificationWantReval: true,
+			// return update cached unstable attrs, and returns
+			// true only when the remote inode itself has been
+			// removed or replaced.
+			cachePolicy:                      cacheRemoteRevalidating,
+			preModificationWantReload:        false,
+			postModificationWantReload:       false,
+			postModificationWantUpdatedAttrs: true,
+			postRemovalWantReload:            true,
 		},
 	}
 
@@ -227,15 +246,17 @@ func TestRevalidation(t *testing.T) {
 			if err != nil {
 				t.Fatalf("Lookup(%q) failed: %v", name, err)
 			}
-			if test.preModificationWantReval && dirent == newDirent {
+			if test.preModificationWantReload && dirent == newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
 			}
-			if !test.preModificationWantReval && dirent != newDirent {
+			if !test.preModificationWantReload && dirent != newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
 			}
 
 			// Modify the underlying mocked file's modification time.
-			file.GetAttrMock.Attr.MTimeSeconds = uint64(time.Now().Unix())
+			nowSeconds := time.Now().Unix()
+			rootFile.WalkGetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
+			file.GetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
 
 			// Walk again. Depending on the cache policy, we may get a new
 			// dirent.
@@ -243,12 +264,36 @@ func TestRevalidation(t *testing.T) {
 			if err != nil {
 				t.Fatalf("Lookup(%q) failed: %v", name, err)
 			}
-			if test.postModificationWantReval && dirent == newDirent {
+			if test.postModificationWantReload && dirent == newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
 			}
-			if !test.postModificationWantReval && dirent != newDirent {
+			if !test.postModificationWantReload && dirent != newDirent {
 				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
 			}
+			uattrs, err := newDirent.Inode.UnstableAttr(ctx)
+			if err != nil {
+				t.Fatalf("Error getting unstable attrs: %v", err)
+			}
+			gotModTimeSeconds := uattrs.ModificationTime.Seconds()
+			if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds {
+				t.Fatalf("Lookup(%q) with cachePolicy=%s got new modification time %v, wanted %v", name, test.cachePolicy, gotModTimeSeconds, nowSeconds)
+			}
+
+			// Make WalkGetAttr return ENOENT. This simulates
+			// removing the file from the remote fs.
+			rootFile.WalkGetAttrMock = p9test.WalkGetAttrMock{
+				Err: syscall.ENOENT,
+			}
+
+			// Walk again. Depending on the cache policy, we may
+			// get ENOENT.
+			newDirent, err = rootDir.Walk(ctx, rootDir, name)
+			if test.postRemovalWantReload && err == nil {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got nil error, wanted ENOENT", name, test.cachePolicy)
+			}
+			if !test.postRemovalWantReload && (err != nil || dirent != newDirent) {
+				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v and error %v, wanted old dirent %v and nil error", name, test.cachePolicy, newDirent, err, dirent)
+			}
 		})
 	}
 }
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index eeb9087e9..49d27ee88 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -146,13 +146,13 @@ func (s *session) Destroy() {
 }
 
 // Revalidate implements MountSource.Revalidate.
-func (s *session) Revalidate(ctx context.Context, i *fs.Inode) bool {
-	return s.cachePolicy.revalidateDirent()
+func (s *session) Revalidate(ctx context.Context, name string, parent, child *fs.Inode) bool {
+	return s.cachePolicy.revalidate(ctx, name, parent, child)
 }
 
 // Keep implements MountSource.Keep.
 func (s *session) Keep(d *fs.Dirent) bool {
-	return s.cachePolicy.keepDirent(d.Inode)
+	return s.cachePolicy.keep(d)
 }
 
 // ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings.
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 89a0103ba..846b6e8bb 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -68,7 +68,7 @@ func NewMockMountSource(cache *DirentCache) *MountSource {
 }
 
 // Revalidate implements fs.MountSourceOperations.Revalidate.
-func (n *MockMountSourceOps) Revalidate(context.Context, *Inode) bool {
+func (n *MockMountSourceOps) Revalidate(context.Context, string, *Inode, *Inode) bool {
 	return n.revalidate
 }
 
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 455f5b35c..8345876fc 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -27,10 +27,13 @@ import (
 // DirentOperations provide file systems greater control over how long a Dirent stays pinned
 // in core. Implementations must not take Dirent.mu.
 type DirentOperations interface {
-	// Revalidate returns true if the Inode is stale and its
-	// InodeOperations needs to be reloaded. Revalidate will never be
-	// called on a Inode that is mounted.
-	Revalidate(ctx context.Context, inode *Inode) bool
+	// Revalidate is called during lookup each time we encounter a Dirent
+	// in the cache. Implementations may update stale properties of the
+	// child Inode. If Revalidate returns true, then the entire Inode will
+	// be reloaded.
+	//
+	// Revalidate will never be called on a Inode that is mounted.
+	Revalidate(ctx context.Context, name string, parent, child *Inode) bool
 
 	// Keep returns true if the Dirent should be kept in memory for as long
 	// as possible beyond any active references.
@@ -281,7 +284,7 @@ type SimpleMountSourceOperations struct {
 }
 
 // Revalidate implements MountSourceOperations.Revalidate.
-func (smo *SimpleMountSourceOperations) Revalidate(context.Context, *Inode) bool {
+func (smo *SimpleMountSourceOperations) Revalidate(context.Context, string, *Inode, *Inode) bool {
 	return smo.revalidate
 }
 
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 9fa87c10f..dbc608c7e 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -41,23 +41,29 @@ func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *M
 // delegating to the upper filesystem's Revalidate method. We cannot reload
 // files from the lower filesystem, so we panic if the lower filesystem's
 // Revalidate method returns true.
-func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, inode *Inode) bool {
-	if inode.overlay == nil {
+func (o *overlayMountSourceOperations) Revalidate(ctx context.Context, name string, parent, child *Inode) bool {
+	if child.overlay == nil {
 		panic("overlay cannot revalidate inode that is not an overlay")
 	}
 
-	// Should we bother checking this, or just ignore?
-	if inode.overlay.lower != nil && o.lower.Revalidate(ctx, inode.overlay.lower) {
+	// Revalidate is never called on a mount point, so parent and child
+	// must be from the same mount, and thus must both be overlay inodes.
+	if parent.overlay == nil {
+		panic("trying to revalidate an overlay inode but the parent is not an overlay")
+	}
+
+	// We can't revalidate from the lower filesystem.
+	if child.overlay.lower != nil && o.lower.Revalidate(ctx, name, parent.overlay.lower, child.overlay.lower) {
 		panic("an overlay cannot revalidate file objects from the lower fs")
 	}
 
-	if inode.overlay.upper == nil {
-		// Nothing to revalidate.
+	// Do we have anything to revalidate?
+	if child.overlay.upper == nil {
 		return false
 	}
 
 	// Does the upper require revalidation?
-	return o.upper.Revalidate(ctx, inode.overlay.upper)
+	return o.upper.Revalidate(ctx, name, parent.overlay.upper, child.overlay.upper)
 }
 
 // Keep implements MountSourceOperations by delegating to the upper
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index fe7da05b5..d9f8f02f3 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -82,7 +82,7 @@ type superOperations struct{}
 // Slave entries are dropped from dir when their master is closed, so an
 // existing slave Dirent in the tree is not sufficient to guarantee that it
 // still exists on the filesystem.
-func (superOperations) Revalidate(context.Context, *fs.Inode) bool {
+func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool {
 	return true
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 25aaf3f86..4ce3afc91 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -134,7 +134,13 @@ func waitForFile(f *os.File) error {
 		}
 		return nil
 	}
-	return testutil.Poll(op, 5*time.Second)
+
+	timeout := 5 * time.Second
+	if testutil.RaceEnabled {
+		// Race makes slow things even slow, so bump the timeout.
+		timeout = 3 * timeout
+	}
+	return testutil.Poll(op, timeout)
 }
 
 // readOutputNum reads a file at given filepath and returns the int at the
@@ -213,10 +219,8 @@ const (
 	nonExclusiveFS
 )
 
-// TODO: nonExclusiveFS was removed because it causes timeout
-// with --race. Put it back when bug is fixed.
-var all = []configOption{overlay, kvm}
-var noOverlay = []configOption{kvm}
+var noOverlay = []configOption{kvm, nonExclusiveFS}
+var all = append(noOverlay, overlay)
 
 // configs generates different configurations to run tests.
 func configs(opts ...configOption) []*boot.Config {
@@ -1572,10 +1576,6 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	// the filesystem.
 	spec := testutil.NewSpecWithArgs("sleep", "1000")
 
-	// TODO: $TEST_TMPDIR mount is mistakenly marked as RO after
-	// revalidation. Remove when it's fixed.
-	spec.Root.Readonly = false
-
 	dir, err := ioutil.TempDir(testutil.TmpDir(), "root-fs-test")
 	if err != nil {
 		t.Fatalf("TempDir failed: %v", err)
-- 
cgit v1.2.3


From f0492d45aa31e32f8a04b13b7bf53e0161e1afb6 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Mon, 27 Aug 2018 17:20:36 -0700
Subject: Add /proc/sys/kernel/shm[all,max,mni].

PiperOrigin-RevId: 210459956
Change-Id: I51859b90fa967631e0a54a390abc3b5541fbee66
---
 pkg/abi/linux/shm.go         | 11 +++++++++++
 pkg/sentry/fs/proc/sys.go    |  6 ++++++
 pkg/sentry/kernel/shm/shm.go | 34 ++++++++--------------------------
 3 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
index 9149ed094..f50b3c2e2 100644
--- a/pkg/abi/linux/shm.go
+++ b/pkg/abi/linux/shm.go
@@ -14,6 +14,8 @@
 
 package linux
 
+import "math"
+
 // shmat(2) flags. Source: include/uapi/linux/shm.h
 const (
 	SHM_RDONLY = 010000  // Read-only access.
@@ -38,6 +40,15 @@ const (
 	SHM_INFO   = 14
 )
 
+// SHM defaults as specified by linux. Source: include/uapi/linux/shm.h
+const (
+	SHMMIN = 1
+	SHMMNI = 4096
+	SHMMAX = math.MaxUint64 - 1<<24
+	SHMALL = math.MaxUint64 - 1<<24
+	SHMSEG = 4096
+)
+
 // ShmidDS is equivalent to struct shmid64_ds. Source:
 // include/uapi/asm-generic/shmbuf.h
 type ShmidDS struct {
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index a2d36ca23..384b4ffe1 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -17,7 +17,9 @@ package proc
 import (
 	"fmt"
 	"io"
+	"strconv"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
@@ -102,6 +104,10 @@ func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode
 	d := &ramfs.Dir{}
 	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
 	d.AddChild(ctx, "hostname", p.newHostname(ctx, msrc))
+
+	d.AddChild(ctx, "shmmax", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))))
+	d.AddChild(ctx, "shmall", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))))
+	d.AddChild(ctx, "shmmni", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))))
 	return newFile(d, msrc, fs.SpecialDirectory, nil)
 }
 
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 1ac444094..77973951e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -35,7 +35,6 @@ package shm
 
 import (
 	"fmt"
-	"math"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -52,23 +51,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-// Various limits for shared memory segments.
-const (
-	// shmsTotalMaxPages is the system-wide limit on all shared memory segments, measured
-	// in number of pages.
-	shmsTotalMaxPages = math.MaxInt64 // SHMALL
-
-	// shmMaxSize is the maximum size of a single segment, in bytes.
-	shmMaxSize = math.MaxInt64 // SHMMAX
-
-	// shmMinSize is the minimum specifiable size of a segment, effectively
-	// yielding a size rounded up to the next page size. Measured in bytes.
-	shmMinSize = 1 // SHMMIN
-
-	// shmsTotalMax is the maximum number of segments on the system.
-	shmsTotalMax = 4096 // SHMMNI
-)
-
 // Registry tracks all shared memory segments in an IPC namespace. The registry
 // provides the mechanisms for creating and finding segments, and reporting
 // global shm parameters.
@@ -119,7 +101,7 @@ func (r *Registry) findByKey(key int32) *Shm {
 // FindOrCreate looks up or creates a segment in the registry. It's functionally
 // analogous to open(2).
 func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
-	if create && (size < shmMinSize || size > shmMaxSize) {
+	if create && (size < linux.SHMMIN || size > linux.SHMMAX) {
 		// "A new segment was to be created and size is less than SHMMIN or
 		// greater than SHMMAX." - man shmget(2)
 		return nil, syserror.EINVAL
@@ -128,7 +110,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64
 	r.mu.Lock()
 	defer r.mu.Unlock()
 
-	if len(r.shms) >= shmsTotalMax {
+	if len(r.shms) >= linux.SHMMNI {
 		// "All possible shared memory IDs have been taken (SHMMNI) ..."
 		//   - man shmget(2)
 		return nil, syserror.ENOSPC
@@ -179,7 +161,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64
 		return nil, syserror.EINVAL
 	}
 
-	if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > shmsTotalMaxPages {
+	if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL {
 		// "... allocating a segment of the requested size would cause the
 		// system to exceed the system-wide limit on shared memory (SHMALL)."
 		//   - man shmget(2)
@@ -245,11 +227,11 @@ func (r *Registry) newShm(ctx context.Context, pid, key int32, creator fs.FileOw
 // system. See shmctl(IPC_INFO).
 func (r *Registry) IPCInfo() *linux.ShmParams {
 	return &linux.ShmParams{
-		ShmMax: shmMaxSize,
-		ShmMin: shmMinSize,
-		ShmMni: shmsTotalMax,
-		ShmSeg: shmsTotalMax, // Linux also sets this to SHMMNI.
-		ShmAll: shmsTotalMaxPages,
+		ShmMax: linux.SHMMAX,
+		ShmMin: linux.SHMMIN,
+		ShmMni: linux.SHMMNI,
+		ShmSeg: linux.SHMSEG,
+		ShmAll: linux.SHMALL,
 	}
 }
 
-- 
cgit v1.2.3


From ae648bafda2d82a6641e4a28bed34dae40d426ec Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 27 Aug 2018 20:35:00 -0700
Subject: Add command-line parameter to trigger panic on signal

This is to troubleshoot problems with a hung process that is
not responding to 'runsc debug --stack' command.

PiperOrigin-RevId: 210483513
Change-Id: I4377b210b4e51bc8a281ad34fd94f3df13d9187d
---
 pkg/sentry/sighandling/sighandling.go |  5 ++---
 runsc/boot/config.go                  |  6 ++++++
 runsc/boot/loader.go                  | 14 +++++++++++++-
 runsc/cmd/debug.go                    | 10 ++++++++++
 runsc/main.go                         |  2 ++
 5 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 25295440c..5bac3a4e1 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -103,7 +103,7 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 // PrepareForwarding ensures that synchronous signals are forwarded to k and
 // returns a callback that starts signal delivery, which itself returns a
 // callback that stops signal forwarding.
-func PrepareForwarding(k *kernel.Kernel, enablePanicSignal bool) func() func() {
+func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func() {
 	start := make(chan struct{})
 	stop := make(chan struct{})
 
@@ -119,8 +119,7 @@ func PrepareForwarding(k *kernel.Kernel, enablePanicSignal bool) func() func() {
 		sigchan := make(chan os.Signal, 1)
 		sigchans = append(sigchans, sigchan)
 
-		// SignalPanic is handled by Run.
-		if enablePanicSignal && linux.Signal(sig) == kernel.SignalPanic {
+		if syscall.Signal(sig) == skipSignal {
 			continue
 		}
 
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index bc392deb3..efb8563ea 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -204,7 +204,12 @@ type Config struct {
 	// TODO: Remove this when multiple container is fully supported.
 	MultiContainer bool
 
+	// WatchdogAction sets what action the watchdog takes when triggered.
 	WatchdogAction watchdog.Action
+
+	// PanicSignal register signal handling that panics. Usually set to
+	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+	PanicSignal int
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -225,5 +230,6 @@ func (c *Config) ToFlags() []string {
 		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
 		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
 		"--watchdog-action=" + c.WatchdogAction.String(),
+		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
 	}
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 3963ed55d..0ad830a6b 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"math/rand"
 	"os"
+	"os/signal"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -229,7 +230,18 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
 	}
 	// Ensure that signals received are forwarded to the emulated kernel.
-	stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
+	ps := syscall.Signal(conf.PanicSignal)
+	stopSignalForwarding := sighandling.PrepareForwarding(k, ps)()
+	if conf.PanicSignal != -1 {
+		// Panics if the sentry receives 'conf.PanicSignal'.
+		panicChan := make(chan os.Signal, 1)
+		signal.Notify(panicChan, ps)
+		go func() { // S/R-SAFE: causes sentry panic.
+			<-panicChan
+			panic("Signal-induced panic")
+		}()
+		log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal)
+	}
 
 	procArgs, err := newProcess(spec, creds, utsns, ipcns, k)
 	if err != nil {
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 7952489de..b20987b2c 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -15,6 +15,8 @@
 package cmd
 
 import (
+	"syscall"
+
 	"context"
 	"flag"
 	"github.com/google/subcommands"
@@ -27,6 +29,7 @@ import (
 type Debug struct {
 	pid    int
 	stacks bool
+	signal int
 }
 
 // Name implements subcommands.Command.
@@ -48,6 +51,7 @@ func (*Debug) Usage() string {
 func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
+	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -96,6 +100,12 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("sandbox %q is not running", c.Sandbox.ID)
 	}
 
+	if d.signal > 0 {
+		log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid)
+		if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil {
+			Fatalf("failed to send signal %d to processs %d", d.signal, c.Sandbox.Pid)
+		}
+	}
 	if d.stacks {
 		log.Infof("Retrieving sandbox stacks")
 		stacks, err := c.Sandbox.Stacks()
diff --git a/runsc/main.go b/runsc/main.go
index 0a2cbca6c..773ec6486 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -61,6 +61,7 @@ var (
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	multiContainer = flag.Bool("multi-container", false, "enable *experimental* multi-container support.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 )
 
 var gitRevision = ""
@@ -139,6 +140,7 @@ func main() {
 		StraceLogSize:  *straceLogSize,
 		MultiContainer: *multiContainer,
 		WatchdogAction: wa,
+		PanicSignal:    *panicSignal,
 	}
 	if len(*straceSyscalls) != 0 {
 		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
-- 
cgit v1.2.3


From 25a8e13a78ad6418a1798ec419a1b5ab2116a7f8 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 28 Aug 2018 09:20:17 -0700
Subject: Bump to Go 1.11

The procid offset is unchanged.

PiperOrigin-RevId: 210551969
Change-Id: I33ba1ce56c2f5631b712417d870aa65ef24e6022
---
 WORKSPACE                                 | 6 +++---
 pkg/sentry/platform/procid/procid_amd64.s | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/WORKSPACE b/WORKSPACE
index 45b120be4..302acc973 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,8 +1,8 @@
 # Load go bazel rules and gazelle.
 http_archive(
     name = "io_bazel_rules_go",
-    url = "https://github.com/bazelbuild/rules_go/releases/download/0.14.0/rules_go-0.14.0.tar.gz",
-    sha256 = "5756a4ad75b3703eb68249d50e23f5d64eaf1593e886b9aa931aa6e938c4e301",
+    url = "https://github.com/bazelbuild/rules_go/releases/download/0.15.1/rules_go-0.15.1.tar.gz",
+    sha256 = "5f3b0304cdf0c505ec9e5b3c4fc4a87b5ca21b13d8ecc780c97df3d1809b9ce6",
 )
 http_archive(
     name = "bazel_gazelle",
@@ -11,7 +11,7 @@ http_archive(
 )
 load("@io_bazel_rules_go//go:def.bzl", "go_rules_dependencies", "go_register_toolchains")
 go_rules_dependencies()
-go_register_toolchains(go_version="1.10.3")
+go_register_toolchains(go_version="1.11")
 load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository")
 gazelle_dependencies()
 
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
index ead4e3d91..5b1ba1f24 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.11
+// +build !go1.12
 
 #include "textflag.h"
 
-- 
cgit v1.2.3


From d724863a313f5e08a043c8f2ccb4969e8ea23de1 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 28 Aug 2018 13:20:54 -0700
Subject: sentry: optimize dirent weakref map save / restore.

Weak references save / restore involves multiple interface indirection
and cause material latency overhead when there are lots of dirents, each
containing a weak reference map. The nil entries in the map should also
be purged.

PiperOrigin-RevId: 210593727
Change-Id: Ied6f4c3c0726fcc53a24b983d9b3a79121b6b758
---
 pkg/sentry/fs/dirent.go       |  2 +-
 pkg/sentry/fs/dirent_state.go | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 5587582b5..9417e808f 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -201,7 +201,7 @@ type Dirent struct {
 	mu sync.Mutex `state:"nosave"`
 
 	// children are cached via weak references.
-	children map[string]*refs.WeakRef
+	children map[string]*refs.WeakRef `state:".(map[string]*Dirent)"`
 }
 
 // NewDirent returns a new root Dirent, taking the caller's reference on inode. The caller
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index c6a1b5e38..fb81e7d54 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -17,6 +17,8 @@ package fs
 import (
 	"fmt"
 	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/refs"
 )
 
 // beforeSave is invoked by stateify.
@@ -36,6 +38,27 @@ func (d *Dirent) beforeSave() {
 	}
 }
 
+// saveChildren is invoked by stateify.
+func (d *Dirent) saveChildren() map[string]*Dirent {
+	c := make(map[string]*Dirent)
+	for name, w := range d.children {
+		if rc := w.Get(); rc != nil {
+			// Drop the reference count obtain in w.Get()
+			rc.DecRef()
+			c[name] = rc.(*Dirent)
+		}
+	}
+	return c
+}
+
+// loadChildren is invoked by stateify.
+func (d *Dirent) loadChildren(children map[string]*Dirent) {
+	d.children = make(map[string]*refs.WeakRef)
+	for name, c := range children {
+		d.children[name] = refs.NewWeakRef(c, nil)
+	}
+}
+
 // afterLoad is invoked by stateify.
 func (d *Dirent) afterLoad() {
 	if d.userVisible {
-- 
cgit v1.2.3


From 515d9bf43b358cf7645d34dccdcc299f43dd8d74 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 28 Aug 2018 15:08:15 -0700
Subject: fs: Add tests for dirent ref counting with an overlay.

PiperOrigin-RevId: 210614669
Change-Id: I408365ff6d6c7765ed7b789446d30e7079cbfc67
---
 pkg/sentry/fs/inode_overlay_test.go | 70 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/mounts.go             |  4 +--
 2 files changed, 72 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index a7be9d040..3ee4c9667 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -299,6 +299,76 @@ func TestLookupRevalidation(t *testing.T) {
 	}
 }
 
+func TestCacheFlush(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	// Upper and lower each have a file.
+	upperFileName := "file-from-upper"
+	lowerFileName := "file-from-lower"
+	upper := newTestRamfsDir(ctx, []dirContent{{name: upperFileName}}, nil)
+	lower := newTestRamfsDir(ctx, []dirContent{{name: lowerFileName}}, nil)
+
+	overlay := fs.NewTestOverlayDir(ctx, upper, lower, true /* revalidate */)
+
+	mns, err := fs.NewMountNamespace(ctx, overlay)
+	if err != nil {
+		t.Fatalf("NewMountNamespace failed: %v", err)
+	}
+	root := mns.Root()
+	defer root.DecRef()
+
+	ctx = &rootContext{
+		Context: ctx,
+		root:    root,
+	}
+
+	for _, fileName := range []string{upperFileName, lowerFileName} {
+		// Walk to the file.
+		dirent, err := mns.FindInode(ctx, root, nil, fileName, 0)
+		if err != nil {
+			t.Fatalf("FindInode(%q) failed: %v", fileName, err)
+		}
+
+		// Get a file from the dirent.
+		file, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true})
+		if err != nil {
+			t.Fatalf("GetFile() failed: %v", err)
+		}
+
+		// The dirent should have 3 refs, one from us, one from the
+		// file, and one from the dirent cache.
+		// dirent cache.
+		if got, want := dirent.ReadRefs(), 3; int(got) != want {
+			t.Errorf("dirent.ReadRefs() got %d want %d", got, want)
+		}
+
+		// Drop the file reference.
+		file.DecRef()
+
+		// Dirent should have 2 refs left.
+		if got, want := dirent.ReadRefs(), 2; int(got) != want {
+			t.Errorf("dirent.ReadRefs() got %d want %d", got, want)
+		}
+
+		// Flush the dirent cache.
+		mns.FlushMountSourceRefs()
+
+		// Dirent should have 1 ref left from the dirent cache.
+		if got, want := dirent.ReadRefs(), 1; int(got) != want {
+			t.Errorf("dirent.ReadRefs() got %d want %d", got, want)
+		}
+
+		// Drop our ref.
+		dirent.DecRef()
+
+		// We should be back to zero refs.
+		if got, want := dirent.ReadRefs(), 0; int(got) != want {
+			t.Errorf("dirent.ReadRefs() got %d want %d", got, want)
+		}
+	}
+
+}
+
 type dir struct {
 	fs.InodeOperations
 
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 144d3427d..0318f135d 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -348,10 +348,10 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 // Precondition: the path must be non-empty.
 func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
 	if root == nil {
-		panic("MountNamespace.FindInode: root must not be nil")
+		panic("MountNamespace.FindLink: root must not be nil")
 	}
 	if len(path) == 0 {
-		panic("MountNamespace.FindInode: path is empty")
+		panic("MountNamespace.FindLink: path is empty")
 	}
 
 	// Split the path.
-- 
cgit v1.2.3


From 3b11769c772ab667f6f7a1863f6a614a095445ad Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 28 Aug 2018 15:17:32 -0700
Subject: fs: Don't bother saving negative dirents.

PiperOrigin-RevId: 210616454
Change-Id: I3f536e2b4d603e540cdd9a67c61b8ec3351f4ac3
---
 pkg/sentry/fs/dirent_state.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index fb81e7d54..58dd01202 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -45,7 +45,13 @@ func (d *Dirent) saveChildren() map[string]*Dirent {
 		if rc := w.Get(); rc != nil {
 			// Drop the reference count obtain in w.Get()
 			rc.DecRef()
-			c[name] = rc.(*Dirent)
+
+			cd := rc.(*Dirent)
+			if cd.IsNegative() {
+				// Don't bother saving negative Dirents.
+				continue
+			}
+			c[name] = cd
 		}
 	}
 	return c
-- 
cgit v1.2.3


From 52e6714146b46487a1260fab838a6ba193029845 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 28 Aug 2018 17:25:13 -0700
Subject: fasync: don't keep mutex after return

PiperOrigin-RevId: 210637533
Change-Id: I3536c3f9efb54732a0d8ada8bc299142b2c1682f
---
 pkg/sentry/kernel/fasync/fasync.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 15218fb5a..69c7970fa 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -47,6 +47,7 @@ type FileAsync struct {
 func (a *FileAsync) Callback(e *waiter.Entry) {
 	a.mu.Lock()
 	if a.e.Callback == nil {
+		a.mu.Unlock()
 		return
 	}
 	t := a.recipientT
-- 
cgit v1.2.3


From 18932476167ecf16b7d3e85ae6addaaba193ceed Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 29 Aug 2018 11:21:21 -0700
Subject: fs: Drop reference to over-written file before renaming over it.

dirent.go:Rename() walks to the file being replaced and defers
replaced.DecRef(). After the rename, the reference is dropped, triggering a
writeout and SettAttr call to the gofer. Because of lazyOpenForWrite, the gofer
opens the replaced file BY ITS OLD NAME and calls ftruncate on it.

This CL changes Remove to drop the reference on replaced (and thus trigger
writeout) before the actual rename call.

PiperOrigin-RevId: 210756097
Change-Id: I01ea09a5ee6c2e2d464560362f09943641638e0f
---
 pkg/sentry/fs/dirent.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 9417e808f..30545de7e 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1549,16 +1549,19 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	// Check constraints on the object being replaced, if any.
 	replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */)
 	if err == nil {
-		defer replaced.DecRef()
+		// NOTE: We don't want to keep replaced alive
+		// across the Rename, so must call DecRef manually (no defer).
 
 		// Target should not be an ancestor of source.
 		if replaced == oldParent {
+			replaced.DecRef()
 			// Why is this not EINVAL? See fs/namei.c.
 			return syscall.ENOTEMPTY
 		}
 
 		// Is the thing we're trying to replace busy?
 		if replaced.Busy() {
+			replaced.DecRef()
 			return syscall.EBUSY
 		}
 
@@ -1566,9 +1569,11 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		oldIsDir := IsDir(renamed.Inode.StableAttr)
 		newIsDir := IsDir(replaced.Inode.StableAttr)
 		if !newIsDir && oldIsDir {
+			replaced.DecRef()
 			return syscall.ENOTDIR
 		}
 		if !oldIsDir && newIsDir {
+			replaced.DecRef()
 			return syscall.EISDIR
 		}
 
@@ -1583,6 +1588,9 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		replaced.mu.Lock()
 		replaced.flush()
 		replaced.mu.Unlock()
+
+		// Done with replaced.
+		replaced.DecRef()
 	}
 
 	if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName); err != nil {
-- 
cgit v1.2.3


From 956fe64ad6d628c70fe8d0ae7fd4001e8b648a3b Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 29 Aug 2018 11:45:23 -0700
Subject: fs: Fix renameMu lock recursion.

dirent.walk() takes renameMu, but is often called with renameMu already held,
which can lead to a deadlock.

Fix this by requiring renameMu to be held for reading when dirent.walk() is
called. This causes walks and existence checks to block while a rename
operation takes place, but that is what we were already trying to enforce by
taking renameMu in walk() anyways.

PiperOrigin-RevId: 210760780
Change-Id: Id61018e6e4adbeac53b9c1b3aa24ab77f75d8a54
---
 pkg/sentry/fs/dirent.go | 57 ++++++++++++++-----------------------------------
 1 file changed, 16 insertions(+), 41 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 30545de7e..f81f7d627 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -451,6 +451,7 @@ func (d *Dirent) descendantOf(p *Dirent) bool {
 // Inode.Lookup, otherwise walk will keep d.mu locked.
 //
 // Preconditions:
+// - renameMu must be held for reading.
 // - d.mu must be held.
 // - name must must not contain "/"s.
 func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
@@ -461,22 +462,18 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 		d.IncRef()
 		return d, nil
 	} else if name == ".." {
-		renameMu.RLock()
 		// Respect the chroot. Note that in Linux there is no check to enforce
 		// that d is a descendant of root.
 		if d == root {
 			d.IncRef()
-			renameMu.RUnlock()
 			return d, nil
 		}
 		// Are we already at the root? Then ".." is ".".
 		if d.IsRoot() {
 			d.IncRef()
-			renameMu.RUnlock()
 			return d, nil
 		}
 		d.parent.IncRef()
-		renameMu.RUnlock()
 		return d.parent, nil
 	}
 
@@ -532,15 +529,11 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 	// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
 	// expensive, if possible release the lock and re-acquire it.
 	if walkMayUnlock {
-		// While this dirent is unlocked, the lookup below is not allowed to proceed in tandem with a
-		// rename operation. The rename should be fully complete before we call Lookup on anything.
 		d.mu.Unlock()
-		renameMu.RLock()
 	}
 	c, err := d.Inode.Lookup(ctx, name)
 	if walkMayUnlock {
 		d.mu.Lock()
-		renameMu.RUnlock()
 	}
 	// No dice.
 	if err != nil {
@@ -608,18 +601,27 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent,
 		panic("Dirent.Walk: root must not be nil")
 	}
 
+	// We could use lockDirectory here, but this is a hot path and we want
+	// to avoid defer.
+	renameMu.RLock()
 	d.dirMu.RLock()
 	d.mu.Lock()
+
 	child, err := d.walk(ctx, root, name, true /* may unlock */)
+
 	d.mu.Unlock()
 	d.dirMu.RUnlock()
+	renameMu.RUnlock()
 
 	return child, err
 }
 
 // exists returns true if name exists in relation to d.
 //
-// Preconditions: d.mu must be held.
+// Preconditions:
+// - renameMu must be held for reading.
+// - d.mu must be held.
+// - name must must not contain "/"s.
 func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
 	child, err := d.walk(ctx, root, name, true /* may unlock */)
 	if err != nil {
@@ -634,24 +636,13 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
 // lockDirectory should be called for any operation that changes this `d`s
 // children (creating or removing them).
 func (d *Dirent) lockDirectory() func() {
-	if d.Inode.overlay != nil {
-		// overlay copyUp may need to look at Dirent parents, and hence
-		// may need renameMu.
-		renameMu.RLock()
-		d.dirMu.Lock()
-		d.mu.Lock()
-		return func() {
-			d.mu.Unlock()
-			d.dirMu.Unlock()
-			renameMu.RUnlock()
-		}
-	}
-
+	renameMu.RLock()
 	d.dirMu.Lock()
 	d.mu.Lock()
 	return func() {
 		d.mu.Unlock()
 		d.dirMu.Unlock()
+		renameMu.RUnlock()
 	}
 }
 
@@ -724,9 +715,10 @@ func (d *Dirent) finishCreate(child *Dirent, name string) {
 
 // genericCreate executes create if name does not exist. Removes a negative Dirent at name if
 // create succeeds.
-//
-// Preconditions: d.mu must be held.
 func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error {
+	unlock := d.lockDirectory()
+	defer unlock()
+
 	// Does something already exist?
 	if d.exists(ctx, root, name) {
 		return syscall.EEXIST
@@ -765,9 +757,6 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c
 
 // CreateLink creates a new link in this directory.
 func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname string) error {
-	unlock := d.lockDirectory()
-	defer unlock()
-
 	return d.genericCreate(ctx, root, newname, func() error {
 		if err := d.Inode.CreateLink(ctx, d, oldname, newname); err != nil {
 			return err
@@ -779,9 +768,6 @@ func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname
 
 // CreateHardLink creates a new hard link in this directory.
 func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Dirent, name string) error {
-	unlock := d.lockDirectory()
-	defer unlock()
-
 	// Make sure that target does not span filesystems.
 	if d.Inode.MountSource != target.Inode.MountSource {
 		return syscall.EXDEV
@@ -799,9 +785,6 @@ func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Diren
 
 // CreateDirectory creates a new directory under this dirent.
 func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
-	unlock := d.lockDirectory()
-	defer unlock()
-
 	return d.genericCreate(ctx, root, name, func() error {
 		if err := d.Inode.CreateDirectory(ctx, d, name, perms); err != nil {
 			return err
@@ -813,11 +796,6 @@ func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string,
 
 // Bind satisfies the InodeOperations interface; otherwise same as GetFile.
 func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data unix.BoundEndpoint, perms FilePermissions) (*Dirent, error) {
-	d.dirMu.Lock()
-	defer d.dirMu.Unlock()
-	d.mu.Lock()
-	defer d.mu.Unlock()
-
 	var childDir *Dirent
 	err := d.genericCreate(ctx, root, name, func() error {
 		var e error
@@ -839,9 +817,6 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data unix.
 
 // CreateFifo creates a new named pipe under this dirent.
 func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
-	unlock := d.lockDirectory()
-	defer unlock()
-
 	return d.genericCreate(ctx, root, name, func() error {
 		if err := d.Inode.CreateFifo(ctx, d, name, perms); err != nil {
 			return err
-- 
cgit v1.2.3


From 8bfb5fa91977a4b10d7ad87fe4627c236f841137 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 30 Aug 2018 12:00:27 -0700
Subject: fs: Add empty dir at /sys/class/power_supply.

PiperOrigin-RevId: 210953512
Change-Id: I07d2d7fb0d268aa8eca26d81ef28b5b5c42289ee
---
 pkg/sentry/fs/sys/sys.go | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index b1c3d48eb..7b9697668 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -56,9 +56,11 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		// Add a basic set of top-level directories. In Linux, these
 		// are dynamically added depending on the KConfig. Here we just
 		// add the most common ones.
-		"block":    newDir(ctx, msrc, nil),
-		"bus":      newDir(ctx, msrc, nil),
-		"class":    newDir(ctx, msrc, nil),
+		"block": newDir(ctx, msrc, nil),
+		"bus":   newDir(ctx, msrc, nil),
+		"class": newDir(ctx, msrc, map[string]*fs.Inode{
+			"power_supply": newDir(ctx, msrc, nil),
+		}),
 		"dev":      newDir(ctx, msrc, nil),
 		"devices":  newDevicesDir(ctx, msrc),
 		"firmware": newDir(ctx, msrc, nil),
-- 
cgit v1.2.3


From b1c1afa3ccc499df3fd15814d2b6cf9005bc2ab1 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 31 Aug 2018 13:06:16 -0700
Subject: Delete the long-obsolete kernel.TaskMaybe interface.

PiperOrigin-RevId: 211131855
Change-Id: Ia7799561ccd65d16269e0ae6f408ab53749bca37
---
 pkg/sentry/kernel/task.go | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 0f83c0a39..21be3120e 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -499,13 +499,6 @@ func (t *Task) afterLoad() {
 // struct.
 const copyScratchBufferLen = 52
 
-// TaskMaybe is the interface for extracting Tasks out of things which may be
-// or contain Task objects.
-type TaskMaybe interface {
-	// ExtractTask returns the Task.
-	ExtractTask() *Task
-}
-
 // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
 // functions. It must only be used within those functions and can only be used
 // by the task goroutine; it exists to improve performance and thus
@@ -525,11 +518,6 @@ func (t *Task) FutexWaiter() *futex.Waiter {
 	return t.futexWaiter
 }
 
-// ExtractTask implements TaskMaybe.ExtractTask.
-func (t *Task) ExtractTask() *Task {
-	return t
-}
-
 // TaskContext returns t's TaskContext.
 //
 // Precondition: The caller must be running on the task goroutine, or t.mu must
-- 
cgit v1.2.3


From 098046ba193b839d69c059f7a0e68c89409b4237 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 31 Aug 2018 13:57:02 -0700
Subject: Disintegrate kernel.TaskResources.

This allows us to call kernel.FDMap.DecRef without holding mutexes
cleanly.

PiperOrigin-RevId: 211139657
Change-Id: Ie59d5210fb9282e1950e2e40323df7264a01bcec
---
 pkg/sentry/kernel/BUILD             |   1 -
 pkg/sentry/kernel/kernel.go         |  35 ++++++----
 pkg/sentry/kernel/ptrace.go         |   4 +-
 pkg/sentry/kernel/task.go           | 131 +++++++++++++++++++++++++----------
 pkg/sentry/kernel/task_clone.go     |  64 ++++++++++++-----
 pkg/sentry/kernel/task_exec.go      |   2 +-
 pkg/sentry/kernel/task_exit.go      |  13 ++--
 pkg/sentry/kernel/task_log.go       |   2 +-
 pkg/sentry/kernel/task_resources.go | 132 ------------------------------------
 pkg/sentry/kernel/task_signals.go   |  24 +++----
 pkg/sentry/kernel/task_start.go     |  75 ++++++++++++--------
 11 files changed, 230 insertions(+), 253 deletions(-)
 delete mode 100644 pkg/sentry/kernel/task_resources.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index a7b847e94..0bc735550 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -95,7 +95,6 @@ go_library(
         "task_list.go",
         "task_log.go",
         "task_net.go",
-        "task_resources.go",
         "task_run.go",
         "task_sched.go",
         "task_signals.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 33cd727c6..c2b5c7269 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -332,7 +332,8 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
-		if fdmap := t.FDMap(); fdmap != nil {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if fdmap := t.fds; fdmap != nil {
 			for _, desc := range fdmap.files {
 				if flags := desc.file.Flags(); !flags.Write {
 					continue
@@ -381,7 +382,8 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
-		if fdmap := t.FDMap(); fdmap != nil {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if fdmap := t.fds; fdmap != nil {
 			for _, desc := range fdmap.files {
 				if desc.file != nil {
 					if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
@@ -625,20 +627,23 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	if err != nil {
 		return nil, err
 	}
-	tr := newTaskResources(args.FDMap, newFSContext(root, wd, args.Umask))
-	// NewTask unconditionally takes ownership of tr, so we never have to call
-	// tr.release.
+
+	// Take a reference on the FDMap, which will be transferred to
+	// TaskSet.NewTask().
+	args.FDMap.IncRef()
 
 	// Create the task.
 	config := &TaskConfig{
-		Kernel:         k,
-		ThreadGroup:    tg,
-		TaskContext:    tc,
-		TaskResources:  tr,
-		Credentials:    args.Credentials,
-		UTSNamespace:   args.UTSNamespace,
-		IPCNamespace:   args.IPCNamespace,
-		AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
+		Kernel:                  k,
+		ThreadGroup:             tg,
+		TaskContext:             tc,
+		FSContext:               newFSContext(root, wd, args.Umask),
+		FDMap:                   args.FDMap,
+		Credentials:             args.Credentials,
+		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
+		UTSNamespace:            args.UTSNamespace,
+		IPCNamespace:            args.IPCNamespace,
+		AbstractSocketNamespace: NewAbstractSocketNamespace(), // FIXME
 	}
 	t, err := k.tasks.NewTask(config)
 	if err != nil {
@@ -714,7 +719,7 @@ func (k *Kernel) pauseTimeLocked() {
 		for _, it := range t.tg.timers {
 			it.PauseTimer()
 		}
-		if fdm := t.tr.FDMap; fdm != nil {
+		if fdm := t.fds; fdm != nil {
 			for _, desc := range fdm.files {
 				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
 					tfd.PauseTimer()
@@ -744,7 +749,7 @@ func (k *Kernel) resumeTimeLocked() {
 		for _, it := range t.tg.timers {
 			it.ResumeTimer()
 		}
-		if fdm := t.tr.FDMap; fdm != nil {
+		if fdm := t.fds; fdm != nil {
 			for _, desc := range fdm.files {
 				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
 					tfd.ResumeTimer()
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 1a0d1876d..e21a25ae6 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -992,9 +992,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		if addr != linux.SignalSetSize {
 			return syserror.EINVAL
 		}
-		target.mu.Lock()
-		defer target.mu.Unlock()
-		_, err := t.CopyOut(data, target.tr.SignalMask)
+		_, err := t.CopyOut(data, target.SignalMask())
 		return err
 
 	case linux.PTRACE_SETSIGMASK:
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 21be3120e..32db0bf48 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -99,6 +99,19 @@ type Task struct {
 	// ThreadGroup.signalHandlers.
 	pendingSignals pendingSignals
 
+	// signalMask is the set of signals whose delivery is currently blocked.
+	//
+	// signalMask is accessed using atomic memory operations, and is protected
+	// by the signal mutex (such that reading signalMask is safe if either the
+	// signal mutex is locked or if atomic memory operations are used, while
+	// writing signalMask requires both). signalMask is owned by the task
+	// goroutine.
+	signalMask linux.SignalSet
+
+	// FIXME: An equivalent to task_struct::real_blocked is needed
+	// to prevent signals that are ignored, but transiently unblocked by
+	// sigtimedwait(2), from being dropped in Task.sendSignalTimerLocked.
+
 	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
 	// should be applied after the task has either delivered one signal to a
 	// user handler or is about to resume execution in the untrusted
@@ -182,25 +195,30 @@ type Task struct {
 	// syscallRestartBlock is exclusive to the task goroutine.
 	syscallRestartBlock SyscallRestartBlock
 
+	// p provides the mechanism by which the task runs code in userspace. The p
+	// interface object is immutable.
+	p platform.Context `state:"nosave"`
+
+	// k is the Kernel that this task belongs to. The k pointer is immutable.
+	k *Kernel
+
 	// mu protects some of the following fields.
 	mu sync.Mutex `state:"nosave"`
 
-	// tc and tr form the majority of the task's data.
+	// tc holds task data provided by the ELF loader.
 	//
-	// tc and tr are protected by mu. tc and tr are owned by the task
-	// goroutine. tr.signalMask is protected by the signal mutex and must be
-	// written using atomic memory operations (such that reading tr.signalMask
-	// is safe if the signal mutex is locked or if atomic memory operations are
-	// used), but is also owned by the task goroutine.
+	// tc is protected by mu, and is owned by the task goroutine.
 	tc TaskContext
-	tr TaskResources
 
-	// p provides the mechanism by which the task runs code in userspace. The p
-	// interface object is immutable.
-	p platform.Context `state:"nosave"`
+	// fsc is the task's filesystem context.
+	//
+	// fsc is protected by mu, and is owned by the task goroutine.
+	fsc *FSContext
 
-	// k is the Kernel that this task belongs to. The k pointer is immutable.
-	k *Kernel
+	// fds is the task's file descriptor table.
+	//
+	// fds is protected by mu, and is owned by the task goroutine.
+	fds *FDMap
 
 	// If vforkParent is not nil, it is the task that created this task with
 	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
@@ -351,6 +369,11 @@ type Task struct {
 	// ipcns is protected by mu.
 	ipcns *IPCNamespace
 
+	// abstractSockets tracks abstract sockets that are in use.
+	//
+	// abstractSockets is protected by mu.
+	abstractSockets *AbstractSocketNamespace
+
 	// parentDeathSignal is sent to this task's thread group when its parent exits.
 	//
 	// parentDeathSignal is protected by mu.
@@ -518,29 +541,6 @@ func (t *Task) FutexWaiter() *futex.Waiter {
 	return t.futexWaiter
 }
 
-// TaskContext returns t's TaskContext.
-//
-// Precondition: The caller must be running on the task goroutine, or t.mu must
-// be locked.
-func (t *Task) TaskContext() *TaskContext {
-	return &t.tc
-}
-
-// TaskResources returns t's TaskResources.
-//
-// Precondition: The caller must be running on the task goroutine, or t.mu must
-// be locked.
-func (t *Task) TaskResources() *TaskResources {
-	return &t.tr
-}
-
-// WithMuLocked executes f with t.mu locked.
-func (t *Task) WithMuLocked(f func(*Task)) {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	f(t)
-}
-
 // Kernel returns the Kernel containing t.
 func (t *Task) Kernel() *Kernel {
 	return t.k
@@ -572,7 +572,7 @@ func (t *Task) Value(key interface{}) interface{} {
 	case context.CtxThreadGroupID:
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
-		return t.FSContext().RootDirectory()
+		return t.fsc.RootDirectory()
 	case inet.CtxStack:
 		return t.NetworkContext()
 	case ktime.CtxRealtimeClock:
@@ -619,3 +619,62 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
 	t.syscallRestartBlock = nil
 	return r
 }
+
+// IsChrooted returns true if the root directory of t's FSContext is not the
+// root directory of t's MountNamespace.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) IsChrooted() bool {
+	realRoot := t.k.mounts.Root()
+	defer realRoot.DecRef()
+	root := t.fsc.RootDirectory()
+	if root != nil {
+		defer root.DecRef()
+	}
+	return root != realRoot
+}
+
+// TaskContext returns t's TaskContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskContext() *TaskContext {
+	return &t.tc
+}
+
+// FSContext returns t's FSContext. FSContext does not take an additional
+// reference on the returned FSContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) FSContext() *FSContext {
+	return t.fsc
+}
+
+// FDMap returns t's FDMap. FDMap does not take an additional reference on the
+// returned FDMap.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) FDMap() *FDMap {
+	return t.fds
+}
+
+// WithMuLocked executes f with t.mu locked.
+func (t *Task) WithMuLocked(f func(*Task)) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f(t)
+}
+
+// MountNamespace returns t's MountNamespace. MountNamespace does not take an
+// additional reference on the returned MountNamespace.
+func (t *Task) MountNamespace() *fs.MountNamespace {
+	return t.k.mounts
+}
+
+// AbstractSockets returns t's AbstractSocketNamespace.
+func (t *Task) AbstractSockets() *AbstractSocketNamespace {
+	return t.abstractSockets
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 526165af0..46c688b20 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -213,6 +213,22 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		tc.Arch.StateData().Regs.Fs_base = uint64(opts.TLS)
 	}
 
+	var fsc *FSContext
+	if opts.NewFSContext {
+		fsc = t.fsc.Fork()
+	} else {
+		fsc = t.fsc
+		fsc.IncRef()
+	}
+
+	var fds *FDMap
+	if opts.NewFiles {
+		fds = t.fds.Fork()
+	} else {
+		fds = t.fds
+		fds.IncRef()
+	}
+
 	pidns := t.tg.pidns
 	if t.childPIDNamespace != nil {
 		pidns = t.childPIDNamespace
@@ -227,17 +243,21 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		}
 		tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
 	}
+
 	cfg := &TaskConfig{
-		Kernel:            t.k,
-		ThreadGroup:       tg,
-		TaskContext:       tc,
-		TaskResources:     t.tr.Fork(!opts.NewFiles, !opts.NewFSContext),
-		Niceness:          t.Niceness(),
-		Credentials:       creds.Fork(),
-		NetworkNamespaced: t.netns,
-		AllowedCPUMask:    t.CPUMask(),
-		UTSNamespace:      utsns,
-		IPCNamespace:      ipcns,
+		Kernel:                  t.k,
+		ThreadGroup:             tg,
+		SignalMask:              t.SignalMask(),
+		TaskContext:             tc,
+		FSContext:               fsc,
+		FDMap:                   fds,
+		Credentials:             creds.Fork(),
+		Niceness:                t.Niceness(),
+		NetworkNamespaced:       t.netns,
+		AllowedCPUMask:          t.CPUMask(),
+		UTSNamespace:            utsns,
+		IPCNamespace:            ipcns,
+		AbstractSocketNamespace: t.abstractSockets,
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
@@ -435,15 +455,17 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
 	}
 	t.mu.Lock()
-	defer t.mu.Unlock()
+	// Can't defer unlock: DecRefs must occur without holding t.mu.
 	if opts.NewNetworkNamespace {
 		if !haveCapSysAdmin {
+			t.mu.Unlock()
 			return syserror.EPERM
 		}
 		t.netns = true
 	}
 	if opts.NewUTSNamespace {
 		if !haveCapSysAdmin {
+			t.mu.Unlock()
 			return syserror.EPERM
 		}
 		// Note that this must happen after NewUserNamespace, so the
@@ -452,21 +474,29 @@ func (t *Task) Unshare(opts *SharingOptions) error {
 	}
 	if opts.NewIPCNamespace {
 		if !haveCapSysAdmin {
+			t.mu.Unlock()
 			return syserror.EPERM
 		}
 		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
 		// namespace"
 		t.ipcns = NewIPCNamespace(t.creds.UserNamespace)
 	}
+	var oldfds *FDMap
 	if opts.NewFiles {
-		oldFDMap := t.tr.FDMap
-		t.tr.FDMap = oldFDMap.Fork()
-		oldFDMap.DecRef()
+		oldfds = t.fds
+		t.fds = oldfds.Fork()
 	}
+	var oldfsc *FSContext
 	if opts.NewFSContext {
-		oldFS := t.tr.FSContext
-		t.tr.FSContext = oldFS.Fork()
-		oldFS.DecRef()
+		oldfsc = t.fsc
+		t.fsc = oldfsc.Fork()
+	}
+	t.mu.Unlock()
+	if oldfds != nil {
+		oldfds.DecRef()
+	}
+	if oldfsc != nil {
+		oldfsc.DecRef()
 	}
 	return nil
 }
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index bb3d0bd02..1b760aba4 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -194,7 +194,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tg.pidns.owner.mu.Unlock()
 
 	// Remove FDs with the CloseOnExec flag set.
-	t.FDMap().RemoveIf(func(file *fs.File, flags FDFlags) bool {
+	t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool {
 		return flags.CloseOnExec
 	})
 
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b37fcf4c1..a1b24e1c6 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -253,21 +253,22 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		}
 	}
 
-	// Deactivate the address space before releasing the MM.
+	// Deactivate the address space and update max RSS before releasing the
+	// task's MM.
 	t.Deactivate()
-
-	// Update the max resident set size before releasing t.tc.mm.
 	t.tg.pidns.owner.mu.Lock()
 	t.updateRSSLocked()
 	t.tg.pidns.owner.mu.Unlock()
-
-	// Release all of the task's resources.
 	t.mu.Lock()
 	t.tc.release()
-	t.tr.release()
 	t.mu.Unlock()
+
+	// Releasing the MM unblocks a blocked CLONE_VFORK parent.
 	t.unstopVforkParent()
 
+	t.fsc.DecRef()
+	t.fds.DecRef()
+
 	// If this is the last task to exit from the thread group, release the
 	// thread group's resources.
 	if lastExiter {
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 18efacb19..1769da210 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -63,7 +63,7 @@ func (t *Task) DebugDumpState() {
 	if mm := t.MemoryManager(); mm != nil {
 		t.Debugf("Mappings:\n%s", mm)
 	}
-	t.Debugf("FDMap:\n%s", t.FDMap())
+	t.Debugf("FDMap:\n%s", t.fds)
 }
 
 // debugDumpRegisters logs register state at log level debug.
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
deleted file mode 100644
index 0832bf989..000000000
--- a/pkg/sentry/kernel/task_resources.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-)
-
-// TaskResources is the subset of a task's data provided by its creator that is
-// not provided by the loader.
-//
-// +stateify savable
-type TaskResources struct {
-	// SignalMask is the set of signals whose delivery is currently blocked.
-	//
-	// FIXME: Determine if we also need RealSignalMask
-	SignalMask linux.SignalSet
-
-	// FSContext is the filesystem context.
-	*FSContext
-
-	// FDMap provides access to files to the task.
-	*FDMap
-
-	// Tracks abstract sockets that are in use.
-	AbstractSockets *AbstractSocketNamespace
-}
-
-// newTaskResources returns a new TaskResources, taking an additional reference
-// on fdm.
-func newTaskResources(fdm *FDMap, fc *FSContext) *TaskResources {
-	fdm.IncRef()
-	return &TaskResources{
-		FDMap:           fdm,
-		FSContext:       fc,
-		AbstractSockets: NewAbstractSocketNamespace(),
-	}
-}
-
-// release releases all resources held by the TaskResources. release is called
-// by the task when it exits.
-func (tr *TaskResources) release() {
-	tr.FDMap.DecRef()
-	tr.FDMap = nil
-	tr.FSContext.DecRef()
-	tr.FSContext = nil
-	tr.AbstractSockets = nil
-}
-
-// Fork returns a duplicate of tr.
-//
-// FIXME: Preconditions: When tr is owned by a Task, that task's
-// signal mutex must be locked, or Fork must be called by the task's goroutine.
-func (tr *TaskResources) Fork(shareFiles bool, shareFSContext bool) *TaskResources {
-	var fdmap *FDMap
-	if shareFiles {
-		fdmap = tr.FDMap
-		fdmap.IncRef()
-	} else {
-		fdmap = tr.FDMap.Fork()
-	}
-
-	var fsc *FSContext
-	if shareFSContext {
-		fsc = tr.FSContext
-		fsc.IncRef()
-	} else {
-		fsc = tr.FSContext.Fork()
-	}
-
-	return &TaskResources{
-		SignalMask:      tr.SignalMask,
-		FDMap:           fdmap,
-		FSContext:       fsc,
-		AbstractSockets: tr.AbstractSockets,
-	}
-}
-
-// FDMap returns t's FDMap.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) FDMap() *FDMap {
-	return t.tr.FDMap
-}
-
-// FSContext returns t's FSContext.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) FSContext() *FSContext {
-	return t.tr.FSContext
-}
-
-// MountNamespace returns t's MountNamespace. MountNamespace does not take an additional
-// reference on the returned MountNamespace.
-func (t *Task) MountNamespace() *fs.MountNamespace {
-	return t.k.mounts
-}
-
-// AbstractSockets returns t's AbstractSocketNamespace.
-func (t *Task) AbstractSockets() *AbstractSocketNamespace {
-	return t.tr.AbstractSockets
-}
-
-// IsChrooted returns true if the root directory of t's FSContext is not the
-// root directory of t's MountNamespace.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) IsChrooted() bool {
-	realRoot := t.k.mounts.Root()
-	defer realRoot.DecRef()
-	root := t.tr.FSContext.RootDirectory()
-	if root != nil {
-		defer root.DecRef()
-	}
-	return root != realRoot
-}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 4a66bce6b..58a1bc0bd 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -124,10 +124,10 @@ var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTI
 //
 // Preconditions: t.tg.signalHandlers.mu must be locked.
 func (t *Task) dequeueSignalLocked() *arch.SignalInfo {
-	if info := t.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
+	if info := t.pendingSignals.dequeue(t.signalMask); info != nil {
 		return info
 	}
-	return t.tg.pendingSignals.dequeue(t.tr.SignalMask)
+	return t.tg.pendingSignals.dequeue(t.signalMask)
 }
 
 // TakeSignal returns a pending signal not blocked by mask. Signal handlers are
@@ -252,7 +252,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	// handler should run with the current mask, but sigreturn should restore
 	// the saved one.
 	st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
-	mask := t.tr.SignalMask
+	mask := t.signalMask
 	if t.haveSavedSignalMask {
 		mask = t.savedSignalMask
 	}
@@ -262,7 +262,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct)
 	t.haveSavedSignalMask = false
 
 	// Add our signal mask.
-	newMask := t.tr.SignalMask | act.Mask
+	newMask := t.signalMask | act.Mask
 	if !act.IsNoDefer() {
 		newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
 	}
@@ -431,7 +431,7 @@ func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *I
 	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
 	// sig_ignored().
 	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
-	if linux.SignalSetOf(sig)&t.tr.SignalMask == 0 && ignored && !t.hasTracer() {
+	if linux.SignalSetOf(sig)&t.signalMask == 0 && ignored && !t.hasTracer() {
 		t.Debugf("Discarding ignored signal %d", sig)
 		if timer != nil {
 			timer.signalRejectedLocked()
@@ -515,7 +515,7 @@ func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
 // Preconditions: The signal mutex must be locked.
 func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
 	// - Do not choose tasks that are blocking the signal.
-	if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+	if linux.SignalSetOf(sig)&t.signalMask != 0 {
 		return false
 	}
 	// - No need to check Task.exitState, as the exit path sets every bit in the
@@ -564,21 +564,21 @@ func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
 }
 
 func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
-	blocked := linux.SignalSetOf(sig)&t.tr.SignalMask != 0
+	blocked := linux.SignalSetOf(sig)&t.signalMask != 0
 	act := t.tg.signalHandlers.actions[sig]
 	ignored := act.Handler == arch.SignalActIgnore
 	if blocked || ignored || unconditional {
 		act.Handler = arch.SignalActDefault
 		t.tg.signalHandlers.actions[sig] = act
 		if blocked {
-			t.setSignalMaskLocked(t.tr.SignalMask &^ linux.SignalSetOf(sig))
+			t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig))
 		}
 	}
 }
 
 // SignalMask returns a copy of t's signal mask.
 func (t *Task) SignalMask() linux.SignalSet {
-	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.tr.SignalMask)))
+	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask)))
 }
 
 // SetSignalMask sets t's signal mask.
@@ -595,8 +595,8 @@ func (t *Task) SetSignalMask(mask linux.SignalSet) {
 
 // Preconditions: The signal mutex must be locked.
 func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
-	oldMask := t.tr.SignalMask
-	atomic.StoreUint64((*uint64)(&t.tr.SignalMask), uint64(mask))
+	oldMask := t.signalMask
+	atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask))
 
 	// If the new mask blocks any signals that were not blocked by the old
 	// mask, and at least one such signal is pending in tg.pendingSignals, and
@@ -1076,7 +1076,7 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
 	t.tg.signalHandlers.mu.Lock()
 	t.tg.pidns.owner.mu.Unlock()
 	// If the signal is masked, re-queue it.
-	if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+	if linux.SignalSetOf(sig)&t.signalMask != 0 {
 		t.sendSignalLocked(info, false /* group */)
 		t.tg.signalHandlers.mu.Unlock()
 		return (*runInterrupt)(nil)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index c97dee8fc..6ce99d268 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
@@ -26,7 +27,7 @@ import (
 // TaskConfig defines the configuration of a new Task (see below).
 type TaskConfig struct {
 	// Kernel is the owning Kernel.
-	*Kernel
+	Kernel *Kernel
 
 	// Parent is the new task's parent. Parent may be nil.
 	Parent *Task
@@ -36,13 +37,24 @@ type TaskConfig struct {
 	InheritParent *Task
 
 	// ThreadGroup is the ThreadGroup the new task belongs to.
-	*ThreadGroup
+	ThreadGroup *ThreadGroup
 
-	// TaskContext is the TaskContext of the new task.
-	*TaskContext
+	// SignalMask is the new task's initial signal mask.
+	SignalMask linux.SignalSet
 
-	// TaskResources is the TaskResources of the new task.
-	*TaskResources
+	// TaskContext is the TaskContext of the new task. Ownership of the
+	// TaskContext is transferred to TaskSet.NewTask, whether or not it
+	// succeeds.
+	TaskContext *TaskContext
+
+	// FSContext is the FSContext of the new task. A reference must be held on
+	// FSContext, which is transferred to TaskSet.NewTask whether or not it
+	// succeeds.
+	FSContext *FSContext
+
+	// FDMap is the FDMap of the new task. A reference must be held on FDMap,
+	// which is transferred to TaskSet.NewTask whether or not it succeeds.
+	FDMap *FDMap
 
 	// Credentials is the Credentials of the new task.
 	Credentials *auth.Credentials
@@ -62,25 +74,27 @@ type TaskConfig struct {
 
 	// IPCNamespace is the IPCNamespace of the new task.
 	IPCNamespace *IPCNamespace
+
+	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
+	AbstractSocketNamespace *AbstractSocketNamespace
 }
 
-// NewTask creates a new task defined by TaskConfig.
-// Whether or not NewTask is successful, it takes ownership of both TaskContext
-// and TaskResources of the TaskConfig.
+// NewTask creates a new task defined by cfg.
 //
 // NewTask does not start the returned task; the caller must call Task.Start.
 func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
 	t, err := ts.newTask(cfg)
 	if err != nil {
 		cfg.TaskContext.release()
-		cfg.TaskResources.release()
+		cfg.FSContext.DecRef()
+		cfg.FDMap.DecRef()
 		return nil, err
 	}
 	return t, nil
 }
 
-// newTask is a helper for TaskSet.NewTask that only takes ownership of TaskContext
-// and TaskResources of the TaskConfig if it succeeds.
+// newTask is a helper for TaskSet.NewTask that only takes ownership of parts
+// of cfg if it succeeds.
 func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 	tg := cfg.ThreadGroup
 	tc := cfg.TaskContext
@@ -90,23 +104,26 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 			parent:   cfg.Parent,
 			children: make(map[*Task]struct{}),
 		},
-		runState:       (*runApp)(nil),
-		interruptChan:  make(chan struct{}, 1),
-		signalStack:    arch.SignalStack{Flags: arch.SignalStackFlagDisable},
-		tc:             *tc,
-		tr:             *cfg.TaskResources,
-		p:              cfg.Kernel.Platform.NewContext(),
-		k:              cfg.Kernel,
-		ptraceTracees:  make(map[*Task]struct{}),
-		allowedCPUMask: cfg.AllowedCPUMask.Copy(),
-		ioUsage:        &usage.IO{},
-		creds:          cfg.Credentials,
-		niceness:       cfg.Niceness,
-		netns:          cfg.NetworkNamespaced,
-		utsns:          cfg.UTSNamespace,
-		ipcns:          cfg.IPCNamespace,
-		rseqCPU:        -1,
-		futexWaiter:    futex.NewWaiter(),
+		runState:        (*runApp)(nil),
+		interruptChan:   make(chan struct{}, 1),
+		signalMask:      cfg.SignalMask,
+		signalStack:     arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+		tc:              *tc,
+		fsc:             cfg.FSContext,
+		fds:             cfg.FDMap,
+		p:               cfg.Kernel.Platform.NewContext(),
+		k:               cfg.Kernel,
+		ptraceTracees:   make(map[*Task]struct{}),
+		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
+		ioUsage:         &usage.IO{},
+		creds:           cfg.Credentials,
+		niceness:        cfg.Niceness,
+		netns:           cfg.NetworkNamespaced,
+		utsns:           cfg.UTSNamespace,
+		ipcns:           cfg.IPCNamespace,
+		abstractSockets: cfg.AbstractSocketNamespace,
+		rseqCPU:         -1,
+		futexWaiter:     futex.NewWaiter(),
 	}
 	t.endStopCond.L = &t.tg.signalHandlers.mu
 	t.ptraceTracer.Store((*Task)(nil))
-- 
cgit v1.2.3


From b935311e2371abdbceba89294d0001905f2658d5 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 31 Aug 2018 14:16:36 -0700
Subject: Do not use fs.FileOwnerFromContext in fs/proc.file.UnstableAttr().

From //pkg/sentry/context/context.go:

// - It is *not safe* to retain a Context passed to a function beyond the scope
// of that function call.

Passing a stored kernel.Task as a context.Context to
fs.FileOwnerFromContext violates this requirement.

PiperOrigin-RevId: 211143021
Change-Id: I4c5b02bd941407be4c9cfdbcbdfe5a26acaec037
---
 pkg/sentry/fs/proc/file.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
index 4b2d08e75..4b3448245 100644
--- a/pkg/sentry/fs/proc/file.go
+++ b/pkg/sentry/fs/proc/file.go
@@ -51,7 +51,8 @@ func (f *file) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAt
 		return fs.UnstableAttr{}, err
 	}
 	if f.t != nil {
-		uattr.Owner = fs.FileOwnerFromContext(f.t)
+		creds := f.t.Credentials()
+		uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
 	}
 	return uattr, nil
 }
-- 
cgit v1.2.3


From f8ccfbbed4875e65c78c849cd46afa882ba68ee3 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 31 Aug 2018 15:43:32 -0700
Subject: Document more task-goroutine-owned fields in kernel.Task.

Task.creds can only be changed by the task's own set*id and execve
syscalls, and Task namespaces can only be changed by the task's own
unshare/setns syscalls.

PiperOrigin-RevId: 211156279
Change-Id: I94d57105d34e8739d964400995a8a5d76306b2a0
---
 pkg/sentry/kernel/task.go | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 32db0bf48..ae4fd7817 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -354,19 +354,19 @@ type Task struct {
 
 	// creds is the task's credentials.
 	//
-	// creds is protected by mu, however the value itself is immutable and
-	// can only be changed by a copy. After reading the pointer, access
-	// will proceed outside the scope of mu.
+	// creds is protected by mu, however the value itself is immutable and can
+	// only be changed by a copy. After reading the pointer, access will
+	// proceed outside the scope of mu. creds is owned by the task goroutine.
 	creds *auth.Credentials
 
 	// utsns is the task's UTS namespace.
 	//
-	// utsns is protected by mu.
+	// utsns is protected by mu. utsns is owned by the task goroutine.
 	utsns *UTSNamespace
 
 	// ipcns is the task's IPC namespace.
 	//
-	// ipcns is protected by mu.
+	// ipcns is protected by mu. ipcns is owned by the task goroutine.
 	ipcns *IPCNamespace
 
 	// abstractSockets tracks abstract sockets that are in use.
@@ -547,6 +547,9 @@ func (t *Task) Kernel() *Kernel {
 }
 
 // Value implements context.Context.Value.
+//
+// Preconditions: The caller must be running on the task goroutine (as implied
+// by the requirements of context.Context).
 func (t *Task) Value(key interface{}) interface{} {
 	switch key {
 	case CtxCanTrace:
@@ -556,18 +559,12 @@ func (t *Task) Value(key interface{}) interface{} {
 	case CtxPIDNamespace:
 		return t.tg.pidns
 	case CtxUTSNamespace:
-		t.mu.Lock()
-		defer t.mu.Unlock()
 		return t.utsns
 	case CtxIPCNamespace:
-		t.mu.Lock()
-		defer t.mu.Unlock()
 		return t.ipcns
 	case CtxTask:
 		return t
 	case auth.CtxCredentials:
-		t.mu.Lock()
-		defer t.mu.Unlock()
 		return t.creds
 	case context.CtxThreadGroupID:
 		return int32(t.ThreadGroup().ID())
-- 
cgit v1.2.3


From c09f9acd7c7a2e85472b1ee47bf26f7c89ded43e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 4 Sep 2018 09:18:00 -0700
Subject: Distinguish Element and Linker for ilist.

Furthermore, allow for the specification of an ElementMapper. This allows a
single "Element" type to exist on multiple inline lists, and work without
having to embed the entry type.

This is a requisite change for supporting a per-Inode list of Dirents.

PiperOrigin-RevId: 211467497
Change-Id: If2768999b43e03fdaecf8ed15f435fe37518d163
---
 pkg/ilist/BUILD                       |   7 ++-
 pkg/ilist/list.go                     | 108 ++++++++++++++++++++--------------
 pkg/refs/BUILD                        |  15 ++++-
 pkg/refs/refcounter.go                |   8 +--
 pkg/sentry/fs/BUILD                   |   1 +
 pkg/sentry/kernel/BUILD               |   4 ++
 pkg/sentry/kernel/futex/BUILD         |   1 +
 pkg/sentry/kernel/semaphore/BUILD     |   1 +
 pkg/sentry/mm/BUILD                   |   1 +
 pkg/tcpip/network/fragmentation/BUILD |   1 +
 pkg/tcpip/transport/ping/BUILD        |   1 +
 pkg/tcpip/transport/tcp/BUILD         |   1 +
 pkg/tcpip/transport/udp/BUILD         |   1 +
 13 files changed, 100 insertions(+), 50 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index b26a39132..1bd71b800 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -28,6 +28,7 @@ go_template_instance(
     prefix = "direct",
     template = ":generic_list",
     types = {
+        "Element": "*direct",
         "Linker": "*direct",
     },
 )
@@ -47,6 +48,10 @@ go_template(
     srcs = [
         "list.go",
     ],
-    opt_types = ["Linker"],
+    opt_types = [
+        "Element",
+        "ElementMapper",
+        "Linker",
+    ],
     visibility = ["//visibility:public"],
 )
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index a88b82196..4ae02eee9 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -21,12 +21,34 @@ package ilist
 // N.B. When substituted in a template instantiation, Linker doesn't need to
 // be an interface, and in most cases won't be.
 type Linker interface {
-	Next() Linker
-	Prev() Linker
-	SetNext(Linker)
-	SetPrev(Linker)
+	Next() Element
+	Prev() Element
+	SetNext(Element)
+	SetPrev(Element)
 }
 
+// Element the item that is used at the API level.
+//
+// N.B. Like Linker, this is unlikely to be an interface in most cases.
+type Element interface {
+	Linker
+}
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type ElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (ElementMapper) linkerFor(elem Element) Linker { return elem }
+
 // List is an intrusive list. Entries can be added to or removed from the list
 // in O(1) time and with no additional memory allocations.
 //
@@ -39,8 +61,8 @@ type Linker interface {
 //
 // +stateify savable
 type List struct {
-	head Linker
-	tail Linker
+	head Element
+	tail Element
 }
 
 // Reset resets list l to the empty state.
@@ -55,22 +77,22 @@ func (l *List) Empty() bool {
 }
 
 // Front returns the first element of list l or nil.
-func (l *List) Front() Linker {
+func (l *List) Front() Element {
 	return l.head
 }
 
 // Back returns the last element of list l or nil.
-func (l *List) Back() Linker {
+func (l *List) Back() Element {
 	return l.tail
 }
 
 // PushFront inserts the element e at the front of list l.
-func (l *List) PushFront(e Linker) {
-	e.SetNext(l.head)
-	e.SetPrev(nil)
+func (l *List) PushFront(e Element) {
+	ElementMapper{}.linkerFor(e).SetNext(l.head)
+	ElementMapper{}.linkerFor(e).SetPrev(nil)
 
 	if l.head != nil {
-		l.head.SetPrev(e)
+		ElementMapper{}.linkerFor(l.head).SetPrev(e)
 	} else {
 		l.tail = e
 	}
@@ -79,12 +101,12 @@ func (l *List) PushFront(e Linker) {
 }
 
 // PushBack inserts the element e at the back of list l.
-func (l *List) PushBack(e Linker) {
-	e.SetNext(nil)
-	e.SetPrev(l.tail)
+func (l *List) PushBack(e Element) {
+	ElementMapper{}.linkerFor(e).SetNext(nil)
+	ElementMapper{}.linkerFor(e).SetPrev(l.tail)
 
 	if l.tail != nil {
-		l.tail.SetNext(e)
+		ElementMapper{}.linkerFor(l.tail).SetNext(e)
 	} else {
 		l.head = e
 	}
@@ -98,8 +120,8 @@ func (l *List) PushBackList(m *List) {
 		l.head = m.head
 		l.tail = m.tail
 	} else if m.head != nil {
-		l.tail.SetNext(m.head)
-		m.head.SetPrev(l.tail)
+		ElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		ElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
 
 		l.tail = m.tail
 	}
@@ -109,46 +131,46 @@ func (l *List) PushBackList(m *List) {
 }
 
 // InsertAfter inserts e after b.
-func (l *List) InsertAfter(b, e Linker) {
-	a := b.Next()
-	e.SetNext(a)
-	e.SetPrev(b)
-	b.SetNext(e)
+func (l *List) InsertAfter(b, e Element) {
+	a := ElementMapper{}.linkerFor(b).Next()
+	ElementMapper{}.linkerFor(e).SetNext(a)
+	ElementMapper{}.linkerFor(e).SetPrev(b)
+	ElementMapper{}.linkerFor(b).SetNext(e)
 
 	if a != nil {
-		a.SetPrev(e)
+		ElementMapper{}.linkerFor(a).SetPrev(e)
 	} else {
 		l.tail = e
 	}
 }
 
 // InsertBefore inserts e before a.
-func (l *List) InsertBefore(a, e Linker) {
-	b := a.Prev()
-	e.SetNext(a)
-	e.SetPrev(b)
-	a.SetPrev(e)
+func (l *List) InsertBefore(a, e Element) {
+	b := ElementMapper{}.linkerFor(a).Prev()
+	ElementMapper{}.linkerFor(e).SetNext(a)
+	ElementMapper{}.linkerFor(e).SetPrev(b)
+	ElementMapper{}.linkerFor(a).SetPrev(e)
 
 	if b != nil {
-		b.SetNext(e)
+		ElementMapper{}.linkerFor(b).SetNext(e)
 	} else {
 		l.head = e
 	}
 }
 
 // Remove removes e from l.
-func (l *List) Remove(e Linker) {
-	prev := e.Prev()
-	next := e.Next()
+func (l *List) Remove(e Element) {
+	prev := ElementMapper{}.linkerFor(e).Prev()
+	next := ElementMapper{}.linkerFor(e).Next()
 
 	if prev != nil {
-		prev.SetNext(next)
+		ElementMapper{}.linkerFor(prev).SetNext(next)
 	} else {
 		l.head = next
 	}
 
 	if next != nil {
-		next.SetPrev(prev)
+		ElementMapper{}.linkerFor(next).SetPrev(prev)
 	} else {
 		l.tail = prev
 	}
@@ -160,26 +182,26 @@ func (l *List) Remove(e Linker) {
 //
 // +stateify savable
 type Entry struct {
-	next Linker
-	prev Linker
+	next Element
+	prev Element
 }
 
 // Next returns the entry that follows e in the list.
-func (e *Entry) Next() Linker {
+func (e *Entry) Next() Element {
 	return e.next
 }
 
 // Prev returns the entry that precedes e in the list.
-func (e *Entry) Prev() Linker {
+func (e *Entry) Prev() Element {
 	return e.prev
 }
 
 // SetNext assigns 'entry' as the entry that follows e in the list.
-func (e *Entry) SetNext(entry Linker) {
-	e.next = entry
+func (e *Entry) SetNext(elem Element) {
+	e.next = elem
 }
 
 // SetPrev assigns 'entry' as the entry that precedes e in the list.
-func (e *Entry) SetPrev(entry Linker) {
-	e.prev = entry
+func (e *Entry) SetPrev(elem Element) {
+	e.prev = elem
 }
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 3ea877ccf..98150ba8f 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,16 +1,29 @@
 package(licenses = ["notice"])  # Apache 2.0
 
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+go_template_instance(
+    name = "weak_ref_list",
+    out = "weak_ref_list.go",
+    package = "refs",
+    prefix = "weakRef",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*WeakRef",
+        "Linker": "*WeakRef",
+    },
+)
+
 go_library(
     name = "refs",
     srcs = [
         "refcounter.go",
         "refcounter_state.go",
+        "weak_ref_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/refs",
     visibility = ["//:sandbox"],
-    deps = ["//pkg/ilist"],
 )
 
 go_test(
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 0d44c2499..638a93bab 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -20,8 +20,6 @@ import (
 	"reflect"
 	"sync"
 	"sync/atomic"
-
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 )
 
 // RefCounter is the interface to be implemented by objects that are reference
@@ -61,7 +59,7 @@ type WeakRefUser interface {
 //
 // +stateify savable
 type WeakRef struct {
-	ilist.Entry `state:"nosave"`
+	weakRefEntry `state:"nosave"`
 
 	// obj is an atomic value that points to the refCounter.
 	obj atomic.Value `state:".(savedReference)"`
@@ -195,7 +193,7 @@ type AtomicRefCount struct {
 	mu sync.Mutex `state:"nosave"`
 
 	// weakRefs is our collection of weak references.
-	weakRefs ilist.List `state:"nosave"`
+	weakRefs weakRefList `state:"nosave"`
 }
 
 // ReadRefs returns the current number of references. The returned count is
@@ -276,7 +274,7 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) {
 		// return false due to the reference count check.
 		r.mu.Lock()
 		for !r.weakRefs.Empty() {
-			w := r.weakRefs.Front().(*WeakRef)
+			w := r.weakRefs.Front()
 			// Capture the callback because w cannot be touched
 			// after it's zapped -- the owner is free it reuse it
 			// after that.
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 18cd5ae8e..a949fffbf 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -78,6 +78,7 @@ go_template_instance(
     template = "//pkg/ilist:generic_list",
     types = {
         "Linker": "*Dirent",
+        "Element": "*Dirent",
     },
 )
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 0bc735550..7eb2bffeb 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -10,6 +10,7 @@ go_template_instance(
     prefix = "pendingSignal",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*pendingSignal",
         "Linker": "*pendingSignal",
     },
 )
@@ -21,6 +22,7 @@ go_template_instance(
     prefix = "processGroup",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*ProcessGroup",
         "Linker": "*ProcessGroup",
     },
 )
@@ -43,6 +45,7 @@ go_template_instance(
     prefix = "session",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*Session",
         "Linker": "*Session",
     },
 )
@@ -54,6 +57,7 @@ go_template_instance(
     prefix = "task",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*Task",
         "Linker": "*Task",
     },
 )
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index b44a26974..0ff5b0a95 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -10,6 +10,7 @@ go_template_instance(
     prefix = "waiter",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*Waiter",
         "Linker": "*Waiter",
     },
 )
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index e7fa44e2c..bdcf4ce5c 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -10,6 +10,7 @@ go_template_instance(
     prefix = "waiter",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*waiter",
         "Linker": "*waiter",
     },
 )
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index bbdfae247..ad9231774 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -67,6 +67,7 @@ go_template_instance(
     prefix = "io",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*ioResult",
         "Linker": "*ioResult",
     },
 )
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index 83b4d253f..aaabfcb9a 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -10,6 +10,7 @@ go_template_instance(
     prefix = "reassembler",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*reassembler",
         "Linker": "*reassembler",
     },
 )
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
index 117532fea..982b6795c 100644
--- a/pkg/tcpip/transport/ping/BUILD
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -10,6 +10,7 @@ go_template_instance(
     prefix = "pingPacket",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*pingPacket",
         "Linker": "*pingPacket",
     },
 )
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 8b911c295..c7943f08e 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -10,6 +10,7 @@ go_template_instance(
     prefix = "segment",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*segment",
         "Linker": "*segment",
     },
 )
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 1a3a62d3d..4225e28dc 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -10,6 +10,7 @@ go_template_instance(
     prefix = "udpPacket",
     template = "//pkg/ilist:generic_list",
     types = {
+        "Element": "*udpPacket",
         "Linker": "*udpPacket",
     },
 )
-- 
cgit v1.2.3


From 3944cb41cbef64ac507e87f258441000a46424d5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 4 Sep 2018 13:28:37 -0700
Subject: /proc/PID/mounts is not tab-delimited

PiperOrigin-RevId: 211513847
Change-Id: Ib484dd2d921c3e5d70d0e410cd973d3bff4f6b73
---
 pkg/sentry/fs/proc/mounts.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 2b8167c28..81dcc153a 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -173,7 +173,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 
 	var buf bytes.Buffer
 	forEachMountSource(mf.t, func(mountPath string, m *fs.MountSource) {
-		// Format (tab-separated):
+		// Format:
 		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
 		//
 		// We use the filesystem name as the first field, since there
@@ -191,7 +191,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		if m.Filesystem != nil {
 			name = m.Filesystem.Name()
 		}
-		fmt.Fprintf(&buf, "%s\t%s\t%s\t%s\t%d\t%d\n", "none", mountPath, name, opts, 0, 0)
+		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, name, opts, 0, 0)
 	})
 
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0
-- 
cgit v1.2.3


From 2b8dae0bc5594f7088dd028268efaedbb5a72507 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Wed, 5 Sep 2018 09:20:18 -0700
Subject: Open(2) isn't honoring O_NOFOLLOW

PiperOrigin-RevId: 211644897
Change-Id: I882ed827a477d6c03576463ca5bf2d6351892b90
---
 pkg/abi/linux/file.go                 | 1 +
 pkg/sentry/syscalls/linux/sys_file.go | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 509f6b5b3..9bf229a57 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -37,6 +37,7 @@ const (
 	O_DIRECT    = 00040000
 	O_LARGEFILE = 00100000
 	O_DIRECTORY = 00200000
+	O_NOFOLLOW  = 00400000
 	O_CLOEXEC   = 02000000
 	O_SYNC      = 04010000
 	O_PATH      = 010000000
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 2cf429f5c..3e28d4b8a 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -136,7 +136,8 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
 		return 0, err
 	}
 
-	err = fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+	resolve := flags&linux.O_NOFOLLOW == 0
+	err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
 		// First check a few things about the filesystem before trying to get the file
 		// reference.
 		//
@@ -147,6 +148,10 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
 			return err
 		}
 
+		if fs.IsSymlink(d.Inode.StableAttr) && !resolve {
+			return syserror.ELOOP
+		}
+
 		fileFlags := linuxToFlags(flags)
 		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
 		fileFlags.LargeFile = true
-- 
cgit v1.2.3


From 41b56696c4923276c6269812bb3dfa7643dab65d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 5 Sep 2018 18:05:59 -0700
Subject: Imported FD in exec was leaking

Imported file needs to be closed after it's
been imported.

PiperOrigin-RevId: 211732472
Change-Id: Ia9249210558b77be076bcce465b832a22eed301f
---
 pkg/sentry/control/proc.go            |  4 ++
 pkg/sentry/fs/host/BUILD              |  1 +
 pkg/sentry/fs/host/descriptor.go      |  6 +--
 pkg/sentry/fs/host/descriptor_test.go | 78 +++++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 3 deletions(-)
 create mode 100644 pkg/sentry/fs/host/descriptor_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 2493c5175..4848a5d2b 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -119,6 +119,10 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 			return err
 		}
 		defer file.DecRef()
+
+		// We're done with this file.
+		f.Close()
+
 		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
 			return err
 		}
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index f1252b0f2..d1a6eaf6e 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -55,6 +55,7 @@ go_test(
     name = "host_test",
     size = "small",
     srcs = [
+        "descriptor_test.go",
         "fs_test.go",
         "inode_test.go",
         "socket_test.go",
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 3aee4d11c..148291ba6 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -31,9 +31,9 @@ type descriptor struct {
 	// donated is true if the host fd was donated by another process.
 	donated bool
 
-	// If origFD >= 0, it is the host fd that this file was
-	// originally created from, which must be available at time
-	// of restore. Only valid if donated is true.
+	// If origFD >= 0, it is the host fd that this file was originally created
+	// from, which must be available at time of restore. The FD can be closed
+	// after descriptor is created. Only set if donated is true.
 	origFD int
 
 	// wouldBlock is true if value (below) points to a file that can
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
new file mode 100644
index 000000000..f393a8b54
--- /dev/null
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"io/ioutil"
+	"path/filepath"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
+)
+
+func TestDescriptorRelease(t *testing.T) {
+	for _, tc := range []struct {
+		name       string
+		saveable   bool
+		wouldBlock bool
+	}{
+		{name: "all false"},
+		{name: "saveable", saveable: true},
+		{name: "wouldBlock", wouldBlock: true},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir("", "descriptor_test")
+			if err != nil {
+				t.Fatal("ioutil.TempDir() failed:", err)
+			}
+
+			fd, err := syscall.Open(filepath.Join(dir, "file"), syscall.O_RDWR|syscall.O_CREAT, 0666)
+			if err != nil {
+				t.Fatal("failed to open temp file:", err)
+			}
+
+			// FD ownership is transferred to the descritor.
+			queue := &waiter.Queue{}
+			d, err := newDescriptor(fd, false /* donated*/, tc.saveable, tc.wouldBlock, queue)
+			if err != nil {
+				syscall.Close(fd)
+				t.Fatalf("newDescriptor(%d, %t, false, %t, queue) failed, err: %v", fd, tc.saveable, tc.wouldBlock, err)
+			}
+			if tc.saveable {
+				if d.origFD < 0 {
+					t.Errorf("saveable descriptor must preserve origFD, desc: %+v", d)
+				}
+			}
+			if tc.wouldBlock {
+				if !fdnotifier.HasFD(int32(d.value)) {
+					t.Errorf("FD not registered with notifier, desc: %+v", d)
+				}
+			}
+
+			oldVal := d.value
+			d.Release()
+			if d.value != -1 {
+				t.Errorf("d.value want: -1, got: %d", d.value)
+			}
+			if tc.wouldBlock {
+				if fdnotifier.HasFD(int32(oldVal)) {
+					t.Errorf("FD not unregistered with notifier, desc: %+v", d)
+				}
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From 6516b5648b471951e8c4da7869531c9509ba1495 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 6 Sep 2018 13:46:45 -0700
Subject: createProcessArgs.RootFromContext should return process Root if it
 exists.

It was always returning the MountNamespace root, which may be different from
the process Root if the process is in a chroot environment.

PiperOrigin-RevId: 211862181
Change-Id: I63bfeb610e2b0affa9fdbdd8147eba3c39014480
---
 pkg/sentry/kernel/kernel.go | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index c2b5c7269..31a2f068d 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -553,10 +553,18 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 	case auth.CtxCredentials:
 		return ctx.args.Credentials
 	case fs.CtxRoot:
-		if ctx.k.mounts == nil {
-			return nil
+		if ctx.args.Root != nil {
+			// Take a refernce on the root dirent that will be
+			// given to the caller.
+			ctx.args.Root.IncRef()
+			return ctx.args.Root
 		}
-		return ctx.k.mounts.Root()
+		if ctx.k.mounts != nil {
+			// MountNamespace.Root() will take a reference on the
+			// root dirent for us.
+			return ctx.k.mounts.Root()
+		}
+		return nil
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
-- 
cgit v1.2.3


From 169e2efc5a2116755beca91e65802780282ab4c1 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 7 Sep 2018 10:27:19 -0700
Subject: Continue handling signals after disabling forwarding

Before destroying the Kernel, we disable signal forwarding,
relinquishing control to the Go runtime. External signals that arrive
after disabling forwarding but before the sandbox exits thus may use
runtime.raise (i.e., tkill(2)) and violate the syscall filters.

Adjust forwardSignals to handle signals received after disabling
forwarding the same way they are handled before starting forwarding.
i.e., by implementing the standard Go runtime behavior using tgkill(2)
instead of tkill(2).

This also makes the stop callback block until forwarding actually stops.
This isn't required to avoid tkill(2) but is a saner interface.

PiperOrigin-RevId: 211995946
Change-Id: I3585841644409260eec23435cf65681ad41f5f03
---
 pkg/sentry/sighandling/sighandling.go | 41 +++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 5bac3a4e1..b08588c11 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -30,9 +30,11 @@ import (
 // numSignals is the number of normal (non-realtime) signals on Linux.
 const numSignals = 32
 
-// forwardSignals listens for incoming signals and delivers them to k. It starts
-// when the start channel is closed and stops when the stop channel is closed.
-func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop chan struct{}) {
+// forwardSignals listens for incoming signals and delivers them to k.
+//
+// It starts when the start channel is closed, stops when the stop channel
+// is closed, and closes done once it will no longer deliver signals to k.
+func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop, done chan struct{}) {
 	// Build a select case.
 	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}}
 	for _, sigchan := range sigchans {
@@ -47,13 +49,19 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 		// Was it the start / stop channel?
 		if index == 0 {
 			if !ok {
-				if started {
-					// stop channel
-					break
-				} else {
-					// start channel
+				if !started {
+					// start channel; start forwarding and
+					// swap this case for the stop channel
+					// to select stop requests.
 					started = true
 					sc[0] = reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}
+				} else {
+					// stop channel; stop forwarding and
+					// clear this case so it is never
+					// selected again.
+					started = false
+					close(done)
+					sc[0].Chan = reflect.Value{}
 				}
 			}
 			continue
@@ -69,7 +77,8 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 		signal := linux.Signal(index)
 
 		if !started {
-			// Kernel is not ready to receive signals.
+			// Kernel cannot receive signals, either because it is
+			// not ready yet or is shutting down.
 			//
 			// Kill ourselves if this signal would have killed the
 			// process before PrepareForwarding was called. i.e., all
@@ -92,20 +101,19 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop cha
 
 		k.SendExternalSignal(&arch.SignalInfo{Signo: int32(signal)}, "sentry")
 	}
-
-	// Close all individual channels.
-	for _, sigchan := range sigchans {
-		signal.Stop(sigchan)
-		close(sigchan)
-	}
 }
 
 // PrepareForwarding ensures that synchronous signals are forwarded to k and
 // returns a callback that starts signal delivery, which itself returns a
 // callback that stops signal forwarding.
+//
+// Note that this function permanently takes over signal handling. After the
+// stop callback, signals revert to the default Go runtime behavior, which
+// cannot be overridden with external calls to signal.Notify.
 func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func() {
 	start := make(chan struct{})
 	stop := make(chan struct{})
+	done := make(chan struct{})
 
 	// Register individual channels. One channel per standard signal is
 	// required as os.Notify() is non-blocking and may drop signals. To avoid
@@ -126,12 +134,13 @@ func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func(
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
-	go forwardSignals(k, sigchans, start, stop) // S/R-SAFE: synchronized by Kernel.extMu
+	go forwardSignals(k, sigchans, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
 
 	return func() func() {
 		close(start)
 		return func() {
 			close(stop)
+			<-done
 		}
 	}
 }
-- 
cgit v1.2.3


From f895cb4d8b4b37a563b7a5b9dc92eae552084b44 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 7 Sep 2018 10:44:50 -0700
Subject: Use root abstract socket namespace for exec

PiperOrigin-RevId: 211999211
Change-Id: I5968dd1a8313d3e49bb6e6614e130107495de41d
---
 pkg/sentry/control/proc.go  | 23 ++++++++-------
 pkg/sentry/kernel/kernel.go | 41 +++++++++++++++++---------
 runsc/boot/loader.go        | 72 +++++++++++++++++++--------------------------
 3 files changed, 70 insertions(+), 66 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 4848a5d2b..6949a3ae5 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -95,17 +95,18 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		proc.Kernel.RootUserNamespace())
 
 	initArgs := kernel.CreateProcessArgs{
-		Filename:             args.Filename,
-		Argv:                 args.Argv,
-		Envv:                 args.Envv,
-		WorkingDirectory:     args.WorkingDirectory,
-		Credentials:          creds,
-		FDMap:                fdm,
-		Umask:                0022,
-		Limits:               l,
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		UTSNamespace:         proc.Kernel.RootUTSNamespace(),
-		IPCNamespace:         proc.Kernel.RootIPCNamespace(),
+		Filename:                args.Filename,
+		Argv:                    args.Argv,
+		Envv:                    args.Envv,
+		WorkingDirectory:        args.WorkingDirectory,
+		Credentials:             creds,
+		FDMap:                   fdm,
+		Umask:                   0022,
+		Limits:                  l,
+		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
+		UTSNamespace:            proc.Kernel.RootUTSNamespace(),
+		IPCNamespace:            proc.Kernel.RootIPCNamespace(),
+		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
 	}
 	ctx := initArgs.NewContext(proc.Kernel)
 	mounter := fs.FileOwnerFromContext(ctx)
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 31a2f068d..bc41c3963 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -90,17 +90,18 @@ type Kernel struct {
 	platform.Platform `state:"nosave"`
 
 	// See InitKernelArgs for the meaning of these fields.
-	featureSet        *cpuid.FeatureSet
-	timekeeper        *Timekeeper
-	tasks             *TaskSet
-	rootUserNamespace *auth.UserNamespace
-	networkStack      inet.Stack `state:"nosave"`
-	applicationCores  uint
-	useHostCores      bool
-	extraAuxv         []arch.AuxEntry
-	vdso              *loader.VDSO
-	rootUTSNamespace  *UTSNamespace
-	rootIPCNamespace  *IPCNamespace
+	featureSet                  *cpuid.FeatureSet
+	timekeeper                  *Timekeeper
+	tasks                       *TaskSet
+	rootUserNamespace           *auth.UserNamespace
+	networkStack                inet.Stack `state:"nosave"`
+	applicationCores            uint
+	useHostCores                bool
+	extraAuxv                   []arch.AuxEntry
+	vdso                        *loader.VDSO
+	rootUTSNamespace            *UTSNamespace
+	rootIPCNamespace            *IPCNamespace
+	rootAbstractSocketNamespace *AbstractSocketNamespace
 
 	// mounts holds the state of the virtual filesystem. mounts is initially
 	// nil, and must be set by calling Kernel.SetRootMountNamespace before
@@ -201,11 +202,14 @@ type InitKernelArgs struct {
 	// Vdso holds the VDSO and its parameter page.
 	Vdso *loader.VDSO
 
-	// RootUTSNamespace is the root UTS namepsace.
+	// RootUTSNamespace is the root UTS namespace.
 	RootUTSNamespace *UTSNamespace
 
-	// RootIPCNamespace is the root IPC namepsace.
+	// RootIPCNamespace is the root IPC namespace.
 	RootIPCNamespace *IPCNamespace
+
+	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
+	RootAbstractSocketNamespace *AbstractSocketNamespace
 }
 
 // Init initialize the Kernel with no tasks.
@@ -231,6 +235,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.rootUserNamespace = args.RootUserNamespace
 	k.rootUTSNamespace = args.RootUTSNamespace
 	k.rootIPCNamespace = args.RootIPCNamespace
+	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
 	k.networkStack = args.NetworkStack
 	k.applicationCores = args.ApplicationCores
 	if args.UseHostCores {
@@ -509,6 +514,9 @@ type CreateProcessArgs struct {
 	// IPCNamespace is the initial IPC namespace.
 	IPCNamespace *IPCNamespace
 
+	// AbstractSocketNamespace is the initial Abstract Socket namespace.
+	AbstractSocketNamespace *AbstractSocketNamespace
+
 	// Root optionally contains the dirent that serves as the root for the
 	// process. If nil, the mount namespace's root is used as the process'
 	// root.
@@ -651,7 +659,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
-		AbstractSocketNamespace: NewAbstractSocketNamespace(), // FIXME
+		AbstractSocketNamespace: args.AbstractSocketNamespace,
 	}
 	t, err := k.tasks.NewTask(config)
 	if err != nil {
@@ -839,6 +847,11 @@ func (k *Kernel) RootIPCNamespace() *IPCNamespace {
 	return k.rootIPCNamespace
 }
 
+// RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
+func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
+	return k.rootAbstractSocketNamespace
+}
+
 // RootMountNamespace returns the MountNamespace.
 func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
 	k.extMu.Lock()
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index ae2226e12..540cd6188 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -143,6 +143,19 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 	}
 	tk.SetClocks(time.NewCalibratedClocks())
 
+	if err := enableStrace(conf); err != nil {
+		return nil, fmt.Errorf("failed to enable strace: %v", err)
+	}
+
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
+	networkStack, err := newEmptyNetworkStack(conf, k)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create network: %v", err)
+	}
+
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
@@ -163,26 +176,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		caps,
 		auth.NewRootUserNamespace())
 
-	// Create user namespace.
-	// TODO: Not clear what domain name should be here.  It is
-	// not configurable from runtime spec.
-	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
-
-	ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
-
-	if err := enableStrace(conf); err != nil {
-		return nil, fmt.Errorf("failed to enable strace: %v", err)
-	}
-
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(conf, k)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create network: %v", err)
-	}
-
 	// Initiate the Kernel object, which is required by the Context passed
 	// to createVFS in order to mount (among other things) procfs.
 	if err = k.Init(kernel.InitKernelArgs{
@@ -191,10 +184,11 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		RootUserNamespace: creds.UserNamespace,
 		NetworkStack:      networkStack,
 		// TODO: use number of logical processors from cgroups.
-		ApplicationCores: uint(runtime.NumCPU()),
-		Vdso:             vdso,
-		RootUTSNamespace: utsns,
-		RootIPCNamespace: ipcns,
+		ApplicationCores:            uint(runtime.NumCPU()),
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
 	}); err != nil {
 		return nil, fmt.Errorf("error initializing kernel: %v", err)
 	}
@@ -244,7 +238,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 		log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal)
 	}
 
-	procArgs, err := newProcess(spec, creds, utsns, ipcns, k)
+	procArgs, err := newProcess(spec, creds, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
@@ -265,7 +259,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console
 }
 
 // newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+func newProcess(spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -274,15 +268,16 @@ func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSName
 
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
-		Argv:                 spec.Process.Args,
-		Envv:                 spec.Process.Env,
-		WorkingDirectory:     spec.Process.Cwd, // Defaults to '/' if empty.
-		Credentials:          creds,
-		Umask:                0022,
-		Limits:               ls,
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		UTSNamespace:         utsns,
-		IPCNamespace:         ipcns,
+		Argv:                    spec.Process.Args,
+		Envv:                    spec.Process.Env,
+		WorkingDirectory:        spec.Process.Cwd, // Defaults to '/' if empty.
+		Credentials:             creds,
+		Umask:                   0022,
+		Limits:                  ls,
+		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
+		UTSNamespace:            k.RootUTSNamespace(),
+		IPCNamespace:            k.RootIPCNamespace(),
+		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
 	}
 	return procArgs, nil
 }
@@ -421,12 +416,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	// TODO New containers should be started in new PID namespaces
 	// when indicated by the spec.
 
-	procArgs, err := newProcess(
-		spec,
-		creds,
-		l.k.RootUTSNamespace(),
-		l.k.RootIPCNamespace(),
-		l.k)
+	procArgs, err := newProcess(spec, creds, l.k)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create new process: %v", err)
 	}
-- 
cgit v1.2.3


From 172860a059ce2cff68aa85a3f66319ee52bdec13 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 7 Sep 2018 16:58:19 -0700
Subject: Add 'Starting gVisor...' message to syslog

This allows applications to verify they are running with gVisor. It
also helps debugging when running with a mix of container runtimes.

Closes #54

PiperOrigin-RevId: 212059457
Change-Id: I51d9595ee742b58c1f83f3902ab2e2ecbd5cedec
---
 pkg/sentry/kernel/syslog.go | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 125312b6a..6531bd5d2 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -86,14 +86,18 @@ func (s *syslog) Log() []byte {
 		return m
 	}
 
-	time := 0.0
+	const format = "<6>[%11.6f] %s\n"
+
+	s.msg = append(s.msg, []byte(fmt.Sprintf(format, 0.0, "Starting gVisor..."))...)
+
+	time := 0.1
 	for i := 0; i < 10; i++ {
 		time += rand.Float64() / 2
-		s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] %s\n", time, selectMessage()))...)
+		s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...)
 	}
 
 	time += rand.Float64() / 2
-	s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] Ready!\n", time))...)
+	s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...)
 
 	// Return a copy.
 	o := make([]byte, len(s.msg))
-- 
cgit v1.2.3


From 9751b800a6835f7febf99f1dee22a5aedd43f381 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 7 Sep 2018 17:38:34 -0700
Subject: runsc: Support multi-container exec.

We must use a context.Context with a Root Dirent that corresponds to the
container's chroot. Previously we were using the root context, which does not
have a chroot.

Getting the correct context required refactoring some of the path-lookup code.
We can't lookup the path without a context.Context, which requires
kernel.CreateProcArgs, which we only get inside control.Execute.  So we have to
do the path lookup much later than we previously were.

PiperOrigin-RevId: 212064734
Change-Id: I84a5cfadacb21fd9c3ab9c393f7e308a40b9b537
---
 pkg/sentry/control/proc.go   | 18 +++++++++++-
 pkg/sentry/fs/mounts.go      | 66 ++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/controller.go     | 36 ++++++++++++++++++------
 runsc/boot/fs.go             | 58 ++------------------------------------
 runsc/sandbox/sandbox.go     |  7 ++++-
 runsc/specutils/specutils.go | 12 --------
 6 files changed, 118 insertions(+), 79 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 6949a3ae5..289b8ba0e 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -54,6 +54,11 @@ type ExecArgs struct {
 	// Envv is a list of environment variables.
 	Envv []string `json:"envv"`
 
+	// Root defines the root directory for the new process. A reference on
+	// Root must be held for the lifetime of the ExecArgs. If Root is nil,
+	// it will default to the VFS root.
+	Root *fs.Dirent
+
 	// WorkingDirectory defines the working directory for the new process.
 	WorkingDirectory string `json:"wd"`
 
@@ -99,6 +104,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		Argv:                    args.Argv,
 		Envv:                    args.Envv,
 		WorkingDirectory:        args.WorkingDirectory,
+		Root:                    args.Root,
 		Credentials:             creds,
 		FDMap:                   fdm,
 		Umask:                   0022,
@@ -109,8 +115,18 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
 	}
 	ctx := initArgs.NewContext(proc.Kernel)
-	mounter := fs.FileOwnerFromContext(ctx)
 
+	if initArgs.Filename == "" {
+		// Get the full path to the filename from the PATH env variable.
+		paths := fs.GetPath(initArgs.Envv)
+		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+		if err != nil {
+			return fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+		}
+		initArgs.Filename = f
+	}
+
+	mounter := fs.FileOwnerFromContext(ctx)
 	for appFD, f := range args.FilePayload.Files {
 		enableIoctl := args.StdioIsPty && appFD <= 2
 
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 0318f135d..c0a803b2d 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -16,9 +16,13 @@ package fs
 
 import (
 	"fmt"
+	"path"
+	"strings"
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -509,3 +513,65 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) {
 	defer mns.mu.Unlock()
 	mns.root.SyncAll(ctx)
 }
+
+// ResolveExecutablePath resolves the given executable name given a set of
+// paths that might contain it.
+func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) {
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return name, nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		if wd == "" {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return "", fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return path.Join(wd, name), nil
+	}
+
+	// Otherwise, We must lookup the name in the paths, starting from the
+	// calling context's root directory.
+	root := RootFromContext(ctx)
+	if root == nil {
+		// Caller has no root. Don't bother traversing anything.
+		return "", syserror.ENOENT
+	}
+	defer root.DecRef()
+	for _, p := range paths {
+		binPath := path.Join(p, name)
+		d, err := mns.FindInode(ctx, root, nil, binPath, linux.MaxSymlinkTraversals)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		defer d.DecRef()
+
+		// Check whether we can read and execute the found file.
+		if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil {
+			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
+			continue
+		}
+		return path.Join("/", p, name), nil
+	}
+	return "", syserror.ENOENT
+}
+
+// GetPath returns the PATH as a slice of strings given the environemnt
+// variables.
+func GetPath(env []string) []string {
+	const prefix = "PATH="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.Split(strings.TrimPrefix(e, prefix), ":")
+		}
+	}
+	return nil
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 45aa255c4..fd5b7cc9e 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -224,21 +224,39 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	return nil
 }
 
+// ExecArgs contains arguments to Execute.
+type ExecArgs struct {
+	control.ExecArgs
+
+	// CID is the ID of the container to exec in.
+	CID string
+}
+
 // Execute runs a command on a created or running sandbox.
-func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+func (cm *containerManager) Execute(e *ExecArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Execute: %+v", *e)
 
-	if e.Filename == "" {
-		rootCtx := cm.l.rootProcArgs.NewContext(cm.l.k)
-		rootMns := cm.l.k.RootMountNamespace()
-		var err error
-		if e.Filename, err = getExecutablePath(rootCtx, rootMns, e.Argv[0], e.Envv); err != nil {
-			return fmt.Errorf("error getting executable path for %q: %v", e.Argv[0], err)
-		}
+	// Get the container Root Dirent from the Task, since we must run this
+	// process with the same Root.
+	cm.l.mu.Lock()
+	tgid, ok := cm.l.containerRootTGIDs[e.CID]
+	cm.l.mu.Unlock()
+	if !ok {
+		return fmt.Errorf("cannot exec in container %q: no such container", e.CID)
+	}
+	t := cm.l.k.TaskSet().Root.TaskWithID(kernel.ThreadID(tgid))
+	if t == nil {
+		return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", e.CID, tgid)
+	}
+	t.WithMuLocked(func(t *kernel.Task) {
+		e.Root = t.FSContext().RootDirectory()
+	})
+	if e.Root != nil {
+		defer e.Root.DecRef()
 	}
 
 	proc := control.Proc{Kernel: cm.l.k}
-	if err := proc.Exec(e, waitStatus); err != nil {
+	if err := proc.Exec(&e.ExecArgs, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
 	}
 	return nil
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 3df276170..5ec9a7d03 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -683,64 +682,11 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe
 	return nil
 }
 
-// getExecutablePath traverses the *container's* filesystem to resolve exec's
-// absolute path. For example, if the container is being served files by the
-// fsgofer serving /foo/bar as the container root, it will search within
-// /foo/bar, not the host root.
-// TODO: Unit test this.
-func getExecutablePath(ctx context.Context, mns *fs.MountNamespace, filename string, env []string) (string, error) {
-	exec := filepath.Clean(filename)
-
-	// Don't search PATH if exec is a path to a file (absolute or relative).
-	if strings.IndexByte(exec, '/') >= 0 {
-		return exec, nil
-	}
-
-	// Search the PATH for a file whose name matches the one we are looking
-	// for.
-	pathDirs := specutils.GetPath(env)
-	for _, p := range pathDirs {
-		// Try to find the binary inside path p.
-		binPath := path.Join(p, filename)
-		root := fs.RootFromContext(ctx)
-		defer root.DecRef()
-		d, err := mns.FindInode(ctx, root, nil, binPath, linux.MaxSymlinkTraversals)
-		if err == syserror.ENOENT || err == syserror.EACCES {
-			continue
-		}
-		if err != nil {
-			return "", fmt.Errorf("FindInode(%q) failed: %v", binPath, err)
-		}
-		defer d.DecRef()
-
-		// Check whether we can read and execute the found file.
-		if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
-			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
-			continue
-		}
-		return path.Join("/", p, exec), nil
-	}
-
-	return "", fmt.Errorf("could not find executable %q in path %v", exec, pathDirs)
-}
-
 // setExecutablePath sets the procArgs.Filename by searching the PATH for an
 // executable matching the procArgs.Argv[0].
 func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
-	if procArgs.Filename != "" {
-		// Sanity check.
-		if !path.IsAbs(procArgs.Filename) {
-			return fmt.Errorf("filename must be absolute: %q", procArgs.Filename)
-		}
-		// Nothing to set.
-		return nil
-	}
-
-	if len(procArgs.Argv) == 0 {
-		return fmt.Errorf("Argv must not be empty")
-	}
-
-	f, err := getExecutablePath(ctx, mns, procArgs.Argv[0], procArgs.Envv)
+	paths := fs.GetPath(procArgs.Envv)
+	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, procArgs.Argv[0], paths)
 	if err != nil {
 		return err
 	}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 697210669..f272496a1 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -187,11 +187,16 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 	}
 	defer conn.Close()
 
+	ea := &boot.ExecArgs{
+		ExecArgs: *e,
+		CID:      cid,
+	}
+
 	// Send a message to the sandbox control server to start the container.
 	var waitStatus uint32
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should execute in the context of that container.
-	if err := conn.Call(boot.ContainerExecute, e, &waitStatus); err != nil {
+	if err := conn.Call(boot.ContainerExecute, ea, &waitStatus); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
 
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index f3fa8d129..fdc9007e0 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -163,18 +163,6 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
 	return &spec, nil
 }
 
-// GetPath returns the PATH as a slice of strings given the environemnt
-// variables.
-func GetPath(env []string) []string {
-	const prefix = "PATH="
-	for _, e := range env {
-		if strings.HasPrefix(e, prefix) {
-			return strings.Split(strings.TrimPrefix(e, prefix), ":")
-		}
-	}
-	return nil
-}
-
 // Capabilities takes in spec and returns a TaskCapabilities corresponding to
 // the spec.
 func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
-- 
cgit v1.2.3


From 7045828a310d47a2940214f71ae75b8b7b682b78 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 7 Sep 2018 18:13:50 -0700
Subject: Update cleanup TODO

PiperOrigin-RevId: 212068327
Change-Id: I3f360cdf7d6caa1c96fae68ae3a1caaf440f0cbe
---
 pkg/sentry/sighandling/sighandling.go | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index b08588c11..0946ab075 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -87,9 +87,8 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop, do
 			//
 			// Otherwise ignore the signal.
 			//
-			// TODO: Convert Go's runtime.raise from
-			// tkill to tgkill so PrepareForwarding doesn't need to
-			// be called until after filter installation.
+			// TODO: Drop in Go 1.12, which uses tgkill
+			// in runtime.raise.
 			switch signal {
 			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
 				dieFromSignal(signal)
-- 
cgit v1.2.3


From 7e9e6745ca1f17031bbea14cb08b3ee3c0f9f818 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 10 Sep 2018 13:23:49 -0700
Subject: Allow '/dev/zero' to be mapped with unaligned length

PiperOrigin-RevId: 212321271
Change-Id: I79d71c2e6f4b8fcd3b9b923fe96c2256755f4c48
---
 pkg/sentry/mm/special_mappable.go | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index aa2f87107..5d7bd33bd 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -138,10 +138,15 @@ func (m *SpecialMappable) Length() uint64 {
 // uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
 // do the same to get non-zero device and inode IDs.
 func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable, error) {
-	if length == 0 || length != uint64(usermem.Addr(length).RoundDown()) {
+	if length == 0 {
 		return nil, syserror.EINVAL
 	}
-	fr, err := p.Memory().Allocate(length, usage.Anonymous)
+	alignedLen, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return nil, syserror.EINVAL
+	}
+
+	fr, err := p.Memory().Allocate(uint64(alignedLen), usage.Anonymous)
 	if err != nil {
 		return nil, err
 	}
-- 
cgit v1.2.3


From a29c39aa629b6118765e5075eb228752934d7081 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 10 Sep 2018 15:22:44 -0700
Subject: Map committed chunks concurrently in FileMem.LoadFrom.

PiperOrigin-RevId: 212345401
Change-Id: Iac626ee87ba312df88ab1019ade6ecd62c04c75c
---
 pkg/sentry/platform/filemem/filemem_state.go | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/filemem/filemem_state.go b/pkg/sentry/platform/filemem/filemem_state.go
index 5dace8fec..e28e021c9 100644
--- a/pkg/sentry/platform/filemem/filemem_state.go
+++ b/pkg/sentry/platform/filemem/filemem_state.go
@@ -19,6 +19,7 @@ import (
 	"fmt"
 	"io"
 	"runtime"
+	"sync/atomic"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -127,6 +128,29 @@ func (f *FileMem) LoadFrom(r io.Reader) error {
 		return err
 	}
 
+	// Try to map committed chunks concurrently: For any given chunk, either
+	// this loop or the following one will mmap the chunk first and cache it in
+	// f.mappings for the other, but this loop is likely to run ahead of the
+	// other since it doesn't do any work between mmaps. The rest of this
+	// function doesn't mutate f.usage, so it's safe to iterate concurrently.
+	mapperDone := make(chan struct{})
+	mapperCanceled := int32(0)
+	go func() { // S/R-SAFE: see comment
+		defer func() { close(mapperDone) }()
+		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			if atomic.LoadInt32(&mapperCanceled) != 0 {
+				return
+			}
+			if seg.Value().knownCommitted {
+				f.forEachMappingSlice(seg.Range(), func(s []byte) {})
+			}
+		}
+	}()
+	defer func() {
+		atomic.StoreInt32(&mapperCanceled, 1)
+		<-mapperDone
+	}()
+
 	// Load committed pages.
 	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 		if !seg.Value().knownCommitted {
-- 
cgit v1.2.3


From 6cc9b311af3633d244f526abed50c0d3b0ce06a1 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 11 Sep 2018 13:08:36 -0700
Subject: platform: Pass device fd into platform constructor.

We were previously openining the platform device (i.e. /dev/kvm) inside the
platfrom constructor (i.e. kvm.New).  This requires that we have RW access to
the platform device when constructing the platform.

However, now that the runsc sandbox process runs as user "nobody", it is not
able to open the platform device.

This CL changes the kvm constructor to take the platform device FD, rather than
opening the device file itself. The device file is opened outside of the
sandbox and passed to the sandbox process.

PiperOrigin-RevId: 212505804
Change-Id: I427e1d9de5eb84c84f19d513356e1bb148a52910
---
 pkg/sentry/platform/kvm/kvm.go      | 25 ++++++++++++++---------
 pkg/sentry/platform/kvm/kvm_test.go |  6 +++++-
 runsc/boot/controller.go            | 24 +++++++++++++++++-----
 runsc/boot/loader.go                | 11 ++++++----
 runsc/boot/loader_test.go           |  2 +-
 runsc/cmd/boot.go                   |  6 +++++-
 runsc/sandbox/BUILD                 |  1 +
 runsc/sandbox/chroot.go             | 40 +++----------------------------------
 runsc/sandbox/sandbox.go            | 40 ++++++++++++++++++++++++++++++++++++-
 9 files changed, 96 insertions(+), 59 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 2dc3239a5..19bc2d515 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -17,6 +17,7 @@ package kvm
 
 import (
 	"fmt"
+	"os"
 	"sync"
 	"syscall"
 
@@ -44,25 +45,29 @@ var (
 	globalErr  error
 )
 
+// OpenDevice opens the KVM device at /dev/kvm and returns the File.
+func OpenDevice() (*os.File, error) {
+	f, err := os.OpenFile("/dev/kvm", syscall.O_RDWR, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error opening /dev/kvm: %v", err)
+	}
+	return f, nil
+}
+
 // New returns a new KVM-based implementation of the platform interface.
-func New() (*KVM, error) {
+func New(deviceFile *os.File) (*KVM, error) {
 	// Allocate physical memory for the vCPUs.
 	fm, err := filemem.New("kvm-memory")
 	if err != nil {
 		return nil, err
 	}
 
-	// Try opening KVM.
-	fd, err := syscall.Open("/dev/kvm", syscall.O_RDWR, 0)
-	if err != nil {
-		return nil, fmt.Errorf("opening /dev/kvm: %v", err)
-	}
-	defer syscall.Close(fd)
+	fd := deviceFile.Fd()
 
 	// Ensure global initialization is done.
 	globalOnce.Do(func() {
 		physicalInit()
-		globalErr = updateSystemValues(fd)
+		globalErr = updateSystemValues(int(fd))
 		ring0.Init(cpuid.HostFeatureSet())
 	})
 	if globalErr != nil {
@@ -70,10 +75,12 @@ func New() (*KVM, error) {
 	}
 
 	// Create a new VM fd.
-	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_CREATE_VM, 0)
+	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, fd, _KVM_CREATE_VM, 0)
 	if errno != 0 {
 		return nil, fmt.Errorf("creating VM: %v", errno)
 	}
+	// We are done with the device file.
+	deviceFile.Close()
 
 	// Create a VM context.
 	machine, err := newMachine(int(vm))
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 180bf7bb0..52448839f 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -39,7 +39,11 @@ type testHarness interface {
 
 func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
 	// Create the machine.
-	k, err := New()
+	deviceFile, err := OpenDevice()
+	if err != nil {
+		t.Fatalf("error opening device file: %v", err)
+	}
+	k, err := New(deviceFile)
 	if err != nil {
 		t.Fatalf("error creating KVM instance: %v", err)
 	}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index fd5b7cc9e..257f275f9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -17,6 +17,7 @@ package boot
 import (
 	"errors"
 	"fmt"
+	"os"
 	"path"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -287,7 +288,8 @@ func (cm *containerManager) WaitForLoader(_, _ *struct{}) error {
 
 // RestoreOpts contains options related to restoring a container's file system.
 type RestoreOpts struct {
-	// FilePayload contains the state file to be restored.
+	// FilePayload contains the state file to be restored, followed by the
+	// platform device file if necessary.
 	urpc.FilePayload
 
 	// SandboxID contains the ID of the sandbox.
@@ -300,16 +302,28 @@ type RestoreOpts struct {
 // signal to start.
 func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	log.Debugf("containerManager.Restore")
-	if len(o.FilePayload.Files) != 1 {
-		return fmt.Errorf("exactly one file must be provided")
+
+	var specFile, deviceFile *os.File
+	switch numFiles := len(o.FilePayload.Files); numFiles {
+	case 2:
+		// The device file is donated to the platform, so don't Close
+		// it here.
+		deviceFile = o.FilePayload.Files[1]
+		fallthrough
+	case 1:
+		specFile = o.FilePayload.Files[0]
+		defer specFile.Close()
+	case 0:
+		return fmt.Errorf("at least one file must be passed to Restore")
+	default:
+		return fmt.Errorf("at most two files may be passed to Restore")
 	}
-	defer o.FilePayload.Files[0].Close()
 
 	// Destroy the old kernel and create a new kernel.
 	cm.l.k.Pause()
 	cm.l.k.Destroy()
 
-	p, err := createPlatform(cm.l.conf)
+	p, err := createPlatform(cm.l.conf, int(deviceFile.Fd()))
 	if err != nil {
 		return fmt.Errorf("error creating platform: %v", err)
 	}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 994b3d2e2..30d22b9c6 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -122,9 +122,9 @@ func init() {
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
-func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
+func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, ioFDs []int, console bool) (*Loader, error) {
 	// Create kernel and platform.
-	p, err := createPlatform(conf)
+	p, err := createPlatform(conf, deviceFD)
 	if err != nil {
 		return nil, fmt.Errorf("error creating platform: %v", err)
 	}
@@ -301,14 +301,17 @@ func (l *Loader) Destroy() {
 	l.watchdog.Stop()
 }
 
-func createPlatform(conf *Config) (platform.Platform, error) {
+func createPlatform(conf *Config, deviceFD int) (platform.Platform, error) {
 	switch conf.Platform {
 	case PlatformPtrace:
 		log.Infof("Platform: ptrace")
 		return ptrace.New()
 	case PlatformKVM:
 		log.Infof("Platform: kvm")
-		return kvm.New()
+		if deviceFD < 0 {
+			return nil, fmt.Errorf("kvm device fd must be provided")
+		}
+		return kvm.New(os.NewFile(uintptr(deviceFD), "kvm device"))
 	default:
 		return nil, fmt.Errorf("invalid platform %v", conf.Platform)
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index d6bfe9ff1..9398292ff 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -101,7 +101,7 @@ func createLoader() (*Loader, func(), error) {
 		return nil, nil, err
 	}
 
-	l, err := New(spec, conf, fd, []int{sandEnd}, false)
+	l, err := New(spec, conf, fd, -1 /* device fd */, []int{sandEnd}, false)
 	if err != nil {
 		cleanup()
 		return nil, nil, err
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index d8c7b9cd3..035147cf1 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -42,6 +42,9 @@ type Boot struct {
 	// control server that is donated to this process.
 	controllerFD int
 
+	// deviceFD is the file descriptor for the platform device file.
+	deviceFD int
+
 	// ioFDs is the list of FDs used to connect to FS gofers.
 	ioFDs intFlags
 
@@ -74,6 +77,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
 	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
 	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
@@ -134,7 +138,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Create the loader.
-	l, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
+	l, err := boot.New(spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.console)
 	if err != nil {
 		Fatalf("error creating loader: %v", err)
 	}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 8ebd14c4e..5cf8f0cda 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -18,6 +18,7 @@ go_library(
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/sentry/control",
+        "//pkg/sentry/platform/kvm",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/console",
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
index f35d9c72d..749bf3782 100644
--- a/runsc/sandbox/chroot.go
+++ b/runsc/sandbox/chroot.go
@@ -22,7 +22,6 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
@@ -39,18 +38,12 @@ func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
 	if err := specutils.Mount(src, chrootDst, typ, flags); err != nil {
 		return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err)
 	}
-
-	// Make sure the mount is accessible to all users, since we will be
-	// running as nobody inside the chroot.
-	if err := os.Chmod(chrootDst, 0777); err != nil {
-		return fmt.Errorf("Chmod(%q) failed: %v", chroot, err)
-	}
 	return nil
 }
 
-// setUpChroot creates an empty directory with runsc mounted at /runsc, proc
-// mounted at /proc, and any dev files needed for the platform.
-func setUpChroot(platform boot.PlatformType) (string, error) {
+// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
+// mounted at /proc.
+func setUpChroot() (string, error) {
 	// Create the chroot directory and make it accessible to all users.
 	chroot, err := ioutil.TempDir("", "runsc-sandbox-chroot-")
 	if err != nil {
@@ -75,18 +68,6 @@ func setUpChroot(platform boot.PlatformType) (string, error) {
 		return "", fmt.Errorf("error mounting runsc in chroot: %v", err)
 	}
 
-	// Mount dev files needed for platform.
-	var devMount string
-	switch platform {
-	case boot.PlatformKVM:
-		devMount = "/dev/kvm"
-	}
-	if devMount != "" {
-		if err := mountInChroot(chroot, devMount, devMount, "bind", syscall.MS_BIND); err != nil {
-			return "", fmt.Errorf("error mounting platform device in chroot: %v", err)
-		}
-	}
-
 	return chroot, nil
 }
 
@@ -105,21 +86,6 @@ func tearDownChroot(chroot string) error {
 		return fmt.Errorf("error unmounting %q: %v", exe, err)
 	}
 
-	// Unmount platform dev files.
-	devFiles := []string{"dev/kvm"}
-	for _, f := range devFiles {
-		devPath := filepath.Join(chroot, f)
-		if _, err := os.Stat(devPath); err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-			return fmt.Errorf("Stat(%q) failed: %v", devPath, err)
-		}
-		if err := syscall.Unmount(devPath, 0); err != nil {
-			return fmt.Errorf("error unmounting %q: %v", devPath, err)
-		}
-	}
-
 	// Remove chroot directory.
 	if err := os.RemoveAll(chroot); err != nil {
 		return fmt.Errorf("error removing %q: %v", chroot, err)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index f272496a1..195deda1e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/console"
@@ -140,6 +141,14 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 		SandboxID: s.ID,
 	}
 
+	// If the platform needs a device fd we must pass it in.
+	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+		return err
+	} else if deviceFile != nil {
+		defer deviceFile.Close()
+		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
+	}
+
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
@@ -315,6 +324,16 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
+	// If the platform needs a device fd we must pass it in.
+	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+		return err
+	} else if deviceFile != nil {
+		defer deviceFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
+		cmd.Args = append(cmd.Args, "--device-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
 	// Sandbox stdio defaults to current process stdio.
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
@@ -428,7 +447,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
 		} else if specutils.HasCapSysAdmin() {
 			log.Infof("Sandbox will be started in minimal chroot")
-			chroot, err := setUpChroot(conf.Platform)
+			chroot, err := setUpChroot()
 			if err != nil {
 				return fmt.Errorf("error setting up chroot: %v", err)
 			}
@@ -660,3 +679,22 @@ func signalProcess(pid int, sig syscall.Signal) error {
 	}
 	return nil
 }
+
+// deviceFileForPlatform opens the device file for the given platform. If the
+// platform does not need a device file, then nil is returned.
+func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) {
+	var (
+		f   *os.File
+		err error
+	)
+	switch p {
+	case boot.PlatformKVM:
+		f, err = kvm.OpenDevice()
+	default:
+		return nil, nil
+	}
+	if err != nil {
+		return nil, fmt.Errorf("error opening device file for platform %q: %v", p, err)
+	}
+	return f, err
+}
-- 
cgit v1.2.3


From 2eff1fdd061be9cfabc36532dda8cbefeb02e534 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 12 Sep 2018 15:22:24 -0700
Subject: runsc: Add exec flag that specifies where to save the
 sandbox-internal pid.

This is different from the existing -pid-file flag, which saves a host pid.

PiperOrigin-RevId: 212713968
Change-Id: I2c486de8dd5cfd9b923fb0970165ef7c5fc597f0
---
 pkg/sentry/control/proc.go        | 35 ++++++++++------
 runsc/boot/controller.go          | 33 +++++++++------
 runsc/cmd/exec.go                 | 31 +++++++++++----
 runsc/container/container.go      |  9 +++--
 runsc/container/container_test.go | 84 ++++++++++++++++++++++++---------------
 runsc/sandbox/sandbox.go          | 19 ++++-----
 6 files changed, 129 insertions(+), 82 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 289b8ba0e..1623ed19a 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -87,6 +87,24 @@ type ExecArgs struct {
 
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
+	newTG, err := proc.execAsync(args)
+	if err != nil {
+		return err
+	}
+
+	// Wait for completion.
+	newTG.WaitExited()
+	*waitStatus = newTG.ExitStatus().Status()
+	return nil
+}
+
+// ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
+// as a function rather than a method to avoid exposing execAsync as an RPC.
+func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, error) {
+	return proc.execAsync(args)
+}
+
+func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 	// Import file descriptors.
 	l := limits.NewLimitSet()
 	fdm := proc.Kernel.NewFDMap()
@@ -121,7 +139,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		paths := fs.GetPath(initArgs.Envv)
 		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
-			return fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			return nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
 		}
 		initArgs.Filename = f
 	}
@@ -133,7 +151,7 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		// Import the given file FD. This dups the FD as well.
 		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
 		if err != nil {
-			return err
+			return nil, err
 		}
 		defer file.DecRef()
 
@@ -141,20 +159,11 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 		f.Close()
 
 		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
-			return err
+			return nil, err
 		}
 	}
 
-	// Start the new task.
-	newTG, err := proc.Kernel.CreateProcess(initArgs)
-	if err != nil {
-		return err
-	}
-
-	// Wait for completion.
-	newTG.WaitExited()
-	*waitStatus = newTG.ExitStatus().Status()
-	return nil
+	return proc.Kernel.CreateProcess(initArgs)
 }
 
 // PsArgs is the set of arguments to ps.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 257f275f9..aaac852e0 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -41,9 +41,9 @@ const (
 	// container used by "runsc events".
 	ContainerEvent = "containerManager.Event"
 
-	// ContainerExecute is the URPC endpoint for executing a command in a
+	// ContainerExecuteAsync is the URPC endpoint for executing a command in a
 	// container..
-	ContainerExecute = "containerManager.Execute"
+	ContainerExecuteAsync = "containerManager.ExecuteAsync"
 
 	// ContainerPause pauses the container.
 	ContainerPause = "containerManager.Pause"
@@ -233,33 +233,40 @@ type ExecArgs struct {
 	CID string
 }
 
-// Execute runs a command on a created or running sandbox.
-func (cm *containerManager) Execute(e *ExecArgs, waitStatus *uint32) error {
-	log.Debugf("containerManager.Execute: %+v", *e)
+// ExecuteAsync starts running a command on a created or running sandbox. It
+// returns the pid of the new process.
+func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
+	log.Debugf("containerManager.ExecuteAsync: %+v", args)
 
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
 	cm.l.mu.Lock()
-	tgid, ok := cm.l.containerRootTGIDs[e.CID]
+	tgid, ok := cm.l.containerRootTGIDs[args.CID]
 	cm.l.mu.Unlock()
 	if !ok {
-		return fmt.Errorf("cannot exec in container %q: no such container", e.CID)
+		return fmt.Errorf("cannot exec in container %q: no such container", args.CID)
 	}
 	t := cm.l.k.TaskSet().Root.TaskWithID(kernel.ThreadID(tgid))
 	if t == nil {
-		return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", e.CID, tgid)
+		return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", args.CID, tgid)
 	}
 	t.WithMuLocked(func(t *kernel.Task) {
-		e.Root = t.FSContext().RootDirectory()
+		args.Root = t.FSContext().RootDirectory()
 	})
-	if e.Root != nil {
-		defer e.Root.DecRef()
+	if args.Root != nil {
+		defer args.Root.DecRef()
 	}
 
+	// Start the process.
 	proc := control.Proc{Kernel: cm.l.k}
-	if err := proc.Exec(&e.ExecArgs, waitStatus); err != nil {
-		return fmt.Errorf("error executing: %+v: %v", e, err)
+	newTG, err := control.ExecAsync(&proc, &args.ExecArgs)
+	if err != nil {
+		return fmt.Errorf("error executing: %+v: %v", args, err)
 	}
+
+	// Return the pid of the newly-created process.
+	ts := cm.l.k.TaskSet()
+	*pid = int32(ts.Root.IDOfThreadGroup(newTG))
 	return nil
 }
 
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index da1642c08..0d1fa6e20 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -45,12 +45,13 @@ type Exec struct {
 	cwd string
 	env stringSlice
 	// user contains the UID and GID with which to run the new process.
-	user        user
-	extraKGIDs  stringSlice
-	caps        stringSlice
-	detach      bool
-	processPath string
-	pidFile     string
+	user            user
+	extraKGIDs      stringSlice
+	caps            stringSlice
+	detach          bool
+	processPath     string
+	pidFile         string
+	internalPidFile string
 
 	// consoleSocket is the path to an AF_UNIX socket which will receive a
 	// file descriptor referencing the master end of the console's
@@ -97,6 +98,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
 	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
+	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
 }
 
@@ -146,10 +148,25 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
-	ws, err := c.Execute(e)
+	// Start the new process and get it pid.
+	pid, err := c.Execute(e)
 	if err != nil {
 		Fatalf("error getting processes for container: %v", err)
 	}
+
+	// Write the sandbox-internal pid if required.
+	if ex.internalPidFile != "" {
+		pidStr := []byte(strconv.Itoa(int(pid)))
+		if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
+			Fatalf("error writing internal pid file %q: %v", ex.internalPidFile, err)
+		}
+	}
+
+	// Wait for the process to exit.
+	ws, err := c.WaitPID(pid)
+	if err != nil {
+		Fatalf("error waiting on pid %d: %v", pid, err)
+	}
 	*waitStatus = ws
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 9a05a1dc5..38848d02f 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -353,13 +353,14 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	return c.Wait()
 }
 
-// Execute runs the specified command in the container.
-func (c *Container) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
-	log.Debugf("Execute in container %q, args: %+v", c.ID, e)
+// Execute runs the specified command in the container. It returns the pid of
+// the newly created process.
+func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
+	log.Debugf("Execute in container %q, args: %+v", c.ID, args)
 	if c.Status != Created && c.Status != Running {
 		return 0, fmt.Errorf("cannot exec in container in state %s", c.Status)
 	}
-	return c.Sandbox.Execute(c.ID, e)
+	return c.Sandbox.Execute(c.ID, args)
 }
 
 // Event returns events for the container.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index c45eb79a3..790334249 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -49,11 +49,11 @@ func init() {
 }
 
 // waitForProcessList waits for the given process list to show up in the container.
-func waitForProcessList(s *Container, expected []*control.Process) error {
+func waitForProcessList(cont *Container, expected []*control.Process) error {
 	var got []*control.Process
 	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
 		var err error
-		got, err = s.Processes()
+		got, err = cont.Processes()
 		if err != nil {
 			return fmt.Errorf("error getting process data from container: %v", err)
 		}
@@ -485,12 +485,12 @@ func TestExec(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
-		defer s.Destroy()
-		if err := s.Start(conf); err != nil {
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
 
@@ -513,11 +513,11 @@ func TestExec(t *testing.T) {
 		}
 
 		// Verify that "sleep 100" is running.
-		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
 			t.Error(err)
 		}
 
-		execArgs := control.ExecArgs{
+		args := &control.ExecArgs{
 			Filename:         "/bin/sleep",
 			Argv:             []string{"sleep", "5"},
 			WorkingDirectory: "/",
@@ -528,17 +528,19 @@ func TestExec(t *testing.T) {
 		// First, start running exec (whick blocks).
 		status := make(chan error, 1)
 		go func() {
-			exitStatus, err := s.Execute(&execArgs)
+			exitStatus, err := cont.executeSync(args)
 			if err != nil {
+				log.Debugf("error executing: %v", err)
 				status <- err
 			} else if exitStatus != 0 {
+				log.Debugf("bad status: %d", exitStatus)
 				status <- fmt.Errorf("failed with exit status: %v", exitStatus)
 			} else {
 				status <- nil
 			}
 		}()
 
-		if err := waitForProcessList(s, expectedPL); err != nil {
+		if err := waitForProcessList(cont, expectedPL); err != nil {
 			t.Fatal(err)
 		}
 
@@ -548,7 +550,7 @@ func TestExec(t *testing.T) {
 			t.Fatalf("container timed out waiting for exec to finish.")
 		case st := <-status:
 			if st != nil {
-				t.Errorf("container failed to exec %v: %v", execArgs, err)
+				t.Errorf("container failed to exec %v: %v", args, err)
 			}
 		}
 	}
@@ -884,15 +886,18 @@ func TestPauseResume(t *testing.T) {
 		}
 
 		script := fmt.Sprintf("while [[ -f %q ]]; do sleep 0.1; done", lock.Name())
-		execArgs := control.ExecArgs{
+		args := &control.ExecArgs{
 			Filename:         "/bin/bash",
 			Argv:             []string{"bash", "-c", script},
 			WorkingDirectory: "/",
 			KUID:             uid,
 		}
 
-		// First, start running exec (which blocks).
-		go cont.Execute(&execArgs)
+		// First, start running exec.
+		_, err = cont.Execute(args)
+		if err != nil {
+			t.Fatalf("error executing: %v", err)
+		}
 
 		// Verify that "sleep 5" is running.
 		if err := waitForProcessList(cont, expectedPL); err != nil {
@@ -1022,12 +1027,12 @@ func TestCapabilities(t *testing.T) {
 		defer os.RemoveAll(bundleDir)
 
 		// Create and start the container.
-		s, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+		cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
 		if err != nil {
 			t.Fatalf("error creating container: %v", err)
 		}
-		defer s.Destroy()
-		if err := s.Start(conf); err != nil {
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
 			t.Fatalf("error starting container: %v", err)
 		}
 
@@ -1048,7 +1053,7 @@ func TestCapabilities(t *testing.T) {
 				Cmd:  "exe",
 			},
 		}
-		if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
 			t.Fatalf("Failed to wait for sleep to start, err: %v", err)
 		}
 
@@ -1064,7 +1069,7 @@ func TestCapabilities(t *testing.T) {
 		// Need to traverse the intermediate directory.
 		os.Chmod(rootDir, 0755)
 
-		execArgs := control.ExecArgs{
+		args := &control.ExecArgs{
 			Filename:         exePath,
 			Argv:             []string{exePath},
 			WorkingDirectory: "/",
@@ -1074,17 +1079,17 @@ func TestCapabilities(t *testing.T) {
 		}
 
 		// "exe" should fail because we don't have the necessary permissions.
-		if _, err := s.Execute(&execArgs); err == nil {
+		if _, err := cont.executeSync(args); err == nil {
 			t.Fatalf("container executed without error, but an error was expected")
 		}
 
 		// Now we run with the capability enabled and should succeed.
-		execArgs.Capabilities = &auth.TaskCapabilities{
+		args.Capabilities = &auth.TaskCapabilities{
 			EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
 		}
 		// "exe" should not fail this time.
-		if _, err := s.Execute(&execArgs); err != nil {
-			t.Fatalf("container failed to exec %v: %v", execArgs, err)
+		if _, err := cont.executeSync(args); err != nil {
+			t.Fatalf("container failed to exec %v: %v", args, err)
 		}
 	}
 }
@@ -1404,11 +1409,11 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	filename := filepath.Join(dir, "file")
 
 	// File does not exist yet. Reading from the sandbox should fail.
-	execArgsTestFile := control.ExecArgs{
+	argsTestFile := &control.ExecArgs{
 		Filename: "/usr/bin/test",
 		Argv:     []string{"test", "-f", filename},
 	}
-	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+	if ws, err := c.executeSync(argsTestFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() == 0 {
 		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
@@ -1420,7 +1425,7 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// Now we should be able to test the file from within the sandbox.
-	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+	if ws, err := c.executeSync(argsTestFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
@@ -1433,18 +1438,18 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// File should no longer exist at the old path within the sandbox.
-	if ws, err := c.Execute(&execArgsTestFile); err != nil {
+	if ws, err := c.executeSync(argsTestFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", filename, err)
 	} else if ws.ExitStatus() == 0 {
 		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
 	}
 
 	// We should be able to test the new filename from within the sandbox.
-	execArgsTestNewFile := control.ExecArgs{
+	argsTestNewFile := &control.ExecArgs{
 		Filename: "/usr/bin/test",
 		Argv:     []string{"test", "-f", newFilename},
 	}
-	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
@@ -1456,20 +1461,20 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// Renamed file should no longer exist at the old path within the sandbox.
-	if ws, err := c.Execute(&execArgsTestNewFile); err != nil {
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
 		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
 	} else if ws.ExitStatus() == 0 {
 		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
 	}
 
 	// Now create the file from WITHIN the sandbox.
-	execArgsTouch := control.ExecArgs{
+	argsTouch := &control.ExecArgs{
 		Filename: "/usr/bin/touch",
 		Argv:     []string{"touch", filename},
 		KUID:     auth.KUID(os.Getuid()),
 		KGID:     auth.KGID(os.Getgid()),
 	}
-	if ws, err := c.Execute(&execArgsTouch); err != nil {
+	if ws, err := c.executeSync(argsTouch); err != nil {
 		t.Fatalf("unexpected error touching file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
@@ -1486,11 +1491,11 @@ func TestContainerVolumeContentsShared(t *testing.T) {
 	}
 
 	// Delete the file from within the sandbox.
-	execArgsRemove := control.ExecArgs{
+	argsRemove := &control.ExecArgs{
 		Filename: "/bin/rm",
 		Argv:     []string{"rm", filename},
 	}
-	if ws, err := c.Execute(&execArgsRemove); err != nil {
+	if ws, err := c.executeSync(argsRemove); err != nil {
 		t.Fatalf("unexpected error removing file %q: %v", filename, err)
 	} else if ws.ExitStatus() != 0 {
 		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
@@ -1547,6 +1552,19 @@ func TestGoferExits(t *testing.T) {
 	}
 }
 
+// executeSync synchronously executes a new process.
+func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
+	pid, err := cont.Execute(args)
+	if err != nil {
+		return 0, fmt.Errorf("error executing: %v", err)
+	}
+	ws, err := cont.WaitPID(pid)
+	if err != nil {
+		return 0, fmt.Errorf("error waiting: %v", err)
+	}
+	return ws, nil
+}
+
 func TestMain(m *testing.M) {
 	testutil.RunAsRoot(m)
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 195deda1e..8e90dcc70 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -187,8 +187,9 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	return pl, nil
 }
 
-// Execute runs the specified command in the container.
-func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) {
+// Execute runs the specified command in the container. It returns the pid of
+// the newly created process.
+func (s *Sandbox) Execute(cid string, args *control.ExecArgs) (int32, error) {
 	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -196,20 +197,14 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 	}
 	defer conn.Close()
 
-	ea := &boot.ExecArgs{
-		ExecArgs: *e,
-		CID:      cid,
-	}
+	rpcArgs := &boot.ExecArgs{ExecArgs: *args, CID: cid}
 
 	// Send a message to the sandbox control server to start the container.
-	var waitStatus uint32
-	// TODO: Pass in the container id (cid) here. The sandbox
-	// should execute in the context of that container.
-	if err := conn.Call(boot.ContainerExecute, ea, &waitStatus); err != nil {
+	var pid int32
+	if err := conn.Call(boot.ContainerExecuteAsync, rpcArgs, &pid); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
-
-	return syscall.WaitStatus(waitStatus), nil
+	return pid, nil
 }
 
 // Event retrieves stats about the sandbox such as memory and CPU utilization.
-- 
cgit v1.2.3


From e2d79480f57d96ea90bb73b241f248573c2a33fc Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 13 Sep 2018 09:49:17 -0700
Subject: initArgs must hold a reference on the Root if it is not nil.

The contract in ExecArgs says that a reference on ExecArgs.Root must be held
for the lifetime of the struct, but the caller is free to drop the ref after
that.

As a result, proc.Exec must take an additional ref on Root when it constructs
the CreateProcessArgs, since that holds a pointer to Root as well. That ref is
dropped in CreateProcess.

PiperOrigin-RevId: 212828348
Change-Id: I7f44a612f337ff51a02b873b8a845d3119408707
---
 pkg/sentry/control/proc.go | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 1623ed19a..19bc76f5c 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -132,6 +132,11 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 		IPCNamespace:            proc.Kernel.RootIPCNamespace(),
 		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
 	}
+	if initArgs.Root != nil {
+		// initArgs must hold a reference on Root. This ref is dropped
+		// in CreateProcess.
+		initArgs.Root.IncRef()
+	}
 	ctx := initArgs.NewContext(proc.Kernel)
 
 	if initArgs.Filename == "" {
-- 
cgit v1.2.3


From 9c6b38e2952650cba32e21d0719bcb0ffdc10860 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 13 Sep 2018 14:06:34 -0700
Subject: Format struct itimerspec

PiperOrigin-RevId: 212874745
Change-Id: I0c3e8e6a9e8976631cee03bf0b8891b336ddb8c8
---
 pkg/sentry/strace/linux64.go  |  8 ++++----
 pkg/sentry/strace/strace.go   | 14 ++++++++++++++
 pkg/sentry/strace/syscalls.go |  9 ++++++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 7a1eb581d..63851246c 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -240,8 +240,8 @@ var linuxAMD64 = SyscallMap{
 	220: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex),
 	221: makeSyscallInfo("fadvise64", Hex, Hex, Hex, Hex),
 	222: makeSyscallInfo("timer_create", Hex, Hex, Hex),
-	223: makeSyscallInfo("timer_settime", Hex, Hex, Hex, Hex),
-	224: makeSyscallInfo("timer_gettime", Hex, Hex),
+	223: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
+	224: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec),
 	225: makeSyscallInfo("timer_getoverrun", Hex),
 	226: makeSyscallInfo("timer_delete", Hex),
 	227: makeSyscallInfo("clock_settime", Hex, Timespec),
@@ -303,8 +303,8 @@ var linuxAMD64 = SyscallMap{
 	283: makeSyscallInfo("timerfd_create", Hex, Hex),
 	284: makeSyscallInfo("eventfd", Hex),
 	285: makeSyscallInfo("fallocate", Hex, Hex, Hex, Hex),
-	286: makeSyscallInfo("timerfd_settime", Hex, Hex, Hex, Hex),
-	287: makeSyscallInfo("timerfd_gettime", Hex, Hex),
+	286: makeSyscallInfo("timerfd_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
+	287: makeSyscallInfo("timerfd_gettime", Hex, PostItimerSpec),
 	288: makeSyscallInfo("accept4", Hex, PostSockAddr, SockLen, SockFlags),
 	289: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex),
 	290: makeSyscallInfo("eventfd2", Hex, Hex),
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 03b4a350a..539e665d2 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -224,6 +224,16 @@ func itimerval(t *kernel.Task, addr usermem.Addr) string {
 	return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
 }
 
+func itimerspec(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	interval := timespec(t, addr)
+	value := timespec(t, addr+usermem.Addr(binary.Size(linux.Timespec{})))
+	return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value)
+}
+
 func stringVector(t *kernel.Task, addr usermem.Addr) string {
 	vec, err := t.CopyInVector(addr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
 	if err != nil {
@@ -296,6 +306,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, utimensTimespec(t, args[arg].Pointer()))
 		case ItimerVal:
 			output = append(output, itimerval(t, args[arg].Pointer()))
+		case ItimerSpec:
+			output = append(output, itimerspec(t, args[arg].Pointer()))
 		case Timeval:
 			output = append(output, timeval(t, args[arg].Pointer()))
 		case Utimbuf:
@@ -362,6 +374,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = timespec(t, args[arg].Pointer())
 		case PostItimerVal:
 			output[arg] = itimerval(t, args[arg].Pointer())
+		case PostItimerSpec:
+			output[arg] = itimerspec(t, args[arg].Pointer())
 		case Timeval:
 			output[arg] = timeval(t, args[arg].Pointer())
 		case Rusage:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 4513d1ba6..770a0d2b9 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -132,10 +132,17 @@ const (
 	// ItimerVal is a pointer to a struct itimerval.
 	ItimerVal
 
-	// ItimerVal is a pointer to a struct itimerval, formatted after
+	// PostItimerVal is a pointer to a struct itimerval, formatted after
 	// syscall execution.
 	PostItimerVal
 
+	// ItimerSpec is a pointer to a struct itimerspec.
+	ItimerSpec
+
+	// PostItimerSpec is a pointer to a struct itimerspec, formatted after
+	// syscall execution.
+	PostItimerSpec
+
 	// Timeval is a pointer to a struct timeval, formatted before and after
 	// syscall execution.
 	Timeval
-- 
cgit v1.2.3


From adf8f339703922211886d3e5588160f65bc131b3 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 13 Sep 2018 15:15:33 -0700
Subject: Extend memory usage events to report mapped memory usage.

PiperOrigin-RevId: 212887555
Change-Id: I3545383ce903cbe9f00d9b5288d9ef9a049b9f4f
---
 pkg/sentry/kernel/memevent/memory_events.go    | 5 ++++-
 pkg/sentry/kernel/memevent/memory_events.proto | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index ecc9151de..f7a183a1d 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -94,5 +94,8 @@ func (m *MemoryEvents) emit() {
 	snapshot, _ := usage.MemoryAccounting.Copy()
 	total := totalPlatform + snapshot.Mapped
 
-	eventchannel.Emit(&pb.MemoryUsageEvent{Total: total})
+	eventchannel.Emit(&pb.MemoryUsageEvent{
+		Mapped: snapshot.Mapped,
+		Total:  total,
+	})
 }
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
index e6e0bd628..abc565054 100644
--- a/pkg/sentry/kernel/memevent/memory_events.proto
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -22,4 +22,8 @@ message MemoryUsageEvent {
   // The total memory usage of the sandboxed application in bytes, calculated
   // using the 'fast' method.
   uint64 total = 1;
+
+  // Memory used to back memory-mapped regions for files in the application, in
+  // bytes. This corresponds to the usage.MemoryKind.Mapped memory type.
+  uint64 mapped = 2;
 }
-- 
cgit v1.2.3


From 29a7271f5da9fdb7b4a9a6c9ea61421ce6844a73 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 13 Sep 2018 19:11:12 -0700
Subject: Plumb monotonic time to netstack

Netstack needs to be portable, so this seems to be preferable to using raw
system calls.

PiperOrigin-RevId: 212917409
Change-Id: I7b2073e7db4b4bf75300717ca23aea4c15be944c
---
 pkg/sentry/kernel/kernel.go | 11 +++++++++++
 pkg/tcpip/BUILD             |  5 ++++-
 pkg/tcpip/tcpip.go          | 11 +++--------
 pkg/tcpip/time_unsafe.go    | 43 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 61 insertions(+), 9 deletions(-)
 create mode 100644 pkg/tcpip/time_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index bc41c3963..316612b37 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -943,6 +943,8 @@ func (k *Kernel) SetExitError(err error) {
 	}
 }
 
+var _ tcpip.Clock = (*Kernel)(nil)
+
 // NowNanoseconds implements tcpip.Clock.NowNanoseconds.
 func (k *Kernel) NowNanoseconds() int64 {
 	now, err := k.timekeeper.GetTime(sentrytime.Realtime)
@@ -952,6 +954,15 @@ func (k *Kernel) NowNanoseconds() int64 {
 	return now
 }
 
+// NowMonotonic implements tcpip.Clock.NowMonotonic.
+func (k *Kernel) NowMonotonic() int64 {
+	now, err := k.timekeeper.GetTime(sentrytime.Monotonic)
+	if err != nil {
+		panic("Kernel.NowMonotonic: " + err.Error())
+	}
+	return now
+}
+
 // SupervisorContext returns a Context with maximum privileges in k. It should
 // only be used by goroutines outside the control of the emulated kernel
 // defined by e.
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 5153bd3b4..daff9a0a0 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -4,7 +4,10 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 go_library(
     name = "tcpip",
-    srcs = ["tcpip.go"],
+    srcs = [
+        "tcpip.go",
+        "time_unsafe.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 51360b11f..b1bd5117f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -123,16 +123,11 @@ func (e ErrSaveRejection) Error() string {
 // time, but never for netstack internal timekeeping.
 type Clock interface {
 	// NowNanoseconds returns the current real time as a number of
-	// nanoseconds since some epoch.
+	// nanoseconds since the Unix epoch.
 	NowNanoseconds() int64
-}
-
-// StdClock implements Clock with the time package.
-type StdClock struct{}
 
-// NowNanoseconds implements Clock.NowNanoseconds.
-func (*StdClock) NowNanoseconds() int64 {
-	return time.Now().UnixNano()
+	// NowMonotonic returns a monotonic time value.
+	NowMonotonic() int64
 }
 
 // Address is a byte slice cast as a string that represents the address of a
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
new file mode 100644
index 000000000..2102e9633
--- /dev/null
+++ b/pkg/tcpip/time_unsafe.go
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.9
+// +build !go1.12
+
+package tcpip
+
+import (
+	_ "time"   // Used with go:linkname.
+	_ "unsafe" // Required for go:linkname.
+)
+
+// StdClock implements Clock with the time package.
+type StdClock struct{}
+
+var _ Clock = (*StdClock)(nil)
+
+//go:linkname now time.now
+func now() (sec int64, nsec int32, mono int64)
+
+// NowNanoseconds implements Clock.NowNanoseconds.
+func (*StdClock) NowNanoseconds() int64 {
+	sec, nsec, _ := now()
+	return sec*1e9 + int64(nsec)
+}
+
+// NowMonotonic implements Clock.NowMonotonic.
+func (*StdClock) NowMonotonic() int64 {
+	_, _, mono := now()
+	return mono
+}
-- 
cgit v1.2.3


From faa34a0738456f5328cf99de13622a150042776d Mon Sep 17 00:00:00 2001
From: Chenggang <chenggang.qcg@alibaba-inc.com>
Date: Thu, 13 Sep 2018 21:46:03 -0700
Subject: platform/kvm: Get max vcpu number dynamically by ioctl

The old kernel version, such as 4.4, only support 255 vcpus.
While gvisor is ran on these kernels, it could panic because the
vcpu id and vcpu number beyond max_vcpus.
Use ioctl(vmfd, _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) to get max
vcpus number dynamically.

Change-Id: I50dd859a11b1c2cea854a8e27d4bf11a411aa45c
PiperOrigin-RevId: 212929704
---
 pkg/sentry/platform/kvm/address_space.go | 29 ++++++++++++-----------------
 pkg/sentry/platform/kvm/kvm.go           |  1 +
 pkg/sentry/platform/kvm/kvm_const.go     |  8 +++++++-
 pkg/sentry/platform/kvm/machine.go       | 30 +++++++++++++++++++++++++++---
 4 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 463617170..c4293c517 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -26,31 +26,26 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-type vCPUBitArray [(_KVM_NR_VCPUS + 63) / 64]uint64
-
 // dirtySet tracks vCPUs for invalidation.
 type dirtySet struct {
-	vCPUs vCPUBitArray
+	vCPUs []uint64
 }
 
 // forEach iterates over all CPUs in the dirty set.
 func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
-	var localSet vCPUBitArray
-	for index := 0; index < len(ds.vCPUs); index++ {
-		// Clear the dirty set, copy to the local one.
-		localSet[index] = atomic.SwapUint64(&ds.vCPUs[index], 0)
-	}
-
 	m.mu.RLock()
 	defer m.mu.RUnlock()
 
-	for _, c := range m.vCPUs {
-		index := uint64(c.id) / 64
-		bit := uint64(1) << uint(c.id%64)
-
-		// Call the function if it was set.
-		if localSet[index]&bit != 0 {
-			fn(c)
+	for index := range ds.vCPUs {
+		mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+		if mask != 0 {
+			for bit := 0; bit < 64; bit++ {
+				if mask&(1<<uint64(bit)) == 0 {
+					continue
+				}
+				id := 64*index + bit
+				fn(m.vCPUsByID[id])
+			}
 		}
 	}
 }
@@ -92,7 +87,7 @@ type addressSpace struct {
 	pageTables *pagetables.PageTables
 
 	// dirtySet is the set of dirty vCPUs.
-	dirtySet dirtySet
+	dirtySet *dirtySet
 
 	// files contains files mapped in the host address space.
 	//
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 19bc2d515..0c4dff308 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -143,6 +143,7 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru
 		filemem:    k.FileMem,
 		machine:    k.machine,
 		pageTables: pageTables,
+		dirtySet:   k.machine.newDirtySet(),
 	}, nil, nil
 }
 
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index c819fd16f..ca44c31b3 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -25,6 +25,7 @@ const (
 	_KVM_SET_TSS_ADDR           = 0xae47
 	_KVM_RUN                    = 0xae80
 	_KVM_NMI                    = 0xae9a
+	_KVM_CHECK_EXTENSION        = 0xae03
 	_KVM_INTERRUPT              = 0x4004ae86
 	_KVM_SET_MSRS               = 0x4008ae89
 	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
@@ -49,9 +50,14 @@ const (
 	_KVM_EXIT_INTERNAL_ERROR  = 0x11
 )
 
+// KVM capability options.
+const (
+	_KVM_CAP_MAX_VCPUS = 0x42
+)
+
 // KVM limits.
 const (
-	_KVM_NR_VCPUS         = 0x100
+	_KVM_NR_VCPUS         = 0xff
 	_KVM_NR_INTERRUPTS    = 0x100
 	_KVM_NR_CPUID_ENTRIES = 0x100
 )
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 68e099d1b..9f60b6b31 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -22,6 +22,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
@@ -55,6 +56,12 @@ type machine struct {
 	//
 	// These are populated dynamically.
 	vCPUs map[uint64]*vCPU
+
+	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
+	vCPUsByID map[int]*vCPU
+
+	// maxVCPUs is the maximum number of vCPUs supported by the machine.
+	maxVCPUs int
 }
 
 const (
@@ -135,6 +142,7 @@ func (m *machine) newVCPU() *vCPU {
 	c.CPU.Init(&m.kernel)
 	c.CPU.KernelSyscall = bluepillSyscall
 	c.CPU.KernelException = bluepillException
+	m.vCPUsByID[c.id] = c
 
 	// Ensure the signal mask is correct.
 	if err := c.setSignalMask(); err != nil {
@@ -160,14 +168,23 @@ func (m *machine) newVCPU() *vCPU {
 func newMachine(vm int) (*machine, error) {
 	// Create the machine.
 	m := &machine{
-		fd:    vm,
-		vCPUs: make(map[uint64]*vCPU),
+		fd:        vm,
+		vCPUs:     make(map[uint64]*vCPU),
+		vCPUsByID: make(map[int]*vCPU),
 	}
 	m.available.L = &m.mu
 	m.kernel.Init(ring0.KernelOpts{
 		PageTables: pagetables.New(newAllocator()),
 	})
 
+	maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+	if errno != 0 {
+		m.maxVCPUs = _KVM_NR_VCPUS
+	} else {
+		m.maxVCPUs = int(maxVCPUs)
+	}
+	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
+
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
 	// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -315,7 +332,7 @@ func (m *machine) Get() *vCPU {
 		}
 
 		// Create a new vCPU (maybe).
-		if len(m.vCPUs) < _KVM_NR_VCPUS {
+		if len(m.vCPUs) < m.maxVCPUs {
 			c := m.newVCPU()
 			c.lock()
 			m.vCPUs[tid] = c
@@ -365,6 +382,13 @@ func (m *machine) Put(c *vCPU) {
 	m.available.Signal()
 }
 
+// newDirtySet returns a new dirty set.
+func (m *machine) newDirtySet() *dirtySet {
+	return &dirtySet{
+		vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+	}
+}
+
 // lock marks the vCPU as in user mode.
 //
 // This should only be called directly when known to be safe, i.e. when
-- 
cgit v1.2.3


From 0380bcb3a4125723dc5248f70174ff64fb1942a2 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 14 Sep 2018 11:09:41 -0700
Subject: Fix interaction between rt_sigtimedwait and ignored signals.

PiperOrigin-RevId: 213011782
Change-Id: I716c6ea3c586b0c6c5a892b6390d2d11478bc5af
---
 pkg/sentry/kernel/task.go               |  9 +++--
 pkg/sentry/kernel/task_signals.go       | 68 ++++++++++++++++++++++++---------
 pkg/sentry/syscalls/linux/sys_signal.go | 55 +++-----------------------
 3 files changed, 61 insertions(+), 71 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index ae4fd7817..2f6f825ac 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -108,9 +108,12 @@ type Task struct {
 	// goroutine.
 	signalMask linux.SignalSet
 
-	// FIXME: An equivalent to task_struct::real_blocked is needed
-	// to prevent signals that are ignored, but transiently unblocked by
-	// sigtimedwait(2), from being dropped in Task.sendSignalTimerLocked.
+	// If the task goroutine is currently executing Task.sigtimedwait,
+	// realSignalMask is the previous value of signalMask, which has temporarily
+	// been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0.
+	//
+	// realSignalMask is exclusive to the task goroutine.
+	realSignalMask linux.SignalSet
 
 	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
 	// should be applied after the task has either delivered one signal to a
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 58a1bc0bd..afb010f60 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -19,6 +19,7 @@ package kernel
 import (
 	"fmt"
 	"sync/atomic"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
@@ -119,25 +120,11 @@ var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)
 // StopSignals is the set of signals whose default action is SignalActionStop.
 var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)
 
-// dequeueSignalLocked returns a pending unmasked signal. If there are no
-// pending unmasked signals, dequeueSignalLocked returns nil.
+// dequeueSignalLocked returns a pending signal that is *not* included in mask.
+// If there are no pending unmasked signals, dequeueSignalLocked returns nil.
 //
 // Preconditions: t.tg.signalHandlers.mu must be locked.
-func (t *Task) dequeueSignalLocked() *arch.SignalInfo {
-	if info := t.pendingSignals.dequeue(t.signalMask); info != nil {
-		return info
-	}
-	return t.tg.pendingSignals.dequeue(t.signalMask)
-}
-
-// TakeSignal returns a pending signal not blocked by mask. Signal handlers are
-// not affected. If there are no pending signals not blocked by mask,
-// TakeSignal returns a nil SignalInfo.
-func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo {
-	t.tg.pidns.owner.mu.RLock()
-	defer t.tg.pidns.owner.mu.RUnlock()
-	t.tg.signalHandlers.mu.Lock()
-	defer t.tg.signalHandlers.mu.Unlock()
+func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo {
 	if info := t.pendingSignals.dequeue(mask); info != nil {
 		return info
 	}
@@ -294,6 +281,49 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
 	return ctrlResume, nil
 }
 
+// Sigtimedwait implements the semantics of sigtimedwait(2).
+//
+// Preconditions: The caller must be running on the task goroutine. t.exitState
+// < TaskExitZombie.
+func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
+	// set is the set of signals we're interested in; invert it to get the set
+	// of signals to block.
+	mask := ^set &^ UnblockableSignals
+
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+	if info := t.dequeueSignalLocked(mask); info != nil {
+		return info, nil
+	}
+
+	if timeout == 0 {
+		return nil, syserror.EAGAIN
+	}
+
+	// Unblock signals we're waiting for. Remember the original signal mask so
+	// that Task.sendSignalTimerLocked doesn't discard ignored signals that
+	// we're temporarily unblocking.
+	t.realSignalMask = t.signalMask
+	t.setSignalMaskLocked(t.signalMask & mask)
+
+	// Wait for a timeout or new signal.
+	t.tg.signalHandlers.mu.Unlock()
+	_, err := t.BlockWithTimeout(nil, true, timeout)
+	t.tg.signalHandlers.mu.Lock()
+
+	// Restore the original signal mask.
+	t.setSignalMaskLocked(t.realSignalMask)
+	t.realSignalMask = 0
+
+	if info := t.dequeueSignalLocked(mask); info != nil {
+		return info, nil
+	}
+	if err == syserror.ETIMEDOUT {
+		return nil, syserror.EAGAIN
+	}
+	return nil, err
+}
+
 // SendSignal sends the given signal to t.
 //
 // The following errors may be returned:
@@ -431,7 +461,7 @@ func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *I
 	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
 	// sig_ignored().
 	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
-	if linux.SignalSetOf(sig)&t.signalMask == 0 && ignored && !t.hasTracer() {
+	if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() {
 		t.Debugf("Discarding ignored signal %d", sig)
 		if timer != nil {
 			timer.signalRejectedLocked()
@@ -1010,7 +1040,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 	}
 
 	// Are there signals pending?
-	if info := t.dequeueSignalLocked(); info != nil {
+	if info := t.dequeueSignalLocked(t.signalMask); info != nil {
 		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 && t.tg.groupStopPhase == groupStopNone {
 			// Indicate that we've dequeued a stop signal before
 			// unlocking the signal mutex; initiateGroupStop will check
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index 66ecb1299..ecdec5d3a 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -343,44 +343,6 @@ func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND)
 }
 
-func sigtimedwait(t *kernel.Task, mask linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
-	// Is it already pending?
-	if info := t.TakeSignal(^mask); info != nil {
-		return info, nil
-	}
-
-	// No signals available immediately and asked not to wait.
-	if timeout == 0 {
-		return nil, syserror.EAGAIN
-	}
-
-	// No signals available yet. Temporarily unblock the ones we are interested
-	// in then wait for either a timeout or a new signal.
-	oldmask := t.SignalMask()
-	t.SetSignalMask(oldmask &^ mask)
-	_, err := t.BlockWithTimeout(nil, true, timeout)
-	t.SetSignalMask(oldmask)
-
-	// How did the wait go?
-	switch err {
-	case syserror.ErrInterrupted:
-		if info := t.TakeSignal(^mask); info != nil {
-			// Got one of the signals we were waiting for.
-			return info, nil
-		}
-		// Got a signal we weren't waiting for.
-		return nil, syserror.EINTR
-	case syserror.ETIMEDOUT:
-		// Timed out and still no signals.
-		return nil, syserror.EAGAIN
-	default:
-		// Some other error? Shouldn't be possible. The event channel
-		// passed to BlockWithTimeout was nil, so the only two ways the
-		// block could've ended are a timeout or an interrupt.
-		panic("unreachable")
-	}
-}
-
 // RtSigpending implements linux syscall rt_sigpending(2).
 func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
@@ -415,23 +377,18 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne
 		timeout = time.Duration(math.MaxInt64)
 	}
 
-	si, err := sigtimedwait(t, mask, timeout)
+	si, err := t.Sigtimedwait(mask, timeout)
 	if err != nil {
 		return 0, nil, err
 	}
 
-	if si != nil {
-		if siginfo != 0 {
-			si.FixSignalCodeForUser()
-			if _, err := t.CopyOut(siginfo, si); err != nil {
-				return 0, nil, err
-			}
+	if siginfo != 0 {
+		si.FixSignalCodeForUser()
+		if _, err := t.CopyOut(siginfo, si); err != nil {
+			return 0, nil, err
 		}
-		return uintptr(si.Signo), nil, nil
 	}
-
-	// sigtimedwait's not supposed to return nil si and err...
-	return 0, nil, nil
+	return uintptr(si.Signo), nil, nil
 }
 
 // RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2).
-- 
cgit v1.2.3


From b84bfa570d76e6979d5cfc40c235ffe74de9f9ca Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 14 Sep 2018 12:28:43 -0700
Subject: Make gVisor hard link check match Linux's.

Linux permits hard-linking if the target is owned by the user OR the target has
Read+Write permission.

PiperOrigin-RevId: 213024613
Change-Id: If642066317b568b99084edd33ee4e8822ec9cbb3
---
 pkg/sentry/fs/dirent.go               |  5 +++++
 pkg/sentry/syscalls/linux/sys_file.go | 25 +++++++++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index f81f7d627..dd2b4e589 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -773,6 +773,11 @@ func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Diren
 		return syscall.EXDEV
 	}
 
+	// Directories are never linkable. See fs/namei.c:vfs_link.
+	if IsDir(target.Inode.StableAttr) {
+		return syscall.EPERM
+	}
+
 	return d.genericCreate(ctx, root, name, func() error {
 		if err := d.Inode.CreateHardLink(ctx, d, target, name); err != nil {
 			return err
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 3e28d4b8a..97881a1f5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1122,15 +1122,32 @@ func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 //
 // This corresponds to Linux's fs/namei.c:may_linkat.
 func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
+	// Linux will impose the following restrictions on hard links only if
+	// sysctl_protected_hardlinks is enabled. The kernel disables this
+	// setting by default for backward compatibility (see commit
+	// 561ec64ae67e), but also recommends that distributions enable it (and
+	// Debian does:
+	// https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098).
+	//
+	// gVisor currently behaves as though sysctl_protected_hardlinks is
+	// always enabled, and thus imposes the following restrictions on hard
+	// links.
+
 	// Technically Linux is more restrictive in 3.11.10 (requires CAP_FOWNER in
 	// root user namespace); this is from the later f2ca379642d7 "namei: permit
 	// linking with CAP_FOWNER in userns".
-	if !target.CheckOwnership(t) {
-		return syserror.EPERM
+	if target.CheckOwnership(t) {
+		// fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER)
+		// can hardlink all they like."
+		return nil
 	}
 
-	// Check that the target is not a directory and that permissions are okay.
-	if fs.IsDir(target.StableAttr) || target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
+	// If we are not the owner, then the file must be regular and have
+	// Read+Write permissions.
+	if !fs.IsRegular(target.StableAttr) {
+		return syserror.EPERM
+	}
+	if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
 		return syserror.EPERM
 	}
 
-- 
cgit v1.2.3


From 3aa50f18a4102429aa40f5d0e518357ceaed2373 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 14 Sep 2018 15:58:56 -0700
Subject: Reuse readlink parameter, add sockaddr max.

PiperOrigin-RevId: 213058623
Change-Id: I522598c655d633b9330990951ff1c54d1023ec29
---
 pkg/abi/linux/socket.go           | 4 ++++
 pkg/sentry/fs/host/util_unsafe.go | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 9a78cc131..19b5fa212 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -140,6 +140,10 @@ const (
 	SO_TYPE        = 3
 )
 
+// SockAddrMax is the maximum size of a struct sockaddr, from
+// uapi/linux/socket.h.
+const SockAddrMax = 128
+
 // SockAddrInt is struct sockaddr_in, from uapi/linux/in.h.
 type SockAddrInet struct {
 	Family uint16
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index c38d2392d..2ecb54319 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -23,6 +23,9 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 )
 
+// NulByte is a single NUL byte. It is passed to readlinkat as an empty string.
+var NulByte byte = '\x00'
+
 func createLink(fd int, name string, linkName string) error {
 	namePtr, err := syscall.BytePtrFromString(name)
 	if err != nil {
@@ -50,7 +53,7 @@ func readLink(fd int) (string, error) {
 		n, _, errno := syscall.Syscall6(
 			syscall.SYS_READLINKAT,
 			uintptr(fd),
-			uintptr(unsafe.Pointer(syscall.StringBytePtr(""))),
+			uintptr(unsafe.Pointer(&NulByte)), // ""
 			uintptr(unsafe.Pointer(&b[0])),
 			uintptr(l),
 			0, 0)
-- 
cgit v1.2.3


From de5a590ee203b4ee217da68dbec8e58a7753e520 Mon Sep 17 00:00:00 2001
From: newmanwang <wcs1011@gmail.com>
Date: Fri, 14 Sep 2018 17:38:16 -0700
Subject: Avoid reuse of pending SignalInfo objects

runApp.execute -> Task.SendSignal -> sendSignalLocked -> sendSignalTimerLocked
-> pendingSignals.enqueue assumes that it owns the arch.SignalInfo returned
from platform.Context.Switch.

On the other hand, ptrace.context.Switch assumes that it owns the returned
SignalInfo and can safely reuse it on the next call to Switch. The KVM platform
always returns a unique SignalInfo.

This becomes a problem when the returned signal is not immediately delivered,
allowing a future signal in Switch to change the previous pending SignalInfo.

This is noticeable in #38 when external SIGINTs are delivered from the PTY
slave FD. Note that the ptrace stubs are in the same process group as the
sentry, so they are eligible to receive the PTY signals. This should probably
change, but is not the only possible cause of this bug.

Updates #38

Original change by newmanwang <wcs1011@gmail.com>, updated by Michael Pratt
<mpratt@google.com>.

Change-Id: I5383840272309df70a29f67b25e8221f933622cd
PiperOrigin-RevId: 213071072
---
 pkg/sentry/platform/platform.go      | 3 ++-
 pkg/sentry/platform/ptrace/ptrace.go | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 6eb2acbd7..8a1620d93 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -133,7 +133,8 @@ type Context interface {
 	// - ErrContextSignal: The Context was interrupted by a signal. The
 	// returned *arch.SignalInfo contains information about the signal. If
 	// arch.SignalInfo.Signo == SIGSEGV, the returned usermem.AccessType
-	// contains the access type of the triggering fault.
+	// contains the access type of the triggering fault. The caller owns
+	// the returned SignalInfo.
 	//
 	// - ErrContextInterrupt: The Context was interrupted by a call to
 	// Interrupt(). Switch() may return ErrContextInterrupt spuriously. In
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index a44f549a2..4f20716f7 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -142,9 +142,12 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (
 	if isSyscall {
 		return nil, usermem.NoAccess, nil
 	}
+
+	si := c.signalInfo
+
 	if faultSP == nil {
 		// Non-fault signal.
-		return &c.signalInfo, usermem.NoAccess, platform.ErrContextSignal
+		return &si, usermem.NoAccess, platform.ErrContextSignal
 	}
 
 	// Got a page fault. Ideally, we'd get real fault type here, but ptrace
@@ -168,7 +171,7 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (
 	// here, in case this fault was generated by a CPUID exception. There
 	// is no way to distinguish between CPUID-generated faults and regular
 	// page faults.
-	return &c.signalInfo, at, platform.ErrContextSignalCPUID
+	return &si, at, platform.ErrContextSignalCPUID
 }
 
 // Interrupt interrupts the running guest application associated with this context.
-- 
cgit v1.2.3


From d639c3d61bfdbd42eb809c21a15275cc75524b7e Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 17 Sep 2018 12:15:35 -0700
Subject: Allow NULL data in mount(2)

PiperOrigin-RevId: 213315267
Change-Id: I7562bcd81fb22e90aa9c7dd9eeb94803fcb8c5af
---
 pkg/sentry/syscalls/linux/sys_mount.go | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index d70b79e4f..57cedccc1 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -46,13 +46,16 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	}
 
-	// In Linux, a full page is always copied in regardless of null
-	// character placement, and the address is passed to each file system.
-	// Most file systems always treat this data as a string, though, and so
-	// do all of the ones we implement.
-	data, err := t.CopyInString(dataAddr, usermem.PageSize)
-	if err != nil {
-		return 0, nil, err
+	data := ""
+	if dataAddr != 0 {
+		// In Linux, a full page is always copied in regardless of null
+		// character placement, and the address is passed to each file system.
+		// Most file systems always treat this data as a string, though, and so
+		// do all of the ones we implement.
+		data, err = t.CopyInString(dataAddr, usermem.PageSize)
+		if err != nil {
+			return 0, nil, err
+		}
 	}
 
 	// Ignore magic value that was required before Linux 2.4.
-- 
cgit v1.2.3


From ab6fa44588233fa48d1ae0bf7d9b0d9e984a6af0 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 17 Sep 2018 13:35:00 -0700
Subject: Allow kernel.(*Task).Block to accept an extract only channel

PiperOrigin-RevId: 213328293
Change-Id: I4164133e6f709ecdb89ffbb5f7df3324c273860a
---
 pkg/sentry/fs/lock/lock.go      | 2 +-
 pkg/sentry/kernel/task_block.go | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index e9b376eb6..439e645db 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -121,7 +121,7 @@ type Locks struct {
 // Blocker is the interface used for blocking locks. Passing a nil Blocker
 // will be treated as non-blocking.
 type Blocker interface {
-	Block(C chan struct{}) error
+	Block(C <-chan struct{}) error
 }
 
 const (
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 9fd24f134..6dc7b938e 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -95,7 +95,7 @@ func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline kt
 // Most clients should use BlockWithDeadline or BlockWithTimeout instead.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) BlockWithTimer(C chan struct{}, tchan <-chan struct{}) error {
+func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error {
 	return t.block(C, tchan)
 }
 
@@ -104,13 +104,13 @@ func (t *Task) BlockWithTimer(C chan struct{}, tchan <-chan struct{}) error {
 // is interrupted.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) Block(C chan struct{}) error {
+func (t *Task) Block(C <-chan struct{}) error {
 	return t.block(C, nil)
 }
 
 // block blocks a task on one of many events.
 // N.B. defer is too expensive to be used here.
-func (t *Task) block(C chan struct{}, timerChan <-chan struct{}) error {
+func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
 	// Fast path if the request is already done.
 	select {
 	case <-C:
-- 
cgit v1.2.3


From bb88c187c5457df14fa78e5e6b6f48cbc90fb489 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Mon, 17 Sep 2018 16:24:05 -0700
Subject: runsc: Enable waiting on exited processes.

This makes `runsc wait` behave more like waitpid()/wait4() in that:
- Once a process has run to completion, you can wait on it and get its exit
  code.
- Processes not waited on will consume memory (like a zombie process)

PiperOrigin-RevId: 213358916
Change-Id: I5b5eca41ce71eea68e447380df8c38361a4d1558
---
 pkg/sentry/control/proc.go              |  14 ++--
 pkg/sentry/kernel/kernel.go             |  17 ++---
 runsc/boot/controller.go                |  33 +++------
 runsc/boot/loader.go                    | 114 +++++++++++++++++++++++++++-----
 runsc/boot/loader_test.go               |  25 +++----
 runsc/cmd/exec.go                       |  14 +++-
 runsc/cmd/wait.go                       |   4 +-
 runsc/container/container.go            |   8 +--
 runsc/container/container_test.go       |   4 +-
 runsc/container/multi_container_test.go |  94 ++++++++++++++++++++++++--
 runsc/sandbox/sandbox.go                |   7 +-
 11 files changed, 248 insertions(+), 86 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 19bc76f5c..68d3b179b 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -87,7 +87,7 @@ type ExecArgs struct {
 
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
-	newTG, err := proc.execAsync(args)
+	newTG, _, err := proc.execAsync(args)
 	if err != nil {
 		return err
 	}
@@ -100,11 +100,13 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 
 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
 // as a function rather than a method to avoid exposing execAsync as an RPC.
-func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, error) {
+func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
 	return proc.execAsync(args)
 }
 
-func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
+// execAsync runs a new task, but doesn't wait for it to finish. It returns the
+// newly created thread group and its PID.
+func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
 	// Import file descriptors.
 	l := limits.NewLimitSet()
 	fdm := proc.Kernel.NewFDMap()
@@ -144,7 +146,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 		paths := fs.GetPath(initArgs.Envv)
 		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
-			return nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			return nil, 0, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
 		}
 		initArgs.Filename = f
 	}
@@ -156,7 +158,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 		// Import the given file FD. This dups the FD as well.
 		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
 		if err != nil {
-			return nil, err
+			return nil, 0, err
 		}
 		defer file.DecRef()
 
@@ -164,7 +166,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, error) {
 		f.Close()
 
 		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
-			return nil, err
+			return nil, 0, err
 		}
 	}
 
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 316612b37..f71e32ac9 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -596,13 +596,13 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 //
 // CreateProcess has no analogue in Linux; it is used to create the initial
 // application task, as well as processes started by the control server.
-func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
 	log.Infof("EXEC: %v", args.Argv)
 
 	if k.mounts == nil {
-		return nil, fmt.Errorf("no kernel MountNamespace")
+		return nil, 0, fmt.Errorf("no kernel MountNamespace")
 	}
 
 	tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
@@ -622,7 +622,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 		var err error
 		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
 		if err != nil {
-			return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
 		}
 		defer wd.DecRef()
 	}
@@ -630,10 +630,10 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	if args.Filename == "" {
 		// Was anything provided?
 		if len(args.Argv) == 0 {
-			return nil, fmt.Errorf("no filename or command provided")
+			return nil, 0, fmt.Errorf("no filename or command provided")
 		}
 		if !filepath.IsAbs(args.Argv[0]) {
-			return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
 		}
 		args.Filename = args.Argv[0]
 	}
@@ -641,7 +641,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	// Create a fresh task context.
 	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
 	// Take a reference on the FDMap, which will be transferred to
@@ -663,17 +663,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
 	}
 	t, err := k.tasks.NewTask(config)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
 	// Success.
+	tgid := k.tasks.Root.IDOfThreadGroup(tg)
 	if k.started {
 		tid := k.tasks.Root.IDOfTask(t)
 		t.Start(tid)
 	} else if k.globalInit == nil {
 		k.globalInit = tg
 	}
-	return tg, nil
+	return tg, tgid, nil
 }
 
 // Start starts execution of all tasks in k.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 4d41dcd6c..dc9359092 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -242,32 +242,11 @@ type ExecArgs struct {
 // returns the pid of the new process.
 func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
 	log.Debugf("containerManager.ExecuteAsync: %+v", args)
-
-	// Get the container Root Dirent from the Task, since we must run this
-	// process with the same Root.
-	cm.l.mu.Lock()
-	tg, ok := cm.l.containerRootTGs[args.CID]
-	cm.l.mu.Unlock()
-	if !ok {
-		return fmt.Errorf("cannot exec in container %q: no such container", args.CID)
-	}
-	tg.Leader().WithMuLocked(func(t *kernel.Task) {
-		args.Root = t.FSContext().RootDirectory()
-	})
-	if args.Root != nil {
-		defer args.Root.DecRef()
-	}
-
-	// Start the process.
-	proc := control.Proc{Kernel: cm.l.k}
-	newTG, err := control.ExecAsync(&proc, &args.ExecArgs)
+	tgid, err := cm.l.executeAsync(&args.ExecArgs, args.CID)
 	if err != nil {
-		return fmt.Errorf("error executing: %+v: %v", args, err)
+		return err
 	}
-
-	// Return the pid of the newly-created process.
-	ts := cm.l.k.TaskSet()
-	*pid = int32(ts.Root.IDOfThreadGroup(newTG))
+	*pid = int32(tgid)
 	return nil
 }
 
@@ -409,12 +388,16 @@ type WaitPIDArgs struct {
 
 	// CID is the container ID.
 	CID string
+
+	// ClearStatus determines whether the exit status of the process should
+	// be cleared when WaitPID returns.
+	ClearStatus bool
 }
 
 // WaitPID waits for the process with PID 'pid' in the sandbox.
 func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
 }
 
 // SignalArgs are arguments to the Signal method.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 5e9ccb96f..665240ab6 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -103,7 +104,7 @@ type Loader struct {
 	// sandboxID is the ID for the whole sandbox.
 	sandboxID string
 
-	// mu guards containerRootTGs.
+	// mu guards containerRootTGs and execProcesses.
 	mu sync.Mutex
 
 	// containerRootTGs maps container IDs to their root processes. It
@@ -111,7 +112,24 @@ type Loader struct {
 	// call methods on particular containers.
 	//
 	// containerRootTGs is guarded by mu.
+	//
+	// TODO: When containers are removed via `runsc delete`,
+	// containerRootTGs should be cleaned up.
 	containerRootTGs map[string]*kernel.ThreadGroup
+
+	// execProcesses maps each invocation of exec to the process it spawns.
+	//
+	// execProcesses is guardded by mu.
+	//
+	// TODO: When containers are removed via `runsc delete`,
+	// execProcesses should be cleaned up.
+	execProcesses map[execID]*kernel.ThreadGroup
+}
+
+// execID uniquely identifies a sentry process.
+type execID struct {
+	cid string
+	pid kernel.ThreadID
 }
 
 func init() {
@@ -385,7 +403,8 @@ func (l *Loader) run() error {
 		}
 
 		// Create the root container init task.
-		if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+		_, _, err := l.k.CreateProcess(l.rootProcArgs)
+		if err != nil {
 			return fmt.Errorf("failed to create init process: %v", err)
 		}
 
@@ -393,6 +412,11 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
+	if l.execProcesses != nil {
+		return fmt.Errorf("there shouldn't already be a cache of exec'd processes, but found: %v", l.execProcesses)
+	}
+	l.execProcesses = make(map[execID]*kernel.ThreadGroup)
+
 	// Start signal forwarding only after an init process is created.
 	l.stopSignalForwarding = l.startSignalForwarding()
 
@@ -467,7 +491,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
 	}
 
-	tg, err := l.k.CreateProcess(procArgs)
+	tg, _, err := l.k.CreateProcess(procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to create process in sentry: %v", err)
 	}
@@ -482,6 +506,40 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	return nil
 }
 
+func (l *Loader) executeAsync(args *control.ExecArgs, cid string) (kernel.ThreadID, error) {
+	// Get the container Root Dirent from the Task, since we must run this
+	// process with the same Root.
+	l.mu.Lock()
+	tg, ok := l.containerRootTGs[cid]
+	l.mu.Unlock()
+	if !ok {
+		return 0, fmt.Errorf("cannot exec in container %q: no such container", cid)
+	}
+	tg.Leader().WithMuLocked(func(t *kernel.Task) {
+		args.Root = t.FSContext().RootDirectory()
+	})
+	if args.Root != nil {
+		defer args.Root.DecRef()
+	}
+
+	// Start the process.
+	proc := control.Proc{Kernel: l.k}
+	tg, tgid, err := control.ExecAsync(&proc, args)
+	if err != nil {
+		return 0, fmt.Errorf("error executing: %+v: %v", args, err)
+	}
+
+	// Insert the process into execProcesses so that we can wait on it
+	// later.
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	eid := execID{cid: cid, pid: tgid}
+	l.execProcesses[eid] = tg
+	log.Debugf("updated execProcesses: %v", l.execProcesses)
+
+	return tgid, nil
+}
+
 // TODO: Per-container namespaces must be supported for -pid.
 
 // waitContainer waits for the root process of a container to exit.
@@ -500,39 +558,59 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// consider the container exited.
 	// TODO: Multiple calls to waitContainer() should return
 	// the same exit status.
-	defer func() {
-		l.mu.Lock()
-		defer l.mu.Unlock()
-		// TODO: Containers don't map 1:1 with their root
-		// processes. Container exits should be managed explicitly
-		// rather than via PID.
-		delete(l.containerRootTGs, cid)
-	}()
-	l.wait(tg, waitStatus)
+	ws := l.wait(tg)
+	*waitStatus = ws
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	delete(l.containerRootTGs, cid)
+
 	return nil
 }
 
-func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
 	// TODO: Containers all currently share a PID namespace.
 	// When per-container PID namespaces are supported, wait should use cid
 	// to find the appropriate PID namespace.
 	/*if cid != l.sandboxID {
 		return errors.New("non-sandbox PID namespaces are not yet implemented")
 	}*/
-	// TODO: This won't work if the exec process already exited.
-	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
+
+	// If the process was started via runsc exec, it will have an
+	// entry in l.execProcesses.
+	l.mu.Lock()
+	eid := execID{cid: cid, pid: tgid}
+	tg, ok := l.execProcesses[eid]
+	l.mu.Unlock()
+	if ok {
+		ws := l.wait(tg)
+		*waitStatus = ws
+		if clearStatus {
+			// Remove tg from the cache.
+			l.mu.Lock()
+			delete(l.execProcesses, eid)
+			log.Debugf("updated execProcesses (removal): %v", l.execProcesses)
+			l.mu.Unlock()
+		}
+		return nil
+	}
+
+	// This process wasn't created by runsc exec or start, so just find it
+	// by pid and hope it hasn't exited yet.
+	tg = l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
 	if tg == nil {
 		return fmt.Errorf("no thread group with ID %d", tgid)
 	}
-	l.wait(tg, waitStatus)
+	ws := l.wait(tg)
+	*waitStatus = ws
 	return nil
 }
 
 // wait waits for the process with TGID 'tgid' in a container's PID namespace
 // to exit.
-func (l *Loader) wait(tg *kernel.ThreadGroup, waitStatus *uint32) {
+func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
 	tg.WaitExited()
-	*waitStatus = tg.ExitStatus().Status()
+	return tg.ExitStatus().Status()
 }
 
 func (l *Loader) setRootContainerID(cid string) {
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 9398292ff..a8a796445 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -111,11 +111,11 @@ func createLoader() (*Loader, func(), error) {
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
 func TestRun(t *testing.T) {
-	s, cleanup, err := createLoader()
+	l, cleanup, err := createLoader()
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
-	defer s.Destroy()
+	defer l.Destroy()
 	defer cleanup()
 
 	// Start a goroutine to read the start chan result, otherwise Run will
@@ -124,12 +124,13 @@ func TestRun(t *testing.T) {
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
-		resultChanErr = <-s.ctrl.manager.startResultChan
+		resultChanErr = <-l.ctrl.manager.startResultChan
 		wg.Done()
 	}()
 
-	// Run the container..
-	if err := s.Run(); err != nil {
+	// Run the container.
+	l.setRootContainerID("foo")
+	if err := l.Run(); err != nil {
 		t.Errorf("error running container: %v", err)
 	}
 
@@ -140,7 +141,7 @@ func TestRun(t *testing.T) {
 	}
 
 	// Wait for the application to exit.  It should succeed.
-	if status := s.WaitExit(); status.Code != 0 || status.Signo != 0 {
+	if status := l.WaitExit(); status.Code != 0 || status.Signo != 0 {
 		t.Errorf("application exited with status %+v, want 0", status)
 	}
 }
@@ -148,24 +149,24 @@ func TestRun(t *testing.T) {
 // TestStartSignal tests that the controller Start message will cause
 // WaitForStartSignal to return.
 func TestStartSignal(t *testing.T) {
-	s, cleanup, err := createLoader()
+	l, cleanup, err := createLoader()
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
-	defer s.Destroy()
+	defer l.Destroy()
 	defer cleanup()
 
 	// We aren't going to wait on this application, so the control server
 	// needs to be shut down manually.
-	defer s.ctrl.srv.Stop()
+	defer l.ctrl.srv.Stop()
 
 	// Start a goroutine that calls WaitForStartSignal and writes to a
 	// channel when it returns.
 	waitFinished := make(chan struct{})
 	go func() {
-		s.WaitForStartSignal()
+		l.WaitForStartSignal()
 		// Pretend that Run() executed and returned no error.
-		s.ctrl.manager.startResultChan <- nil
+		l.ctrl.manager.startResultChan <- nil
 		waitFinished <- struct{}{}
 	}()
 
@@ -181,7 +182,7 @@ func TestStartSignal(t *testing.T) {
 
 	// Trigger the control server StartRoot method.
 	cid := "foo"
-	if err := s.ctrl.manager.StartRoot(&cid, nil); err != nil {
+	if err := l.ctrl.manager.StartRoot(&cid, nil); err != nil {
 		t.Errorf("error calling StartRoot: %v", err)
 	}
 
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 0d1fa6e20..957c4f0ff 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -49,6 +49,7 @@ type Exec struct {
 	extraKGIDs      stringSlice
 	caps            stringSlice
 	detach          bool
+	clearStatus     bool
 	processPath     string
 	pidFile         string
 	internalPidFile string
@@ -100,6 +101,9 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
 	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+
+	// clear-status is expected to only be set when we fork due to --detach being set.
+	f.BoolVar(&ex.clearStatus, "clear-status", true, "clear the status of the exec'd process upon completion")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -163,7 +167,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Wait for the process to exit.
-	ws, err := c.WaitPID(pid)
+	ws, err := c.WaitPID(pid, ex.clearStatus)
 	if err != nil {
 		Fatalf("error waiting on pid %d: %v", pid, err)
 	}
@@ -194,10 +198,16 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 
 	// Add the rest of the args, excluding the "detach" flag.
 	for _, a := range os.Args[1:] {
-		if !strings.Contains(a, "detach") {
+		if strings.Contains(a, "detach") {
+			// Replace with the "clear-status" flag, which tells
+			// the new process it's a detached child and shouldn't
+			// clear the exit status of the sentry process.
+			args = append(args, "--clear-status=false")
+		} else {
 			args = append(args, a)
 		}
 	}
+
 	cmd := exec.Command(binPath, args...)
 
 	// Exec stdio defaults to current process stdio.
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index b41edc725..956349140 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		waitStatus = ws
 	// Wait on a PID in the root PID namespace.
 	case wt.rootPID != unsetPID:
-		ws, err := c.WaitRootPID(int32(wt.rootPID))
+		ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
 		if err != nil {
 			Fatalf("error waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
 		}
 		waitStatus = ws
 	// Wait on a PID in the container's PID namespace.
 	case wt.pid != unsetPID:
-		ws, err := c.WaitPID(int32(wt.pid))
+		ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
 		if err != nil {
 			Fatalf("error waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
 		}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 792b7967b..a24c6cc31 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -398,22 +398,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 
 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
 // returns its WaitStatus.
-func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
+func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
 	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
 		return 0, fmt.Errorf("container sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(pid, c.Sandbox.ID)
+	return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
 }
 
 // WaitPID waits for process 'pid' in the container's PID namespace and returns
 // its WaitStatus.
-func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
+func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
 	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
 		return 0, fmt.Errorf("container sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(pid, c.ID)
+	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
 }
 
 // Signal sends the signal to the container.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index ab1823f1c..5fe80f20f 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -551,7 +551,7 @@ func TestExec(t *testing.T) {
 
 		args := &control.ExecArgs{
 			Filename:         "/bin/sleep",
-			Argv:             []string{"sleep", "5"},
+			Argv:             []string{"/bin/sleep", "5"},
 			WorkingDirectory: "/",
 			KUID:             uid,
 		}
@@ -1598,7 +1598,7 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %v", err)
 	}
-	ws, err := cont.WaitPID(pid)
+	ws, err := cont.WaitPID(pid, true /* clearStatus */)
 	if err != nil {
 		return 0, fmt.Errorf("error waiting: %v", err)
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 84e0ec080..09888cb86 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -163,16 +163,15 @@ func TestMultiContainerWait(t *testing.T) {
 		go func(c *Container) {
 			defer wg.Done()
 			const pid = 2
-			if ws, err := c.WaitPID(pid); err != nil {
+			if ws, err := c.WaitPID(pid, true /* clearStatus */); err != nil {
 				t.Errorf("failed to wait for PID %d: %v", pid, err)
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("PID %d exited with non-zero status %d", pid, es)
 			}
-			if _, err := c.WaitPID(pid); err == nil {
+			if _, err := c.WaitPID(pid, true /* clearStatus */); err == nil {
 				t.Errorf("wait for stopped PID %d should fail", pid)
 			}
-			// TODO: use 'container[1]' when PID namespace is supported.
-		}(containers[0])
+		}(containers[1])
 	}
 
 	wg.Wait()
@@ -184,6 +183,93 @@ func TestMultiContainerWait(t *testing.T) {
 	}
 }
 
+// TestExecWait ensures what we can wait containers and individual processes in the
+// sandbox that have already exited.
+func TestExecWait(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// The first container should run the entire duration of the test.
+	cmd1 := []string{"sleep", "100"}
+	// We'll wait on the second container, which is much shorter lived.
+	cmd2 := []string{"sleep", "1"}
+	specs, ids := createSpecs(cmd1, cmd2)
+
+	// Setup the containers.
+	var containers []*Container
+	for i, spec := range specs {
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	// Check via ps that multiple processes are running.
+	expectedPL := []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Fatalf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Wait for the second container to finish.
+	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		t.Fatalf("failed to wait for second container to stop: %v", err)
+	}
+
+	// Get the second container exit status.
+	if ws, err := containers[1].Wait(); err != nil {
+		t.Fatalf("failed to wait for process %s: %v", containers[1].Spec.Process.Args, err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Fatalf("process %s exited with non-zero status %d", containers[1].Spec.Process.Args, es)
+	}
+	if _, err := containers[1].Wait(); err == nil {
+		t.Fatalf("wait for stopped process %s should fail", containers[1].Spec.Process.Args)
+	}
+
+	// Execute another process in the first container.
+	args := &control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"/bin/sleep", "1"},
+		WorkingDirectory: "/",
+		KUID:             0,
+	}
+	pid, err := containers[0].Execute(args)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+
+	// Wait for the exec'd process to exit.
+	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		t.Fatalf("failed to wait for second container to stop: %v", err)
+	}
+
+	// Get the exit status from the exec'd process.
+	if ws, err := containers[0].WaitPID(pid, true /* clearStatus */); err != nil {
+		t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Fatalf("process %+v exited with non-zero status %d", args, es)
+	}
+	if _, err := containers[0].WaitPID(pid, true /* clearStatus */); err == nil {
+		t.Fatalf("wait for stopped process %+v should fail", args)
+	}
+}
+
 // TestMultiContainerMount tests that bind mounts can be used with multiple
 // containers.
 func TestMultiContainerMount(t *testing.T) {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 8c4d0d495..3b10fd20e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -522,7 +522,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
 // WaitStatus.
-func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
+func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
 	var ws syscall.WaitStatus
 	conn, err := s.sandboxConnect()
@@ -532,8 +532,9 @@ func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) {
 	defer conn.Close()
 
 	args := &boot.WaitPIDArgs{
-		PID: pid,
-		CID: cid,
+		PID:         pid,
+		CID:         cid,
+		ClearStatus: clearStatus,
 	}
 	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
 		return ws, fmt.Errorf("error waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
-- 
cgit v1.2.3


From 5d9816be41a967fa1fa9bbbe0c638dd322c7c0b1 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 17 Sep 2018 21:33:51 -0700
Subject: Remove memory usage static init

panic() during init() can be hard to debug.

Updates #100

PiperOrigin-RevId: 213391932
Change-Id: Ic103f1981c5b48f1e12da3b42e696e84ffac02a9
---
 pkg/sentry/usage/memory.go | 16 +++++++++-------
 runsc/boot/loader.go       |  4 ++++
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 4a1527b5f..f13a77779 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -117,15 +117,16 @@ type MemoryLocked struct {
 	File *os.File
 }
 
-func newMemoryLocked() MemoryLocked {
-	name := "memory-usage"
+// Init initializes global 'MemoryAccounting'.
+func Init() error {
+	const name = "memory-usage"
 	fd, err := memutil.CreateMemFD(name, 0)
 	if err != nil {
-		panic("error creating usage file: " + err.Error())
+		return fmt.Errorf("error creating usage file: %v", err)
 	}
 	file := os.NewFile(uintptr(fd), name)
 	if err := file.Truncate(int64(RTMemoryStatsSize)); err != nil {
-		panic("error truncating usage file: " + err.Error())
+		return fmt.Errorf("error truncating usage file: %v", err)
 	}
 	// Note: We rely on the returned page being initially zeroed. This will
 	// always be the case for a newly mapped page from /dev/shm. If we obtain
@@ -133,13 +134,14 @@ func newMemoryLocked() MemoryLocked {
 	// explicitly zero the page.
 	mmap, err := syscall.Mmap(int(file.Fd()), 0, int(RTMemoryStatsSize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
 	if err != nil {
-		panic("error mapping usage file: " + err.Error())
+		return fmt.Errorf("error mapping usage file: %v", err)
 	}
 
-	return MemoryLocked{
+	MemoryAccounting = &MemoryLocked{
 		File:          file,
 		RTMemoryStats: RTMemoryStatsPointer(mmap),
 	}
+	return nil
 }
 
 // MemoryAccounting is the global memory stats.
@@ -147,7 +149,7 @@ func newMemoryLocked() MemoryLocked {
 // There is no need to save or restore the global memory accounting object,
 // because individual frame kinds are saved and charged only when they become
 // resident.
-var MemoryAccounting = newMemoryLocked()
+var MemoryAccounting *MemoryLocked
 
 func (m *MemoryLocked) incLocked(val uint64, kind MemoryKind) {
 	switch kind {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 665240ab6..faaf3e800 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -42,6 +42,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
 	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
@@ -143,6 +144,9 @@ func init() {
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, console bool) (*Loader, error) {
+	if err := usage.Init(); err != nil {
+		return nil, fmt.Errorf("Error setting up memory usage: %v", err)
+	}
 	// Create kernel and platform.
 	p, err := createPlatform(conf, deviceFD)
 	if err != nil {
-- 
cgit v1.2.3


From da20559137ccbf7f27e6008472f4d9159306df4a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 18 Sep 2018 02:08:11 -0700
Subject: Provide better message when memfd_create fails with ENOSYS

Updates #100

PiperOrigin-RevId: 213414821
Change-Id: I90c2e6c18c54a6afcd7ad6f409f670aa31577d37
---
 pkg/sentry/platform/filemem/filemem.go | 3 +++
 pkg/sentry/usage/memory.go             | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index feb020ef8..f278c8d63 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -233,6 +233,9 @@ func newFromFile(file *os.File) (*FileMem, error) {
 func New(name string) (*FileMem, error) {
 	fd, err := memutil.CreateMemFD(name, 0)
 	if err != nil {
+		if e, ok := err.(syscall.Errno); ok && e == syscall.ENOSYS {
+			return nil, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
+		}
 		return nil, err
 	}
 	return newFromFile(os.NewFile(uintptr(fd), name))
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index f13a77779..92a478d85 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -122,6 +122,9 @@ func Init() error {
 	const name = "memory-usage"
 	fd, err := memutil.CreateMemFD(name, 0)
 	if err != nil {
+		if e, ok := err.(syscall.Errno); ok && e == syscall.ENOSYS {
+			return fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
+		}
 		return fmt.Errorf("error creating usage file: %v", err)
 	}
 	file := os.NewFile(uintptr(fd), name)
-- 
cgit v1.2.3


From ed08597d121a624592e5517a28ae40ddbcc59cb0 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 18 Sep 2018 11:13:27 -0700
Subject: Allow for MSG_CTRUNC in input flags for recv.

PiperOrigin-RevId: 213481363
Change-Id: I8150ea20cebeb207afe031ed146244de9209e745
---
 pkg/sentry/syscalls/linux/sys_socket.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index d6d5dba8a..867fec468 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -602,7 +602,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 
@@ -635,7 +635,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 
@@ -791,7 +791,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CONFIRM) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CONFIRM) != 0 {
 		return 0, syscall.EINVAL
 	}
 
-- 
cgit v1.2.3


From fd222d62eda8b447fa0e11260f64fdb94e5e7084 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 18 Sep 2018 15:41:13 -0700
Subject: Short-circuit Readdir calls on overlay files when the dirent is
 frozen.

If we have an overlay file whose corresponding Dirent is frozen, then we should
not bother calling Readdir on the upper or lower files, since DirentReaddir
will calculate children based on the frozen Dirent tree.

A test was added that fails without this change.

PiperOrigin-RevId: 213531215
Change-Id: I4d6c98f1416541a476a34418f664ba58f936a81d
---
 pkg/sentry/fs/file_overlay.go       | 22 ++++++----
 pkg/sentry/fs/file_overlay_test.go  | 83 +++++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/inode_overlay_test.go | 12 +++++-
 3 files changed, 109 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 113962368..41e646ee8 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -163,6 +163,21 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
 
 // Readdir implements FileOperations.Readdir.
 func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) {
+	root := RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &DirCtx{
+		Serializer: serializer,
+		DirCursor:  &f.dirCursor,
+	}
+
+	// If the directory dirent is frozen, then DirentReaddir will calculate
+	// the children based off the frozen dirent tree. There is no need to
+	// call readdir on the upper/lower layers.
+	if file.Dirent.frozen {
+		return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
+	}
+
+	// Otherwise proceed with usual overlay readdir.
 	o := file.Dirent.Inode.overlay
 
 	o.copyMu.RLock()
@@ -174,13 +189,6 @@ func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, seriali
 		return file.Offset(), err
 	}
 
-	root := RootFromContext(ctx)
-	defer root.DecRef()
-
-	dirCtx := &DirCtx{
-		Serializer: serializer,
-		DirCursor:  &f.dirCursor,
-	}
 	return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
 }
 
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 38762d8a1..830458ff9 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -174,6 +174,89 @@ func TestReaddirRevalidation(t *testing.T) {
 	}
 }
 
+// TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with
+// a frozen dirent tree does not make Readdir calls to the underlying files.
+func TestReaddirOverlayFrozen(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	// Create an overlay with two directories, each with two files.
+	upper := newTestRamfsDir(ctx, []dirContent{{name: "upper-file1"}, {name: "upper-file2"}}, nil)
+	lower := newTestRamfsDir(ctx, []dirContent{{name: "lower-file1"}, {name: "lower-file2"}}, nil)
+	overlayInode := fs.NewTestOverlayDir(ctx, upper, lower, false)
+
+	// Set that overlay as the root.
+	root := fs.NewDirent(overlayInode, "root")
+	ctx = &rootContext{
+		Context: ctx,
+		root:    root,
+	}
+
+	// Check that calling Readdir on the root now returns all 4 files (2
+	// from each layer in the overlay).
+	rootFile, err := root.Inode.GetFile(ctx, root, fs.FileFlags{Read: true})
+	if err != nil {
+		t.Fatalf("root.Inode.GetFile failed: %v", err)
+	}
+	defer rootFile.DecRef()
+	ser := &fs.CollectEntriesSerializer{}
+	if err := rootFile.Readdir(ctx, ser); err != nil {
+		t.Fatalf("rootFile.Readdir failed: %v", err)
+	}
+	if got, want := ser.Order, []string{".", "..", "lower-file1", "lower-file2", "upper-file1", "upper-file2"}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Readdir got names %v, want %v", got, want)
+	}
+
+	// Readdir should have been called on upper and lower.
+	upperDir := upper.InodeOperations.(*dir)
+	lowerDir := lower.InodeOperations.(*dir)
+	if !upperDir.ReaddirCalled {
+		t.Errorf("upperDir.ReaddirCalled got %v, want true", upperDir.ReaddirCalled)
+	}
+	if !lowerDir.ReaddirCalled {
+		t.Errorf("lowerDir.ReaddirCalled got %v, want true", lowerDir.ReaddirCalled)
+	}
+
+	// Reset.
+	upperDir.ReaddirCalled = false
+	lowerDir.ReaddirCalled = false
+
+	// Take references on "upper-file1" and "lower-file1", pinning them in
+	// the dirent tree.
+	for _, name := range []string{"upper-file1", "lower-file1"} {
+		if _, err := root.Walk(ctx, root, name); err != nil {
+			t.Fatalf("root.Walk(%q) failed: %v", name, err)
+		}
+		// Don't drop a reference on the returned dirent so that it
+		// will stay in the tree.
+	}
+
+	// Freeze the dirent tree.
+	root.Freeze()
+
+	// Seek back to the beginning of the file.
+	if _, err := rootFile.Seek(ctx, fs.SeekSet, 0); err != nil {
+		t.Fatalf("error seeking to beginning of directory: %v", err)
+	}
+
+	// Calling Readdir on the root now will return only the pinned
+	// children.
+	ser = &fs.CollectEntriesSerializer{}
+	if err := rootFile.Readdir(ctx, ser); err != nil {
+		t.Fatalf("rootFile.Readdir failed: %v", err)
+	}
+	if got, want := ser.Order, []string{".", "..", "lower-file1", "upper-file1"}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Readdir got names %v, want %v", got, want)
+	}
+
+	// Readdir should NOT have been called on upper or lower.
+	if upperDir.ReaddirCalled {
+		t.Errorf("upperDir.ReaddirCalled got %v, want false", upperDir.ReaddirCalled)
+	}
+	if lowerDir.ReaddirCalled {
+		t.Errorf("lowerDir.ReaddirCalled got %v, want false", lowerDir.ReaddirCalled)
+	}
+}
+
 type rootContext struct {
 	context.Context
 	root *fs.Dirent
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 3ee4c9667..23e5635a4 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -372,10 +372,14 @@ func TestCacheFlush(t *testing.T) {
 type dir struct {
 	fs.InodeOperations
 
-	// list of negative child names.
+	// List of negative child names.
 	negative []string
+
+	// Whether DeprecatedReaddir has been called on this dir.
+	ReaddirCalled bool
 }
 
+// Getxattr implements InodeOperations.Getxattr.
 func (d *dir) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
 	for _, n := range d.negative {
 		if name == fs.XattrOverlayWhiteout(n) {
@@ -385,6 +389,12 @@ func (d *dir) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
 	return nil, syserror.ENOATTR
 }
 
+// DeprecatedReaddir implements InodeOperations.DeprecatedReaddir.
+func (d *dir) DeprecatedReaddir(ctx context.Context, dirctx *fs.DirCtx, offset int) (int, error) {
+	d.ReaddirCalled = true
+	return d.InodeOperations.DeprecatedReaddir(ctx, dirctx, offset)
+}
+
 type dirContent struct {
 	name string
 	dir  bool
-- 
cgit v1.2.3


From fca9a390db4c965b4606dd85838460841bd4ab14 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 26 Sep 2018 21:58:54 -0700
Subject: Return correct parent PID

Old code was returning ID of the thread that created
the child process. It should be returning the ID of
the parent process instead.

PiperOrigin-RevId: 214720910
Change-Id: I95715c535bcf468ecf1ae771cccd04a4cd345b36
---
 pkg/sentry/control/proc.go | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 68d3b179b..b120471cb 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -269,11 +269,14 @@ func Processes(k *kernel.Kernel, out *[]*Process) error {
 			continue
 		}
 
+		ppid := kernel.ThreadID(0)
+		if tg.Leader().Parent() != nil {
+			ppid = ts.Root.IDOfThreadGroup(tg.Leader().Parent().ThreadGroup())
+		}
 		*out = append(*out, &Process{
-			UID: tg.Leader().Credentials().EffectiveKUID,
-			PID: pid,
-			// If Parent is null (i.e. tg is the init process), PPID will be 0.
-			PPID:  ts.Root.IDOfTask(tg.Leader().Parent()),
+			UID:   tg.Leader().Credentials().EffectiveKUID,
+			PID:   pid,
+			PPID:  ppid,
 			STime: formatStartTime(now, tg.Leader().StartTime()),
 			C:     percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
 			Time:  tg.CPUStats().SysTime.String(),
-- 
cgit v1.2.3


From 234f36b6f2cb0db74d119079e5244619d6ea38ad Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 27 Sep 2018 10:41:28 -0700
Subject: sentry: export cpuTime function.

PiperOrigin-RevId: 214798278
Change-Id: Id59d1ceb35037cda0689d3a1c4844e96c6957615
---
 pkg/sentry/state/state_metadata.go | 2 +-
 pkg/sentry/state/state_unsafe.go   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go
index b6d3dbcb4..afa21672a 100644
--- a/pkg/sentry/state/state_metadata.go
+++ b/pkg/sentry/state/state_metadata.go
@@ -28,7 +28,7 @@ const (
 )
 
 func addSaveMetadata(m map[string]string) {
-	t, err := cpuTime()
+	t, err := CPUTime()
 	if err != nil {
 		log.Warningf("Error getting cpu time: %v", err)
 	}
diff --git a/pkg/sentry/state/state_unsafe.go b/pkg/sentry/state/state_unsafe.go
index 53814ef70..3ff7d24c8 100644
--- a/pkg/sentry/state/state_unsafe.go
+++ b/pkg/sentry/state/state_unsafe.go
@@ -23,7 +23,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 )
 
-func cpuTime() (time.Duration, error) {
+// CPUTime returns the CPU time usage by Sentry and app.
+func CPUTime() (time.Duration, error) {
 	var ts syscall.Timespec
 	_, _, errno := syscall.RawSyscall(syscall.SYS_CLOCK_GETTIME, uintptr(linux.CLOCK_PROCESS_CPUTIME_ID), uintptr(unsafe.Pointer(&ts)), 0)
 	if errno != 0 {
-- 
cgit v1.2.3


From 491faac03b2815ca1bc9b5425c1b3f6291468e20 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Sep 2018 15:00:03 -0700
Subject: Implement 'runsc kill --all'

In order to implement kill --all correctly, the Sentry needs
to track all tasks that belong to a given container. This change
introduces ContainerID to the task, that gets inherited by all
children. 'kill --all' then iterates over all tasks comparing the
ContainerID field to find all processes that need to be signalled.

PiperOrigin-RevId: 214841768
Change-Id: I693b2374be8692d88cc441ef13a0ae34abf73ac6
---
 pkg/sentry/control/proc.go              |  14 ++-
 pkg/sentry/kernel/kernel.go             |  25 +++++
 pkg/sentry/kernel/task.go               |  12 ++
 pkg/sentry/kernel/task_clone.go         |   1 +
 pkg/sentry/kernel/task_start.go         |   4 +
 runsc/boot/controller.go                |  29 ++---
 runsc/boot/loader.go                    |  26 +++--
 runsc/cmd/kill.go                       |  15 ++-
 runsc/container/BUILD                   |   6 +-
 runsc/container/container.go            |  11 +-
 runsc/container/container_test.go       |  38 +++++--
 runsc/container/multi_container_test.go | 190 ++++++++++++++++++++++++++++----
 runsc/container/test_app.go             |  63 +++++++++++
 runsc/sandbox/sandbox.go                |  16 ++-
 14 files changed, 371 insertions(+), 79 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index b120471cb..106055e86 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -83,6 +83,9 @@ type ExecArgs struct {
 
 	// FilePayload determines the files to give to the new process.
 	urpc.FilePayload
+
+	// ContainerID is the container for the process being executed.
+	ContainerID string
 }
 
 // Exec runs a new task.
@@ -133,6 +136,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		UTSNamespace:            proc.Kernel.RootUTSNamespace(),
 		IPCNamespace:            proc.Kernel.RootIPCNamespace(),
 		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
+		ContainerID:             args.ContainerID,
 	}
 	if initArgs.Root != nil {
 		// initArgs must hold a reference on Root. This ref is dropped
@@ -182,7 +186,7 @@ type PsArgs struct {
 // Ps provides a process listing for the running kernel.
 func (proc *Proc) Ps(args *PsArgs, out *string) error {
 	var p []*Process
-	if e := Processes(proc.Kernel, &p); e != nil {
+	if e := Processes(proc.Kernel, "", &p); e != nil {
 		return e
 	}
 	if !args.JSON {
@@ -258,8 +262,9 @@ func PrintPIDsJSON(pl []*Process) (string, error) {
 	return string(b), nil
 }
 
-// Processes retrieves information about processes running in the sandbox.
-func Processes(k *kernel.Kernel, out *[]*Process) error {
+// Processes retrieves information about processes running in the sandbox with
+// the given container id. All processes are returned if 'containerID' is empty.
+func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 	ts := k.TaskSet()
 	now := k.RealtimeClock().Now()
 	for _, tg := range ts.Root.ThreadGroups() {
@@ -268,6 +273,9 @@ func Processes(k *kernel.Kernel, out *[]*Process) error {
 		if pid == 0 {
 			continue
 		}
+		if containerID != "" && containerID != tg.Leader().ContainerID() {
+			continue
+		}
 
 		ppid := kernel.ThreadID(0)
 		if tg.Leader().Parent() != nil {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index f71e32ac9..1ace0b501 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -524,6 +524,9 @@ type CreateProcessArgs struct {
 	// Anyone setting Root must donate a reference (i.e. increment it) to
 	// keep it alive until it is decremented by CreateProcess.
 	Root *fs.Dirent
+
+	// ContainerID is the container that the process belongs to.
+	ContainerID string
 }
 
 // NewContext returns a context.Context that represents the task that will be
@@ -660,6 +663,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		UTSNamespace:            args.UTSNamespace,
 		IPCNamespace:            args.IPCNamespace,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
+		ContainerID:             args.ContainerID,
 	}
 	t, err := k.tasks.NewTask(config)
 	if err != nil {
@@ -818,6 +822,27 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
 	k.sendExternalSignal(info, context)
 }
 
+// SendContainerSignal sends the given signal to all processes inside the
+// namespace that match the given container ID.
+func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader && t.ContainerID() == cid {
+			t.tg.signalHandlers.mu.Lock()
+			defer t.tg.signalHandlers.mu.Unlock()
+			infoCopy := *info
+			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 // FeatureSet returns the FeatureSet.
 func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
 	return k.featureSet
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2f6f825ac..07ad1614c 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -205,6 +205,13 @@ type Task struct {
 	// k is the Kernel that this task belongs to. The k pointer is immutable.
 	k *Kernel
 
+	// containerID has no equivalent in Linux; it's used by runsc to track all
+	// tasks that belong to a given containers since cgroups aren't implemented.
+	// It's inherited by the children, is immutable, and may be empty.
+	//
+	// NOTE: cgroups can be used to track this when implemented.
+	containerID string
+
 	// mu protects some of the following fields.
 	mu sync.Mutex `state:"nosave"`
 
@@ -678,3 +685,8 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
 func (t *Task) AbstractSockets() *AbstractSocketNamespace {
 	return t.abstractSockets
 }
+
+// ContainerID returns t's container ID.
+func (t *Task) ContainerID() string {
+	return t.containerID
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 46c688b20..130bd652b 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -258,6 +258,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		ContainerID:             t.ContainerID(),
 	}
 	if opts.NewThreadGroup {
 		cfg.Parent = t
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 6ce99d268..6c8d7d316 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -77,6 +77,9 @@ type TaskConfig struct {
 
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
+
+	// ContainerID is the container the new task belongs to.
+	ContainerID string
 }
 
 // NewTask creates a new task defined by cfg.
@@ -124,6 +127,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		abstractSockets: cfg.AbstractSocketNamespace,
 		rseqCPU:         -1,
 		futexWaiter:     futex.NewWaiter(),
+		containerID:     cfg.ContainerID,
 	}
 	t.endStopCond.L = &t.tg.signalHandlers.mu
 	t.ptraceTracer.Store((*Task)(nil))
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index bc33e028a..116a8369c 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -174,10 +174,17 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
 	return nil
 }
 
+// ProcessesArgs container arguments to Processes method.
+type ProcessesArgs struct {
+	// CID restricts the result to processes belonging to
+	// the given container. Empty means all.
+	CID string
+}
+
 // Processes retrieves information about processes running in the sandbox.
-func (cm *containerManager) Processes(_, out *[]*control.Process) error {
+func (cm *containerManager) Processes(args *ProcessesArgs, out *[]*control.Process) error {
 	log.Debugf("containerManager.Processes")
-	return control.Processes(cm.l.k, out)
+	return control.Processes(cm.l.k, args.CID, out)
 }
 
 // StartArgs contains arguments to the Start method.
@@ -326,19 +333,11 @@ func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 	return nil
 }
 
-// ExecArgs contains arguments to Execute.
-type ExecArgs struct {
-	control.ExecArgs
-
-	// CID is the ID of the container to exec in.
-	CID string
-}
-
 // ExecuteAsync starts running a command on a created or running sandbox. It
 // returns the pid of the new process.
-func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
+func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
 	log.Debugf("containerManager.ExecuteAsync: %+v", args)
-	tgid, err := cm.l.executeAsync(&args.ExecArgs, args.CID)
+	tgid, err := cm.l.executeAsync(args)
 	if err != nil {
 		return err
 	}
@@ -503,11 +502,15 @@ type SignalArgs struct {
 
 	// Signo is the signal to send to the process.
 	Signo int32
+
+	// All is set when signal should be sent to all processes in the container.
+	// When false, the signal is sent to the root container process only.
+	All bool
 }
 
 // Signal sends a signal to the init process of the container.
 // TODO: Send signal to exec process.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Signal")
-	return cm.l.signal(args.CID, args.Signo)
+	return cm.l.signal(args.CID, args.Signo, args.All)
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9a5d649ab..bd6e146fc 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -270,7 +270,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal)
 	}
 
-	procArgs, err := newProcess(spec, creds, k)
+	procArgs, err := newProcess(id, spec, creds, k)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create root process: %v", err)
 	}
@@ -295,7 +295,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 }
 
 // newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -314,6 +314,7 @@ func newProcess(spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (ke
 		UTSNamespace:            k.RootUTSNamespace(),
 		IPCNamespace:            k.RootIPCNamespace(),
 		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
+		ContainerID:             id,
 	}
 	return procArgs, nil
 }
@@ -465,7 +466,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	// TODO New containers should be started in new PID namespaces
 	// when indicated by the spec.
 
-	procArgs, err := newProcess(spec, creds, l.k)
+	procArgs, err := newProcess(cid, spec, creds, l.k)
 	if err != nil {
 		return fmt.Errorf("failed to create new process: %v", err)
 	}
@@ -525,14 +526,14 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	return nil
 }
 
-func (l *Loader) executeAsync(args *control.ExecArgs, cid string) (kernel.ThreadID, error) {
+func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
 	l.mu.Lock()
-	tg, ok := l.containerRootTGs[cid]
+	tg, ok := l.containerRootTGs[args.ContainerID]
 	l.mu.Unlock()
 	if !ok {
-		return 0, fmt.Errorf("cannot exec in container %q: no such container", cid)
+		return 0, fmt.Errorf("cannot exec in container %q: no such container", args.ContainerID)
 	}
 	tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
@@ -552,7 +553,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs, cid string) (kernel.Thread
 	// later.
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	eid := execID{cid: cid, pid: tgid}
+	eid := execID{cid: args.ContainerID, pid: tgid}
 	l.execProcesses[eid] = tg
 	log.Debugf("updated execProcesses: %v", l.execProcesses)
 
@@ -671,8 +672,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	}
 }
 
-// TODO: Support sending signal to all.
-func (l *Loader) signal(cid string, signo int32) error {
+func (l *Loader) signal(cid string, signo int32, all bool) error {
 	l.mu.Lock()
 	tg, ok := l.containerRootTGs[cid]
 	l.mu.Unlock()
@@ -681,5 +681,13 @@ func (l *Loader) signal(cid string, signo int32) error {
 	}
 
 	si := arch.SignalInfo{Signo: signo}
+	if all {
+		// Pause the kernel to prevent new processes from being created while
+		// the signal is delivered. This prevents process leaks when SIGKILL is
+		// sent to the entire container.
+		l.k.Pause()
+		defer l.k.Unpause()
+		return l.k.SendContainerSignal(cid, &si)
+	}
 	return tg.Leader().SendSignal(&si)
 }
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 6fa5674f1..af709bc71 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -29,7 +29,9 @@ import (
 )
 
 // Kill implements subcommands.Command for the "kill" command.
-type Kill struct{}
+type Kill struct {
+	all bool
+}
 
 // Name implements subcommands.Command.Name.
 func (*Kill) Name() string {
@@ -47,15 +49,12 @@ func (*Kill) Usage() string {
 }
 
 // SetFlags implements subcommands.Command.SetFlags.
-func (*Kill) SetFlags(f *flag.FlagSet) {
-	// TODO: Implement this flag.  It is defined here just to
-	// prevent runsc from crashing if it is passed.
-	var all bool
-	f.BoolVar(&all, "all", false, "send the specified signal to all processes inside the container")
+func (k *Kill) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&k.all, "all", false, "send the specified signal to all processes inside the container")
 }
 
 // Execute implements subcommands.Command.Execute.
-func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	if f.NArg() == 0 || f.NArg() > 2 {
 		f.Usage()
 		return subcommands.ExitUsageError
@@ -83,7 +82,7 @@ func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	}
 	// TODO: Distinguish between already-exited containers and
 	// genuine errors.
-	if err := c.Signal(sig); err != nil {
+	if err := c.Signal(sig, k.all); err != nil {
 		Fatalf("%v", err)
 	}
 	return subcommands.ExitSuccess
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index d72d05c13..e68fb1e8e 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -53,6 +53,7 @@ go_test(
         "//runsc/boot",
         "//runsc/specutils",
         "//runsc/test/testutil",
+        "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
@@ -61,5 +62,8 @@ go_test(
 go_binary(
     name = "test_app",
     srcs = ["test_app.go"],
-    deps = ["@com_github_google_subcommands//:go_default_library"],
+    deps = [
+        "//runsc/test/testutil",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
 )
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a1b31d861..44b7dad8a 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -159,7 +159,7 @@ func Load(rootDir, id string) (*Container, error) {
 		} else if c.Status == Running {
 			// Container state should reflect the actual state of the application, so
 			// we don't consider gofer process here.
-			if err := c.Signal(syscall.Signal(0)); err != nil {
+			if err := c.Signal(syscall.Signal(0), false); err != nil {
 				c.changeStatus(Stopped)
 			}
 		}
@@ -398,7 +398,8 @@ func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
 	if err := c.requireStatus("execute in", Created, Running); err != nil {
 		return 0, err
 	}
-	return c.Sandbox.Execute(c.ID, args)
+	args.ContainerID = c.ID
+	return c.Sandbox.Execute(args)
 }
 
 // Event returns events for the container.
@@ -453,13 +454,13 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // Signal sends the signal to the container.
 // Signal returns an error if the container is already stopped.
 // TODO: Distinguish different error types.
-func (c *Container) Signal(sig syscall.Signal) error {
+func (c *Container) Signal(sig syscall.Signal, all bool) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
 	if err := c.requireStatus("signal", Running); err != nil {
 		return err
 	}
 	// TODO: Query the container for its state, then save it.
-	return c.Sandbox.Signal(c.ID, sig)
+	return c.Sandbox.Signal(c.ID, sig, all)
 }
 
 // Checkpoint sends the checkpoint call to the container.
@@ -612,7 +613,7 @@ func (c *Container) waitForStopped() error {
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
 		if c.isSandboxRunning() {
-			if err := c.Signal(syscall.Signal(0)); err == nil {
+			if err := c.Signal(syscall.Signal(0), false); err == nil {
 				return fmt.Errorf("container is still running")
 			}
 		}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index efa598202..de1e50a3f 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -30,6 +30,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -49,21 +50,34 @@ func init() {
 }
 
 // waitForProcessList waits for the given process list to show up in the container.
-func waitForProcessList(cont *Container, expected []*control.Process) error {
-	var got []*control.Process
-	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
-		var err error
-		got, err = cont.Processes()
+func waitForProcessList(cont *Container, want []*control.Process) error {
+	cb := func() error {
+		got, err := cont.Processes()
 		if err != nil {
-			return fmt.Errorf("error getting process data from container: %v", err)
+			err = fmt.Errorf("error getting process data from container: %v", err)
+			return &backoff.PermanentError{Err: err}
 		}
-		if procListsEqual(got, expected) {
-			return nil
+		if !procListsEqual(got, want) {
+			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
 		}
-		// Process might not have started, try again...
-		time.Sleep(10 * time.Millisecond)
+		return nil
+	}
+	return testutil.Poll(cb, 5*time.Second)
+}
+
+func waitForProcessCount(cont *Container, want int) error {
+	cb := func() error {
+		pss, err := cont.Processes()
+		if err != nil {
+			err = fmt.Errorf("error getting process data from container: %v", err)
+			return &backoff.PermanentError{Err: err}
+		}
+		if got := len(pss); got != want {
+			return fmt.Errorf("wrong process count, got: %d, want: %d", got, want)
+		}
+		return nil
 	}
-	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(expected))
+	return testutil.Poll(cb, 5*time.Second)
 }
 
 // procListsEqual is used to check whether 2 Process lists are equal for all
@@ -345,7 +359,7 @@ func TestLifecycle(t *testing.T) {
 		<-ch
 		time.Sleep(100 * time.Millisecond)
 		// Send the container a SIGTERM which will cause it to stop.
-		if err := c.Signal(syscall.SIGTERM); err != nil {
+		if err := c.Signal(syscall.SIGTERM, false); err != nil {
 			t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
 		}
 		// Wait for it to die.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 2867aa3b9..dc938066b 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -16,6 +16,7 @@ package container
 
 import (
 	"io/ioutil"
+	"math"
 	"os"
 	"path"
 	"path/filepath"
@@ -91,11 +92,16 @@ func TestMultiContainerSanity(t *testing.T) {
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
 			{PID: 1, Cmd: "sleep"},
-			{PID: 2, Cmd: "sleep"},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
+		expectedPL = []*control.Process{
+			{PID: 2, Cmd: "sleep"},
+		}
+		if err := waitForProcessList(containers[1], expectedPL); err != nil {
+			t.Errorf("failed to wait for sleep to start: %v", err)
+		}
 	}
 }
 
@@ -134,10 +140,9 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
 		{PID: 2, Cmd: "sleep"},
 	}
-	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
 	}
 
@@ -179,7 +184,10 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
-	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+	expectedPL = []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
 	}
 }
@@ -219,17 +227,16 @@ func TestExecWait(t *testing.T) {
 		containers = append(containers, cont)
 	}
 
-	// Check via ps that multiple processes are running.
+	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
 		{PID: 2, Cmd: "sleep"},
 	}
-	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
 	}
 
 	// Wait for the second container to finish.
-	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+	if err := waitForProcessCount(containers[1], 0); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
 	}
 
@@ -256,7 +263,10 @@ func TestExecWait(t *testing.T) {
 	}
 
 	// Wait for the exec'd process to exit.
-	if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+	expectedPL = []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
 	}
 
@@ -360,23 +370,25 @@ func TestMultiContainerSignal(t *testing.T) {
 			containers = append(containers, cont)
 		}
 
-		// Check via ps that multiple processes are running.
+		// Check via ps that container 1 process is running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
 			{PID: 2, Cmd: "sleep"},
 		}
 
-		if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
 		// Kill process 2.
-		if err := containers[1].Signal(syscall.SIGKILL); err != nil {
+		if err := containers[1].Signal(syscall.SIGKILL, false); err != nil {
 			t.Errorf("failed to kill process 2: %v", err)
 		}
 
 		// Make sure process 1 is still running.
-		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		expectedPL = []*control.Process{
+			{PID: 1, Cmd: "sleep"},
+		}
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
@@ -395,18 +407,18 @@ func TestMultiContainerSignal(t *testing.T) {
 			t.Errorf("error waiting for gofer to exit: %v", err)
 		}
 		// Make sure process 1 is still running.
-		if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil {
+		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 
 		// Now that process 2 is gone, ensure we get an error trying to
 		// signal it again.
-		if err := containers[1].Signal(syscall.SIGKILL); err == nil {
+		if err := containers[1].Signal(syscall.SIGKILL, false); err == nil {
 			t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
 		}
 
 		// Kill process 1.
-		if err := containers[0].Signal(syscall.SIGKILL); err != nil {
+		if err := containers[0].Signal(syscall.SIGKILL, false); err != nil {
 			t.Errorf("failed to kill process 1: %v", err)
 		}
 
@@ -428,7 +440,7 @@ func TestMultiContainerSignal(t *testing.T) {
 		}
 
 		// The sentry should be gone, so signaling should yield an error.
-		if err := containers[0].Signal(syscall.SIGKILL); err == nil {
+		if err := containers[0].Signal(syscall.SIGKILL, false); err == nil {
 			t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
 		}
 	}
@@ -453,7 +465,6 @@ func TestMultiContainerDestroy(t *testing.T) {
 		// Setup the containers.
 		var containers []*Container
 		for i, spec := range specs {
-			conf := testutil.TestConfig()
 			bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
@@ -501,3 +512,144 @@ func TestMultiContainerDestroy(t *testing.T) {
 		}
 	}
 }
+
+func TestMultiContainerProcesses(t *testing.T) {
+	// Note: use 'while true' to keep 'sh' process around. Otherwise, shell will
+	// just execve into 'sleep' and both containers will look the same.
+	specs, ids := createSpecs(
+		[]string{"sleep", "100"},
+		[]string{"sh", "-c", "while true; do sleep 100; done"})
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	var containers []*Container
+	for i, spec := range specs {
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	// Check root's container process list doesn't include other containers.
+	expectedPL0 := []*control.Process{
+		{PID: 1, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+
+	// Same for the other container.
+	expectedPL1 := []*control.Process{
+		{PID: 2, Cmd: "sh"},
+		{PID: 3, PPID: 2, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+
+	// Now exec into the second container and verify it shows up in the container.
+	args := &control.ExecArgs{
+		Filename: "/bin/sleep",
+		Argv:     []string{"/bin/sleep", "100"},
+	}
+	if _, err := containers[1].Execute(args); err != nil {
+		t.Fatalf("error exec'ing: %v", err)
+	}
+	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+	// Root container should remain unchanged.
+	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+}
+
+// TestMultiContainerKillAll checks that all process that belong to a container
+// are killed when SIGKILL is sent to *all* processes in that container.
+func TestMultiContainerKillAll(t *testing.T) {
+	app, err := testutil.FindFile("runsc/container/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// First container will remain intact while the second container is killed.
+	specs, ids := createSpecs(
+		[]string{app, "task-tree", "--depth=2", "--width=2"},
+		[]string{app, "task-tree", "--depth=4", "--width=2"})
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	var containers []*Container
+	for i, spec := range specs {
+		conf := testutil.TestConfig()
+		bundleDir, err := testutil.SetupContainerInRoot(rootDir, spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := Create(ids[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	// Wait until all processes are created.
+	rootProcCount := int(math.Pow(2, 3) - 1)
+	if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+		t.Fatal(err)
+	}
+	procCount := int(math.Pow(2, 5) - 1)
+	if err := waitForProcessCount(containers[1], procCount); err != nil {
+		t.Fatal(err)
+	}
+
+	// Exec more processes to ensure signal works for exec'd processes too.
+	args := &control.ExecArgs{
+		Filename: app,
+		Argv:     []string{app, "task-tree", "--depth=2", "--width=2"},
+	}
+	if _, err := containers[1].Execute(args); err != nil {
+		t.Fatalf("error exec'ing: %v", err)
+	}
+	procCount += 3
+	if err := waitForProcessCount(containers[1], procCount); err != nil {
+		t.Fatal(err)
+	}
+
+	// Kill'Em All
+	containers[1].Signal(syscall.SIGKILL, true)
+
+	// Check that all processes are gone.
+	if err := waitForProcessCount(containers[1], 0); err != nil {
+		t.Fatal(err)
+	}
+	// Check that root container was not affected.
+	if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index 768293cf9..a99eb97c4 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -22,17 +22,20 @@ import (
 	"log"
 	"net"
 	"os"
+	"os/exec"
 	"strconv"
 	"time"
 
 	"flag"
 	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
 func main() {
 	subcommands.Register(subcommands.HelpCommand(), "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
 	subcommands.Register(new(uds), "")
+	subcommands.Register(new(taskTree), "")
 
 	flag.Parse()
 
@@ -114,3 +117,63 @@ func server(listener net.Listener, out *os.File) {
 		fmt.Fprint(out, string(data)+"\n")
 	}
 }
+
+type taskTree struct {
+	depth int
+	width int
+}
+
+// Name implements subcommands.Command.
+func (*taskTree) Name() string {
+	return "task-tree"
+}
+
+// Synopsis implements subcommands.Command.
+func (*taskTree) Synopsis() string {
+	return "creates a tree of tasks"
+}
+
+// Usage implements subcommands.Command.
+func (*taskTree) Usage() string {
+	return "task-tree <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *taskTree) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&c.depth, "depth", 1, "number of levels to create")
+	f.IntVar(&c.width, "width", 1, "number of tasks at each level")
+}
+
+// Execute implements subcommands.Command.
+func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	if c.depth == 0 {
+		log.Printf("Child sleeping, PID: %d\n", os.Getpid())
+		for {
+			time.Sleep(24 * time.Hour)
+		}
+	}
+	log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
+
+	var cmds []*exec.Cmd
+	for i := 0; i < c.width; i++ {
+		cmd := exec.Command(
+			"/proc/self/exe", c.Name(),
+			"--depth", strconv.Itoa(c.depth-1),
+			"--width", strconv.Itoa(c.width))
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+
+		if err := cmd.Start(); err != nil {
+			log.Fatal("failed to call self:", err)
+		}
+		cmds = append(cmds, cmd)
+	}
+
+	for _, c := range cmds {
+		c.Wait()
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index c3d90d5f4..ef85f175f 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -183,10 +183,9 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	}
 	defer conn.Close()
 
+	args := boot.ProcessesArgs{CID: cid}
 	var pl []*control.Process
-	// TODO: Pass in the container id (cid) here. The sandbox
-	// should return process info for only that container.
-	if err := conn.Call(boot.ContainerProcesses, nil, &pl); err != nil {
+	if err := conn.Call(boot.ContainerProcesses, &args, &pl); err != nil {
 		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
 	}
 	return pl, nil
@@ -194,19 +193,17 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 
 // Execute runs the specified command in the container. It returns the pid of
 // the newly created process.
-func (s *Sandbox) Execute(cid string, args *control.ExecArgs) (int32, error) {
-	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
+func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
+	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return 0, s.connError(err)
 	}
 	defer conn.Close()
 
-	rpcArgs := &boot.ExecArgs{ExecArgs: *args, CID: cid}
-
 	// Send a message to the sandbox control server to start the container.
 	var pid int32
-	if err := conn.Call(boot.ContainerExecuteAsync, rpcArgs, &pid); err != nil {
+	if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
 		return 0, fmt.Errorf("error executing in sandbox: %v", err)
 	}
 	return pid, nil
@@ -575,7 +572,7 @@ func (s *Sandbox) destroy() error {
 }
 
 // Signal sends the signal to a container in the sandbox.
-func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
+func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
 	log.Debugf("Signal sandbox %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -586,6 +583,7 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error {
 	args := boot.SignalArgs{
 		CID:   cid,
 		Signo: int32(sig),
+		All:   all,
 	}
 	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
 		return fmt.Errorf("err signaling container %q: %v", cid, err)
-- 
cgit v1.2.3


From b709d239870143102cf4e44b65cc26cea78a6ccb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 27 Sep 2018 18:15:07 -0700
Subject: Forward ioctl(TCSETSF) calls on host ttys to the host kernel.

We already forward TCSETS and TCSETSW.  TCSETSF is roughly equivalent but
discards pending input.

The filters were relaxed to allow host ioctls with TCSETSF argument.

This fixes programs like "passwd" that prevent user input from being displayed
on the terminal.

Before:
	root@b8a0240fc836:/# passwd
	Enter new UNIX password: 123
	Retype new UNIX password: 123
	passwd: password updated successfully

After:
	root@ae6f5dabe402:/# passwd
	Enter new UNIX password:
	Retype new UNIX password:
	passwd: password updated successfully
PiperOrigin-RevId: 214869788
Change-Id: I31b4d1373c1388f7b51d0f2f45ce40aa8e8b0b58
---
 pkg/abi/linux/ioctl.go      | 1 +
 pkg/sentry/fs/host/file.go  | 2 +-
 runsc/boot/filter/config.go | 5 +++++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 4d7a2dfd7..1c9dc7b03 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -21,6 +21,7 @@ const (
 	TCGETS     = 0x00005401
 	TCSETS     = 0x00005402
 	TCSETSW    = 0x00005403
+	TCSETSF    = 0x00005404
 	TIOCGPGRP  = 0x0000540f
 	TIOCSPGRP  = 0x00005410
 	TIOCOUTQ   = 0x00005411
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 8d2463c78..6f469b5cc 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -305,7 +305,7 @@ func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.Sys
 		})
 		return 0, err
 
-	case linux.TCSETS, linux.TCSETSW:
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
 		var termios linux.Termios
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
 			AddressSpaceActive: true,
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 0bcc640d5..352c64253 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -147,6 +147,11 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(linux.TCSETS),
 			seccomp.AllowAny{}, /* termios struct */
 		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSF),
+			seccomp.AllowAny{}, /* termios struct */
+		},
 		{
 			seccomp.AllowAny{}, /* fd */
 			seccomp.AllowValue(linux.TCSETSW),
-- 
cgit v1.2.3


From c17ea8c6e20f58510b063f064d45608792a014e4 Mon Sep 17 00:00:00 2001
From: Sepehr Raissian <sepehrtheraiss@gmail.com>
Date: Fri, 28 Sep 2018 10:59:21 -0700
Subject: Block for link address resolution

Previously, if address resolution for UDP or Ping sockets required sending
packets using Write in Transport layer, Resolve would return ErrWouldBlock
and Write would return ErrNoLinkAddress. Meanwhile startAddressResolution
would run in background. Further calls to Write using same address would also
return ErrNoLinkAddress until resolution has been completed successfully.

Since Write is not allowed to block and System Calls need to be
interruptible in System Call layer, the caller to Write is responsible for
blocking upon return of ErrWouldBlock.

Now, when startAddressResolution is called a notification channel for
the completion of the address resolution is returned.
The channel will traverse up to the calling function of Write as well as
ErrNoLinkAddress. Once address resolution is complete (success or not) the
channel is closed. The caller would call Write again to send packets and
check if address resolution was compeleted successfully or not.

Fixes google/gvisor#5

Change-Id: Idafaf31982bee1915ca084da39ae7bd468cebd93
PiperOrigin-RevId: 214962200
---
 pkg/dhcp/client.go                            | 17 ++++++++--
 pkg/dhcp/server.go                            | 15 ++++++++-
 pkg/sentry/socket/epsocket/epsocket.go        | 23 +++++++++++--
 pkg/tcpip/adapters/gonet/gonet.go             | 35 +++++++++++++++----
 pkg/tcpip/network/ipv6/icmp_test.go           |  2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go      |  2 +-
 pkg/tcpip/stack/linkaddrcache.go              | 40 ++++++++++++++++------
 pkg/tcpip/stack/linkaddrcache_test.go         | 22 ++++++------
 pkg/tcpip/stack/registration.go               |  6 +++-
 pkg/tcpip/stack/route.go                      | 16 +++++----
 pkg/tcpip/stack/stack.go                      |  4 +--
 pkg/tcpip/stack/transport_test.go             | 12 +++----
 pkg/tcpip/tcpip.go                            |  7 +++-
 pkg/tcpip/transport/ping/endpoint.go          | 29 ++++++++--------
 pkg/tcpip/transport/tcp/connect.go            |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go           | 16 ++++-----
 pkg/tcpip/transport/tcp/tcp_test.go           | 48 +++++++++++++--------------
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go |  4 +--
 pkg/tcpip/transport/udp/endpoint.go           | 31 ++++++++---------
 pkg/tcpip/transport/udp/udp_test.go           | 18 +++++-----
 20 files changed, 220 insertions(+), 129 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index cf8472c5f..92c634a14 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -195,10 +195,23 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) (cfg
 	wopts := tcpip.WriteOptions{
 		To: serverAddr,
 	}
-	if _, err := ep.Write(tcpip.SlicePayload(h), wopts); err != nil {
+	var resCh <-chan struct{}
+	if _, resCh, err = ep.Write(tcpip.SlicePayload(h), wopts); err != nil && resCh == nil {
 		return Config{}, fmt.Errorf("dhcp discovery write: %v", err)
 	}
 
+	if resCh != nil {
+		select {
+		case <-resCh:
+		case <-ctx.Done():
+			return Config{}, fmt.Errorf("dhcp client address resolution: %v", tcpip.ErrAborted)
+		}
+
+		if _, _, err := ep.Write(tcpip.SlicePayload(h), wopts); err != nil {
+			return Config{}, fmt.Errorf("dhcp discovery write: %v", err)
+		}
+	}
+
 	we, ch := waiter.NewChannelEntry(nil)
 	wq.EventRegister(&we, waiter.EventIn)
 	defer wq.EventUnregister(&we)
@@ -289,7 +302,7 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) (cfg
 		reqOpts = append(reqOpts, option{optClientID, clientID})
 	}
 	h.setOptions(reqOpts)
-	if _, err := ep.Write(tcpip.SlicePayload(h), wopts); err != nil {
+	if _, _, err := ep.Write(tcpip.SlicePayload(h), wopts); err != nil {
 		return Config{}, fmt.Errorf("dhcp discovery write: %v", err)
 	}
 
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index 003e272b2..26700bdbc 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -95,9 +95,22 @@ func (c *epConn) Read() (buffer.View, tcpip.FullAddress, error) {
 }
 
 func (c *epConn) Write(b []byte, addr *tcpip.FullAddress) error {
-	if _, err := c.ep.Write(tcpip.SlicePayload(b), tcpip.WriteOptions{To: addr}); err != nil {
+	_, resCh, err := c.ep.Write(tcpip.SlicePayload(b), tcpip.WriteOptions{To: addr})
+	if err != nil && resCh == nil {
 		return fmt.Errorf("write: %v", err)
 	}
+
+	if resCh != nil {
+		select {
+		case <-resCh:
+		case <-c.ctx.Done():
+			return fmt.Errorf("dhcp server address resolution: %v", tcpip.ErrAborted)
+		}
+
+		if _, _, err := c.ep.Write(tcpip.SlicePayload(b), tcpip.WriteOptions{To: addr}); err != nil {
+			return fmt.Errorf("write: %v", err)
+		}
+	}
 	return nil
 }
 
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 4d32f7a31..550569b4c 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -276,10 +276,21 @@ func (i *ioSequencePayload) Size() int {
 // Write implements fs.FileOperations.Write.
 func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
 	f := &ioSequencePayload{ctx: ctx, src: src}
-	n, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
+	n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
 	if err == tcpip.ErrWouldBlock {
 		return int64(n), syserror.ErrWouldBlock
 	}
+
+	if resCh != nil {
+		t := ctx.(*kernel.Task)
+		if err := t.Block(resCh); err != nil {
+			return int64(n), syserr.FromError(err).ToError()
+		}
+
+		n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{})
+		return int64(n), syserr.TranslateNetstackError(err).ToError()
+	}
+
 	return int64(n), syserr.TranslateNetstackError(err).ToError()
 }
 
@@ -1016,7 +1027,13 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		EndOfRecord: flags&linux.MSG_EOR != 0,
 	}
 
-	n, err := s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+	n, resCh, err := s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+	if resCh != nil {
+		if err := t.Block(resCh); err != nil {
+			return int(n), syserr.FromError(err)
+		}
+		n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+	}
 	if err != tcpip.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		return int(n), syserr.TranslateNetstackError(err)
 	}
@@ -1030,7 +1047,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	v.TrimFront(int(n))
 	total := n
 	for {
-		n, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+		n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
 		v.TrimFront(int(n))
 		total += n
 		if err != tcpip.ErrWouldBlock {
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 490b9c648..b64dce720 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -393,9 +393,22 @@ func (c *Conn) Write(b []byte) (int, error) {
 		}
 
 		var n uintptr
-		n, err = c.ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
+		var resCh <-chan struct{}
+		n, resCh, err = c.ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
 		nbytes += int(n)
 		v.TrimFront(int(n))
+
+		if resCh != nil {
+			select {
+			case <-deadline:
+				return nbytes, c.newOpError("write", &timeoutError{})
+			case <-resCh:
+			}
+
+			n, _, err = c.ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
+			nbytes += int(n)
+			v.TrimFront(int(n))
+		}
 	}
 
 	if err == nil {
@@ -571,7 +584,16 @@ func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) {
 	copy(v, b)
 
 	wopts := tcpip.WriteOptions{To: &fullAddr}
-	n, err := c.ep.Write(tcpip.SlicePayload(v), wopts)
+	n, resCh, err := c.ep.Write(tcpip.SlicePayload(v), wopts)
+	if resCh != nil {
+		select {
+		case <-deadline:
+			return int(n), c.newRemoteOpError("write", addr, &timeoutError{})
+		case <-resCh:
+		}
+
+		n, _, err = c.ep.Write(tcpip.SlicePayload(v), wopts)
+	}
 
 	if err == tcpip.ErrWouldBlock {
 		// Create wait queue entry that notifies a channel.
@@ -579,15 +601,16 @@ func (c *PacketConn) WriteTo(b []byte, addr net.Addr) (int, error) {
 		c.wq.EventRegister(&waitEntry, waiter.EventOut)
 		defer c.wq.EventUnregister(&waitEntry)
 		for {
-			n, err = c.ep.Write(tcpip.SlicePayload(v), wopts)
-			if err != tcpip.ErrWouldBlock {
-				break
-			}
 			select {
 			case <-deadline:
 				return int(n), c.newRemoteOpError("write", addr, &timeoutError{})
 			case <-notifyCh:
 			}
+
+			n, _, err = c.ep.Write(tcpip.SlicePayload(v), wopts)
+			if err != tcpip.ErrWouldBlock {
+				break
+			}
 		}
 	}
 
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index a8eef4cf2..b8e53c13e 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -190,7 +190,7 @@ func TestLinkResolution(t *testing.T) {
 		if ctx.Err() != nil {
 			break
 		}
-		if _, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: 1, Addr: lladdr1}}); err == tcpip.ErrNoLinkAddress {
+		if _, _, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: 1, Addr: lladdr1}}); err == tcpip.ErrNoLinkAddress {
 			// There's something asynchronous going on; yield to let it do its thing.
 			runtime.Gosched()
 		} else if err == nil {
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index d029193fb..c4707736e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -80,7 +80,7 @@ func writer(ch chan struct{}, ep tcpip.Endpoint) {
 
 		v.CapLength(n)
 		for len(v) > 0 {
-			n, err := ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
+			n, _, err := ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
 			if err != nil {
 				fmt.Println("Write failed:", err)
 				return
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 04b8f251a..3a147a75f 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -88,12 +88,14 @@ type linkAddrEntry struct {
 	linkAddr   tcpip.LinkAddress
 	expiration time.Time
 	s          entryState
+	resDone    bool
 
 	// wakers is a set of waiters for address resolution result. Anytime
 	// state transitions out of 'incomplete' these waiters are notified.
 	wakers map[*sleep.Waker]struct{}
 
 	cancel chan struct{}
+	resCh  chan struct{}
 }
 
 func (e *linkAddrEntry) state() entryState {
@@ -182,15 +184,20 @@ func (c *linkAddrCache) makeAndAddEntry(k tcpip.FullAddress, v tcpip.LinkAddress
 	// someone waiting for address resolution on it.
 	entry.changeState(expired)
 	if entry.cancel != nil {
-		entry.cancel <- struct{}{}
+		if !entry.resDone {
+			close(entry.resCh)
+		}
+		close(entry.cancel)
 	}
 
 	*entry = linkAddrEntry{
 		addr:       k,
 		linkAddr:   v,
 		expiration: time.Now().Add(c.ageLimit),
+		resDone:    false,
 		wakers:     make(map[*sleep.Waker]struct{}),
 		cancel:     make(chan struct{}, 1),
+		resCh:      make(chan struct{}, 1),
 	}
 
 	c.cache[k] = entry
@@ -202,10 +209,10 @@ func (c *linkAddrCache) makeAndAddEntry(k tcpip.FullAddress, v tcpip.LinkAddress
 }
 
 // get reports any known link address for k.
-func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) (tcpip.LinkAddress, *tcpip.Error) {
+func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
 	if linkRes != nil {
 		if addr, ok := linkRes.ResolveStaticAddress(k.Addr); ok {
-			return addr, nil
+			return addr, nil, nil
 		}
 	}
 
@@ -214,10 +221,11 @@ func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, lo
 	if entry == nil || entry.state() == expired {
 		c.mu.Unlock()
 		if linkRes == nil {
-			return "", tcpip.ErrNoLinkAddress
+			return "", nil, tcpip.ErrNoLinkAddress
 		}
-		c.startAddressResolution(k, linkRes, localAddr, linkEP, waker)
-		return "", tcpip.ErrWouldBlock
+
+		ch := c.startAddressResolution(k, linkRes, localAddr, linkEP, waker)
+		return "", ch, tcpip.ErrWouldBlock
 	}
 	defer c.mu.Unlock()
 
@@ -227,13 +235,13 @@ func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, lo
 		// in that case it's safe to consider it ready.
 		fallthrough
 	case ready:
-		return entry.linkAddr, nil
+		return entry.linkAddr, nil, nil
 	case failed:
-		return "", tcpip.ErrNoLinkAddress
+		return "", nil, tcpip.ErrNoLinkAddress
 	case incomplete:
 		// Address resolution is still in progress.
 		entry.addWaker(waker)
-		return "", tcpip.ErrWouldBlock
+		return "", entry.resCh, tcpip.ErrWouldBlock
 	default:
 		panic(fmt.Sprintf("invalid cache entry state: %d", s))
 	}
@@ -249,13 +257,13 @@ func (c *linkAddrCache) removeWaker(k tcpip.FullAddress, waker *sleep.Waker) {
 	}
 }
 
-func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) {
+func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) <-chan struct{} {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
 	// Look up again with lock held to ensure entry wasn't added by someone else.
 	if e := c.cache[k]; e != nil && e.state() != expired {
-		return
+		return nil
 	}
 
 	// Add 'incomplete' entry in the cache to mark that resolution is in progress.
@@ -274,6 +282,15 @@ func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes Link
 			select {
 			case <-time.After(c.resolutionTimeout):
 				if stop := c.checkLinkRequest(k, i); stop {
+					// If entry is evicted then resCh is already closed.
+					c.mu.Lock()
+					if e, ok := c.cache[k]; ok {
+						if !e.resDone {
+							e.resDone = true
+							close(e.resCh)
+						}
+					}
+					c.mu.Unlock()
 					return
 				}
 			case <-cancel:
@@ -281,6 +298,7 @@ func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes Link
 			}
 		}
 	}()
+	return e.resCh
 }
 
 // checkLinkRequest checks whether previous attempt to resolve address has succeeded
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index f0988d6de..e46267f12 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -73,7 +73,7 @@ func getBlocking(c *linkAddrCache, addr tcpip.FullAddress, linkRes LinkAddressRe
 	defer s.Done()
 
 	for {
-		if got, err := c.get(addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+		if got, _, err := c.get(addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
 			return got, err
 		}
 		s.Fetch(true)
@@ -95,7 +95,7 @@ func TestCacheOverflow(t *testing.T) {
 	for i := len(testaddrs) - 1; i >= 0; i-- {
 		e := testaddrs[i]
 		c.add(e.addr, e.linkAddr)
-		got, err := c.get(e.addr, nil, "", nil, nil)
+		got, _, err := c.get(e.addr, nil, "", nil, nil)
 		if err != nil {
 			t.Errorf("insert %d, c.get(%q)=%q, got error: %v", i, string(e.addr.Addr), got, err)
 		}
@@ -106,7 +106,7 @@ func TestCacheOverflow(t *testing.T) {
 	// Expect to find at least half of the most recent entries.
 	for i := 0; i < linkAddrCacheSize/2; i++ {
 		e := testaddrs[i]
-		got, err := c.get(e.addr, nil, "", nil, nil)
+		got, _, err := c.get(e.addr, nil, "", nil, nil)
 		if err != nil {
 			t.Errorf("check %d, c.get(%q)=%q, got error: %v", i, string(e.addr.Addr), got, err)
 		}
@@ -117,7 +117,7 @@ func TestCacheOverflow(t *testing.T) {
 	// The earliest entries should no longer be in the cache.
 	for i := len(testaddrs) - 1; i >= len(testaddrs)-linkAddrCacheSize; i-- {
 		e := testaddrs[i]
-		if _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
+		if _, _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
 			t.Errorf("check %d, c.get(%q), got error: %v, want: error ErrNoLinkAddress", i, string(e.addr.Addr), err)
 		}
 	}
@@ -143,7 +143,7 @@ func TestCacheConcurrent(t *testing.T) {
 	// can fit in the cache, so our eviction strategy requires that
 	// the last entry be present and the first be missing.
 	e := testaddrs[len(testaddrs)-1]
-	got, err := c.get(e.addr, nil, "", nil, nil)
+	got, _, err := c.get(e.addr, nil, "", nil, nil)
 	if err != nil {
 		t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
 	}
@@ -152,7 +152,7 @@ func TestCacheConcurrent(t *testing.T) {
 	}
 
 	e = testaddrs[0]
-	if _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
+	if _, _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
 		t.Errorf("c.get(%q), got error: %v, want: error ErrNoLinkAddress", string(e.addr.Addr), err)
 	}
 }
@@ -162,7 +162,7 @@ func TestCacheAgeLimit(t *testing.T) {
 	e := testaddrs[0]
 	c.add(e.addr, e.linkAddr)
 	time.Sleep(50 * time.Millisecond)
-	if _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
+	if _, _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
 		t.Errorf("c.get(%q), got error: %v, want: error ErrNoLinkAddress", string(e.addr.Addr), err)
 	}
 }
@@ -172,7 +172,7 @@ func TestCacheReplace(t *testing.T) {
 	e := testaddrs[0]
 	l2 := e.linkAddr + "2"
 	c.add(e.addr, e.linkAddr)
-	got, err := c.get(e.addr, nil, "", nil, nil)
+	got, _, err := c.get(e.addr, nil, "", nil, nil)
 	if err != nil {
 		t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
 	}
@@ -181,7 +181,7 @@ func TestCacheReplace(t *testing.T) {
 	}
 
 	c.add(e.addr, l2)
-	got, err = c.get(e.addr, nil, "", nil, nil)
+	got, _, err = c.get(e.addr, nil, "", nil, nil)
 	if err != nil {
 		t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
 	}
@@ -206,7 +206,7 @@ func TestCacheResolution(t *testing.T) {
 	// Check that after resolved, address stays in the cache and never returns WouldBlock.
 	for i := 0; i < 10; i++ {
 		e := testaddrs[len(testaddrs)-1]
-		got, err := c.get(e.addr, linkRes, "", nil, nil)
+		got, _, err := c.get(e.addr, linkRes, "", nil, nil)
 		if err != nil {
 			t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
 		}
@@ -256,7 +256,7 @@ func TestStaticResolution(t *testing.T) {
 
 	addr := tcpip.Address("broadcast")
 	want := tcpip.LinkAddress("mac_broadcast")
-	got, err := c.get(tcpip.FullAddress{Addr: addr}, linkRes, "", nil, nil)
+	got, _, err := c.get(tcpip.FullAddress{Addr: addr}, linkRes, "", nil, nil)
 	if err != nil {
 		t.Errorf("c.get(%q)=%q, got error: %v", string(addr), string(got), err)
 	}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 595c7e793..0acec2984 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -289,7 +289,11 @@ type LinkAddressCache interface {
 	// registered with the network protocol, the cache attempts to resolve the address
 	// and returns ErrWouldBlock. Waker is notified when address resolution is
 	// complete (success or not).
-	GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, *tcpip.Error)
+	//
+	// If address resolution is required, ErrNoLinkAddress and a notification channel is
+	// returned for the top level caller to block. Channel is closed once address resolution
+	// is complete (success or not).
+	GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error)
 
 	// RemoveWaker removes a waker that has been added in GetLinkAddress().
 	RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker)
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index cc9b24e23..6c6400c33 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -89,11 +89,15 @@ func (r *Route) Capabilities() LinkEndpointCapabilities {
 // Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in
 // case address resolution requires blocking, e.g. wait for ARP reply. Waker is
 // notified when address resolution is complete (success or not).
-func (r *Route) Resolve(waker *sleep.Waker) *tcpip.Error {
+//
+// If address resolution is required, ErrNoLinkAddress and a notification channel is
+// returned for the top level caller to block. Channel is closed once address resolution
+// is complete (success or not).
+func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 	if !r.IsResolutionRequired() {
 		// Nothing to do if there is no cache (which does the resolution on cache miss) or
 		// link address is already known.
-		return nil
+		return nil, nil
 	}
 
 	nextAddr := r.NextHop
@@ -101,16 +105,16 @@ func (r *Route) Resolve(waker *sleep.Waker) *tcpip.Error {
 		// Local link address is already known.
 		if r.RemoteAddress == r.LocalAddress {
 			r.RemoteLinkAddress = r.LocalLinkAddress
-			return nil
+			return nil, nil
 		}
 		nextAddr = r.RemoteAddress
 	}
-	linkAddr, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
+	linkAddr, ch, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
 	if err != nil {
-		return err
+		return ch, err
 	}
 	r.RemoteLinkAddress = linkAddr
-	return nil
+	return nil, nil
 }
 
 // RemoveWaker removes a waker that has been added in Resolve().
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 699519be1..d1ec6a660 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -831,12 +831,12 @@ func (s *Stack) AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr t
 }
 
 // GetLinkAddress implements LinkAddressCache.GetLinkAddress.
-func (s *Stack) GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, *tcpip.Error) {
+func (s *Stack) GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
 	s.mu.RLock()
 	nic := s.nics[nicid]
 	if nic == nil {
 		s.mu.RUnlock()
-		return "", tcpip.ErrUnknownNICID
+		return "", nil, tcpip.ErrUnknownNICID
 	}
 	s.mu.RUnlock()
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 9ec37e7b6..98cc3b120 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -60,21 +60,21 @@ func (*fakeTransportEndpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.Contr
 	return buffer.View{}, tcpip.ControlMessages{}, nil
 }
 
-func (f *fakeTransportEndpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
+func (f *fakeTransportEndpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) {
 	if len(f.route.RemoteAddress) == 0 {
-		return 0, tcpip.ErrNoRoute
+		return 0, nil, tcpip.ErrNoRoute
 	}
 
 	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()))
 	v, err := p.Get(p.Size())
 	if err != nil {
-		return 0, err
+		return 0, nil, err
 	}
 	if err := f.route.WritePacket(hdr, buffer.View(v).ToVectorisedView(), fakeTransNumber, 123); err != nil {
-		return 0, err
+		return 0, nil, err
 	}
 
-	return uintptr(len(v)), nil
+	return uintptr(len(v)), nil, nil
 }
 
 func (f *fakeTransportEndpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) {
@@ -362,7 +362,7 @@ func TestTransportSend(t *testing.T) {
 
 	// Create buffer that will hold the payload.
 	view := buffer.NewView(30)
-	_, err = ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{})
+	_, _, err = ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{})
 	if err != nil {
 		t.Fatalf("write failed: %v", err)
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 61272cb05..5f210cdd0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -306,7 +306,12 @@ type Endpoint interface {
 	//
 	// Note that unlike io.Writer.Write, it is not an error for Write to
 	// perform a partial write.
-	Write(Payload, WriteOptions) (uintptr, *Error)
+	//
+	// For UDP and Ping sockets if address resolution is required,
+	// ErrNoLinkAddress and a notification channel is returned for the caller to
+	// block. Channel is closed once address resolution is complete (success or
+	// not). The channel is only non-nil in this case.
+	Write(Payload, WriteOptions) (uintptr, <-chan struct{}, *Error)
 
 	// Peek reads data without consuming it from the endpoint.
 	//
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index fcfb96624..055daa918 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -198,10 +198,10 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 
 // Write writes data to the endpoint's peer. This method does not block
 // if the data cannot be written.
-func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
+func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) {
 	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
 	if opts.More {
-		return 0, tcpip.ErrInvalidOptionValue
+		return 0, nil, tcpip.ErrInvalidOptionValue
 	}
 
 	to := opts.To
@@ -211,14 +211,14 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 
 	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
 	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
-		return 0, tcpip.ErrClosedForSend
+		return 0, nil, tcpip.ErrClosedForSend
 	}
 
 	// Prepare for write.
 	for {
 		retry, err := e.prepareForWrite(to)
 		if err != nil {
-			return 0, err
+			return 0, nil, err
 		}
 
 		if !retry {
@@ -241,7 +241,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 
 			// Recheck state after lock was re-acquired.
 			if e.state != stateConnected {
-				return 0, tcpip.ErrInvalidEndpointState
+				return 0, nil, tcpip.ErrInvalidEndpointState
 			}
 		}
 	} else {
@@ -250,7 +250,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 		nicid := to.NIC
 		if e.bindNICID != 0 {
 			if nicid != 0 && nicid != e.bindNICID {
-				return 0, tcpip.ErrNoRoute
+				return 0, nil, tcpip.ErrNoRoute
 			}
 
 			nicid = e.bindNICID
@@ -260,13 +260,13 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 		to = &toCopy
 		netProto, err := e.checkV4Mapped(to, true)
 		if err != nil {
-			return 0, err
+			return 0, nil, err
 		}
 
 		// Find the enpoint.
 		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto)
 		if err != nil {
-			return 0, err
+			return 0, nil, err
 		}
 		defer r.Release()
 
@@ -275,23 +275,20 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 
 	if route.IsResolutionRequired() {
 		waker := &sleep.Waker{}
-		if err := route.Resolve(waker); err != nil {
+		if ch, err := route.Resolve(waker); err != nil {
 			if err == tcpip.ErrWouldBlock {
 				// Link address needs to be resolved. Resolution was triggered the
 				// background. Better luck next time.
-				//
-				// TODO: queue up the request and send after link address
-				// is resolved.
 				route.RemoveWaker(waker)
-				return 0, tcpip.ErrNoLinkAddress
+				return 0, ch, tcpip.ErrNoLinkAddress
 			}
-			return 0, err
+			return 0, nil, err
 		}
 	}
 
 	v, err := p.Get(p.Size())
 	if err != nil {
-		return 0, err
+		return 0, nil, err
 	}
 
 	switch e.netProto {
@@ -302,7 +299,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 		err = sendPing6(route, e.id.LocalPort, v)
 	}
 
-	return uintptr(len(v)), err
+	return uintptr(len(v)), nil, err
 }
 
 // Peek only returns data from a single datagram, so do nothing here.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 68c0d4472..27dbcace2 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -365,7 +365,7 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 	for {
 		switch index {
 		case wakerForResolution:
-			if err := h.ep.route.Resolve(resolutionWaker); err != tcpip.ErrWouldBlock {
+			if _, err := h.ep.route.Resolve(resolutionWaker); err != tcpip.ErrWouldBlock {
 				// Either success (err == nil) or failure.
 				return err
 			}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index e82e25233..707d6be96 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -492,7 +492,7 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 }
 
 // Write writes data to the endpoint's peer.
-func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
+func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) {
 	// Linux completely ignores any address passed to sendto(2) for TCP sockets
 	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
 	// and opts.EndOfRecord are also ignored.
@@ -504,15 +504,15 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 	if e.state != stateConnected {
 		switch e.state {
 		case stateError:
-			return 0, e.hardError
+			return 0, nil, e.hardError
 		default:
-			return 0, tcpip.ErrClosedForSend
+			return 0, nil, tcpip.ErrClosedForSend
 		}
 	}
 
 	// Nothing to do if the buffer is empty.
 	if p.Size() == 0 {
-		return 0, nil
+		return 0, nil, nil
 	}
 
 	e.sndBufMu.Lock()
@@ -520,20 +520,20 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 	// Check if the connection has already been closed for sends.
 	if e.sndClosed {
 		e.sndBufMu.Unlock()
-		return 0, tcpip.ErrClosedForSend
+		return 0, nil, tcpip.ErrClosedForSend
 	}
 
 	// Check against the limit.
 	avail := e.sndBufSize - e.sndBufUsed
 	if avail <= 0 {
 		e.sndBufMu.Unlock()
-		return 0, tcpip.ErrWouldBlock
+		return 0, nil, tcpip.ErrWouldBlock
 	}
 
 	v, perr := p.Get(avail)
 	if perr != nil {
 		e.sndBufMu.Unlock()
-		return 0, perr
+		return 0, nil, perr
 	}
 
 	var err *tcpip.Error
@@ -558,7 +558,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 		// Let the protocol goroutine do the work.
 		e.sndWaker.Assert()
 	}
-	return uintptr(l), err
+	return uintptr(l), nil, err
 }
 
 // Peek reads data without consuming it from the endpoint.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index ac21e565b..48852ea47 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -869,7 +869,7 @@ func TestSimpleSend(t *testing.T) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -910,7 +910,7 @@ func TestZeroWindowSend(t *testing.T) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	_, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{})
+	_, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{})
 	if err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
@@ -971,7 +971,7 @@ func TestScaledWindowConnect(t *testing.T) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -1004,7 +1004,7 @@ func TestNonScaledWindowConnect(t *testing.T) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -1077,7 +1077,7 @@ func TestScaledWindowAccept(t *testing.T) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -1150,7 +1150,7 @@ func TestNonScaledWindowAccept(t *testing.T) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -1265,7 +1265,7 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -1653,7 +1653,7 @@ func TestSendOnResetConnection(t *testing.T) {
 
 	// Try to write.
 	view := buffer.NewView(10)
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != tcpip.ErrConnectionReset {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != tcpip.ErrConnectionReset {
 		t.Fatalf("got c.EP.Write(...) = %v, want = %v", err, tcpip.ErrConnectionReset)
 	}
 }
@@ -1763,7 +1763,7 @@ func TestFinWithNoPendingData(t *testing.T) {
 
 	// Write something out, and have it acknowledged.
 	view := buffer.NewView(10)
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -1836,7 +1836,7 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 	// any of them.
 	view := buffer.NewView(10)
 	for i := tcp.InitialCwnd; i > 0; i-- {
-		if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 			t.Fatalf("Write failed: %v", err)
 		}
 	}
@@ -1922,7 +1922,7 @@ func TestFinWithPendingData(t *testing.T) {
 
 	// Write something out, and acknowledge it to get cwnd to 2.
 	view := buffer.NewView(10)
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -1948,7 +1948,7 @@ func TestFinWithPendingData(t *testing.T) {
 	})
 
 	// Write new data, but don't acknowledge it.
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2009,7 +2009,7 @@ func TestFinWithPartialAck(t *testing.T) {
 	// Write something out, and acknowledge it to get cwnd to 2. Also send
 	// FIN from the test side.
 	view := buffer.NewView(10)
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2046,7 +2046,7 @@ func TestFinWithPartialAck(t *testing.T) {
 	)
 
 	// Write new data, but don't acknowledge it.
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2116,7 +2116,7 @@ func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
 
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
-	if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2158,7 +2158,7 @@ func TestCongestionAvoidance(t *testing.T) {
 
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
-	if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2263,7 +2263,7 @@ func TestCubicCongestionAvoidance(t *testing.T) {
 
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
-	if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2371,7 +2371,7 @@ func TestFastRecovery(t *testing.T) {
 
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
-	if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2503,11 +2503,11 @@ func TestRetransmit(t *testing.T) {
 	// Write all the data in two shots. Packets will only be written at the
 	// MTU size though.
 	half := data[:len(data)/2]
-	if _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 	half = data[len(data)/2:]
-	if _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -2605,7 +2605,7 @@ func scaledSendWindow(t *testing.T, scale uint8) {
 
 	// Send some data. Check that it's capped by the window size.
 	view := buffer.NewView(65535)
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -3099,7 +3099,7 @@ func TestSelfConnect(t *testing.T) {
 	data := []byte{1, 2, 3}
 	view := buffer.NewView(len(data))
 	copy(view, data)
-	if _, err := ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -3290,7 +3290,7 @@ func TestPathMTUDiscovery(t *testing.T) {
 		data[i] = byte(i)
 	}
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
@@ -3495,7 +3495,7 @@ func TestKeepalive(t *testing.T) {
 	// Send some data and wait before ACKing it. Keepalives should be disabled
 	// during this period.
 	view := buffer.NewView(3)
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Write failed: %v", err)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 894ead507..ca16fc8fa 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -147,7 +147,7 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Unexpected error from Write: %v", err)
 	}
 
@@ -210,7 +210,7 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
 	view := buffer.NewView(len(data))
 	copy(view, data)
 
-	if _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
 		t.Fatalf("Unexpected error from Write: %v", err)
 	}
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index f2dd98f35..6ed805357 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -258,10 +258,10 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 
 // Write writes data to the endpoint's peer. This method does not block
 // if the data cannot be written.
-func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tcpip.Error) {
+func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) {
 	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
 	if opts.More {
-		return 0, tcpip.ErrInvalidOptionValue
+		return 0, nil, tcpip.ErrInvalidOptionValue
 	}
 
 	to := opts.To
@@ -271,14 +271,14 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 
 	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
 	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
-		return 0, tcpip.ErrClosedForSend
+		return 0, nil, tcpip.ErrClosedForSend
 	}
 
 	// Prepare for write.
 	for {
 		retry, err := e.prepareForWrite(to)
 		if err != nil {
-			return 0, err
+			return 0, nil, err
 		}
 
 		if !retry {
@@ -303,7 +303,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 
 			// Recheck state after lock was re-acquired.
 			if e.state != stateConnected {
-				return 0, tcpip.ErrInvalidEndpointState
+				return 0, nil, tcpip.ErrInvalidEndpointState
 			}
 		}
 	} else {
@@ -312,7 +312,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 		nicid := to.NIC
 		if e.bindNICID != 0 {
 			if nicid != 0 && nicid != e.bindNICID {
-				return 0, tcpip.ErrNoRoute
+				return 0, nil, tcpip.ErrNoRoute
 			}
 
 			nicid = e.bindNICID
@@ -322,13 +322,13 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 		to = &toCopy
 		netProto, err := e.checkV4Mapped(to, false)
 		if err != nil {
-			return 0, err
+			return 0, nil, err
 		}
 
 		// Find the enpoint.
 		r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, to.Addr, netProto)
 		if err != nil {
-			return 0, err
+			return 0, nil, err
 		}
 		defer r.Release()
 
@@ -338,23 +338,20 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 
 	if route.IsResolutionRequired() {
 		waker := &sleep.Waker{}
-		if err := route.Resolve(waker); err != nil {
+		if ch, err := route.Resolve(waker); err != nil {
 			if err == tcpip.ErrWouldBlock {
 				// Link address needs to be resolved. Resolution was triggered the background.
 				// Better luck next time.
-				//
-				// TODO: queue up the request and send after link address
-				// is resolved.
 				route.RemoveWaker(waker)
-				return 0, tcpip.ErrNoLinkAddress
+				return 0, ch, tcpip.ErrNoLinkAddress
 			}
-			return 0, err
+			return 0, nil, err
 		}
 	}
 
 	v, err := p.Get(p.Size())
 	if err != nil {
-		return 0, err
+		return 0, nil, err
 	}
 
 	ttl := route.DefaultTTL()
@@ -363,9 +360,9 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, *tc
 	}
 
 	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.id.LocalPort, dstPort, ttl); err != nil {
-		return 0, err
+		return 0, nil, err
 	}
-	return uintptr(len(v)), nil
+	return uintptr(len(v)), nil, nil
 }
 
 // Peek only returns data from a single datagram, so do nothing here.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 46110c8ff..c3f592bd4 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -482,7 +482,7 @@ func TestV4ReadOnV4(t *testing.T) {
 func testV4Write(c *testContext) uint16 {
 	// Write to V4 mapped address.
 	payload := buffer.View(newPayload())
-	n, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
+	n, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
 		To: &tcpip.FullAddress{Addr: testV4MappedAddr, Port: testPort},
 	})
 	if err != nil {
@@ -512,7 +512,7 @@ func testV4Write(c *testContext) uint16 {
 func testV6Write(c *testContext) uint16 {
 	// Write to v6 address.
 	payload := buffer.View(newPayload())
-	n, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
+	n, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
 		To: &tcpip.FullAddress{Addr: testV6Addr, Port: testPort},
 	})
 	if err != nil {
@@ -590,7 +590,7 @@ func TestDualWriteConnectedToV6(t *testing.T) {
 
 	// Write to V4 mapped address.
 	payload := buffer.View(newPayload())
-	_, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
+	_, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
 		To: &tcpip.FullAddress{Addr: testV4MappedAddr, Port: testPort},
 	})
 	if err != tcpip.ErrNetworkUnreachable {
@@ -613,7 +613,7 @@ func TestDualWriteConnectedToV4Mapped(t *testing.T) {
 
 	// Write to v6 address.
 	payload := buffer.View(newPayload())
-	_, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
+	_, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
 		To: &tcpip.FullAddress{Addr: testV6Addr, Port: testPort},
 	})
 	if err != tcpip.ErrInvalidEndpointState {
@@ -629,7 +629,7 @@ func TestV4WriteOnV6Only(t *testing.T) {
 
 	// Write to V4 mapped address.
 	payload := buffer.View(newPayload())
-	_, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
+	_, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
 		To: &tcpip.FullAddress{Addr: testV4MappedAddr, Port: testPort},
 	})
 	if err != tcpip.ErrNoRoute {
@@ -650,7 +650,7 @@ func TestV6WriteOnBoundToV4Mapped(t *testing.T) {
 
 	// Write to v6 address.
 	payload := buffer.View(newPayload())
-	_, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
+	_, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
 		To: &tcpip.FullAddress{Addr: testV6Addr, Port: testPort},
 	})
 	if err != tcpip.ErrInvalidEndpointState {
@@ -671,7 +671,7 @@ func TestV6WriteOnConnected(t *testing.T) {
 
 	// Write without destination.
 	payload := buffer.View(newPayload())
-	n, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{})
+	n, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{})
 	if err != nil {
 		c.t.Fatalf("Write failed: %v", err)
 	}
@@ -707,7 +707,7 @@ func TestV4WriteOnConnected(t *testing.T) {
 
 	// Write without destination.
 	payload := buffer.View(newPayload())
-	n, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{})
+	n, _, err := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{})
 	if err != nil {
 		c.t.Fatalf("Write failed: %v", err)
 	}
@@ -856,7 +856,7 @@ func TestTTL(t *testing.T) {
 								c.t.Fatalf("SetSockOpt failed: %v", err)
 							}
 
-							n, err := c.ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{Addr: addr, Port: port}})
+							n, _, err := c.ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{Addr: addr, Port: port}})
 							if err != nil {
 								c.t.Fatalf("Write failed: %v", err)
 							}
-- 
cgit v1.2.3


From 3ff24b4f2c6d5a7a872a744150bbfca795afdbfc Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 28 Sep 2018 11:02:11 -0700
Subject: Require AF_UNIX sockets from the gofer

host.endpoint already has the check, but it is missing from
host.ConnectedEndpoint.

PiperOrigin-RevId: 214962762
Change-Id: I88bb13a5c5871775e4e7bf2608433df8a3d348e6
---
 pkg/sentry/fs/gofer/socket.go |  3 +++
 pkg/sentry/fs/host/socket.go  | 22 +++++++++++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 8628b9c69..0190bc006 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -15,6 +15,7 @@
 package gofer
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
@@ -101,6 +102,7 @@ func (e *endpoint) BidirectionalConnect(ce unix.ConnectingEndpoint, returnConnec
 	c, terr := host.NewConnectedEndpoint(hostFile, ce.WaiterQueue(), e.path)
 	if terr != nil {
 		ce.Unlock()
+		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, cf, terr)
 		return terr
 	}
 
@@ -120,6 +122,7 @@ func (e *endpoint) UnidirectionalConnect() (unix.ConnectedEndpoint, *tcpip.Error
 
 	c, terr := host.NewConnectedEndpoint(hostFile, &waiter.Queue{}, e.path)
 	if terr != nil {
+		log.Warningf("Gofer returned invalid host socket for UnidirectionalConnect; file %+v: %v", e.file, terr)
 		return nil, terr
 	}
 	c.Init()
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 4ace71c3e..e11772946 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -35,6 +35,8 @@ import (
 
 // endpoint encapsulates the state needed to represent a host Unix socket.
 //
+// TODO: Remove/merge with ConnectedEndpoint.
+//
 // +stateify savable
 type endpoint struct {
 	queue waiter.Queue `state:"zerovalue"`
@@ -288,13 +290,23 @@ func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.Fu
 	return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil
 }
 
-// NewConnectedEndpoint creates a new ConnectedEndpoint backed by
-// a host FD that will pretend to be bound at a given sentry path.
+// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD
+// that will pretend to be bound at a given sentry path.
 //
-// The caller is responsible for calling Init(). Additionaly, Release needs
-// to be called twice because host.ConnectedEndpoint is both a
-// unix.Receiver and unix.ConnectedEndpoint.
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because host.ConnectedEndpoint is both a unix.Receiver and
+// unix.ConnectedEndpoint.
 func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *tcpip.Error) {
+	family, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+	if err != nil {
+		return nil, translateError(err)
+	}
+
+	if family != syscall.AF_UNIX {
+		// We only allow Unix sockets.
+		return nil, tcpip.ErrInvalidEndpointState
+	}
+
 	e := &ConnectedEndpoint{path: path, queue: queue, file: file}
 
 	// AtomicRefCounters start off with a single reference. We need two.
-- 
cgit v1.2.3


From 07aa040842cfd31a0c6e851900173d02dd01c7fe Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 1 Oct 2018 13:54:57 -0700
Subject: Fix possible panic in control.Processes.

There was a race where we checked task.Parent() != nil, and then later called
task.Parent() again, assuming that it is not nil.  If the task is exiting, the
parent may have been set to nil in between the two calls, causing a panic.

This CL changes the code to only call task.Parent() once.

PiperOrigin-RevId: 215274456
Change-Id: Ib5a537312c917773265ec72016014f7bc59a5f59
---
 pkg/sentry/control/proc.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 106055e86..faf1168bb 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -278,8 +278,8 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
 		}
 
 		ppid := kernel.ThreadID(0)
-		if tg.Leader().Parent() != nil {
-			ppid = ts.Root.IDOfThreadGroup(tg.Leader().Parent().ThreadGroup())
+		if p := tg.Leader().Parent(); p != nil {
+			ppid = ts.Root.IDOfThreadGroup(p.ThreadGroup())
 		}
 		*out = append(*out, &Process{
 			UID:   tg.Leader().Credentials().EffectiveKUID,
-- 
cgit v1.2.3


From 0400e5459288592768af12ab71609c6df6afe3d7 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 1 Oct 2018 14:15:52 -0700
Subject: Add itimer types to linux package, strace

PiperOrigin-RevId: 215278262
Change-Id: Icd10384c99802be6097be938196044386441e282
---
 pkg/abi/linux/BUILD                    |  1 +
 pkg/abi/linux/timer.go                 | 23 +++++++++++++++++++++++
 pkg/sentry/strace/linux64.go           |  4 ++--
 pkg/sentry/strace/strace.go            | 19 +++++++++++++++++++
 pkg/sentry/strace/syscalls.go          |  7 +++++--
 pkg/sentry/syscalls/linux/sys_timer.go | 27 +++++++--------------------
 6 files changed, 57 insertions(+), 24 deletions(-)
 create mode 100644 pkg/abi/linux/timer.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index ac4ceefbc..f8f82c0da 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -44,6 +44,7 @@ go_library(
         "signal.go",
         "socket.go",
         "time.go",
+        "timer.go",
         "tty.go",
         "uio.go",
         "utsname.go",
diff --git a/pkg/abi/linux/timer.go b/pkg/abi/linux/timer.go
new file mode 100644
index 000000000..6c4675c35
--- /dev/null
+++ b/pkg/abi/linux/timer.go
@@ -0,0 +1,23 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// itimer types for getitimer(2) and setitimer(2), from
+// include/uapi/linux/time.h.
+const (
+	ITIMER_REAL    = 0
+	ITIMER_VIRTUAL = 1
+	ITIMER_PROF    = 2
+)
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 63851246c..1df148e7d 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -53,9 +53,9 @@ var linuxAMD64 = SyscallMap{
 	33:  makeSyscallInfo("dup2", Hex, Hex),
 	34:  makeSyscallInfo("pause"),
 	35:  makeSyscallInfo("nanosleep", Timespec, PostTimespec),
-	36:  makeSyscallInfo("getitimer", Hex, PostItimerVal),
+	36:  makeSyscallInfo("getitimer", ItimerType, PostItimerVal),
 	37:  makeSyscallInfo("alarm", Hex),
-	38:  makeSyscallInfo("setitimer", Hex, ItimerVal, PostItimerVal),
+	38:  makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal),
 	39:  makeSyscallInfo("getpid"),
 	40:  makeSyscallInfo("sendfile", Hex, Hex, Hex, Hex),
 	41:  makeSyscallInfo("socket", SockFamily, SockType, SockProtocol),
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 539e665d2..c99c33c33 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -24,6 +24,7 @@ import (
 	"syscall"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bits"
 	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
@@ -46,6 +47,22 @@ var LogMaximumSize uint = DefaultLogMaximumSize
 // do anything useful with binary text dump of byte array arguments.
 var EventMaximumSize uint
 
+// ItimerTypes are the possible itimer types.
+var ItimerTypes = abi.ValueSet{
+	{
+		Value: linux.ITIMER_REAL,
+		Name:  "ITIMER_REAL",
+	},
+	{
+		Value: linux.ITIMER_VIRTUAL,
+		Name:  "ITIMER_VIRTUAL",
+	},
+	{
+		Value: linux.ITIMER_PROF,
+		Name:  "ITIMER_PROF",
+	},
+}
+
 func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, maxBytes uint64) string {
 	if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
 		return fmt.Sprintf("%#x (error decoding iovecs: invalid iovcnt)", addr)
@@ -322,6 +339,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, futex(uint64(args[arg].Uint())))
 		case PtraceRequest:
 			output = append(output, PtraceRequestSet.Parse(args[arg].Uint64()))
+		case ItimerType:
+			output = append(output, ItimerTypes.Parse(uint64(args[arg].Int())))
 		case Oct:
 			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
 		case Hex:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 770a0d2b9..8be4fa318 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -150,6 +150,9 @@ const (
 	// Utimbuf is a pointer to a struct utimbuf.
 	Utimbuf
 
+	// Rusage is a struct rusage, formatted after syscall execution.
+	Rusage
+
 	// CloneFlags are clone(2) flags.
 	CloneFlags
 
@@ -165,8 +168,8 @@ const (
 	// PtraceRequest is the ptrace(2) request.
 	PtraceRequest
 
-	// Rusage is a struct rusage, formatted after syscall execution.
-	Rusage
+	// ItimerType is an itimer type (ITIMER_REAL, etc).
+	ItimerType
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index aaed75c81..a12d12d9d 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -25,19 +25,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// ItimerType denotes the type of interval timer.
-type ItimerType int
-
-// Interval timer types from <sys/time.h>.
-const (
-	// ItimerReal equals to ITIMER_REAL.
-	ItimerReal ItimerType = iota
-	// ItimerVirtual equals to ITIMER_VIRTUAL.
-	ItimerVirtual
-	// ItimerProf equals to ITIMER_PROF.
-	ItimerProf
-)
-
 const nsecPerSec = int64(time.Second)
 
 // copyItimerValIn copies an ItimerVal from the untrusted app range to the
@@ -83,13 +70,13 @@ func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) e
 	}
 }
 
-func findTimer(t *kernel.Task, w ItimerType) (*ktime.Timer, error) {
-	switch w {
-	case ItimerReal:
+func findTimer(t *kernel.Task, which int32) (*ktime.Timer, error) {
+	switch which {
+	case linux.ITIMER_REAL:
 		return t.ThreadGroup().Timer().RealTimer, nil
-	case ItimerVirtual:
+	case linux.ITIMER_VIRTUAL:
 		return t.ThreadGroup().Timer().VirtualTimer, nil
-	case ItimerProf:
+	case linux.ITIMER_PROF:
 		return t.ThreadGroup().Timer().ProfTimer, nil
 	default:
 		return nil, syscall.EINVAL
@@ -98,7 +85,7 @@ func findTimer(t *kernel.Task, w ItimerType) (*ktime.Timer, error) {
 
 // Getitimer implements linux syscall getitimer(2).
 func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	timerID := ItimerType(args[0].Int())
+	timerID := args[0].Int()
 	val := args[1].Pointer()
 
 	timer, err := findTimer(t, timerID)
@@ -116,7 +103,7 @@ func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 
 // Setitimer implements linux syscall setitimer(2).
 func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	timerID := ItimerType(args[0].Int())
+	timerID := args[0].Int()
 	newVal := args[1].Pointer()
 	oldVal := args[2].Pointer()
 
-- 
cgit v1.2.3


From f1c01ed88666ea81d8f5cef7931153a9951a6e64 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 1 Oct 2018 22:05:41 -0700
Subject: runsc: Support job control signals in "exec -it".

Terminal support in runsc relies on host tty file descriptors that are imported
into the sandbox. Application tty ioctls are sent directly to the host fd.

However, those host tty ioctls are associated in the host kernel with a host
process (in this case runsc), and the host kernel intercepts job control
characters like ^C and send signals to the host process. Thus, typing ^C into a
"runsc exec" shell will send a SIGINT to the runsc process.

This change makes "runsc exec" handle all signals, and forward them into the
sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is
associated with a particular container process in the sandbox, the signal must
be associated with the same container process.

One big difficulty is that the signal should not necessarily be sent to the
sandbox process started by "exec", but instead must be sent to the foreground
process group for the tty. For example, we may exec "bash", and from bash call
"sleep 100". A ^C at this point should SIGINT sleep, not bash.

To handle this, tty files inside the sandbox must keep track of their
foreground process group, which is set/get via ioctls. When an incoming
ContainerSignal urpc comes in, we look up the foreground process group via the
tty file. Unfortunately, this means we have to expose and cache the tty file in
the Loader.

Note that "runsc exec" now handles signals properly, but "runs run" does not.
That will come in a later CL, as this one is complex enough already.

Example:
	root@:/usr/local/apache2# sleep 100
	^C

	root@:/usr/local/apache2# sleep 100
	^Z
	[1]+  Stopped                 sleep 100

	root@:/usr/local/apache2# fg
	sleep 100
	^C

	root@:/usr/local/apache2#

PiperOrigin-RevId: 215334554
Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
---
 pkg/sentry/control/proc.go              |  78 ++++++++++----
 pkg/sentry/fs/host/BUILD                |   1 +
 pkg/sentry/fs/host/file.go              | 144 +++++--------------------
 pkg/sentry/fs/host/inode.go             |  18 ++--
 pkg/sentry/fs/host/tty.go               | 185 ++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/sessions.go           |   5 +
 runsc/boot/controller.go                |  46 ++++++--
 runsc/boot/fds.go                       |   8 +-
 runsc/boot/loader.go                    | 108 ++++++++++++++-----
 runsc/cmd/exec.go                       |  13 ++-
 runsc/container/BUILD                   |   2 +
 runsc/container/container.go            |  51 ++++++---
 runsc/container/container_test.go       | 117 ++++++++++++++++++++
 runsc/container/multi_container_test.go |   7 +-
 runsc/sandbox/sandbox.go                |  49 ++++++---
 runsc/test/integration/exec_test.go     |  55 ++++++++++
 runsc/test/testutil/BUILD               |   1 +
 runsc/test/testutil/docker.go           |  21 ++++
 runsc/test/testutil/testutil.go         |  36 +++++++
 19 files changed, 732 insertions(+), 213 deletions(-)
 create mode 100644 pkg/sentry/fs/host/tty.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index faf1168bb..0ba730c1e 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -78,7 +78,7 @@ type ExecArgs struct {
 	Capabilities *auth.TaskCapabilities
 
 	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host
-	// pty fd.
+	// pty FD.
 	StdioIsPty bool
 
 	// FilePayload determines the files to give to the new process.
@@ -90,7 +90,7 @@ type ExecArgs struct {
 
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
-	newTG, _, err := proc.execAsync(args)
+	newTG, _, _, err := proc.execAsync(args)
 	if err != nil {
 		return err
 	}
@@ -103,18 +103,27 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 
 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
 // as a function rather than a method to avoid exposing execAsync as an RPC.
-func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
+func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
 	return proc.execAsync(args)
 }
 
 // execAsync runs a new task, but doesn't wait for it to finish. It returns the
-// newly created thread group and its PID.
-func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, error) {
+// newly created thread group and its PID. If the stdio FDs are TTYs, then a
+// TTYFileOperations that wraps the TTY is also returned.
+func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
 	// Import file descriptors.
 	l := limits.NewLimitSet()
 	fdm := proc.Kernel.NewFDMap()
 	defer fdm.DecRef()
 
+	// No matter what happens, we should close all files in the FilePayload
+	// before returning. Any files that are imported will be duped.
+	defer func() {
+		for _, f := range args.FilePayload.Files {
+			f.Close()
+		}
+	}()
+
 	creds := auth.NewUserCredentials(
 		args.KUID,
 		args.KGID,
@@ -150,31 +159,62 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		paths := fs.GetPath(initArgs.Envv)
 		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
-			return nil, 0, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+			return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
 		}
 		initArgs.Filename = f
 	}
 
 	mounter := fs.FileOwnerFromContext(ctx)
-	for appFD, f := range args.FilePayload.Files {
-		enableIoctl := args.StdioIsPty && appFD <= 2
 
-		// Import the given file FD. This dups the FD as well.
-		file, err := host.ImportFile(ctx, int(f.Fd()), mounter, enableIoctl)
-		if err != nil {
-			return nil, 0, err
+	var ttyFile *fs.File
+	for appFD, hostFile := range args.FilePayload.Files {
+		var appFile *fs.File
+
+		if args.StdioIsPty && appFD < 3 {
+			// Import the file as a host TTY file.
+			if ttyFile == nil {
+				var err error
+				appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, true /* isTTY */)
+				if err != nil {
+					return nil, 0, nil, err
+				}
+				defer appFile.DecRef()
+
+				// Remember this in the TTY file, as we will
+				// use it for the other stdio FDs.
+				ttyFile = appFile
+			} else {
+				// Re-use the existing TTY file, as all three
+				// stdio FDs must point to the same fs.File in
+				// order to share TTY state, specifically the
+				// foreground process group id.
+				appFile = ttyFile
+			}
+		} else {
+			// Import the file as a regular host file.
+			var err error
+			appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, false /* isTTY */)
+			if err != nil {
+				return nil, 0, nil, err
+			}
+			defer appFile.DecRef()
 		}
-		defer file.DecRef()
-
-		// We're done with this file.
-		f.Close()
 
-		if err := fdm.NewFDAt(kdefs.FD(appFD), file, kernel.FDFlags{}, l); err != nil {
-			return nil, 0, err
+		// Add the file to the FD map.
+		if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+			return nil, 0, nil, err
 		}
 	}
 
-	return proc.Kernel.CreateProcess(initArgs)
+	tg, tid, err := proc.Kernel.CreateProcess(initArgs)
+	if err != nil {
+		return nil, 0, nil, err
+	}
+
+	if ttyFile == nil {
+		return tg, tid, nil, nil
+	}
+	return tg, tid, ttyFile.FileOperations.(*host.TTYFileOperations), nil
 }
 
 // PsArgs is the set of arguments to ps.
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index d1a6eaf6e..c34f1c26b 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -17,6 +17,7 @@ go_library(
         "socket.go",
         "socket_state.go",
         "socket_unsafe.go",
+        "tty.go",
         "util.go",
         "util_unsafe.go",
     ],
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 6f469b5cc..22a5d9f12 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -18,15 +18,12 @@ import (
 	"fmt"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/secio"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -39,6 +36,7 @@ import (
 //
 // +stateify savable
 type fileOperations struct {
+	fsutil.NoIoctl     `state:"nosave"`
 	fsutil.NoopRelease `state:"nosave"`
 
 	// iops are the Inode operations for this file.
@@ -49,49 +47,49 @@ type fileOperations struct {
 
 	// dirCursor is the directory cursor.
 	dirCursor string
-
-	// allowIoctl determines whether ioctls should be passed through to the
-	// host.
-	allowIoctl bool
 }
 
 // fileOperations implements fs.FileOperations.
 var _ fs.FileOperations = (*fileOperations)(nil)
 
 // NewFile creates a new File backed by the provided host file descriptor. If
-// NewFile succeeds, ownership of the fd is transferred to the returned File.
+// NewFile succeeds, ownership of the FD is transferred to the returned File.
 //
 // The returned File cannot be saved, since there is no guarantee that the same
-// fd will exist or represent the same file at time of restore. If such a
+// FD will exist or represent the same file at time of restore. If such a
 // guarantee does exist, use ImportFile instead.
 func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) {
 	return newFileFromDonatedFD(ctx, fd, mounter, false, false)
 }
 
 // ImportFile creates a new File backed by the provided host file descriptor.
-// Unlike NewFile, the file descriptor used by the File is duped from fd to
-// ensure that later changes to fd are not reflected by the fs.File.
+// Unlike NewFile, the file descriptor used by the File is duped from FD to
+// ensure that later changes to FD are not reflected by the fs.File.
 //
-// If the returned file is saved, it will be restored by re-importing the fd
+// If the returned file is saved, it will be restored by re-importing the FD
 // originally passed to ImportFile. It is the restorer's responsibility to
-// ensure that the fd represents the same file.
-func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, allowIoctl bool) (*fs.File, error) {
-	return newFileFromDonatedFD(ctx, fd, mounter, true, allowIoctl)
+// ensure that the FD represents the same file.
+func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) {
+	return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY)
 }
 
-// newFileFromDonatedFD returns an fs.File from a donated fd. If the fd is
+// newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is
 // saveable, then saveable is true.
-func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, allowIoctl bool) (*fs.File, error) {
+func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(donated, &s); err != nil {
 		return nil, err
 	}
+	flags, err := fileFlagsFromDonatedFD(donated)
+	if err != nil {
+		return nil, err
+	}
 	switch s.Mode & syscall.S_IFMT {
 	case syscall.S_IFSOCK:
-		flags, err := fileFlagsFromDonatedFD(donated)
-		if err != nil {
-			return nil, err
+		if isTTY {
+			return nil, fmt.Errorf("cannot import host socket as TTY")
 		}
+
 		s, err := newSocket(ctx, donated, saveable)
 		if err != nil {
 			return nil, err
@@ -101,10 +99,6 @@ func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner
 		})
 		return s, nil
 	default:
-		flags, err := fileFlagsFromDonatedFD(donated)
-		if err != nil {
-			return nil, err
-		}
 		msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */)
 		inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */)
 		if err != nil {
@@ -116,14 +110,18 @@ func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner
 		dirent := fs.NewDirent(inode, name)
 		defer dirent.DecRef()
 
-		return newFile(ctx, dirent, flags, iops, allowIoctl), nil
+		if isTTY {
+			return newTTYFile(ctx, dirent, flags, iops), nil
+		}
+
+		return newFile(ctx, dirent, flags, iops), nil
 	}
 }
 
 func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
 	flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0)
 	if errno != 0 {
-		log.Warningf("Failed to get file flags for donated fd %d (errno=%d)", donated, errno)
+		log.Warningf("Failed to get file flags for donated FD %d (errno=%d)", donated, errno)
 		return fs.FileFlags{}, syscall.EIO
 	}
 	accmode := flags & syscall.O_ACCMODE
@@ -138,17 +136,14 @@ func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) {
 }
 
 // newFile returns a new fs.File.
-func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations, allowIoctl bool) *fs.File {
+func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
 	if !iops.ReturnsWouldBlock() {
 		// Allow reading/writing at an arbitrary offset for files
 		// that support it.
 		flags.Pread = true
 		flags.Pwrite = true
 	}
-	return fs.NewFile(ctx, dirent, flags, &fileOperations{
-		iops:       iops,
-		allowIoctl: allowIoctl,
-	})
+	return fs.NewFile(ctx, dirent, flags, &fileOperations{iops: iops})
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
@@ -269,7 +264,7 @@ func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64,
 func (f *fileOperations) Flush(context.Context, *fs.File) error {
 	// This is a no-op because flushing the resource backing this
 	// file would mean closing it. We can't do that because other
-	// open files may depend on the backing host fd.
+	// open files may depend on the backing host FD.
 	return nil
 }
 
@@ -285,88 +280,3 @@ func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts
 func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
 	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
 }
-
-// Ioctl implements fs.FileOperations.Iocotl.
-func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	if !f.allowIoctl {
-		return 0, syserror.ENOTTY
-	}
-	// Ignore arg[0].  This is the real FD:
-	fd := f.iops.fileState.FD()
-	ioctl := args[1].Uint64()
-	switch ioctl {
-	case linux.TCGETS:
-		termios, err := ioctlGetTermios(fd)
-		if err != nil {
-			return 0, err
-		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-
-	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
-		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
-			return 0, err
-		}
-		err := ioctlSetTermios(fd, ioctl, &termios)
-		return 0, err
-
-	case linux.TIOCGPGRP:
-		// Args: pid_t *argp
-		// When successful, equivalent to *argp = tcgetpgrp(fd).
-		// Get the process group ID of the foreground process group on
-		// this terminal.
-
-		t := kernel.TaskFromContext(ctx)
-		if t == nil {
-			panic(fmt.Sprintf("cannot get thread group from context %v", ctx))
-		}
-		tid := t.ThreadID()
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tid, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-
-	case linux.TIOCSPGRP:
-		// Args: const pid_t *argp
-		// Equivalent to tcsetpgrp(fd, *argp).
-		// Set the foreground process group ID of this terminal.
-
-		// Not much we can do with this one at the moment, so we just
-		// lie and pretend everything is great. Bash and Sh seem fine
-		// with this.
-		log.Warningf("Ignoring application ioctl(TIOCSPGRP) call")
-		return 0, nil
-
-	case linux.TIOCGWINSZ:
-		// Args: struct winsize *argp
-		// Get window size.
-		winsize, err := ioctlGetWinsize(fd)
-		if err != nil {
-			return 0, err
-		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-
-	case linux.TIOCSWINSZ:
-		// Args: const struct winsize *argp
-		// Set window size.
-		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
-			return 0, err
-		}
-		err := ioctlSetWinsize(fd, &winsize)
-		return 0, err
-
-	default:
-		return 0, syserror.ENOTTY
-	}
-}
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index e7254fa7d..c2e8ba62f 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -73,7 +73,7 @@ type inodeFileState struct {
 	// Common file system state.
 	mops *superOperations `state:"wait"`
 
-	// descriptor is the backing host fd.
+	// descriptor is the backing host FD.
 	descriptor *descriptor `state:"wait"`
 
 	// Event queue for blocking operations.
@@ -167,7 +167,7 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 // inodeOperations implements fs.InodeOperations.
 var _ fs.InodeOperations = (*inodeOperations)(nil)
 
-// newInode returns a new fs.Inode backed by the host fd.
+// newInode returns a new fs.Inode backed by the host FD.
 func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) {
 	// Retrieve metadata.
 	var s syscall.Stat_t
@@ -212,8 +212,8 @@ func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
 	return i.cachingInodeOps
 }
 
-// ReturnsWouldBlock returns true if this host fd can return EWOULDBLOCK
-// for operations that would block.
+// ReturnsWouldBlock returns true if this host FD can return EWOULDBLOCK for
+// operations that would block.
 func (i *inodeOperations) ReturnsWouldBlock() bool {
 	return i.fileState.descriptor.wouldBlock
 }
@@ -226,7 +226,7 @@ func (i *inodeOperations) Release(context.Context) {
 
 // Lookup implements fs.InodeOperations.Lookup.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	// Get a new fd relative to i at name.
+	// Get a new FD relative to i at name.
 	fd, err := open(i, name)
 	if err != nil {
 		if err == syserror.ENOENT {
@@ -321,7 +321,7 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.Bound
 
 // GetFile implements fs.InodeOperations.GetFile.
 func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return newFile(ctx, d, flags, i, false), nil
+	return newFile(ctx, d, flags, i), nil
 }
 
 // canMap returns true if this fs.Inode can be memory mapped.
@@ -362,7 +362,7 @@ func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) err
 func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool {
 	// Can we use host kernel metadata caches?
 	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
-		// Then just change the timestamps on the fd, the host
+		// Then just change the timestamps on the FD, the host
 		// will synchronize the metadata update with any host
 		// inode and page cache.
 		return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil
@@ -375,7 +375,7 @@ func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f
 func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
 	// Can we use host kernel metadata caches?
 	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
-		// Then just change the timestamps on the fd, the host
+		// Then just change the timestamps on the FD, the host
 		// will synchronize the metadata update with any host
 		// inode and page cache.
 		return setTimestamps(i.fileState.FD(), ts)
@@ -388,7 +388,7 @@ func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts
 func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
 	// Is the file not memory-mappable?
 	if !canMap(inode) {
-		// Then just change the file size on the fd, the host
+		// Then just change the file size on the FD, the host
 		// will synchronize the metadata update with any host
 		// inode and page cache.
 		return syscall.Ftruncate(i.fileState.FD(), size)
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
new file mode 100644
index 000000000..ad1323610
--- /dev/null
+++ b/pkg/sentry/fs/host/tty.go
@@ -0,0 +1,185 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TTYFileOperations implements fs.FileOperations for a host file descriptor
+// that wraps a TTY FD.
+//
+// +stateify savable
+type TTYFileOperations struct {
+	fileOperations
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// FGProcessGroup is the foreground process group this TTY.  Will be
+	// nil if not set or if this file has been released.
+	fgProcessGroup *kernel.ProcessGroup
+}
+
+// newTTYFile returns a new fs.File that wraps a TTY FD.
+func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File {
+	return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{
+		fileOperations: fileOperations{iops: iops},
+	})
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY. This will
+// be nil if the foreground process has not been set or if the file has been
+// released.
+func (t *TTYFileOperations) ForegroundProcessGroup() *kernel.ProcessGroup {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.fgProcessGroup
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *TTYFileOperations) Release() {
+	t.mu.Lock()
+	t.fgProcessGroup = nil
+	t.mu.Unlock()
+
+	t.fileOperations.Release()
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Ignore arg[0].  This is the real FD:
+	fd := t.fileOperations.iops.fileState.FD()
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case linux.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		return 0, err
+
+	case linux.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on
+		// this terminal.
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if t.fgProcessGroup == nil {
+			// No process group has been set yet. Let's just lie
+			// and tell it the process group from the current task.
+			// The app is probably going to set it to something
+			// else very soon anyways.
+			t.fgProcessGroup = kernel.TaskFromContext(ctx).ThreadGroup().ProcessGroup()
+		}
+
+		// Map the ProcessGroup into a ProcessGroupID in the task's PID
+		// namespace.
+		pgID := kernel.TaskFromContext(ctx).ThreadGroup().PIDNamespace().IDOfProcessGroup(t.fgProcessGroup)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		var pgID kernel.ProcessGroupID
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		// pgID must be non-negative.
+		if pgID < 0 {
+			return 0, syserror.EINVAL
+		}
+
+		// Process group with pgID must exist in this PID namespace.
+		task := kernel.TaskFromContext(ctx)
+		pidns := task.PIDNamespace()
+		pg := pidns.ProcessGroupWithID(pgID)
+		if pg == nil {
+			return 0, syserror.ESRCH
+		}
+
+		// Process group must be in same session as calling task's
+		// process group.
+		curSession := task.ThreadGroup().ProcessGroup().Session()
+		curSessionID := pidns.IDOfSession(curSession)
+		if pidns.IDOfSession(pg.Session()) != curSessionID {
+			return 0, syserror.EPERM
+		}
+
+		t.mu.Lock()
+		t.fgProcessGroup = pg
+		t.mu.Unlock()
+		return 0, nil
+
+	case linux.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := ioctlGetWinsize(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+		var winsize linux.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetWinsize(fd, &winsize)
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index cf4e18805..b44d218d9 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -219,6 +219,11 @@ func (pg *ProcessGroup) handleOrphan() {
 	return
 }
 
+// Session returns the process group's session without taking a reference.
+func (pg *ProcessGroup) Session() *Session {
+	return pg.session
+}
+
 // CreateSession creates a new Session, with the ThreadGroup as the leader.
 //
 // EPERM may be returned if either the given ThreadGroup is already a Session
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 98356e8b7..eaeb9e2d8 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -65,6 +65,10 @@ const (
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
+	// ContainerSignalProcess is used to send a signal to a particular
+	// process in a container.
+	ContainerSignalProcess = "containerManager.SignalProcess"
+
 	// ContainerStart is the URPC endpoint for running a non-root container
 	// within a sandbox.
 	ContainerStart = "containerManager.Start"
@@ -92,7 +96,7 @@ const (
 	SandboxStacks = "debug.Stacks"
 )
 
-// ControlSocketAddr generates an abstract unix socket name for the given id.
+// ControlSocketAddr generates an abstract unix socket name for the given ID.
 func ControlSocketAddr(id string) string {
 	return fmt.Sprintf("\x00runsc-sandbox.%s", id)
 }
@@ -248,7 +252,7 @@ func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 }
 
 // ExecuteAsync starts running a command on a created or running sandbox. It
-// returns the pid of the new process.
+// returns the PID of the new process.
 func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
 	log.Debugf("containerManager.ExecuteAsync: %+v", args)
 	tgid, err := cm.l.executeAsync(args)
@@ -373,8 +377,12 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	// restore the state of multiple containers, nor exec processes.
 	cm.l.sandboxID = o.SandboxID
 	cm.l.mu.Lock()
-	key := execID{cid: o.SandboxID}
-	cm.l.processes = map[execID]*kernel.ThreadGroup{key: cm.l.k.GlobalInit()}
+	eid := execID{cid: o.SandboxID}
+	cm.l.processes = map[execID]*execProcess{
+		eid: &execProcess{
+			tg: cm.l.k.GlobalInit(),
+		},
+	}
 	cm.l.mu.Unlock()
 
 	// Tell the root container to start and wait for the result.
@@ -419,7 +427,7 @@ func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error
 
 // SignalArgs are arguments to the Signal method.
 type SignalArgs struct {
-	// CID is the container id.
+	// CID is the container ID.
 	CID string
 
 	// Signo is the signal to send to the process.
@@ -430,9 +438,31 @@ type SignalArgs struct {
 	All bool
 }
 
-// Signal sends a signal to the init process of the container.
-// TODO: Send signal to exec process.
+// Signal sends a signal to the root process of the container.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
 	log.Debugf("containerManager.Signal %q %d, all: %t", args.CID, args.Signo, args.All)
-	return cm.l.signal(args.CID, args.Signo, args.All)
+	return cm.l.signalContainer(args.CID, args.Signo, args.All)
+}
+
+// SignalProcessArgs are arguments to the Signal method.
+type SignalProcessArgs struct {
+	// CID is the container ID.
+	CID string
+
+	// PID is the process ID in the given container that will be signaled.
+	PID int32
+
+	// Signo is the signal to send to the process.
+	Signo int32
+
+	// SendToForegroundProcess indicates that the signal should be sent to
+	// the foreground process group in the session that PID belongs to.
+	// This is only valid if the process is attached to a host TTY.
+	SendToForegroundProcess bool
+}
+
+// SignalProcess sends a signal to a particular process in the container.
+func (cm *containerManager) SignalProcess(args *SignalProcessArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal: %+v", args)
+	return cm.l.signalProcess(args.CID, args.PID, args.Signo, args.SendToForegroundProcess)
 }
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 92d641b68..a5a6ba8af 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -25,8 +25,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 )
 
-// createFDMap creates an fd map that contains stdin, stdout, and stderr. If
-// console is true, then ioctl calls will be passed through to the host fd.
+// createFDMap creates an FD map that contains stdin, stdout, and stderr. If
+// console is true, then ioctl calls will be passed through to the host FD.
 // Upon success, createFDMap dups then closes stdioFDs.
 func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
 	if len(stdioFDs) != 3 {
@@ -36,7 +36,7 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 
-	// Maps sandbox fd to host fd.
+	// Maps sandbox FD to host FD.
 	fdMap := map[int]int{
 		0: stdioFDs[0],
 		1: stdioFDs[1],
@@ -45,7 +45,7 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 	mounter := fs.FileOwnerFromContext(ctx)
 
 	for sfd, hfd := range fdMap {
-		file, err := host.ImportFile(ctx, hfd, mounter, console /* allow ioctls */)
+		file, err := host.ImportFile(ctx, hfd, mounter, console /* isTTY */)
 		if err != nil {
 			return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err)
 		}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9fa9b51a0..766a2e968 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -112,7 +113,7 @@ type Loader struct {
 	// have the corresponding pid set.
 	//
 	// processes is guardded by mu.
-	processes map[execID]*kernel.ThreadGroup
+	processes map[execID]*execProcess
 }
 
 // execID uniquely identifies a sentry process.
@@ -121,6 +122,14 @@ type execID struct {
 	pid kernel.ThreadID
 }
 
+// execProcess contains the thread group and host TTY of a sentry process.
+type execProcess struct {
+	tg *kernel.ThreadGroup
+
+	// tty will be nil if the process is not attached to a terminal.
+	tty *host.TTYFileOperations
+}
+
 func init() {
 	// Initialize the random number generator.
 	rand.Seed(gtime.Now().UnixNano())
@@ -276,7 +285,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int,
 		startSignalForwarding: startSignalForwarding,
 		rootProcArgs:          procArgs,
 		sandboxID:             id,
-		processes:             make(map[execID]*kernel.ThreadGroup),
+		processes:             make(map[execID]*execProcess),
 	}
 	ctrl.manager.l = l
 	return l, nil
@@ -330,7 +339,7 @@ func createPlatform(conf *Config, deviceFD int) (platform.Platform, error) {
 	case PlatformKVM:
 		log.Infof("Platform: kvm")
 		if deviceFD < 0 {
-			return nil, fmt.Errorf("kvm device fd must be provided")
+			return nil, fmt.Errorf("kvm device FD must be provided")
 		}
 		return kvm.New(os.NewFile(uintptr(deviceFD), "kvm device"))
 	default:
@@ -413,8 +422,8 @@ func (l *Loader) run() error {
 	}
 
 	l.mu.Lock()
-	key := execID{cid: l.sandboxID}
-	l.processes[key] = l.k.GlobalInit()
+	eid := execID{cid: l.sandboxID}
+	l.processes[eid] = &execProcess{tg: l.k.GlobalInit()}
 	l.mu.Unlock()
 
 	// Start signal forwarding only after an init process is created.
@@ -510,8 +519,8 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	key := execID{cid: cid}
-	l.processes[key] = tg
+	eid := execID{cid: cid}
+	l.processes[eid] = &execProcess{tg: tg}
 
 	return nil
 }
@@ -520,7 +529,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 // filesystem.
 func (l *Loader) destroyContainer(cid string) error {
 	// First kill and wait for all processes in the container.
-	if err := l.signal(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
+	if err := l.signalContainer(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
 		return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
 	}
 
@@ -549,12 +558,12 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	// process with the same Root.
 	l.mu.Lock()
 	rootKey := execID{cid: args.ContainerID}
-	tg, ok := l.processes[rootKey]
+	ep, ok := l.processes[rootKey]
 	l.mu.Unlock()
 	if !ok {
 		return 0, fmt.Errorf("cannot exec in container %q: no such container", args.ContainerID)
 	}
-	tg.Leader().WithMuLocked(func(t *kernel.Task) {
+	ep.tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
 	})
 	if args.Root != nil {
@@ -563,7 +572,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
-	tg, tgid, err := control.ExecAsync(&proc, args)
+	tg, tgid, ttyFile, err := control.ExecAsync(&proc, args)
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %+v: %v", args, err)
 	}
@@ -573,7 +582,10 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 	eid := execID{cid: args.ContainerID, pid: tgid}
-	l.processes[eid] = tg
+	l.processes[eid] = &execProcess{
+		tg:  tg,
+		tty: ttyFile,
+	}
 	log.Debugf("updated processes: %v", l.processes)
 
 	return tgid, nil
@@ -584,8 +596,8 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
 	l.mu.Lock()
-	key := execID{cid: cid}
-	tg, ok := l.processes[key]
+	eid := execID{cid: cid}
+	ep, ok := l.processes[eid]
 	l.mu.Unlock()
 	if !ok {
 		return fmt.Errorf("can't find process for container %q in %v", cid, l.processes)
@@ -593,7 +605,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 
 	// If the thread either has already exited or exits during waiting,
 	// consider the container exited.
-	ws := l.wait(tg)
+	ws := l.wait(ep.tg)
 	*waitStatus = ws
 	return nil
 }
@@ -610,10 +622,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 	// entry in l.processes.
 	l.mu.Lock()
 	eid := execID{cid: cid, pid: tgid}
-	tg, ok := l.processes[eid]
+	ep, ok := l.processes[eid]
 	l.mu.Unlock()
 	if ok {
-		ws := l.wait(tg)
+		ws := l.wait(ep.tg)
 		*waitStatus = ws
 		if clearStatus {
 			// Remove tg from the cache.
@@ -626,8 +638,8 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 	}
 
 	// This process wasn't created by runsc exec or start, so just find it
-	// by pid and hope it hasn't exited yet.
-	tg = l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
+	// by PID and hope it hasn't exited yet.
+	tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
 	if tg == nil {
 		return fmt.Errorf("no thread group with ID %d", tgid)
 	}
@@ -682,18 +694,66 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	}
 }
 
-func (l *Loader) signal(cid string, signo int32, all bool) error {
+// signalProcess sends a signal to the process with the given PID. If
+// sendToFGProcess is true, then the signal will be sent to the foreground
+// process group in the same session that PID belongs to.
+func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess bool) error {
+	si := arch.SignalInfo{Signo: signo}
+
+	if pid <= 0 {
+		return fmt.Errorf("failed to signal container %q PID %d: PID must be positive", cid, pid)
+	}
+
+	eid := execID{
+		cid: cid,
+		pid: kernel.ThreadID(pid),
+	}
 	l.mu.Lock()
-	key := execID{cid: cid}
-	tg, ok := l.processes[key]
+	ep, ok := l.processes[eid]
 	l.mu.Unlock()
+
 	if !ok {
-		return fmt.Errorf("failed to signal container %q: no such container", cid)
+		return fmt.Errorf("failed to signal container %q PID %d: no such PID", cid, pid)
+	}
+
+	if !sendToFGProcess {
+		// Send signal directly to exec process.
+		return ep.tg.SendSignal(&si)
 	}
 
+	// Lookup foreground process group from the TTY for the given process,
+	// and send the signal to it.
+	if ep.tty == nil {
+		return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, pid)
+	}
+	pg := ep.tty.ForegroundProcessGroup()
+	if pg == nil {
+		// No foreground process group has been set. Signal the
+		// original thread group.
+		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
+		return ep.tg.SendSignal(&si)
+	}
+
+	// Send the signal.
+	return pg.Originator().SendSignal(&si)
+}
+
+// signalContainer sends a signal to the root container process, or to all
+// processes in the container if all is true.
+func (l *Loader) signalContainer(cid string, signo int32, all bool) error {
 	si := arch.SignalInfo{Signo: signo}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	ep, ok := l.processes[eid]
+	if !ok {
+		return fmt.Errorf("failed to signal container %q: no such container", cid)
+	}
+
 	if !all {
-		return tg.Leader().SendSignal(&si)
+		return ep.tg.SendSignal(&si)
 	}
 
 	// Pause the kernel to prevent new processes from being created while
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 28229dbcf..336edf3f6 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -158,6 +158,13 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("error getting processes for container: %v", err)
 	}
 
+	if e.StdioIsPty {
+		// Forward signals sent to this process to the foreground
+		// process in the sandbox.
+		stopForwarding := c.ForwardSignals(pid, true /* fgProcess */)
+		defer stopForwarding()
+	}
+
 	// Write the sandbox-internal pid if required.
 	if ex.internalPidFile != "" {
 		pidStr := []byte(strconv.Itoa(int(pid)))
@@ -216,9 +223,9 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	cmd.Stderr = os.Stderr
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the tty on the sandbox process.
+	// pty master/slave pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
-		// Create a new tty pair and send the master on the provided
+		// Create a new TTY pair and send the master on the provided
 		// socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
 		if err != nil {
@@ -226,7 +233,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		}
 		defer tty.Close()
 
-		// Set stdio to the new tty slave.
+		// Set stdio to the new TTY slave.
 		cmd.Stdin = tty
 		cmd.Stdout = tty
 		cmd.Stderr = tty
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index e68fb1e8e..bf8b9a2ab 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -50,10 +50,12 @@ go_test(
         "//pkg/sentry/control",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
+        "//pkg/urpc",
         "//runsc/boot",
         "//runsc/specutils",
         "//runsc/test/testutil",
         "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/runsc/container/container.go b/runsc/container/container.go
index be833c03d..4b0037b4e 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -22,6 +22,7 @@ import (
 	"io/ioutil"
 	"os"
 	"os/exec"
+	"os/signal"
 	"path/filepath"
 	"regexp"
 	"strconv"
@@ -107,14 +108,13 @@ type Container struct {
 	Owner string `json:"owner"`
 
 	// ConsoleSocket is the path to a unix domain socket that will receive
-	// the console FD. It is only used during create, so we don't need to
-	// store it in the metadata.
-	ConsoleSocket string `json:"-"`
+	// the console FD.
+	ConsoleSocket string `json:"consoleSocket"`
 
 	// Status is the current container Status.
 	Status Status `json:"status"`
 
-	// GoferPid is the pid of the gofer running along side the sandbox. May
+	// GoferPid is the PID of the gofer running along side the sandbox. May
 	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
@@ -313,12 +313,12 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		return nil, err
 	}
 
-	// Write the pid file. Containerd considers the create complete after
+	// Write the PID file. Containerd considers the create complete after
 	// this file is created, so it must be the last thing we do.
 	if pidFile != "" {
 		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.Pid())), 0644); err != nil {
 			c.Destroy()
-			return nil, fmt.Errorf("error writing pid file: %v", err)
+			return nil, fmt.Errorf("error writing PID file: %v", err)
 		}
 	}
 
@@ -406,7 +406,7 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke
 	return c.Wait()
 }
 
-// Execute runs the specified command in the container. It returns the pid of
+// Execute runs the specified command in the container. It returns the PID of
 // the newly created process.
 func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
 	log.Debugf("Execute in container %q, args: %+v", c.ID, args)
@@ -429,7 +429,7 @@ func (c *Container) Event() (*boot.Event, error) {
 // Pid returns the Pid of the sandbox the container is running in, or -1 if the
 // container is not running.
 func (c *Container) Pid() int {
-	if err := c.requireStatus("pid", Created, Running, Paused); err != nil {
+	if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
 		return -1
 	}
 	return c.Sandbox.Pid
@@ -449,7 +449,7 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
-	log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
+	log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("container is not running")
 	}
@@ -459,7 +459,7 @@ func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus
 // WaitPID waits for process 'pid' in the container's PID namespace and returns
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
-	log.Debugf("Wait on pid %d in container %q", pid, c.ID)
+	log.Debugf("Wait on PID %d in container %q", pid, c.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("container is not running")
 	}
@@ -483,7 +483,30 @@ func (c *Container) Signal(sig syscall.Signal, all bool) error {
 	if !c.isSandboxRunning() {
 		return fmt.Errorf("container is not running")
 	}
-	return c.Sandbox.Signal(c.ID, sig, all)
+	return c.Sandbox.SignalContainer(c.ID, sig, all)
+}
+
+// ForwardSignals forwards all signals received by the current process to the
+// container process inside the sandbox. It returns a function that will stop
+// forwarding signals.
+func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
+	log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh)
+	go func() {
+		for s := range sigCh {
+			log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", s, c.ID, pid, fgProcess)
+			if err := c.Sandbox.SignalProcess(c.ID, pid, s.(syscall.Signal), fgProcess); err != nil {
+				log.Warningf("error forwarding signal %d to container %q: %v", s, c.ID, err)
+			}
+		}
+		log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+	}()
+
+	return func() {
+		signal.Stop(sigCh)
+		close(sigCh)
+	}
 }
 
 // Checkpoint sends the checkpoint call to the container.
@@ -683,9 +706,9 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 		if err != nil {
 			return nil, err
 		}
-		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
+		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
 
-		goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
+		goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD")
 		defer goferEnd.Close()
 		goferEnds = append(goferEnds, goferEnd)
 
@@ -710,7 +733,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	if err := specutils.StartInNS(cmd, nss); err != nil {
 		return nil, err
 	}
-	log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
+	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
 	c.GoferPid = cmd.Process.Pid
 	return sandEnds, nil
 }
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index aebfb2878..84b59ffd8 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -31,6 +31,7 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
+	"github.com/kr/pty"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -38,6 +39,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
@@ -1577,6 +1579,121 @@ func TestRootNotMount(t *testing.T) {
 	}
 }
 
+func TestJobControlSignalExec(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	conf := testutil.TestConfig()
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Create a pty master/slave. The slave will be passed to the exec
+	// process.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		t.Fatalf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+	defer ptySlave.Close()
+
+	// Exec bash and attach a terminal.
+	args := &control.ExecArgs{
+		Filename: "/bin/bash",
+		// Don't let bash execute from profile or rc files, otherwise
+		// our PID counts get messed up.
+		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
+		// Pass the pty slave as FD 0, 1, and 2.
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+		},
+		StdioIsPty: true,
+	}
+
+	pid, err := c.Execute(args)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+	if pid != 2 {
+		t.Fatalf("exec got pid %d, wanted %d", pid, 2)
+	}
+
+	// Make sure all the processes are running.
+	expectedPL := []*control.Process{
+		// Root container process.
+		{PID: 1, Cmd: "sleep"},
+		// Bash from exec process.
+		{PID: 2, Cmd: "bash"},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Execute sleep.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for it to start. Sleep's PPID is bash's PID.
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send a SIGTERM to the foreground process for the exec PID. Note that
+	// although we pass in the PID of "bash", it should actually terminate
+	// "sleep", since that is the foreground process.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+	expectedPL = expectedPL[:1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Make sure the process indicates it was killed by a SIGKILL.
+	ws, err := c.WaitPID(pid, true)
+	if err != nil {
+		t.Errorf("waiting on container failed: %v", err)
+	}
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index e5f7daf60..ab200b75c 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -477,11 +477,12 @@ func TestMultiContainerDestroy(t *testing.T) {
 }
 
 func TestMultiContainerProcesses(t *testing.T) {
-	// Note: use 'while true' to keep 'sh' process around. Otherwise, shell will
-	// just execve into 'sleep' and both containers will look the same.
+	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
+	// will just execve into 'sleep' and both containers will look the
+	// same.
 	specs, ids := createSpecs(
 		[]string{"sleep", "100"},
-		[]string{"sh", "-c", "while true; do sleep 100; done"})
+		[]string{"sh", "-c", "{ sleep 100; }"})
 	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 4111b1a60..e4853af69 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -80,7 +80,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 
 // StartRoot starts running the root container process inside the sandbox.
 func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
-	log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid)
+	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
@@ -107,7 +107,7 @@ func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, goferFi
 		defer f.Close()
 	}
 
-	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
+	log.Debugf("Start non-root container sandbox %q, PID: %d", s.ID, s.Pid)
 	sandboxConn, err := s.sandboxConnect()
 	if err != nil {
 		return fmt.Errorf("couldn't connect to sandbox: %v", err)
@@ -147,7 +147,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str
 		SandboxID: s.ID,
 	}
 
-	// If the platform needs a device fd we must pass it in.
+	// If the platform needs a device FD we must pass it in.
 	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
 		return err
 	} else if deviceFile != nil {
@@ -192,7 +192,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	return pl, nil
 }
 
-// Execute runs the specified command in the container. It returns the pid of
+// Execute runs the specified command in the container. It returns the PID of
 // the newly created process.
 func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
 	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
@@ -239,7 +239,7 @@ func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
 }
 
 func (s *Sandbox) connError(err error) error {
-	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+	return fmt.Errorf("error connecting to control server at PID %d: %v", s.Pid, err)
 }
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
@@ -322,7 +322,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		nextFD++
 	}
 
-	// If the platform needs a device fd we must pass it in.
+	// If the platform needs a device FD we must pass it in.
 	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
 		return err
 	} else if deviceFile != nil {
@@ -338,7 +338,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	cmd.Stderr = os.Stderr
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the tty on the sandbox process.
+	// pty master/slave pair and set the TTY on the sandbox process.
 	if consoleEnabled {
 		// console.NewWithSocket will send the master on the socket,
 		// and return the slave.
@@ -461,7 +461,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Add container as the last argument.
 	cmd.Args = append(cmd.Args, s.ID)
 
-	// Log the fds we are donating to the sandbox process.
+	// Log the FDs we are donating to the sandbox process.
 	for i, f := range cmd.ExtraFiles {
 		log.Debugf("Donating FD %d: %q", i+3, f.Name())
 	}
@@ -472,7 +472,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		return err
 	}
 	s.Pid = cmd.Process.Pid
-	log.Infof("Sandbox started, pid: %d", s.Pid)
+	log.Infof("Sandbox started, PID: %d", s.Pid)
 
 	return nil
 }
@@ -572,9 +572,10 @@ func (s *Sandbox) destroy() error {
 	return nil
 }
 
-// Signal sends the signal to a container in the sandbox. If all is true and
-// signal is SIGKILL, then waits for all processes to exit before returning.
-func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
+// SignalContainer sends the signal to a container in the sandbox. If all is
+// true and signal is SIGKILL, then waits for all processes to exit before
+// returning.
+func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) error {
 	log.Debugf("Signal sandbox %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -593,6 +594,30 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
 	return nil
 }
 
+// SignalProcess sends the signal to a particular process in the container. If
+// fgProcess is true, then the signal is sent to the foreground process group
+// in the same session that PID belongs to. This is only valid if the process
+// is attached to a host TTY.
+func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgProcess bool) error {
+	log.Debugf("Signal sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	args := boot.SignalProcessArgs{
+		CID:                     cid,
+		Signo:                   int32(sig),
+		PID:                     pid,
+		SendToForegroundProcess: fgProcess,
+	}
+	if err := conn.Call(boot.ContainerSignalProcess, &args, nil); err != nil {
+		return fmt.Errorf("err signaling container %q PID %d: %v", cid, pid, err)
+	}
+	return nil
+}
+
 // Checkpoint sends the checkpoint call for a container in the sandbox.
 // The statefile will be written to f.
 func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 910c36597..ddd088223 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -27,6 +27,7 @@
 package integration
 
 import (
+	"syscall"
 	"testing"
 	"time"
 
@@ -60,3 +61,57 @@ func TestExecCapabilities(t *testing.T) {
 		t.Errorf("wrong capabilities, got: %q, want: %q", got, want)
 	}
 }
+
+func TestExecJobControl(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("exec-test")
+
+	// Start the container.
+	if _, err := d.Run("alpine", "sleep", "1000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	// Exec 'sh' with an attached pty.
+	cmd, ptmx, err := d.ExecWithTerminal("sh")
+	if err != nil {
+		t.Fatalf("docker exec failed: %v", err)
+	}
+	defer ptmx.Close()
+
+	// Call "sleep 100" in the shell.
+	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Give shell a few seconds to start executing the sleep.
+	time.Sleep(2 * time.Second)
+
+	// Send a ^C to the pty, which should kill sleep, but not the shell.
+	// \x03 is ASCII "end of text", which is the same as ^C.
+	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// The shell should still be alive at this point. Sleep should have
+	// exited with code 2+128=130. We'll exit with 10 plus that number, so
+	// that we can be sure that the shell did not get signalled.
+	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Exec process should exit with code 10+130=140.
+	ps, err := cmd.Process.Wait()
+	if err != nil {
+		t.Fatalf("error waiting for exec process: %v", err)
+	}
+	ws := ps.Sys().(syscall.WaitStatus)
+	if !ws.Exited() {
+		t.Errorf("ws.Exited got false, want true")
+	}
+	if got, want := ws.ExitStatus(), 140; got != want {
+		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
+	}
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index ca91e07ff..da2535bfa 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//runsc/boot",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 7f5909987..55ca353b8 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -26,6 +26,8 @@ import (
 	"strconv"
 	"strings"
 	"time"
+
+	"github.com/kr/pty"
 )
 
 func init() {
@@ -131,6 +133,17 @@ func do(args ...string) (string, error) {
 	return string(out), nil
 }
 
+// doWithPty executes docker command with stdio attached to a pty.
+func doWithPty(args ...string) (*exec.Cmd, *os.File, error) {
+	fmt.Printf("Running with pty: docker %s\n", args)
+	cmd := exec.Command("docker", args...)
+	ptmx, err := pty.Start(cmd)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error executing docker %s with a pty: %v", args, err)
+	}
+	return cmd, ptmx, nil
+}
+
 // Pull pulls a docker image. This is used in tests to isolate the
 // time to pull the image off the network from the time to actually
 // start the container, to avoid timeouts over slow networks.
@@ -197,6 +210,14 @@ func (d *Docker) Exec(args ...string) (string, error) {
 	return do(a...)
 }
 
+// ExecWithTerminal calls 'docker exec -it' with the arguments provided and
+// attaches a pty to stdio.
+func (d *Docker) ExecWithTerminal(args ...string) (*exec.Cmd, *os.File, error) {
+	a := []string{"exec", "-it", d.Name}
+	a = append(a, args...)
+	return doWithPty(a...)
+}
+
 // Pause calls 'docker pause'.
 func (d *Docker) Pause() error {
 	if _, err := do("pause", d.Name); err != nil {
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 07d66e469..cdc7f78c3 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -16,6 +16,7 @@
 package testutil
 
 import (
+	"bufio"
 	"context"
 	"encoding/json"
 	"fmt"
@@ -27,6 +28,8 @@ import (
 	"os/signal"
 	"path/filepath"
 	"runtime"
+	"strings"
+	"sync/atomic"
 	"syscall"
 	"time"
 
@@ -315,3 +318,36 @@ func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
 		}
 	}
 }
+
+// WaitUntilRead reads from the given reader until the wanted string is found
+// or until timeout.
+func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
+	sc := bufio.NewScanner(r)
+	if split != nil {
+		sc.Split(split)
+	}
+	// done must be accessed atomically. A value greater than 0 indicates
+	// that the read loop can exit.
+	var done uint32
+	doneCh := make(chan struct{})
+	go func() {
+		for sc.Scan() {
+			t := sc.Text()
+			if strings.Contains(t, want) {
+				atomic.StoreUint32(&done, 1)
+				close(doneCh)
+				break
+			}
+			if atomic.LoadUint32(&done) > 0 {
+				break
+			}
+		}
+	}()
+	select {
+	case <-time.After(timeout):
+		atomic.StoreUint32(&done, 1)
+		return fmt.Errorf("timeout waiting to read %q", want)
+	case <-doneCh:
+		return nil
+	}
+}
-- 
cgit v1.2.3


From 4fef31f96c289d5e58c3c2997ee38fcb22c0378f Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 3 Oct 2018 17:02:05 -0700
Subject: Add S/R support for FIOASYNC

PiperOrigin-RevId: 215655197
Change-Id: I668b1bc7c29daaf2999f8f759138bcbb09c4de6f
---
 pkg/sentry/fs/file.go              | 10 +++++++++-
 pkg/sentry/fs/file_state.go        | 11 +++++++++++
 pkg/sentry/kernel/fasync/fasync.go |  4 +++-
 pkg/waiter/waiter.go               |  2 ++
 4 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 904827a3e..36794d378 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -85,6 +85,9 @@ type File struct {
 	// async handles O_ASYNC notifications.
 	async FileAsync
 
+	// saving indicates that this file is in the process of being saved.
+	saving bool `state:"nosave"`
+
 	// mu is dual-purpose: first, to make read(2) and write(2) thread-safe
 	// in conformity with POSIX, and second, to cancel operations before they
 	// begin in response to interruptions (i.e. signals).
@@ -127,10 +130,15 @@ func (f *File) DecRef() {
 		// Release a reference on the Dirent.
 		f.Dirent.DecRef()
 
+		// Only unregister if we are currently registered. There is nothing
+		// to register if f.async is nil (this happens when async mode is
+		// enabled without setting an owner). Also, we unregister during
+		// save.
 		f.flagsMu.Lock()
-		if f.flags.Async && f.async != nil {
+		if !f.saving && f.flags.Async && f.async != nil {
 			f.async.Unregister(f)
 		}
+		f.async = nil
 		f.flagsMu.Unlock()
 	})
 }
diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go
index 3384737ab..f848d1b79 100644
--- a/pkg/sentry/fs/file_state.go
+++ b/pkg/sentry/fs/file_state.go
@@ -14,7 +14,18 @@
 
 package fs
 
+// beforeSave is invoked by stateify.
+func (f *File) beforeSave() {
+	f.saving = true
+	if f.flags.Async && f.async != nil {
+		f.async.Unregister(f)
+	}
+}
+
 // afterLoad is invoked by stateify.
 func (f *File) afterLoad() {
 	f.mu.Init()
+	if f.flags.Async && f.async != nil {
+		f.async.Register(f)
+	}
 }
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 69c7970fa..7d01abe90 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -32,8 +32,10 @@ func New() fs.FileAsync {
 }
 
 // FileAsync sends signals when the registered file is ready for IO.
+//
+// +stateify savable
 type FileAsync struct {
-	mu        sync.Mutex
+	mu        sync.Mutex `state:"nosave"`
 	e         waiter.Entry
 	requester *auth.Credentials
 
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 9825880ca..832b6a5a9 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -113,6 +113,8 @@ type EntryCallback interface {
 // Entry represents a waiter that can be add to the a wait queue. It can
 // only be in one queue at a time, and is added "intrusively" to the queue with
 // no extra memory allocations.
+//
+// +stateify savable
 type Entry struct {
 	// Context stores any state the waiter may wish to store in the entry
 	// itself, which may be used at wake up time.
-- 
cgit v1.2.3


From 213f6688a56e7bcd1205c77dc79c3e5cfee817fe Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 3 Oct 2018 17:28:52 -0700
Subject: Implement TIOCSCTTY ioctl as a noop.

PiperOrigin-RevId: 215658757
Change-Id: If63b33293f3e53a7f607ae72daa79e2b7ef6fcfd
---
 pkg/abi/linux/ioctl.go     | 1 +
 pkg/sentry/fs/tty/slave.go | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 1c9dc7b03..afd9ee82b 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -22,6 +22,7 @@ const (
 	TCSETS     = 0x00005402
 	TCSETSW    = 0x00005403
 	TCSETSF    = 0x00005404
+	TIOCSCTTY  = 0x0000540e
 	TIOCGPGRP  = 0x0000540f
 	TIOCSPGRP  = 0x00005410
 	TIOCOUTQ   = 0x00005411
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 963331b9b..4a0d4fdb9 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -154,6 +154,12 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 		return 0, sf.si.t.ld.windowSize(ctx, io, args)
 	case linux.TIOCSWINSZ:
 		return 0, sf.si.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		// TODO: Implement once we have support for job
+		// control.
+		return 0, nil
 	default:
 		return 0, syserror.ENOTTY
 	}
-- 
cgit v1.2.3


From beac59b37a8b0ea834904870e5c236d2627947a2 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 3 Oct 2018 20:21:25 -0700
Subject: Fix panic if FIOASYNC callback is registered and triggered without
 target

PiperOrigin-RevId: 215674589
Change-Id: I4f8871b64c570dc6da448d2fe351cec8a406efeb
---
 pkg/sentry/kernel/fasync/fasync.go | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 7d01abe90..f77339cae 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -60,6 +60,11 @@ func (a *FileAsync) Callback(e *waiter.Entry) {
 	if tg != nil {
 		t = tg.Leader()
 	}
+	if t == nil {
+		// No recipient has been registered.
+		a.mu.Unlock()
+		return
+	}
 	c := t.Credentials()
 	// Logic from sigio_perm in fs/fcntl.c.
 	if a.requester.EffectiveKUID == 0 ||
-- 
cgit v1.2.3


From e9e8be661328661b5527f1643727b9a13bbeab48 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 8 Oct 2018 10:19:27 -0700
Subject: Implement shared futexes.

- Shared futex objects on shared mappings are represented by Mappable +
  offset, analogous to Linux's use of inode + offset. Add type
  futex.Key, and change the futex.Manager bucket API to use futex.Keys
  instead of addresses.

- Extend the futex.Checker interface to be able to return Keys for
  memory mappings. It returns Keys rather than just mappings because
  whether the address or the target of the mapping is used in the Key
  depends on whether the mapping is MAP_SHARED or MAP_PRIVATE; this
  matters because using mapping target for a futex on a MAP_PRIVATE
  mapping causes it to stop working across COW-breaking.

- futex.Manager.WaitComplete depends on atomic updates to
  futex.Waiter.addr to determine when it has locked the right bucket,
  which is much less straightforward for struct futex.Waiter.key. Switch
  to an atomically-accessed futex.Waiter.bucket pointer.

- futex.Manager.Wake now needs to take a futex.Checker to resolve
  addresses for shared futexes. CLONE_CHILD_CLEARTID requires the exit
  path to perform a shared futex wakeup (Linux:
  kernel/fork.c:mm_release() => sys_futex(tsk->clear_child_tid,
  FUTEX_WAKE, ...)). This is a problem because futexChecker is in the
  syscalls/linux package. Move it to kernel.

PiperOrigin-RevId: 216207039
Change-Id: I708d68e2d1f47e526d9afd95e7fed410c84afccf
---
 pkg/sentry/kernel/BUILD                |   1 +
 pkg/sentry/kernel/futex/BUILD          |  17 +-
 pkg/sentry/kernel/futex/futex.go       | 382 ++++++++++------
 pkg/sentry/kernel/futex/futex_test.go  | 765 +++++++++++++++++----------------
 pkg/sentry/kernel/kernel.go            |   7 +
 pkg/sentry/kernel/task_clone.go        |   2 +-
 pkg/sentry/kernel/task_context.go      |  15 +-
 pkg/sentry/kernel/task_exit.go         |   2 +-
 pkg/sentry/kernel/task_futex.go        | 148 +++++++
 pkg/sentry/mm/BUILD                    |   1 +
 pkg/sentry/mm/syscalls.go              |  35 ++
 pkg/sentry/syscalls/linux/sys_futex.go | 135 +-----
 12 files changed, 880 insertions(+), 630 deletions(-)
 create mode 100644 pkg/sentry/kernel/task_futex.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 7eb2bffeb..31ad96612 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -95,6 +95,7 @@ go_library(
         "task_context.go",
         "task_exec.go",
         "task_exit.go",
+        "task_futex.go",
         "task_identity.go",
         "task_list.go",
         "task_log.go",
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 0ff5b0a95..e13fcb5ff 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -3,6 +3,17 @@ package(licenses = ["notice"])  # Apache 2.0
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+go_template_instance(
+    name = "atomicptr_bucket",
+    out = "atomicptr_bucket.go",
+    package = "futex",
+    suffix = "Bucket",
+    template = "//pkg/sync:generic_atomicptr",
+    types = {
+        "Value": "bucket",
+    },
+)
+
 go_template_instance(
     name = "waiter_list",
     out = "waiter_list.go",
@@ -18,12 +29,16 @@ go_template_instance(
 go_library(
     name = "futex",
     srcs = [
+        "atomicptr_bucket.go",
         "futex.go",
         "waiter_list.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
     visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/syserror"],
+    deps = [
+        "//pkg/sentry/memmap",
+        "//pkg/syserror",
+    ],
 )
 
 go_test(
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 4a1f2a0ef..54b1982a0 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -19,11 +19,78 @@ package futex
 
 import (
 	"sync"
-	"sync/atomic"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+// KeyKind indicates the type of a Key.
+type KeyKind int
+
+const (
+	// KindPrivate indicates a private futex (a futex syscall with the
+	// FUTEX_PRIVATE_FLAG set).
+	KindPrivate KeyKind = iota
+
+	// KindSharedPrivate indicates a shared futex on a private memory mapping.
+	// Although KindPrivate and KindSharedPrivate futexes both use memory
+	// addresses to identify futexes, they do not interoperate (in Linux, the
+	// two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
+	// comparison).
+	KindSharedPrivate
+
+	// KindSharedMappable indicates a shared futex on a memory mapping other
+	// than a private anonymous memory mapping.
+	KindSharedMappable
+)
+
+// Key represents something that a futex waiter may wait on.
+type Key struct {
+	// Kind is the type of the Key.
+	Kind KeyKind
+
+	// Mappable is the memory-mapped object that is represented by the Key.
+	// Mappable is always nil if Kind is not KindSharedMappable, and may be nil
+	// even if it is.
+	Mappable memmap.Mappable
+
+	// MappingIdentity is the MappingIdentity associated with Mappable.
+	// MappingIdentity is always nil is Mappable is nil, and may be nil even if
+	// it isn't.
+	MappingIdentity memmap.MappingIdentity
+
+	// If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
+	// memory address. Otherwise, Offset is the represented offset into
+	// Mappable.
+	Offset uint64
+}
+
+func (k *Key) release() {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.DecRef()
+	}
+	k.Mappable = nil
+	k.MappingIdentity = nil
+}
+
+func (k *Key) clone() Key {
+	if k.MappingIdentity != nil {
+		k.MappingIdentity.IncRef()
+	}
+	return *k
+}
+
+// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
+func (k *Key) addr() uintptr {
+	return uintptr(k.Offset)
+}
+
+// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
+func (k *Key) matches(k2 *Key) bool {
+	// k.MappingIdentity is ignored; it's only used for reference counting.
+	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
+}
+
 // Checker abstracts memory accesses. This is useful because the "addresses"
 // used in this package may not be real addresses (they could be indices of an
 // array, for example), or they could be mapped via some special mechanism.
@@ -41,6 +108,14 @@ type Checker interface {
 	// Note that op is an opaque operation whose behaviour is defined
 	// outside of the futex manager.
 	Op(addr uintptr, op uint32) (bool, error)
+
+	// GetSharedKey returns a Key with kind KindSharedPrivate or
+	// KindSharedMappable corresponding to the memory mapped at address addr.
+	//
+	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
+	// reference is held on the MappingIdentity, which must be dropped by the
+	// caller when the Key is no longer in use.
+	GetSharedKey(addr uintptr) (Key, error)
 }
 
 // Waiter is the struct which gets enqueued into buckets for wake up routines
@@ -53,11 +128,11 @@ type Waiter struct {
 	// synchronization applies).
 	//
 	// - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
-	// waiterEntry, complete, and addr are protected by the bucket.mu ("bucket
-	// lock") of the containing bucket, and bitmask is immutable. complete and
-	// addr are additionally mutated using atomic memory operations, ensuring
-	// that they can be read using atomic memory operations without holding the
-	// bucket lock.
+	// waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
+	// lock") of the containing bucket, and bitmask is immutable. Note that
+	// since bucket is mutated using atomic memory operations, bucket.Load()
+	// may be called without holding the bucket lock, although it may change
+	// racily. See WaitComplete().
 	//
 	// - A Waiter is only guaranteed to be no longer queued after calling
 	// WaitComplete().
@@ -65,15 +140,15 @@ type Waiter struct {
 	// waiterEntry links Waiter into bucket.waiters.
 	waiterEntry
 
-	// complete is 1 if the Waiter was removed from its bucket by a wakeup and
-	// 0 otherwise.
-	complete int32
+	// bucket is the bucket this waiter is queued in. If bucket is nil, the
+	// waiter is not waiting and is not in any bucket.
+	bucket AtomicPtrBucket
 
 	// C is sent to when the Waiter is woken.
 	C chan struct{}
 
-	// addr is the address being waited on.
-	addr uintptr
+	// key is what this waiter is waiting on.
+	key Key
 
 	// The bitmask we're waiting on.
 	// This is used the case of a FUTEX_WAKE_BITSET.
@@ -87,7 +162,14 @@ func NewWaiter() *Waiter {
 	}
 }
 
+// woken returns true if w has been woken since the last call to WaitPrepare.
+func (w *Waiter) woken() bool {
+	return len(w.C) != 0
+}
+
 // bucket holds a list of waiters for a given address hash.
+//
+// +stateify savable
 type bucket struct {
 	// mu protects waiters and contained Waiter state. See comment in Waiter.
 	mu sync.Mutex `state:"nosave"`
@@ -99,10 +181,10 @@ type bucket struct {
 // bucket and returns the number of waiters woken.
 //
 // Preconditions: b.mu must be locked.
-func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
+func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
 	done := 0
 	for w := b.waiters.Front(); done < n && w != nil; {
-		if w.addr != addr || w.bitmask&bitmask == 0 {
+		if !w.key.matches(key) || w.bitmask&bitmask == 0 {
 			// Not matching.
 			w = w.Next()
 			continue
@@ -114,15 +196,15 @@ func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
 		b.waiters.Remove(woke)
 		woke.C <- struct{}{}
 
-		// NOTE: The above channel write establishes a write barrier
-		// according to the memory model, so nothing may be ordered
-		// around it. Since we've dequeued w and will never touch it
-		// again, we can safely store 1 to w.complete here and allow
-		// the WaitComplete() to short-circuit grabbing the bucket
-		// lock. If they somehow miss the w.complete, we are still
-		// holding the lock, so we can know that they won't dequeue w,
-		// assume it's free and have the below operation afterwards.
-		atomic.StoreInt32(&woke.complete, 1)
+		// NOTE: The above channel write establishes a write barrier according
+		// to the memory model, so nothing may be ordered around it. Since
+		// we've dequeued woke and will never touch it again, we can safely
+		// store nil to woke.bucket here and allow the WaitComplete() to
+		// short-circuit grabbing the bucket lock. If they somehow miss the
+		// store, we are still holding the lock, so we can know that they won't
+		// dequeue woke, assume it's free and have the below operation
+		// afterwards.
+		woke.bucket.Store(nil)
 		done++
 	}
 	return done
@@ -132,10 +214,10 @@ func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
 // bucket "to".
 //
 // Preconditions: b and to must be locked.
-func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int {
+func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int {
 	done := 0
 	for w := b.waiters.Front(); done < n && w != nil; {
-		if w.addr != addr {
+		if !w.key.matches(key) {
 			// Not matching.
 			w = w.Next()
 			continue
@@ -144,8 +226,10 @@ func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int {
 		requeued := w
 		w = w.Next() // Next iteration.
 		b.waiters.Remove(requeued)
-		atomic.StoreUintptr(&requeued.addr, naddr)
+		requeued.key.release()
+		requeued.key = nkey.clone()
 		to.waiters.PushBack(requeued)
+		requeued.bucket.Store(to)
 		done++
 	}
 	return done
@@ -158,19 +242,22 @@ const (
 	bucketCountBits = 10
 )
 
-func checkAddr(addr uintptr) error {
+// getKey returns a Key representing address addr in c.
+func getKey(c Checker, addr uintptr, private bool) (Key, error) {
 	// Ensure the address is aligned.
 	// It must be a DWORD boundary.
 	if addr&0x3 != 0 {
-		return syserror.EINVAL
+		return Key{}, syserror.EINVAL
 	}
-
-	return nil
+	if private {
+		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
+	}
+	return c.GetSharedKey(addr)
 }
 
 // bucketIndexForAddr returns the index into Manager.buckets for addr.
 func bucketIndexForAddr(addr uintptr) uintptr {
-	// - The bottom 2 bits of addr must be 0, per checkAddr.
+	// - The bottom 2 bits of addr must be 0, per getKey.
 	//
 	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
 	// for a canonical address, and (on all existing platforms) bit 47 must be
@@ -199,171 +286,216 @@ func bucketIndexForAddr(addr uintptr) uintptr {
 //
 // +stateify savable
 type Manager struct {
-	buckets [bucketCount]bucket `state:"zerovalue"`
+	// privateBuckets holds buckets for KindPrivate and KindSharedPrivate
+	// futexes.
+	privateBuckets [bucketCount]bucket `state:"zerovalue"`
+
+	// sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
+	// may be shared by multiple Managers. The sharedBucket pointer is
+	// immutable.
+	sharedBucket *bucket
 }
 
 // NewManager returns an initialized futex manager.
-// N.B. we use virtual address to tag futexes, so it only works for private
-// (within a single process) futex.
 func NewManager() *Manager {
-	return &Manager{}
+	return &Manager{
+		sharedBucket: &bucket{},
+	}
 }
 
-// lockBucket returns a locked bucket for the given addr.
-//
-// Preconditions: checkAddr(addr) == nil.
-func (m *Manager) lockBucket(addr uintptr) *bucket {
-	b := &m.buckets[bucketIndexForAddr(addr)]
+// Fork returns a new Manager. Shared futex clients using the returned Manager
+// may interoperate with those using m.
+func (m *Manager) Fork() *Manager {
+	return &Manager{
+		sharedBucket: m.sharedBucket,
+	}
+}
+
+// lockBucket returns a locked bucket for the given key.
+func (m *Manager) lockBucket(k *Key) *bucket {
+	var b *bucket
+	if k.Kind == KindSharedMappable {
+		b = m.sharedBucket
+	} else {
+		b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
+	}
 	b.mu.Lock()
 	return b
 }
 
-// lockBuckets returns locked buckets for the given addrs.
-//
-// Preconditions: checkAddr(addr1) == checkAddr(addr2) == nil.
-func (m *Manager) lockBuckets(addr1 uintptr, addr2 uintptr) (*bucket, *bucket) {
-	i1 := bucketIndexForAddr(addr1)
-	i2 := bucketIndexForAddr(addr2)
-	b1 := &m.buckets[i1]
-	b2 := &m.buckets[i2]
-
-	// Ensure that buckets are locked in a consistent order (lowest index
-	// first) to avoid circular locking.
-	switch {
-	case i1 < i2:
-		b1.mu.Lock()
-		b2.mu.Lock()
-	case i2 < i1:
-		b2.mu.Lock()
-		b1.mu.Lock()
-	default:
-		b1.mu.Lock()
+// lockBuckets returns locked buckets for the given keys.
+func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
+	// Buckets must be consistently ordered to avoid circular lock
+	// dependencies. We order buckets in m.privateBuckets by index (lowest
+	// index first), and all buckets in m.privateBuckets precede
+	// m.sharedBucket.
+
+	// Handle the common case first:
+	if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
+		i1 := bucketIndexForAddr(k1.addr())
+		i2 := bucketIndexForAddr(k2.addr())
+		b1 := &m.privateBuckets[i1]
+		b2 := &m.privateBuckets[i2]
+		switch {
+		case i1 < i2:
+			b1.mu.Lock()
+			b2.mu.Lock()
+		case i2 < i1:
+			b2.mu.Lock()
+			b1.mu.Lock()
+		default:
+			b1.mu.Lock()
+		}
+		return b1, b2
 	}
 
+	// At least one of b1 or b2 should be m.sharedBucket.
+	b1 := m.sharedBucket
+	b2 := m.sharedBucket
+	if k1.Kind != KindSharedMappable {
+		b1 = m.lockBucket(k1)
+	} else if k2.Kind != KindSharedMappable {
+		b2 = m.lockBucket(k2)
+	}
+	m.sharedBucket.mu.Lock()
 	return b1, b2
 }
 
 // Wake wakes up to n waiters matching the bitmask on the given addr.
 // The number of waiters woken is returned.
-func (m *Manager) Wake(addr uintptr, bitmask uint32, n int) (int, error) {
-	if err := checkAddr(addr); err != nil {
+func (m *Manager) Wake(c Checker, addr uintptr, private bool, bitmask uint32, n int) (int, error) {
+	// This function is very hot; avoid defer.
+	k, err := getKey(c, addr, private)
+	if err != nil {
 		return 0, err
 	}
 
-	b := m.lockBucket(addr)
-	// This function is very hot; avoid defer.
-	r := b.wakeLocked(addr, bitmask, n)
+	b := m.lockBucket(&k)
+	r := b.wakeLocked(&k, bitmask, n)
+
 	b.mu.Unlock()
+	k.release()
 	return r, nil
 }
 
-func (m *Manager) doRequeue(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
-	if err := checkAddr(addr); err != nil {
+func (m *Manager) doRequeue(c Checker, addr, naddr uintptr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+	k1, err := getKey(c, addr, private)
+	if err != nil {
 		return 0, err
 	}
-	if err := checkAddr(naddr); err != nil {
+	defer k1.release()
+	k2, err := getKey(c, naddr, private)
+	if err != nil {
 		return 0, err
 	}
+	defer k2.release()
 
-	b1, b2 := m.lockBuckets(addr, naddr)
+	b1, b2 := m.lockBuckets(&k1, &k2)
 	defer b1.mu.Unlock()
 	if b2 != b1 {
 		defer b2.mu.Unlock()
 	}
 
-	// Check our value.
-	// This only applied for RequeueCmp().
-	if c != nil {
+	if checkval {
 		if err := c.Check(addr, val); err != nil {
 			return 0, err
 		}
 	}
 
 	// Wake the number required.
-	done := b1.wakeLocked(addr, ^uint32(0), nwake)
+	done := b1.wakeLocked(&k1, ^uint32(0), nwake)
 
 	// Requeue the number required.
-	b1.requeueLocked(b2, addr, naddr, nreq)
+	b1.requeueLocked(b2, &k1, &k2, nreq)
 
 	return done, nil
 }
 
 // Requeue wakes up to nwake waiters on the given addr, and unconditionally
 // requeues up to nreq waiters on naddr.
-func (m *Manager) Requeue(addr uintptr, naddr uintptr, nwake int, nreq int) (int, error) {
-	return m.doRequeue(nil, addr, 0, naddr, nwake, nreq)
+func (m *Manager) Requeue(c Checker, addr, naddr uintptr, private bool, nwake int, nreq int) (int, error) {
+	return m.doRequeue(c, addr, naddr, private, false, 0, nwake, nreq)
 }
 
 // RequeueCmp atomically checks that the addr contains val (via the Checker),
 // wakes up to nwake waiters on addr and then unconditionally requeues nreq
 // waiters on naddr.
-func (m *Manager) RequeueCmp(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
-	return m.doRequeue(c, addr, val, naddr, nwake, nreq)
+func (m *Manager) RequeueCmp(c Checker, addr, naddr uintptr, private bool, val uint32, nwake int, nreq int) (int, error) {
+	return m.doRequeue(c, addr, naddr, private, true, val, nwake, nreq)
 }
 
 // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
 // waiters unconditionally from addr1, and, based on the original value at addr2
 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
 // It returns the total number of waiters woken.
-func (m *Manager) WakeOp(c Checker, addr1 uintptr, addr2 uintptr, nwake1 int, nwake2 int, op uint32) (int, error) {
-	if err := checkAddr(addr1); err != nil {
+func (m *Manager) WakeOp(c Checker, addr1, addr2 uintptr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+	k1, err := getKey(c, addr1, private)
+	if err != nil {
 		return 0, err
 	}
-	if err := checkAddr(addr2); err != nil {
+	defer k1.release()
+	k2, err := getKey(c, addr2, private)
+	if err != nil {
 		return 0, err
 	}
+	defer k2.release()
 
-	b1, b2 := m.lockBuckets(addr1, addr2)
+	b1, b2 := m.lockBuckets(&k1, &k2)
+	defer b1.mu.Unlock()
+	if b2 != b1 {
+		defer b2.mu.Unlock()
+	}
 
 	done := 0
 	cond, err := c.Op(addr2, op)
-	if err == nil {
-		// Wake up up to nwake1 entries from the first bucket.
-		done = b1.wakeLocked(addr1, ^uint32(0), nwake1)
-
-		// Wake up up to nwake2 entries from the second bucket if the
-		// operation yielded true.
-		if cond {
-			done += b2.wakeLocked(addr2, ^uint32(0), nwake2)
-		}
+	if err != nil {
+		return 0, err
 	}
 
-	b1.mu.Unlock()
-	if b2 != b1 {
-		b2.mu.Unlock()
+	// Wake up up to nwake1 entries from the first bucket.
+	done = b1.wakeLocked(&k1, ^uint32(0), nwake1)
+
+	// Wake up up to nwake2 entries from the second bucket if the
+	// operation yielded true.
+	if cond {
+		done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
 	}
-	return done, err
+
+	return done, nil
 }
 
 // WaitPrepare atomically checks that addr contains val (via the Checker), then
 // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
 // Waiter must be subsequently removed by calling WaitComplete, whether or not
 // a wakeup is received on w.C.
-func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bitmask uint32) error {
-	if err := checkAddr(addr); err != nil {
+func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, private bool, val uint32, bitmask uint32) error {
+	k, err := getKey(c, addr, private)
+	if err != nil {
 		return err
 	}
+	// Ownership of k is transferred to w below.
 
 	// Prepare the Waiter before taking the bucket lock.
-	w.complete = 0
 	select {
 	case <-w.C:
 	default:
 	}
-	w.addr = addr
+	w.key = k
 	w.bitmask = bitmask
 
-	b := m.lockBucket(addr)
+	b := m.lockBucket(&k)
 	// This function is very hot; avoid defer.
 
 	// Perform our atomic check.
 	if err := c.Check(addr, val); err != nil {
 		b.mu.Unlock()
+		w.key.release()
 		return err
 	}
 
 	// Add the waiter to the bucket.
 	b.waiters.PushBack(w)
+	w.bucket.Store(b)
 
 	b.mu.Unlock()
 	return nil
@@ -372,36 +504,36 @@ func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bi
 // WaitComplete must be called when a Waiter previously added by WaitPrepare is
 // no longer eligible to be woken.
 func (m *Manager) WaitComplete(w *Waiter) {
-	// Can we short-circuit acquiring the lock?
-	// This is the happy path where a notification
-	// was received and we don't need to dequeue this
-	// waiter from any list (or take any locks).
-	if atomic.LoadInt32(&w.complete) != 0 {
-		return
-	}
-
-	// Take the bucket lock. Note that without holding the bucket lock, the
-	// waiter is not guaranteed to stay in that bucket, so after we take the
-	// bucket lock, we must ensure that the bucket hasn't changed: if it
-	// happens to have changed, we release the old bucket lock and try again
-	// with the new bucket; if it hasn't changed, we know it won't change now
-	// because we hold the lock.
-	var b *bucket
+	// Remove w from the bucket it's in.
 	for {
-		addr := atomic.LoadUintptr(&w.addr)
-		b = m.lockBucket(addr)
-		// We still have to use an atomic load here, because if w was racily
-		// requeued then w.addr is not protected by b.mu.
-		if addr == atomic.LoadUintptr(&w.addr) {
+		b := w.bucket.Load()
+
+		// If b is nil, the waiter isn't in any bucket anymore. This can't be
+		// racy because the waiter can't be concurrently re-queued in another
+		// bucket.
+		if b == nil {
 			break
 		}
-		b.mu.Unlock()
-	}
 
-	// Remove waiter from the bucket. w.complete can only be stored with b.mu
-	// locked, so this load doesn't need to use sync/atomic.
-	if w.complete == 0 {
+		// Take the bucket lock. Note that without holding the bucket lock, the
+		// waiter is not guaranteed to stay in that bucket, so after we take
+		// the bucket lock, we must ensure that the bucket hasn't changed: if
+		// it happens to have changed, we release the old bucket lock and try
+		// again with the new bucket; if it hasn't changed, we know it won't
+		// change now because we hold the lock.
+		b.mu.Lock()
+		if b != w.bucket.Load() {
+			b.mu.Unlock()
+			continue
+		}
+
+		// Remove w from b.
 		b.waiters.Remove(w)
+		w.bucket.Store(nil)
+		b.mu.Unlock()
+		break
 	}
-	b.mu.Unlock()
+
+	// Release references held by the waiter.
+	w.key.release()
 }
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 7b81358ec..726c26990 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -24,17 +24,13 @@ import (
 	"unsafe"
 )
 
-const (
-	testMutexSize            = 4
-	testMutexLocked   uint32 = 1
-	testMutexUnlocked uint32 = 0
-)
-
 // testData implements the Checker interface, and allows us to
 // treat the address passed for futex operations as an index in
 // a byte slice for testing simplicity.
 type testData []byte
 
+const sizeofInt32 = 4
+
 func newTestData(size uint) testData {
 	return make([]byte, size)
 }
@@ -50,451 +46,478 @@ func (t testData) Op(addr uintptr, val uint32) (bool, error) {
 	return val == 0, nil
 }
 
-// testMutex ties together a testData slice, an address, and a
-// futex manager in order to implement the sync.Locker interface.
-// Beyond being used as a Locker, this is a simple mechanism for
-// changing the underlying values for simpler tests.
-type testMutex struct {
-	a uintptr
-	d testData
-	m *Manager
-}
-
-func newTestMutex(addr uintptr, d testData, m *Manager) *testMutex {
-	return &testMutex{a: addr, d: d, m: m}
+func (t testData) GetSharedKey(addr uintptr) (Key, error) {
+	return Key{
+		Kind:   KindSharedMappable,
+		Offset: uint64(addr),
+	}, nil
 }
 
-// Lock acquires the testMutex.
-// This may wait for it to be available via the futex manager.
-func (t *testMutex) Lock() {
-	for {
-		// Attempt to grab the lock.
-		if atomic.CompareAndSwapUint32(
-			((*uint32)(unsafe.Pointer(&t.d[t.a]))),
-			testMutexUnlocked,
-			testMutexLocked) {
-			// Lock held.
-			return
-		}
-
-		// Wait for it to be "not locked".
-		w := NewWaiter()
-		err := t.m.WaitPrepare(w, t.d, t.a, testMutexLocked, ^uint32(0))
-		if err == syscall.EAGAIN {
-			continue
-		}
-		if err != nil {
-			// Should never happen.
-			panic("WaitPrepare returned unexpected error: " + err.Error())
-		}
-		<-w.C
-		t.m.WaitComplete(w)
+func futexKind(private bool) string {
+	if private {
+		return "private"
 	}
+	return "shared"
 }
 
-// Unlock releases the testMutex.
-// This will notify any waiters via the futex manager.
-func (t *testMutex) Unlock() {
-	// Unlock.
-	atomic.StoreUint32(((*uint32)(unsafe.Pointer(&t.d[t.a]))), testMutexUnlocked)
-
-	// Notify all waiters.
-	t.m.Wake(t.a, ^uint32(0), math.MaxInt32)
+func newPreparedTestWaiter(t *testing.T, m *Manager, c Checker, addr uintptr, private bool, val uint32, bitmask uint32) *Waiter {
+	w := NewWaiter()
+	if err := m.WaitPrepare(w, c, addr, private, val, bitmask); err != nil {
+		t.Fatalf("WaitPrepare failed: %v", err)
+	}
+	return w
 }
 
 func TestFutexWake(t *testing.T) {
-	m := NewManager()
-	d := newTestData(testMutexSize)
-
-	// Wait for it to be locked.
-	// (This won't trigger the wake in testMutex)
-	w := NewWaiter()
-	m.WaitPrepare(w, d, 0, testMutexUnlocked, ^uint32(0))
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(sizeofInt32)
+
+			// Start waiting for wakeup.
+			w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w)
+
+			// Perform a wakeup.
+			if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 {
+				t.Errorf("Wake: got (%d, %v), wanted (1, nil)", n, err)
+			}
 
-	// Wake the single thread.
-	if _, err := m.Wake(0, ^uint32(0), 1); err != nil {
-		t.Error("wake error:", err)
+			// Expect the waiter to have been woken.
+			if !w.woken() {
+				t.Error("waiter not woken")
+			}
+		})
 	}
-
-	<-w.C
-	m.WaitComplete(w)
 }
 
 func TestFutexWakeBitmask(t *testing.T) {
-	m := NewManager()
-	d := newTestData(testMutexSize)
-
-	// Wait for it to be locked.
-	// (This won't trigger the wake in testMutex)
-	w := NewWaiter()
-	m.WaitPrepare(w, d, 0, testMutexUnlocked, 0x0000ffff)
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(sizeofInt32)
+
+			// Start waiting for wakeup.
+			w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff)
+			defer m.WaitComplete(w)
+
+			// Perform a wakeup using the wrong bitmask.
+			if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 {
+				t.Errorf("Wake with non-matching bitmask: got (%d, %v), wanted (0, nil)", n, err)
+			}
 
-	// Wake the single thread, not using the bitmask.
-	if _, err := m.Wake(0, 0xffff0000, 1); err != nil {
-		t.Error("wake non-matching bitmask error:", err)
-	}
+			// Expect the waiter to still be waiting.
+			if w.woken() {
+				t.Error("waiter woken unexpectedly")
+			}
 
-	select {
-	case <-w.C:
-		t.Error("w is alive?")
-	default:
-	}
+			// Perform a wakeup using the right bitmask.
+			if n, err := m.Wake(d, 0, private, 0x00000001, 1); err != nil || n != 1 {
+				t.Errorf("Wake with matching bitmask: got (%d, %v), wanted (1, nil)", n, err)
+			}
 
-	// Now use a matching bitmask.
-	if _, err := m.Wake(0, 0x00000001, 1); err != nil {
-		t.Error("wake matching bitmask error:", err)
+			// Expect that the waiter was woken.
+			if !w.woken() {
+				t.Error("waiter not woken")
+			}
+		})
 	}
-
-	<-w.C
-	m.WaitComplete(w)
 }
 
 func TestFutexWakeTwo(t *testing.T) {
-	m := NewManager()
-	d := newTestData(testMutexSize)
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(sizeofInt32)
+
+			// Start three waiters waiting for wakeup.
+			var ws [3]*Waiter
+			for i := range ws {
+				ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+				defer m.WaitComplete(ws[i])
+			}
 
-	// Wait for it to be locked.
-	// (This won't trigger the wake in testMutex)
-	w1 := NewWaiter()
-	w2 := NewWaiter()
-	w3 := NewWaiter()
-	m.WaitPrepare(w1, d, 0, testMutexUnlocked, ^uint32(0))
-	m.WaitPrepare(w2, d, 0, testMutexUnlocked, ^uint32(0))
-	m.WaitPrepare(w3, d, 0, testMutexUnlocked, ^uint32(0))
-
-	// Wake exactly two threads.
-	if _, err := m.Wake(0, ^uint32(0), 2); err != nil {
-		t.Error("wake error:", err)
-	}
+			// Perform two wakeups.
+			const wakeups = 2
+			if n, err := m.Wake(d, 0, private, ^uint32(0), 2); err != nil || n != wakeups {
+				t.Errorf("Wake: got (%d, %v), wanted (%d, nil)", n, err, wakeups)
+			}
 
-	// Ensure exactly two are alive.
-	// We don't get guarantees about exactly which two,
-	// (although we expect them to be w1 and w2).
-	awake := 0
-	for {
-		select {
-		case <-w1.C:
-			awake++
-		case <-w2.C:
-			awake++
-		case <-w3.C:
-			awake++
-		default:
-			if awake != 2 {
-				t.Error("awake != 2?")
-			}
-
-			// Success.
-			return
-		}
+			// Expect that exactly two waiters were woken.
+			// We don't get guarantees about exactly which two,
+			// (although we expect them to be w1 and w2).
+			awake := 0
+			for i := range ws {
+				if ws[i].woken() {
+					awake++
+				}
+			}
+			if awake != wakeups {
+				t.Errorf("got %d woken waiters, wanted %d", awake, wakeups)
+			}
+		})
 	}
 }
 
 func TestFutexWakeUnrelated(t *testing.T) {
-	m := NewManager()
-	d := newTestData(2 * testMutexSize)
-
-	// Wait for it to be locked.
-	w1 := NewWaiter()
-	w2 := NewWaiter()
-	m.WaitPrepare(w1, d, 0*testMutexSize, testMutexUnlocked, ^uint32(0))
-	m.WaitPrepare(w2, d, 1*testMutexSize, testMutexUnlocked, ^uint32(0))
-
-	// Wake only the second one.
-	if _, err := m.Wake(1*testMutexSize, ^uint32(0), 2); err != nil {
-		t.Error("wake error:", err)
-	}
-
-	// Ensure only r2 is alive.
-	select {
-	case <-w1.C:
-		t.Error("w1 is alive?")
-	default:
-	}
-	<-w2.C
-}
-
-// This function was shamelessly stolen from mutex_test.go.
-func HammerMutex(l sync.Locker, loops int, cdone chan bool) {
-	for i := 0; i < loops; i++ {
-		l.Lock()
-		runtime.Gosched()
-		l.Unlock()
-	}
-	cdone <- true
-}
-
-func TestFutexStress(t *testing.T) {
-	m := NewManager()
-	d := newTestData(testMutexSize)
-	tm := newTestMutex(0*testMutexSize, d, m)
-	c := make(chan bool)
-
-	for i := 0; i < 10; i++ {
-		go HammerMutex(tm, 1000, c)
-	}
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(2 * sizeofInt32)
+
+			// Start two waiters waiting for wakeup on different addresses.
+			w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform two wakeups on the second address.
+			if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 {
+				t.Errorf("Wake: got (%d, %v), wanted (1, nil)", n, err)
+			}
 
-	for i := 0; i < 10; i++ {
-		<-c
+			// Expect that only the second waiter was woken.
+			if w1.woken() {
+				t.Error("w1 woken unexpectedly")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+		})
 	}
 }
 
 func TestWakeOpEmpty(t *testing.T) {
-	m := NewManager()
-	d := newTestData(8)
-
-	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
-	}
-
-	if n != 0 {
-		t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(2 * sizeofInt32)
+
+			// Perform wakeups with no waiters.
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 0); err != nil || n != 0 {
+				t.Fatalf("WakeOp: got (%d, %v), wanted (0, nil)", n, err)
+			}
+		})
 	}
 }
 
 func TestWakeOpFirstNonEmpty(t *testing.T) {
-	m := NewManager()
-	d := newTestData(8)
-
-	// Add two waiters on address 0.
-	w1 := NewWaiter()
-	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w1)
-
-	w2 := NewWaiter()
-	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w2)
-
-	// Wake up all waiters on address 0.
-	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
-	}
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address 0.
+			w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform 10 wakeups on address 0.
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err)
+			}
 
-	if n != 2 {
-		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+			// Expect that both waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+		})
 	}
 }
 
 func TestWakeOpSecondNonEmpty(t *testing.T) {
-	m := NewManager()
-	d := newTestData(8)
-
-	// Add two waiters on address 4.
-	w1 := NewWaiter()
-	if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w1)
-
-	w2 := NewWaiter()
-	if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w2)
-
-	// Wake up all waiters on address 4.
-	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
-	}
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address sizeofInt32.
+			w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform 10 wakeups on address sizeofInt32 (contingent on
+			// d.Op(0), which should succeed).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 0, 10, 0); err != nil || n != 2 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err)
+			}
 
-	if n != 2 {
-		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+			// Expect that both waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+		})
 	}
 }
 
 func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
-	m := NewManager()
-	d := newTestData(8)
-
-	// Add two waiters on address 4.
-	w1 := NewWaiter()
-	if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w1)
-
-	w2 := NewWaiter()
-	if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w2)
-
-	// Wake up all waiters on address 4.
-	n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
-	}
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address sizeofInt32.
+			w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Perform 10 wakeups on address sizeofInt32 (contingent on
+			// d.Op(1), which should fail).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 0, 10, 1); err != nil || n != 0 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (0, nil)", n, err)
+			}
 
-	if n != 0 {
-		t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+			// Expect that neither waiter was woken.
+			if w1.woken() {
+				t.Error("w1 woken unexpectedly")
+			}
+			if w2.woken() {
+				t.Error("w2 woken unexpectedly")
+			}
+		})
 	}
 }
 
 func TestWakeOpAllNonEmpty(t *testing.T) {
-	m := NewManager()
-	d := newTestData(8)
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address 0.
+			w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Add two waiters on address sizeofInt32.
+			w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w3)
+			w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w4)
+
+			// Perform 10 wakeups on address 0 (unconditionally), and 10
+			// wakeups on address sizeofInt32 (contingent on d.Op(0), which
+			// should succeed).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 0); err != nil || n != 4 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (4, nil)", n, err)
+			}
 
-	// Add two waiters on address 0.
-	w1 := NewWaiter()
-	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
+			// Expect that all waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+			if !w3.woken() {
+				t.Error("w3 not woken")
+			}
+			if !w4.woken() {
+				t.Error("w4 not woken")
+			}
+		})
 	}
-	defer m.WaitComplete(w1)
+}
 
-	w2 := NewWaiter()
-	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w2)
+func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add two waiters on address 0.
+			w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w1)
+			w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+			defer m.WaitComplete(w2)
+
+			// Add two waiters on address sizeofInt32.
+			w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w3)
+			w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0))
+			defer m.WaitComplete(w4)
+
+			// Perform 10 wakeups on address 0 (unconditionally), and 10
+			// wakeups on address sizeofInt32 (contingent on d.Op(1), which
+			// should fail).
+			if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 1); err != nil || n != 2 {
+				t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err)
+			}
 
-	// Add two waiters on address 4.
-	w3 := NewWaiter()
-	if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
+			// Expect that only the first two waiters were woken.
+			if !w1.woken() {
+				t.Error("w1 not woken")
+			}
+			if !w2.woken() {
+				t.Error("w2 not woken")
+			}
+			if w3.woken() {
+				t.Error("w3 woken unexpectedly")
+			}
+			if w4.woken() {
+				t.Error("w4 woken unexpectedly")
+			}
+		})
 	}
-	defer m.WaitComplete(w3)
+}
 
-	w4 := NewWaiter()
-	if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w4)
+func TestWakeOpSameAddress(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add four waiters on address 0.
+			var ws [4]*Waiter
+			for i := range ws {
+				ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+				defer m.WaitComplete(ws[i])
+			}
 
-	// Wake up all waiters on both addresses.
-	n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
-	}
+			// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
+			// on address 0 (contingent on d.Op(0), which should succeed).
+			const wakeups = 2
+			if n, err := m.WakeOp(d, 0, 0, private, 1, 1, 0); err != nil || n != wakeups {
+				t.Errorf("WakeOp: got (%d, %v), wanted (%d, nil)", n, err, wakeups)
+			}
 
-	if n != 4 {
-		t.Fatalf("Invalid number of wakes: want 4, got %d", n)
+			// Expect that exactly two waiters were woken.
+			awake := 0
+			for i := range ws {
+				if ws[i].woken() {
+					awake++
+				}
+			}
+			if awake != wakeups {
+				t.Errorf("got %d woken waiters, wanted %d", awake, wakeups)
+			}
+		})
 	}
 }
 
-func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
-	m := NewManager()
-	d := newTestData(8)
-
-	// Add two waiters on address 0.
-	w1 := NewWaiter()
-	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w1)
-
-	w2 := NewWaiter()
-	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w2)
+func TestWakeOpSameAddressFailingOp(t *testing.T) {
+	for _, private := range []bool{false, true} {
+		t.Run(futexKind(private), func(t *testing.T) {
+			m := NewManager()
+			d := newTestData(8)
+
+			// Add four waiters on address 0.
+			var ws [4]*Waiter
+			for i := range ws {
+				ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0))
+				defer m.WaitComplete(ws[i])
+			}
 
-	// Add two waiters on address 4.
-	w3 := NewWaiter()
-	if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w3)
+			// Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup
+			// on address 0 (contingent on d.Op(1), which should fail).
+			const wakeups = 1
+			if n, err := m.WakeOp(d, 0, 0, private, 1, 1, 1); err != nil || n != wakeups {
+				t.Errorf("WakeOp: got (%d, %v), wanted (%d, nil)", n, err, wakeups)
+			}
 
-	w4 := NewWaiter()
-	if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
+			// Expect that exactly one waiter was woken.
+			awake := 0
+			for i := range ws {
+				if ws[i].woken() {
+					awake++
+				}
+			}
+			if awake != wakeups {
+				t.Errorf("got %d woken waiters, wanted %d", awake, wakeups)
+			}
+		})
 	}
-	defer m.WaitComplete(w4)
+}
 
-	// Wake up all waiters on both addresses.
-	n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
-	}
+const (
+	testMutexSize            = sizeofInt32
+	testMutexLocked   uint32 = 1
+	testMutexUnlocked uint32 = 0
+)
 
-	if n != 2 {
-		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
-	}
+// testMutex ties together a testData slice, an address, and a
+// futex manager in order to implement the sync.Locker interface.
+// Beyond being used as a Locker, this is a simple mechanism for
+// changing the underlying values for simpler tests.
+type testMutex struct {
+	a uintptr
+	d testData
+	m *Manager
 }
 
-func TestWakeOpSameAddress(t *testing.T) {
-	m := NewManager()
-	d := newTestData(8)
-
-	// Add four waiters on address 0.
-	w1 := NewWaiter()
-	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w1)
+func newTestMutex(addr uintptr, d testData, m *Manager) *testMutex {
+	return &testMutex{a: addr, d: d, m: m}
+}
 
-	w2 := NewWaiter()
-	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w2)
+// Lock acquires the testMutex.
+// This may wait for it to be available via the futex manager.
+func (t *testMutex) Lock() {
+	for {
+		// Attempt to grab the lock.
+		if atomic.CompareAndSwapUint32(
+			(*uint32)(unsafe.Pointer(&t.d[t.a])),
+			testMutexUnlocked,
+			testMutexLocked) {
+			// Lock held.
+			return
+		}
 
-	w3 := NewWaiter()
-	if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
+		// Wait for it to be "not locked".
+		w := NewWaiter()
+		err := t.m.WaitPrepare(w, t.d, t.a, true, testMutexLocked, ^uint32(0))
+		if err == syscall.EAGAIN {
+			continue
+		}
+		if err != nil {
+			// Should never happen.
+			panic("WaitPrepare returned unexpected error: " + err.Error())
+		}
+		<-w.C
+		t.m.WaitComplete(w)
 	}
-	defer m.WaitComplete(w3)
+}
 
-	w4 := NewWaiter()
-	if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w4)
+// Unlock releases the testMutex.
+// This will notify any waiters via the futex manager.
+func (t *testMutex) Unlock() {
+	// Unlock.
+	atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked)
 
-	// Use the same address, with one at most one waiter from each.
-	n, err := m.WakeOp(d, 0, 0, 1, 1, 0)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
-	}
+	// Notify all waiters.
+	t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32)
+}
 
-	if n != 2 {
-		t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+// This function was shamelessly stolen from mutex_test.go.
+func HammerMutex(l sync.Locker, loops int, cdone chan bool) {
+	for i := 0; i < loops; i++ {
+		l.Lock()
+		runtime.Gosched()
+		l.Unlock()
 	}
+	cdone <- true
 }
 
-func TestWakeOpSameAddressFailingOp(t *testing.T) {
+func TestMutexStress(t *testing.T) {
 	m := NewManager()
-	d := newTestData(8)
-
-	// Add four waiters on address 0.
-	w1 := NewWaiter()
-	if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w1)
-
-	w2 := NewWaiter()
-	if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w2)
-
-	w3 := NewWaiter()
-	if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w3)
-
-	w4 := NewWaiter()
-	if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
-		t.Fatalf("WaitPrepare failed: %v", err)
-	}
-	defer m.WaitComplete(w4)
+	d := newTestData(testMutexSize)
+	tm := newTestMutex(0*testMutexSize, d, m)
+	c := make(chan bool)
 
-	// Use the same address, with one at most one waiter from each.
-	n, err := m.WakeOp(d, 0, 0, 1, 1, 1)
-	if err != nil {
-		t.Fatalf("WakeOp failed: %v", err)
+	for i := 0; i < 10; i++ {
+		go HammerMutex(tm, 1000, c)
 	}
 
-	if n != 1 {
-		t.Fatalf("Invalid number of wakes: want 1, got %d", n)
+	for i := 0; i < 10; i++ {
+		<-c
 	}
 }
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1ace0b501..238fd127b 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -49,6 +49,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
@@ -108,6 +109,11 @@ type Kernel struct {
 	// Kernel.CreateProcess can succeed.
 	mounts *fs.MountNamespace
 
+	// futexes is the "root" futex.Manager, from which all others are forked.
+	// This is necessary to ensure that shared futexes are coherent across all
+	// tasks, including those created by CreateProcess.
+	futexes *futex.Manager
+
 	// globalInit is the thread group whose leader has ID 1 in the root PID
 	// namespace. globalInit is stored separately so that it is accessible even
 	// after all tasks in the thread group have exited, such that ID 1 is no
@@ -254,6 +260,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.vdso = args.Vdso
 	k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
+	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
 
 	return nil
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 130bd652b..7c469ec46 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -200,7 +200,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		ipcns = NewIPCNamespace(userns)
 	}
 
-	tc, err := t.tc.Fork(t, !opts.NewAddressSpace)
+	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
 	}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 9a59cbd33..d2df7e9d1 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -72,7 +72,7 @@ func (tc *TaskContext) release() {
 // TaskContext shares an address space with the original; otherwise, the copied
 // TaskContext has an independent address space that is initially a duplicate
 // of the original's.
-func (tc *TaskContext) Fork(ctx context.Context, shareAddressSpace bool) (*TaskContext, error) {
+func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) {
 	newTC := &TaskContext{
 		Arch: tc.Arch.Fork(),
 		st:   tc.st,
@@ -93,8 +93,7 @@ func (tc *TaskContext) Fork(ctx context.Context, shareAddressSpace bool) (*TaskC
 			return nil, err
 		}
 		newTC.MemoryManager = newMM
-		// TODO: revisit when shmem is supported.
-		newTC.fu = futex.NewManager()
+		newTC.fu = k.futexes.Fork()
 	}
 	return newTC, nil
 }
@@ -116,14 +115,6 @@ func (t *Task) MemoryManager() *mm.MemoryManager {
 	return t.tc.MemoryManager
 }
 
-// Futex returns t's futex manager.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) Futex() *futex.Manager {
-	return t.tc.fu
-}
-
 // SyscallTable returns t's syscall table.
 //
 // Preconditions: The caller must be running on the task goroutine, or t.mu
@@ -175,7 +166,7 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, r
 		Name:          name,
 		Arch:          ac,
 		MemoryManager: m,
-		fu:            futex.NewManager(),
+		fu:            k.futexes.Fork(),
 		st:            st,
 	}, nil
 }
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index a1b24e1c6..f5b45fb17 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -247,7 +247,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		t.tg.signalHandlers.mu.Unlock()
 		if !signaled {
 			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
-				t.Futex().Wake(uintptr(t.cleartid), ^uint32(0), 1)
+				t.Futex().Wake(t.FutexChecker(), uintptr(t.cleartid), false, ^uint32(0), 1)
 			}
 			// If the CopyOut fails, there's nothing we can do.
 		}
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
new file mode 100644
index 000000000..62ebbcb0d
--- /dev/null
+++ b/pkg/sentry/kernel/task_futex.go
@@ -0,0 +1,148 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Futex returns t's futex manager.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Futex() *futex.Manager {
+	return t.tc.fu
+}
+
+// FutexChecker returns a futex.Checker that interprets addresses in t's
+// address space.
+//
+// Preconditions: All uses of the returned futex.Checker must be on the task
+// goroutine.
+func (t *Task) FutexChecker() futex.Checker {
+	return futexChecker{t}
+}
+
+type futexChecker struct {
+	t *Task
+}
+
+// Check implements futex.Checker.Check.
+func (f futexChecker) Check(addr uintptr, val uint32) error {
+	// FIXME
+	in := f.t.CopyScratchBuffer(4)
+	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
+	if err != nil {
+		return err
+	}
+	nval := usermem.ByteOrder.Uint32(in)
+	if val != nval {
+		return syserror.EAGAIN
+	}
+	return nil
+}
+
+func (f futexChecker) atomicOp(addr uintptr, op func(uint32) uint32) (uint32, error) {
+	// FIXME
+	in := f.t.CopyScratchBuffer(4)
+	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
+	if err != nil {
+		return 0, err
+	}
+	o := usermem.ByteOrder.Uint32(in)
+	mm := f.t.MemoryManager()
+	for {
+		n := op(o)
+		r, err := mm.CompareAndSwapUint32(f.t, usermem.Addr(addr), o, n, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		if err != nil {
+			return 0, err
+		}
+
+		if r == o {
+			return o, nil
+		}
+		o = r
+	}
+}
+
+// Op implements futex.Checker.Op, interpreting opIn consistently with Linux.
+func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
+	op := (opIn >> 28) & 0xf
+	cmp := (opIn >> 24) & 0xf
+	opArg := (opIn >> 12) & 0xfff
+	cmpArg := opIn & 0xfff
+
+	if op&linux.FUTEX_OP_OPARG_SHIFT != 0 {
+		opArg = 1 << opArg
+		op &^= linux.FUTEX_OP_OPARG_SHIFT // clear flag
+	}
+
+	var oldVal uint32
+	var err error
+	switch op {
+	case linux.FUTEX_OP_SET:
+		oldVal, err = f.t.MemoryManager().SwapUint32(f.t, usermem.Addr(addr), opArg, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+	case linux.FUTEX_OP_ADD:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a + opArg
+		})
+	case linux.FUTEX_OP_OR:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a | opArg
+		})
+	case linux.FUTEX_OP_ANDN:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a &^ opArg
+		})
+	case linux.FUTEX_OP_XOR:
+		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
+			return a ^ opArg
+		})
+	default:
+		return false, syserror.ENOSYS
+	}
+	if err != nil {
+		return false, err
+	}
+
+	switch cmp {
+	case linux.FUTEX_OP_CMP_EQ:
+		return oldVal == cmpArg, nil
+	case linux.FUTEX_OP_CMP_NE:
+		return oldVal != cmpArg, nil
+	case linux.FUTEX_OP_CMP_LT:
+		return oldVal < cmpArg, nil
+	case linux.FUTEX_OP_CMP_LE:
+		return oldVal <= cmpArg, nil
+	case linux.FUTEX_OP_CMP_GT:
+		return oldVal > cmpArg, nil
+	case linux.FUTEX_OP_CMP_GE:
+		return oldVal >= cmpArg, nil
+	default:
+		return false, syserror.ENOSYS
+	}
+}
+
+// GetSharedKey implements futex.Checker.GetSharedKey.
+func (f futexChecker) GetSharedKey(addr uintptr) (futex.Key, error) {
+	return f.t.MemoryManager().GetSharedFutexKey(f.t, usermem.Addr(addr))
+}
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index ad9231774..744e73a39 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -106,6 +106,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 21aeabde8..b0622b0c3 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
@@ -793,6 +794,40 @@ func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uin
 	return nil
 }
 
+// GetSharedFutexKey is used by kernel.futexChecker.GetSharedKey to implement
+// futex.Checker.GetSharedKey.
+func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Addr) (futex.Key, error) {
+	ar, ok := addr.ToRange(4) // sizeof(int32)
+	if !ok {
+		return futex.Key{}, syserror.EFAULT
+	}
+
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	vseg, _, err := mm.getVMAsLocked(ctx, ar, usermem.Read, false)
+	if err != nil {
+		return futex.Key{}, err
+	}
+	vma := vseg.ValuePtr()
+
+	if vma.private {
+		return futex.Key{
+			Kind:   futex.KindSharedPrivate,
+			Offset: uint64(addr),
+		}, nil
+	}
+
+	if vma.id != nil {
+		vma.id.IncRef()
+	}
+	return futex.Key{
+		Kind:            futex.KindSharedMappable,
+		Mappable:        vma.mappable,
+		MappingIdentity: vma.id,
+		Offset:          vseg.mappableOffsetAt(addr),
+	}, nil
+}
+
 // VirtualMemorySize returns the combined length in bytes of all mappings in
 // mm.
 func (mm *MemoryManager) VirtualMemorySize() uint64 {
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 1a0e1f5fb..d35dcecbe 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -21,115 +21,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-// futexChecker is a futex.Checker that uses a Task's MemoryManager.
-type futexChecker struct {
-	t *kernel.Task
-}
-
-// Check checks if the address contains the given value, and returns
-// syserror.EAGAIN if it doesn't. See Checker interface in futex package
-// for more information.
-func (f futexChecker) Check(addr uintptr, val uint32) error {
-	in := f.t.CopyScratchBuffer(4)
-	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
-	if err != nil {
-		return err
-	}
-	nval := usermem.ByteOrder.Uint32(in)
-	if val != nval {
-		return syserror.EAGAIN
-	}
-	return nil
-}
-
-func (f futexChecker) atomicOp(addr uintptr, op func(uint32) uint32) (uint32, error) {
-	in := f.t.CopyScratchBuffer(4)
-	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
-	if err != nil {
-		return 0, err
-	}
-	o := usermem.ByteOrder.Uint32(in)
-	mm := f.t.MemoryManager()
-	for {
-		n := op(o)
-		r, err := mm.CompareAndSwapUint32(f.t, usermem.Addr(addr), o, n, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		if err != nil {
-			return 0, err
-		}
-
-		if r == o {
-			return o, nil
-		}
-		o = r
-	}
-}
-
-// Op performs an operation on addr and returns a result based on the operation.
-func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
-	op := (opIn >> 28) & 0xf
-	cmp := (opIn >> 24) & 0xf
-	opArg := (opIn >> 12) & 0xfff
-	cmpArg := opIn & 0xfff
-
-	if op&linux.FUTEX_OP_OPARG_SHIFT != 0 {
-		opArg = 1 << opArg
-		op &^= linux.FUTEX_OP_OPARG_SHIFT // clear flag
-	}
-
-	var oldVal uint32
-	var err error
-	switch op {
-	case linux.FUTEX_OP_SET:
-		oldVal, err = f.t.MemoryManager().SwapUint32(f.t, usermem.Addr(addr), opArg, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-	case linux.FUTEX_OP_ADD:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a + opArg
-		})
-	case linux.FUTEX_OP_OR:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a | opArg
-		})
-	case linux.FUTEX_OP_ANDN:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a & ^opArg
-		})
-	case linux.FUTEX_OP_XOR:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a ^ opArg
-		})
-	default:
-		return false, syserror.ENOSYS
-	}
-	if err != nil {
-		return false, err
-	}
-
-	switch cmp {
-	case linux.FUTEX_OP_CMP_EQ:
-		return oldVal == cmpArg, nil
-	case linux.FUTEX_OP_CMP_NE:
-		return oldVal != cmpArg, nil
-	case linux.FUTEX_OP_CMP_LT:
-		return oldVal < cmpArg, nil
-	case linux.FUTEX_OP_CMP_LE:
-		return oldVal <= cmpArg, nil
-	case linux.FUTEX_OP_CMP_GT:
-		return oldVal > cmpArg, nil
-	case linux.FUTEX_OP_CMP_GE:
-		return oldVal >= cmpArg, nil
-	default:
-		return false, syserror.ENOSYS
-	}
-}
-
 // futexWaitRestartBlock encapsulates the state required to restart futex(2)
 // via restart_syscall(2).
 //
@@ -140,13 +34,14 @@ type futexWaitRestartBlock struct {
 	// addr stored as uint64 since uintptr is not save-able.
 	addr uint64
 
-	val  uint32
-	mask uint32
+	private bool
+	val     uint32
+	mask    uint32
 }
 
 // Restart implements kernel.SyscallRestartBlock.Restart.
 func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
-	return futexWaitDuration(t, f.duration, false, uintptr(f.addr), f.val, f.mask)
+	return futexWaitDuration(t, f.duration, false, uintptr(f.addr), f.private, f.val, f.mask)
 }
 
 // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
@@ -156,9 +51,9 @@ func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
 //
 // If blocking is interrupted, the syscall is restarted with the original
 // arguments.
-func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr uintptr, val, mask uint32) (uintptr, error) {
+func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr uintptr, private bool, val, mask uint32) (uintptr, error) {
 	w := t.FutexWaiter()
-	err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask)
+	err := t.Futex().WaitPrepare(w, t.FutexChecker(), addr, private, val, mask)
 	if err != nil {
 		return 0, err
 	}
@@ -192,9 +87,9 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo
 // syscall. If forever is true, the syscall is restarted with the original
 // arguments. If forever is false, duration is a relative timeout and the
 // syscall is restarted with the remaining timeout.
-func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr uintptr, val, mask uint32) (uintptr, error) {
+func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr uintptr, private bool, val, mask uint32) (uintptr, error) {
 	w := t.FutexWaiter()
-	err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask)
+	err := t.Futex().WaitPrepare(w, t.FutexChecker(), addr, private, val, mask)
 	if err != nil {
 		return 0, err
 	}
@@ -222,6 +117,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
 	t.SetSyscallRestartBlock(&futexWaitRestartBlock{
 		duration: remaining,
 		addr:     uint64(addr),
+		private:  private,
 		val:      val,
 		mask:     mask,
 	})
@@ -243,6 +139,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	addr := uintptr(uaddr)
 	naddr := uintptr(uaddr2)
 	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
+	private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
 	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
 	mask := uint32(val3)
 
@@ -268,7 +165,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			if !forever {
 				timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
 			}
-			n, err := futexWaitDuration(t, timeoutDur, forever, addr, uint32(val), mask)
+			n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask)
 			return n, nil, err
 
 		case linux.FUTEX_WAIT_BITSET:
@@ -277,7 +174,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			if mask == 0 {
 				return 0, nil, syserror.EINVAL
 			}
-			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, uint32(val), mask)
+			n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask)
 			return n, nil, err
 		default:
 			panic("unreachable")
@@ -291,23 +188,23 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if mask == 0 {
 			return 0, nil, syserror.EINVAL
 		}
-		n, err := t.Futex().Wake(addr, mask, val)
+		n, err := t.Futex().Wake(t.FutexChecker(), addr, private, mask, val)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_REQUEUE:
-		n, err := t.Futex().Requeue(addr, naddr, val, nreq)
+		n, err := t.Futex().Requeue(t.FutexChecker(), addr, naddr, private, val, nreq)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_CMP_REQUEUE:
 		// 'val3' contains the value to be checked at 'addr' and
 		// 'val' is the number of waiters that should be woken up.
 		nval := uint32(val3)
-		n, err := t.Futex().RequeueCmp(futexChecker{t}, addr, nval, naddr, val, nreq)
+		n, err := t.Futex().RequeueCmp(t.FutexChecker(), addr, naddr, private, nval, val, nreq)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_WAKE_OP:
 		op := uint32(val3)
-		n, err := t.Futex().WakeOp(futexChecker{t}, addr, naddr, val, nreq, op)
+		n, err := t.Futex().WakeOp(t.FutexChecker(), addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
-- 
cgit v1.2.3


From 569c2b06c47d269d961405fa652d45e51860d005 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 8 Oct 2018 11:38:02 -0700
Subject: Statfs Namelen should be NAME_MAX not PATH_MAX

We accidentally set the wrong maximum. I've also added PATH_MAX and
NAME_MAX to the linux abi package.

PiperOrigin-RevId: 216221311
Change-Id: I44805fcf21508831809692184a0eba4cee469633
---
 pkg/abi/linux/fs.go                     | 6 ++++++
 pkg/sentry/loader/elf.go                | 3 +--
 pkg/sentry/strace/strace.go             | 2 +-
 pkg/sentry/syscalls/linux/sys_file.go   | 4 ++--
 pkg/sentry/syscalls/linux/sys_stat.go   | 4 +---
 pkg/sentry/syscalls/linux/sys_thread.go | 2 +-
 6 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 00b239351..32a0812b4 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -28,6 +28,12 @@ const (
 	V9FS_MAGIC            = 0x01021997
 )
 
+// Filesystem path limits, from uapi/linux/limits.h.
+const (
+	NAME_MAX = 255
+	PATH_MAX = 4096
+)
+
 // Statfs is struct statfs, from uapi/asm-generic/statfs.h.
 type Statfs struct {
 	// Type is one of the filesystem magic values, defined above.
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index f4deaa905..849be5a3d 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -19,7 +19,6 @@ import (
 	"debug/elf"
 	"fmt"
 	"io"
-	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -409,7 +408,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
 				ctx.Infof("PT_INTERP path too small: %v", phdr.Filesz)
 				return loadedELF{}, syserror.ENOEXEC
 			}
-			if phdr.Filesz > syscall.PathMax {
+			if phdr.Filesz > linux.PATH_MAX {
 				ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
 				return loadedELF{}, syserror.ENOEXEC
 			}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index c99c33c33..f2a22aaa5 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -133,7 +133,7 @@ func dump(t *kernel.Task, addr usermem.Addr, size uint, maximumBlobSize uint) st
 }
 
 func path(t *kernel.Task, addr usermem.Addr) string {
-	path, err := t.CopyInString(addr, syscall.PathMax)
+	path, err := t.CopyInString(addr, linux.PATH_MAX)
 	if err != nil {
 		return fmt.Sprintf("%#x (error decoding path: %s)", addr, err)
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 97881a1f5..015afda9b 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -115,7 +115,7 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func
 
 // copyInPath copies a path in.
 func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
-	path, err = t.CopyInString(addr, syscall.PathMax)
+	path, err = t.CopyInString(addr, linux.PATH_MAX)
 	if err != nil {
 		return "", false, err
 	}
@@ -1080,7 +1080,7 @@ func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr use
 
 	// The oldPath is copied in verbatim. This is because the symlink
 	// will include all details, including trailing slashes.
-	oldPath, err := t.CopyInString(oldAddr, syscall.PathMax)
+	oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 6e21b34fd..619a14d7c 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -15,8 +15,6 @@
 package linux
 
 import (
-	"syscall"
-
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -198,7 +196,7 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
 		Files:           info.TotalFiles,
 		FilesFree:       info.FreeFiles,
 		// Same as Linux for simple_statfs, see fs/libfs.c.
-		NameLength:   syscall.PathMax,
+		NameLength:   linux.NAME_MAX,
 		FragmentSize: d.Inode.StableAttr.BlockSize,
 		// Leave other fields 0 like simple_statfs does.
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 0adbf160f..550f63a43 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -76,7 +76,7 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	envvAddr := args[2].Pointer()
 
 	// Extract our arguments.
-	filename, err := t.CopyInString(filenameAddr, syscall.PathMax)
+	filename, err := t.CopyInString(filenameAddr, linux.PATH_MAX)
 	if err != nil {
 		return 0, nil, err
 	}
-- 
cgit v1.2.3


From acf7a951894a1b445ff61e945e32c989892f476f Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 9 Oct 2018 09:51:01 -0700
Subject: Add memunit to sysinfo(2).

Also properly add padding after Procs in the linux.Sysinfo
structure. This will be implicitly padded to 64bits so we
need to do the same.

PiperOrigin-RevId: 216372907
Change-Id: I6eb6a27800da61d8f7b7b6e87bf0391a48fdb475
---
 pkg/abi/linux/linux.go                   | 1 +
 pkg/sentry/syscalls/linux/sys_sysinfo.go | 1 +
 2 files changed, 2 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go
index a946849c5..de2af80dc 100644
--- a/pkg/abi/linux/linux.go
+++ b/pkg/abi/linux/linux.go
@@ -31,6 +31,7 @@ type Sysinfo struct {
 	TotalSwap uint64
 	FreeSwap  uint64
 	Procs     uint16
+	_         [6]byte // Pad Procs to 64bits.
 	TotalHigh uint64
 	FreeHigh  uint64
 	Unit      uint32
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index bd0ffcd5c..6560bac57 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -36,6 +36,7 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		Uptime:   t.Kernel().MonotonicClock().Now().Seconds(),
 		TotalRAM: totalSize,
 		FreeRAM:  totalSize - totalUsage,
+		Unit:     1,
 	}
 	_, err := t.CopyOut(addr, si)
 	return 0, nil, err
-- 
cgit v1.2.3


From c36d2ef3733a0619b992f8ddc23b072474b04044 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 9 Oct 2018 15:11:46 -0700
Subject: Add new netstack metrics to the sentry

PiperOrigin-RevId: 216431260
Change-Id: Ia6e5c8d506940148d10ff2884cf4440f470e5820
---
 pkg/metric/metric.go                   | 54 ++++++++++++++++++++++++----------
 pkg/sentry/socket/epsocket/BUILD       |  1 +
 pkg/sentry/socket/epsocket/epsocket.go | 38 ++++++++++++++++++++++++
 runsc/boot/loader.go                   |  7 +++--
 4 files changed, 83 insertions(+), 17 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 0743612f0..763cd6bc2 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -48,9 +48,6 @@ var (
 // TODO: Support metric fields.
 //
 type Uint64Metric struct {
-	// metadata describes the metric. It is immutable.
-	metadata *pb.MetricMetadata
-
 	// value is the actual value of the metric. It must be accessed
 	// atomically.
 	value uint64
@@ -101,24 +98,35 @@ func Disable() {
 	}
 }
 
-// NewUint64Metric creates a new metric with the given name.
+type customUint64Metric struct {
+	// metadata describes the metric. It is immutable.
+	metadata *pb.MetricMetadata
+
+	// value returns the current value of the metric.
+	value func() uint64
+}
+
+// RegisterCustomUint64Metric registers a metric with the given name.
+//
+// Register must only be called at init and will return and error if called
+// after Initialized.
 //
-// Metrics must be statically defined (i.e., at startup). NewUint64Metric will
-// return an error if called after Initialized.
+// All metrics must be cumulative, meaning that the return values of value must
+// only increase over time.
 //
 // Preconditions:
 //  * name must be globally unique.
 //  * Initialize/Disable have not been called.
-func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric, error) {
+func RegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) error {
 	if initialized {
-		return nil, ErrInitializationDone
+		return ErrInitializationDone
 	}
 
 	if _, ok := allMetrics.m[name]; ok {
-		return nil, ErrNameInUse
+		return ErrNameInUse
 	}
 
-	m := &Uint64Metric{
+	allMetrics.m[name] = customUint64Metric{
 		metadata: &pb.MetricMetadata{
 			Name:        name,
 			Description: description,
@@ -126,9 +134,25 @@ func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric,
 			Sync:        sync,
 			Type:        pb.MetricMetadata_UINT64,
 		},
+		value: value,
+	}
+	return nil
+}
+
+// MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric and panics
+// if it returns an error.
+func MustRegisterCustomUint64Metric(name string, sync bool, description string, value func() uint64) {
+	if err := RegisterCustomUint64Metric(name, sync, description, value); err != nil {
+		panic(fmt.Sprintf("Unable to register metric %q: %v", name, err))
 	}
-	allMetrics.m[name] = m
-	return m, nil
+}
+
+// NewUint64Metric creates and registers a new metric with the given name.
+//
+// Metrics must be statically defined (i.e., at init).
+func NewUint64Metric(name string, sync bool, description string) (*Uint64Metric, error) {
+	var m Uint64Metric
+	return &m, RegisterCustomUint64Metric(name, sync, description, m.Value)
 }
 
 // MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an
@@ -158,13 +182,13 @@ func (m *Uint64Metric) IncrementBy(v uint64) {
 
 // metricSet holds named metrics.
 type metricSet struct {
-	m map[string]*Uint64Metric
+	m map[string]customUint64Metric
 }
 
 // makeMetricSet returns a new metricSet.
 func makeMetricSet() metricSet {
 	return metricSet{
-		m: make(map[string]*Uint64Metric),
+		m: make(map[string]customUint64Metric),
 	}
 }
 
@@ -172,7 +196,7 @@ func makeMetricSet() metricSet {
 func (m *metricSet) Values() metricValues {
 	vals := make(metricValues)
 	for k, v := range m.m {
-		vals[k] = v.Value()
+		vals[k] = v.value()
 	}
 	return vals
 }
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 49af8db85..7f9ea9edc 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -19,6 +19,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/log",
+        "//pkg/metric",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 550569b4c..c5da18b0e 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -33,6 +33,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -53,6 +54,43 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+func mustCreateMetric(name, description string) *tcpip.StatCounter {
+	var cm tcpip.StatCounter
+	metric.MustRegisterCustomUint64Metric(name, false /* sync */, description, cm.Value)
+	return &cm
+}
+
+// Metrics contains metrics exported by netstack.
+var Metrics = tcpip.Stats{
+	UnknownProtocolRcvdPackets: mustCreateMetric("/netstack/unknown_protocol_received_packets", "Number of packets received by netstack that were for an unknown or unsupported protocol."),
+	MalformedRcvdPackets:       mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."),
+	DroppedPackets:             mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."),
+	IP: tcpip.IPStats{
+		PacketsReceived:          mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
+		InvalidAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."),
+		PacketsDelivered:         mustCreateMetric("/netstack/ip/packets_delivered", "Total number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
+		PacketsSent:              mustCreateMetric("/netstack/ip/packets_sent", "Total number of IP packets sent via WritePacket."),
+		OutgoingPacketErrors:     mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
+	},
+	TCP: tcpip.TCPStats{
+		ActiveConnectionOpenings:  mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
+		PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
+		FailedConnectionAttempts:  mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
+		ValidSegmentsReceived:     mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
+		InvalidSegmentsReceived:   mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
+		SegmentsSent:              mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
+		ResetsSent:                mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
+		ResetsReceived:            mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
+	},
+	UDP: tcpip.UDPStats{
+		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
+		UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
+		ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
+		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
+		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent via sendUDP."),
+	},
+}
+
 const sizeOfInt32 int = 4
 
 var errStackType = syserr.New("expected but did not receive an epsocket.Stack", linux.EINVAL)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 5716ef217..1ad6b09f4 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -683,11 +683,14 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 		// NetworkNone sets up loopback using netstack.
 		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
 		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
-		s := &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
+		s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
+			Clock: clock,
+			Stats: epsocket.Metrics,
+		})}
 		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
 			return nil, fmt.Errorf("failed to enable SACK: %v", err)
 		}
-		return s, nil
+		return &s, nil
 
 	default:
 		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
-- 
cgit v1.2.3


From b78552d30e0af4122710e01bc86cbde6bb412686 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 10 Oct 2018 10:41:18 -0700
Subject: When creating a new process group, add it to the session.

PiperOrigin-RevId: 216554791
Change-Id: Ia6b7a2e6eaad80a81b2a8f2e3241e93ebc2bda35
---
 pkg/sentry/kernel/sessions.go | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index b44d218d9..a9b4e7647 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -367,6 +367,9 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 	tg.processGroup.decRefWithParent(oldParentPG)
 	tg.processGroup = pg
 
+	// Add the new process group to the session.
+	pg.session.processGroups.PushBack(pg)
+
 	// Ensure this translation is added to all namespaces.
 	for ns := tg.pidns; ns != nil; ns = ns.parent {
 		local := ns.tids[tg.leader]
-- 
cgit v1.2.3


From ddb34b3690c07f6c8efe2b96f89166145c4a7d3c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 10 Oct 2018 14:09:24 -0700
Subject: Enforce message size limits and avoid host calls with too many iovecs

Currently, in the face of FileMem fragmentation and a large sendmsg or
recvmsg call, host sockets may pass > 1024 iovecs to the host, which
will immediately cause the host to return EMSGSIZE.

When we detect this case, use a single intermediate buffer to pass to
the kernel, copying to/from the src/dst buffer.

To avoid creating unbounded intermediate buffers, enforce message size
checks and truncation w.r.t. the send buffer size. The same
functionality is added to netstack unix sockets for feature parity.

PiperOrigin-RevId: 216590198
Change-Id: I719a32e71c7b1098d5097f35e6daf7dd5190eff7
---
 pkg/sentry/fs/host/BUILD                   |   1 +
 pkg/sentry/fs/host/socket.go               | 145 +++++++++++++++++++++--------
 pkg/sentry/fs/host/socket_iovec.go         | 113 ++++++++++++++++++++++
 pkg/sentry/fs/host/socket_unsafe.go        |  64 ++++++++-----
 pkg/sentry/socket/unix/unix.go             |  17 +++-
 pkg/syserr/netstack.go                     |   2 +
 pkg/syserror/syserror.go                   |   1 +
 pkg/tcpip/link/rawfile/errors.go           |   2 +
 pkg/tcpip/tcpip.go                         |   2 +
 pkg/tcpip/transport/queue/queue.go         |  69 +++++++++++---
 pkg/tcpip/transport/tcp/endpoint_state.go  |   2 +
 pkg/tcpip/transport/udp/endpoint.go        |   6 ++
 pkg/tcpip/transport/unix/connectionless.go |   6 +-
 pkg/tcpip/transport/unix/unix.go           |  49 ++++++----
 runsc/boot/filter/config.go                |  10 +-
 15 files changed, 386 insertions(+), 103 deletions(-)
 create mode 100644 pkg/sentry/fs/host/socket_iovec.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index c34f1c26b..6d5640f0a 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -15,6 +15,7 @@ go_library(
         "inode_state.go",
         "ioctl_unsafe.go",
         "socket.go",
+        "socket_iovec.go",
         "socket_state.go",
         "socket_unsafe.go",
         "tty.go",
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index e11772946..68ebf6402 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -33,6 +34,11 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
+// maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
+//
+// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
+const maxSendBufferSize = 8 << 20
+
 // endpoint encapsulates the state needed to represent a host Unix socket.
 //
 // TODO: Remove/merge with ConnectedEndpoint.
@@ -41,15 +47,17 @@ import (
 type endpoint struct {
 	queue waiter.Queue `state:"zerovalue"`
 
-	// stype is the type of Unix socket. (Ex: unix.SockStream,
-	// unix.SockSeqpacket, unix.SockDgram)
-	stype unix.SockType `state:"nosave"`
-
 	// fd is the host fd backing this file.
 	fd int `state:"nosave"`
 
 	// If srfd >= 0, it is the host fd that fd was imported from.
 	srfd int `state:"wait"`
+
+	// stype is the type of Unix socket.
+	stype unix.SockType `state:"nosave"`
+
+	// sndbuf is the size of the send buffer.
+	sndbuf int `state:"nosave"`
 }
 
 func (e *endpoint) init() error {
@@ -67,12 +75,21 @@ func (e *endpoint) init() error {
 	if err != nil {
 		return err
 	}
+	e.stype = unix.SockType(stype)
+
+	e.sndbuf, err = syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return err
+	}
+	if e.sndbuf > maxSendBufferSize {
+		log.Warningf("Socket send buffer too large: %d", e.sndbuf)
+		return syserror.EINVAL
+	}
 
 	if err := syscall.SetNonblock(e.fd, true); err != nil {
 		return err
 	}
 
-	e.stype = unix.SockType(stype)
 	return fdnotifier.AddFD(int32(e.fd), &e.queue)
 }
 
@@ -189,13 +206,13 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = 0
 		return nil
 	case *tcpip.SendBufferSizeOption:
-		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
-		*o = tcpip.SendBufferSizeOption(v)
-		return translateError(err)
+		*o = tcpip.SendBufferSizeOption(e.sndbuf)
+		return nil
 	case *tcpip.ReceiveBufferSizeOption:
-		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
-		*o = tcpip.ReceiveBufferSizeOption(v)
-		return translateError(err)
+		// N.B. Unix sockets don't use the receive buffer. We'll claim it is
+		// the same size as the send buffer.
+		*o = tcpip.ReceiveBufferSizeOption(e.sndbuf)
+		return nil
 	case *tcpip.ReuseAddressOption:
 		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR)
 		*o = tcpip.ReuseAddressOption(v)
@@ -240,33 +257,47 @@ func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages,
 	if to != nil {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
-	return sendMsg(e.fd, data, controlMessages)
+
+	// Since stream sockets don't preserve message boundaries, we can write
+	// only as much of the message as fits in the send buffer.
+	truncate := e.stype == unix.SockStream
+
+	return sendMsg(e.fd, data, controlMessages, e.sndbuf, truncate)
 }
 
-func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages) (uintptr, *tcpip.Error) {
+func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages, maxlen int, truncate bool) (uintptr, *tcpip.Error) {
 	if !controlMessages.Empty() {
 		return 0, tcpip.ErrInvalidEndpointState
 	}
-	n, err := fdWriteVec(fd, data)
+	n, totalLen, err := fdWriteVec(fd, data, maxlen, truncate)
+	if n < totalLen && err == nil {
+		// The host only returns a short write if it would otherwise
+		// block (and only for stream sockets).
+		err = syserror.EAGAIN
+	}
 	return n, translateError(err)
 }
 
 // RecvMsg implements unix.Endpoint.RecvMsg.
 func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
-	return recvMsg(e.fd, data, numRights, peek, addr)
+	// N.B. Unix sockets don't have a receive buffer, the send buffer
+	// serves both purposes.
+	rl, ml, cm, err := recvMsg(e.fd, data, numRights, peek, addr, e.sndbuf)
+	if rl > 0 && err == tcpip.ErrWouldBlock {
+		// Message did not fill buffer; that's fine, no need to block.
+		err = nil
+	}
+	return rl, ml, cm, err
 }
 
-func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
+func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress, maxlen int) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
 	var cm unet.ControlMessage
 	if numRights > 0 {
 		cm.EnableFDs(int(numRights))
 	}
-	rl, ml, cl, err := fdReadVec(fd, data, []byte(cm), peek)
-	if err == syscall.EAGAIN {
-		return 0, 0, unix.ControlMessages{}, tcpip.ErrWouldBlock
-	}
-	if err != nil {
-		return 0, 0, unix.ControlMessages{}, translateError(err)
+	rl, ml, cl, rerr := fdReadVec(fd, data, []byte(cm), peek, maxlen)
+	if rl == 0 && rerr != nil {
+		return 0, 0, unix.ControlMessages{}, translateError(rerr)
 	}
 
 	// Trim the control data if we received less than the full amount.
@@ -276,7 +307,7 @@ func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.Fu
 
 	// Avoid extra allocations in the case where there isn't any control data.
 	if len(cm) == 0 {
-		return rl, ml, unix.ControlMessages{}, nil
+		return rl, ml, unix.ControlMessages{}, translateError(rerr)
 	}
 
 	fds, err := cm.ExtractFDs()
@@ -285,9 +316,9 @@ func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.Fu
 	}
 
 	if len(fds) == 0 {
-		return rl, ml, unix.ControlMessages{}, nil
+		return rl, ml, unix.ControlMessages{}, translateError(rerr)
 	}
-	return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil
+	return rl, ml, control.New(nil, nil, newSCMRights(fds)), translateError(rerr)
 }
 
 // NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD
@@ -307,7 +338,27 @@ func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*Conne
 		return nil, tcpip.ErrInvalidEndpointState
 	}
 
-	e := &ConnectedEndpoint{path: path, queue: queue, file: file}
+	stype, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return nil, translateError(err)
+	}
+
+	sndbuf, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return nil, translateError(err)
+	}
+	if sndbuf > maxSendBufferSize {
+		log.Warningf("Socket send buffer too large: %d", sndbuf)
+		return nil, tcpip.ErrInvalidEndpointState
+	}
+
+	e := &ConnectedEndpoint{
+		path:   path,
+		queue:  queue,
+		file:   file,
+		stype:  unix.SockType(stype),
+		sndbuf: sndbuf,
+	}
 
 	// AtomicRefCounters start off with a single reference. We need two.
 	e.ref.IncRef()
@@ -346,6 +397,17 @@ type ConnectedEndpoint struct {
 	// writeClosed is true if the FD has write shutdown or if it has been
 	// closed.
 	writeClosed bool
+
+	// stype is the type of Unix socket.
+	stype unix.SockType
+
+	// sndbuf is the size of the send buffer.
+	//
+	// N.B. When this is smaller than the host size, we present it via
+	// GetSockOpt and message splitting/rejection in SendMsg, but do not
+	// prevent lots of small messages from filling the real send buffer
+	// size on the host.
+	sndbuf int
 }
 
 // Send implements unix.ConnectedEndpoint.Send.
@@ -355,7 +417,12 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMess
 	if c.writeClosed {
 		return 0, false, tcpip.ErrClosedForSend
 	}
-	n, err := sendMsg(c.file.FD(), data, controlMessages)
+
+	// Since stream sockets don't preserve message boundaries, we can write
+	// only as much of the message as fits in the send buffer.
+	truncate := c.stype == unix.SockStream
+
+	n, err := sendMsg(c.file.FD(), data, controlMessages, c.sndbuf, truncate)
 	// There is no need for the callee to call SendNotify because sendMsg uses
 	// the host's sendmsg(2) and the host kernel's queue.
 	return n, false, err
@@ -411,7 +478,15 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 	if c.readClosed {
 		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
 	}
-	rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil)
+
+	// N.B. Unix sockets don't have a receive buffer, the send buffer
+	// serves both purposes.
+	rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil, c.sndbuf)
+	if rl > 0 && err == tcpip.ErrWouldBlock {
+		// Message did not fill buffer; that's fine, no need to block.
+		err = nil
+	}
+
 	// There is no need for the callee to call RecvNotify because recvMsg uses
 	// the host's recvmsg(2) and the host kernel's queue.
 	return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err
@@ -460,20 +535,14 @@ func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
 
 // SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize.
 func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
-	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
-	if err != nil {
-		return -1
-	}
-	return int64(v)
+	return int64(c.sndbuf)
 }
 
 // RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize.
 func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
-	v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF)
-	if err != nil {
-		return -1
-	}
-	return int64(v)
+	// N.B. Unix sockets don't use the receive buffer. We'll claim it is
+	// the same size as the send buffer.
+	return int64(c.sndbuf)
 }
 
 // Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
new file mode 100644
index 000000000..1a9587b90
--- /dev/null
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// maxIovs is the maximum number of iovecs to pass to the host.
+var maxIovs = linux.UIO_MAXIOV
+
+// copyToMulti copies as many bytes from src to dst as possible.
+func copyToMulti(dst [][]byte, src []byte) {
+	for _, d := range dst {
+		done := copy(d, src)
+		src = src[done:]
+		if len(src) == 0 {
+			break
+		}
+	}
+}
+
+// copyFromMulti copies as many bytes from src to dst as possible.
+func copyFromMulti(dst []byte, src [][]byte) {
+	for _, s := range src {
+		done := copy(dst, s)
+		dst = dst[done:]
+		if len(dst) == 0 {
+			break
+		}
+	}
+}
+
+// buildIovec builds an iovec slice from the given []byte slice.
+//
+// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error.
+//
+// If length < the total length of bufs, err indicates why, even when returning
+// a truncated iovec.
+//
+// If intermediate != nil, iovecs references intermediate rather than bufs and
+// the caller must copy to/from bufs as necessary.
+func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovecs []syscall.Iovec, intermediate []byte, err error) {
+	var iovsRequired int
+	for _, b := range bufs {
+		length += uintptr(len(b))
+		if len(b) > 0 {
+			iovsRequired++
+		}
+	}
+
+	stopLen := length
+	if length > uintptr(maxlen) {
+		if truncate {
+			stopLen = uintptr(maxlen)
+			err = syserror.EAGAIN
+		} else {
+			return 0, nil, nil, syserror.EMSGSIZE
+		}
+	}
+
+	if iovsRequired > maxIovs {
+		// The kernel will reject our call if we pass this many iovs.
+		// Use a single intermediate buffer instead.
+		b := make([]byte, stopLen)
+
+		return stopLen, []syscall.Iovec{{
+			Base: &b[0],
+			Len:  uint64(stopLen),
+		}}, b, err
+	}
+
+	var total uintptr
+	iovecs = make([]syscall.Iovec, 0, iovsRequired)
+	for i := range bufs {
+		l := len(bufs[i])
+		if l == 0 {
+			continue
+		}
+
+		stop := l
+		if total+uintptr(stop) > stopLen {
+			stop = int(stopLen - total)
+		}
+
+		iovecs = append(iovecs, syscall.Iovec{
+			Base: &bufs[i][0],
+			Len:  uint64(stop),
+		})
+
+		total += uintptr(stop)
+		if total >= stopLen {
+			break
+		}
+	}
+
+	return total, iovecs, nil, err
+}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index bf8da6867..5e4c5feed 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -19,29 +19,23 @@ import (
 	"unsafe"
 )
 
-// buildIovec builds an iovec slice from the given []byte slice.
-func buildIovec(bufs [][]byte) (uintptr, []syscall.Iovec) {
-	var length uintptr
-	iovecs := make([]syscall.Iovec, 0, 10)
-	for i := range bufs {
-		if l := len(bufs[i]); l > 0 {
-			length += uintptr(l)
-			iovecs = append(iovecs, syscall.Iovec{
-				Base: &bufs[i][0],
-				Len:  uint64(l),
-			})
-		}
-	}
-	return length, iovecs
-}
-
-func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
+// fdReadVec receives from fd to bufs.
+//
+// If the total length of bufs is > maxlen, fdReadVec will do a partial read
+// and err will indicate why the message was truncated.
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
 	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
 	if peek {
 		flags |= syscall.MSG_PEEK
 	}
 
-	length, iovecs := buildIovec(bufs)
+	// Always truncate the receive buffer. All socket types will truncate
+	// received messages.
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, 0, 0, err
+	}
 
 	var msg syscall.Msghdr
 	if len(control) != 0 {
@@ -53,30 +47,52 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintpt
 		msg.Iov = &iovecs[0]
 		msg.Iovlen = uint64(len(iovecs))
 	}
+
 	n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
 	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
 		return 0, 0, 0, e
 	}
 
+	// Copy data back to bufs.
+	if intermediate != nil {
+		copyToMulti(bufs, intermediate)
+	}
+
 	if n > length {
-		return length, n, msg.Controllen, nil
+		return length, n, msg.Controllen, err
 	}
 
-	return n, n, msg.Controllen, nil
+	return n, n, msg.Controllen, err
 }
 
-func fdWriteVec(fd int, bufs [][]byte) (uintptr, error) {
-	_, iovecs := buildIovec(bufs)
+// fdWriteVec sends from bufs to fd.
+//
+// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
+// partial write and err will indicate why the message was truncated.
+func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uintptr, error) {
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, length, err
+	}
+
+	// Copy data to intermediate buf.
+	if intermediate != nil {
+		copyFromMulti(intermediate, bufs)
+	}
 
 	var msg syscall.Msghdr
 	if len(iovecs) > 0 {
 		msg.Iov = &iovecs[0]
 		msg.Iovlen = uint64(len(iovecs))
 	}
+
 	n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
 	if e != 0 {
-		return 0, e
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, length, e
 	}
 
-	return n, nil
+	return n, length, err
 }
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1c22e78b3..e30378e60 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -378,7 +378,8 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		w.To = ep
 	}
 
-	if n, err := src.CopyInTo(t, &w); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	n, err := src.CopyInTo(t, &w)
+	if err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
 		return int(n), syserr.FromError(err)
 	}
 
@@ -388,15 +389,23 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	s.EventRegister(&e, waiter.EventOut)
 	defer s.EventUnregister(&e)
 
+	total := n
 	for {
-		if n, err := src.CopyInTo(t, &w); err != syserror.ErrWouldBlock {
-			return int(n), syserr.FromError(err)
+		// Shorten src to reflect bytes previously written.
+		src = src.DropFirst64(n)
+
+		n, err = src.CopyInTo(t, &w)
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
 		}
 
 		if err := t.Block(ch); err != nil {
-			return 0, syserr.FromError(err)
+			break
 		}
 	}
+
+	return int(total), syserr.FromError(err)
 }
 
 // Passcred implements unix.Credentialer.Passcred.
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index c40fb7dbf..b9786b48f 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -78,6 +78,8 @@ var netstackErrorTranslations = map[*tcpip.Error]*Error{
 	tcpip.ErrNoLinkAddress:         ErrHostDown,
 	tcpip.ErrBadAddress:            ErrBadAddress,
 	tcpip.ErrNetworkUnreachable:    ErrNetworkUnreachable,
+	tcpip.ErrMessageTooLong:        ErrMessageTooLong,
+	tcpip.ErrNoBufferSpace:         ErrNoBufferSpace,
 }
 
 // TranslateNetstackError converts an error from the tcpip package to a sentry
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 6f8a7a319..5bc74e65e 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -44,6 +44,7 @@ var (
 	ELIBBAD      = error(syscall.ELIBBAD)
 	ELOOP        = error(syscall.ELOOP)
 	EMFILE       = error(syscall.EMFILE)
+	EMSGSIZE     = error(syscall.EMSGSIZE)
 	ENAMETOOLONG = error(syscall.ENAMETOOLONG)
 	ENOATTR      = ENODATA
 	ENODATA      = error(syscall.ENODATA)
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index 7f213793e..de7593d9c 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -41,6 +41,8 @@ var translations = map[syscall.Errno]*tcpip.Error{
 	syscall.ENOTCONN:      tcpip.ErrNotConnected,
 	syscall.ECONNRESET:    tcpip.ErrConnectionReset,
 	syscall.ECONNABORTED:  tcpip.ErrConnectionAborted,
+	syscall.EMSGSIZE:      tcpip.ErrMessageTooLong,
+	syscall.ENOBUFS:       tcpip.ErrNoBufferSpace,
 }
 
 // TranslateErrno translate an errno from the syscall package into a
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f5b5ec86b..cef27948c 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -98,6 +98,8 @@ var (
 	ErrNoLinkAddress         = &Error{msg: "no remote link address"}
 	ErrBadAddress            = &Error{msg: "bad address"}
 	ErrNetworkUnreachable    = &Error{msg: "network is unreachable"}
+	ErrMessageTooLong        = &Error{msg: "message too long"}
+	ErrNoBufferSpace         = &Error{msg: "no buffer space available"}
 )
 
 // Errors related to Subnet
diff --git a/pkg/tcpip/transport/queue/queue.go b/pkg/tcpip/transport/queue/queue.go
index eb9ee8a3f..b3d2ea68b 100644
--- a/pkg/tcpip/transport/queue/queue.go
+++ b/pkg/tcpip/transport/queue/queue.go
@@ -24,12 +24,23 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// Entry implements Linker interface and has both Length and Release methods.
+// Entry implements Linker interface and has additional required methods.
 type Entry interface {
 	ilist.Linker
+
+	// Length returns the number of bytes stored in the entry.
 	Length() int64
+
+	// Release releases any resources held by the entry.
 	Release()
+
+	// Peek returns a copy of the entry. It must be Released separately.
 	Peek() Entry
+
+	// Truncate reduces the number of bytes stored in the entry to n bytes.
+	//
+	// Preconditions: n <= Length().
+	Truncate(n int64)
 }
 
 // Queue is a buffer queue.
@@ -52,7 +63,7 @@ func New(ReaderQueue *waiter.Queue, WriterQueue *waiter.Queue, limit int64) *Que
 }
 
 // Close closes q for reading and writing. It is immediately not writable and
-// will become unreadble will no more data is pending.
+// will become unreadable when no more data is pending.
 //
 // Both the read and write queues must be notified after closing:
 // q.ReaderQueue.Notify(waiter.EventIn)
@@ -86,38 +97,74 @@ func (q *Queue) IsReadable() bool {
 	return q.closed || q.dataList.Front() != nil
 }
 
+// bufWritable returns true if there is space for writing.
+//
+// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is
+// free.
+//
+// See net/unix/af_unix.c:unix_writeable.
+func (q *Queue) bufWritable() bool {
+	return 4*q.used < q.limit
+}
+
 // IsWritable determines if q is currently writable.
 func (q *Queue) IsWritable() bool {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 
-	return q.closed || q.used < q.limit
+	return q.closed || q.bufWritable()
 }
 
 // Enqueue adds an entry to the data queue if room is available.
 //
+// If truncate is true, Enqueue may truncate the message beforing enqueuing it.
+// Otherwise, the entire message must fit. If n < e.Length(), err indicates why.
+//
 // If notify is true, ReaderQueue.Notify must be called:
 // q.ReaderQueue.Notify(waiter.EventIn)
-func (q *Queue) Enqueue(e Entry) (notify bool, err *tcpip.Error) {
+func (q *Queue) Enqueue(e Entry, truncate bool) (l int64, notify bool, err *tcpip.Error) {
 	q.mu.Lock()
 
 	if q.closed {
 		q.mu.Unlock()
-		return false, tcpip.ErrClosedForSend
+		return 0, false, tcpip.ErrClosedForSend
+	}
+
+	free := q.limit - q.used
+
+	l = e.Length()
+
+	if l > free && truncate {
+		if free == 0 {
+			// Message can't fit right now.
+			q.mu.Unlock()
+			return 0, false, tcpip.ErrWouldBlock
+		}
+
+		e.Truncate(free)
+		l = e.Length()
+		err = tcpip.ErrWouldBlock
+	}
+
+	if l > q.limit {
+		// Message is too big to ever fit.
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrMessageTooLong
 	}
 
-	if q.used >= q.limit {
+	if l > free {
+		// Message can't fit right now.
 		q.mu.Unlock()
-		return false, tcpip.ErrWouldBlock
+		return 0, false, tcpip.ErrWouldBlock
 	}
 
 	notify = q.dataList.Front() == nil
-	q.used += e.Length()
+	q.used += l
 	q.dataList.PushBack(e)
 
 	q.mu.Unlock()
 
-	return notify, nil
+	return l, notify, err
 }
 
 // Dequeue removes the first entry in the data queue, if one exists.
@@ -137,13 +184,13 @@ func (q *Queue) Dequeue() (e Entry, notify bool, err *tcpip.Error) {
 		return nil, false, err
 	}
 
-	notify = q.used >= q.limit
+	notify = !q.bufWritable()
 
 	e = q.dataList.Front().(Entry)
 	q.dataList.Remove(e)
 	q.used -= e.Length()
 
-	notify = notify && q.used < q.limit
+	notify = notify && q.bufWritable()
 
 	q.mu.Unlock()
 
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 6143390b3..bed7ec6a6 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -315,6 +315,8 @@ func loadError(s string) *tcpip.Error {
 			tcpip.ErrNoLinkAddress,
 			tcpip.ErrBadAddress,
 			tcpip.ErrNetworkUnreachable,
+			tcpip.ErrMessageTooLong,
+			tcpip.ErrNoBufferSpace,
 		}
 
 		messageToError = make(map[string]*tcpip.Error)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 6ed805357..840e95302 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -15,6 +15,7 @@
 package udp
 
 import (
+	"math"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/sleep"
@@ -264,6 +265,11 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 		return 0, nil, tcpip.ErrInvalidOptionValue
 	}
 
+	if p.Size() > math.MaxUint16 {
+		// Payload can't possibly fit in a packet.
+		return 0, nil, tcpip.ErrMessageTooLong
+	}
+
 	to := opts.To
 
 	e.mu.RLock()
diff --git a/pkg/tcpip/transport/unix/connectionless.go b/pkg/tcpip/transport/unix/connectionless.go
index ebd4802b0..ae93c61d7 100644
--- a/pkg/tcpip/transport/unix/connectionless.go
+++ b/pkg/tcpip/transport/unix/connectionless.go
@@ -105,14 +105,12 @@ func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to Bo
 	e.Lock()
 	n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
 	e.Unlock()
-	if err != nil {
-		return 0, err
-	}
+
 	if notify {
 		connected.SendNotify()
 	}
 
-	return n, nil
+	return n, err
 }
 
 // Type implements Endpoint.Type.
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
index 0bb00df42..718606cd1 100644
--- a/pkg/tcpip/transport/unix/unix.go
+++ b/pkg/tcpip/transport/unix/unix.go
@@ -260,20 +260,28 @@ type message struct {
 	Address tcpip.FullAddress
 }
 
-// Length returns number of bytes stored in the Message.
+// Length returns number of bytes stored in the message.
 func (m *message) Length() int64 {
 	return int64(len(m.Data))
 }
 
-// Release releases any resources held by the Message.
+// Release releases any resources held by the message.
 func (m *message) Release() {
 	m.Control.Release()
 }
 
+// Peek returns a copy of the message.
 func (m *message) Peek() queue.Entry {
 	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
 }
 
+// Truncate reduces the length of the message payload to n bytes.
+//
+// Preconditions: n <= m.Length().
+func (m *message) Truncate(n int64) {
+	m.Data.CapLength(int(n))
+}
+
 // A Receiver can be used to receive Messages.
 type Receiver interface {
 	// Recv receives a single message. This method does not block.
@@ -623,23 +631,33 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
 
 // Send implements ConnectedEndpoint.Send.
 func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
-	var l int
+	var l int64
 	for _, d := range data {
-		l += len(d)
-	}
-	// Discard empty stream packets. Since stream sockets don't preserve
-	// message boundaries, sending zero bytes is a no-op. In Linux, the
-	// receiver actually uses a zero-length receive as an indication that the
-	// stream was closed.
-	if l == 0 && e.endpoint.Type() == SockStream {
-		controlMessages.Release()
-		return 0, false, nil
+		l += int64(len(d))
+	}
+
+	truncate := false
+	if e.endpoint.Type() == SockStream {
+		// Since stream sockets don't preserve message boundaries, we
+		// can write only as much of the message as fits in the queue.
+		truncate = true
+
+		// Discard empty stream packets. Since stream sockets don't
+		// preserve message boundaries, sending zero bytes is a no-op.
+		// In Linux, the receiver actually uses a zero-length receive
+		// as an indication that the stream was closed.
+		if l == 0 {
+			controlMessages.Release()
+			return 0, false, nil
+		}
 	}
+
 	v := make([]byte, 0, l)
 	for _, d := range data {
 		v = append(v, d...)
 	}
-	notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from})
+
+	l, notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from}, truncate)
 	return uintptr(l), notify, err
 }
 
@@ -793,15 +811,12 @@ func (e *baseEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoin
 
 	n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
 	e.Unlock()
-	if err != nil {
-		return 0, err
-	}
 
 	if notify {
 		e.connected.SendNotify()
 	}
 
-	return n, nil
+	return n, err
 }
 
 // SetSockOpt sets a socket option. Currently not supported.
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 06c04e3bb..92a73db9a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -121,11 +121,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(syscall.SOL_SOCKET),
 			seccomp.AllowValue(syscall.SO_SNDBUF),
 		},
-		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_RCVBUF),
-		},
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(syscall.SOL_SOCKET),
@@ -304,6 +299,11 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowValue(syscall.SOL_SOCKET),
 				seccomp.AllowValue(syscall.SO_SNDBUF),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_RCVBUF),
+			},
 			{
 				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_SOCKET),
-- 
cgit v1.2.3


From 463e73d46d76042c39050d02cf3b0f875e55eb01 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 10 Oct 2018 22:39:32 -0700
Subject: Add seccomp filter configuration to ptrace stubs.

This is a defense-in-depth measure. If the sentry is compromised, this prevents
system call injection to the stubs. There is some complexity with respect to
ptrace and seccomp interactions, so this protection is not really available
for kernel versions < 4.8; this is detected dynamically.

Note that this also solves the vsyscall emulation issue by adding in
appropriate trapping for those system calls. It does mean that a compromised
sentry could theoretically inject these into the stub (ignoring the trap and
resume, thereby allowing execution), but they are harmless.

PiperOrigin-RevId: 216647581
Change-Id: Id06c232cbac1f9489b1803ec97f83097fcba8eb8
---
 pkg/abi/BUILD                                  |   1 +
 pkg/abi/abi_linux.go                           |  20 +++
 pkg/seccomp/BUILD                              |   3 -
 pkg/seccomp/seccomp.go                         | 224 +++++++++++++++++--------
 pkg/seccomp/seccomp_rules.go                   |   8 +-
 pkg/seccomp/seccomp_test.go                    | 172 ++++++++++++++-----
 pkg/seccomp/seccomp_unsafe.go                  |  24 ++-
 pkg/sentry/arch/arch_amd64.go                  |   5 +
 pkg/sentry/platform/ptrace/BUILD               |   2 +
 pkg/sentry/platform/ptrace/ptrace_unsafe.go    |   2 +-
 pkg/sentry/platform/ptrace/subprocess.go       | 150 +++++++++++------
 pkg/sentry/platform/ptrace/subprocess_amd64.go |  16 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go | 175 +++++++++++++++++--
 pkg/sentry/strace/BUILD                        |   1 +
 pkg/sentry/strace/strace.go                    |  11 ++
 15 files changed, 620 insertions(+), 194 deletions(-)
 create mode 100644 pkg/abi/abi_linux.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index c014d2c4b..1ba4f3a46 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "abi",
     srcs = [
         "abi.go",
+        "abi_linux.go",
         "flag.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/abi",
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go
new file mode 100644
index 000000000..dd5d67b51
--- /dev/null
+++ b/pkg/abi/abi_linux.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package abi
+
+// Host specifies the host ABI.
+const Host = Linux
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index b3e2f0b38..1975d17a6 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -28,12 +28,9 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/seccomp",
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/bpf",
         "//pkg/log",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/strace",
     ],
 )
 
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 49da3c775..a746dc9b3 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -20,31 +20,36 @@ import (
 	"reflect"
 	"sort"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bpf"
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
 )
 
 const (
-	// violationLabel is added to the program to take action on a violation.
-	violationLabel = "violation"
-
 	// skipOneInst is the offset to take for skipping one instruction.
 	skipOneInst = 1
+
+	// defaultLabel is the label for the default action.
+	defaultLabel = "default_action"
 )
 
 // Install generates BPF code based on the set of syscalls provided. It only
-// allows syscalls that conform to the specification (*) and generates SIGSYS
+// allows syscalls that conform to the specification and generates SIGSYS
 // trap unless kill is set.
 //
-// (*) The current implementation only checks the syscall number. It does NOT
-// validate any of the arguments.
+// This is a convenience wrapper around BuildProgram and SetFilter.
 func Install(rules SyscallRules, kill bool) error {
 	log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill)
-	instrs, err := buildProgram(rules, kill)
+	defaultAction := uint32(linux.SECCOMP_RET_TRAP)
+	if kill {
+		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+	}
+	instrs, err := BuildProgram([]RuleSet{
+		RuleSet{
+			Rules:  rules,
+			Action: uint32(linux.SECCOMP_RET_ALLOW),
+		},
+	}, defaultAction)
 	if log.IsLogging(log.Debug) {
 		programStr, errDecode := bpf.DecodeProgram(instrs)
 		if errDecode != nil {
@@ -56,60 +61,84 @@ func Install(rules SyscallRules, kill bool) error {
 		return err
 	}
 
-	if err := seccomp(instrs); err != nil {
-		return err
+	// Perform the actual installation.
+	if errno := SetFilter(instrs); errno != 0 {
+		return fmt.Errorf("Failed to set filter: %v", errno)
 	}
 
 	log.Infof("Seccomp filters installed.")
 	return nil
 }
 
-// buildProgram builds a BPF program that whitelists all given syscall rules.
-func buildProgram(rules SyscallRules, kill bool) ([]linux.BPFInstruction, error) {
+// RuleSet is a set of rules and associated action.
+type RuleSet struct {
+	Rules  SyscallRules
+	Action uint32
+
+	// Vsyscall indicates that a check is made for a function being called
+	// from kernel mappings. This is where the vsyscall page is located
+	// (and typically) emulated, so this RuleSet will not match any
+	// functions not dispatched from the vsyscall page.
+	Vsyscall bool
+}
+
+// SyscallName gives names to system calls. It is used purely for debugging purposes.
+//
+// An alternate namer can be provided to the package at initialization time.
+var SyscallName = func(sysno uintptr) string {
+	return fmt.Sprintf("syscall_%d", sysno)
+}
+
+// BuildProgram builds a BPF program from the given map of actions to matching
+// SyscallRules. The single generated program covers all provided RuleSets.
+func BuildProgram(rules []RuleSet, defaultAction uint32) ([]linux.BPFInstruction, error) {
 	program := bpf.NewProgramBuilder()
-	violationAction := uint32(linux.SECCOMP_RET_KILL)
-	if !kill {
-		violationAction = linux.SECCOMP_RET_TRAP
-	}
 
 	// Be paranoid and check that syscall is done in the expected architecture.
 	//
 	// A = seccomp_data.arch
-	// if (A != AUDIT_ARCH_X86_64) goto violation
+	// if (A != AUDIT_ARCH_X86_64) goto defaultAction.
 	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch)
-	// violationLabel is at the bottom of the program. The size of program
+	// defaultLabel is at the bottom of the program. The size of program
 	// may exceeds 255 lines, which is the limit of a condition jump.
 	program.AddJump(bpf.Jmp|bpf.Jeq|bpf.K, linux.AUDIT_ARCH_X86_64, skipOneInst, 0)
-	program.AddDirectJumpLabel(violationLabel)
-
+	program.AddDirectJumpLabel(defaultLabel)
 	if err := buildIndex(rules, program); err != nil {
 		return nil, err
 	}
 
-	// violation: return violationAction
-	if err := program.AddLabel(violationLabel); err != nil {
+	// Exhausted: return defaultAction.
+	if err := program.AddLabel(defaultLabel); err != nil {
 		return nil, err
 	}
-	program.AddStmt(bpf.Ret|bpf.K, violationAction)
+	program.AddStmt(bpf.Ret|bpf.K, defaultAction)
 
 	return program.Instructions()
 }
 
-// buildIndex builds a BST to quickly search through all syscalls that are whitelisted.
-func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
-	syscalls := []uintptr{}
-	for sysno := range rules {
-		syscalls = append(syscalls, sysno)
+// buildIndex builds a BST to quickly search through all syscalls.
+func buildIndex(rules []RuleSet, program *bpf.ProgramBuilder) error {
+	// Build a list of all application system calls, across all given rule
+	// sets. We have a simple BST, but may dispatch individual matchers
+	// with different actions. The matchers are evaluated linearly.
+	requiredSyscalls := make(map[uintptr]struct{})
+	for _, rs := range rules {
+		for sysno := range rs.Rules {
+			requiredSyscalls[sysno] = struct{}{}
+		}
 	}
-
-	t, ok := strace.Lookup(abi.Linux, arch.AMD64)
-	if !ok {
-		panic("Can't find amd64 Linux syscall table")
+	syscalls := make([]uintptr, 0, len(requiredSyscalls))
+	for sysno, _ := range requiredSyscalls {
+		syscalls = append(syscalls, sysno)
 	}
-
 	sort.Slice(syscalls, func(i, j int) bool { return syscalls[i] < syscalls[j] })
-	for _, s := range syscalls {
-		log.Infof("syscall filter: %v (%v): %s", s, t.Name(s), rules[s])
+	for _, sysno := range syscalls {
+		for _, rs := range rules {
+			// Print only if there is a corresponding set of rules.
+			if _, ok := rs.Rules[sysno]; ok {
+				log.Debugf("syscall filter %v: %s => 0x%x", SyscallName(sysno), rs.Rules[sysno], rs.Action)
+			}
+		}
 	}
 
 	root := createBST(syscalls)
@@ -119,7 +148,7 @@ func buildIndex(rules SyscallRules, program *bpf.ProgramBuilder) error {
 	//
 	// A = seccomp_data.nr
 	program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR)
-	return root.traverse(buildBSTProgram, program, rules)
+	return root.traverse(buildBSTProgram, rules, program)
 }
 
 // createBST converts sorted syscall slice into a balanced BST.
@@ -136,15 +165,23 @@ func createBST(syscalls []uintptr) *node {
 	return &parent
 }
 
-func ruleViolationLabel(sysno uintptr, idx int) string {
-	return fmt.Sprintf("ruleViolation_%v_%v", sysno, idx)
+func vsyscallViolationLabel(ruleSetIdx int, sysno uintptr) string {
+	return fmt.Sprintf("vsyscallViolation_%v_%v", ruleSetIdx, sysno)
+}
+
+func ruleViolationLabel(ruleSetIdx int, sysno uintptr, idx int) string {
+	return fmt.Sprintf("ruleViolation_%v_%v_%v", ruleSetIdx, sysno, idx)
 }
 
 func checkArgsLabel(sysno uintptr) string {
 	return fmt.Sprintf("checkArgs_%v", sysno)
 }
 
-func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) error {
+// addSyscallArgsCheck adds argument checks for a single system call. It does
+// not insert a jump to the default action at the end and it is the
+// responsibility of the caller to insert an appropriate jump after calling
+// this function.
+func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action uint32, ruleSetIdx int, sysno uintptr) error {
 	for ruleidx, rule := range rules {
 		labelled := false
 		for i, arg := range rule {
@@ -155,28 +192,29 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err
 					high, low := uint32(a>>32), uint32(a)
 					// assert arg_low == low
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgLow(i))
-					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(sysno, ruleidx))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, low, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					// assert arg_high == high
 					p.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArgHigh(i))
-					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(sysno, ruleidx))
+					p.AddJumpFalseLabel(bpf.Jmp|bpf.Jeq|bpf.K, high, 0, ruleViolationLabel(ruleSetIdx, sysno, ruleidx))
 					labelled = true
-
 				default:
 					return fmt.Errorf("unknown syscall rule type: %v", reflect.TypeOf(a))
 				}
 			}
 		}
-		// Matched, allow the syscall.
-		p.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
-		// Label the end of the rule if necessary.
+
+		// Matched, emit the given action.
+		p.AddStmt(bpf.Ret|bpf.K, action)
+
+		// Label the end of the rule if necessary. This is added for
+		// the jumps above when the argument check fails.
 		if labelled {
-			if err := p.AddLabel(ruleViolationLabel(sysno, ruleidx)); err != nil {
+			if err := p.AddLabel(ruleViolationLabel(ruleSetIdx, sysno, ruleidx)); err != nil {
 				return err
 			}
 		}
 	}
-	// Not matched?
-	p.AddDirectJumpLabel(violationLabel)
+
 	return nil
 }
 
@@ -188,16 +226,16 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, sysno uintptr) err
 //   (A > 22) ? goto index_35 : goto index_9
 //
 // index_9:  // SYS_MMAP(9), leaf
-//   A == 9) ? goto argument check : violation
+//   A == 9) ? goto argument check : defaultLabel
 //
 // index_35:  // SYS_NANOSLEEP(35), single child
 //   (A == 35) ? goto argument check : continue
-//   (A > 35) ? goto index_50 : goto violation
+//   (A > 35) ? goto index_50 : goto defaultLabel
 //
 // index_50:  // SYS_LISTEN(50), leaf
-//   (A == 50) ? goto argument check : goto violation
+//   (A == 50) ? goto argument check : goto defaultLabel
 //
-func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) error {
+func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) error {
 	// Root node is never referenced by label, skip it.
 	if !n.root {
 		if err := program.AddLabel(n.label()); err != nil {
@@ -209,11 +247,10 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e
 	program.AddJumpTrueLabel(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), checkArgsLabel(sysno), 0)
 	if n.left == nil && n.right == nil {
 		// Leaf nodes don't require extra check.
-		program.AddDirectJumpLabel(violationLabel)
+		program.AddDirectJumpLabel(defaultLabel)
 	} else {
 		// Non-leaf node. Check which turn to take otherwise. Using direct jumps
 		// in case that the offset may exceed the limit of a conditional jump (255)
-		// Note that 'violationLabel' is returned for nil children.
 		program.AddJump(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), 0, skipOneInst)
 		program.AddDirectJumpLabel(n.right.label())
 		program.AddDirectJumpLabel(n.left.label())
@@ -222,12 +259,60 @@ func buildBSTProgram(program *bpf.ProgramBuilder, rules SyscallRules, n *node) e
 	if err := program.AddLabel(checkArgsLabel(sysno)); err != nil {
 		return err
 	}
-	// No rules, just allow it and save one jmp.
-	if len(rules[sysno]) == 0 {
-		program.AddStmt(bpf.Ret|bpf.K, linux.SECCOMP_RET_ALLOW)
-		return nil
+
+	emitted := false
+	for ruleSetIdx, rs := range rules {
+		if _, ok := rs.Rules[sysno]; ok {
+			// If there are no rules, then this will always match.
+			// Remember we've done this so that we can emit a
+			// sensible error. We can't catch all overlaps, but we
+			// can catch this one at least.
+			if emitted {
+				return fmt.Errorf("unreachable action for %v: 0x%x (rule set %d)", SyscallName(sysno), rs.Action, ruleSetIdx)
+			}
+
+			// Emit a vsyscall check if this rule requires a
+			// Vsyscall match. This rule ensures that the top bit
+			// is set in the instruction pointer, which is where
+			// the vsyscall page will be mapped.
+			if rs.Vsyscall {
+				program.AddStmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh)
+				program.AddJumpFalseLabel(bpf.Jmp|bpf.Jset|bpf.K, 0x80000000, 0, vsyscallViolationLabel(ruleSetIdx, sysno))
+			}
+
+			// Emit matchers.
+			if len(rs.Rules[sysno]) == 0 {
+				// This is a blanket action.
+				program.AddStmt(bpf.Ret|bpf.K, rs.Action)
+				emitted = true
+			} else {
+				// Add an argument check for these particular
+				// arguments. This will continue execution and
+				// check the next rule set. We need to ensure
+				// that at the very end, we insert a direct
+				// jump label for the unmatched case.
+				if err := addSyscallArgsCheck(program, rs.Rules[sysno], rs.Action, ruleSetIdx, sysno); err != nil {
+					return err
+				}
+			}
+
+			// If there was a Vsyscall check for this rule, then we
+			// need to add an appropriate label for the jump above.
+			if rs.Vsyscall {
+				if err := program.AddLabel(vsyscallViolationLabel(ruleSetIdx, sysno)); err != nil {
+					return err
+				}
+			}
+		}
 	}
-	return addSyscallArgsCheck(program, rules[sysno], sysno)
+
+	// Not matched? We only need to insert a jump to the default label if
+	// not default action has been emitted for this call.
+	if !emitted {
+		program.AddDirectJumpLabel(defaultLabel)
+	}
+
+	return nil
 }
 
 // node represents a tree node.
@@ -238,26 +323,27 @@ type node struct {
 	root  bool
 }
 
-// label returns the label corresponding to this node. If node is nil (syscall not present),
-// violationLabel is returned for convenience.
+// label returns the label corresponding to this node.
+//
+// If n is nil, then the defaultLabel is returned.
 func (n *node) label() string {
 	if n == nil {
-		return violationLabel
+		return defaultLabel
 	}
 	return fmt.Sprintf("index_%v", n.value)
 }
 
-type traverseFunc func(*bpf.ProgramBuilder, SyscallRules, *node) error
+type traverseFunc func(*node, []RuleSet, *bpf.ProgramBuilder) error
 
-func (n *node) traverse(fn traverseFunc, p *bpf.ProgramBuilder, rules SyscallRules) error {
+func (n *node) traverse(fn traverseFunc, rules []RuleSet, p *bpf.ProgramBuilder) error {
 	if n == nil {
 		return nil
 	}
-	if err := fn(p, rules, n); err != nil {
+	if err := fn(n, rules, p); err != nil {
 		return err
 	}
-	if err := n.left.traverse(fn, p, rules); err != nil {
+	if err := n.left.traverse(fn, rules, p); err != nil {
 		return err
 	}
-	return n.right.traverse(fn, p, rules)
+	return n.right.traverse(fn, rules, p)
 }
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 9215e5c90..6b707f195 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -24,9 +24,11 @@ import "fmt"
 //	__u64 args[6];
 // };
 const (
-	seccompDataOffsetNR   = 0
-	seccompDataOffsetArch = 4
-	seccompDataOffsetArgs = 16
+	seccompDataOffsetNR     = 0
+	seccompDataOffsetArch   = 4
+	seccompDataOffsetIPLow  = 8
+	seccompDataOffsetIPHigh = 12
+	seccompDataOffsetArgs   = 16
 )
 
 func seccompDataOffsetArgLow(i int) uint32 {
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 42cf85c03..0188ad4f3 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -76,14 +76,18 @@ func TestBasic(t *testing.T) {
 	}
 
 	for _, test := range []struct {
-		// filters are the set of syscall that are allowed.
-		filters SyscallRules
-		kill    bool
-		specs   []spec
+		ruleSets      []RuleSet
+		defaultAction uint32
+		specs         []spec
 	}{
 		{
-			filters: SyscallRules{1: {}},
-			kill:    false,
+			ruleSets: []RuleSet{
+				{
+					Rules:  SyscallRules{1: {}},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Single syscall allowed",
@@ -98,12 +102,61 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: {},
-				3: {},
-				5: {},
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowValue(0x1),
+							},
+						},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					Rules: SyscallRules{
+						1: {},
+						2: {},
+					},
+					Action: linux.SECCOMP_RET_TRAP,
+				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_KILL,
+			specs: []spec{
+				{
+					desc: "Multiple rulesets allowed (1a)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64, args: [6]uint64{0x1}},
+					want: linux.SECCOMP_RET_ALLOW,
+				},
+				{
+					desc: "Multiple rulesets allowed (1b)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "Multiple rulesets allowed (2)",
+					data: seccompData{nr: 1, arch: linux.AUDIT_ARCH_X86_64},
+					want: linux.SECCOMP_RET_TRAP,
+				},
+				{
+					desc: "Multiple rulesets allowed (2)",
+					data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64},
+					want: linux.SECCOMP_RET_KILL,
+				},
+			},
+		},
+		{
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: {},
+						3: {},
+						5: {},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Multiple syscalls allowed (1)",
@@ -148,8 +201,15 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{1: {}},
-			kill:    false,
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: {},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Wrong architecture",
@@ -159,26 +219,38 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{1: {}},
-			kill:    true,
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: {},
+					},
+					Action: linux.SECCOMP_RET_ALLOW,
+				},
+			},
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
-					desc: "Syscall disallowed, action kill",
+					desc: "Syscall disallowed, action trap",
 					data: seccompData{nr: 2, arch: linux.AUDIT_ARCH_X86_64},
-					want: linux.SECCOMP_RET_KILL,
+					want: linux.SECCOMP_RET_TRAP,
 				},
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: []Rule{
-					{
-						AllowAny{},
-						AllowValue(0xf),
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowAny{},
+								AllowValue(0xf),
+							},
+						},
 					},
+					Action: linux.SECCOMP_RET_ALLOW,
 				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Syscall argument allowed",
@@ -193,17 +265,22 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: []Rule{
-					{
-						AllowValue(0xf),
-					},
-					{
-						AllowValue(0xe),
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowValue(0xf),
+							},
+							{
+								AllowValue(0xe),
+							},
+						},
 					},
+					Action: linux.SECCOMP_RET_ALLOW,
 				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "Syscall argument allowed, two rules",
@@ -218,16 +295,21 @@ func TestBasic(t *testing.T) {
 			},
 		},
 		{
-			filters: SyscallRules{
-				1: []Rule{
-					{
-						AllowValue(0),
-						AllowValue(math.MaxUint64 - 1),
-						AllowValue(math.MaxUint32),
+			ruleSets: []RuleSet{
+				{
+					Rules: SyscallRules{
+						1: []Rule{
+							{
+								AllowValue(0),
+								AllowValue(math.MaxUint64 - 1),
+								AllowValue(math.MaxUint32),
+							},
+						},
 					},
+					Action: linux.SECCOMP_RET_ALLOW,
 				},
 			},
-			kill: false,
+			defaultAction: linux.SECCOMP_RET_TRAP,
 			specs: []spec{
 				{
 					desc: "64bit syscall argument allowed",
@@ -259,7 +341,7 @@ func TestBasic(t *testing.T) {
 			},
 		},
 	} {
-		instrs, err := buildProgram(test.filters, test.kill)
+		instrs, err := BuildProgram(test.ruleSets, test.defaultAction)
 		if err != nil {
 			t.Errorf("%s: buildProgram() got error: %v", test.specs[0].desc, err)
 			continue
@@ -282,6 +364,7 @@ func TestBasic(t *testing.T) {
 	}
 }
 
+// TestRandom tests that randomly generated rules are encoded correctly.
 func TestRandom(t *testing.T) {
 	rand.Seed(time.Now().UnixNano())
 	size := rand.Intn(50) + 1
@@ -294,7 +377,12 @@ func TestRandom(t *testing.T) {
 	}
 
 	fmt.Printf("Testing filters: %v", syscallRules)
-	instrs, err := buildProgram(syscallRules, false)
+	instrs, err := BuildProgram([]RuleSet{
+		RuleSet{
+			Rules:  syscallRules,
+			Action: uint32(linux.SECCOMP_RET_ALLOW),
+		},
+	}, uint32(linux.SECCOMP_RET_TRAP))
 	if err != nil {
 		t.Fatalf("buildProgram() got error: %v", err)
 	}
@@ -319,8 +407,8 @@ func TestRandom(t *testing.T) {
 	}
 }
 
-// TestReadDeal checks that a process dies when it trips over the filter and that it
-// doesn't die when the filter is not triggered.
+// TestReadDeal checks that a process dies when it trips over the filter and
+// that it doesn't die when the filter is not triggered.
 func TestRealDeal(t *testing.T) {
 	for _, test := range []struct {
 		die  bool
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index 6682f8d9b..ae18534bf 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -17,7 +17,6 @@
 package seccomp
 
 import (
-	"fmt"
 	"syscall"
 	"unsafe"
 
@@ -31,19 +30,28 @@ type sockFprog struct {
 	Filter *linux.BPFInstruction
 }
 
-func seccomp(instrs []linux.BPFInstruction) error {
+// SetFilter installs the given BPF program.
+//
+// This is safe to call from an afterFork context.
+//
+//go:nosplit
+func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
 	// SYS_SECCOMP is not available in syscall package.
 	const SYS_SECCOMP = 317
 
 	// PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
-	if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); err != 0 {
-		return fmt.Errorf("failed to set PR_SET_NO_NEW_PRIVS: %v", err)
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 {
+		return errno
 	}
-	sockProg := sockFprog{Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0]))}
 
 	// TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available.
-	if _, _, err := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); err != 0 {
-		return fmt.Errorf("failed to set seccomp filter: %v", err)
+	sockProg := sockFprog{
+		Len:    uint16(len(instrs)),
+		Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
 	}
-	return nil
+	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 {
+		return errno
+	}
+
+	return 0
 }
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index f1e408af9..5ba6c19ea 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build amd64
+
 package arch
 
 import (
@@ -26,6 +28,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// Host specifies the host architecture.
+const Host = AMD64
+
 // These constants come directly from Linux.
 const (
 	// maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index ceee895dc..debae058b 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -19,6 +19,8 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/filemem",
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index b55b2795a..46a8bda8e 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -136,7 +136,7 @@ func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) {
 		return nil, syscall.EINVAL
 	}
 	rval, err := t.syscallIgnoreInterrupt(
-		initRegs,
+		&t.initRegs,
 		syscall.SYS_CLONE,
 		arch.SyscallArgument{Value: uintptr(
 			syscall.CLONE_FILES |
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 035ebc332..6d5ad6b71 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -47,6 +47,11 @@ type thread struct {
 	tgid int32
 	tid  int32
 	cpu  uint32
+
+	// initRegs are the initial registers for the first thread.
+	//
+	// These are used for the register set for system calls.
+	initRegs syscall.PtraceRegs
 }
 
 // threadPool is a collection of threads.
@@ -99,11 +104,6 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread)
 type subprocess struct {
 	platform.NoAddressSpaceIO
 
-	// initRegs are the initial registers for the first thread.
-	//
-	// These are used for the register set for system calls.
-	initRegs syscall.PtraceRegs
-
 	// requests is used to signal creation of new threads.
 	requests chan chan *thread
 
@@ -142,7 +142,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 	// thread, and responding to requests to make additional threads in the
 	// traced process. The process will be killed and reaped when the
 	// request channel is closed, which happens in Release below.
-	var initRegs syscall.PtraceRegs
 	errChan := make(chan error)
 	requests := make(chan chan *thread)
 	go func() { // S/R-SAFE: Platform-related.
@@ -156,22 +155,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 			return
 		}
 
-		// Grab registers.
-		//
-		// Note that we adjust the current register RIP value to be
-		// just before the current system call executed. This depends
-		// on the definition of the stub itself.
-		if err := firstThread.getRegs(&initRegs); err != nil {
-			panic(fmt.Sprintf("ptrace get regs failed: %v", err))
-		}
-		initRegs.Rip -= initRegsRipAdjustment
-
 		// Ready to handle requests.
 		errChan <- nil
 
 		// Wait for requests to create threads.
 		for r := range requests {
-			t, err := firstThread.clone(&initRegs)
+			t, err := firstThread.clone(&firstThread.initRegs)
 			if err != nil {
 				// Should not happen: not recoverable.
 				panic(fmt.Sprintf("error initializing first thread: %v", err))
@@ -183,15 +172,12 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 			// (Hopefully nobody tgkilled it with a signal <
 			// SIGSTOP before the SIGSTOP was delivered, in which
 			// case that signal would be delivered before SIGSTOP.)
-			if sig := t.wait(); sig != syscall.SIGSTOP {
+			if sig := t.wait(stopped); sig != syscall.SIGSTOP {
 				panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
 			}
 
-			// Detach the thread without suppressing the SIGSTOP,
-			// causing it to enter group-stop.
-			if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
-				panic(fmt.Sprintf("can't detach new clone: %v", errno))
-			}
+			// Detach the thread.
+			t.detach()
 
 			// Return the thread.
 			r <- t
@@ -208,7 +194,6 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 
 	// Ready.
 	sp := &subprocess{
-		initRegs: initRegs,
 		requests: requests,
 		sysemuThreads: threadPool{
 			threads: make(map[int32]*thread),
@@ -277,16 +262,48 @@ func (t *thread) attach() {
 	// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
 	// newSubprocess), so we always expect to see signal-delivery-stop with
 	// SIGSTOP.
-	if sig := t.wait(); sig != syscall.SIGSTOP {
+	if sig := t.wait(stopped); sig != syscall.SIGSTOP {
 		panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig))
 	}
 
 	// Initialize options.
 	t.init()
+
+	// Grab registers.
+	//
+	// Note that we adjust the current register RIP value to be just before
+	// the current system call executed. This depends on the definition of
+	// the stub itself.
+	if err := t.getRegs(&t.initRegs); err != nil {
+		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+	}
+	t.initRegs.Rip -= initRegsRipAdjustment
 }
 
+// detach detachs from the thread.
+//
+// Because the SIGSTOP is not supressed, the thread will enter group-stop.
+func (t *thread) detach() {
+	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
+		panic(fmt.Sprintf("can't detach new clone: %v", errno))
+	}
+}
+
+// waitOutcome is used for wait below.
+type waitOutcome int
+
+const (
+	// stopped indicates that the process was stopped.
+	stopped waitOutcome = iota
+
+	// killed indicates that the process was killed.
+	killed
+)
+
 // wait waits for a stop event.
-func (t *thread) wait() syscall.Signal {
+//
+// Precondition: outcome is a valid waitOutcome.
+func (t *thread) wait(outcome waitOutcome) syscall.Signal {
 	var status syscall.WaitStatus
 
 	for {
@@ -300,25 +317,55 @@ func (t *thread) wait() syscall.Signal {
 		if int(r) != int(t.tid) {
 			panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
 		}
-		if !status.Stopped() {
-			panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
-		}
-		if status.StopSignal() == 0 {
-			continue // Spurious stop.
+		switch outcome {
+		case stopped:
+			if !status.Stopped() {
+				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+			}
+			stopSig := status.StopSignal()
+			if stopSig == 0 {
+				continue // Spurious stop.
+			}
+			if stopSig == syscall.SIGTRAP {
+				// Re-encode the trap cause the way it's expected.
+				return stopSig | syscall.Signal(status.TrapCause()<<8)
+			}
+			// Not a trap signal.
+			return stopSig
+		case killed:
+			if !status.Exited() && !status.Signaled() {
+				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
+			}
+			return syscall.Signal(status.ExitStatus())
+		default:
+			// Should not happen.
+			panic(fmt.Sprintf("unknown outcome: %v", outcome))
 		}
-		return status.StopSignal()
 	}
 }
 
+// destroy kills the thread.
+//
+// Note that this should not be used in the general case; the death of threads
+// will typically cause the death of the parent. This is a utility method for
+// manually created threads.
+func (t *thread) destroy() {
+	t.detach()
+	syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL))
+	t.wait(killed)
+}
+
 // init initializes trace options.
 func (t *thread) init() {
-	// Set our TRACESYSGOOD option to differeniate real SIGTRAP.
+	// Set our TRACESYSGOOD option to differeniate real SIGTRAP. Also, we
+	// require the SECCOMP option to ensure that seccomp violations
+	// generate a ptrace event.
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_PTRACE,
 		syscall.PTRACE_SETOPTIONS,
 		uintptr(t.tid),
 		0,
-		syscall.PTRACE_O_TRACESYSGOOD,
+		syscall.PTRACE_O_TRACESYSGOOD|_PTRACE_O_TRACESECCOMP,
 		0, 0)
 	if errno != 0 {
 		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
@@ -342,8 +389,8 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
 		}
 
-		sig := t.wait()
-		if sig == (0x80 | syscall.SIGTRAP) {
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
 			// Reached syscall-enter-stop.
 			break
 		} else {
@@ -360,7 +407,7 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
 	// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
 	// between syscall-enter-stop and syscall-exit-stop; it happens *after*
 	// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
-	if sig := t.wait(); sig != (0x80 | syscall.SIGTRAP) {
+	if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
 		panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
 	}
 
@@ -403,22 +450,23 @@ func (t *thread) NotifyInterrupt() {
 //
 // This function returns true on a system call, false on a signal.
 func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
-	regs := &ac.StateData().Regs
-	s.resetSysemuRegs(regs)
+	// Lock the thread for ptrace operations.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
 
 	// Extract floating point state.
 	fpState := ac.FloatingPointData()
 	fpLen, _ := ac.FeatureSet().ExtendedStateSize()
 	useXsave := ac.FeatureSet().UseXsave()
 
-	// Lock the thread for ptrace operations.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
 	// Grab our thread from the pool.
 	currentTID := int32(procid.Current())
 	t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread)
 
+	// Reset necessary registers.
+	regs := &ac.StateData().Regs
+	t.resetSysemuRegs(regs)
+
 	// Check for interrupts, and ensure that future interrupts will signal t.
 	if !c.interrupt.Enable(t) {
 		// Pending interrupt; simulate.
@@ -459,7 +507,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		}
 
 		// Wait for the syscall-enter stop.
-		sig := t.wait()
+		sig := t.wait(stopped)
 
 		// Refresh all registers.
 		if err := t.getRegs(regs); err != nil {
@@ -470,13 +518,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		}
 
 		// Is it a system call?
-		if sig == (0x80 | syscall.SIGTRAP) {
+		if sig == (syscallEvent | syscall.SIGTRAP) {
 			// Ensure registers are sane.
 			updateSyscallRegs(regs)
 			return true
-		}
-
-		if sig == syscall.SIGSTOP {
+		} else if sig == (seccompEvent | syscall.SIGTRAP) {
+			// Seccomp is enabled, and caught the system call. This
+			// is an emulated vsyscall call, since those are caught
+			// only by seccomp and explicitly set to trace.
+			updateSyscallRegs(regs)
+			return true
+		} else if sig == syscall.SIGSTOP {
 			// SIGSTOP was delivered to another thread in the same thread
 			// group, which initiated another group stop. Just ignore it.
 			continue
@@ -507,7 +559,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
 	currentTID := int32(procid.Current())
 	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
 
-	return t.syscallIgnoreInterrupt(&s.initRegs, sysno, args...)
+	return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...)
 }
 
 // MapFile implements platform.AddressSpace.MapFile.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 8211215df..c38dc1ff8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -43,20 +43,20 @@ const (
 // resetSysemuRegs sets up emulation registers.
 //
 // This should be called prior to calling sysemu.
-func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
-	regs.Cs = s.initRegs.Cs
-	regs.Ss = s.initRegs.Ss
-	regs.Ds = s.initRegs.Ds
-	regs.Es = s.initRegs.Es
-	regs.Fs = s.initRegs.Fs
-	regs.Gs = s.initRegs.Gs
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+	regs.Cs = t.initRegs.Cs
+	regs.Ss = t.initRegs.Ss
+	regs.Ds = t.initRegs.Ds
+	regs.Es = t.initRegs.Es
+	regs.Fs = t.initRegs.Fs
+	regs.Gs = t.initRegs.Gs
 }
 
 // createSyscallRegs sets up syscall registers.
 //
 // This should be called to generate registers for a system call.
 func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
-	// Copy initial registers (RIP, segments, etc.).
+	// Copy initial registers.
 	regs := *initRegs
 
 	// Set our syscall number.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index b212bbdfe..53adadadd 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -21,14 +21,167 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 )
 
+const (
+	syscallEvent           syscall.Signal = 0x80
+	seccompEvent           syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8
+	_PTRACE_O_TRACESECCOMP                = 0x80  // 1 << 0x7 (PTRACE_SECCOMP_EVENT)
+)
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO))
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
+}
+
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
 func createStub() (*thread, error) {
+	// The exact interactions of ptrace and seccomp are complex, and
+	// changed in recent kernel versions. Before commit 93e35efb8de45, the
+	// seccomp check is done before the ptrace emulation check. This means
+	// that any calls not matching this list will trigger the seccomp
+	// default action instead of notifying ptrace.
+	//
+	// After commit 93e35efb8de45, the seccomp check is done after the
+	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
+	// will never run for emulation. Seccomp will only run for injected
+	// system calls, and thus we can use RET_KILL as our violation action.
+	var defaultAction uint32
+	if probeSeccomp() {
+		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
+		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+	} else {
+		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
+		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
+		defaultAction = uint32(linux.SECCOMP_RET_ALLOW)
+	}
+
+	// When creating the new child process, we specify SIGKILL as the
+	// signal to deliver when the child exits. We never expect a subprocess
+	// to exit; they are pooled and reused. This is done to ensure that if
+	// a subprocess is OOM-killed, this process (and all other stubs,
+	// transitively) will be killed as well. It's simply not possible to
+	// safely handle a single stub getting killed: the exact state of
+	// execution is unknown and not recoverable.
+	return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
+}
+
+// attachedThread returns a new attached thread.
+//
+// Precondition: the runtime OS thread must be locked.
+func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
+	// Create a BPF program that allows only the system calls needed by the
+	// stub and all its children. This is used to create child stubs
+	// (below), so we must include the ability to fork, but otherwise lock
+	// down available calls only to what is needed.
+	rules := []seccomp.RuleSet{
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				309:                      {}, // SYS_GETCPU.
+			},
+			Action:   uint32(linux.SECCOMP_RET_TRACE),
+			Vsyscall: true,
+		},
+	}
+	if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) {
+		rules = append(rules, seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_CLONE: []seccomp.Rule{
+					// Allow creation of new subprocesses (used by the master).
+					{seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+					// Allow creation of new threads within a single address space (used by addresss spaces).
+					{seccomp.AllowValue(
+						syscall.CLONE_FILES |
+							syscall.CLONE_FS |
+							syscall.CLONE_SIGHAND |
+							syscall.CLONE_THREAD |
+							syscall.CLONE_PTRACE |
+							syscall.CLONE_VM)},
+				},
+
+				// For the initial process creation.
+				syscall.SYS_WAIT4: {},
+				syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+				},
+				syscall.SYS_EXIT: {},
+
+				// For the stub prctl dance (all).
+				syscall.SYS_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+				},
+				syscall.SYS_GETPPID: {},
+
+				// For the stub to stop itself (all).
+				syscall.SYS_GETPID: {},
+				syscall.SYS_KILL: []seccomp.Rule{
+					{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+				},
+
+				// Injected to support the address space operations.
+				syscall.SYS_MMAP:   {},
+				syscall.SYS_MUNMAP: {},
+			},
+			Action: uint32(linux.SECCOMP_RET_ALLOW),
+		})
+	}
+	instrs, err := seccomp.BuildProgram(rules, defaultAction)
+	if err != nil {
+		return nil, err
+	}
+
 	// Declare all variables up front in order to ensure that there's no
 	// need for allocations between beforeFork & afterFork.
 	var (
@@ -43,14 +196,8 @@ func createStub() (*thread, error) {
 	// Among other things, beforeFork masks all signals.
 	beforeFork()
 
-	// When creating the new child process, we specify SIGKILL as the
-	// signal to deliver when the child exits. We never expect a subprocess
-	// to exit; they are pooled and reused. This is done to ensure that if
-	// a subprocess is OOM-killed, this process (and all other stubs,
-	// transitively) will be killed as well. It's simply not possible to
-	// safely handle a single stub getting killed: the exact state of
-	// execution is unknown and not recoverable.
-	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
+	// Do the clone.
+	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0)
 	if errno != 0 {
 		afterFork()
 		return nil, errno
@@ -67,7 +214,7 @@ func createStub() (*thread, error) {
 			tid:  int32(pid),
 			cpu:  ^uint32(0),
 		}
-		if sig := t.wait(); sig != syscall.SIGSTOP {
+		if sig := t.wait(stopped); sig != syscall.SIGSTOP {
 			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
 		}
 		t.attach()
@@ -86,6 +233,12 @@ func createStub() (*thread, error) {
 		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
 	}
 
+	// Set an aggressive BPF filter for the stub and all it's children. See
+	// the description of the BPF program built above.
+	if errno := seccomp.SetFilter(instrs); errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
 	// Enable cpuid-faulting; this may fail on older kernels or hardware,
 	// so we just disregard the result. Host CPUID will be enabled.
 	syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
@@ -105,7 +258,7 @@ func (s *subprocess) createStub() (*thread, error) {
 	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
 
 	// Pass the expected PPID to the child via R15.
-	regs := s.initRegs
+	regs := t.initRegs
 	regs.R15 = uint64(t.tgid)
 
 	// Call fork in a subprocess.
@@ -138,7 +291,7 @@ func (s *subprocess) createStub() (*thread, error) {
 	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
 	// If the child actually exited, the attach below will fail.
 	_, err = t.syscallIgnoreInterrupt(
-		&s.initRegs,
+		&t.initRegs,
 		syscall.SYS_WAIT4,
 		arch.SyscallArgument{Value: uintptr(pid)},
 		arch.SyscallArgument{Value: 0},
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index e1c8db67a..674554081 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -24,6 +24,7 @@ go_library(
         "//pkg/binary",
         "//pkg/bits",
         "//pkg/eventchannel",
+        "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
         "//pkg/sentry/socket/control",
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index f2a22aaa5..a16f5490e 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bits"
 	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto"
@@ -699,3 +700,13 @@ func EnableAll(sinks SinkType) {
 		table.FeatureEnable.EnableAll(flags)
 	}
 }
+
+func init() {
+	t, ok := Lookup(abi.Host, arch.Host)
+	if ok {
+		// Provide the native table as the lookup for seccomp
+		// debugging. This is best-effort. This is provided this way to
+		// avoid dependencies from seccomp to this package.
+		seccomp.SyscallName = t.Name
+	}
+}
-- 
cgit v1.2.3


From 0bfa03d61c7791aad03da5ac021bc60e4578858e Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Thu, 11 Oct 2018 11:40:34 -0700
Subject: sentry: allow saving of unlinked files with open fds on virtual fs.

PiperOrigin-RevId: 216733414
Change-Id: I33cd3eb818f0c39717d6656fcdfff6050b37ebb0
---
 pkg/sentry/fs/dirent_state.go | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index 58dd01202..04ab197b9 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -23,16 +23,20 @@ import (
 
 // beforeSave is invoked by stateify.
 func (d *Dirent) beforeSave() {
-	// Refuse to save if the file has already been deleted (but still has
-	// open fds, which is why the Dirent is still accessible). We know the
-	// the restore opening of the file will always fail. This condition will
-	// last until all the open fds and this Dirent are closed and released.
+	// Refuse to save if the file is on a non-virtual file system and has
+	// already been deleted (but still has open fds, which is why the Dirent
+	// is still accessible). We know the the restore re-opening of the file
+	// will always fail. This condition will last until all the open fds and
+	// this Dirent are closed and released.
+	//
+	// Such "dangling" open files on virtual file systems (e.g., tmpfs) is
+	// OK to save as their restore does not require re-opening the files.
 	//
 	// Note that this is rejection rather than failure---it would be
 	// perfectly OK to save---we are simply disallowing it here to prevent
 	// generating non-restorable state dumps. As the program continues its
 	// execution, it may become allowed to save again.
-	if atomic.LoadInt32(&d.deleted) != 0 {
+	if !d.Inode.IsVirtual() && atomic.LoadInt32(&d.deleted) != 0 {
 		n, _ := d.FullName(nil /* root */)
 		panic(ErrSaveRejection{fmt.Errorf("deleted file %q still has open fds", n)})
 	}
-- 
cgit v1.2.3


From 47d3862c33b7b74b451ea71139abdea34d5b46bd Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 12 Oct 2018 13:57:10 -0700
Subject: runsc: Support retrieving MTU via netdevice ioctl.

This enables ifconfig to display MTU.

PiperOrigin-RevId: 216917021
Change-Id: Id513b23d9d76899bcb71b0b6a25036f41629a923
---
 pkg/sentry/socket/epsocket/epsocket.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index c5da18b0e..a44679f0b 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1266,9 +1266,10 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 		// Gets the metric of the device. As per netdevice(7), this
 		// always just sets ifr_metric to 0.
 		usermem.ByteOrder.PutUint32(ifr.Data[:4], 0)
+
 	case syscall.SIOCGIFMTU:
 		// Gets the MTU of the device.
-		// TODO: Implement.
+		usermem.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
 
 	case syscall.SIOCGIFMAP:
 		// Gets the hardware parameters of the device.
-- 
cgit v1.2.3


From 4ea69fce8def9e030cbbc4d803b95e632175750c Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Mon, 15 Oct 2018 09:30:49 -0700
Subject: sentry: save fs.Dirent deleted info.

PiperOrigin-RevId: 217155458
Change-Id: Id3265b1ec784787039e2131c80254ac4937330c7
---
 pkg/sentry/fs/dirent.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index dd2b4e589..7191c5c30 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -119,7 +119,7 @@ type Dirent struct {
 	parent *Dirent
 
 	// deleted may be set atomically when removed.
-	deleted int32 `state:"nosave"`
+	deleted int32
 
 	// frozen indicates this entry can't walk to unknown nodes.
 	frozen bool
-- 
cgit v1.2.3


From ecd94ea7a693d49a0edce8607241a8e2ac22bfe0 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 15 Oct 2018 17:41:34 -0700
Subject: Clean up Rename and Unlink checks for EBUSY.

- Change Dirent.Busy => Dirent.isMountPoint. The function body is unchanged,
  and it is no longer exported.

- fs.MayDelete now checks that the victim is not the process root. This aligns
  with Linux's namei.c:may_delete().

- Fix "is-ancestor" checks to actually compare all ancestors, not just the
  parents.

- Fix handling of paths that end in dots, which are handled differently in
  Rename vs. Unlink.

PiperOrigin-RevId: 217239274
Change-Id: I7a0eb768e70a1b2915017ce54f7f95cbf8edf1fb
---
 pkg/sentry/fs/dirent.go               | 75 ++++++++++++++++++++++++-----------
 pkg/sentry/syscalls/linux/sys_file.go | 43 +++++++++++---------
 2 files changed, 75 insertions(+), 43 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 7191c5c30..a42c03e98 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1027,11 +1027,14 @@ func (d *Dirent) flush() {
 	}
 }
 
-// Busy indicates whether this Dirent is a mount point or root dirent.
-func (d *Dirent) Busy() bool {
+// isMountPoint returns true if the dirent is a mount point or the root.
+func (d *Dirent) isMountPoint() bool {
 	d.mu.Lock()
 	defer d.mu.Unlock()
+	return d.isMountPointLocked()
+}
 
+func (d *Dirent) isMountPointLocked() bool {
 	return d.mounted || d.parent == nil
 }
 
@@ -1137,7 +1140,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error {
 	}
 
 	// Remove cannot remove a mount point.
-	if child.Busy() {
+	if child.isMountPoint() {
 		return syscall.EBUSY
 	}
 
@@ -1211,7 +1214,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string)
 	}
 
 	// Remove cannot remove a mount point.
-	if child.Busy() {
+	if child.isMountPoint() {
 		return syscall.EBUSY
 	}
 
@@ -1457,12 +1460,20 @@ func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
 	return mayDelete(ctx, dir, victim)
 }
 
-func mayDelete(ctx context.Context, dir *Dirent, victim *Dirent) error {
+func mayDelete(ctx context.Context, dir, victim *Dirent) error {
 	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
 		return err
 	}
 
-	return checkSticky(ctx, dir, victim)
+	if err := checkSticky(ctx, dir, victim); err != nil {
+		return err
+	}
+
+	if victim.IsRoot() {
+		return syserror.EBUSY
+	}
+
+	return nil
 }
 
 // Rename atomically converts the child of oldParent named oldName to a
@@ -1491,33 +1502,28 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		return syscall.ENOENT
 	}
 
-	// Check constraints on the object being renamed.
+	// renamed is the dirent that will be renamed to something else.
 	renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */)
 	if err != nil {
 		return err
 	}
 	defer renamed.DecRef()
 
-	// Make sure we have write permissions on old and new parent.
+	// Check that the renamed dirent is deletable.
 	if err := mayDelete(ctx, oldParent, renamed); err != nil {
 		return err
 	}
-	if newParent != oldParent {
-		if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
-			return err
-		}
+
+	// Check that the renamed dirent is not a mount point.
+	if renamed.isMountPointLocked() {
+		return syscall.EBUSY
 	}
 
 	// Source should not be an ancestor of the target.
-	if renamed == newParent {
+	if newParent.descendantOf(renamed) {
 		return syscall.EINVAL
 	}
 
-	// Is the thing we're trying to rename busy?
-	if renamed.Busy() {
-		return syscall.EBUSY
-	}
-
 	// Per rename(2): "... EACCES: ... or oldpath is a directory and does not
 	// allow write permission (needed to update the .. entry)."
 	if IsDir(renamed.Inode.StableAttr) {
@@ -1526,21 +1532,42 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		}
 	}
 
-	// Check constraints on the object being replaced, if any.
+	// replaced is the dirent that is being overwritten by rename.
 	replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */)
-	if err == nil {
+	if err != nil {
+		if err != syserror.ENOENT {
+			return err
+		}
+
+		// Make sure we can create a new child in the new parent.
+		if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+			return err
+		}
+	} else {
+		// Check constraints on the dirent being replaced.
+
 		// NOTE: We don't want to keep replaced alive
 		// across the Rename, so must call DecRef manually (no defer).
 
+		// Check that we can delete replaced.
+		if err := mayDelete(ctx, oldParent, renamed); err != nil {
+			replaced.DecRef()
+			return err
+		}
+
 		// Target should not be an ancestor of source.
-		if replaced == oldParent {
+		if oldParent.descendantOf(replaced) {
 			replaced.DecRef()
-			// Why is this not EINVAL? See fs/namei.c.
+
+			// Note that Linux returns EINVAL if the source is an
+			// ancestor of target, but ENOTEMPTY if the target is
+			// an ancestor of source (unless RENAME_EXCHANGE flag
+			// is present).  See fs/namei.c:renameat2.
 			return syscall.ENOTEMPTY
 		}
 
-		// Is the thing we're trying to replace busy?
-		if replaced.Busy() {
+		// Check that replaced is not a mount point.
+		if replaced.isMountPointLocked() {
 			replaced.DecRef()
 			return syscall.EBUSY
 		}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 015afda9b..64704bb88 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1042,11 +1042,9 @@ func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
 		return err
 	}
 
-	// Special case: rmdir rejects anything with '.' as last component.
-	// This would be handled by the busy check for the current working
-	// directory, but this is how it's done.
-	if (len(path) == 1 && path == ".") || (len(path) > 1 && path[len(path)-2:] == "/.") {
-		return syserror.EINVAL
+	// Special case: removing the root always returns EBUSY.
+	if path == "/" {
+		return syserror.EBUSY
 	}
 
 	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error {
@@ -1054,6 +1052,15 @@ func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error {
 			return syserror.ENOTDIR
 		}
 
+		// Linux returns different ernos when the path ends in single
+		// dot vs. double dots.
+		switch name {
+		case ".":
+			return syserror.EINVAL
+		case "..":
+			return syserror.ENOTEMPTY
+		}
+
 		if err := fs.MayDelete(t, root, d, name); err != nil {
 			return err
 		}
@@ -1829,27 +1836,25 @@ func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD
 			return syserror.ENOTDIR
 		}
 
-		// Root cannot be renamed to anything.
-		//
-		// TODO: This catches the case when the rename
-		// argument is exactly "/", but we should return EBUSY when
-		// renaming any mount point, or when the argument is not
-		// exactly "/" but still resolves to the root, like "/.." or
-		// "/bin/..".
-		if oldParent == root && oldName == "." {
-			return syscall.EBUSY
+		// Rename rejects paths that end in ".", "..", or empty (i.e.
+		// the root) with EBUSY.
+		switch oldName {
+		case "", ".", "..":
+			return syserror.EBUSY
 		}
+
 		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error {
 			if !fs.IsDir(newParent.Inode.StableAttr) {
 				return syserror.ENOTDIR
 			}
 
-			// Nothing can be renamed to root.
-			//
-			// TODO: Same as above.
-			if newParent == root && newName == "." {
-				return syscall.EBUSY
+			// Rename rejects paths that end in ".", "..", or empty
+			// (i.e.  the root) with EBUSY.
+			switch newName {
+			case "", ".", "..":
+				return syserror.EBUSY
 			}
+
 			return fs.Rename(t, root, oldParent, oldName, newParent, newName)
 		})
 	})
-- 
cgit v1.2.3


From 167f2401c4abb1ebda1f4536a04d9854e9008e0b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 15 Oct 2018 17:47:24 -0700
Subject: Merge host.endpoint into host.ConnectedEndpoint

host.endpoint contained duplicated logic from the sockerpair implementation and
host.ConnectedEndpoint. Remove host.endpoint in favor of a
host.ConnectedEndpoint wrapped in a socketpair end.

PiperOrigin-RevId: 217240096
Change-Id: I4a3d51e3fe82bdf30e2d0152458b8499ab4c987c
---
 pkg/sentry/context/contexttest/contexttest.go |  12 +-
 pkg/sentry/fs/host/BUILD                      |   2 +
 pkg/sentry/fs/host/socket.go                  | 319 +++++++-------------------
 pkg/sentry/fs/host/socket_state.go            |  19 +-
 pkg/sentry/kernel/kernel.go                   |   4 +
 pkg/sentry/kernel/task.go                     |   2 +
 pkg/sentry/uniqueid/BUILD                     |   5 +-
 pkg/sentry/uniqueid/context.go                |  10 +
 pkg/tcpip/transport/unix/connectioned.go      |  11 +
 9 files changed, 134 insertions(+), 250 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index 193ce3440..b3c6a566b 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -53,6 +53,14 @@ type testContext struct {
 // globalUniqueID tracks incremental unique identifiers for tests.
 var globalUniqueID uint64
 
+// globalUniqueIDProvider implements unix.UniqueIDProvider.
+type globalUniqueIDProvider struct{}
+
+// UniqueID implements unix.UniqueIDProvider.UniqueID.
+func (*globalUniqueIDProvider) UniqueID() uint64 {
+	return atomic.AddUint64(&globalUniqueID, 1)
+}
+
 // lastInotifyCookie is a monotonically increasing counter for generating unique
 // inotify cookies. Must be accessed using atomic ops.
 var lastInotifyCookie uint32
@@ -76,7 +84,9 @@ func (t *testContext) Value(key interface{}) interface{} {
 	case platform.CtxPlatform:
 		return t.platform
 	case uniqueid.CtxGlobalUniqueID:
-		return atomic.AddUint64(&globalUniqueID, 1)
+		return (*globalUniqueIDProvider).UniqueID(nil)
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return &globalUniqueIDProvider{}
 	case uniqueid.CtxInotifyCookie:
 		return atomic.AddUint32(&lastInotifyCookie, 1)
 	case ktime.CtxRealtimeClock:
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 6d5640f0a..5ada32ee1 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -42,7 +42,9 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
+        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/rawfile",
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 68ebf6402..577e9e272 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -25,6 +25,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
 	unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
@@ -39,88 +41,35 @@ import (
 // N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
 const maxSendBufferSize = 8 << 20
 
-// endpoint encapsulates the state needed to represent a host Unix socket.
-//
-// TODO: Remove/merge with ConnectedEndpoint.
-//
-// +stateify savable
-type endpoint struct {
-	queue waiter.Queue `state:"zerovalue"`
-
-	// fd is the host fd backing this file.
-	fd int `state:"nosave"`
-
-	// If srfd >= 0, it is the host fd that fd was imported from.
-	srfd int `state:"wait"`
-
-	// stype is the type of Unix socket.
-	stype unix.SockType `state:"nosave"`
-
-	// sndbuf is the size of the send buffer.
-	sndbuf int `state:"nosave"`
-}
-
-func (e *endpoint) init() error {
-	family, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN)
-	if err != nil {
-		return err
-	}
-
-	if family != syscall.AF_UNIX {
-		// We only allow Unix sockets.
-		return syserror.EINVAL
-	}
-
-	stype, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_TYPE)
-	if err != nil {
-		return err
-	}
-	e.stype = unix.SockType(stype)
-
-	e.sndbuf, err = syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
-	if err != nil {
-		return err
-	}
-	if e.sndbuf > maxSendBufferSize {
-		log.Warningf("Socket send buffer too large: %d", e.sndbuf)
-		return syserror.EINVAL
-	}
-
-	if err := syscall.SetNonblock(e.fd, true); err != nil {
-		return err
-	}
-
-	return fdnotifier.AddFD(int32(e.fd), &e.queue)
-}
-
-// newEndpoint creates a new host endpoint.
-func newEndpoint(fd int, srfd int) (*endpoint, error) {
-	ep := &endpoint{fd: fd, srfd: srfd}
-	if err := ep.init(); err != nil {
-		return nil, err
-	}
-	return ep, nil
-}
-
 // newSocket allocates a new unix socket with host endpoint.
-func newSocket(ctx context.Context, fd int, saveable bool) (*fs.File, error) {
-	ownedfd := fd
+func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) {
+	ownedfd := orgfd
 	srfd := -1
 	if saveable {
 		var err error
-		ownedfd, err = syscall.Dup(fd)
+		ownedfd, err = syscall.Dup(orgfd)
 		if err != nil {
 			return nil, err
 		}
-		srfd = fd
+		srfd = orgfd
 	}
-	ep, err := newEndpoint(ownedfd, srfd)
+	f := fd.New(ownedfd)
+	var q waiter.Queue
+	e, err := NewConnectedEndpoint(f, &q, "" /* path */)
 	if err != nil {
 		if saveable {
-			syscall.Close(ownedfd)
+			f.Close()
+		} else {
+			f.Release()
 		}
-		return nil, err
+		return nil, syserr.TranslateNetstackError(err).ToError()
 	}
+
+	e.srfd = srfd
+	e.Init()
+
+	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+
 	return unixsocket.New(ctx, ep), nil
 }
 
@@ -130,139 +79,22 @@ func newSocket(ctx context.Context, fd int, saveable bool) (*fs.File, error) {
 //
 // NewSocketWithDirent takes ownership of f on success.
 func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
-	ep, err := newEndpoint(f.FD(), -1)
+	f2 := fd.New(f.FD())
+	var q waiter.Queue
+	e, err := NewConnectedEndpoint(f2, &q, "" /* path */)
 	if err != nil {
-		return nil, err
+		f2.Release()
+		return nil, syserr.TranslateNetstackError(err).ToError()
 	}
 
 	// Take ownship of the FD.
 	f.Release()
 
-	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
-}
-
-// Close implements unix.Endpoint.Close.
-func (e *endpoint) Close() {
-	fdnotifier.RemoveFD(int32(e.fd))
-	syscall.Close(e.fd)
-	e.fd = -1
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (e *endpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
-	e.queue.EventRegister(we, mask)
-	fdnotifier.UpdateFD(int32(e.fd))
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (e *endpoint) EventUnregister(we *waiter.Entry) {
-	e.queue.EventUnregister(we)
-	fdnotifier.UpdateFD(int32(e.fd))
-}
-
-// Readiness implements unix.Endpoint.Readiness.
-func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return fdnotifier.NonBlockingPoll(int32(e.fd), mask)
-}
-
-// Type implements unix.Endpoint.Type.
-func (e *endpoint) Type() unix.SockType {
-	return e.stype
-}
-
-// Connect implements unix.Endpoint.Connect.
-func (e *endpoint) Connect(server unix.BoundEndpoint) *tcpip.Error {
-	return tcpip.ErrInvalidEndpointState
-}
-
-// Bind implements unix.Endpoint.Bind.
-func (e *endpoint) Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
-	return tcpip.ErrInvalidEndpointState
-}
-
-// Listen implements unix.Endpoint.Listen.
-func (e *endpoint) Listen(backlog int) *tcpip.Error {
-	return tcpip.ErrInvalidEndpointState
-}
-
-// Accept implements unix.Endpoint.Accept.
-func (e *endpoint) Accept() (unix.Endpoint, *tcpip.Error) {
-	return nil, tcpip.ErrInvalidEndpointState
-}
-
-// Shutdown implements unix.Endpoint.Shutdown.
-func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
-	return tcpip.ErrInvalidEndpointState
-}
-
-// GetSockOpt implements unix.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		_, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_ERROR)
-		return translateError(err)
-	case *tcpip.PasscredOption:
-		// We don't support passcred on host sockets.
-		*o = 0
-		return nil
-	case *tcpip.SendBufferSizeOption:
-		*o = tcpip.SendBufferSizeOption(e.sndbuf)
-		return nil
-	case *tcpip.ReceiveBufferSizeOption:
-		// N.B. Unix sockets don't use the receive buffer. We'll claim it is
-		// the same size as the send buffer.
-		*o = tcpip.ReceiveBufferSizeOption(e.sndbuf)
-		return nil
-	case *tcpip.ReuseAddressOption:
-		v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR)
-		*o = tcpip.ReuseAddressOption(v)
-		return translateError(err)
-	case *tcpip.ReceiveQueueSizeOption:
-		return tcpip.ErrQueueSizeNotSupported
-	}
-	return tcpip.ErrInvalidEndpointState
-}
-
-// SetSockOpt implements unix.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	return nil
-}
-
-// GetLocalAddress implements unix.Endpoint.GetLocalAddress.
-func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
-	return tcpip.FullAddress{}, nil
-}
-
-// GetRemoteAddress implements unix.Endpoint.GetRemoteAddress.
-func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
-	return tcpip.FullAddress{}, nil
-}
-
-// Passcred returns whether or not the SO_PASSCRED socket option is
-// enabled on this end.
-func (e *endpoint) Passcred() bool {
-	// We don't support credential passing for host sockets.
-	return false
-}
-
-// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
-// is enabled on the connected end.
-func (e *endpoint) ConnectedPasscred() bool {
-	// We don't support credential passing for host sockets.
-	return false
-}
+	e.Init()
 
-// SendMsg implements unix.Endpoint.SendMsg.
-func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages, to unix.BoundEndpoint) (uintptr, *tcpip.Error) {
-	if to != nil {
-		return 0, tcpip.ErrInvalidEndpointState
-	}
-
-	// Since stream sockets don't preserve message boundaries, we can write
-	// only as much of the message as fits in the send buffer.
-	truncate := e.stype == unix.SockStream
+	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
 
-	return sendMsg(e.fd, data, controlMessages, e.sndbuf, truncate)
+	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
 }
 
 func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages, maxlen int, truncate bool) (uintptr, *tcpip.Error) {
@@ -278,18 +110,6 @@ func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages, maxlen
 	return n, translateError(err)
 }
 
-// RecvMsg implements unix.Endpoint.RecvMsg.
-func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
-	// N.B. Unix sockets don't have a receive buffer, the send buffer
-	// serves both purposes.
-	rl, ml, cm, err := recvMsg(e.fd, data, numRights, peek, addr, e.sndbuf)
-	if rl > 0 && err == tcpip.ErrWouldBlock {
-		// Message did not fill buffer; that's fine, no need to block.
-		err = nil
-	}
-	return rl, ml, cm, err
-}
-
 func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress, maxlen int) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
 	var cm unet.ControlMessage
 	if numRights > 0 {
@@ -328,42 +148,21 @@ func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.Fu
 // be called twice because host.ConnectedEndpoint is both a unix.Receiver and
 // unix.ConnectedEndpoint.
 func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *tcpip.Error) {
-	family, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_DOMAIN)
-	if err != nil {
-		return nil, translateError(err)
-	}
-
-	if family != syscall.AF_UNIX {
-		// We only allow Unix sockets.
-		return nil, tcpip.ErrInvalidEndpointState
-	}
-
-	stype, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE)
-	if err != nil {
-		return nil, translateError(err)
-	}
-
-	sndbuf, err := syscall.GetsockoptInt(file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
-	if err != nil {
-		return nil, translateError(err)
-	}
-	if sndbuf > maxSendBufferSize {
-		log.Warningf("Socket send buffer too large: %d", sndbuf)
-		return nil, tcpip.ErrInvalidEndpointState
+	e := ConnectedEndpoint{
+		path:  path,
+		queue: queue,
+		file:  file,
+		srfd:  -1,
 	}
 
-	e := &ConnectedEndpoint{
-		path:   path,
-		queue:  queue,
-		file:   file,
-		stype:  unix.SockType(stype),
-		sndbuf: sndbuf,
+	if err := e.init(); err != nil {
+		return nil, err
 	}
 
 	// AtomicRefCounters start off with a single reference. We need two.
 	e.ref.IncRef()
 
-	return e, nil
+	return &e, nil
 }
 
 // Init will do initialization required without holding other locks.
@@ -376,7 +175,7 @@ func (c *ConnectedEndpoint) Init() {
 // ConnectedEndpoint is a host FD backed implementation of
 // unix.ConnectedEndpoint and unix.Receiver.
 //
-// ConnectedEndpoint does not support save/restore for now.
+// +stateify savable
 type ConnectedEndpoint struct {
 	queue *waiter.Queue
 	path  string
@@ -385,11 +184,11 @@ type ConnectedEndpoint struct {
 	ref refs.AtomicRefCount
 
 	// mu protects fd, readClosed and writeClosed.
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	// file is an *fd.FD containing the FD backing this endpoint. It must be
 	// set to nil if it has been closed.
-	file *fd.FD
+	file *fd.FD `state:"nosave"`
 
 	// readClosed is true if the FD has read shutdown or if it has been closed.
 	readClosed bool
@@ -398,6 +197,9 @@ type ConnectedEndpoint struct {
 	// closed.
 	writeClosed bool
 
+	// If srfd >= 0, it is the host FD that file was imported from.
+	srfd int `state:"wait"`
+
 	// stype is the type of Unix socket.
 	stype unix.SockType
 
@@ -407,7 +209,44 @@ type ConnectedEndpoint struct {
 	// GetSockOpt and message splitting/rejection in SendMsg, but do not
 	// prevent lots of small messages from filling the real send buffer
 	// size on the host.
-	sndbuf int
+	sndbuf int `state:"nosave"`
+}
+
+// init performs initialization required for creating new ConnectedEndpoints and
+// for restoring them.
+func (c *ConnectedEndpoint) init() *tcpip.Error {
+	family, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+	if err != nil {
+		return translateError(err)
+	}
+
+	if family != syscall.AF_UNIX {
+		// We only allow Unix sockets.
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	stype, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return translateError(err)
+	}
+
+	if err := syscall.SetNonblock(c.file.FD(), true); err != nil {
+		return translateError(err)
+	}
+
+	sndbuf, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return translateError(err)
+	}
+	if sndbuf > maxSendBufferSize {
+		log.Warningf("Socket send buffer too large: %d", sndbuf)
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	c.stype = unix.SockType(stype)
+	c.sndbuf = sndbuf
+
+	return nil
 }
 
 // Send implements unix.ConnectedEndpoint.Send.
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
index 6acabd55a..7fa500bfb 100644
--- a/pkg/sentry/fs/host/socket_state.go
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -17,23 +17,26 @@ package host
 import (
 	"fmt"
 	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
 )
 
 // beforeSave is invoked by stateify.
-func (ep *endpoint) beforeSave() {
-	if ep.srfd < 0 {
+func (c *ConnectedEndpoint) beforeSave() {
+	if c.srfd < 0 {
 		panic("only host file descriptors provided at sentry startup can be saved")
 	}
 }
 
 // afterLoad is invoked by stateify.
-func (ep *endpoint) afterLoad() {
-	fd, err := syscall.Dup(ep.srfd)
+func (c *ConnectedEndpoint) afterLoad() {
+	f, err := syscall.Dup(c.srfd)
 	if err != nil {
-		panic(fmt.Sprintf("failed to dup restored fd %d: %v", ep.srfd, err))
+		panic(fmt.Sprintf("failed to dup restored FD %d: %v", c.srfd, err))
 	}
-	ep.fd = fd
-	if err := ep.init(); err != nil {
-		panic(fmt.Sprintf("Could not restore host socket fd %d: %v", ep.srfd, err))
+	c.file = fd.New(f)
+	if err := c.init(); err != nil {
+		panic(fmt.Sprintf("Could not restore host socket FD %d: %v", c.srfd, err))
 	}
+	c.Init()
 }
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 238fd127b..cc664deec 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -591,6 +591,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k
 	case uniqueid.CtxGlobalUniqueID:
 		return ctx.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return ctx.k
 	case uniqueid.CtxInotifyCookie:
 		return ctx.k.GenerateInotifyCookie()
 	default:
@@ -1045,6 +1047,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k
 	case uniqueid.CtxGlobalUniqueID:
 		return ctx.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return ctx.k
 	case uniqueid.CtxInotifyCookie:
 		return ctx.k.GenerateInotifyCookie()
 	default:
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 07ad1614c..4f0b7fe3f 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -590,6 +590,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t.k
 	case uniqueid.CtxGlobalUniqueID:
 		return t.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return t.k
 	case uniqueid.CtxInotifyCookie:
 		return t.k.GenerateInotifyCookie()
 	default:
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index 8eba3609e..ff50b9925 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -7,5 +7,8 @@ go_library(
     srcs = ["context.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid",
     visibility = ["//pkg/sentry:internal"],
-    deps = ["//pkg/sentry/context"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/tcpip/transport/unix",
+    ],
 )
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
index eeb8c4286..541e0611d 100644
--- a/pkg/sentry/uniqueid/context.go
+++ b/pkg/sentry/uniqueid/context.go
@@ -18,6 +18,7 @@ package uniqueid
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 // contextID is the kernel package's type for context.Context.Value keys.
@@ -28,6 +29,10 @@ const (
 	// unique identifier.
 	CtxGlobalUniqueID contextID = iota
 
+	// CtxGlobalUniqueIDProvider is a Context.Value key for a
+	// system-wide unique identifier generator.
+	CtxGlobalUniqueIDProvider
+
 	// CtxInotifyCookie is a Context.Value key for a unique inotify
 	// event cookie.
 	CtxInotifyCookie
@@ -38,6 +43,11 @@ func GlobalFromContext(ctx context.Context) uint64 {
 	return ctx.Value(CtxGlobalUniqueID).(uint64)
 }
 
+// GlobalProviderFromContext returns a system-wide unique identifier from ctx.
+func GlobalProviderFromContext(ctx context.Context) unix.UniqueIDProvider {
+	return ctx.Value(CtxGlobalUniqueIDProvider).(unix.UniqueIDProvider)
+}
+
 // InotifyCookie generates a unique inotify event cookie from ctx.
 func InotifyCookie(ctx context.Context) uint32 {
 	return ctx.Value(CtxInotifyCookie).(uint32)
diff --git a/pkg/tcpip/transport/unix/connectioned.go b/pkg/tcpip/transport/unix/connectioned.go
index dd7c03cf1..e319b3bb8 100644
--- a/pkg/tcpip/transport/unix/connectioned.go
+++ b/pkg/tcpip/transport/unix/connectioned.go
@@ -158,6 +158,17 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
 	return a, b
 }
 
+// NewExternal creates a new externally backed Endpoint. It behaves like a
+// socketpair.
+func NewExternal(stype SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
+	return &connectionedEndpoint{
+		baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected},
+		id:           uid.UniqueID(),
+		idGenerator:  uid,
+		stype:        stype,
+	}
+}
+
 // ID implements ConnectingEndpoint.ID.
 func (e *connectionedEndpoint) ID() uint64 {
 	return e.id
-- 
cgit v1.2.3


From 324ad3564ba42a5106be77a06d0cd52290e1cd22 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 15 Oct 2018 20:21:06 -0700
Subject: Refactor host.ConnectedEndpoint

* Integrate recvMsg and sendMsg functions into Recv and Send respectively as
  they are no longer shared.
* Clean up partial read/write error handling code.
* Re-order code to make sense given that there is no longer a host.endpoint
  type.

PiperOrigin-RevId: 217255072
Change-Id: Ib43fe9286452f813b8309d969be11f5fa40694cd
---
 pkg/sentry/fs/host/socket.go     | 281 ++++++++++++++++++++-------------------
 pkg/tcpip/transport/unix/unix.go |   3 +
 2 files changed, 144 insertions(+), 140 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 577e9e272..e454b6fe5 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -41,137 +41,6 @@ import (
 // N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max).
 const maxSendBufferSize = 8 << 20
 
-// newSocket allocates a new unix socket with host endpoint.
-func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) {
-	ownedfd := orgfd
-	srfd := -1
-	if saveable {
-		var err error
-		ownedfd, err = syscall.Dup(orgfd)
-		if err != nil {
-			return nil, err
-		}
-		srfd = orgfd
-	}
-	f := fd.New(ownedfd)
-	var q waiter.Queue
-	e, err := NewConnectedEndpoint(f, &q, "" /* path */)
-	if err != nil {
-		if saveable {
-			f.Close()
-		} else {
-			f.Release()
-		}
-		return nil, syserr.TranslateNetstackError(err).ToError()
-	}
-
-	e.srfd = srfd
-	e.Init()
-
-	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
-
-	return unixsocket.New(ctx, ep), nil
-}
-
-// NewSocketWithDirent allocates a new unix socket with host endpoint.
-//
-// This is currently only used by unsaveable Gofer nodes.
-//
-// NewSocketWithDirent takes ownership of f on success.
-func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
-	f2 := fd.New(f.FD())
-	var q waiter.Queue
-	e, err := NewConnectedEndpoint(f2, &q, "" /* path */)
-	if err != nil {
-		f2.Release()
-		return nil, syserr.TranslateNetstackError(err).ToError()
-	}
-
-	// Take ownship of the FD.
-	f.Release()
-
-	e.Init()
-
-	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
-
-	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
-}
-
-func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages, maxlen int, truncate bool) (uintptr, *tcpip.Error) {
-	if !controlMessages.Empty() {
-		return 0, tcpip.ErrInvalidEndpointState
-	}
-	n, totalLen, err := fdWriteVec(fd, data, maxlen, truncate)
-	if n < totalLen && err == nil {
-		// The host only returns a short write if it would otherwise
-		// block (and only for stream sockets).
-		err = syserror.EAGAIN
-	}
-	return n, translateError(err)
-}
-
-func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress, maxlen int) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) {
-	var cm unet.ControlMessage
-	if numRights > 0 {
-		cm.EnableFDs(int(numRights))
-	}
-	rl, ml, cl, rerr := fdReadVec(fd, data, []byte(cm), peek, maxlen)
-	if rl == 0 && rerr != nil {
-		return 0, 0, unix.ControlMessages{}, translateError(rerr)
-	}
-
-	// Trim the control data if we received less than the full amount.
-	if cl < uint64(len(cm)) {
-		cm = cm[:cl]
-	}
-
-	// Avoid extra allocations in the case where there isn't any control data.
-	if len(cm) == 0 {
-		return rl, ml, unix.ControlMessages{}, translateError(rerr)
-	}
-
-	fds, err := cm.ExtractFDs()
-	if err != nil {
-		return 0, 0, unix.ControlMessages{}, translateError(err)
-	}
-
-	if len(fds) == 0 {
-		return rl, ml, unix.ControlMessages{}, translateError(rerr)
-	}
-	return rl, ml, control.New(nil, nil, newSCMRights(fds)), translateError(rerr)
-}
-
-// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD
-// that will pretend to be bound at a given sentry path.
-//
-// The caller is responsible for calling Init(). Additionaly, Release needs to
-// be called twice because host.ConnectedEndpoint is both a unix.Receiver and
-// unix.ConnectedEndpoint.
-func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *tcpip.Error) {
-	e := ConnectedEndpoint{
-		path:  path,
-		queue: queue,
-		file:  file,
-		srfd:  -1,
-	}
-
-	if err := e.init(); err != nil {
-		return nil, err
-	}
-
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-
-	return &e, nil
-}
-
-// Init will do initialization required without holding other locks.
-func (c *ConnectedEndpoint) Init() {
-	if err := fdnotifier.AddFD(int32(c.file.FD()), c.queue); err != nil {
-		panic(err)
-	}
-}
-
 // ConnectedEndpoint is a host FD backed implementation of
 // unix.ConnectedEndpoint and unix.Receiver.
 //
@@ -249,6 +118,93 @@ func (c *ConnectedEndpoint) init() *tcpip.Error {
 	return nil
 }
 
+// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host FD
+// that will pretend to be bound at a given sentry path.
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because ConnectedEndpoint is both a unix.Receiver and
+// unix.ConnectedEndpoint.
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *tcpip.Error) {
+	e := ConnectedEndpoint{
+		path:  path,
+		queue: queue,
+		file:  file,
+		srfd:  -1,
+	}
+
+	if err := e.init(); err != nil {
+		return nil, err
+	}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+
+	return &e, nil
+}
+
+// Init will do initialization required without holding other locks.
+func (c *ConnectedEndpoint) Init() {
+	if err := fdnotifier.AddFD(int32(c.file.FD()), c.queue); err != nil {
+		panic(err)
+	}
+}
+
+// NewSocketWithDirent allocates a new unix socket with host endpoint.
+//
+// This is currently only used by unsaveable Gofer nodes.
+//
+// NewSocketWithDirent takes ownership of f on success.
+func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) {
+	f2 := fd.New(f.FD())
+	var q waiter.Queue
+	e, err := NewConnectedEndpoint(f2, &q, "" /* path */)
+	if err != nil {
+		f2.Release()
+		return nil, syserr.TranslateNetstackError(err).ToError()
+	}
+
+	// Take ownship of the FD.
+	f.Release()
+
+	e.Init()
+
+	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+
+	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
+}
+
+// newSocket allocates a new unix socket with host endpoint.
+func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) {
+	ownedfd := orgfd
+	srfd := -1
+	if saveable {
+		var err error
+		ownedfd, err = syscall.Dup(orgfd)
+		if err != nil {
+			return nil, err
+		}
+		srfd = orgfd
+	}
+	f := fd.New(ownedfd)
+	var q waiter.Queue
+	e, err := NewConnectedEndpoint(f, &q, "" /* path */)
+	if err != nil {
+		if saveable {
+			f.Close()
+		} else {
+			f.Release()
+		}
+		return nil, syserr.TranslateNetstackError(err).ToError()
+	}
+
+	e.srfd = srfd
+	e.Init()
+
+	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+
+	return unixsocket.New(ctx, ep), nil
+}
+
 // Send implements unix.ConnectedEndpoint.Send.
 func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
 	c.mu.RLock()
@@ -257,14 +213,30 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMess
 		return 0, false, tcpip.ErrClosedForSend
 	}
 
+	if !controlMessages.Empty() {
+		return 0, false, tcpip.ErrInvalidEndpointState
+	}
+
 	// Since stream sockets don't preserve message boundaries, we can write
 	// only as much of the message as fits in the send buffer.
 	truncate := c.stype == unix.SockStream
 
-	n, err := sendMsg(c.file.FD(), data, controlMessages, c.sndbuf, truncate)
-	// There is no need for the callee to call SendNotify because sendMsg uses
-	// the host's sendmsg(2) and the host kernel's queue.
-	return n, false, err
+	n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate)
+	if n < totalLen && err == nil {
+		// The host only returns a short write if it would otherwise
+		// block (and only for stream sockets).
+		err = syserror.EAGAIN
+	}
+	if n > 0 && err != syserror.EAGAIN {
+		// The caller may need to block to send more data, but
+		// otherwise there isn't anything that can be done about an
+		// error with a partial write.
+		err = nil
+	}
+
+	// There is no need for the callee to call SendNotify because fdWriteVec
+	// uses the host's sendmsg(2) and the host kernel's queue.
+	return n, false, translateError(err)
 }
 
 // SendNotify implements unix.ConnectedEndpoint.SendNotify.
@@ -318,17 +290,46 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
 	}
 
+	var cm unet.ControlMessage
+	if numRights > 0 {
+		cm.EnableFDs(int(numRights))
+	}
+
 	// N.B. Unix sockets don't have a receive buffer, the send buffer
 	// serves both purposes.
-	rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil, c.sndbuf)
-	if rl > 0 && err == tcpip.ErrWouldBlock {
-		// Message did not fill buffer; that's fine, no need to block.
+	rl, ml, cl, err := fdReadVec(c.file.FD(), data, []byte(cm), peek, c.sndbuf)
+	if rl > 0 && err != nil {
+		// We got some data, so all we need to do on error is return
+		// the data that we got. Short reads are fine, no need to
+		// block.
 		err = nil
 	}
+	if err != nil {
+		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
+	}
 
-	// There is no need for the callee to call RecvNotify because recvMsg uses
+	// There is no need for the callee to call RecvNotify because fdReadVec uses
 	// the host's recvmsg(2) and the host kernel's queue.
-	return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err
+
+	// Trim the control data if we received less than the full amount.
+	if cl < uint64(len(cm)) {
+		cm = cm[:cl]
+	}
+
+	// Avoid extra allocations in the case where there isn't any control data.
+	if len(cm) == 0 {
+		return rl, ml, unix.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+	}
+
+	fds, err := cm.ExtractFDs()
+	if err != nil {
+		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
+	}
+
+	if len(fds) == 0 {
+		return rl, ml, unix.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+	}
+	return rl, ml, control.New(nil, nil, newSCMRights(fds)), tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
 }
 
 // close releases all resources related to the endpoint.
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
index 718606cd1..1bca4b0b4 100644
--- a/pkg/tcpip/transport/unix/unix.go
+++ b/pkg/tcpip/transport/unix/unix.go
@@ -562,6 +562,9 @@ type ConnectedEndpoint interface {
 	// Send sends a single message. This method does not block.
 	//
 	// notify indicates if SendNotify should be called.
+	//
+	// tcpip.ErrWouldBlock can be returned along with a partial write if
+	// the caller should block to send the rest of the data.
 	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n uintptr, notify bool, err *tcpip.Error)
 
 	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
-- 
cgit v1.2.3


From 6cba410df0ea2eabb87bad5074a8a79ed89312b8 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 17 Oct 2018 11:36:32 -0700
Subject: Move Unix transport out of netstack

PiperOrigin-RevId: 217557656
Change-Id: I63d27635b1a6c12877279995d2d9847b6a19da9b
---
 pkg/sentry/fs/BUILD                                |   2 +-
 pkg/sentry/fs/ashmem/BUILD                         |   1 -
 pkg/sentry/fs/dirent.go                            |   4 +-
 pkg/sentry/fs/fsutil/BUILD                         |   2 +-
 pkg/sentry/fs/fsutil/inode.go                      |   6 +-
 pkg/sentry/fs/gofer/BUILD                          |   2 +-
 pkg/sentry/fs/gofer/path.go                        |   4 +-
 pkg/sentry/fs/gofer/session.go                     |  22 +-
 pkg/sentry/fs/gofer/socket.go                      |  25 +-
 pkg/sentry/fs/host/BUILD                           |   4 +-
 pkg/sentry/fs/host/control.go                      |   8 +-
 pkg/sentry/fs/host/inode.go                        |   6 +-
 pkg/sentry/fs/host/socket.go                       |  66 +-
 pkg/sentry/fs/host/socket_test.go                  |  12 +-
 pkg/sentry/fs/inode.go                             |   6 +-
 pkg/sentry/fs/inode_operations.go                  |   6 +-
 pkg/sentry/fs/inode_overlay.go                     |   6 +-
 pkg/sentry/fs/ramfs/BUILD                          |   2 +-
 pkg/sentry/fs/ramfs/dir.go                         |   6 +-
 pkg/sentry/fs/ramfs/ramfs.go                       |   4 +-
 pkg/sentry/fs/ramfs/socket.go                      |   8 +-
 pkg/sentry/fs/tmpfs/BUILD                          |   2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                       |   6 +-
 pkg/sentry/fs/tty/BUILD                            |   2 +-
 pkg/sentry/fs/tty/dir.go                           |   4 +-
 pkg/sentry/kernel/BUILD                            |   2 +-
 pkg/sentry/kernel/abstract_socket_namespace.go     |  16 +-
 pkg/sentry/socket/BUILD                            |   2 +-
 pkg/sentry/socket/control/BUILD                    |   2 +-
 pkg/sentry/socket/control/control.go               |  46 +-
 pkg/sentry/socket/epsocket/BUILD                   |   2 +-
 pkg/sentry/socket/epsocket/epsocket.go             |  20 +-
 pkg/sentry/socket/epsocket/provider.go             |   8 +-
 pkg/sentry/socket/hostinet/BUILD                   |   2 +-
 pkg/sentry/socket/hostinet/socket.go               |   6 +-
 pkg/sentry/socket/netlink/BUILD                    |   2 +-
 pkg/sentry/socket/netlink/provider.go              |   8 +-
 pkg/sentry/socket/netlink/socket.go                |  20 +-
 pkg/sentry/socket/rpcinet/BUILD                    |   2 +-
 pkg/sentry/socket/rpcinet/socket.go                |   6 +-
 pkg/sentry/socket/socket.go                        |  12 +-
 pkg/sentry/socket/unix/BUILD                       |   2 +-
 pkg/sentry/socket/unix/io.go                       |  21 +-
 pkg/sentry/socket/unix/transport/BUILD             |  22 +
 pkg/sentry/socket/unix/transport/connectioned.go   | 454 ++++++++++
 .../socket/unix/transport/connectioned_state.go    |  53 ++
 pkg/sentry/socket/unix/transport/connectionless.go | 192 +++++
 pkg/sentry/socket/unix/transport/queue/BUILD       |  15 +
 pkg/sentry/socket/unix/transport/queue/queue.go    | 227 +++++
 pkg/sentry/socket/unix/transport/unix.go           | 953 +++++++++++++++++++++
 pkg/sentry/socket/unix/unix.go                     |  62 +-
 pkg/sentry/syscalls/linux/BUILD                    |   2 +-
 pkg/sentry/syscalls/linux/sys_socket.go            |   8 +-
 pkg/sentry/uniqueid/BUILD                          |   2 +-
 pkg/sentry/uniqueid/context.go                     |   6 +-
 pkg/tcpip/transport/queue/BUILD                    |  15 -
 pkg/tcpip/transport/queue/queue.go                 | 227 -----
 pkg/tcpip/transport/unix/BUILD                     |  22 -
 pkg/tcpip/transport/unix/connectioned.go           | 454 ----------
 pkg/tcpip/transport/unix/connectioned_state.go     |  53 --
 pkg/tcpip/transport/unix/connectionless.go         | 192 -----
 pkg/tcpip/transport/unix/unix.go                   | 953 ---------------------
 62 files changed, 2154 insertions(+), 2153 deletions(-)
 create mode 100644 pkg/sentry/socket/unix/transport/BUILD
 create mode 100644 pkg/sentry/socket/unix/transport/connectioned.go
 create mode 100644 pkg/sentry/socket/unix/transport/connectioned_state.go
 create mode 100644 pkg/sentry/socket/unix/transport/connectionless.go
 create mode 100644 pkg/sentry/socket/unix/transport/queue/BUILD
 create mode 100644 pkg/sentry/socket/unix/transport/queue/queue.go
 create mode 100644 pkg/sentry/socket/unix/transport/unix.go
 delete mode 100644 pkg/tcpip/transport/queue/BUILD
 delete mode 100644 pkg/tcpip/transport/queue/queue.go
 delete mode 100644 pkg/tcpip/transport/unix/BUILD
 delete mode 100644 pkg/tcpip/transport/unix/connectioned.go
 delete mode 100644 pkg/tcpip/transport/unix/connectioned_state.go
 delete mode 100644 pkg/tcpip/transport/unix/connectionless.go
 delete mode 100644 pkg/tcpip/transport/unix/unix.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index a949fffbf..548898aaa 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -59,13 +59,13 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/syserror",
         "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index dc893d22f..44ef82e64 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -28,7 +28,6 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
     ],
 )
 
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index a42c03e98..27fea0019 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -26,9 +26,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 type globalDirentMap struct {
@@ -800,7 +800,7 @@ func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string,
 }
 
 // Bind satisfies the InodeOperations interface; otherwise same as GetFile.
-func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data unix.BoundEndpoint, perms FilePermissions) (*Dirent, error) {
+func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data transport.BoundEndpoint, perms FilePermissions) (*Dirent, error) {
 	var childDir *Dirent
 	err := d.genericCreate(ctx, root, name, func() error {
 		var e error
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 3512bae6f..6834e1272 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -87,11 +87,11 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 3479f2fad..3acc32752 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -19,9 +19,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -254,7 +254,7 @@ func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
+func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, transport.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
 	return nil, syserror.ENOTDIR
 }
 
@@ -277,7 +277,7 @@ func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) err
 type InodeNotSocket struct{}
 
 // BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
-func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint {
+func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint {
 	return nil
 }
 
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index cb17339c9..cef01829a 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -41,10 +41,10 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
         "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
         "//pkg/unet",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index bec9680f8..0bf7881da 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 // Lookup loads an Inode at name into a Dirent based on the session's cache
@@ -180,7 +180,7 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s
 }
 
 // Bind implements InodeOperations.Bind.
-func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
 	if i.session().endpoints == nil {
 		return nil, syscall.EOPNOTSUPP
 	}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 49d27ee88..4e2293398 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
@@ -36,23 +36,23 @@ type endpointMaps struct {
 	// direntMap links sockets to their dirents.
 	// It is filled concurrently with the keyMap and is stored upon save.
 	// Before saving, this map is used to populate the pathMap.
-	direntMap map[unix.BoundEndpoint]*fs.Dirent
+	direntMap map[transport.BoundEndpoint]*fs.Dirent
 
 	// keyMap links MultiDeviceKeys (containing inode IDs) to their sockets.
 	// It is not stored during save because the inode ID may change upon restore.
-	keyMap map[device.MultiDeviceKey]unix.BoundEndpoint `state:"nosave"`
+	keyMap map[device.MultiDeviceKey]transport.BoundEndpoint `state:"nosave"`
 
 	// pathMap links the sockets to their paths.
 	// It is filled before saving from the direntMap and is stored upon save.
 	// Upon restore, this map is used to re-populate the keyMap.
-	pathMap map[unix.BoundEndpoint]string
+	pathMap map[transport.BoundEndpoint]string
 }
 
 // add adds the endpoint to the maps.
 // A reference is taken on the dirent argument.
 //
 // Precondition: maps must have been locked with 'lock'.
-func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep unix.BoundEndpoint) {
+func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) {
 	e.keyMap[key] = ep
 	d.IncRef()
 	e.direntMap[ep] = d
@@ -81,7 +81,7 @@ func (e *endpointMaps) lock() func() {
 // get returns the endpoint mapped to the given key.
 //
 // Precondition: maps must have been locked for reading.
-func (e *endpointMaps) get(key device.MultiDeviceKey) unix.BoundEndpoint {
+func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint {
 	return e.keyMap[key]
 }
 
@@ -285,9 +285,9 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 // newEndpointMaps creates a new endpointMaps.
 func newEndpointMaps() *endpointMaps {
 	return &endpointMaps{
-		direntMap: make(map[unix.BoundEndpoint]*fs.Dirent),
-		keyMap:    make(map[device.MultiDeviceKey]unix.BoundEndpoint),
-		pathMap:   make(map[unix.BoundEndpoint]string),
+		direntMap: make(map[transport.BoundEndpoint]*fs.Dirent),
+		keyMap:    make(map[device.MultiDeviceKey]transport.BoundEndpoint),
+		pathMap:   make(map[transport.BoundEndpoint]string),
 	}
 }
 
@@ -341,7 +341,7 @@ func (s *session) fillPathMap() error {
 func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// When restoring, only need to create the keyMap because the dirent and path
 	// maps got stored through the save.
-	s.endpoints.keyMap = make(map[device.MultiDeviceKey]unix.BoundEndpoint)
+	s.endpoints.keyMap = make(map[device.MultiDeviceKey]transport.BoundEndpoint)
 	if err := s.fillKeyMap(ctx); err != nil {
 		return fmt.Errorf("failed to insert sockets into endpoint map: %v", err)
 	}
@@ -349,6 +349,6 @@ func (s *session) restoreEndpointMaps(ctx context.Context) error {
 	// Re-create pathMap because it can no longer be trusted as socket paths can
 	// change while process continues to run. Empty pathMap will be re-filled upon
 	// next save.
-	s.endpoints.pathMap = make(map[unix.BoundEndpoint]string)
+	s.endpoints.pathMap = make(map[transport.BoundEndpoint]string)
 	return nil
 }
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 0190bc006..d072da624 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -19,13 +19,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// BoundEndpoint returns a gofer-backed unix.BoundEndpoint.
-func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint {
+// BoundEndpoint returns a gofer-backed transport.BoundEndpoint.
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.BoundEndpoint {
 	if !fs.IsSocket(i.fileState.sattr) {
 		return nil
 	}
@@ -45,7 +45,7 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.Bound
 	return &endpoint{inode, i.fileState.file.file, path}
 }
 
-// endpoint is a Gofer-backed unix.BoundEndpoint.
+// endpoint is a Gofer-backed transport.BoundEndpoint.
 //
 // An endpoint's lifetime is the time between when InodeOperations.BoundEndpoint()
 // is called and either BoundEndpoint.BidirectionalConnect or
@@ -61,20 +61,20 @@ type endpoint struct {
 	path string
 }
 
-func unixSockToP9(t unix.SockType) (p9.ConnectFlags, bool) {
+func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) {
 	switch t {
-	case unix.SockStream:
+	case transport.SockStream:
 		return p9.StreamSocket, true
-	case unix.SockSeqpacket:
+	case transport.SockSeqpacket:
 		return p9.SeqpacketSocket, true
-	case unix.SockDgram:
+	case transport.SockDgram:
 		return p9.DgramSocket, true
 	}
 	return 0, false
 }
 
 // BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
-func (e *endpoint) BidirectionalConnect(ce unix.ConnectingEndpoint, returnConnect func(unix.Receiver, unix.ConnectedEndpoint)) *tcpip.Error {
+func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *tcpip.Error {
 	cf, ok := unixSockToP9(ce.Type())
 	if !ok {
 		return tcpip.ErrConnectionRefused
@@ -113,8 +113,9 @@ func (e *endpoint) BidirectionalConnect(ce unix.ConnectingEndpoint, returnConnec
 	return nil
 }
 
-// UnidirectionalConnect implements unix.BoundEndpoint.UnidirectionalConnect.
-func (e *endpoint) UnidirectionalConnect() (unix.ConnectedEndpoint, *tcpip.Error) {
+// UnidirectionalConnect implements
+// transport.BoundEndpoint.UnidirectionalConnect.
+func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *tcpip.Error) {
 	hostFile, err := e.file.Connect(p9.DgramSocket)
 	if err != nil {
 		return nil, tcpip.ErrConnectionRefused
@@ -134,7 +135,7 @@ func (e *endpoint) UnidirectionalConnect() (unix.ConnectedEndpoint, *tcpip.Error
 	return c, nil
 }
 
-// Release implements unix.BoundEndpoint.Release.
+// Release implements transport.BoundEndpoint.Release.
 func (e *endpoint) Release() {
 	e.inode.DecRef()
 }
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 5ada32ee1..4f264a024 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -42,13 +42,13 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/rawfile",
-        "//pkg/tcpip/transport/unix",
         "//pkg/unet",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
@@ -72,10 +72,10 @@ go_test(
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/socket",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
     ],
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index d2b007ab2..d2e34a69d 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -20,7 +20,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 type scmRights struct {
@@ -45,13 +45,13 @@ func (c *scmRights) Files(ctx context.Context, max int) control.RightsFiles {
 	return rf
 }
 
-// Clone implements unix.RightsControlMessage.Clone.
-func (c *scmRights) Clone() unix.RightsControlMessage {
+// Clone implements transport.RightsControlMessage.Clone.
+func (c *scmRights) Clone() transport.RightsControlMessage {
 	// Host rights never need to be cloned.
 	return nil
 }
 
-// Release implements unix.RightsControlMessage.Release.
+// Release implements transport.RightsControlMessage.Release.
 func (c *scmRights) Release() {
 	for _, fd := range c.fds {
 		syscall.Close(fd)
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index c2e8ba62f..e32497203 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -27,8 +27,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -310,12 +310,12 @@ func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldNa
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
 	return nil, syserror.EOPNOTSUPP
 }
 
 // BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
-func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint {
+func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport.BoundEndpoint {
 	return nil
 }
 
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index e454b6fe5..0eb267c00 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -25,12 +25,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
 	unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
@@ -42,7 +42,7 @@ import (
 const maxSendBufferSize = 8 << 20
 
 // ConnectedEndpoint is a host FD backed implementation of
-// unix.ConnectedEndpoint and unix.Receiver.
+// transport.ConnectedEndpoint and transport.Receiver.
 //
 // +stateify savable
 type ConnectedEndpoint struct {
@@ -70,7 +70,7 @@ type ConnectedEndpoint struct {
 	srfd int `state:"wait"`
 
 	// stype is the type of Unix socket.
-	stype unix.SockType
+	stype transport.SockType
 
 	// sndbuf is the size of the send buffer.
 	//
@@ -112,7 +112,7 @@ func (c *ConnectedEndpoint) init() *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	c.stype = unix.SockType(stype)
+	c.stype = transport.SockType(stype)
 	c.sndbuf = sndbuf
 
 	return nil
@@ -122,8 +122,8 @@ func (c *ConnectedEndpoint) init() *tcpip.Error {
 // that will pretend to be bound at a given sentry path.
 //
 // The caller is responsible for calling Init(). Additionaly, Release needs to
-// be called twice because ConnectedEndpoint is both a unix.Receiver and
-// unix.ConnectedEndpoint.
+// be called twice because ConnectedEndpoint is both a transport.Receiver and
+// transport.ConnectedEndpoint.
 func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *tcpip.Error) {
 	e := ConnectedEndpoint{
 		path:  path,
@@ -168,7 +168,7 @@ func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.F
 
 	e.Init()
 
-	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
 
 	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
 }
@@ -200,13 +200,13 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
 	e.srfd = srfd
 	e.Init()
 
-	ep := unix.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
+	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
 
 	return unixsocket.New(ctx, ep), nil
 }
 
-// Send implements unix.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+// Send implements transport.ConnectedEndpoint.Send.
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.writeClosed {
@@ -219,7 +219,7 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMess
 
 	// Since stream sockets don't preserve message boundaries, we can write
 	// only as much of the message as fits in the send buffer.
-	truncate := c.stype == unix.SockStream
+	truncate := c.stype == transport.SockStream
 
 	n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate)
 	if n < totalLen && err == nil {
@@ -239,20 +239,20 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMess
 	return n, false, translateError(err)
 }
 
-// SendNotify implements unix.ConnectedEndpoint.SendNotify.
+// SendNotify implements transport.ConnectedEndpoint.SendNotify.
 func (c *ConnectedEndpoint) SendNotify() {}
 
-// CloseSend implements unix.ConnectedEndpoint.CloseSend.
+// CloseSend implements transport.ConnectedEndpoint.CloseSend.
 func (c *ConnectedEndpoint) CloseSend() {
 	c.mu.Lock()
 	c.writeClosed = true
 	c.mu.Unlock()
 }
 
-// CloseNotify implements unix.ConnectedEndpoint.CloseNotify.
+// CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
 func (c *ConnectedEndpoint) CloseNotify() {}
 
-// Writable implements unix.ConnectedEndpoint.Writable.
+// Writable implements transport.ConnectedEndpoint.Writable.
 func (c *ConnectedEndpoint) Writable() bool {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
@@ -262,18 +262,18 @@ func (c *ConnectedEndpoint) Writable() bool {
 	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
 }
 
-// Passcred implements unix.ConnectedEndpoint.Passcred.
+// Passcred implements transport.ConnectedEndpoint.Passcred.
 func (c *ConnectedEndpoint) Passcred() bool {
 	// We don't support credential passing for host sockets.
 	return false
 }
 
-// GetLocalAddress implements unix.ConnectedEndpoint.GetLocalAddress.
+// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress.
 func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil
 }
 
-// EventUpdate implements unix.ConnectedEndpoint.EventUpdate.
+// EventUpdate implements transport.ConnectedEndpoint.EventUpdate.
 func (c *ConnectedEndpoint) EventUpdate() {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
@@ -282,12 +282,12 @@ func (c *ConnectedEndpoint) EventUpdate() {
 	}
 }
 
-// Recv implements unix.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+// Recv implements transport.Receiver.Recv.
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.readClosed {
-		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
+		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
 	}
 
 	var cm unet.ControlMessage
@@ -305,7 +305,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 		err = nil
 	}
 	if err != nil {
-		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
+		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
 	}
 
 	// There is no need for the callee to call RecvNotify because fdReadVec uses
@@ -318,16 +318,16 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 
 	// Avoid extra allocations in the case where there isn't any control data.
 	if len(cm) == 0 {
-		return rl, ml, unix.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+		return rl, ml, transport.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
 	}
 
 	fds, err := cm.ExtractFDs()
 	if err != nil {
-		return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
+		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
 	}
 
 	if len(fds) == 0 {
-		return rl, ml, unix.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+		return rl, ml, transport.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
 	}
 	return rl, ml, control.New(nil, nil, newSCMRights(fds)), tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
 }
@@ -339,17 +339,17 @@ func (c *ConnectedEndpoint) close() {
 	c.file = nil
 }
 
-// RecvNotify implements unix.Receiver.RecvNotify.
+// RecvNotify implements transport.Receiver.RecvNotify.
 func (c *ConnectedEndpoint) RecvNotify() {}
 
-// CloseRecv implements unix.Receiver.CloseRecv.
+// CloseRecv implements transport.Receiver.CloseRecv.
 func (c *ConnectedEndpoint) CloseRecv() {
 	c.mu.Lock()
 	c.readClosed = true
 	c.mu.Unlock()
 }
 
-// Readable implements unix.Receiver.Readable.
+// Readable implements transport.Receiver.Readable.
 func (c *ConnectedEndpoint) Readable() bool {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
@@ -359,33 +359,33 @@ func (c *ConnectedEndpoint) Readable() bool {
 	return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
 }
 
-// SendQueuedSize implements unix.Receiver.SendQueuedSize.
+// SendQueuedSize implements transport.Receiver.SendQueuedSize.
 func (c *ConnectedEndpoint) SendQueuedSize() int64 {
 	// SendQueuedSize isn't supported for host sockets because we don't allow the
 	// sentry to call ioctl(2).
 	return -1
 }
 
-// RecvQueuedSize implements unix.Receiver.RecvQueuedSize.
+// RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
 func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
 	// RecvQueuedSize isn't supported for host sockets because we don't allow the
 	// sentry to call ioctl(2).
 	return -1
 }
 
-// SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize.
+// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize.
 func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
 	return int64(c.sndbuf)
 }
 
-// RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize.
+// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize.
 func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
 	// N.B. Unix sockets don't use the receive buffer. We'll claim it is
 	// the same size as the send buffer.
 	return int64(c.sndbuf)
 }
 
-// Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release.
+// Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release.
 func (c *ConnectedEndpoint) Release() {
 	c.ref.DecRefWithDestructor(c.close)
 }
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 8b752737d..1c6f9ddb1 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -22,20 +22,20 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 var (
-	// Make sure that ConnectedEndpoint implements unix.ConnectedEndpoint.
-	_ = unix.ConnectedEndpoint(new(ConnectedEndpoint))
+	// Make sure that ConnectedEndpoint implements transport.ConnectedEndpoint.
+	_ = transport.ConnectedEndpoint(new(ConnectedEndpoint))
 
-	// Make sure that ConnectedEndpoint implements unix.Receiver.
-	_ = unix.Receiver(new(ConnectedEndpoint))
+	// Make sure that ConnectedEndpoint implements transport.Receiver.
+	_ = transport.Receiver(new(ConnectedEndpoint))
 )
 
 func getFl(fd int) (uint32, error) {
@@ -199,7 +199,7 @@ func TestListen(t *testing.T) {
 
 func TestSend(t *testing.T) {
 	e := ConnectedEndpoint{writeClosed: true}
-	if _, _, err := e.Send(nil, unix.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend {
+	if _, _, err := e.Send(nil, transport.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend {
 		t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend)
 	}
 }
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index db7240dca..409c81a97 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 // Inode is a file system object that can be simultaneously referenced by different
@@ -223,7 +223,7 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent,
 }
 
 // Bind calls i.InodeOperations.Bind with i as the directory.
-func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+func (i *Inode) Bind(ctx context.Context, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
 	if i.overlay != nil {
 		return overlayBind(ctx, i.overlay, name, data, perm)
 	}
@@ -231,7 +231,7 @@ func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint,
 }
 
 // BoundEndpoint calls i.InodeOperations.BoundEndpoint with i as the Inode.
-func (i *Inode) BoundEndpoint(path string) unix.BoundEndpoint {
+func (i *Inode) BoundEndpoint(path string) transport.BoundEndpoint {
 	if i.overlay != nil {
 		return overlayBoundEndpoint(i.overlay, path)
 	}
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 952f9704d..3ee3de10e 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -146,7 +146,7 @@ type InodeOperations interface {
 	// Implementations must ensure that name does not already exist.
 	//
 	// The caller must ensure that this operation is permitted.
-	Bind(ctx context.Context, dir *Inode, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error)
+	Bind(ctx context.Context, dir *Inode, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error)
 
 	// BoundEndpoint returns the socket endpoint at path stored in
 	// or generated by an Inode.
@@ -160,7 +160,7 @@ type InodeOperations interface {
 	// generally implies that this Inode was created via CreateSocket.
 	//
 	// If there is no socket endpoint available, nil will be returned.
-	BoundEndpoint(inode *Inode, path string) unix.BoundEndpoint
+	BoundEndpoint(inode *Inode, path string) transport.BoundEndpoint
 
 	// GetFile returns a new open File backed by a Dirent and FileFlags.
 	// It may block as long as it is done with ctx.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index e18e095a0..cf698a4da 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 func overlayHasWhiteout(parent *Inode, name string) bool {
@@ -356,7 +356,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 	return nil
 }
 
-func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+func overlayBind(ctx context.Context, o *overlayEntry, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 	// We do not support doing anything exciting with sockets unless there
@@ -383,7 +383,7 @@ func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.Bo
 	return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
 }
 
-func overlayBoundEndpoint(o *overlayEntry, path string) unix.BoundEndpoint {
+func overlayBoundEndpoint(o *overlayEntry, path string) transport.BoundEndpoint {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
 
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 5230157fe..a93ad6240 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -23,9 +23,9 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index d8333194b..075e13b01 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -20,9 +20,9 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 // CreateOps represents operations to create different file types.
@@ -37,7 +37,7 @@ type CreateOps struct {
 	NewSymlink func(ctx context.Context, dir *fs.Inode, target string) (*fs.Inode, error)
 
 	// NewBoundEndpoint creates a new socket.
-	NewBoundEndpoint func(ctx context.Context, dir *fs.Inode, ep unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error)
+	NewBoundEndpoint func(ctx context.Context, dir *fs.Inode, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error)
 
 	// NewFifo creates a new fifo.
 	NewFifo func(ctx context.Context, dir *fs.Inode, perm fs.FilePermissions) (*fs.Inode, error)
@@ -314,7 +314,7 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
+func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
 	if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil {
 		return nil, ErrDenied
 	}
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 1028b5f1d..83cbcab23 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -26,9 +26,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -279,7 +279,7 @@ func (*Entry) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermiss
 }
 
 // Bind is not supported by default.
-func (*Entry) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
+func (*Entry) Bind(context.Context, *fs.Inode, string, transport.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
 	return nil, ErrInvalidOp
 }
 
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 93427a1ff..9ac00eb18 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -17,7 +17,7 @@ package ramfs
 import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // Socket represents a socket.
@@ -27,17 +27,17 @@ type Socket struct {
 	Entry
 
 	// ep is the bound endpoint.
-	ep unix.BoundEndpoint
+	ep transport.BoundEndpoint
 }
 
 // InitSocket initializes a socket.
-func (s *Socket) InitSocket(ctx context.Context, ep unix.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) {
+func (s *Socket) InitSocket(ctx context.Context, ep transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) {
 	s.InitEntry(ctx, owner, perms)
 	s.ep = ep
 }
 
 // BoundEndpoint returns the socket data.
-func (s *Socket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint {
+func (s *Socket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint {
 	// ramfs only supports stored sentry internal sockets. Only gofer sockets
 	// care about the path argument.
 	return s.ep
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index cfe11ab02..277583113 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -25,9 +25,9 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 10cb5451d..38be6db46 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -22,9 +22,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 var fsInfo = fs.Info{
@@ -104,7 +104,7 @@ func (d *Dir) newCreateOps() *ramfs.CreateOps {
 		NewSymlink: func(ctx context.Context, dir *fs.Inode, target string) (*fs.Inode, error) {
 			return NewSymlink(ctx, target, fs.FileOwnerFromContext(ctx), dir.MountSource), nil
 		},
-		NewBoundEndpoint: func(ctx context.Context, dir *fs.Inode, socket unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error) {
+		NewBoundEndpoint: func(ctx context.Context, dir *fs.Inode, socket transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error) {
 			return NewSocket(ctx, socket, fs.FileOwnerFromContext(ctx), perms, dir.MountSource), nil
 		},
 		NewFifo: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
@@ -160,7 +160,7 @@ type Socket struct {
 }
 
 // NewSocket returns a new socket with the provided permissions.
-func NewSocket(ctx context.Context, socket unix.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
+func NewSocket(ctx context.Context, socket transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
 	s := &Socket{}
 	s.InitSocket(ctx, socket, owner, perms)
 	return fs.NewInode(s, msrc, fs.StableAttr{
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 3c446eef4..d4dd20e30 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -26,9 +26,9 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index c6f39fce3..7c0c0b0c1 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -26,9 +26,9 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -215,7 +215,7 @@ func (d *dirInodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode,
 }
 
 // Bind implements fs.InodeOperations.Bind.
-func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
 	return nil, syserror.EPERM
 }
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 31ad96612..acc61cb09 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -156,6 +156,7 @@ go_library(
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/netlink/port",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/time",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
@@ -166,7 +167,6 @@ go_library(
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/stack",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index d6d1d341d..45088c988 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -19,12 +19,12 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/refs"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // +stateify savable
 type abstractEndpoint struct {
-	ep   unix.BoundEndpoint
+	ep   transport.BoundEndpoint
 	wr   *refs.WeakRef
 	name string
 	ns   *AbstractSocketNamespace
@@ -56,14 +56,14 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
 	}
 }
 
-// A boundEndpoint wraps a unix.BoundEndpoint to maintain a reference on its
-// backing object.
+// A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on
+// its backing object.
 type boundEndpoint struct {
-	unix.BoundEndpoint
+	transport.BoundEndpoint
 	rc refs.RefCounter
 }
 
-// Release implements unix.BoundEndpoint.Release.
+// Release implements transport.BoundEndpoint.Release.
 func (e *boundEndpoint) Release() {
 	e.rc.DecRef()
 	e.BoundEndpoint.Release()
@@ -71,7 +71,7 @@ func (e *boundEndpoint) Release() {
 
 // BoundEndpoint retrieves the endpoint bound to the given name. The return
 // value is nil if no endpoint was bound.
-func (a *AbstractSocketNamespace) BoundEndpoint(name string) unix.BoundEndpoint {
+func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndpoint {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
@@ -93,7 +93,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) unix.BoundEndpoint
 //
 // When the last reference managed by rc is dropped, ep may be removed from the
 // namespace.
-func (a *AbstractSocketNamespace) Bind(name string, ep unix.BoundEndpoint, rc refs.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index a320fca0b..3a8044b5f 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -16,9 +16,9 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
     ],
 )
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index c4874fdfb..d3a63f15f 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -18,8 +18,8 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
     ],
 )
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index c31182e69..db97e95f2 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -24,16 +24,16 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 const maxInt = int(^uint(0) >> 1)
 
 // SCMCredentials represents a SCM_CREDENTIALS socket control message.
 type SCMCredentials interface {
-	unix.CredentialsControlMessage
+	transport.CredentialsControlMessage
 
 	// Credentials returns properly namespaced values for the sender's pid, uid
 	// and gid.
@@ -42,7 +42,7 @@ type SCMCredentials interface {
 
 // SCMRights represents a SCM_RIGHTS socket control message.
 type SCMRights interface {
-	unix.RightsControlMessage
+	transport.RightsControlMessage
 
 	// Files returns up to max RightsFiles.
 	Files(ctx context.Context, max int) RightsFiles
@@ -81,8 +81,8 @@ func (fs *RightsFiles) Files(ctx context.Context, max int) RightsFiles {
 	return rf
 }
 
-// Clone implements unix.RightsControlMessage.Clone.
-func (fs *RightsFiles) Clone() unix.RightsControlMessage {
+// Clone implements transport.RightsControlMessage.Clone.
+func (fs *RightsFiles) Clone() transport.RightsControlMessage {
 	nfs := append(RightsFiles(nil), *fs...)
 	for _, nf := range nfs {
 		nf.IncRef()
@@ -90,7 +90,7 @@ func (fs *RightsFiles) Clone() unix.RightsControlMessage {
 	return &nfs
 }
 
-// Release implements unix.RightsControlMessage.Release.
+// Release implements transport.RightsControlMessage.Release.
 func (fs *RightsFiles) Release() {
 	for _, f := range *fs {
 		f.DecRef()
@@ -156,8 +156,8 @@ func NewSCMCredentials(t *kernel.Task, cred linux.ControlMessageCredentials) (SC
 	return &scmCredentials{t, kuid, kgid}, nil
 }
 
-// Equals implements unix.CredentialsControlMessage.Equals.
-func (c *scmCredentials) Equals(oc unix.CredentialsControlMessage) bool {
+// Equals implements transport.CredentialsControlMessage.Equals.
+func (c *scmCredentials) Equals(oc transport.CredentialsControlMessage) bool {
 	if oc, _ := oc.(*scmCredentials); oc != nil && *c == *oc {
 		return true
 	}
@@ -301,7 +301,7 @@ func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte {
 }
 
 // Parse parses a raw socket control message into portable objects.
-func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.ControlMessages, error) {
+func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) {
 	var (
 		fds linux.ControlMessageRights
 
@@ -311,20 +311,20 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.Contr
 
 	for i := 0; i < len(buf); {
 		if i+linux.SizeOfControlMessageHeader > len(buf) {
-			return unix.ControlMessages{}, syserror.EINVAL
+			return transport.ControlMessages{}, syserror.EINVAL
 		}
 
 		var h linux.ControlMessageHeader
 		binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h)
 
 		if h.Length < uint64(linux.SizeOfControlMessageHeader) {
-			return unix.ControlMessages{}, syserror.EINVAL
+			return transport.ControlMessages{}, syserror.EINVAL
 		}
 		if h.Length > uint64(len(buf)-i) {
-			return unix.ControlMessages{}, syserror.EINVAL
+			return transport.ControlMessages{}, syserror.EINVAL
 		}
 		if h.Level != linux.SOL_SOCKET {
-			return unix.ControlMessages{}, syserror.EINVAL
+			return transport.ControlMessages{}, syserror.EINVAL
 		}
 
 		i += linux.SizeOfControlMessageHeader
@@ -340,7 +340,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.Contr
 			numRights := rightsSize / linux.SizeOfControlMessageRight
 
 			if len(fds)+numRights > linux.SCM_MAX_FD {
-				return unix.ControlMessages{}, syserror.EINVAL
+				return transport.ControlMessages{}, syserror.EINVAL
 			}
 
 			for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight {
@@ -351,7 +351,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.Contr
 
 		case linux.SCM_CREDENTIALS:
 			if length < linux.SizeOfControlMessageCredentials {
-				return unix.ControlMessages{}, syserror.EINVAL
+				return transport.ControlMessages{}, syserror.EINVAL
 			}
 
 			binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds)
@@ -360,7 +360,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.Contr
 
 		default:
 			// Unknown message type.
-			return unix.ControlMessages{}, syserror.EINVAL
+			return transport.ControlMessages{}, syserror.EINVAL
 		}
 	}
 
@@ -368,7 +368,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.Contr
 	if haveCreds {
 		var err error
 		if credentials, err = NewSCMCredentials(t, creds); err != nil {
-			return unix.ControlMessages{}, err
+			return transport.ControlMessages{}, err
 		}
 	} else {
 		credentials = makeCreds(t, socketOrEndpoint)
@@ -378,22 +378,22 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (unix.Contr
 	if len(fds) > 0 {
 		var err error
 		if rights, err = NewSCMRights(t, fds); err != nil {
-			return unix.ControlMessages{}, err
+			return transport.ControlMessages{}, err
 		}
 	}
 
 	if credentials == nil && rights == nil {
-		return unix.ControlMessages{}, nil
+		return transport.ControlMessages{}, nil
 	}
 
-	return unix.ControlMessages{Credentials: credentials, Rights: rights}, nil
+	return transport.ControlMessages{Credentials: credentials, Rights: rights}, nil
 }
 
 func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
 	if t == nil || socketOrEndpoint == nil {
 		return nil
 	}
-	if cr, ok := socketOrEndpoint.(unix.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) {
+	if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) {
 		tcred := t.Credentials()
 		return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
 	}
@@ -401,8 +401,8 @@ func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
 }
 
 // New creates default control messages if needed.
-func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) unix.ControlMessages {
-	return unix.ControlMessages{
+func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages {
+	return transport.ControlMessages{
 		Credentials: makeCreds(t, socketOrEndpoint),
 		Rights:      rights,
 	}
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 7f9ea9edc..dbabc931c 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -31,6 +31,7 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
@@ -42,7 +43,6 @@ go_library(
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index a44679f0b..47c575e7b 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -44,13 +44,13 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -108,26 +108,26 @@ func htons(v uint16) uint16 {
 }
 
 // commonEndpoint represents the intersection of a tcpip.Endpoint and a
-// unix.Endpoint.
+// transport.Endpoint.
 type commonEndpoint interface {
 	// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and
-	// unix.Endpoint.GetLocalAddress.
+	// transport.Endpoint.GetLocalAddress.
 	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
 
 	// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and
-	// unix.Endpoint.GetRemoteAddress.
+	// transport.Endpoint.GetRemoteAddress.
 	GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
 
 	// Readiness implements tcpip.Endpoint.Readiness and
-	// unix.Endpoint.Readiness.
+	// transport.Endpoint.Readiness.
 	Readiness(mask waiter.EventMask) waiter.EventMask
 
 	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
-	// unix.Endpoint.SetSockOpt.
+	// transport.Endpoint.SetSockOpt.
 	SetSockOpt(interface{}) *tcpip.Error
 
 	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
-	// unix.Endpoint.GetSockOpt.
+	// transport.Endpoint.GetSockOpt.
 	GetSockOpt(interface{}) *tcpip.Error
 }
 
@@ -146,7 +146,7 @@ type SocketOperations struct {
 
 	family   int
 	Endpoint tcpip.Endpoint
-	skType   unix.SockType
+	skType   transport.SockType
 
 	// readMu protects access to readView, control, and sender.
 	readMu   sync.Mutex `state:"nosave"`
@@ -156,7 +156,7 @@ type SocketOperations struct {
 }
 
 // New creates a new endpoint socket.
-func New(t *kernel.Task, family int, skType unix.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) *fs.File {
+func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) *fs.File {
 	dirent := socket.NewDirent(t, epsocketDevice)
 	defer dirent.DecRef()
 	return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true}, &SocketOperations{
@@ -502,7 +502,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 
 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
 // sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType unix.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
 		switch name {
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 6c1e3b6b9..dbc232d26 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
@@ -28,7 +29,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -40,7 +40,7 @@ type provider struct {
 
 // GetTransportProtocol figures out transport protocol. Currently only TCP,
 // UDP, and ICMP are supported.
-func GetTransportProtocol(stype unix.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
+func GetTransportProtocol(stype transport.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
 	switch stype {
 	case linux.SOCK_STREAM:
 		if protocol != 0 && protocol != syscall.IPPROTO_TCP {
@@ -62,7 +62,7 @@ func GetTransportProtocol(stype unix.SockType, protocol int) (tcpip.TransportPro
 }
 
 // Socket creates a new socket object for the AF_INET or AF_INET6 family.
-func (p *provider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Fail right away if we don't have a stack.
 	stack := t.NetworkContext()
 	if stack == nil {
@@ -92,7 +92,7 @@ func (p *provider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*f
 }
 
 // Pair just returns nil sockets (not supported).
-func (*provider) Pair(*kernel.Task, unix.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+func (*provider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
 	return nil, nil, nil
 }
 
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index d623718b3..c30220a46 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -29,10 +29,10 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
     ],
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index d0f3054dc..e82624b44 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -27,10 +27,10 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
@@ -511,7 +511,7 @@ type socketProvider struct {
 }
 
 // Socket implements socket.Provider.Socket.
-func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Check that we are using the host network stack.
 	stack := t.NetworkContext()
 	if stack == nil {
@@ -553,7 +553,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protoc
 }
 
 // Pair implements socket.Provider.Pair.
-func (p *socketProvider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	// Not supported by AF_INET/AF_INET6.
 	return nil, nil, nil
 }
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index b852165f7..cff922cb8 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -25,11 +25,11 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index e874216f4..5d0a04a07 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 // Protocol is the implementation of a netlink socket protocol.
@@ -66,10 +66,10 @@ type socketProvider struct {
 }
 
 // Socket implements socket.Provider.Socket.
-func (*socketProvider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Netlink sockets must be specified as datagram or raw, but they
 	// behave the same regardless of type.
-	if stype != unix.SockDgram && stype != unix.SockRaw {
+	if stype != transport.SockDgram && stype != transport.SockRaw {
 		return nil, syserr.ErrSocketNotSupported
 	}
 
@@ -94,7 +94,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype unix.SockType, protocol int)
 }
 
 // Pair implements socket.Provider.Pair by returning an error.
-func (*socketProvider) Pair(*kernel.Task, unix.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+func (*socketProvider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
 	// Netlink sockets never supports creating socket pairs.
 	return nil, nil, syserr.ErrNotSupported
 }
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index f3b2c7256..0c03997f2 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -31,12 +31,12 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
-	sunix "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -80,11 +80,11 @@ type Socket struct {
 
 	// ep is a datagram unix endpoint used to buffer messages sent from the
 	// kernel to userspace. RecvMsg reads messages from this endpoint.
-	ep unix.Endpoint
+	ep transport.Endpoint
 
 	// connection is the kernel's connection to ep, used to write messages
 	// sent to userspace.
-	connection unix.ConnectedEndpoint
+	connection transport.ConnectedEndpoint
 
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -105,7 +105,7 @@ var _ socket.Socket = (*Socket)(nil)
 // NewSocket creates a new Socket.
 func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
 	// Datagram endpoint used to buffer kernel -> user messages.
-	ep := unix.NewConnectionless()
+	ep := transport.NewConnectionless()
 
 	// Bind the endpoint for good measure so we can connect to it. The
 	// bound address will never be exposed.
@@ -115,7 +115,7 @@ func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
 	}
 
 	// Create a connection from which the kernel can write messages.
-	connection, terr := ep.(unix.BoundEndpoint).UnidirectionalConnect()
+	connection, terr := ep.(transport.BoundEndpoint).UnidirectionalConnect()
 	if terr != nil {
 		ep.Close()
 		return nil, syserr.TranslateNetstackError(terr)
@@ -368,7 +368,7 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 
 	trunc := flags&linux.MSG_TRUNC != 0
 
-	r := sunix.EndpointReader{
+	r := unix.EndpointReader{
 		Endpoint: s.ep,
 		Peek:     flags&linux.MSG_PEEK != 0,
 	}
@@ -408,7 +408,7 @@ func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	return dst.CopyOutFrom(ctx, &sunix.EndpointReader{
+	return dst.CopyOutFrom(ctx, &unix.EndpointReader{
 		Endpoint: s.ep,
 	})
 }
@@ -424,7 +424,7 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	if len(bufs) > 0 {
 		// RecvMsg never receives the address, so we don't need to send
 		// one.
-		_, notify, terr := s.connection.Send(bufs, unix.ControlMessages{}, tcpip.FullAddress{})
+		_, notify, terr := s.connection.Send(bufs, transport.ControlMessages{}, tcpip.FullAddress{})
 		// If the buffer is full, we simply drop messages, just like
 		// Linux.
 		if terr != nil && terr != tcpip.ErrWouldBlock {
@@ -448,7 +448,7 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 			PortID: uint32(ms.PortID),
 		})
 
-		_, notify, terr := s.connection.Send([][]byte{m.Finalize()}, unix.ControlMessages{}, tcpip.FullAddress{})
+		_, notify, terr := s.connection.Send([][]byte{m.Finalize()}, transport.ControlMessages{}, tcpip.FullAddress{})
 		if terr != nil && terr != tcpip.ErrWouldBlock {
 			return syserr.TranslateNetstackError(terr)
 		}
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 288199779..3ea433360 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -31,12 +31,12 @@ go_library(
         "//pkg/sentry/socket/hostinet",
         "//pkg/sentry/socket/rpcinet/conn",
         "//pkg/sentry/socket/rpcinet/notifier",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
-        "//pkg/tcpip/transport/unix",
         "//pkg/unet",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 72fa1ca8f..c7e761d54 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -31,12 +31,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -763,7 +763,7 @@ type socketProvider struct {
 }
 
 // Socket implements socket.Provider.Socket.
-func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Check that we are using the RPC network stack.
 	stack := t.NetworkContext()
 	if stack == nil {
@@ -803,7 +803,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protoc
 }
 
 // Pair implements socket.Provider.Pair.
-func (p *socketProvider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	// Not supported by AF_INET/AF_INET6.
 	return nil, nil, nil
 }
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 54fe64595..31f8d42d7 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -29,16 +29,16 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 // ControlMessages represents the union of unix control messages and tcpip
 // control messages.
 type ControlMessages struct {
-	Unix unix.ControlMessages
+	Unix transport.ControlMessages
 	IP   tcpip.ControlMessages
 }
 
@@ -109,12 +109,12 @@ type Provider interface {
 	// If a nil Socket _and_ a nil error is returned, it means that the
 	// protocol is not supported. A non-nil error should only be returned
 	// if the protocol is supported, but an error occurs during creation.
-	Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error)
+	Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error)
 
 	// Pair creates a pair of connected sockets.
 	//
 	// See Socket for error information.
-	Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
+	Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
 }
 
 // families holds a map of all known address families and their providers.
@@ -128,7 +128,7 @@ func RegisterProvider(family int, provider Provider) {
 }
 
 // New creates a new socket with the given family, type and protocol.
-func New(t *kernel.Task, family int, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
 	for _, p := range families[family] {
 		s, err := p.Socket(t, stype, protocol)
 		if err != nil {
@@ -144,7 +144,7 @@ func New(t *kernel.Task, family int, stype unix.SockType, protocol int) (*fs.Fil
 
 // Pair creates a new connected socket pair with the given family, type and
 // protocol.
-func Pair(t *kernel.Task, family int, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	providers, ok := families[family]
 	if !ok {
 		return nil, nil, syserr.ErrAddressFamilyNotSupported
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index 9fe681e9a..a12fa93db 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -26,11 +26,11 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/epsocket",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 0ca2e35d0..06333e14b 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -16,23 +16,23 @@ package unix
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
-// EndpointWriter implements safemem.Writer that writes to a unix.Endpoint.
+// EndpointWriter implements safemem.Writer that writes to a transport.Endpoint.
 //
 // EndpointWriter is not thread-safe.
 type EndpointWriter struct {
-	// Endpoint is the unix.Endpoint to write to.
-	Endpoint unix.Endpoint
+	// Endpoint is the transport.Endpoint to write to.
+	Endpoint transport.Endpoint
 
 	// Control is the control messages to send.
-	Control unix.ControlMessages
+	Control transport.ControlMessages
 
 	// To is the endpoint to send to. May be nil.
-	To unix.BoundEndpoint
+	To transport.BoundEndpoint
 }
 
 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
@@ -46,12 +46,13 @@ func (w *EndpointWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	}}.WriteFromBlocks(srcs)
 }
 
-// EndpointReader implements safemem.Reader that reads from a unix.Endpoint.
+// EndpointReader implements safemem.Reader that reads from a
+// transport.Endpoint.
 //
 // EndpointReader is not thread-safe.
 type EndpointReader struct {
-	// Endpoint is the unix.Endpoint to read from.
-	Endpoint unix.Endpoint
+	// Endpoint is the transport.Endpoint to read from.
+	Endpoint transport.Endpoint
 
 	// Creds indicates if credential control messages are requested.
 	Creds bool
@@ -71,7 +72,7 @@ type EndpointReader struct {
 	From *tcpip.FullAddress
 
 	// Control contains the received control messages.
-	Control unix.ControlMessages
+	Control transport.ControlMessages
 }
 
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
new file mode 100644
index 000000000..04ef0d438
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -0,0 +1,22 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+    name = "transport",
+    srcs = [
+        "connectioned.go",
+        "connectioned_state.go",
+        "connectionless.go",
+        "unix.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/ilist",
+        "//pkg/sentry/socket/unix/transport/queue",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
new file mode 100644
index 000000000..f09935765
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -0,0 +1,454 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package transport
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// UniqueIDProvider generates a sequence of unique identifiers useful for,
+// among other things, lock ordering.
+type UniqueIDProvider interface {
+	// UniqueID returns a new unique identifier.
+	UniqueID() uint64
+}
+
+// A ConnectingEndpoint is a connectioned unix endpoint that is attempting to
+// establish a bidirectional connection with a BoundEndpoint.
+type ConnectingEndpoint interface {
+	// ID returns the endpoint's globally unique identifier. This identifier
+	// must be used to determine locking order if more than one endpoint is
+	// to be locked in the same codepath. The endpoint with the smaller
+	// identifier must be locked before endpoints with larger identifiers.
+	ID() uint64
+
+	// Passcred implements socket.Credentialer.Passcred.
+	Passcred() bool
+
+	// Type returns the socket type, typically either SockStream or
+	// SockSeqpacket. The connection attempt must be aborted if this
+	// value doesn't match the ConnectableEndpoint's type.
+	Type() SockType
+
+	// GetLocalAddress returns the bound path.
+	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
+
+	// Locker protects the following methods. While locked, only the holder of
+	// the lock can change the return value of the protected methods.
+	sync.Locker
+
+	// Connected returns true iff the ConnectingEndpoint is in the connected
+	// state. ConnectingEndpoints can only be connected to a single endpoint,
+	// so the connection attempt must be aborted if this returns true.
+	Connected() bool
+
+	// Listening returns true iff the ConnectingEndpoint is in the listening
+	// state. ConnectingEndpoints cannot make connections while listening, so
+	// the connection attempt must be aborted if this returns true.
+	Listening() bool
+
+	// WaiterQueue returns a pointer to the endpoint's waiter queue.
+	WaiterQueue() *waiter.Queue
+}
+
+// connectionedEndpoint is a Unix-domain connected or connectable endpoint and implements
+// ConnectingEndpoint, ConnectableEndpoint and tcpip.Endpoint.
+//
+// connectionedEndpoints must be in connected state in order to transfer data.
+//
+// This implementation includes STREAM and SEQPACKET Unix sockets created with
+// socket(2), accept(2) or socketpair(2) and dgram unix sockets created with
+// socketpair(2). See unix_connectionless.go for the implementation of DGRAM
+// Unix sockets created with socket(2).
+//
+// The state is much simpler than a TCP endpoint, so it is not encoded
+// explicitly. Instead we enforce the following invariants:
+//
+// receiver != nil, connected != nil => connected.
+// path != "" && acceptedChan == nil => bound, not listening.
+// path != "" && acceptedChan != nil => bound and listening.
+//
+// Only one of these will be true at any moment.
+//
+// +stateify savable
+type connectionedEndpoint struct {
+	baseEndpoint
+
+	// id is the unique endpoint identifier. This is used exclusively for
+	// lock ordering within connect.
+	id uint64
+
+	// idGenerator is used to generate new unique endpoint identifiers.
+	idGenerator UniqueIDProvider
+
+	// stype is used by connecting sockets to ensure that they are the
+	// same type. The value is typically either tcpip.SockSeqpacket or
+	// tcpip.SockStream.
+	stype SockType
+
+	// acceptedChan is per the TCP endpoint implementation. Note that the
+	// sockets in this channel are _already in the connected state_, and
+	// have another associated connectionedEndpoint.
+	//
+	// If nil, then no listen call has been made.
+	acceptedChan chan *connectionedEndpoint `state:".([]*connectionedEndpoint)"`
+}
+
+// NewConnectioned creates a new unbound connectionedEndpoint.
+func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint {
+	return &connectionedEndpoint{
+		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
+		id:           uid.UniqueID(),
+		idGenerator:  uid,
+		stype:        stype,
+	}
+}
+
+// NewPair allocates a new pair of connected unix-domain connectionedEndpoints.
+func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
+	a := &connectionedEndpoint{
+		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
+		id:           uid.UniqueID(),
+		idGenerator:  uid,
+		stype:        stype,
+	}
+	b := &connectionedEndpoint{
+		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
+		id:           uid.UniqueID(),
+		idGenerator:  uid,
+		stype:        stype,
+	}
+
+	q1 := queue.New(a.Queue, b.Queue, initialLimit)
+	q2 := queue.New(b.Queue, a.Queue, initialLimit)
+
+	if stype == SockStream {
+		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
+		b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}}
+	} else {
+		a.receiver = &queueReceiver{q1}
+		b.receiver = &queueReceiver{q2}
+	}
+
+	a.connected = &connectedEndpoint{
+		endpoint:   b,
+		writeQueue: q2,
+	}
+	b.connected = &connectedEndpoint{
+		endpoint:   a,
+		writeQueue: q1,
+	}
+
+	return a, b
+}
+
+// NewExternal creates a new externally backed Endpoint. It behaves like a
+// socketpair.
+func NewExternal(stype SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
+	return &connectionedEndpoint{
+		baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected},
+		id:           uid.UniqueID(),
+		idGenerator:  uid,
+		stype:        stype,
+	}
+}
+
+// ID implements ConnectingEndpoint.ID.
+func (e *connectionedEndpoint) ID() uint64 {
+	return e.id
+}
+
+// Type implements ConnectingEndpoint.Type and Endpoint.Type.
+func (e *connectionedEndpoint) Type() SockType {
+	return e.stype
+}
+
+// WaiterQueue implements ConnectingEndpoint.WaiterQueue.
+func (e *connectionedEndpoint) WaiterQueue() *waiter.Queue {
+	return e.Queue
+}
+
+// isBound returns true iff the connectionedEndpoint is bound (but not
+// listening).
+func (e *connectionedEndpoint) isBound() bool {
+	return e.path != "" && e.acceptedChan == nil
+}
+
+// Listening implements ConnectingEndpoint.Listening.
+func (e *connectionedEndpoint) Listening() bool {
+	return e.acceptedChan != nil
+}
+
+// Close puts the connectionedEndpoint in a closed state and frees all
+// resources associated with it.
+//
+// The socket will be a fresh state after a call to close and may be reused.
+// That is, close may be used to "unbind" or "disconnect" the socket in error
+// paths.
+func (e *connectionedEndpoint) Close() {
+	e.Lock()
+	var c ConnectedEndpoint
+	var r Receiver
+	switch {
+	case e.Connected():
+		e.connected.CloseSend()
+		e.receiver.CloseRecv()
+		c = e.connected
+		r = e.receiver
+		e.connected = nil
+		e.receiver = nil
+	case e.isBound():
+		e.path = ""
+	case e.Listening():
+		close(e.acceptedChan)
+		for n := range e.acceptedChan {
+			n.Close()
+		}
+		e.acceptedChan = nil
+		e.path = ""
+	}
+	e.Unlock()
+	if c != nil {
+		c.CloseNotify()
+		c.Release()
+	}
+	if r != nil {
+		r.CloseNotify()
+		r.Release()
+	}
+}
+
+// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
+func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error {
+	if ce.Type() != e.stype {
+		return tcpip.ErrConnectionRefused
+	}
+
+	// Check if ce is e to avoid a deadlock.
+	if ce, ok := ce.(*connectionedEndpoint); ok && ce == e {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Do a dance to safely acquire locks on both endpoints.
+	if e.id < ce.ID() {
+		e.Lock()
+		ce.Lock()
+	} else {
+		ce.Lock()
+		e.Lock()
+	}
+
+	// Check connecting state.
+	if ce.Connected() {
+		e.Unlock()
+		ce.Unlock()
+		return tcpip.ErrAlreadyConnected
+	}
+	if ce.Listening() {
+		e.Unlock()
+		ce.Unlock()
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Check bound state.
+	if !e.Listening() {
+		e.Unlock()
+		ce.Unlock()
+		return tcpip.ErrConnectionRefused
+	}
+
+	// Create a newly bound connectionedEndpoint.
+	ne := &connectionedEndpoint{
+		baseEndpoint: baseEndpoint{
+			path:  e.path,
+			Queue: &waiter.Queue{},
+		},
+		id:          e.idGenerator.UniqueID(),
+		idGenerator: e.idGenerator,
+		stype:       e.stype,
+	}
+	readQueue := queue.New(ce.WaiterQueue(), ne.Queue, initialLimit)
+	writeQueue := queue.New(ne.Queue, ce.WaiterQueue(), initialLimit)
+	ne.connected = &connectedEndpoint{
+		endpoint:   ce,
+		writeQueue: readQueue,
+	}
+	if e.stype == SockStream {
+		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
+	} else {
+		ne.receiver = &queueReceiver{readQueue: writeQueue}
+	}
+
+	select {
+	case e.acceptedChan <- ne:
+		// Commit state.
+		connected := &connectedEndpoint{
+			endpoint:   ne,
+			writeQueue: writeQueue,
+		}
+		if e.stype == SockStream {
+			returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected)
+		} else {
+			returnConnect(&queueReceiver{readQueue: readQueue}, connected)
+		}
+
+		// Notify can deadlock if we are holding these locks.
+		e.Unlock()
+		ce.Unlock()
+
+		// Notify on both ends.
+		e.Notify(waiter.EventIn)
+		ce.WaiterQueue().Notify(waiter.EventOut)
+
+		return nil
+	default:
+		// Busy; return ECONNREFUSED per spec.
+		ne.Close()
+		e.Unlock()
+		ce.Unlock()
+		return tcpip.ErrConnectionRefused
+	}
+}
+
+// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
+func (e *connectionedEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error) {
+	return nil, tcpip.ErrConnectionRefused
+}
+
+// Connect attempts to directly connect to another Endpoint.
+// Implements Endpoint.Connect.
+func (e *connectionedEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
+	returnConnect := func(r Receiver, ce ConnectedEndpoint) {
+		e.receiver = r
+		e.connected = ce
+	}
+
+	return server.BidirectionalConnect(e, returnConnect)
+}
+
+// Listen starts listening on the connection.
+func (e *connectionedEndpoint) Listen(backlog int) *tcpip.Error {
+	e.Lock()
+	defer e.Unlock()
+	if e.Listening() {
+		// Adjust the size of the channel iff we can fix existing
+		// pending connections into the new one.
+		if len(e.acceptedChan) > backlog {
+			return tcpip.ErrInvalidEndpointState
+		}
+		origChan := e.acceptedChan
+		e.acceptedChan = make(chan *connectionedEndpoint, backlog)
+		close(origChan)
+		for ep := range origChan {
+			e.acceptedChan <- ep
+		}
+		return nil
+	}
+	if !e.isBound() {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Normal case.
+	e.acceptedChan = make(chan *connectionedEndpoint, backlog)
+	return nil
+}
+
+// Accept accepts a new connection.
+func (e *connectionedEndpoint) Accept() (Endpoint, *tcpip.Error) {
+	e.Lock()
+	defer e.Unlock()
+
+	if !e.Listening() {
+		return nil, tcpip.ErrInvalidEndpointState
+	}
+
+	select {
+	case ne := <-e.acceptedChan:
+		return ne, nil
+
+	default:
+		// Nothing left.
+		return nil, tcpip.ErrWouldBlock
+	}
+}
+
+// Bind binds the connection.
+//
+// For Unix connectionedEndpoints, this _only sets the address associated with
+// the socket_. Work associated with sockets in the filesystem or finding those
+// sockets must be done by a higher level.
+//
+// Bind will fail only if the socket is connected, bound or the passed address
+// is invalid (the empty string).
+func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	e.Lock()
+	defer e.Unlock()
+	if e.isBound() || e.Listening() {
+		return tcpip.ErrAlreadyBound
+	}
+	if addr.Addr == "" {
+		// The empty string is not permitted.
+		return tcpip.ErrBadLocalAddress
+	}
+	if commit != nil {
+		if err := commit(); err != nil {
+			return err
+		}
+	}
+
+	// Save the bound address.
+	e.path = string(addr.Addr)
+	return nil
+}
+
+// SendMsg writes data and a control message to the endpoint's peer.
+// This method does not block if the data cannot be written.
+func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
+	// Stream sockets do not support specifying the endpoint. Seqpacket
+	// sockets ignore the passed endpoint.
+	if e.stype == SockStream && to != nil {
+		return 0, tcpip.ErrNotSupported
+	}
+	return e.baseEndpoint.SendMsg(data, c, to)
+}
+
+// Readiness returns the current readiness of the connectionedEndpoint. For
+// example, if waiter.EventIn is set, the connectionedEndpoint is immediately
+// readable.
+func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	e.Lock()
+	defer e.Unlock()
+
+	ready := waiter.EventMask(0)
+	switch {
+	case e.Connected():
+		if mask&waiter.EventIn != 0 && e.receiver.Readable() {
+			ready |= waiter.EventIn
+		}
+		if mask&waiter.EventOut != 0 && e.connected.Writable() {
+			ready |= waiter.EventOut
+		}
+	case e.Listening():
+		if mask&waiter.EventIn != 0 && len(e.acceptedChan) > 0 {
+			ready |= waiter.EventIn
+		}
+	}
+
+	return ready
+}
diff --git a/pkg/sentry/socket/unix/transport/connectioned_state.go b/pkg/sentry/socket/unix/transport/connectioned_state.go
new file mode 100644
index 000000000..7e6c73dcc
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/connectioned_state.go
@@ -0,0 +1,53 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package transport
+
+// saveAcceptedChan is invoked by stateify.
+func (e *connectionedEndpoint) saveAcceptedChan() []*connectionedEndpoint {
+	// If acceptedChan is nil (i.e. we are not listening) then we will save nil.
+	// Otherwise we create a (possibly empty) slice of the values in acceptedChan and
+	// save that.
+	var acceptedSlice []*connectionedEndpoint
+	if e.acceptedChan != nil {
+		// Swap out acceptedChan with a new empty channel of the same capacity.
+		saveChan := e.acceptedChan
+		e.acceptedChan = make(chan *connectionedEndpoint, cap(saveChan))
+
+		// Create a new slice with the same len and capacity as the channel.
+		acceptedSlice = make([]*connectionedEndpoint, len(saveChan), cap(saveChan))
+		// Drain acceptedChan into saveSlice, and fill up the new acceptChan at the
+		// same time.
+		for i := range acceptedSlice {
+			ep := <-saveChan
+			acceptedSlice[i] = ep
+			e.acceptedChan <- ep
+		}
+		close(saveChan)
+	}
+	return acceptedSlice
+}
+
+// loadAcceptedChan is invoked by stateify.
+func (e *connectionedEndpoint) loadAcceptedChan(acceptedSlice []*connectionedEndpoint) {
+	// If acceptedSlice is nil, then acceptedChan should also be nil.
+	if acceptedSlice != nil {
+		// Otherwise, create a new channel with the same capacity as acceptedSlice.
+		e.acceptedChan = make(chan *connectionedEndpoint, cap(acceptedSlice))
+		// Seed the channel with values from acceptedSlice.
+		for _, ep := range acceptedSlice {
+			e.acceptedChan <- ep
+		}
+	}
+}
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
new file mode 100644
index 000000000..fb2728010
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -0,0 +1,192 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package transport
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// connectionlessEndpoint is a unix endpoint for unix sockets that support operating in
+// a conectionless fashon.
+//
+// Specifically, this means datagram unix sockets not created with
+// socketpair(2).
+//
+// +stateify savable
+type connectionlessEndpoint struct {
+	baseEndpoint
+}
+
+// NewConnectionless creates a new unbound dgram endpoint.
+func NewConnectionless() Endpoint {
+	ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
+	ep.receiver = &queueReceiver{readQueue: queue.New(&waiter.Queue{}, ep.Queue, initialLimit)}
+	return ep
+}
+
+// isBound returns true iff the endpoint is bound.
+func (e *connectionlessEndpoint) isBound() bool {
+	return e.path != ""
+}
+
+// Close puts the endpoint in a closed state and frees all resources associated
+// with it.
+//
+// The socket will be a fresh state after a call to close and may be reused.
+// That is, close may be used to "unbind" or "disconnect" the socket in error
+// paths.
+func (e *connectionlessEndpoint) Close() {
+	e.Lock()
+	var r Receiver
+	if e.Connected() {
+		e.receiver.CloseRecv()
+		r = e.receiver
+		e.receiver = nil
+
+		e.connected.Release()
+		e.connected = nil
+	}
+	if e.isBound() {
+		e.path = ""
+	}
+	e.Unlock()
+	if r != nil {
+		r.CloseNotify()
+		r.Release()
+	}
+}
+
+// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
+func (e *connectionlessEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error {
+	return tcpip.ErrConnectionRefused
+}
+
+// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
+func (e *connectionlessEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error) {
+	e.Lock()
+	r := e.receiver
+	e.Unlock()
+	if r == nil {
+		return nil, tcpip.ErrConnectionRefused
+	}
+	return &connectedEndpoint{
+		endpoint:   e,
+		writeQueue: r.(*queueReceiver).readQueue,
+	}, nil
+}
+
+// SendMsg writes data and a control message to the specified endpoint.
+// This method does not block if the data cannot be written.
+func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
+	if to == nil {
+		return e.baseEndpoint.SendMsg(data, c, nil)
+	}
+
+	connected, err := to.UnidirectionalConnect()
+	if err != nil {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+	defer connected.Release()
+
+	e.Lock()
+	n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
+	e.Unlock()
+
+	if notify {
+		connected.SendNotify()
+	}
+
+	return n, err
+}
+
+// Type implements Endpoint.Type.
+func (e *connectionlessEndpoint) Type() SockType {
+	return SockDgram
+}
+
+// Connect attempts to connect directly to server.
+func (e *connectionlessEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
+	connected, err := server.UnidirectionalConnect()
+	if err != nil {
+		return err
+	}
+
+	e.Lock()
+	e.connected = connected
+	e.Unlock()
+
+	return nil
+}
+
+// Listen starts listening on the connection.
+func (e *connectionlessEndpoint) Listen(int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept accepts a new connection.
+func (e *connectionlessEndpoint) Accept() (Endpoint, *tcpip.Error) {
+	return nil, tcpip.ErrNotSupported
+}
+
+// Bind binds the connection.
+//
+// For Unix endpoints, this _only sets the address associated with the socket_.
+// Work associated with sockets in the filesystem or finding those sockets must
+// be done by a higher level.
+//
+// Bind will fail only if the socket is connected, bound or the passed address
+// is invalid (the empty string).
+func (e *connectionlessEndpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+	e.Lock()
+	defer e.Unlock()
+	if e.isBound() {
+		return tcpip.ErrAlreadyBound
+	}
+	if addr.Addr == "" {
+		// The empty string is not permitted.
+		return tcpip.ErrBadLocalAddress
+	}
+	if commit != nil {
+		if err := commit(); err != nil {
+			return err
+		}
+	}
+
+	// Save the bound address.
+	e.path = string(addr.Addr)
+	return nil
+}
+
+// Readiness returns the current readiness of the endpoint. For example, if
+// waiter.EventIn is set, the endpoint is immediately readable.
+func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	e.Lock()
+	defer e.Unlock()
+
+	ready := waiter.EventMask(0)
+	if mask&waiter.EventIn != 0 && e.receiver.Readable() {
+		ready |= waiter.EventIn
+	}
+
+	if e.Connected() {
+		if mask&waiter.EventOut != 0 && e.connected.Writable() {
+			ready |= waiter.EventOut
+		}
+	}
+
+	return ready
+}
diff --git a/pkg/sentry/socket/unix/transport/queue/BUILD b/pkg/sentry/socket/unix/transport/queue/BUILD
new file mode 100644
index 000000000..d914ecc23
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/queue/BUILD
@@ -0,0 +1,15 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+    name = "queue",
+    srcs = ["queue.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/ilist",
+        "//pkg/tcpip",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/socket/unix/transport/queue/queue.go b/pkg/sentry/socket/unix/transport/queue/queue.go
new file mode 100644
index 000000000..b3d2ea68b
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/queue/queue.go
@@ -0,0 +1,227 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package queue provides the implementation of buffer queue
+// and interface of queue entry with Length method.
+package queue
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Entry implements Linker interface and has additional required methods.
+type Entry interface {
+	ilist.Linker
+
+	// Length returns the number of bytes stored in the entry.
+	Length() int64
+
+	// Release releases any resources held by the entry.
+	Release()
+
+	// Peek returns a copy of the entry. It must be Released separately.
+	Peek() Entry
+
+	// Truncate reduces the number of bytes stored in the entry to n bytes.
+	//
+	// Preconditions: n <= Length().
+	Truncate(n int64)
+}
+
+// Queue is a buffer queue.
+//
+// +stateify savable
+type Queue struct {
+	ReaderQueue *waiter.Queue
+	WriterQueue *waiter.Queue
+
+	mu       sync.Mutex `state:"nosave"`
+	closed   bool
+	used     int64
+	limit    int64
+	dataList ilist.List
+}
+
+// New allocates and initializes a new queue.
+func New(ReaderQueue *waiter.Queue, WriterQueue *waiter.Queue, limit int64) *Queue {
+	return &Queue{ReaderQueue: ReaderQueue, WriterQueue: WriterQueue, limit: limit}
+}
+
+// Close closes q for reading and writing. It is immediately not writable and
+// will become unreadable when no more data is pending.
+//
+// Both the read and write queues must be notified after closing:
+// q.ReaderQueue.Notify(waiter.EventIn)
+// q.WriterQueue.Notify(waiter.EventOut)
+func (q *Queue) Close() {
+	q.mu.Lock()
+	q.closed = true
+	q.mu.Unlock()
+}
+
+// Reset empties the queue and Releases all of the Entries.
+//
+// Both the read and write queues must be notified after resetting:
+// q.ReaderQueue.Notify(waiter.EventIn)
+// q.WriterQueue.Notify(waiter.EventOut)
+func (q *Queue) Reset() {
+	q.mu.Lock()
+	for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
+		cur.(Entry).Release()
+	}
+	q.dataList.Reset()
+	q.used = 0
+	q.mu.Unlock()
+}
+
+// IsReadable determines if q is currently readable.
+func (q *Queue) IsReadable() bool {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	return q.closed || q.dataList.Front() != nil
+}
+
+// bufWritable returns true if there is space for writing.
+//
+// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is
+// free.
+//
+// See net/unix/af_unix.c:unix_writeable.
+func (q *Queue) bufWritable() bool {
+	return 4*q.used < q.limit
+}
+
+// IsWritable determines if q is currently writable.
+func (q *Queue) IsWritable() bool {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	return q.closed || q.bufWritable()
+}
+
+// Enqueue adds an entry to the data queue if room is available.
+//
+// If truncate is true, Enqueue may truncate the message beforing enqueuing it.
+// Otherwise, the entire message must fit. If n < e.Length(), err indicates why.
+//
+// If notify is true, ReaderQueue.Notify must be called:
+// q.ReaderQueue.Notify(waiter.EventIn)
+func (q *Queue) Enqueue(e Entry, truncate bool) (l int64, notify bool, err *tcpip.Error) {
+	q.mu.Lock()
+
+	if q.closed {
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrClosedForSend
+	}
+
+	free := q.limit - q.used
+
+	l = e.Length()
+
+	if l > free && truncate {
+		if free == 0 {
+			// Message can't fit right now.
+			q.mu.Unlock()
+			return 0, false, tcpip.ErrWouldBlock
+		}
+
+		e.Truncate(free)
+		l = e.Length()
+		err = tcpip.ErrWouldBlock
+	}
+
+	if l > q.limit {
+		// Message is too big to ever fit.
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrMessageTooLong
+	}
+
+	if l > free {
+		// Message can't fit right now.
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrWouldBlock
+	}
+
+	notify = q.dataList.Front() == nil
+	q.used += l
+	q.dataList.PushBack(e)
+
+	q.mu.Unlock()
+
+	return l, notify, err
+}
+
+// Dequeue removes the first entry in the data queue, if one exists.
+//
+// If notify is true, WriterQueue.Notify must be called:
+// q.WriterQueue.Notify(waiter.EventOut)
+func (q *Queue) Dequeue() (e Entry, notify bool, err *tcpip.Error) {
+	q.mu.Lock()
+
+	if q.dataList.Front() == nil {
+		err := tcpip.ErrWouldBlock
+		if q.closed {
+			err = tcpip.ErrClosedForReceive
+		}
+		q.mu.Unlock()
+
+		return nil, false, err
+	}
+
+	notify = !q.bufWritable()
+
+	e = q.dataList.Front().(Entry)
+	q.dataList.Remove(e)
+	q.used -= e.Length()
+
+	notify = notify && q.bufWritable()
+
+	q.mu.Unlock()
+
+	return e, notify, nil
+}
+
+// Peek returns the first entry in the data queue, if one exists.
+func (q *Queue) Peek() (Entry, *tcpip.Error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	if q.dataList.Front() == nil {
+		err := tcpip.ErrWouldBlock
+		if q.closed {
+			err = tcpip.ErrClosedForReceive
+		}
+		return nil, err
+	}
+
+	return q.dataList.Front().(Entry).Peek(), nil
+}
+
+// QueuedSize returns the number of bytes currently in the queue, that is, the
+// number of readable bytes.
+func (q *Queue) QueuedSize() int64 {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	return q.used
+}
+
+// MaxQueueSize returns the maximum number of bytes storable in the queue.
+func (q *Queue) MaxQueueSize() int64 {
+	return q.limit
+}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
new file mode 100644
index 000000000..577aa87d5
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -0,0 +1,953 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package transport contains the implementation of Unix endpoints.
+package transport
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// initialLimit is the starting limit for the socket buffers.
+const initialLimit = 16 * 1024
+
+// A SockType is a type (as opposed to family) of sockets. These are enumerated
+// in the syscall package as syscall.SOCK_* constants.
+type SockType int
+
+const (
+	// SockStream corresponds to syscall.SOCK_STREAM.
+	SockStream SockType = 1
+	// SockDgram corresponds to syscall.SOCK_DGRAM.
+	SockDgram SockType = 2
+	// SockRaw corresponds to syscall.SOCK_RAW.
+	SockRaw SockType = 3
+	// SockSeqpacket corresponds to syscall.SOCK_SEQPACKET.
+	SockSeqpacket SockType = 5
+)
+
+// A RightsControlMessage is a control message containing FDs.
+type RightsControlMessage interface {
+	// Clone returns a copy of the RightsControlMessage.
+	Clone() RightsControlMessage
+
+	// Release releases any resources owned by the RightsControlMessage.
+	Release()
+}
+
+// A CredentialsControlMessage is a control message containing Unix credentials.
+type CredentialsControlMessage interface {
+	// Equals returns true iff the two messages are equal.
+	Equals(CredentialsControlMessage) bool
+}
+
+// A ControlMessages represents a collection of socket control messages.
+//
+// +stateify savable
+type ControlMessages struct {
+	// Rights is a control message containing FDs.
+	Rights RightsControlMessage
+
+	// Credentials is a control message containing Unix credentials.
+	Credentials CredentialsControlMessage
+}
+
+// Empty returns true iff the ControlMessages does not contain either
+// credentials or rights.
+func (c *ControlMessages) Empty() bool {
+	return c.Rights == nil && c.Credentials == nil
+}
+
+// Clone clones both the credentials and the rights.
+func (c *ControlMessages) Clone() ControlMessages {
+	cm := ControlMessages{}
+	if c.Rights != nil {
+		cm.Rights = c.Rights.Clone()
+	}
+	cm.Credentials = c.Credentials
+	return cm
+}
+
+// Release releases both the credentials and the rights.
+func (c *ControlMessages) Release() {
+	if c.Rights != nil {
+		c.Rights.Release()
+	}
+	*c = ControlMessages{}
+}
+
+// Endpoint is the interface implemented by Unix transport protocol
+// implementations that expose functionality like sendmsg, recvmsg, connect,
+// etc. to Unix socket implementations.
+type Endpoint interface {
+	Credentialer
+	waiter.Waitable
+
+	// Close puts the endpoint in a closed state and frees all resources
+	// associated with it.
+	Close()
+
+	// RecvMsg reads data and a control message from the endpoint. This method
+	// does not block if there is no data pending.
+	//
+	// creds indicates if credential control messages are requested by the
+	// caller. This is useful for determining if control messages can be
+	// coalesced. creds is a hint and can be safely ignored by the
+	// implementation if no coalescing is possible. It is fine to return
+	// credential control messages when none were requested or to not return
+	// credential control messages when they were requested.
+	//
+	// numRights is the number of SCM_RIGHTS FDs requested by the caller. This
+	// is useful if one must allocate a buffer to receive a SCM_RIGHTS message
+	// or determine if control messages can be coalesced. numRights is a hint
+	// and can be safely ignored by the implementation if the number of
+	// available SCM_RIGHTS FDs is known and no coalescing is possible. It is
+	// fine for the returned number of SCM_RIGHTS FDs to be either higher or
+	// lower than the requested number.
+	//
+	// If peek is true, no data should be consumed from the Endpoint. Any and
+	// all data returned from a peek should be available in the next call to
+	// RecvMsg.
+	//
+	// recvLen is the number of bytes copied into data.
+	//
+	// msgLen is the length of the read message consumed for datagram Endpoints.
+	// msgLen is always the same as recvLen for stream Endpoints.
+	RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, err *tcpip.Error)
+
+	// SendMsg writes data and a control message to the endpoint's peer.
+	// This method does not block if the data cannot be written.
+	//
+	// SendMsg does not take ownership of any of its arguments on error.
+	SendMsg([][]byte, ControlMessages, BoundEndpoint) (uintptr, *tcpip.Error)
+
+	// Connect connects this endpoint directly to another.
+	//
+	// This should be called on the client endpoint, and the (bound)
+	// endpoint passed in as a parameter.
+	//
+	// The error codes are the same as Connect.
+	Connect(server BoundEndpoint) *tcpip.Error
+
+	// Shutdown closes the read and/or write end of the endpoint connection
+	// to its peer.
+	Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error
+
+	// Listen puts the endpoint in "listen" mode, which allows it to accept
+	// new connections.
+	Listen(backlog int) *tcpip.Error
+
+	// Accept returns a new endpoint if a peer has established a connection
+	// to an endpoint previously set to listen mode. This method does not
+	// block if no new connections are available.
+	//
+	// The returned Queue is the wait queue for the newly created endpoint.
+	Accept() (Endpoint, *tcpip.Error)
+
+	// Bind binds the endpoint to a specific local address and port.
+	// Specifying a NIC is optional.
+	//
+	// An optional commit function will be executed atomically with respect
+	// to binding the endpoint. If this returns an error, the bind will not
+	// occur and the error will be propagated back to the caller.
+	Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error
+
+	// Type return the socket type, typically either SockStream, SockDgram
+	// or SockSeqpacket.
+	Type() SockType
+
+	// GetLocalAddress returns the address to which the endpoint is bound.
+	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
+
+	// GetRemoteAddress returns the address to which the endpoint is
+	// connected.
+	GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
+
+	// SetSockOpt sets a socket option. opt should be one of the tcpip.*Option
+	// types.
+	SetSockOpt(opt interface{}) *tcpip.Error
+
+	// GetSockOpt gets a socket option. opt should be a pointer to one of the
+	// tcpip.*Option types.
+	GetSockOpt(opt interface{}) *tcpip.Error
+}
+
+// A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
+// option.
+type Credentialer interface {
+	// Passcred returns whether or not the SO_PASSCRED socket option is
+	// enabled on this end.
+	Passcred() bool
+
+	// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
+	// is enabled on the connected end.
+	ConnectedPasscred() bool
+}
+
+// A BoundEndpoint is a unix endpoint that can be connected to.
+type BoundEndpoint interface {
+	// BidirectionalConnect establishes a bi-directional connection between two
+	// unix endpoints in an all-or-nothing manner. If an error occurs during
+	// connecting, the state of neither endpoint should be modified.
+	//
+	// In order for an endpoint to establish such a bidirectional connection
+	// with a BoundEndpoint, the endpoint calls the BidirectionalConnect method
+	// on the BoundEndpoint and sends a representation of itself (the
+	// ConnectingEndpoint) and a callback (returnConnect) to receive the
+	// connection information (Receiver and ConnectedEndpoint) upon a
+	// successful connect. The callback should only be called on a successful
+	// connect.
+	//
+	// For a connection attempt to be successful, the ConnectingEndpoint must
+	// be unconnected and not listening and the BoundEndpoint whose
+	// BidirectionalConnect method is being called must be listening.
+	//
+	// This method will return tcpip.ErrConnectionRefused on endpoints with a
+	// type that isn't SockStream or SockSeqpacket.
+	BidirectionalConnect(ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error
+
+	// UnidirectionalConnect establishes a write-only connection to a unix
+	// endpoint.
+	//
+	// An endpoint which calls UnidirectionalConnect and supports it itself must
+	// not hold its own lock when calling UnidirectionalConnect.
+	//
+	// This method will return tcpip.ErrConnectionRefused on a non-SockDgram
+	// endpoint.
+	UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error)
+
+	// Release releases any resources held by the BoundEndpoint. It must be
+	// called before dropping all references to a BoundEndpoint returned by a
+	// function.
+	Release()
+}
+
+// message represents a message passed over a Unix domain socket.
+//
+// +stateify savable
+type message struct {
+	ilist.Entry
+
+	// Data is the Message payload.
+	Data buffer.View
+
+	// Control is auxiliary control message data that goes along with the
+	// data.
+	Control ControlMessages
+
+	// Address is the bound address of the endpoint that sent the message.
+	//
+	// If the endpoint that sent the message is not bound, the Address is
+	// the empty string.
+	Address tcpip.FullAddress
+}
+
+// Length returns number of bytes stored in the message.
+func (m *message) Length() int64 {
+	return int64(len(m.Data))
+}
+
+// Release releases any resources held by the message.
+func (m *message) Release() {
+	m.Control.Release()
+}
+
+// Peek returns a copy of the message.
+func (m *message) Peek() queue.Entry {
+	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
+}
+
+// Truncate reduces the length of the message payload to n bytes.
+//
+// Preconditions: n <= m.Length().
+func (m *message) Truncate(n int64) {
+	m.Data.CapLength(int(n))
+}
+
+// A Receiver can be used to receive Messages.
+type Receiver interface {
+	// Recv receives a single message. This method does not block.
+	//
+	// See Endpoint.RecvMsg for documentation on shared arguments.
+	//
+	// notify indicates if RecvNotify should be called.
+	Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, source tcpip.FullAddress, notify bool, err *tcpip.Error)
+
+	// RecvNotify notifies the Receiver of a successful Recv. This must not be
+	// called while holding any endpoint locks.
+	RecvNotify()
+
+	// CloseRecv prevents the receiving of additional Messages.
+	//
+	// After CloseRecv is called, CloseNotify must also be called.
+	CloseRecv()
+
+	// CloseNotify notifies the Receiver of recv being closed. This must not be
+	// called while holding any endpoint locks.
+	CloseNotify()
+
+	// Readable returns if messages should be attempted to be received. This
+	// includes when read has been shutdown.
+	Readable() bool
+
+	// RecvQueuedSize returns the total amount of data currently receivable.
+	// RecvQueuedSize should return -1 if the operation isn't supported.
+	RecvQueuedSize() int64
+
+	// RecvMaxQueueSize returns maximum value for RecvQueuedSize.
+	// RecvMaxQueueSize should return -1 if the operation isn't supported.
+	RecvMaxQueueSize() int64
+
+	// Release releases any resources owned by the Receiver. It should be
+	// called before droping all references to a Receiver.
+	Release()
+}
+
+// queueReceiver implements Receiver for datagram sockets.
+//
+// +stateify savable
+type queueReceiver struct {
+	readQueue *queue.Queue
+}
+
+// Recv implements Receiver.Recv.
+func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+	var m queue.Entry
+	var notify bool
+	var err *tcpip.Error
+	if peek {
+		m, err = q.readQueue.Peek()
+	} else {
+		m, notify, err = q.readQueue.Dequeue()
+	}
+	if err != nil {
+		return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
+	}
+	msg := m.(*message)
+	src := []byte(msg.Data)
+	var copied uintptr
+	for i := 0; i < len(data) && len(src) > 0; i++ {
+		n := copy(data[i], src)
+		copied += uintptr(n)
+		src = src[n:]
+	}
+	return copied, uintptr(len(msg.Data)), msg.Control, msg.Address, notify, nil
+}
+
+// RecvNotify implements Receiver.RecvNotify.
+func (q *queueReceiver) RecvNotify() {
+	q.readQueue.WriterQueue.Notify(waiter.EventOut)
+}
+
+// CloseNotify implements Receiver.CloseNotify.
+func (q *queueReceiver) CloseNotify() {
+	q.readQueue.ReaderQueue.Notify(waiter.EventIn)
+	q.readQueue.WriterQueue.Notify(waiter.EventOut)
+}
+
+// CloseRecv implements Receiver.CloseRecv.
+func (q *queueReceiver) CloseRecv() {
+	q.readQueue.Close()
+}
+
+// Readable implements Receiver.Readable.
+func (q *queueReceiver) Readable() bool {
+	return q.readQueue.IsReadable()
+}
+
+// RecvQueuedSize implements Receiver.RecvQueuedSize.
+func (q *queueReceiver) RecvQueuedSize() int64 {
+	return q.readQueue.QueuedSize()
+}
+
+// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
+func (q *queueReceiver) RecvMaxQueueSize() int64 {
+	return q.readQueue.MaxQueueSize()
+}
+
+// Release implements Receiver.Release.
+func (*queueReceiver) Release() {}
+
+// streamQueueReceiver implements Receiver for stream sockets.
+//
+// +stateify savable
+type streamQueueReceiver struct {
+	queueReceiver
+
+	mu      sync.Mutex `state:"nosave"`
+	buffer  []byte
+	control ControlMessages
+	addr    tcpip.FullAddress
+}
+
+func vecCopy(data [][]byte, buf []byte) (uintptr, [][]byte, []byte) {
+	var copied uintptr
+	for len(data) > 0 && len(buf) > 0 {
+		n := copy(data[0], buf)
+		copied += uintptr(n)
+		buf = buf[n:]
+		data[0] = data[0][n:]
+		if len(data[0]) == 0 {
+			data = data[1:]
+		}
+	}
+	return copied, data, buf
+}
+
+// Readable implements Receiver.Readable.
+func (q *streamQueueReceiver) Readable() bool {
+	q.mu.Lock()
+	bl := len(q.buffer)
+	r := q.readQueue.IsReadable()
+	q.mu.Unlock()
+	// We're readable if we have data in our buffer or if the queue receiver is
+	// readable.
+	return bl > 0 || r
+}
+
+// RecvQueuedSize implements Receiver.RecvQueuedSize.
+func (q *streamQueueReceiver) RecvQueuedSize() int64 {
+	q.mu.Lock()
+	bl := len(q.buffer)
+	qs := q.readQueue.QueuedSize()
+	q.mu.Unlock()
+	return int64(bl) + qs
+}
+
+// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
+func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
+	// The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest
+	// message we can buffer which is also the largest message we can receive.
+	return 2 * q.readQueue.MaxQueueSize()
+}
+
+// Recv implements Receiver.Recv.
+func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	var notify bool
+
+	// If we have no data in the endpoint, we need to get some.
+	if len(q.buffer) == 0 {
+		// Load the next message into a buffer, even if we are peeking. Peeking
+		// won't consume the message, so it will be still available to be read
+		// the next time Recv() is called.
+		m, n, err := q.readQueue.Dequeue()
+		if err != nil {
+			return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
+		}
+		notify = n
+		msg := m.(*message)
+		q.buffer = []byte(msg.Data)
+		q.control = msg.Control
+		q.addr = msg.Address
+	}
+
+	var copied uintptr
+	if peek {
+		// Don't consume control message if we are peeking.
+		c := q.control.Clone()
+
+		// Don't consume data since we are peeking.
+		copied, data, _ = vecCopy(data, q.buffer)
+
+		return copied, copied, c, q.addr, notify, nil
+	}
+
+	// Consume data and control message since we are not peeking.
+	copied, data, q.buffer = vecCopy(data, q.buffer)
+
+	// Save the original state of q.control.
+	c := q.control
+
+	// Remove rights from q.control and leave behind just the creds.
+	q.control.Rights = nil
+	if !wantCreds {
+		c.Credentials = nil
+	}
+
+	if c.Rights != nil && numRights == 0 {
+		c.Rights.Release()
+		c.Rights = nil
+	}
+
+	haveRights := c.Rights != nil
+
+	// If we have more capacity for data and haven't received any usable
+	// rights.
+	//
+	// Linux never coalesces rights control messages.
+	for !haveRights && len(data) > 0 {
+		// Get a message from the readQueue.
+		m, n, err := q.readQueue.Dequeue()
+		if err != nil {
+			// We already got some data, so ignore this error. This will
+			// manifest as a short read to the user, which is what Linux
+			// does.
+			break
+		}
+		notify = notify || n
+		msg := m.(*message)
+		q.buffer = []byte(msg.Data)
+		q.control = msg.Control
+		q.addr = msg.Address
+
+		if wantCreds {
+			if (q.control.Credentials == nil) != (c.Credentials == nil) {
+				// One message has credentials, the other does not.
+				break
+			}
+
+			if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) {
+				// Both messages have credentials, but they don't match.
+				break
+			}
+		}
+
+		if numRights != 0 && c.Rights != nil && q.control.Rights != nil {
+			// Both messages have rights.
+			break
+		}
+
+		var cpd uintptr
+		cpd, data, q.buffer = vecCopy(data, q.buffer)
+		copied += cpd
+
+		if cpd == 0 {
+			// data was actually full.
+			break
+		}
+
+		if q.control.Rights != nil {
+			// Consume rights.
+			if numRights == 0 {
+				q.control.Rights.Release()
+			} else {
+				c.Rights = q.control.Rights
+				haveRights = true
+			}
+			q.control.Rights = nil
+		}
+	}
+	return copied, copied, c, q.addr, notify, nil
+}
+
+// A ConnectedEndpoint is an Endpoint that can be used to send Messages.
+type ConnectedEndpoint interface {
+	// Passcred implements Endpoint.Passcred.
+	Passcred() bool
+
+	// GetLocalAddress implements Endpoint.GetLocalAddress.
+	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
+
+	// Send sends a single message. This method does not block.
+	//
+	// notify indicates if SendNotify should be called.
+	//
+	// tcpip.ErrWouldBlock can be returned along with a partial write if
+	// the caller should block to send the rest of the data.
+	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n uintptr, notify bool, err *tcpip.Error)
+
+	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
+	// must not be called while holding any endpoint locks.
+	SendNotify()
+
+	// CloseSend prevents the sending of additional Messages.
+	//
+	// After CloseSend is call, CloseNotify must also be called.
+	CloseSend()
+
+	// CloseNotify notifies the ConnectedEndpoint of send being closed. This
+	// must not be called while holding any endpoint locks.
+	CloseNotify()
+
+	// Writable returns if messages should be attempted to be sent. This
+	// includes when write has been shutdown.
+	Writable() bool
+
+	// EventUpdate lets the ConnectedEndpoint know that event registrations
+	// have changed.
+	EventUpdate()
+
+	// SendQueuedSize returns the total amount of data currently queued for
+	// sending. SendQueuedSize should return -1 if the operation isn't
+	// supported.
+	SendQueuedSize() int64
+
+	// SendMaxQueueSize returns maximum value for SendQueuedSize.
+	// SendMaxQueueSize should return -1 if the operation isn't supported.
+	SendMaxQueueSize() int64
+
+	// Release releases any resources owned by the ConnectedEndpoint. It should
+	// be called before droping all references to a ConnectedEndpoint.
+	Release()
+}
+
+// +stateify savable
+type connectedEndpoint struct {
+	// endpoint represents the subset of the Endpoint functionality needed by
+	// the connectedEndpoint. It is implemented by both connectionedEndpoint
+	// and connectionlessEndpoint and allows the use of types which don't
+	// fully implement Endpoint.
+	endpoint interface {
+		// Passcred implements Endpoint.Passcred.
+		Passcred() bool
+
+		// GetLocalAddress implements Endpoint.GetLocalAddress.
+		GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
+
+		// Type implements Endpoint.Type.
+		Type() SockType
+	}
+
+	writeQueue *queue.Queue
+}
+
+// Passcred implements ConnectedEndpoint.Passcred.
+func (e *connectedEndpoint) Passcred() bool {
+	return e.endpoint.Passcred()
+}
+
+// GetLocalAddress implements ConnectedEndpoint.GetLocalAddress.
+func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return e.endpoint.GetLocalAddress()
+}
+
+// Send implements ConnectedEndpoint.Send.
+func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+	var l int64
+	for _, d := range data {
+		l += int64(len(d))
+	}
+
+	truncate := false
+	if e.endpoint.Type() == SockStream {
+		// Since stream sockets don't preserve message boundaries, we
+		// can write only as much of the message as fits in the queue.
+		truncate = true
+
+		// Discard empty stream packets. Since stream sockets don't
+		// preserve message boundaries, sending zero bytes is a no-op.
+		// In Linux, the receiver actually uses a zero-length receive
+		// as an indication that the stream was closed.
+		if l == 0 {
+			controlMessages.Release()
+			return 0, false, nil
+		}
+	}
+
+	v := make([]byte, 0, l)
+	for _, d := range data {
+		v = append(v, d...)
+	}
+
+	l, notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from}, truncate)
+	return uintptr(l), notify, err
+}
+
+// SendNotify implements ConnectedEndpoint.SendNotify.
+func (e *connectedEndpoint) SendNotify() {
+	e.writeQueue.ReaderQueue.Notify(waiter.EventIn)
+}
+
+// CloseNotify implements ConnectedEndpoint.CloseNotify.
+func (e *connectedEndpoint) CloseNotify() {
+	e.writeQueue.ReaderQueue.Notify(waiter.EventIn)
+	e.writeQueue.WriterQueue.Notify(waiter.EventOut)
+}
+
+// CloseSend implements ConnectedEndpoint.CloseSend.
+func (e *connectedEndpoint) CloseSend() {
+	e.writeQueue.Close()
+}
+
+// Writable implements ConnectedEndpoint.Writable.
+func (e *connectedEndpoint) Writable() bool {
+	return e.writeQueue.IsWritable()
+}
+
+// EventUpdate implements ConnectedEndpoint.EventUpdate.
+func (*connectedEndpoint) EventUpdate() {}
+
+// SendQueuedSize implements ConnectedEndpoint.SendQueuedSize.
+func (e *connectedEndpoint) SendQueuedSize() int64 {
+	return e.writeQueue.QueuedSize()
+}
+
+// SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize.
+func (e *connectedEndpoint) SendMaxQueueSize() int64 {
+	return e.writeQueue.MaxQueueSize()
+}
+
+// Release implements ConnectedEndpoint.Release.
+func (*connectedEndpoint) Release() {}
+
+// baseEndpoint is an embeddable unix endpoint base used in both the connected and connectionless
+// unix domain socket Endpoint implementations.
+//
+// Not to be used on its own.
+//
+// +stateify savable
+type baseEndpoint struct {
+	*waiter.Queue
+
+	// passcred specifies whether SCM_CREDENTIALS socket control messages are
+	// enabled on this endpoint. Must be accessed atomically.
+	passcred int32
+
+	// Mutex protects the below fields.
+	sync.Mutex `state:"nosave"`
+
+	// receiver allows Messages to be received.
+	receiver Receiver
+
+	// connected allows messages to be sent and state information about the
+	// connected endpoint to be read.
+	connected ConnectedEndpoint
+
+	// path is not empty if the endpoint has been bound,
+	// or may be used if the endpoint is connected.
+	path string
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
+	e.Queue.EventRegister(we, mask)
+	e.Lock()
+	if e.connected != nil {
+		e.connected.EventUpdate()
+	}
+	e.Unlock()
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
+	e.Queue.EventUnregister(we)
+	e.Lock()
+	if e.connected != nil {
+		e.connected.EventUpdate()
+	}
+	e.Unlock()
+}
+
+// Passcred implements Credentialer.Passcred.
+func (e *baseEndpoint) Passcred() bool {
+	return atomic.LoadInt32(&e.passcred) != 0
+}
+
+// ConnectedPasscred implements Credentialer.ConnectedPasscred.
+func (e *baseEndpoint) ConnectedPasscred() bool {
+	e.Lock()
+	defer e.Unlock()
+	return e.connected != nil && e.connected.Passcred()
+}
+
+func (e *baseEndpoint) setPasscred(pc bool) {
+	if pc {
+		atomic.StoreInt32(&e.passcred, 1)
+	} else {
+		atomic.StoreInt32(&e.passcred, 0)
+	}
+}
+
+// Connected implements ConnectingEndpoint.Connected.
+func (e *baseEndpoint) Connected() bool {
+	return e.receiver != nil && e.connected != nil
+}
+
+// RecvMsg reads data and a control message from the endpoint.
+func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, *tcpip.Error) {
+	e.Lock()
+
+	if e.receiver == nil {
+		e.Unlock()
+		return 0, 0, ControlMessages{}, tcpip.ErrNotConnected
+	}
+
+	recvLen, msgLen, cms, a, notify, err := e.receiver.Recv(data, creds, numRights, peek)
+	e.Unlock()
+	if err != nil {
+		return 0, 0, ControlMessages{}, err
+	}
+
+	if notify {
+		e.receiver.RecvNotify()
+	}
+
+	if addr != nil {
+		*addr = a
+	}
+	return recvLen, msgLen, cms, nil
+}
+
+// SendMsg writes data and a control message to the endpoint's peer.
+// This method does not block if the data cannot be written.
+func (e *baseEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
+	e.Lock()
+	if !e.Connected() {
+		e.Unlock()
+		return 0, tcpip.ErrNotConnected
+	}
+	if to != nil {
+		e.Unlock()
+		return 0, tcpip.ErrAlreadyConnected
+	}
+
+	n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
+	e.Unlock()
+
+	if notify {
+		e.connected.SendNotify()
+	}
+
+	return n, err
+}
+
+// SetSockOpt sets a socket option. Currently not supported.
+func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
+	case tcpip.PasscredOption:
+		e.setPasscred(v != 0)
+		return nil
+	}
+	return nil
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+	case *tcpip.SendQueueSizeOption:
+		e.Lock()
+		if !e.Connected() {
+			e.Unlock()
+			return tcpip.ErrNotConnected
+		}
+		qs := tcpip.SendQueueSizeOption(e.connected.SendQueuedSize())
+		e.Unlock()
+		if qs < 0 {
+			return tcpip.ErrQueueSizeNotSupported
+		}
+		*o = qs
+		return nil
+	case *tcpip.ReceiveQueueSizeOption:
+		e.Lock()
+		if !e.Connected() {
+			e.Unlock()
+			return tcpip.ErrNotConnected
+		}
+		qs := tcpip.ReceiveQueueSizeOption(e.receiver.RecvQueuedSize())
+		e.Unlock()
+		if qs < 0 {
+			return tcpip.ErrQueueSizeNotSupported
+		}
+		*o = qs
+		return nil
+	case *tcpip.PasscredOption:
+		if e.Passcred() {
+			*o = tcpip.PasscredOption(1)
+		} else {
+			*o = tcpip.PasscredOption(0)
+		}
+		return nil
+	case *tcpip.SendBufferSizeOption:
+		e.Lock()
+		if !e.Connected() {
+			e.Unlock()
+			return tcpip.ErrNotConnected
+		}
+		qs := tcpip.SendBufferSizeOption(e.connected.SendMaxQueueSize())
+		e.Unlock()
+		if qs < 0 {
+			return tcpip.ErrQueueSizeNotSupported
+		}
+		*o = qs
+		return nil
+	case *tcpip.ReceiveBufferSizeOption:
+		e.Lock()
+		if e.receiver == nil {
+			e.Unlock()
+			return tcpip.ErrNotConnected
+		}
+		qs := tcpip.ReceiveBufferSizeOption(e.receiver.RecvMaxQueueSize())
+		e.Unlock()
+		if qs < 0 {
+			return tcpip.ErrQueueSizeNotSupported
+		}
+		*o = qs
+		return nil
+	}
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Shutdown closes the read and/or write end of the endpoint connection to its
+// peer.
+func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.Lock()
+	if !e.Connected() {
+		e.Unlock()
+		return tcpip.ErrNotConnected
+	}
+
+	if flags&tcpip.ShutdownRead != 0 {
+		e.receiver.CloseRecv()
+	}
+
+	if flags&tcpip.ShutdownWrite != 0 {
+		e.connected.CloseSend()
+	}
+
+	e.Unlock()
+
+	if flags&tcpip.ShutdownRead != 0 {
+		e.receiver.CloseNotify()
+	}
+
+	if flags&tcpip.ShutdownWrite != 0 {
+		e.connected.CloseNotify()
+	}
+
+	return nil
+}
+
+// GetLocalAddress returns the bound path.
+func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.Lock()
+	defer e.Unlock()
+	return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil
+}
+
+// GetRemoteAddress returns the local address of the connected endpoint (if
+// available).
+func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.Lock()
+	c := e.connected
+	e.Unlock()
+	if c != nil {
+		return c.GetLocalAddress()
+	}
+	return tcpip.FullAddress{}, tcpip.ErrNotConnected
+}
+
+// Release implements BoundEndpoint.Release.
+func (*baseEndpoint) Release() {}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index e30378e60..668363864 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -32,16 +32,16 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// SocketOperations is a Unix socket. It is similar to an epsocket, except it is backed
-// by a unix.Endpoint instead of a tcpip.Endpoint.
+// SocketOperations is a Unix socket. It is similar to an epsocket, except it
+// is backed by a transport.Endpoint instead of a tcpip.Endpoint.
 //
 // +stateify savable
 type SocketOperations struct {
@@ -52,18 +52,18 @@ type SocketOperations struct {
 	fsutil.NoFsync       `state:"nosave"`
 	fsutil.NoopFlush     `state:"nosave"`
 	fsutil.NoMMap        `state:"nosave"`
-	ep                   unix.Endpoint
+	ep                   transport.Endpoint
 }
 
 // New creates a new unix socket.
-func New(ctx context.Context, endpoint unix.Endpoint) *fs.File {
+func New(ctx context.Context, endpoint transport.Endpoint) *fs.File {
 	dirent := socket.NewDirent(ctx, unixSocketDevice)
 	defer dirent.DecRef()
 	return NewWithDirent(ctx, dirent, endpoint, fs.FileFlags{Read: true, Write: true})
 }
 
 // NewWithDirent creates a new unix socket using an existing dirent.
-func NewWithDirent(ctx context.Context, d *fs.Dirent, ep unix.Endpoint, flags fs.FileFlags) *fs.File {
+func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, flags fs.FileFlags) *fs.File {
 	return fs.NewFile(ctx, d, flags, &SocketOperations{
 		ep: ep,
 	})
@@ -83,8 +83,8 @@ func (s *SocketOperations) Release() {
 	s.DecRef()
 }
 
-// Endpoint extracts the unix.Endpoint.
-func (s *SocketOperations) Endpoint() unix.Endpoint {
+// Endpoint extracts the transport.Endpoint.
+func (s *SocketOperations) Endpoint() transport.Endpoint {
 	return s.ep
 }
 
@@ -110,7 +110,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) {
 }
 
 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
 	addr, err := s.ep.GetRemoteAddress()
 	if err != nil {
@@ -122,7 +122,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *sy
 }
 
 // GetSockName implements the linux syscall getsockname(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
 	addr, err := s.ep.GetLocalAddress()
 	if err != nil {
@@ -139,20 +139,20 @@ func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.S
 }
 
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
 	return epsocket.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
 	return syserr.TranslateNetstackError(s.ep.Listen(backlog))
 }
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
 // connections are ready to be accept, it will block until one becomes ready.
-func (s *SocketOperations) blockingAccept(t *kernel.Task) (unix.Endpoint, *syserr.Error) {
+func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) {
 	// Register for notifications.
 	e, ch := waiter.NewChannelEntry(nil)
 	s.EventRegister(&e, waiter.EventIn)
@@ -172,7 +172,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (unix.Endpoint, *syser
 }
 
 // Accept implements the linux syscall accept(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
 	// Issue the accept request to get the new endpoint.
 	ep, err := s.ep.Accept()
@@ -226,7 +226,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 		return e
 	}
 
-	bep, ok := s.ep.(unix.BoundEndpoint)
+	bep, ok := s.ep.(transport.BoundEndpoint)
 	if !ok {
 		// This socket can't be bound.
 		return syserr.ErrInvalidArgument
@@ -287,10 +287,10 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	}))
 }
 
-// extractEndpoint retrieves the unix.BoundEndpoint associated with a Unix
-// socket path. The Release must be called on the unix.BoundEndpoint when the
-// caller is done with it.
-func extractEndpoint(t *kernel.Task, sockaddr []byte) (unix.BoundEndpoint, *syserr.Error) {
+// extractEndpoint retrieves the transport.BoundEndpoint associated with a Unix
+// socket path. The Release must be called on the transport.BoundEndpoint when
+// the caller is done with it.
+func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, *syserr.Error) {
 	path, err := extractPath(sockaddr)
 	if err != nil {
 		return nil, err
@@ -362,7 +362,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 }
 
 // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	w := EndpointWriter{
 		Endpoint: s.ep,
@@ -408,12 +408,12 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	return int(total), syserr.FromError(err)
 }
 
-// Passcred implements unix.Credentialer.Passcred.
+// Passcred implements transport.Credentialer.Passcred.
 func (s *SocketOperations) Passcred() bool {
 	return s.ep.Passcred()
 }
 
-// ConnectedPasscred implements unix.Credentialer.ConnectedPasscred.
+// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
 func (s *SocketOperations) ConnectedPasscred() bool {
 	return s.ep.ConnectedPasscred()
 }
@@ -434,13 +434,13 @@ func (s *SocketOperations) EventUnregister(e *waiter.Entry) {
 }
 
 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
 	return epsocket.SetSockOpt(t, s, s.ep, level, name, optVal)
 }
 
 // Shutdown implements the linux syscall shutdown(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	f, err := epsocket.ConvertShutdown(how)
 	if err != nil {
@@ -465,7 +465,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 }
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
-// a unix.Endpoint.
+// a transport.Endpoint.
 func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
@@ -539,19 +539,19 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 type provider struct{}
 
 // Socket returns a new unix domain socket.
-func (*provider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Check arguments.
 	if protocol != 0 {
 		return nil, syserr.ErrInvalidArgument
 	}
 
 	// Create the endpoint and socket.
-	var ep unix.Endpoint
+	var ep transport.Endpoint
 	switch stype {
 	case linux.SOCK_DGRAM:
-		ep = unix.NewConnectionless()
+		ep = transport.NewConnectionless()
 	case linux.SOCK_STREAM, linux.SOCK_SEQPACKET:
-		ep = unix.NewConnectioned(stype, t.Kernel())
+		ep = transport.NewConnectioned(stype, t.Kernel())
 	default:
 		return nil, syserr.ErrInvalidArgument
 	}
@@ -560,7 +560,7 @@ func (*provider) Socket(t *kernel.Task, stype unix.SockType, protocol int) (*fs.
 }
 
 // Pair creates a new pair of AF_UNIX connected sockets.
-func (*provider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (*provider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	// Check arguments.
 	if protocol != 0 {
 		return nil, nil, syserr.ErrInvalidArgument
@@ -573,7 +573,7 @@ func (*provider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.Fi
 	}
 
 	// Create the endpoints and sockets.
-	ep1, ep2 := unix.NewPair(stype, t.Kernel())
+	ep1, ep2 := transport.NewPair(stype, t.Kernel())
 	s1 := New(t, ep1)
 	s2 := New(t, ep2)
 
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index bbdfad9da..7621bfdbd 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -79,11 +79,11 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
-        "//pkg/tcpip/transport/unix",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 867fec468..5fa5ddce6 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -27,9 +27,9 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
 )
 
 // minListenBacklog is the minimum reasonable backlog for listening sockets.
@@ -180,7 +180,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 
 	// Create the new socket.
-	s, e := socket.New(t, domain, unix.SockType(stype&0xf), protocol)
+	s, e := socket.New(t, domain, transport.SockType(stype&0xf), protocol)
 	if e != nil {
 		return 0, nil, e.ToError()
 	}
@@ -219,7 +219,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	}
 
 	// Create the socket pair.
-	s1, s2, e := socket.Pair(t, domain, unix.SockType(stype&0xf), protocol)
+	s1, s2, e := socket.Pair(t, domain, transport.SockType(stype&0xf), protocol)
 	if e != nil {
 		return 0, nil, e.ToError()
 	}
@@ -750,7 +750,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 
 	controlData := make([]byte, 0, msg.ControlLen)
 
-	if cr, ok := s.(unix.Credentialer); ok && cr.Passcred() {
+	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
 		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
 		controlData = control.PackCredentials(t, creds, controlData)
 	}
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index ff50b9925..68b82af47 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -9,6 +9,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/sentry/context",
-        "//pkg/tcpip/transport/unix",
+        "//pkg/sentry/socket/unix/transport",
     ],
 )
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
index 541e0611d..e48fabc2d 100644
--- a/pkg/sentry/uniqueid/context.go
+++ b/pkg/sentry/uniqueid/context.go
@@ -18,7 +18,7 @@ package uniqueid
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // contextID is the kernel package's type for context.Context.Value keys.
@@ -44,8 +44,8 @@ func GlobalFromContext(ctx context.Context) uint64 {
 }
 
 // GlobalProviderFromContext returns a system-wide unique identifier from ctx.
-func GlobalProviderFromContext(ctx context.Context) unix.UniqueIDProvider {
-	return ctx.Value(CtxGlobalUniqueIDProvider).(unix.UniqueIDProvider)
+func GlobalProviderFromContext(ctx context.Context) transport.UniqueIDProvider {
+	return ctx.Value(CtxGlobalUniqueIDProvider).(transport.UniqueIDProvider)
 }
 
 // InotifyCookie generates a unique inotify event cookie from ctx.
diff --git a/pkg/tcpip/transport/queue/BUILD b/pkg/tcpip/transport/queue/BUILD
deleted file mode 100644
index 6dcec312e..000000000
--- a/pkg/tcpip/transport/queue/BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-package(licenses = ["notice"])  # Apache 2.0
-
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_library(
-    name = "queue",
-    srcs = ["queue.go"],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/queue",
-    visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/ilist",
-        "//pkg/tcpip",
-        "//pkg/waiter",
-    ],
-)
diff --git a/pkg/tcpip/transport/queue/queue.go b/pkg/tcpip/transport/queue/queue.go
deleted file mode 100644
index b3d2ea68b..000000000
--- a/pkg/tcpip/transport/queue/queue.go
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package queue provides the implementation of buffer queue
-// and interface of queue entry with Length method.
-package queue
-
-import (
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// Entry implements Linker interface and has additional required methods.
-type Entry interface {
-	ilist.Linker
-
-	// Length returns the number of bytes stored in the entry.
-	Length() int64
-
-	// Release releases any resources held by the entry.
-	Release()
-
-	// Peek returns a copy of the entry. It must be Released separately.
-	Peek() Entry
-
-	// Truncate reduces the number of bytes stored in the entry to n bytes.
-	//
-	// Preconditions: n <= Length().
-	Truncate(n int64)
-}
-
-// Queue is a buffer queue.
-//
-// +stateify savable
-type Queue struct {
-	ReaderQueue *waiter.Queue
-	WriterQueue *waiter.Queue
-
-	mu       sync.Mutex `state:"nosave"`
-	closed   bool
-	used     int64
-	limit    int64
-	dataList ilist.List
-}
-
-// New allocates and initializes a new queue.
-func New(ReaderQueue *waiter.Queue, WriterQueue *waiter.Queue, limit int64) *Queue {
-	return &Queue{ReaderQueue: ReaderQueue, WriterQueue: WriterQueue, limit: limit}
-}
-
-// Close closes q for reading and writing. It is immediately not writable and
-// will become unreadable when no more data is pending.
-//
-// Both the read and write queues must be notified after closing:
-// q.ReaderQueue.Notify(waiter.EventIn)
-// q.WriterQueue.Notify(waiter.EventOut)
-func (q *Queue) Close() {
-	q.mu.Lock()
-	q.closed = true
-	q.mu.Unlock()
-}
-
-// Reset empties the queue and Releases all of the Entries.
-//
-// Both the read and write queues must be notified after resetting:
-// q.ReaderQueue.Notify(waiter.EventIn)
-// q.WriterQueue.Notify(waiter.EventOut)
-func (q *Queue) Reset() {
-	q.mu.Lock()
-	for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
-		cur.(Entry).Release()
-	}
-	q.dataList.Reset()
-	q.used = 0
-	q.mu.Unlock()
-}
-
-// IsReadable determines if q is currently readable.
-func (q *Queue) IsReadable() bool {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	return q.closed || q.dataList.Front() != nil
-}
-
-// bufWritable returns true if there is space for writing.
-//
-// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is
-// free.
-//
-// See net/unix/af_unix.c:unix_writeable.
-func (q *Queue) bufWritable() bool {
-	return 4*q.used < q.limit
-}
-
-// IsWritable determines if q is currently writable.
-func (q *Queue) IsWritable() bool {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	return q.closed || q.bufWritable()
-}
-
-// Enqueue adds an entry to the data queue if room is available.
-//
-// If truncate is true, Enqueue may truncate the message beforing enqueuing it.
-// Otherwise, the entire message must fit. If n < e.Length(), err indicates why.
-//
-// If notify is true, ReaderQueue.Notify must be called:
-// q.ReaderQueue.Notify(waiter.EventIn)
-func (q *Queue) Enqueue(e Entry, truncate bool) (l int64, notify bool, err *tcpip.Error) {
-	q.mu.Lock()
-
-	if q.closed {
-		q.mu.Unlock()
-		return 0, false, tcpip.ErrClosedForSend
-	}
-
-	free := q.limit - q.used
-
-	l = e.Length()
-
-	if l > free && truncate {
-		if free == 0 {
-			// Message can't fit right now.
-			q.mu.Unlock()
-			return 0, false, tcpip.ErrWouldBlock
-		}
-
-		e.Truncate(free)
-		l = e.Length()
-		err = tcpip.ErrWouldBlock
-	}
-
-	if l > q.limit {
-		// Message is too big to ever fit.
-		q.mu.Unlock()
-		return 0, false, tcpip.ErrMessageTooLong
-	}
-
-	if l > free {
-		// Message can't fit right now.
-		q.mu.Unlock()
-		return 0, false, tcpip.ErrWouldBlock
-	}
-
-	notify = q.dataList.Front() == nil
-	q.used += l
-	q.dataList.PushBack(e)
-
-	q.mu.Unlock()
-
-	return l, notify, err
-}
-
-// Dequeue removes the first entry in the data queue, if one exists.
-//
-// If notify is true, WriterQueue.Notify must be called:
-// q.WriterQueue.Notify(waiter.EventOut)
-func (q *Queue) Dequeue() (e Entry, notify bool, err *tcpip.Error) {
-	q.mu.Lock()
-
-	if q.dataList.Front() == nil {
-		err := tcpip.ErrWouldBlock
-		if q.closed {
-			err = tcpip.ErrClosedForReceive
-		}
-		q.mu.Unlock()
-
-		return nil, false, err
-	}
-
-	notify = !q.bufWritable()
-
-	e = q.dataList.Front().(Entry)
-	q.dataList.Remove(e)
-	q.used -= e.Length()
-
-	notify = notify && q.bufWritable()
-
-	q.mu.Unlock()
-
-	return e, notify, nil
-}
-
-// Peek returns the first entry in the data queue, if one exists.
-func (q *Queue) Peek() (Entry, *tcpip.Error) {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	if q.dataList.Front() == nil {
-		err := tcpip.ErrWouldBlock
-		if q.closed {
-			err = tcpip.ErrClosedForReceive
-		}
-		return nil, err
-	}
-
-	return q.dataList.Front().(Entry).Peek(), nil
-}
-
-// QueuedSize returns the number of bytes currently in the queue, that is, the
-// number of readable bytes.
-func (q *Queue) QueuedSize() int64 {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-	return q.used
-}
-
-// MaxQueueSize returns the maximum number of bytes storable in the queue.
-func (q *Queue) MaxQueueSize() int64 {
-	return q.limit
-}
diff --git a/pkg/tcpip/transport/unix/BUILD b/pkg/tcpip/transport/unix/BUILD
deleted file mode 100644
index dae0bd079..000000000
--- a/pkg/tcpip/transport/unix/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-package(licenses = ["notice"])  # Apache 2.0
-
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_library(
-    name = "unix",
-    srcs = [
-        "connectioned.go",
-        "connectioned_state.go",
-        "connectionless.go",
-        "unix.go",
-    ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix",
-    visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/ilist",
-        "//pkg/tcpip",
-        "//pkg/tcpip/buffer",
-        "//pkg/tcpip/transport/queue",
-        "//pkg/waiter",
-    ],
-)
diff --git a/pkg/tcpip/transport/unix/connectioned.go b/pkg/tcpip/transport/unix/connectioned.go
deleted file mode 100644
index e319b3bb8..000000000
--- a/pkg/tcpip/transport/unix/connectioned.go
+++ /dev/null
@@ -1,454 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package unix
-
-import (
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/queue"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// UniqueIDProvider generates a sequence of unique identifiers useful for,
-// among other things, lock ordering.
-type UniqueIDProvider interface {
-	// UniqueID returns a new unique identifier.
-	UniqueID() uint64
-}
-
-// A ConnectingEndpoint is a connectioned unix endpoint that is attempting to
-// establish a bidirectional connection with a BoundEndpoint.
-type ConnectingEndpoint interface {
-	// ID returns the endpoint's globally unique identifier. This identifier
-	// must be used to determine locking order if more than one endpoint is
-	// to be locked in the same codepath. The endpoint with the smaller
-	// identifier must be locked before endpoints with larger identifiers.
-	ID() uint64
-
-	// Passcred implements socket.Credentialer.Passcred.
-	Passcred() bool
-
-	// Type returns the socket type, typically either SockStream or
-	// SockSeqpacket. The connection attempt must be aborted if this
-	// value doesn't match the ConnectableEndpoint's type.
-	Type() SockType
-
-	// GetLocalAddress returns the bound path.
-	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
-
-	// Locker protects the following methods. While locked, only the holder of
-	// the lock can change the return value of the protected methods.
-	sync.Locker
-
-	// Connected returns true iff the ConnectingEndpoint is in the connected
-	// state. ConnectingEndpoints can only be connected to a single endpoint,
-	// so the connection attempt must be aborted if this returns true.
-	Connected() bool
-
-	// Listening returns true iff the ConnectingEndpoint is in the listening
-	// state. ConnectingEndpoints cannot make connections while listening, so
-	// the connection attempt must be aborted if this returns true.
-	Listening() bool
-
-	// WaiterQueue returns a pointer to the endpoint's waiter queue.
-	WaiterQueue() *waiter.Queue
-}
-
-// connectionedEndpoint is a Unix-domain connected or connectable endpoint and implements
-// ConnectingEndpoint, ConnectableEndpoint and tcpip.Endpoint.
-//
-// connectionedEndpoints must be in connected state in order to transfer data.
-//
-// This implementation includes STREAM and SEQPACKET Unix sockets created with
-// socket(2), accept(2) or socketpair(2) and dgram unix sockets created with
-// socketpair(2). See unix_connectionless.go for the implementation of DGRAM
-// Unix sockets created with socket(2).
-//
-// The state is much simpler than a TCP endpoint, so it is not encoded
-// explicitly. Instead we enforce the following invariants:
-//
-// receiver != nil, connected != nil => connected.
-// path != "" && acceptedChan == nil => bound, not listening.
-// path != "" && acceptedChan != nil => bound and listening.
-//
-// Only one of these will be true at any moment.
-//
-// +stateify savable
-type connectionedEndpoint struct {
-	baseEndpoint
-
-	// id is the unique endpoint identifier. This is used exclusively for
-	// lock ordering within connect.
-	id uint64
-
-	// idGenerator is used to generate new unique endpoint identifiers.
-	idGenerator UniqueIDProvider
-
-	// stype is used by connecting sockets to ensure that they are the
-	// same type. The value is typically either tcpip.SockSeqpacket or
-	// tcpip.SockStream.
-	stype SockType
-
-	// acceptedChan is per the TCP endpoint implementation. Note that the
-	// sockets in this channel are _already in the connected state_, and
-	// have another associated connectionedEndpoint.
-	//
-	// If nil, then no listen call has been made.
-	acceptedChan chan *connectionedEndpoint `state:".([]*connectionedEndpoint)"`
-}
-
-// NewConnectioned creates a new unbound connectionedEndpoint.
-func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint {
-	return &connectionedEndpoint{
-		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
-		id:           uid.UniqueID(),
-		idGenerator:  uid,
-		stype:        stype,
-	}
-}
-
-// NewPair allocates a new pair of connected unix-domain connectionedEndpoints.
-func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
-	a := &connectionedEndpoint{
-		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
-		id:           uid.UniqueID(),
-		idGenerator:  uid,
-		stype:        stype,
-	}
-	b := &connectionedEndpoint{
-		baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
-		id:           uid.UniqueID(),
-		idGenerator:  uid,
-		stype:        stype,
-	}
-
-	q1 := queue.New(a.Queue, b.Queue, initialLimit)
-	q2 := queue.New(b.Queue, a.Queue, initialLimit)
-
-	if stype == SockStream {
-		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
-		b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}}
-	} else {
-		a.receiver = &queueReceiver{q1}
-		b.receiver = &queueReceiver{q2}
-	}
-
-	a.connected = &connectedEndpoint{
-		endpoint:   b,
-		writeQueue: q2,
-	}
-	b.connected = &connectedEndpoint{
-		endpoint:   a,
-		writeQueue: q1,
-	}
-
-	return a, b
-}
-
-// NewExternal creates a new externally backed Endpoint. It behaves like a
-// socketpair.
-func NewExternal(stype SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
-	return &connectionedEndpoint{
-		baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected},
-		id:           uid.UniqueID(),
-		idGenerator:  uid,
-		stype:        stype,
-	}
-}
-
-// ID implements ConnectingEndpoint.ID.
-func (e *connectionedEndpoint) ID() uint64 {
-	return e.id
-}
-
-// Type implements ConnectingEndpoint.Type and Endpoint.Type.
-func (e *connectionedEndpoint) Type() SockType {
-	return e.stype
-}
-
-// WaiterQueue implements ConnectingEndpoint.WaiterQueue.
-func (e *connectionedEndpoint) WaiterQueue() *waiter.Queue {
-	return e.Queue
-}
-
-// isBound returns true iff the connectionedEndpoint is bound (but not
-// listening).
-func (e *connectionedEndpoint) isBound() bool {
-	return e.path != "" && e.acceptedChan == nil
-}
-
-// Listening implements ConnectingEndpoint.Listening.
-func (e *connectionedEndpoint) Listening() bool {
-	return e.acceptedChan != nil
-}
-
-// Close puts the connectionedEndpoint in a closed state and frees all
-// resources associated with it.
-//
-// The socket will be a fresh state after a call to close and may be reused.
-// That is, close may be used to "unbind" or "disconnect" the socket in error
-// paths.
-func (e *connectionedEndpoint) Close() {
-	e.Lock()
-	var c ConnectedEndpoint
-	var r Receiver
-	switch {
-	case e.Connected():
-		e.connected.CloseSend()
-		e.receiver.CloseRecv()
-		c = e.connected
-		r = e.receiver
-		e.connected = nil
-		e.receiver = nil
-	case e.isBound():
-		e.path = ""
-	case e.Listening():
-		close(e.acceptedChan)
-		for n := range e.acceptedChan {
-			n.Close()
-		}
-		e.acceptedChan = nil
-		e.path = ""
-	}
-	e.Unlock()
-	if c != nil {
-		c.CloseNotify()
-		c.Release()
-	}
-	if r != nil {
-		r.CloseNotify()
-		r.Release()
-	}
-}
-
-// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
-func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error {
-	if ce.Type() != e.stype {
-		return tcpip.ErrConnectionRefused
-	}
-
-	// Check if ce is e to avoid a deadlock.
-	if ce, ok := ce.(*connectionedEndpoint); ok && ce == e {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	// Do a dance to safely acquire locks on both endpoints.
-	if e.id < ce.ID() {
-		e.Lock()
-		ce.Lock()
-	} else {
-		ce.Lock()
-		e.Lock()
-	}
-
-	// Check connecting state.
-	if ce.Connected() {
-		e.Unlock()
-		ce.Unlock()
-		return tcpip.ErrAlreadyConnected
-	}
-	if ce.Listening() {
-		e.Unlock()
-		ce.Unlock()
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	// Check bound state.
-	if !e.Listening() {
-		e.Unlock()
-		ce.Unlock()
-		return tcpip.ErrConnectionRefused
-	}
-
-	// Create a newly bound connectionedEndpoint.
-	ne := &connectionedEndpoint{
-		baseEndpoint: baseEndpoint{
-			path:  e.path,
-			Queue: &waiter.Queue{},
-		},
-		id:          e.idGenerator.UniqueID(),
-		idGenerator: e.idGenerator,
-		stype:       e.stype,
-	}
-	readQueue := queue.New(ce.WaiterQueue(), ne.Queue, initialLimit)
-	writeQueue := queue.New(ne.Queue, ce.WaiterQueue(), initialLimit)
-	ne.connected = &connectedEndpoint{
-		endpoint:   ce,
-		writeQueue: readQueue,
-	}
-	if e.stype == SockStream {
-		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
-	} else {
-		ne.receiver = &queueReceiver{readQueue: writeQueue}
-	}
-
-	select {
-	case e.acceptedChan <- ne:
-		// Commit state.
-		connected := &connectedEndpoint{
-			endpoint:   ne,
-			writeQueue: writeQueue,
-		}
-		if e.stype == SockStream {
-			returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected)
-		} else {
-			returnConnect(&queueReceiver{readQueue: readQueue}, connected)
-		}
-
-		// Notify can deadlock if we are holding these locks.
-		e.Unlock()
-		ce.Unlock()
-
-		// Notify on both ends.
-		e.Notify(waiter.EventIn)
-		ce.WaiterQueue().Notify(waiter.EventOut)
-
-		return nil
-	default:
-		// Busy; return ECONNREFUSED per spec.
-		ne.Close()
-		e.Unlock()
-		ce.Unlock()
-		return tcpip.ErrConnectionRefused
-	}
-}
-
-// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
-func (e *connectionedEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error) {
-	return nil, tcpip.ErrConnectionRefused
-}
-
-// Connect attempts to directly connect to another Endpoint.
-// Implements Endpoint.Connect.
-func (e *connectionedEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
-	returnConnect := func(r Receiver, ce ConnectedEndpoint) {
-		e.receiver = r
-		e.connected = ce
-	}
-
-	return server.BidirectionalConnect(e, returnConnect)
-}
-
-// Listen starts listening on the connection.
-func (e *connectionedEndpoint) Listen(backlog int) *tcpip.Error {
-	e.Lock()
-	defer e.Unlock()
-	if e.Listening() {
-		// Adjust the size of the channel iff we can fix existing
-		// pending connections into the new one.
-		if len(e.acceptedChan) > backlog {
-			return tcpip.ErrInvalidEndpointState
-		}
-		origChan := e.acceptedChan
-		e.acceptedChan = make(chan *connectionedEndpoint, backlog)
-		close(origChan)
-		for ep := range origChan {
-			e.acceptedChan <- ep
-		}
-		return nil
-	}
-	if !e.isBound() {
-		return tcpip.ErrInvalidEndpointState
-	}
-
-	// Normal case.
-	e.acceptedChan = make(chan *connectionedEndpoint, backlog)
-	return nil
-}
-
-// Accept accepts a new connection.
-func (e *connectionedEndpoint) Accept() (Endpoint, *tcpip.Error) {
-	e.Lock()
-	defer e.Unlock()
-
-	if !e.Listening() {
-		return nil, tcpip.ErrInvalidEndpointState
-	}
-
-	select {
-	case ne := <-e.acceptedChan:
-		return ne, nil
-
-	default:
-		// Nothing left.
-		return nil, tcpip.ErrWouldBlock
-	}
-}
-
-// Bind binds the connection.
-//
-// For Unix connectionedEndpoints, this _only sets the address associated with
-// the socket_. Work associated with sockets in the filesystem or finding those
-// sockets must be done by a higher level.
-//
-// Bind will fail only if the socket is connected, bound or the passed address
-// is invalid (the empty string).
-func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
-	e.Lock()
-	defer e.Unlock()
-	if e.isBound() || e.Listening() {
-		return tcpip.ErrAlreadyBound
-	}
-	if addr.Addr == "" {
-		// The empty string is not permitted.
-		return tcpip.ErrBadLocalAddress
-	}
-	if commit != nil {
-		if err := commit(); err != nil {
-			return err
-		}
-	}
-
-	// Save the bound address.
-	e.path = string(addr.Addr)
-	return nil
-}
-
-// SendMsg writes data and a control message to the endpoint's peer.
-// This method does not block if the data cannot be written.
-func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
-	// Stream sockets do not support specifying the endpoint. Seqpacket
-	// sockets ignore the passed endpoint.
-	if e.stype == SockStream && to != nil {
-		return 0, tcpip.ErrNotSupported
-	}
-	return e.baseEndpoint.SendMsg(data, c, to)
-}
-
-// Readiness returns the current readiness of the connectionedEndpoint. For
-// example, if waiter.EventIn is set, the connectionedEndpoint is immediately
-// readable.
-func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
-	e.Lock()
-	defer e.Unlock()
-
-	ready := waiter.EventMask(0)
-	switch {
-	case e.Connected():
-		if mask&waiter.EventIn != 0 && e.receiver.Readable() {
-			ready |= waiter.EventIn
-		}
-		if mask&waiter.EventOut != 0 && e.connected.Writable() {
-			ready |= waiter.EventOut
-		}
-	case e.Listening():
-		if mask&waiter.EventIn != 0 && len(e.acceptedChan) > 0 {
-			ready |= waiter.EventIn
-		}
-	}
-
-	return ready
-}
diff --git a/pkg/tcpip/transport/unix/connectioned_state.go b/pkg/tcpip/transport/unix/connectioned_state.go
deleted file mode 100644
index 39e0ca2d6..000000000
--- a/pkg/tcpip/transport/unix/connectioned_state.go
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package unix
-
-// saveAcceptedChan is invoked by stateify.
-func (e *connectionedEndpoint) saveAcceptedChan() []*connectionedEndpoint {
-	// If acceptedChan is nil (i.e. we are not listening) then we will save nil.
-	// Otherwise we create a (possibly empty) slice of the values in acceptedChan and
-	// save that.
-	var acceptedSlice []*connectionedEndpoint
-	if e.acceptedChan != nil {
-		// Swap out acceptedChan with a new empty channel of the same capacity.
-		saveChan := e.acceptedChan
-		e.acceptedChan = make(chan *connectionedEndpoint, cap(saveChan))
-
-		// Create a new slice with the same len and capacity as the channel.
-		acceptedSlice = make([]*connectionedEndpoint, len(saveChan), cap(saveChan))
-		// Drain acceptedChan into saveSlice, and fill up the new acceptChan at the
-		// same time.
-		for i := range acceptedSlice {
-			ep := <-saveChan
-			acceptedSlice[i] = ep
-			e.acceptedChan <- ep
-		}
-		close(saveChan)
-	}
-	return acceptedSlice
-}
-
-// loadAcceptedChan is invoked by stateify.
-func (e *connectionedEndpoint) loadAcceptedChan(acceptedSlice []*connectionedEndpoint) {
-	// If acceptedSlice is nil, then acceptedChan should also be nil.
-	if acceptedSlice != nil {
-		// Otherwise, create a new channel with the same capacity as acceptedSlice.
-		e.acceptedChan = make(chan *connectionedEndpoint, cap(acceptedSlice))
-		// Seed the channel with values from acceptedSlice.
-		for _, ep := range acceptedSlice {
-			e.acceptedChan <- ep
-		}
-	}
-}
diff --git a/pkg/tcpip/transport/unix/connectionless.go b/pkg/tcpip/transport/unix/connectionless.go
deleted file mode 100644
index ae93c61d7..000000000
--- a/pkg/tcpip/transport/unix/connectionless.go
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package unix
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/queue"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// connectionlessEndpoint is a unix endpoint for unix sockets that support operating in
-// a conectionless fashon.
-//
-// Specifically, this means datagram unix sockets not created with
-// socketpair(2).
-//
-// +stateify savable
-type connectionlessEndpoint struct {
-	baseEndpoint
-}
-
-// NewConnectionless creates a new unbound dgram endpoint.
-func NewConnectionless() Endpoint {
-	ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
-	ep.receiver = &queueReceiver{readQueue: queue.New(&waiter.Queue{}, ep.Queue, initialLimit)}
-	return ep
-}
-
-// isBound returns true iff the endpoint is bound.
-func (e *connectionlessEndpoint) isBound() bool {
-	return e.path != ""
-}
-
-// Close puts the endpoint in a closed state and frees all resources associated
-// with it.
-//
-// The socket will be a fresh state after a call to close and may be reused.
-// That is, close may be used to "unbind" or "disconnect" the socket in error
-// paths.
-func (e *connectionlessEndpoint) Close() {
-	e.Lock()
-	var r Receiver
-	if e.Connected() {
-		e.receiver.CloseRecv()
-		r = e.receiver
-		e.receiver = nil
-
-		e.connected.Release()
-		e.connected = nil
-	}
-	if e.isBound() {
-		e.path = ""
-	}
-	e.Unlock()
-	if r != nil {
-		r.CloseNotify()
-		r.Release()
-	}
-}
-
-// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
-func (e *connectionlessEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error {
-	return tcpip.ErrConnectionRefused
-}
-
-// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
-func (e *connectionlessEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error) {
-	e.Lock()
-	r := e.receiver
-	e.Unlock()
-	if r == nil {
-		return nil, tcpip.ErrConnectionRefused
-	}
-	return &connectedEndpoint{
-		endpoint:   e,
-		writeQueue: r.(*queueReceiver).readQueue,
-	}, nil
-}
-
-// SendMsg writes data and a control message to the specified endpoint.
-// This method does not block if the data cannot be written.
-func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
-	if to == nil {
-		return e.baseEndpoint.SendMsg(data, c, nil)
-	}
-
-	connected, err := to.UnidirectionalConnect()
-	if err != nil {
-		return 0, tcpip.ErrInvalidEndpointState
-	}
-	defer connected.Release()
-
-	e.Lock()
-	n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
-	e.Unlock()
-
-	if notify {
-		connected.SendNotify()
-	}
-
-	return n, err
-}
-
-// Type implements Endpoint.Type.
-func (e *connectionlessEndpoint) Type() SockType {
-	return SockDgram
-}
-
-// Connect attempts to connect directly to server.
-func (e *connectionlessEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
-	connected, err := server.UnidirectionalConnect()
-	if err != nil {
-		return err
-	}
-
-	e.Lock()
-	e.connected = connected
-	e.Unlock()
-
-	return nil
-}
-
-// Listen starts listening on the connection.
-func (e *connectionlessEndpoint) Listen(int) *tcpip.Error {
-	return tcpip.ErrNotSupported
-}
-
-// Accept accepts a new connection.
-func (e *connectionlessEndpoint) Accept() (Endpoint, *tcpip.Error) {
-	return nil, tcpip.ErrNotSupported
-}
-
-// Bind binds the connection.
-//
-// For Unix endpoints, this _only sets the address associated with the socket_.
-// Work associated with sockets in the filesystem or finding those sockets must
-// be done by a higher level.
-//
-// Bind will fail only if the socket is connected, bound or the passed address
-// is invalid (the empty string).
-func (e *connectionlessEndpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
-	e.Lock()
-	defer e.Unlock()
-	if e.isBound() {
-		return tcpip.ErrAlreadyBound
-	}
-	if addr.Addr == "" {
-		// The empty string is not permitted.
-		return tcpip.ErrBadLocalAddress
-	}
-	if commit != nil {
-		if err := commit(); err != nil {
-			return err
-		}
-	}
-
-	// Save the bound address.
-	e.path = string(addr.Addr)
-	return nil
-}
-
-// Readiness returns the current readiness of the endpoint. For example, if
-// waiter.EventIn is set, the endpoint is immediately readable.
-func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
-	e.Lock()
-	defer e.Unlock()
-
-	ready := waiter.EventMask(0)
-	if mask&waiter.EventIn != 0 && e.receiver.Readable() {
-		ready |= waiter.EventIn
-	}
-
-	if e.Connected() {
-		if mask&waiter.EventOut != 0 && e.connected.Writable() {
-			ready |= waiter.EventOut
-		}
-	}
-
-	return ready
-}
diff --git a/pkg/tcpip/transport/unix/unix.go b/pkg/tcpip/transport/unix/unix.go
deleted file mode 100644
index 1bca4b0b4..000000000
--- a/pkg/tcpip/transport/unix/unix.go
+++ /dev/null
@@ -1,953 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package unix contains the implementation of Unix endpoints.
-package unix
-
-import (
-	"sync"
-	"sync/atomic"
-
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/queue"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// initialLimit is the starting limit for the socket buffers.
-const initialLimit = 16 * 1024
-
-// A SockType is a type (as opposed to family) of sockets. These are enumerated
-// in the syscall package as syscall.SOCK_* constants.
-type SockType int
-
-const (
-	// SockStream corresponds to syscall.SOCK_STREAM.
-	SockStream SockType = 1
-	// SockDgram corresponds to syscall.SOCK_DGRAM.
-	SockDgram SockType = 2
-	// SockRaw corresponds to syscall.SOCK_RAW.
-	SockRaw SockType = 3
-	// SockSeqpacket corresponds to syscall.SOCK_SEQPACKET.
-	SockSeqpacket SockType = 5
-)
-
-// A RightsControlMessage is a control message containing FDs.
-type RightsControlMessage interface {
-	// Clone returns a copy of the RightsControlMessage.
-	Clone() RightsControlMessage
-
-	// Release releases any resources owned by the RightsControlMessage.
-	Release()
-}
-
-// A CredentialsControlMessage is a control message containing Unix credentials.
-type CredentialsControlMessage interface {
-	// Equals returns true iff the two messages are equal.
-	Equals(CredentialsControlMessage) bool
-}
-
-// A ControlMessages represents a collection of socket control messages.
-//
-// +stateify savable
-type ControlMessages struct {
-	// Rights is a control message containing FDs.
-	Rights RightsControlMessage
-
-	// Credentials is a control message containing Unix credentials.
-	Credentials CredentialsControlMessage
-}
-
-// Empty returns true iff the ControlMessages does not contain either
-// credentials or rights.
-func (c *ControlMessages) Empty() bool {
-	return c.Rights == nil && c.Credentials == nil
-}
-
-// Clone clones both the credentials and the rights.
-func (c *ControlMessages) Clone() ControlMessages {
-	cm := ControlMessages{}
-	if c.Rights != nil {
-		cm.Rights = c.Rights.Clone()
-	}
-	cm.Credentials = c.Credentials
-	return cm
-}
-
-// Release releases both the credentials and the rights.
-func (c *ControlMessages) Release() {
-	if c.Rights != nil {
-		c.Rights.Release()
-	}
-	*c = ControlMessages{}
-}
-
-// Endpoint is the interface implemented by Unix transport protocol
-// implementations that expose functionality like sendmsg, recvmsg, connect,
-// etc. to Unix socket implementations.
-type Endpoint interface {
-	Credentialer
-	waiter.Waitable
-
-	// Close puts the endpoint in a closed state and frees all resources
-	// associated with it.
-	Close()
-
-	// RecvMsg reads data and a control message from the endpoint. This method
-	// does not block if there is no data pending.
-	//
-	// creds indicates if credential control messages are requested by the
-	// caller. This is useful for determining if control messages can be
-	// coalesced. creds is a hint and can be safely ignored by the
-	// implementation if no coalescing is possible. It is fine to return
-	// credential control messages when none were requested or to not return
-	// credential control messages when they were requested.
-	//
-	// numRights is the number of SCM_RIGHTS FDs requested by the caller. This
-	// is useful if one must allocate a buffer to receive a SCM_RIGHTS message
-	// or determine if control messages can be coalesced. numRights is a hint
-	// and can be safely ignored by the implementation if the number of
-	// available SCM_RIGHTS FDs is known and no coalescing is possible. It is
-	// fine for the returned number of SCM_RIGHTS FDs to be either higher or
-	// lower than the requested number.
-	//
-	// If peek is true, no data should be consumed from the Endpoint. Any and
-	// all data returned from a peek should be available in the next call to
-	// RecvMsg.
-	//
-	// recvLen is the number of bytes copied into data.
-	//
-	// msgLen is the length of the read message consumed for datagram Endpoints.
-	// msgLen is always the same as recvLen for stream Endpoints.
-	RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, err *tcpip.Error)
-
-	// SendMsg writes data and a control message to the endpoint's peer.
-	// This method does not block if the data cannot be written.
-	//
-	// SendMsg does not take ownership of any of its arguments on error.
-	SendMsg([][]byte, ControlMessages, BoundEndpoint) (uintptr, *tcpip.Error)
-
-	// Connect connects this endpoint directly to another.
-	//
-	// This should be called on the client endpoint, and the (bound)
-	// endpoint passed in as a parameter.
-	//
-	// The error codes are the same as Connect.
-	Connect(server BoundEndpoint) *tcpip.Error
-
-	// Shutdown closes the read and/or write end of the endpoint connection
-	// to its peer.
-	Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error
-
-	// Listen puts the endpoint in "listen" mode, which allows it to accept
-	// new connections.
-	Listen(backlog int) *tcpip.Error
-
-	// Accept returns a new endpoint if a peer has established a connection
-	// to an endpoint previously set to listen mode. This method does not
-	// block if no new connections are available.
-	//
-	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *tcpip.Error)
-
-	// Bind binds the endpoint to a specific local address and port.
-	// Specifying a NIC is optional.
-	//
-	// An optional commit function will be executed atomically with respect
-	// to binding the endpoint. If this returns an error, the bind will not
-	// occur and the error will be propagated back to the caller.
-	Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error
-
-	// Type return the socket type, typically either SockStream, SockDgram
-	// or SockSeqpacket.
-	Type() SockType
-
-	// GetLocalAddress returns the address to which the endpoint is bound.
-	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
-
-	// GetRemoteAddress returns the address to which the endpoint is
-	// connected.
-	GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error)
-
-	// SetSockOpt sets a socket option. opt should be one of the tcpip.*Option
-	// types.
-	SetSockOpt(opt interface{}) *tcpip.Error
-
-	// GetSockOpt gets a socket option. opt should be a pointer to one of the
-	// tcpip.*Option types.
-	GetSockOpt(opt interface{}) *tcpip.Error
-}
-
-// A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
-// option.
-type Credentialer interface {
-	// Passcred returns whether or not the SO_PASSCRED socket option is
-	// enabled on this end.
-	Passcred() bool
-
-	// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
-	// is enabled on the connected end.
-	ConnectedPasscred() bool
-}
-
-// A BoundEndpoint is a unix endpoint that can be connected to.
-type BoundEndpoint interface {
-	// BidirectionalConnect establishes a bi-directional connection between two
-	// unix endpoints in an all-or-nothing manner. If an error occurs during
-	// connecting, the state of neither endpoint should be modified.
-	//
-	// In order for an endpoint to establish such a bidirectional connection
-	// with a BoundEndpoint, the endpoint calls the BidirectionalConnect method
-	// on the BoundEndpoint and sends a representation of itself (the
-	// ConnectingEndpoint) and a callback (returnConnect) to receive the
-	// connection information (Receiver and ConnectedEndpoint) upon a
-	// successful connect. The callback should only be called on a successful
-	// connect.
-	//
-	// For a connection attempt to be successful, the ConnectingEndpoint must
-	// be unconnected and not listening and the BoundEndpoint whose
-	// BidirectionalConnect method is being called must be listening.
-	//
-	// This method will return tcpip.ErrConnectionRefused on endpoints with a
-	// type that isn't SockStream or SockSeqpacket.
-	BidirectionalConnect(ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error
-
-	// UnidirectionalConnect establishes a write-only connection to a unix
-	// endpoint.
-	//
-	// An endpoint which calls UnidirectionalConnect and supports it itself must
-	// not hold its own lock when calling UnidirectionalConnect.
-	//
-	// This method will return tcpip.ErrConnectionRefused on a non-SockDgram
-	// endpoint.
-	UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error)
-
-	// Release releases any resources held by the BoundEndpoint. It must be
-	// called before dropping all references to a BoundEndpoint returned by a
-	// function.
-	Release()
-}
-
-// message represents a message passed over a Unix domain socket.
-//
-// +stateify savable
-type message struct {
-	ilist.Entry
-
-	// Data is the Message payload.
-	Data buffer.View
-
-	// Control is auxiliary control message data that goes along with the
-	// data.
-	Control ControlMessages
-
-	// Address is the bound address of the endpoint that sent the message.
-	//
-	// If the endpoint that sent the message is not bound, the Address is
-	// the empty string.
-	Address tcpip.FullAddress
-}
-
-// Length returns number of bytes stored in the message.
-func (m *message) Length() int64 {
-	return int64(len(m.Data))
-}
-
-// Release releases any resources held by the message.
-func (m *message) Release() {
-	m.Control.Release()
-}
-
-// Peek returns a copy of the message.
-func (m *message) Peek() queue.Entry {
-	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
-}
-
-// Truncate reduces the length of the message payload to n bytes.
-//
-// Preconditions: n <= m.Length().
-func (m *message) Truncate(n int64) {
-	m.Data.CapLength(int(n))
-}
-
-// A Receiver can be used to receive Messages.
-type Receiver interface {
-	// Recv receives a single message. This method does not block.
-	//
-	// See Endpoint.RecvMsg for documentation on shared arguments.
-	//
-	// notify indicates if RecvNotify should be called.
-	Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, source tcpip.FullAddress, notify bool, err *tcpip.Error)
-
-	// RecvNotify notifies the Receiver of a successful Recv. This must not be
-	// called while holding any endpoint locks.
-	RecvNotify()
-
-	// CloseRecv prevents the receiving of additional Messages.
-	//
-	// After CloseRecv is called, CloseNotify must also be called.
-	CloseRecv()
-
-	// CloseNotify notifies the Receiver of recv being closed. This must not be
-	// called while holding any endpoint locks.
-	CloseNotify()
-
-	// Readable returns if messages should be attempted to be received. This
-	// includes when read has been shutdown.
-	Readable() bool
-
-	// RecvQueuedSize returns the total amount of data currently receivable.
-	// RecvQueuedSize should return -1 if the operation isn't supported.
-	RecvQueuedSize() int64
-
-	// RecvMaxQueueSize returns maximum value for RecvQueuedSize.
-	// RecvMaxQueueSize should return -1 if the operation isn't supported.
-	RecvMaxQueueSize() int64
-
-	// Release releases any resources owned by the Receiver. It should be
-	// called before droping all references to a Receiver.
-	Release()
-}
-
-// queueReceiver implements Receiver for datagram sockets.
-//
-// +stateify savable
-type queueReceiver struct {
-	readQueue *queue.Queue
-}
-
-// Recv implements Receiver.Recv.
-func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
-	var m queue.Entry
-	var notify bool
-	var err *tcpip.Error
-	if peek {
-		m, err = q.readQueue.Peek()
-	} else {
-		m, notify, err = q.readQueue.Dequeue()
-	}
-	if err != nil {
-		return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
-	}
-	msg := m.(*message)
-	src := []byte(msg.Data)
-	var copied uintptr
-	for i := 0; i < len(data) && len(src) > 0; i++ {
-		n := copy(data[i], src)
-		copied += uintptr(n)
-		src = src[n:]
-	}
-	return copied, uintptr(len(msg.Data)), msg.Control, msg.Address, notify, nil
-}
-
-// RecvNotify implements Receiver.RecvNotify.
-func (q *queueReceiver) RecvNotify() {
-	q.readQueue.WriterQueue.Notify(waiter.EventOut)
-}
-
-// CloseNotify implements Receiver.CloseNotify.
-func (q *queueReceiver) CloseNotify() {
-	q.readQueue.ReaderQueue.Notify(waiter.EventIn)
-	q.readQueue.WriterQueue.Notify(waiter.EventOut)
-}
-
-// CloseRecv implements Receiver.CloseRecv.
-func (q *queueReceiver) CloseRecv() {
-	q.readQueue.Close()
-}
-
-// Readable implements Receiver.Readable.
-func (q *queueReceiver) Readable() bool {
-	return q.readQueue.IsReadable()
-}
-
-// RecvQueuedSize implements Receiver.RecvQueuedSize.
-func (q *queueReceiver) RecvQueuedSize() int64 {
-	return q.readQueue.QueuedSize()
-}
-
-// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
-func (q *queueReceiver) RecvMaxQueueSize() int64 {
-	return q.readQueue.MaxQueueSize()
-}
-
-// Release implements Receiver.Release.
-func (*queueReceiver) Release() {}
-
-// streamQueueReceiver implements Receiver for stream sockets.
-//
-// +stateify savable
-type streamQueueReceiver struct {
-	queueReceiver
-
-	mu      sync.Mutex `state:"nosave"`
-	buffer  []byte
-	control ControlMessages
-	addr    tcpip.FullAddress
-}
-
-func vecCopy(data [][]byte, buf []byte) (uintptr, [][]byte, []byte) {
-	var copied uintptr
-	for len(data) > 0 && len(buf) > 0 {
-		n := copy(data[0], buf)
-		copied += uintptr(n)
-		buf = buf[n:]
-		data[0] = data[0][n:]
-		if len(data[0]) == 0 {
-			data = data[1:]
-		}
-	}
-	return copied, data, buf
-}
-
-// Readable implements Receiver.Readable.
-func (q *streamQueueReceiver) Readable() bool {
-	q.mu.Lock()
-	bl := len(q.buffer)
-	r := q.readQueue.IsReadable()
-	q.mu.Unlock()
-	// We're readable if we have data in our buffer or if the queue receiver is
-	// readable.
-	return bl > 0 || r
-}
-
-// RecvQueuedSize implements Receiver.RecvQueuedSize.
-func (q *streamQueueReceiver) RecvQueuedSize() int64 {
-	q.mu.Lock()
-	bl := len(q.buffer)
-	qs := q.readQueue.QueuedSize()
-	q.mu.Unlock()
-	return int64(bl) + qs
-}
-
-// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
-func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
-	// The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest
-	// message we can buffer which is also the largest message we can receive.
-	return 2 * q.readQueue.MaxQueueSize()
-}
-
-// Recv implements Receiver.Recv.
-func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	var notify bool
-
-	// If we have no data in the endpoint, we need to get some.
-	if len(q.buffer) == 0 {
-		// Load the next message into a buffer, even if we are peeking. Peeking
-		// won't consume the message, so it will be still available to be read
-		// the next time Recv() is called.
-		m, n, err := q.readQueue.Dequeue()
-		if err != nil {
-			return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
-		}
-		notify = n
-		msg := m.(*message)
-		q.buffer = []byte(msg.Data)
-		q.control = msg.Control
-		q.addr = msg.Address
-	}
-
-	var copied uintptr
-	if peek {
-		// Don't consume control message if we are peeking.
-		c := q.control.Clone()
-
-		// Don't consume data since we are peeking.
-		copied, data, _ = vecCopy(data, q.buffer)
-
-		return copied, copied, c, q.addr, notify, nil
-	}
-
-	// Consume data and control message since we are not peeking.
-	copied, data, q.buffer = vecCopy(data, q.buffer)
-
-	// Save the original state of q.control.
-	c := q.control
-
-	// Remove rights from q.control and leave behind just the creds.
-	q.control.Rights = nil
-	if !wantCreds {
-		c.Credentials = nil
-	}
-
-	if c.Rights != nil && numRights == 0 {
-		c.Rights.Release()
-		c.Rights = nil
-	}
-
-	haveRights := c.Rights != nil
-
-	// If we have more capacity for data and haven't received any usable
-	// rights.
-	//
-	// Linux never coalesces rights control messages.
-	for !haveRights && len(data) > 0 {
-		// Get a message from the readQueue.
-		m, n, err := q.readQueue.Dequeue()
-		if err != nil {
-			// We already got some data, so ignore this error. This will
-			// manifest as a short read to the user, which is what Linux
-			// does.
-			break
-		}
-		notify = notify || n
-		msg := m.(*message)
-		q.buffer = []byte(msg.Data)
-		q.control = msg.Control
-		q.addr = msg.Address
-
-		if wantCreds {
-			if (q.control.Credentials == nil) != (c.Credentials == nil) {
-				// One message has credentials, the other does not.
-				break
-			}
-
-			if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) {
-				// Both messages have credentials, but they don't match.
-				break
-			}
-		}
-
-		if numRights != 0 && c.Rights != nil && q.control.Rights != nil {
-			// Both messages have rights.
-			break
-		}
-
-		var cpd uintptr
-		cpd, data, q.buffer = vecCopy(data, q.buffer)
-		copied += cpd
-
-		if cpd == 0 {
-			// data was actually full.
-			break
-		}
-
-		if q.control.Rights != nil {
-			// Consume rights.
-			if numRights == 0 {
-				q.control.Rights.Release()
-			} else {
-				c.Rights = q.control.Rights
-				haveRights = true
-			}
-			q.control.Rights = nil
-		}
-	}
-	return copied, copied, c, q.addr, notify, nil
-}
-
-// A ConnectedEndpoint is an Endpoint that can be used to send Messages.
-type ConnectedEndpoint interface {
-	// Passcred implements Endpoint.Passcred.
-	Passcred() bool
-
-	// GetLocalAddress implements Endpoint.GetLocalAddress.
-	GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
-
-	// Send sends a single message. This method does not block.
-	//
-	// notify indicates if SendNotify should be called.
-	//
-	// tcpip.ErrWouldBlock can be returned along with a partial write if
-	// the caller should block to send the rest of the data.
-	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n uintptr, notify bool, err *tcpip.Error)
-
-	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
-	// must not be called while holding any endpoint locks.
-	SendNotify()
-
-	// CloseSend prevents the sending of additional Messages.
-	//
-	// After CloseSend is call, CloseNotify must also be called.
-	CloseSend()
-
-	// CloseNotify notifies the ConnectedEndpoint of send being closed. This
-	// must not be called while holding any endpoint locks.
-	CloseNotify()
-
-	// Writable returns if messages should be attempted to be sent. This
-	// includes when write has been shutdown.
-	Writable() bool
-
-	// EventUpdate lets the ConnectedEndpoint know that event registrations
-	// have changed.
-	EventUpdate()
-
-	// SendQueuedSize returns the total amount of data currently queued for
-	// sending. SendQueuedSize should return -1 if the operation isn't
-	// supported.
-	SendQueuedSize() int64
-
-	// SendMaxQueueSize returns maximum value for SendQueuedSize.
-	// SendMaxQueueSize should return -1 if the operation isn't supported.
-	SendMaxQueueSize() int64
-
-	// Release releases any resources owned by the ConnectedEndpoint. It should
-	// be called before droping all references to a ConnectedEndpoint.
-	Release()
-}
-
-// +stateify savable
-type connectedEndpoint struct {
-	// endpoint represents the subset of the Endpoint functionality needed by
-	// the connectedEndpoint. It is implemented by both connectionedEndpoint
-	// and connectionlessEndpoint and allows the use of types which don't
-	// fully implement Endpoint.
-	endpoint interface {
-		// Passcred implements Endpoint.Passcred.
-		Passcred() bool
-
-		// GetLocalAddress implements Endpoint.GetLocalAddress.
-		GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
-
-		// Type implements Endpoint.Type.
-		Type() SockType
-	}
-
-	writeQueue *queue.Queue
-}
-
-// Passcred implements ConnectedEndpoint.Passcred.
-func (e *connectedEndpoint) Passcred() bool {
-	return e.endpoint.Passcred()
-}
-
-// GetLocalAddress implements ConnectedEndpoint.GetLocalAddress.
-func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
-	return e.endpoint.GetLocalAddress()
-}
-
-// Send implements ConnectedEndpoint.Send.
-func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
-	var l int64
-	for _, d := range data {
-		l += int64(len(d))
-	}
-
-	truncate := false
-	if e.endpoint.Type() == SockStream {
-		// Since stream sockets don't preserve message boundaries, we
-		// can write only as much of the message as fits in the queue.
-		truncate = true
-
-		// Discard empty stream packets. Since stream sockets don't
-		// preserve message boundaries, sending zero bytes is a no-op.
-		// In Linux, the receiver actually uses a zero-length receive
-		// as an indication that the stream was closed.
-		if l == 0 {
-			controlMessages.Release()
-			return 0, false, nil
-		}
-	}
-
-	v := make([]byte, 0, l)
-	for _, d := range data {
-		v = append(v, d...)
-	}
-
-	l, notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from}, truncate)
-	return uintptr(l), notify, err
-}
-
-// SendNotify implements ConnectedEndpoint.SendNotify.
-func (e *connectedEndpoint) SendNotify() {
-	e.writeQueue.ReaderQueue.Notify(waiter.EventIn)
-}
-
-// CloseNotify implements ConnectedEndpoint.CloseNotify.
-func (e *connectedEndpoint) CloseNotify() {
-	e.writeQueue.ReaderQueue.Notify(waiter.EventIn)
-	e.writeQueue.WriterQueue.Notify(waiter.EventOut)
-}
-
-// CloseSend implements ConnectedEndpoint.CloseSend.
-func (e *connectedEndpoint) CloseSend() {
-	e.writeQueue.Close()
-}
-
-// Writable implements ConnectedEndpoint.Writable.
-func (e *connectedEndpoint) Writable() bool {
-	return e.writeQueue.IsWritable()
-}
-
-// EventUpdate implements ConnectedEndpoint.EventUpdate.
-func (*connectedEndpoint) EventUpdate() {}
-
-// SendQueuedSize implements ConnectedEndpoint.SendQueuedSize.
-func (e *connectedEndpoint) SendQueuedSize() int64 {
-	return e.writeQueue.QueuedSize()
-}
-
-// SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize.
-func (e *connectedEndpoint) SendMaxQueueSize() int64 {
-	return e.writeQueue.MaxQueueSize()
-}
-
-// Release implements ConnectedEndpoint.Release.
-func (*connectedEndpoint) Release() {}
-
-// baseEndpoint is an embeddable unix endpoint base used in both the connected and connectionless
-// unix domain socket Endpoint implementations.
-//
-// Not to be used on its own.
-//
-// +stateify savable
-type baseEndpoint struct {
-	*waiter.Queue
-
-	// passcred specifies whether SCM_CREDENTIALS socket control messages are
-	// enabled on this endpoint. Must be accessed atomically.
-	passcred int32
-
-	// Mutex protects the below fields.
-	sync.Mutex `state:"nosave"`
-
-	// receiver allows Messages to be received.
-	receiver Receiver
-
-	// connected allows messages to be sent and state information about the
-	// connected endpoint to be read.
-	connected ConnectedEndpoint
-
-	// path is not empty if the endpoint has been bound,
-	// or may be used if the endpoint is connected.
-	path string
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
-	e.Queue.EventRegister(we, mask)
-	e.Lock()
-	if e.connected != nil {
-		e.connected.EventUpdate()
-	}
-	e.Unlock()
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
-	e.Queue.EventUnregister(we)
-	e.Lock()
-	if e.connected != nil {
-		e.connected.EventUpdate()
-	}
-	e.Unlock()
-}
-
-// Passcred implements Credentialer.Passcred.
-func (e *baseEndpoint) Passcred() bool {
-	return atomic.LoadInt32(&e.passcred) != 0
-}
-
-// ConnectedPasscred implements Credentialer.ConnectedPasscred.
-func (e *baseEndpoint) ConnectedPasscred() bool {
-	e.Lock()
-	defer e.Unlock()
-	return e.connected != nil && e.connected.Passcred()
-}
-
-func (e *baseEndpoint) setPasscred(pc bool) {
-	if pc {
-		atomic.StoreInt32(&e.passcred, 1)
-	} else {
-		atomic.StoreInt32(&e.passcred, 0)
-	}
-}
-
-// Connected implements ConnectingEndpoint.Connected.
-func (e *baseEndpoint) Connected() bool {
-	return e.receiver != nil && e.connected != nil
-}
-
-// RecvMsg reads data and a control message from the endpoint.
-func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, *tcpip.Error) {
-	e.Lock()
-
-	if e.receiver == nil {
-		e.Unlock()
-		return 0, 0, ControlMessages{}, tcpip.ErrNotConnected
-	}
-
-	recvLen, msgLen, cms, a, notify, err := e.receiver.Recv(data, creds, numRights, peek)
-	e.Unlock()
-	if err != nil {
-		return 0, 0, ControlMessages{}, err
-	}
-
-	if notify {
-		e.receiver.RecvNotify()
-	}
-
-	if addr != nil {
-		*addr = a
-	}
-	return recvLen, msgLen, cms, nil
-}
-
-// SendMsg writes data and a control message to the endpoint's peer.
-// This method does not block if the data cannot be written.
-func (e *baseEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
-	e.Lock()
-	if !e.Connected() {
-		e.Unlock()
-		return 0, tcpip.ErrNotConnected
-	}
-	if to != nil {
-		e.Unlock()
-		return 0, tcpip.ErrAlreadyConnected
-	}
-
-	n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
-	e.Unlock()
-
-	if notify {
-		e.connected.SendNotify()
-	}
-
-	return n, err
-}
-
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
-	case tcpip.PasscredOption:
-		e.setPasscred(v != 0)
-		return nil
-	}
-	return nil
-}
-
-// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
-	switch o := opt.(type) {
-	case tcpip.ErrorOption:
-		return nil
-	case *tcpip.SendQueueSizeOption:
-		e.Lock()
-		if !e.Connected() {
-			e.Unlock()
-			return tcpip.ErrNotConnected
-		}
-		qs := tcpip.SendQueueSizeOption(e.connected.SendQueuedSize())
-		e.Unlock()
-		if qs < 0 {
-			return tcpip.ErrQueueSizeNotSupported
-		}
-		*o = qs
-		return nil
-	case *tcpip.ReceiveQueueSizeOption:
-		e.Lock()
-		if !e.Connected() {
-			e.Unlock()
-			return tcpip.ErrNotConnected
-		}
-		qs := tcpip.ReceiveQueueSizeOption(e.receiver.RecvQueuedSize())
-		e.Unlock()
-		if qs < 0 {
-			return tcpip.ErrQueueSizeNotSupported
-		}
-		*o = qs
-		return nil
-	case *tcpip.PasscredOption:
-		if e.Passcred() {
-			*o = tcpip.PasscredOption(1)
-		} else {
-			*o = tcpip.PasscredOption(0)
-		}
-		return nil
-	case *tcpip.SendBufferSizeOption:
-		e.Lock()
-		if !e.Connected() {
-			e.Unlock()
-			return tcpip.ErrNotConnected
-		}
-		qs := tcpip.SendBufferSizeOption(e.connected.SendMaxQueueSize())
-		e.Unlock()
-		if qs < 0 {
-			return tcpip.ErrQueueSizeNotSupported
-		}
-		*o = qs
-		return nil
-	case *tcpip.ReceiveBufferSizeOption:
-		e.Lock()
-		if e.receiver == nil {
-			e.Unlock()
-			return tcpip.ErrNotConnected
-		}
-		qs := tcpip.ReceiveBufferSizeOption(e.receiver.RecvMaxQueueSize())
-		e.Unlock()
-		if qs < 0 {
-			return tcpip.ErrQueueSizeNotSupported
-		}
-		*o = qs
-		return nil
-	}
-	return tcpip.ErrUnknownProtocolOption
-}
-
-// Shutdown closes the read and/or write end of the endpoint connection to its
-// peer.
-func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
-	e.Lock()
-	if !e.Connected() {
-		e.Unlock()
-		return tcpip.ErrNotConnected
-	}
-
-	if flags&tcpip.ShutdownRead != 0 {
-		e.receiver.CloseRecv()
-	}
-
-	if flags&tcpip.ShutdownWrite != 0 {
-		e.connected.CloseSend()
-	}
-
-	e.Unlock()
-
-	if flags&tcpip.ShutdownRead != 0 {
-		e.receiver.CloseNotify()
-	}
-
-	if flags&tcpip.ShutdownWrite != 0 {
-		e.connected.CloseNotify()
-	}
-
-	return nil
-}
-
-// GetLocalAddress returns the bound path.
-func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.Lock()
-	defer e.Unlock()
-	return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil
-}
-
-// GetRemoteAddress returns the local address of the connected endpoint (if
-// available).
-func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
-	e.Lock()
-	c := e.connected
-	e.Unlock()
-	if c != nil {
-		return c.GetLocalAddress()
-	}
-	return tcpip.FullAddress{}, tcpip.ErrNotConnected
-}
-
-// Release implements BoundEndpoint.Release.
-func (*baseEndpoint) Release() {}
-- 
cgit v1.2.3


From 578fe5a50dcf8e104b6bce3802987b0f8c069ade Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 17 Oct 2018 11:51:43 -0700
Subject: Fix PTRACE_GETREGSET write size

The existing logic is backwards and writes iov_len == 0 for a full write.

PiperOrigin-RevId: 217560377
Change-Id: I5a39c31bf0ba9063a8495993bfef58dc8ab7c5fa
---
 pkg/sentry/kernel/ptrace.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index e21a25ae6..9fe28f435 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -921,7 +921,13 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		if err != nil {
 			return err
 		}
-		ar.End -= usermem.Addr(n)
+
+		// Update iovecs to represent the range of the written register set.
+		end, ok := ar.Start.AddLength(uint64(n))
+		if !ok {
+			panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length()))
+		}
+		ar.End = end
 		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
 
 	case linux.PTRACE_SETREGS:
-- 
cgit v1.2.3


From 4e6f0892c96c374b1abcf5c39b75ba52d98c97f8 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 17 Oct 2018 12:27:58 -0700
Subject: runsc: Support job control signals for the root container.

Now containers run with "docker run -it" support control characters like ^C and
^Z.

This required refactoring our signal handling a bit. Signals delivered to the
"runsc boot" process are turned into loader.Signal calls with the appropriate
delivery mode. Previously they were always sent directly to PID 1.

PiperOrigin-RevId: 217566770
Change-Id: I5b7220d9a0f2b591a56335479454a200c6de8732
---
 pkg/sentry/kernel/kernel.go                |  27 +-
 pkg/sentry/sighandling/BUILD               |   6 +-
 pkg/sentry/sighandling/sighandling.go      |  25 +-
 runsc/boot/controller.go                   |  58 ++--
 runsc/boot/fds.go                          |  45 ++-
 runsc/boot/loader.go                       | 221 +++++++-------
 runsc/container/BUILD                      |   1 +
 runsc/container/console_test.go            | 452 +++++++++++++++++++++++++++++
 runsc/container/container_test.go          | 203 -------------
 runsc/sandbox/sandbox.go                   |  24 +-
 runsc/test/integration/exec_test.go        |   2 +-
 runsc/test/integration/integration_test.go |  48 +++
 runsc/test/testutil/docker.go              |  39 +++
 13 files changed, 776 insertions(+), 375 deletions(-)
 create mode 100644 runsc/container/console_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index cc664deec..84afdb530 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -839,17 +839,40 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	k.tasks.mu.RLock()
 	defer k.tasks.mu.RUnlock()
 
+	var lastErr error
 	for t := range k.tasks.Root.tids {
 		if t == t.tg.leader && t.ContainerID() == cid {
 			t.tg.signalHandlers.mu.Lock()
 			defer t.tg.signalHandlers.mu.Unlock()
 			infoCopy := *info
 			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
-				return err
+				lastErr = err
 			}
 		}
 	}
-	return nil
+	return lastErr
+}
+
+// SendProcessGroupSignal sends a signal to all processes inside the process
+// group. It is analagous to kernel/signal.c:kill_pgrp.
+func (k *Kernel) SendProcessGroupSignal(pg *ProcessGroup, info *arch.SignalInfo) error {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	var lastErr error
+	for t := range k.tasks.Root.tids {
+		if t == t.tg.leader && t.tg.ProcessGroup() == pg {
+			t.tg.signalHandlers.mu.Lock()
+			defer t.tg.signalHandlers.mu.Unlock()
+			infoCopy := *info
+			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+		}
+	}
+	return lastErr
 }
 
 // FeatureSet returns the FeatureSet.
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index f480f0735..751176747 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -10,9 +10,5 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling",
     visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/kernel",
-    ],
+    deps = ["//pkg/abi/linux"],
 )
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 0946ab075..29bcf55ab 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -23,18 +23,17 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 )
 
 // numSignals is the number of normal (non-realtime) signals on Linux.
 const numSignals = 32
 
-// forwardSignals listens for incoming signals and delivers them to k.
+// handleSignals listens for incoming signals and calls the given handler
+// function.
 //
 // It starts when the start channel is closed, stops when the stop channel
 // is closed, and closes done once it will no longer deliver signals to k.
-func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop, done chan struct{}) {
+func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, stop, done chan struct{}) {
 	// Build a select case.
 	sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}}
 	for _, sigchan := range sigchans {
@@ -98,18 +97,19 @@ func forwardSignals(k *kernel.Kernel, sigchans []chan os.Signal, start, stop, do
 			}
 		}
 
-		k.SendExternalSignal(&arch.SignalInfo{Signo: int32(signal)}, "sentry")
+		// Pass the signal to the handler.
+		handler(signal)
 	}
 }
 
-// PrepareForwarding ensures that synchronous signals are forwarded to k and
-// returns a callback that starts signal delivery, which itself returns a
-// callback that stops signal forwarding.
+// PrepareHandler ensures that synchronous signals are passed to the given
+// handler function and returns a callback that starts signal delivery, which
+// itself returns a callback that stops signal handling.
 //
 // Note that this function permanently takes over signal handling. After the
 // stop callback, signals revert to the default Go runtime behavior, which
 // cannot be overridden with external calls to signal.Notify.
-func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func() {
+func PrepareHandler(handler func(linux.Signal)) func() func() {
 	start := make(chan struct{})
 	stop := make(chan struct{})
 	done := make(chan struct{})
@@ -125,15 +125,10 @@ func PrepareForwarding(k *kernel.Kernel, skipSignal syscall.Signal) func() func(
 	for sig := 1; sig <= numSignals+1; sig++ {
 		sigchan := make(chan os.Signal, 1)
 		sigchans = append(sigchans, sigchan)
-
-		if syscall.Signal(sig) == skipSignal {
-			continue
-		}
-
 		signal.Notify(sigchan, syscall.Signal(sig))
 	}
 	// Start up our listener.
-	go forwardSignals(k, sigchans, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
+	go handleSignals(sigchans, handler, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
 
 	return func() func() {
 		close(start)
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index eaeb9e2d8..bee82f344 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -425,6 +425,26 @@ func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error
 	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
 }
 
+// SignalDeliveryMode enumerates different signal delivery modes.
+type SignalDeliveryMode int
+
+const (
+	// DeliverToProcess delivers the signal to the container process with
+	// the specified PID. If PID is 0, then the container init process is
+	// signaled.
+	DeliverToProcess SignalDeliveryMode = iota
+
+	// DeliverToAllProcesses delivers the signal to all processes in the
+	// container. PID must be 0.
+	DeliverToAllProcesses
+
+	// DeliverToForegroundProcessGroup delivers the signal to the
+	// foreground process group in the same TTY session as the specified
+	// process. If PID is 0, then the signal is delivered to the foreground
+	// process group for the TTY for the init process.
+	DeliverToForegroundProcessGroup
+)
+
 // SignalArgs are arguments to the Signal method.
 type SignalArgs struct {
 	// CID is the container ID.
@@ -433,36 +453,20 @@ type SignalArgs struct {
 	// Signo is the signal to send to the process.
 	Signo int32
 
-	// All is set when signal should be sent to all processes in the container.
-	// When false, the signal is sent to the root container process only.
-	All bool
-}
-
-// Signal sends a signal to the root process of the container.
-func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
-	log.Debugf("containerManager.Signal %q %d, all: %t", args.CID, args.Signo, args.All)
-	return cm.l.signalContainer(args.CID, args.Signo, args.All)
-}
-
-// SignalProcessArgs are arguments to the Signal method.
-type SignalProcessArgs struct {
-	// CID is the container ID.
-	CID string
-
 	// PID is the process ID in the given container that will be signaled.
+	// If 0, the root container will be signalled.
 	PID int32
 
-	// Signo is the signal to send to the process.
-	Signo int32
-
-	// SendToForegroundProcess indicates that the signal should be sent to
-	// the foreground process group in the session that PID belongs to.
-	// This is only valid if the process is attached to a host TTY.
-	SendToForegroundProcess bool
+	// Mode is the signal delivery mode.
+	Mode SignalDeliveryMode
 }
 
-// SignalProcess sends a signal to a particular process in the container.
-func (cm *containerManager) SignalProcess(args *SignalProcessArgs, _ *struct{}) error {
-	log.Debugf("containerManager.Signal: %+v", args)
-	return cm.l.signalProcess(args.CID, args.PID, args.Signo, args.SendToForegroundProcess)
+// Signal sends a signal to one or more processes in a container. If args.PID
+// is 0, then the container init process is used. Depending on the
+// args.SignalDeliveryMode option, the signal may be sent directly to the
+// indicated process, to all processes in the container, or to the foreground
+// process group.
+func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal %+v", args)
+	return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
 }
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index a5a6ba8af..9416e3a5c 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -35,6 +35,7 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
+	mounter := fs.FileOwnerFromContext(ctx)
 
 	// Maps sandbox FD to host FD.
 	fdMap := map[int]int{
@@ -42,16 +43,44 @@ func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, cons
 		1: stdioFDs[1],
 		2: stdioFDs[2],
 	}
-	mounter := fs.FileOwnerFromContext(ctx)
 
-	for sfd, hfd := range fdMap {
-		file, err := host.ImportFile(ctx, hfd, mounter, console /* isTTY */)
-		if err != nil {
-			return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err)
+	var ttyFile *fs.File
+	for appFD, hostFD := range fdMap {
+		var appFile *fs.File
+
+		if console && appFD < 3 {
+			// Import the file as a host TTY file.
+			if ttyFile == nil {
+				var err error
+				appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */)
+				if err != nil {
+					return nil, err
+				}
+				defer appFile.DecRef()
+
+				// Remember this in the TTY file, as we will
+				// use it for the other stdio FDs.
+				ttyFile = appFile
+			} else {
+				// Re-use the existing TTY file, as all three
+				// stdio FDs must point to the same fs.File in
+				// order to share TTY state, specifically the
+				// foreground process group id.
+				appFile = ttyFile
+			}
+		} else {
+			// Import the file as a regular host file.
+			var err error
+			appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */)
+			if err != nil {
+				return nil, err
+			}
+			defer appFile.DecRef()
 		}
-		defer file.DecRef()
-		if err := fdm.NewFDAt(kdefs.FD(sfd), file, kernel.FDFlags{}, l); err != nil {
-			return nil, fmt.Errorf("failed to add imported fd %d to FDMap: %v", hfd, err)
+
+		// Add the file to the FD map.
+		if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+			return nil, err
 		}
 	}
 
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index fa169d090..c79b95bde 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -19,7 +19,6 @@ import (
 	"fmt"
 	mrand "math/rand"
 	"os"
-	"os/signal"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -110,7 +109,7 @@ type Loader struct {
 	// mu guards processes.
 	mu sync.Mutex
 
-	// processes maps containers root process and invocation of exec. Root
+	// processes maps containers init process and invocation of exec. Root
 	// processes are keyed with container ID and pid=0, while exec invocations
 	// have the corresponding pid set.
 	//
@@ -291,28 +290,9 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
 
-	// We don't care about child signals; some platforms can generate a
-	// tremendous number of useless ones (I'm looking at you, ptrace).
-	if err := sighandling.IgnoreChildStop(); err != nil {
-		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
-	}
-	// Ensure that signals received are forwarded to the emulated kernel.
-	ps := syscall.Signal(args.Conf.PanicSignal)
-	startSignalForwarding := sighandling.PrepareForwarding(k, ps)
-	if args.Conf.PanicSignal != -1 {
-		// Panics if the sentry receives 'Config.PanicSignal'.
-		panicChan := make(chan os.Signal, 1)
-		signal.Notify(panicChan, ps)
-		go func() { // S/R-SAFE: causes sentry panic.
-			<-panicChan
-			panic("Signal-induced panic")
-		}()
-		log.Infof("Panic signal set to %v(%d)", ps, args.Conf.PanicSignal)
-	}
-
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create root process: %v", err)
+		return nil, fmt.Errorf("failed to create init process for root container: %v", err)
 	}
 
 	if err := initCompatLogs(args.UserLogFD); err != nil {
@@ -320,19 +300,47 @@ func New(args Args) (*Loader, error) {
 	}
 
 	l := &Loader{
-		k:                     k,
-		ctrl:                  ctrl,
-		conf:                  args.Conf,
-		console:               args.Console,
-		watchdog:              watchdog,
-		spec:                  args.Spec,
-		goferFDs:              args.GoferFDs,
-		stdioFDs:              args.StdioFDs,
-		startSignalForwarding: startSignalForwarding,
-		rootProcArgs:          procArgs,
-		sandboxID:             args.ID,
-		processes:             make(map[execID]*execProcess),
+		k:            k,
+		ctrl:         ctrl,
+		conf:         args.Conf,
+		console:      args.Console,
+		watchdog:     watchdog,
+		spec:         args.Spec,
+		goferFDs:     args.GoferFDs,
+		stdioFDs:     args.StdioFDs,
+		rootProcArgs: procArgs,
+		sandboxID:    args.ID,
+		processes:    make(map[execID]*execProcess),
 	}
+
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	}
+
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if args.Console {
+			// Since we are running with a console, we should
+			// forward the signal to the foreground process group
+			// so that job control signals like ^C can be handled
+			// properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
+		}
+	})
+
 	ctrl.manager.l = l
 	return l, nil
 }
@@ -467,9 +475,15 @@ func (l *Loader) run() error {
 		l.rootProcArgs.FDMap.DecRef()
 	}
 
-	l.mu.Lock()
 	eid := execID{cid: l.sandboxID}
-	l.processes[eid] = &execProcess{tg: l.k.GlobalInit()}
+	ep := execProcess{tg: l.k.GlobalInit()}
+	if l.console {
+		ttyFile := l.rootProcArgs.FDMap.GetFile(0)
+		defer ttyFile.DecRef()
+		ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
+	}
+	l.mu.Lock()
+	l.processes[eid] = &ep
 	l.mu.Unlock()
 
 	// Start signal forwarding only after an init process is created.
@@ -572,7 +586,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 // filesystem.
 func (l *Loader) destroyContainer(cid string) error {
 	// First kill and wait for all processes in the container.
-	if err := l.signalContainer(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
+	if err := l.signal(cid, 0, int32(linux.SIGKILL), DeliverToAllProcesses); err != nil {
 		return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
 	}
 
@@ -634,7 +648,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	return tgid, nil
 }
 
-// waitContainer waits for the root process of a container to exit.
+// waitContainer waits for the init process of a container to exit.
 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
@@ -740,11 +754,12 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 	}
 }
 
-// signalProcess sends a signal to the process with the given PID. If
-// sendToFGProcess is true, then the signal will be sent to the foreground
-// process group in the same session that PID belongs to.
-func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess bool) error {
-	if pid <= 0 {
+// signal sends a signal to one or more processes in a container. If PID is 0,
+// then the container init process is used. Depending on the SignalDeliveryMode
+// option, the signal may be sent directly to the indicated process, to all
+// processes in the container, or to the foreground process group.
+func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
+	if pid < 0 {
 		return fmt.Errorf("failed to signal container %q PID %d: PID must be positive", cid, pid)
 	}
 
@@ -756,10 +771,16 @@ func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess boo
 	ep, ok := l.processes[eid]
 	l.mu.Unlock()
 
-	// The caller may be signaling a process not started directly via exec.
-	// In this case, find the process in the container's PID namespace and
-	// signal it.
-	if !ok {
+	switch mode {
+	case DeliverToProcess:
+		if ok {
+			// Send signal directly to the identified process.
+			return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		}
+
+		// The caller may be signaling a process not started directly via exec.
+		// In this case, find the process in the container's PID namespace and
+		// signal it.
 		ep, ok := l.processes[execID{cid: cid}]
 		if !ok {
 			return fmt.Errorf("no container with ID: %q", cid)
@@ -772,74 +793,60 @@ func (l *Loader) signalProcess(cid string, pid, signo int32, sendToFGProcess boo
 			return fmt.Errorf("process %d is part of a different container: %q", pid, tg.Leader().ContainerID())
 		}
 		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
-	}
-
-	if !sendToFGProcess {
-		// Send signal directly to exec process.
-		return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
-	}
 
-	// Lookup foreground process group from the TTY for the given process,
-	// and send the signal to it.
-	if ep.tty == nil {
-		return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, pid)
-	}
-	pg := ep.tty.ForegroundProcessGroup()
-	if pg == nil {
-		// No foreground process group has been set. Signal the
-		// original thread group.
-		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
-		return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
-	}
+	case DeliverToForegroundProcessGroup:
+		if !ok {
+			return fmt.Errorf("failed to signal foreground process group for container %q PID %d: no such PID", cid, pid)
+		}
 
-	// Send the signal to all processes in the process group.
-	var lastErr error
-	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
-		if tg.ProcessGroup() != pg {
-			continue
+		// Lookup foreground process group from the TTY for the given process,
+		// and send the signal to it.
+		if ep.tty == nil {
+			return fmt.Errorf("failed to signal foreground process group in container %q PID %d: no TTY attached", cid, pid)
 		}
-		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
-			lastErr = err
+		pg := ep.tty.ForegroundProcessGroup()
+		if pg == nil {
+			// No foreground process group has been set. Signal the
+			// original thread group.
+			log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, pid, pid)
+			return ep.tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		}
+		// Send the signal to all processes in the process group.
+		var lastErr error
+		for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+			if tg.ProcessGroup() != pg {
+				continue
+			}
+			if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+				lastErr = err
+			}
+		}
+		return lastErr
+	case DeliverToAllProcesses:
+		if !ok {
+			return fmt.Errorf("failed to signal all processes in container %q PID %d: no such PID", cid, pid)
 		}
-	}
-	return lastErr
-}
-
-// signalContainer sends a signal to the root container process, or to all
-// processes in the container if all is true.
-func (l *Loader) signalContainer(cid string, signo int32, all bool) error {
-	si := arch.SignalInfo{Signo: signo}
-
-	l.mu.Lock()
-	defer l.mu.Unlock()
-
-	eid := execID{cid: cid}
-	ep, ok := l.processes[eid]
-	if !ok {
-		return fmt.Errorf("failed to signal container %q: no such container", cid)
-	}
-
-	if !all {
-		return ep.tg.SendSignal(&si)
-	}
 
-	// Pause the kernel to prevent new processes from being created while
-	// the signal is delivered. This prevents process leaks when SIGKILL is
-	// sent to the entire container.
-	l.k.Pause()
-	if err := l.k.SendContainerSignal(cid, &si); err != nil {
+		// Pause the kernel to prevent new processes from being created while
+		// the signal is delivered. This prevents process leaks when SIGKILL is
+		// sent to the entire container.
+		l.k.Pause()
+		if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
+			l.k.Unpause()
+			return err
+		}
 		l.k.Unpause()
-		return err
-	}
-	l.k.Unpause()
 
-	// If killing all processes, wait for them to exit.
-	if all && linux.Signal(signo) == linux.SIGKILL {
-		for _, t := range l.k.TaskSet().Root.Tasks() {
-			if t.ContainerID() == cid {
-				t.ThreadGroup().WaitExited()
+		// If SIGKILLing all processes, wait for them to exit.
+		if linux.Signal(signo) == linux.SIGKILL {
+			for _, t := range l.k.TaskSet().Root.Tasks() {
+				if t.ContainerID() == cid {
+					t.ThreadGroup().WaitExited()
+				}
 			}
 		}
+		return nil
+	default:
+		panic(fmt.Sprintf("unknown signal signal delivery mode %v", mode))
 	}
-	return nil
 }
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 60f1d3033..f4c6f1525 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -30,6 +30,7 @@ go_test(
     name = "container_test",
     size = "medium",
     srcs = [
+        "console_test.go",
         "container_test.go",
         "fs_test.go",
         "multi_container_test.go",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
new file mode 100644
index 000000000..82adcbb7d
--- /dev/null
+++ b/runsc/container/console_test.go
@@ -0,0 +1,452 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// createConsoleSocket creates a socket that will receive a console fd from the
+// sandbox. If no error occurs, it returns the server socket and a cleanup
+// function.
+func createConsoleSocket(socketPath string) (*unet.ServerSocket, func() error, error) {
+	cwd, err := os.Getwd()
+	if err != nil {
+		return nil, nil, fmt.Errorf("error getting cwd: %v", err)
+	}
+	// We use a relative path to avoid overflowing the unix path length
+	// limit (108 chars).
+	socketRelPath, err := filepath.Rel(cwd, socketPath)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+	}
+	if len(socketRelPath) > len(socketPath) {
+		socketRelPath = socketPath
+	}
+	srv, err := unet.BindAndListen(socketRelPath, false)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error binding and listening to socket %q: %v", socketPath, err)
+	}
+
+	cleanup := func() error {
+		if err := srv.Close(); err != nil {
+			return fmt.Errorf("error closing socket %q: %v", socketRelPath, err)
+		}
+		if err := os.Remove(socketPath); err != nil {
+			return fmt.Errorf("error removing socket %q: %v", socketRelPath, err)
+		}
+		return nil
+	}
+
+	return srv, cleanup, nil
+}
+
+// receiveConsolePTY accepts a connection on the server socket and reads fds.
+// It fails if more than one FD is received, or if the FD is not a PTY. It
+// returns the PTY master file.
+func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
+	sock, err := srv.Accept()
+	if err != nil {
+		return nil, fmt.Errorf("error accepting socket connection: %v", err)
+	}
+
+	// Allow 3 fds to be received.  We only expect 1.
+	r := sock.Reader(true /* blocking */)
+	r.EnableFDs(1)
+
+	// The socket is closed right after sending the FD, so EOF is
+	// an allowed error.
+	b := [][]byte{{}}
+	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+		return nil, fmt.Errorf("error reading from socket connection: %v", err)
+	}
+
+	// We should have gotten a control message.
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		return nil, fmt.Errorf("error extracting fds from socket connection: %v", err)
+	}
+	if len(fds) != 1 {
+		return nil, fmt.Errorf("got %d fds from socket, wanted 1", len(fds))
+	}
+
+	// Verify that the fd is a terminal.
+	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+		return nil, fmt.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+	}
+
+	return os.NewFile(uintptr(fds[0]), "pty_master"), nil
+}
+
+// Test that an pty FD is sent over the console socket if one is provided.
+func TestConsoleSocket(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+		spec := testutil.NewSpecWithArgs("true")
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		socketPath := filepath.Join(bundleDir, "socket")
+		srv, cleanup, err := createConsoleSocket(socketPath)
+		if err != nil {
+			t.Fatalf("error creating socket at %q: %v", socketPath, err)
+		}
+		defer cleanup()
+
+		// Create the container and pass the socket name.
+		id := testutil.UniqueContainerID()
+		c, err := Create(id, spec, conf, bundleDir, socketPath, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer c.Destroy()
+
+		// Make sure we get a console PTY.
+		ptyMaster, err := receiveConsolePTY(srv)
+		if err != nil {
+			t.Fatalf("error receiving console FD: %v", err)
+		}
+		ptyMaster.Close()
+	}
+}
+
+// Test that job control signals work on a console created with "exec -ti".
+func TestJobControlSignalExec(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	conf := testutil.TestConfig()
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Create a pty master/slave. The slave will be passed to the exec
+	// process.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		t.Fatalf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+	defer ptySlave.Close()
+
+	// Exec bash and attach a terminal.
+	args := &control.ExecArgs{
+		Filename: "/bin/bash",
+		// Don't let bash execute from profile or rc files, otherwise
+		// our PID counts get messed up.
+		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
+		// Pass the pty slave as FD 0, 1, and 2.
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+		},
+		StdioIsPty: true,
+	}
+
+	pid, err := c.Execute(args)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+	if pid != 2 {
+		t.Fatalf("exec got pid %d, wanted %d", pid, 2)
+	}
+
+	// Make sure all the processes are running.
+	expectedPL := []*control.Process{
+		// Root container process.
+		{PID: 1, Cmd: "sleep"},
+		// Bash from exec process.
+		{PID: 2, Cmd: "bash"},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Execute sleep.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for it to start. Sleep's PPID is bash's PID.
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send a SIGTERM to the foreground process for the exec PID. Note that
+	// although we pass in the PID of "bash", it should actually terminate
+	// "sleep", since that is the foreground process.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+	expectedPL = expectedPL[:1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Make sure the process indicates it was killed by a SIGKILL.
+	ws, err := c.WaitPID(pid, true)
+	if err != nil {
+		t.Errorf("waiting on container failed: %v", err)
+	}
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
+// Test that job control signals work on a console created with "run -ti".
+func TestJobControlSignalRootContainer(t *testing.T) {
+	conf := testutil.TestConfig()
+	// Don't let bash execute from profile or rc files, otherwise our PID
+	// counts get messed up.
+	spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc")
+	spec.Process.Terminal = true
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	socketPath := filepath.Join(bundleDir, "socket")
+	srv, cleanup, err := createConsoleSocket(socketPath)
+	if err != nil {
+		t.Fatalf("error creating socket at %q: %v", socketPath, err)
+	}
+	defer cleanup()
+
+	// Create the container and pass the socket name.
+	id := testutil.UniqueContainerID()
+	c, err := Create(id, spec, conf, bundleDir, socketPath, "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+
+	// Get the PTY master.
+	ptyMaster, err := receiveConsolePTY(srv)
+	if err != nil {
+		t.Fatalf("error receiving console FD: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Bash output as well as sandbox output will be written to the PTY
+	// file. Writes after a certain point will block unless we drain the
+	// PTY, so we must continually copy from it.
+	//
+	// We log the output to stdout for debugabilitly, and also to a buffer,
+	// since we wait on particular output from bash below. We use a custom
+	// blockingBuffer which is thread-safe and also blocks on Read calls,
+	// which makes this a suitable Reader for WaitUntilRead.
+	ptyBuf := newBlockingBuffer()
+	tee := io.TeeReader(ptyMaster, ptyBuf)
+	go io.Copy(os.Stdout, tee)
+
+	// Start the container.
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Start waiting for the container to exit in a goroutine. We do this
+	// very early, otherwise it might exit before we have a chance to call
+	// Wait.
+	var (
+		ws syscall.WaitStatus
+		wg sync.WaitGroup
+	)
+	wg.Add(1)
+	go func() {
+		var err error
+		ws, err = c.Wait()
+		if err != nil {
+			t.Errorf("error waiting on container: %v", err)
+		}
+		wg.Done()
+	}()
+
+	// Wait for bash to start.
+	expectedPL := []*control.Process{
+		{PID: 1, Cmd: "bash"},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	// Execute sleep via the terminal.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for sleep to start.
+	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	// Reset the pty buffer, so there is less output for us to scan later.
+	ptyBuf.Reset()
+
+	// Send a SIGTERM to the foreground process. We pass PID=0, indicating
+	// that the root process should be killed. However, by setting
+	// fgProcess=true, the signal should actually be sent to sleep.
+	if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Wait for the sandbox to exit. It should exit with a SIGKILL status.
+	wg.Wait()
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
+// blockingBuffer is a thread-safe buffer that blocks when reading if the
+// buffer is empty.  It implements io.ReadWriter.
+type blockingBuffer struct {
+	// A send to readCh indicates that a previously empty buffer now has
+	// data for reading.
+	readCh chan struct{}
+
+	// mu protects buf.
+	mu  sync.Mutex
+	buf bytes.Buffer
+}
+
+func newBlockingBuffer() *blockingBuffer {
+	return &blockingBuffer{
+		readCh: make(chan struct{}, 1),
+	}
+}
+
+// Write implements Writer.Write.
+func (bb *blockingBuffer) Write(p []byte) (int, error) {
+	bb.mu.Lock()
+	defer bb.mu.Unlock()
+	l := bb.buf.Len()
+	n, err := bb.buf.Write(p)
+	if l == 0 && n > 0 {
+		// New data!
+		bb.readCh <- struct{}{}
+	}
+	return n, err
+}
+
+// Read implements Reader.Read. It will block until data is available.
+func (bb *blockingBuffer) Read(p []byte) (int, error) {
+	for {
+		bb.mu.Lock()
+		n, err := bb.buf.Read(p)
+		if n > 0 || err != io.EOF {
+			if bb.buf.Len() == 0 {
+				// Reset the readCh.
+				select {
+				case <-bb.readCh:
+				default:
+				}
+			}
+			bb.mu.Unlock()
+			return n, err
+		}
+		bb.mu.Unlock()
+
+		// Wait for new data.
+		<-bb.readCh
+	}
+}
+
+// Reset resets the buffer.
+func (bb *blockingBuffer) Reset() {
+	bb.mu.Lock()
+	defer bb.mu.Unlock()
+	bb.buf.Reset()
+	// Reset the readCh.
+	select {
+	case <-bb.readCh:
+	default:
+	}
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index d9cd38c0a..e2bb7d8ec 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -17,7 +17,6 @@ package container
 import (
 	"bytes"
 	"fmt"
-	"io"
 	"io/ioutil"
 	"os"
 	"path"
@@ -31,15 +30,11 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
-	"github.com/kr/pty"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
-	"gvisor.googlesource.com/gvisor/pkg/urpc"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
@@ -1151,89 +1146,6 @@ func TestCapabilities(t *testing.T) {
 	}
 }
 
-// Test that an tty FD is sent over the console socket if one is provided.
-func TestConsoleSocket(t *testing.T) {
-	for _, conf := range configs(all...) {
-		t.Logf("Running test with conf: %+v", conf)
-		spec := testutil.NewSpecWithArgs("true")
-		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-		if err != nil {
-			t.Fatalf("error setting up container: %v", err)
-		}
-		defer os.RemoveAll(rootDir)
-		defer os.RemoveAll(bundleDir)
-
-		// Create a named socket and start listening.  We use a relative path
-		// to avoid overflowing the unix path length limit (108 chars).
-		socketPath := filepath.Join(bundleDir, "socket")
-		cwd, err := os.Getwd()
-		if err != nil {
-			t.Fatalf("error getting cwd: %v", err)
-		}
-		socketRelPath, err := filepath.Rel(cwd, socketPath)
-		if err != nil {
-			t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
-		}
-		if len(socketRelPath) > len(socketPath) {
-			socketRelPath = socketPath
-		}
-		srv, err := unet.BindAndListen(socketRelPath, false)
-		if err != nil {
-			t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
-		}
-		defer os.Remove(socketPath)
-
-		// Create the container and pass the socket name.
-		id := testutil.UniqueContainerID()
-		c, err := Create(id, spec, conf, bundleDir, socketRelPath, "", "")
-		if err != nil {
-			t.Fatalf("error creating container: %v", err)
-		}
-		c.Destroy()
-
-		// Open the othe end of the socket.
-		sock, err := srv.Accept()
-		if err != nil {
-			t.Fatalf("error accepting socket connection: %v", err)
-		}
-
-		// Allow 3 fds to be received.  We only expect 1.
-		r := sock.Reader(true /* blocking */)
-		r.EnableFDs(1)
-
-		// The socket is closed right after sending the FD, so EOF is
-		// an allowed error.
-		b := [][]byte{{}}
-		if _, err := r.ReadVec(b); err != nil && err != io.EOF {
-			t.Fatalf("error reading from socket connection: %v", err)
-		}
-
-		// We should have gotten a control message.
-		fds, err := r.ExtractFDs()
-		if err != nil {
-			t.Fatalf("error extracting fds from socket connection: %v", err)
-		}
-		if len(fds) != 1 {
-			t.Fatalf("got %d fds from socket, wanted 1", len(fds))
-		}
-
-		// Verify that the fd is a terminal.
-		if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
-			t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
-		}
-
-		// Shut it down.
-		if err := c.Destroy(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
-
-		// Close socket.
-		if err := srv.Close(); err != nil {
-			t.Fatalf("error destroying container: %v", err)
-		}
-	}
-}
-
 // TestRunNonRoot checks that sandbox can be configured when running as
 // non-privileged user.
 func TestRunNonRoot(t *testing.T) {
@@ -1626,121 +1538,6 @@ func TestRootNotMount(t *testing.T) {
 	}
 }
 
-func TestJobControlSignalExec(t *testing.T) {
-	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
-	conf := testutil.TestConfig()
-
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create and start the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer c.Destroy()
-	if err := c.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-
-	// Create a pty master/slave. The slave will be passed to the exec
-	// process.
-	ptyMaster, ptySlave, err := pty.Open()
-	if err != nil {
-		t.Fatalf("error opening pty: %v", err)
-	}
-	defer ptyMaster.Close()
-	defer ptySlave.Close()
-
-	// Exec bash and attach a terminal.
-	args := &control.ExecArgs{
-		Filename: "/bin/bash",
-		// Don't let bash execute from profile or rc files, otherwise
-		// our PID counts get messed up.
-		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
-		// Pass the pty slave as FD 0, 1, and 2.
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{ptySlave, ptySlave, ptySlave},
-		},
-		StdioIsPty: true,
-	}
-
-	pid, err := c.Execute(args)
-	if err != nil {
-		t.Fatalf("error executing: %v", err)
-	}
-	if pid != 2 {
-		t.Fatalf("exec got pid %d, wanted %d", pid, 2)
-	}
-
-	// Make sure all the processes are running.
-	expectedPL := []*control.Process{
-		// Root container process.
-		{PID: 1, Cmd: "sleep"},
-		// Bash from exec process.
-		{PID: 2, Cmd: "bash"},
-	}
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Execute sleep.
-	ptyMaster.Write([]byte("sleep 100\n"))
-
-	// Wait for it to start. Sleep's PPID is bash's PID.
-	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Send a SIGTERM to the foreground process for the exec PID. Note that
-	// although we pass in the PID of "bash", it should actually terminate
-	// "sleep", since that is the foreground process.
-	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil {
-		t.Fatalf("error signaling container: %v", err)
-	}
-
-	// Sleep process should be gone.
-	expectedPL = expectedPL[:len(expectedPL)-1]
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Sleep is dead, but it may take more time for bash to notice and
-	// change the foreground process back to itself. We know it is done
-	// when bash writes "Terminated" to the pty.
-	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
-		t.Fatalf("bash did not take over pty: %v", err)
-	}
-
-	// Send a SIGKILL to the foreground process again. This time "bash"
-	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
-	// because bash ignores those.
-	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil {
-		t.Fatalf("error signaling container: %v", err)
-	}
-	expectedPL = expectedPL[:1]
-	if err := waitForProcessList(c, expectedPL); err != nil {
-		t.Error(err)
-	}
-
-	// Make sure the process indicates it was killed by a SIGKILL.
-	ws, err := c.WaitPID(pid, true)
-	if err != nil {
-		t.Errorf("waiting on container failed: %v", err)
-	}
-	if !ws.Signaled() {
-		t.Error("ws.Signaled() got false, want true")
-	}
-	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
-		t.Errorf("ws.Signal() got %v, want %v", got, want)
-	}
-}
-
 func TestUserLog(t *testing.T) {
 	app, err := testutil.FindFile("runsc/container/test_app")
 	if err != nil {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 713b326a6..6dc8cf7f0 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -696,10 +696,15 @@ func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) erro
 	}
 	defer conn.Close()
 
+	mode := boot.DeliverToProcess
+	if all {
+		mode = boot.DeliverToAllProcesses
+	}
+
 	args := boot.SignalArgs{
 		CID:   cid,
 		Signo: int32(sig),
-		All:   all,
+		Mode:  mode,
 	}
 	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
 		return fmt.Errorf("err signaling container %q: %v", cid, err)
@@ -719,13 +724,18 @@ func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgPro
 	}
 	defer conn.Close()
 
-	args := boot.SignalProcessArgs{
-		CID:                     cid,
-		Signo:                   int32(sig),
-		PID:                     pid,
-		SendToForegroundProcess: fgProcess,
+	mode := boot.DeliverToProcess
+	if fgProcess {
+		mode = boot.DeliverToForegroundProcessGroup
+	}
+
+	args := boot.SignalArgs{
+		CID:   cid,
+		Signo: int32(sig),
+		PID:   pid,
+		Mode:  mode,
 	}
-	if err := conn.Call(boot.ContainerSignalProcess, &args, nil); err != nil {
+	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
 		return fmt.Errorf("err signaling container %q PID %d: %v", cid, pid, err)
 	}
 	return nil
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index d08140ad3..3cac674d0 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -66,7 +66,7 @@ func TestExecJobControl(t *testing.T) {
 	if err := testutil.Pull("alpine"); err != nil {
 		t.Fatalf("docker pull failed: %v", err)
 	}
-	d := testutil.MakeDocker("exec-test")
+	d := testutil.MakeDocker("exec-job-control-test")
 
 	// Start the container.
 	if err := d.Run("alpine", "sleep", "1000"); err != nil {
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index b7d07309d..536bb17e0 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -28,6 +28,7 @@ import (
 	"os"
 	"strconv"
 	"strings"
+	"syscall"
 	"testing"
 	"time"
 
@@ -231,6 +232,53 @@ func TestNumCPU(t *testing.T) {
 	}
 }
 
+// TestJobControl tests that job control characters are handled properly.
+func TestJobControl(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("job-control-test")
+
+	// Start the container with an attached PTY.
+	_, ptmx, err := d.RunWithPty("alpine", "sh")
+	if err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer ptmx.Close()
+	defer d.CleanUp()
+
+	// Call "sleep 100" in the shell.
+	if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Give shell a few seconds to start executing the sleep.
+	time.Sleep(2 * time.Second)
+
+	// Send a ^C to the pty, which should kill sleep, but not the shell.
+	// \x03 is ASCII "end of text", which is the same as ^C.
+	if _, err := ptmx.Write([]byte{'\x03'}); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// The shell should still be alive at this point. Sleep should have
+	// exited with code 2+128=130. We'll exit with 10 plus that number, so
+	// that we can be sure that the shell did not get signalled.
+	if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil {
+		t.Fatalf("error writing to pty: %v", err)
+	}
+
+	// Wait for the container to exit.
+	got, err := d.Wait(5 * time.Second)
+	if err != nil {
+		t.Fatalf("error getting exit code: %v", err)
+	}
+	// Container should exit with code 10+130=140.
+	if want := syscall.WaitStatus(140); got != want {
+		t.Errorf("container exited with code %d want %d", got, want)
+	}
+}
+
 func TestMain(m *testing.M) {
 	testutil.EnsureSupportedDockerVersion()
 	os.Exit(m.Run())
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 8a51d3eed..4e48817cf 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -25,6 +25,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"syscall"
 	"time"
 
 	"github.com/kr/pty"
@@ -198,6 +199,13 @@ func (d *Docker) Run(args ...string) error {
 	return err
 }
 
+// RunWithPty is like Run but with an attached pty.
+func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
+	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-it"}
+	a = append(a, args...)
+	return doWithPty(a...)
+}
+
 // RunFg calls 'docker run' with the arguments provided in the foreground. It
 // blocks until the container exits and returns the output.
 func (d *Docker) RunFg(args ...string) (string, error) {
@@ -307,6 +315,37 @@ func (d *Docker) ID() (string, error) {
 	return strings.TrimSpace(string(out)), nil
 }
 
+// Wait waits for container to exit, up to the given timeout. Returns error if
+// wait fails or timeout is hit. Returns the application return code otherwise.
+// Note that the application may have failed even if err == nil, always check
+// the exit code.
+func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
+	timeoutChan := time.After(timeout)
+	waitChan := make(chan (syscall.WaitStatus))
+	errChan := make(chan (error))
+
+	go func() {
+		out, err := do("wait", d.Name)
+		if err != nil {
+			errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err)
+		}
+		exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n"))
+		if err != nil {
+			errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err)
+		}
+		waitChan <- syscall.WaitStatus(uint32(exit))
+	}()
+
+	select {
+	case ws := <-waitChan:
+		return ws, nil
+	case err := <-errChan:
+		return syscall.WaitStatus(1), err
+	case <-timeoutChan:
+		return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name)
+	}
+}
+
 // WaitForOutput calls 'docker logs' to retrieve containers output and searches
 // for the given pattern.
 func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
-- 
cgit v1.2.3


From 8fa6f6fe769ede042b651e5b82bd93721e3aa339 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 17 Oct 2018 13:05:14 -0700
Subject: Reflow comment to 80 columns

PiperOrigin-RevId: 217573168
Change-Id: Ic1914d0ef71bab020e3ee11cf9c4a50a702bd8dd
---
 pkg/sentry/strace/strace.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a16f5490e..f7bfa3a1f 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -568,8 +568,10 @@ func (s SyscallMap) SyscallExit(context interface{}, t *kernel.Task, sysno, rval
 	}
 }
 
-// ConvertToSysnoMap converts the names to a map keyed on the syscall number and value set to true.
-// The map is in a convenient format to call SyscallFlagsTable.Enable().
+// ConvertToSysnoMap converts the names to a map keyed on the syscall number
+// and value set to true.
+//
+// The map is in a convenient format to pass to SyscallFlagsTable.Enable().
 func (s SyscallMap) ConvertToSysnoMap(syscalls []string) (map[uintptr]bool, error) {
 	if syscalls == nil {
 		// Sentinel: no list.
-- 
cgit v1.2.3


From 8c85f5e9ce1d7e25010ac295006555a46034bc39 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 17 Oct 2018 13:24:52 -0700
Subject: Fix typos in socket_test

PiperOrigin-RevId: 217576188
Change-Id: I82e45c306c5c9161e207311c7dbb8a983820c1df
---
 pkg/sentry/fs/host/socket_test.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 1c6f9ddb1..483e99dd6 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -71,18 +71,18 @@ func TestSocketIsBlocking(t *testing.T) {
 		t.Fatalf("newSocket(%v) failed => %v", pair[0], err)
 	}
 	defer sock.DecRef()
-	// Test that the socket now is non blocking.
+	// Test that the socket now is non-blocking.
 	if fl, err = getFl(pair[0]); err != nil {
 		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err)
 	}
 	if fl&syscall.O_NONBLOCK != syscall.O_NONBLOCK {
-		t.Errorf("Expected socket %v to have becoming non blocking", pair[0])
+		t.Errorf("Expected socket %v to have become non-blocking", pair[0])
 	}
 	if fl, err = getFl(pair[1]); err != nil {
 		t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err)
 	}
 	if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK {
-		t.Errorf("Did not expect socket %v to become non blocking", pair[1])
+		t.Errorf("Did not expect socket %v to become non-blocking", pair[1])
 	}
 }
 
-- 
cgit v1.2.3


From 6922eee6499212a009fdc254224f916bd1c46f29 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 17 Oct 2018 15:09:26 -0700
Subject: Merge queue into Unix transport

This queue only has a single user, so there is no need for it to use an
interface. Merging it into the same package as its sole user allows us to avoid
a circular dependency.

This simplifies the code and should slightly improve performance.

PiperOrigin-RevId: 217595889
Change-Id: Iabbd5164240b935f79933618c61581bc8dcd2822
---
 pkg/sentry/socket/unix/transport/BUILD             |   2 +-
 pkg/sentry/socket/unix/transport/connectioned.go   |   9 +-
 pkg/sentry/socket/unix/transport/connectionless.go |   3 +-
 pkg/sentry/socket/unix/transport/queue.go          | 206 +++++++++++++++++++
 pkg/sentry/socket/unix/transport/queue/BUILD       |  15 --
 pkg/sentry/socket/unix/transport/queue/queue.go    | 227 ---------------------
 pkg/sentry/socket/unix/transport/unix.go           |  28 ++-
 7 files changed, 224 insertions(+), 266 deletions(-)
 create mode 100644 pkg/sentry/socket/unix/transport/queue.go
 delete mode 100644 pkg/sentry/socket/unix/transport/queue/BUILD
 delete mode 100644 pkg/sentry/socket/unix/transport/queue/queue.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 04ef0d438..75b5a2eb6 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -8,13 +8,13 @@ go_library(
         "connectioned.go",
         "connectioned_state.go",
         "connectionless.go",
+        "queue.go",
         "unix.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport",
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
-        "//pkg/sentry/socket/unix/transport/queue",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/waiter",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index f09935765..566e3d57b 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -17,7 +17,6 @@ package transport
 import (
 	"sync"
 
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -135,8 +134,8 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
 		stype:        stype,
 	}
 
-	q1 := queue.New(a.Queue, b.Queue, initialLimit)
-	q2 := queue.New(b.Queue, a.Queue, initialLimit)
+	q1 := newQueue(a.Queue, b.Queue, initialLimit)
+	q2 := newQueue(b.Queue, a.Queue, initialLimit)
 
 	if stype == SockStream {
 		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -283,8 +282,8 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 		idGenerator: e.idGenerator,
 		stype:       e.stype,
 	}
-	readQueue := queue.New(ce.WaiterQueue(), ne.Queue, initialLimit)
-	writeQueue := queue.New(ne.Queue, ce.WaiterQueue(), initialLimit)
+	readQueue := newQueue(ce.WaiterQueue(), ne.Queue, initialLimit)
+	writeQueue := newQueue(ne.Queue, ce.WaiterQueue(), initialLimit)
 	ne.connected = &connectedEndpoint{
 		endpoint:   ce,
 		writeQueue: readQueue,
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index fb2728010..86cd05199 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -15,7 +15,6 @@
 package transport
 
 import (
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -34,7 +33,7 @@ type connectionlessEndpoint struct {
 // NewConnectionless creates a new unbound dgram endpoint.
 func NewConnectionless() Endpoint {
 	ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
-	ep.receiver = &queueReceiver{readQueue: queue.New(&waiter.Queue{}, ep.Queue, initialLimit)}
+	ep.receiver = &queueReceiver{readQueue: newQueue(&waiter.Queue{}, ep.Queue, initialLimit)}
 	return ep
 }
 
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
new file mode 100644
index 000000000..203e31333
--- /dev/null
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -0,0 +1,206 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package transport
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// queue is a buffer queue.
+//
+// +stateify savable
+type queue struct {
+	ReaderQueue *waiter.Queue
+	WriterQueue *waiter.Queue
+
+	mu       sync.Mutex `state:"nosave"`
+	closed   bool
+	used     int64
+	limit    int64
+	dataList ilist.List
+}
+
+// newQueue allocates and initializes a new queue.
+func newQueue(ReaderQueue *waiter.Queue, WriterQueue *waiter.Queue, limit int64) *queue {
+	return &queue{ReaderQueue: ReaderQueue, WriterQueue: WriterQueue, limit: limit}
+}
+
+// Close closes q for reading and writing. It is immediately not writable and
+// will become unreadable when no more data is pending.
+//
+// Both the read and write queues must be notified after closing:
+// q.ReaderQueue.Notify(waiter.EventIn)
+// q.WriterQueue.Notify(waiter.EventOut)
+func (q *queue) Close() {
+	q.mu.Lock()
+	q.closed = true
+	q.mu.Unlock()
+}
+
+// Reset empties the queue and Releases all of the Entries.
+//
+// Both the read and write queues must be notified after resetting:
+// q.ReaderQueue.Notify(waiter.EventIn)
+// q.WriterQueue.Notify(waiter.EventOut)
+func (q *queue) Reset() {
+	q.mu.Lock()
+	for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
+		cur.(*message).Release()
+	}
+	q.dataList.Reset()
+	q.used = 0
+	q.mu.Unlock()
+}
+
+// IsReadable determines if q is currently readable.
+func (q *queue) IsReadable() bool {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	return q.closed || q.dataList.Front() != nil
+}
+
+// bufWritable returns true if there is space for writing.
+//
+// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is
+// free.
+//
+// See net/unix/af_unix.c:unix_writeable.
+func (q *queue) bufWritable() bool {
+	return 4*q.used < q.limit
+}
+
+// IsWritable determines if q is currently writable.
+func (q *queue) IsWritable() bool {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	return q.closed || q.bufWritable()
+}
+
+// Enqueue adds an entry to the data queue if room is available.
+//
+// If truncate is true, Enqueue may truncate the message beforing enqueuing it.
+// Otherwise, the entire message must fit. If n < e.Length(), err indicates why.
+//
+// If notify is true, ReaderQueue.Notify must be called:
+// q.ReaderQueue.Notify(waiter.EventIn)
+func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *tcpip.Error) {
+	q.mu.Lock()
+
+	if q.closed {
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrClosedForSend
+	}
+
+	free := q.limit - q.used
+
+	l = e.Length()
+
+	if l > free && truncate {
+		if free == 0 {
+			// Message can't fit right now.
+			q.mu.Unlock()
+			return 0, false, tcpip.ErrWouldBlock
+		}
+
+		e.Truncate(free)
+		l = e.Length()
+		err = tcpip.ErrWouldBlock
+	}
+
+	if l > q.limit {
+		// Message is too big to ever fit.
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrMessageTooLong
+	}
+
+	if l > free {
+		// Message can't fit right now.
+		q.mu.Unlock()
+		return 0, false, tcpip.ErrWouldBlock
+	}
+
+	notify = q.dataList.Front() == nil
+	q.used += l
+	q.dataList.PushBack(e)
+
+	q.mu.Unlock()
+
+	return l, notify, err
+}
+
+// Dequeue removes the first entry in the data queue, if one exists.
+//
+// If notify is true, WriterQueue.Notify must be called:
+// q.WriterQueue.Notify(waiter.EventOut)
+func (q *queue) Dequeue() (e *message, notify bool, err *tcpip.Error) {
+	q.mu.Lock()
+
+	if q.dataList.Front() == nil {
+		err := tcpip.ErrWouldBlock
+		if q.closed {
+			err = tcpip.ErrClosedForReceive
+		}
+		q.mu.Unlock()
+
+		return nil, false, err
+	}
+
+	notify = !q.bufWritable()
+
+	e = q.dataList.Front().(*message)
+	q.dataList.Remove(e)
+	q.used -= e.Length()
+
+	notify = notify && q.bufWritable()
+
+	q.mu.Unlock()
+
+	return e, notify, nil
+}
+
+// Peek returns the first entry in the data queue, if one exists.
+func (q *queue) Peek() (*message, *tcpip.Error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	if q.dataList.Front() == nil {
+		err := tcpip.ErrWouldBlock
+		if q.closed {
+			err = tcpip.ErrClosedForReceive
+		}
+		return nil, err
+	}
+
+	return q.dataList.Front().(*message).Peek(), nil
+}
+
+// QueuedSize returns the number of bytes currently in the queue, that is, the
+// number of readable bytes.
+func (q *queue) QueuedSize() int64 {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	return q.used
+}
+
+// MaxQueueSize returns the maximum number of bytes storable in the queue.
+func (q *queue) MaxQueueSize() int64 {
+	return q.limit
+}
diff --git a/pkg/sentry/socket/unix/transport/queue/BUILD b/pkg/sentry/socket/unix/transport/queue/BUILD
deleted file mode 100644
index d914ecc23..000000000
--- a/pkg/sentry/socket/unix/transport/queue/BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-package(licenses = ["notice"])  # Apache 2.0
-
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_library(
-    name = "queue",
-    srcs = ["queue.go"],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue",
-    visibility = ["//:sandbox"],
-    deps = [
-        "//pkg/ilist",
-        "//pkg/tcpip",
-        "//pkg/waiter",
-    ],
-)
diff --git a/pkg/sentry/socket/unix/transport/queue/queue.go b/pkg/sentry/socket/unix/transport/queue/queue.go
deleted file mode 100644
index b3d2ea68b..000000000
--- a/pkg/sentry/socket/unix/transport/queue/queue.go
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package queue provides the implementation of buffer queue
-// and interface of queue entry with Length method.
-package queue
-
-import (
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// Entry implements Linker interface and has additional required methods.
-type Entry interface {
-	ilist.Linker
-
-	// Length returns the number of bytes stored in the entry.
-	Length() int64
-
-	// Release releases any resources held by the entry.
-	Release()
-
-	// Peek returns a copy of the entry. It must be Released separately.
-	Peek() Entry
-
-	// Truncate reduces the number of bytes stored in the entry to n bytes.
-	//
-	// Preconditions: n <= Length().
-	Truncate(n int64)
-}
-
-// Queue is a buffer queue.
-//
-// +stateify savable
-type Queue struct {
-	ReaderQueue *waiter.Queue
-	WriterQueue *waiter.Queue
-
-	mu       sync.Mutex `state:"nosave"`
-	closed   bool
-	used     int64
-	limit    int64
-	dataList ilist.List
-}
-
-// New allocates and initializes a new queue.
-func New(ReaderQueue *waiter.Queue, WriterQueue *waiter.Queue, limit int64) *Queue {
-	return &Queue{ReaderQueue: ReaderQueue, WriterQueue: WriterQueue, limit: limit}
-}
-
-// Close closes q for reading and writing. It is immediately not writable and
-// will become unreadable when no more data is pending.
-//
-// Both the read and write queues must be notified after closing:
-// q.ReaderQueue.Notify(waiter.EventIn)
-// q.WriterQueue.Notify(waiter.EventOut)
-func (q *Queue) Close() {
-	q.mu.Lock()
-	q.closed = true
-	q.mu.Unlock()
-}
-
-// Reset empties the queue and Releases all of the Entries.
-//
-// Both the read and write queues must be notified after resetting:
-// q.ReaderQueue.Notify(waiter.EventIn)
-// q.WriterQueue.Notify(waiter.EventOut)
-func (q *Queue) Reset() {
-	q.mu.Lock()
-	for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
-		cur.(Entry).Release()
-	}
-	q.dataList.Reset()
-	q.used = 0
-	q.mu.Unlock()
-}
-
-// IsReadable determines if q is currently readable.
-func (q *Queue) IsReadable() bool {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	return q.closed || q.dataList.Front() != nil
-}
-
-// bufWritable returns true if there is space for writing.
-//
-// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is
-// free.
-//
-// See net/unix/af_unix.c:unix_writeable.
-func (q *Queue) bufWritable() bool {
-	return 4*q.used < q.limit
-}
-
-// IsWritable determines if q is currently writable.
-func (q *Queue) IsWritable() bool {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	return q.closed || q.bufWritable()
-}
-
-// Enqueue adds an entry to the data queue if room is available.
-//
-// If truncate is true, Enqueue may truncate the message beforing enqueuing it.
-// Otherwise, the entire message must fit. If n < e.Length(), err indicates why.
-//
-// If notify is true, ReaderQueue.Notify must be called:
-// q.ReaderQueue.Notify(waiter.EventIn)
-func (q *Queue) Enqueue(e Entry, truncate bool) (l int64, notify bool, err *tcpip.Error) {
-	q.mu.Lock()
-
-	if q.closed {
-		q.mu.Unlock()
-		return 0, false, tcpip.ErrClosedForSend
-	}
-
-	free := q.limit - q.used
-
-	l = e.Length()
-
-	if l > free && truncate {
-		if free == 0 {
-			// Message can't fit right now.
-			q.mu.Unlock()
-			return 0, false, tcpip.ErrWouldBlock
-		}
-
-		e.Truncate(free)
-		l = e.Length()
-		err = tcpip.ErrWouldBlock
-	}
-
-	if l > q.limit {
-		// Message is too big to ever fit.
-		q.mu.Unlock()
-		return 0, false, tcpip.ErrMessageTooLong
-	}
-
-	if l > free {
-		// Message can't fit right now.
-		q.mu.Unlock()
-		return 0, false, tcpip.ErrWouldBlock
-	}
-
-	notify = q.dataList.Front() == nil
-	q.used += l
-	q.dataList.PushBack(e)
-
-	q.mu.Unlock()
-
-	return l, notify, err
-}
-
-// Dequeue removes the first entry in the data queue, if one exists.
-//
-// If notify is true, WriterQueue.Notify must be called:
-// q.WriterQueue.Notify(waiter.EventOut)
-func (q *Queue) Dequeue() (e Entry, notify bool, err *tcpip.Error) {
-	q.mu.Lock()
-
-	if q.dataList.Front() == nil {
-		err := tcpip.ErrWouldBlock
-		if q.closed {
-			err = tcpip.ErrClosedForReceive
-		}
-		q.mu.Unlock()
-
-		return nil, false, err
-	}
-
-	notify = !q.bufWritable()
-
-	e = q.dataList.Front().(Entry)
-	q.dataList.Remove(e)
-	q.used -= e.Length()
-
-	notify = notify && q.bufWritable()
-
-	q.mu.Unlock()
-
-	return e, notify, nil
-}
-
-// Peek returns the first entry in the data queue, if one exists.
-func (q *Queue) Peek() (Entry, *tcpip.Error) {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-
-	if q.dataList.Front() == nil {
-		err := tcpip.ErrWouldBlock
-		if q.closed {
-			err = tcpip.ErrClosedForReceive
-		}
-		return nil, err
-	}
-
-	return q.dataList.Front().(Entry).Peek(), nil
-}
-
-// QueuedSize returns the number of bytes currently in the queue, that is, the
-// number of readable bytes.
-func (q *Queue) QueuedSize() int64 {
-	q.mu.Lock()
-	defer q.mu.Unlock()
-	return q.used
-}
-
-// MaxQueueSize returns the maximum number of bytes storable in the queue.
-func (q *Queue) MaxQueueSize() int64 {
-	return q.limit
-}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 577aa87d5..9a0de9a06 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -20,7 +20,6 @@ import (
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/ilist"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport/queue"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -271,7 +270,7 @@ func (m *message) Release() {
 }
 
 // Peek returns a copy of the message.
-func (m *message) Peek() queue.Entry {
+func (m *message) Peek() *message {
 	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
 }
 
@@ -325,12 +324,12 @@ type Receiver interface {
 //
 // +stateify savable
 type queueReceiver struct {
-	readQueue *queue.Queue
+	readQueue *queue
 }
 
 // Recv implements Receiver.Recv.
 func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
-	var m queue.Entry
+	var m *message
 	var notify bool
 	var err *tcpip.Error
 	if peek {
@@ -341,15 +340,14 @@ func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek
 	if err != nil {
 		return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
 	}
-	msg := m.(*message)
-	src := []byte(msg.Data)
+	src := []byte(m.Data)
 	var copied uintptr
 	for i := 0; i < len(data) && len(src) > 0; i++ {
 		n := copy(data[i], src)
 		copied += uintptr(n)
 		src = src[n:]
 	}
-	return copied, uintptr(len(msg.Data)), msg.Control, msg.Address, notify, nil
+	return copied, uintptr(len(m.Data)), m.Control, m.Address, notify, nil
 }
 
 // RecvNotify implements Receiver.RecvNotify.
@@ -456,10 +454,9 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 			return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
 		}
 		notify = n
-		msg := m.(*message)
-		q.buffer = []byte(msg.Data)
-		q.control = msg.Control
-		q.addr = msg.Address
+		q.buffer = []byte(m.Data)
+		q.control = m.Control
+		q.addr = m.Address
 	}
 
 	var copied uintptr
@@ -506,10 +503,9 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 			break
 		}
 		notify = notify || n
-		msg := m.(*message)
-		q.buffer = []byte(msg.Data)
-		q.control = msg.Control
-		q.addr = msg.Address
+		q.buffer = []byte(m.Data)
+		q.control = m.Control
+		q.addr = m.Address
 
 		if wantCreds {
 			if (q.control.Credentials == nil) != (c.Credentials == nil) {
@@ -619,7 +615,7 @@ type connectedEndpoint struct {
 		Type() SockType
 	}
 
-	writeQueue *queue.Queue
+	writeQueue *queue
 }
 
 // Passcred implements ConnectedEndpoint.Passcred.
-- 
cgit v1.2.3


From b2a88ff4713325fca736f6a3bf200be02d2d72a7 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 17 Oct 2018 15:48:55 -0700
Subject: Check thread group CPU timers in the CPU clock ticker.

This reduces the number of goroutines and runtime timers when
ITIMER_VIRTUAL or ITIMER_PROF are enabled, or when RLIMIT_CPU is set.
This also ensures that thread group CPU timers only advance if running
tasks are observed at the time the CPU clock advances, mostly
eliminating the possibility that a CPU timer expiration observes no
running tasks and falls back to the group leader.

PiperOrigin-RevId: 217603396
Change-Id: Ia24ce934d5574334857d9afb5ad8ca0b6a6e65f4
---
 pkg/sentry/kernel/BUILD                 |   1 -
 pkg/sentry/kernel/kernel.go             |  46 ++---
 pkg/sentry/kernel/task_acct.go          |  97 ++++++++-
 pkg/sentry/kernel/task_clone.go         |   2 +-
 pkg/sentry/kernel/task_exit.go          |   3 -
 pkg/sentry/kernel/task_run.go           |   4 -
 pkg/sentry/kernel/task_sched.go         | 344 ++++++++++++++++++++++++++++++--
 pkg/sentry/kernel/task_signals.go       |  66 ------
 pkg/sentry/kernel/thread_group.go       |  73 +++++--
 pkg/sentry/kernel/threads.go            |   6 +-
 pkg/sentry/kernel/time/time.go          |  35 +++-
 pkg/sentry/kernel/timekeeper.go         |  26 +++
 pkg/sentry/kernel/timer.go              | 290 ---------------------------
 pkg/sentry/syscalls/linux/sys_rlimit.go |   2 +-
 pkg/sentry/syscalls/linux/sys_timer.go  |  61 ++----
 15 files changed, 551 insertions(+), 505 deletions(-)
 delete mode 100644 pkg/sentry/kernel/timer.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index acc61cb09..e2fb61ba6 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -111,7 +111,6 @@ go_library(
         "threads.go",
         "timekeeper.go",
         "timekeeper_state.go",
-        "timer.go",
         "uts_namespace.go",
         "vdso.go",
         "version.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 84afdb530..5d6856f3c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -20,7 +20,7 @@
 //
 // Kernel.extMu
 //   ThreadGroup.timerMu
-//     ktime.Timer.mu (for IntervalTimer)
+//     ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
 //       TaskSet.mu
 //         SignalHandlers.mu
 //           Task.mu
@@ -617,7 +617,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		return nil, 0, fmt.Errorf("no kernel MountNamespace")
 	}
 
-	tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
 	ctx := args.NewContext(k)
 
 	// Grab the root directory.
@@ -705,7 +705,7 @@ func (k *Kernel) Start() error {
 	}
 
 	k.started = true
-	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k})
+	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k))
 	k.cpuClockTicker.Swap(ktime.Setting{
 		Enabled: true,
 		Period:  linux.ClockTick,
@@ -741,14 +741,13 @@ func (k *Kernel) pauseTimeLocked() {
 	// mutex, while holding the Timer mutex.)
 	for t := range k.tasks.Root.tids {
 		if t == t.tg.leader {
-			t.tg.tm.pause()
-		}
-		// This means we'll iterate ThreadGroups and FDMaps shared by multiple
-		// tasks repeatedly, but ktime.Timer.Pause is idempotent so this is
-		// harmless.
-		for _, it := range t.tg.timers {
-			it.PauseTimer()
+			t.tg.itimerRealTimer.Pause()
+			for _, it := range t.tg.timers {
+				it.PauseTimer()
+			}
 		}
+		// This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if fdm := t.fds; fdm != nil {
 			for _, desc := range fdm.files {
 				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
@@ -774,10 +773,10 @@ func (k *Kernel) resumeTimeLocked() {
 	k.timekeeper.ResumeUpdates()
 	for t := range k.tasks.Root.tids {
 		if t == t.tg.leader {
-			t.tg.tm.resume()
-		}
-		for _, it := range t.tg.timers {
-			it.ResumeTimer()
+			t.tg.itimerRealTimer.Resume()
+			for _, it := range t.tg.timers {
+				it.ResumeTimer()
+			}
 		}
 		if fdm := t.fds; fdm != nil {
 			for _, desc := range fdm.files {
@@ -1078,22 +1077,3 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return nil
 	}
 }
-
-type kernelCPUClockListener struct {
-	k *Kernel
-}
-
-// Notify implements ktime.TimerListener.Notify.
-func (l kernelCPUClockListener) Notify(exp uint64) {
-	// Only increment cpuClock by 1 regardless of the number of expirations.
-	// This approximately compensates for cases where thread throttling or bad
-	// Go runtime scheduling prevents the cpuClockTicker goroutine, and
-	// presumably task goroutines as well, from executing for a long period of
-	// time. It's also necessary to prevent CPU clocks from seeing large
-	// discontinuous jumps.
-	atomic.AddUint64(&l.k.cpuClock, 1)
-}
-
-// Destroy implements ktime.TimerListener.Destroy.
-func (l kernelCPUClockListener) Destroy() {
-}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index ce12cdb64..d2052921e 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -21,8 +21,99 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+// Getitimer implements getitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var s ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		tm, s = t.tg.itimerRealTimer.Get()
+	case linux.ITIMER_VIRTUAL:
+		tm = t.tg.UserCPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerVirtSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	case linux.ITIMER_PROF:
+		tm = t.tg.CPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerProfSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	val, iv := ktime.SpecFromSetting(tm, s)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(val),
+		Interval: linux.DurationToTimeval(iv),
+	}, nil
+}
+
+// Setitimer implements setitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var olds ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock())
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+		tm, olds = t.tg.itimerRealTimer.Swap(news)
+	case linux.ITIMER_VIRTUAL:
+		c := t.tg.UserCPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerVirtSetting
+			t.tg.itimerVirtSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	case linux.ITIMER_PROF:
+		c := t.tg.CPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerProfSetting
+			t.tg.itimerProfSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	oldval, oldiv := ktime.SpecFromSetting(tm, olds)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(oldval),
+		Interval: linux.DurationToTimeval(oldiv),
+	}, nil
+}
+
 // IOUsage returns the io usage of the thread.
 func (t *Task) IOUsage() *usage.IO {
 	return t.ioUsage
@@ -56,12 +147,6 @@ func (t *Task) SetName(name string) {
 	t.Debugf("Set thread name to %q", name)
 }
 
-// SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft
-// limits on CPU time used by this process.
-func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit) {
-	tg.Timer().applyCPULimits(*l)
-}
-
 // Limits implements context.Context.Limits.
 func (t *Task) Limits() *limits.LimitSet {
 	return t.ThreadGroup().Limits()
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 7c469ec46..de3aef40d 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -241,7 +241,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
 		}
-		tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+		tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
 	}
 
 	cfg := &TaskConfig{
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index f5b45fb17..65969ca9b 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -675,9 +675,6 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
 		t.tg.ioUsage.Accumulate(t.ioUsage)
 		t.tg.signalHandlers.mu.Lock()
 		t.tg.tasks.Remove(t)
-		if t.tg.lastTimerSignalTask == t {
-			t.tg.lastTimerSignalTask = nil
-		}
 		t.tg.tasksCount--
 		tc := t.tg.tasksCount
 		t.tg.signalHandlers.mu.Unlock()
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 8dd0ef6ea..49ac933b7 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -70,10 +70,6 @@ func (t *Task) run(threadID uintptr) {
 	// Platform.CooperativelySharesAddressSpace() == true, we give up the
 	// AddressSpace before the task goroutine finishes executing.
 
-	// Ensure that thread group timers for execution time reflect that this
-	// task now exists.
-	t.tg.tm.kick()
-
 	// If this is a newly-started task, it should check for participation in
 	// group stops. If this is a task resuming after restore, it was
 	// interrupted by saving. In either case, the task is initially
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 49141ab74..19dcc963a 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -18,12 +18,15 @@ package kernel
 
 import (
 	"fmt"
+	"math/rand"
 	"sync/atomic"
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -84,6 +87,33 @@ type TaskGoroutineSchedInfo struct {
 	SysTicks uint64
 }
 
+// userTicksAt returns the extrapolated value of ts.UserTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause userTicksAt to adjust stats by too much,
+// making the observed stats non-monotonic.
+func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
+		// Update stats to reflect execution since the last update.
+		return ts.UserTicks + (now - ts.Timestamp)
+	}
+	return ts.UserTicks
+}
+
+// sysTicksAt returns the extrapolated value of ts.SysTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: As for userTicksAt.
+func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
+		return ts.SysTicks + (now - ts.Timestamp)
+	}
+	return ts.SysTicks
+}
+
 // Preconditions: The caller must be running on the task goroutine.
 func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
 	now := t.k.CPUClockNow()
@@ -127,26 +157,12 @@ func (t *Task) CPUStats() usage.CPUStats {
 	return t.cpuStatsAt(t.k.CPUClockNow())
 }
 
-// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
-// monotonic, this is satisfied if now is the result of a previous call to
-// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
-// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
-// the returned stats non-monotonic.
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
 func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
 	tsched := t.TaskGoroutineSchedInfo()
-	if tsched.Timestamp < now {
-		// Update stats to reflect execution since the last update to
-		// t.gosched.
-		switch tsched.State {
-		case TaskGoroutineRunningSys:
-			tsched.SysTicks += now - tsched.Timestamp
-		case TaskGoroutineRunningApp:
-			tsched.UserTicks += now - tsched.Timestamp
-		}
-	}
 	return usage.CPUStats{
-		UserTime:          time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
-		SysTime:           time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
+		UserTime:          time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
+		SysTime:           time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
 		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
 	}
 }
@@ -162,9 +178,14 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
 		// ThreadGroup has ever executed anyway.
 		return usage.CPUStats{}
 	}
-	now := tg.leader.k.CPUClockNow()
+	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
+}
+
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
+// must be locked.
+func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
 	stats := tg.exitedCPUStats
-	// Account for active tasks.
+	// Account for live tasks.
 	for t := tg.tasks.Front(); t != nil; t = t.Next() {
 		stats.Accumulate(t.cpuStatsAt(now))
 	}
@@ -182,6 +203,291 @@ func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
 	return tg.childCPUStats
 }
 
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
+//
+// +stateify savable
+type taskClock struct {
+	t *Task
+
+	// If includeSys is true, the taskClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// taskClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+	// based on either of the clock events, so there's no event to be
+	// notified for.
+	ktime.NoClockEvents `state:"nosave"`
+
+	// Implements ktime.Clock.WallTimeUntil.
+	//
+	// As an upper bound, a task's clock cannot advance faster than CPU
+	// time. It would have to execute at a rate of more than 1 task-second
+	// per 1 CPU-second, which isn't possible.
+	ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+	stats := tc.t.CPUStats()
+	if tc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
+//
+// +stateify savable
+type tgClock struct {
+	tg *ThreadGroup
+
+	// If includeSys is true, the tgClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// tgClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable.
+	ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+	stats := tgc.tg.CPUStats()
+	if tgc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+	// Thread group CPU time should not exceed wall time * live tasks, since
+	// task goroutines exit after the transition to TaskExitZombie in
+	// runExitNotify.
+	tgc.tg.pidns.owner.mu.RLock()
+	n := tgc.tg.liveTasks
+	tgc.tg.pidns.owner.mu.RUnlock()
+	if n == 0 {
+		if t.Before(now) {
+			return 0
+		}
+		// The timer tick raced with thread group exit, after which no more
+		// tasks can enter the thread group. So tgc.Now() will never advance
+		// again. Return a large delay; the timer should be stopped long before
+		// it comes again anyway.
+		return time.Hour
+	}
+	// This is a lower bound on the amount of time that can elapse before an
+	// associated timer expires, so returning this value tends to result in a
+	// sequence of closely-spaced ticks just before timer expiry. To avoid
+	// this, round up to the nearest ClockTick; CPU usage measurements are
+	// limited to this resolution anyway.
+	remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
+	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: false}
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: true}
+}
+
+type kernelCPUClockTicker struct {
+	k *Kernel
+
+	// These are essentially kernelCPUClockTicker.Notify local variables that
+	// are cached between calls to reduce allocations.
+	rng *rand.Rand
+	tgs []*ThreadGroup
+}
+
+func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
+	return &kernelCPUClockTicker{
+		k:   k,
+		rng: rand.New(rand.NewSource(rand.Int63())),
+	}
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
+	// Only increment cpuClock by 1 regardless of the number of expirations.
+	// This approximately compensates for cases where thread throttling or bad
+	// Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
+	// presumably task goroutines as well, from executing for a long period of
+	// time. It's also necessary to prevent CPU clocks from seeing large
+	// discontinuous jumps.
+	now := atomic.AddUint64(&ticker.k.cpuClock, 1)
+
+	// Check thread group CPU timers.
+	tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs)
+	for _, tg := range tgs {
+		if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 {
+			continue
+		}
+
+		ticker.k.tasks.mu.RLock()
+		if tg.leader == nil {
+			// No tasks have ever run in this thread group.
+			ticker.k.tasks.mu.RUnlock()
+			continue
+		}
+		// Accumulate thread group CPU stats, and randomly select running tasks
+		// using reservoir sampling to receive CPU timer signals.
+		var virtReceiver *Task
+		nrVirtCandidates := 0
+		var profReceiver *Task
+		nrProfCandidates := 0
+		tgUserTime := tg.exitedCPUStats.UserTime
+		tgSysTime := tg.exitedCPUStats.SysTime
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			tsched := t.TaskGoroutineSchedInfo()
+			tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
+			tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
+			switch tsched.State {
+			case TaskGoroutineRunningApp:
+				// Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
+				// timers.
+				nrVirtCandidates++
+				if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 {
+					virtReceiver = t
+				}
+				fallthrough
+			case TaskGoroutineRunningSys:
+				// Considered by ITIMER_PROF and RLIMIT_CPU timers.
+				nrProfCandidates++
+				if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 {
+					profReceiver = t
+				}
+			}
+		}
+		tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
+		tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())
+
+		// All of the following are standard (not real-time) signals, which are
+		// automatically deduplicated, so we ignore the number of expirations.
+		tg.signalHandlers.mu.Lock()
+		// It should only be possible for these timers to advance if we found
+		// at least one running task.
+		if virtReceiver != nil {
+			// ITIMER_VIRTUAL
+			newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
+			tg.itimerVirtSetting = newItimerVirtSetting
+			if exp != 0 {
+				virtReceiver.sendSignalLocked(sigPriv(linux.SIGVTALRM), true)
+			}
+		}
+		if profReceiver != nil {
+			// ITIMER_PROF
+			newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
+			tg.itimerProfSetting = newItimerProfSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(sigPriv(linux.SIGPROF), true)
+			}
+			// RLIMIT_CPU soft limit
+			newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
+			tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(sigPriv(linux.SIGXCPU), true)
+			}
+			// RLIMIT_CPU hard limit
+			rlimitCPUMax := tg.limits.Get(limits.CPU).Max
+			if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
+				profReceiver.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+			}
+		}
+		tg.signalHandlers.mu.Unlock()
+
+		ticker.k.tasks.mu.RUnlock()
+	}
+
+	// Retain tgs between calls to Notify to reduce allocations.
+	for i := range tgs {
+		tgs[i] = nil
+	}
+	ticker.tgs = tgs[:0]
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (ticker *kernelCPUClockTicker) Destroy() {
+}
+
+// randInt31n returns a random integer in [0, n).
+//
+// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
+// See that function for details.
+func randInt31n(rng *rand.Rand, n int32) int32 {
+	v := rng.Uint32()
+	prod := uint64(v) * uint64(n)
+	low := uint32(prod)
+	if low < uint32(n) {
+		thresh := uint32(-n) % uint32(n)
+		for low < thresh {
+			v = rng.Uint32()
+			prod = uint64(v) * uint64(n)
+			low = uint32(prod)
+		}
+	}
+	return int32(prod >> 32)
+}
+
+// NotifyRlimitCPUUpdated is called by setrlimit.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) NotifyRlimitCPUUpdated() {
+	t.k.cpuClockTicker.Atomically(func() {
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		rlimitCPU := t.tg.limits.Get(limits.CPU)
+		t.tg.rlimitCPUSoftSetting = ktime.Setting{
+			Enabled: rlimitCPU.Cur != limits.Infinity,
+			Next:    ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
+			Period:  time.Second,
+		}
+		if rlimitCPU.Max != limits.Infinity {
+			// Check if tg is already over the hard limit.
+			tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
+			tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
+			if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
+				t.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+			}
+		}
+		t.tg.updateCPUTimersEnabledLocked()
+	})
+}
+
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
+	rlimitCPU := tg.limits.Get(limits.CPU)
+	if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 1)
+	} else {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 0)
+	}
+}
+
 // StateStatus returns a string representation of the task's current state,
 // appropriate for /proc/[pid]/status.
 func (t *Task) StateStatus() string {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index afb010f60..e2925a708 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -359,72 +359,6 @@ func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
 	return tg.leader.sendSignalLocked(info, true /* group */)
 }
 
-// Preconditions: The TaskSet mutex must be locked.
-func (t *Task) onCPULocked(includeSys bool) bool {
-	// Task is exiting.
-	if t.exitState != TaskExitNone {
-		return false
-	}
-
-	switch t.TaskGoroutineSchedInfo().State {
-	case TaskGoroutineRunningSys:
-		return includeSys
-	case TaskGoroutineRunningApp:
-		return true
-	default:
-		return false
-	}
-}
-
-// SendTimerSignal mimics the process timer signal delivery behavior in linux:
-// signals are delivered to the thread that triggers the timer expiration (see
-// kernel/time/posix-cpu-timers.c:check_process_timers(). This
-// means
-//   1) the thread is running on cpu at the time.
-//   2) a thread runs more frequently will get more of those signals.
-//
-// We approximate this behavior by selecting a running task in a round-robin
-// fashion. Statistically, a thread running more often should have a higher
-// probability to be selected.
-func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error {
-	tg.pidns.owner.mu.RLock()
-	defer tg.pidns.owner.mu.RUnlock()
-	tg.signalHandlers.mu.Lock()
-	defer tg.signalHandlers.mu.Unlock()
-
-	// Find the next running threads.
-	var t *Task
-	if tg.lastTimerSignalTask == nil {
-		t = tg.tasks.Front()
-	} else {
-		t = tg.lastTimerSignalTask.Next()
-	}
-
-	// Iterate from lastTimerSignalTask.Next() to the last task in the task list.
-	for t != nil {
-		if t.onCPULocked(includeSys) {
-			tg.lastTimerSignalTask = t
-			return t.sendSignalLocked(info, true /* group */)
-		}
-		t = t.Next()
-	}
-
-	// t is nil when we reach here. If lastTimerSignalTask is not nil, iterate
-	// from Front to lastTimerSignalTask.
-	if tg.lastTimerSignalTask != nil {
-		for t := tg.tasks.Front(); t != tg.lastTimerSignalTask.Next(); t = t.Next() {
-			if t.onCPULocked(includeSys) {
-				tg.lastTimerSignalTask = t
-				return t.sendSignalLocked(info, true /* group */)
-			}
-		}
-	}
-
-	// No running threads? Just try the leader.
-	tg.lastTimerSignalTask = tg.leader
-	return tg.leader.sendSignalLocked(info, true /* group */)
-}
-
 func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
 	return t.sendSignalTimerLocked(info, group, nil)
 }
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 13dce08ce..dfff7b52d 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,6 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 )
@@ -59,12 +60,6 @@ type ThreadGroup struct {
 	// pendingSignals is protected by the signal mutex.
 	pendingSignals pendingSignals
 
-	// lastTimerSignalTask records the last task we deliver a process timer signal to.
-	// Please see SendTimerSignal for more details.
-	//
-	// lastTimerSignalTask is protected by the signal mutex.
-	lastTimerSignalTask *Task
-
 	// groupStopPhase indicates the state of a group stop in progress on the
 	// thread group, if any.
 	//
@@ -152,14 +147,39 @@ type ThreadGroup struct {
 	// restarted by Task.Start.
 	liveGoroutines sync.WaitGroup `state:"nosave"`
 
-	// tm contains process timers. TimerManager fields are immutable.
-	tm TimerManager
+	timerMu sync.Mutex `state:"nosave"`
+
+	// itimerRealTimer implements ITIMER_REAL for the thread group.
+	itimerRealTimer *ktime.Timer
+
+	// itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
+	//
+	// itimerVirtSetting is protected by the signal mutex.
+	itimerVirtSetting ktime.Setting
+
+	// itimerProfSetting is the ITIMER_PROF setting for the thread group.
+	//
+	// itimerProfSetting is protected by the signal mutex.
+	itimerProfSetting ktime.Setting
+
+	// rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
+	// notifications for the thread group.
+	//
+	// rlimitCPUSoftSetting is protected by the signal mutex.
+	rlimitCPUSoftSetting ktime.Setting
+
+	// cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
+	// itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
+	// or limits.Get(CPU) is finite.
+	//
+	// cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is
+	// accessed using atomic memory operations.
+	cpuTimersEnabled uint32
 
 	// timers is the thread group's POSIX interval timers. nextTimerID is the
 	// TimerID at which allocation should begin searching for an unused ID.
 	//
 	// timers and nextTimerID are protected by timerMu.
-	timerMu     sync.Mutex `state:"nosave"`
 	timers      map[linux.TimerID]*IntervalTimer
 	nextTimerID linux.TimerID
 
@@ -211,11 +231,11 @@ type ThreadGroup struct {
 	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
 }
 
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// newThreadGroup returns a new, empty thread group in PID namespace ns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
-func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
 	tg := &ThreadGroup{
 		threadGroupNode: threadGroupNode{
 			pidns: ns,
@@ -225,7 +245,7 @@ func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linu
 		ioUsage:           &usage.IO{},
 		limits:            limits,
 	}
-	tg.tm = newTimerManager(tg, monotonicClock)
+	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
 	tg.timers = make(map[linux.TimerID]*IntervalTimer)
 	tg.rscr.Store(&RSEQCriticalRegion{})
 	return tg
@@ -249,11 +269,6 @@ func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
 	return tg.signalHandlers
 }
 
-// Timer returns tg's timers.
-func (tg *ThreadGroup) Timer() *TimerManager {
-	return &tg.tm
-}
-
 // Limits returns tg's limits.
 func (tg *ThreadGroup) Limits() *limits.LimitSet {
 	return tg.limits
@@ -261,11 +276,9 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
 
 // release releases the thread group's resources.
 func (tg *ThreadGroup) release() {
-	// These must be done without holding the TaskSet or signal mutexes since
-	// timers send signals with Timer.mu locked.
-
-	tg.tm.destroy()
-
+	// Timers must be destroyed without holding the TaskSet or signal mutexes
+	// since timers send signals with Timer.mu locked.
+	tg.itimerRealTimer.Destroy()
 	var its []*IntervalTimer
 	tg.pidns.owner.mu.Lock()
 	tg.signalHandlers.mu.Lock()
@@ -292,3 +305,19 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
 		}
 	}
 }
+
+// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
+//
+// +stateify savable
+type itimerRealListener struct {
+	tg *ThreadGroup
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l *itimerRealListener) Notify(exp uint64) {
+	l.tg.SendSignal(sigPriv(linux.SIGALRM))
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l *itimerRealListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 3d5713106..4e3d19e97 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -243,9 +243,13 @@ func (ns *PIDNamespace) Tasks() []*Task {
 
 // ThreadGroups returns a snapshot of the thread groups in ns.
 func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+	return ns.ThreadGroupsAppend(nil)
+}
+
+// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
+func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
 	ns.owner.mu.RLock()
 	defer ns.owner.mu.RUnlock()
-	var tgs []*ThreadGroup
 	for t := range ns.tids {
 		if t == t.tg.leader {
 			tgs = append(tgs, t.tg)
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 1f6fed007..52e0dfba1 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -307,6 +307,12 @@ type Setting struct {
 // SettingFromSpec converts a (value, interval) pair to a Setting based on a
 // reading from c. value is interpreted as a time relative to c.Now().
 func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+	return SettingFromSpecAt(value, interval, c.Now())
+}
+
+// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is
+// interpreted as a time relative to now.
+func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) {
 	if value < 0 {
 		return Setting{}, syserror.EINVAL
 	}
@@ -315,7 +321,7 @@ func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Sett
 	}
 	return Setting{
 		Enabled: true,
-		Next:    c.Now().Add(value),
+		Next:    now.Add(value),
 		Period:  interval,
 	}, nil
 }
@@ -365,14 +371,14 @@ func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec {
 	}
 }
 
-// advancedTo returns an updated Setting and a number of expirations after
-// the associated Clock indicates a time of now.
+// At returns an updated Setting and a number of expirations after the
+// associated Clock indicates a time of now.
 //
-// Settings may be created by successive calls to advancedTo with decreasing
+// Settings may be created by successive calls to At with decreasing
 // values of now (i.e. time may appear to go backward). Supporting this is
 // required to support non-monotonic clocks, as well as allowing
 // Timer.clock.Now() to be called without holding Timer.mu.
-func (s Setting) advancedTo(now Time) (Setting, uint64) {
+func (s Setting) At(now Time) (Setting, uint64) {
 	if !s.Enabled {
 		return s, 0
 	}
@@ -519,7 +525,7 @@ func (t *Timer) Tick() {
 	if t.paused {
 		return
 	}
-	s, exp := t.setting.advancedTo(now)
+	s, exp := t.setting.At(now)
 	t.setting = s
 	if exp > 0 {
 		t.listener.Notify(exp)
@@ -574,7 +580,7 @@ func (t *Timer) Get() (Time, Setting) {
 	if t.paused {
 		panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
 	}
-	s, exp := t.setting.advancedTo(now)
+	s, exp := t.setting.At(now)
 	t.setting = s
 	if exp > 0 {
 		t.listener.Notify(exp)
@@ -607,14 +613,14 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	if t.paused {
 		panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
 	}
-	oldS, oldExp := t.setting.advancedTo(now)
+	oldS, oldExp := t.setting.At(now)
 	if oldExp > 0 {
 		t.listener.Notify(oldExp)
 	}
 	if f != nil {
 		f()
 	}
-	newS, newExp := s.advancedTo(now)
+	newS, newExp := s.At(now)
 	t.setting = newS
 	if newExp > 0 {
 		t.listener.Notify(newExp)
@@ -623,6 +629,17 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	return now, oldS
 }
 
+// Atomically invokes f atomically with respect to expirations of t; that is, t
+// cannot generate expirations while f is being called.
+//
+// Preconditions: f cannot call any Timer methods since it is called with the
+// Timer mutex locked.
+func (t *Timer) Atomically(f func()) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f()
+}
+
 // Preconditions: t.mu must be locked.
 func (t *Timer) resetKickerLocked(now Time) {
 	if t.setting.Enabled {
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index df5dbe128..2167f3efe 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"fmt"
 	"sync"
 	"time"
 
@@ -277,3 +278,28 @@ func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
 func (t *Timekeeper) BootTime() ktime.Time {
 	return t.bootTime
 }
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+//
+// +stateify savable
+type timekeeperClock struct {
+	tk *Timekeeper
+	c  sentrytime.ClockID
+
+	// Implements ktime.Clock.WallTimeUntil.
+	ktime.WallRateClock `state:"nosave"`
+
+	// Implements waiter.Waitable. (We have no ability to detect
+	// discontinuities from external changes to CLOCK_REALTIME).
+	ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+	now, err := tc.tk.GetTime(tc.c)
+	if err != nil {
+		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+	}
+	return ktime.FromNanoseconds(now)
+}
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
deleted file mode 100644
index 534d03d0f..000000000
--- a/pkg/sentry/kernel/timer.go
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
-	"fmt"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
-	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
-)
-
-// timekeeperClock is a ktime.Clock that reads time from a
-// kernel.Timekeeper-managed clock.
-//
-// +stateify savable
-type timekeeperClock struct {
-	tk *Timekeeper
-	c  sentrytime.ClockID
-
-	// Implements ktime.Clock.WallTimeUntil.
-	ktime.WallRateClock `state:"nosave"`
-
-	// Implements waiter.Waitable. (We have no ability to detect
-	// discontinuities from external changes to CLOCK_REALTIME).
-	ktime.NoClockEvents `state:"nosave"`
-}
-
-// Now implements ktime.Clock.Now.
-func (tc *timekeeperClock) Now() ktime.Time {
-	now, err := tc.tk.GetTime(tc.c)
-	if err != nil {
-		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
-	}
-	return ktime.FromNanoseconds(now)
-}
-
-// tgClock is a ktime.Clock that measures the time a thread group has spent
-// executing.
-//
-// +stateify savable
-type tgClock struct {
-	tg *ThreadGroup
-
-	// If includeSys is true, the tgClock includes both time spent executing
-	// application code as well as time spent in the sentry. Otherwise, the
-	// tgClock includes only time spent executing application code.
-	includeSys bool
-
-	// Implements waiter.Waitable.
-	ktime.ClockEventsQueue `state:"nosave"`
-}
-
-// UserCPUClock returns a ktime.Clock that measures the time that a thread
-// group has spent executing.
-func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
-	return tg.tm.virtClock
-}
-
-// CPUClock returns a ktime.Clock that measures the time that a thread group
-// has spent executing, including sentry time.
-func (tg *ThreadGroup) CPUClock() ktime.Clock {
-	return tg.tm.profClock
-}
-
-// Now implements ktime.Clock.Now.
-func (tgc *tgClock) Now() ktime.Time {
-	stats := tgc.tg.CPUStats()
-	if tgc.includeSys {
-		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
-	}
-	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
-}
-
-// WallTimeUntil implements ktime.Clock.WallTimeUntil.
-func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
-	// The assumption here is that the time spent in this process (not matter
-	// virtual or prof) should not exceed wall time * active tasks, since
-	// Task.exitThreadGroup stops accounting as it transitions to
-	// TaskExitInitiated.
-	tgc.tg.pidns.owner.mu.RLock()
-	n := tgc.tg.activeTasks
-	tgc.tg.pidns.owner.mu.RUnlock()
-	if n == 0 {
-		if t.Before(now) {
-			return 0
-		}
-		// The timer tick raced with thread group exit, after which no more
-		// tasks can enter the thread group. So tgc.Now() will never advance
-		// again. Return a large delay; the timer should be stopped long before
-		// it comes again anyway.
-		return time.Hour
-	}
-	// This is a lower bound on the amount of time that can elapse before an
-	// associated timer expires, so returning this value tends to result in a
-	// sequence of closely-spaced ticks just before timer expiry. To avoid
-	// this, round up to the nearest ClockTick; CPU usage measurements are
-	// limited to this resolution anyway.
-	remaining := time.Duration(int64(t.Sub(now))/int64(n)) * time.Nanosecond
-	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
-}
-
-// taskClock is a ktime.Clock that measures the time that a task has spent
-// executing.
-type taskClock struct {
-	t *Task
-
-	// If includeSys is true, the taskClock includes both time spent executing
-	// application code as well as time spent in the sentry. Otherwise, the
-	// taskClock includes only time spent executing application code.
-	includeSys bool
-
-	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
-	// based on either of the clock events, so there's no event to be
-	// notified for.
-	ktime.NoClockEvents `state:"nosave"`
-
-	// Implements ktime.Clock.WallTimeUntil.
-	//
-	// As an upper bound, a task's clock cannot advance faster than CPU
-	// time. It would have to execute at a rate of more than 1 task-second
-	// per 1 CPU-second, which isn't possible.
-	ktime.WallRateClock `state:"nosave"`
-}
-
-// UserCPUClock returns a clock measuring the CPU time the task has spent
-// executing application code.
-func (t *Task) UserCPUClock() ktime.Clock {
-	return &taskClock{t: t, includeSys: false}
-}
-
-// CPUClock returns a clock measuring the CPU time the task has spent executing
-// application and "kernel" code.
-func (t *Task) CPUClock() ktime.Clock {
-	return &taskClock{t: t, includeSys: true}
-}
-
-// Now implements ktime.Clock.Now.
-func (tc *taskClock) Now() ktime.Time {
-	stats := tc.t.CPUStats()
-	if tc.includeSys {
-		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
-	}
-	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
-}
-
-// signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
-//
-// +stateify savable
-type signalNotifier struct {
-	tg         *ThreadGroup
-	signal     linux.Signal
-	realTimer  bool
-	includeSys bool
-}
-
-// Notify implements ktime.TimerListener.Notify.
-func (s *signalNotifier) Notify(exp uint64) {
-	// Since all signals sent using a signalNotifier are standard (not
-	// real-time) signals, we can ignore the number of expirations and send
-	// only a single signal.
-	if s.realTimer {
-		// real timer signal sent to leader. See kernel/time/itimer.c:it_real_fn
-		s.tg.SendSignal(sigPriv(s.signal))
-	} else {
-		s.tg.SendTimerSignal(sigPriv(s.signal), s.includeSys)
-	}
-}
-
-// Destroy implements ktime.TimerListener.Destroy.
-func (s *signalNotifier) Destroy() {}
-
-// TimerManager is a collection of supported process cpu timers.
-//
-// +stateify savable
-type TimerManager struct {
-	// Clocks used to drive thread group execution time timers.
-	virtClock *tgClock
-	profClock *tgClock
-
-	RealTimer      *ktime.Timer
-	VirtualTimer   *ktime.Timer
-	ProfTimer      *ktime.Timer
-	SoftLimitTimer *ktime.Timer
-	HardLimitTimer *ktime.Timer
-}
-
-// newTimerManager returns a new instance of TimerManager.
-func newTimerManager(tg *ThreadGroup, monotonicClock ktime.Clock) TimerManager {
-	virtClock := &tgClock{tg: tg, includeSys: false}
-	profClock := &tgClock{tg: tg, includeSys: true}
-	tm := TimerManager{
-		virtClock: virtClock,
-		profClock: profClock,
-		RealTimer: ktime.NewTimer(monotonicClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGALRM,
-			realTimer:  true,
-			includeSys: false,
-		}),
-		VirtualTimer: ktime.NewTimer(virtClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGVTALRM,
-			realTimer:  false,
-			includeSys: false,
-		}),
-		ProfTimer: ktime.NewTimer(profClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGPROF,
-			realTimer:  false,
-			includeSys: true,
-		}),
-		SoftLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGXCPU,
-			realTimer:  false,
-			includeSys: true,
-		}),
-		HardLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGKILL,
-			realTimer:  false,
-			includeSys: true,
-		}),
-	}
-	tm.applyCPULimits(tg.Limits().Get(limits.CPU))
-	return tm
-}
-
-// Save saves this TimerManger.
-
-// destroy destroys all timers.
-func (tm *TimerManager) destroy() {
-	tm.RealTimer.Destroy()
-	tm.VirtualTimer.Destroy()
-	tm.ProfTimer.Destroy()
-	tm.SoftLimitTimer.Destroy()
-	tm.HardLimitTimer.Destroy()
-}
-
-func (tm *TimerManager) applyCPULimits(l limits.Limit) {
-	tm.SoftLimitTimer.Swap(ktime.Setting{
-		Enabled: l.Cur != limits.Infinity,
-		Next:    ktime.FromNanoseconds((time.Duration(l.Cur) * time.Second).Nanoseconds()),
-		Period:  time.Second,
-	})
-	tm.HardLimitTimer.Swap(ktime.Setting{
-		Enabled: l.Max != limits.Infinity,
-		Next:    ktime.FromNanoseconds((time.Duration(l.Max) * time.Second).Nanoseconds()),
-	})
-}
-
-// kick is called when the number of threads in the thread group associated
-// with tm increases.
-func (tm *TimerManager) kick() {
-	tm.virtClock.Notify(ktime.ClockEventRateIncrease)
-	tm.profClock.Notify(ktime.ClockEventRateIncrease)
-}
-
-// pause is to pause the timers and stop timer signal delivery.
-func (tm *TimerManager) pause() {
-	tm.RealTimer.Pause()
-	tm.VirtualTimer.Pause()
-	tm.ProfTimer.Pause()
-	tm.SoftLimitTimer.Pause()
-	tm.HardLimitTimer.Pause()
-}
-
-// resume is to resume the timers and continue timer signal delivery.
-func (tm *TimerManager) resume() {
-	tm.RealTimer.Resume()
-	tm.VirtualTimer.Resume()
-	tm.ProfTimer.Resume()
-	tm.SoftLimitTimer.Resume()
-	tm.HardLimitTimer.Resume()
-}
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 481e79eaa..d806b58ab 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -111,7 +111,7 @@ func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit)
 	}
 
 	if resource == limits.CPU {
-		t.ThreadGroup().SetCPUTimer(newLim)
+		t.NotifyRlimitCPUUpdated()
 	}
 	return oldLim, nil
 }
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index a12d12d9d..c41074d54 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -70,34 +69,15 @@ func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) e
 	}
 }
 
-func findTimer(t *kernel.Task, which int32) (*ktime.Timer, error) {
-	switch which {
-	case linux.ITIMER_REAL:
-		return t.ThreadGroup().Timer().RealTimer, nil
-	case linux.ITIMER_VIRTUAL:
-		return t.ThreadGroup().Timer().VirtualTimer, nil
-	case linux.ITIMER_PROF:
-		return t.ThreadGroup().Timer().ProfTimer, nil
-	default:
-		return nil, syscall.EINVAL
-	}
-}
-
 // Getitimer implements linux syscall getitimer(2).
 func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	timerID := args[0].Int()
 	val := args[1].Pointer()
 
-	timer, err := findTimer(t, timerID)
+	olditv, err := t.Getitimer(timerID)
 	if err != nil {
 		return 0, nil, err
 	}
-	value, interval := ktime.SpecFromSetting(timer.Get())
-	olditv := linux.ItimerVal{
-		Value:    linux.DurationToTimeval(value),
-		Interval: linux.DurationToTimeval(interval),
-	}
-
 	return 0, nil, copyItimerValOut(t, val, &olditv)
 }
 
@@ -107,29 +87,14 @@ func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	newVal := args[1].Pointer()
 	oldVal := args[2].Pointer()
 
-	timer, err := findTimer(t, timerID)
+	newitv, err := copyItimerValIn(t, newVal)
 	if err != nil {
 		return 0, nil, err
 	}
-
-	itv, err := copyItimerValIn(t, newVal)
+	olditv, err := t.Setitimer(timerID, newitv)
 	if err != nil {
 		return 0, nil, err
 	}
-	// Just like linux, we cap the timer value and interval with the max
-	// number that int64 can represent which is roughly 292 years.
-	s, err := ktime.SettingFromSpec(itv.Value.ToDuration(),
-		itv.Interval.ToDuration(), timer.Clock())
-	if err != nil {
-		return 0, nil, err
-	}
-
-	valueNS, intervalNS := ktime.SpecFromSetting(timer.Swap(s))
-	olditv := linux.ItimerVal{
-		Value:    linux.DurationToTimeval(valueNS),
-		Interval: linux.DurationToTimeval(intervalNS),
-	}
-
 	return 0, nil, copyItimerValOut(t, oldVal, &olditv)
 }
 
@@ -137,21 +102,19 @@ func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	duration := time.Duration(args[0].Uint()) * time.Second
 
-	timer := t.ThreadGroup().Timer().RealTimer
-	s, err := ktime.SettingFromSpec(duration, 0, timer.Clock())
+	olditv, err := t.Setitimer(linux.ITIMER_REAL, linux.ItimerVal{
+		Value: linux.DurationToTimeval(duration),
+	})
 	if err != nil {
 		return 0, nil, err
 	}
-
-	value, _ := ktime.SpecFromSetting(timer.Swap(s))
-	sec := int64(value) / nsecPerSec
-	nsec := int64(value) % nsecPerSec
-	// We can't return 0 if we have an alarm pending ...
-	if (sec == 0 && nsec > 0) || nsec >= nsecPerSec/2 {
-		sec++
+	olddur := olditv.Value.ToDuration()
+	secs := olddur.Round(time.Second).Nanoseconds() / nsecPerSec
+	if secs == 0 && olddur != 0 {
+		// We can't return 0 if an alarm was previously scheduled.
+		secs = 1
 	}
-
-	return uintptr(sec), nil, nil
+	return uintptr(secs), nil, nil
 }
 
 // TimerCreate implements linux syscall timer_create(2).
-- 
cgit v1.2.3


From f7419fec26d1fd0d12936cc44f2c3481bbade033 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 17 Oct 2018 16:30:11 -0700
Subject: Use generic ilist in Unix transport queue

This should improve performance.

PiperOrigin-RevId: 217610560
Change-Id: I370f196ea2396f1715a460b168ecbee197f94d6c
---
 pkg/sentry/socket/unix/transport/BUILD    | 14 ++++++++++++++
 pkg/sentry/socket/unix/transport/queue.go |  9 ++++-----
 pkg/sentry/socket/unix/transport/unix.go  |  3 +--
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 75b5a2eb6..28038ce7f 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -1,6 +1,19 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "transport_message_list",
+    out = "transport_message_list.go",
+    package = "transport",
+    prefix = "message",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*message",
+        "Linker": "*message",
+    },
+)
 
 go_library(
     name = "transport",
@@ -9,6 +22,7 @@ go_library(
         "connectioned_state.go",
         "connectionless.go",
         "queue.go",
+        "transport_message_list.go",
         "unix.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 203e31333..c4d7d863c 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -17,7 +17,6 @@ package transport
 import (
 	"sync"
 
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -33,7 +32,7 @@ type queue struct {
 	closed   bool
 	used     int64
 	limit    int64
-	dataList ilist.List
+	dataList messageList
 }
 
 // newQueue allocates and initializes a new queue.
@@ -61,7 +60,7 @@ func (q *queue) Close() {
 func (q *queue) Reset() {
 	q.mu.Lock()
 	for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
-		cur.(*message).Release()
+		cur.Release()
 	}
 	q.dataList.Reset()
 	q.used = 0
@@ -165,7 +164,7 @@ func (q *queue) Dequeue() (e *message, notify bool, err *tcpip.Error) {
 
 	notify = !q.bufWritable()
 
-	e = q.dataList.Front().(*message)
+	e = q.dataList.Front()
 	q.dataList.Remove(e)
 	q.used -= e.Length()
 
@@ -189,7 +188,7 @@ func (q *queue) Peek() (*message, *tcpip.Error) {
 		return nil, err
 	}
 
-	return q.dataList.Front().(*message).Peek(), nil
+	return q.dataList.Front().Peek(), nil
 }
 
 // QueuedSize returns the number of bytes currently in the queue, that is, the
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 9a0de9a06..2934101a2 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -19,7 +19,6 @@ import (
 	"sync"
 	"sync/atomic"
 
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -243,7 +242,7 @@ type BoundEndpoint interface {
 //
 // +stateify savable
 type message struct {
-	ilist.Entry
+	messageEntry
 
 	// Data is the Message payload.
 	Data buffer.View
-- 
cgit v1.2.3


From 8fce67af24945f82378b4c2731cca1788936d074 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 19 Oct 2018 16:34:09 -0700
Subject: Use correct company name in copyright header

PiperOrigin-RevId: 217951017
Change-Id: Ie08bf6987f98467d07457bcf35b5f1ff6e43c035
---
 kokoro/run_build.sh                                              | 2 +-
 kokoro/run_tests.sh                                              | 2 +-
 pkg/abi/abi.go                                                   | 2 +-
 pkg/abi/abi_linux.go                                             | 2 +-
 pkg/abi/flag.go                                                  | 2 +-
 pkg/abi/linux/aio.go                                             | 2 +-
 pkg/abi/linux/ashmem.go                                          | 2 +-
 pkg/abi/linux/binder.go                                          | 2 +-
 pkg/abi/linux/bpf.go                                             | 2 +-
 pkg/abi/linux/capability.go                                      | 2 +-
 pkg/abi/linux/dev.go                                             | 2 +-
 pkg/abi/linux/elf.go                                             | 2 +-
 pkg/abi/linux/errors.go                                          | 2 +-
 pkg/abi/linux/eventfd.go                                         | 2 +-
 pkg/abi/linux/exec.go                                            | 2 +-
 pkg/abi/linux/fcntl.go                                           | 2 +-
 pkg/abi/linux/file.go                                            | 2 +-
 pkg/abi/linux/fs.go                                              | 2 +-
 pkg/abi/linux/futex.go                                           | 2 +-
 pkg/abi/linux/inotify.go                                         | 2 +-
 pkg/abi/linux/ioctl.go                                           | 2 +-
 pkg/abi/linux/ip.go                                              | 2 +-
 pkg/abi/linux/ipc.go                                             | 2 +-
 pkg/abi/linux/limits.go                                          | 2 +-
 pkg/abi/linux/linux.go                                           | 2 +-
 pkg/abi/linux/mm.go                                              | 2 +-
 pkg/abi/linux/netdevice.go                                       | 2 +-
 pkg/abi/linux/netlink.go                                         | 2 +-
 pkg/abi/linux/netlink_route.go                                   | 2 +-
 pkg/abi/linux/poll.go                                            | 2 +-
 pkg/abi/linux/prctl.go                                           | 2 +-
 pkg/abi/linux/ptrace.go                                          | 2 +-
 pkg/abi/linux/rusage.go                                          | 2 +-
 pkg/abi/linux/sched.go                                           | 2 +-
 pkg/abi/linux/seccomp.go                                         | 2 +-
 pkg/abi/linux/sem.go                                             | 2 +-
 pkg/abi/linux/shm.go                                             | 2 +-
 pkg/abi/linux/signal.go                                          | 2 +-
 pkg/abi/linux/socket.go                                          | 2 +-
 pkg/abi/linux/time.go                                            | 2 +-
 pkg/abi/linux/timer.go                                           | 2 +-
 pkg/abi/linux/tty.go                                             | 2 +-
 pkg/abi/linux/uio.go                                             | 2 +-
 pkg/abi/linux/utsname.go                                         | 2 +-
 pkg/amutex/amutex.go                                             | 2 +-
 pkg/amutex/amutex_test.go                                        | 2 +-
 pkg/atomicbitops/atomic_bitops.go                                | 2 +-
 pkg/atomicbitops/atomic_bitops_amd64.s                           | 2 +-
 pkg/atomicbitops/atomic_bitops_common.go                         | 2 +-
 pkg/atomicbitops/atomic_bitops_test.go                           | 2 +-
 pkg/binary/binary.go                                             | 2 +-
 pkg/binary/binary_test.go                                        | 2 +-
 pkg/bits/bits.go                                                 | 2 +-
 pkg/bits/bits_template.go                                        | 2 +-
 pkg/bits/uint64_arch_amd64.go                                    | 2 +-
 pkg/bits/uint64_arch_amd64_asm.s                                 | 2 +-
 pkg/bits/uint64_arch_generic.go                                  | 2 +-
 pkg/bits/uint64_test.go                                          | 2 +-
 pkg/bpf/bpf.go                                                   | 2 +-
 pkg/bpf/decoder.go                                               | 2 +-
 pkg/bpf/decoder_test.go                                          | 2 +-
 pkg/bpf/input_bytes.go                                           | 2 +-
 pkg/bpf/interpreter.go                                           | 2 +-
 pkg/bpf/interpreter_test.go                                      | 2 +-
 pkg/bpf/program_builder.go                                       | 2 +-
 pkg/bpf/program_builder_test.go                                  | 2 +-
 pkg/compressio/compressio.go                                     | 2 +-
 pkg/compressio/compressio_test.go                                | 2 +-
 pkg/control/client/client.go                                     | 2 +-
 pkg/control/server/server.go                                     | 2 +-
 pkg/cpuid/cpu_amd64.s                                            | 2 +-
 pkg/cpuid/cpuid.go                                               | 2 +-
 pkg/cpuid/cpuid_parse_test.go                                    | 2 +-
 pkg/cpuid/cpuid_test.go                                          | 2 +-
 pkg/dhcp/client.go                                               | 2 +-
 pkg/dhcp/dhcp.go                                                 | 2 +-
 pkg/dhcp/dhcp_string.go                                          | 2 +-
 pkg/dhcp/dhcp_test.go                                            | 2 +-
 pkg/dhcp/server.go                                               | 2 +-
 pkg/eventchannel/event.go                                        | 2 +-
 pkg/eventchannel/event.proto                                     | 2 +-
 pkg/fd/fd.go                                                     | 2 +-
 pkg/fd/fd_test.go                                                | 2 +-
 pkg/gate/gate.go                                                 | 2 +-
 pkg/gate/gate_test.go                                            | 2 +-
 pkg/ilist/list.go                                                | 2 +-
 pkg/ilist/list_test.go                                           | 2 +-
 pkg/linewriter/linewriter.go                                     | 2 +-
 pkg/linewriter/linewriter_test.go                                | 2 +-
 pkg/log/glog.go                                                  | 2 +-
 pkg/log/glog_unsafe.go                                           | 2 +-
 pkg/log/json.go                                                  | 2 +-
 pkg/log/json_test.go                                             | 2 +-
 pkg/log/log.go                                                   | 2 +-
 pkg/log/log_test.go                                              | 2 +-
 pkg/metric/metric.go                                             | 2 +-
 pkg/metric/metric.proto                                          | 2 +-
 pkg/metric/metric_test.go                                        | 2 +-
 pkg/p9/buffer.go                                                 | 2 +-
 pkg/p9/client.go                                                 | 2 +-
 pkg/p9/client_file.go                                            | 2 +-
 pkg/p9/client_test.go                                            | 2 +-
 pkg/p9/file.go                                                   | 2 +-
 pkg/p9/handlers.go                                               | 2 +-
 pkg/p9/local_server/local_server.go                              | 2 +-
 pkg/p9/messages.go                                               | 2 +-
 pkg/p9/messages_test.go                                          | 2 +-
 pkg/p9/p9.go                                                     | 2 +-
 pkg/p9/p9_test.go                                                | 2 +-
 pkg/p9/p9test/client_test.go                                     | 2 +-
 pkg/p9/p9test/mocks.go                                           | 2 +-
 pkg/p9/pool.go                                                   | 2 +-
 pkg/p9/pool_test.go                                              | 2 +-
 pkg/p9/server.go                                                 | 2 +-
 pkg/p9/transport.go                                              | 2 +-
 pkg/p9/transport_test.go                                         | 2 +-
 pkg/p9/version.go                                                | 2 +-
 pkg/p9/version_test.go                                           | 2 +-
 pkg/rand/rand.go                                                 | 2 +-
 pkg/rand/rand_linux.go                                           | 2 +-
 pkg/refs/refcounter.go                                           | 2 +-
 pkg/refs/refcounter_state.go                                     | 2 +-
 pkg/refs/refcounter_test.go                                      | 2 +-
 pkg/seccomp/seccomp.go                                           | 2 +-
 pkg/seccomp/seccomp_rules.go                                     | 2 +-
 pkg/seccomp/seccomp_test.go                                      | 2 +-
 pkg/seccomp/seccomp_test_victim.go                               | 2 +-
 pkg/seccomp/seccomp_unsafe.go                                    | 2 +-
 pkg/secio/full_reader.go                                         | 2 +-
 pkg/secio/secio.go                                               | 2 +-
 pkg/secio/secio_test.go                                          | 2 +-
 pkg/segment/range.go                                             | 2 +-
 pkg/segment/set.go                                               | 2 +-
 pkg/segment/set_state.go                                         | 2 +-
 pkg/segment/test/segment_test.go                                 | 2 +-
 pkg/segment/test/set_functions.go                                | 2 +-
 pkg/sentry/arch/aligned.go                                       | 2 +-
 pkg/sentry/arch/arch.go                                          | 2 +-
 pkg/sentry/arch/arch_amd64.go                                    | 2 +-
 pkg/sentry/arch/arch_amd64.s                                     | 2 +-
 pkg/sentry/arch/arch_state_x86.go                                | 2 +-
 pkg/sentry/arch/arch_x86.go                                      | 2 +-
 pkg/sentry/arch/auxv.go                                          | 2 +-
 pkg/sentry/arch/registers.proto                                  | 2 +-
 pkg/sentry/arch/signal_act.go                                    | 2 +-
 pkg/sentry/arch/signal_amd64.go                                  | 2 +-
 pkg/sentry/arch/signal_info.go                                   | 2 +-
 pkg/sentry/arch/signal_stack.go                                  | 2 +-
 pkg/sentry/arch/stack.go                                         | 2 +-
 pkg/sentry/arch/syscalls_amd64.go                                | 2 +-
 pkg/sentry/context/context.go                                    | 2 +-
 pkg/sentry/context/contexttest/contexttest.go                    | 2 +-
 pkg/sentry/control/control.go                                    | 2 +-
 pkg/sentry/control/proc.go                                       | 2 +-
 pkg/sentry/control/proc_test.go                                  | 2 +-
 pkg/sentry/control/state.go                                      | 2 +-
 pkg/sentry/device/device.go                                      | 2 +-
 pkg/sentry/device/device_test.go                                 | 2 +-
 pkg/sentry/fs/anon/anon.go                                       | 2 +-
 pkg/sentry/fs/anon/device.go                                     | 2 +-
 pkg/sentry/fs/ashmem/area.go                                     | 2 +-
 pkg/sentry/fs/ashmem/device.go                                   | 2 +-
 pkg/sentry/fs/ashmem/pin_board.go                                | 2 +-
 pkg/sentry/fs/ashmem/pin_board_test.go                           | 2 +-
 pkg/sentry/fs/attr.go                                            | 2 +-
 pkg/sentry/fs/binder/binder.go                                   | 2 +-
 pkg/sentry/fs/context.go                                         | 2 +-
 pkg/sentry/fs/copy_up.go                                         | 2 +-
 pkg/sentry/fs/copy_up_test.go                                    | 2 +-
 pkg/sentry/fs/dentry.go                                          | 2 +-
 pkg/sentry/fs/dev/dev.go                                         | 2 +-
 pkg/sentry/fs/dev/device.go                                      | 2 +-
 pkg/sentry/fs/dev/fs.go                                          | 2 +-
 pkg/sentry/fs/dev/full.go                                        | 2 +-
 pkg/sentry/fs/dev/null.go                                        | 2 +-
 pkg/sentry/fs/dev/random.go                                      | 2 +-
 pkg/sentry/fs/dirent.go                                          | 2 +-
 pkg/sentry/fs/dirent_cache.go                                    | 2 +-
 pkg/sentry/fs/dirent_cache_test.go                               | 2 +-
 pkg/sentry/fs/dirent_refs_test.go                                | 2 +-
 pkg/sentry/fs/dirent_state.go                                    | 2 +-
 pkg/sentry/fs/fdpipe/pipe.go                                     | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener.go                              | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener_test.go                         | 2 +-
 pkg/sentry/fs/fdpipe/pipe_state.go                               | 2 +-
 pkg/sentry/fs/fdpipe/pipe_test.go                                | 2 +-
 pkg/sentry/fs/file.go                                            | 2 +-
 pkg/sentry/fs/file_operations.go                                 | 2 +-
 pkg/sentry/fs/file_overlay.go                                    | 2 +-
 pkg/sentry/fs/file_overlay_test.go                               | 2 +-
 pkg/sentry/fs/file_state.go                                      | 2 +-
 pkg/sentry/fs/file_test.go                                       | 2 +-
 pkg/sentry/fs/filesystems.go                                     | 2 +-
 pkg/sentry/fs/filetest/filetest.go                               | 2 +-
 pkg/sentry/fs/flags.go                                           | 2 +-
 pkg/sentry/fs/fs.go                                              | 2 +-
 pkg/sentry/fs/fsutil/dirty_set.go                                | 2 +-
 pkg/sentry/fs/fsutil/dirty_set_test.go                           | 2 +-
 pkg/sentry/fs/fsutil/file.go                                     | 2 +-
 pkg/sentry/fs/fsutil/file_range_set.go                           | 2 +-
 pkg/sentry/fs/fsutil/frame_ref_set.go                            | 2 +-
 pkg/sentry/fs/fsutil/fsutil.go                                   | 2 +-
 pkg/sentry/fs/fsutil/handle.go                                   | 2 +-
 pkg/sentry/fs/fsutil/handle_test.go                              | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper.go                         | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_state.go                   | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go                  | 2 +-
 pkg/sentry/fs/fsutil/inode.go                                    | 2 +-
 pkg/sentry/fs/fsutil/inode_cached.go                             | 2 +-
 pkg/sentry/fs/fsutil/inode_cached_test.go                        | 2 +-
 pkg/sentry/fs/gofer/attr.go                                      | 2 +-
 pkg/sentry/fs/gofer/cache_policy.go                              | 2 +-
 pkg/sentry/fs/gofer/context_file.go                              | 2 +-
 pkg/sentry/fs/gofer/device.go                                    | 2 +-
 pkg/sentry/fs/gofer/file.go                                      | 2 +-
 pkg/sentry/fs/gofer/file_state.go                                | 2 +-
 pkg/sentry/fs/gofer/fs.go                                        | 2 +-
 pkg/sentry/fs/gofer/gofer_test.go                                | 2 +-
 pkg/sentry/fs/gofer/handles.go                                   | 2 +-
 pkg/sentry/fs/gofer/inode.go                                     | 2 +-
 pkg/sentry/fs/gofer/inode_state.go                               | 2 +-
 pkg/sentry/fs/gofer/path.go                                      | 2 +-
 pkg/sentry/fs/gofer/session.go                                   | 2 +-
 pkg/sentry/fs/gofer/session_state.go                             | 2 +-
 pkg/sentry/fs/gofer/socket.go                                    | 2 +-
 pkg/sentry/fs/gofer/util.go                                      | 2 +-
 pkg/sentry/fs/host/control.go                                    | 2 +-
 pkg/sentry/fs/host/descriptor.go                                 | 2 +-
 pkg/sentry/fs/host/descriptor_state.go                           | 2 +-
 pkg/sentry/fs/host/descriptor_test.go                            | 2 +-
 pkg/sentry/fs/host/device.go                                     | 2 +-
 pkg/sentry/fs/host/file.go                                       | 2 +-
 pkg/sentry/fs/host/fs.go                                         | 2 +-
 pkg/sentry/fs/host/fs_test.go                                    | 2 +-
 pkg/sentry/fs/host/inode.go                                      | 2 +-
 pkg/sentry/fs/host/inode_state.go                                | 2 +-
 pkg/sentry/fs/host/inode_test.go                                 | 2 +-
 pkg/sentry/fs/host/ioctl_unsafe.go                               | 2 +-
 pkg/sentry/fs/host/socket.go                                     | 2 +-
 pkg/sentry/fs/host/socket_iovec.go                               | 2 +-
 pkg/sentry/fs/host/socket_state.go                               | 2 +-
 pkg/sentry/fs/host/socket_test.go                                | 2 +-
 pkg/sentry/fs/host/socket_unsafe.go                              | 2 +-
 pkg/sentry/fs/host/tty.go                                        | 2 +-
 pkg/sentry/fs/host/util.go                                       | 2 +-
 pkg/sentry/fs/host/util_unsafe.go                                | 2 +-
 pkg/sentry/fs/host/wait_test.go                                  | 2 +-
 pkg/sentry/fs/inode.go                                           | 2 +-
 pkg/sentry/fs/inode_inotify.go                                   | 2 +-
 pkg/sentry/fs/inode_operations.go                                | 2 +-
 pkg/sentry/fs/inode_overlay.go                                   | 2 +-
 pkg/sentry/fs/inode_overlay_test.go                              | 2 +-
 pkg/sentry/fs/inotify.go                                         | 2 +-
 pkg/sentry/fs/inotify_event.go                                   | 2 +-
 pkg/sentry/fs/inotify_watch.go                                   | 2 +-
 pkg/sentry/fs/lock/lock.go                                       | 2 +-
 pkg/sentry/fs/lock/lock_range_test.go                            | 2 +-
 pkg/sentry/fs/lock/lock_set_functions.go                         | 2 +-
 pkg/sentry/fs/lock/lock_test.go                                  | 2 +-
 pkg/sentry/fs/mock.go                                            | 2 +-
 pkg/sentry/fs/mount.go                                           | 2 +-
 pkg/sentry/fs/mount_overlay.go                                   | 2 +-
 pkg/sentry/fs/mount_state.go                                     | 2 +-
 pkg/sentry/fs/mount_test.go                                      | 2 +-
 pkg/sentry/fs/mounts.go                                          | 2 +-
 pkg/sentry/fs/mounts_test.go                                     | 2 +-
 pkg/sentry/fs/offset.go                                          | 2 +-
 pkg/sentry/fs/overlay.go                                         | 2 +-
 pkg/sentry/fs/path.go                                            | 2 +-
 pkg/sentry/fs/path_test.go                                       | 2 +-
 pkg/sentry/fs/proc/cpuinfo.go                                    | 2 +-
 pkg/sentry/fs/proc/device/device.go                              | 2 +-
 pkg/sentry/fs/proc/exec_args.go                                  | 2 +-
 pkg/sentry/fs/proc/fds.go                                        | 2 +-
 pkg/sentry/fs/proc/file.go                                       | 2 +-
 pkg/sentry/fs/proc/filesystems.go                                | 2 +-
 pkg/sentry/fs/proc/fs.go                                         | 2 +-
 pkg/sentry/fs/proc/loadavg.go                                    | 2 +-
 pkg/sentry/fs/proc/meminfo.go                                    | 2 +-
 pkg/sentry/fs/proc/mounts.go                                     | 2 +-
 pkg/sentry/fs/proc/net.go                                        | 2 +-
 pkg/sentry/fs/proc/net_test.go                                   | 2 +-
 pkg/sentry/fs/proc/proc.go                                       | 2 +-
 pkg/sentry/fs/proc/rpcinet_proc.go                               | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go                            | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile_test.go                       | 2 +-
 pkg/sentry/fs/proc/stat.go                                       | 2 +-
 pkg/sentry/fs/proc/sys.go                                        | 2 +-
 pkg/sentry/fs/proc/sys_net.go                                    | 2 +-
 pkg/sentry/fs/proc/sys_net_test.go                               | 2 +-
 pkg/sentry/fs/proc/task.go                                       | 2 +-
 pkg/sentry/fs/proc/uid_gid_map.go                                | 2 +-
 pkg/sentry/fs/proc/uptime.go                                     | 2 +-
 pkg/sentry/fs/proc/version.go                                    | 2 +-
 pkg/sentry/fs/ramfs/dir.go                                       | 2 +-
 pkg/sentry/fs/ramfs/file.go                                      | 2 +-
 pkg/sentry/fs/ramfs/ramfs.go                                     | 2 +-
 pkg/sentry/fs/ramfs/socket.go                                    | 2 +-
 pkg/sentry/fs/ramfs/symlink.go                                   | 2 +-
 pkg/sentry/fs/ramfs/test/test.go                                 | 2 +-
 pkg/sentry/fs/ramfs/tree.go                                      | 2 +-
 pkg/sentry/fs/ramfs/tree_test.go                                 | 2 +-
 pkg/sentry/fs/restore.go                                         | 2 +-
 pkg/sentry/fs/save.go                                            | 2 +-
 pkg/sentry/fs/seek.go                                            | 2 +-
 pkg/sentry/fs/sync.go                                            | 2 +-
 pkg/sentry/fs/sys/device.go                                      | 2 +-
 pkg/sentry/fs/sys/devices.go                                     | 2 +-
 pkg/sentry/fs/sys/fs.go                                          | 2 +-
 pkg/sentry/fs/sys/sys.go                                         | 2 +-
 pkg/sentry/fs/timerfd/timerfd.go                                 | 2 +-
 pkg/sentry/fs/tmpfs/device.go                                    | 2 +-
 pkg/sentry/fs/tmpfs/file_regular.go                              | 2 +-
 pkg/sentry/fs/tmpfs/file_test.go                                 | 2 +-
 pkg/sentry/fs/tmpfs/fs.go                                        | 2 +-
 pkg/sentry/fs/tmpfs/inode_file.go                                | 2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                                     | 2 +-
 pkg/sentry/fs/tty/dir.go                                         | 2 +-
 pkg/sentry/fs/tty/fs.go                                          | 2 +-
 pkg/sentry/fs/tty/inode.go                                       | 2 +-
 pkg/sentry/fs/tty/line_discipline.go                             | 2 +-
 pkg/sentry/fs/tty/master.go                                      | 2 +-
 pkg/sentry/fs/tty/queue.go                                       | 2 +-
 pkg/sentry/fs/tty/slave.go                                       | 2 +-
 pkg/sentry/fs/tty/terminal.go                                    | 2 +-
 pkg/sentry/fs/tty/tty_test.go                                    | 2 +-
 pkg/sentry/hostcpu/getcpu_amd64.s                                | 2 +-
 pkg/sentry/hostcpu/hostcpu.go                                    | 2 +-
 pkg/sentry/hostcpu/hostcpu_test.go                               | 2 +-
 pkg/sentry/inet/context.go                                       | 2 +-
 pkg/sentry/inet/inet.go                                          | 2 +-
 pkg/sentry/inet/test_stack.go                                    | 2 +-
 pkg/sentry/kernel/abstract_socket_namespace.go                   | 2 +-
 pkg/sentry/kernel/auth/auth.go                                   | 2 +-
 pkg/sentry/kernel/auth/capability_set.go                         | 2 +-
 pkg/sentry/kernel/auth/context.go                                | 2 +-
 pkg/sentry/kernel/auth/credentials.go                            | 2 +-
 pkg/sentry/kernel/auth/id.go                                     | 2 +-
 pkg/sentry/kernel/auth/id_map.go                                 | 2 +-
 pkg/sentry/kernel/auth/id_map_functions.go                       | 2 +-
 pkg/sentry/kernel/auth/user_namespace.go                         | 2 +-
 pkg/sentry/kernel/context.go                                     | 2 +-
 pkg/sentry/kernel/epoll/epoll.go                                 | 2 +-
 pkg/sentry/kernel/epoll/epoll_state.go                           | 2 +-
 pkg/sentry/kernel/epoll/epoll_test.go                            | 2 +-
 pkg/sentry/kernel/eventfd/eventfd.go                             | 2 +-
 pkg/sentry/kernel/eventfd/eventfd_test.go                        | 2 +-
 pkg/sentry/kernel/fasync/fasync.go                               | 2 +-
 pkg/sentry/kernel/fd_map.go                                      | 2 +-
 pkg/sentry/kernel/fd_map_test.go                                 | 2 +-
 pkg/sentry/kernel/fs_context.go                                  | 2 +-
 pkg/sentry/kernel/futex/futex.go                                 | 2 +-
 pkg/sentry/kernel/futex/futex_test.go                            | 2 +-
 pkg/sentry/kernel/ipc_namespace.go                               | 2 +-
 pkg/sentry/kernel/kdefs/kdefs.go                                 | 2 +-
 pkg/sentry/kernel/kernel.go                                      | 2 +-
 pkg/sentry/kernel/kernel_state.go                                | 2 +-
 pkg/sentry/kernel/memevent/memory_events.go                      | 2 +-
 pkg/sentry/kernel/memevent/memory_events.proto                   | 2 +-
 pkg/sentry/kernel/pending_signals.go                             | 2 +-
 pkg/sentry/kernel/pending_signals_state.go                       | 2 +-
 pkg/sentry/kernel/pipe/buffers.go                                | 2 +-
 pkg/sentry/kernel/pipe/device.go                                 | 2 +-
 pkg/sentry/kernel/pipe/node.go                                   | 2 +-
 pkg/sentry/kernel/pipe/node_test.go                              | 2 +-
 pkg/sentry/kernel/pipe/pipe.go                                   | 2 +-
 pkg/sentry/kernel/pipe/pipe_test.go                              | 2 +-
 pkg/sentry/kernel/pipe/reader.go                                 | 2 +-
 pkg/sentry/kernel/pipe/reader_writer.go                          | 2 +-
 pkg/sentry/kernel/pipe/writer.go                                 | 2 +-
 pkg/sentry/kernel/posixtimer.go                                  | 2 +-
 pkg/sentry/kernel/ptrace.go                                      | 2 +-
 pkg/sentry/kernel/rseq.go                                        | 2 +-
 pkg/sentry/kernel/sched/cpuset.go                                | 2 +-
 pkg/sentry/kernel/sched/cpuset_test.go                           | 2 +-
 pkg/sentry/kernel/sched/sched.go                                 | 2 +-
 pkg/sentry/kernel/seccomp.go                                     | 2 +-
 pkg/sentry/kernel/semaphore/semaphore.go                         | 2 +-
 pkg/sentry/kernel/semaphore/semaphore_test.go                    | 2 +-
 pkg/sentry/kernel/sessions.go                                    | 2 +-
 pkg/sentry/kernel/shm/device.go                                  | 2 +-
 pkg/sentry/kernel/shm/shm.go                                     | 2 +-
 pkg/sentry/kernel/signal.go                                      | 2 +-
 pkg/sentry/kernel/signal_handlers.go                             | 2 +-
 pkg/sentry/kernel/syscalls.go                                    | 2 +-
 pkg/sentry/kernel/syscalls_state.go                              | 2 +-
 pkg/sentry/kernel/syslog.go                                      | 2 +-
 pkg/sentry/kernel/table_test.go                                  | 2 +-
 pkg/sentry/kernel/task.go                                        | 2 +-
 pkg/sentry/kernel/task_acct.go                                   | 2 +-
 pkg/sentry/kernel/task_block.go                                  | 2 +-
 pkg/sentry/kernel/task_clone.go                                  | 2 +-
 pkg/sentry/kernel/task_context.go                                | 2 +-
 pkg/sentry/kernel/task_exec.go                                   | 2 +-
 pkg/sentry/kernel/task_exit.go                                   | 2 +-
 pkg/sentry/kernel/task_futex.go                                  | 2 +-
 pkg/sentry/kernel/task_identity.go                               | 2 +-
 pkg/sentry/kernel/task_log.go                                    | 2 +-
 pkg/sentry/kernel/task_net.go                                    | 2 +-
 pkg/sentry/kernel/task_run.go                                    | 2 +-
 pkg/sentry/kernel/task_sched.go                                  | 2 +-
 pkg/sentry/kernel/task_signals.go                                | 2 +-
 pkg/sentry/kernel/task_start.go                                  | 2 +-
 pkg/sentry/kernel/task_stop.go                                   | 2 +-
 pkg/sentry/kernel/task_syscall.go                                | 2 +-
 pkg/sentry/kernel/task_test.go                                   | 2 +-
 pkg/sentry/kernel/task_usermem.go                                | 2 +-
 pkg/sentry/kernel/thread_group.go                                | 2 +-
 pkg/sentry/kernel/threads.go                                     | 2 +-
 pkg/sentry/kernel/time/context.go                                | 2 +-
 pkg/sentry/kernel/time/time.go                                   | 2 +-
 pkg/sentry/kernel/timekeeper.go                                  | 2 +-
 pkg/sentry/kernel/timekeeper_state.go                            | 2 +-
 pkg/sentry/kernel/timekeeper_test.go                             | 2 +-
 pkg/sentry/kernel/uts_namespace.go                               | 2 +-
 pkg/sentry/kernel/vdso.go                                        | 2 +-
 pkg/sentry/kernel/version.go                                     | 2 +-
 pkg/sentry/limits/context.go                                     | 2 +-
 pkg/sentry/limits/limits.go                                      | 2 +-
 pkg/sentry/limits/limits_test.go                                 | 2 +-
 pkg/sentry/limits/linux.go                                       | 2 +-
 pkg/sentry/loader/elf.go                                         | 2 +-
 pkg/sentry/loader/interpreter.go                                 | 2 +-
 pkg/sentry/loader/loader.go                                      | 2 +-
 pkg/sentry/loader/vdso.go                                        | 2 +-
 pkg/sentry/loader/vdso_state.go                                  | 2 +-
 pkg/sentry/memmap/mapping_set.go                                 | 2 +-
 pkg/sentry/memmap/mapping_set_test.go                            | 2 +-
 pkg/sentry/memmap/memmap.go                                      | 2 +-
 pkg/sentry/memutil/memutil.go                                    | 2 +-
 pkg/sentry/memutil/memutil_unsafe.go                             | 2 +-
 pkg/sentry/mm/address_space.go                                   | 2 +-
 pkg/sentry/mm/aio_context.go                                     | 2 +-
 pkg/sentry/mm/aio_context_state.go                               | 2 +-
 pkg/sentry/mm/debug.go                                           | 2 +-
 pkg/sentry/mm/io.go                                              | 2 +-
 pkg/sentry/mm/lifecycle.go                                       | 2 +-
 pkg/sentry/mm/metadata.go                                        | 2 +-
 pkg/sentry/mm/mm.go                                              | 2 +-
 pkg/sentry/mm/mm_test.go                                         | 2 +-
 pkg/sentry/mm/pma.go                                             | 2 +-
 pkg/sentry/mm/proc_pid_maps.go                                   | 2 +-
 pkg/sentry/mm/save_restore.go                                    | 2 +-
 pkg/sentry/mm/shm.go                                             | 2 +-
 pkg/sentry/mm/special_mappable.go                                | 2 +-
 pkg/sentry/mm/syscalls.go                                        | 2 +-
 pkg/sentry/mm/vma.go                                             | 2 +-
 pkg/sentry/platform/context.go                                   | 2 +-
 pkg/sentry/platform/filemem/filemem.go                           | 2 +-
 pkg/sentry/platform/filemem/filemem_state.go                     | 2 +-
 pkg/sentry/platform/filemem/filemem_test.go                      | 2 +-
 pkg/sentry/platform/filemem/filemem_unsafe.go                    | 2 +-
 pkg/sentry/platform/interrupt/interrupt.go                       | 2 +-
 pkg/sentry/platform/interrupt/interrupt_test.go                  | 2 +-
 pkg/sentry/platform/kvm/address_space.go                         | 2 +-
 pkg/sentry/platform/kvm/allocator.go                             | 2 +-
 pkg/sentry/platform/kvm/bluepill.go                              | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.go                        | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.s                         | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go                 | 2 +-
 pkg/sentry/platform/kvm/bluepill_fault.go                        | 2 +-
 pkg/sentry/platform/kvm/bluepill_unsafe.go                       | 2 +-
 pkg/sentry/platform/kvm/context.go                               | 2 +-
 pkg/sentry/platform/kvm/host_map.go                              | 2 +-
 pkg/sentry/platform/kvm/kvm.go                                   | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64.go                             | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go                      | 2 +-
 pkg/sentry/platform/kvm/kvm_const.go                             | 2 +-
 pkg/sentry/platform/kvm/kvm_test.go                              | 2 +-
 pkg/sentry/platform/kvm/machine.go                               | 2 +-
 pkg/sentry/platform/kvm/machine_amd64.go                         | 2 +-
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go                  | 2 +-
 pkg/sentry/platform/kvm/machine_unsafe.go                        | 2 +-
 pkg/sentry/platform/kvm/physical_map.go                          | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil.go                     | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.go               | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.s                | 2 +-
 pkg/sentry/platform/kvm/virtual_map.go                           | 2 +-
 pkg/sentry/platform/kvm/virtual_map_test.go                      | 2 +-
 pkg/sentry/platform/mmap_min_addr.go                             | 2 +-
 pkg/sentry/platform/platform.go                                  | 2 +-
 pkg/sentry/platform/procid/procid.go                             | 2 +-
 pkg/sentry/platform/procid/procid_amd64.s                        | 2 +-
 pkg/sentry/platform/procid/procid_net_test.go                    | 2 +-
 pkg/sentry/platform/procid/procid_test.go                        | 2 +-
 pkg/sentry/platform/ptrace/ptrace.go                             | 2 +-
 pkg/sentry/platform/ptrace/ptrace_unsafe.go                      | 2 +-
 pkg/sentry/platform/ptrace/stub_amd64.s                          | 2 +-
 pkg/sentry/platform/ptrace/stub_unsafe.go                        | 2 +-
 pkg/sentry/platform/ptrace/subprocess.go                         | 2 +-
 pkg/sentry/platform/ptrace/subprocess_amd64.go                   | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go                   | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go      | 2 +-
 pkg/sentry/platform/ptrace/subprocess_unsafe.go                  | 2 +-
 pkg/sentry/platform/ring0/defs.go                                | 2 +-
 pkg/sentry/platform/ring0/defs_amd64.go                          | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.go                         | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.s                          | 2 +-
 pkg/sentry/platform/ring0/gen_offsets/main.go                    | 2 +-
 pkg/sentry/platform/ring0/kernel.go                              | 2 +-
 pkg/sentry/platform/ring0/kernel_amd64.go                        | 2 +-
 pkg/sentry/platform/ring0/kernel_unsafe.go                       | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.go                           | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.s                            | 2 +-
 pkg/sentry/platform/ring0/offsets_amd64.go                       | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator.go                | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go         | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go               | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go         | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go    | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_test.go          | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_x86.go           | 2 +-
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go                | 2 +-
 pkg/sentry/platform/ring0/pagetables/walker_amd64.go             | 2 +-
 pkg/sentry/platform/ring0/ring0.go                               | 2 +-
 pkg/sentry/platform/ring0/x86.go                                 | 2 +-
 pkg/sentry/platform/safecopy/atomic_amd64.s                      | 2 +-
 pkg/sentry/platform/safecopy/memclr_amd64.s                      | 2 +-
 pkg/sentry/platform/safecopy/memcpy_amd64.s                      | 2 +-
 pkg/sentry/platform/safecopy/safecopy.go                         | 2 +-
 pkg/sentry/platform/safecopy/safecopy_test.go                    | 2 +-
 pkg/sentry/platform/safecopy/safecopy_unsafe.go                  | 2 +-
 pkg/sentry/platform/safecopy/sighandler_amd64.s                  | 2 +-
 pkg/sentry/safemem/block_unsafe.go                               | 2 +-
 pkg/sentry/safemem/io.go                                         | 2 +-
 pkg/sentry/safemem/io_test.go                                    | 2 +-
 pkg/sentry/safemem/safemem.go                                    | 2 +-
 pkg/sentry/safemem/seq_test.go                                   | 2 +-
 pkg/sentry/safemem/seq_unsafe.go                                 | 2 +-
 pkg/sentry/sighandling/sighandling.go                            | 2 +-
 pkg/sentry/sighandling/sighandling_unsafe.go                     | 2 +-
 pkg/sentry/socket/control/control.go                             | 2 +-
 pkg/sentry/socket/epsocket/device.go                             | 2 +-
 pkg/sentry/socket/epsocket/epsocket.go                           | 2 +-
 pkg/sentry/socket/epsocket/provider.go                           | 2 +-
 pkg/sentry/socket/epsocket/save_restore.go                       | 2 +-
 pkg/sentry/socket/epsocket/stack.go                              | 2 +-
 pkg/sentry/socket/hostinet/device.go                             | 2 +-
 pkg/sentry/socket/hostinet/hostinet.go                           | 2 +-
 pkg/sentry/socket/hostinet/save_restore.go                       | 2 +-
 pkg/sentry/socket/hostinet/socket.go                             | 2 +-
 pkg/sentry/socket/hostinet/socket_unsafe.go                      | 2 +-
 pkg/sentry/socket/hostinet/stack.go                              | 2 +-
 pkg/sentry/socket/netlink/message.go                             | 2 +-
 pkg/sentry/socket/netlink/port/port.go                           | 2 +-
 pkg/sentry/socket/netlink/port/port_test.go                      | 2 +-
 pkg/sentry/socket/netlink/provider.go                            | 2 +-
 pkg/sentry/socket/netlink/route/protocol.go                      | 2 +-
 pkg/sentry/socket/netlink/socket.go                              | 2 +-
 pkg/sentry/socket/rpcinet/conn/conn.go                           | 2 +-
 pkg/sentry/socket/rpcinet/device.go                              | 2 +-
 pkg/sentry/socket/rpcinet/notifier/notifier.go                   | 2 +-
 pkg/sentry/socket/rpcinet/rpcinet.go                             | 2 +-
 pkg/sentry/socket/rpcinet/socket.go                              | 2 +-
 pkg/sentry/socket/rpcinet/stack.go                               | 2 +-
 pkg/sentry/socket/rpcinet/stack_unsafe.go                        | 2 +-
 pkg/sentry/socket/socket.go                                      | 2 +-
 pkg/sentry/socket/unix/device.go                                 | 2 +-
 pkg/sentry/socket/unix/io.go                                     | 2 +-
 pkg/sentry/socket/unix/transport/connectioned.go                 | 2 +-
 pkg/sentry/socket/unix/transport/connectioned_state.go           | 2 +-
 pkg/sentry/socket/unix/transport/connectionless.go               | 2 +-
 pkg/sentry/socket/unix/transport/queue.go                        | 2 +-
 pkg/sentry/socket/unix/transport/unix.go                         | 2 +-
 pkg/sentry/socket/unix/unix.go                                   | 2 +-
 pkg/sentry/state/state.go                                        | 2 +-
 pkg/sentry/state/state_metadata.go                               | 2 +-
 pkg/sentry/state/state_unsafe.go                                 | 2 +-
 pkg/sentry/strace/clone.go                                       | 2 +-
 pkg/sentry/strace/futex.go                                       | 2 +-
 pkg/sentry/strace/linux64.go                                     | 2 +-
 pkg/sentry/strace/open.go                                        | 2 +-
 pkg/sentry/strace/ptrace.go                                      | 2 +-
 pkg/sentry/strace/socket.go                                      | 2 +-
 pkg/sentry/strace/strace.go                                      | 2 +-
 pkg/sentry/strace/strace.proto                                   | 2 +-
 pkg/sentry/strace/syscalls.go                                    | 2 +-
 pkg/sentry/syscalls/epoll.go                                     | 2 +-
 pkg/sentry/syscalls/linux/error.go                               | 2 +-
 pkg/sentry/syscalls/linux/flags.go                               | 2 +-
 pkg/sentry/syscalls/linux/linux64.go                             | 2 +-
 pkg/sentry/syscalls/linux/sigset.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_aio.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_capability.go                      | 2 +-
 pkg/sentry/syscalls/linux/sys_epoll.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_eventfd.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_file.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_futex.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_getdents.go                        | 2 +-
 pkg/sentry/syscalls/linux/sys_identity.go                        | 2 +-
 pkg/sentry/syscalls/linux/sys_inotify.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_lseek.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_mmap.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_mount.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_pipe.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_poll.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_prctl.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_random.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_read.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_rlimit.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_rusage.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_sched.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_seccomp.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_sem.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_shm.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_signal.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_socket.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_stat.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_sync.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_sysinfo.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_syslog.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_thread.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_time.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_timer.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_timerfd.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_tls.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_utsname.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_write.go                           | 2 +-
 pkg/sentry/syscalls/linux/timespec.go                            | 2 +-
 pkg/sentry/syscalls/polling.go                                   | 2 +-
 pkg/sentry/syscalls/syscalls.go                                  | 2 +-
 pkg/sentry/syscalls/unimplemented_syscall.proto                  | 2 +-
 pkg/sentry/time/calibrated_clock.go                              | 2 +-
 pkg/sentry/time/calibrated_clock_test.go                         | 2 +-
 pkg/sentry/time/clock_id.go                                      | 2 +-
 pkg/sentry/time/clocks.go                                        | 2 +-
 pkg/sentry/time/muldiv_amd64.s                                   | 2 +-
 pkg/sentry/time/parameters.go                                    | 2 +-
 pkg/sentry/time/parameters_test.go                               | 2 +-
 pkg/sentry/time/sampler.go                                       | 2 +-
 pkg/sentry/time/sampler_test.go                                  | 2 +-
 pkg/sentry/time/sampler_unsafe.go                                | 2 +-
 pkg/sentry/time/tsc_amd64.s                                      | 2 +-
 pkg/sentry/uniqueid/context.go                                   | 2 +-
 pkg/sentry/usage/cpu.go                                          | 2 +-
 pkg/sentry/usage/io.go                                           | 2 +-
 pkg/sentry/usage/memory.go                                       | 2 +-
 pkg/sentry/usage/memory_unsafe.go                                | 2 +-
 pkg/sentry/usage/usage.go                                        | 2 +-
 pkg/sentry/usermem/access_type.go                                | 2 +-
 pkg/sentry/usermem/addr.go                                       | 2 +-
 pkg/sentry/usermem/addr_range_seq_test.go                        | 2 +-
 pkg/sentry/usermem/addr_range_seq_unsafe.go                      | 2 +-
 pkg/sentry/usermem/bytes_io.go                                   | 2 +-
 pkg/sentry/usermem/bytes_io_unsafe.go                            | 2 +-
 pkg/sentry/usermem/usermem.go                                    | 2 +-
 pkg/sentry/usermem/usermem_test.go                               | 2 +-
 pkg/sentry/usermem/usermem_x86.go                                | 2 +-
 pkg/sentry/watchdog/watchdog.go                                  | 2 +-
 pkg/sleep/commit_amd64.s                                         | 2 +-
 pkg/sleep/commit_asm.go                                          | 2 +-
 pkg/sleep/commit_noasm.go                                        | 2 +-
 pkg/sleep/empty.s                                                | 2 +-
 pkg/sleep/sleep_test.go                                          | 2 +-
 pkg/sleep/sleep_unsafe.go                                        | 2 +-
 pkg/state/decode.go                                              | 2 +-
 pkg/state/encode.go                                              | 2 +-
 pkg/state/encode_unsafe.go                                       | 2 +-
 pkg/state/map.go                                                 | 2 +-
 pkg/state/object.proto                                           | 2 +-
 pkg/state/printer.go                                             | 2 +-
 pkg/state/state.go                                               | 2 +-
 pkg/state/state_test.go                                          | 2 +-
 pkg/state/statefile/statefile.go                                 | 2 +-
 pkg/state/statefile/statefile_test.go                            | 2 +-
 pkg/state/stats.go                                               | 2 +-
 pkg/sync/atomicptr_unsafe.go                                     | 2 +-
 pkg/sync/atomicptrtest/atomicptr_test.go                         | 2 +-
 pkg/sync/memmove_unsafe.go                                       | 2 +-
 pkg/sync/norace_unsafe.go                                        | 2 +-
 pkg/sync/race_unsafe.go                                          | 2 +-
 pkg/sync/seqatomic_unsafe.go                                     | 2 +-
 pkg/sync/seqatomictest/seqatomic_test.go                         | 2 +-
 pkg/sync/seqcount.go                                             | 2 +-
 pkg/sync/seqcount_test.go                                        | 2 +-
 pkg/sync/sync.go                                                 | 2 +-
 pkg/syserr/host_linux.go                                         | 2 +-
 pkg/syserr/netstack.go                                           | 2 +-
 pkg/syserr/syserr.go                                             | 2 +-
 pkg/syserror/syserror.go                                         | 2 +-
 pkg/syserror/syserror_test.go                                    | 2 +-
 pkg/tcpip/adapters/gonet/gonet.go                                | 2 +-
 pkg/tcpip/adapters/gonet/gonet_test.go                           | 2 +-
 pkg/tcpip/buffer/prependable.go                                  | 2 +-
 pkg/tcpip/buffer/view.go                                         | 2 +-
 pkg/tcpip/buffer/view_test.go                                    | 2 +-
 pkg/tcpip/checker/checker.go                                     | 2 +-
 pkg/tcpip/header/arp.go                                          | 2 +-
 pkg/tcpip/header/checksum.go                                     | 2 +-
 pkg/tcpip/header/eth.go                                          | 2 +-
 pkg/tcpip/header/gue.go                                          | 2 +-
 pkg/tcpip/header/icmpv4.go                                       | 2 +-
 pkg/tcpip/header/icmpv6.go                                       | 2 +-
 pkg/tcpip/header/interfaces.go                                   | 2 +-
 pkg/tcpip/header/ipv4.go                                         | 2 +-
 pkg/tcpip/header/ipv6.go                                         | 2 +-
 pkg/tcpip/header/ipv6_fragment.go                                | 2 +-
 pkg/tcpip/header/ipversion_test.go                               | 2 +-
 pkg/tcpip/header/tcp.go                                          | 2 +-
 pkg/tcpip/header/tcp_test.go                                     | 2 +-
 pkg/tcpip/header/udp.go                                          | 2 +-
 pkg/tcpip/link/channel/channel.go                                | 2 +-
 pkg/tcpip/link/fdbased/endpoint.go                               | 2 +-
 pkg/tcpip/link/fdbased/endpoint_test.go                          | 2 +-
 pkg/tcpip/link/loopback/loopback.go                              | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_amd64.s                      | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_unsafe.go                    | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go              | 2 +-
 pkg/tcpip/link/rawfile/errors.go                                 | 2 +-
 pkg/tcpip/link/rawfile/rawfile_unsafe.go                         | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe.go                            | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_test.go                       | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go                     | 2 +-
 pkg/tcpip/link/sharedmem/pipe/rx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/pipe/tx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/queue/queue_test.go                     | 2 +-
 pkg/tcpip/link/sharedmem/queue/rx.go                             | 2 +-
 pkg/tcpip/link/sharedmem/queue/tx.go                             | 2 +-
 pkg/tcpip/link/sharedmem/rx.go                                   | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem.go                            | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_test.go                       | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_unsafe.go                     | 2 +-
 pkg/tcpip/link/sharedmem/tx.go                                   | 2 +-
 pkg/tcpip/link/sniffer/pcap.go                                   | 2 +-
 pkg/tcpip/link/sniffer/sniffer.go                                | 2 +-
 pkg/tcpip/link/tun/tun_unsafe.go                                 | 2 +-
 pkg/tcpip/link/waitable/waitable.go                              | 2 +-
 pkg/tcpip/link/waitable/waitable_test.go                         | 2 +-
 pkg/tcpip/network/arp/arp.go                                     | 2 +-
 pkg/tcpip/network/arp/arp_test.go                                | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap.go                     | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap_test.go                | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation.go                 | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation_test.go            | 2 +-
 pkg/tcpip/network/fragmentation/reassembler.go                   | 2 +-
 pkg/tcpip/network/fragmentation/reassembler_test.go              | 2 +-
 pkg/tcpip/network/hash/hash.go                                   | 2 +-
 pkg/tcpip/network/ip_test.go                                     | 2 +-
 pkg/tcpip/network/ipv4/icmp.go                                   | 2 +-
 pkg/tcpip/network/ipv4/ipv4.go                                   | 2 +-
 pkg/tcpip/network/ipv4/ipv4_test.go                              | 2 +-
 pkg/tcpip/network/ipv6/icmp.go                                   | 2 +-
 pkg/tcpip/network/ipv6/icmp_test.go                              | 2 +-
 pkg/tcpip/network/ipv6/ipv6.go                                   | 2 +-
 pkg/tcpip/ports/ports.go                                         | 2 +-
 pkg/tcpip/ports/ports_test.go                                    | 2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go                         | 2 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go                            | 2 +-
 pkg/tcpip/seqnum/seqnum.go                                       | 2 +-
 pkg/tcpip/stack/linkaddrcache.go                                 | 2 +-
 pkg/tcpip/stack/linkaddrcache_test.go                            | 2 +-
 pkg/tcpip/stack/nic.go                                           | 2 +-
 pkg/tcpip/stack/registration.go                                  | 2 +-
 pkg/tcpip/stack/route.go                                         | 2 +-
 pkg/tcpip/stack/stack.go                                         | 2 +-
 pkg/tcpip/stack/stack_global_state.go                            | 2 +-
 pkg/tcpip/stack/stack_test.go                                    | 2 +-
 pkg/tcpip/stack/transport_demuxer.go                             | 2 +-
 pkg/tcpip/stack/transport_test.go                                | 2 +-
 pkg/tcpip/tcpip.go                                               | 2 +-
 pkg/tcpip/tcpip_test.go                                          | 2 +-
 pkg/tcpip/time.s                                                 | 2 +-
 pkg/tcpip/time_unsafe.go                                         | 2 +-
 pkg/tcpip/transport/ping/endpoint.go                             | 2 +-
 pkg/tcpip/transport/ping/endpoint_state.go                       | 2 +-
 pkg/tcpip/transport/ping/protocol.go                             | 2 +-
 pkg/tcpip/transport/tcp/accept.go                                | 2 +-
 pkg/tcpip/transport/tcp/connect.go                               | 2 +-
 pkg/tcpip/transport/tcp/cubic.go                                 | 2 +-
 pkg/tcpip/transport/tcp/dual_stack_test.go                       | 2 +-
 pkg/tcpip/transport/tcp/endpoint.go                              | 2 +-
 pkg/tcpip/transport/tcp/endpoint_state.go                        | 2 +-
 pkg/tcpip/transport/tcp/forwarder.go                             | 2 +-
 pkg/tcpip/transport/tcp/protocol.go                              | 2 +-
 pkg/tcpip/transport/tcp/rcv.go                                   | 2 +-
 pkg/tcpip/transport/tcp/reno.go                                  | 2 +-
 pkg/tcpip/transport/tcp/sack.go                                  | 2 +-
 pkg/tcpip/transport/tcp/segment.go                               | 2 +-
 pkg/tcpip/transport/tcp/segment_heap.go                          | 2 +-
 pkg/tcpip/transport/tcp/segment_queue.go                         | 2 +-
 pkg/tcpip/transport/tcp/segment_state.go                         | 2 +-
 pkg/tcpip/transport/tcp/snd.go                                   | 2 +-
 pkg/tcpip/transport/tcp/snd_state.go                             | 2 +-
 pkg/tcpip/transport/tcp/tcp_sack_test.go                         | 2 +-
 pkg/tcpip/transport/tcp/tcp_test.go                              | 2 +-
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go                    | 2 +-
 pkg/tcpip/transport/tcp/testing/context/context.go               | 2 +-
 pkg/tcpip/transport/tcp/timer.go                                 | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go                | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go           | 2 +-
 pkg/tcpip/transport/udp/endpoint.go                              | 2 +-
 pkg/tcpip/transport/udp/endpoint_state.go                        | 2 +-
 pkg/tcpip/transport/udp/protocol.go                              | 2 +-
 pkg/tcpip/transport/udp/udp_test.go                              | 2 +-
 pkg/tmutex/tmutex.go                                             | 2 +-
 pkg/tmutex/tmutex_test.go                                        | 2 +-
 pkg/unet/unet.go                                                 | 2 +-
 pkg/unet/unet_test.go                                            | 2 +-
 pkg/unet/unet_unsafe.go                                          | 2 +-
 pkg/urpc/urpc.go                                                 | 2 +-
 pkg/urpc/urpc_test.go                                            | 2 +-
 pkg/waiter/fdnotifier/fdnotifier.go                              | 2 +-
 pkg/waiter/fdnotifier/poll_unsafe.go                             | 2 +-
 pkg/waiter/waiter.go                                             | 2 +-
 pkg/waiter/waiter_test.go                                        | 2 +-
 runsc/boot/compat.go                                             | 2 +-
 runsc/boot/config.go                                             | 2 +-
 runsc/boot/controller.go                                         | 2 +-
 runsc/boot/debug.go                                              | 2 +-
 runsc/boot/events.go                                             | 2 +-
 runsc/boot/fds.go                                                | 2 +-
 runsc/boot/filter/config.go                                      | 2 +-
 runsc/boot/filter/extra_filters.go                               | 2 +-
 runsc/boot/filter/extra_filters_msan.go                          | 2 +-
 runsc/boot/filter/extra_filters_race.go                          | 2 +-
 runsc/boot/filter/filter.go                                      | 2 +-
 runsc/boot/fs.go                                                 | 2 +-
 runsc/boot/limits.go                                             | 2 +-
 runsc/boot/loader.go                                             | 2 +-
 runsc/boot/loader_test.go                                        | 2 +-
 runsc/boot/network.go                                            | 2 +-
 runsc/boot/strace.go                                             | 2 +-
 runsc/cgroup/cgroup.go                                           | 2 +-
 runsc/cgroup/cgroup_test.go                                      | 2 +-
 runsc/cmd/boot.go                                                | 2 +-
 runsc/cmd/capability.go                                          | 2 +-
 runsc/cmd/capability_test.go                                     | 2 +-
 runsc/cmd/checkpoint.go                                          | 2 +-
 runsc/cmd/cmd.go                                                 | 2 +-
 runsc/cmd/create.go                                              | 2 +-
 runsc/cmd/debug.go                                               | 2 +-
 runsc/cmd/delete.go                                              | 2 +-
 runsc/cmd/delete_test.go                                         | 2 +-
 runsc/cmd/events.go                                              | 2 +-
 runsc/cmd/exec.go                                                | 2 +-
 runsc/cmd/exec_test.go                                           | 2 +-
 runsc/cmd/gofer.go                                               | 2 +-
 runsc/cmd/kill.go                                                | 2 +-
 runsc/cmd/list.go                                                | 2 +-
 runsc/cmd/path.go                                                | 2 +-
 runsc/cmd/pause.go                                               | 2 +-
 runsc/cmd/ps.go                                                  | 2 +-
 runsc/cmd/restore.go                                             | 2 +-
 runsc/cmd/resume.go                                              | 2 +-
 runsc/cmd/run.go                                                 | 2 +-
 runsc/cmd/spec.go                                                | 2 +-
 runsc/cmd/start.go                                               | 2 +-
 runsc/cmd/state.go                                               | 2 +-
 runsc/cmd/wait.go                                                | 2 +-
 runsc/console/console.go                                         | 2 +-
 runsc/container/console_test.go                                  | 2 +-
 runsc/container/container.go                                     | 2 +-
 runsc/container/container_test.go                                | 2 +-
 runsc/container/fs.go                                            | 2 +-
 runsc/container/fs_test.go                                       | 2 +-
 runsc/container/hook.go                                          | 2 +-
 runsc/container/multi_container_test.go                          | 2 +-
 runsc/container/status.go                                        | 2 +-
 runsc/container/test_app.go                                      | 2 +-
 runsc/fsgofer/filter/config.go                                   | 2 +-
 runsc/fsgofer/filter/extra_filters.go                            | 2 +-
 runsc/fsgofer/filter/extra_filters_msan.go                       | 2 +-
 runsc/fsgofer/filter/extra_filters_race.go                       | 2 +-
 runsc/fsgofer/filter/filter.go                                   | 2 +-
 runsc/fsgofer/fsgofer.go                                         | 2 +-
 runsc/fsgofer/fsgofer_test.go                                    | 2 +-
 runsc/fsgofer/fsgofer_unsafe.go                                  | 2 +-
 runsc/main.go                                                    | 2 +-
 runsc/sandbox/chroot.go                                          | 2 +-
 runsc/sandbox/network.go                                         | 2 +-
 runsc/sandbox/sandbox.go                                         | 2 +-
 runsc/specutils/namespace.go                                     | 2 +-
 runsc/specutils/specutils.go                                     | 2 +-
 runsc/specutils/specutils_test.go                                | 2 +-
 runsc/test/image/image.go                                        | 2 +-
 runsc/test/image/image_test.go                                   | 2 +-
 runsc/test/image/mysql.sql                                       | 2 +-
 runsc/test/image/ruby.rb                                         | 2 +-
 runsc/test/image/ruby.sh                                         | 2 +-
 runsc/test/install.sh                                            | 2 +-
 runsc/test/integration/exec_test.go                              | 2 +-
 runsc/test/integration/integration.go                            | 2 +-
 runsc/test/integration/integration_test.go                       | 2 +-
 runsc/test/root/cgroup_test.go                                   | 2 +-
 runsc/test/root/chroot_test.go                                   | 2 +-
 runsc/test/root/root.go                                          | 2 +-
 runsc/test/testutil/docker.go                                    | 2 +-
 runsc/test/testutil/testutil.go                                  | 2 +-
 runsc/test/testutil/testutil_race.go                             | 2 +-
 runsc/tools/dockercfg/dockercfg.go                               | 2 +-
 tools/go_generics/generics.go                                    | 2 +-
 tools/go_generics/generics_tests/all_stmts/input.go              | 2 +-
 tools/go_generics/generics_tests/all_stmts/output/output.go      | 2 +-
 tools/go_generics/generics_tests/all_types/input.go              | 2 +-
 tools/go_generics/generics_tests/all_types/lib/lib.go            | 2 +-
 tools/go_generics/generics_tests/all_types/output/output.go      | 2 +-
 tools/go_generics/generics_tests/consts/input.go                 | 2 +-
 tools/go_generics/generics_tests/consts/output/output.go         | 2 +-
 tools/go_generics/generics_tests/imports/input.go                | 2 +-
 tools/go_generics/generics_tests/imports/output/output.go        | 2 +-
 tools/go_generics/generics_tests/remove_typedef/input.go         | 2 +-
 tools/go_generics/generics_tests/remove_typedef/output/output.go | 2 +-
 tools/go_generics/generics_tests/simple/input.go                 | 2 +-
 tools/go_generics/generics_tests/simple/output/output.go         | 2 +-
 tools/go_generics/globals/globals_visitor.go                     | 2 +-
 tools/go_generics/globals/scope.go                               | 2 +-
 tools/go_generics/go_generics_unittest.sh                        | 2 +-
 tools/go_generics/imports.go                                     | 2 +-
 tools/go_generics/merge.go                                       | 2 +-
 tools/go_generics/remove.go                                      | 2 +-
 tools/go_generics/rules_tests/template.go                        | 2 +-
 tools/go_generics/rules_tests/template_test.go                   | 2 +-
 tools/go_stateify/main.go                                        | 2 +-
 tools/workspace_status.sh                                        | 2 +-
 vdso/barrier.h                                                   | 2 +-
 vdso/check_vdso.py                                               | 2 +-
 vdso/compiler.h                                                  | 2 +-
 vdso/cycle_clock.h                                               | 2 +-
 vdso/seqlock.h                                                   | 2 +-
 vdso/syscalls.h                                                  | 2 +-
 vdso/vdso.cc                                                     | 2 +-
 vdso/vdso_time.cc                                                | 2 +-
 vdso/vdso_time.h                                                 | 2 +-
 923 files changed, 923 insertions(+), 923 deletions(-)

(limited to 'pkg/sentry')

diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
index f2b719f52..89e24b037 100755
--- a/kokoro/run_build.sh
+++ b/kokoro/run_build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 3f8841cee..0a0d73d29 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi.go b/pkg/abi/abi.go
index a53c2747b..7770f0405 100644
--- a/pkg/abi/abi.go
+++ b/pkg/abi/abi.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go
index dd5d67b51..9d9f361a4 100644
--- a/pkg/abi/abi_linux.go
+++ b/pkg/abi/abi_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/flag.go b/pkg/abi/flag.go
index 0391ccf37..0698e410f 100644
--- a/pkg/abi/flag.go
+++ b/pkg/abi/flag.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go
index 9c39ca2ef..1b7ca714a 100644
--- a/pkg/abi/linux/aio.go
+++ b/pkg/abi/linux/aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ashmem.go b/pkg/abi/linux/ashmem.go
index 7fbfd2e68..ced1e44d4 100644
--- a/pkg/abi/linux/ashmem.go
+++ b/pkg/abi/linux/ashmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/binder.go b/pkg/abi/linux/binder.go
index b228898f9..522dc6f53 100644
--- a/pkg/abi/linux/binder.go
+++ b/pkg/abi/linux/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index 80e5b1af1..d9cd09948 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index b470ce0a5..7d96f013e 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index ea5b16b7b..5b1199aac 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go
index 76c13b677..928067c04 100644
--- a/pkg/abi/linux/elf.go
+++ b/pkg/abi/linux/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/errors.go b/pkg/abi/linux/errors.go
index b5ddb2b2f..01e4095b8 100644
--- a/pkg/abi/linux/errors.go
+++ b/pkg/abi/linux/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/eventfd.go b/pkg/abi/linux/eventfd.go
index bc0fb44d2..5614f5cf1 100644
--- a/pkg/abi/linux/eventfd.go
+++ b/pkg/abi/linux/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/exec.go b/pkg/abi/linux/exec.go
index 4d81eca54..a07c29243 100644
--- a/pkg/abi/linux/exec.go
+++ b/pkg/abi/linux/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index 2a5ad6ed7..c8558933a 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 9bf229a57..72e5c6f83 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 32a0812b4..7817bfb52 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go
index f63f5200c..5dff01fba 100644
--- a/pkg/abi/linux/futex.go
+++ b/pkg/abi/linux/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/inotify.go b/pkg/abi/linux/inotify.go
index 072a2d146..79c5d3593 100644
--- a/pkg/abi/linux/inotify.go
+++ b/pkg/abi/linux/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index afd9ee82b..9afc3d1ef 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index 6b68999ab..fcec16965 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go
index 81e9904dd..10681768b 100644
--- a/pkg/abi/linux/ipc.go
+++ b/pkg/abi/linux/ipc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index e1f0932ec..b2e51b9bd 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go
index de2af80dc..d365f693d 100644
--- a/pkg/abi/linux/linux.go
+++ b/pkg/abi/linux/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index b48e1d18a..3fcdf8235 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netdevice.go b/pkg/abi/linux/netdevice.go
index 88654a1b3..e3b6b1e40 100644
--- a/pkg/abi/linux/netdevice.go
+++ b/pkg/abi/linux/netdevice.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index e823ffa7e..10ceb5bf2 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index a5d778748..4200b6506 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/poll.go b/pkg/abi/linux/poll.go
index f373cfca1..9f0b15d1c 100644
--- a/pkg/abi/linux/poll.go
+++ b/pkg/abi/linux/poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index 074ec03f0..e152c4c27 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ptrace.go b/pkg/abi/linux/ptrace.go
index ba48d4d6d..7db4f5464 100644
--- a/pkg/abi/linux/ptrace.go
+++ b/pkg/abi/linux/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/rusage.go b/pkg/abi/linux/rusage.go
index a4a89abda..7fea4b589 100644
--- a/pkg/abi/linux/rusage.go
+++ b/pkg/abi/linux/rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go
index 05fda1604..ef96a3801 100644
--- a/pkg/abi/linux/sched.go
+++ b/pkg/abi/linux/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index a8de9d3d0..9963ceeba 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index 3495f5cd0..d1a0bdb32 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
index f50b3c2e2..82a80e609 100644
--- a/pkg/abi/linux/shm.go
+++ b/pkg/abi/linux/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index b2c7230c4..bf9bce6ed 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 19b5fa212..af0761a3b 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 4569f4208..bbd21e726 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/timer.go b/pkg/abi/linux/timer.go
index 6c4675c35..a6f420bdb 100644
--- a/pkg/abi/linux/timer.go
+++ b/pkg/abi/linux/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index f63dc52aa..e6f7c5b2a 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/uio.go b/pkg/abi/linux/uio.go
index 93c972774..7e00d9959 100644
--- a/pkg/abi/linux/uio.go
+++ b/pkg/abi/linux/uio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/utsname.go b/pkg/abi/linux/utsname.go
index 7d33d20de..f80ed7d4a 100644
--- a/pkg/abi/linux/utsname.go
+++ b/pkg/abi/linux/utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index 1cb73359a..26b674435 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go
index 876e47b19..104e0dab1 100644
--- a/pkg/amutex/amutex_test.go
+++ b/pkg/amutex/amutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops.go b/pkg/atomicbitops/atomic_bitops.go
index 6635ea0d2..9a57f9599 100644
--- a/pkg/atomicbitops/atomic_bitops.go
+++ b/pkg/atomicbitops/atomic_bitops.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_amd64.s b/pkg/atomicbitops/atomic_bitops_amd64.s
index 542452bec..b37e3aad3 100644
--- a/pkg/atomicbitops/atomic_bitops_amd64.s
+++ b/pkg/atomicbitops/atomic_bitops_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_common.go b/pkg/atomicbitops/atomic_bitops_common.go
index 542ff4e83..b03242baa 100644
--- a/pkg/atomicbitops/atomic_bitops_common.go
+++ b/pkg/atomicbitops/atomic_bitops_common.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
index ec0c07ee2..ee6207cb3 100644
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ b/pkg/atomicbitops/atomic_bitops_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary.go b/pkg/binary/binary.go
index 3b18a86ee..02f7e9fb8 100644
--- a/pkg/binary/binary.go
+++ b/pkg/binary/binary.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary_test.go b/pkg/binary/binary_test.go
index 921a0369a..d8d481f32 100644
--- a/pkg/binary/binary_test.go
+++ b/pkg/binary/binary_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits.go b/pkg/bits/bits.go
index 50ca4bff7..eb3c80f49 100644
--- a/pkg/bits/bits.go
+++ b/pkg/bits/bits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 0a01f29c2..8c578cca2 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64.go b/pkg/bits/uint64_arch_amd64.go
index 068597f68..1fef89394 100644
--- a/pkg/bits/uint64_arch_amd64.go
+++ b/pkg/bits/uint64_arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64_asm.s b/pkg/bits/uint64_arch_amd64_asm.s
index 33885641a..8c7322f0f 100644
--- a/pkg/bits/uint64_arch_amd64_asm.s
+++ b/pkg/bits/uint64_arch_amd64_asm.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_generic.go b/pkg/bits/uint64_arch_generic.go
index 862033a4b..cfb47400b 100644
--- a/pkg/bits/uint64_arch_generic.go
+++ b/pkg/bits/uint64_arch_generic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index 906017e1a..d6dbaf602 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/bpf.go b/pkg/bpf/bpf.go
index 757744090..98d44d911 100644
--- a/pkg/bpf/bpf.go
+++ b/pkg/bpf/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder.go b/pkg/bpf/decoder.go
index ef41e9edc..ae6b8839a 100644
--- a/pkg/bpf/decoder.go
+++ b/pkg/bpf/decoder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder_test.go b/pkg/bpf/decoder_test.go
index 18709b944..f093e1e41 100644
--- a/pkg/bpf/decoder_test.go
+++ b/pkg/bpf/decoder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/input_bytes.go b/pkg/bpf/input_bytes.go
index 74af038eb..745c0749b 100644
--- a/pkg/bpf/input_bytes.go
+++ b/pkg/bpf/input_bytes.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter.go b/pkg/bpf/interpreter.go
index 111ada9d1..86c7add4d 100644
--- a/pkg/bpf/interpreter.go
+++ b/pkg/bpf/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter_test.go b/pkg/bpf/interpreter_test.go
index 9e5e33228..c46a43991 100644
--- a/pkg/bpf/interpreter_test.go
+++ b/pkg/bpf/interpreter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder.go b/pkg/bpf/program_builder.go
index bad56d7ac..b4ce228e1 100644
--- a/pkg/bpf/program_builder.go
+++ b/pkg/bpf/program_builder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder_test.go b/pkg/bpf/program_builder_test.go
index 7e4f06584..0e0b79d88 100644
--- a/pkg/bpf/program_builder_test.go
+++ b/pkg/bpf/program_builder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go
index 667f17c5c..205536812 100644
--- a/pkg/compressio/compressio.go
+++ b/pkg/compressio/compressio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio_test.go b/pkg/compressio/compressio_test.go
index 7cb5f8dc4..1bbabee79 100644
--- a/pkg/compressio/compressio_test.go
+++ b/pkg/compressio/compressio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/client/client.go b/pkg/control/client/client.go
index f7c2e8776..0d0c9f148 100644
--- a/pkg/control/client/client.go
+++ b/pkg/control/client/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go
index d00061ce3..c46b5d70b 100644
--- a/pkg/control/server/server.go
+++ b/pkg/control/server/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpu_amd64.s b/pkg/cpuid/cpu_amd64.s
index 48a13c6fd..905c1d12e 100644
--- a/pkg/cpuid/cpu_amd64.s
+++ b/pkg/cpuid/cpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index e91e34dc7..5b083a5fb 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_parse_test.go b/pkg/cpuid/cpuid_parse_test.go
index c4f52818c..81b06f48c 100644
--- a/pkg/cpuid/cpuid_parse_test.go
+++ b/pkg/cpuid/cpuid_parse_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
index 02f732f85..0decd8f08 100644
--- a/pkg/cpuid/cpuid_test.go
+++ b/pkg/cpuid/cpuid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 92c634a14..3330c4998 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp.go b/pkg/dhcp/dhcp.go
index ceaba34c3..ad11e178a 100644
--- a/pkg/dhcp/dhcp.go
+++ b/pkg/dhcp/dhcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_string.go b/pkg/dhcp/dhcp_string.go
index 7cabed29e..8533895bd 100644
--- a/pkg/dhcp/dhcp_string.go
+++ b/pkg/dhcp/dhcp_string.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index d60e3752b..a21dce6bc 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index 26700bdbc..3e06ab4c7 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index bfd28256e..41a7b5ed3 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.proto b/pkg/eventchannel/event.proto
index 455f03658..c1679c7e7 100644
--- a/pkg/eventchannel/event.proto
+++ b/pkg/eventchannel/event.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index 32d24c41b..f6656ffa1 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd_test.go b/pkg/fd/fd_test.go
index 94b3eb7cc..42bb3ef6c 100644
--- a/pkg/fd/fd_test.go
+++ b/pkg/fd/fd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate.go b/pkg/gate/gate.go
index 93808c9dd..48122bf5a 100644
--- a/pkg/gate/gate.go
+++ b/pkg/gate/gate.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 06587339b..95620fa8e 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 4ae02eee9..51c9b6df3 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list_test.go b/pkg/ilist/list_test.go
index 2c56280f6..4bda570b6 100644
--- a/pkg/ilist/list_test.go
+++ b/pkg/ilist/list_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go
index 98f974410..5fbd4e779 100644
--- a/pkg/linewriter/linewriter.go
+++ b/pkg/linewriter/linewriter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter_test.go b/pkg/linewriter/linewriter_test.go
index ce97cca05..9140ee6af 100644
--- a/pkg/linewriter/linewriter_test.go
+++ b/pkg/linewriter/linewriter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index 58b4052e6..fbb58501b 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog_unsafe.go b/pkg/log/glog_unsafe.go
index c320190b8..bb06aa7d3 100644
--- a/pkg/log/glog_unsafe.go
+++ b/pkg/log/glog_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json.go b/pkg/log/json.go
index 3887f1cd5..96bd13d87 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json_test.go b/pkg/log/json_test.go
index 3b167dab0..b8c7a795e 100644
--- a/pkg/log/json_test.go
+++ b/pkg/log/json_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log.go b/pkg/log/log.go
index c496e86e4..b8d456aae 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go
index d93e989dc..a59d457dd 100644
--- a/pkg/log/log_test.go
+++ b/pkg/log/log_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 763cd6bc2..02af75974 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto
index 6108cb7c0..917fda1ac 100644
--- a/pkg/metric/metric.proto
+++ b/pkg/metric/metric.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index 7d156e4a5..40034a589 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/buffer.go b/pkg/p9/buffer.go
index fc65d2c5f..9575ddf12 100644
--- a/pkg/p9/buffer.go
+++ b/pkg/p9/buffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 5fa231bc5..3ebfab82a 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index a46efd27f..066639fda 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_test.go b/pkg/p9/client_test.go
index 06302a76a..f7145452d 100644
--- a/pkg/p9/client_test.go
+++ b/pkg/p9/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 9723fa24d..d2e89e373 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index ea41f97c7..959dff31d 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index cef3701a7..1e6aaa762 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index b3d76801b..972c37344 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index f353755f1..dfb41bb76 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index c6899c3ce..3b0993ecd 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9_test.go b/pkg/p9/p9_test.go
index a50ac80a4..02498346c 100644
--- a/pkg/p9/p9_test.go
+++ b/pkg/p9/p9_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 34ddccd8b..db562b9ba 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/mocks.go b/pkg/p9/p9test/mocks.go
index 9d039ac63..9a8c14975 100644
--- a/pkg/p9/p9test/mocks.go
+++ b/pkg/p9/p9test/mocks.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
index 9a508b898..34ed898e8 100644
--- a/pkg/p9/pool.go
+++ b/pkg/p9/pool.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool_test.go b/pkg/p9/pool_test.go
index 96be2c8bd..71052d8c4 100644
--- a/pkg/p9/pool_test.go
+++ b/pkg/p9/pool_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 28a273ac6..5c7cb18c8 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index b5df29961..97396806c 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go
index d6d4b6365..3352a5205 100644
--- a/pkg/p9/transport_test.go
+++ b/pkg/p9/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index 8783eaa7e..ceb6fabbf 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version_test.go b/pkg/p9/version_test.go
index 634ac3ca5..c053614c9 100644
--- a/pkg/p9/version_test.go
+++ b/pkg/p9/version_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand.go b/pkg/rand/rand.go
index e81f0f5db..593a14380 100644
--- a/pkg/rand/rand.go
+++ b/pkg/rand/rand.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index a2be66b3b..7ebe8f3b0 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 638a93bab..8f08c74c7 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_state.go b/pkg/refs/refcounter_state.go
index 093eae785..136f06fbf 100644
--- a/pkg/refs/refcounter_state.go
+++ b/pkg/refs/refcounter_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go
index cc11bcd71..abaa87453 100644
--- a/pkg/refs/refcounter_test.go
+++ b/pkg/refs/refcounter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index a746dc9b3..1dfbf749e 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index 6b707f195..a9278c64b 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 0188ad4f3..226f30b7b 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index 4f2ae4dac..007038273 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index ae18534bf..dd009221a 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/full_reader.go b/pkg/secio/full_reader.go
index b2dbb8615..90b1772a7 100644
--- a/pkg/secio/full_reader.go
+++ b/pkg/secio/full_reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio.go b/pkg/secio/secio.go
index fc625efb8..e5f74a497 100644
--- a/pkg/secio/secio.go
+++ b/pkg/secio/secio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio_test.go b/pkg/secio/secio_test.go
index 64b4cc17d..8304c4f74 100644
--- a/pkg/secio/secio_test.go
+++ b/pkg/secio/secio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/range.go b/pkg/segment/range.go
index 34c067265..057bcd7ff 100644
--- a/pkg/segment/range.go
+++ b/pkg/segment/range.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index cffec2a2c..a9a3b8875 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set_state.go b/pkg/segment/set_state.go
index a763d1915..b86e1b75f 100644
--- a/pkg/segment/set_state.go
+++ b/pkg/segment/set_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go
index 7ea24b177..0825105db 100644
--- a/pkg/segment/test/segment_test.go
+++ b/pkg/segment/test/segment_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index 37c196ea1..05ba5fbb9 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/aligned.go b/pkg/sentry/arch/aligned.go
index 193232e27..c88c034f6 100644
--- a/pkg/sentry/arch/aligned.go
+++ b/pkg/sentry/arch/aligned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 21cb84502..575b7ba66 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 5ba6c19ea..bb80a7bed 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s
index 10d621b6d..fa9857df7 100644
--- a/pkg/sentry/arch/arch_amd64.s
+++ b/pkg/sentry/arch/arch_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index e9c23a06b..604bd08a6 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index b35eec53c..59bf89d99 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 81cfb4a01..5df65a691 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index 437ff44ca..f4c2f7043 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index 36437b965..ad098c746 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 9ca4c8ed1..f7f054b0b 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_info.go b/pkg/sentry/arch/signal_info.go
index ec004ae75..fa0ecbec5 100644
--- a/pkg/sentry/arch/signal_info.go
+++ b/pkg/sentry/arch/signal_info.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index ba43dd1d4..c02ae3b7c 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 6c1b9be82..716a3574d 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
index 41d8ba0d1..47c31d4b9 100644
--- a/pkg/sentry/arch/syscalls_amd64.go
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index 598c5b4ff..12bdcef85 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index b3c6a566b..d2f084ed7 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/control.go b/pkg/sentry/control/control.go
index a6ee6e649..32d30b6ea 100644
--- a/pkg/sentry/control/control.go
+++ b/pkg/sentry/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 0ba730c1e..b6ac2f312 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index 22c826236..5d52cd829 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
index cee4db636..0a480c84a 100644
--- a/pkg/sentry/control/state.go
+++ b/pkg/sentry/control/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 21fee8f8a..27e4eb258 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device_test.go b/pkg/sentry/device/device_test.go
index dfec45046..5d8805c2f 100644
--- a/pkg/sentry/device/device_test.go
+++ b/pkg/sentry/device/device_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
index ddc2c0985..743cf511f 100644
--- a/pkg/sentry/fs/anon/anon.go
+++ b/pkg/sentry/fs/anon/anon.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/device.go b/pkg/sentry/fs/anon/device.go
index 1c666729c..2d1249299 100644
--- a/pkg/sentry/fs/anon/device.go
+++ b/pkg/sentry/fs/anon/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index bfd7f2762..5372875ac 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index d0986fa11..962da141b 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
index ecba395a0..7c997f533 100644
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go
index f4ea5de6d..736e628dc 100644
--- a/pkg/sentry/fs/ashmem/pin_board_test.go
+++ b/pkg/sentry/fs/ashmem/pin_board_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 091f4ac63..59e060e3c 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 502a262dd..42b9e8b26 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index da46ad77f..1775d3486 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 8c949b176..d65dc74bf 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index c3c9d963d..64f030f72 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index b347468ff..ef6d1a870 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 3f4f2a40a..05a5005ad 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/device.go b/pkg/sentry/fs/dev/device.go
index 9d935e008..3cecdf6e2 100644
--- a/pkg/sentry/fs/dev/device.go
+++ b/pkg/sentry/fs/dev/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index 2ae49be4e..d96f4f423 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 492b8eb3a..eeda646ab 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 2977c8670..68090f353 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 47b76218f..33e4913e4 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 27fea0019..2c01485a8 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index c680e4828..502b0a09b 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go
index 82b7f6bd5..5d0e9d91c 100644
--- a/pkg/sentry/fs/dirent_cache_test.go
+++ b/pkg/sentry/fs/dirent_cache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index f9dcba316..325404e27 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index 04ab197b9..5cf151dab 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 2e34604e6..bfafff5ec 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
index 945cfaf08..92ab6ff0e 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index 83f6c1986..69516e048 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index 99c40d8ed..4395666ad 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 6cd314f5b..d3f15be6b 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 36794d378..d6752ed1b 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index d223bb5c7..28e8e233d 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 41e646ee8..9b958b64b 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 830458ff9..11e4f7203 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go
index f848d1b79..1c3bae3e8 100644
--- a/pkg/sentry/fs/file_state.go
+++ b/pkg/sentry/fs/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_test.go b/pkg/sentry/fs/file_test.go
index 18aee7101..f3ed9a70b 100644
--- a/pkg/sentry/fs/file_test.go
+++ b/pkg/sentry/fs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index 5a1e7a270..ba8be85e4 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 1831aa82f..65ca196d9 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index 1aa271560..bf2a20b33 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 6ec9ff446..b5c72990e 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 8e31e48fd..5add16ac4 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go
index f7693cb19..f5c9d9215 100644
--- a/pkg/sentry/fs/fsutil/dirty_set_test.go
+++ b/pkg/sentry/fs/fsutil/dirty_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index d5881613b..46db2e51c 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index da6949ccb..dd7ab4b4a 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index 14dece315..b6e783614 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
index 6fe4ef13d..3d7f3732d 100644
--- a/pkg/sentry/fs/fsutil/fsutil.go
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
index e7efd3c0f..8920b72ee 100644
--- a/pkg/sentry/fs/fsutil/handle.go
+++ b/pkg/sentry/fs/fsutil/handle.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go
index d94c3eb0d..43e1a3bdf 100644
--- a/pkg/sentry/fs/fsutil/handle_test.go
+++ b/pkg/sentry/fs/fsutil/handle_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 9c1e2f76f..9599665f0 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
index 57705decd..bbd15b30b 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_state.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
index 790f3a5a6..86df76822 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 3acc32752..d4db1c2de 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 6777c8bf7..b0af44ddd 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 996c91849..e388ec3d7 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 5e24767f9..98700d014 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 98f43c578..3d380f0e8 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index d4b6f6eb7..a0265c2aa 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/device.go b/pkg/sentry/fs/gofer/device.go
index fac7306d4..52c5acf48 100644
--- a/pkg/sentry/fs/gofer/device.go
+++ b/pkg/sentry/fs/gofer/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index c4a210656..6d961813d 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index 715af8f16..dd4f817bf 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 3ae93f059..ed30cb1f1 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index c8d7bd773..3190d1e18 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index a3e52aad6..f32e99ce0 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 7fc8f77b0..5811b8b12 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index ad11034f9..ad4d3df58 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 0bf7881da..a324dc990 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 4e2293398..7552216f3 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 8e6424492..f657135fc 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index d072da624..76ce58810 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index d9ed8c81e..1a759370d 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index d2e34a69d..0753640a2 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 148291ba6..7c9d2b299 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
index 7fb274451..530c0109f 100644
--- a/pkg/sentry/fs/host/descriptor_state.go
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
index f393a8b54..6bc1bd2ae 100644
--- a/pkg/sentry/fs/host/descriptor_test.go
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
index f2a0b6b15..b5adedf44 100644
--- a/pkg/sentry/fs/host/device.go
+++ b/pkg/sentry/fs/host/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 22a5d9f12..975084c86 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index e46ae433c..fec890964 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index b08125ca8..e69559aac 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index e32497203..08754bd6b 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index 8bc99d94b..b7c1a9581 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 0ff87c418..9f1561bd5 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index bc965a1c2..175dca613 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 0eb267c00..af53bf533 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index 1a9587b90..d4ce4a8c1 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
index 7fa500bfb..2932c1f16 100644
--- a/pkg/sentry/fs/host/socket_state.go
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 483e99dd6..e9a88b124 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 5e4c5feed..f35e2492d 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index ad1323610..cf3639c46 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index 74c703eb7..40c450660 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index 2ecb54319..d00da89d6 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index c5f5c9c0d..9ca8c399f 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 409c81a97..95769ccf8 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index 683140afe..e213df924 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 3ee3de10e..77973ce79 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index cf698a4da..78923fb5b 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 23e5635a4..bba20da14 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 2aabdded8..f251df0d1 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index e9b5e0f56..9e3e9d816 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 3e1959e83..b83544c9f 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 439e645db..5ff800d2d 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_range_test.go b/pkg/sentry/fs/lock/lock_range_test.go
index 06a37c701..b0ab882b9 100644
--- a/pkg/sentry/fs/lock/lock_range_test.go
+++ b/pkg/sentry/fs/lock/lock_range_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index e16f485be..395592a4b 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index c60f5f7a2..67fa4b1dd 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 846b6e8bb..6bfcda6bb 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 8345876fc..24e28ddb2 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index dbc608c7e..fb91635bc 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_state.go b/pkg/sentry/fs/mount_state.go
index f5ed1dd8d..6344d5160 100644
--- a/pkg/sentry/fs/mount_state.go
+++ b/pkg/sentry/fs/mount_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 968b435ab..a1c9f4f79 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index c0a803b2d..7c5348cce 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index 8669f3a38..cc7c32c9b 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go
index 7cc8398e6..38aee765a 100644
--- a/pkg/sentry/fs/offset.go
+++ b/pkg/sentry/fs/offset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 5a30af419..036c0f733 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go
index b74f6ed8c..91a9a8ffd 100644
--- a/pkg/sentry/fs/path.go
+++ b/pkg/sentry/fs/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go
index 7ab070855..391b010a7 100644
--- a/pkg/sentry/fs/path_test.go
+++ b/pkg/sentry/fs/path_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index 4dfec03a4..f8be06dc3 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/device/device.go b/pkg/sentry/fs/proc/device/device.go
index 6194afe88..04b687bcf 100644
--- a/pkg/sentry/fs/proc/device/device.go
+++ b/pkg/sentry/fs/proc/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index a69cbaa0e..b4896053f 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index dada8f982..5ebb33703 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
index 4b3448245..f659e590a 100644
--- a/pkg/sentry/fs/proc/file.go
+++ b/pkg/sentry/fs/proc/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index 49b92fd8a..c050a00be 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 061824b8c..63f737ff4 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 6fac251d2..78f3a1dc0 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 53dfd59ef..b31258eed 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 81dcc153a..0b0e87528 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 8cd6fe9d3..45f2a1211 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net_test.go b/pkg/sentry/fs/proc/net_test.go
index a31a20494..94677cc1d 100644
--- a/pkg/sentry/fs/proc/net_test.go
+++ b/pkg/sentry/fs/proc/net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 07029a7bb..33030bebf 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index 50d0271f9..d025069df 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 51cae5e37..0499ba65b 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index d90e3e736..f9a2ca38e 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index bf7650211..f2bbef375 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index 384b4ffe1..54562508d 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index beb25be20..801eb6a1e 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 7ba392346..0ce9d30f1 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 748ca4320..404faea0a 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index a7e4cf0a6..f70399686 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index f3a9b81df..80c7ce0b4 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index 00f6a2afd..b6d49d5e9 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 075e13b01..0a911b155 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/file.go b/pkg/sentry/fs/ramfs/file.go
index 0b94d92a1..b7fc98ffc 100644
--- a/pkg/sentry/fs/ramfs/file.go
+++ b/pkg/sentry/fs/ramfs/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
index 83cbcab23..d77688a34 100644
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ b/pkg/sentry/fs/ramfs/ramfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 9ac00eb18..8c81478c8 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 1c54d9991..a21fac2c7 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/test/test.go b/pkg/sentry/fs/ramfs/test/test.go
index fb669558f..11bff7729 100644
--- a/pkg/sentry/fs/ramfs/test/test.go
+++ b/pkg/sentry/fs/ramfs/test/test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index 1fb335f74..29a70f698 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index 68e2929d5..d5567d9e1 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index b4ac85a27..da2df7e1d 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/save.go b/pkg/sentry/fs/save.go
index bf2a85143..90988d385 100644
--- a/pkg/sentry/fs/save.go
+++ b/pkg/sentry/fs/save.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/seek.go b/pkg/sentry/fs/seek.go
index 1268726c2..72f3fb632 100644
--- a/pkg/sentry/fs/seek.go
+++ b/pkg/sentry/fs/seek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sync.go b/pkg/sentry/fs/sync.go
index 9738a8f22..6dcc2fe8d 100644
--- a/pkg/sentry/fs/sync.go
+++ b/pkg/sentry/fs/sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/device.go b/pkg/sentry/fs/sys/device.go
index 54e414d1b..38ecd0c18 100644
--- a/pkg/sentry/fs/sys/device.go
+++ b/pkg/sentry/fs/sys/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index 2cf3a6f98..e64aa0edc 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 625525540..5ce33f87f 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index 7b9697668..7cc1942c7 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 767db95a0..7423e816c 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/device.go b/pkg/sentry/fs/tmpfs/device.go
index e588b3440..aade93c26 100644
--- a/pkg/sentry/fs/tmpfs/device.go
+++ b/pkg/sentry/fs/tmpfs/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 342688f81..1f9d69909 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index f064eb1ac..b5830d3df 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index ca620e65e..7c91e248b 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1e4fe47d2..42a7d7b9c 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 38be6db46..91b782540 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 7c0c0b0c1..e32b05c1d 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index d9f8f02f3..0c412eb21 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
index c0fa2b407..d5d1caafc 100644
--- a/pkg/sentry/fs/tty/inode.go
+++ b/pkg/sentry/fs/tty/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 31804571e..484366f85 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index ae7540eff..dad0cad79 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 01dc8d1ac..a09ca0119 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 4a0d4fdb9..9de3168bf 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 3cb135124..79f9d76d7 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index 32e1b1556..ad535838f 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/getcpu_amd64.s b/pkg/sentry/hostcpu/getcpu_amd64.s
index 7f6247d81..409db1450 100644
--- a/pkg/sentry/hostcpu/getcpu_amd64.s
+++ b/pkg/sentry/hostcpu/getcpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu.go b/pkg/sentry/hostcpu/hostcpu.go
index fa46499ad..3adc847bb 100644
--- a/pkg/sentry/hostcpu/hostcpu.go
+++ b/pkg/sentry/hostcpu/hostcpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu_test.go b/pkg/sentry/hostcpu/hostcpu_test.go
index a82e1a271..38de0e1f6 100644
--- a/pkg/sentry/hostcpu/hostcpu_test.go
+++ b/pkg/sentry/hostcpu/hostcpu_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go
index 370381f41..d05e96f15 100644
--- a/pkg/sentry/inet/context.go
+++ b/pkg/sentry/inet/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 30ca4e0c0..8206377cc 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index bc10926ee..05c1a1792 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 45088c988..1ea2cee36 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
index c49a6b852..19f15fd36 100644
--- a/pkg/sentry/kernel/auth/auth.go
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
index 5b8164c49..88d6243aa 100644
--- a/pkg/sentry/kernel/auth/capability_set.go
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index 914589b28..f7e945599 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index f18f7dac9..de33f1953 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index 37522b018..e5bed44d7 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index bd0090e0f..43f439825 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
index 889291d96..8f1a189ec 100644
--- a/pkg/sentry/kernel/auth/id_map_functions.go
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index d359f3f31..5bb9c44c0 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index 261ca6f7a..b629521eb 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index a8eb114c0..9c13ecfcc 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index dabb32f49..7f3e2004a 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index bc869fc13..d89c1b745 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index a4ada0e78..26dc59a85 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
index 71326b62f..14e8996d9 100644
--- a/pkg/sentry/kernel/eventfd/eventfd_test.go
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index f77339cae..aa4aac109 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index cad0b0a20..715f4714d 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
index 95123aef3..b49996137 100644
--- a/pkg/sentry/kernel/fd_map_test.go
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index f3f05e8f5..3cf0db280 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 54b1982a0..ea69d433b 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 726c26990..ea506a29b 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 5eef49f59..9ceb9bd92 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
index bbb476544..8eafe810b 100644
--- a/pkg/sentry/kernel/kdefs/kdefs.go
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5d6856f3c..bad558d48 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
index bb2d5102d..a0a69b498 100644
--- a/pkg/sentry/kernel/kernel_state.go
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index f7a183a1d..f05ef1b64 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
index abc565054..43b8deb76 100644
--- a/pkg/sentry/kernel/memevent/memory_events.proto
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index bb5db0309..373e11772 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
index 6d90ed033..72be6702f 100644
--- a/pkg/sentry/kernel/pending_signals_state.go
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index a82e45c3f..fa8045910 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
index 8d383577a..eec5c5de8 100644
--- a/pkg/sentry/kernel/pipe/device.go
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 23d692da1..4b0e00b85 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index cc1ebf4f6..eda551594 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index ced2559a7..126054826 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index 49ef8c8ac..3b9895927 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 1fa5e9a32..f27379969 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 82607367b..63efc5bbe 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index d93324b53..6fea9769c 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
index 0ab958529..40b5acca3 100644
--- a/pkg/sentry/kernel/posixtimer.go
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 9fe28f435..20bac2b70 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 1f3de58e3..46b03c700 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
index 0a97603f0..69aee9127 100644
--- a/pkg/sentry/kernel/sched/cpuset.go
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
index 8a6e12958..a036ed513 100644
--- a/pkg/sentry/kernel/sched/cpuset_test.go
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
index f1de1da60..e59909baf 100644
--- a/pkg/sentry/kernel/sched/sched.go
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index d77c05e2f..37dd3e4c9 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index aa07946cf..232a276dc 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index f9eb382e9..5f886bf31 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index a9b4e7647..78a5b4063 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
index b0dacdbe0..bbc653ed8 100644
--- a/pkg/sentry/kernel/shm/device.go
+++ b/pkg/sentry/kernel/shm/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 77973951e..8d0d14e45 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index e3a2a777a..b066df132 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 3649f5e4d..3f1ac9898 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 4c7811b6c..19b711e9c 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 826809a70..981455d46 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 6531bd5d2..2aecf3eea 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
index 71ca75555..3b29d3c6a 100644
--- a/pkg/sentry/kernel/table_test.go
+++ b/pkg/sentry/kernel/table_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 4f0b7fe3f..e22ec768d 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index d2052921e..24230af89 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 6dc7b938e..e5027e551 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index de3aef40d..755fe0370 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index d2df7e9d1..45b8d2b04 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 1b760aba4..a9b74da8e 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 65969ca9b..44fbb487c 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 62ebbcb0d..5a11ca3df 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index b0921b2eb..8f90ed786 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 1769da210..f4c881c2d 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index 4df2e53d3..fc7cefc1f 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 49ac933b7..596b9aa16 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 19dcc963a..3b3cdc24a 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index e2925a708..fe24f7542 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 6c8d7d316..c82a32c78 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index feaf6cae4..36846484c 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index f0373c375..0318adb35 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
index 82ef858a1..3f37f505d 100644
--- a/pkg/sentry/kernel/task_test.go
+++ b/pkg/sentry/kernel/task_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index 2b4954869..c8e973bd5 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index dfff7b52d..d7652f57c 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 4e3d19e97..bdb907905 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
index ac4dc01d8..3675ea20d 100644
--- a/pkg/sentry/kernel/time/context.go
+++ b/pkg/sentry/kernel/time/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 52e0dfba1..ca0f4ba2e 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 2167f3efe..6bff80f13 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
index 2e7fed4d8..f3a3ed543 100644
--- a/pkg/sentry/kernel/timekeeper_state.go
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 34a5cec27..71674c21c 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index 7e0fe0d21..ed5f0c031 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 971e8bc59..0ec858a4a 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
index a9e84673f..72bb0f93c 100644
--- a/pkg/sentry/kernel/version.go
+++ b/pkg/sentry/kernel/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index 75e97bf92..bf413eb7d 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index 02c8b60e3..ba0b7d4fd 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits_test.go b/pkg/sentry/limits/limits_test.go
index dd6f80750..d41f62554 100644
--- a/pkg/sentry/limits/limits_test.go
+++ b/pkg/sentry/limits/limits_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 8e6a24341..511db6733 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 849be5a3d..9b1e81dc9 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 54534952b..06a3c7156 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 62b39e52b..d1417c4f1 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index a06e27ac9..437cc5da1 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
index dc71e1c2d..b327f0e1e 100644
--- a/pkg/sentry/loader/vdso_state.go
+++ b/pkg/sentry/loader/vdso_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index c9483905d..33cf16f91 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go
index 10668d404..49ee34548 100644
--- a/pkg/sentry/memmap/mapping_set_test.go
+++ b/pkg/sentry/memmap/mapping_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index cdc5f2b27..05349a77f 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil.go b/pkg/sentry/memutil/memutil.go
index 4f245cf3c..286d50ca4 100644
--- a/pkg/sentry/memutil/memutil.go
+++ b/pkg/sentry/memutil/memutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go
index 32c27eb2f..8d9fc64fb 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/sentry/memutil/memutil_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 27554f163..7488f7c4a 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index b42156d45..87942af0e 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index 1a5e56f8e..192a6f744 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
index 56d0490f0..d341b9c07 100644
--- a/pkg/sentry/mm/debug.go
+++ b/pkg/sentry/mm/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index 6741db594..6600ddd78 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index a4b5cb443..b248b76e7 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 32d5e2ff6..5ef1ba0b1 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 3299ae164..aab697f9e 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index b47aa7263..f2db43196 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 9febb25ac..5690fe6b4 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go
index 5840b257c..0bf1cdb51 100644
--- a/pkg/sentry/mm/proc_pid_maps.go
+++ b/pkg/sentry/mm/proc_pid_maps.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
index 36fed8f1c..6e7080a84 100644
--- a/pkg/sentry/mm/save_restore.go
+++ b/pkg/sentry/mm/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
index bab137a5a..3bc48c7e7 100644
--- a/pkg/sentry/mm/shm.go
+++ b/pkg/sentry/mm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 5d7bd33bd..e511472f4 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index b0622b0c3..a721cc456 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index b81e861f1..dafdbd0e4 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go
index 0d200a5e2..cca21a23e 100644
--- a/pkg/sentry/platform/context.go
+++ b/pkg/sentry/platform/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index f278c8d63..97da31e70 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem_state.go b/pkg/sentry/platform/filemem/filemem_state.go
index e28e021c9..964e2aaaa 100644
--- a/pkg/sentry/platform/filemem/filemem_state.go
+++ b/pkg/sentry/platform/filemem/filemem_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem_test.go b/pkg/sentry/platform/filemem/filemem_test.go
index 4b165dc48..9becec25f 100644
--- a/pkg/sentry/platform/filemem/filemem_test.go
+++ b/pkg/sentry/platform/filemem/filemem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/filemem/filemem_unsafe.go b/pkg/sentry/platform/filemem/filemem_unsafe.go
index a23b9825a..776aed74d 100644
--- a/pkg/sentry/platform/filemem/filemem_unsafe.go
+++ b/pkg/sentry/platform/filemem/filemem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index ca4f42087..9c83f41eb 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt_test.go b/pkg/sentry/platform/interrupt/interrupt_test.go
index 7c49eeea6..fb3284395 100644
--- a/pkg/sentry/platform/interrupt/interrupt_test.go
+++ b/pkg/sentry/platform/interrupt/interrupt_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index c4293c517..72e897a9a 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
index f5cebd5b3..b25cad155 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index ecc33d7dd..9f1c9510b 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index b364e3ef7..f013d1dc9 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 0881bd5f5..ec017f6c2 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 61ca61dcb..cd00a47f2 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index 8650cd78f..e79a30ef2 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 216d4b4b6..747a95997 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index aac84febf..be902be88 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/host_map.go b/pkg/sentry/platform/kvm/host_map.go
index fc16ad2de..ee6a1a42d 100644
--- a/pkg/sentry/platform/kvm/host_map.go
+++ b/pkg/sentry/platform/kvm/host_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 0c4dff308..d4f50024d 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index 3d56ed895..70d0ac63b 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index 476e783a0..c0a0af92d 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index ca44c31b3..8c53c6f06 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 52448839f..45eeb96ff 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 9f60b6b31..fc7ad258f 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index bcd29a947..e0aec42b8 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 8b9041f13..50e513f3b 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 86323c891..4f5b01321 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 81a98656d..b908cae6a 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go
index 8a614e25d..0d496561d 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
index 39286a0af..fcba33813 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
index 3b5ad8817..f1da41a44 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index 0d3fbe043..0343e9267 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
index 7875bd3e9..935e0eb93 100644
--- a/pkg/sentry/platform/kvm/virtual_map_test.go
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go
index 6398e5e01..1bcc1f8e9 100644
--- a/pkg/sentry/platform/mmap_min_addr.go
+++ b/pkg/sentry/platform/mmap_min_addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 8a1620d93..f16588e6e 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/sentry/platform/procid/procid.go
index 5f861908f..3f49ab093 100644
--- a/pkg/sentry/platform/procid/procid.go
+++ b/pkg/sentry/platform/procid/procid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
index 5b1ba1f24..fd88ce82e 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/sentry/platform/procid/procid_net_test.go
index 2d1605a08..e8dcc479d 100644
--- a/pkg/sentry/platform/procid/procid_net_test.go
+++ b/pkg/sentry/platform/procid/procid_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/sentry/platform/procid/procid_test.go
index 5e44da36f..7a57c7cdc 100644
--- a/pkg/sentry/platform/procid/procid_test.go
+++ b/pkg/sentry/platform/procid/procid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 4f20716f7..00d92b092 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 46a8bda8e..7a3cb8f49 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
index 9bf87b6f6..63f98e40d 100644
--- a/pkg/sentry/platform/ptrace/stub_amd64.s
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go
index c868a2d68..48c16c4a1 100644
--- a/pkg/sentry/platform/ptrace/stub_unsafe.go
+++ b/pkg/sentry/platform/ptrace/stub_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 6d5ad6b71..6a9da5db8 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index c38dc1ff8..d23a1133e 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 53adadadd..7523487e7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
index 697431472..0c9263060 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index fe41641d3..ca6c4ac97 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index f09d045eb..18137e55d 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 84819f132..67242b92b 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index a3e992e0d..4a9affe64 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index 08c15ad65..d48fbd2d1 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go
index ffa7eaf77..11c49855f 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/main.go
+++ b/pkg/sentry/platform/ring0/gen_offsets/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 62e67005e..e70eafde2 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 0d2b0f7dc..ab562bca7 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
index cfb3ad853..faf4240e5 100644
--- a/pkg/sentry/platform/ring0/kernel_unsafe.go
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index 989e3e383..2b95a0141 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 6f143ea5a..98a130525 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index ca5fd456b..753d31ef8 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 049fd0247..ee6e90a11 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
index aca778913..f48647b3a 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index ff5787f89..c7207ec18 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 878463018..746f614e5 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
index a7f2ad9a4..2f82c4353 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index dca3f69ef..3e5dc7dc7 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index ca49d20f8..6bd8c3584 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index fa068e35e..0d9a51aa5 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
index afa4d473a..c4c71d23e 100644
--- a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
index 4991031c5..10c51e88d 100644
--- a/pkg/sentry/platform/ring0/ring0.go
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index f489fcecb..7c88010d8 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
index 69947dec3..873ffa046 100644
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s
index 7d1019f60..488b6e666 100644
--- a/pkg/sentry/platform/safecopy/memclr_amd64.s
+++ b/pkg/sentry/platform/safecopy/memclr_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s
index 96ef2eefc..0bf26fd7b 100644
--- a/pkg/sentry/platform/safecopy/memcpy_amd64.s
+++ b/pkg/sentry/platform/safecopy/memcpy_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
index 90a2aad7b..c60f73103 100644
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go
index 67df36121..1a682d28a 100644
--- a/pkg/sentry/platform/safecopy/safecopy_test.go
+++ b/pkg/sentry/platform/safecopy/safecopy_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
index 72f243f8d..df1c35b66 100644
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
index a65cb0c26..06614f1b4 100644
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
index 0b58f6497..e91ff66ae 100644
--- a/pkg/sentry/safemem/block_unsafe.go
+++ b/pkg/sentry/safemem/block_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index fd917648b..6cb52439f 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go
index edac4c1d7..2eda8c3bb 100644
--- a/pkg/sentry/safemem/io_test.go
+++ b/pkg/sentry/safemem/io_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go
index 2f8002004..090932d3e 100644
--- a/pkg/sentry/safemem/safemem.go
+++ b/pkg/sentry/safemem/safemem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go
index 3e83b3851..fddcaf714 100644
--- a/pkg/sentry/safemem/seq_test.go
+++ b/pkg/sentry/safemem/seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go
index e0d29a0b3..83a6b7183 100644
--- a/pkg/sentry/safemem/seq_unsafe.go
+++ b/pkg/sentry/safemem/seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 29bcf55ab..6b5d5f993 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index a455b919f..5913d47a8 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index db97e95f2..d44f5e88a 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/device.go b/pkg/sentry/socket/epsocket/device.go
index 17f2c9559..3cc138eb0 100644
--- a/pkg/sentry/socket/epsocket/device.go
+++ b/pkg/sentry/socket/epsocket/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 47c575e7b..e90ef4835 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index dbc232d26..686554437 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/epsocket/save_restore.go
index 2613f90de..34d9a7cf0 100644
--- a/pkg/sentry/socket/epsocket/save_restore.go
+++ b/pkg/sentry/socket/epsocket/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index e4ed52fc8..c0081c819 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/device.go b/pkg/sentry/socket/hostinet/device.go
index a9a673316..c5133f3bb 100644
--- a/pkg/sentry/socket/hostinet/device.go
+++ b/pkg/sentry/socket/hostinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/hostinet.go b/pkg/sentry/socket/hostinet/hostinet.go
index 67c6c8066..7858892ab 100644
--- a/pkg/sentry/socket/hostinet/hostinet.go
+++ b/pkg/sentry/socket/hostinet/hostinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/save_restore.go b/pkg/sentry/socket/hostinet/save_restore.go
index 0821a794a..3827f082a 100644
--- a/pkg/sentry/socket/hostinet/save_restore.go
+++ b/pkg/sentry/socket/hostinet/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index e82624b44..e4e950fbb 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index f8bb75636..59c8910ca 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index f64809d39..4ce73c1f1 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index b902d7ec9..a95172cba 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index 1c5d4c3a5..20b9a6e37 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port_test.go b/pkg/sentry/socket/netlink/port/port_test.go
index 34565e2f9..49b3b48ab 100644
--- a/pkg/sentry/socket/netlink/port/port_test.go
+++ b/pkg/sentry/socket/netlink/port/port_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 5d0a04a07..06786bd50 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 70322b9ed..7e70b09b2 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0c03997f2..4d4130a4c 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index f4c8489b1..9c749b888 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go
index f7b63436e..d2b9f9222 100644
--- a/pkg/sentry/socket/rpcinet/device.go
+++ b/pkg/sentry/socket/rpcinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index f88a908ed..73c255c33 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go
index 10b0dedc2..6c98e6acb 100644
--- a/pkg/sentry/socket/rpcinet/rpcinet.go
+++ b/pkg/sentry/socket/rpcinet/rpcinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index c7e761d54..44fa5c620 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index bcb89fb34..cb8344ec6 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go
index 9a896c623..d04fb2069 100644
--- a/pkg/sentry/socket/rpcinet/stack_unsafe.go
+++ b/pkg/sentry/socket/rpcinet/stack_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 31f8d42d7..a235c5249 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/device.go b/pkg/sentry/socket/unix/device.go
index e8bcc7a9f..41820dbb3 100644
--- a/pkg/sentry/socket/unix/device.go
+++ b/pkg/sentry/socket/unix/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 06333e14b..7d6434696 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 566e3d57b..4c913effc 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned_state.go b/pkg/sentry/socket/unix/transport/connectioned_state.go
index 7e6c73dcc..608a6a97a 100644
--- a/pkg/sentry/socket/unix/transport/connectioned_state.go
+++ b/pkg/sentry/socket/unix/transport/connectioned_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 86cd05199..cd4633106 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index c4d7d863c..5b4dfab68 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 2934101a2..157133b65 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 668363864..3543dd81f 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 43e88a713..70b33f190 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go
index afa21672a..7f047b808 100644
--- a/pkg/sentry/state/state_metadata.go
+++ b/pkg/sentry/state/state_metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_unsafe.go b/pkg/sentry/state/state_unsafe.go
index 3ff7d24c8..f02e12b2a 100644
--- a/pkg/sentry/state/state_unsafe.go
+++ b/pkg/sentry/state/state_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/clone.go b/pkg/sentry/strace/clone.go
index b82ca1ad1..e18ce84dc 100644
--- a/pkg/sentry/strace/clone.go
+++ b/pkg/sentry/strace/clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/futex.go b/pkg/sentry/strace/futex.go
index 3da108cb7..ceb3dc21d 100644
--- a/pkg/sentry/strace/futex.go
+++ b/pkg/sentry/strace/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 1df148e7d..99714f12c 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/open.go b/pkg/sentry/strace/open.go
index 839d5eda7..5a72a940c 100644
--- a/pkg/sentry/strace/open.go
+++ b/pkg/sentry/strace/open.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/ptrace.go b/pkg/sentry/strace/ptrace.go
index fcdb7e9f4..c572aafb4 100644
--- a/pkg/sentry/strace/ptrace.go
+++ b/pkg/sentry/strace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 26831edd6..375418dc1 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index f7bfa3a1f..4286f0df7 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto
index 914e8c7b0..f1fc539d6 100644
--- a/pkg/sentry/strace/strace.proto
+++ b/pkg/sentry/strace/strace.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 8be4fa318..9eeb18a03 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index 01dd6fa71..b90d191b7 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 013b385bc..9fd002955 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index f01483cd3..d1e0833fc 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 4465549ad..75e87f5ec 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index bfb541634..a033b7c70 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 54e4afa9e..355071131 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index 89c81ac90..cf972dc28 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index e69dfc77a..62272efcd 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index 60fe5a133..903172890 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 64704bb88..a70f35be0 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index d35dcecbe..cf04428bc 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 29c0d7a39..4b441b31b 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 4fd0ed794..8d594aa83 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index 725204dff..26a505782 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index 97b51ba7c..ad3bfd761 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 1a98328dc..f8d9c43fd 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 57cedccc1..bf0df7302 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 2b544f145..3652c429e 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index b9bdefadb..bf0958435 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index a1242acd3..c7b39ede8 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
index be31e6b17..452dff058 100644
--- a/pkg/sentry/syscalls/linux/sys_random.go
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 0be2d195a..b2e5a5449 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index d806b58ab..2f16e1791 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index 82e42b589..ab07c77f9 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index ff9e46077..e679a6694 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 4323a4df4..969acaa36 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index a8983705b..4ed52c4a7 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index 48ff1d5f0..b13d48b98 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index ecdec5d3a..a539354c5 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 5fa5ddce6..0a7551742 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 619a14d7c..9c433c45d 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 902d210db..826c6869d 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 6560bac57..5eeb3ba58 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
index 792040c81..7193b7aed 100644
--- a/pkg/sentry/syscalls/linux/sys_syslog.go
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 550f63a43..820ca680e 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 8e6683444..063fbb106 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index c41074d54..6baf4599b 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index 92c6a3d60..f70d13682 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index b95d62320..27ddb3808 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index 899116374..689f2f838 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index caa7b01ea..08e263112 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
index e865c6fc0..752ec326d 100644
--- a/pkg/sentry/syscalls/linux/timespec.go
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/polling.go b/pkg/sentry/syscalls/polling.go
index fd90184ef..2b33d6c19 100644
--- a/pkg/sentry/syscalls/polling.go
+++ b/pkg/sentry/syscalls/polling.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index 1176f858d..bae32d727 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/unimplemented_syscall.proto b/pkg/sentry/syscalls/unimplemented_syscall.proto
index d6febf5b1..41579b016 100644
--- a/pkg/sentry/syscalls/unimplemented_syscall.proto
+++ b/pkg/sentry/syscalls/unimplemented_syscall.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index cbb95e2d7..c8cf4eca4 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go
index 8b6dd5592..a9237630e 100644
--- a/pkg/sentry/time/calibrated_clock_test.go
+++ b/pkg/sentry/time/calibrated_clock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clock_id.go b/pkg/sentry/time/clock_id.go
index 500102e58..1317a5dad 100644
--- a/pkg/sentry/time/clock_id.go
+++ b/pkg/sentry/time/clock_id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clocks.go b/pkg/sentry/time/clocks.go
index 9925b407d..e26386520 100644
--- a/pkg/sentry/time/clocks.go
+++ b/pkg/sentry/time/clocks.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/muldiv_amd64.s b/pkg/sentry/time/muldiv_amd64.s
index 291940b1d..bfcb8c724 100644
--- a/pkg/sentry/time/muldiv_amd64.s
+++ b/pkg/sentry/time/muldiv_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index 594b4874b..f3ad58454 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index 7394fc5ee..4a0c4e880 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler.go b/pkg/sentry/time/sampler.go
index cf581b5fa..445690d49 100644
--- a/pkg/sentry/time/sampler.go
+++ b/pkg/sentry/time/sampler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_test.go b/pkg/sentry/time/sampler_test.go
index caf7e5c53..ec0e442b6 100644
--- a/pkg/sentry/time/sampler_test.go
+++ b/pkg/sentry/time/sampler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_unsafe.go b/pkg/sentry/time/sampler_unsafe.go
index 7ea19d387..0f8eb4fc8 100644
--- a/pkg/sentry/time/sampler_unsafe.go
+++ b/pkg/sentry/time/sampler_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/tsc_amd64.s b/pkg/sentry/time/tsc_amd64.s
index 4cc604392..e53d477f7 100644
--- a/pkg/sentry/time/tsc_amd64.s
+++ b/pkg/sentry/time/tsc_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
index e48fabc2d..399d98c29 100644
--- a/pkg/sentry/uniqueid/context.go
+++ b/pkg/sentry/uniqueid/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
index ed7b04b9e..cbd7cfe19 100644
--- a/pkg/sentry/usage/cpu.go
+++ b/pkg/sentry/usage/cpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
index 49faa507d..8e27a0a88 100644
--- a/pkg/sentry/usage/io.go
+++ b/pkg/sentry/usage/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 92a478d85..7e065cb76 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory_unsafe.go b/pkg/sentry/usage/memory_unsafe.go
index f990a7750..a3ae668a5 100644
--- a/pkg/sentry/usage/memory_unsafe.go
+++ b/pkg/sentry/usage/memory_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/usage.go b/pkg/sentry/usage/usage.go
index 3b3118659..ab327f8e2 100644
--- a/pkg/sentry/usage/usage.go
+++ b/pkg/sentry/usage/usage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index 75346d854..c71d05afe 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
index fc94bee80..2a75aa60c 100644
--- a/pkg/sentry/usermem/addr.go
+++ b/pkg/sentry/usermem/addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go
index cf9d785ed..bd6a1ec8a 100644
--- a/pkg/sentry/usermem/addr_range_seq_test.go
+++ b/pkg/sentry/usermem/addr_range_seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go
index 13b2998b3..f5fd446fa 100644
--- a/pkg/sentry/usermem/addr_range_seq_unsafe.go
+++ b/pkg/sentry/usermem/addr_range_seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go
index 01a746404..274f568d0 100644
--- a/pkg/sentry/usermem/bytes_io.go
+++ b/pkg/sentry/usermem/bytes_io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
index efd71fcbc..8bdf3a508 100644
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 5d8a1c558..1d6c0b4d6 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
index 563560da8..1991a9641 100644
--- a/pkg/sentry/usermem/usermem_test.go
+++ b/pkg/sentry/usermem/usermem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go
index 2484b0d82..9ec90f9ff 100644
--- a/pkg/sentry/usermem/usermem_x86.go
+++ b/pkg/sentry/usermem/usermem_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 5b620693d..75b11237f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_amd64.s b/pkg/sleep/commit_amd64.s
index d525e5b79..d08df7f37 100644
--- a/pkg/sleep/commit_amd64.s
+++ b/pkg/sleep/commit_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_asm.go b/pkg/sleep/commit_asm.go
index 39a55df7e..90eef4cbc 100644
--- a/pkg/sleep/commit_asm.go
+++ b/pkg/sleep/commit_asm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go
index 584866cd8..967d22e24 100644
--- a/pkg/sleep/commit_noasm.go
+++ b/pkg/sleep/commit_noasm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/empty.s b/pkg/sleep/empty.s
index 8aca31bee..85d52cd9c 100644
--- a/pkg/sleep/empty.s
+++ b/pkg/sleep/empty.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index bc1738371..8feb9ffc2 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index b12cce681..45fb6f0ea 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index 3ef59610b..54b5ad8b8 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode.go b/pkg/state/encode.go
index fd052db12..577aaf051 100644
--- a/pkg/state/encode.go
+++ b/pkg/state/encode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode_unsafe.go b/pkg/state/encode_unsafe.go
index d96ba56d4..be94742a8 100644
--- a/pkg/state/encode_unsafe.go
+++ b/pkg/state/encode_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/map.go b/pkg/state/map.go
index c3d165501..0035d7250 100644
--- a/pkg/state/map.go
+++ b/pkg/state/map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/object.proto b/pkg/state/object.proto
index c78efed2a..d3b46ea97 100644
--- a/pkg/state/object.proto
+++ b/pkg/state/object.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/printer.go b/pkg/state/printer.go
index 2c8ce60a5..aee4b69fb 100644
--- a/pkg/state/printer.go
+++ b/pkg/state/printer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state.go b/pkg/state/state.go
index 23a0b5922..4b141777e 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state_test.go b/pkg/state/state_test.go
index 38ad9da9c..22bcad9e1 100644
--- a/pkg/state/state_test.go
+++ b/pkg/state/state_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile.go b/pkg/state/statefile/statefile.go
index 9c86c1934..99158fd02 100644
--- a/pkg/state/statefile/statefile.go
+++ b/pkg/state/statefile/statefile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile_test.go b/pkg/state/statefile/statefile_test.go
index fa3fb9f2c..b4f400e01 100644
--- a/pkg/state/statefile/statefile_test.go
+++ b/pkg/state/statefile/statefile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/stats.go b/pkg/state/stats.go
index ddcc49f78..17ca258fc 100644
--- a/pkg/state/stats.go
+++ b/pkg/state/stats.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/atomicptr_unsafe.go b/pkg/sync/atomicptr_unsafe.go
index f12e9cb67..d943b7ff4 100644
--- a/pkg/sync/atomicptr_unsafe.go
+++ b/pkg/sync/atomicptr_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/atomicptrtest/atomicptr_test.go b/pkg/sync/atomicptrtest/atomicptr_test.go
index b458382b1..3262785ce 100644
--- a/pkg/sync/atomicptrtest/atomicptr_test.go
+++ b/pkg/sync/atomicptrtest/atomicptr_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index 0c992d5a4..cd7a02dca 100644
--- a/pkg/sync/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go
index 968665078..1593b9e5d 100644
--- a/pkg/sync/norace_unsafe.go
+++ b/pkg/sync/norace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go
index d143a21c7..473eaddc6 100644
--- a/pkg/sync/race_unsafe.go
+++ b/pkg/sync/race_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
index a18e1229a..bea31adc5 100644
--- a/pkg/sync/seqatomic_unsafe.go
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqatomictest/seqatomic_test.go b/pkg/sync/seqatomictest/seqatomic_test.go
index b785d2344..f5e1fbfff 100644
--- a/pkg/sync/seqatomictest/seqatomic_test.go
+++ b/pkg/sync/seqatomictest/seqatomic_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
index 8e3304d69..732e856a4 100644
--- a/pkg/sync/seqcount.go
+++ b/pkg/sync/seqcount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go
index fa4abed1d..b14a8878e 100644
--- a/pkg/sync/seqcount_test.go
+++ b/pkg/sync/seqcount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sync/sync.go b/pkg/sync/sync.go
index 36d4c4dee..22c5348d7 100644
--- a/pkg/sync/sync.go
+++ b/pkg/sync/sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go
index 22009a799..74bbe9f5b 100644
--- a/pkg/syserr/host_linux.go
+++ b/pkg/syserr/host_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index b9786b48f..20e756edb 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go
index dba6cb7de..6a66e23a2 100644
--- a/pkg/syserr/syserr.go
+++ b/pkg/syserr/syserr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 5bc74e65e..4228707f4 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror_test.go b/pkg/syserror/syserror_test.go
index fb7d8d5ee..0f0da5781 100644
--- a/pkg/syserror/syserror_test.go
+++ b/pkg/syserror/syserror_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index b64dce720..81428770b 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index 79b7c77ee..05a730a05 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
index c5dd2819f..d3a9a0f88 100644
--- a/pkg/tcpip/buffer/prependable.go
+++ b/pkg/tcpip/buffer/prependable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index cea4e3657..24479ea40 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index 02c264593..74a0a96fc 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 206531f20..5dfb3ca1d 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/arp.go b/pkg/tcpip/header/arp.go
index ae373f112..22b259ccb 100644
--- a/pkg/tcpip/header/arp.go
+++ b/pkg/tcpip/header/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index e67c50f50..12f208fde 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index 99c29b750..77365bc41 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/gue.go b/pkg/tcpip/header/gue.go
index aac4593c5..2ad13955a 100644
--- a/pkg/tcpip/header/gue.go
+++ b/pkg/tcpip/header/gue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index af1e94b7f..3ac89cdae 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index 7d35caff7..e317975e8 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/interfaces.go b/pkg/tcpip/header/interfaces.go
index 042006983..ac327d8a5 100644
--- a/pkg/tcpip/header/interfaces.go
+++ b/pkg/tcpip/header/interfaces.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index 29570cc34..1b882d3d8 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 66c778fe1..d985b745d 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6_fragment.go b/pkg/tcpip/header/ipv6_fragment.go
index 44b28b326..e36d5177b 100644
--- a/pkg/tcpip/header/ipv6_fragment.go
+++ b/pkg/tcpip/header/ipv6_fragment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipversion_test.go b/pkg/tcpip/header/ipversion_test.go
index 3ae9b7e4a..8301ba5cf 100644
--- a/pkg/tcpip/header/ipversion_test.go
+++ b/pkg/tcpip/header/ipversion_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 6689a6dc5..567a21167 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp_test.go b/pkg/tcpip/header/tcp_test.go
index 7854d3523..7cd98df3b 100644
--- a/pkg/tcpip/header/tcp_test.go
+++ b/pkg/tcpip/header/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index cf2602e50..31c8ef456 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 113cbbf5e..da34032cc 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index ee99ada07..24af428dd 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 52e532ebb..19b007a9e 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index fc3f80c01..e6585be66 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
index fc5231831..63b8c4451 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
index a0a9d4acd..6a3e956ad 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go
index 1f143c0db..89a8a9954 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index de7593d9c..f42ff98db 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index cea3cd6a1..be4a4fa9c 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe.go b/pkg/tcpip/link/sharedmem/pipe/pipe.go
index 1a0edbaba..e014324cc 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index db0737c98..30742ccb1 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
index 480dc4a23..f491d74a2 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/rx.go b/pkg/tcpip/link/sharedmem/pipe/rx.go
index ff778cecd..8d641c76f 100644
--- a/pkg/tcpip/link/sharedmem/pipe/rx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/tx.go b/pkg/tcpip/link/sharedmem/pipe/tx.go
index 717f5a4b1..e75175d98 100644
--- a/pkg/tcpip/link/sharedmem/pipe/tx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/queue_test.go b/pkg/tcpip/link/sharedmem/queue/queue_test.go
index 3d5909cef..391165bc3 100644
--- a/pkg/tcpip/link/sharedmem/queue/queue_test.go
+++ b/pkg/tcpip/link/sharedmem/queue/queue_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/rx.go b/pkg/tcpip/link/sharedmem/queue/rx.go
index c40d62c33..d3a5da08a 100644
--- a/pkg/tcpip/link/sharedmem/queue/rx.go
+++ b/pkg/tcpip/link/sharedmem/queue/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/tx.go b/pkg/tcpip/link/sharedmem/queue/tx.go
index 39b595e56..845108db1 100644
--- a/pkg/tcpip/link/sharedmem/queue/tx.go
+++ b/pkg/tcpip/link/sharedmem/queue/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go
index b8e39eca1..3eeab769e 100644
--- a/pkg/tcpip/link/sharedmem/rx.go
+++ b/pkg/tcpip/link/sharedmem/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index ce6e86767..27d7eb3b9 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index ad987d382..4b8061b13 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
index f0be2dc73..b91adbaf7 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go
index 42a21cb43..37da34831 100644
--- a/pkg/tcpip/link/sharedmem/tx.go
+++ b/pkg/tcpip/link/sharedmem/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go
index 04f3d494e..3d0d8d852 100644
--- a/pkg/tcpip/link/sniffer/pcap.go
+++ b/pkg/tcpip/link/sniffer/pcap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index a30e57a32..1bd174bc3 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go
index 1dec41982..e4c589dda 100644
--- a/pkg/tcpip/link/tun/tun_unsafe.go
+++ b/pkg/tcpip/link/tun/tun_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index ef8c88561..9ffb7b7e9 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 0a15c40de..5ebe09664 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 9d0881e11..2e0024925 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 50628e4a2..5894f9114 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap.go b/pkg/tcpip/network/fragmentation/frag_heap.go
index 6c7faafe4..55615c8e6 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap_test.go b/pkg/tcpip/network/fragmentation/frag_heap_test.go
index a15540634..1b1b72e88 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap_test.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 885e3cca2..a5dda0398 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index fc62a15dd..5bf3463a9 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index b57fe82ec..c9ad2bef6 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index 4c137828f..a2bc9707a 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
index eddf7ca4d..07960ddf0 100644
--- a/pkg/tcpip/network/hash/hash.go
+++ b/pkg/tcpip/network/hash/hash.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index e3c7af1f9..5c1e88e56 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index ee8172ac8..f82dc098f 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index d4eeeb5d9..d7801ec19 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 2b7067a50..190d548eb 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 81aba0923..14107443b 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index fabbdc8c7..12c818b48 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 25bd998e5..4d0b6ee9c 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 4e24efddb..41ef32921 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 4ab6a1fa2..72577dfcb 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index c4707736e..67e8f0b9e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 910d1257f..ab40e9e0b 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/seqnum/seqnum.go b/pkg/tcpip/seqnum/seqnum.go
index e507d02f7..f2b988839 100644
--- a/pkg/tcpip/seqnum/seqnum.go
+++ b/pkg/tcpip/seqnum/seqnum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 3a147a75f..cb7b7116b 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index e46267f12..651fa17ac 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index dba95369c..3da99ac67 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 0acec2984..b6266eb55 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 6c6400c33..2b4185014 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index d1ec6a660..d4da980a9 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index b6c095efb..f2c6c9a8d 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index a0b3399a8..74bf2c99e 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index a7470d606..c8522ad9e 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 98cc3b120..f09760180 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index bf11c2175..413aee6c6 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index d283f71c7..361e359d4 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time.s b/pkg/tcpip/time.s
index 8aca31bee..85d52cd9c 100644
--- a/pkg/tcpip/time.s
+++ b/pkg/tcpip/time.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 2102e9633..231151bf3 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index 055daa918..b3f54cfe0 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/ping/endpoint_state.go b/pkg/tcpip/transport/ping/endpoint_state.go
index a16087304..80721d227 100644
--- a/pkg/tcpip/transport/ping/endpoint_state.go
+++ b/pkg/tcpip/transport/ping/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/ping/protocol.go b/pkg/tcpip/transport/ping/protocol.go
index 549b1b2d3..1d504773b 100644
--- a/pkg/tcpip/transport/ping/protocol.go
+++ b/pkg/tcpip/transport/ping/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index c22ed5ea7..5a88d25d0 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 27dbcace2..800d2409e 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/cubic.go b/pkg/tcpip/transport/tcp/cubic.go
index 8cea416d2..003525d86 100644
--- a/pkg/tcpip/transport/tcp/cubic.go
+++ b/pkg/tcpip/transport/tcp/cubic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index c88e98977..d3120c1d8 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 707d6be96..673a65c31 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index bed7ec6a6..e32c73aae 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index c80f3c7d6..2f90839e9 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index abdc825cd..753e1419e 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 92ef9c6f7..05ff9e0d7 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
index feb593234..e4f8b7d5a 100644
--- a/pkg/tcpip/transport/tcp/reno.go
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack.go b/pkg/tcpip/transport/tcp/sack.go
index 05bac08cb..24e48fe7b 100644
--- a/pkg/tcpip/transport/tcp/sack.go
+++ b/pkg/tcpip/transport/tcp/sack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 51a3d6aba..fc87a05fd 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index e3a3405ef..98422fadf 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 6a2d7bc0b..0c637d7ad 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
index 22f0bbf18..46b6d85a6 100644
--- a/pkg/tcpip/transport/tcp/segment_state.go
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 0bd421ff4..eefe93d48 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index d536839af..86bbd643f 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index a61d0ca64..06b0702c5 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 48852ea47..04e046257 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index ca16fc8fa..b08df0fec 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 5b25534f4..0695e8150 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/timer.go b/pkg/tcpip/transport/tcp/timer.go
index 938c0bcef..38240d2d5 100644
--- a/pkg/tcpip/transport/tcp/timer.go
+++ b/pkg/tcpip/transport/tcp/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index 5f8f1a64d..f7b2900de 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
index 514722ab7..aaeae9b18 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 840e95302..d777a80d0 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 70a37c7f2..db1e281ad 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 1334fec8a..b3fbed6e4 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index c3f592bd4..58a346cd9 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex.go b/pkg/tmutex/tmutex.go
index bd5c681dd..df61d89f5 100644
--- a/pkg/tmutex/tmutex.go
+++ b/pkg/tmutex/tmutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
index a9dc9972f..a4537cb3b 100644
--- a/pkg/tmutex/tmutex_test.go
+++ b/pkg/tmutex/tmutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index f4800e0d9..deeea078d 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index 6c546825f..ecc670925 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go
index fa15cf744..1d69de542 100644
--- a/pkg/unet/unet_unsafe.go
+++ b/pkg/unet/unet_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 1ec06dd4c..753366be2 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc_test.go b/pkg/urpc/urpc_test.go
index d9cfc512e..f1b9a85ca 100644
--- a/pkg/urpc/urpc_test.go
+++ b/pkg/urpc/urpc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/fdnotifier/fdnotifier.go b/pkg/waiter/fdnotifier/fdnotifier.go
index 8bb93e39b..624b1a0c5 100644
--- a/pkg/waiter/fdnotifier/fdnotifier.go
+++ b/pkg/waiter/fdnotifier/fdnotifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/fdnotifier/poll_unsafe.go b/pkg/waiter/fdnotifier/poll_unsafe.go
index 26bca2b53..8459d4c74 100644
--- a/pkg/waiter/fdnotifier/poll_unsafe.go
+++ b/pkg/waiter/fdnotifier/poll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 832b6a5a9..93390b299 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter_test.go b/pkg/waiter/waiter_test.go
index c45f22889..60853f9c1 100644
--- a/pkg/waiter/waiter_test.go
+++ b/pkg/waiter/waiter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 3250cdcdc..6766953b3 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 51d20d06d..9ebbde424 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index bee82f344..6dd7fadd9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
index 971962c91..d224d08b7 100644
--- a/runsc/boot/debug.go
+++ b/runsc/boot/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index 595846b10..f954b8c0b 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 9416e3a5c..a3d21d963 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 92a73db9a..378396b9b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
index 82cf00dfb..67f3101fe 100644
--- a/runsc/boot/filter/extra_filters.go
+++ b/runsc/boot/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index 76f3f6865..fb95283ab 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index ebd56c553..02a122c95 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index b656883ad..dc7294b1d 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index ea825e571..e52c89fe4 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 510497eba..8ecda6d0e 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index c79b95bde..fa3de0133 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 41ff3681b..c342ee005 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 6a2678ac9..89f186139 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index 1e898672b..028bcc1f4 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 7a75a189a..d6058a8a2 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index cde915329..4a4713d4f 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 023b63dc0..7c14857ba 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
index 0b18c5481..e5da021e5 100644
--- a/runsc/cmd/capability.go
+++ b/runsc/cmd/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index 3329b308d..dd278b32d 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 023ab2455..d49d0169b 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index 2937ae1c4..a1c3491a3 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 275a96f57..b84185b43 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index cb7d81057..288cbe435 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 92b609c3c..ea1ca1278 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index f6d164394..4a5b4774a 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index df65ea31d..df03415ec 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 336edf3f6..9a395e6f1 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
index 623461e78..686c5e150 100644
--- a/runsc/cmd/exec_test.go
+++ b/runsc/cmd/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index fd4eee546..3842fdf64 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 7a98d10a2..1f1086250 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 4d4a5cb0b..fd59b73e6 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
index c207b80da..baba937a8 100644
--- a/runsc/cmd/path.go
+++ b/runsc/cmd/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index ac393b48e..5ff6f059c 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 5d219bfdc..fd76cf975 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 6dc044672..cc99b3503 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index a12adf1a3..274b5d084 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index 9a87cf240..b6a12f5d6 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 6281fc49d..57ee37c86 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 97ea91fff..48bd4c401 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index 265014e1b..f8ce8c3d8 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 956349140..121c54554 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 3df184742..9f4f9214d 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 8f019b54a..0b0dfb4cb 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container.go b/runsc/container/container.go
index f76bad1aa..cb4c9b5c1 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 662591b3b..243528d35 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index 2ed42fd93..41022686b 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/fs_test.go b/runsc/container/fs_test.go
index 84bde18fb..87cdb078e 100644
--- a/runsc/container/fs_test.go
+++ b/runsc/container/fs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
index 3d93ca0be..6b9e5550a 100644
--- a/runsc/container/hook.go
+++ b/runsc/container/hook.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 1781a4602..4548eb106 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/status.go b/runsc/container/status.go
index bf177e78a..234ffb0dd 100644
--- a/runsc/container/status.go
+++ b/runsc/container/status.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index cc3b087e1..b5071ada6 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 35698f21f..75a087848 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
index 82cf00dfb..67f3101fe 100644
--- a/runsc/fsgofer/filter/extra_filters.go
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
index 169a79ed8..7e142b790 100644
--- a/runsc/fsgofer/filter/extra_filters_msan.go
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
index 9e6512d8c..3cd29472a 100644
--- a/runsc/fsgofer/filter/extra_filters_race.go
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index 6f341f688..f50b6bc87 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 9c4864cf1..e03bb7752 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index a500a2976..48860f952 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index e676809ac..99bc25ec1 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/main.go b/runsc/main.go
index 62b1f01b3..4a92db7c0 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
index 35b19a0b1..354049871 100644
--- a/runsc/sandbox/chroot.go
+++ b/runsc/sandbox/chroot.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 86a52c6ae..52fe8fc0f 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 923a52f7f..0fe85cfe1 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 00293d45b..73fab13e1 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index b29802fde..ab14ed1fc 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index 64e2172c8..b61f1ca62 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image.go b/runsc/test/image/image.go
index 069d08013..bcb6f876f 100644
--- a/runsc/test/image/image.go
+++ b/runsc/test/image/image.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index d89d80a86..763152b47 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/mysql.sql b/runsc/test/image/mysql.sql
index dd5bfaa4e..c1271e719 100644
--- a/runsc/test/image/mysql.sql
+++ b/runsc/test/image/mysql.sql
@@ -1,4 +1,4 @@
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.rb b/runsc/test/image/ruby.rb
index ae5de3419..25d1ac129 100644
--- a/runsc/test/image/ruby.rb
+++ b/runsc/test/image/ruby.rb
@@ -1,4 +1,4 @@
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.sh b/runsc/test/image/ruby.sh
index 54be2c931..d3a9b5656 100644
--- a/runsc/test/image/ruby.sh
+++ b/runsc/test/image/ruby.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
index c239588d4..32e1e884e 100755
--- a/runsc/test/install.sh
+++ b/runsc/test/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 3cac674d0..fac8337f4 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration.go b/runsc/test/integration/integration.go
index 49c3c893a..e15321c87 100644
--- a/runsc/test/integration/integration.go
+++ b/runsc/test/integration/integration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 536bb17e0..526b3a7a1 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
index 5cb4b794f..fdb94ff64 100644
--- a/runsc/test/root/cgroup_test.go
+++ b/runsc/test/root/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 8831e6a78..0ffaaf87b 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/root.go b/runsc/test/root/root.go
index 790f62c29..586ea0fe3 100644
--- a/runsc/test/root/root.go
+++ b/runsc/test/root/root.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index 7d6a72e5f..3f74e0770 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 4d7ac3bc9..1b5a02c0f 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil_race.go b/runsc/test/testutil/testutil_race.go
index 59cfdaa7b..9267af150 100644
--- a/runsc/test/testutil/testutil_race.go
+++ b/runsc/test/testutil/testutil_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
index 0bd6cad93..110a581ff 100644
--- a/runsc/tools/dockercfg/dockercfg.go
+++ b/runsc/tools/dockercfg/dockercfg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go
index cc61a7537..eaf5c4970 100644
--- a/tools/go_generics/generics.go
+++ b/tools/go_generics/generics.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/input.go b/tools/go_generics/generics_tests/all_stmts/input.go
index 870af3b6c..19184a3fe 100644
--- a/tools/go_generics/generics_tests/all_stmts/input.go
+++ b/tools/go_generics/generics_tests/all_stmts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/output/output.go b/tools/go_generics/generics_tests/all_stmts/output/output.go
index e4e670bf1..51582346c 100644
--- a/tools/go_generics/generics_tests/all_stmts/output/output.go
+++ b/tools/go_generics/generics_tests/all_stmts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/input.go b/tools/go_generics/generics_tests/all_types/input.go
index 3a8643e3d..ed6e97c29 100644
--- a/tools/go_generics/generics_tests/all_types/input.go
+++ b/tools/go_generics/generics_tests/all_types/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/lib/lib.go b/tools/go_generics/generics_tests/all_types/lib/lib.go
index d3911d12d..7e73e678e 100644
--- a/tools/go_generics/generics_tests/all_types/lib/lib.go
+++ b/tools/go_generics/generics_tests/all_types/lib/lib.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/output/output.go b/tools/go_generics/generics_tests/all_types/output/output.go
index b89840936..ec09a6be4 100644
--- a/tools/go_generics/generics_tests/all_types/output/output.go
+++ b/tools/go_generics/generics_tests/all_types/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/input.go b/tools/go_generics/generics_tests/consts/input.go
index dabf76e1e..394bcc262 100644
--- a/tools/go_generics/generics_tests/consts/input.go
+++ b/tools/go_generics/generics_tests/consts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/output/output.go b/tools/go_generics/generics_tests/consts/output/output.go
index 72865607e..91a07fdc2 100644
--- a/tools/go_generics/generics_tests/consts/output/output.go
+++ b/tools/go_generics/generics_tests/consts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/input.go b/tools/go_generics/generics_tests/imports/input.go
index 66b43fee5..22e6641a6 100644
--- a/tools/go_generics/generics_tests/imports/input.go
+++ b/tools/go_generics/generics_tests/imports/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/output/output.go b/tools/go_generics/generics_tests/imports/output/output.go
index 5f20d43ce..2555c0004 100644
--- a/tools/go_generics/generics_tests/imports/output/output.go
+++ b/tools/go_generics/generics_tests/imports/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/input.go b/tools/go_generics/generics_tests/remove_typedef/input.go
index c02307d32..d9c9b8530 100644
--- a/tools/go_generics/generics_tests/remove_typedef/input.go
+++ b/tools/go_generics/generics_tests/remove_typedef/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/output/output.go b/tools/go_generics/generics_tests/remove_typedef/output/output.go
index d20a89abd..f111a9426 100644
--- a/tools/go_generics/generics_tests/remove_typedef/output/output.go
+++ b/tools/go_generics/generics_tests/remove_typedef/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/input.go b/tools/go_generics/generics_tests/simple/input.go
index 670161d6e..711687cf5 100644
--- a/tools/go_generics/generics_tests/simple/input.go
+++ b/tools/go_generics/generics_tests/simple/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/output/output.go b/tools/go_generics/generics_tests/simple/output/output.go
index 75b5467cd..139c9bf9d 100644
--- a/tools/go_generics/generics_tests/simple/output/output.go
+++ b/tools/go_generics/generics_tests/simple/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/globals_visitor.go b/tools/go_generics/globals/globals_visitor.go
index fc0de4381..daaa17b1d 100644
--- a/tools/go_generics/globals/globals_visitor.go
+++ b/tools/go_generics/globals/globals_visitor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/scope.go b/tools/go_generics/globals/scope.go
index 18743bdee..b75a91689 100644
--- a/tools/go_generics/globals/scope.go
+++ b/tools/go_generics/globals/scope.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/go_generics_unittest.sh b/tools/go_generics/go_generics_unittest.sh
index 699e1f631..e7553a071 100755
--- a/tools/go_generics/go_generics_unittest.sh
+++ b/tools/go_generics/go_generics_unittest.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/imports.go b/tools/go_generics/imports.go
index 97267098b..57f7c3dce 100644
--- a/tools/go_generics/imports.go
+++ b/tools/go_generics/imports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/merge.go b/tools/go_generics/merge.go
index ebe7cf4e4..2f83facf8 100644
--- a/tools/go_generics/merge.go
+++ b/tools/go_generics/merge.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/remove.go b/tools/go_generics/remove.go
index 2a66de762..139d03955 100644
--- a/tools/go_generics/remove.go
+++ b/tools/go_generics/remove.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template.go b/tools/go_generics/rules_tests/template.go
index 73c024f0e..f3f31ae8e 100644
--- a/tools/go_generics/rules_tests/template.go
+++ b/tools/go_generics/rules_tests/template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template_test.go b/tools/go_generics/rules_tests/template_test.go
index 76c4cdb64..3a38c8629 100644
--- a/tools/go_generics/rules_tests/template_test.go
+++ b/tools/go_generics/rules_tests/template_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 5646b879a..9e2c8e106 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
index d89db1f99..7d44dad37 100755
--- a/tools/workspace_status.sh
+++ b/tools/workspace_status.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/barrier.h b/vdso/barrier.h
index db8185b2e..7866af414 100644
--- a/vdso/barrier.h
+++ b/vdso/barrier.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/check_vdso.py b/vdso/check_vdso.py
index 9a3142ab8..6f7d7e7ec 100644
--- a/vdso/check_vdso.py
+++ b/vdso/check_vdso.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Google Inc.
+# Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/compiler.h b/vdso/compiler.h
index a661516c3..d65f148fb 100644
--- a/vdso/compiler.h
+++ b/vdso/compiler.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/cycle_clock.h b/vdso/cycle_clock.h
index 93c5f2c0d..dfb5b427d 100644
--- a/vdso/cycle_clock.h
+++ b/vdso/cycle_clock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/seqlock.h b/vdso/seqlock.h
index b527bdbca..ab2f3fda3 100644
--- a/vdso/seqlock.h
+++ b/vdso/seqlock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index fd79c4642..0be8a7f9b 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso.cc b/vdso/vdso.cc
index db3bdef01..f30dc26a2 100644
--- a/vdso/vdso.cc
+++ b/vdso/vdso.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.cc b/vdso/vdso_time.cc
index 5d5c8de65..a59771bff 100644
--- a/vdso/vdso_time.cc
+++ b/vdso/vdso_time.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.h b/vdso/vdso_time.h
index 71d6e2f64..464dadff2 100644
--- a/vdso/vdso_time.h
+++ b/vdso/vdso_time.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google Inc.
+// Copyright 2018 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From b2068cf5a5d43f3898cf389ab2d6151cf61908ac Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Sat, 20 Oct 2018 11:12:26 -0700
Subject: Add more unimplemented syscall events

Added events for *ctl syscalls that may have multiple different commands.
For runsc, each syscall event is only logged once. For *ctl syscalls, use
the cmd as identifier, not only the syscall number.

PiperOrigin-RevId: 218015941
Change-Id: Ie3c19131ae36124861e9b492a7dbe1765d9e5e59
---
 pkg/abi/linux/ioctl.go                          | 75 +++++++++++++------
 pkg/abi/linux/prctl.go                          | 99 +++++++++++++++++++++++--
 pkg/sentry/fs/host/BUILD                        |  1 +
 pkg/sentry/fs/host/tty.go                       | 30 ++++++++
 pkg/sentry/fs/tty/BUILD                         |  1 +
 pkg/sentry/fs/tty/master.go                     | 45 ++++++++++-
 pkg/sentry/fs/tty/slave.go                      |  3 +-
 pkg/sentry/kernel/BUILD                         |  2 +
 pkg/sentry/kernel/kernel.go                     | 17 +++++
 pkg/sentry/kernel/pipe/reader_writer.go         |  3 +-
 pkg/sentry/kernel/task.go                       |  3 +
 pkg/sentry/socket/epsocket/BUILD                |  1 +
 pkg/sentry/socket/epsocket/epsocket.go          |  3 +
 pkg/sentry/socket/rpcinet/BUILD                 |  1 +
 pkg/sentry/socket/rpcinet/socket.go             |  5 ++
 pkg/sentry/syscalls/BUILD                       | 18 -----
 pkg/sentry/syscalls/linux/linux64.go            |  2 +-
 pkg/sentry/syscalls/linux/sys_prctl.go          | 39 +++++++++-
 pkg/sentry/syscalls/linux/sys_shm.go            |  1 +
 pkg/sentry/syscalls/linux/sys_tls.go            |  3 +
 pkg/sentry/syscalls/syscalls.go                 | 15 +---
 pkg/sentry/syscalls/unimplemented_syscall.proto | 27 -------
 pkg/sentry/unimpl/BUILD                         | 30 ++++++++
 pkg/sentry/unimpl/events.go                     | 45 +++++++++++
 pkg/sentry/unimpl/unimplemented_syscall.proto   | 27 +++++++
 runsc/boot/BUILD                                |  9 ++-
 runsc/boot/compat.go                            | 72 ++++++++++++++++--
 runsc/boot/compat_amd64.go                      | 54 ++++++++++++++
 runsc/boot/compat_test.go                       | 66 +++++++++++++++++
 29 files changed, 596 insertions(+), 101 deletions(-)
 delete mode 100644 pkg/sentry/syscalls/unimplemented_syscall.proto
 create mode 100644 pkg/sentry/unimpl/BUILD
 create mode 100644 pkg/sentry/unimpl/events.go
 create mode 100644 pkg/sentry/unimpl/unimplemented_syscall.proto
 create mode 100644 runsc/boot/compat_amd64.go
 create mode 100644 runsc/boot/compat_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 9afc3d1ef..191b26e4d 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -18,28 +18,59 @@ package linux
 //
 // These are ordered by request number (low byte).
 const (
-	TCGETS     = 0x00005401
-	TCSETS     = 0x00005402
-	TCSETSW    = 0x00005403
-	TCSETSF    = 0x00005404
-	TIOCSCTTY  = 0x0000540e
-	TIOCGPGRP  = 0x0000540f
-	TIOCSPGRP  = 0x00005410
-	TIOCOUTQ   = 0x00005411
-	TIOCGWINSZ = 0x00005413
-	TIOCSWINSZ = 0x00005414
-	TIOCINQ    = 0x0000541b
-	FIONREAD   = TIOCINQ
-	FIONBIO    = 0x00005421
-	TIOCGPTN   = 0x80045430
-	TIOCSPTLCK = 0x40045431
-	FIONCLEX   = 0x00005450
-	FIOCLEX    = 0x00005451
-	FIOASYNC   = 0x00005452
-	FIOSETOWN  = 0x00008901
-	SIOCSPGRP  = 0x00008902
-	FIOGETOWN  = 0x00008903
-	SIOCGPGRP  = 0x00008904
+	TCGETS      = 0x00005401
+	TCSETS      = 0x00005402
+	TCSETSW     = 0x00005403
+	TCSETSF     = 0x00005404
+	TCSBRK      = 0x00005409
+	TIOCEXCL    = 0x0000540c
+	TIOCNXCL    = 0x0000540d
+	TIOCSCTTY   = 0x0000540e
+	TIOCGPGRP   = 0x0000540f
+	TIOCSPGRP   = 0x00005410
+	TIOCOUTQ    = 0x00005411
+	TIOCSTI     = 0x00005412
+	TIOCGWINSZ  = 0x00005413
+	TIOCSWINSZ  = 0x00005414
+	TIOCMGET    = 0x00005415
+	TIOCMBIS    = 0x00005416
+	TIOCMBIC    = 0x00005417
+	TIOCMSET    = 0x00005418
+	TIOCINQ     = 0x0000541b
+	FIONREAD    = TIOCINQ
+	FIONBIO     = 0x00005421
+	TIOCSETD    = 0x00005423
+	TIOCNOTTY   = 0x00005422
+	TIOCGETD    = 0x00005424
+	TCSBRKP     = 0x00005425
+	TIOCSBRK    = 0x00005427
+	TIOCCBRK    = 0x00005428
+	TIOCGSID    = 0x00005429
+	TIOCGPTN    = 0x80045430
+	TIOCSPTLCK  = 0x40045431
+	TIOCGDEV    = 0x80045432
+	TIOCVHANGUP = 0x00005437
+	TCFLSH      = 0x0000540b
+	TIOCCONS    = 0x0000541d
+	TIOCSSERIAL = 0x0000541f
+	TIOCGEXCL   = 0x80045440
+	TIOCGPTPEER = 0x80045441
+	TIOCGICOUNT = 0x0000545d
+	FIONCLEX    = 0x00005450
+	FIOCLEX     = 0x00005451
+	FIOASYNC    = 0x00005452
+	FIOSETOWN   = 0x00008901
+	SIOCSPGRP   = 0x00008902
+	FIOGETOWN   = 0x00008903
+	SIOCGPGRP   = 0x00008904
+)
+
+// ioctl(2) requests provided by uapi/linux/sockios.h
+const (
+	SIOCGIFMEM    = 0x891f
+	SIOCGIFPFLAGS = 0x8935
+	SIOCGMIIPHY   = 0x8947
+	SIOCGMIIREG   = 0x8948
 )
 
 // ioctl(2) requests provided by uapi/linux/android/binder.h
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index e152c4c27..db3206f36 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -22,26 +22,102 @@ const (
 	// PR_GET_PDEATHSIG will get the process' death signal.
 	PR_GET_PDEATHSIG = 2
 
+	// PR_GET_DUMPABLE will get the process's dumpable flag.
+	PR_GET_DUMPABLE = 3
+
+	// PR_SET_DUMPABLE will set the process's dumpable flag.
+	PR_SET_DUMPABLE = 4
+
 	// PR_GET_KEEPCAPS will get the value of the keep capabilities flag.
 	PR_GET_KEEPCAPS = 7
 
 	// PR_SET_KEEPCAPS will set the value of the keep capabilities flag.
 	PR_SET_KEEPCAPS = 8
 
+	// PR_GET_TIMING will get the process's timing method.
+	PR_GET_TIMING = 13
+
+	// PR_SET_TIMING will set the process's timing method.
+	PR_SET_TIMING = 14
+
 	// PR_SET_NAME will set the process' name.
 	PR_SET_NAME = 15
 
 	// PR_GET_NAME will get the process' name.
 	PR_GET_NAME = 16
 
+	// PR_GET_SECCOMP will get a process' seccomp mode.
+	PR_GET_SECCOMP = 21
+
+	// PR_SET_SECCOMP will set a process' seccomp mode.
+	PR_SET_SECCOMP = 22
+
+	// PR_CAPBSET_READ will get the capability bounding set.
+	PR_CAPBSET_READ = 23
+
+	// PR_CAPBSET_DROP will set the capability bounding set.
+	PR_CAPBSET_DROP = 24
+
+	// PR_GET_TSC will get the the value of the flag determining whether the
+	// timestamp counter can be read.
+	PR_GET_TSC = 25
+
+	// PR_SET_TSC will set the the value of the flag determining whether the
+	// timestamp counter can be read.
+	PR_SET_TSC = 26
+
+	// PR_SET_TIMERSLACK set the process's time slack.
+	PR_SET_TIMERSLACK = 29
+
+	// PR_GET_TIMERSLACK get the process's time slack.
+	PR_GET_TIMERSLACK = 30
+
+	// PR_TASK_PERF_EVENTS_DISABLE disable all performance counters attached to
+	// the calling process.
+	PR_TASK_PERF_EVENTS_DISABLE = 31
+
+	// PR_TASK_PERF_EVENTS_ENABLE enable all performance counters attached to
+	// the calling process.
+	PR_TASK_PERF_EVENTS_ENABLE = 32
+
+	// PR_MCE_KILL set the machine check memory corruption kill policy for the
+	// calling thread.
+	PR_MCE_KILL = 33
+
+	// PR_MCE_KILL_GET get the machine check memory corruption kill policy for the
+	// calling thread.
+	PR_MCE_KILL_GET = 34
+
 	// PR_SET_MM will modify certain kernel memory map descriptor fields of the
 	// calling process. See prctl(2) for more information.
 	PR_SET_MM = 35
 
+	PR_SET_MM_START_CODE  = 1
+	PR_SET_MM_END_CODE    = 2
+	PR_SET_MM_START_DATA  = 3
+	PR_SET_MM_END_DATA    = 4
+	PR_SET_MM_START_STACK = 5
+	PR_SET_MM_START_BRK   = 6
+	PR_SET_MM_BRK         = 7
+	PR_SET_MM_ARG_START   = 8
+	PR_SET_MM_ARG_END     = 9
+	PR_SET_MM_ENV_START   = 10
+	PR_SET_MM_ENV_END     = 11
+	PR_SET_MM_AUXV        = 12
 	// PR_SET_MM_EXE_FILE will supersede the /proc/pid/exe symbolic link with a
 	// new one pointing to a new executable file identified by the file descriptor
 	// provided in arg3 argument. See prctl(2) for more information.
 	PR_SET_MM_EXE_FILE = 13
+	PR_SET_MM_MAP      = 14
+	PR_SET_MM_MAP_SIZE = 15
+
+	// PR_SET_CHILD_SUBREAPER set the "child subreaper" attribute of the calling
+	// process.
+	PR_SET_CHILD_SUBREAPER = 36
+
+	// PR_GET_CHILD_SUBREAPER get the "child subreaper" attribute of the calling
+	// process.
+	PR_GET_CHILD_SUBREAPER = 37
 
 	// PR_SET_NO_NEW_PRIVS will set the calling thread's no_new_privs bit.
 	PR_SET_NO_NEW_PRIVS = 38
@@ -49,17 +125,24 @@ const (
 	// PR_GET_NO_NEW_PRIVS will get the calling thread's no_new_privs bit.
 	PR_GET_NO_NEW_PRIVS = 39
 
-	// PR_SET_SECCOMP will set a process' seccomp mode.
-	PR_SET_SECCOMP = 22
+	// PR_GET_TID_ADDRESS retrieve the clear_child_tid address.
+	PR_GET_TID_ADDRESS = 40
 
-	// PR_GET_SECCOMP will get a process' seccomp mode.
-	PR_GET_SECCOMP = 21
+	// PR_SET_THP_DISABLE set the state of the "THP disable" flag for the calling
+	// thread.
+	PR_SET_THP_DISABLE = 41
 
-	// PR_CAPBSET_READ will get the capability bounding set.
-	PR_CAPBSET_READ = 23
+	// PR_GET_THP_DISABLE get the state of the "THP disable" flag for the calling
+	// thread.
+	PR_GET_THP_DISABLE = 42
 
-	// PR_CAPBSET_DROP will set the capability bounding set.
-	PR_CAPBSET_DROP = 24
+	// PR_MPX_ENABLE_MANAGEMENT enable kernel management of Memory Protection
+	// eXtensions (MPX) bounds tables.
+	PR_MPX_ENABLE_MANAGEMENT = 43
+
+	// PR_MPX_DISABLE_MANAGEMENTdisable kernel management of Memory Protection
+	// eXtensions (MPX) bounds tables.
+	PR_MPX_DISABLE_MANAGEMENT = 44
 )
 
 // From <asm/prctl.h>
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 4f264a024..d1eb9bd64 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -43,6 +43,7 @@ go_library(
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index cf3639c46..f0bcdc908 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -179,6 +180,35 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		err := ioctlSetWinsize(fd, &winsize)
 		return 0, err
 
+	// Unimplemented commands.
+	case linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+		fallthrough
 	default:
 		return 0, syserror.ENOTTY
 	}
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index d4dd20e30..2b45069a6 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -27,6 +27,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
         "//pkg/waiter",
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index dad0cad79..00bec4c2c 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -149,7 +150,7 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch args[1].Uint() {
+	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
 		return 0, mf.t.ld.outputQueueReadSize(ctx, io, args)
@@ -177,6 +178,48 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args a
 	case linux.TIOCSWINSZ:
 		return 0, mf.t.ld.setWindowSize(ctx, io, args)
 	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
 	}
 }
+
+// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
+func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
+	switch cmd {
+	case linux.TCGETS,
+		linux.TCSETS,
+		linux.TCSETSW,
+		linux.TCSETSF,
+		linux.TIOCGPGRP,
+		linux.TIOCSPGRP,
+		linux.TIOCGWINSZ,
+		linux.TIOCSWINSZ,
+		linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+	}
+}
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 9de3168bf..a696fbb51 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -134,7 +134,7 @@ func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src userme
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch args[1].Uint() {
+	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the input queue read buffer.
 		return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args)
@@ -161,6 +161,7 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 		// control.
 		return 0, nil
 	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
 	}
 }
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e2fb61ba6..389824b25 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -157,6 +157,8 @@ go_library(
         "//pkg/sentry/socket/netlink/port",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/time",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index bad558d48..17425e656 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -40,6 +40,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -58,6 +59,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
+	uspb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/state"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
@@ -595,6 +598,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k
 	case uniqueid.CtxInotifyCookie:
 		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
 	default:
 		return nil
 	}
@@ -1033,6 +1038,16 @@ func (k *Kernel) SupervisorContext() context.Context {
 	}
 }
 
+// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
+	t := TaskFromContext(ctx)
+	eventchannel.Emit(&uspb.UnimplementedSyscall{
+		Tid:       int32(t.ThreadID()),
+		Registers: t.Arch().StateData().Proto(),
+	})
+}
+
 type supervisorContext struct {
 	context.NoopSleeper
 	log.Logger
@@ -1073,6 +1088,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return ctx.k
 	case uniqueid.CtxInotifyCookie:
 		return ctx.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return ctx.k
 	default:
 		return nil
 	}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 63efc5bbe..36be1efc3 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -19,6 +19,7 @@ import (
 	"math"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -77,7 +78,7 @@ func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
 func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// Switch on ioctl request.
 	switch int(args[1].Int()) {
-	case syscall.TIOCINQ:
+	case linux.FIONREAD:
 		v := rw.queuedSize()
 		if v > math.MaxInt32 {
 			panic(fmt.Sprintf("Impossibly large pipe queued size: %d", v))
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e22ec768d..73ba8bee9 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -30,6 +30,7 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -594,6 +595,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t.k
 	case uniqueid.CtxInotifyCookie:
 		return t.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return t.k
 	default:
 		return nil
 	}
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index dbabc931c..da4aaf510 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e90ef4835..39a0b9941 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -45,6 +45,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -1184,6 +1185,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 		})
 		return 0, err
 
+	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
+		unimpl.EmitUnimplementedEvent(ctx)
 	}
 
 	return 0, syserror.ENOTTY
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 3ea433360..38fa54283 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/sentry/socket/rpcinet/conn",
         "//pkg/sentry/socket/rpcinet/notifier",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 44fa5c620..788d853c9 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -555,6 +556,10 @@ func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.S
 		})
 
 		return 0, err
+
+	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
+		unimpl.EmitUnimplementedEvent(ctx)
+
 	default:
 		return 0, syserror.ENOTTY
 	}
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 22a757095..2a9f0915e 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,7 +1,6 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
 go_library(
     name = "syscalls",
@@ -13,9 +12,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls",
     visibility = ["//:sandbox"],
     deps = [
-        ":unimplemented_syscall_go_proto",
         "//pkg/abi/linux",
-        "//pkg/eventchannel",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
@@ -26,18 +23,3 @@ go_library(
         "//pkg/waiter",
     ],
 )
-
-proto_library(
-    name = "unimplemented_syscall_proto",
-    srcs = ["unimplemented_syscall.proto"],
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_proto"],
-)
-
-go_proto_library(
-    name = "unimplemented_syscall_go_proto",
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto",
-    proto = ":unimplemented_syscall_proto",
-    visibility = ["//visibility:public"],
-    deps = ["//pkg/sentry/arch:registers_go_proto"],
-)
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 75e87f5ec..11bf81f88 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -369,7 +369,7 @@ var AMD64 = &kernel.SyscallTable{
 		0xffffffffff600800: 309, // vsyscall getcpu(2)
 	},
 	Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
-		syscalls.UnimplementedEvent(t)
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, syserror.ENOSYS
 	},
 }
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index c7b39ede8..91e852049 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -104,6 +104,22 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 			// Set the underlying executable.
 			t.MemoryManager().SetExecutable(file.Dirent)
+
+		case linux.PR_SET_MM_AUXV,
+			linux.PR_SET_MM_START_CODE,
+			linux.PR_SET_MM_END_CODE,
+			linux.PR_SET_MM_START_DATA,
+			linux.PR_SET_MM_END_DATA,
+			linux.PR_SET_MM_START_STACK,
+			linux.PR_SET_MM_START_BRK,
+			linux.PR_SET_MM_BRK,
+			linux.PR_SET_MM_ARG_START,
+			linux.PR_SET_MM_ARG_END,
+			linux.PR_SET_MM_ENV_START,
+			linux.PR_SET_MM_ENV_END:
+
+			t.Kernel().EmitUnimplementedEvent(t)
+			fallthrough
 		default:
 			return 0, nil, syscall.EINVAL
 		}
@@ -151,8 +167,29 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 		return 0, nil, t.DropBoundingCapability(cp)
 
+	case linux.PR_GET_DUMPABLE,
+		linux.PR_SET_DUMPABLE,
+		linux.PR_GET_TIMING,
+		linux.PR_SET_TIMING,
+		linux.PR_GET_TSC,
+		linux.PR_SET_TSC,
+		linux.PR_TASK_PERF_EVENTS_DISABLE,
+		linux.PR_TASK_PERF_EVENTS_ENABLE,
+		linux.PR_GET_TIMERSLACK,
+		linux.PR_SET_TIMERSLACK,
+		linux.PR_MCE_KILL,
+		linux.PR_MCE_KILL_GET,
+		linux.PR_GET_TID_ADDRESS,
+		linux.PR_SET_CHILD_SUBREAPER,
+		linux.PR_GET_CHILD_SUBREAPER,
+		linux.PR_GET_THP_DISABLE,
+		linux.PR_SET_THP_DISABLE,
+		linux.PR_MPX_ENABLE_MANAGEMENT,
+		linux.PR_MPX_DISABLE_MANAGEMENT:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
 	default:
-		t.Warningf("Unsupported prctl %d", option)
 		return 0, nil, syscall.EINVAL
 	}
 
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index b13d48b98..5f887523a 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -147,6 +147,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		// We currently do not support memmory locking anywhere.
 		// mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
 		// same here.
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, nil
 
 	default:
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index 27ddb3808..40e84825b 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -45,6 +45,9 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		regs.Fs = 0
 		regs.Fs_base = fsbase
 
+	case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
 	default:
 		return 0, nil, syscall.EINVAL
 	}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index bae32d727..425ce900c 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -26,10 +26,8 @@ package syscalls
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	uspb "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -44,7 +42,7 @@ func Error(err error) kernel.SyscallFn {
 // syscall event via the event channel and returns the passed error.
 func ErrorWithEvent(err error) kernel.SyscallFn {
 	return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-		UnimplementedEvent(t)
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, err
 	}
 }
@@ -57,16 +55,7 @@ func CapError(c linux.Capability) kernel.SyscallFn {
 		if !t.HasCapability(c) {
 			return 0, nil, syserror.EPERM
 		}
-		UnimplementedEvent(t)
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, syserror.ENOSYS
 	}
 }
-
-// UnimplementedEvent emits an UnimplementedSyscall event via the event
-// channel.
-func UnimplementedEvent(t *kernel.Task) {
-	eventchannel.Emit(&uspb.UnimplementedSyscall{
-		Tid:       int32(t.ThreadID()),
-		Registers: t.Arch().StateData().Proto(),
-	})
-}
diff --git a/pkg/sentry/syscalls/unimplemented_syscall.proto b/pkg/sentry/syscalls/unimplemented_syscall.proto
deleted file mode 100644
index 41579b016..000000000
--- a/pkg/sentry/syscalls/unimplemented_syscall.proto
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto3";
-
-package gvisor;
-
-import "pkg/sentry/arch/registers.proto";
-
-message UnimplementedSyscall {
-  // Task ID.
-  int32 tid = 1;
-
-  // Registers at the time of the call.
-  Registers registers = 2;
-}
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
new file mode 100644
index 000000000..63da5e81f
--- /dev/null
+++ b/pkg/sentry/unimpl/BUILD
@@ -0,0 +1,30 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
+proto_library(
+    name = "unimplemented_syscall_proto",
+    srcs = ["unimplemented_syscall.proto"],
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_proto"],
+)
+
+go_proto_library(
+    name = "unimplemented_syscall_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto",
+    proto = ":unimplemented_syscall_proto",
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_go_proto"],
+)
+
+go_library(
+    name = "unimpl",
+    srcs = ["events.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl",
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/context",
+    ],
+)
diff --git a/pkg/sentry/unimpl/events.go b/pkg/sentry/unimpl/events.go
new file mode 100644
index 000000000..f78f8c981
--- /dev/null
+++ b/pkg/sentry/unimpl/events.go
@@ -0,0 +1,45 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package unimpl contains interface to emit events about unimplemented
+// features.
+package unimpl
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the events package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxEvents is a Context.Value key for a Events.
+	CtxEvents contextID = iota
+)
+
+// Events interface defines method to emit unsupported events.
+type Events interface {
+	EmitUnimplementedEvent(context.Context)
+}
+
+// EmitUnimplementedEvent emits unsupported syscall event to the context.
+func EmitUnimplementedEvent(ctx context.Context) {
+	e := ctx.Value(CtxEvents)
+	if e == nil {
+		log.Warningf("Context.Value(CtxEvents) not present, unimplemented syscall event not reported.")
+		return
+	}
+	e.(Events).EmitUnimplementedEvent(ctx)
+}
diff --git a/pkg/sentry/unimpl/unimplemented_syscall.proto b/pkg/sentry/unimpl/unimplemented_syscall.proto
new file mode 100644
index 000000000..41579b016
--- /dev/null
+++ b/pkg/sentry/unimpl/unimplemented_syscall.proto
@@ -0,0 +1,27 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+import "pkg/sentry/arch/registers.proto";
+
+message UnimplementedSyscall {
+  // Task ID.
+  int32 tid = 1;
+
+  // Registers at the time of the call.
+  Registers registers = 2;
+}
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index f8f848ebf..04cc0e854 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "boot",
     srcs = [
         "compat.go",
+        "compat_amd64.go",
         "config.go",
         "controller.go",
         "debug.go",
@@ -59,9 +60,9 @@ go_library(
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
-        "//pkg/sentry/syscalls:unimplemented_syscall_go_proto",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/time",
+        "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
         "//pkg/sentry/watchdog",
         "//pkg/syserror",
@@ -87,12 +88,16 @@ go_library(
 go_test(
     name = "boot_test",
     size = "small",
-    srcs = ["loader_test.go"],
+    srcs = [
+        "compat_test.go",
+        "loader_test.go",
+    ],
     embed = [":boot"],
     deps = [
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/unet",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 6766953b3..d18c2f802 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -17,6 +17,8 @@ package boot
 import (
 	"fmt"
 	"os"
+	"sync"
+	"syscall"
 
 	"github.com/golang/protobuf/proto"
 	"gvisor.googlesource.com/gvisor/pkg/abi"
@@ -25,7 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
-	spb "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto"
+	spb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
 )
 
 func initCompatLogs(fd int) error {
@@ -40,15 +42,27 @@ func initCompatLogs(fd int) error {
 type compatEmitter struct {
 	sink    *log.BasicLogger
 	nameMap strace.SyscallMap
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// trackers map syscall number to the respective tracker instance.
+	// Protected by 'mu'.
+	trackers map[uint64]syscallTracker
 }
 
 func newCompatEmitter(logFD int) (*compatEmitter, error) {
-	// Always logs to default logger.
 	nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
 	if !ok {
 		return nil, fmt.Errorf("amd64 Linux syscall table not found")
 	}
-	c := &compatEmitter{sink: log.Log(), nameMap: nameMap}
+
+	c := &compatEmitter{
+		// Always logs to default logger.
+		sink:     log.Log(),
+		nameMap:  nameMap,
+		trackers: make(map[uint64]syscallTracker),
+	}
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
@@ -61,10 +75,33 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 // Emit implements eventchannel.Emitter.
 func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
 	// Only interested in UnimplementedSyscall, skip the rest.
-	if us, ok := msg.(*spb.UnimplementedSyscall); ok {
-		regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
-		sysnr := regs.OrigRax
+	us, ok := msg.(*spb.UnimplementedSyscall)
+	if !ok {
+		return false, nil
+	}
+	regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	sysnr := regs.OrigRax
+	tr := c.trackers[sysnr]
+	if tr == nil {
+		switch sysnr {
+		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+			tr = newCmdTracker(0)
+
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL:
+			tr = newCmdTracker(1)
+
+		default:
+			tr = &onceTracker{}
+		}
+		c.trackers[sysnr] = tr
+	}
+	if tr.shouldReport(regs) {
 		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+		tr.onReported(regs)
 	}
 	return false, nil
 }
@@ -74,3 +111,26 @@ func (c *compatEmitter) Close() error {
 	c.sink = nil
 	return nil
 }
+
+// syscallTracker interface allows filters to apply differently depending on
+// the syscall and arguments.
+type syscallTracker interface {
+	// shouldReport returns true is the syscall should be reported.
+	shouldReport(regs *rpb.AMD64Registers) bool
+
+	// onReported marks the syscall as reported.
+	onReported(regs *rpb.AMD64Registers)
+}
+
+// onceTracker reports only a single time, used for most syscalls.
+type onceTracker struct {
+	reported bool
+}
+
+func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+	return !o.reported
+}
+
+func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+	o.reported = true
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
new file mode 100644
index 000000000..2bb769a49
--- /dev/null
+++ b/runsc/boot/compat_amd64.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+)
+
+// cmdTracker reports only a single time for each different command argument in
+// the syscall. It's used for generic syscalls like ioctl to report once per
+// 'cmd'
+type cmdTracker struct {
+	// argIdx is the syscall argument index where the command is located.
+	argIdx int
+	cmds   map[uint32]struct{}
+}
+
+func newCmdTracker(argIdx int) *cmdTracker {
+	return &cmdTracker{argIdx: argIdx, cmds: make(map[uint32]struct{})}
+}
+
+// cmd returns the command based on the syscall argument index.
+func (c *cmdTracker) cmd(regs *rpb.AMD64Registers) uint32 {
+	switch c.argIdx {
+	case 0:
+		return uint32(regs.Rdi)
+	case 1:
+		return uint32(regs.Rsi)
+	}
+	panic(fmt.Sprintf("unsupported syscall argument index %d", c.argIdx))
+}
+
+func (c *cmdTracker) shouldReport(regs *rpb.AMD64Registers) bool {
+	_, ok := c.cmds[c.cmd(regs)]
+	return !ok
+}
+
+func (c *cmdTracker) onReported(regs *rpb.AMD64Registers) {
+	c.cmds[c.cmd(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
new file mode 100644
index 000000000..30b94798a
--- /dev/null
+++ b/runsc/boot/compat_test.go
@@ -0,0 +1,66 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"testing"
+
+	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+)
+
+func TestOnceTracker(t *testing.T) {
+	o := onceTracker{}
+	if !o.shouldReport(nil) {
+		t.Error("first call to checkAndMark, got: false, want: true")
+	}
+	o.onReported(nil)
+	for i := 0; i < 2; i++ {
+		if o.shouldReport(nil) {
+			t.Error("after first call to checkAndMark, got: true, want: false")
+		}
+	}
+}
+
+func TestCmdTracker(t *testing.T) {
+	for _, tc := range []struct {
+		name string
+		idx  int
+		rdi1 uint64
+		rdi2 uint64
+		rsi1 uint64
+		rsi2 uint64
+		want bool
+	}{
+		{name: "same rdi", idx: 0, rdi1: 123, rdi2: 123, want: false},
+		{name: "same rsi", idx: 1, rsi1: 123, rsi2: 123, want: false},
+		{name: "diff rdi", idx: 0, rdi1: 123, rdi2: 321, want: true},
+		{name: "diff rsi", idx: 1, rsi1: 123, rsi2: 321, want: true},
+		{name: "cmd is uint32", idx: 0, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := newCmdTracker(tc.idx)
+			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+			if !c.shouldReport(regs) {
+				t.Error("first call to checkAndMark, got: false, want: true")
+			}
+			c.onReported(regs)
+
+			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+			if got := c.shouldReport(regs); tc.want != got {
+				t.Errorf("after first call to checkAndMark, got: %t, want: %t", got, tc.want)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From d7c11c741752813e56b7d8726a575a520260c56a Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Sat, 20 Oct 2018 17:57:19 -0700
Subject: Refcount Unix transport queue

This allows us to release messages in the queue when all users close.

PiperOrigin-RevId: 218033550
Change-Id: I2f6e87650fced87a3977e3b74c64775c7b885c1b
---
 pkg/sentry/socket/unix/transport/BUILD             |  1 +
 pkg/sentry/socket/unix/transport/connectioned.go   |  8 +++++++-
 pkg/sentry/socket/unix/transport/connectionless.go |  6 +++++-
 pkg/sentry/socket/unix/transport/queue.go          | 10 ++++++++++
 pkg/sentry/socket/unix/transport/unix.go           | 12 +++++++++---
 5 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 28038ce7f..5bc01e3c8 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -29,6 +29,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/ilist",
+        "//pkg/refs",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/waiter",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 4c913effc..83b50459f 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -145,10 +145,12 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
 		b.receiver = &queueReceiver{q2}
 	}
 
+	q2.IncRef()
 	a.connected = &connectedEndpoint{
 		endpoint:   b,
 		writeQueue: q2,
 	}
+	q1.IncRef()
 	b.connected = &connectedEndpoint{
 		endpoint:   a,
 		writeQueue: q1,
@@ -282,12 +284,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 		idGenerator: e.idGenerator,
 		stype:       e.stype,
 	}
+
 	readQueue := newQueue(ce.WaiterQueue(), ne.Queue, initialLimit)
-	writeQueue := newQueue(ne.Queue, ce.WaiterQueue(), initialLimit)
 	ne.connected = &connectedEndpoint{
 		endpoint:   ce,
 		writeQueue: readQueue,
 	}
+
+	writeQueue := newQueue(ne.Queue, ce.WaiterQueue(), initialLimit)
 	if e.stype == SockStream {
 		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
 	} else {
@@ -297,10 +301,12 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 	select {
 	case e.acceptedChan <- ne:
 		// Commit state.
+		writeQueue.IncRef()
 		connected := &connectedEndpoint{
 			endpoint:   ne,
 			writeQueue: writeQueue,
 		}
+		readQueue.IncRef()
 		if e.stype == SockStream {
 			returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected)
 		} else {
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index cd4633106..376e4abb2 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -82,9 +82,13 @@ func (e *connectionlessEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tc
 	if r == nil {
 		return nil, tcpip.ErrConnectionRefused
 	}
+	q := r.(*queueReceiver).readQueue
+	if !q.TryIncRef() {
+		return nil, tcpip.ErrConnectionRefused
+	}
 	return &connectedEndpoint{
 		endpoint:   e,
-		writeQueue: r.(*queueReceiver).readQueue,
+		writeQueue: q,
 	}, nil
 }
 
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 5b4dfab68..72aa409ab 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -17,6 +17,7 @@ package transport
 import (
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -25,6 +26,8 @@ import (
 //
 // +stateify savable
 type queue struct {
+	refs.AtomicRefCount
+
 	ReaderQueue *waiter.Queue
 	WriterQueue *waiter.Queue
 
@@ -67,6 +70,13 @@ func (q *queue) Reset() {
 	q.mu.Unlock()
 }
 
+// DecRef implements RefCounter.DecRef with destructor q.Reset.
+func (q *queue) DecRef() {
+	q.DecRefWithDestructor(q.Reset)
+	// We don't need to notify after resetting because no one cares about
+	// this queue after all references have been dropped.
+}
+
 // IsReadable determines if q is currently readable.
 func (q *queue) IsReadable() bool {
 	q.mu.Lock()
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 157133b65..765cca27a 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -381,7 +381,9 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 {
 }
 
 // Release implements Receiver.Release.
-func (*queueReceiver) Release() {}
+func (q *queueReceiver) Release() {
+	q.readQueue.DecRef()
+}
 
 // streamQueueReceiver implements Receiver for stream sockets.
 //
@@ -694,7 +696,9 @@ func (e *connectedEndpoint) SendMaxQueueSize() int64 {
 }
 
 // Release implements ConnectedEndpoint.Release.
-func (*connectedEndpoint) Release() {}
+func (e *connectedEndpoint) Release() {
+	e.writeQueue.DecRef()
+}
 
 // baseEndpoint is an embeddable unix endpoint base used in both the connected and connectionless
 // unix domain socket Endpoint implementations.
@@ -945,4 +949,6 @@ func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 }
 
 // Release implements BoundEndpoint.Release.
-func (*baseEndpoint) Release() {}
+func (*baseEndpoint) Release() {
+	// Binding a baseEndpoint doesn't take a reference.
+}
-- 
cgit v1.2.3


From 75cd70ecc9abfd5daaefea04da5070a0e0d620dd Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 23 Oct 2018 00:19:11 -0700
Subject: Track paths and provide a rename hook.

This change also adds extensive testing to the p9 package via mocks. The sanity
checks and type checks are moved from the gofer into the core package, where
they can be more easily validated.

PiperOrigin-RevId: 218296768
Change-Id: I4fc3c326e7bf1e0e140a454cbacbcc6fd617ab55
---
 WORKSPACE                                     |   20 +-
 pkg/amutex/BUILD                              |    4 +-
 pkg/atomicbitops/BUILD                        |    4 +-
 pkg/binary/BUILD                              |    4 +-
 pkg/bits/BUILD                                |    3 +-
 pkg/compressio/BUILD                          |    4 +-
 pkg/control/client/BUILD                      |    4 +-
 pkg/control/server/BUILD                      |    4 +-
 pkg/dhcp/BUILD                                |    4 +-
 pkg/eventchannel/BUILD                        |    4 +-
 pkg/fd/BUILD                                  |    4 +-
 pkg/gate/BUILD                                |    4 +-
 pkg/ilist/BUILD                               |    4 +-
 pkg/linewriter/BUILD                          |    4 +-
 pkg/log/BUILD                                 |    4 +-
 pkg/metric/BUILD                              |    4 +-
 pkg/p9/BUILD                                  |    2 +
 pkg/p9/buffer_test.go                         |   31 +
 pkg/p9/client.go                              |    6 +
 pkg/p9/client_file.go                         |    4 +-
 pkg/p9/file.go                                |  151 +-
 pkg/p9/handlers.go                            |  697 ++++++--
 pkg/p9/local_server/BUILD                     |    4 +-
 pkg/p9/local_server/local_server.go           |    5 +
 pkg/p9/messages_test.go                       |   37 +
 pkg/p9/p9.go                                  |   24 +
 pkg/p9/p9test/BUILD                           |   76 +-
 pkg/p9/p9test/client_test.go                  | 2263 ++++++++++++++++++++++---
 pkg/p9/p9test/mocks.go                        |  489 ------
 pkg/p9/p9test/p9test.go                       |  329 ++++
 pkg/p9/path_tree.go                           |  109 ++
 pkg/p9/server.go                              |  228 ++-
 pkg/p9/transport.go                           |   10 +-
 pkg/rand/BUILD                                |    4 +-
 pkg/seccomp/BUILD                             |    4 +-
 pkg/secio/BUILD                               |    4 +-
 pkg/sentry/arch/BUILD                         |    3 +-
 pkg/sentry/context/BUILD                      |    4 +-
 pkg/sentry/control/BUILD                      |    4 +-
 pkg/sentry/device/BUILD                       |    4 +-
 pkg/sentry/fs/anon/BUILD                      |    4 +-
 pkg/sentry/fs/gofer/BUILD                     |    4 -
 pkg/sentry/fs/gofer/context_file.go           |    7 -
 pkg/sentry/fs/gofer/gofer_test.go             |  894 ++--------
 pkg/sentry/fs/gofer/session.go                |    9 +-
 pkg/sentry/fs/gofer/session_state.go          |    4 +-
 pkg/sentry/fs/proc/device/BUILD               |    4 +-
 pkg/sentry/hostcpu/BUILD                      |    4 +-
 pkg/sentry/kernel/kdefs/BUILD                 |    4 +-
 pkg/sentry/kernel/memevent/BUILD              |    4 +-
 pkg/sentry/kernel/sched/BUILD                 |    4 +-
 pkg/sentry/loader/BUILD                       |    3 +-
 pkg/sentry/memutil/BUILD                      |    4 +-
 pkg/sentry/platform/interrupt/BUILD           |    4 +-
 pkg/sentry/platform/kvm/BUILD                 |    3 +-
 pkg/sentry/platform/kvm/testutil/BUILD        |    4 +-
 pkg/sentry/platform/procid/BUILD              |    4 +-
 pkg/sentry/platform/ptrace/BUILD              |    4 +-
 pkg/sentry/platform/ring0/BUILD               |    3 +-
 pkg/sentry/platform/ring0/gen_offsets/BUILD   |    3 +-
 pkg/sentry/platform/ring0/pagetables/BUILD    |    3 +-
 pkg/sentry/platform/safecopy/BUILD            |    4 +-
 pkg/sentry/safemem/BUILD                      |    4 +-
 pkg/sentry/sighandling/BUILD                  |    4 +-
 pkg/sentry/socket/rpcinet/BUILD               |    4 +-
 pkg/sentry/socket/rpcinet/conn/BUILD          |    4 +-
 pkg/sentry/socket/rpcinet/notifier/BUILD      |    4 +-
 pkg/sentry/state/BUILD                        |    4 +-
 pkg/sentry/strace/BUILD                       |    4 +-
 pkg/sentry/syscalls/BUILD                     |    4 +-
 pkg/sentry/time/BUILD                         |    3 +-
 pkg/sentry/unimpl/BUILD                       |    4 +-
 pkg/sentry/uniqueid/BUILD                     |    4 +-
 pkg/sentry/watchdog/BUILD                     |    4 +-
 pkg/sleep/BUILD                               |    4 +-
 pkg/state/BUILD                               |    5 +-
 pkg/state/statefile/BUILD                     |    4 +-
 pkg/sync/atomicptrtest/BUILD                  |    3 +-
 pkg/sync/seqatomictest/BUILD                  |    3 +-
 pkg/syserr/BUILD                              |    4 +-
 pkg/syserror/BUILD                            |    4 +-
 pkg/tcpip/adapters/gonet/BUILD                |    4 +-
 pkg/tcpip/checker/BUILD                       |    4 +-
 pkg/tcpip/link/channel/BUILD                  |    4 +-
 pkg/tcpip/link/fdbased/BUILD                  |    4 +-
 pkg/tcpip/link/loopback/BUILD                 |    4 +-
 pkg/tcpip/link/rawfile/BUILD                  |    4 +-
 pkg/tcpip/link/sharedmem/BUILD                |    4 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD           |    4 +-
 pkg/tcpip/link/sharedmem/queue/BUILD          |    4 +-
 pkg/tcpip/link/sniffer/BUILD                  |    4 +-
 pkg/tcpip/link/tun/BUILD                      |    4 +-
 pkg/tcpip/link/waitable/BUILD                 |    4 +-
 pkg/tcpip/network/BUILD                       |    4 +-
 pkg/tcpip/network/arp/BUILD                   |    4 +-
 pkg/tcpip/network/hash/BUILD                  |    4 +-
 pkg/tcpip/network/ipv4/BUILD                  |    4 +-
 pkg/tcpip/network/ipv6/BUILD                  |    4 +-
 pkg/tcpip/ports/BUILD                         |    4 +-
 pkg/tcpip/sample/tun_tcp_connect/BUILD        |    4 +-
 pkg/tcpip/sample/tun_tcp_echo/BUILD           |    4 +-
 pkg/tcpip/transport/tcp/testing/context/BUILD |    4 +-
 pkg/tcpip/transport/tcpconntrack/BUILD        |    4 +-
 pkg/tmutex/BUILD                              |    4 +-
 pkg/unet/BUILD                                |    4 +-
 pkg/urpc/BUILD                                |    4 +-
 pkg/waiter/fdnotifier/BUILD                   |    4 +-
 runsc/boot/BUILD                              |    4 +-
 runsc/boot/filter/BUILD                       |    4 +-
 runsc/cgroup/BUILD                            |    4 +-
 runsc/cmd/BUILD                               |    4 +-
 runsc/console/BUILD                           |    4 +-
 runsc/container/BUILD                         |    4 +-
 runsc/fsgofer/BUILD                           |    4 +-
 runsc/fsgofer/filter/BUILD                    |    4 +-
 runsc/fsgofer/fsgofer.go                      |   98 +-
 runsc/fsgofer/fsgofer_test.go                 |   78 +-
 runsc/sandbox/BUILD                           |    4 +-
 runsc/specutils/BUILD                         |    4 +-
 runsc/test/image/BUILD                        |    4 +-
 runsc/test/integration/BUILD                  |    4 +-
 runsc/test/root/BUILD                         |    4 +-
 runsc/test/testutil/BUILD                     |    4 +-
 runsc/tools/dockercfg/BUILD                   |    4 +-
 tools/go_generics/BUILD                       |    4 +-
 tools/go_generics/globals/BUILD               |    4 +-
 tools/go_generics/rules_tests/BUILD           |    3 +-
 tools/go_stateify/BUILD                       |    4 +-
 128 files changed, 3834 insertions(+), 2147 deletions(-)
 create mode 100644 pkg/p9/buffer_test.go
 delete mode 100644 pkg/p9/p9test/mocks.go
 create mode 100644 pkg/p9/p9test/p9test.go
 create mode 100644 pkg/p9/path_tree.go

(limited to 'pkg/sentry')

diff --git a/WORKSPACE b/WORKSPACE
index 48e0d3436..841a23e06 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -15,7 +15,7 @@ go_register_toolchains(go_version="1.11.1")
 load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository")
 gazelle_dependencies()
 
-# Add dependencies on external repositories.
+# External repositories, in sorted order.
 go_repository(
     name = "com_github_cenkalti_backoff",
     importpath = "github.com/cenkalti/backoff",
@@ -28,6 +28,12 @@ go_repository(
     commit = "886344bea0798d02ff3fae16a922be5f6b26cee0"
 )
 
+go_repository(
+    name = "com_github_golang_mock",
+    importpath = "github.com/golang/mock",
+    commit = "600781dde9cca80734169b9e969d9054ccc57937",
+)
+
 go_repository(
     name = "com_github_google_go-cmp",
     importpath = "github.com/google/go-cmp",
@@ -58,6 +64,12 @@ go_repository(
     commit = "b2d941ef6a780da2d9982c1fb28d77ad97f54fc7",
 )
 
+go_repository(
+    name = "com_github_syndtr_gocapability",
+    importpath = "github.com/syndtr/gocapability",
+    commit = "d98352740cb2c55f81556b63d4a1ec64c5a319c2",
+)
+
 go_repository(
     name = "com_github_vishvananda_netlink",
     importpath = "github.com/vishvananda/netlink",
@@ -81,9 +93,3 @@ go_repository(
     importpath = "golang.org/x/sys",
     commit = "0dd5e194bbf5eb84a39666eb4c98a4d007e4203a",
 )
-
-go_repository(
-    name = "com_github_syndtr_gocapability",
-    importpath = "github.com/syndtr/gocapability",
-    commit = "d98352740cb2c55f81556b63d4a1ec64c5a319c2",
-)
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 84e6b79a5..815ee3a69 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "amutex",
     srcs = ["amutex.go"],
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index a8dd17825..235188531 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "atomicbitops",
     srcs = [
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 586d05634..571151f72 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "binary",
     srcs = ["binary.go"],
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 8c943b615..46794bdb8 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_library(
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index d70f982c1..72952d735 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "compressio",
     srcs = ["compressio.go"],
diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD
index d58cd1b71..32853875d 100644
--- a/pkg/control/client/BUILD
+++ b/pkg/control/client/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "client",
     srcs = [
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index c3f74a532..ba2b1be9f 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "server",
     srcs = ["server.go"],
diff --git a/pkg/dhcp/BUILD b/pkg/dhcp/BUILD
index 711a72c99..c97dfc14b 100644
--- a/pkg/dhcp/BUILD
+++ b/pkg/dhcp/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "dhcp",
     srcs = [
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 9d531ce12..18348ef54 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "eventchannel",
     srcs = [
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index 435b6fa34..06cfd445e 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fd",
     srcs = ["fd.go"],
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index 872eff531..9a87a3a31 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "gate",
     srcs = [
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index 1bd71b800..a67aa2cff 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ilist",
     srcs = [
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index 6c3795432..3f28ba867 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "linewriter",
     srcs = ["linewriter.go"],
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index fc9281079..bf85b4494 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "log",
     srcs = [
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index c0cd40c7b..d96e5563b 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "metric",
     srcs = ["metric.go"],
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 1cf5c6458..2c224e65b 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -15,6 +15,7 @@ go_library(
         "handlers.go",
         "messages.go",
         "p9.go",
+        "path_tree.go",
         "pool.go",
         "server.go",
         "transport.go",
@@ -32,6 +33,7 @@ go_test(
     name = "p9_test",
     size = "small",
     srcs = [
+        "buffer_test.go",
         "client_test.go",
         "messages_test.go",
         "p9_test.go",
diff --git a/pkg/p9/buffer_test.go b/pkg/p9/buffer_test.go
new file mode 100644
index 000000000..97eceefa7
--- /dev/null
+++ b/pkg/p9/buffer_test.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package p9
+
+import (
+	"testing"
+)
+
+func TestBufferOverrun(t *testing.T) {
+	buf := &buffer{
+		// This header indicates that a large string should follow, but
+		// it is only two bytes. Reading a string should cause an
+		// overrun.
+		data: []byte{0x0, 0x16},
+	}
+	if s := buf.ReadString(); s != "" {
+		t.Errorf("overrun read got %s, want empty", s)
+	}
+}
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 3ebfab82a..67887874a 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -116,6 +116,7 @@ func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client
 			msize: largestFixedSize,
 		}
 	}
+
 	// Compute a payload size and round to 512 (normal block size)
 	// if it's larger than a single block.
 	payloadSize := messageSize - largestFixedSize
@@ -299,3 +300,8 @@ func (c *Client) sendRecv(t message, r message) error {
 func (c *Client) Version() uint32 {
 	return c.version
 }
+
+// Close closes the underlying socket.
+func (c *Client) Close() error {
+	return c.socket.Close()
+}
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 066639fda..992d1daf7 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -172,6 +172,9 @@ func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error {
 }
 
 // Remove implements File.Remove.
+//
+// N.B. This method is no longer part of the file interface and should be
+// considered deprecated.
 func (c *clientFile) Remove() error {
 	// Avoid double close.
 	if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
@@ -181,7 +184,6 @@ func (c *clientFile) Remove() error {
 
 	// Send the remove message.
 	if err := c.client.sendRecv(&Tremove{FID: c.fid}, &Rremove{}); err != nil {
-		log.Warningf("Tremove failed, losing FID %v: %v", c.fid, err)
 		return err
 	}
 
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index d2e89e373..55ceb52e1 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -31,35 +31,63 @@ type Attacher interface {
 
 // File is a set of operations corresponding to a single node.
 //
-// Functions below MUST return syscall.Errno values.
-// TODO: Enforce that with the type.
+// Note that on the server side, the server logic places constraints on
+// concurrent operations to make things easier. This may reduce the need for
+// complex, error-prone locking and logic in the backend. These are documented
+// for each method.
 //
-// These must be implemented in all circumstances.
+// There are three different types of guarantees provided:
+//
+// none: There is no concurrency guarantee. The method may be invoked
+// concurrently with any other method on any other file.
+//
+// read: The method is guaranteed to be exclusive of any write or global
+// operation that is mutating the state of the directory tree starting at this
+// node. For example, this means creating new files, symlinks, directories or
+// renaming a directory entry (or renaming in to this target), but the method
+// may be called concurrently with other read methods.
+//
+// write: The method is guaranteed to be exclusive of any read, write or global
+// operation that is mutating the state of the directory tree starting at this
+// node, as described in read above. There may however, be other write
+// operations executing concurrently on other components in the directory tree.
+//
+// global: The method is guaranteed to be exclusive of any read, write or
+// global operation.
 type File interface {
 	// Walk walks to the path components given in names.
 	//
 	// Walk returns QIDs in the same order that the names were passed in.
 	//
 	// An empty list of arguments should return a copy of the current file.
+	//
+	// On the server, Walk has a read concurrency guarantee.
 	Walk(names []string) ([]QID, File, error)
 
+	// WalkGetAttr walks to the next file and returns its maximal set of
+	// attributes.
+	//
+	// Server-side p9.Files may return syscall.ENOSYS to indicate that Walk
+	// and GetAttr should be used separately to satisfy this request.
+	//
+	// On the server, WalkGetAttr has a read concurrency guarantee.
+	WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error)
+
 	// StatFS returns information about the file system associated with
 	// this file.
+	//
+	// On the server, StatFS has no concurrency guarantee.
 	StatFS() (FSStat, error)
 
 	// GetAttr returns attributes of this node.
+	//
+	// On the server, GetAttr has a read concurrency guarantee.
 	GetAttr(req AttrMask) (QID, AttrMask, Attr, error)
 
 	// SetAttr sets attributes on this node.
-	SetAttr(valid SetAttrMask, attr SetAttr) error
-
-	// Remove removes the file.
 	//
-	// This is deprecated in favor of UnlinkAt below.
-	Remove() error
-
-	// Rename renames the file.
-	Rename(directory File, name string) error
+	// On the server, SetAttr has a write concurrency guarantee.
+	SetAttr(valid SetAttrMask, attr SetAttr) error
 
 	// Close is called when all references are dropped on the server side,
 	// and Close should be called by the client to drop all references.
@@ -67,65 +95,93 @@ type File interface {
 	// For server-side implementations of Close, the error is ignored.
 	//
 	// Close must be called even when Open has not been called.
+	//
+	// On the server, Close has no concurrency guarantee.
 	Close() error
 
-	// Open is called prior to using read/write.
+	// Open must be called prior to using Read, Write or Readdir. Once Open
+	// is called, some operations, such as Walk, will no longer work.
 	//
-	// The *fd.FD may be nil. If an *fd.FD is provided, ownership now
-	// belongs to the caller and the FD must be non-blocking.
+	// On the client, Open should be called only once. The fd return is
+	// optional, and may be nil.
 	//
-	// If Open returns a non-nil *fd.FD, it should do so for all possible
-	// OpenFlags. If Open returns a nil *fd.FD, it should similarly return
-	// a nil *fd.FD for all possible OpenFlags.
+	// On the server, Open has a read concurrency guarantee. If an *fd.FD
+	// is provided, ownership now belongs to the caller. Open is guaranteed
+	// to be called only once.
 	//
-	// This can be assumed to be one-shot only.
+	// N.B. The server must resolve any lazy paths when open is called.
+	// After this point, read and write may be called on files with no
+	// deletion check, so resolving in the data path is not viable.
 	Open(mode OpenFlags) (*fd.FD, QID, uint32, error)
 
-	// Read reads from this file.
+	// Read reads from this file. Open must be called first.
 	//
 	// This may return io.EOF in addition to syscall.Errno values.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, ReadAt has a read concurrency guarantee. See Open for
+	// additional requirements regarding lazy path resolution.
 	ReadAt(p []byte, offset uint64) (int, error)
 
-	// Write writes to this file.
+	// Write writes to this file. Open must be called first.
 	//
 	// This may return io.EOF in addition to syscall.Errno values.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, WriteAt has a read concurrency guarantee. See Open
+	// for additional requirements regarding lazy path resolution.
 	WriteAt(p []byte, offset uint64) (int, error)
 
-	// FSync syncs this node.
+	// FSync syncs this node. Open must be called first.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, FSync has a read concurrency guarantee.
 	FSync() error
 
 	// Create creates a new regular file and opens it according to the
-	// flags given.
+	// flags given. This file is already Open.
+	//
+	// N.B. On the client, the returned file is a reference to the current
+	// file, which now represents the created file. This is not the case on
+	// the server. These semantics are very subtle and can easily lead to
+	// bugs, but are a consequence of the 9P create operation.
 	//
 	// See p9.File.Open for a description of *fd.FD.
+	//
+	// On the server, Create has a write concurrency guarantee.
 	Create(name string, flags OpenFlags, permissions FileMode, uid UID, gid GID) (*fd.FD, File, QID, uint32, error)
 
 	// Mkdir creates a subdirectory.
+	//
+	// On the server, Mkdir has a write concurrency guarantee.
 	Mkdir(name string, permissions FileMode, uid UID, gid GID) (QID, error)
 
 	// Symlink makes a new symbolic link.
-	Symlink(oldname string, newname string, uid UID, gid GID) (QID, error)
+	//
+	// On the server, Symlink has a write concurrency guarantee.
+	Symlink(oldName string, newName string, uid UID, gid GID) (QID, error)
 
 	// Link makes a new hard link.
-	Link(target File, newname string) error
+	//
+	// On the server, Link has a write concurrency guarantee.
+	Link(target File, newName string) error
 
 	// Mknod makes a new device node.
+	//
+	// On the server, Mknod has a write concurrency guarantee.
 	Mknod(name string, permissions FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error)
 
+	// Rename renames the file.
+	//
+	// Rename will never be called on the server, and RenameAt will always
+	// be used instead.
+	Rename(newDir File, newName string) error
+
 	// RenameAt renames a given file to a new name in a potentially new
 	// directory.
 	//
-	// oldname must be a name relative to this file, which must be a
-	// directory. newname is a name relative to newdir.
+	// oldName must be a name relative to this file, which must be a
+	// directory. newName is a name relative to newDir.
 	//
-	// This is deprecated in favor of Rename.
-	RenameAt(oldname string, newdir File, newname string) error
+	// On the server, RenameAt has a global concurrency guarantee.
+	RenameAt(oldName string, newDir File, newName string) error
 
 	// UnlinkAt the given named file.
 	//
@@ -133,16 +189,20 @@ type File interface {
 	//
 	// Flags are implementation-specific (e.g. O_DIRECTORY), but are
 	// generally Linux unlinkat(2) flags.
+	//
+	// On the server, UnlinkAt has a write concurrency guarantee.
 	UnlinkAt(name string, flags uint32) error
 
 	// Readdir reads directory entries.
 	//
 	// This may return io.EOF in addition to syscall.Errno values.
 	//
-	// Preconditions: Open has been called and returned success.
+	// On the server, Readdir has a read concurrency guarantee.
 	Readdir(offset uint64, count uint32) ([]Dirent, error)
 
 	// Readlink reads the link target.
+	//
+	// On the server, Readlink has a read concurrency guarantee.
 	Readlink() (string, error)
 
 	// Flush is called prior to Close.
@@ -150,16 +210,11 @@ type File interface {
 	// Whereas Close drops all references to the file, Flush cleans up the
 	// file state. Behavior is implementation-specific.
 	//
-	// Flush is not related to flush(9p).  Flush is an extension to 9P2000.L,
+	// Flush is not related to flush(9p). Flush is an extension to 9P2000.L,
 	// see version.go.
-	Flush() error
-
-	// WalkGetAttr walks to the next file and returns its maximal set of
-	// attributes.
 	//
-	// Server-side p9.Files may return syscall.ENOSYS to indicate that Walk
-	// and GetAttr should be used separately to satisfy this request.
-	WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error)
+	// On the server, Flush has a read concurrency guarantee.
+	Flush() error
 
 	// Connect establishes a new host-socket backed connection with a
 	// socket. A File does not need to be opened before it can be connected
@@ -170,8 +225,22 @@ type File interface {
 	//
 	// The returned FD must be non-blocking.
 	//
-	// flags indicates the requested type of socket.
+	// Flags indicates the requested type of socket.
+	//
+	// On the server, Connect has a read concurrency guarantee.
 	Connect(flags ConnectFlags) (*fd.FD, error)
+
+	// Renamed is called when this node is renamed.
+	//
+	// This may not fail. The file will hold a reference to its parent
+	// within the p9 package, and is therefore safe to use for the lifetime
+	// of this File (until Close is called).
+	//
+	// This method should not be called by clients, who should use the
+	// relevant Rename methods. (Although the method will be a no-op.)
+	//
+	// On the server, Renamed has a global concurrency guarantee.
+	Renamed(newDir File, newName string)
 }
 
 // DefaultWalkGetAttr implements File.WalkGetAttr to return ENOSYS for server-side Files.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 959dff31d..0d7a6138f 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -15,6 +15,7 @@
 package p9
 
 import (
+	"fmt"
 	"io"
 	"os"
 	"path"
@@ -22,22 +23,43 @@ import (
 	"sync/atomic"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 )
 
-// newErr returns a new error message from an error.
-func newErr(err error) *Rlerror {
+const maximumNameLength = 255
+
+// ExtractErrno extracts a syscall.Errno from a error, best effort.
+func ExtractErrno(err error) syscall.Errno {
+	switch err {
+	case os.ErrNotExist:
+		return syscall.ENOENT
+	case os.ErrExist:
+		return syscall.EEXIST
+	case os.ErrPermission:
+		return syscall.EACCES
+	case os.ErrInvalid:
+		return syscall.EINVAL
+	}
+
+	// Attempt to unwrap.
 	switch e := err.(type) {
 	case syscall.Errno:
-		return &Rlerror{Error: uint32(e)}
+		return e
 	case *os.PathError:
-		return newErr(e.Err)
+		return ExtractErrno(e.Err)
 	case *os.SyscallError:
-		return newErr(e.Err)
-	default:
-		log.Warningf("unknown error: %v", err)
-		return &Rlerror{Error: uint32(syscall.EIO)}
+		return ExtractErrno(e.Err)
 	}
+
+	// Default case.
+	log.Warningf("unknown error: %v", err)
+	return syscall.EIO
+}
+
+// newErr returns a new error message from an error.
+func newErr(err error) *Rlerror {
+	return &Rlerror{Error: uint32(ExtractErrno(err))}
 }
 
 // handler is implemented for server-handled messages.
@@ -85,13 +107,15 @@ func (t *Tflush) handle(cs *connState) message {
 	return &Rflush{}
 }
 
-// isSafeName returns true iff the name does not contain directory characters.
-//
-// We permit walks only on safe names and store the sequence of paths used for
-// any given walk in each FID. (This is immutable.) We use this to mark
-// relevant FIDs as moved when a successful rename occurs.
-func isSafeName(name string) bool {
-	return name != "" && !strings.Contains(name, "/") && name != "." && name != ".."
+// checkSafeName validates the name and returns nil or returns an error.
+func checkSafeName(name string) error {
+	if name == "" || strings.Contains(name, "/") || name == "." || name == ".." {
+		return syscall.EINVAL
+	}
+	if len(name) > maximumNameLength {
+		return syscall.ENAMETOOLONG
+	}
+	return nil
 }
 
 // handle implements handler.handle.
@@ -110,22 +134,54 @@ func (t *Tremove) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
+	// Frustratingly, because we can't be guaranteed that a rename is not
+	// occurring simultaneously with this removal, we need to acquire the
+	// global rename lock for this kind of remove operation to ensure that
+	// ref.parent does not change out from underneath us.
+	//
+	// This is why Tremove is a bad idea, and clients should generally use
+	// Tunlinkat. All p9 clients will use Tunlinkat.
+	err := ref.safelyGlobal(func() error {
+		// Is this a root? Can't remove that.
+		if ref.isRoot() {
+			return syscall.EINVAL
+		}
+
+		// N.B. this remove operation is permitted, even if the file is open.
+		// See also rename below for reasoning.
+
+		// Is this file already deleted?
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Retrieve the file's proper name.
+		name := ref.parent.pathNode.nameFor(ref)
+
+		// Attempt the removal.
+		if err := ref.parent.file.UnlinkAt(name, 0); err != nil {
+			return err
+		}
+
+		// Mark all relevant fids as deleted. We don't need to lock any
+		// individual nodes because we already hold the global lock.
+		ref.parent.markChildDeleted(name)
+		return nil
+	})
+
 	// "The remove request asks the file server both to remove the file
 	// represented by fid and to clunk the fid, even if the remove fails."
 	//
 	// "It is correct to consider remove to be a clunk with the side effect
 	// of removing the file if permissions allow."
 	// https://swtch.com/plan9port/man/man9/remove.html
-	err := ref.file.Remove()
-
-	// Clunk the FID regardless of Remove error.
 	if !cs.DeleteFID(t.FID) {
 		return newErr(syscall.EBADF)
 	}
-
 	if err != nil {
 		return newErr(err)
 	}
+
 	return &Rremove{}
 }
 
@@ -168,9 +224,12 @@ func (t *Tattach) handle(cs *connState) message {
 
 	// Build a transient reference.
 	root := &fidRef{
+		server:   cs.server,
+		parent:   nil,
 		file:     sf,
 		refs:     1,
-		walkable: attr.Mode.IsDir(),
+		mode:     attr.Mode.FileType(),
+		pathNode: &cs.server.pathTree,
 	}
 	defer root.DecRef()
 
@@ -183,20 +242,24 @@ func (t *Tattach) handle(cs *connState) message {
 	// We want the same traversal checks to apply on attach, so always
 	// attach at the root and use the regular walk paths.
 	names := strings.Split(t.Auth.AttachName, "/")
-	_, target, _, attr, err := doWalk(cs, root, names)
+	_, newRef, _, attr, err := doWalk(cs, root, names)
 	if err != nil {
 		return newErr(err)
 	}
+	defer newRef.DecRef()
 
 	// Insert the FID.
-	cs.InsertFID(t.FID, &fidRef{
-		file:     target,
-		walkable: attr.Mode.IsDir(),
-	})
-
+	cs.InsertFID(t.FID, newRef)
 	return &Rattach{}
 }
 
+// CanOpen returns whether this file open can be opened, read and written to.
+//
+// This includes everything except symlinks and sockets.
+func CanOpen(mode FileMode) bool {
+	return mode.IsRegular() || mode.IsDir() || mode.IsNamedPipe() || mode.IsBlockDevice() || mode.IsCharacterDevice()
+}
+
 // handle implements handler.handle.
 func (t *Tlopen) handle(cs *connState) message {
 	// Lookup the FID.
@@ -210,13 +273,35 @@ func (t *Tlopen) handle(cs *connState) message {
 	defer ref.openedMu.Unlock()
 
 	// Has it been opened already?
-	if ref.opened {
+	if ref.opened || !CanOpen(ref.mode) {
 		return newErr(syscall.EINVAL)
 	}
 
-	// Do the open.
-	osFile, qid, ioUnit, err := ref.file.Open(t.Flags)
-	if err != nil {
+	// Are flags valid?
+	if t.Flags&^OpenFlagsModeMask != 0 {
+		return newErr(syscall.EINVAL)
+	}
+
+	// Is this an attempt to open a directory as writable? Don't accept.
+	if ref.mode.IsDir() && t.Flags != ReadOnly {
+		return newErr(syscall.EINVAL)
+	}
+
+	var (
+		qid    QID
+		ioUnit uint32
+		osFile *fd.FD
+	)
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been deleted already?
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Do the open.
+		osFile, qid, ioUnit, err = ref.file.Open(t.Flags)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -229,8 +314,8 @@ func (t *Tlopen) handle(cs *connState) message {
 
 func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -240,20 +325,48 @@ func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the create.
-	osFile, nsf, qid, ioUnit, err := ref.file.Create(t.Name, t.OpenFlags, t.Permissions, uid, t.GID)
-	if err != nil {
+	var (
+		osFile *fd.FD
+		nsf    File
+		qid    QID
+		ioUnit uint32
+		newRef *fidRef
+	)
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow creation from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the create.
+		osFile, nsf, qid, ioUnit, err = ref.file.Create(t.Name, t.OpenFlags, t.Permissions, uid, t.GID)
+		if err != nil {
+			return err
+		}
+
+		newRef = &fidRef{
+			server:    cs.server,
+			parent:    ref,
+			file:      nsf,
+			opened:    true,
+			openFlags: t.OpenFlags,
+			mode:      ModeRegular,
+			pathNode:  ref.pathNode.pathNodeFor(t.Name),
+		}
+		ref.pathNode.addChild(newRef, t.Name)
+		ref.IncRef() // Acquire parent reference.
+		return nil
+	}); err != nil {
 		return nil, err
 	}
 
 	// Replace the FID reference.
-	//
-	// The new file will be opened already.
-	cs.InsertFID(t.FID, &fidRef{
-		file:      nsf,
-		opened:    true,
-		openFlags: t.OpenFlags,
-	})
+	cs.InsertFID(t.FID, newRef)
 
 	return &Rlcreate{Rlopen: Rlopen{QID: qid, IoUnit: ioUnit, File: osFile}}, nil
 }
@@ -278,8 +391,8 @@ func (t *Tsymlink) handle(cs *connState) message {
 
 func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -289,9 +402,22 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the symlink.
-	qid, err := ref.file.Symlink(t.Target, t.Name, uid, t.GID)
-	if err != nil {
+	var qid QID
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow symlinks from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the symlink.
+		qid, err = ref.file.Symlink(t.Target, t.Name, uid, t.GID)
+		return err
+	}); err != nil {
 		return nil, err
 	}
 
@@ -301,8 +427,8 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
 // handle implements handler.handle.
 func (t *Tlink) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.Name); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -319,8 +445,20 @@ func (t *Tlink) handle(cs *connState) message {
 	}
 	defer refTarget.DecRef()
 
-	// Do the link.
-	if err := ref.file.Link(refTarget.file, t.Name); err != nil {
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow create links from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the link.
+		return ref.file.Link(refTarget.file, t.Name)
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -330,8 +468,11 @@ func (t *Tlink) handle(cs *connState) message {
 // handle implements handler.handle.
 func (t *Trenameat) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.OldName) || !isSafeName(t.NewName) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.OldName); err != nil {
+		return newErr(err)
+	}
+	if err := checkSafeName(t.NewName); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -348,8 +489,32 @@ func (t *Trenameat) handle(cs *connState) message {
 	}
 	defer refTarget.DecRef()
 
-	// Do the rename.
-	if err := ref.file.RenameAt(t.OldName, refTarget.file, t.NewName); err != nil {
+	// Perform the rename holding the global lock.
+	if err := ref.safelyGlobal(func() (err error) {
+		// Don't allow renaming across deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() || refTarget.isDeleted() || !refTarget.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Is this the same file? If yes, short-circuit and return success.
+		if ref.pathNode == refTarget.pathNode && t.OldName == t.NewName {
+			return nil
+		}
+
+		// Attempt the actual rename.
+		if err := ref.file.RenameAt(t.OldName, refTarget.file, t.NewName); err != nil {
+			return err
+		}
+
+		// Update the path tree.
+		ref.renameChildTo(t.OldName, refTarget, t.NewName)
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -359,8 +524,8 @@ func (t *Trenameat) handle(cs *connState) message {
 // handle implements handler.handle.
 func (t *Tunlinkat) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.Name); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -370,8 +535,40 @@ func (t *Tunlinkat) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Do the unlink.
-	if err := ref.file.UnlinkAt(t.Name, t.Flags); err != nil {
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow deletion from non-directories or deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Before we do the unlink itself, we need to ensure that there
+		// are no operations in flight on associated path node. The
+		// child's path node lock must be held to ensure that the
+		// unlink at marking the child deleted below is atomic with
+		// respect to any other read or write operations.
+		//
+		// This is one case where we have a lock ordering issue, but
+		// since we always acquire deeper in the hierarchy, we know
+		// that we are free of lock cycles.
+		childPathNode := ref.pathNode.pathNodeFor(t.Name)
+		childPathNode.mu.Lock()
+		defer childPathNode.mu.Unlock()
+
+		// Do the unlink.
+		err = ref.file.UnlinkAt(t.Name, t.Flags)
+		if err != nil {
+			return err
+		}
+
+		// Mark the path as deleted.
+		ref.markChildDeleted(t.Name)
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -381,8 +578,8 @@ func (t *Tunlinkat) handle(cs *connState) message {
 // handle implements handler.handle.
 func (t *Trename) handle(cs *connState) message {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return newErr(syscall.EINVAL)
+	if err := checkSafeName(t.Name); err != nil {
+		return newErr(err)
 	}
 
 	// Lookup the FID.
@@ -399,8 +596,43 @@ func (t *Trename) handle(cs *connState) message {
 	}
 	defer refTarget.DecRef()
 
-	// Call the rename method.
-	if err := ref.file.Rename(refTarget.file, t.Name); err != nil {
+	if err := ref.safelyGlobal(func() (err error) {
+		// Don't allow a root rename.
+		if ref.isRoot() {
+			return syscall.EINVAL
+		}
+
+		// Don't allow renaming deleting entries, or target non-directories.
+		if ref.isDeleted() || refTarget.isDeleted() || !refTarget.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// If the parent is deleted, but we not, something is seriously wrong.
+		// It's fail to die at this point with an assertion failure.
+		if ref.parent.isDeleted() {
+			panic(fmt.Sprintf("parent %+v deleted, child %+v is not", ref.parent, ref))
+		}
+
+		// N.B. The rename operation is allowed to proceed on open files. It
+		// does impact the state of its parent, but this is merely a sanity
+		// check in any case, and the operation is safe. There may be other
+		// files corresponding to the same path that are renamed anyways.
+
+		// Check for the exact same file and short-circuit.
+		oldName := ref.parent.pathNode.nameFor(ref)
+		if ref.parent.pathNode == refTarget.pathNode && oldName == t.Name {
+			return nil
+		}
+
+		// Call the rename method on the parent.
+		if err := ref.parent.file.RenameAt(oldName, refTarget.file, t.Name); err != nil {
+			return err
+		}
+
+		// Update the path tree.
+		ref.parent.renameChildTo(oldName, refTarget, t.Name)
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -416,9 +648,19 @@ func (t *Treadlink) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Do the read.
-	target, err := ref.file.Readlink()
-	if err != nil {
+	var target string
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow readlink on deleted files. There is no need to
+		// check if this file is opened because symlinks cannot be
+		// opened.
+		if ref.isDeleted() || !ref.mode.IsSymlink() {
+			return syscall.EINVAL
+		}
+
+		// Do the read.
+		target, err = ref.file.Readlink()
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -434,26 +676,30 @@ func (t *Tread) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	openFlags, opened := ref.OpenFlags()
-	if !opened {
-		return newErr(syscall.EINVAL)
-	}
-
-	// Can it be read? Check permissions.
-	if openFlags&OpenFlagsModeMask == WriteOnly {
-		return newErr(syscall.EPERM)
-	}
-
 	// Constrain the size of the read buffer.
 	if int(t.Count) > int(maximumLength) {
 		return newErr(syscall.ENOBUFS)
 	}
 
-	// Do the read.
-	data := make([]byte, t.Count)
-	n, err := ref.file.ReadAt(data, t.Offset)
-	if err != nil && err != io.EOF {
+	var (
+		data = make([]byte, t.Count)
+		n    int
+	)
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been opened already?
+		openFlags, opened := ref.OpenFlags()
+		if !opened {
+			return syscall.EINVAL
+		}
+
+		// Can it be read? Check permissions.
+		if openFlags&OpenFlagsModeMask == WriteOnly {
+			return syscall.EPERM
+		}
+
+		n, err = ref.file.ReadAt(data, t.Offset)
+		return err
+	}); err != nil && err != io.EOF {
 		return newErr(err)
 	}
 
@@ -469,20 +715,22 @@ func (t *Twrite) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	openFlags, opened := ref.OpenFlags()
-	if !opened {
-		return newErr(syscall.EINVAL)
-	}
+	var n int
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been opened already?
+		openFlags, opened := ref.OpenFlags()
+		if !opened {
+			return syscall.EINVAL
+		}
 
-	// Can it be write? Check permissions.
-	if openFlags&OpenFlagsModeMask == ReadOnly {
-		return newErr(syscall.EPERM)
-	}
+		// Can it be write? Check permissions.
+		if openFlags&OpenFlagsModeMask == ReadOnly {
+			return syscall.EPERM
+		}
 
-	// Do the write.
-	n, err := ref.file.WriteAt(t.Data, t.Offset)
-	if err != nil {
+		n, err = ref.file.WriteAt(t.Data, t.Offset)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -500,8 +748,8 @@ func (t *Tmknod) handle(cs *connState) message {
 
 func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -511,9 +759,22 @@ func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the mknod.
-	qid, err := ref.file.Mknod(t.Name, t.Permissions, t.Major, t.Minor, uid, t.GID)
-	if err != nil {
+	var qid QID
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow mknod on deleted files.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the mknod.
+		qid, err = ref.file.Mknod(t.Name, t.Permissions, t.Major, t.Minor, uid, t.GID)
+		return err
+	}); err != nil {
 		return nil, err
 	}
 
@@ -531,8 +792,8 @@ func (t *Tmkdir) handle(cs *connState) message {
 
 func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
 	// Don't allow complex names.
-	if !isSafeName(t.Name) {
-		return nil, syscall.EINVAL
+	if err := checkSafeName(t.Name); err != nil {
+		return nil, err
 	}
 
 	// Lookup the FID.
@@ -542,9 +803,22 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
 	}
 	defer ref.DecRef()
 
-	// Do the mkdir.
-	qid, err := ref.file.Mkdir(t.Name, t.Permissions, uid, t.GID)
-	if err != nil {
+	var qid QID
+	if err := ref.safelyWrite(func() (err error) {
+		// Don't allow mkdir on deleted files.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Not allowed on open directories.
+		if _, opened := ref.OpenFlags(); opened {
+			return syscall.EINVAL
+		}
+
+		// Do the mkdir.
+		qid, err = ref.file.Mkdir(t.Name, t.Permissions, uid, t.GID)
+		return err
+	}); err != nil {
 		return nil, err
 	}
 
@@ -560,9 +834,20 @@ func (t *Tgetattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Get attributes.
-	qid, valid, attr, err := ref.file.GetAttr(t.AttrMask)
-	if err != nil {
+	// We allow getattr on deleted files. Depending on the backing
+	// implementation, it's possible that races exist that might allow
+	// fetching attributes of other files. But we need to generally allow
+	// refreshing attributes and this is a minor leak, if at all.
+
+	var (
+		qid   QID
+		valid AttrMask
+		attr  Attr
+	)
+	if err := ref.safelyRead(func() (err error) {
+		qid, valid, attr, err = ref.file.GetAttr(t.AttrMask)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -578,8 +863,18 @@ func (t *Tsetattr) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Set attributes.
-	if err := ref.file.SetAttr(t.Valid, t.SetAttr); err != nil {
+	if err := ref.safelyWrite(func() error {
+		// We don't allow setattr on files that have been deleted.
+		// This might be technically incorrect, as it's possible that
+		// there were multiple links and you can still change the
+		// corresponding inode information.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		// Set the attributes.
+		return ref.file.SetAttr(t.Valid, t.SetAttr)
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -621,14 +916,25 @@ func (t *Treaddir) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	if _, opened := ref.OpenFlags(); !opened {
-		return newErr(syscall.EINVAL)
-	}
+	var entries []Dirent
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow reading deleted directories.
+		if ref.isDeleted() || !ref.mode.IsDir() {
+			return syscall.EINVAL
+		}
+
+		// Has it been opened already?
+		if _, opened := ref.OpenFlags(); !opened {
+			return syscall.EINVAL
+		}
 
-	// Read the entries.
-	entries, err := ref.file.Readdir(t.Offset, t.Count)
-	if err != nil && err != io.EOF {
+		// Read the entries.
+		entries, err = ref.file.Readdir(t.Offset, t.Count)
+		if err != nil && err != io.EOF {
+			return err
+		}
+		return nil
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -644,13 +950,15 @@ func (t *Tfsync) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Has it been opened already?
-	if _, opened := ref.OpenFlags(); !opened {
-		return newErr(syscall.EINVAL)
-	}
+	if err := ref.safelyRead(func() (err error) {
+		// Has it been opened already?
+		if _, opened := ref.OpenFlags(); !opened {
+			return syscall.EINVAL
+		}
 
-	err := ref.file.FSync()
-	if err != nil {
+		// Perform the sync.
+		return ref.file.FSync()
+	}); err != nil {
 		return newErr(err)
 	}
 
@@ -671,6 +979,11 @@ func (t *Tstatfs) handle(cs *connState) message {
 		return newErr(err)
 	}
 
+	// Constrain the name length.
+	if st.NameLength > maximumNameLength {
+		st.NameLength = maximumNameLength
+	}
+
 	return &Rstatfs{st}
 }
 
@@ -682,7 +995,7 @@ func (t *Tflushf) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	if err := ref.file.Flush(); err != nil {
+	if err := ref.safelyRead(ref.file.Flush); err != nil {
 		return newErr(err)
 	}
 
@@ -726,12 +1039,14 @@ func walkOne(qids []QID, from File, names []string) ([]QID, File, AttrMask, Attr
 
 // doWalk walks from a given fidRef.
 //
-// This enforces that all intermediate nodes are walkable (directories).
-func doWalk(cs *connState, ref *fidRef, names []string) (qids []QID, sf File, valid AttrMask, attr Attr, err error) {
+// This enforces that all intermediate nodes are walkable (directories). The
+// fidRef returned (newRef) has a reference associated with it that is now
+// owned by the caller and must be handled appropriately.
+func doWalk(cs *connState, ref *fidRef, names []string) (qids []QID, newRef *fidRef, valid AttrMask, attr Attr, err error) {
 	// Check the names.
 	for _, name := range names {
-		if !isSafeName(name) {
-			err = syscall.EINVAL
+		err = checkSafeName(name)
+		if err != nil {
 			return
 		}
 	}
@@ -745,44 +1060,88 @@ func doWalk(cs *connState, ref *fidRef, names []string) (qids []QID, sf File, va
 	// Is this an empty list? Handle specially. We don't actually need to
 	// validate anything since this is always permitted.
 	if len(names) == 0 {
-		return walkOne(nil, ref.file, nil)
-	}
-
-	// Is it walkable?
-	if !ref.walkable {
-		err = syscall.EINVAL
-		return
+		var sf File // Temporary.
+		if err := ref.maybeParent().safelyRead(func() (err error) {
+			// Clone the single element.
+			qids, sf, valid, attr, err = walkOne(nil, ref.file, nil)
+			if err != nil {
+				return err
+			}
+
+			newRef = &fidRef{
+				server:   cs.server,
+				parent:   ref.parent,
+				file:     sf,
+				mode:     ref.mode,
+				pathNode: ref.pathNode,
+
+				// For the clone case, the cloned fid must
+				// preserve the deleted property of the
+				// original FID.
+				deleted: ref.deleted,
+			}
+			if !ref.isRoot() {
+				if !newRef.isDeleted() {
+					// Add only if a non-root node; the same node.
+					ref.parent.pathNode.addChild(newRef, ref.parent.pathNode.nameFor(ref))
+				}
+				ref.parent.IncRef() // Acquire parent reference.
+			}
+			// doWalk returns a reference.
+			newRef.IncRef()
+			return nil
+		}); err != nil {
+			return nil, nil, AttrMask{}, Attr{}, err
+		}
+		return qids, newRef, valid, attr, nil
 	}
 
-	from := ref.file // Start at the passed ref.
-
 	// Do the walk, one element at a time.
+	walkRef := ref
+	walkRef.IncRef()
 	for i := 0; i < len(names); i++ {
-		qids, sf, valid, attr, err = walkOne(qids, from, names[i:i+1])
-
-		// Close the intermediate file. Note that we don't close the
-		// first file because in that case we are walking from the
-		// existing reference.
-		if i > 0 {
-			from.Close()
-		}
-		from = sf // Use the new file.
-
-		// Was there an error walking?
-		if err != nil {
-			return nil, nil, AttrMask{}, Attr{}, err
-		}
-
 		// We won't allow beyond past symlinks; stop here if this isn't
 		// a proper directory and we have additional paths to walk.
-		if !valid.Mode || (!attr.Mode.IsDir() && i < len(names)-1) {
-			from.Close() // Not using the file object.
+		if !walkRef.mode.IsDir() {
+			walkRef.DecRef() // Drop walk reference; no lock required.
 			return nil, nil, AttrMask{}, Attr{}, syscall.EINVAL
 		}
+
+		var sf File // Temporary.
+		if err := walkRef.safelyRead(func() (err error) {
+			qids, sf, valid, attr, err = walkOne(qids, walkRef.file, names[i:i+1])
+			if err != nil {
+				return err
+			}
+
+			// Note that we don't need to acquire a lock on any of
+			// these individual instances. That's because they are
+			// not actually addressable via a FID. They are
+			// anonymous. They exist in the tree for tracking
+			// purposes.
+			newRef := &fidRef{
+				server:   cs.server,
+				parent:   walkRef,
+				file:     sf,
+				mode:     attr.Mode.FileType(),
+				pathNode: walkRef.pathNode.pathNodeFor(names[i]),
+			}
+			walkRef.pathNode.addChild(newRef, names[i])
+			// We allow our walk reference to become the new parent
+			// reference here and so we don't IncRef. Instead, just
+			// set walkRef to the newRef above and acquire a new
+			// walk reference.
+			walkRef = newRef
+			walkRef.IncRef()
+			return nil
+		}); err != nil {
+			walkRef.DecRef() // Drop the old walkRef.
+			return nil, nil, AttrMask{}, Attr{}, err
+		}
 	}
 
 	// Success.
-	return qids, sf, valid, attr, nil
+	return qids, walkRef, valid, attr, nil
 }
 
 // handle implements handler.handle.
@@ -795,17 +1154,14 @@ func (t *Twalk) handle(cs *connState) message {
 	defer ref.DecRef()
 
 	// Do the walk.
-	qids, sf, _, attr, err := doWalk(cs, ref, t.Names)
+	qids, newRef, _, _, err := doWalk(cs, ref, t.Names)
 	if err != nil {
 		return newErr(err)
 	}
+	defer newRef.DecRef()
 
 	// Install the new FID.
-	cs.InsertFID(t.NewFID, &fidRef{
-		file:     sf,
-		walkable: attr.Mode.IsDir(),
-	})
-
+	cs.InsertFID(t.NewFID, newRef)
 	return &Rwalk{QIDs: qids}
 }
 
@@ -819,17 +1175,14 @@ func (t *Twalkgetattr) handle(cs *connState) message {
 	defer ref.DecRef()
 
 	// Do the walk.
-	qids, sf, valid, attr, err := doWalk(cs, ref, t.Names)
+	qids, newRef, valid, attr, err := doWalk(cs, ref, t.Names)
 	if err != nil {
 		return newErr(err)
 	}
+	defer newRef.DecRef()
 
 	// Install the new FID.
-	cs.InsertFID(t.NewFID, &fidRef{
-		file:     sf,
-		walkable: attr.Mode.IsDir(),
-	})
-
+	cs.InsertFID(t.NewFID, newRef)
 	return &Rwalkgetattr{QIDs: qids, Valid: valid, Attr: attr}
 }
 
@@ -878,9 +1231,17 @@ func (t *Tlconnect) handle(cs *connState) message {
 	}
 	defer ref.DecRef()
 
-	// Do the connect.
-	osFile, err := ref.file.Connect(t.Flags)
-	if err != nil {
+	var osFile *fd.FD
+	if err := ref.safelyRead(func() (err error) {
+		// Don't allow connecting to deleted files.
+		if ref.isDeleted() || !ref.mode.IsSocket() {
+			return syscall.EINVAL
+		}
+
+		// Do the connect.
+		osFile, err = ref.file.Connect(t.Flags)
+		return err
+	}); err != nil {
 		return newErr(err)
 	}
 
diff --git a/pkg/p9/local_server/BUILD b/pkg/p9/local_server/BUILD
index 8229e6308..b17ebb79d 100644
--- a/pkg/p9/local_server/BUILD
+++ b/pkg/p9/local_server/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "local_server",
     srcs = ["local_server.go"],
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index 1e6aaa762..69b90c6cd 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -318,6 +318,11 @@ func (l *local) Connect(p9.ConnectFlags) (*fd.FD, error) {
 	return nil, syscall.ECONNREFUSED
 }
 
+// Renamed implements p9.File.Renamed.
+func (l *local) Renamed(parent p9.File, newName string) {
+	l.path = path.Join(parent.(*local).path, newName)
+}
+
 func main() {
 	log.SetLevel(log.Debug)
 
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index dfb41bb76..c0d65d82c 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -15,6 +15,7 @@
 package p9
 
 import (
+	"fmt"
 	"reflect"
 	"testing"
 )
@@ -186,6 +187,13 @@ func TestEncodeDecode(t *testing.T) {
 		&Rxattrwalk{
 			Size: 1,
 		},
+		&Txattrcreate{
+			FID:      1,
+			Name:     "a",
+			AttrSize: 2,
+			Flags:    3,
+		},
+		&Rxattrcreate{},
 		&Treaddir{
 			Directory: 1,
 			Offset:    2,
@@ -389,3 +397,32 @@ func TestEncodeDecode(t *testing.T) {
 		}
 	}
 }
+
+func TestMessageStrings(t *testing.T) {
+	for typ, fn := range messageRegistry {
+		name := fmt.Sprintf("%+v", typ)
+		t.Run(name, func(t *testing.T) {
+			defer func() { // Ensure no panic.
+				if r := recover(); r != nil {
+					t.Errorf("printing %s failed: %v", name, r)
+				}
+			}()
+			m := fn()
+			_ = fmt.Sprintf("%v", m)
+			err := ErrInvalidMsgType{typ}
+			_ = err.Error()
+		})
+	}
+}
+
+func TestRegisterDuplicate(t *testing.T) {
+	defer func() {
+		if r := recover(); r == nil {
+			// We expect a panic.
+			t.FailNow()
+		}
+	}()
+
+	// Register a duplicate.
+	register(&Rlerror{})
+}
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 3b0993ecd..be644e7bf 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -984,6 +984,30 @@ func (s *SetAttr) Encode(b *buffer) {
 	b.Write64(s.MTimeNanoSeconds)
 }
 
+// Apply applies this to the given Attr.
+func (a *Attr) Apply(mask SetAttrMask, attr SetAttr) {
+	if mask.Permissions {
+		a.Mode = a.Mode&^PermissionsMask | (attr.Permissions & PermissionsMask)
+	}
+	if mask.UID {
+		a.UID = attr.UID
+	}
+	if mask.GID {
+		a.GID = attr.GID
+	}
+	if mask.Size {
+		a.Size = attr.Size
+	}
+	if mask.ATime {
+		a.ATimeSeconds = attr.ATimeSeconds
+		a.ATimeNanoSeconds = attr.ATimeNanoSeconds
+	}
+	if mask.MTime {
+		a.MTimeSeconds = attr.MTimeSeconds
+		a.MTimeNanoSeconds = attr.MTimeNanoSeconds
+	}
+}
+
 // Dirent is used for readdir.
 type Dirent struct {
 	// QID is the entry QID.
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index d6f428e11..7c4b875ce 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,16 +1,60 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+alias(
+    name = "mockgen",
+    actual = "@com_github_golang_mock//mockgen:mockgen",
+)
 
-go_test(
-    name = "p9test_test",
-    size = "small",
-    srcs = ["client_test.go"],
-    embed = [":p9test"],
+MOCK_SRC_PACKAGE = "gvisor.googlesource.com/gvisor/pkg/p9"
+
+# mockgen_reflect is a source file that contains mock generation code that
+# imports the p9 package and generates a specification via reflection. The
+# usual generation path must be split into two distinct parts because the full
+# source tree is not available to all build targets. Only declared depencies
+# are available (and even then, not the Go source files).
+genrule(
+    name = "mockgen_reflect",
+    testonly = 1,
+    outs = ["mockgen_reflect.go"],
+    cmd = (
+        "$(location :mockgen) " +
+        "-package p9test " +
+        "-prog_only " + MOCK_SRC_PACKAGE + " " +
+        "Attacher,File > $@"
+    ),
+    tools = [":mockgen"],
+)
+
+# mockgen_exec is the binary that includes the above reflection generator.
+# Running this binary will emit an encoded version of the p9 Attacher and File
+# structures. This is consumed by the mocks genrule, below.
+go_binary(
+    name = "mockgen_exec",
+    testonly = 1,
+    srcs = ["mockgen_reflect.go"],
     deps = [
-        "//pkg/fd",
         "//pkg/p9",
-        "//pkg/unet",
+        "@com_github_golang_mock//mockgen/model:go_default_library",
+    ],
+)
+
+# mocks consumes the encoded output above, and generates the full source for a
+# set of mocks. These are included directly in the p9test library.
+genrule(
+    name = "mocks",
+    testonly = 1,
+    outs = ["mocks.go"],
+    cmd = (
+        "$(location :mockgen) " +
+        "-package p9test " +
+        "-exec_only $(location :mockgen_exec) " + MOCK_SRC_PACKAGE + " File > $@"
+    ),
+    tools = [
+        ":mockgen",
+        ":mockgen_exec",
     ],
 )
 
@@ -18,11 +62,27 @@ go_library(
     name = "p9test",
     srcs = [
         "mocks.go",
+        "p9test.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/p9/p9test",
     visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/unet",
+        "@com_github_golang_mock//gomock:go_default_library",
+    ],
+)
+
+go_test(
+    name = "client_test",
+    size = "small",
+    srcs = ["client_test.go"],
+    embed = [":p9test"],
     deps = [
         "//pkg/fd",
         "//pkg/p9",
+        "@com_github_golang_mock//gomock:go_default_library",
     ],
 )
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index db562b9ba..242d81b95 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -15,360 +15,2059 @@
 package p9test
 
 import (
-	"io/ioutil"
+	"bytes"
+	"fmt"
+	"io"
+	"math/rand"
 	"os"
 	"reflect"
+	"strings"
+	"sync"
 	"syscall"
 	"testing"
+	"time"
 
+	"github.com/golang/mock/gomock"
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
-func TestDonateFD(t *testing.T) {
-	// Temporary file.
-	osFile, err := ioutil.TempFile("", "p9")
+func TestPanic(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Create a new root.
+	d := h.NewDirectory(nil)(nil)
+	defer d.Close() // Needed manually.
+	h.Attacher.EXPECT().Attach().Return(d, nil).Do(func() {
+		// Panic here, and ensure that we get back EFAULT.
+		panic("handler")
+	})
+
+	// Attach to the client.
+	if _, err := c.Attach("/"); err != syscall.EFAULT {
+		t.Fatalf("got attach err %v, want EFAULT", err)
+	}
+}
+
+func TestAttachNoLeak(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Create a new root.
+	d := h.NewDirectory(nil)(nil)
+	h.Attacher.EXPECT().Attach().Return(d, nil).Times(1)
+
+	// Attach to the client.
+	f, err := c.Attach("/")
+	if err != nil {
+		t.Fatalf("got attach err %v, want nil", err)
+	}
+
+	// Don't close the file. This should be closed automatically when the
+	// client disconnects. The mock asserts that everything is closed
+	// exactly once. This statement just removes the unused variable error.
+	_ = f
+}
+
+func TestBadAttach(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Return an error on attach.
+	h.Attacher.EXPECT().Attach().Return(nil, syscall.EINVAL).Times(1)
+
+	// Attach to the client.
+	if _, err := c.Attach("/"); err != syscall.EINVAL {
+		t.Fatalf("got attach err %v, want syscall.EINVAL", err)
+	}
+}
+
+func TestWalkAttach(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	// Create a new root.
+	d := h.NewDirectory(map[string]Generator{
+		"a": h.NewDirectory(map[string]Generator{
+			"b": h.NewFile(),
+		}),
+	})(nil)
+	h.Attacher.EXPECT().Attach().Return(d, nil).Times(1)
+
+	// Attach to the client as a non-root, and ensure that the walk above
+	// occurs as expected. We should get back b, and all references should
+	// be dropped when the file is closed.
+	f, err := c.Attach("/a/b")
+	if err != nil {
+		t.Fatalf("got attach err %v, want nil", err)
+	}
+	defer f.Close()
+
+	// Check that's a regular file.
+	if _, _, attr, err := f.GetAttr(p9.AttrMaskAll()); err != nil {
+		t.Errorf("got err %v, want nil", err)
+	} else if !attr.Mode.IsRegular() {
+		t.Errorf("got mode %v, want regular file", err)
+	}
+}
+
+// newTypeMap returns a new type map dictionary.
+func newTypeMap(h *Harness) map[string]Generator {
+	return map[string]Generator{
+		"directory":        h.NewDirectory(map[string]Generator{}),
+		"file":             h.NewFile(),
+		"symlink":          h.NewSymlink(),
+		"block-device":     h.NewBlockDevice(),
+		"character-device": h.NewCharacterDevice(),
+		"named-pipe":       h.NewNamedPipe(),
+		"socket":           h.NewSocket(),
+	}
+}
+
+// newRoot returns a new root filesystem.
+//
+// This is set up in a deterministic way for testing most operations.
+//
+// The represented file system looks like:
+// - file
+// - symlink
+// - directory
+// ...
+// + one
+//   - file
+//   - symlink
+//   - directory
+//   ...
+//   + two
+//     - file
+//     - symlink
+//     - directory
+//     ...
+// + three
+//   - file
+//   - symlink
+//   - directory
+//   ...
+func newRoot(h *Harness, c *p9.Client) (*Mock, p9.File) {
+	root := newTypeMap(h)
+	one := newTypeMap(h)
+	two := newTypeMap(h)
+	three := newTypeMap(h)
+	one["two"] = h.NewDirectory(two)      // Will be nested in one.
+	root["one"] = h.NewDirectory(one)     // Top level.
+	root["three"] = h.NewDirectory(three) // Alternate top-level.
+
+	// Create a new root.
+	rootBackend := h.NewDirectory(root)(nil)
+	h.Attacher.EXPECT().Attach().Return(rootBackend, nil)
+
+	// Attach to the client.
+	r, err := c.Attach("/")
+	if err != nil {
+		h.t.Fatalf("got attach err %v, want nil", err)
+	}
+
+	return rootBackend, r
+}
+
+func allInvalidNames(from string) []string {
+	return []string{
+		from + "/other",
+		from + "/..",
+		from + "/.",
+		from + "/",
+		"other/" + from,
+		"/" + from,
+		"./" + from,
+		"../" + from,
+		".",
+		"..",
+		"/",
+		"",
+	}
+}
+
+func TestWalkInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	// Run relevant tests.
+	for name := range newTypeMap(h) {
+		// These are all the various ways that one might attempt to
+		// construct compound paths. They should all be rejected, as
+		// any compound that contains a / is not allowed, as well as
+		// the singular paths of '.' and '..'.
+		if _, _, err := root.Walk([]string{".", name}); err != syscall.EINVAL {
+			t.Errorf("Walk through . %s wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := root.Walk([]string{"..", name}); err != syscall.EINVAL {
+			t.Errorf("Walk through . %s wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := root.Walk([]string{name, "."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s . wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := root.Walk([]string{name, ".."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s .. wanted EINVAL, got %v", name, err)
+		}
+		for _, invalidName := range allInvalidNames(name) {
+			if _, _, err := root.Walk([]string{invalidName}); err != syscall.EINVAL {
+				t.Errorf("Walk through %s wanted EINVAL, got %v", invalidName, err)
+			}
+		}
+		wantErr := syscall.EINVAL
+		if name == "directory" {
+			// We can attempt a walk through a directory. However,
+			// we should never see a file named "other", so we
+			// expect this to return ENOENT.
+			wantErr = syscall.ENOENT
+		}
+		if _, _, err := root.Walk([]string{name, "other"}); err != wantErr {
+			t.Errorf("Walk through %s/other wanted %v, got %v", name, wantErr, err)
+		}
+
+		// Do a successful walk.
+		_, f, err := root.Walk([]string{name})
+		if err != nil {
+			t.Errorf("Walk to %s wanted nil, got %v", name, err)
+		}
+		defer f.Close()
+		local := h.Pop(f)
+
+		// Check that the file matches.
+		_, localMask, localAttr, localErr := local.GetAttr(p9.AttrMaskAll())
+		if _, mask, attr, err := f.GetAttr(p9.AttrMaskAll()); mask != localMask || attr != localAttr || err != localErr {
+			t.Errorf("GetAttr got (%v, %v, %v), wanted (%v, %v, %v)",
+				mask, attr, err, localMask, localAttr, localErr)
+		}
+
+		// Ensure we can't walk backwards.
+		if _, _, err := f.Walk([]string{"."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s/. wanted EINVAL, got %v", name, err)
+		}
+		if _, _, err := f.Walk([]string{".."}); err != syscall.EINVAL {
+			t.Errorf("Walk through %s/.. wanted EINVAL, got %v", name, err)
+		}
+	}
+}
+
+// fileGenerator is a function to generate files via walk or create.
+//
+// Examples are:
+//	- walkHelper
+//	- walkAndOpenHelper
+//	- createHelper
+type fileGenerator func(*Harness, string, p9.File) (*Mock, *Mock, p9.File)
+
+// walkHelper walks to the given file.
+//
+// The backends of the parent and walked file are returned, as well as the
+// walked client file.
+func walkHelper(h *Harness, name string, dir p9.File) (parentBackend *Mock, walkedBackend *Mock, walked p9.File) {
+	_, parent, err := dir.Walk(nil)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer parent.Close()
+	parentBackend = h.Pop(parent)
+
+	_, walked, err = parent.Walk([]string{name})
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	walkedBackend = h.Pop(walked)
+
+	return parentBackend, walkedBackend, walked
+}
+
+// walkAndOpenHelper additionally opens the walked file, if possible.
+func walkAndOpenHelper(h *Harness, name string, dir p9.File) (*Mock, *Mock, p9.File) {
+	parentBackend, walkedBackend, walked := walkHelper(h, name, dir)
+	if p9.CanOpen(walkedBackend.Attr.Mode) {
+		// Open for all file types that we can. We stick to a read-only
+		// open here because directories may not be opened otherwise.
+		walkedBackend.EXPECT().Open(p9.ReadOnly).Times(1)
+		if _, _, _, err := walked.Open(p9.ReadOnly); err != nil {
+			h.t.Errorf("got open err %v, want nil", err)
+		}
+	} else {
+		// ... or assert an error for others.
+		if _, _, _, err := walked.Open(p9.ReadOnly); err != syscall.EINVAL {
+			h.t.Errorf("got open err %v, want EINVAL", err)
+		}
+	}
+	return parentBackend, walkedBackend, walked
+}
+
+// createHelper creates the given file and returns the parent directory,
+// created file and client file, which must be closed when done.
+func createHelper(h *Harness, name string, dir p9.File) (*Mock, *Mock, p9.File) {
+	// Clone the directory first, since Create replaces the existing file.
+	// We change the type after calling create.
+	_, dirThenFile, err := dir.Walk(nil)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+
+	// Create a new server-side file. On the server-side, the a new file is
+	// returned from a create call. The client will reuse the same file,
+	// but we still expect the normal chain of closes. This complicates
+	// things a bit because the "parent" will always chain to the cloned
+	// dir above.
+	dirBackend := h.Pop(dirThenFile)   // New backend directory.
+	newFile := h.NewFile()(dirBackend) // New file with backend parent.
+	dirBackend.EXPECT().Create(name, gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, newFile, newFile.QID, uint32(0), nil)
+
+	// Create via the client.
+	_, dirThenFile, _, _, err = dirThenFile.Create(name, p9.ReadOnly, 0, 0, 0)
 	if err != nil {
-		t.Fatalf("could not create temporary file: %v", err)
+		h.t.Fatalf("got create err %v, want nil", err)
+	}
+
+	// Ensure subsequent walks succeed.
+	dirBackend.AddChild(name, h.NewFile())
+	return dirBackend, newFile, dirThenFile
+}
+
+// deprecatedRemover allows us to access the deprecated Remove operation within
+// the p9.File client object.
+type deprecatedRemover interface {
+	Remove() error
+}
+
+// checkDeleted asserts that relevant methods fail for an unlinked file.
+//
+// This function will close the file at the end.
+func checkDeleted(h *Harness, file p9.File) {
+	defer file.Close() // See doc.
+
+	if _, _, _, err := file.Open(p9.ReadOnly); err != syscall.EINVAL {
+		h.t.Errorf("open while deleted, got %v, want EINVAL", err)
+	}
+	if _, _, _, _, err := file.Create("created", p9.ReadOnly, 0, 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("create while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Symlink("old", "new", 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("symlink while deleted, got %v, want EINVAL", err)
+	}
+	// N.B. This link is technically invalid, but if a call to link is
+	// actually made in the backend then the mock will panic.
+	if err := file.Link(file, "new"); err != syscall.EINVAL {
+		h.t.Errorf("link while deleted, got %v, want EINVAL", err)
+	}
+	if err := file.RenameAt("src", file, "dst"); err != syscall.EINVAL {
+		h.t.Errorf("renameAt while deleted, got %v, want EINVAL", err)
+	}
+	if err := file.UnlinkAt("file", 0); err != syscall.EINVAL {
+		h.t.Errorf("unlinkAt while deleted, got %v, want EINVAL", err)
+	}
+	if err := file.Rename(file, "dst"); err != syscall.EINVAL {
+		h.t.Errorf("rename while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Readlink(); err != syscall.EINVAL {
+		h.t.Errorf("readlink while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Mkdir("dir", p9.ModeDirectory, 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("mkdir while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Mknod("dir", p9.ModeDirectory, 0, 0, 0, 0); err != syscall.EINVAL {
+		h.t.Errorf("mknod while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Readdir(0, 1); err != syscall.EINVAL {
+		h.t.Errorf("readdir while deleted, got %v, want EINVAL", err)
+	}
+	if _, err := file.Connect(p9.ConnectFlags(0)); err != syscall.EINVAL {
+		h.t.Errorf("connect while deleted, got %v, want EINVAL", err)
+	}
+
+	// The remove method is technically deprecated, but we want to ensure
+	// that it still checks for deleted appropriately. We must first clone
+	// the file because remove is equivalent to close.
+	_, newFile, err := file.Walk(nil)
+	if err == syscall.EBUSY {
+		// We can't walk from here because this reference is open
+		// aleady. Okay, we will also have unopened cases through
+		// TestUnlink, just skip the remove operation for now.
+		return
+	} else if err != nil {
+		h.t.Fatalf("clone failed, got %v, want nil", err)
+	}
+	if err := newFile.(deprecatedRemover).Remove(); err != syscall.EINVAL {
+		h.t.Errorf("remove while deleted, got %v, want EINVAL", err)
+	}
+}
+
+// deleter is a function to remove a file.
+type deleter func(parent p9.File, name string) error
+
+// unlinkAt is a deleter.
+func unlinkAt(parent p9.File, name string) error {
+	// Call unlink. Note that a filesystem may normally impose additional
+	// constaints on unlinkat success, such as ensuring that a directory is
+	// empty, requiring AT_REMOVEDIR in flags to remove a directory, etc.
+	// None of that is required internally (entire trees can be marked
+	// deleted when this operation succeeds), so the mock will succeed.
+	return parent.UnlinkAt(name, 0)
+}
+
+// remove is a deleter.
+func remove(parent p9.File, name string) error {
+	// See notes above re: remove.
+	_, newFile, err := parent.Walk([]string{name})
+	if err != nil {
+		// Should not be expected.
+		return err
+	}
+
+	// Do the actual remove.
+	if err := newFile.(deprecatedRemover).Remove(); err != nil {
+		return err
+	}
+
+	// Ensure that the remove closed the file.
+	if err := newFile.(deprecatedRemover).Remove(); err != syscall.EBADF {
+		return syscall.EBADF // Propagate this code.
+	}
+
+	return nil
+}
+
+// unlinkHelper unlinks the noted path, and ensures that all relevant
+// operations on that path, acquired from multiple paths, start failing.
+func unlinkHelper(h *Harness, root p9.File, targetNames []string, targetGen fileGenerator, deleteFn deleter) {
+	// name is the file to be unlinked.
+	name := targetNames[len(targetNames)-1]
+
+	// Walk to the directory containing the target.
+	_, parent, err := root.Walk(targetNames[:len(targetNames)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer parent.Close()
+	parentBackend := h.Pop(parent)
+
+	// Walk to or generate the target file.
+	_, _, target := targetGen(h, name, parent)
+	defer checkDeleted(h, target)
+
+	// Walk to a second reference.
+	_, second, err := parent.Walk([]string{name})
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer checkDeleted(h, second)
+
+	// Walk to a third reference, from the start.
+	_, third, err := root.Walk(targetNames)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer checkDeleted(h, third)
+
+	// This will be translated in the backend to an unlinkat.
+	parentBackend.EXPECT().UnlinkAt(name, uint32(0)).Return(nil)
+
+	// Actually perform the deletion.
+	if err := deleteFn(parent, name); err != nil {
+		h.t.Fatalf("got delete err %v, want nil", err)
+	}
+}
+
+func unlinkTest(t *testing.T, targetNames []string, targetGen fileGenerator) {
+	t.Run(fmt.Sprintf("unlinkAt(%s)", strings.Join(targetNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		unlinkHelper(h, root, targetNames, targetGen, unlinkAt)
+	})
+	t.Run(fmt.Sprintf("remove(%s)", strings.Join(targetNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		unlinkHelper(h, root, targetNames, targetGen, remove)
+	})
+}
+
+func TestUnlink(t *testing.T) {
+	// Unlink all files.
+	for name := range newTypeMap(nil) {
+		unlinkTest(t, []string{name}, walkHelper)
+		unlinkTest(t, []string{name}, walkAndOpenHelper)
+		unlinkTest(t, []string{"one", name}, walkHelper)
+		unlinkTest(t, []string{"one", name}, walkAndOpenHelper)
+		unlinkTest(t, []string{"one", "two", name}, walkHelper)
+		unlinkTest(t, []string{"one", "two", name}, walkAndOpenHelper)
+	}
+
+	// Unlink a directory.
+	unlinkTest(t, []string{"one"}, walkHelper)
+	unlinkTest(t, []string{"one"}, walkAndOpenHelper)
+	unlinkTest(t, []string{"one", "two"}, walkHelper)
+	unlinkTest(t, []string{"one", "two"}, walkAndOpenHelper)
+
+	// Unlink created files.
+	unlinkTest(t, []string{"created"}, createHelper)
+	unlinkTest(t, []string{"one", "created"}, createHelper)
+	unlinkTest(t, []string{"one", "two", "created"}, createHelper)
+}
+
+func TestUnlinkAtInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.UnlinkAt(invalidName, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+// expectRenamed asserts an ordered sequence of rename calls, based on all the
+// elements in elements being the source, and the first element therein
+// changing to dstName, parented at dstParent.
+func expectRenamed(file *Mock, elements []string, dstParent *Mock, dstName string) *gomock.Call {
+	if len(elements) > 0 {
+		// Recurse to the parent, if necessary.
+		call := expectRenamed(file.parent, elements[:len(elements)-1], dstParent, dstName)
+
+		// Recursive case: this element is unchanged, but should have
+		// it's hook called after the parent.
+		return file.EXPECT().Renamed(file.parent, elements[len(elements)-1]).Do(func(p p9.File, _ string) {
+			file.parent = p.(*Mock)
+		}).After(call)
+	}
+
+	// Base case: this is the changed element.
+	return file.EXPECT().Renamed(dstParent, dstName).Do(func(p p9.File, name string) {
+		file.parent = p.(*Mock)
+	})
+}
+
+// renamer is a rename function.
+type renamer func(h *Harness, srcParent, dstParent p9.File, origName, newName string, selfRename bool) error
+
+// renameAt is a renamer.
+func renameAt(_ *Harness, srcParent, dstParent p9.File, srcName, dstName string, selfRename bool) error {
+	return srcParent.RenameAt(srcName, dstParent, dstName)
+}
+
+// rename is a renamer.
+func rename(h *Harness, srcParent, dstParent p9.File, srcName, dstName string, selfRename bool) error {
+	_, f, err := srcParent.Walk([]string{srcName})
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	if !selfRename {
+		backend := h.Pop(f)
+		backend.EXPECT().Renamed(gomock.Any(), dstName).Do(func(p p9.File, name string) {
+			backend.parent = p.(*Mock) // Required for close ordering.
+		})
+	}
+	return f.Rename(dstParent, dstName)
+}
+
+// renameHelper executes a rename, and asserts that all relevant elements
+// receive expected notifications. If overwriting a file, this includes
+// ensuring that the target has been appropriately marked as unlinked.
+func renameHelper(h *Harness, root p9.File, srcNames []string, dstNames []string, target fileGenerator, renameFn renamer) {
+	// Walk to the directory containing the target.
+	srcQID, targetParent, err := root.Walk(srcNames[:len(srcNames)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer targetParent.Close()
+	targetParentBackend := h.Pop(targetParent)
+
+	// Walk to or generate the target file.
+	_, targetBackend, src := target(h, srcNames[len(srcNames)-1], targetParent)
+	defer src.Close()
+
+	// Walk to a second reference.
+	_, second, err := targetParent.Walk([]string{srcNames[len(srcNames)-1]})
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer second.Close()
+	secondBackend := h.Pop(second)
+
+	// Walk to a third reference, from the start.
+	_, third, err := root.Walk(srcNames)
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer third.Close()
+	thirdBackend := h.Pop(third)
+
+	// Find the common suffix to identify the rename parent.
+	var (
+		renameDestPath []string
+		renameSrcPath  []string
+		selfRename     bool
+	)
+	for i := 1; i <= len(srcNames) && i <= len(dstNames); i++ {
+		if srcNames[len(srcNames)-i] != dstNames[len(dstNames)-i] {
+			// Take the full prefix of dstNames up until this
+			// point, including the first mismatched name. The
+			// first mismatch must be the renamed entry.
+			renameDestPath = dstNames[:len(dstNames)-i+1]
+			renameSrcPath = srcNames[:len(srcNames)-i+1]
+
+			// Does the renameDestPath fully contain the
+			// renameSrcPath here? If yes, then this is a mismatch.
+			// We can't rename the src to some subpath of itself.
+			if len(renameDestPath) > len(renameSrcPath) &&
+				reflect.DeepEqual(renameDestPath[:len(renameSrcPath)], renameSrcPath) {
+				renameDestPath = nil
+				renameSrcPath = nil
+				continue
+			}
+			break
+		}
+	}
+	if len(renameSrcPath) == 0 || len(renameDestPath) == 0 {
+		// This must be a rename to self, or a tricky look-alike. This
+		// happens iff we fail to find a suitable divergence in the two
+		// paths. It's a true self move if the path length is the same.
+		renameDestPath = dstNames
+		renameSrcPath = srcNames
+		selfRename = len(srcNames) == len(dstNames)
+	}
+
+	// Walk to the source parent.
+	_, srcParent, err := root.Walk(renameSrcPath[:len(renameSrcPath)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer srcParent.Close()
+	srcParentBackend := h.Pop(srcParent)
+
+	// Walk to the destination parent.
+	_, dstParent, err := root.Walk(renameDestPath[:len(renameDestPath)-1])
+	if err != nil {
+		h.t.Fatalf("got walk err %v, want nil", err)
+	}
+	defer dstParent.Close()
+	dstParentBackend := h.Pop(dstParent)
+
+	// expectedErr is the result of the rename operation.
+	var expectedErr error
+
+	// Walk to the target file, if one exists.
+	dstQID, dst, err := root.Walk(renameDestPath)
+	if err == nil {
+		if !selfRename && srcQID[0].Type == dstQID[0].Type {
+			// If there is a destination file, and is it of the
+			// same type as the source file, then we expect the
+			// rename to succeed. We expect the destination file to
+			// be deleted, so we run a deletion test on it in this
+			// case.
+			defer checkDeleted(h, dst)
+		} else {
+			if !selfRename {
+				// If the type is different than the
+				// destination, then we expect the rename to
+				// fail. We expect ensure that this is
+				// returned.
+				expectedErr = syscall.EINVAL
+			} else {
+				// This is the file being renamed to itself.
+				// This is technically allowed and a no-op, but
+				// all the triggers will fire.
+			}
+			dst.Close()
+		}
+	}
+	dstName := renameDestPath[len(renameDestPath)-1] // Renamed element.
+	srcName := renameSrcPath[len(renameSrcPath)-1]   // Renamed element.
+	if expectedErr == nil && !selfRename {
+		// Expect all to be renamed appropriately. Note that if this is
+		// a final file being renamed, then we expect the file to be
+		// called with the new parent. If not, then we expect the
+		// rename hook to be called, but the parent will remain
+		// unchanged.
+		elements := srcNames[len(renameSrcPath):]
+		expectRenamed(targetBackend, elements, dstParentBackend, dstName)
+		expectRenamed(secondBackend, elements, dstParentBackend, dstName)
+		expectRenamed(thirdBackend, elements, dstParentBackend, dstName)
+
+		// The target parent has also been opened, and may be moved
+		// directly or indirectly.
+		if len(elements) > 1 {
+			expectRenamed(targetParentBackend, elements[:len(elements)-1], dstParentBackend, dstName)
+		}
+	}
+
+	// Expect the rename if it's not the same file. Note that like unlink,
+	// renames are always translated to the at variant in the backend.
+	if !selfRename {
+		srcParentBackend.EXPECT().RenameAt(srcName, dstParentBackend, dstName).Return(expectedErr)
+	}
+
+	// Perform the actual rename; everything has been lined up.
+	if err := renameFn(h, srcParent, dstParent, srcName, dstName, selfRename); err != expectedErr {
+		h.t.Fatalf("got rename err %v, want %v", err, expectedErr)
+	}
+}
+
+func renameTest(t *testing.T, srcNames []string, dstNames []string, target fileGenerator) {
+	t.Run(fmt.Sprintf("renameAt(%s->%s)", strings.Join(srcNames, "/"), strings.Join(dstNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		renameHelper(h, root, srcNames, dstNames, target, renameAt)
+	})
+	t.Run(fmt.Sprintf("rename(%s->%s)", strings.Join(srcNames, "/"), strings.Join(dstNames, "/")), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		renameHelper(h, root, srcNames, dstNames, target, rename)
+	})
+}
+
+func TestRename(t *testing.T) {
+	// In-directory rename, simple case.
+	for name := range newTypeMap(nil) {
+		// Within the root.
+		renameTest(t, []string{name}, []string{"renamed"}, walkHelper)
+		renameTest(t, []string{name}, []string{"renamed"}, walkAndOpenHelper)
+
+		// Within a subdirectory.
+		renameTest(t, []string{"one", name}, []string{"one", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"one", "renamed"}, walkAndOpenHelper)
+	}
+
+	// ... with created files.
+	renameTest(t, []string{"created"}, []string{"renamed"}, createHelper)
+	renameTest(t, []string{"one", "created"}, []string{"one", "renamed"}, createHelper)
+
+	// Across directories.
+	for name := range newTypeMap(nil) {
+		// Down one level.
+		renameTest(t, []string{"one", name}, []string{"one", "two", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"one", "two", "renamed"}, walkAndOpenHelper)
+
+		// Up one level.
+		renameTest(t, []string{"one", "two", name}, []string{"one", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", "two", name}, []string{"one", "renamed"}, walkAndOpenHelper)
+
+		// Across at the same level.
+		renameTest(t, []string{"one", name}, []string{"three", "renamed"}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"three", "renamed"}, walkAndOpenHelper)
+	}
+
+	// ... with created files.
+	renameTest(t, []string{"one", "created"}, []string{"one", "two", "renamed"}, createHelper)
+	renameTest(t, []string{"one", "two", "created"}, []string{"one", "renamed"}, createHelper)
+	renameTest(t, []string{"one", "created"}, []string{"three", "renamed"}, createHelper)
+
+	// Renaming parents.
+	for name := range newTypeMap(nil) {
+		// Rename a parent.
+		renameTest(t, []string{"one", name}, []string{"renamed", name}, walkHelper)
+		renameTest(t, []string{"one", name}, []string{"renamed", name}, walkAndOpenHelper)
+
+		// Rename a super parent.
+		renameTest(t, []string{"one", "two", name}, []string{"renamed", name}, walkHelper)
+		renameTest(t, []string{"one", "two", name}, []string{"renamed", name}, walkAndOpenHelper)
+	}
+
+	// ... with created files.
+	renameTest(t, []string{"one", "created"}, []string{"renamed", "created"}, createHelper)
+	renameTest(t, []string{"one", "two", "created"}, []string{"renamed", "created"}, createHelper)
+
+	// Over existing files, including itself.
+	for name := range newTypeMap(nil) {
+		for other := range newTypeMap(nil) {
+			// Overwrite the noted file (may be itself).
+			renameTest(t, []string{"one", name}, []string{"one", other}, walkHelper)
+			renameTest(t, []string{"one", name}, []string{"one", other}, walkAndOpenHelper)
+
+			// Overwrite other files in another directory.
+			renameTest(t, []string{"one", name}, []string{"one", "two", other}, walkHelper)
+			renameTest(t, []string{"one", name}, []string{"one", "two", other}, walkAndOpenHelper)
+		}
+
+		// Overwrite by moving the parent.
+		renameTest(t, []string{"three", name}, []string{"one", name}, walkHelper)
+		renameTest(t, []string{"three", name}, []string{"one", name}, walkAndOpenHelper)
+
+		// Create over the types.
+		renameTest(t, []string{"one", "created"}, []string{"one", name}, createHelper)
+		renameTest(t, []string{"one", "created"}, []string{"one", "two", name}, createHelper)
+		renameTest(t, []string{"three", "created"}, []string{"one", name}, createHelper)
+	}
+}
+
+func TestRenameInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.Rename(root, invalidName); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestRenameAtInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.RenameAt(invalidName, root, "okay"); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+			if err := root.RenameAt("okay", root, invalidName); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestReadlink(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			// Walk to the file normally.
+			_, f, err := root.Walk([]string{name})
+			if err != nil {
+				t.Fatalf("walk failed: got %v, wanted nil", err)
+			}
+			defer f.Close()
+			backend := h.Pop(f)
+
+			const symlinkTarget = "symlink-target"
+
+			if backend.Attr.Mode.IsSymlink() {
+				// This should only go through on symlinks.
+				backend.EXPECT().Readlink().Return(symlinkTarget, nil)
+			}
+
+			// Attempt a Readlink operation.
+			target, err := f.Readlink()
+			if err != nil && err != syscall.EINVAL {
+				t.Errorf("readlink got %v, wanted EINVAL", err)
+			} else if err == nil && target != symlinkTarget {
+				t.Errorf("readlink got %v, wanted %v", target, symlinkTarget)
+			}
+		})
+	}
+}
+
+// fdTest is a wrapper around operations that may send file descriptors. This
+// asserts that the file descriptors are working as intended.
+func fdTest(t *testing.T, sendFn func(*fd.FD) *fd.FD) {
+	// Create a pipe that we can read from.
+	r, w, err := os.Pipe()
+	if err != nil {
+		t.Fatalf("unable to create pipe: %v", err)
+	}
+	defer r.Close()
+	defer w.Close()
+
+	// Attempt to send the write end.
+	wFD, err := fd.NewFromFile(w)
+	if err != nil {
+		t.Fatalf("unable to convert file: %v", err)
+	}
+	defer wFD.Close() // This is a copy.
+
+	// Send wFD and receive newFD.
+	newFD := sendFn(wFD)
+	defer newFD.Close()
+
+	// Attempt to write.
+	const message = "hello"
+	if _, err := newFD.Write([]byte(message)); err != nil {
+		t.Fatalf("write got %v, wanted nil", err)
+	}
+
+	// Should see the message on our end.
+	buffer := []byte(message)
+	if _, err := io.ReadFull(r, buffer); err != nil {
+		t.Fatalf("read got %v, wanted nil", err)
+	}
+	if string(buffer) != message {
+		t.Errorf("got message %v, wanted %v", string(buffer), message)
+	}
+}
+
+func TestConnect(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			// Walk to the file normally.
+			_, backend, f := walkHelper(h, name, root)
+			defer f.Close()
+
+			// Catch all the non-socket cases.
+			if !backend.Attr.Mode.IsSocket() {
+				// This has been set up to fail if Connect is called.
+				if _, err := f.Connect(p9.ConnectFlags(0)); err != syscall.EINVAL {
+					t.Errorf("connect got %v, wanted EINVAL", err)
+				}
+				return
+			}
+
+			// Ensure the fd exchange works.
+			fdTest(t, func(send *fd.FD) *fd.FD {
+				backend.EXPECT().Connect(p9.ConnectFlags(0)).Return(send, nil)
+				recv, err := backend.Connect(p9.ConnectFlags(0))
+				if err != nil {
+					t.Fatalf("connect got %v, wanted nil", err)
+				}
+				return recv
+			})
+		})
+	}
+}
+
+func TestReaddir(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			// Walk to the file normally.
+			_, backend, f := walkHelper(h, name, root)
+			defer f.Close()
+
+			// Catch all the non-directory cases.
+			if !backend.Attr.Mode.IsDir() {
+				// This has also been set up to fail if Readdir is called.
+				if _, err := f.Readdir(0, 1); err != syscall.EINVAL {
+					t.Errorf("readdir got %v, wanted EINVAL", err)
+				}
+				return
+			}
+
+			// Ensure that readdir works for directories.
+			if _, err := f.Readdir(0, 1); err != syscall.EINVAL {
+				t.Errorf("readdir got %v, wanted EINVAL", err)
+			}
+			if _, _, _, err := f.Open(p9.ReadWrite); err != syscall.EINVAL {
+				t.Errorf("readdir got %v, wanted EINVAL", err)
+			}
+			if _, _, _, err := f.Open(p9.WriteOnly); err != syscall.EINVAL {
+				t.Errorf("readdir got %v, wanted EINVAL", err)
+			}
+			backend.EXPECT().Open(p9.ReadOnly).Times(1)
+			if _, _, _, err := f.Open(p9.ReadOnly); err != nil {
+				t.Errorf("readdir got %v, wanted nil", err)
+			}
+			backend.EXPECT().Readdir(uint64(0), uint32(1)).Times(1)
+			if _, err := f.Readdir(0, 1); err != nil {
+				t.Errorf("readdir got %v, wanted nil", err)
+			}
+		})
+	}
+}
+
+func TestOpen(t *testing.T) {
+	type openTest struct {
+		name  string
+		mode  p9.OpenFlags
+		err   error
+		match func(p9.FileMode) bool
+	}
+
+	cases := []openTest{
+		{
+			name:  "invalid",
+			mode:  ^p9.OpenFlagsModeMask,
+			err:   syscall.EINVAL,
+			match: func(p9.FileMode) bool { return true },
+		},
+		{
+			name:  "not-openable-read-only",
+			mode:  p9.ReadOnly,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
+		},
+		{
+			name:  "not-openable-write-only",
+			mode:  p9.WriteOnly,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
+		},
+		{
+			name:  "not-openable-read-write",
+			mode:  p9.ReadWrite,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return !p9.CanOpen(mode) },
+		},
+		{
+			name:  "directory-read-only",
+			mode:  p9.ReadOnly,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+		},
+		{
+			name:  "directory-read-write",
+			mode:  p9.ReadWrite,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+		},
+		{
+			name:  "directory-write-only",
+			mode:  p9.WriteOnly,
+			err:   syscall.EINVAL,
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+		},
+		{
+			name:  "read-only",
+			mode:  p9.ReadOnly,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) },
+		},
+		{
+			name:  "write-only",
+			mode:  p9.WriteOnly,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
+		},
+		{
+			name:  "read-write",
+			mode:  p9.ReadWrite,
+			err:   nil,
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) && !mode.IsDir() },
+		},
+	}
+
+	// Open(mode OpenFlags) (*fd.FD, QID, uint32, error)
+	// - only works on Regular, NamedPipe, BLockDevice, CharacterDevice
+	// - returning a file works as expected
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s-%s", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				// Walk to the file normally.
+				_, backend, f := walkHelper(h, name, root)
+				defer f.Close()
+
+				// Does this match the case?
+				if !tc.match(backend.Attr.Mode) {
+					t.SkipNow()
+				}
+
+				// Ensure open-required operations fail.
+				if _, err := f.ReadAt([]byte("hello"), 0); err != syscall.EINVAL {
+					t.Errorf("readAt got %v, wanted EINVAL", err)
+				}
+				if _, err := f.WriteAt(make([]byte, 6), 0); err != syscall.EINVAL {
+					t.Errorf("writeAt got %v, wanted EINVAL", err)
+				}
+				if err := f.FSync(); err != syscall.EINVAL {
+					t.Errorf("fsync got %v, wanted EINVAL", err)
+				}
+				if _, err := f.Readdir(0, 1); err != syscall.EINVAL {
+					t.Errorf("readdir got %v, wanted EINVAL", err)
+				}
+
+				// Attempt the given open.
+				if tc.err != nil {
+					// We expect an error, just test and return.
+					if _, _, _, err := f.Open(tc.mode); err != tc.err {
+						t.Fatalf("open with mode %v got %v, want %v", tc.mode, err, tc.err)
+					}
+					return
+				}
+
+				// Run an FD test, since we expect success.
+				fdTest(t, func(send *fd.FD) *fd.FD {
+					backend.EXPECT().Open(tc.mode).Return(send, p9.QID{}, uint32(0), nil).Times(1)
+					recv, _, _, err := f.Open(tc.mode)
+					if err != tc.err {
+						t.Fatalf("open with mode %v got %v, want %v", tc.mode, err, tc.err)
+					}
+					return recv
+				})
+
+				// If the open was successful, attempt another one.
+				if _, _, _, err := f.Open(tc.mode); err != syscall.EINVAL {
+					t.Errorf("second open with mode %v got %v, want EINVAL", tc.mode, err)
+				}
+
+				// Ensure that all illegal operations fail.
+				if _, _, err := f.Walk(nil); err != syscall.EINVAL && err != syscall.EBUSY {
+					t.Errorf("walk got %v, wanted EINVAL or EBUSY", err)
+				}
+				if _, _, _, _, err := f.WalkGetAttr(nil); err != syscall.EINVAL && err != syscall.EBUSY {
+					t.Errorf("walkgetattr got %v, wanted EINVAL or EBUSY", err)
+				}
+			})
+		}
+	}
+}
+
+func TestClose(t *testing.T) {
+	type closeTest struct {
+		name    string
+		closeFn func(backend *Mock, f p9.File)
+	}
+
+	cases := []closeTest{
+		{
+			name: "close",
+			closeFn: func(_ *Mock, f p9.File) {
+				f.Close()
+			},
+		},
+		{
+			name: "remove",
+			closeFn: func(backend *Mock, f p9.File) {
+				// Allow the rename call in the parent, automatically translated.
+				backend.parent.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Times(1)
+				f.(deprecatedRemover).Remove()
+			},
+		},
 	}
-	os.Remove(osFile.Name())
 
-	hfi, err := osFile.Stat()
-	if err != nil {
-		osFile.Close()
-		t.Fatalf("stat failed: %v", err)
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s(%s)", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				// Walk to the file normally.
+				_, backend, f := walkHelper(h, name, root)
+
+				// Close via the prescribed method.
+				tc.closeFn(backend, f)
+
+				// Everything should fail with EBADF.
+				if _, _, err := f.Walk(nil); err != syscall.EBADF {
+					t.Errorf("walk got %v, wanted EBADF", err)
+				}
+				if _, err := f.StatFS(); err != syscall.EBADF {
+					t.Errorf("statfs got %v, wanted EBADF", err)
+				}
+				if _, _, _, err := f.GetAttr(p9.AttrMaskAll()); err != syscall.EBADF {
+					t.Errorf("getattr got %v, wanted EBADF", err)
+				}
+				if err := f.SetAttr(p9.SetAttrMask{}, p9.SetAttr{}); err != syscall.EBADF {
+					t.Errorf("setattrk got %v, wanted EBADF", err)
+				}
+				if err := f.Rename(root, "new-name"); err != syscall.EBADF {
+					t.Errorf("rename got %v, wanted EBADF", err)
+				}
+				if err := f.Close(); err != syscall.EBADF {
+					t.Errorf("close got %v, wanted EBADF", err)
+				}
+				if _, _, _, err := f.Open(p9.ReadOnly); err != syscall.EBADF {
+					t.Errorf("open got %v, wanted EBADF", err)
+				}
+				if _, err := f.ReadAt([]byte("hello"), 0); err != syscall.EBADF {
+					t.Errorf("readAt got %v, wanted EBADF", err)
+				}
+				if _, err := f.WriteAt(make([]byte, 6), 0); err != syscall.EBADF {
+					t.Errorf("writeAt got %v, wanted EBADF", err)
+				}
+				if err := f.FSync(); err != syscall.EBADF {
+					t.Errorf("fsync got %v, wanted EBADF", err)
+				}
+				if _, _, _, _, err := f.Create("new-file", p9.ReadWrite, 0, 0, 0); err != syscall.EBADF {
+					t.Errorf("create got %v, wanted EBADF", err)
+				}
+				if _, err := f.Mkdir("new-directory", 0, 0, 0); err != syscall.EBADF {
+					t.Errorf("mkdir got %v, wanted EBADF", err)
+				}
+				if _, err := f.Symlink("old-name", "new-name", 0, 0); err != syscall.EBADF {
+					t.Errorf("symlink got %v, wanted EBADF", err)
+				}
+				if err := f.Link(root, "new-name"); err != syscall.EBADF {
+					t.Errorf("link got %v, wanted EBADF", err)
+				}
+				if _, err := f.Mknod("new-block-device", 0, 0, 0, 0, 0); err != syscall.EBADF {
+					t.Errorf("mknod got %v, wanted EBADF", err)
+				}
+				if err := f.RenameAt("old-name", root, "new-name"); err != syscall.EBADF {
+					t.Errorf("renameAt got %v, wanted EBADF", err)
+				}
+				if err := f.UnlinkAt("name", 0); err != syscall.EBADF {
+					t.Errorf("unlinkAt got %v, wanted EBADF", err)
+				}
+				if _, err := f.Readdir(0, 1); err != syscall.EBADF {
+					t.Errorf("readdir got %v, wanted EBADF", err)
+				}
+				if _, err := f.Readlink(); err != syscall.EBADF {
+					t.Errorf("readlink got %v, wanted EBADF", err)
+				}
+				if err := f.Flush(); err != syscall.EBADF {
+					t.Errorf("flush got %v, wanted EBADF", err)
+				}
+				if _, _, _, _, err := f.WalkGetAttr(nil); err != syscall.EBADF {
+					t.Errorf("walkgetattr got %v, wanted EBADF", err)
+				}
+				if _, err := f.Connect(p9.ConnectFlags(0)); err != syscall.EBADF {
+					t.Errorf("connect got %v, wanted EBADF", err)
+				}
+			})
+		}
+	}
+}
+
+// onlyWorksOnOpenThings is a helper test method for operations that should
+// only work on files that have been explicitly opened.
+func onlyWorksOnOpenThings(h *Harness, t *testing.T, name string, root p9.File, mode p9.OpenFlags, expectedErr error, fn func(backend *Mock, f p9.File, shouldSucceed bool) error) {
+	// Walk to the file normally.
+	_, backend, f := walkHelper(h, name, root)
+	defer f.Close()
+
+	// Does it work before opening?
+	if err := fn(backend, f, false); err != syscall.EINVAL {
+		t.Errorf("operation got %v, wanted EINVAL", err)
 	}
-	osFileStat := hfi.Sys().(*syscall.Stat_t)
 
-	f, err := fd.NewFromFile(osFile)
-	// osFile should always be closed.
-	osFile.Close()
-	if err != nil {
-		t.Fatalf("unable to create file: %v", err)
+	// Is this openable?
+	if !p9.CanOpen(backend.Attr.Mode) {
+		return // Nothing to do.
+	}
+
+	// If this is a directory, we can't handle writing.
+	if backend.Attr.Mode.IsDir() && (mode == p9.ReadWrite || mode == p9.WriteOnly) {
+		return // Skip.
+	}
+
+	// Open the file.
+	backend.EXPECT().Open(mode)
+	if _, _, _, err := f.Open(mode); err != nil {
+		t.Fatalf("open got %v, wanted nil", err)
+	}
+
+	// Attempt the operation.
+	if err := fn(backend, f, expectedErr == nil); err != expectedErr {
+		t.Fatalf("operation got %v, wanted %v", err, expectedErr)
+	}
+}
+
+func TestRead(t *testing.T) {
+	type readTest struct {
+		name string
+		mode p9.OpenFlags
+		err  error
+	}
+
+	cases := []readTest{
+		{
+			name: "read-only",
+			mode: p9.ReadOnly,
+			err:  nil,
+		},
+		{
+			name: "read-write",
+			mode: p9.ReadWrite,
+			err:  nil,
+		},
+		{
+			name: "write-only",
+			mode: p9.WriteOnly,
+			err:  syscall.EPERM,
+		},
+	}
+
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s-%s", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				const message = "hello"
+
+				onlyWorksOnOpenThings(h, t, name, root, tc.mode, tc.err, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+					if !shouldSucceed {
+						_, err := f.ReadAt([]byte(message), 0)
+						return err
+					}
+
+					// Prepare for the call to readAt in the backend.
+					backend.EXPECT().ReadAt(gomock.Any(), uint64(0)).Do(func(p []byte, offset uint64) {
+						copy(p, message)
+					}).Return(len(message), nil)
+
+					// Make the client call.
+					p := make([]byte, 2*len(message)) // Double size.
+					n, err := f.ReadAt(p, 0)
+
+					// Sanity check result.
+					if err != nil {
+						return err
+					}
+					if n != len(message) {
+						t.Fatalf("message length incorrect, got %d, want %d", n, len(message))
+					}
+					if !bytes.Equal(p[:n], []byte(message)) {
+						t.Fatalf("message incorrect, got %v, want %v", p, []byte(message))
+					}
+					return nil // Success.
+				})
+			})
+		}
+	}
+}
+
+func TestWrite(t *testing.T) {
+	type writeTest struct {
+		name string
+		mode p9.OpenFlags
+		err  error
 	}
 
-	// Craft attacher to attach to the mocked file which will return our
-	// temporary file.
-	fileMock := &FileMock{
-		OpenMock: OpenMock{File: f},
-		GetAttrMock: GetAttrMock{
-			// The mode must be valid always.
-			Valid: p9.AttrMask{Mode: true},
+	cases := []writeTest{
+		{
+			name: "read-only",
+			mode: p9.ReadOnly,
+			err:  syscall.EPERM,
+		},
+		{
+			name: "read-write",
+			mode: p9.ReadWrite,
+			err:  nil,
+		},
+		{
+			name: "write-only",
+			mode: p9.WriteOnly,
+			err:  nil,
 		},
 	}
-	attacher := &AttachMock{
-		File: fileMock,
+
+	for name := range newTypeMap(nil) {
+		for _, tc := range cases {
+			t.Run(fmt.Sprintf("%s-%s", tc.name, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				const message = "hello"
+
+				onlyWorksOnOpenThings(h, t, name, root, tc.mode, tc.err, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+					if !shouldSucceed {
+						_, err := f.WriteAt([]byte(message), 0)
+						return err
+					}
+
+					// Prepare for the call to readAt in the backend.
+					var output []byte // Saved by Do below.
+					backend.EXPECT().WriteAt(gomock.Any(), uint64(0)).Do(func(p []byte, offset uint64) {
+						output = p
+					}).Return(len(message), nil)
+
+					// Make the client call.
+					n, err := f.WriteAt([]byte(message), 0)
+
+					// Sanity check result.
+					if err != nil {
+						return err
+					}
+					if n != len(message) {
+						t.Fatalf("message length incorrect, got %d, want %d", n, len(message))
+					}
+					if !bytes.Equal(output, []byte(message)) {
+						t.Fatalf("message incorrect, got %v, want %v", output, []byte(message))
+					}
+					return nil // Success.
+				})
+			})
+		}
 	}
+}
 
-	// Make socket pair.
-	serverSocket, clientSocket, err := unet.SocketPair(false)
-	if err != nil {
-		t.Fatalf("socketpair got err %v wanted nil", err)
+func TestFSync(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		for _, mode := range []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} {
+			t.Run(fmt.Sprintf("%s-%s", mode, name), func(t *testing.T) {
+				h, c := NewHarness(t)
+				defer h.Finish()
+
+				_, root := newRoot(h, c)
+				defer root.Close()
+
+				onlyWorksOnOpenThings(h, t, name, root, mode, nil, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+					if shouldSucceed {
+						backend.EXPECT().FSync().Times(1)
+					}
+					return f.FSync()
+				})
+			})
+		}
 	}
-	defer clientSocket.Close()
-	server := p9.NewServer(attacher)
-	go server.Handle(serverSocket)
-	client, err := p9.NewClient(clientSocket, 1024*1024 /* 1M message size */, p9.HighestVersionString())
-	if err != nil {
-		t.Fatalf("new client got %v, expected nil", err)
+}
+
+func TestFlush(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			_, backend, f := walkHelper(h, name, root)
+			defer f.Close()
+
+			backend.EXPECT().Flush()
+			f.Flush()
+		})
 	}
+}
 
-	// Attach to the mocked file.
-	cFile, err := client.Attach("")
-	if err != nil {
-		t.Fatalf("attach failed: %v", err)
+// onlyWorksOnDirectories is a helper test method for operations that should
+// only work on unopened directories, such as create, mkdir and symlink.
+func onlyWorksOnDirectories(h *Harness, t *testing.T, name string, root p9.File, fn func(backend *Mock, f p9.File, shouldSucceed bool) error) {
+	// Walk to the file normally.
+	_, backend, f := walkHelper(h, name, root)
+	defer f.Close()
+
+	// Only directories support mknod.
+	if !backend.Attr.Mode.IsDir() {
+		if err := fn(backend, f, false); err != syscall.EINVAL {
+			t.Errorf("operation got %v, wanted EINVAL", err)
+		}
+		return // Nothing else to do.
 	}
 
-	// Try to open the mocked file.
-	clientHostFile, _, _, err := cFile.Open(0)
-	if err != nil {
-		t.Fatalf("open failed: %v", err)
+	// Should succeed.
+	if err := fn(backend, f, true); err != nil {
+		t.Fatalf("operation got %v, wanted nil", err)
 	}
-	var clientStat syscall.Stat_t
-	if err := syscall.Fstat(clientHostFile.FD(), &clientStat); err != nil {
-		t.Fatalf("stat failed: %v", err)
+
+	// Open the directory.
+	backend.EXPECT().Open(p9.ReadOnly).Times(1)
+	if _, _, _, err := f.Open(p9.ReadOnly); err != nil {
+		t.Fatalf("open got %v, wanted nil", err)
 	}
 
-	// Compare inode nums to make sure it's the same file.
-	if clientStat.Ino != osFileStat.Ino {
-		t.Errorf("fd donation failed")
+	// Should not work again.
+	if err := fn(backend, f, false); err != syscall.EINVAL {
+		t.Fatalf("operation got %v, wanted EINVAL", err)
 	}
 }
 
-// TestClient is a megatest.
-//
-// This allows us to probe various edge cases, while changing the state of the
-// underlying server in expected ways. The test slowly builds server state and
-// is documented inline.
-//
-// We wind up with the following, after probing edge cases:
-//
-// FID 1: ServerFile (sf).
-// FID 2: Directory (d).
-// FID 3: File (f).
-// FID 4: Symlink (s).
-//
-// Although you should use the FID method on the individual files.
-func TestClient(t *testing.T) {
+func TestCreate(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if !shouldSucceed {
+					_, _, _, _, err := f.Create("new-file", p9.ReadWrite, 0, 1, 2)
+					return err
+				}
+
+				// If the create is going to succeed, then we
+				// need to create a new backend file, and we
+				// clone to ensure that we don't close the
+				// original.
+				_, newF, err := f.Walk(nil)
+				if err != nil {
+					t.Fatalf("clone got %v, wanted nil", err)
+				}
+				defer newF.Close()
+				newBackend := h.Pop(newF)
+
+				// Run a regular FD test to validate that path.
+				fdTest(t, func(send *fd.FD) *fd.FD {
+					// Return the send FD on success.
+					newFile := h.NewFile()(backend) // New file with the parent backend.
+					newBackend.EXPECT().Create("new-file", p9.ReadWrite, p9.FileMode(0), p9.UID(1), p9.GID(2)).Return(send, newFile, p9.QID{}, uint32(0), nil)
+
+					// Receive the fd back.
+					recv, _, _, _, err := newF.Create("new-file", p9.ReadWrite, 0, 1, 2)
+					if err != nil {
+						t.Fatalf("create got %v, wanted nil", err)
+					}
+					return recv
+				})
+
+				// The above will fail via normal test flow, so
+				// we can assume that it passed.
+				return nil
+			})
+		})
+	}
+}
+
+func TestCreateInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if _, _, _, _, err := root.Create(invalidName, p9.ReadWrite, 0, 0, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestMkdir(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Mkdir("new-directory", p9.FileMode(0), p9.UID(1), p9.GID(2))
+				}
+				_, err := f.Mkdir("new-directory", 0, 1, 2)
+				return err
+			})
+		})
+	}
+}
+
+func TestMkdirInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if _, err := root.Mkdir(invalidName, 0, 0, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestSymlink(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Symlink("old-name", "new-name", p9.UID(1), p9.GID(2))
+				}
+				_, err := f.Symlink("old-name", "new-name", 1, 2)
+				return err
+			})
+		})
+	}
+}
+
+func TestSyminkInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			// We need only test for invalid names in the new name,
+			// the target can be an arbitrary string and we don't
+			// need to sanity check it.
+			if _, err := root.Symlink("old-name", invalidName, 0, 0); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestLink(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Link(gomock.Any(), "new-link")
+				}
+				return f.Link(f, "new-link")
+			})
+		})
+	}
+}
+
+func TestLinkInvalid(t *testing.T) {
+	h, c := NewHarness(t)
+	defer h.Finish()
+
+	_, root := newRoot(h, c)
+	defer root.Close()
+
+	for name := range newTypeMap(nil) {
+		for _, invalidName := range allInvalidNames(name) {
+			if err := root.Link(root, invalidName); err != syscall.EINVAL {
+				t.Errorf("got %v for name %q, want EINVAL", err, invalidName)
+			}
+		}
+	}
+}
+
+func TestMknod(t *testing.T) {
+	for name := range newTypeMap(nil) {
+		t.Run(name, func(t *testing.T) {
+			h, c := NewHarness(t)
+			defer h.Finish()
+
+			_, root := newRoot(h, c)
+			defer root.Close()
+
+			onlyWorksOnDirectories(h, t, name, root, func(backend *Mock, f p9.File, shouldSucceed bool) error {
+				if shouldSucceed {
+					backend.EXPECT().Mknod("new-block-device", p9.FileMode(0), uint32(1), uint32(2), p9.UID(3), p9.GID(4)).Times(1)
+				}
+				_, err := f.Mknod("new-block-device", 0, 1, 2, 3, 4)
+				return err
+			})
+		})
+	}
+}
+
+// concurrentFn is a specification of a concurrent operation. This is used to
+// drive the concurrency tests below.
+type concurrentFn struct {
+	name  string
+	match func(p9.FileMode) bool
+	op    func(h *Harness, backend *Mock, f p9.File, callback func())
+}
+
+func concurrentTest(t *testing.T, name string, fn1, fn2 concurrentFn, sameDir, expectedOkay bool) {
 	var (
-		// Sentinel error.
-		sentinelErr = syscall.Errno(4383)
-
-		// Backend mocks.
-		a  = &AttachMock{}
-		sf = &FileMock{}
-		d  = &FileMock{}
-		f  = &FileMock{}
-		s  = &FileMock{}
-
-		// Client Files for the above.
-		sfFile p9.File
+		names1 []string
+		names2 []string
 	)
+	if sameDir {
+		// Use the same file one directory up.
+		names1, names2 = []string{"one", name}, []string{"one", name}
+	} else {
+		// For different directories, just use siblings.
+		names1, names2 = []string{"one", name}, []string{"three", name}
+	}
 
-	testSteps := []struct {
-		name string
-		fn   func(*p9.Client) error
-		want error
-	}{
-		{
-			name: "bad-attach",
-			want: sentinelErr,
-			fn: func(c *p9.Client) error {
-				a.File = nil
-				a.Err = sentinelErr
-				_, err := c.Attach("")
-				return err
+	t.Run(fmt.Sprintf("%s(%v)+%s(%v)", fn1.name, names1, fn2.name, names2), func(t *testing.T) {
+		h, c := NewHarness(t)
+		defer h.Finish()
+
+		_, root := newRoot(h, c)
+		defer root.Close()
+
+		// Walk to both files as given.
+		_, f1, err := root.Walk(names1)
+		if err != nil {
+			t.Fatalf("error walking, got %v, want nil", err)
+		}
+		defer f1.Close()
+		b1 := h.Pop(f1)
+		_, f2, err := root.Walk(names2)
+		if err != nil {
+			t.Fatalf("error walking, got %v, want nil", err)
+		}
+		defer f2.Close()
+		b2 := h.Pop(f2)
+
+		// Are these a good match for the current test case?
+		if !fn1.match(b1.Attr.Mode) {
+			t.SkipNow()
+		}
+		if !fn2.match(b2.Attr.Mode) {
+			t.SkipNow()
+		}
+
+		// Construct our "concurrency creator".
+		in1 := make(chan struct{}, 1)
+		in2 := make(chan struct{}, 1)
+		var top sync.WaitGroup
+		var fns sync.WaitGroup
+		defer top.Wait()
+		top.Add(2) // Accounting for below.
+		defer fns.Done()
+		fns.Add(1) // See line above; released before top.Wait.
+		go func() {
+			defer top.Done()
+			fn1.op(h, b1, f1, func() {
+				in1 <- struct{}{}
+				fns.Wait()
+			})
+		}()
+		go func() {
+			defer top.Done()
+			fn2.op(h, b2, f2, func() {
+				in2 <- struct{}{}
+				fns.Wait()
+			})
+		}()
+
+		// Compute a reasonable timeout. If we expect the operation to hang,
+		// give it 10 milliseconds before we assert that it's fine. After all,
+		// there will be a lot of these tests. If we don't expect it to hang,
+		// give it a full minute, since the machine could be slow.
+		timeout := 10 * time.Millisecond
+		if expectedOkay {
+			timeout = 1 * time.Minute
+		}
+
+		// Read the first channel.
+		var second chan struct{}
+		select {
+		case <-in1:
+			second = in2
+		case <-in2:
+			second = in1
+		}
+
+		// Catch concurrency.
+		select {
+		case <-second:
+			// We finished successful. Is this good? Depends on the
+			// expected result.
+			if !expectedOkay {
+				t.Errorf("%q and %q proceeded concurrently!", fn1.name, fn2.name)
+			}
+		case <-time.After(timeout):
+			// Great, things did not proceed concurrently. Is that what we
+			// expected?
+			if expectedOkay {
+				t.Errorf("%q and %q hung concurrently!", fn1.name, fn2.name)
+			}
+		}
+	})
+}
+
+func randomFileName() string {
+	return fmt.Sprintf("%x", rand.Int63())
+}
+
+func TestConcurrency(t *testing.T) {
+	readExclusive := []concurrentFn{
+		{
+			// N.B. We can't explicitly check WalkGetAttr behavior,
+			// but we rely on the fact that the internal code paths
+			// are the same.
+			name:  "walk",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// See the documentation of WalkCallback.
+				// Because walk is actually implemented by the
+				// mock, we need a special place for this
+				// callback.
+				//
+				// Note that a clone actually locks the parent
+				// node. So we walk from this node to test
+				// concurrent operations appropriately.
+				backend.WalkCallback = func() error {
+					callback()
+					return nil
+				}
+				f.Walk([]string{randomFileName()}) // Won't exist.
 			},
 		},
 		{
-			name: "attach",
-			fn: func(c *p9.Client) error {
-				a.Called = false
-				a.File = sf
-				a.Err = nil
-				// The attached root must have a valid mode.
-				sf.GetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory}
-				sf.GetAttrMock.Valid = p9.AttrMask{Mode: true}
-				var err error
-				sfFile, err = c.Attach("")
-				if !a.Called {
-					t.Errorf("Attach never Called?")
-				}
-				return err
+			name:  "fsync",
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Open(gomock.Any())
+				backend.EXPECT().FSync().Do(func() {
+					callback()
+				})
+				f.Open(p9.ReadOnly) // Required.
+				f.FSync()
 			},
 		},
 		{
-			name: "bad-walk",
-			want: sentinelErr,
-			fn: func(c *p9.Client) error {
-				// Walk only called when WalkGetAttr not available.
-				sf.WalkGetAttrMock.Err = syscall.ENOSYS
-				sf.WalkMock.File = d
-				sf.WalkMock.Err = sentinelErr
-				_, _, err := sfFile.Walk([]string{"foo", "bar"})
-				return err
+			name:  "readdir",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Open(gomock.Any())
+				backend.EXPECT().Readdir(gomock.Any(), gomock.Any()).Do(func(uint64, uint32) {
+					callback()
+				})
+				f.Open(p9.ReadOnly) // Required.
+				f.Readdir(0, 1)
 			},
 		},
 		{
-			name: "walk-to-dir",
-			fn: func(c *p9.Client) error {
-				// Walk only called when WalkGetAttr not available.
-				sf.WalkGetAttrMock.Err = syscall.ENOSYS
-				sf.WalkMock.Called = false
-				sf.WalkMock.Names = nil
-				sf.WalkMock.File = d
-				sf.WalkMock.Err = nil
-				sf.WalkMock.QIDs = []p9.QID{{Type: 1}}
-				// All intermediate values must be directories.
-				d.WalkGetAttrMock.Err = syscall.ENOSYS
-				d.WalkMock.Called = false
-				d.WalkMock.Names = nil
-				d.WalkMock.File = d // Walk to self.
-				d.WalkMock.Err = nil
-				d.WalkMock.QIDs = []p9.QID{{Type: 1}}
-				d.GetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory}
-				d.GetAttrMock.Valid = p9.AttrMask{Mode: true}
-				var qids []p9.QID
-				var err error
-				qids, _, err = sfFile.Walk([]string{"foo", "bar"})
-				if !sf.WalkMock.Called {
-					t.Errorf("Walk never Called?")
-				}
-				if !d.GetAttrMock.Called {
-					t.Errorf("GetAttr never Called?")
-				}
-				if !reflect.DeepEqual(sf.WalkMock.Names, []string{"foo"}) {
-					t.Errorf("got names %v wanted []{foo}", sf.WalkMock.Names)
-				}
-				if !reflect.DeepEqual(d.WalkMock.Names, []string{"bar"}) {
-					t.Errorf("got names %v wanted []{bar}", d.WalkMock.Names)
-				}
-				if len(qids) != 2 || qids[len(qids)-1].Type != 1 {
-					t.Errorf("got qids %v wanted []{..., {Type: 1}}", qids)
-				}
-				return err
+			name:  "readlink",
+			match: func(mode p9.FileMode) bool { return mode.IsSymlink() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Readlink().Do(func() {
+					callback()
+				})
+				f.Readlink()
 			},
 		},
 		{
-			name: "walkgetattr-to-dir",
-			fn: func(c *p9.Client) error {
-				sf.WalkGetAttrMock.Called = false
-				sf.WalkGetAttrMock.Names = nil
-				sf.WalkGetAttrMock.File = d
-				sf.WalkGetAttrMock.Err = nil
-				sf.WalkGetAttrMock.QIDs = []p9.QID{{Type: 1}}
-				sf.WalkGetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory, UID: 1}
-				sf.WalkGetAttrMock.Valid = p9.AttrMask{Mode: true}
-				// See above.
-				d.WalkGetAttrMock.Called = false
-				d.WalkGetAttrMock.Names = nil
-				d.WalkGetAttrMock.File = d // Walk to self.
-				d.WalkGetAttrMock.Err = nil
-				d.WalkGetAttrMock.QIDs = []p9.QID{{Type: 1}}
-				d.WalkGetAttrMock.Attr = p9.Attr{Mode: p9.ModeDirectory, UID: 1}
-				d.WalkGetAttrMock.Valid = p9.AttrMask{Mode: true}
-				var qids []p9.QID
-				var err error
-				var mask p9.AttrMask
-				var attr p9.Attr
-				qids, _, mask, attr, err = sfFile.WalkGetAttr([]string{"foo", "bar"})
-				if !sf.WalkGetAttrMock.Called {
-					t.Errorf("Walk never Called?")
-				}
-				if !reflect.DeepEqual(sf.WalkGetAttrMock.Names, []string{"foo"}) {
-					t.Errorf("got names %v wanted []{foo}", sf.WalkGetAttrMock.Names)
-				}
-				if !reflect.DeepEqual(d.WalkGetAttrMock.Names, []string{"bar"}) {
-					t.Errorf("got names %v wanted []{bar}", d.WalkGetAttrMock.Names)
-				}
-				if len(qids) != 2 || qids[len(qids)-1].Type != 1 {
-					t.Errorf("got qids %v wanted []{..., {Type: 1}}", qids)
-				}
-				if !reflect.DeepEqual(attr, sf.WalkGetAttrMock.Attr) {
-					t.Errorf("got attrs %s wanted %s", attr, sf.WalkGetAttrMock.Attr)
-				}
-				if !reflect.DeepEqual(mask, sf.WalkGetAttrMock.Valid) {
-					t.Errorf("got mask %s wanted %s", mask, sf.WalkGetAttrMock.Valid)
-				}
-				return err
+			name:  "connect",
+			match: func(mode p9.FileMode) bool { return mode.IsSocket() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Connect(gomock.Any()).Do(func(p9.ConnectFlags) {
+					callback()
+				})
+				f.Connect(0)
 			},
 		},
 		{
-			name: "walk-to-file",
-			fn: func(c *p9.Client) error {
-				// Basic sanity check is done in walk-to-dir.
-				//
-				// Here we just create basic file FIDs to use.
-				sf.WalkMock.File = f
-				sf.WalkMock.Err = nil
-				var err error
-				_, _, err = sfFile.Walk(nil)
-				return err
+			name:  "open",
+			match: func(mode p9.FileMode) bool { return p9.CanOpen(mode) },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Open(gomock.Any()).Do(func(p9.OpenFlags) {
+					callback()
+				})
+				f.Open(p9.ReadOnly)
 			},
 		},
 		{
-			name: "walk-to-symlink",
-			fn: func(c *p9.Client) error {
-				// See note in walk-to-file.
-				sf.WalkMock.File = s
-				sf.WalkMock.Err = nil
-				var err error
-				_, _, err = sfFile.Walk(nil)
-				return err
+			name:  "flush",
+			match: func(mode p9.FileMode) bool { return true },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Flush().Do(func() {
+					callback()
+				})
+				f.Flush()
+			},
+		},
+	}
+	writeExclusive := []concurrentFn{
+		{
+			// N.B. We can't really check getattr. But this is an
+			// extremely low-risk function, it seems likely that
+			// this check is paranoid anyways.
+			name:  "setattr",
+			match: func(mode p9.FileMode) bool { return true },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().SetAttr(gomock.Any(), gomock.Any()).Do(func(p9.SetAttrMask, p9.SetAttr) {
+					callback()
+				})
+				f.SetAttr(p9.SetAttrMask{}, p9.SetAttr{})
 			},
 		},
 		{
-			name: "bad-statfs",
-			want: sentinelErr,
-			fn: func(c *p9.Client) error {
-				sf.StatFSMock.Err = sentinelErr
-				_, err := sfFile.StatFS()
-				return err
+			name:  "unlinkAt",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Do(func(string, uint32) {
+					callback()
+				})
+				f.UnlinkAt(randomFileName(), 0)
 			},
 		},
 		{
-			name: "statfs",
-			fn: func(c *p9.Client) error {
-				sf.StatFSMock.Called = false
-				sf.StatFSMock.Stat = p9.FSStat{Type: 1}
-				sf.StatFSMock.Err = nil
-				stat, err := sfFile.StatFS()
-				if !sf.StatFSMock.Called {
-					t.Errorf("StatfS never Called?")
-				}
-				if stat.Type != 1 {
-					t.Errorf("got stat %v wanted {Type: 1}", stat)
-				}
-				return err
+			name:  "mknod",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Mknod(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.FileMode, uint32, uint32, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Mknod(randomFileName(), 0, 0, 0, 0, 0)
+			},
+		},
+		{
+			name:  "link",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Link(gomock.Any(), gomock.Any()).Do(func(p9.File, string) {
+					callback()
+				})
+				f.Link(f, randomFileName())
+			},
+		},
+		{
+			name:  "symlink",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Symlink(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, string, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Symlink(randomFileName(), randomFileName(), 0, 0)
+			},
+		},
+		{
+			name:  "mkdir",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().Mkdir(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.FileMode, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Mkdir(randomFileName(), 0, 0, 0)
+			},
+		},
+		{
+			name:  "create",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// Return an error for the creation operation, as this is the simplest.
+				backend.EXPECT().Create(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil, p9.QID{}, uint32(0), syscall.EINVAL).Do(func(string, p9.OpenFlags, p9.FileMode, p9.UID, p9.GID) {
+					callback()
+				})
+				f.Create(randomFileName(), p9.ReadOnly, 0, 0, 0)
 			},
 		},
 	}
+	globalExclusive := []concurrentFn{
+		{
+			name:  "remove",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// Remove operates on a locked parent. So we
+				// add a child, walk to it and call remove.
+				// Note that because this operation can operate
+				// concurrently with itself, we need to
+				// generate a random file name.
+				randomFile := randomFileName()
+				backend.AddChild(randomFile, h.NewFile())
+				defer backend.RemoveChild(randomFile)
+				_, file, err := f.Walk([]string{randomFile})
+				if err != nil {
+					h.t.Fatalf("walk got %v, want nil", err)
+				}
 
-	// First, create a new server and connection.
-	serverSocket, clientSocket, err := unet.SocketPair(false)
-	if err != nil {
-		t.Fatalf("socketpair got err %v wanted nil", err)
-	}
-	defer clientSocket.Close()
-	server := p9.NewServer(a)
-	go server.Handle(serverSocket)
-	client, err := p9.NewClient(clientSocket, 1024*1024 /* 1M message size */, p9.HighestVersionString())
-	if err != nil {
-		t.Fatalf("new client got err %v, wanted nil", err)
-	}
+				// Remove is automatically translated to the parent.
+				backend.EXPECT().UnlinkAt(gomock.Any(), gomock.Any()).Do(func(string, uint32) {
+					callback()
+				})
 
-	// Now, run through each of the test steps.
-	for _, step := range testSteps {
-		err := step.fn(client)
-		if err != step.want {
-			// Don't fail, just note this one step failed.
-			t.Errorf("step %q got %v wanted %v", step.name, err, step.want)
-		}
-	}
-}
+				// Remove is also a close.
+				file.(deprecatedRemover).Remove()
+			},
+		},
+		{
+			name:  "rename",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				// Similarly to remove, because we need to
+				// operate on a child, we allow a walk.
+				randomFile := randomFileName()
+				backend.AddChild(randomFile, h.NewFile())
+				defer backend.RemoveChild(randomFile)
+				_, file, err := f.Walk([]string{randomFile})
+				if err != nil {
+					h.t.Fatalf("walk got %v, want nil", err)
+				}
+				defer file.Close()
+				fileBackend := h.Pop(file)
 
-func BenchmarkClient(b *testing.B) {
-	// Backend mock.
-	a := &AttachMock{
-		File: &FileMock{
-			ReadAtMock: ReadAtMock{N: 1},
+				// Rename is automatically translated to the parent.
+				backend.EXPECT().RenameAt(gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.File, string) {
+					callback()
+				})
+
+				// Attempt the rename.
+				fileBackend.EXPECT().Renamed(gomock.Any(), gomock.Any())
+				file.Rename(f, randomFileName())
+			},
 		},
-	}
+		{
+			name:  "renameAt",
+			match: func(mode p9.FileMode) bool { return mode.IsDir() },
+			op: func(h *Harness, backend *Mock, f p9.File, callback func()) {
+				backend.EXPECT().RenameAt(gomock.Any(), gomock.Any(), gomock.Any()).Do(func(string, p9.File, string) {
+					callback()
+				})
 
-	// First, create a new server and connection.
-	serverSocket, clientSocket, err := unet.SocketPair(false)
-	if err != nil {
-		b.Fatalf("socketpair got err %v wanted nil", err)
-	}
-	defer clientSocket.Close()
-	server := p9.NewServer(a)
-	go server.Handle(serverSocket)
-	client, err := p9.NewClient(clientSocket, 1024*1024 /* 1M message size */, p9.HighestVersionString())
-	if err != nil {
-		b.Fatalf("new client got %v, expected nil", err)
+				// Attempt the rename. There are no active fids
+				// with this name, so we don't need to expect
+				// Renamed hooks on anything.
+				f.RenameAt(randomFileName(), f, randomFileName())
+			},
+		},
 	}
 
-	// Attach to the server.
-	f, err := client.Attach("")
-	if err != nil {
-		b.Fatalf("error during attach, got %v wanted nil", err)
+	for _, fn1 := range readExclusive {
+		for _, fn2 := range readExclusive {
+			for name := range newTypeMap(nil) {
+				// Everything should be able to proceed in parallel.
+				concurrentTest(t, name, fn1, fn2, true, true)
+				concurrentTest(t, name, fn1, fn2, false, true)
+			}
+		}
 	}
 
-	// Open the file.
-	if _, _, _, err := f.Open(p9.ReadOnly); err != nil {
-		b.Fatalf("error during open, got %v wanted nil", err)
+	for _, fn1 := range append(readExclusive, writeExclusive...) {
+		for _, fn2 := range writeExclusive {
+			for name := range newTypeMap(nil) {
+				// Only cross-directory functions should proceed in parallel.
+				concurrentTest(t, name, fn1, fn2, true, false)
+				concurrentTest(t, name, fn1, fn2, false, true)
+			}
+		}
 	}
 
-	// Reset the clock.
-	b.ResetTimer()
-
-	// Do N reads.
-	var buf [1]byte
-	for i := 0; i < b.N; i++ {
-		_, err := f.ReadAt(buf[:], 0)
-		if err != nil {
-			b.Fatalf("error during read %d, got %v wanted nil", i, err)
+	for _, fn1 := range append(append(readExclusive, writeExclusive...), globalExclusive...) {
+		for _, fn2 := range globalExclusive {
+			for name := range newTypeMap(nil) {
+				// Nothing should be able to run in parallel.
+				concurrentTest(t, name, fn1, fn2, true, false)
+				concurrentTest(t, name, fn1, fn2, false, false)
+			}
 		}
 	}
 }
diff --git a/pkg/p9/p9test/mocks.go b/pkg/p9/p9test/mocks.go
deleted file mode 100644
index 9a8c14975..000000000
--- a/pkg/p9/p9test/mocks.go
+++ /dev/null
@@ -1,489 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package p9test
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/fd"
-	"gvisor.googlesource.com/gvisor/pkg/p9"
-)
-
-// StatFSMock mocks p9.File.StatFS.
-type StatFSMock struct {
-	Called bool
-
-	// Return.
-	Stat p9.FSStat
-	Err  error
-}
-
-// StatFS implements p9.File.StatFS.
-func (f *StatFSMock) StatFS() (p9.FSStat, error) {
-	f.Called = true
-	return f.Stat, f.Err
-}
-
-// GetAttrMock mocks p9.File.GetAttr.
-type GetAttrMock struct {
-	Called bool
-
-	// Args.
-	Req p9.AttrMask
-
-	// Return.
-	QID   p9.QID
-	Valid p9.AttrMask
-	Attr  p9.Attr
-	Err   error
-}
-
-// GetAttr implements p9.File.GetAttr.
-func (g *GetAttrMock) GetAttr(req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	g.Called, g.Req = true, req
-	return g.QID, g.Valid, g.Attr, g.Err
-}
-
-// WalkGetAttrMock mocks p9.File.WalkGetAttr.
-type WalkGetAttrMock struct {
-	Called bool
-
-	// Args.
-	Names []string
-
-	// Return.
-	QIDs  []p9.QID
-	File  p9.File
-	Valid p9.AttrMask
-	Attr  p9.Attr
-	Err   error
-}
-
-// WalkGetAttr implements p9.File.WalkGetAttr.
-func (w *WalkGetAttrMock) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
-	w.Called = true
-	w.Names = append(w.Names, names...)
-	return w.QIDs, w.File, w.Valid, w.Attr, w.Err
-}
-
-// SetAttrMock mocks p9.File.SetAttr.
-type SetAttrMock struct {
-	Called bool
-
-	// Args.
-	Valid p9.SetAttrMask
-	Attr  p9.SetAttr
-
-	// Return.
-	Err error
-}
-
-// SetAttr implements p9.File.SetAttr.
-func (s *SetAttrMock) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
-	s.Called, s.Valid, s.Attr = true, valid, attr
-	return s.Err
-}
-
-// RemoveMock mocks p9.File.Remove.
-type RemoveMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// Remove implements p9.File.Remove.
-func (r *RemoveMock) Remove() error {
-	r.Called = true
-	return r.Err
-}
-
-// OpenMock mocks p9.File.Open.
-type OpenMock struct {
-	Called bool
-
-	// Args.
-	Flags p9.OpenFlags
-
-	// Return.
-	File   *fd.FD
-	QID    p9.QID
-	IOUnit uint32
-	Err    error
-}
-
-// Open implements p9.File.Open.
-func (o *OpenMock) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
-	o.Called, o.Flags = true, flags
-	return o.File, o.QID, o.IOUnit, o.Err
-}
-
-// ReadAtMock mocks p9.File.ReadAt.
-type ReadAtMock struct {
-	Called bool
-
-	// Args.
-	P      []byte
-	Offset uint64
-
-	// Return.
-	N   int
-	Err error
-}
-
-// ReadAt implements p9.File.ReadAt.
-func (r *ReadAtMock) ReadAt(p []byte, offset uint64) (int, error) {
-	r.Called, r.P, r.Offset = true, p, offset
-	return r.N, r.Err
-}
-
-// WriteAtMock mocks p9.File.WriteAt.
-type WriteAtMock struct {
-	Called bool
-
-	// Args.
-	P      []byte
-	Offset uint64
-
-	// Return.
-	N   int
-	Err error
-}
-
-// WriteAt implements p9.File.WriteAt.
-func (w *WriteAtMock) WriteAt(p []byte, offset uint64) (int, error) {
-	w.Called, w.P, w.Offset = true, p, offset
-	return w.N, w.Err
-}
-
-// FSyncMock mocks p9.File.FSync.
-type FSyncMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// FSync implements p9.File.FSync.
-func (f *FSyncMock) FSync() error {
-	f.Called = true
-	return f.Err
-}
-
-// MkdirMock mocks p9.File.Mkdir.
-type MkdirMock struct {
-	Called bool
-
-	// Args.
-	Name        string
-	Permissions p9.FileMode
-	UID         p9.UID
-	GID         p9.GID
-
-	// Return.
-	QID p9.QID
-	Err error
-}
-
-// Mkdir implements p9.File.Mkdir.
-func (s *MkdirMock) Mkdir(name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	s.Called, s.Name, s.Permissions, s.UID, s.GID = true, name, permissions, uid, gid
-	return s.QID, s.Err
-}
-
-// SymlinkMock mocks p9.File.Symlink.
-type SymlinkMock struct {
-	Called bool
-
-	// Args.
-	Oldname string
-	Newname string
-	UID     p9.UID
-	GID     p9.GID
-
-	// Return.
-	QID p9.QID
-	Err error
-}
-
-// Symlink implements p9.File.Symlink.
-func (s *SymlinkMock) Symlink(oldname string, newname string, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	s.Called, s.Oldname, s.Newname, s.UID, s.GID = true, oldname, newname, uid, gid
-	return s.QID, s.Err
-}
-
-// MknodMock mocks p9.File.Mknod.
-type MknodMock struct {
-	Called bool
-
-	// Args.
-	Name        string
-	Permissions p9.FileMode
-	Major       uint32
-	Minor       uint32
-	UID         p9.UID
-	GID         p9.GID
-
-	// Return.
-	QID p9.QID
-	Err error
-}
-
-// Mknod implements p9.File.Mknod.
-func (m *MknodMock) Mknod(name string, permissions p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	m.Called, m.Name, m.Permissions, m.Major, m.Minor, m.UID, m.GID = true, name, permissions, major, minor, uid, gid
-	return m.QID, m.Err
-}
-
-// UnlinkAtMock mocks p9.File.UnlinkAt.
-type UnlinkAtMock struct {
-	Called bool
-
-	// Args.
-	Name  string
-	Flags uint32
-
-	// Return.
-	Err error
-}
-
-// UnlinkAt implements p9.File.UnlinkAt.
-func (u *UnlinkAtMock) UnlinkAt(name string, flags uint32) error {
-	u.Called, u.Name, u.Flags = true, name, flags
-	return u.Err
-}
-
-// ReaddirMock mocks p9.File.Readdir.
-type ReaddirMock struct {
-	Called bool
-
-	// Args.
-	Offset uint64
-	Count  uint32
-
-	// Return.
-	Dirents []p9.Dirent
-	Err     error
-}
-
-// Readdir implements p9.File.Readdir.
-func (r *ReaddirMock) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
-	r.Called, r.Offset, r.Count = true, offset, count
-	return r.Dirents, r.Err
-}
-
-// ReadlinkMock mocks p9.File.Readlink.
-type ReadlinkMock struct {
-	Called bool
-
-	// Return.
-	Target string
-	Err    error
-}
-
-// Readlink implements p9.File.Readlink.
-func (r *ReadlinkMock) Readlink() (string, error) {
-	r.Called = true
-	return r.Target, r.Err
-}
-
-// AttachMock mocks p9.Attacher.Attach.
-type AttachMock struct {
-	Called bool
-
-	// Return.
-	File p9.File
-	Err  error
-}
-
-// Attach implements p9.Attacher.Attach.
-func (a *AttachMock) Attach() (p9.File, error) {
-	a.Called = true
-	return a.File, a.Err
-}
-
-// WalkMock mocks p9.File.Walk.
-type WalkMock struct {
-	Called bool
-
-	// Args.
-	Names []string
-
-	// Return.
-	QIDs []p9.QID
-	File p9.File
-	Err  error
-}
-
-// Walk implements p9.File.Walk.
-func (w *WalkMock) Walk(names []string) ([]p9.QID, p9.File, error) {
-	w.Called = true
-	w.Names = append(w.Names, names...)
-	return w.QIDs, w.File, w.Err
-}
-
-// RenameMock mocks p9.File.Rename.
-type RenameMock struct {
-	Called bool
-
-	// Args.
-	Directory p9.File
-	Name      string
-
-	// Return.
-	Err error
-}
-
-// Rename implements p9.File.Rename.
-func (r *RenameMock) Rename(directory p9.File, name string) error {
-	r.Called, r.Directory, r.Name = true, directory, name
-	return r.Err
-}
-
-// CloseMock mocks p9.File.Close.
-type CloseMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// Close implements p9.File.Close.
-func (d *CloseMock) Close() error {
-	d.Called = true
-	return d.Err
-}
-
-// CreateMock mocks p9.File.Create.
-type CreateMock struct {
-	Called bool
-
-	// Args.
-	Name        string
-	Flags       p9.OpenFlags
-	Permissions p9.FileMode
-	UID         p9.UID
-	GID         p9.GID
-
-	// Return.
-	HostFile *fd.FD
-	File     p9.File
-	QID      p9.QID
-	IOUnit   uint32
-	Err      error
-}
-
-// Create implements p9.File.Create.
-func (c *CreateMock) Create(name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
-	c.Called, c.Name, c.Flags, c.Permissions, c.UID, c.GID = true, name, flags, permissions, uid, gid
-	return c.HostFile, c.File, c.QID, c.IOUnit, c.Err
-}
-
-// LinkMock mocks p9.File.Link.
-type LinkMock struct {
-	Called bool
-
-	// Args.
-	Target  p9.File
-	Newname string
-
-	// Return.
-	Err error
-}
-
-// Link implements p9.File.Link.
-func (l *LinkMock) Link(target p9.File, newname string) error {
-	l.Called, l.Target, l.Newname = true, target, newname
-	return l.Err
-}
-
-// RenameAtMock mocks p9.File.RenameAt.
-type RenameAtMock struct {
-	Called bool
-
-	// Args.
-	Oldname string
-	Newdir  p9.File
-	Newname string
-
-	// Return.
-	Err error
-}
-
-// RenameAt implements p9.File.RenameAt.
-func (r *RenameAtMock) RenameAt(oldname string, newdir p9.File, newname string) error {
-	r.Called, r.Oldname, r.Newdir, r.Newname = true, oldname, newdir, newname
-	return r.Err
-}
-
-// FlushMock mocks p9.File.Flush.
-type FlushMock struct {
-	Called bool
-
-	// Return.
-	Err error
-}
-
-// Flush implements p9.File.Flush.
-func (f *FlushMock) Flush() error {
-	return f.Err
-}
-
-// ConnectMock mocks p9.File.Connect.
-type ConnectMock struct {
-	Called bool
-
-	// Args.
-	Flags p9.ConnectFlags
-
-	// Return.
-	File *fd.FD
-	Err  error
-}
-
-// Connect implements p9.File.Connect.
-func (o *ConnectMock) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
-	o.Called, o.Flags = true, flags
-	return o.File, o.Err
-}
-
-// FileMock mocks p9.File.
-type FileMock struct {
-	WalkMock
-	WalkGetAttrMock
-	StatFSMock
-	GetAttrMock
-	SetAttrMock
-	RemoveMock
-	RenameMock
-	CloseMock
-	OpenMock
-	ReadAtMock
-	WriteAtMock
-	FSyncMock
-	CreateMock
-	MkdirMock
-	SymlinkMock
-	LinkMock
-	MknodMock
-	RenameAtMock
-	UnlinkAtMock
-	ReaddirMock
-	ReadlinkMock
-	FlushMock
-	ConnectMock
-}
-
-var (
-	_ p9.File = &FileMock{}
-)
diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go
new file mode 100644
index 000000000..417b55950
--- /dev/null
+++ b/pkg/p9/p9test/p9test.go
@@ -0,0 +1,329 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package p9test provides standard mocks for p9.
+package p9test
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"testing"
+
+	"github.com/golang/mock/gomock"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+)
+
+// Harness is an attacher mock.
+type Harness struct {
+	t            *testing.T
+	mockCtrl     *gomock.Controller
+	Attacher     *MockAttacher
+	wg           sync.WaitGroup
+	clientSocket *unet.Socket
+	mu           sync.Mutex
+	created      []*Mock
+}
+
+// globalPath is a QID.Path Generator.
+var globalPath uint64
+
+// MakePath returns a globally unique path.
+func MakePath() uint64 {
+	return atomic.AddUint64(&globalPath, 1)
+}
+
+// Generator is a function that generates a new file.
+type Generator func(parent *Mock) *Mock
+
+// Mock is a common mock element.
+type Mock struct {
+	p9.DefaultWalkGetAttr
+	*MockFile
+	parent   *Mock
+	closed   bool
+	harness  *Harness
+	QID      p9.QID
+	Attr     p9.Attr
+	children map[string]Generator
+
+	// WalkCallback is a special function that will be called from within
+	// the walk context. This is needed for the concurrent tests within
+	// this package.
+	WalkCallback func() error
+}
+
+// globalMu protects the children maps in all mocks. Note that this is not a
+// particularly elegant solution, but because the test has walks from the root
+// through to final nodes, we must share maps below, and it's easiest to simply
+// protect against concurrent access globally.
+var globalMu sync.RWMutex
+
+// AddChild adds a new child to the Mock.
+func (m *Mock) AddChild(name string, generator Generator) {
+	globalMu.Lock()
+	defer globalMu.Unlock()
+	m.children[name] = generator
+}
+
+// RemoveChild removes the child with the given name.
+func (m *Mock) RemoveChild(name string) {
+	globalMu.Lock()
+	defer globalMu.Unlock()
+	delete(m.children, name)
+}
+
+// Matches implements gomock.Matcher.Matches.
+func (m *Mock) Matches(x interface{}) bool {
+	if om, ok := x.(*Mock); ok {
+		return m.QID.Path == om.QID.Path
+	}
+	return false
+}
+
+// String implements gomock.Matcher.String.
+func (m *Mock) String() string {
+	return fmt.Sprintf("Mock{Mode: 0x%x, QID.Path: %d}", m.Attr.Mode, m.QID.Path)
+}
+
+// GetAttr returns the current attributes.
+func (m *Mock) GetAttr(mask p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	return m.QID, p9.AttrMaskAll(), m.Attr, nil
+}
+
+// Walk supports clone and walking in directories.
+func (m *Mock) Walk(names []string) ([]p9.QID, p9.File, error) {
+	if m.WalkCallback != nil {
+		if err := m.WalkCallback(); err != nil {
+			return nil, nil, err
+		}
+	}
+	if len(names) == 0 {
+		// Clone the file appropriately.
+		nm := m.harness.NewMock(m.parent, m.QID.Path, m.Attr)
+		nm.children = m.children // Inherit children.
+		return []p9.QID{nm.QID}, nm, nil
+	} else if len(names) != 1 {
+		m.harness.t.Fail() // Should not happen.
+		return nil, nil, syscall.EINVAL
+	}
+
+	if m.Attr.Mode.IsDir() {
+		globalMu.RLock()
+		defer globalMu.RUnlock()
+		if fn, ok := m.children[names[0]]; ok {
+			// Generate the child.
+			nm := fn(m)
+			return []p9.QID{nm.QID}, nm, nil
+		}
+		// No child found.
+		return nil, nil, syscall.ENOENT
+	}
+
+	// Call the underlying mock.
+	return m.MockFile.Walk(names)
+}
+
+// WalkGetAttr calls the default implementation; this is a client-side optimization.
+func (m *Mock) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
+	return m.DefaultWalkGetAttr.WalkGetAttr(names)
+}
+
+// Pop pops off the most recently created Mock and assert that this mock
+// represents the same file passed in. If nil is passed in, no check is
+// performed.
+//
+// Precondition: there must be at least one Mock or this will panic.
+func (h *Harness) Pop(clientFile p9.File) *Mock {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	if clientFile == nil {
+		// If no clientFile is provided, then we always return the last
+		// created file. The caller can safely use this as long as
+		// there is no concurrency.
+		m := h.created[len(h.created)-1]
+		h.created = h.created[:len(h.created)-1]
+		return m
+	}
+
+	qid, _, _, err := clientFile.GetAttr(p9.AttrMaskAll())
+	if err != nil {
+		// We do not expect this to happen.
+		panic(fmt.Sprintf("err during Pop: %v", err))
+	}
+
+	// Find the relevant file in our created list. We must scan the last
+	// from back to front to ensure that we favor the most recently
+	// generated file.
+	for i := len(h.created) - 1; i >= 0; i-- {
+		m := h.created[i]
+		if qid.Path == m.QID.Path {
+			// Copy and truncate.
+			copy(h.created[i:], h.created[i+1:])
+			h.created = h.created[:len(h.created)-1]
+			return m
+		}
+	}
+
+	// Unable to find relevant file.
+	panic(fmt.Sprintf("unable to locate file with QID %+v", qid.Path))
+}
+
+// NewMock returns a new base file.
+func (h *Harness) NewMock(parent *Mock, path uint64, attr p9.Attr) *Mock {
+	m := &Mock{
+		MockFile: NewMockFile(h.mockCtrl),
+		parent:   parent,
+		harness:  h,
+		QID: p9.QID{
+			Type: p9.QIDType((attr.Mode & p9.FileModeMask) >> 12),
+			Path: path,
+		},
+		Attr: attr,
+	}
+
+	// Always ensure Close is after the parent's close. Note that this
+	// can't be done via a straight-forward After call, because the parent
+	// might change after initial creation. We ensure that this is true at
+	// close time.
+	m.EXPECT().Close().Return(nil).Times(1).Do(func() {
+		if m.parent != nil && m.parent.closed {
+			h.t.FailNow()
+		}
+		// Note that this should not be racy, as this operation should
+		// be protected by the Times(1) above first.
+		m.closed = true
+	})
+
+	// Remember what was created.
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.created = append(h.created, m)
+
+	return m
+}
+
+// NewFile returns a new file mock.
+//
+// Note that ReadAt and WriteAt must be mocked separately.
+func (h *Harness) NewFile() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeRegular})
+	}
+}
+
+// NewDirectory returns a new mock directory.
+//
+// Note that Mkdir, Link, Mknod, RenameAt, UnlinkAt and Readdir must be mocked
+// separately. Walk is provided and children may be manipulated via AddChild
+// and RemoveChild. After calling Walk remotely, one can use Pop to find the
+// corresponding backend mock on the server side.
+func (h *Harness) NewDirectory(contents map[string]Generator) Generator {
+	return func(parent *Mock) *Mock {
+		m := h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeDirectory})
+		m.children = contents // Save contents.
+		return m
+	}
+}
+
+// NewSymlink returns a new mock directory.
+//
+// Note that Readlink must be mocked separately.
+func (h *Harness) NewSymlink() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeSymlink})
+	}
+}
+
+// NewBlockDevice returns a new mock block device.
+func (h *Harness) NewBlockDevice() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeBlockDevice})
+	}
+}
+
+// NewCharacterDevice returns a new mock character device.
+func (h *Harness) NewCharacterDevice() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeCharacterDevice})
+	}
+}
+
+// NewNamedPipe returns a new mock named pipe.
+func (h *Harness) NewNamedPipe() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeNamedPipe})
+	}
+}
+
+// NewSocket returns a new mock socket.
+func (h *Harness) NewSocket() Generator {
+	return func(parent *Mock) *Mock {
+		return h.NewMock(parent, MakePath(), p9.Attr{Mode: p9.ModeSocket})
+	}
+}
+
+// Finish completes all checks and shuts down the server.
+func (h *Harness) Finish() {
+	h.clientSocket.Close()
+	h.wg.Wait()
+	h.mockCtrl.Finish()
+}
+
+// NewHarness creates and returns a new test server.
+//
+// It should always be used as:
+//
+//	h, c := NewHarness(t)
+//	defer h.Finish()
+//
+func NewHarness(t *testing.T) (*Harness, *p9.Client) {
+	// Create the mock.
+	mockCtrl := gomock.NewController(t)
+	h := &Harness{
+		t:        t,
+		mockCtrl: mockCtrl,
+		Attacher: NewMockAttacher(mockCtrl),
+	}
+
+	// Make socket pair.
+	serverSocket, clientSocket, err := unet.SocketPair(false)
+	if err != nil {
+		t.Fatalf("socketpair got err %v wanted nil", err)
+	}
+
+	// Start the server, synchronized on exit.
+	server := p9.NewServer(h.Attacher)
+	h.wg.Add(1)
+	go func() {
+		defer h.wg.Done()
+		server.Handle(serverSocket)
+	}()
+
+	// Create the client.
+	client, err := p9.NewClient(clientSocket, 1024, p9.HighestVersionString())
+	if err != nil {
+		serverSocket.Close()
+		clientSocket.Close()
+		t.Fatalf("new client got %v, expected nil", err)
+		return nil, nil // Never hit.
+	}
+
+	// Capture the client socket.
+	h.clientSocket = clientSocket
+	return h, client
+}
diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go
new file mode 100644
index 000000000..97f90bcd5
--- /dev/null
+++ b/pkg/p9/path_tree.go
@@ -0,0 +1,109 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package p9
+
+import (
+	"fmt"
+	"sync"
+)
+
+// pathNode is a single node in a path traversal.
+//
+// These are shared by all fidRefs that point to the same path.
+//
+// These are not synchronized because we allow certain operations (file walk)
+// to proceed without having to acquire a write lock. The lock in this
+// structure exists to synchronize high-level, semantic operations, such as the
+// simultaneous creation and deletion of a file.
+//
+// (+) below is the path component string.
+type pathNode struct {
+	mu       sync.RWMutex // See above.
+	fidRefs  sync.Map     // => map[*fidRef]string(+)
+	children sync.Map     // => map[string(+)]*pathNode
+	count    int64
+}
+
+// pathNodeFor returns the path node for the given name, or a new one.
+//
+// Precondition: mu must be held in a readable fashion.
+func (p *pathNode) pathNodeFor(name string) *pathNode {
+	// Load the existing path node.
+	if pn, ok := p.children.Load(name); ok {
+		return pn.(*pathNode)
+	}
+
+	// Create a new pathNode for shared use.
+	pn, _ := p.children.LoadOrStore(name, new(pathNode))
+	return pn.(*pathNode)
+}
+
+// nameFor returns the name for the given fidRef.
+//
+// Precondition: mu must be held in a readable fashion.
+func (p *pathNode) nameFor(ref *fidRef) string {
+	if s, ok := p.fidRefs.Load(ref); ok {
+		return s.(string)
+	}
+
+	// This should not happen, don't proceed.
+	panic(fmt.Sprintf("expected name for %+v, none found", ref))
+}
+
+// addChild adds a child to the given pathNode.
+//
+// This applies only to an individual fidRef.
+//
+// Precondition: mu must be held in a writable fashion.
+func (p *pathNode) addChild(ref *fidRef, name string) {
+	if s, ok := p.fidRefs.Load(ref); ok {
+		// This should not happen, don't proceed.
+		panic(fmt.Sprintf("unexpected fidRef %+v with path %q, wanted %q", ref, s, name))
+	}
+
+	p.fidRefs.Store(ref, name)
+}
+
+// removeChild removes the given child.
+//
+// This applies only to an individual fidRef.
+//
+// Precondition: mu must be held in a writable fashion.
+func (p *pathNode) removeChild(ref *fidRef) {
+	p.fidRefs.Delete(ref)
+}
+
+// removeWithName removes all references with the given name.
+//
+// The original pathNode is returned by this function, and removed from this
+// pathNode. Any operations on the removed tree must use this value.
+//
+// The provided function is executed after removal.
+//
+// Precondition: mu must be held in a writable fashion.
+func (p *pathNode) removeWithName(name string, fn func(ref *fidRef)) *pathNode {
+	p.fidRefs.Range(func(key, value interface{}) bool {
+		if value.(string) == name {
+			p.fidRefs.Delete(key)
+			fn(key.(*fidRef))
+		}
+		return true
+	})
+
+	// Return the original path node.
+	origPathNode := p.pathNodeFor(name)
+	p.children.Delete(name)
+	return origPathNode
+}
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 5c7cb18c8..3ef151595 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -15,6 +15,8 @@
 package p9
 
 import (
+	"io"
+	"runtime/debug"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -27,6 +29,19 @@ import (
 type Server struct {
 	// attacher provides the attach function.
 	attacher Attacher
+
+	// pathTree is the full set of paths opened on this server.
+	//
+	// These may be across different connections, but rename operations
+	// must be serialized globally for safely. There is a single pathTree
+	// for the entire server, and not per connection.
+	pathTree pathNode
+
+	// renameMu is a global lock protecting rename operations. With this
+	// lock, we can be certain that any given rename operation can safely
+	// acquire two path nodes in any order, as all other concurrent
+	// operations acquire at most a single node.
+	renameMu sync.RWMutex
 }
 
 // NewServer returns a new server.
@@ -81,6 +96,9 @@ type connState struct {
 
 // fidRef wraps a node and tracks references.
 type fidRef struct {
+	// server is the associated server.
+	server *Server
+
 	// file is the associated File.
 	file File
 
@@ -97,13 +115,39 @@ type fidRef struct {
 	// This is updated in handlers.go.
 	opened bool
 
-	// walkable indicates this fidRef may be walked.
-	walkable bool
+	// mode is the fidRef's mode from the walk. Only the type bits are
+	// valid, the permissions may change. This is used to sanity check
+	// operations on this element, and prevent walks across
+	// non-directories.
+	mode FileMode
 
 	// openFlags is the mode used in the open.
 	//
 	// This is updated in handlers.go.
 	openFlags OpenFlags
+
+	// pathNode is the current pathNode for this FID.
+	pathNode *pathNode
+
+	// parent is the parent fidRef. We hold on to a parent reference to
+	// ensure that hooks, such as Renamed, can be executed safely by the
+	// server code.
+	//
+	// Note that parent cannot be changed without holding both the global
+	// rename lock and a writable lock on the associated pathNode for this
+	// fidRef. Holding either of these locks is sufficient to examine
+	// parent safely.
+	//
+	// The parent will be nil for root fidRefs, and non-nil otherwise. The
+	// method maybeParent can be used to return a cyclical reference, and
+	// isRoot should be used to check for root over looking at parent
+	// directly.
+	parent *fidRef
+
+	// deleted indicates that the backing file has been deleted. We stop
+	// many operations at the API level if they are incompatible with a
+	// file that has already been unlinked.
+	deleted uint32
 }
 
 // OpenFlags returns the flags the file was opened with and true iff the fid was opened previously.
@@ -113,13 +157,146 @@ func (f *fidRef) OpenFlags() (OpenFlags, bool) {
 	return f.openFlags, f.opened
 }
 
+// IncRef increases the references on a fid.
+func (f *fidRef) IncRef() {
+	atomic.AddInt64(&f.refs, 1)
+}
+
 // DecRef should be called when you're finished with a fid.
 func (f *fidRef) DecRef() {
 	if atomic.AddInt64(&f.refs, -1) == 0 {
 		f.file.Close()
+
+		// Drop the parent reference.
+		//
+		// Since this fidRef is guaranteed to be non-discoverable when
+		// the references reach zero, we don't need to worry about
+		// clearing the parent.
+		if f.parent != nil {
+			// If we've been previously deleted, this removing this
+			// ref is a no-op. That's expected.
+			f.parent.pathNode.removeChild(f)
+			f.parent.DecRef()
+		}
 	}
 }
 
+// isDeleted returns true if this fidRef has been deleted.
+func (f *fidRef) isDeleted() bool {
+	return atomic.LoadUint32(&f.deleted) != 0
+}
+
+// isRoot indicates whether this is a root fid.
+func (f *fidRef) isRoot() bool {
+	return f.parent == nil
+}
+
+// maybeParent returns a cyclic reference for roots, and the parent otherwise.
+func (f *fidRef) maybeParent() *fidRef {
+	if f.parent != nil {
+		return f.parent
+	}
+	return f // Root has itself.
+}
+
+// notifyDelete marks all fidRefs as deleted.
+//
+// Precondition: the write lock must be held on the given pathNode.
+func notifyDelete(pn *pathNode) {
+	// Call on all local references.
+	pn.fidRefs.Range(func(key, _ interface{}) bool {
+		ref := key.(*fidRef)
+		atomic.StoreUint32(&ref.deleted, 1)
+		return true
+	})
+
+	// Call on all subtrees.
+	pn.children.Range(func(_, value interface{}) bool {
+		notifyDelete(value.(*pathNode))
+		return true
+	})
+}
+
+// markChildDeleted marks all children below the given name as deleted.
+//
+// Precondition: this must be called via safelyWrite or safelyGlobal.
+func (f *fidRef) markChildDeleted(name string) {
+	origPathNode := f.pathNode.removeWithName(name, func(ref *fidRef) {
+		atomic.StoreUint32(&ref.deleted, 1)
+	})
+
+	// Mark everything below as deleted.
+	notifyDelete(origPathNode)
+}
+
+// notifyNameChange calls the relevant Renamed method on all nodes in the path,
+// recursively. Note that this applies only for subtrees, as these
+// notifications do not apply to the actual file whose name has changed.
+//
+// Precondition: the write lock must be held on the given pathNode.
+func notifyNameChange(pn *pathNode) {
+	// Call on all local references.
+	pn.fidRefs.Range(func(key, value interface{}) bool {
+		ref := key.(*fidRef)
+		name := value.(string)
+		ref.file.Renamed(ref.parent.file, name)
+		return true
+	})
+
+	// Call on all subtrees.
+	pn.children.Range(func(_, value interface{}) bool {
+		notifyNameChange(value.(*pathNode))
+		return true
+	})
+}
+
+// renameChildTo renames the given child to the target.
+//
+// Precondition: this must be called via safelyGlobal.
+func (f *fidRef) renameChildTo(oldName string, target *fidRef, newName string) {
+	target.markChildDeleted(newName)
+	origPathNode := f.pathNode.removeWithName(oldName, func(ref *fidRef) {
+		ref.parent.DecRef() // Drop original reference.
+		ref.parent = target // Change parent.
+		ref.parent.IncRef() // Acquire new one.
+		target.pathNode.addChild(ref, newName)
+		ref.file.Renamed(target.file, newName)
+	})
+
+	// Replace the previous (now deleted) path node.
+	f.pathNode.children.Store(newName, origPathNode)
+
+	// Call Renamed on everything above.
+	notifyNameChange(origPathNode)
+}
+
+// safelyRead executes the given operation with the local path node locked.
+// This implies that paths will not change during the operation.
+func (f *fidRef) safelyRead(fn func() error) (err error) {
+	f.server.renameMu.RLock()
+	defer f.server.renameMu.RUnlock()
+	f.pathNode.mu.RLock()
+	defer f.pathNode.mu.RUnlock()
+	return fn()
+}
+
+// safelyWrite executes the given operation with the local path node locked in
+// a writable fashion. This implies some paths may change.
+func (f *fidRef) safelyWrite(fn func() error) (err error) {
+	f.server.renameMu.RLock()
+	defer f.server.renameMu.RUnlock()
+	f.pathNode.mu.Lock()
+	defer f.pathNode.mu.Unlock()
+	return fn()
+}
+
+// safelyGlobal executes the given operation with the global path lock held.
+func (f *fidRef) safelyGlobal(fn func() error) (err error) {
+	f.server.renameMu.Lock()
+	defer f.server.renameMu.Unlock()
+	return fn()
+}
+
 // LookupFID finds the given FID.
 //
 // You should call fid.DecRef when you are finished using the fid.
@@ -128,7 +305,7 @@ func (cs *connState) LookupFID(fid FID) (*fidRef, bool) {
 	defer cs.fidMu.Unlock()
 	fidRef, ok := cs.fids[fid]
 	if ok {
-		atomic.AddInt64(&fidRef.refs, 1)
+		fidRef.IncRef()
 		return fidRef, true
 	}
 	return nil, false
@@ -145,7 +322,7 @@ func (cs *connState) InsertFID(fid FID, newRef *fidRef) {
 	if ok {
 		defer origRef.DecRef()
 	}
-	atomic.AddInt64(&newRef.refs, 1)
+	newRef.IncRef()
 	cs.fids[fid] = newRef
 }
 
@@ -229,10 +406,9 @@ func (cs *connState) handleRequest() {
 	cs.recvDone <- nil
 
 	// Deal with other errors.
-	if err != nil {
+	if err != nil && err != io.EOF {
 		// If it's not a connection error, but some other protocol error,
 		// we can send a response immediately.
-		log.Debugf("err [%05d] %v", tag, err)
 		cs.sendMu.Lock()
 		err := send(cs.conn, tag, newErr(err))
 		cs.sendMu.Unlock()
@@ -243,12 +419,38 @@ func (cs *connState) handleRequest() {
 	// Try to start the tag.
 	if !cs.StartTag(tag) {
 		// Nothing we can do at this point; client is bogus.
+		log.Debugf("no valid tag [%05d]", tag)
 		cs.sendDone <- ErrNoValidMessage
 		return
 	}
 
 	// Handle the message.
-	var r message
+	var r message // r is the response.
+	defer func() {
+		if r == nil {
+			// Don't allow a panic to propagate.
+			recover()
+
+			// Include a useful log message.
+			log.Warningf("panic in handler: %s", debug.Stack())
+
+			// Wrap in an EFAULT error; we don't really have a
+			// better way to describe this kind of error. It will
+			// usually manifest as a result of the test framework.
+			r = newErr(syscall.EFAULT)
+		}
+
+		// Clear the tag before sending. That's because as soon as this
+		// hits the wire, the client can legally send another message
+		// with the same tag.
+		cs.ClearTag(tag)
+
+		// Send back the result.
+		cs.sendMu.Lock()
+		err = send(cs.conn, tag, r)
+		cs.sendMu.Unlock()
+		cs.sendDone <- err
+	}()
 	if handler, ok := m.(handler); ok {
 		// Call the message handler.
 		r = handler.handle(cs)
@@ -256,18 +458,6 @@ func (cs *connState) handleRequest() {
 		// Produce an ENOSYS error.
 		r = newErr(syscall.ENOSYS)
 	}
-
-	// Clear the tag before sending. That's because as soon
-	// as this hits the wire, the client can legally send
-	// another message with the same tag.
-	cs.ClearTag(tag)
-
-	// Send back the result.
-	cs.sendMu.Lock()
-	err = send(cs.conn, tag, r)
-	cs.sendMu.Unlock()
-	cs.sendDone <- err
-	return
 }
 
 func (cs *connState) handleRequests() {
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index 97396806c..bafb377de 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -167,7 +167,7 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 	r.EnableFDs(1)
 
 	n, err := r.ReadVec([][]byte{hdr[:]})
-	if err != nil {
+	if err != nil && (n == 0 || err != io.EOF) {
 		r.CloseFDs()
 		return NoTag, nil, ErrSocket{err}
 	}
@@ -189,10 +189,8 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 	// Continuing reading for a short header.
 	for n < int(headerLength) {
 		cur, err := r.ReadVec([][]byte{hdr[n:]})
-		if err != nil {
+		if err != nil && (cur == 0 || err != io.EOF) {
 			return NoTag, nil, ErrSocket{err}
-		} else if cur == 0 {
-			return NoTag, nil, ErrSocket{io.EOF}
 		}
 		n += cur
 	}
@@ -296,10 +294,8 @@ func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message,
 		r := s.Reader(true)
 		for n := 0; n < int(remaining); {
 			cur, err := r.ReadVec(vecs)
-			if err != nil {
+			if err != nil && (cur == 0 || err != io.EOF) {
 				return NoTag, nil, ErrSocket{err}
-			} else if cur == 0 {
-				return NoTag, nil, ErrSocket{io.EOF}
 			}
 			n += cur
 
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index 97b9ba3ff..0c9efc709 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "rand",
     srcs = [
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 1975d17a6..657f923ed 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "victim",
     testonly = 1,
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index 0ed38c64a..29f751725 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "secio",
     srcs = [
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 314b3e962..9bf04360a 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_library(
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
index 2a7a6df23..02d24defd 100644
--- a/pkg/sentry/context/BUILD
+++ b/pkg/sentry/context/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "context",
     srcs = ["context.go"],
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index fbdde0721..c3b682d6f 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "control",
     srcs = [
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 69c99b0b3..bebdb2939 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "device",
     srcs = ["device.go"],
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index ff4ab850a..4bd912e95 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "anon",
     srcs = [
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index cef01829a..c9e531e40 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -56,14 +56,10 @@ go_test(
     srcs = ["gofer_test.go"],
     embed = [":gofer"],
     deps = [
-        "//pkg/log",
         "//pkg/p9",
         "//pkg/p9/p9test",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/kernel/time",
-        "//pkg/sentry/usermem",
-        "//pkg/unet",
     ],
 )
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index a0265c2aa..455953237 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -58,13 +58,6 @@ func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9
 	return c.file.SetAttr(valid, attr)
 }
 
-func (c *contextFile) remove(ctx context.Context) error {
-	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Remove()
-}
-
 func (c *contextFile) rename(ctx context.Context, directory contextFile, name string) error {
 	ctx.UninterruptibleSleepStart(false)
 	defer ctx.UninterruptibleSleepFinish(false)
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 3190d1e18..b450778ca 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -16,110 +16,102 @@ package gofer
 
 import (
 	"fmt"
-	"io"
 	"syscall"
 	"testing"
 	"time"
 
-	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/p9/p9test"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
-// goodMockFile returns a file that can be Walk'ed to and created.
-func goodMockFile(mode p9.FileMode, size uint64) *p9test.FileMock {
-	return &p9test.FileMock{
-		GetAttrMock: p9test.GetAttrMock{
-			Attr:  p9.Attr{Mode: mode, Size: size, RDev: 0},
-			Valid: p9.AttrMaskAll(),
-		},
-	}
-}
-
-func newClosedSocket() (*unet.Socket, error) {
-	fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
-	if err != nil {
-		return nil, err
-	}
-
-	s, err := unet.NewSocket(fd)
-	if err != nil {
-		syscall.Close(fd)
-		return nil, err
-	}
-
-	return s, s.Close()
-}
-
-// root returns a p9 file mock and an fs.InodeOperations created from that file.  Any
-// functions performed on fs.InodeOperations will use the p9 file mock.
-func root(ctx context.Context, cp cachePolicy, mode p9.FileMode, size uint64) (*p9test.FileMock, *fs.Inode, error) {
-	sock, err := newClosedSocket()
-	if err != nil {
-		return nil, nil, err
-	}
-
-	// Construct a dummy session that we can destruct.
-	s := &session{
-		conn:        sock,
-		mounter:     fs.RootOwner,
-		cachePolicy: cp,
-		client:      &p9.Client{},
-	}
-
-	rootFile := goodMockFile(mode, size)
-	sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr, false /* socket */)
-	m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{})
-	return rootFile, fs.NewInode(rootInodeOperations, m, sattr), nil
+// rootTest runs a test with a p9 mock and an fs.InodeOperations created from
+// the attached root directory. The root file will be closed and client
+// disconnected, but additional files must be closed manually.
+func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context, *p9test.Harness, *p9test.Mock, *fs.Inode)) {
+	t.Run(name, func(t *testing.T) {
+		h, c := p9test.NewHarness(t)
+		defer h.Finish()
+
+		// Create a new root. Note that we pass an empty, but non-nil
+		// map here. This allows tests to extend the root children
+		// dynamically.
+		root := h.NewDirectory(map[string]p9test.Generator{})(nil)
+
+		// Return this as the root.
+		h.Attacher.EXPECT().Attach().Return(root, nil).Times(1)
+
+		// ... and open via the client.
+		rootFile, err := c.Attach("/")
+		if err != nil {
+			t.Fatalf("unable to attach: %v", err)
+		}
+		defer rootFile.Close()
+
+		// Wrap an a session.
+		s := &session{
+			mounter:     fs.RootOwner,
+			cachePolicy: cp,
+			client:      c,
+		}
+
+		// ... and an INode, with only the mode being explicitly valid for now.
+		ctx := contexttest.Context(t)
+		sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{
+			file: rootFile,
+		}, root.QID, p9.AttrMaskAll(), root.Attr, false /* socket */)
+		m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{})
+		rootInode := fs.NewInode(rootInodeOperations, m, sattr)
+
+		// Ensure that the cache is fully invalidated, so that any
+		// close actions actually take place before the full harness is
+		// torn down.
+		defer m.FlushDirentRefs()
+
+		// Execute the test.
+		fn(ctx, h, root, rootInode)
+	})
 }
 
 func TestLookup(t *testing.T) {
-	// Test parameters.
 	type lookupTest struct {
 		// Name of the test.
 		name string
 
-		// Function input parameters.
-		fileName string
-
 		// Expected return value.
 		want error
 	}
 
 	tests := []lookupTest{
 		{
-			name:     "mock Walk passes (function succeeds)",
-			fileName: "ppp",
-			want:     nil,
+			name: "mock Walk passes (function succeeds)",
+			want: nil,
 		},
 		{
-			name:     "mock Walk fails (function fails)",
-			fileName: "ppp",
-			want:     syscall.ENOENT,
+			name: "mock Walk fails (function fails)",
+			want: syscall.ENOENT,
 		},
 	}
 
-	ctx := contexttest.Context(t)
+	const file = "file" // The walked target file.
+
 	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
+		rootTest(t, test.name, cacheNone, func(ctx context.Context, h *p9test.Harness, rootFile *p9test.Mock, rootInode *fs.Inode) {
+			// Setup the appropriate result.
+			rootFile.WalkCallback = func() error {
+				return test.want
+			}
+			if test.want == nil {
+				// Set the contents of the root. We expect a
+				// normal file generator for ppp above. This is
+				// overriden by setting WalkErr in the mock.
+				rootFile.AddChild(file, h.NewFile())
 			}
-
-			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-			rootFile.WalkGetAttrMock.Err = test.want
-			rootFile.WalkGetAttrMock.File = goodMockFile(p9.PermissionsMask, 0)
 
 			// Call function.
-			dirent, err := rootInode.Lookup(ctx, test.fileName)
+			dirent, err := rootInode.Lookup(ctx, file)
 
 			// Unwrap the InodeOperations.
 			var newInodeOperations fs.InodeOperations
@@ -138,19 +130,12 @@ func TestLookup(t *testing.T) {
 			if err == nil && newInodeOperations == nil {
 				t.Errorf("Lookup got non-nil err and non-nil node, wanted at least one non-nil")
 			}
-
-			// Check mock parameters.
-			if !rootFile.WalkGetAttrMock.Called {
-				t.Errorf("GetAttr not called; error: %v", err)
-			} else if rootFile.WalkGetAttrMock.Names[0] != test.fileName {
-				t.Errorf("file name not set")
-			}
 		})
 	}
 }
 
 func TestRevalidation(t *testing.T) {
-	tests := []struct {
+	type revalidationTest struct {
 		cachePolicy cachePolicy
 
 		// Whether dirent should be reloaded before any modifications.
@@ -167,7 +152,9 @@ func TestRevalidation(t *testing.T) {
 		// Whether dirent should be reloaded after the remote has
 		// removed the file.
 		postRemovalWantReload bool
-	}{
+	}
+
+	tests := []revalidationTest{
 		{
 			// Policy cacheNone causes Revalidate to always return
 			// true.
@@ -208,67 +195,83 @@ func TestRevalidation(t *testing.T) {
 		},
 	}
 
-	ctx := contexttest.Context(t)
+	const file = "file" // The file walked below.
+
 	for _, test := range tests {
 		name := fmt.Sprintf("cachepolicy=%s", test.cachePolicy)
-		t.Run(name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, test.cachePolicy, p9.ModeDirectory|p9.PermissionsMask, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
+		rootTest(t, name, test.cachePolicy, func(ctx context.Context, h *p9test.Harness, rootFile *p9test.Mock, rootInode *fs.Inode) {
+			// Wrap in a dirent object.
 			rootDir := fs.NewDirent(rootInode, "root")
 
-			// Create a mock file that we will walk to from the root.
-			const (
-				name = "foo"
-				mode = p9.PermissionsMask
-			)
-			file := goodMockFile(mode, 0)
-			file.GetAttrMock.Valid = p9.AttrMaskAll()
-
-			// Tell the root mock how to walk to this file.
-			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-			rootFile.WalkGetAttrMock.File = file
-			rootFile.WalkGetAttrMock.Attr = file.GetAttrMock.Attr
-			rootFile.WalkGetAttrMock.Valid = file.GetAttrMock.Valid
+			// Create a mock file a child of the root. We save when
+			// this is generated, so that when the time changed, we
+			// can update the original entry.
+			var origMocks []*p9test.Mock
+			rootFile.AddChild(file, func(parent *p9test.Mock) *p9test.Mock {
+				// Regular a regular file that has a consistent
+				// path number. This might be used by
+				// validation so we don't change it.
+				m := h.NewMock(parent, 0, p9.Attr{
+					Mode: p9.ModeRegular,
+				})
+				origMocks = append(origMocks, m)
+				return m
+			})
 
 			// Do the walk.
-			dirent, err := rootDir.Walk(ctx, rootDir, name)
+			dirent, err := rootDir.Walk(ctx, rootDir, file)
 			if err != nil {
-				t.Fatalf("Lookup(%q) failed: %v", name, err)
+				t.Fatalf("Lookup failed: %v", err)
 			}
 
-			// Walk again. Depending on the cache policy, we may get a new
-			// dirent.
-			newDirent, err := rootDir.Walk(ctx, rootDir, name)
+			// We must release the dirent, of the test will fail
+			// with a reference leak. This is tracked by p9test.
+			defer dirent.DecRef()
+
+			// Walk again. Depending on the cache policy, we may
+			// get a new dirent.
+			newDirent, err := rootDir.Walk(ctx, rootDir, file)
 			if err != nil {
-				t.Fatalf("Lookup(%q) failed: %v", name, err)
+				t.Fatalf("Lookup failed: %v", err)
 			}
 			if test.preModificationWantReload && dirent == newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got old dirent %+v, wanted a new dirent", test.cachePolicy, dirent)
 			}
 			if !test.preModificationWantReload && dirent != newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent)
 			}
+			newDirent.DecRef() // See above.
 
-			// Modify the underlying mocked file's modification time.
+			// Modify the underlying mocked file's modification
+			// time for the next walk that occurs.
 			nowSeconds := time.Now().Unix()
-			rootFile.WalkGetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
-			file.GetAttrMock.Attr.MTimeSeconds = uint64(nowSeconds)
+			rootFile.AddChild(file, func(parent *p9test.Mock) *p9test.Mock {
+				// Ensure that the path is the same as above,
+				// but we change only the modification time of
+				// the file.
+				return h.NewMock(parent, 0, p9.Attr{
+					Mode:         p9.ModeRegular,
+					MTimeSeconds: uint64(nowSeconds),
+				})
+			})
+
+			// We also modify the original time, so that GetAttr
+			// behaves as expected for the caching case.
+			for _, m := range origMocks {
+				m.Attr.MTimeSeconds = uint64(nowSeconds)
+			}
 
-			// Walk again. Depending on the cache policy, we may get a new
-			// dirent.
-			newDirent, err = rootDir.Walk(ctx, rootDir, name)
+			// Walk again. Depending on the cache policy, we may
+			// get a new dirent.
+			newDirent, err = rootDir.Walk(ctx, rootDir, file)
 			if err != nil {
-				t.Fatalf("Lookup(%q) failed: %v", name, err)
+				t.Fatalf("Lookup failed: %v", err)
 			}
 			if test.postModificationWantReload && dirent == newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got old dirent %v, wanted a new dirent", name, test.cachePolicy, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got old dirent, wanted a new dirent", test.cachePolicy)
 			}
 			if !test.postModificationWantReload && dirent != newDirent {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v, wanted old dirent %v", name, test.cachePolicy, newDirent, dirent)
+				t.Errorf("Lookup with cachePolicy=%s got new dirent, wanted old dirent", test.cachePolicy)
 			}
 			uattrs, err := newDirent.Inode.UnstableAttr(ctx)
 			if err != nil {
@@ -276,660 +279,25 @@ func TestRevalidation(t *testing.T) {
 			}
 			gotModTimeSeconds := uattrs.ModificationTime.Seconds()
 			if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds {
-				t.Fatalf("Lookup(%q) with cachePolicy=%s got new modification time %v, wanted %v", name, test.cachePolicy, gotModTimeSeconds, nowSeconds)
+				t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds)
 			}
+			newDirent.DecRef() // See above.
 
-			// Make WalkGetAttr return ENOENT. This simulates
-			// removing the file from the remote fs.
-			rootFile.WalkGetAttrMock = p9test.WalkGetAttrMock{
-				Err: syscall.ENOENT,
-			}
+			// Remove the file from the remote fs, subsequent walks
+			// should now fail to find anything.
+			rootFile.RemoveChild(file)
 
 			// Walk again. Depending on the cache policy, we may
 			// get ENOENT.
-			newDirent, err = rootDir.Walk(ctx, rootDir, name)
+			newDirent, err = rootDir.Walk(ctx, rootDir, file)
 			if test.postRemovalWantReload && err == nil {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got nil error, wanted ENOENT", name, test.cachePolicy)
+				t.Errorf("Lookup with cachePolicy=%s got nil error, wanted ENOENT", test.cachePolicy)
 			}
 			if !test.postRemovalWantReload && (err != nil || dirent != newDirent) {
-				t.Errorf("Lookup(%q) with cachePolicy=%s got new dirent %v and error %v, wanted old dirent %v and nil error", name, test.cachePolicy, newDirent, err, dirent)
-			}
-		})
-	}
-}
-
-func TestSetTimestamps(t *testing.T) {
-	// Test parameters.
-	type setTimestampsTest struct {
-		// Name of the test.
-		name string
-
-		// Function input parameters.
-		ts fs.TimeSpec
-	}
-
-	ctx := contexttest.Context(t)
-	now := ktime.NowFromContext(ctx)
-	tests := []setTimestampsTest{
-		{
-			name: "mock SetAttr passes (function succeeds)",
-			ts: fs.TimeSpec{
-				ATime: now,
-				MTime: now,
-			},
-		},
-		{
-			name: "mock SetAttr passes, times are 0 (function succeeds)",
-			ts:   fs.TimeSpec{},
-		},
-		{
-			name: "mock SetAttr passes, times are 0 and not system time (function succeeds)",
-			ts: fs.TimeSpec{
-				ATimeSetSystemTime: false,
-				MTimeSetSystemTime: false,
-			},
-		},
-		{
-			name: "mock SetAttr passes, times are set to system time (function succeeds)",
-			ts: fs.TimeSpec{
-				ATimeSetSystemTime: true,
-				MTimeSetSystemTime: true,
-			},
-		},
-		{
-			name: "mock SetAttr passes, times are omitted (function succeeds)",
-			ts: fs.TimeSpec{
-				ATimeOmit: true,
-				MTimeOmit: true,
-			},
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			// Call function.
-			err = rootInode.SetTimestamps(ctx, nil /* Dirent */, test.ts)
-
-			// Check return values.
-			if err != nil {
-				t.Errorf("SetTimestamps failed: got error %v, want nil", err)
-			}
-
-			// Check mock parameters.
-			if !(test.ts.ATimeOmit && test.ts.MTimeOmit) && !rootFile.SetAttrMock.Called {
-				t.Errorf("TestSetTimestamps failed: SetAttr not called")
-				return
-			}
-
-			// Check what was passed to the mock function.
-			attr := rootFile.SetAttrMock.Attr
-			atimeGiven := ktime.FromUnix(int64(attr.ATimeSeconds), int64(attr.ATimeNanoSeconds))
-			if test.ts.ATimeOmit {
-				if rootFile.SetAttrMock.Valid.ATime {
-					t.Errorf("ATime got set true in mask, wanted false")
-				}
-			} else {
-				if got, want := rootFile.SetAttrMock.Valid.ATimeNotSystemTime, !test.ts.ATimeSetSystemTime; got != want {
-					t.Errorf("got ATimeNotSystemTime %v, want %v", got, want)
-				}
-				if !test.ts.ATimeSetSystemTime && !test.ts.ATime.Equal(atimeGiven) {
-					t.Errorf("ATime got %v, want %v", atimeGiven, test.ts.ATime)
-				}
-			}
-
-			mtimeGiven := ktime.FromUnix(int64(attr.MTimeSeconds), int64(attr.MTimeNanoSeconds))
-			if test.ts.MTimeOmit {
-				if rootFile.SetAttrMock.Valid.MTime {
-					t.Errorf("MTime got set true in mask, wanted false")
-				}
-			} else {
-				if got, want := rootFile.SetAttrMock.Valid.MTimeNotSystemTime, !test.ts.MTimeSetSystemTime; got != want {
-					t.Errorf("got MTimeNotSystemTime %v, want %v", got, want)
-				}
-				if !test.ts.MTimeSetSystemTime && !test.ts.MTime.Equal(mtimeGiven) {
-					t.Errorf("MTime got %v, want %v", mtimeGiven, test.ts.MTime)
-				}
-			}
-		})
-	}
-}
-
-func TestSetPermissions(t *testing.T) {
-	// Test parameters.
-	type setPermissionsTest struct {
-		// Name of the test.
-		name string
-
-		// SetPermissions input parameters.
-		perms fs.FilePermissions
-
-		// Error that SetAttr mock should return.
-		setAttrErr error
-
-		// Expected return value.
-		want bool
-	}
-
-	tests := []setPermissionsTest{
-		{
-			name:       "SetAttr mock succeeds (function succeeds)",
-			perms:      fs.FilePermissions{User: fs.PermMask{Read: true, Write: true, Execute: true}},
-			want:       true,
-			setAttrErr: nil,
-		},
-		{
-			name:       "SetAttr mock fails (function fails)",
-			perms:      fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}},
-			want:       false,
-			setAttrErr: syscall.ENOENT,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, 0, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-			rootFile.SetAttrMock.Err = test.setAttrErr
-
-			ok := rootInode.SetPermissions(ctx, nil /* Dirent */, test.perms)
-
-			// Check return value.
-			if ok != test.want {
-				t.Errorf("SetPermissions got %v, want %v", ok, test.want)
-			}
-
-			// Check mock parameters.
-			pattr := rootFile.SetAttrMock.Attr
-			if !rootFile.SetAttrMock.Called {
-				t.Errorf("SetAttr not called")
-				return
-			}
-			if !rootFile.SetAttrMock.Valid.Permissions {
-				t.Errorf("SetAttr did not get right request (got false, expected SetAttrMask.Permissions true)")
-			}
-			if got := fs.FilePermsFromP9(pattr.Permissions); got != test.perms {
-				t.Errorf("SetAttr did not get right permissions -- got %v, want %v", got, test.perms)
-			}
-		})
-	}
-}
-
-func TestClose(t *testing.T) {
-	ctx := contexttest.Context(t)
-	// Set up mock.
-	rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-	if err != nil {
-		t.Fatalf("error creating root: %v", err)
-	}
-
-	// Call function.
-	rootInode.InodeOperations.Release(ctx)
-
-	// Check mock parameters.
-	if !rootFile.CloseMock.Called {
-		t.Errorf("TestClose failed: Close not called")
-	}
-}
-
-func TestRename(t *testing.T) {
-	// Test parameters.
-	type renameTest struct {
-		// Name of the test.
-		name string
-
-		// Input parameters.
-		newParent *fs.Inode
-		newName   string
-
-		// Rename mock parameters.
-		renameErr    error
-		renameCalled bool
-
-		// Error want to return given the parameters. (Same as what
-		// we expect and tell rename to return.)
-		want error
-	}
-	ctx := contexttest.Context(t)
-	rootFile, rootInode, err := root(ctx, cacheNone, p9.PermissionsMask, 0)
-	if err != nil {
-		t.Fatalf("error creating root: %v", err)
-	}
-
-	tests := []renameTest{
-		{
-			name:         "mock Rename succeeds (function succeeds)",
-			newParent:    rootInode,
-			newName:      "foo2",
-			want:         nil,
-			renameErr:    nil,
-			renameCalled: true,
-		},
-		{
-			name:         "mock Rename fails (function fails)",
-			newParent:    rootInode,
-			newName:      "foo2",
-			want:         syscall.ENOENT,
-			renameErr:    syscall.ENOENT,
-			renameCalled: true,
-		},
-		{
-			name:         "newParent is not inodeOperations but should be (function fails)",
-			newParent:    fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory}),
-			newName:      "foo2",
-			want:         syscall.EXDEV,
-			renameErr:    nil,
-			renameCalled: false,
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			mockFile := goodMockFile(p9.PermissionsMask, 0)
-			rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}}
-			rootFile.WalkGetAttrMock.File = mockFile
-
-			dirent, err := rootInode.Lookup(ctx, "foo")
-			if err != nil {
-				t.Fatalf("root.Walk failed: %v", err)
-			}
-			mockFile.RenameMock.Err = test.renameErr
-			mockFile.RenameMock.Called = false
-
-			// Use a dummy oldParent to acquire write access to that directory.
-			oldParent := &inodeOperations{
-				readdirCache: fs.NewSortedDentryMap(nil),
-			}
-			oldInode := fs.NewInode(oldParent, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory})
-
-			// Call function.
-			err = dirent.Inode.InodeOperations.Rename(ctx, oldInode, "", test.newParent, test.newName)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Rename got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if got, want := mockFile.RenameMock.Called, test.renameCalled; got != want {
-				t.Errorf("renameCalled got %v want %v", got, want)
-			}
-		})
-	}
-}
-
-// This file is read from in TestPreadv.
-type readAtFileFake struct {
-	p9test.FileMock
-
-	// Parameters for faking ReadAt.
-	FileLength int
-	Err        error
-	ChunkSize  int
-	Called     bool
-	LengthRead int
-}
-
-func (r *readAtFileFake) ReadAt(p []byte, offset uint64) (int, error) {
-	r.Called = true
-	log.Warningf("ReadAt fake: length read so far = %d, len(p) = %d, offset = %d", r.LengthRead, len(p), offset)
-	if int(offset) != r.LengthRead {
-		return 0, fmt.Errorf("offset got %d; expected %d", offset, r.LengthRead)
-	}
-
-	if r.Err != nil {
-		return 0, r.Err
-	}
-
-	if r.LengthRead >= r.FileLength {
-		return 0, io.EOF
-	}
-
-	// Read at most ChunkSize and read at most what's left in the file.
-	toBeRead := len(p)
-	if r.LengthRead+toBeRead >= r.FileLength {
-		toBeRead = r.FileLength - int(offset)
-	}
-	if toBeRead > r.ChunkSize {
-		toBeRead = r.ChunkSize
-	}
-
-	r.LengthRead += toBeRead
-	if r.LengthRead == r.FileLength {
-		return toBeRead, io.EOF
-	}
-	return toBeRead, nil
-}
-
-func TestPreadv(t *testing.T) {
-	// Test parameters.
-	type preadvTest struct {
-		// Name of the test.
-		name string
-
-		// Mock parameters
-		mode p9.FileMode
-
-		// Buffer to read into.
-		buffer    [512]byte
-		sliceSize int
-
-		// How much readAt returns at a time.
-		chunkSize int
-
-		// Whether or not we expect ReadAt to be called.
-		readAtCalled bool
-		readAtErr    error
-
-		// Expected return values.
-		want error
-	}
-
-	tests := []preadvTest{
-		{
-			name:         "fake ReadAt succeeds, 512 bytes requested, 512 byte chunks (function succeeds)",
-			want:         nil,
-			readAtErr:    nil,
-			mode:         p9.PermissionsMask,
-			readAtCalled: true,
-			sliceSize:    512,
-			chunkSize:    512,
-		},
-		{
-			name:         "fake ReadAt succeeds, 512 bytes requested, 200 byte chunks (function succeeds)",
-			want:         nil,
-			readAtErr:    nil,
-			mode:         p9.PermissionsMask,
-			readAtCalled: true,
-			sliceSize:    512,
-			chunkSize:    200,
-		},
-		{
-			name:         "fake ReadAt succeeds, 0 bytes requested (function succeeds)",
-			want:         nil,
-			readAtErr:    nil,
-			mode:         p9.PermissionsMask,
-			readAtCalled: false,
-			sliceSize:    0,
-			chunkSize:    100,
-		},
-		{
-			name:         "fake ReadAt returns 0 bytes and EOF (function fails)",
-			want:         io.EOF,
-			readAtErr:    io.EOF,
-			mode:         p9.PermissionsMask,
-			readAtCalled: true,
-			sliceSize:    512,
-			chunkSize:    512,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, test.mode, 1024)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			// Set up the read buffer.
-			dst := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
-
-			// This file will be read from.
-			openFile := &readAtFileFake{
-				Err:        test.readAtErr,
-				FileLength: test.sliceSize,
-				ChunkSize:  test.chunkSize,
-			}
-			rootFile.WalkGetAttrMock.File = openFile
-			rootFile.WalkGetAttrMock.Attr.Mode = test.mode
-			rootFile.WalkGetAttrMock.Valid.Mode = true
-
-			f := NewFile(
-				ctx,
-				fs.NewDirent(rootInode, ""),
-				"",
-				fs.FileFlags{Read: true},
-				rootInode.InodeOperations.(*inodeOperations),
-				&handles{File: contextFile{file: openFile}},
-			)
-
-			// Call function.
-			_, err = f.Preadv(ctx, dst, 0)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Preadv got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if test.readAtCalled != openFile.Called {
-				t.Errorf("ReadAt called: %v, but expected opposite", openFile.Called)
-			}
-		})
-	}
-}
-
-func TestReadlink(t *testing.T) {
-	// Test parameters.
-	type readlinkTest struct {
-		// Name of the test.
-		name string
-
-		// Mock parameters
-		mode p9.FileMode
-
-		// Whether or not we expect ReadAt to be called and what error
-		// it shall return.
-		readlinkCalled bool
-		readlinkErr    error
-
-		// Expected return values.
-		want error
-	}
-
-	tests := []readlinkTest{
-		{
-			name:           "file is not symlink (function fails)",
-			want:           syscall.ENOLINK,
-			mode:           p9.PermissionsMask,
-			readlinkCalled: false,
-			readlinkErr:    nil,
-		},
-		{
-			name:           "mock Readlink succeeds (function succeeds)",
-			want:           nil,
-			mode:           p9.PermissionsMask | p9.ModeSymlink,
-			readlinkCalled: true,
-			readlinkErr:    nil,
-		},
-		{
-			name:           "mock Readlink fails (function fails)",
-			want:           syscall.ENOENT,
-			mode:           p9.PermissionsMask | p9.ModeSymlink,
-			readlinkCalled: true,
-			readlinkErr:    syscall.ENOENT,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			rootFile, rootInode, err := root(ctx, cacheNone, test.mode, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			openFile := goodMockFile(test.mode, 0)
-			rootFile.WalkMock.File = openFile
-			rootFile.ReadlinkMock.Err = test.readlinkErr
-
-			// Call function.
-			_, err = rootInode.Readlink(ctx)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Readlink got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if test.readlinkCalled && !rootFile.ReadlinkMock.Called {
-				t.Errorf("Readlink not called")
-			}
-		})
-	}
-}
-
-// This file is write from in TestPwritev.
-type writeAtFileFake struct {
-	p9test.FileMock
-
-	// Parameters for faking WriteAt.
-	Err           error
-	ChunkSize     int
-	Called        bool
-	LengthWritten int
-}
-
-func (r *writeAtFileFake) WriteAt(p []byte, offset uint64) (int, error) {
-	r.Called = true
-	log.Warningf("WriteAt fake: length written so far = %d, len(p) = %d, offset = %d", r.LengthWritten, len(p), offset)
-	if int(offset) != r.LengthWritten {
-		return 0, fmt.Errorf("offset got %d; want %d", offset, r.LengthWritten)
-	}
-
-	if r.Err != nil {
-		return 0, r.Err
-	}
-
-	// Write at most ChunkSize.
-	toBeWritten := len(p)
-	if toBeWritten > r.ChunkSize {
-		toBeWritten = r.ChunkSize
-	}
-	r.LengthWritten += toBeWritten
-	return toBeWritten, nil
-}
-
-func TestPwritev(t *testing.T) {
-	// Test parameters.
-	type pwritevTest struct {
-		// Name of the test.
-		name string
-
-		// Mock parameters
-		mode p9.FileMode
-
-		allowWrite bool
-
-		// Buffer to write into.
-		buffer    [512]byte
-		sliceSize int
-		chunkSize int
-
-		// Whether or not we expect writeAt to be called.
-		writeAtCalled bool
-		writeAtErr    error
-
-		// Expected return values.
-		want error
-	}
-
-	tests := []pwritevTest{
-		{
-			name:          "fake writeAt succeeds, one chunk (function succeeds)",
-			want:          nil,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    true,
-			writeAtCalled: true,
-			sliceSize:     512,
-			chunkSize:     512,
-		},
-		{
-			name:          "fake writeAt fails, short write (function fails)",
-			want:          io.ErrShortWrite,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    true,
-			writeAtCalled: true,
-			sliceSize:     512,
-			chunkSize:     200,
-		},
-		{
-			name:          "fake writeAt succeeds, len 0 (function succeeds)",
-			want:          nil,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    true,
-			writeAtCalled: false,
-			sliceSize:     0,
-			chunkSize:     0,
-		},
-		{
-			name:          "writeAt can still write despite file permissions read only (function succeeds)",
-			want:          nil,
-			writeAtErr:    nil,
-			mode:          p9.PermissionsMask,
-			allowWrite:    false,
-			writeAtCalled: true,
-			sliceSize:     512,
-			chunkSize:     512,
-		},
-	}
-
-	ctx := contexttest.Context(t)
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			// Set up mock.
-			_, rootInode, err := root(ctx, cacheNone, test.mode, 0)
-			if err != nil {
-				t.Fatalf("error creating root: %v", err)
-			}
-
-			src := usermem.BytesIOSequence(test.buffer[:test.sliceSize])
-
-			// This is the file that will be used for writing.
-			openFile := &writeAtFileFake{
-				Err:       test.writeAtErr,
-				ChunkSize: test.chunkSize,
-			}
-
-			f := NewFile(
-				ctx,
-				fs.NewDirent(rootInode, ""),
-				"",
-				fs.FileFlags{Write: true},
-				rootInode.InodeOperations.(*inodeOperations),
-				&handles{File: contextFile{file: openFile}},
-			)
-
-			// Call function.
-			_, err = f.Pwritev(ctx, src, 0)
-
-			// Check return value.
-			if err != test.want {
-				t.Errorf("Pwritev got %v, want %v", err, test.want)
-			}
-
-			// Check mock parameters.
-			if test.writeAtCalled != openFile.Called {
-				t.Errorf("WriteAt called: %v, but expected opposite", openFile.Called)
-				return
+				t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err)
 			}
-			if openFile.Called && test.writeAtErr != nil && openFile.LengthWritten != test.sliceSize {
-				t.Errorf("wrote %d bytes, expected %d bytes written", openFile.LengthWritten, test.sliceSize)
+			if err == nil {
+				newDirent.DecRef() // See above.
 			}
 		})
 	}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 7552216f3..f76a83cd9 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -91,10 +91,6 @@ func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint {
 type session struct {
 	refs.AtomicRefCount
 
-	// conn is a unet.Socket that wraps the readFD/writeFD mount option,
-	// see fs/gofer/fs.go.
-	conn *unet.Socket `state:"nosave"`
-
 	// msize is the value of the msize mount option, see fs/gofer/fs.go.
 	msize uint32 `state:"wait"`
 
@@ -142,7 +138,7 @@ type session struct {
 
 // Destroy tears down the session.
 func (s *session) Destroy() {
-	s.conn.Close()
+	s.client.Close()
 }
 
 // Revalidate implements MountSource.Revalidate.
@@ -235,7 +231,6 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	// Construct the session.
 	s := &session{
 		connID:          dev,
-		conn:            conn,
 		msize:           o.msize,
 		version:         o.version,
 		cachePolicy:     o.policy,
@@ -252,7 +247,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	m := fs.NewMountSource(s, filesystem, superBlockFlags)
 
 	// Send the Tversion request.
-	s.client, err = p9.NewClient(s.conn, s.msize, s.version)
+	s.client, err = p9.NewClient(conn, s.msize, s.version)
 	if err != nil {
 		// Drop our reference on the session, it needs to be torn down.
 		s.DecRef()
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index f657135fc..d9fd7a221 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -84,13 +84,13 @@ func (s *session) afterLoad() {
 	}
 
 	// Manually restore the connection.
-	s.conn, err = unet.NewSocket(opts.fd)
+	conn, err := unet.NewSocket(opts.fd)
 	if err != nil {
 		panic(fmt.Sprintf("failed to create Socket for FD %d: %v", opts.fd, err))
 	}
 
 	// Manually restore the client.
-	s.client, err = p9.NewClient(s.conn, s.msize, s.version)
+	s.client, err = p9.NewClient(conn, s.msize, s.version)
 	if err != nil {
 		panic(fmt.Sprintf("failed to connect client to server: %v", err))
 	}
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
index 34582f275..ff7dacf07 100644
--- a/pkg/sentry/fs/proc/device/BUILD
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "device",
     srcs = ["device.go"],
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index f362d15c8..33197cf14 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "hostcpu",
     srcs = [
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
index fe6fa2260..3f8fa206c 100644
--- a/pkg/sentry/kernel/kdefs/BUILD
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "kdefs",
     srcs = ["kdefs.go"],
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index 66899910c..e903badd3 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "memevent",
     srcs = ["memory_events.go"],
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index 125792f39..52e226a39 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sched",
     srcs = [
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 0beb4561b..83cad186a 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
 load("//tools/go_stateify:defs.bzl", "go_library")
 
 go_embed_data(
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
index 341b30b98..88738d65d 100644
--- a/pkg/sentry/memutil/BUILD
+++ b/pkg/sentry/memutil/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "memutil",
     srcs = [
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index 35121321a..dbafa3204 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "interrupt",
     srcs = [
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 4ef9e20d7..1b71e629f 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index e779e3893..1dffe94a4 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "testutil",
     testonly = 1,
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD
index ba68d48f4..d3398d1e8 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/sentry/platform/procid/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "procid",
     srcs = [
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index debae058b..2eb354ad4 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ptrace",
     srcs = [
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index 2485eb2eb..c35d49f2d 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template(
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index 3bce56985..b76d7974e 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 7a86e2234..de1b920af 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template(
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 7dcf6e561..614d9e21e 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "safecopy",
     srcs = [
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index e96509ce1..87a9bff12 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "safemem",
     srcs = [
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index 751176747..41313d334 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sighandling",
     srcs = [
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 38fa54283..06e121946 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "rpcinet",
     srcs = [
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
index c51ca14b1..a16977f29 100644
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # BSD
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # BSD
+
 go_library(
     name = "conn",
     srcs = ["conn.go"],
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index 2ae902b3f..2bab01774 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # BSD
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # BSD
+
 go_library(
     name = "notifier",
     srcs = ["notifier.go"],
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index a57a8298e..f1f6fdb7d 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "state",
     srcs = [
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 674554081..52c7f325c 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "strace",
     srcs = [
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 2a9f0915e..35192ff49 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "syscalls",
     srcs = [
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 9452787fb..5dadb8a2d 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index 63da5e81f..42e24ace5 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -1,8 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 proto_library(
     name = "unimplemented_syscall_proto",
     srcs = ["unimplemented_syscall.proto"],
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index 68b82af47..0929497c3 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "uniqueid",
     srcs = ["context.go"],
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index 13bc33eb1..b2c687b20 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "watchdog",
     srcs = ["watchdog.go"],
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index 05e4ca540..338fd9336 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sleep",
     srcs = [
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index 6a5b2d4ff..dd0f250fa 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,7 +1,8 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
+package(licenses = ["notice"])  # Apache 2.0
+
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index 6be78dc9b..66c8f3807 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "statefile",
     srcs = ["statefile.go"],
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
index 4fa959df0..9cb7f66fe 100644
--- a/pkg/sync/atomicptrtest/BUILD
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index 07b4f85ab..54f8e59b1 100644
--- a/pkg/sync/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -1,6 +1,7 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 go_template_instance(
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index 5dd2e90bb..30ae20772 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "syserr",
     srcs = [
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index e050c2043..d4c6da97a 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "syserror",
     srcs = ["syserror.go"],
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index bf618831a..723ad668f 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "gonet",
     srcs = ["gonet.go"],
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index e8a524918..a1de808b9 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "checker",
     testonly = 1,
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 9a6f49c45..25f6c1457 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "channel",
     srcs = ["channel.go"],
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 6e75e9f47..94391433c 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fdbased",
     srcs = ["endpoint.go"],
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index cc4247ffd..a46ba7f11 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "loopback",
     srcs = ["loopback.go"],
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 10b35a37e..829ea7c42 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "rawfile",
     srcs = [
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index 5390257c5..d7f1e66ef 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sharedmem",
     srcs = [
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index ff798ae6f..12e813509 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "pipe",
     srcs = [
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index c4a7879c4..661037bb2 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "queue",
     srcs = [
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 7155aea66..52e237c25 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sniffer",
     srcs = [
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index a8bb03661..5ec01cec9 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "tun",
     srcs = ["tun_unsafe.go"],
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 7582df32e..ba495c437 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "waitable",
     srcs = [
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index 25a3c98b6..a2a07f533 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_test(
     name = "ip_test",
     size = "small",
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index 44f2b66e5..f6fb7daf7 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "arp",
     srcs = ["arp.go"],
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index 1c22c52fc..401dce646 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "hash",
     srcs = ["hash.go"],
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 90d65d531..e72317e9f 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ipv4",
     srcs = [
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index 2f19a659e..808c37df3 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ipv6",
     srcs = [
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 3c3374275..c69fc0744 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "ports",
     srcs = ["ports.go"],
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index 21d32245d..32baf2115 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "tun_tcp_connect",
     srcs = ["main.go"],
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index d7402aaa2..760445843 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "tun_tcp_echo",
     srcs = ["main.go"],
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index 7a95594ef..814e5c1ea 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "context",
     testonly = 1,
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index 46da3e6f1..ac1a94d4d 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "tcpconntrack",
     srcs = ["tcp_conntrack.go"],
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index d18338fff..c20df7005 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "tmutex",
     srcs = ["tmutex.go"],
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index acdfd7cb6..f90e43c89 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "unet",
     srcs = [
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index d32c57d1a..21008cf6c 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "urpc",
     srcs = ["urpc.go"],
diff --git a/pkg/waiter/fdnotifier/BUILD b/pkg/waiter/fdnotifier/BUILD
index 4e582755d..af6baa303 100644
--- a/pkg/waiter/fdnotifier/BUILD
+++ b/pkg/waiter/fdnotifier/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("//tools/go_stateify:defs.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fdnotifier",
     srcs = [
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 04cc0e854..07afce807 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "boot",
     srcs = [
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 48f2c8024..004222242 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "filter",
     srcs = [
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index 10a8e5feb..bf2f373a9 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "cgroup",
     srcs = ["cgroup.go"],
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 7040eb4ec..394bb0e1f 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "cmd",
     srcs = [
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
index fa1a7d430..ff4ccff69 100644
--- a/runsc/console/BUILD
+++ b/runsc/console/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "console",
     srcs = ["console.go"],
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index f4c6f1525..bdd93aaba 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "container",
     srcs = [
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 24e172f48..f28e4fa77 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "fsgofer",
     srcs = [
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index 40f4f2205..c7848d10c 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "filter",
     srcs = [
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index e03bb7752..fd913831a 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -26,7 +26,6 @@ import (
 	"math"
 	"os"
 	"path"
-	"strings"
 	"sync"
 	"syscall"
 
@@ -181,18 +180,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 	}
 }
 
-func isNameValid(name string) bool {
-	if name == "" || name == "." || name == ".." {
-		log.Warningf("Invalid name: %s", name)
-		return false
-	}
-	if strings.IndexByte(name, '/') >= 0 {
-		log.Warningf("Invalid name: %s", name)
-		return false
-	}
-	return true
-}
-
 // localFile implements p9.File wrapping a local file. The underlying file
 // is opened during Walk() and stored in 'controlFile' to be used with other
 // operations. The mode in which the file is opened varies depending on the
@@ -228,11 +215,7 @@ type localFile struct {
 	// attachPoint is the attachPoint that serves this localFile.
 	attachPoint *attachPoint
 
-	// mu protects 'hostPath' when file is renamed.
-	mu sync.Mutex
-
-	// TODO: hostPath is not safe to use as path needs to be walked
-	// everytime (and can change underneath us). Remove all usages.
+	// hostPath will be safely updated by the Renamed hook.
 	hostPath string
 
 	// controlFile is opened when localFile is created and it's never nil.
@@ -246,6 +229,7 @@ type localFile struct {
 	// if localFile isn't opened.
 	mode p9.OpenFlags
 
+	// ft is the fileType for this file.
 	ft fileType
 
 	// readDirMu protects against concurrent Readdir calls.
@@ -296,10 +280,7 @@ func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
 		return nil, "", extractErrno(err)
 	}
 
-	parent.mu.Lock()
-	defer parent.mu.Unlock()
 	newPath := path.Join(parent.hostPath, name)
-
 	return os.NewFile(uintptr(fd), newPath), newPath, nil
 }
 
@@ -382,13 +363,10 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
 		var err error
 
-		l.mu.Lock()
 		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
 		if err != nil {
-			l.mu.Unlock()
 			return nil, p9.QID{}, 0, extractErrno(err)
 		}
-		l.mu.Unlock()
 	}
 
 	stat, err := stat(int(newFile.Fd()))
@@ -418,9 +396,6 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		}
 		return nil, nil, p9.QID{}, 0, syscall.EBADF
 	}
-	if !isNameValid(name) {
-		return nil, nil, p9.QID{}, 0, syscall.EINVAL
-	}
 
 	// Use a single file for both 'controlFile' and 'openedFile'. Mode must include read for control
 	// and whichever else was requested by caller. Note that resulting file might have a wider mode
@@ -452,9 +427,6 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 
-	l.mu.Lock()
-	defer l.mu.Unlock()
-
 	cPath := path.Join(l.hostPath, name)
 	f := os.NewFile(uintptr(fd), cPath)
 	c := &localFile{
@@ -477,10 +449,6 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 		return p9.QID{}, syscall.EBADF
 	}
 
-	if !isNameValid(name) {
-		return p9.QID{}, syscall.EINVAL
-	}
-
 	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -517,9 +485,6 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 			return nil, nil, extractErrno(err)
 		}
 
-		l.mu.Lock()
-		defer l.mu.Unlock()
-
 		c := &localFile{
 			attachPoint: l.attachPoint,
 			hostPath:    l.hostPath,
@@ -532,10 +497,6 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	var qids []p9.QID
 	last := l
 	for _, name := range names {
-		if !isNameValid(name) {
-			return nil, nil, syscall.EINVAL
-		}
-
 		f, path, err := openAnyFile(last, name)
 		if err != nil {
 			return nil, nil, extractErrno(err)
@@ -761,15 +722,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	return err
 }
 
-// Remove implements p9.File.
-//
-// This is deprecated in favor of UnlinkAt.
-func (*localFile) Remove() error {
-	return syscall.ENOSYS
+// Rename implements p9.File; this should never be called.
+func (l *localFile) Rename(p9.File, string) error {
+	panic("rename called directly")
 }
 
-// Rename implements p9.File.
-func (l *localFile) Rename(directory p9.File, name string) error {
+// RenameAt implements p9.File.RenameAt.
+//
+// TODO: change to renameat(2).
+func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
 	conf := l.attachPoint.conf
 	if conf.ROMount {
 		if conf.PanicOnWrite {
@@ -777,34 +738,16 @@ func (l *localFile) Rename(directory p9.File, name string) error {
 		}
 		return syscall.EBADF
 	}
-	if !isNameValid(name) {
-		return syscall.EINVAL
-	}
-
-	l.mu.Lock()
-	defer l.mu.Unlock()
 
-	// TODO: change to renameat(2)
-	parent := directory.(*localFile)
-	newPath := path.Join(parent.hostPath, name)
-	if err := syscall.Rename(l.hostPath, newPath); err != nil {
+	newParent := directory.(*localFile)
+	oldPath := path.Join(l.hostPath, oldName)
+	newPath := path.Join(newParent.hostPath, newName)
+	if err := syscall.Rename(oldPath, newPath); err != nil {
 		return extractErrno(err)
 	}
-
-	// Update path on success.
-	// TODO: this doesn't cover cases where any of the
-	// parents have been renamed.
-	l.hostPath = newPath
 	return nil
 }
 
-// RenameAt implements p9.File.RenameAt.
-//
-// Code still uses [deprecated] Rename().
-func (*localFile) RenameAt(_ string, _ p9.File, _ string) error {
-	return syscall.ENOSYS
-}
-
 // ReadAt implements p9.File.
 func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
@@ -848,9 +791,6 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 		}
 		return p9.QID{}, syscall.EBADF
 	}
-	if !isNameValid(newName) {
-		return p9.QID{}, syscall.EINVAL
-	}
 
 	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
@@ -882,9 +822,6 @@ func (l *localFile) Link(target p9.File, newName string) error {
 		}
 		return syscall.EBADF
 	}
-	if !isNameValid(newName) {
-		return syscall.EINVAL
-	}
 
 	targetFile := target.(*localFile)
 	if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
@@ -909,9 +846,7 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 		}
 		return syscall.EBADF
 	}
-	if !isNameValid(name) {
-		return syscall.EINVAL
-	}
+
 	if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
 		return extractErrno(err)
 	}
@@ -1000,6 +935,11 @@ func (l *localFile) Close() error {
 	return err
 }
 
+// Renamed implements p9.Renamed.
+func (l *localFile) Renamed(newDir p9.File, newName string) {
+	l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
+}
+
 // extractErrno tries to determine the errno.
 func extractErrno(err error) syscall.Errno {
 	if err == nil {
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 48860f952..34033245b 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -415,22 +415,22 @@ func TestLink(t *testing.T) {
 
 func TestROMountChecks(t *testing.T) {
 	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
-		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
 			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+		if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
 			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if err := s.file.Rename(s.file, ".."); err != syscall.EBADF {
+		if err := s.file.RenameAt("some_file", s.file, "other_file"); err != syscall.EBADF {
 			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+		if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
 			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if err := s.file.UnlinkAt("..", 0); err != syscall.EBADF {
+		if err := s.file.UnlinkAt("some_file", 0); err != syscall.EBADF {
 			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
-		if err := s.file.Link(s.file, ".."); err != syscall.EBADF {
+		if err := s.file.Link(s.file, "some_link"); err != syscall.EBADF {
 			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
 		}
 
@@ -445,12 +445,12 @@ func TestROMountChecks(t *testing.T) {
 func TestROMountPanics(t *testing.T) {
 	conf := Config{ROMount: true, PanicOnWrite: true}
 	runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
-		assertPanic(t, func() { s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.Rename(s.file, "..") })
-		assertPanic(t, func() { s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.UnlinkAt("..", 0) })
-		assertPanic(t, func() { s.file.Link(s.file, "..") })
+		assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
+		assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
+		assertPanic(t, func() { s.file.Link(s.file, "some_link") })
 
 		valid := p9.SetAttrMask{Size: true}
 		attr := p9.SetAttr{Size: 0}
@@ -458,60 +458,6 @@ func TestROMountPanics(t *testing.T) {
 	})
 }
 
-func TestInvalidName(t *testing.T) {
-	runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) {
-		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
-			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if _, _, err := s.file.Walk([]string{".."}); err != syscall.EINVAL {
-			t.Errorf("%v: Walk() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
-			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if err := s.file.Rename(s.file, ".."); err != syscall.EINVAL {
-			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
-			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if err := s.file.UnlinkAt("..", 0); err != syscall.EINVAL {
-			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-		if err := s.file.Link(s.file, ".."); err != syscall.EINVAL {
-			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EINVAL", s, err)
-		}
-	})
-}
-
-func TestIsNameValid(t *testing.T) {
-	valid := []string{
-		"name",
-		"123",
-		"!@#$%^&*()",
-		".name",
-		"..name",
-		"...",
-	}
-	for _, s := range valid {
-		if got := isNameValid(s); !got {
-			t.Errorf("isNameValid(%s) failed, got: %v, expected: true", s, got)
-		}
-	}
-	invalid := []string{
-		".",
-		"..",
-		"name/name",
-		"/name",
-		"name/",
-	}
-	for _, s := range invalid {
-		if got := isNameValid(s); got {
-			t.Errorf("isNameValid(%s) failed, got: %v, expected: false", s, got)
-		}
-	}
-}
-
 func TestWalkNotFound(t *testing.T) {
 	runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
 		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index eb9c4cd76..d6043bcf7 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "sandbox",
     srcs = [
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index e73b2293f..a1e5da3f5 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "specutils",
     srcs = [
diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index c41161d50..22b3ebd2a 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_test(
     name = "image_test",
     size = "large",
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 726ebf49e..e7204dc66 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_test(
     name = "integration_test",
     size = "large",
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
index c69249b52..c2567ef23 100644
--- a/runsc/test/root/BUILD
+++ b/runsc/test/root/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "root",
     srcs = ["root.go"],
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index da2535bfa..128bd80fb 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "testutil",
     srcs = [
diff --git a/runsc/tools/dockercfg/BUILD b/runsc/tools/dockercfg/BUILD
index 5abb0c90a..a80b3abab 100644
--- a/runsc/tools/dockercfg/BUILD
+++ b/runsc/tools/dockercfg/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "dockercfg",
     srcs = ["dockercfg.go"],
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 1afc58625..22c2e62c3 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "go_generics",
     srcs = [
diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD
index a238becab..c26ac56d2 100644
--- a/tools/go_generics/globals/BUILD
+++ b/tools/go_generics/globals/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_library(
     name = "globals",
     srcs = [
diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD
index 2d9a6fa9d..23b2d656d 100644
--- a/tools/go_generics/rules_tests/BUILD
+++ b/tools/go_generics/rules_tests/BUILD
@@ -1,6 +1,7 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
 package(licenses = ["notice"])  # Apache 2.0
 
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
 go_template_instance(
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index edbeb4e2d..68d37f5d7 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -1,7 +1,7 @@
-package(licenses = ["notice"])  # Apache 2.0
-
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
+package(licenses = ["notice"])  # Apache 2.0
+
 go_binary(
     name = "stateify",
     srcs = ["main.go"],
-- 
cgit v1.2.3


From 46603b569c3ab20f45cf1b651d1fd3d2dda33243 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Tue, 23 Oct 2018 14:17:47 -0700
Subject: Fix panic on creation of zero-len shm segments.

Attempting to create a zero-len shm segment causes a panic since we
try to allocate a zero-len filemem region. The existing code had a
guard to disallow this, but the check didn't encode the fact that
requesting a private segment implies a segment creation regardless of
whether IPC_CREAT is explicitly specified.

PiperOrigin-RevId: 218405743
Change-Id: I30aef1232b2125ebba50333a73352c2f907977da
---
 pkg/sentry/kernel/shm/shm.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 8d0d14e45..2feffe612 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -101,9 +101,12 @@ func (r *Registry) findByKey(key int32) *Shm {
 // FindOrCreate looks up or creates a segment in the registry. It's functionally
 // analogous to open(2).
 func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
-	if create && (size < linux.SHMMIN || size > linux.SHMMAX) {
+	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
 		// "A new segment was to be created and size is less than SHMMIN or
 		// greater than SHMMAX." - man shmget(2)
+		//
+		// Note that 'private' always implies the creation of a new segment
+		// whether IPC_CREAT is specified or not.
 		return nil, syserror.EINVAL
 	}
 
-- 
cgit v1.2.3


From 4a1a2dead9b382b4315eddbd06ddb1c83f1ccf5e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 24 Oct 2018 10:41:34 -0700
Subject: Run ptrace stubs in their own session and process group.

Pseudoterminal job control signals are meant to be received and handled by the
sandbox process, but if the ptrace stubs are running in the same process group,
they will receive the signals as well and inject then into the sentry kernel.

This can result in duplicate signals being delivered (often to the wrong
process), or a sentry panic if the ptrace stub is inactive.

This CL makes the ptrace stub run in a new session.

PiperOrigin-RevId: 218536851
Change-Id: Ie593c5687439bbfbf690ada3b2197ea71ed60a0e
---
 pkg/sentry/platform/ptrace/subprocess_linux.go | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 7523487e7..73ddc559b 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -222,14 +222,21 @@ func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
 		return t, nil
 	}
 
+	// Move the stub to a new session (and thus a new process group). This
+	// prevents the stub from getting PTY job control signals intended only
+	// for the sentry process. We must call this before restoring signal
+	// mask.
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_SETSID, 0, 0, 0); errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
 	// afterForkInChild resets all signals to their default dispositions
 	// and restores the signal mask to its pre-fork state.
 	afterForkInChild()
 
 	// Explicitly unmask all signals to ensure that the tracer can see
 	// them.
-	errno = unmaskAllSignals()
-	if errno != 0 {
+	if errno := unmaskAllSignals(); errno != 0 {
 		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
 	}
 
-- 
cgit v1.2.3


From 425dccdd7ed035a671aaf8da1982f7b029365d66 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 24 Oct 2018 11:04:11 -0700
Subject: Convert Unix transport to syserr

Previously this code used the tcpip error space. Since it is no longer part of
netstack, it can use the sentry's error space (except for a few cases where
there is still some shared code. This reduces the number of error space
conversions required for hot Unix socket operations.

PiperOrigin-RevId: 218541611
Change-Id: I3d13047006a8245b5dfda73364d37b8a453784bb
---
 pkg/sentry/fs/gofer/BUILD                          |  2 +-
 pkg/sentry/fs/gofer/socket.go                      | 32 +++++++-------
 pkg/sentry/fs/host/BUILD                           |  1 -
 pkg/sentry/fs/host/socket.go                       | 44 ++++++++-----------
 pkg/sentry/fs/host/socket_test.go                  |  8 ++--
 pkg/sentry/socket/netlink/socket.go                | 22 +++++-----
 pkg/sentry/socket/unix/io.go                       |  5 +--
 pkg/sentry/socket/unix/transport/BUILD             |  1 +
 pkg/sentry/socket/unix/transport/connectioned.go   | 43 +++++++++---------
 pkg/sentry/socket/unix/transport/connectionless.go | 31 ++++++-------
 pkg/sentry/socket/unix/transport/queue.go          | 26 +++++------
 pkg/sentry/socket/unix/transport/unix.go           | 51 +++++++++++-----------
 pkg/sentry/socket/unix/unix.go                     | 32 +++++++-------
 13 files changed, 146 insertions(+), 152 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index c9e531e40..35ffadd13 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -43,8 +43,8 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
+        "//pkg/syserr",
         "//pkg/syserror",
-        "//pkg/tcpip",
         "//pkg/unet",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index 76ce58810..ce6d3d5c3 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -20,7 +20,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -74,10 +74,10 @@ func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) {
 }
 
 // BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
-func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *tcpip.Error {
+func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
 	cf, ok := unixSockToP9(ce.Type())
 	if !ok {
-		return tcpip.ErrConnectionRefused
+		return syserr.ErrConnectionRefused
 	}
 
 	// No lock ordering required as only the ConnectingEndpoint has a mutex.
@@ -86,24 +86,24 @@ func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnC
 	// Check connecting state.
 	if ce.Connected() {
 		ce.Unlock()
-		return tcpip.ErrAlreadyConnected
+		return syserr.ErrAlreadyConnected
 	}
 	if ce.Listening() {
 		ce.Unlock()
-		return tcpip.ErrInvalidEndpointState
+		return syserr.ErrInvalidEndpointState
 	}
 
 	hostFile, err := e.file.Connect(cf)
 	if err != nil {
 		ce.Unlock()
-		return tcpip.ErrConnectionRefused
+		return syserr.ErrConnectionRefused
 	}
 
-	c, terr := host.NewConnectedEndpoint(hostFile, ce.WaiterQueue(), e.path)
-	if terr != nil {
+	c, serr := host.NewConnectedEndpoint(hostFile, ce.WaiterQueue(), e.path)
+	if serr != nil {
 		ce.Unlock()
-		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, cf, terr)
-		return terr
+		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, cf, serr)
+		return serr
 	}
 
 	returnConnect(c, c)
@@ -115,16 +115,16 @@ func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnC
 
 // UnidirectionalConnect implements
 // transport.BoundEndpoint.UnidirectionalConnect.
-func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *tcpip.Error) {
+func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *syserr.Error) {
 	hostFile, err := e.file.Connect(p9.DgramSocket)
 	if err != nil {
-		return nil, tcpip.ErrConnectionRefused
+		return nil, syserr.ErrConnectionRefused
 	}
 
-	c, terr := host.NewConnectedEndpoint(hostFile, &waiter.Queue{}, e.path)
-	if terr != nil {
-		log.Warningf("Gofer returned invalid host socket for UnidirectionalConnect; file %+v: %v", e.file, terr)
-		return nil, terr
+	c, serr := host.NewConnectedEndpoint(hostFile, &waiter.Queue{}, e.path)
+	if serr != nil {
+		log.Warningf("Gofer returned invalid host socket for UnidirectionalConnect; file %+v: %v", e.file, serr)
+		return nil, serr
 	}
 	c.Init()
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index d1eb9bd64..89d7b2fe7 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -49,7 +49,6 @@ go_library(
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
-        "//pkg/tcpip/link/rawfile",
         "//pkg/unet",
         "//pkg/waiter",
         "//pkg/waiter/fdnotifier",
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index af53bf533..506be3056 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -30,7 +30,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
@@ -83,33 +82,33 @@ type ConnectedEndpoint struct {
 
 // init performs initialization required for creating new ConnectedEndpoints and
 // for restoring them.
-func (c *ConnectedEndpoint) init() *tcpip.Error {
+func (c *ConnectedEndpoint) init() *syserr.Error {
 	family, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_DOMAIN)
 	if err != nil {
-		return translateError(err)
+		return syserr.FromError(err)
 	}
 
 	if family != syscall.AF_UNIX {
 		// We only allow Unix sockets.
-		return tcpip.ErrInvalidEndpointState
+		return syserr.ErrInvalidEndpointState
 	}
 
 	stype, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_TYPE)
 	if err != nil {
-		return translateError(err)
+		return syserr.FromError(err)
 	}
 
 	if err := syscall.SetNonblock(c.file.FD(), true); err != nil {
-		return translateError(err)
+		return syserr.FromError(err)
 	}
 
 	sndbuf, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF)
 	if err != nil {
-		return translateError(err)
+		return syserr.FromError(err)
 	}
 	if sndbuf > maxSendBufferSize {
 		log.Warningf("Socket send buffer too large: %d", sndbuf)
-		return tcpip.ErrInvalidEndpointState
+		return syserr.ErrInvalidEndpointState
 	}
 
 	c.stype = transport.SockType(stype)
@@ -124,7 +123,7 @@ func (c *ConnectedEndpoint) init() *tcpip.Error {
 // The caller is responsible for calling Init(). Additionaly, Release needs to
 // be called twice because ConnectedEndpoint is both a transport.Receiver and
 // transport.ConnectedEndpoint.
-func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *tcpip.Error) {
+func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (*ConnectedEndpoint, *syserr.Error) {
 	e := ConnectedEndpoint{
 		path:  path,
 		queue: queue,
@@ -160,7 +159,7 @@ func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.F
 	e, err := NewConnectedEndpoint(f2, &q, "" /* path */)
 	if err != nil {
 		f2.Release()
-		return nil, syserr.TranslateNetstackError(err).ToError()
+		return nil, err.ToError()
 	}
 
 	// Take ownship of the FD.
@@ -194,7 +193,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
 		} else {
 			f.Release()
 		}
-		return nil, syserr.TranslateNetstackError(err).ToError()
+		return nil, err.ToError()
 	}
 
 	e.srfd = srfd
@@ -206,15 +205,15 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
 }
 
 // Send implements transport.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.writeClosed {
-		return 0, false, tcpip.ErrClosedForSend
+		return 0, false, syserr.ErrClosedForSend
 	}
 
 	if !controlMessages.Empty() {
-		return 0, false, tcpip.ErrInvalidEndpointState
+		return 0, false, syserr.ErrInvalidEndpointState
 	}
 
 	// Since stream sockets don't preserve message boundaries, we can write
@@ -236,7 +235,7 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.Contro
 
 	// There is no need for the callee to call SendNotify because fdWriteVec
 	// uses the host's sendmsg(2) and the host kernel's queue.
-	return n, false, translateError(err)
+	return n, false, syserr.FromError(err)
 }
 
 // SendNotify implements transport.ConnectedEndpoint.SendNotify.
@@ -283,11 +282,11 @@ func (c *ConnectedEndpoint) EventUpdate() {
 }
 
 // Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, tcpip.FullAddress, bool, *syserr.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.readClosed {
-		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive
+		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive
 	}
 
 	var cm unet.ControlMessage
@@ -305,7 +304,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 		err = nil
 	}
 	if err != nil {
-		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
+		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, syserr.FromError(err)
 	}
 
 	// There is no need for the callee to call RecvNotify because fdReadVec uses
@@ -323,7 +322,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 
 	fds, err := cm.ExtractFDs()
 	if err != nil {
-		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, translateError(err)
+		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, syserr.FromError(err)
 	}
 
 	if len(fds) == 0 {
@@ -389,10 +388,3 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
 func (c *ConnectedEndpoint) Release() {
 	c.ref.DecRefWithDestructor(c.close)
 }
-
-func translateError(err error) *tcpip.Error {
-	if err == nil {
-		return nil
-	}
-	return rawfile.TranslateErrno(err.(syscall.Errno))
-}
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index e9a88b124..17bf397ef 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -199,15 +199,15 @@ func TestListen(t *testing.T) {
 
 func TestSend(t *testing.T) {
 	e := ConnectedEndpoint{writeClosed: true}
-	if _, _, err := e.Send(nil, transport.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend {
-		t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend)
+	if _, _, err := e.Send(nil, transport.ControlMessages{}, tcpip.FullAddress{}); err != syserr.ErrClosedForSend {
+		t.Errorf("Got %#v.Send() = %v, want = %v", e, err, syserr.ErrClosedForSend)
 	}
 }
 
 func TestRecv(t *testing.T) {
 	e := ConnectedEndpoint{readClosed: true}
-	if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != tcpip.ErrClosedForReceive {
-		t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, tcpip.ErrClosedForReceive)
+	if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != syserr.ErrClosedForReceive {
+		t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, syserr.ErrClosedForReceive)
 	}
 }
 
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 4d4130a4c..f901cfa0b 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -109,16 +109,16 @@ func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
 
 	// Bind the endpoint for good measure so we can connect to it. The
 	// bound address will never be exposed.
-	if terr := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); terr != nil {
+	if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
 		ep.Close()
-		return nil, syserr.TranslateNetstackError(terr)
+		return nil, err
 	}
 
 	// Create a connection from which the kernel can write messages.
-	connection, terr := ep.(transport.BoundEndpoint).UnidirectionalConnect()
-	if terr != nil {
+	connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect()
+	if err != nil {
 		ep.Close()
-		return nil, syserr.TranslateNetstackError(terr)
+		return nil, err
 	}
 
 	return &Socket{
@@ -424,11 +424,11 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 	if len(bufs) > 0 {
 		// RecvMsg never receives the address, so we don't need to send
 		// one.
-		_, notify, terr := s.connection.Send(bufs, transport.ControlMessages{}, tcpip.FullAddress{})
+		_, notify, err := s.connection.Send(bufs, transport.ControlMessages{}, tcpip.FullAddress{})
 		// If the buffer is full, we simply drop messages, just like
 		// Linux.
-		if terr != nil && terr != tcpip.ErrWouldBlock {
-			return syserr.TranslateNetstackError(terr)
+		if err != nil && err != syserr.ErrWouldBlock {
+			return err
 		}
 		if notify {
 			s.connection.SendNotify()
@@ -448,9 +448,9 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
 			PortID: uint32(ms.PortID),
 		})
 
-		_, notify, terr := s.connection.Send([][]byte{m.Finalize()}, transport.ControlMessages{}, tcpip.FullAddress{})
-		if terr != nil && terr != tcpip.ErrWouldBlock {
-			return syserr.TranslateNetstackError(terr)
+		_, notify, err := s.connection.Send([][]byte{m.Finalize()}, transport.ControlMessages{}, tcpip.FullAddress{})
+		if err != nil && err != syserr.ErrWouldBlock {
+			return err
 		}
 		if notify {
 			s.connection.SendNotify()
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 7d6434696..7d80e4393 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -17,7 +17,6 @@ package unix
 import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 )
 
@@ -40,7 +39,7 @@ func (w *EndpointWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	return safemem.FromVecWriterFunc{func(bufs [][]byte) (int64, error) {
 		n, err := w.Endpoint.SendMsg(bufs, w.Control, w.To)
 		if err != nil {
-			return int64(n), syserr.TranslateNetstackError(err).ToError()
+			return int64(n), err.ToError()
 		}
 		return int64(n), nil
 	}}.WriteFromBlocks(srcs)
@@ -82,7 +81,7 @@ func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		r.Control = c
 		r.MsgSize = ms
 		if err != nil {
-			return int64(n), syserr.TranslateNetstackError(err).ToError()
+			return int64(n), err.ToError()
 		}
 		return int64(n), nil
 	}}.ReadToBlocks(dsts)
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 5bc01e3c8..5a90837bc 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -30,6 +30,7 @@ go_library(
     deps = [
         "//pkg/ilist",
         "//pkg/refs",
+        "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/waiter",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 83b50459f..7cfbbfe8a 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -17,6 +17,7 @@ package transport
 import (
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -236,14 +237,14 @@ func (e *connectionedEndpoint) Close() {
 }
 
 // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
-func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error {
+func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
 	if ce.Type() != e.stype {
-		return tcpip.ErrConnectionRefused
+		return syserr.ErrConnectionRefused
 	}
 
 	// Check if ce is e to avoid a deadlock.
 	if ce, ok := ce.(*connectionedEndpoint); ok && ce == e {
-		return tcpip.ErrInvalidEndpointState
+		return syserr.ErrInvalidEndpointState
 	}
 
 	// Do a dance to safely acquire locks on both endpoints.
@@ -259,19 +260,19 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 	if ce.Connected() {
 		e.Unlock()
 		ce.Unlock()
-		return tcpip.ErrAlreadyConnected
+		return syserr.ErrAlreadyConnected
 	}
 	if ce.Listening() {
 		e.Unlock()
 		ce.Unlock()
-		return tcpip.ErrInvalidEndpointState
+		return syserr.ErrInvalidEndpointState
 	}
 
 	// Check bound state.
 	if !e.Listening() {
 		e.Unlock()
 		ce.Unlock()
-		return tcpip.ErrConnectionRefused
+		return syserr.ErrConnectionRefused
 	}
 
 	// Create a newly bound connectionedEndpoint.
@@ -327,18 +328,18 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 		ne.Close()
 		e.Unlock()
 		ce.Unlock()
-		return tcpip.ErrConnectionRefused
+		return syserr.ErrConnectionRefused
 	}
 }
 
 // UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
-func (e *connectionedEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error) {
-	return nil, tcpip.ErrConnectionRefused
+func (e *connectionedEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error) {
+	return nil, syserr.ErrConnectionRefused
 }
 
 // Connect attempts to directly connect to another Endpoint.
 // Implements Endpoint.Connect.
-func (e *connectionedEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
+func (e *connectionedEndpoint) Connect(server BoundEndpoint) *syserr.Error {
 	returnConnect := func(r Receiver, ce ConnectedEndpoint) {
 		e.receiver = r
 		e.connected = ce
@@ -348,14 +349,14 @@ func (e *connectionedEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
 }
 
 // Listen starts listening on the connection.
-func (e *connectionedEndpoint) Listen(backlog int) *tcpip.Error {
+func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error {
 	e.Lock()
 	defer e.Unlock()
 	if e.Listening() {
 		// Adjust the size of the channel iff we can fix existing
 		// pending connections into the new one.
 		if len(e.acceptedChan) > backlog {
-			return tcpip.ErrInvalidEndpointState
+			return syserr.ErrInvalidEndpointState
 		}
 		origChan := e.acceptedChan
 		e.acceptedChan = make(chan *connectionedEndpoint, backlog)
@@ -366,7 +367,7 @@ func (e *connectionedEndpoint) Listen(backlog int) *tcpip.Error {
 		return nil
 	}
 	if !e.isBound() {
-		return tcpip.ErrInvalidEndpointState
+		return syserr.ErrInvalidEndpointState
 	}
 
 	// Normal case.
@@ -375,12 +376,12 @@ func (e *connectionedEndpoint) Listen(backlog int) *tcpip.Error {
 }
 
 // Accept accepts a new connection.
-func (e *connectionedEndpoint) Accept() (Endpoint, *tcpip.Error) {
+func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) {
 	e.Lock()
 	defer e.Unlock()
 
 	if !e.Listening() {
-		return nil, tcpip.ErrInvalidEndpointState
+		return nil, syserr.ErrInvalidEndpointState
 	}
 
 	select {
@@ -389,7 +390,7 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *tcpip.Error) {
 
 	default:
 		// Nothing left.
-		return nil, tcpip.ErrWouldBlock
+		return nil, syserr.ErrWouldBlock
 	}
 }
 
@@ -401,15 +402,15 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *tcpip.Error) {
 //
 // Bind will fail only if the socket is connected, bound or the passed address
 // is invalid (the empty string).
-func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error {
 	e.Lock()
 	defer e.Unlock()
 	if e.isBound() || e.Listening() {
-		return tcpip.ErrAlreadyBound
+		return syserr.ErrAlreadyBound
 	}
 	if addr.Addr == "" {
 		// The empty string is not permitted.
-		return tcpip.ErrBadLocalAddress
+		return syserr.ErrBadLocalAddress
 	}
 	if commit != nil {
 		if err := commit(); err != nil {
@@ -424,11 +425,11 @@ func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip
 
 // SendMsg writes data and a control message to the endpoint's peer.
 // This method does not block if the data cannot be written.
-func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
+func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
 	// Stream sockets do not support specifying the endpoint. Seqpacket
 	// sockets ignore the passed endpoint.
 	if e.stype == SockStream && to != nil {
-		return 0, tcpip.ErrNotSupported
+		return 0, syserr.ErrNotSupported
 	}
 	return e.baseEndpoint.SendMsg(data, c, to)
 }
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 376e4abb2..f432a9717 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -15,6 +15,7 @@
 package transport
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -70,21 +71,21 @@ func (e *connectionlessEndpoint) Close() {
 }
 
 // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
-func (e *connectionlessEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error {
-	return tcpip.ErrConnectionRefused
+func (e *connectionlessEndpoint) BidirectionalConnect(ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
+	return syserr.ErrConnectionRefused
 }
 
 // UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
-func (e *connectionlessEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error) {
+func (e *connectionlessEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error) {
 	e.Lock()
 	r := e.receiver
 	e.Unlock()
 	if r == nil {
-		return nil, tcpip.ErrConnectionRefused
+		return nil, syserr.ErrConnectionRefused
 	}
 	q := r.(*queueReceiver).readQueue
 	if !q.TryIncRef() {
-		return nil, tcpip.ErrConnectionRefused
+		return nil, syserr.ErrConnectionRefused
 	}
 	return &connectedEndpoint{
 		endpoint:   e,
@@ -94,14 +95,14 @@ func (e *connectionlessEndpoint) UnidirectionalConnect() (ConnectedEndpoint, *tc
 
 // SendMsg writes data and a control message to the specified endpoint.
 // This method does not block if the data cannot be written.
-func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
+func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
 	if to == nil {
 		return e.baseEndpoint.SendMsg(data, c, nil)
 	}
 
 	connected, err := to.UnidirectionalConnect()
 	if err != nil {
-		return 0, tcpip.ErrInvalidEndpointState
+		return 0, syserr.ErrInvalidEndpointState
 	}
 	defer connected.Release()
 
@@ -122,7 +123,7 @@ func (e *connectionlessEndpoint) Type() SockType {
 }
 
 // Connect attempts to connect directly to server.
-func (e *connectionlessEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
+func (e *connectionlessEndpoint) Connect(server BoundEndpoint) *syserr.Error {
 	connected, err := server.UnidirectionalConnect()
 	if err != nil {
 		return err
@@ -136,13 +137,13 @@ func (e *connectionlessEndpoint) Connect(server BoundEndpoint) *tcpip.Error {
 }
 
 // Listen starts listening on the connection.
-func (e *connectionlessEndpoint) Listen(int) *tcpip.Error {
-	return tcpip.ErrNotSupported
+func (e *connectionlessEndpoint) Listen(int) *syserr.Error {
+	return syserr.ErrNotSupported
 }
 
 // Accept accepts a new connection.
-func (e *connectionlessEndpoint) Accept() (Endpoint, *tcpip.Error) {
-	return nil, tcpip.ErrNotSupported
+func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) {
+	return nil, syserr.ErrNotSupported
 }
 
 // Bind binds the connection.
@@ -153,15 +154,15 @@ func (e *connectionlessEndpoint) Accept() (Endpoint, *tcpip.Error) {
 //
 // Bind will fail only if the socket is connected, bound or the passed address
 // is invalid (the empty string).
-func (e *connectionlessEndpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+func (e *connectionlessEndpoint) Bind(addr tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error {
 	e.Lock()
 	defer e.Unlock()
 	if e.isBound() {
-		return tcpip.ErrAlreadyBound
+		return syserr.ErrAlreadyBound
 	}
 	if addr.Addr == "" {
 		// The empty string is not permitted.
-		return tcpip.ErrBadLocalAddress
+		return syserr.ErrBadLocalAddress
 	}
 	if commit != nil {
 		if err := commit(); err != nil {
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 72aa409ab..05d1bdeef 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -18,7 +18,7 @@ import (
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/refs"
-	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -110,12 +110,12 @@ func (q *queue) IsWritable() bool {
 //
 // If notify is true, ReaderQueue.Notify must be called:
 // q.ReaderQueue.Notify(waiter.EventIn)
-func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *tcpip.Error) {
+func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *syserr.Error) {
 	q.mu.Lock()
 
 	if q.closed {
 		q.mu.Unlock()
-		return 0, false, tcpip.ErrClosedForSend
+		return 0, false, syserr.ErrClosedForSend
 	}
 
 	free := q.limit - q.used
@@ -126,24 +126,24 @@ func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *t
 		if free == 0 {
 			// Message can't fit right now.
 			q.mu.Unlock()
-			return 0, false, tcpip.ErrWouldBlock
+			return 0, false, syserr.ErrWouldBlock
 		}
 
 		e.Truncate(free)
 		l = e.Length()
-		err = tcpip.ErrWouldBlock
+		err = syserr.ErrWouldBlock
 	}
 
 	if l > q.limit {
 		// Message is too big to ever fit.
 		q.mu.Unlock()
-		return 0, false, tcpip.ErrMessageTooLong
+		return 0, false, syserr.ErrMessageTooLong
 	}
 
 	if l > free {
 		// Message can't fit right now.
 		q.mu.Unlock()
-		return 0, false, tcpip.ErrWouldBlock
+		return 0, false, syserr.ErrWouldBlock
 	}
 
 	notify = q.dataList.Front() == nil
@@ -159,13 +159,13 @@ func (q *queue) Enqueue(e *message, truncate bool) (l int64, notify bool, err *t
 //
 // If notify is true, WriterQueue.Notify must be called:
 // q.WriterQueue.Notify(waiter.EventOut)
-func (q *queue) Dequeue() (e *message, notify bool, err *tcpip.Error) {
+func (q *queue) Dequeue() (e *message, notify bool, err *syserr.Error) {
 	q.mu.Lock()
 
 	if q.dataList.Front() == nil {
-		err := tcpip.ErrWouldBlock
+		err := syserr.ErrWouldBlock
 		if q.closed {
-			err = tcpip.ErrClosedForReceive
+			err = syserr.ErrClosedForReceive
 		}
 		q.mu.Unlock()
 
@@ -186,14 +186,14 @@ func (q *queue) Dequeue() (e *message, notify bool, err *tcpip.Error) {
 }
 
 // Peek returns the first entry in the data queue, if one exists.
-func (q *queue) Peek() (*message, *tcpip.Error) {
+func (q *queue) Peek() (*message, *syserr.Error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 
 	if q.dataList.Front() == nil {
-		err := tcpip.ErrWouldBlock
+		err := syserr.ErrWouldBlock
 		if q.closed {
-			err = tcpip.ErrClosedForReceive
+			err = syserr.ErrClosedForReceive
 		}
 		return nil, err
 	}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 765cca27a..e98096d7b 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -19,6 +19,7 @@ import (
 	"sync"
 	"sync/atomic"
 
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -129,13 +130,13 @@ type Endpoint interface {
 	//
 	// msgLen is the length of the read message consumed for datagram Endpoints.
 	// msgLen is always the same as recvLen for stream Endpoints.
-	RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, err *tcpip.Error)
+	RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, err *syserr.Error)
 
 	// SendMsg writes data and a control message to the endpoint's peer.
 	// This method does not block if the data cannot be written.
 	//
 	// SendMsg does not take ownership of any of its arguments on error.
-	SendMsg([][]byte, ControlMessages, BoundEndpoint) (uintptr, *tcpip.Error)
+	SendMsg([][]byte, ControlMessages, BoundEndpoint) (uintptr, *syserr.Error)
 
 	// Connect connects this endpoint directly to another.
 	//
@@ -143,22 +144,22 @@ type Endpoint interface {
 	// endpoint passed in as a parameter.
 	//
 	// The error codes are the same as Connect.
-	Connect(server BoundEndpoint) *tcpip.Error
+	Connect(server BoundEndpoint) *syserr.Error
 
 	// Shutdown closes the read and/or write end of the endpoint connection
 	// to its peer.
-	Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error
+	Shutdown(flags tcpip.ShutdownFlags) *syserr.Error
 
 	// Listen puts the endpoint in "listen" mode, which allows it to accept
 	// new connections.
-	Listen(backlog int) *tcpip.Error
+	Listen(backlog int) *syserr.Error
 
 	// Accept returns a new endpoint if a peer has established a connection
 	// to an endpoint previously set to listen mode. This method does not
 	// block if no new connections are available.
 	//
 	// The returned Queue is the wait queue for the newly created endpoint.
-	Accept() (Endpoint, *tcpip.Error)
+	Accept() (Endpoint, *syserr.Error)
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
@@ -166,7 +167,7 @@ type Endpoint interface {
 	// An optional commit function will be executed atomically with respect
 	// to binding the endpoint. If this returns an error, the bind will not
 	// occur and the error will be propagated back to the caller.
-	Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error
+	Bind(address tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error
 
 	// Type return the socket type, typically either SockStream, SockDgram
 	// or SockSeqpacket.
@@ -218,9 +219,9 @@ type BoundEndpoint interface {
 	// be unconnected and not listening and the BoundEndpoint whose
 	// BidirectionalConnect method is being called must be listening.
 	//
-	// This method will return tcpip.ErrConnectionRefused on endpoints with a
+	// This method will return syserr.ErrConnectionRefused on endpoints with a
 	// type that isn't SockStream or SockSeqpacket.
-	BidirectionalConnect(ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *tcpip.Error
+	BidirectionalConnect(ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error
 
 	// UnidirectionalConnect establishes a write-only connection to a unix
 	// endpoint.
@@ -228,9 +229,9 @@ type BoundEndpoint interface {
 	// An endpoint which calls UnidirectionalConnect and supports it itself must
 	// not hold its own lock when calling UnidirectionalConnect.
 	//
-	// This method will return tcpip.ErrConnectionRefused on a non-SockDgram
+	// This method will return syserr.ErrConnectionRefused on a non-SockDgram
 	// endpoint.
-	UnidirectionalConnect() (ConnectedEndpoint, *tcpip.Error)
+	UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error)
 
 	// Release releases any resources held by the BoundEndpoint. It must be
 	// called before dropping all references to a BoundEndpoint returned by a
@@ -287,7 +288,7 @@ type Receiver interface {
 	// See Endpoint.RecvMsg for documentation on shared arguments.
 	//
 	// notify indicates if RecvNotify should be called.
-	Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, source tcpip.FullAddress, notify bool, err *tcpip.Error)
+	Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, source tcpip.FullAddress, notify bool, err *syserr.Error)
 
 	// RecvNotify notifies the Receiver of a successful Recv. This must not be
 	// called while holding any endpoint locks.
@@ -327,10 +328,10 @@ type queueReceiver struct {
 }
 
 // Recv implements Receiver.Recv.
-func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *syserr.Error) {
 	var m *message
 	var notify bool
-	var err *tcpip.Error
+	var err *syserr.Error
 	if peek {
 		m, err = q.readQueue.Peek()
 	} else {
@@ -439,7 +440,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
 }
 
 // Recv implements Receiver.Recv.
-func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) {
+func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *syserr.Error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 
@@ -560,9 +561,9 @@ type ConnectedEndpoint interface {
 	//
 	// notify indicates if SendNotify should be called.
 	//
-	// tcpip.ErrWouldBlock can be returned along with a partial write if
+	// syserr.ErrWouldBlock can be returned along with a partial write if
 	// the caller should block to send the rest of the data.
-	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n uintptr, notify bool, err *tcpip.Error)
+	Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n uintptr, notify bool, err *syserr.Error)
 
 	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
 	// must not be called while holding any endpoint locks.
@@ -630,7 +631,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
 }
 
 // Send implements ConnectedEndpoint.Send.
-func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) {
+func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
 	var l int64
 	for _, d := range data {
 		l += int64(len(d))
@@ -774,12 +775,12 @@ func (e *baseEndpoint) Connected() bool {
 }
 
 // RecvMsg reads data and a control message from the endpoint.
-func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, *tcpip.Error) {
+func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, *syserr.Error) {
 	e.Lock()
 
 	if e.receiver == nil {
 		e.Unlock()
-		return 0, 0, ControlMessages{}, tcpip.ErrNotConnected
+		return 0, 0, ControlMessages{}, syserr.ErrNotConnected
 	}
 
 	recvLen, msgLen, cms, a, notify, err := e.receiver.Recv(data, creds, numRights, peek)
@@ -800,15 +801,15 @@ func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, pee
 
 // SendMsg writes data and a control message to the endpoint's peer.
 // This method does not block if the data cannot be written.
-func (e *baseEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *tcpip.Error) {
+func (e *baseEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
 	e.Lock()
 	if !e.Connected() {
 		e.Unlock()
-		return 0, tcpip.ErrNotConnected
+		return 0, syserr.ErrNotConnected
 	}
 	if to != nil {
 		e.Unlock()
-		return 0, tcpip.ErrAlreadyConnected
+		return 0, syserr.ErrAlreadyConnected
 	}
 
 	n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
@@ -901,11 +902,11 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 
 // Shutdown closes the read and/or write end of the endpoint connection to its
 // peer.
-func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
 	e.Lock()
 	if !e.Connected() {
 		e.Unlock()
-		return tcpip.ErrNotConnected
+		return syserr.ErrNotConnected
 	}
 
 	if flags&tcpip.ShutdownRead != 0 {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 3543dd81f..334169372 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -147,7 +147,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 // Listen implements the linux syscall listen(2) for sockets backed by
 // a transport.Endpoint.
 func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
-	return syserr.TranslateNetstackError(s.ep.Listen(backlog))
+	return s.ep.Listen(backlog)
 }
 
 // blockingAccept implements a blocking version of accept(2), that is, if no
@@ -161,8 +161,8 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
 	// Try to accept the connection; if it fails, then wait until we get a
 	// notification.
 	for {
-		if ep, err := s.ep.Accept(); err != tcpip.ErrWouldBlock {
-			return ep, syserr.TranslateNetstackError(err)
+		if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock {
+			return ep, err
 		}
 
 		if err := t.Block(ch); err != nil {
@@ -177,8 +177,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	// Issue the accept request to get the new endpoint.
 	ep, err := s.ep.Accept()
 	if err != nil {
-		if err != tcpip.ErrWouldBlock || !blocking {
-			return 0, nil, 0, syserr.TranslateNetstackError(err)
+		if err != syserr.ErrWouldBlock || !blocking {
+			return 0, nil, 0, err
 		}
 
 		var err *syserr.Error
@@ -232,15 +232,15 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 		return syserr.ErrInvalidArgument
 	}
 
-	return syserr.TranslateNetstackError(s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *tcpip.Error {
+	return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error {
 		// Is it abstract?
 		if p[0] == 0 {
 			if t.IsNetworkNamespaced() {
-				return tcpip.ErrInvalidEndpointState
+				return syserr.ErrInvalidEndpointState
 			}
 			if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil {
-				// tcpip.ErrPortInUse corresponds to EADDRINUSE.
-				return tcpip.ErrPortInUse
+				// syserr.ErrPortInUse corresponds to EADDRINUSE.
+				return syserr.ErrPortInUse
 			}
 		} else {
 			// The parent and name.
@@ -269,7 +269,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 				d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, fs.DefaultTraversalLimit)
 				if err != nil {
 					// No path available.
-					return tcpip.ErrNoSuchFile
+					return syserr.ErrNoSuchFile
 				}
 				defer d.DecRef()
 				name = p[lastSlash+1:]
@@ -278,13 +278,13 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 			// Create the socket.
 			childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
 			if err != nil {
-				return tcpip.ErrPortInUse
+				return syserr.ErrPortInUse
 			}
 			childDir.DecRef()
 		}
 
 		return nil
-	}))
+	})
 }
 
 // extractEndpoint retrieves the transport.BoundEndpoint associated with a Unix
@@ -341,7 +341,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
 	defer ep.Release()
 
 	// Connect the server endpoint.
-	return syserr.TranslateNetstackError(s.ep.Connect(ep))
+	return s.ep.Connect(ep)
 }
 
 // Writev implements fs.FileOperations.Write.
@@ -350,8 +350,8 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 	ctrl := control.New(t, s.ep, nil)
 
 	if src.NumBytes() == 0 {
-		nInt, tcpipError := s.ep.SendMsg([][]byte{}, ctrl, nil)
-		return int64(nInt), syserr.TranslateNetstackError(tcpipError).ToError()
+		nInt, err := s.ep.SendMsg([][]byte{}, ctrl, nil)
+		return int64(nInt), err.ToError()
 	}
 
 	return src.CopyInTo(ctx, &EndpointWriter{
@@ -448,7 +448,7 @@ func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	}
 
 	// Issue shutdown request.
-	return syserr.TranslateNetstackError(s.ep.Shutdown(f))
+	return s.ep.Shutdown(f)
 }
 
 // Read implements fs.FileOperations.Read.
-- 
cgit v1.2.3


From e7191f058f550cc3a203a854a1d81f7746c96e53 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 24 Oct 2018 15:51:46 -0700
Subject: Use TRAP to simplify vsyscall emulation.

PiperOrigin-RevId: 218592058
Change-Id: I373a2d813aa6cc362500dd5a894c0b214a1959d7
---
 pkg/sentry/platform/ptrace/subprocess.go       | 24 ++++++++++-----------
 pkg/sentry/platform/ptrace/subprocess_linux.go | 29 ++++++++++++++++++++------
 2 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 6a9da5db8..2cd49d1ec 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -357,15 +357,13 @@ func (t *thread) destroy() {
 
 // init initializes trace options.
 func (t *thread) init() {
-	// Set our TRACESYSGOOD option to differeniate real SIGTRAP. Also, we
-	// require the SECCOMP option to ensure that seccomp violations
-	// generate a ptrace event.
+	// Set our TRACESYSGOOD option to differeniate real SIGTRAP.
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_PTRACE,
 		syscall.PTRACE_SETOPTIONS,
 		uintptr(t.tid),
 		0,
-		syscall.PTRACE_O_TRACESYSGOOD|_PTRACE_O_TRACESECCOMP,
+		syscall.PTRACE_O_TRACESYSGOOD,
 		0, 0)
 	if errno != 0 {
 		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
@@ -522,12 +520,6 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 			// Ensure registers are sane.
 			updateSyscallRegs(regs)
 			return true
-		} else if sig == (seccompEvent | syscall.SIGTRAP) {
-			// Seccomp is enabled, and caught the system call. This
-			// is an emulated vsyscall call, since those are caught
-			// only by seccomp and explicitly set to trace.
-			updateSyscallRegs(regs)
-			return true
 		} else if sig == syscall.SIGSTOP {
 			// SIGSTOP was delivered to another thread in the same thread
 			// group, which initiated another group stop. Just ignore it.
@@ -544,9 +536,17 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 		// either delivered from the kernel or from this process. We
 		// don't respect other signals.
 		if c.signalInfo.Code > 0 {
-			return false // kernel.
+			// The signal was generated by the kernel. We inspect
+			// the signal information, and may patch it in order to
+			// faciliate vsyscall emulation. See patchSignalInfo.
+			patchSignalInfo(regs, &c.signalInfo)
+			return false
 		} else if c.signalInfo.Code <= 0 && c.signalInfo.Pid() == int32(os.Getpid()) {
-			return false // this process.
+			// The signal was generated by this process. That means
+			// that it was an interrupt or something else that we
+			// should bail for. Note that we ignore signals
+			// generated by other processes.
+			return false
 		}
 	}
 }
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 73ddc559b..885ba4b2e 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -27,11 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
 )
 
-const (
-	syscallEvent           syscall.Signal = 0x80
-	seccompEvent           syscall.Signal = 0x700 // 0x7 (PTRACE_SECCOMP_EVENT) << 8
-	_PTRACE_O_TRACESECCOMP                = 0x80  // 1 << 0x7 (PTRACE_SECCOMP_EVENT)
-)
+const syscallEvent syscall.Signal = 0x80
 
 // probeSeccomp returns true iff seccomp is run after ptrace notifications,
 // which is generally the case for kernel version >= 4.8. This check is dynamic
@@ -81,6 +77,27 @@ func probeSeccomp() bool {
 	}
 }
 
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+		signalInfo.Signo = int32(linux.SIGSEGV)
+
+		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+		// with the si_call_addr field pointing to the current RIP. This field
+		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+		// anything there. We do need to unwind emulation however, so we set the
+		// instruction pointer to the faulting value, and "unpop" the stack.
+		regs.Rip = signalInfo.Addr()
+		regs.Rsp -= 8
+	}
+}
+
 // createStub creates a fresh stub processes.
 //
 // Precondition: the runtime OS thread must be locked.
@@ -131,7 +148,7 @@ func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
 				syscall.SYS_TIME:         {},
 				309:                      {}, // SYS_GETCPU.
 			},
-			Action:   uint32(linux.SECCOMP_RET_TRACE),
+			Action:   uint32(linux.SECCOMP_RET_TRAP),
 			Vsyscall: true,
 		},
 	}
-- 
cgit v1.2.3


From 0091db9cbddb6c9fb4c96fbde980780c98006eda Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 30 Oct 2018 22:45:51 -0700
Subject: kvm: use private futexes.

Use private futexes for performance and to align with other runtime uses.

PiperOrigin-RevId: 219422634
Change-Id: Ief2af5e8302847ea6dc246e8d1ee4d64684ca9dd
---
 pkg/sentry/platform/kvm/machine_unsafe.go | 4 ++--
 runsc/boot/filter/config.go               | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 4f5b01321..38c1f102f 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -88,7 +88,7 @@ func (c *vCPU) notify() {
 	_, _, errno := syscall.RawSyscall6(
 		syscall.SYS_FUTEX,
 		uintptr(unsafe.Pointer(&c.state)),
-		linux.FUTEX_WAKE,
+		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
 		^uintptr(0), // Number of waiters.
 		0, 0, 0)
 	if errno != 0 {
@@ -106,7 +106,7 @@ func (c *vCPU) waitUntilNot(state uint32) {
 	_, _, errno := syscall.Syscall6(
 		syscall.SYS_FUTEX,
 		uintptr(unsafe.Pointer(&c.state)),
-		linux.FUTEX_WAIT,
+		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
 		uintptr(state),
 		0, 0, 0)
 	if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN {
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 378396b9b..83c1fbcce 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -438,7 +438,6 @@ func ptraceFilters() seccomp.SyscallRules {
 func kvmFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ARCH_PRCTL:      {},
-		syscall.SYS_FUTEX:           {},
 		syscall.SYS_IOCTL:           {},
 		syscall.SYS_MMAP:            {},
 		syscall.SYS_RT_SIGSUSPEND:   {},
-- 
cgit v1.2.3


From e9dbd5ab67bc31e59910930e6c1b551c0fd05ee6 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 31 Oct 2018 10:07:06 -0700
Subject: kvm: avoid siginfo allocations.

PiperOrigin-RevId: 219492587
Change-Id: I47f6fc0b74a4907ab0aff03d5f26453bdb983bb5
---
 pkg/sentry/platform/kvm/context.go       |  7 ++-
 pkg/sentry/platform/kvm/kvm_test.go      | 88 +++++++++++++++++++-------------
 pkg/sentry/platform/kvm/machine_amd64.go | 65 ++++++++++++-----------
 3 files changed, 89 insertions(+), 71 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index be902be88..c75a4b415 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -29,6 +29,9 @@ type context struct {
 	// machine is the parent machine, and is immutable.
 	machine *machine
 
+	// info is the arch.SignalInfo cached for this context.
+	info arch.SignalInfo
+
 	// interrupt is the interrupt context.
 	interrupt interrupt.Forwarder
 }
@@ -65,7 +68,7 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 	}
 
 	// Take the blue pill.
-	si, at, err := cpu.SwitchToUser(switchOpts)
+	at, err := cpu.SwitchToUser(switchOpts, &c.info)
 
 	// Clear the address space.
 	cpu.active.set(nil)
@@ -75,7 +78,7 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a
 
 	// All done.
 	c.interrupt.Disable()
-	return si, at, err
+	return &c.info, at, err
 }
 
 // Interrupt interrupts the running context.
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 45eeb96ff..fff463a6e 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -156,12 +156,13 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func
 
 func TestApplicationSyscall(t *testing.T) {
 	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
 			FullRestore:        true,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			return true // Retry.
 		} else if err != nil {
 			t.Errorf("application syscall with full restore failed: %v", err)
@@ -169,11 +170,12 @@ func TestApplicationSyscall(t *testing.T) {
 		return false
 	})
 	applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			return true // Retry.
 		} else if err != nil {
 			t.Errorf("application syscall with partial restore failed: %v", err)
@@ -185,27 +187,29 @@ func TestApplicationSyscall(t *testing.T) {
 func TestApplicationFault(t *testing.T) {
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
-		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
 			FullRestore:        true,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			return true // Retry.
-		} else if err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+		} else if err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
 			t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
 		}
 		return false
 	})
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, nil) // Cause fault.
-		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			return true // Retry.
-		} else if err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+		} else if err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
 			t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
 		}
 		return false
@@ -216,11 +220,12 @@ func TestRegistersSyscall(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
 		for {
-			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
 				Registers:          regs,
 				FloatingPointState: dummyFPState,
 				PageTables:         pt,
-			}); err == platform.ErrContextInterrupt {
+			}, &si); err == platform.ErrContextInterrupt {
 				continue // Retry.
 			} else if err != nil {
 				t.Errorf("application register check with partial restore got unexpected error: %v", err)
@@ -238,12 +243,13 @@ func TestRegistersFault(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestRegs(regs) // Fill values for all registers.
 		for {
-			if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
 				Registers:          regs,
 				FloatingPointState: dummyFPState,
 				PageTables:         pt,
 				FullRestore:        true,
-			}); err == platform.ErrContextInterrupt {
+			}, &si); err == platform.ErrContextInterrupt {
 				continue // Retry.
 			} else if err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
 				t.Errorf("application register check with full restore got unexpected error: %v", err)
@@ -261,12 +267,13 @@ func TestSegments(t *testing.T) {
 	applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTestSegments(regs)
 		for {
-			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
 				Registers:          regs,
 				FloatingPointState: dummyFPState,
 				PageTables:         pt,
 				FullRestore:        true,
-			}); err == platform.ErrContextInterrupt {
+			}, &si); err == platform.ErrContextInterrupt {
 				continue // Retry.
 			} else if err != nil {
 				t.Errorf("application segment check with full restore got unexpected error: %v", err)
@@ -286,11 +293,12 @@ func TestBounce(t *testing.T) {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
 		}()
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err != platform.ErrContextInterrupt {
+		}, &si); err != platform.ErrContextInterrupt {
 			t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 		}
 		return false
@@ -300,12 +308,13 @@ func TestBounce(t *testing.T) {
 			time.Sleep(time.Millisecond)
 			c.BounceToKernel()
 		}()
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
 			FullRestore:        true,
-		}); err != platform.ErrContextInterrupt {
+		}, &si); err != platform.ErrContextInterrupt {
 			t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 		}
 		return false
@@ -331,11 +340,12 @@ func TestBounceStress(t *testing.T) {
 				c.BounceToKernel()
 			}()
 			randomSleep()
-			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
 				Registers:          regs,
 				FloatingPointState: dummyFPState,
 				PageTables:         pt,
-			}); err != platform.ErrContextInterrupt {
+			}, &si); err != platform.ErrContextInterrupt {
 				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 			}
 			c.unlock()
@@ -351,11 +361,12 @@ func TestInvalidate(t *testing.T) {
 	applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
 		testutil.SetTouchTarget(regs, &data) // Read legitimate value.
 		for {
-			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
 				Registers:          regs,
 				FloatingPointState: dummyFPState,
 				PageTables:         pt,
-			}); err == platform.ErrContextInterrupt {
+			}, &si); err == platform.ErrContextInterrupt {
 				continue // Retry.
 			} else if err != nil {
 				t.Errorf("application partial restore: got %v, wanted nil", err)
@@ -365,12 +376,13 @@ func TestInvalidate(t *testing.T) {
 		// Unmap the page containing data & invalidate.
 		pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize)
 		for {
-			if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+			var si arch.SignalInfo
+			if _, err := c.SwitchToUser(ring0.SwitchOpts{
 				Registers:          regs,
 				FloatingPointState: dummyFPState,
 				PageTables:         pt,
 				Flush:              true,
-			}); err == platform.ErrContextInterrupt {
+			}, &si); err == platform.ErrContextInterrupt {
 				continue // Retry.
 			} else if err != platform.ErrContextSignal {
 				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
@@ -388,27 +400,29 @@ func IsFault(err error, si *arch.SignalInfo) bool {
 
 func TestEmptyAddressSpace(t *testing.T) {
 	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			return true // Retry.
-		} else if !IsFault(err, si) {
+		} else if !IsFault(err, &si) {
 			t.Errorf("first fault with partial restore failed got %v", err)
 			t.Logf("registers: %#v", &regs)
 		}
 		return false
 	})
 	applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if si, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
 			FullRestore:        true,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			return true // Retry.
-		} else if !IsFault(err, si) {
+		} else if !IsFault(err, &si) {
 			t.Errorf("first fault with full restore failed got %v", err)
 			t.Logf("registers: %#v", &regs)
 		}
@@ -459,11 +473,12 @@ func BenchmarkApplicationSyscall(b *testing.B) {
 		a int // Count for ErrContextInterrupt.
 	)
 	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			a++
 			return true // Ignore.
 		} else if err != nil {
@@ -495,11 +510,12 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
 		a int
 	)
 	applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
-		if _, _, err := c.SwitchToUser(ring0.SwitchOpts{
+		var si arch.SignalInfo
+		if _, err := c.SwitchToUser(ring0.SwitchOpts{
 			Registers:          regs,
 			FloatingPointState: dummyFPState,
 			PageTables:         pt,
-		}); err == platform.ErrContextInterrupt {
+		}, &si); err == platform.ErrContextInterrupt {
 			a++
 			return true // Ignore.
 		} else if err != nil {
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index e0aec42b8..c03792a1b 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -156,19 +156,19 @@ func (c *vCPU) initArchState() error {
 // nonCanonical generates a canonical address return.
 //
 //go:nosplit
-func nonCanonical(addr uint64, signal int32) (*arch.SignalInfo, usermem.AccessType, error) {
-	info := &arch.SignalInfo{
+func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	*info = arch.SignalInfo{
 		Signo: signal,
 		Code:  arch.SignalInfoKernel,
 	}
 	info.SetAddr(addr) // Include address.
-	return info, usermem.NoAccess, platform.ErrContextSignal
+	return usermem.NoAccess, platform.ErrContextSignal
 }
 
 // fault generates an appropriate fault return.
 //
 //go:nosplit
-func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error) {
+func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
 	bluepill(c) // Probably no-op, but may not be.
 	faultAddr := ring0.ReadCR2()
 	code, user := c.ErrorCode()
@@ -176,11 +176,10 @@ func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error)
 		// The last fault serviced by this CPU was not a user
 		// fault, so we can't reliably trust the faultAddr or
 		// the code provided here. We need to re-execute.
-		return nil, usermem.NoAccess, platform.ErrContextInterrupt
-	}
-	info := &arch.SignalInfo{
-		Signo: signal,
+		return usermem.NoAccess, platform.ErrContextInterrupt
 	}
+	// Reset the pointed SignalInfo.
+	*info = arch.SignalInfo{Signo: signal}
 	info.SetAddr(uint64(faultAddr))
 	accessType := usermem.AccessType{
 		Read:    code&(1<<1) == 0,
@@ -192,20 +191,20 @@ func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error)
 	} else {
 		info.Code = 2 // SEGV_ACCERR.
 	}
-	return info, accessType, platform.ErrContextSignal
+	return accessType, platform.ErrContextSignal
 }
 
 // SwitchToUser unpacks architectural-details.
-func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, usermem.AccessType, error) {
+func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
 	// Check for canonical addresses.
 	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) {
-		return nonCanonical(regs.Rip, int32(syscall.SIGSEGV))
+		return nonCanonical(regs.Rip, int32(syscall.SIGSEGV), info)
 	} else if !ring0.IsCanonical(regs.Rsp) {
-		return nonCanonical(regs.Rsp, int32(syscall.SIGBUS))
+		return nonCanonical(regs.Rsp, int32(syscall.SIGBUS), info)
 	} else if !ring0.IsCanonical(regs.Fs_base) {
-		return nonCanonical(regs.Fs_base, int32(syscall.SIGBUS))
+		return nonCanonical(regs.Fs_base, int32(syscall.SIGBUS), info)
 	} else if !ring0.IsCanonical(regs.Gs_base) {
-		return nonCanonical(regs.Gs_base, int32(syscall.SIGBUS))
+		return nonCanonical(regs.Gs_base, int32(syscall.SIGBUS), info)
 	}
 
 	// Assign PCIDs.
@@ -231,25 +230,25 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 	switch vector {
 	case ring0.Syscall, ring0.SyscallInt80:
 		// Fast path: system call executed.
-		return nil, usermem.NoAccess, nil
+		return usermem.NoAccess, nil
 
 	case ring0.PageFault:
-		return c.fault(int32(syscall.SIGSEGV))
+		return c.fault(int32(syscall.SIGSEGV), info)
 
 	case ring0.Debug, ring0.Breakpoint:
-		info := &arch.SignalInfo{
+		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGTRAP),
 			Code:  1, // TRAP_BRKPT (breakpoint).
 		}
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
-		return info, usermem.AccessType{}, platform.ErrContextSignal
+		return usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.GeneralProtectionFault,
 		ring0.SegmentNotPresent,
 		ring0.BoundRangeExceeded,
 		ring0.InvalidTSS,
 		ring0.StackSegmentFault:
-		info := &arch.SignalInfo{
+		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGSEGV),
 			Code:  arch.SignalInfoKernel,
 		}
@@ -258,52 +257,52 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 			// When CPUID faulting is enabled, we will generate a #GP(0) when
 			// userspace executes a CPUID instruction. This is handled above,
 			// because we need to be able to map and read user memory.
-			return info, usermem.AccessType{}, platform.ErrContextSignalCPUID
+			return usermem.AccessType{}, platform.ErrContextSignalCPUID
 		}
-		return info, usermem.AccessType{}, platform.ErrContextSignal
+		return usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.InvalidOpcode:
-		info := &arch.SignalInfo{
+		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGILL),
 			Code:  1, // ILL_ILLOPC (illegal opcode).
 		}
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
-		return info, usermem.AccessType{}, platform.ErrContextSignal
+		return usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.DivideByZero:
-		info := &arch.SignalInfo{
+		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGFPE),
 			Code:  1, // FPE_INTDIV (divide by zero).
 		}
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
-		return info, usermem.AccessType{}, platform.ErrContextSignal
+		return usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.Overflow:
-		info := &arch.SignalInfo{
+		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGFPE),
 			Code:  1, // FPE_INTOVF (integer overflow).
 		}
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
-		return info, usermem.AccessType{}, platform.ErrContextSignal
+		return usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.X87FloatingPointException,
 		ring0.SIMDFloatingPointException:
-		info := &arch.SignalInfo{
+		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGFPE),
 			Code:  7, // FPE_FLTINV (invalid operation).
 		}
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
-		return info, usermem.AccessType{}, platform.ErrContextSignal
+		return usermem.AccessType{}, platform.ErrContextSignal
 
 	case ring0.Vector(bounce): // ring0.VirtualizationException
-		return nil, usermem.NoAccess, platform.ErrContextInterrupt
+		return usermem.NoAccess, platform.ErrContextInterrupt
 
 	case ring0.AlignmentCheck:
-		info := &arch.SignalInfo{
+		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGBUS),
 			Code:  2, // BUS_ADRERR (physical address does not exist).
 		}
-		return info, usermem.NoAccess, platform.ErrContextSignal
+		return usermem.NoAccess, platform.ErrContextSignal
 
 	case ring0.NMI:
 		// An NMI is generated only when a fault is not servicable by
@@ -311,7 +310,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts) (*arch.SignalInfo, user
 		// really not. This could happen, e.g. if some file is
 		// truncated (and would generate a SIGBUS) and we map it
 		// directly into the instance.
-		return c.fault(int32(syscall.SIGBUS))
+		return c.fault(int32(syscall.SIGBUS), info)
 
 	case ring0.DeviceNotAvailable,
 		ring0.DoubleFault,
-- 
cgit v1.2.3


From c4bbb54168a9014048d2144110e70daf5a5b8211 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 31 Oct 2018 15:49:10 -0700
Subject: kvm: add detailed traces on vCPU errors.

This improves debuggability greatly.

PiperOrigin-RevId: 219551560
Change-Id: I2ecaffdd1c17b0d9f25911538ea6f693e2bc699f
---
 pkg/sentry/platform/kvm/bluepill.go              | 48 ++++++++++++++++++++++--
 pkg/sentry/platform/kvm/bluepill_amd64.s         |  6 +++
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go | 28 ++++++++++++++
 pkg/sentry/platform/kvm/bluepill_unsafe.go       | 29 +++++++++-----
 pkg/sentry/platform/kvm/kvm_const.go             |  1 +
 pkg/sentry/platform/kvm/machine.go               |  3 ++
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go  | 24 ------------
 pkg/sentry/platform/kvm/machine_unsafe.go        | 40 ++++++++++++++++++++
 8 files changed, 142 insertions(+), 37 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 9f1c9510b..d98ec8377 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -19,6 +19,7 @@ import (
 	"reflect"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
 )
 
@@ -28,14 +29,55 @@ func bluepill(*vCPU)
 // sighandler is the signal entry point.
 func sighandler()
 
-// savedHandler is a pointer to the previous handler.
+// dieTrampoline is the assembly trampoline. This calls dieHandler.
 //
-// This is called by bluepillHandler.
-var savedHandler uintptr
+// This uses an architecture-specific calling convention, documented in
+// dieArchSetup and the assembly implementation for dieTrampoline.
+func dieTrampoline()
+
+var (
+	// savedHandler is a pointer to the previous handler.
+	//
+	// This is called by bluepillHandler.
+	savedHandler uintptr
+
+	// dieTrampolineAddr is the address of dieTrampoline.
+	dieTrampolineAddr uintptr
+)
+
+// dieHandler is called by dieTrampoline.
+//
+//go:nosplit
+func dieHandler(c *vCPU) {
+	throw(c.dieMessage)
+}
+
+// die is called to set the vCPU up to panic.
+//
+// This loads vCPU state, and sets up a call for the trampoline.
+//
+//go:nosplit
+func (c *vCPU) die(context *arch.SignalContext64, msg string) {
+	// Save the death message, which will be thrown.
+	c.dieMessage = msg
+
+	// Reload all registers to have an accurate stack trace when we return
+	// to host mode. This means that the stack should be unwound correctly.
+	var guestRegs userRegs
+	if errno := c.getUserRegisters(&guestRegs); errno != 0 {
+		throw(msg)
+	}
+
+	// Setup the trampoline.
+	dieArchSetup(c, context, &guestRegs)
+}
 
 func init() {
 	// Install the handler.
 	if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
 		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
 	}
+
+	// Extract the address for the trampoline.
+	dieTrampolineAddr = reflect.ValueOf(dieTrampoline).Pointer()
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index ec017f6c2..65b01f358 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -85,3 +85,9 @@ fallback:
 	XORQ CX, CX
 	MOVQ ·savedHandler(SB), AX
 	JMP AX
+
+// dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation.
+TEXT ·dieTrampoline(SB),NOSPLIT,$0
+	PUSHQ BX // First argument (vCPU).
+	PUSHQ AX // Fake the old RIP as caller.
+	JMP ·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index cd00a47f2..21de2488e 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -20,9 +20,37 @@ import (
 	"unsafe"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
 )
 
 // bluepillArchContext returns the arch-specific context.
+//
+//go:nosplit
 func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 	return &((*arch.UContext64)(context).MContext)
 }
+
+// dieArchSetup initialies the state for dieTrampoline.
+//
+// The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP
+// to be in AX. The trampoline then simulates a call to dieHandler from the
+// provided RIP.
+//
+//go:nosplit
+func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// If the vCPU is in user mode, we set the stack to the stored stack
+	// value in the vCPU itself. We don't want to unwind the user stack.
+	if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet {
+		regs := c.CPU.Registers()
+		context.Rax = regs.Rax
+		context.Rsp = regs.Rsp
+		context.Rbp = regs.Rbp
+	} else {
+		context.Rax = guestRegs.RIP
+		context.Rsp = guestRegs.RSP
+		context.Rbp = guestRegs.RBP
+		context.Eflags = guestRegs.RFLAGS
+	}
+	context.Rbx = uint64(uintptr(unsafe.Pointer(c)))
+	context.Rip = uint64(dieTrampolineAddr)
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 747a95997..77cf7e800 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -113,9 +113,11 @@ func bluepillHandler(context unsafe.Pointer) {
 
 		switch c.runData.exitReason {
 		case _KVM_EXIT_EXCEPTION:
-			throw("exception")
+			c.die(bluepillArchContext(context), "exception")
+			return
 		case _KVM_EXIT_IO:
-			throw("I/O")
+			c.die(bluepillArchContext(context), "I/O")
+			return
 		case _KVM_EXIT_INTERNAL_ERROR:
 			// An internal error is typically thrown when emulation
 			// fails. This can occur via the MMIO path below (and
@@ -123,9 +125,11 @@ func bluepillHandler(context unsafe.Pointer) {
 			// are not mapped). We would actually prefer that no
 			// emulation occur, and don't mind at all if it fails.
 		case _KVM_EXIT_HYPERCALL:
-			throw("hypercall")
+			c.die(bluepillArchContext(context), "hypercall")
+			return
 		case _KVM_EXIT_DEBUG:
-			throw("debug")
+			c.die(bluepillArchContext(context), "debug")
+			return
 		case _KVM_EXIT_HLT:
 			// Copy out registers.
 			bluepillArchExit(c, bluepillArchContext(context))
@@ -145,9 +149,11 @@ func bluepillHandler(context unsafe.Pointer) {
 			atomic.AddUint32(&c.faults, 1)
 
 			// For MMIO, the physical address is the first data item.
-			virtual, ok := handleBluepillFault(c.machine, uintptr(c.runData.data[0]))
+			physical := uintptr(c.runData.data[0])
+			virtual, ok := handleBluepillFault(c.machine, physical)
 			if !ok {
-				throw("physical address not valid")
+				c.die(bluepillArchContext(context), "invalid physical address")
+				return
 			}
 
 			// We now need to fill in the data appropriately. KVM
@@ -158,7 +164,7 @@ func bluepillHandler(context unsafe.Pointer) {
 			// not create invalid page table mappings.
 			data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
 			length := (uintptr)((uint32)(c.runData.data[2]))
-			write := (uint8)((c.runData.data[2] >> 32 & 0xff)) != 0
+			write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0
 			for i := uintptr(0); i < length; i++ {
 				b := bytePtr(uintptr(virtual) + i)
 				if write {
@@ -182,11 +188,14 @@ func bluepillHandler(context unsafe.Pointer) {
 			// Clear previous injection request.
 			c.runData.requestInterruptWindow = 0
 		case _KVM_EXIT_SHUTDOWN:
-			throw("shutdown")
+			c.die(bluepillArchContext(context), "shutdown")
+			return
 		case _KVM_EXIT_FAIL_ENTRY:
-			throw("entry failed")
+			c.die(bluepillArchContext(context), "entry failed")
+			return
 		default:
-			throw("unknown failure")
+			c.die(bluepillArchContext(context), "unknown")
+			return
 		}
 	}
 }
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 8c53c6f06..cac8d9937 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -31,6 +31,7 @@ const (
 	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
 	_KVM_SET_REGS               = 0x4090ae82
 	_KVM_SET_SREGS              = 0x4138ae84
+	_KVM_GET_REGS               = 0x8090ae81
 	_KVM_GET_SUPPORTED_CPUID    = 0xc008ae05
 	_KVM_SET_CPUID2             = 0x4008ae90
 	_KVM_SET_SIGNAL_MASK        = 0x4004ae8b
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index fc7ad258f..4ba3a185a 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -120,6 +120,9 @@ type vCPU struct {
 
 	// vCPUArchState is the architecture-specific state.
 	vCPUArchState
+
+	// dieMessage is thrown from die.
+	dieMessage string
 }
 
 // newVCPU creates a returns a new vCPU.
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 50e513f3b..8ebd4ab71 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -73,30 +73,6 @@ func (c *vCPU) loadSegments(tid uint64) {
 	atomic.StoreUint64(&c.tid, tid)
 }
 
-// setUserRegisters sets user registers in the vCPU.
-func (c *vCPU) setUserRegisters(uregs *userRegs) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_REGS,
-		uintptr(unsafe.Pointer(uregs))); errno != 0 {
-		return fmt.Errorf("error setting user registers: %v", errno)
-	}
-	return nil
-}
-
-// setSystemRegisters sets system registers.
-func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_SREGS,
-		uintptr(unsafe.Pointer(sregs))); errno != 0 {
-		return fmt.Errorf("error setting system registers: %v", errno)
-	}
-	return nil
-}
-
 // setCPUID sets the CPUID to be used by the guest.
 func (c *vCPU) setCPUID() error {
 	if _, _, errno := syscall.RawSyscall(
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 38c1f102f..22ae60b63 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -57,6 +57,46 @@ func unmapRunData(r *runData) error {
 	return nil
 }
 
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
+	}
+	return nil
+}
+
+// getUserRegisters reloads user registers in the vCPU.
+//
+// This is safe to call from a nosplit context.
+//
+//go:nosplit
+func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return errno
+	}
+	return 0
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return fmt.Errorf("error setting system registers: %v", errno)
+	}
+	return nil
+}
+
 // atomicAddressSpace is an atomic address space pointer.
 type atomicAddressSpace struct {
 	pointer unsafe.Pointer
-- 
cgit v1.2.3


From fb613020c7db323c705adf6ae0f954bee4ab5fec Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 31 Oct 2018 15:58:21 -0700
Subject: kvm: simplify floating point logic.

This reduces the number of floating point save/restore cycles required (since
we don't need to restore immediately following the switch, this always happens
in a known context) and allows the kernel hooks to capture state. This lets us
remove calls like "Current()".

PiperOrigin-RevId: 219552844
Change-Id: I7676fa2f6c18b9919718458aa888b832a7db8cab
---
 pkg/sentry/platform/kvm/bluepill_amd64.go  | 46 ++++++++++++--------------
 pkg/sentry/platform/kvm/bluepill_unsafe.go |  7 ++++
 pkg/sentry/platform/kvm/machine.go         |  4 +--
 pkg/sentry/platform/kvm/machine_amd64.go   | 10 ++++++
 pkg/sentry/platform/ring0/defs.go          | 52 +++++++++++++++++-------------
 pkg/sentry/platform/ring0/entry_amd64.s    | 41 +++++++----------------
 pkg/sentry/platform/ring0/kernel.go        | 34 ++++++++++++-------
 pkg/sentry/platform/ring0/kernel_amd64.go  |  2 +-
 pkg/sentry/platform/ring0/offsets_amd64.go |  2 --
 9 files changed, 103 insertions(+), 95 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index f013d1dc9..6520682d7 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -47,8 +47,8 @@ func redpill() {
 // bluepillArchEnter is called during bluepillEnter.
 //
 //go:nosplit
-func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
-	c = vCPUPtr(uintptr(context.Rax))
+func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
+	c := vCPUPtr(uintptr(context.Rax))
 	regs := c.CPU.Registers()
 	regs.R8 = context.R8
 	regs.R9 = context.R9
@@ -73,50 +73,41 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
 	regs.Cs = uint64(ring0.Kcode)
 	regs.Ds = uint64(ring0.Udata)
 	regs.Es = uint64(ring0.Udata)
-	regs.Fs = uint64(ring0.Udata)
 	regs.Ss = uint64(ring0.Kdata)
-
-	// ring0 uses GS exclusively, so we use GS_base to store the location
-	// of the floating point address.
-	//
-	// The address will be restored directly after running the VCPU, and
-	// will be saved again prior to halting. We rely on the fact that the
-	// SaveFloatingPointer/LoadFloatingPoint functions use the most
-	// efficient mechanism available (including compression) so the state
-	// size is guaranteed to be less than what's pointed to here.
-	regs.Gs_base = uint64(context.Fpstate)
-	return
+	return c
 }
 
-// bluepillSyscall handles kernel syscalls.
+// KernelSyscall handles kernel syscalls.
 //
 //go:nosplit
-func bluepillSyscall() {
-	regs := ring0.Current().Registers()
+func (c *vCPU) KernelSyscall() {
+	regs := c.Registers()
 	if regs.Rax != ^uint64(0) {
 		regs.Rip -= 2 // Rewind.
 	}
-	ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+	// We only trigger a bluepill entry in the bluepill function, and can
+	// therefore be guaranteed that there is no floating point state to be
+	// loaded on resuming from halt. We only worry about saving on exit.
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
 	ring0.Halt()
 	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
-	ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
 }
 
-// bluepillException handles kernel exceptions.
+// KernelException handles kernel exceptions.
 //
 //go:nosplit
-func bluepillException(vector ring0.Vector) {
-	regs := ring0.Current().Registers()
+func (c *vCPU) KernelException(vector ring0.Vector) {
+	regs := c.Registers()
 	if vector == ring0.Vector(bounce) {
 		// These should not interrupt kernel execution; point the Rip
 		// to zero to ensure that we get a reasonable panic when we
-		// attempt to return.
+		// attempt to return and a full stack trace.
 		regs.Rip = 0
 	}
-	ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+	// See above.
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
 	ring0.Halt()
 	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
-	ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
 }
 
 // bluepillArchExit is called during bluepillEnter.
@@ -142,4 +133,9 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
 	context.Rsp = regs.Rsp
 	context.Rip = regs.Rip
 	context.Eflags = regs.Eflags
+
+	// Set the context pointer to the saved floating point state. This is
+	// where the guest data has been serialized, the kernel will restore
+	// from this new pointer value.
+	context.Fpstate = uint64(uintptrValue((*byte)(c.floatingPointState)))
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 77cf7e800..2605f8c93 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -37,6 +37,13 @@ func bytePtr(addr uintptr) *byte {
 	return (*byte)(unsafe.Pointer(addr))
 }
 
+// uintptrValue returns a uintptr for the given address.
+//
+//go:nosplit
+func uintptrValue(addr *byte) uintptr {
+	return (uintptr)(unsafe.Pointer(addr))
+}
+
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 4ba3a185a..deead1b5f 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -142,9 +142,7 @@ func (m *machine) newVCPU() *vCPU {
 		fd:      int(fd),
 		machine: m,
 	}
-	c.CPU.Init(&m.kernel)
-	c.CPU.KernelSyscall = bluepillSyscall
-	c.CPU.KernelException = bluepillException
+	c.CPU.Init(&m.kernel, c)
 	m.vCPUsByID[c.id] = c
 
 	// Ensure the signal mask is correct.
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index c03792a1b..5ad805b8b 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -63,6 +63,10 @@ type vCPUArchState struct {
 	//
 	// This starts above fixedKernelPCID.
 	PCIDs *pagetables.PCIDs
+
+	// floatingPointState is the floating point state buffer used in guest
+	// to host transitions. See usage in bluepill_amd64.go.
+	floatingPointState *arch.FloatingPointData
 }
 
 const (
@@ -149,6 +153,12 @@ func (c *vCPU) initArchState() error {
 		return err
 	}
 
+	// Allocate some floating point state save area for the local vCPU.
+	// This will be saved prior to leaving the guest, and we restore from
+	// this always. We cannot use the pointer in the context alone because
+	// we don't know how large the area there is in reality.
+	c.floatingPointState = arch.NewFloatingPointData()
+
 	// Set the time offset to the host native time.
 	return c.setSystemTime()
 }
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 18137e55d..98d0a6de0 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -38,6 +38,33 @@ type Kernel struct {
 	KernelArchState
 }
 
+// Hooks are hooks for kernel functions.
+type Hooks interface {
+	// KernelSyscall is called for kernel system calls.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelSyscall()
+
+	// KernelException handles an exception during kernel execution.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelException(Vector)
+}
+
 // CPU is the per-CPU struct.
 type CPU struct {
 	// self is a self reference.
@@ -58,29 +85,8 @@ type CPU struct {
 	// calls and exceptions via the Registers function.
 	registers syscall.PtraceRegs
 
-	// KernelException handles an exception during kernel execution.
-	//
-	// Return from this call will restore registers and return to the kernel: the
-	// registers must be modified directly.
-	//
-	// If this function is not provided, a kernel exception results in halt.
-	//
-	// This must be go:nosplit, as this will be on the interrupt stack.
-	// Closures are permitted, as the pointer to the closure frame is not
-	// passed on the stack.
-	KernelException func(Vector)
-
-	// KernelSyscall is called for kernel system calls.
-	//
-	// Return from this call will restore registers and return to the kernel: the
-	// registers must be modified directly.
-	//
-	// If this function is not provided, a kernel exception results in halt.
-	//
-	// This must be go:nosplit, as this will be on the interrupt stack.
-	// Closures are permitted, as the pointer to the closure frame is not
-	// passed on the stack.
-	KernelSyscall func()
+	// hooks are kernel hooks.
+	hooks Hooks
 }
 
 // Registers returns a modifiable-copy of the kernel registers.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index d48fbd2d1..afb040a6f 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -90,12 +90,6 @@ TEXT ·Halt(SB),NOSPLIT,$0
 	HLT
 	RET
 
-// See kernel.go.
-TEXT ·Current(SB),NOSPLIT,$0-8
-	MOVQ CPU_SELF(GS), AX
-	MOVQ AX, ret+0(FP)
-	RET
-
 // See entry_amd64.go.
 TEXT ·swapgs(SB),NOSPLIT,$0
 	SWAP_GS()
@@ -205,19 +199,12 @@ kernel:
 	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
 	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
 
-	// Load the function stored in KernelSyscall.
-	//
-	// Note that this function needs to be executed on the stack in case
-	// the runtime decides to make use of the redzone (grumble). This also
-	// protects against any functions that might not be go:nosplit, since
-	// this will cause a failure immediately.
+	// Call the syscall trampoline.
 	LOAD_KERNEL_STACK(GS)
-	MOVQ CPU_KERNEL_SYSCALL(GS), DX // Function data.
-	MOVQ 0(DX), AX                  // Function pointer.
-	PUSHQ BP                        // Push the frame pointer.
-	MOVQ SP, BP                     // Set frame pointer value.
-	CALL *AX                        // Call the function.
-	POPQ BP                         // Restore the frame pointer.
+	MOVQ CPU_SELF(GS), AX   // Load vCPU.
+	PUSHQ AX                // First argument (vCPU).
+	CALL ·kernelSyscall(SB) // Call the trampoline.
+	POPQ AX                 // Pop vCPU.
 	JMP ·resume(SB)
 
 // exception is a generic exception handler.
@@ -287,18 +274,14 @@ kernel:
 	MOVQ 0(SP), BX              // BX contains the vector.
 	ADDQ $48, SP                // Drop the exception frame.
 
-	// Load the function stored in KernelException.
-	//
-	// See note above re: the kernel stack.
+	// Call the exception trampoline.
 	LOAD_KERNEL_STACK(GS)
-	MOVQ CPU_KERNEL_EXCEPTION(GS), DX // Function data.
-	MOVQ 0(DX), AX                    // Function pointer.
-	PUSHQ BP                          // Push the frame pointer.
-	MOVQ SP, BP                       // Set frame pointer value.
-	PUSHQ BX                          // First argument (vector).
-	CALL *AX                          // Call the function.
-	POPQ BX                           // Discard the argument.
-	POPQ BP                           // Restore the frame pointer.
+	MOVQ CPU_SELF(GS), AX     // Load vCPU.
+	PUSHQ BX                  // Second argument (vector).
+	PUSHQ AX                  // First argument (vCPU).
+	CALL ·kernelException(SB) // Call the trampoline.
+	POPQ BX                   // Pop vector.
+	POPQ AX                   // Pop vCPU.
 	JMP ·resume(SB)
 
 #define EXCEPTION_WITH_ERROR(value, symbol) \
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index e70eafde2..19ac6eb7c 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -26,31 +26,41 @@ func (k *Kernel) Init(opts KernelOpts) {
 // Halt halts execution.
 func Halt()
 
-// Current returns the current CPU.
+// defaultHooks implements hooks.
+type defaultHooks struct{}
+
+// KernelSyscall implements Hooks.KernelSyscall.
 //
-// Its use is only legal in the KernelSyscall and KernelException contexts,
-// which must all be guarded go:nosplit.
-func Current() *CPU
+//go:nosplit
+func (defaultHooks) KernelSyscall() { Halt() }
+
+// KernelException implements Hooks.KernelException.
+//
+//go:nosplit
+func (defaultHooks) KernelException(Vector) { Halt() }
 
-// defaultSyscall is the default syscall hook.
+// kernelSyscall is a trampoline.
 //
 //go:nosplit
-func defaultSyscall() { Halt() }
+func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
 
-// defaultException is the default exception hook.
+// kernelException is a trampoline.
 //
 //go:nosplit
-func defaultException(Vector) { Halt() }
+func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
 
 // Init initializes a new CPU.
 //
 // Init allows embedding in other objects.
-func (c *CPU) Init(k *Kernel) {
+func (c *CPU) Init(k *Kernel, hooks Hooks) {
 	c.self = c   // Set self reference.
 	c.kernel = k // Set kernel reference.
 	c.init()     // Perform architectural init.
 
-	// Defaults.
-	c.KernelSyscall = defaultSyscall
-	c.KernelException = defaultException
+	// Require hooks.
+	if hooks != nil {
+		c.hooks = hooks
+	} else {
+		c.hooks = defaultHooks{}
+	}
 }
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index ab562bca7..9e8c56a54 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -204,7 +204,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 func start(c *CPU) {
 	// Save per-cpu & FS segment.
 	WriteGS(kernelAddr(c))
-	WriteFS(uintptr(c.Registers().Fs_base))
+	WriteFS(uintptr(c.registers.Fs_base))
 
 	// Initialize floating point.
 	//
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index 753d31ef8..806e07ec0 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -34,8 +34,6 @@ func Emit(w io.Writer) {
 	fmt.Fprintf(w, "#define CPU_STACK_TOP        0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
 	fmt.Fprintf(w, "#define CPU_ERROR_CODE       0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
 	fmt.Fprintf(w, "#define CPU_ERROR_TYPE       0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
-	fmt.Fprintf(w, "#define CPU_KERNEL_EXCEPTION 0x%02x\n", reflect.ValueOf(&c.KernelException).Pointer()-reflect.ValueOf(c).Pointer())
-	fmt.Fprintf(w, "#define CPU_KERNEL_SYSCALL   0x%02x\n", reflect.ValueOf(&c.KernelSyscall).Pointer()-reflect.ValueOf(c).Pointer())
 
 	fmt.Fprintf(w, "\n// Bits.\n")
 	fmt.Fprintf(w, "#define _RFLAGS_IF           0x%02x\n", _RFLAGS_IF)
-- 
cgit v1.2.3


From b23cd33682a9a8bd727fa45b8424eb55d91c3086 Mon Sep 17 00:00:00 2001
From: Juan <xionghuan.cn@gmail.com>
Date: Thu, 1 Nov 2018 11:57:09 -0700
Subject: modify modeRegexp to adapt the default spec of containerd

https://github.com/containerd/containerd/blob/master/oci/spec.go#L206, the mode=755
didn't match the pattern modeRegexp = regexp.MustCompile("0[0-7][0-7][0-7]").

Closes #112

Signed-off-by: Juan <xionghuan.cn@gmail.com>
Change-Id: I469e0a68160a1278e34c9e1dbe4b7784c6f97e5a
PiperOrigin-RevId: 219672525
---
 pkg/sentry/fs/tmpfs/fs.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 7c91e248b..453ed5bd9 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -44,7 +44,7 @@ const (
 )
 
 // modeRegexp is the expected format of the mode option.
-var modeRegexp = regexp.MustCompile("0[0-7][0-7][0-7]")
+var modeRegexp = regexp.MustCompile("^0?[0-7][0-7][0-7]$")
 
 // Filesystem is a tmpfs.
 //
-- 
cgit v1.2.3


From 0e277a39c8b6f905e289b75e8ad0594e6b3562ca Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 1 Nov 2018 15:53:25 -0700
Subject: Prevent premature destruction of shm segments.

Shm segments can be marked for lazy destruction via shmctl(IPC_RMID),
which destroys a segment once it is no longer attached to any
processes. We were unconditionally decrementing the segment refcount
on shmctl(IPC_RMID) which allowed a user to force a segment to be
destroyed by repeatedly calling shmctl(IPC_RMID), with outstanding
memory maps to the segment.

This is problematic because the memory released by a segment destroyed
this way can be reused by a different process while remaining
accessible by the process with outstanding maps to the segment.

PiperOrigin-RevId: 219713660
Change-Id: I443ab838322b4fb418ed87b2722c3413ead21845
---
 pkg/sentry/kernel/shm/shm.go         | 13 +++++++++++--
 pkg/sentry/syscalls/linux/sys_shm.go |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 2feffe612..f760f5f76 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -575,10 +575,19 @@ func (s *Shm) destroy() {
 func (s *Shm) MarkDestroyed() {
 	s.mu.Lock()
 	defer s.mu.Unlock()
+
 	// Prevent the segment from being found in the registry.
 	s.key = linux.IPC_PRIVATE
-	s.pendingDestruction = true
-	s.DecRef()
+
+	// Only drop the segment's self-reference once, when destruction is
+	// requested. Otherwise, repeated calls shmctl(IPC_RMID) would force a
+	// segment to be destroyed prematurely, potentially with active maps to the
+	// segment's address range. Remaining references are dropped when the
+	// segment is detached or unmaped.
+	if !s.pendingDestruction {
+		s.pendingDestruction = true
+		s.DecRef()
+	}
 }
 
 // checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index 5f887523a..8753c2e58 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -144,7 +144,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		return 0, nil, nil
 
 	case linux.SHM_LOCK, linux.SHM_UNLOCK:
-		// We currently do not support memmory locking anywhere.
+		// We currently do not support memory locking anywhere.
 		// mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
 		// same here.
 		t.Kernel().EmitUnimplementedEvent(t)
-- 
cgit v1.2.3


From 9d69d85bc13d4f0956a39951b5cd6777f938cffd Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Thu, 1 Nov 2018 17:39:20 -0700
Subject: Make error messages a bit more user friendly.

Updated error messages so that it doesn't print full Go struct representations
when running a new container in a sandbox. For example, this occurs frequently
when commands are not found when doing a 'kubectl exec'.

PiperOrigin-RevId: 219729141
Change-Id: Ic3a7bc84cd7b2167f495d48a1da241d621d3ca09
---
 pkg/sentry/control/proc.go | 11 +++++++++++
 runsc/boot/loader.go       |  4 ++--
 runsc/sandbox/sandbox.go   |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index b6ac2f312..923399fb2 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -19,6 +19,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"sort"
+	"strings"
 	"text/tabwriter"
 	"time"
 
@@ -88,6 +89,16 @@ type ExecArgs struct {
 	ContainerID string
 }
 
+// String prints the arguments as a string.
+func (args ExecArgs) String() string {
+	a := make([]string, len(args.Argv))
+	copy(a, args.Argv)
+	if args.Filename != "" {
+		a[0] = args.Filename
+	}
+	return strings.Join(a, " ")
+}
+
 // Exec runs a new task.
 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
 	newTG, _, _, err := proc.execAsync(args)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index abb347835..380fa3fbf 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -618,7 +618,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	ep, ok := l.processes[rootKey]
 	l.mu.Unlock()
 	if !ok {
-		return 0, fmt.Errorf("cannot exec in container %q: no such container", args.ContainerID)
+		return 0, fmt.Errorf("no such container: %q", args.ContainerID)
 	}
 	ep.tg.Leader().WithMuLocked(func(t *kernel.Task) {
 		args.Root = t.FSContext().RootDirectory()
@@ -631,7 +631,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	proc := control.Proc{Kernel: l.k}
 	tg, tgid, ttyFile, err := control.ExecAsync(&proc, args)
 	if err != nil {
-		return 0, fmt.Errorf("error executing: %+v: %v", args, err)
+		return 0, err
 	}
 
 	// Insert the process into processes so that we can wait on it
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index df235c5e9..9421bd63e 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -229,7 +229,7 @@ func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
 	// Send a message to the sandbox control server to start the container.
 	var pid int32
 	if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
-		return 0, fmt.Errorf("error executing in sandbox: %v", err)
+		return 0, fmt.Errorf("error executing command %q in sandbox: %v", args, err)
 	}
 	return pid, nil
 }
-- 
cgit v1.2.3


From 5a0be6fa203273d1e4ab06a206eaffeca5724533 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 8 Nov 2018 11:08:41 -0800
Subject: Create stubs for syscalls upto Linux 4.4.

Create syscall stubs for missing syscalls upto Linux 4.4 and advertise
a kernel version of 4.4.

PiperOrigin-RevId: 220667680
Change-Id: Idbdccde538faabf16debc22f492dd053a8af0ba7
---
 pkg/sentry/fs/inode.go                   |  5 +----
 pkg/sentry/fs/proc/README.md             |  2 +-
 pkg/sentry/fs/proc/fds.go                |  5 -----
 pkg/sentry/fs/proc/task.go               |  2 +-
 pkg/sentry/kernel/auth/user_namespace.go |  5 +----
 pkg/sentry/kernel/version.go             |  2 +-
 pkg/sentry/syscalls/linux/linux64.go     | 21 +++++++++++++++++----
 pkg/sentry/syscalls/linux/sys_file.go    |  3 ---
 pkg/sentry/syscalls/linux/sys_thread.go  |  4 ++--
 9 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 95769ccf8..38b140bd2 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -439,10 +439,7 @@ func (i *Inode) CheckOwnership(ctx context.Context) bool {
 // CheckCapability checks whether `ctx` has capability `cp` with respect to
 // operations on this Inode.
 //
-// Compare Linux's kernel/capability.c:capable_wrt_inode_uidgid(). Note that
-// this function didn't exist in Linux 3.11.10, but was added by upstream
-// 23adbe12ef7d "fs,userns: Change inode_capable to capable_wrt_inode_uidgid"
-// to fix local privilege escalation CVE-2014-4014.
+// Compare Linux's kernel/capability.c:capable_wrt_inode_uidgid().
 func (i *Inode) CheckCapability(ctx context.Context, cp linux.Capability) bool {
 	uattr, err := i.UnstableAttr(ctx)
 	if err != nil {
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index e1ed88512..686d40f0c 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -223,7 +223,7 @@ Number of seconds idle           | Always zero
 
 ```bash
 $ cat /proc/version
-Linux version 3.11.10 #1 SMP Fri Nov 29 10:47:50 PST 2013
+Linux version 4.4 #1 SMP Sun Jan 10 15:06:54 PST 2016
 ```
 
 ## Process-specific data
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 5ebb33703..5acbce75e 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -173,11 +173,6 @@ func (f *fdDir) Check(ctx context.Context, inode *fs.Inode, req fs.PermMask) boo
 	if t := kernel.TaskFromContext(ctx); t != nil {
 		// Allow access if the task trying to access it is in the
 		// thread group corresponding to this directory.
-		//
-		// N.B. Technically, in Linux 3.11, this compares what would be
-		// the equivalent of task pointers. However, this was fixed
-		// later in 54708d2858e7 ("proc: actually make
-		// proc_fd_permission() thread-friendly").
 		if f.t.ThreadGroup() == t.ThreadGroup() {
 			return true
 		}
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 404faea0a..9f13ff91c 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -76,7 +76,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"gid_map": newGIDMap(t, msrc),
 		// TODO: This is incorrect for /proc/[pid]/task/[tid]/io, i.e. if
 		// showSubtasks is false:
-		// http://lxr.free-electrons.com/source/fs/proc/base.c?v=3.11#L2980
+		// https://elixir.bootlin.com/linux/v4.4/source/fs/proc/base.c#L3154
 		"io":        newIO(t, msrc),
 		"maps":      newMaps(t, msrc),
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 5bb9c44c0..30957bb9a 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -49,10 +49,7 @@ type UserNamespace struct {
 	gidMapFromParent idMapSet
 	gidMapToParent   idMapSet
 
-	// TODO: Consider supporting disabling setgroups(2), which "was
-	// added in Linux 3.19, but was backported to many earlier stable kernel
-	// series, because it addresses a security issue" - user_namespaces(7). (It
-	// was not backported to 3.11.10, which we are currently imitating.)
+	// TODO: Support disabling setgroups(2).
 }
 
 // NewRootUserNamespace returns a UserNamespace that is appropriate for a
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
index 72bb0f93c..8d2f14209 100644
--- a/pkg/sentry/kernel/version.go
+++ b/pkg/sentry/kernel/version.go
@@ -19,7 +19,7 @@ type Version struct {
 	// Operating system name (e.g. "Linux").
 	Sysname string
 
-	// Operating system release (e.g. "3.11.10-amd64").
+	// Operating system release (e.g. "4.4-amd64").
 	Release string
 
 	// Operating system version. On Linux this takes the shape
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 11bf81f88..13084c0ef 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -32,15 +32,19 @@ import (
 const _AUDIT_ARCH_X86_64 = 0xc000003e
 
 // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
-// numbers from Linux 3.11. The entries commented out are those syscalls we
+// numbers from Linux 4.4. The entries commented out are those syscalls we
 // don't currently support.
 var AMD64 = &kernel.SyscallTable{
 	OS:   abi.Linux,
 	Arch: arch.AMD64,
 	Version: kernel.Version{
+		// Version 4.4 is chosen as a stable, longterm version of Linux, which
+		// guides the interface provided by this syscall table. The build
+		// version is that for a clean build with default kernel config, at 5
+		// minutes after v4.4 was tagged.
 		Sysname: "Linux",
-		Release: "3.11.10",
-		Version: "#1 SMP Fri Nov 29 10:47:50 PST 2013",
+		Release: "4.4",
+		Version: "#1 SMP Sun Jan 10 15:06:54 PST 2016",
 	},
 	AuditNumber: _AUDIT_ARCH_X86_64,
 	Table: map[uintptr]kernel.SyscallFn{
@@ -358,9 +362,18 @@ var AMD64 = &kernel.SyscallTable{
 		//     311: ProcessVmWritev, TODO may require cap_sys_ptrace
 		312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace
 		313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module
-		// "Backports."
+		//     314: SchedSetattr, TODO, we have no scheduler
+		//     315: SchedGetattr, TODO, we have no scheduler
+		//     316: Renameat2, TODO
 		317: Seccomp,
 		318: GetRandom,
+		//     319: MemfdCreate, TODO
+		320: syscalls.CapError(linux.CAP_SYS_BOOT),  // KexecFileLoad, infeasible to support
+		321: syscalls.CapError(linux.CAP_SYS_ADMIN), // Bpf, requires cap_sys_admin for all commands
+		//     322: Execveat, TODO
+		//     323: Userfaultfd, TODO
+		//     324: Membarrier, TODO
+		325: syscalls.Error(nil), // Mlock2, TODO
 	},
 
 	Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index a70f35be0..89d21dd98 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1140,9 +1140,6 @@ func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
 	// always enabled, and thus imposes the following restrictions on hard
 	// links.
 
-	// Technically Linux is more restrictive in 3.11.10 (requires CAP_FOWNER in
-	// root user namespace); this is from the later f2ca379642d7 "namei: permit
-	// linking with CAP_FOWNER in userns".
 	if target.CheckOwnership(t) {
 		// fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER)
 		// can hardlink all they like."
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 820ca680e..9eed613a1 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -159,8 +159,8 @@ func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr
 }
 
 // Clone implements linux syscall clone(2).
-// sys_clone has so many flavors. We implement the default one in the
-// current linux 3.11 x86_64:
+// sys_clone has so many flavors. We implement the default one in linux 3.11
+// x86_64:
 //    sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
 func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	flags := int(args[0].Int())
-- 
cgit v1.2.3


From 2ef122da35899591737adca296b499246b877532 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 8 Nov 2018 17:38:50 -0800
Subject: Implement sync_file_range()

sync_file_range - sync a file segment with disk

In Linux, sync_file_range() accepts three flags:

       SYNC_FILE_RANGE_WAIT_BEFORE
              Wait  upon  write-out  of  all pages in the specified range that
              have already been submitted to the device driver  for  write-out
              before performing any write.

       SYNC_FILE_RANGE_WRITE
              Initiate  write-out  of  all  dirty pages in the specified range
              which are not presently submitted  write-out.   Note  that  even
              this  may  block if you attempt to write more than request queue
              size.

       SYNC_FILE_RANGE_WAIT_AFTER
              Wait upon write-out of all pages in the range  after  performing
              any write.

In this implementation:

SYNC_FILE_RANGE_WAIT_BEFORE without SYNC_FILE_RANGE_WAIT_AFTER isn't
supported right now.

SYNC_FILE_RANGE_WRITE is skipped. It should initiate write-out of  all
dirty pages, but it doesn't wait, so it should be safe to do nothing
while nobody uses SYNC_FILE_RANGE_WAIT_BEFORE.

SYNC_FILE_RANGE_WAIT_AFTER is equal to fdatasync(). In Linux,
sync_file_range() doesn't writes out the  file's  meta-data, but
fdatasync() does if a file size is changed.

PiperOrigin-RevId: 220730840
Change-Id: Iae5dfb23c2c916967d67cf1a1ad32f25eb3f6286
---
 pkg/abi/linux/fs.go                   |  7 ++++
 pkg/sentry/syscalls/linux/linux64.go  |  2 +-
 pkg/sentry/syscalls/linux/sys_sync.go | 63 +++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 7817bfb52..0b1c9f3db 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -73,3 +73,10 @@ type Statfs struct {
 	// Spare is unused.
 	Spare [4]uint64
 }
+
+// Sync_file_range flags, from include/uapi/linux/fs.h
+const (
+	SYNC_FILE_RANGE_WAIT_BEFORE = 1
+	SYNC_FILE_RANGE_WRITE       = 2
+	SYNC_FILE_RANGE_WAIT_AFTER  = 4
+)
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 13084c0ef..9912ab2b5 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -325,7 +325,7 @@ var AMD64 = &kernel.SyscallTable{
 		274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete
 		//     275: Splice, TODO
 		//     276: Tee, TODO
-		//     277: SyncFileRange, TODO
+		277: SyncFileRange,
 		//     278: Vmsplice, TODO
 		279: syscalls.CapError(linux.CAP_SYS_NICE), // MovePages, requires cap_sys_nice (mostly)
 		280: Utimensat,
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 826c6869d..68488330f 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -15,6 +15,7 @@
 package linux
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -73,3 +74,65 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData)
 	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 }
+
+// SyncFileRange implements linux syscall sync_file_rage(2)
+func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	var err error
+
+	offset := args[1].Int64()
+	nbytes := args[2].Int64()
+	uflags := args[3].Uint()
+
+	if offset < 0 || offset+nbytes < offset {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if uflags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|
+		linux.SYNC_FILE_RANGE_WRITE|
+		linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	if nbytes == 0 {
+		nbytes = fs.FileMaxOffset
+	}
+
+	fd := kdefs.FD(args[0].Int())
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the
+	// specified range that have already been submitted to the device
+	// driver for write-out before performing any write.
+	if uflags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
+		uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOSYS
+	}
+
+	// SYNC_FILE_RANGE_WRITE initiates write-out of all dirty pages in the
+	// specified range which are not presently submitted write-out.
+	//
+	// It looks impossible to implement this functionality without a
+	// massive rework of the vfs subsystem. file.Fsync() take a file lock
+	// for the entire operation, so even if it is running in a go routing,
+	// it blocks other file operations instead of flushing data in the
+	// background.
+	//
+	// It should be safe to skipped this flag while nobody uses
+	// SYNC_FILE_RANGE_WAIT_BEFORE.
+
+	// SYNC_FILE_RANGE_WAIT_AFTER waits upon write-out of all pages in the
+	// range after performing any write.
+	//
+	// In Linux, sync_file_range() doesn't writes out the  file's
+	// meta-data, but fdatasync() does if a file size is changed.
+	if uflags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
+		err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData)
+	}
+
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
-- 
cgit v1.2.3


From 25d07fbbed0de1e7173f0becb577f5481f98bed8 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Mon, 12 Nov 2018 17:43:43 -0800
Subject: Internal change.

PiperOrigin-RevId: 221189534
Change-Id: Id20d318bed97d5226b454c9351df396d11251e1f
---
 pkg/sentry/fs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
index 7680187f4..a88a0cd3a 100644
--- a/pkg/sentry/fs/README.md
+++ b/pkg/sentry/fs/README.md
@@ -72,7 +72,7 @@ Specifically this state is:
 -   A `kernel.FDMap` containing pointers to open files.
 
 Anything else managed by the VFS that can be easily loaded into memory from a
-filesystem is synced back to those filesystems and is no saved. Examples are
+filesystem is synced back to those filesystems and is not saved. Examples are
 pages in page caches used for optimizations (i.e. readahead and writeback), and
 directory entries used to accelerate path lookups.
 
-- 
cgit v1.2.3


From 7f60294a7367ee62cc5e0bd21648a68184c4ca5e Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Tue, 13 Nov 2018 18:01:26 -0800
Subject: Implement TCP_NODELAY and TCP_CORK

Previously, TCP_NODELAY was always enabled and we would lie about it being
configurable. TCP_NODELAY is now disabled by default (to match Linux) in the
socket layer so that non-gVisor users don't automatically start using this
questionable optimization.

PiperOrigin-RevId: 221368472
Change-Id: Ib0240f66d94455081f4e0ca94f09d9338b2c1356
---
 pkg/sentry/socket/epsocket/epsocket.go |  53 ++++++++++---
 pkg/sentry/socket/epsocket/provider.go |   2 +-
 pkg/tcpip/tcpip.go                     |   8 +-
 pkg/tcpip/transport/tcp/endpoint.go    |  47 ++++++++---
 pkg/tcpip/transport/tcp/snd.go         |  40 +++++++++-
 pkg/tcpip/transport/tcp/tcp_test.go    | 137 +++++++++++++++++++++++++--------
 6 files changed, 230 insertions(+), 57 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 39a0b9941..d14bbad01 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -157,7 +157,13 @@ type SocketOperations struct {
 }
 
 // New creates a new endpoint socket.
-func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) *fs.File {
+func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
+	if skType == transport.SockStream {
+		if err := endpoint.SetSockOpt(tcpip.DelayOption(1)); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+	}
+
 	dirent := socket.NewDirent(t, epsocketDevice)
 	defer dirent.DecRef()
 	return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true}, &SocketOperations{
@@ -165,7 +171,7 @@ func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Qu
 		family:   family,
 		Endpoint: endpoint,
 		skType:   skType,
-	})
+	}), nil
 }
 
 var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{}))
@@ -426,10 +432,10 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait
 // tcpip.Endpoint.
 func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) {
 	// Issue the accept request to get the new endpoint.
-	ep, wq, err := s.Endpoint.Accept()
-	if err != nil {
-		if err != tcpip.ErrWouldBlock || !blocking {
-			return 0, nil, 0, syserr.TranslateNetstackError(err)
+	ep, wq, terr := s.Endpoint.Accept()
+	if terr != nil {
+		if terr != tcpip.ErrWouldBlock || !blocking {
+			return 0, nil, 0, syserr.TranslateNetstackError(terr)
 		}
 
 		var err *syserr.Error
@@ -439,7 +445,10 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		}
 	}
 
-	ns := New(t, s.family, s.skType, wq, ep)
+	ns, err := New(t, s.family, s.skType, wq, ep)
+	if err != nil {
+		return 0, nil, 0, err
+	}
 	defer ns.DecRef()
 
 	if flags&linux.SOCK_NONBLOCK != 0 {
@@ -632,7 +641,22 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 				return nil, syserr.ErrInvalidArgument
 			}
 
-			var v tcpip.NoDelayOption
+			var v tcpip.DelayOption
+			if err := ep.GetSockOpt(&v); err != nil {
+				return nil, syserr.TranslateNetstackError(err)
+			}
+
+			if v == 0 {
+				return int32(1), nil
+			}
+			return int32(0), nil
+
+		case syscall.TCP_CORK:
+			if outLen < sizeOfInt32 {
+				return nil, syserr.ErrInvalidArgument
+			}
+
+			var v tcpip.CorkOption
 			if err := ep.GetSockOpt(&v); err != nil {
 				return nil, syserr.TranslateNetstackError(err)
 			}
@@ -748,7 +772,18 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n
 			}
 
 			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.NoDelayOption(v)))
+			var o tcpip.DelayOption
+			if v == 0 {
+				o = 1
+			}
+			return syserr.TranslateNetstackError(ep.SetSockOpt(o))
+		case syscall.TCP_CORK:
+			if len(optVal) < sizeOfInt32 {
+				return syserr.ErrInvalidArgument
+			}
+
+			v := usermem.ByteOrder.Uint32(optVal)
+			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
 		}
 	case syscall.SOL_IPV6:
 		switch name {
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 686554437..0184d8e3e 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -88,7 +88,7 @@ func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int
 		return nil, syserr.TranslateNetstackError(e)
 	}
 
-	return New(t, p.family, stype, wq, ep), nil
+	return New(t, p.family, stype, wq, ep)
 }
 
 // Pair just returns nil sockets (not supported).
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 413aee6c6..8e2fe70ee 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -420,10 +420,14 @@ type ReceiveQueueSizeOption int
 // socket is to be restricted to sending and receiving IPv6 packets only.
 type V6OnlyOption int
 
-// NoDelayOption is used by SetSockOpt/GetSockOpt to specify if data should be
+// DelayOption is used by SetSockOpt/GetSockOpt to specify if data should be
 // sent out immediately by the transport protocol. For TCP, it determines if the
 // Nagle algorithm is on or off.
-type NoDelayOption int
+type DelayOption int
+
+// CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
+// held until segments are full by the TCP transport protocol.
+type CorkOption int
 
 // ReuseAddressOption is used by SetSockOpt/GetSockOpt to specify whether Bind()
 // should allow reuse of local address.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 0b395b5b0..96a546aa7 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -162,10 +162,19 @@ type endpoint struct {
 	// sack holds TCP SACK related information for this endpoint.
 	sack SACKInfo
 
+	// delay enables Nagle's algorithm.
+	//
+	// delay is a boolean (0 is false) and must be accessed atomically.
+	delay uint32
+
+	// cork holds back segments until full.
+	//
+	// cork is a boolean (0 is false) and must be accessed atomically.
+	cork uint32
+
 	// The options below aren't implemented, but we remember the user
 	// settings because applications expect to be able to set/query these
 	// options.
-	noDelay   bool
 	reuseAddr bool
 
 	// segmentQueue is used to hand received segments to the protocol
@@ -276,7 +285,6 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
 		rcvBufSize:  DefaultBufferSize,
 		sndBufSize:  DefaultBufferSize,
 		sndMTU:      int(math.MaxInt32),
-		noDelay:     false,
 		reuseAddr:   true,
 		keepalive: keepalive{
 			// Linux defaults.
@@ -643,10 +651,24 @@ func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 	switch v := opt.(type) {
-	case tcpip.NoDelayOption:
-		e.mu.Lock()
-		e.noDelay = v != 0
-		e.mu.Unlock()
+	case tcpip.DelayOption:
+		if v == 0 {
+			atomic.StoreUint32(&e.delay, 0)
+		} else {
+			atomic.StoreUint32(&e.delay, 1)
+		}
+		return nil
+
+	case tcpip.CorkOption:
+		if v == 0 {
+			atomic.StoreUint32(&e.cork, 0)
+		} else {
+			atomic.StoreUint32(&e.cork, 1)
+		}
+
+		// Handle the corked data.
+		e.sndWaker.Assert()
+
 		return nil
 
 	case tcpip.ReuseAddressOption:
@@ -812,13 +834,16 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = tcpip.ReceiveQueueSizeOption(v)
 		return nil
 
-	case *tcpip.NoDelayOption:
-		e.mu.RLock()
-		v := e.noDelay
-		e.mu.RUnlock()
+	case *tcpip.DelayOption:
+		*o = 0
+		if v := atomic.LoadUint32(&e.delay); v != 0 {
+			*o = 1
+		}
+		return nil
 
+	case *tcpip.CorkOption:
 		*o = 0
-		if v {
+		if v := atomic.LoadUint32(&e.cork); v != 0 {
 			*o = 1
 		}
 		return nil
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 4482d8d07..f6dc7520b 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -17,6 +17,7 @@ package tcp
 import (
 	"math"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/sleep"
@@ -409,8 +410,6 @@ func (s *sender) sendData() {
 		// We abuse the flags field to determine if we have already
 		// assigned a sequence number to this segment.
 		if seg.flags == 0 {
-			seg.sequenceNumber = s.sndNxt
-			seg.flags = flagAck | flagPsh
 			// Merge segments if allowed.
 			if seg.data.Size() != 0 {
 				available := int(seg.sequenceNumber.Size(end))
@@ -418,8 +417,20 @@ func (s *sender) sendData() {
 					available = limit
 				}
 
+				// nextTooBig indicates that the next segment was too
+				// large to entirely fit in the current segment. It would
+				// be possible to split the next segment and merge the
+				// portion that fits, but unexpectedly splitting segments
+				// can have user visible side-effects which can break
+				// applications. For example, RFC 7766 section 8 says
+				// that the length and data of a DNS response should be
+				// sent in the same TCP segment to avoid triggering bugs
+				// in poorly written DNS implementations.
+				var nextTooBig bool
+
 				for next != nil && next.data.Size() != 0 {
 					if seg.data.Size()+next.data.Size() > available {
+						nextTooBig = true
 						break
 					}
 
@@ -429,7 +440,32 @@ func (s *sender) sendData() {
 					s.writeList.Remove(next)
 					next = next.Next()
 				}
+
+				if !nextTooBig && seg.data.Size() < available {
+					// Segment is not full.
+					if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 {
+						// Nagle's algorithm. From Wikipedia:
+						//   Nagle's algorithm works by combining a number of
+						//   small outgoing messages and sending them all at
+						//   once. Specifically, as long as there is a sent
+						//   packet for which the sender has received no
+						//   acknowledgment, the sender should keep buffering
+						//   its output until it has a full packet's worth of
+						//   output, thus allowing output to be sent all at
+						//   once.
+						break
+					}
+					if atomic.LoadUint32(&s.ep.cork) != 0 {
+						// Hold back the segment until full.
+						break
+					}
+				}
 			}
+
+			// Assign flags. We don't do it above so that we can merge
+			// additional data if Nagle holds the segment.
+			seg.sequenceNumber = s.sndNxt
+			seg.flags = flagAck | flagPsh
 		}
 
 		var segEnd seqnum.Value
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 75868c4a2..8155e4ed8 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1255,20 +1255,92 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 }
 
 func TestSegmentMerging(t *testing.T) {
+	tests := []struct {
+		name   string
+		stop   func(tcpip.Endpoint)
+		resume func(tcpip.Endpoint)
+	}{
+		{
+			"stop work",
+			func(ep tcpip.Endpoint) {
+				ep.(interface{ StopWork() }).StopWork()
+			},
+			func(ep tcpip.Endpoint) {
+				ep.(interface{ ResumeWork() }).ResumeWork()
+			},
+		},
+		{
+			"cork",
+			func(ep tcpip.Endpoint) {
+				ep.SetSockOpt(tcpip.CorkOption(1))
+			},
+			func(ep tcpip.Endpoint) {
+				ep.SetSockOpt(tcpip.CorkOption(0))
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateConnected(789, 30000, nil)
+
+			// Prevent the endpoint from processing packets.
+			test.stop(c.EP)
+
+			var allData []byte
+			for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
+				allData = append(allData, data...)
+				view := buffer.NewViewFromBytes(data)
+				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+					t.Fatalf("Write #%d failed: %v", i+1, err)
+				}
+			}
+
+			// Let the endpoint process the segments that we just sent.
+			test.resume(c.EP)
+
+			// Check that data is received.
+			b := c.GetPacket()
+			checker.IPv4(t, b,
+				checker.PayloadLen(len(allData)+header.TCPMinimumSize),
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.SeqNum(uint32(c.IRS)+1),
+					checker.AckNum(790),
+					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+				),
+			)
+
+			if got := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(got, allData) {
+				t.Fatalf("got data = %v, want = %v", got, allData)
+			}
+
+			// Acknowledge the data.
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: c.Port,
+				Flags:   header.TCPFlagAck,
+				SeqNum:  790,
+				AckNum:  c.IRS.Add(1 + seqnum.Size(len(allData))),
+				RcvWnd:  30000,
+			})
+		})
+	}
+}
+
+func TestDelay(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
 
 	c.CreateConnected(789, 30000, nil)
 
-	// Prevent the endpoint from processing packets.
-	worker := c.EP.(interface {
-		StopWork()
-		ResumeWork()
-	})
-	worker.StopWork()
+	c.EP.SetSockOpt(tcpip.DelayOption(1))
 
 	var allData []byte
-	for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
+	for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
 		allData = append(allData, data...)
 		view := buffer.NewViewFromBytes(data)
 		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
@@ -1276,34 +1348,35 @@ func TestSegmentMerging(t *testing.T) {
 		}
 	}
 
-	// Let the endpoint process the segments that we just sent.
-	worker.ResumeWork()
+	seq := c.IRS.Add(1)
+	for _, want := range [][]byte{allData[:1], allData[1:]} {
+		// Check that data is received.
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.PayloadLen(len(want)+header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(seq)),
+				checker.AckNum(790),
+				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			),
+		)
 
-	// Check that data is received.
-	b := c.GetPacket()
-	checker.IPv4(t, b,
-		checker.PayloadLen(len(allData)+header.TCPMinimumSize),
-		checker.TCP(
-			checker.DstPort(context.TestPort),
-			checker.SeqNum(uint32(c.IRS)+1),
-			checker.AckNum(790),
-			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
-		),
-	)
+		if got := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(got, want) {
+			t.Fatalf("got data = %v, want = %v", got, want)
+		}
 
-	if got := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(got, allData) {
-		t.Fatalf("got data = %v, want = %v", got, allData)
+		seq = seq.Add(seqnum.Size(len(want)))
+		// Acknowledge the data.
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  790,
+			AckNum:  seq,
+			RcvWnd:  30000,
+		})
 	}
-
-	// Acknowledge the data.
-	c.SendPacket(nil, &context.Headers{
-		SrcPort: context.TestPort,
-		DstPort: c.Port,
-		Flags:   header.TCPFlagAck,
-		SeqNum:  790,
-		AckNum:  c.IRS.Add(1 + seqnum.Size(len(allData))),
-		RcvWnd:  30000,
-	})
 }
 
 func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
-- 
cgit v1.2.3


From 6ef08c2bc2be1cc93bdf42bba5b96a0968a94552 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 15 Nov 2018 13:48:04 -0800
Subject: Allow setting sticky bit in tmpfs permissions.

PiperOrigin-RevId: 221683127
Change-Id: Ide6a9f41d75aa19d0e2051a05a1e4a114a4fb93c
---
 pkg/sentry/fs/tmpfs/fs.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 453ed5bd9..2e57f2b42 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -44,7 +44,7 @@ const (
 )
 
 // modeRegexp is the expected format of the mode option.
-var modeRegexp = regexp.MustCompile("^0?[0-7][0-7][0-7]$")
+var modeRegexp = regexp.MustCompile("^[0-1]?[0-7][0-7][0-7]$")
 
 // Filesystem is a tmpfs.
 //
-- 
cgit v1.2.3


From f7aa9371247a3e7d8c490ac0fd4c4f3ff6de2017 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 15 Nov 2018 15:13:52 -0800
Subject: Advertise vsyscall support via /proc/<pid>/maps.

Also update test utilities for probing vsyscall support and add a
metric to see if vsyscalls are actually used in sandboxes.

PiperOrigin-RevId: 221698834
Change-Id: I57870ecc33ea8c864bd7437833f21aa1e8117477
---
 pkg/sentry/kernel/BUILD           |  1 +
 pkg/sentry/kernel/task_syscall.go |  5 +++++
 pkg/sentry/mm/proc_pid_maps.go    | 16 ++++++++++++++++
 3 files changed, 22 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 389824b25..10d7b97c2 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -131,6 +131,7 @@ go_library(
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/log",
+        "//pkg/metric",
         "//pkg/refs",
         "//pkg/secio",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 0318adb35..2a39ebc68 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bits"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -60,6 +61,8 @@ const (
 	ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
 )
 
+var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application")
+
 // Error implements error.Error.
 func (e SyscallRestartErrno) Error() string {
 	// Descriptions are borrowed from strace.
@@ -325,6 +328,8 @@ func (*runSyscallExit) execute(t *Task) taskRunState {
 // indicated by an execution fault at address addr. doVsyscall returns the
 // task's next run state.
 func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+	vsyscallCount.Increment()
+
 	// Grab the caller up front, to make sure there's a sensible stack.
 	caller := t.Arch().Native(uintptr(0))
 	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go
index 0bf1cdb51..247ee45ef 100644
--- a/pkg/sentry/mm/proc_pid_maps.go
+++ b/pkg/sentry/mm/proc_pid_maps.go
@@ -53,6 +53,22 @@ func (mm *MemoryManager) ReadSeqFileData(ctx context.Context, handle seqfile.Seq
 			Handle: &vmaAddr,
 		})
 	}
+
+	// We always emulate vsyscall, so advertise it here. Everything about a
+	// vsyscall region is static, so just hard code the maps entry since we
+	// don't have a real vma backing it. The vsyscall region is at the end of
+	// the virtual address space so nothing should be mapped after it (if
+	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+	// get the sorting on the maps file wrong at worst; but that's not possible
+	// on any current platform).
+	//
+	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
+	if vsyscallEnd := usermem.Addr(0xffffffffff601000); start != vsyscallEnd {
+		data = append(data, seqfile.SeqData{
+			Buf:    []byte("ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n"),
+			Handle: &vsyscallEnd,
+		})
+	}
 	return data, 1
 }
 
-- 
cgit v1.2.3


From bb9a2bb62ed37f9b29c7ab4418b8b90417d1b2a2 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 16 Nov 2018 12:16:37 -0800
Subject: Update futex to use usermem abstractions.

This eliminates the indirection that existed in task_futex.

PiperOrigin-RevId: 221832498
Change-Id: Ifb4c926d493913aa6694e193deae91616a29f042
---
 pkg/sentry/kernel/futex/BUILD          |   3 +
 pkg/sentry/kernel/futex/futex.go       | 155 ++++++++++++++++++++++++---------
 pkg/sentry/kernel/futex/futex_test.go  |  29 +++---
 pkg/sentry/kernel/task_exit.go         |   2 +-
 pkg/sentry/kernel/task_futex.go        | 127 +++------------------------
 pkg/sentry/mm/syscalls.go              |   5 +-
 pkg/sentry/syscalls/linux/sys_futex.go |  28 +++---
 7 files changed, 160 insertions(+), 189 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index e13fcb5ff..afd35985f 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -36,7 +36,9 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/usermem",
         "//pkg/syserror",
     ],
 )
@@ -46,4 +48,5 @@ go_test(
     size = "small",
     srcs = ["futex_test.go"],
     embed = [":futex"],
+    deps = ["//pkg/sentry/usermem"],
 )
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index ea69d433b..b3e628fd4 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -20,7 +20,9 @@ package futex
 import (
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -81,8 +83,8 @@ func (k *Key) clone() Key {
 }
 
 // Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
-func (k *Key) addr() uintptr {
-	return uintptr(k.Offset)
+func (k *Key) addr() usermem.Addr {
+	return usermem.Addr(k.Offset)
 }
 
 // matches returns true if a wakeup on k2 should wake a waiter waiting on k.
@@ -91,23 +93,13 @@ func (k *Key) matches(k2 *Key) bool {
 	return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
 }
 
-// Checker abstracts memory accesses. This is useful because the "addresses"
-// used in this package may not be real addresses (they could be indices of an
-// array, for example), or they could be mapped via some special mechanism.
-//
-// TODO: Replace this with usermem.IO.
-type Checker interface {
-	// Check should validate that given address contains the given value.
-	// If it does not contain the value, syserror.EAGAIN must be returned.
-	// Any other error may be returned, which will be propagated.
-	Check(addr uintptr, val uint32) error
-
-	// Op should atomically perform the operation encoded in op on the data
-	// pointed to by addr, then apply the comparison encoded in op to the
-	// original value at addr, returning the result.
-	// Note that op is an opaque operation whose behaviour is defined
-	// outside of the futex manager.
-	Op(addr uintptr, op uint32) (bool, error)
+// Target abstracts memory accesses and keys.
+type Target interface {
+	// SwapUint32 gives access to usermem.SwapUint32.
+	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+
+	// CompareAndSwap gives access to usermem.CompareAndSwapUint32.
+	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
 
 	// GetSharedKey returns a Key with kind KindSharedPrivate or
 	// KindSharedMappable corresponding to the memory mapped at address addr.
@@ -115,7 +107,84 @@ type Checker interface {
 	// If GetSharedKey returns a Key with a non-nil MappingIdentity, a
 	// reference is held on the MappingIdentity, which must be dropped by the
 	// caller when the Key is no longer in use.
-	GetSharedKey(addr uintptr) (Key, error)
+	GetSharedKey(addr usermem.Addr) (Key, error)
+}
+
+// check performs a basic equality check on the given address.
+func check(t Target, addr usermem.Addr, val uint32) error {
+	prev, err := t.CompareAndSwapUint32(addr, val, val)
+	if err != nil {
+		return err
+	}
+	if prev != val {
+		return syserror.EAGAIN
+	}
+	return nil
+}
+
+// atomicOp performs a complex operation on the given address.
+func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
+	opType := (opIn >> 28) & 0xf
+	cmp := (opIn >> 24) & 0xf
+	opArg := (opIn >> 12) & 0xfff
+	cmpArg := opIn & 0xfff
+
+	if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
+		opArg = 1 << opArg
+		opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
+	}
+
+	var (
+		oldVal uint32
+		err    error
+	)
+	if opType == linux.FUTEX_OP_SET {
+		oldVal, err = t.SwapUint32(addr, opArg)
+	} else {
+		for {
+			oldVal, err = t.CompareAndSwapUint32(addr, 0, 0)
+			if err != nil {
+				break
+			}
+			var newVal uint32
+			switch opType {
+			case linux.FUTEX_OP_ADD:
+				newVal = oldVal + opArg
+			case linux.FUTEX_OP_OR:
+				newVal = oldVal | opArg
+			case linux.FUTEX_OP_ANDN:
+				newVal = oldVal &^ opArg
+			case linux.FUTEX_OP_XOR:
+				newVal = oldVal ^ opArg
+			default:
+				return false, syserror.ENOSYS
+			}
+			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
+			if err != nil {
+				break
+			}
+			if prev == oldVal {
+				break // Success.
+			}
+		}
+	}
+
+	switch cmp {
+	case linux.FUTEX_OP_CMP_EQ:
+		return oldVal == cmpArg, nil
+	case linux.FUTEX_OP_CMP_NE:
+		return oldVal != cmpArg, nil
+	case linux.FUTEX_OP_CMP_LT:
+		return oldVal < cmpArg, nil
+	case linux.FUTEX_OP_CMP_LE:
+		return oldVal <= cmpArg, nil
+	case linux.FUTEX_OP_CMP_GT:
+		return oldVal > cmpArg, nil
+	case linux.FUTEX_OP_CMP_GE:
+		return oldVal >= cmpArg, nil
+	default:
+		return false, syserror.ENOSYS
+	}
 }
 
 // Waiter is the struct which gets enqueued into buckets for wake up routines
@@ -243,7 +312,7 @@ const (
 )
 
 // getKey returns a Key representing address addr in c.
-func getKey(c Checker, addr uintptr, private bool) (Key, error) {
+func getKey(t Target, addr usermem.Addr, private bool) (Key, error) {
 	// Ensure the address is aligned.
 	// It must be a DWORD boundary.
 	if addr&0x3 != 0 {
@@ -252,11 +321,11 @@ func getKey(c Checker, addr uintptr, private bool) (Key, error) {
 	if private {
 		return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
 	}
-	return c.GetSharedKey(addr)
+	return t.GetSharedKey(addr)
 }
 
 // bucketIndexForAddr returns the index into Manager.buckets for addr.
-func bucketIndexForAddr(addr uintptr) uintptr {
+func bucketIndexForAddr(addr usermem.Addr) uintptr {
 	// - The bottom 2 bits of addr must be 0, per getKey.
 	//
 	// - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
@@ -277,8 +346,8 @@ func bucketIndexForAddr(addr uintptr) uintptr {
 	// is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
 	// (addr >> 42)" without any additional grouping, the compiler puts all 4
 	// additions in the critical path.
-	h1 := (addr >> 2) + (addr >> 12) + (addr >> 22)
-	h2 := (addr >> 32) + (addr >> 42)
+	h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
+	h2 := uintptr(addr>>32) + uintptr(addr>>42)
 	return (h1 + h2) % bucketCount
 }
 
@@ -363,9 +432,9 @@ func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) {
 
 // Wake wakes up to n waiters matching the bitmask on the given addr.
 // The number of waiters woken is returned.
-func (m *Manager) Wake(c Checker, addr uintptr, private bool, bitmask uint32, n int) (int, error) {
+func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) {
 	// This function is very hot; avoid defer.
-	k, err := getKey(c, addr, private)
+	k, err := getKey(t, addr, private)
 	if err != nil {
 		return 0, err
 	}
@@ -378,13 +447,13 @@ func (m *Manager) Wake(c Checker, addr uintptr, private bool, bitmask uint32, n
 	return r, nil
 }
 
-func (m *Manager) doRequeue(c Checker, addr, naddr uintptr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
-	k1, err := getKey(c, addr, private)
+func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
+	k1, err := getKey(t, addr, private)
 	if err != nil {
 		return 0, err
 	}
 	defer k1.release()
-	k2, err := getKey(c, naddr, private)
+	k2, err := getKey(t, naddr, private)
 	if err != nil {
 		return 0, err
 	}
@@ -397,7 +466,7 @@ func (m *Manager) doRequeue(c Checker, addr, naddr uintptr, private bool, checkv
 	}
 
 	if checkval {
-		if err := c.Check(addr, val); err != nil {
+		if err := check(t, addr, val); err != nil {
 			return 0, err
 		}
 	}
@@ -413,28 +482,28 @@ func (m *Manager) doRequeue(c Checker, addr, naddr uintptr, private bool, checkv
 
 // Requeue wakes up to nwake waiters on the given addr, and unconditionally
 // requeues up to nreq waiters on naddr.
-func (m *Manager) Requeue(c Checker, addr, naddr uintptr, private bool, nwake int, nreq int) (int, error) {
-	return m.doRequeue(c, addr, naddr, private, false, 0, nwake, nreq)
+func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
 }
 
-// RequeueCmp atomically checks that the addr contains val (via the Checker),
+// RequeueCmp atomically checks that the addr contains val (via the Target),
 // wakes up to nwake waiters on addr and then unconditionally requeues nreq
 // waiters on naddr.
-func (m *Manager) RequeueCmp(c Checker, addr, naddr uintptr, private bool, val uint32, nwake int, nreq int) (int, error) {
-	return m.doRequeue(c, addr, naddr, private, true, val, nwake, nreq)
+func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
+	return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
 }
 
 // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
 // waiters unconditionally from addr1, and, based on the original value at addr2
 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
 // It returns the total number of waiters woken.
-func (m *Manager) WakeOp(c Checker, addr1, addr2 uintptr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
-	k1, err := getKey(c, addr1, private)
+func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
+	k1, err := getKey(t, addr1, private)
 	if err != nil {
 		return 0, err
 	}
 	defer k1.release()
-	k2, err := getKey(c, addr2, private)
+	k2, err := getKey(t, addr2, private)
 	if err != nil {
 		return 0, err
 	}
@@ -447,7 +516,7 @@ func (m *Manager) WakeOp(c Checker, addr1, addr2 uintptr, private bool, nwake1 i
 	}
 
 	done := 0
-	cond, err := c.Op(addr2, op)
+	cond, err := atomicOp(t, addr2, op)
 	if err != nil {
 		return 0, err
 	}
@@ -468,8 +537,8 @@ func (m *Manager) WakeOp(c Checker, addr1, addr2 uintptr, private bool, nwake1 i
 // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
 // Waiter must be subsequently removed by calling WaitComplete, whether or not
 // a wakeup is received on w.C.
-func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, private bool, val uint32, bitmask uint32) error {
-	k, err := getKey(c, addr, private)
+func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error {
+	k, err := getKey(t, addr, private)
 	if err != nil {
 		return err
 	}
@@ -487,7 +556,7 @@ func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, private bool,
 	// This function is very hot; avoid defer.
 
 	// Perform our atomic check.
-	if err := c.Check(addr, val); err != nil {
+	if err := check(t, addr, val); err != nil {
 		b.mu.Unlock()
 		w.key.release()
 		return err
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index ea506a29b..a7ab9f229 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -22,9 +22,11 @@ import (
 	"syscall"
 	"testing"
 	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// testData implements the Checker interface, and allows us to
+// testData implements the Target interface, and allows us to
 // treat the address passed for futex operations as an index in
 // a byte slice for testing simplicity.
 type testData []byte
@@ -35,18 +37,19 @@ func newTestData(size uint) testData {
 	return make([]byte, size)
 }
 
-func (t testData) Check(addr uintptr, val uint32) error {
-	if val != atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))) {
-		return syscall.EAGAIN
-	}
-	return nil
+func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+	val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new)
+	return val, nil
 }
 
-func (t testData) Op(addr uintptr, val uint32) (bool, error) {
-	return val == 0, nil
+func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+	if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) {
+		return old, nil
+	}
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
 }
 
-func (t testData) GetSharedKey(addr uintptr) (Key, error) {
+func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
 	return Key{
 		Kind:   KindSharedMappable,
 		Offset: uint64(addr),
@@ -60,9 +63,9 @@ func futexKind(private bool) string {
 	return "shared"
 }
 
-func newPreparedTestWaiter(t *testing.T, m *Manager, c Checker, addr uintptr, private bool, val uint32, bitmask uint32) *Waiter {
+func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) *Waiter {
 	w := NewWaiter()
-	if err := m.WaitPrepare(w, c, addr, private, val, bitmask); err != nil {
+	if err := m.WaitPrepare(w, ta, addr, private, val, bitmask); err != nil {
 		t.Fatalf("WaitPrepare failed: %v", err)
 	}
 	return w
@@ -450,12 +453,12 @@ const (
 // Beyond being used as a Locker, this is a simple mechanism for
 // changing the underlying values for simpler tests.
 type testMutex struct {
-	a uintptr
+	a usermem.Addr
 	d testData
 	m *Manager
 }
 
-func newTestMutex(addr uintptr, d testData, m *Manager) *testMutex {
+func newTestMutex(addr usermem.Addr, d testData, m *Manager) *testMutex {
 	return &testMutex{a: addr, d: d, m: m}
 }
 
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 44fbb487c..791cc9831 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -247,7 +247,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
 		t.tg.signalHandlers.mu.Unlock()
 		if !signaled {
 			if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
-				t.Futex().Wake(t.FutexChecker(), uintptr(t.cleartid), false, ^uint32(0), 1)
+				t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
 			}
 			// If the CopyOut fails, there's nothing we can do.
 		}
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 5a11ca3df..921f7bdbc 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -15,10 +15,8 @@
 package kernel
 
 import (
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 // Futex returns t's futex manager.
@@ -29,120 +27,21 @@ func (t *Task) Futex() *futex.Manager {
 	return t.tc.fu
 }
 
-// FutexChecker returns a futex.Checker that interprets addresses in t's
-// address space.
-//
-// Preconditions: All uses of the returned futex.Checker must be on the task
-// goroutine.
-func (t *Task) FutexChecker() futex.Checker {
-	return futexChecker{t}
-}
-
-type futexChecker struct {
-	t *Task
+// SwapUint32 implements futex.Target.SwapUint32.
+func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+	return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
 }
 
-// Check implements futex.Checker.Check.
-func (f futexChecker) Check(addr uintptr, val uint32) error {
-	// FIXME
-	in := f.t.CopyScratchBuffer(4)
-	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
-	if err != nil {
-		return err
-	}
-	nval := usermem.ByteOrder.Uint32(in)
-	if val != nval {
-		return syserror.EAGAIN
-	}
-	return nil
-}
-
-func (f futexChecker) atomicOp(addr uintptr, op func(uint32) uint32) (uint32, error) {
-	// FIXME
-	in := f.t.CopyScratchBuffer(4)
-	_, err := f.t.CopyInBytes(usermem.Addr(addr), in)
-	if err != nil {
-		return 0, err
-	}
-	o := usermem.ByteOrder.Uint32(in)
-	mm := f.t.MemoryManager()
-	for {
-		n := op(o)
-		r, err := mm.CompareAndSwapUint32(f.t, usermem.Addr(addr), o, n, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		if err != nil {
-			return 0, err
-		}
-
-		if r == o {
-			return o, nil
-		}
-		o = r
-	}
-}
-
-// Op implements futex.Checker.Op, interpreting opIn consistently with Linux.
-func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) {
-	op := (opIn >> 28) & 0xf
-	cmp := (opIn >> 24) & 0xf
-	opArg := (opIn >> 12) & 0xfff
-	cmpArg := opIn & 0xfff
-
-	if op&linux.FUTEX_OP_OPARG_SHIFT != 0 {
-		opArg = 1 << opArg
-		op &^= linux.FUTEX_OP_OPARG_SHIFT // clear flag
-	}
-
-	var oldVal uint32
-	var err error
-	switch op {
-	case linux.FUTEX_OP_SET:
-		oldVal, err = f.t.MemoryManager().SwapUint32(f.t, usermem.Addr(addr), opArg, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-	case linux.FUTEX_OP_ADD:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a + opArg
-		})
-	case linux.FUTEX_OP_OR:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a | opArg
-		})
-	case linux.FUTEX_OP_ANDN:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a &^ opArg
-		})
-	case linux.FUTEX_OP_XOR:
-		oldVal, err = f.atomicOp(addr, func(a uint32) uint32 {
-			return a ^ opArg
-		})
-	default:
-		return false, syserror.ENOSYS
-	}
-	if err != nil {
-		return false, err
-	}
-
-	switch cmp {
-	case linux.FUTEX_OP_CMP_EQ:
-		return oldVal == cmpArg, nil
-	case linux.FUTEX_OP_CMP_NE:
-		return oldVal != cmpArg, nil
-	case linux.FUTEX_OP_CMP_LT:
-		return oldVal < cmpArg, nil
-	case linux.FUTEX_OP_CMP_LE:
-		return oldVal <= cmpArg, nil
-	case linux.FUTEX_OP_CMP_GT:
-		return oldVal > cmpArg, nil
-	case linux.FUTEX_OP_CMP_GE:
-		return oldVal >= cmpArg, nil
-	default:
-		return false, syserror.ENOSYS
-	}
+// CompareAndSwapUint32 implemets futex.Target.CompareAndSwapUint32.
+func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+	return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
 }
 
-// GetSharedKey implements futex.Checker.GetSharedKey.
-func (f futexChecker) GetSharedKey(addr uintptr) (futex.Key, error) {
-	return f.t.MemoryManager().GetSharedFutexKey(f.t, usermem.Addr(addr))
+// GetSharedKey implements futex.Target.GetSharedKey.
+func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
+	return t.MemoryManager().GetSharedFutexKey(t, addr)
 }
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index a721cc456..9519c7390 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -794,10 +794,9 @@ func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uin
 	return nil
 }
 
-// GetSharedFutexKey is used by kernel.futexChecker.GetSharedKey to implement
-// futex.Checker.GetSharedKey.
+// GetSharedFutexKey is used by kernel.Task.GetSharedKey.
 func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Addr) (futex.Key, error) {
-	ar, ok := addr.ToRange(4) // sizeof(int32)
+	ar, ok := addr.ToRange(4) // sizeof(int32).
 	if !ok {
 		return futex.Key{}, syserror.EFAULT
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index cf04428bc..7a1d396ec 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -32,8 +33,7 @@ type futexWaitRestartBlock struct {
 	duration time.Duration
 
 	// addr stored as uint64 since uintptr is not save-able.
-	addr uint64
-
+	addr    uint64
 	private bool
 	val     uint32
 	mask    uint32
@@ -41,7 +41,7 @@ type futexWaitRestartBlock struct {
 
 // Restart implements kernel.SyscallRestartBlock.Restart.
 func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
-	return futexWaitDuration(t, f.duration, false, uintptr(f.addr), f.private, f.val, f.mask)
+	return futexWaitDuration(t, f.duration, false, usermem.Addr(f.addr), f.private, f.val, f.mask)
 }
 
 // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
@@ -51,9 +51,9 @@ func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
 //
 // If blocking is interrupted, the syscall is restarted with the original
 // arguments.
-func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr uintptr, private bool, val, mask uint32) (uintptr, error) {
+func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) {
 	w := t.FutexWaiter()
-	err := t.Futex().WaitPrepare(w, t.FutexChecker(), addr, private, val, mask)
+	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
 	if err != nil {
 		return 0, err
 	}
@@ -87,9 +87,9 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo
 // syscall. If forever is true, the syscall is restarted with the original
 // arguments. If forever is false, duration is a relative timeout and the
 // syscall is restarted with the remaining timeout.
-func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr uintptr, private bool, val, mask uint32) (uintptr, error) {
+func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr usermem.Addr, private bool, val, mask uint32) (uintptr, error) {
 	w := t.FutexWaiter()
-	err := t.Futex().WaitPrepare(w, t.FutexChecker(), addr, private, val, mask)
+	err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
 	if err != nil {
 		return 0, err
 	}
@@ -128,16 +128,14 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
 // It provides a method for a program to wait for a value at a given address to
 // change, and a method to wake up anyone waiting on a particular address.
 func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	uaddr := args[0].Pointer()
+	addr := args[0].Pointer()
 	futexOp := args[1].Int()
 	val := int(args[2].Int())
 	nreq := int(args[3].Int())
 	timeout := args[3].Pointer()
-	uaddr2 := args[4].Pointer()
+	naddr := args[4].Pointer()
 	val3 := args[5].Int()
 
-	addr := uintptr(uaddr)
-	naddr := uintptr(uaddr2)
 	cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
 	private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
 	clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
@@ -188,23 +186,23 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		if mask == 0 {
 			return 0, nil, syserror.EINVAL
 		}
-		n, err := t.Futex().Wake(t.FutexChecker(), addr, private, mask, val)
+		n, err := t.Futex().Wake(t, addr, private, mask, val)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_REQUEUE:
-		n, err := t.Futex().Requeue(t.FutexChecker(), addr, naddr, private, val, nreq)
+		n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_CMP_REQUEUE:
 		// 'val3' contains the value to be checked at 'addr' and
 		// 'val' is the number of waiters that should be woken up.
 		nval := uint32(val3)
-		n, err := t.Futex().RequeueCmp(t.FutexChecker(), addr, naddr, private, nval, val, nreq)
+		n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_WAKE_OP:
 		op := uint32(val3)
-		n, err := t.Futex().WakeOp(t.FutexChecker(), addr, naddr, private, val, nreq, op)
+		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err
 
 	case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
-- 
cgit v1.2.3


From 8c84f9a3c1a82e633e3f87801921d86985d25a46 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 16 Nov 2018 12:39:14 -0800
Subject: Parse the tmpfs mode before validating.

This gets rid of the problematic modeRegex.

PiperOrigin-RevId: 221835959
Change-Id: I566b8d8a43579a4c30c0a08a620a964bbcd826dd
---
 pkg/sentry/fs/tmpfs/fs.go | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 2e57f2b42..3ac0c4dd4 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -16,7 +16,6 @@ package tmpfs
 
 import (
 	"fmt"
-	"regexp"
 	"strconv"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -39,13 +38,13 @@ const (
 	// TODO: support a tmpfs size limit.
 	// size = "size"
 
-	// default permissions are read/write/execute.
+	// Permissions that exceed modeMask will be rejected.
+	modeMask = 01777
+
+	// Default permissions are read/write/execute.
 	defaultMode = 0777
 )
 
-// modeRegexp is the expected format of the mode option.
-var modeRegexp = regexp.MustCompile("^[0-1]?[0-7][0-7][0-7]$")
-
 // Filesystem is a tmpfs.
 //
 // +stateify savable
@@ -91,15 +90,13 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 	// Parse the root directory permissions.
 	perms := fs.FilePermsFromMode(defaultMode)
 	if m, ok := options[modeKey]; ok {
-		if !modeRegexp.MatchString(m) {
-			return nil, fmt.Errorf("unsupported mode value: 'mode=%s'", m)
-		}
-		// It's basically impossible that we error out at this point,
-		// maybe we should panic.
 		i, err := strconv.ParseUint(m, 8, 32)
 		if err != nil {
 			return nil, fmt.Errorf("mode value not parsable 'mode=%s': %v", m, err)
 		}
+		if i&^modeMask != 0 {
+			return nil, fmt.Errorf("invalid mode %q: must be less than %o", m, modeMask)
+		}
 		perms = fs.FilePermsFromMode(linux.FileMode(i))
 		delete(options, modeKey)
 	}
-- 
cgit v1.2.3


From fadffa2ff831034ff63146abf408ff71462b9f43 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 19 Nov 2018 15:25:00 -0800
Subject: Add unsupported syscall events for get/setsockopt

PiperOrigin-RevId: 222148953
Change-Id: I21500a9f08939c45314a6414e0824490a973e5aa
---
 pkg/abi/linux/BUILD                    |   1 +
 pkg/abi/linux/ip.go                    | 107 ++++++
 pkg/abi/linux/netlink.go               |  14 +
 pkg/abi/linux/socket.go                |  87 ++++-
 pkg/abi/linux/tcp.go                   |  54 +++
 pkg/sentry/socket/epsocket/epsocket.go | 675 ++++++++++++++++++++++-----------
 pkg/sentry/socket/netlink/socket.go    |  33 ++
 pkg/sentry/socket/socket.go            |  91 +++++
 runsc/boot/compat.go                   |  10 +-
 runsc/boot/compat_amd64.go             |  55 ++-
 runsc/boot/compat_test.go              |  39 +-
 11 files changed, 902 insertions(+), 264 deletions(-)
 create mode 100644 pkg/abi/linux/tcp.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index f8f82c0da..1f6e43605 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -43,6 +43,7 @@ go_library(
         "shm.go",
         "signal.go",
         "socket.go",
+        "tcp.go",
         "time.go",
         "timer.go",
         "tty.go",
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index fcec16965..77ac1062c 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -42,3 +42,110 @@ const (
 	IPPROTO_MPLS    = 137
 	IPPROTO_RAW     = 255
 )
+
+// Socket options from uapi/linux/in.h
+const (
+	IP_TOS                    = 1
+	IP_TTL                    = 2
+	IP_HDRINCL                = 3
+	IP_OPTIONS                = 4
+	IP_ROUTER_ALERT           = 5
+	IP_RECVOPTS               = 6
+	IP_RETOPTS                = 7
+	IP_PKTINFO                = 8
+	IP_PKTOPTIONS             = 9
+	IP_MTU_DISCOVER           = 10
+	IP_RECVERR                = 11
+	IP_RECVTTL                = 12
+	IP_RECVTOS                = 13
+	IP_MTU                    = 14
+	IP_FREEBIND               = 15
+	IP_IPSEC_POLICY           = 16
+	IP_XFRM_POLICY            = 17
+	IP_PASSSEC                = 18
+	IP_TRANSPARENT            = 19
+	IP_ORIGDSTADDR            = 20
+	IP_RECVORIGDSTADDR        = IP_ORIGDSTADDR
+	IP_MINTTL                 = 21
+	IP_NODEFRAG               = 22
+	IP_CHECKSUM               = 23
+	IP_BIND_ADDRESS_NO_PORT   = 24
+	IP_RECVFRAGSIZE           = 25
+	IP_MULTICAST_IF           = 32
+	IP_MULTICAST_TTL          = 33
+	IP_MULTICAST_LOOP         = 34
+	IP_ADD_MEMBERSHIP         = 35
+	IP_DROP_MEMBERSHIP        = 36
+	IP_UNBLOCK_SOURCE         = 37
+	IP_BLOCK_SOURCE           = 38
+	IP_ADD_SOURCE_MEMBERSHIP  = 39
+	IP_DROP_SOURCE_MEMBERSHIP = 40
+	IP_MSFILTER               = 41
+	MCAST_JOIN_GROUP          = 42
+	MCAST_BLOCK_SOURCE        = 43
+	MCAST_UNBLOCK_SOURCE      = 44
+	MCAST_LEAVE_GROUP         = 45
+	MCAST_JOIN_SOURCE_GROUP   = 46
+	MCAST_LEAVE_SOURCE_GROUP  = 47
+	MCAST_MSFILTER            = 48
+	IP_MULTICAST_ALL          = 49
+	IP_UNICAST_IF             = 50
+)
+
+// Socket options from uapi/linux/in6.h
+const (
+	IPV6_ADDRFORM         = 1
+	IPV6_2292PKTINFO      = 2
+	IPV6_2292HOPOPTS      = 3
+	IPV6_2292DSTOPTS      = 4
+	IPV6_2292RTHDR        = 5
+	IPV6_2292PKTOPTIONS   = 6
+	IPV6_CHECKSUM         = 7
+	IPV6_2292HOPLIMIT     = 8
+	IPV6_NEXTHOP          = 9
+	IPV6_FLOWINFO         = 11
+	IPV6_UNICAST_HOPS     = 16
+	IPV6_MULTICAST_IF     = 17
+	IPV6_MULTICAST_HOPS   = 18
+	IPV6_MULTICAST_LOOP   = 19
+	IPV6_ADD_MEMBERSHIP   = 20
+	IPV6_DROP_MEMBERSHIP  = 21
+	IPV6_ROUTER_ALERT     = 22
+	IPV6_MTU_DISCOVER     = 23
+	IPV6_MTU              = 24
+	IPV6_RECVERR          = 25
+	IPV6_V6ONLY           = 26
+	IPV6_JOIN_ANYCAST     = 27
+	IPV6_LEAVE_ANYCAST    = 28
+	IPV6_MULTICAST_ALL    = 29
+	IPV6_FLOWLABEL_MGR    = 32
+	IPV6_FLOWINFO_SEND    = 33
+	IPV6_IPSEC_POLICY     = 34
+	IPV6_XFRM_POLICY      = 35
+	IPV6_HDRINCL          = 36
+	IPV6_RECVPKTINFO      = 49
+	IPV6_PKTINFO          = 50
+	IPV6_RECVHOPLIMIT     = 51
+	IPV6_HOPLIMIT         = 52
+	IPV6_RECVHOPOPTS      = 53
+	IPV6_HOPOPTS          = 54
+	IPV6_RTHDRDSTOPTS     = 55
+	IPV6_RECVRTHDR        = 56
+	IPV6_RTHDR            = 57
+	IPV6_RECVDSTOPTS      = 58
+	IPV6_DSTOPTS          = 59
+	IPV6_RECVPATHMTU      = 60
+	IPV6_PATHMTU          = 61
+	IPV6_DONTFRAG         = 62
+	IPV6_RECVTCLASS       = 66
+	IPV6_TCLASS           = 67
+	IPV6_AUTOFLOWLABEL    = 70
+	IPV6_ADDR_PREFERENCES = 72
+	IPV6_MINHOPCOUNT      = 73
+	IPV6_ORIGDSTADDR      = 74
+	IPV6_RECVORIGDSTADDR  = IPV6_ORIGDSTADDR
+	IPV6_TRANSPARENT      = 75
+	IPV6_UNICAST_IF       = 76
+	IPV6_RECVFRAGSIZE     = 77
+	IPV6_FREEBIND         = 78
+)
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index 10ceb5bf2..25c5e17fd 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -108,3 +108,17 @@ const NetlinkAttrHeaderSize = 4
 // NLA_ALIGNTO is the alignment of netlink attributes, from
 // uapi/linux/netlink.h.
 const NLA_ALIGNTO = 4
+
+// Socket options, from uapi/linux/netlink.h.
+const (
+	NETLINK_ADD_MEMBERSHIP   = 1
+	NETLINK_DROP_MEMBERSHIP  = 2
+	NETLINK_PKTINFO          = 3
+	NETLINK_BROADCAST_ERROR  = 4
+	NETLINK_NO_ENOBUFS       = 5
+	NETLINK_LISTEN_ALL_NSID  = 8
+	NETLINK_LIST_MEMBERSHIPS = 9
+	NETLINK_CAP_ACK          = 10
+	NETLINK_EXT_ACK          = 11
+	NETLINK_DUMP_STRICT_CHK  = 12
+)
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index af0761a3b..929814752 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -89,8 +89,18 @@ const (
 	MSG_CMSG_CLOEXEC     = 0x40000000
 )
 
-// SOL_SOCKET is from socket.h
-const SOL_SOCKET = 1
+// Set/get socket option levels, from socket.h.
+const (
+	SOL_IP      = 0
+	SOL_SOCKET  = 1
+	SOL_TCP     = 6
+	SOL_UDP     = 17
+	SOL_IPV6    = 41
+	SOL_ICMPV6  = 58
+	SOL_RAW     = 255
+	SOL_PACKET  = 263
+	SOL_NETLINK = 270
+)
 
 // Socket types, from linux/net.h.
 const (
@@ -122,22 +132,63 @@ const (
 
 // Socket options from socket.h.
 const (
-	SO_ERROR       = 4
-	SO_KEEPALIVE   = 9
-	SO_LINGER      = 13
-	SO_MARK        = 36
-	SO_PASSCRED    = 16
-	SO_PEERCRED    = 17
-	SO_PEERNAME    = 28
-	SO_PROTOCOL    = 38
-	SO_RCVBUF      = 8
-	SO_RCVTIMEO    = 20
-	SO_REUSEADDR   = 2
-	SO_SNDBUF      = 7
-	SO_SNDTIMEO    = 21
-	SO_TIMESTAMP   = 29
-	SO_TIMESTAMPNS = 35
-	SO_TYPE        = 3
+	SO_DEBUG                 = 1
+	SO_REUSEADDR             = 2
+	SO_TYPE                  = 3
+	SO_ERROR                 = 4
+	SO_DONTROUTE             = 5
+	SO_BROADCAST             = 6
+	SO_SNDBUF                = 7
+	SO_RCVBUF                = 8
+	SO_KEEPALIVE             = 9
+	SO_OOBINLINE             = 10
+	SO_NO_CHECK              = 11
+	SO_PRIORITY              = 12
+	SO_LINGER                = 13
+	SO_BSDCOMPAT             = 14
+	SO_REUSEPORT             = 15
+	SO_PASSCRED              = 16
+	SO_PEERCRED              = 17
+	SO_RCVLOWAT              = 18
+	SO_SNDLOWAT              = 19
+	SO_RCVTIMEO              = 20
+	SO_SNDTIMEO              = 21
+	SO_BINDTODEVICE          = 25
+	SO_ATTACH_FILTER         = 26
+	SO_DETACH_FILTER         = 27
+	SO_GET_FILTER            = SO_ATTACH_FILTER
+	SO_PEERNAME              = 28
+	SO_TIMESTAMP             = 29
+	SO_ACCEPTCONN            = 30
+	SO_PEERSEC               = 31
+	SO_SNDBUFFORCE           = 32
+	SO_RCVBUFFORCE           = 33
+	SO_PASSSEC               = 34
+	SO_TIMESTAMPNS           = 35
+	SO_MARK                  = 36
+	SO_TIMESTAMPING          = 37
+	SO_PROTOCOL              = 38
+	SO_DOMAIN                = 39
+	SO_RXQ_OVFL              = 40
+	SO_WIFI_STATUS           = 41
+	SO_PEEK_OFF              = 42
+	SO_NOFCS                 = 43
+	SO_LOCK_FILTER           = 44
+	SO_SELECT_ERR_QUEUE      = 45
+	SO_BUSY_POLL             = 46
+	SO_MAX_PACING_RATE       = 47
+	SO_BPF_EXTENSIONS        = 48
+	SO_INCOMING_CPU          = 49
+	SO_ATTACH_BPF            = 50
+	SO_ATTACH_REUSEPORT_CBPF = 51
+	SO_ATTACH_REUSEPORT_EBPF = 52
+	SO_CNX_ADVICE            = 53
+	SO_MEMINFO               = 55
+	SO_INCOMING_NAPI_ID      = 56
+	SO_COOKIE                = 57
+	SO_PEERGROUPS            = 59
+	SO_ZEROCOPY              = 60
+	SO_TXTIME                = 61
 )
 
 // SockAddrMax is the maximum size of a struct sockaddr, from
diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go
new file mode 100644
index 000000000..7586ada42
--- /dev/null
+++ b/pkg/abi/linux/tcp.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Socket options from uapi/linux/tcp.h.
+const (
+	TCP_NODELAY              = 1
+	TCP_MAXSEG               = 2
+	TCP_CORK                 = 3
+	TCP_KEEPIDLE             = 4
+	TCP_KEEPINTVL            = 5
+	TCP_KEEPCNT              = 6
+	TCP_SYNCNT               = 7
+	TCP_LINGER2              = 8
+	TCP_DEFER_ACCEPT         = 9
+	TCP_WINDOW_CLAMP         = 10
+	TCP_INFO                 = 11
+	TCP_QUICKACK             = 12
+	TCP_CONGESTION           = 13
+	TCP_MD5SIG               = 14
+	TCP_THIN_LINEAR_TIMEOUTS = 16
+	TCP_THIN_DUPACK          = 17
+	TCP_USER_TIMEOUT         = 18
+	TCP_REPAIR               = 19
+	TCP_REPAIR_QUEUE         = 20
+	TCP_QUEUE_SEQ            = 21
+	TCP_REPAIR_OPTIONS       = 22
+	TCP_FASTOPEN             = 23
+	TCP_TIMESTAMP            = 24
+	TCP_NOTSENT_LOWAT        = 25
+	TCP_CC_INFO              = 26
+	TCP_SAVE_SYN             = 27
+	TCP_SAVED_SYN            = 28
+	TCP_REPAIR_WINDOW        = 29
+	TCP_FASTOPEN_CONNECT     = 30
+	TCP_ULP                  = 31
+	TCP_MD5SIG_EXT           = 32
+	TCP_FASTOPEN_KEY         = 33
+	TCP_FASTOPEN_NO_COOKIE   = 34
+	TCP_ZEROCOPY_RECEIVE     = 35
+	TCP_INQ                  = 36
+)
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index d14bbad01..c5ce289b5 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -515,189 +515,233 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
 func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
 	switch level {
 	case linux.SOL_SOCKET:
-		switch name {
-		case linux.SO_TYPE:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
-			return int32(skType), nil
+		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
 
-		case linux.SO_ERROR:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SOL_TCP:
+		return getSockOptTCP(t, ep, name, outLen)
 
-			// Get the last error and convert it.
-			err := ep.GetSockOpt(tcpip.ErrorOption{})
-			if err == nil {
-				return int32(0), nil
-			}
-			return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
+	case linux.SOL_IPV6:
+		return getSockOptIPv6(t, ep, name, outLen)
 
-		case linux.SO_PEERCRED:
-			if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SOL_IP,
+		linux.SOL_UDP,
+		linux.SOL_ICMPV6,
+		linux.SOL_RAW,
+		linux.SOL_PACKET:
 
-			tcred := t.Credentials()
-			return syscall.Ucred{
-				Pid: int32(t.ThreadGroup().ID()),
-				Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
-				Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
-			}, nil
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
 
-		case linux.SO_PASSCRED:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	return nil, syserr.ErrProtocolNotAvailable
+}
 
-			var v tcpip.PasscredOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
+func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) {
+	switch name {
+	case linux.SO_TYPE:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+		return int32(skType), nil
 
-			return int32(v), nil
+	case linux.SO_ERROR:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_SNDBUF:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		// Get the last error and convert it.
+		err := ep.GetSockOpt(tcpip.ErrorOption{})
+		if err == nil {
+			return int32(0), nil
+		}
+		return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil
 
-			var size tcpip.SendBufferSizeOption
-			if err := ep.GetSockOpt(&size); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+	case linux.SO_PEERCRED:
+		if family != linux.AF_UNIX || outLen < syscall.SizeofUcred {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			if size > math.MaxInt32 {
-				size = math.MaxInt32
-			}
+		tcred := t.Credentials()
+		return syscall.Ucred{
+			Pid: int32(t.ThreadGroup().ID()),
+			Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
+			Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
+		}, nil
 
-			return int32(size), nil
+	case linux.SO_PASSCRED:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_RCVBUF:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		var v tcpip.PasscredOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			var size tcpip.ReceiveBufferSizeOption
-			if err := ep.GetSockOpt(&size); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(v), nil
 
-			if size > math.MaxInt32 {
-				size = math.MaxInt32
-			}
+	case linux.SO_SNDBUF:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			return int32(size), nil
+		var size tcpip.SendBufferSizeOption
+		if err := ep.GetSockOpt(&size); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-		case linux.SO_REUSEADDR:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		if size > math.MaxInt32 {
+			size = math.MaxInt32
+		}
 
-			var v tcpip.ReuseAddressOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(size), nil
 
-			return int32(v), nil
+	case linux.SO_RCVBUF:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_KEEPALIVE:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
-			return int32(0), nil
+		var size tcpip.ReceiveBufferSizeOption
+		if err := ep.GetSockOpt(&size); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-		case linux.SO_LINGER:
-			if outLen < syscall.SizeofLinger {
-				return nil, syserr.ErrInvalidArgument
-			}
-			return syscall.Linger{}, nil
+		if size > math.MaxInt32 {
+			size = math.MaxInt32
+		}
 
-		case linux.SO_RCVTIMEO:
-			if outLen < linux.SizeOfTimeval {
-				return nil, syserr.ErrInvalidArgument
-			}
+		return int32(size), nil
 
-			return linux.NsecToTimeval(s.RecvTimeout()), nil
+	case linux.SO_REUSEADDR:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_TIMESTAMP:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		var v tcpip.ReuseAddressOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			var v tcpip.TimestampOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(v), nil
 
-			return int32(v), nil
+	case linux.SO_KEEPALIVE:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
 		}
+		return int32(0), nil
 
-	case syscall.SOL_TCP:
-		switch name {
-		case syscall.TCP_NODELAY:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SO_LINGER:
+		if outLen < syscall.SizeofLinger {
+			return nil, syserr.ErrInvalidArgument
+		}
+		return syscall.Linger{}, nil
 
-			var v tcpip.DelayOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+	case linux.SO_RCVTIMEO:
+		if outLen < linux.SizeOfTimeval {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			if v == 0 {
-				return int32(1), nil
-			}
-			return int32(0), nil
+		return linux.NsecToTimeval(s.RecvTimeout()), nil
 
-		case syscall.TCP_CORK:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+	case linux.SO_TIMESTAMP:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			var v tcpip.CorkOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		var v tcpip.TimestampOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			return int32(v), nil
+		return int32(v), nil
 
-		case syscall.TCP_INFO:
-			var v tcpip.TCPInfoOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+	default:
+		socket.GetSockOptEmitUnimplementedEvent(t, name)
+	}
+	return nil, syserr.ErrProtocolNotAvailable
+}
+
+// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
+func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+	switch name {
+	case linux.TCP_NODELAY:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
-			// TODO: Translate fields once they are added to
-			// tcpip.TCPInfoOption.
-			info := linux.TCPInfo{}
+		var v tcpip.DelayOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			// Linux truncates the output binary to outLen.
-			ib := binary.Marshal(nil, usermem.ByteOrder, &info)
-			if len(ib) > outLen {
-				ib = ib[:outLen]
-			}
+		if v == 0 {
+			return int32(1), nil
+		}
+		return int32(0), nil
 
-			return ib, nil
+	case linux.TCP_CORK:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
 		}
 
-	case syscall.SOL_IPV6:
-		switch name {
-		case syscall.IPV6_V6ONLY:
-			if outLen < sizeOfInt32 {
-				return nil, syserr.ErrInvalidArgument
-			}
+		var v tcpip.CorkOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
 
-			var v tcpip.V6OnlyOption
-			if err := ep.GetSockOpt(&v); err != nil {
-				return nil, syserr.TranslateNetstackError(err)
-			}
+		return int32(v), nil
 
-			return int32(v), nil
+	case linux.TCP_INFO:
+		var v tcpip.TCPInfoOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		// TODO: Translate fields once they are added to
+		// tcpip.TCPInfoOption.
+		info := linux.TCPInfo{}
+
+		// Linux truncates the output binary to outLen.
+		ib := binary.Marshal(nil, usermem.ByteOrder, &info)
+		if len(ib) > outLen {
+			ib = ib[:outLen]
 		}
+
+		return ib, nil
+
+	case linux.TCP_CC_INFO,
+		linux.TCP_NOTSENT_LOWAT,
+		linux.TCP_ZEROCOPY_RECEIVE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventTCP(t, name)
 	}
+	return nil, syserr.ErrProtocolNotAvailable
+}
+
+// getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
+func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+	switch name {
+	case linux.IPV6_V6ONLY:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
 
+		var v tcpip.V6OnlyOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
+	case linux.IPV6_PATHMTU:
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventIPv6(t, name)
+	}
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
@@ -712,109 +756,304 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
 func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
 	switch level {
 	case linux.SOL_SOCKET:
-		switch name {
-		case linux.SO_SNDBUF:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+		return setSockOptSocket(t, s, ep, name, optVal)
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.SendBufferSizeOption(v)))
+	case linux.SOL_TCP:
+		return setSockOptTCP(t, ep, name, optVal)
 
-		case linux.SO_RCVBUF:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	case linux.SOL_IPV6:
+		return setSockOptIPv6(t, ep, name, optVal)
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(v)))
+	case linux.SOL_IP:
+		return setSockOptIP(t, ep, name, optVal)
 
-		case linux.SO_REUSEADDR:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	case linux.SOL_UDP,
+		linux.SOL_ICMPV6,
+		linux.SOL_RAW,
+		linux.SOL_PACKET:
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
 
-		case linux.SO_PASSCRED:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
+// setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
+func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.SO_SNDBUF:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_RCVTIMEO:
-			if len(optVal) < linux.SizeOfTimeval {
-				return syserr.ErrInvalidArgument
-			}
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.SendBufferSizeOption(v)))
 
-			var v linux.Timeval
-			binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
-			s.SetRecvTimeout(v.ToNsecCapped())
-			return nil
+	case linux.SO_RCVBUF:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
 
-		case linux.SO_TIMESTAMP:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReceiveBufferSizeOption(v)))
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TimestampOption(v)))
+	case linux.SO_REUSEADDR:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
 		}
 
-	case syscall.SOL_TCP:
-		switch name {
-		case syscall.TCP_NODELAY:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			var o tcpip.DelayOption
-			if v == 0 {
-				o = 1
-			}
-			return syserr.TranslateNetstackError(ep.SetSockOpt(o))
-		case syscall.TCP_CORK:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
+	case linux.SO_PASSCRED:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
+	case linux.SO_RCVTIMEO:
+		if len(optVal) < linux.SizeOfTimeval {
+			return syserr.ErrInvalidArgument
 		}
-	case syscall.SOL_IPV6:
-		switch name {
-		case syscall.IPV6_V6ONLY:
-			if len(optVal) < sizeOfInt32 {
-				return syserr.ErrInvalidArgument
-			}
 
-			v := usermem.ByteOrder.Uint32(optVal)
-			return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
-		}
-	case syscall.SOL_IP:
-		const (
-			_IP_MULTICAST_IF   = 32
-			_IP_ADD_MEMBERSHIP = 35
-			_MCAST_JOIN_GROUP  = 42
-		)
-		switch name {
-		case _IP_ADD_MEMBERSHIP, _MCAST_JOIN_GROUP, _IP_MULTICAST_IF:
-			// FIXME: Disallow IP-level multicast group options by
-			// default. These will need to be supported by appropriately plumbing
-			// the level through to the network stack (if at all). However, we
-			// still allow setting TTL, and multicast-enable/disable type options.
+		var v linux.Timeval
+		binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+		s.SetRecvTimeout(v.ToNsecCapped())
+		return nil
+
+	case linux.SO_TIMESTAMP:
+		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
 		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TimestampOption(v)))
+
+	default:
+		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
 
 	// Default to the old behavior; hand off to network stack.
 	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
 }
 
+// setSockOptTCP implements SetSockOpt when level is SOL_TCP.
+func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.TCP_NODELAY:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		var o tcpip.DelayOption
+		if v == 0 {
+			o = 1
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(o))
+
+	case linux.TCP_CORK:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
+
+	case linux.TCP_REPAIR_OPTIONS:
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventTCP(t, name)
+	}
+
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
+
+// setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
+func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.IPV6_V6ONLY:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
+
+	case linux.IPV6_ADD_MEMBERSHIP,
+		linux.IPV6_DROP_MEMBERSHIP,
+		linux.IPV6_IPSEC_POLICY,
+		linux.IPV6_JOIN_ANYCAST,
+		linux.IPV6_LEAVE_ANYCAST,
+		linux.IPV6_PKTINFO,
+		linux.IPV6_ROUTER_ALERT,
+		linux.IPV6_XFRM_POLICY,
+		linux.MCAST_BLOCK_SOURCE,
+		linux.MCAST_JOIN_GROUP,
+		linux.MCAST_JOIN_SOURCE_GROUP,
+		linux.MCAST_LEAVE_GROUP,
+		linux.MCAST_LEAVE_SOURCE_GROUP,
+		linux.MCAST_UNBLOCK_SOURCE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUmplementedEventIPv6(t, name)
+	}
+
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
+
+// setSockOptIP implements SetSockOpt when level is SOL_IP.
+func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
+	switch name {
+	case linux.IP_ADD_MEMBERSHIP, linux.MCAST_JOIN_GROUP, linux.IP_MULTICAST_IF:
+		// FIXME: Disallow IP-level multicast group options by
+		// default. These will need to be supported by appropriately plumbing
+		// the level through to the network stack (if at all). However, we
+		// still allow setting TTL, and multicast-enable/disable type options.
+		t.Kernel().EmitUnimplementedEvent(t)
+		return syserr.ErrInvalidArgument
+
+	case linux.IP_ADD_SOURCE_MEMBERSHIP,
+		linux.IP_BIND_ADDRESS_NO_PORT,
+		linux.IP_BLOCK_SOURCE,
+		linux.IP_CHECKSUM,
+		linux.IP_DROP_MEMBERSHIP,
+		linux.IP_DROP_SOURCE_MEMBERSHIP,
+		linux.IP_FREEBIND,
+		linux.IP_HDRINCL,
+		linux.IP_IPSEC_POLICY,
+		linux.IP_MINTTL,
+		linux.IP_MSFILTER,
+		linux.IP_MTU_DISCOVER,
+		linux.IP_MULTICAST_ALL,
+		linux.IP_MULTICAST_LOOP,
+		linux.IP_MULTICAST_TTL,
+		linux.IP_NODEFRAG,
+		linux.IP_OPTIONS,
+		linux.IP_PASSSEC,
+		linux.IP_PKTINFO,
+		linux.IP_RECVERR,
+		linux.IP_RECVFRAGSIZE,
+		linux.IP_RECVOPTS,
+		linux.IP_RECVORIGDSTADDR,
+		linux.IP_RECVTOS,
+		linux.IP_RECVTTL,
+		linux.IP_RETOPTS,
+		linux.IP_TOS,
+		linux.IP_TRANSPARENT,
+		linux.IP_TTL,
+		linux.IP_UNBLOCK_SOURCE,
+		linux.IP_UNICAST_IF,
+		linux.IP_XFRM_POLICY,
+		linux.MCAST_BLOCK_SOURCE,
+		linux.MCAST_JOIN_SOURCE_GROUP,
+		linux.MCAST_LEAVE_GROUP,
+		linux.MCAST_LEAVE_SOURCE_GROUP,
+		linux.MCAST_MSFILTER,
+		linux.MCAST_UNBLOCK_SOURCE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+
+	// Default to the old behavior; hand off to network stack.
+	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
+}
+
+// emitUmplementedEventTCP emits unimplemented event if name is valid. This
+// function contains names that are common between Get and SetSockOpt when
+// level is SOL_TCP.
+func emitUmplementedEventTCP(t *kernel.Task, name int) {
+	switch name {
+	case linux.TCP_CONGESTION,
+		linux.TCP_CORK,
+		linux.TCP_DEFER_ACCEPT,
+		linux.TCP_FASTOPEN,
+		linux.TCP_FASTOPEN_CONNECT,
+		linux.TCP_FASTOPEN_KEY,
+		linux.TCP_FASTOPEN_NO_COOKIE,
+		linux.TCP_INQ,
+		linux.TCP_KEEPCNT,
+		linux.TCP_KEEPIDLE,
+		linux.TCP_KEEPINTVL,
+		linux.TCP_LINGER2,
+		linux.TCP_MAXSEG,
+		linux.TCP_QUEUE_SEQ,
+		linux.TCP_QUICKACK,
+		linux.TCP_REPAIR,
+		linux.TCP_REPAIR_QUEUE,
+		linux.TCP_REPAIR_WINDOW,
+		linux.TCP_SAVED_SYN,
+		linux.TCP_SAVE_SYN,
+		linux.TCP_SYNCNT,
+		linux.TCP_THIN_DUPACK,
+		linux.TCP_THIN_LINEAR_TIMEOUTS,
+		linux.TCP_TIMESTAMP,
+		linux.TCP_ULP,
+		linux.TCP_USER_TIMEOUT,
+		linux.TCP_WINDOW_CLAMP:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+}
+
+// emitUmplementedEventIPv6 emits unimplemented event if name is valid. It
+// contains names that are common between Get and SetSockOpt when level is
+// SOL_IPV6.
+func emitUmplementedEventIPv6(t *kernel.Task, name int) {
+	switch name {
+	case linux.IPV6_2292DSTOPTS,
+		linux.IPV6_2292HOPLIMIT,
+		linux.IPV6_2292HOPOPTS,
+		linux.IPV6_2292PKTINFO,
+		linux.IPV6_2292PKTOPTIONS,
+		linux.IPV6_2292RTHDR,
+		linux.IPV6_ADDR_PREFERENCES,
+		linux.IPV6_AUTOFLOWLABEL,
+		linux.IPV6_DONTFRAG,
+		linux.IPV6_DSTOPTS,
+		linux.IPV6_FLOWINFO,
+		linux.IPV6_FLOWINFO_SEND,
+		linux.IPV6_FLOWLABEL_MGR,
+		linux.IPV6_FREEBIND,
+		linux.IPV6_HOPOPTS,
+		linux.IPV6_MINHOPCOUNT,
+		linux.IPV6_MTU,
+		linux.IPV6_MTU_DISCOVER,
+		linux.IPV6_MULTICAST_ALL,
+		linux.IPV6_MULTICAST_HOPS,
+		linux.IPV6_MULTICAST_IF,
+		linux.IPV6_MULTICAST_LOOP,
+		linux.IPV6_RECVDSTOPTS,
+		linux.IPV6_RECVERR,
+		linux.IPV6_RECVFRAGSIZE,
+		linux.IPV6_RECVHOPLIMIT,
+		linux.IPV6_RECVHOPOPTS,
+		linux.IPV6_RECVORIGDSTADDR,
+		linux.IPV6_RECVPATHMTU,
+		linux.IPV6_RECVPKTINFO,
+		linux.IPV6_RECVRTHDR,
+		linux.IPV6_RECVTCLASS,
+		linux.IPV6_RTHDR,
+		linux.IPV6_RTHDRDSTOPTS,
+		linux.IPV6_TCLASS,
+		linux.IPV6_TRANSPARENT,
+		linux.IPV6_UNICAST_HOPS,
+		linux.IPV6_UNICAST_IF,
+		linux.MCAST_MSFILTER,
+		linux.IPV6_ADDRFORM:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+}
+
 // isLinkLocal determines if the given IPv6 address is link-local. This is the
 // case when it has the fe80::/10 prefix. This check is used to determine when
 // the NICID is relevant for a given IPv6 address.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index f901cfa0b..b1f6620de 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -299,6 +299,21 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (in
 			}
 			// We don't have limit on receiving size.
 			return math.MaxInt32, nil
+
+		default:
+			socket.GetSockOptEmitUnimplementedEvent(t, name)
+		}
+	case linux.SOL_NETLINK:
+		switch name {
+		case linux.NETLINK_BROADCAST_ERROR,
+			linux.NETLINK_CAP_ACK,
+			linux.NETLINK_DUMP_STRICT_CHK,
+			linux.NETLINK_EXT_ACK,
+			linux.NETLINK_LIST_MEMBERSHIPS,
+			linux.NETLINK_NO_ENOBUFS,
+			linux.NETLINK_PKTINFO:
+
+			t.Kernel().EmitUnimplementedEvent(t)
 		}
 	}
 	// TODO: other sockopts are not supported.
@@ -329,7 +344,25 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			// We don't have limit on receiving size. So just accept anything as
 			// valid for compatibility.
 			return nil
+		default:
+			socket.SetSockOptEmitUnimplementedEvent(t, name)
 		}
+
+	case linux.SOL_NETLINK:
+		switch name {
+		case linux.NETLINK_ADD_MEMBERSHIP,
+			linux.NETLINK_BROADCAST_ERROR,
+			linux.NETLINK_CAP_ACK,
+			linux.NETLINK_DROP_MEMBERSHIP,
+			linux.NETLINK_DUMP_STRICT_CHK,
+			linux.NETLINK_EXT_ACK,
+			linux.NETLINK_LISTEN_ALL_NSID,
+			linux.NETLINK_NO_ENOBUFS,
+			linux.NETLINK_PKTINFO:
+
+			t.Kernel().EmitUnimplementedEvent(t)
+		}
+
 	}
 	// TODO: other sockopts are not supported.
 	return syserr.ErrProtocolNotAvailable
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index a235c5249..b1dcbf7b0 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -213,3 +213,94 @@ func (rt *ReceiveTimeout) SetRecvTimeout(nanoseconds int64) {
 func (rt *ReceiveTimeout) RecvTimeout() int64 {
 	return atomic.LoadInt64(&rt.ns)
 }
+
+// GetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
+// It contains names that are valid for GetSockOpt when level is SOL_SOCKET.
+func GetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
+	switch name {
+	case linux.SO_ACCEPTCONN,
+		linux.SO_BPF_EXTENSIONS,
+		linux.SO_COOKIE,
+		linux.SO_DOMAIN,
+		linux.SO_ERROR,
+		linux.SO_GET_FILTER,
+		linux.SO_INCOMING_NAPI_ID,
+		linux.SO_MEMINFO,
+		linux.SO_PEERCRED,
+		linux.SO_PEERGROUPS,
+		linux.SO_PEERNAME,
+		linux.SO_PEERSEC,
+		linux.SO_PROTOCOL,
+		linux.SO_SNDLOWAT,
+		linux.SO_TYPE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUnimplementedEvent(t, name)
+	}
+}
+
+// SetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
+// It contains names that are valid for SetSockOpt when level is SOL_SOCKET.
+func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
+	switch name {
+	case linux.SO_ATTACH_BPF,
+		linux.SO_ATTACH_FILTER,
+		linux.SO_ATTACH_REUSEPORT_CBPF,
+		linux.SO_ATTACH_REUSEPORT_EBPF,
+		linux.SO_CNX_ADVICE,
+		linux.SO_DETACH_FILTER,
+		linux.SO_RCVBUFFORCE,
+		linux.SO_SNDBUFFORCE:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+
+	default:
+		emitUnimplementedEvent(t, name)
+	}
+}
+
+// emitUnimplementedEvent emits unimplemented event if name is valid. It
+// contains names that are common between Get and SetSocketOpt when level is
+// SOL_SOCKET.
+func emitUnimplementedEvent(t *kernel.Task, name int) {
+	switch name {
+	case linux.SO_BINDTODEVICE,
+		linux.SO_BROADCAST,
+		linux.SO_BSDCOMPAT,
+		linux.SO_BUSY_POLL,
+		linux.SO_DEBUG,
+		linux.SO_DONTROUTE,
+		linux.SO_INCOMING_CPU,
+		linux.SO_KEEPALIVE,
+		linux.SO_LINGER,
+		linux.SO_LOCK_FILTER,
+		linux.SO_MARK,
+		linux.SO_MAX_PACING_RATE,
+		linux.SO_NOFCS,
+		linux.SO_NO_CHECK,
+		linux.SO_OOBINLINE,
+		linux.SO_PASSCRED,
+		linux.SO_PASSSEC,
+		linux.SO_PEEK_OFF,
+		linux.SO_PRIORITY,
+		linux.SO_RCVBUF,
+		linux.SO_RCVLOWAT,
+		linux.SO_RCVTIMEO,
+		linux.SO_REUSEADDR,
+		linux.SO_REUSEPORT,
+		linux.SO_RXQ_OVFL,
+		linux.SO_SELECT_ERR_QUEUE,
+		linux.SO_SNDBUF,
+		linux.SO_SNDTIMEO,
+		linux.SO_TIMESTAMP,
+		linux.SO_TIMESTAMPING,
+		linux.SO_TIMESTAMPNS,
+		linux.SO_TXTIME,
+		linux.SO_WIFI_STATUS,
+		linux.SO_ZEROCOPY:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+}
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 4c49e90e3..c2a77ebf5 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -89,10 +89,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
 	if tr == nil {
 		switch sysnr {
 		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
-			tr = newCmdTracker(0)
+			// args: cmd, ...
+			tr = newArgsTracker(0)
 
 		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL:
-			tr = newCmdTracker(1)
+			// args: fd, cmd, ...
+			tr = newArgsTracker(1)
+
+		case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
+			// args: fd, level, name, ...
+			tr = newArgsTracker(1, 2)
 
 		default:
 			tr = &onceTracker{}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 2bb769a49..0c9472f18 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -20,35 +20,58 @@ import (
 	rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
 )
 
-// cmdTracker reports only a single time for each different command argument in
-// the syscall. It's used for generic syscalls like ioctl to report once per
-// 'cmd'
-type cmdTracker struct {
-	// argIdx is the syscall argument index where the command is located.
-	argIdx int
-	cmds   map[uint32]struct{}
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
 }
 
-func newCmdTracker(argIdx int) *cmdTracker {
-	return &cmdTracker{argIdx: argIdx, cmds: make(map[uint32]struct{})}
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
 }
 
 // cmd returns the command based on the syscall argument index.
-func (c *cmdTracker) cmd(regs *rpb.AMD64Registers) uint32 {
-	switch c.argIdx {
+func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+	switch argIdx {
 	case 0:
 		return uint32(regs.Rdi)
 	case 1:
 		return uint32(regs.Rsi)
+	case 2:
+		return uint32(regs.Rdx)
+	case 3:
+		return uint32(regs.R10)
+	case 4:
+		return uint32(regs.R8)
+	case 5:
+		return uint32(regs.R9)
 	}
-	panic(fmt.Sprintf("unsupported syscall argument index %d", c.argIdx))
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func (c *cmdTracker) shouldReport(regs *rpb.AMD64Registers) bool {
-	_, ok := c.cmds[c.cmd(regs)]
+func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
 	return !ok
 }
 
-func (c *cmdTracker) onReported(regs *rpb.AMD64Registers) {
-	c.cmds[c.cmd(regs)] = struct{}{}
+func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
 }
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 30b94798a..f1940dd72 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -33,34 +33,53 @@ func TestOnceTracker(t *testing.T) {
 	}
 }
 
-func TestCmdTracker(t *testing.T) {
+func TestArgsTracker(t *testing.T) {
 	for _, tc := range []struct {
 		name string
-		idx  int
+		idx  []int
 		rdi1 uint64
 		rdi2 uint64
 		rsi1 uint64
 		rsi2 uint64
 		want bool
 	}{
-		{name: "same rdi", idx: 0, rdi1: 123, rdi2: 123, want: false},
-		{name: "same rsi", idx: 1, rsi1: 123, rsi2: 123, want: false},
-		{name: "diff rdi", idx: 0, rdi1: 123, rdi2: 321, want: true},
-		{name: "diff rsi", idx: 1, rsi1: 123, rsi2: 321, want: true},
-		{name: "cmd is uint32", idx: 0, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
+		{name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
+		{name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
+		{name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
+		{name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
-			c := newCmdTracker(tc.idx)
+			c := newArgsTracker(tc.idx...)
 			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
 			if !c.shouldReport(regs) {
-				t.Error("first call to checkAndMark, got: false, want: true")
+				t.Error("first call to shouldReport, got: false, want: true")
 			}
 			c.onReported(regs)
 
 			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
 			if got := c.shouldReport(regs); tc.want != got {
-				t.Errorf("after first call to checkAndMark, got: %t, want: %t", got, tc.want)
+				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
 			}
 		})
 	}
 }
+
+func TestArgsTrackerLimit(t *testing.T) {
+	c := newArgsTracker(0, 1)
+	for i := 0; i < reportLimit; i++ {
+		regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+		if !c.shouldReport(regs) {
+			t.Error("shouldReport before limit was reached, got: false, want: true")
+		}
+		c.onReported(regs)
+	}
+
+	// Should hit the count limit now.
+	regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+	if c.shouldReport(regs) {
+		t.Error("shouldReport after limit was reached, got: true, want: false")
+	}
+}
-- 
cgit v1.2.3


From 03c1eb78b583ca3247f299889146675311727325 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 19 Nov 2018 18:02:50 -0800
Subject: Reference upstream licenses

Include copyright notices and the referenced LICENSE file.

PiperOrigin-RevId: 222171321
Change-Id: I0cc0b167ca51b536d1087bf1c4742fdf1430bc2a
---
 pkg/sentry/platform/safecopy/BUILD          |  2 +-
 pkg/sentry/platform/safecopy/LICENSE        | 27 +++++++++++++++++++++++++++
 pkg/sentry/platform/safecopy/memclr_amd64.s | 16 +++-------------
 pkg/sentry/platform/safecopy/memcpy_amd64.s | 28 ++++++++++++++++++----------
 pkg/sync/LICENSE                            | 27 +++++++++++++++++++++++++++
 5 files changed, 76 insertions(+), 24 deletions(-)
 create mode 100644 pkg/sentry/platform/safecopy/LICENSE
 create mode 100644 pkg/sync/LICENSE

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 614d9e21e..ee58a805e 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])  # Apache 2.0, portions BSD, MIT
 
 go_library(
     name = "safecopy",
diff --git a/pkg/sentry/platform/safecopy/LICENSE b/pkg/sentry/platform/safecopy/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s
index 488b6e666..64cf32f05 100644
--- a/pkg/sentry/platform/safecopy/memclr_amd64.s
+++ b/pkg/sentry/platform/safecopy/memclr_amd64.s
@@ -1,16 +1,6 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
 
 #include "textflag.h"
 
diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s
index 0bf26fd7b..129691d68 100644
--- a/pkg/sentry/platform/safecopy/memcpy_amd64.s
+++ b/pkg/sentry/platform/safecopy/memcpy_amd64.s
@@ -1,16 +1,24 @@
-// Copyright 2018 Google LLC
+// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
+// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+// Portions Copyright 2009 The Go Authors. All rights reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
 //
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
 
 #include "textflag.h"
 
diff --git a/pkg/sync/LICENSE b/pkg/sync/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/pkg/sync/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-- 
cgit v1.2.3


From 8b314b0bf402da58f90ccaac852a880d375f0885 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 20 Nov 2018 15:07:12 -0800
Subject: Fix recursive read lock taken on TaskSet

SyncSyscallFiltersToThreadGroup and Task.TheadID() both acquired TaskSet RWLock
in R mode and could deadlock if a writer comes in between.

PiperOrigin-RevId: 222313551
Change-Id: I4221057d8d46fec544cbfa55765c9a284fe7ebfa
---
 pkg/sentry/kernel/seccomp.go             | 48 +++++++++++---------------------
 pkg/sentry/kernel/task.go                |  2 +-
 pkg/sentry/syscalls/linux/sys_seccomp.go |  7 +----
 3 files changed, 19 insertions(+), 38 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 37dd3e4c9..433b900c7 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -179,20 +179,19 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
 // AppendSyscallFilter adds BPF program p as a system call filter.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) AppendSyscallFilter(p bpf.Program) error {
+func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error {
+	// While syscallFilters are an atomic.Value we must take the mutex to prevent
+	// our read-copy-update from happening while another task is syncing syscall
+	// filters to us, this keeps the filters in a consistent state.
+	t.tg.signalHandlers.mu.Lock()
+	defer t.tg.signalHandlers.mu.Unlock()
+
 	// Cap the combined length of all syscall filters (plus a penalty of 4
-	// instructions per filter beyond the first) to
-	// maxSyscallFilterInstructions. (This restriction is inherited from
-	// Linux.)
+	// instructions per filter beyond the first) to maxSyscallFilterInstructions.
+	// This restriction is inherited from Linux.
 	totalLength := p.Length()
 	var newFilters []bpf.Program
 
-	// While syscallFilters are an atomic.Value we must take the mutex to
-	// prevent our read-copy-update from happening while another task
-	// is syncing syscall filters to us, this keeps the filters in a
-	// consistent state.
-	t.mu.Lock()
-	defer t.mu.Unlock()
 	if sf := t.syscallFilters.Load(); sf != nil {
 		oldFilters := sf.([]bpf.Program)
 		for _, f := range oldFilters {
@@ -207,31 +206,18 @@ func (t *Task) AppendSyscallFilter(p bpf.Program) error {
 
 	newFilters = append(newFilters, p)
 	t.syscallFilters.Store(newFilters)
-	return nil
-}
 
-// SyncSyscallFiltersToThreadGroup will copy this task's filters to all other
-// threads in our thread group.
-func (t *Task) SyncSyscallFiltersToThreadGroup() error {
-	f := t.syscallFilters.Load()
-
-	t.tg.pidns.owner.mu.RLock()
-	defer t.tg.pidns.owner.mu.RUnlock()
-
-	// Note: No new privs is always assumed to be set.
-	for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
-		if ot.ThreadID() != t.ThreadID() {
-			// We must take the other task's mutex to prevent it from
-			// appending to its own syscall filters while we're syncing.
-			ot.mu.Lock()
-			var copiedFilters []bpf.Program
-			if f != nil {
-				copiedFilters = append(copiedFilters, f.([]bpf.Program)...)
+	if syncAll {
+		// Note: No new privs is always assumed to be set.
+		for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
+			if ot != t {
+				var copiedFilters []bpf.Program
+				copiedFilters = append(copiedFilters, newFilters...)
+				ot.syscallFilters.Store(copiedFilters)
 			}
-			ot.syscallFilters.Store(copiedFilters)
-			ot.mu.Unlock()
 		}
 	}
+
 	return nil
 }
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 73ba8bee9..2982bc5d1 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -392,7 +392,7 @@ type Task struct {
 
 	// syscallFilters is all seccomp-bpf syscall filters applicable to the
 	// task, in the order in which they were installed. The type of the atomic
-	// is []bpf.Program. Writing needs to be protected by mu.
+	// is []bpf.Program. Writing needs to be protected by the signal mutex.
 	//
 	// syscallFilters is owned by the task goroutine.
 	syscallFilters atomic.Value `state:".([]bpf.Program)"`
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index 969acaa36..f08fdf5cb 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -68,12 +68,7 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error {
 		return syscall.EINVAL
 	}
 
-	err = t.AppendSyscallFilter(compiledFilter)
-	if err == nil && tsync {
-		// Now we must copy this seccomp program to all other threads.
-		err = t.SyncSyscallFiltersToThreadGroup()
-	}
-	return err
+	return t.AppendSyscallFilter(compiledFilter, tsync)
 }
 
 // Seccomp implements linux syscall seccomp(2).
-- 
cgit v1.2.3


From 5236b78242677612ac71b19cee85b3bf4cca4008 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 20 Nov 2018 17:23:14 -0800
Subject: Dumps stacks if watchdog thread is stuck

PiperOrigin-RevId: 222332703
Change-Id: Id5c3cf79591c5d2949895b4e323e63c48c679820
---
 pkg/sentry/watchdog/watchdog.go | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 75b11237f..c49b537a5 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -190,7 +190,24 @@ func (w *Watchdog) loop() {
 
 // runTurn runs a single pass over all tasks and reports anything it finds.
 func (w *Watchdog) runTurn() {
-	tasks := w.k.TaskSet().Root.Tasks()
+	// Someone needs to watch the watchdog. The call below can get stuck if there
+	// is a deadlock affecting root's PID namespace mutex. Run it in a goroutine
+	// and report if it takes too long to return.
+	var tasks []*kernel.Task
+	done := make(chan struct{})
+	go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R.
+		tasks = w.k.TaskSet().Root.Tasks()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+	case <-time.After(w.taskTimeout):
+		// Report if the watchdog is not making progress.
+		// No one is wathching the watchdog watcher though.
+		w.reportStuckWatchdog()
+		<-done
+	}
 
 	newOffenders := make(map[*kernel.Task]*offender)
 	newTaskFound := false
@@ -245,7 +262,16 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 		buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime)))
 	}
 	buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine")
+	w.onStuckTask(newTaskFound, &buf)
+}
+
+func (w *Watchdog) reportStuckWatchdog() {
+	var buf bytes.Buffer
+	buf.WriteString("Watchdog goroutine is stuck:\n")
+	w.onStuckTask(true, &buf)
+}
 
+func (w *Watchdog) onStuckTask(newTaskFound bool, buf *bytes.Buffer) {
 	switch w.timeoutAction {
 	case LogWarning:
 		// Dump stack only if a new task is detected or if it sometime has passed since
-- 
cgit v1.2.3


From eaac94d91c28b745c51c33dd352ed9bfdd671b8c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 20 Nov 2018 22:55:41 -0800
Subject: Use RET_KILL_PROCESS if available in kernel

RET_KILL_THREAD doesn't work well for Go because it will
kill only the offending thread and leave the process hanging.
RET_TRAP can be masked out and it's not guaranteed to kill
the process. RET_KILL_PROCESS is available since 4.14.

For older kernel, continue to use RET_TRAP as this is the
best option (likely to kill process, easy to debug).

PiperOrigin-RevId: 222357867
Change-Id: Icc1d7d731274b16c2125b7a1ba4f7883fbdb2cbd
---
 pkg/abi/linux/seccomp.go                       | 12 +++---
 pkg/seccomp/seccomp.go                         | 52 ++++++++++++++++++++++----
 pkg/seccomp/seccomp_test.go                    | 20 +++++++---
 pkg/seccomp/seccomp_test_victim.go             |  2 +-
 pkg/seccomp/seccomp_unsafe.go                  | 30 ++++++++++++---
 pkg/sentry/kernel/seccomp.go                   |  4 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go |  2 +-
 runsc/boot/filter/filter.go                    |  3 +-
 runsc/fsgofer/filter/filter.go                 |  3 +-
 9 files changed, 95 insertions(+), 33 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 9963ceeba..5ec01cc4a 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -19,17 +19,19 @@ const (
 	SECCOMP_MODE_NONE   = 0
 	SECCOMP_MODE_FILTER = 2
 
-	SECCOMP_RET_KILL  = 0x00000000
-	SECCOMP_RET_TRAP  = 0x00030000
-	SECCOMP_RET_ERRNO = 0x00050000
-	SECCOMP_RET_TRACE = 0x7ff00000
-	SECCOMP_RET_ALLOW = 0x7fff0000
+	SECCOMP_RET_KILL_PROCESS = 0x80000000
+	SECCOMP_RET_KILL_THREAD  = 0x00000000
+	SECCOMP_RET_TRAP         = 0x00030000
+	SECCOMP_RET_ERRNO        = 0x00050000
+	SECCOMP_RET_TRACE        = 0x7ff00000
+	SECCOMP_RET_ALLOW        = 0x7fff0000
 
 	SECCOMP_RET_ACTION = 0x7fff0000
 	SECCOMP_RET_DATA   = 0x0000ffff
 
 	SECCOMP_SET_MODE_FILTER   = 1
 	SECCOMP_FILTER_FLAG_TSYNC = 1
+	SECCOMP_GET_ACTION_AVAIL  = 2
 )
 
 const (
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 1dfbf749e..9d714d02d 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -33,17 +33,42 @@ const (
 	defaultLabel = "default_action"
 )
 
+func actionName(a uint32) string {
+	switch a {
+	case linux.SECCOMP_RET_KILL_PROCESS:
+		return "kill process"
+	case linux.SECCOMP_RET_TRAP:
+		return "trap"
+	}
+	panic(fmt.Sprintf("invalid action: %d", a))
+}
+
 // Install generates BPF code based on the set of syscalls provided. It only
-// allows syscalls that conform to the specification and generates SIGSYS
-// trap unless kill is set.
+// allows syscalls that conform to the specification. Syscalls that violate the
+// specification will trigger RET_KILL_PROCESS, except for the cases below.
+//
+// RET_TRAP is used in violations, instead of RET_KILL_PROCESS, in the
+// following cases:
+//	 1. Kernel doesn't support RET_KILL_PROCESS: RET_KILL_THREAD only kills the
+//      offending thread and often keeps the sentry hanging.
+//   2. Debug: RET_TRAP generates a panic followed by a stack trace which is
+//      much easier to debug then RET_KILL_PROCESS which can't be caught.
 //
-// This is a convenience wrapper around BuildProgram and SetFilter.
-func Install(rules SyscallRules, kill bool) error {
-	log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill)
-	defaultAction := uint32(linux.SECCOMP_RET_TRAP)
-	if kill {
-		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+// Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored,
+// making it possible for the process to continue running after a violation.
+// However, it will leave a SECCOMP audit event trail behind. In any case, the
+// syscall is still blocked from executing.
+func Install(rules SyscallRules) error {
+	defaultAction, err := defaultAction()
+	if err != nil {
+		return err
 	}
+
+	// Uncomment to get stack trace when there is a violation.
+	// defaultAction = uint32(linux.SECCOMP_RET_TRAP)
+
+	log.Infof("Installing seccomp filters for %d syscalls (action=%s)", len(rules), actionName(defaultAction))
+
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  rules,
@@ -70,6 +95,17 @@ func Install(rules SyscallRules, kill bool) error {
 	return nil
 }
 
+func defaultAction() (uint32, error) {
+	available, err := isKillProcessAvailable()
+	if err != nil {
+		return 0, err
+	}
+	if available {
+		return uint32(linux.SECCOMP_RET_KILL_PROCESS), nil
+	}
+	return uint32(linux.SECCOMP_RET_TRAP), nil
+}
+
 // RuleSet is a set of rules and associated action.
 type RuleSet struct {
 	Rules  SyscallRules
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 226f30b7b..f2b903e42 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -121,7 +121,7 @@ func TestBasic(t *testing.T) {
 					Action: linux.SECCOMP_RET_TRAP,
 				},
 			},
-			defaultAction: linux.SECCOMP_RET_KILL,
+			defaultAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
 					desc: "Multiple rulesets allowed (1a)",
@@ -141,7 +141,7 @@ func TestBasic(t *testing.T) {
 				{
 					desc: "Multiple rulesets allowed (2)",
 					data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64},
-					want: linux.SECCOMP_RET_KILL,
+					want: linux.SECCOMP_RET_KILL_THREAD,
 				},
 			},
 		},
@@ -431,15 +431,23 @@ func TestRealDeal(t *testing.T) {
 				t.Errorf("victim was not killed as expected, output: %s", out)
 				continue
 			}
+			// Depending on kernel version, either RET_TRAP or RET_KILL_PROCESS is
+			// used. RET_TRAP dumps reason for exit in output, while RET_KILL_PROCESS
+			// returns SIGSYS as exit status.
+			if !strings.Contains(string(out), test.want) &&
+				!strings.Contains(err.Error(), test.want) {
+				t.Errorf("Victim error is wrong, got: %v, err: %v, want: %v", string(out), err, test.want)
+				continue
+			}
 		} else {
 			if err != nil {
 				t.Errorf("victim failed to execute, err: %v", err)
 				continue
 			}
-		}
-		if !strings.Contains(string(out), test.want) {
-			t.Errorf("Victim output is wrong, got: %v, want: %v", err, test.want)
-			continue
+			if !strings.Contains(string(out), test.want) {
+				t.Errorf("Victim output is wrong, got: %v, want: %v", string(out), test.want)
+				continue
+			}
 		}
 	}
 }
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index 007038273..dd5ed0041 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -106,7 +106,7 @@ func main() {
 		}
 	}
 
-	if err := seccomp.Install(syscalls, false); err != nil {
+	if err := seccomp.Install(syscalls); err != nil {
 		fmt.Printf("Failed to install seccomp: %v", err)
 		os.Exit(1)
 	}
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index dd009221a..a31c6471d 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -36,22 +36,40 @@ type sockFprog struct {
 //
 //go:nosplit
 func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
-	// SYS_SECCOMP is not available in syscall package.
-	const SYS_SECCOMP = 317
-
 	// PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
 	if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 {
 		return errno
 	}
 
-	// TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available.
 	sockProg := sockFprog{
 		Len:    uint16(len(instrs)),
 		Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
 	}
-	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 {
-		return errno
+	return seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg))
+}
+
+func isKillProcessAvailable() (bool, error) {
+	action := uint32(linux.SECCOMP_RET_KILL_PROCESS)
+	if errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 {
+		// EINVAL: SECCOMP_GET_ACTION_AVAIL not in this kernel yet.
+		// EOPNOTSUPP: SECCOMP_RET_KILL_PROCESS not supported.
+		if errno == syscall.EINVAL || errno == syscall.EOPNOTSUPP {
+			return false, nil
+		}
+		return false, errno
 	}
+	return true, nil
+}
 
+// seccomp calls seccomp(2). This is safe to call from an afterFork context.
+//
+//go:nosplit
+func seccomp(op, flags uint32, ptr unsafe.Pointer) syscall.Errno {
+	// SYS_SECCOMP is not available in syscall package.
+	const SYS_SECCOMP = 317
+
+	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)); errno != 0 {
+		return errno
+	}
 	return 0
 }
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 433b900c7..d6dc45bbd 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -117,7 +117,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 		// "Results in the system call being executed."
 		return seccompResultAllow
 
-	case linux.SECCOMP_RET_KILL:
+	case linux.SECCOMP_RET_KILL_THREAD:
 		// "Results in the task exiting immediately without executing the
 		// system call. The exit status of the task will be SIGSYS, not
 		// SIGKILL."
@@ -155,7 +155,7 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
 		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
 		if err != nil {
 			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
-			thisRet = linux.SECCOMP_RET_KILL
+			thisRet = linux.SECCOMP_RET_KILL_THREAD
 		}
 		// "If multiple filters exist, the return value for the evaluation of a
 		// given system call will always use the highest precedent value." -
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 885ba4b2e..25b8e8cb7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -115,7 +115,7 @@ func createStub() (*thread, error) {
 	var defaultAction uint32
 	if probeSeccomp() {
 		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
-		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+		defaultAction = uint32(linux.SECCOMP_RET_KILL_THREAD)
 	} else {
 		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
 		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index dc7294b1d..d69a6a2cc 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -57,8 +57,7 @@ func Install(opt Options) error {
 		return fmt.Errorf("unknown platform type %T", p)
 	}
 
-	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
-	return seccomp.Install(s, false)
+	return seccomp.Install(s)
 }
 
 // Report writes a warning message to the log.
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index f50b6bc87..c120d57a6 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -29,6 +29,5 @@ func Install() error {
 	// when not enabled.
 	s.Merge(instrumentationFilters())
 
-	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
-	return seccomp.Install(s, false)
+	return seccomp.Install(s)
 }
-- 
cgit v1.2.3


From b3b60ea29adf9415c9c7b98ba331dacd92f231b7 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Mon, 26 Nov 2018 09:49:53 -0800
Subject: Implementation of preadv2 for Linux 4.4 support

Implement RWF_HIPRI (4.6) silently passes the read call.
Implement -1 offset calls readv.

PiperOrigin-RevId: 222840324
Change-Id: If9ddc1e8d086e1a632bdf5e00bae08205f95b6b0
---
 pkg/abi/linux/file.go                 |  7 ++++
 pkg/sentry/syscalls/linux/linux64.go  |  3 ++
 pkg/sentry/syscalls/linux/sys_read.go | 62 +++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 72e5c6f83..8d48e1753 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -150,6 +150,13 @@ const (
 	PermissionsMask = 0777
 )
 
+// Values for preadv2/pwritev2.
+const (
+	RWF_HIPRI = 0x0001
+	RWF_DSYNC = 0X0002
+	RWF_SYNC  = 0x0004
+)
+
 // Stat represents struct stat.
 type Stat struct {
 	Dev      uint64
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 9912ab2b5..2aab948da 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -373,7 +373,10 @@ var AMD64 = &kernel.SyscallTable{
 		//     322: Execveat, TODO
 		//     323: Userfaultfd, TODO
 		//     324: Membarrier, TODO
+		// Syscalls after 325 are backports from 4.6.
 		325: syscalls.Error(nil), // Mlock2, TODO
+		327: Preadv2,
+		//	328: Pwritev2,  // Pwritev2, TODO
 	},
 
 	Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index b2e5a5449..cbb9eb9f8 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -187,6 +187,68 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
 }
 
+// Preadv2 implements linux syscall preadv2(2).
+func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := int(args[4].Int())
+
+	validFlags := linux.RWF_HIPRI
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is reading at an offset supported?
+	if offset > -1 && !file.Flags().Pread {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	// Check that the file is readable.
+	if !file.Flags().Read {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Check flags field.
+	if flags != 0 {
+		if flags&^validFlags != 0 {
+			return 0, nil, syserror.EINVAL
+		}
+		// RWF_HIPRI must be called on a file with O_DIRECT flag set.
+		if flags&linux.RWF_HIPRI != 0 && !file.Flags().Direct {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	// Read the iovecs that specify the destination of the read.
+	dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// If preadv2 is called with an offset of -1, readv is called.
+	if offset == -1 {
+		n, err := readv(t, file, dst)
+		t.IOUsage().AccountReadSyscall(n)
+		return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+	}
+
+	n, err := preadv(t, file, dst, offset)
+	t.IOUsage().AccountReadSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
+}
+
 func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
 	n, err := f.Readv(t, dst)
 	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
-- 
cgit v1.2.3


From 9e0f13237793897c805f75af163006049b37e784 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Tue, 27 Nov 2018 12:45:04 -0800
Subject: Add procid support for arm64 platform

Change-Id: I7c3db8dfdf95a125d7384c1d67c3300dbb99a47e
PiperOrigin-RevId: 223039923
---
 pkg/sentry/platform/procid/BUILD          |  1 +
 pkg/sentry/platform/procid/procid_arm64.s | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 pkg/sentry/platform/procid/procid_arm64.s

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD
index d3398d1e8..20c8bc02c 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/sentry/platform/procid/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "procid.go",
         "procid_amd64.s",
+        "procid_arm64.s",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid",
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/sentry/platform/procid/procid_arm64.s
new file mode 100644
index 000000000..be65d0db0
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid_arm64.s
@@ -0,0 +1,29 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+// +build go1.8
+// +build !go1.12
+
+#include "textflag.h"
+
+TEXT ·Current(SB),NOSPLIT,$0-8
+	// The offset specified here is the m_procid offset for Go1.8+.
+	// Changes to this offset should be caught by the tests, and major
+	// version changes require an explicit tag change above.
+	MOVD g, R0      // g
+	MOVD 48(R0), R0 // g_m (may change in future versions)
+	MOVD 72(R0), R0 // m_procid (may change in future versions)
+	MOVD R0, ret+0(FP)
+	RET
-- 
cgit v1.2.3


From 5bd02b224fd0eb81fc028644137a24d0bbf7dab5 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 27 Nov 2018 17:47:16 -0800
Subject: Save shutdown flags first.

With rpcinet if shutdown flags are not saved before making
the rpc a race is possible where blocked threads are woken
up before the flags have been persisted. This would mean
that threads can block indefinitely in a recvmsg after a
shutdown(SHUT_RD) has happened.

PiperOrigin-RevId: 223089783
Change-Id: If595e7add12aece54bcdf668ab64c570910d061a
---
 pkg/sentry/socket/rpcinet/socket.go | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 788d853c9..7328661ab 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -354,6 +354,13 @@ func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error {
 
 // Shutdown implements socket.Socket.Shutdown.
 func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
+	// We save the shutdown state because of strange differences on linux
+	// related to recvs on blocking vs. non-blocking sockets after a SHUT_RD.
+	// We need to emulate that behavior on the blocking side.
+	// TODO: There is a possible race that can exist with loopback,
+	// where data could possibly be lost.
+	s.setShutdownFlags(how)
+
 	stack := t.NetworkContext().(*Stack)
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Shutdown{&pb.ShutdownRequest{Fd: s.fd, How: int64(how)}}}, false /* ignoreResult */)
 	<-c
@@ -362,10 +369,6 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 		return syserr.FromHost(syscall.Errno(e))
 	}
 
-	// We save the shutdown state because of strange differences on linux
-	// related to recvs on blocking vs. non-blocking sockets after a SHUT_RD.
-	// We need to emulate that behavior on the blocking side.
-	s.setShutdownFlags(how)
 	return nil
 }
 
-- 
cgit v1.2.3


From 573622fdcaa5c016d3e047353c729ca73d211c0e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 27 Nov 2018 18:16:18 -0800
Subject: Fix data race in fs.Async.

Replaces the WaitGroup with a RWMutex. Calls to Async hold the mutex for
reading, while AsyncBarrier takes the lock for writing. This ensures that all
executing Async work finishes before AsyncBarrier returns.

Also pushes the Async() call from Inode.Release into
gofer/InodeOperations.Release(). This removes a recursive Async call which
should not have been allowed in the first place. The gofer Release call is the
slow one (since it may make RPCs to the gofer), so putting the Async call there
makes sense.

PiperOrigin-RevId: 223093067
Change-Id: I116da7b20fce5ebab8d99c2ab0f27db7c89d890e
---
 pkg/sentry/fs/fs.go          | 24 ++++++++++++++----------
 pkg/sentry/fs/gofer/inode.go |  8 +++++++-
 pkg/sentry/fs/inode.go       | 13 +++----------
 3 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index b5c72990e..0ba4b7269 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -44,7 +44,7 @@
 //         DirentCache.mu
 //         Locks in InodeOperations implementations or overlayEntry
 //         Inode.Watches.mu (see `Inotify` for other lock ordering)
-//	   MountSource.mu
+//         MountSource.mu
 //
 // If multiple Dirent or MountSource locks must be taken, locks in the parent must be
 // taken before locks in their children.
@@ -60,10 +60,11 @@ import (
 )
 
 var (
-	// work is a sync.WaitGroup that can be used to queue asynchronous
-	// operations via Do. Callers can use Barrier to ensure no operations
-	// are outstanding.
-	work sync.WaitGroup
+	// workMu is used to synchronize pending asynchronous work. Async work
+	// runs with the lock held for reading. AsyncBarrier will take the lock
+	// for writing, thus ensuring that all Async work completes before
+	// AsyncBarrier returns.
+	workMu sync.RWMutex
 
 	// asyncError is used to store up to one asynchronous execution error.
 	asyncError = make(chan error, 1)
@@ -71,14 +72,17 @@ var (
 
 // AsyncBarrier waits for all outstanding asynchronous work to complete.
 func AsyncBarrier() {
-	work.Wait()
+	workMu.Lock()
+	workMu.Unlock()
 }
 
 // Async executes a function asynchronously.
+//
+// Async must not be called recursively.
 func Async(f func()) {
-	work.Add(1)
-	go func() { // S/R-SAFE: Barrier must be called.
-		defer work.Done() // Ensure Done in case of panic.
+	workMu.RLock()
+	go func() { // S/R-SAFE: AsyncBarrier must be called.
+		defer workMu.RUnlock() // Ensure RUnlock in case of panic.
 		f()
 	}()
 }
@@ -89,7 +93,7 @@ func Async(f func()) {
 func AsyncErrorBarrier() error {
 	wait := make(chan struct{}, 1)
 	go func() { // S/R-SAFE: Does not touch persistent state.
-		work.Wait()
+		AsyncBarrier()
 		wait <- struct{}{}
 	}()
 	select {
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 5811b8b12..7c6e5b025 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -333,8 +333,14 @@ func (i *inodeOperations) session() *session {
 
 // Release implements fs.InodeOperations.Release.
 func (i *inodeOperations) Release(ctx context.Context) {
-	i.fileState.Release(ctx)
 	i.cachingInodeOps.Release()
+
+	// Releasing the fileState may make RPCs to the gofer. There is
+	// no need to wait for those to return, so we can do this
+	// asynchronously.
+	fs.Async(func() {
+		i.fileState.Release(ctx)
+	})
 }
 
 // Mappable implements fs.InodeOperations.Mappable.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 38b140bd2..fa3beb111 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -110,20 +110,13 @@ func (i *Inode) destroy() {
 	// wouldn't be in the destructor.
 	i.Watches.targetDestroyed()
 
-	// Overlay resources should be released synchronously, since they may
-	// trigger more Inode.destroy calls which must themselves be handled
-	// synchronously, like the WriteOut call above.
 	if i.overlay != nil {
 		i.overlay.release()
-		i.MountSource.DecRef()
-		return
+	} else {
+		i.InodeOperations.Release(ctx)
 	}
 
-	// Regular (non-overlay) resources may be released asynchronously.
-	Async(func() {
-		i.InodeOperations.Release(ctx)
-		i.MountSource.DecRef()
-	})
+	i.MountSource.DecRef()
 }
 
 // Mappable calls i.InodeOperations.Mappable.
-- 
cgit v1.2.3


From 5560615c531bc2a0108a4db1e9877f0397a69f8f Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 3 Dec 2018 17:02:28 -0800
Subject: Return an int32 for netlink SO_RCVBUF

Untyped integer constants default to type int and the binary package will panic
if one tries to encode an int.

PiperOrigin-RevId: 223890001
Change-Id: Iccc3afd6d74bad24c35d764508e450fd317b76ec
---
 pkg/sentry/socket/netlink/socket.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index b1f6620de..c4798839e 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -298,7 +298,7 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (in
 				return nil, syserr.ErrInvalidArgument
 			}
 			// We don't have limit on receiving size.
-			return math.MaxInt32, nil
+			return int32(math.MaxInt32), nil
 
 		default:
 			socket.GetSockOptEmitUnimplementedEvent(t, name)
-- 
cgit v1.2.3


From 54dd0d0dc5ee452890628c537e6ebd1ac8c9d699 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 4 Dec 2018 12:23:08 -0800
Subject: Fix data race caused by unlocked call of Dirent.descendantOf.

PiperOrigin-RevId: 224025363
Change-Id: I98864403c779832e9e1436f7d3c3f6fb2fba9904
---
 pkg/sentry/fs/dirent.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 2c01485a8..4c0d1b7ef 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -840,6 +840,10 @@ func (d *Dirent) getDotAttrs(root *Dirent) (DentAttr, DentAttr) {
 		InodeID: sattr.InodeID,
 	}
 
+	// Hold d.mu while we call d.descendantOf.
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
 	// Get '..'.
 	if !d.IsRoot() && d.descendantOf(root) {
 		// Dirent is a descendant of the root.  Get its parent's attrs.
-- 
cgit v1.2.3


From 806e346491503e0292bcee8bf15d74bbf42e2a10 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Tue, 4 Dec 2018 13:45:11 -0800
Subject: Fix mempolicy_test on bazel.

Bazel runs multiple test cases on the same thread. Some of the test
cases rely on the test thread starting with the default memory policy,
while other tests modify the test thread's memory policy. This
obviously breaks when the test framework doesn't run each test case on
a new thread.

Also fixing an incompatibility where set_mempolicy(2) was prevented
from specifying an empty nodemask, which is allowed for some modes.

PiperOrigin-RevId: 224038957
Change-Id: Ibf780766f2706ebc9b129dbc8cf1b85c2a275074
---
 pkg/sentry/syscalls/linux/sys_mmap.go | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index f8d9c43fd..145f7846c 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -308,28 +308,31 @@ func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	nodemask := args[1].Pointer()
 	maxnode := args[2].Uint()
 
-	if maxnode < 1 {
+	if nodemask != 0 && maxnode < 1 {
 		return 0, nil, syserror.EINVAL
 	}
 
 	if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS {
-		// Can't specify multiple modes simultaneously. Must also contain a
-		// valid mode, which we check below.
+		// Can't specify multiple modes simultaneously.
 		return 0, nil, syserror.EINVAL
 	}
 
 	mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
 	if mode < 0 || mode >= linux.MPOL_MAX {
+		// Must specify a valid mode.
 		return 0, nil, syserror.EINVAL
 	}
 
 	var nodemaskVal uint32
-	if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil {
-		return 0, nil, syserror.EFAULT
+	// Nodemask may be empty for some policy modes.
+	if nodemask != 0 && maxnode > 0 {
+		if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil {
+			return 0, nil, syserror.EFAULT
+		}
 	}
 
-	// When setting MPOL_INTERLEAVE, nodemask must not be empty.
-	if mode == linux.MPOL_INTERLEAVE && nodemaskVal == 0 {
+	if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 {
+		// Mode requires a non-empty nodemask, but got an empty nodemask.
 		return 0, nil, syserror.EINVAL
 	}
 
-- 
cgit v1.2.3


From 5a6a1eb420620c3d41a9db4ddf7ac7b163310f09 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 4 Dec 2018 14:27:46 -0800
Subject: Enforce name length restriction on paths.

NAME_LENGTH must be enforced per component.

PiperOrigin-RevId: 224046749
Change-Id: Iba8105b00d951f2509dc768af58e4110dafbe1c9
---
 pkg/sentry/fs/dirent.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4c0d1b7ef..4c0482036 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -458,6 +458,12 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 	if !IsDir(d.Inode.StableAttr) {
 		return nil, syscall.ENOTDIR
 	}
+
+	// The component must be less than NAME_MAX.
+	if len(name) > linux.NAME_MAX {
+		return nil, syscall.ENAMETOOLONG
+	}
+
 	if name == "" || name == "." {
 		d.IncRef()
 		return d, nil
-- 
cgit v1.2.3


From adafc08d7cee594ea94abefbedf67ea315922550 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Tue, 4 Dec 2018 14:29:56 -0800
Subject: sentry: save / restore netstack procfs configuration.

PiperOrigin-RevId: 224047120
Change-Id: Ia6cb17fa978595cd73857b6178c4bdba401e185e
---
 pkg/sentry/fs/proc/BUILD            |  1 +
 pkg/sentry/fs/proc/net.go           |  4 ++++
 pkg/sentry/fs/proc/sys_net.go       | 40 +++++++++++++++++++++++--------------
 pkg/sentry/fs/proc/sys_net_state.go | 33 ++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 15 deletions(-)
 create mode 100644 pkg/sentry/fs/proc/sys_net_state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 2d9f07f2f..aff3c3c01 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -20,6 +20,7 @@ go_library(
         "stat.go",
         "sys.go",
         "sys_net.go",
+        "sys_net_state.go",
         "task.go",
         "uid_gid_map.go",
         "uptime.go",
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 45f2a1211..3ff60aa5b 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -54,6 +54,8 @@ func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 }
 
 // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
+//
+// +stateify savable
 type ifinet6 struct {
 	s inet.Stack
 }
@@ -108,6 +110,8 @@ func (n *ifinet6) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
 }
 
 // netDev implements seqfile.SeqSource for /proc/net/dev.
+//
+// +stateify savable
 type netDev struct {
 	s inet.Stack
 }
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 801eb6a1e..b50d43d70 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -36,7 +36,7 @@ const (
 // +stateify savable
 type tcpMem struct {
 	ramfs.Entry
-	s    inet.Stack
+	s    inet.Stack `state:"wait"`
 	size inet.TCPBufferSize
 	dir  tcpMemDir
 }
@@ -81,30 +81,33 @@ func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 
 	buf := []int32{int32(m.size.Min), int32(m.size.Default), int32(m.size.Max)}
 	n, cperr := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
-	size := inet.TCPBufferSize{
+	m.size = inet.TCPBufferSize{
 		Min:     int(buf[0]),
 		Default: int(buf[1]),
 		Max:     int(buf[2]),
 	}
-	var err error
+	if err := m.writeSize(); err != nil {
+		return n, err
+	}
+	return n, cperr
+}
+
+func (m *tcpMem) writeSize() error {
 	switch m.dir {
 	case tcpRMem:
-		err = m.s.SetTCPReceiveBufferSize(size)
+		return m.s.SetTCPReceiveBufferSize(m.size)
 	case tcpWMem:
-		err = m.s.SetTCPSendBufferSize(size)
+		return m.s.SetTCPSendBufferSize(m.size)
 	default:
 		panic(fmt.Sprintf("unknown tcpMem.dir: %v", m.dir))
 	}
-	if err != nil {
-		return n, err
-	}
-	return n, cperr
 }
 
 // +stateify savable
 type tcpSack struct {
 	ramfs.Entry
-	s inet.Stack
+	s       inet.Stack `state:"wait"`
+	enabled *bool
 }
 
 func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
@@ -124,13 +127,16 @@ func (s *tcpSack) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 		return 0, io.EOF
 	}
 
-	sack, err := s.s.TCPSACKEnabled()
-	if err != nil {
-		return 0, err
+	if s.enabled == nil {
+		sack, err := s.s.TCPSACKEnabled()
+		if err != nil {
+			return 0, err
+		}
+		s.enabled = &sack
 	}
 
 	val := "0\n"
-	if sack {
+	if *s.enabled {
 		// Technically, this is not quite compatible with Linux. Linux
 		// stores these as an integer, so if you write "2" into
 		// tcp_sack, you should get 2 back. Tough luck.
@@ -157,7 +163,11 @@ func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 	if err != nil {
 		return n, err
 	}
-	return n, s.s.SetTCPSACKEnabled(v != 0)
+	if s.enabled == nil {
+		s.enabled = new(bool)
+	}
+	*s.enabled = v != 0
+	return n, s.s.SetTCPSACKEnabled(*s.enabled)
 }
 
 func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
new file mode 100644
index 000000000..7f46776c0
--- /dev/null
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import "fmt"
+
+// afterLoad is invoked by stateify.
+func (m *tcpMem) afterLoad() {
+	if err := m.writeSize(); err != nil {
+		panic(fmt.Sprintf("failed to write previous TCP send / receive buffer sizes [%v]: %v", m.size, err))
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (s *tcpSack) afterLoad() {
+	if s.enabled != nil {
+		if err := s.s.SetTCPSACKEnabled(*s.enabled); err != nil {
+			panic(fmt.Sprintf("failed to set previous TCP sack configuration [%v]: %v", *s.enabled, err))
+		}
+	}
+}
-- 
cgit v1.2.3


From 82719be42e636f86780d21b01e10ecb2c9a25e53 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 4 Dec 2018 14:31:08 -0800
Subject: Max link traversals should be for an entire path.

The number of symbolic links that are allowed to be followed
are for a full path and not just a chain of symbolic links.

PiperOrigin-RevId: 224047321
Change-Id: I5e3c4caf66a93c17eeddcc7f046d1e8bb9434a40
---
 pkg/sentry/fs/copy_up_test.go           |  3 ++-
 pkg/sentry/fs/host/fs.go                |  3 ++-
 pkg/sentry/fs/host/fs_test.go           |  3 ++-
 pkg/sentry/fs/inode_overlay_test.go     |  3 ++-
 pkg/sentry/fs/mount_test.go             | 10 +++++++---
 pkg/sentry/fs/mounts.go                 | 22 ++++++++++++----------
 pkg/sentry/fs/mounts_test.go            |  6 ++++--
 pkg/sentry/fs/ramfs/tree_test.go        |  3 ++-
 pkg/sentry/kernel/kernel.go             |  6 ++++--
 pkg/sentry/kernel/task_context.go       |  2 +-
 pkg/sentry/loader/elf.go                |  2 +-
 pkg/sentry/loader/loader.go             | 10 +++++-----
 pkg/sentry/socket/unix/unix.go          |  6 ++++--
 pkg/sentry/syscalls/linux/sys_file.go   | 11 +++++++----
 pkg/sentry/syscalls/linux/sys_thread.go |  3 ++-
 runsc/boot/fs.go                        | 12 ++++++++----
 runsc/boot/loader_test.go               |  3 ++-
 17 files changed, 67 insertions(+), 41 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 64f030f72..fcba14ed4 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -166,7 +166,8 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
 
 	// Walk to all of the files in the overlay, open them readable.
 	for _, f := range files {
-		d, err := mns.FindInode(ctx, mns.Root(), mns.Root(), f.name, 0)
+		maxTraversals := uint(0)
+		d, err := mns.FindInode(ctx, mns.Root(), mns.Root(), f.name, &maxTraversals)
 		if err != nil {
 			t.Fatalf("failed to find %q: %v", f.name, err)
 		}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index fec890964..54cbb94f9 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -170,7 +170,8 @@ func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string)
 			current := paths[i][:j]
 
 			// Lookup the given component in the tree.
-			d, err := m.FindLink(ctx, root, nil, current, maxTraversals)
+			remainingTraversals := uint(maxTraversals)
+			d, err := m.FindLink(ctx, root, nil, current, &remainingTraversals)
 			if err != nil {
 				log.Warningf("populate failed for %q: %v", current, err)
 				continue
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index e69559aac..44db61ecd 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -150,7 +150,8 @@ func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base stri
 	root := m.Root()
 	defer root.DecRef()
 
-	d, err := m.FindLink(ctx, root, nil, base, 1)
+	maxTraversals := uint(1)
+	d, err := m.FindLink(ctx, root, nil, base, &maxTraversals)
 	if err != nil {
 		t.Logf("FindLink failed for %q", base)
 		return paths, err
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index bba20da14..acdb2b4f8 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -324,7 +324,8 @@ func TestCacheFlush(t *testing.T) {
 
 	for _, fileName := range []string{upperFileName, lowerFileName} {
 		// Walk to the file.
-		dirent, err := mns.FindInode(ctx, root, nil, fileName, 0)
+		maxTraversals := uint(0)
+		dirent, err := mns.FindInode(ctx, root, nil, fileName, &maxTraversals)
 		if err != nil {
 			t.Fatalf("FindInode(%q) failed: %v", fileName, err)
 		}
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index a1c9f4f79..269d6b9da 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -115,8 +115,10 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 		"/waldo",
 	}
 
+	var maxTraversals uint
 	for _, p := range paths {
-		d, err := mm.FindLink(ctx, rootDirent, nil, p, 0)
+		maxTraversals = 0
+		d, err := mm.FindLink(ctx, rootDirent, nil, p, &maxTraversals)
 		if err != nil {
 			t.Fatalf("could not find path %q in mount manager: %v", p, err)
 		}
@@ -164,7 +166,8 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 	}
 
 	// "foo" mount should have two children: /foo/bar, and /foo/qux.
-	d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", 0)
+	maxTraversals = 0
+	d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/foo", err)
 	}
@@ -185,7 +188,8 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 	}
 
 	// "waldo" mount should have no submounts or children.
-	waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", 0)
+	maxTraversals = 0
+	waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err)
 	}
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 7c5348cce..f6f7be0aa 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -350,7 +350,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 //
 // Precondition: root must be non-nil.
 // Precondition: the path must be non-empty.
-func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
+func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) {
 	if root == nil {
 		panic("MountNamespace.FindLink: root must not be nil")
 	}
@@ -419,7 +419,7 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
 			//
 			// See resolve for reference semantics; on err next
 			// will have one dropped.
-			current, err = mns.resolve(ctx, root, next, maxTraversals)
+			current, err = mns.resolve(ctx, root, next, remainingTraversals)
 			if err != nil {
 				return nil, err
 			}
@@ -439,15 +439,15 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path
 // FindInode is identical to FindLink except the return value is resolved.
 //
 //go:nosplit
-func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
-	d, err := mns.FindLink(ctx, root, wd, path, maxTraversals)
+func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) {
+	d, err := mns.FindLink(ctx, root, wd, path, remainingTraversals)
 	if err != nil {
 		return nil, err
 	}
 
 	// See resolve for reference semantics; on err d will have the
 	// reference dropped.
-	return mns.resolve(ctx, root, d, maxTraversals)
+	return mns.resolve(ctx, root, d, remainingTraversals)
 }
 
 // resolve resolves the given link.
@@ -458,14 +458,14 @@ func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path
 // If not successful, a reference is _also_ dropped on the node and an error
 // returned. This is for convenience in using resolve directly as a return
 // value.
-func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxTraversals uint) (*Dirent, error) {
+func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, remainingTraversals *uint) (*Dirent, error) {
 	// Resolve the path.
 	target, err := node.Inode.Getlink(ctx)
 
 	switch err {
 	case nil:
 		// Make sure we didn't exhaust the traversal budget.
-		if maxTraversals == 0 {
+		if *remainingTraversals == 0 {
 			target.DecRef()
 			return nil, syscall.ELOOP
 		}
@@ -481,7 +481,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxT
 		defer node.DecRef() // See above.
 
 		// First, check if we should traverse.
-		if maxTraversals == 0 {
+		if *remainingTraversals == 0 {
 			return nil, syscall.ELOOP
 		}
 
@@ -492,7 +492,8 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxT
 		}
 
 		// Find the node; we resolve relative to the current symlink's parent.
-		d, err := mns.FindInode(ctx, root, node.parent, targetPath, maxTraversals-1)
+		*remainingTraversals--
+		d, err := mns.FindInode(ctx, root, node.parent, targetPath, remainingTraversals)
 		if err != nil {
 			return nil, err
 		}
@@ -544,7 +545,8 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s
 	defer root.DecRef()
 	for _, p := range paths {
 		binPath := path.Join(p, name)
-		d, err := mns.FindInode(ctx, root, nil, binPath, linux.MaxSymlinkTraversals)
+		traversals := uint(linux.MaxSymlinkTraversals)
+		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
 		if err == syserror.ENOENT || err == syserror.EACCES {
 			// Didn't find it here.
 			continue
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index cc7c32c9b..2f7a1710f 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -77,7 +77,8 @@ func TestFindLink(t *testing.T) {
 		{"bar", foo, "/foo/bar"},
 	} {
 		wdPath, _ := tc.wd.FullName(root)
-		if d, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err != nil {
+		maxTraversals := uint(0)
+		if d, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, &maxTraversals); err != nil {
 			t.Errorf("FindLink(%q, wd=%q) failed: %v", tc.findPath, wdPath, err)
 		} else if got, _ := d.FullName(root); got != tc.wantPath {
 			t.Errorf("FindLink(%q, wd=%q) got dirent %q, want %q", tc.findPath, wdPath, got, tc.wantPath)
@@ -95,7 +96,8 @@ func TestFindLink(t *testing.T) {
 		{"foo", foo},
 	} {
 		wdPath, _ := tc.wd.FullName(root)
-		if _, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err == nil {
+		maxTraversals := uint(0)
+		if _, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, &maxTraversals); err == nil {
 			t.Errorf("FindLink(%q, wd=%q) did not return error", tc.findPath, wdPath)
 		}
 	}
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index d5567d9e1..54df2143c 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -70,7 +70,8 @@ func TestMakeDirectoryTree(t *testing.T) {
 		defer mm.DecRef()
 
 		for _, p := range test.subdirs {
-			if _, err := mm.FindInode(ctx, root, nil, p, 0); err != nil {
+			maxTraversals := uint(0)
+			if _, err := mm.FindInode(ctx, root, nil, p, &maxTraversals); err != nil {
 				t.Errorf("%s: failed to find node %s: %v", test.name, p, err)
 				break
 			}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 17425e656..cb61e27f1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -634,10 +634,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	args.Root = nil
 
 	// Grab the working directory.
+	remainingTraversals := uint(args.MaxSymlinkTraversals)
 	wd := root // Default.
 	if args.WorkingDirectory != "" {
 		var err error
-		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
+		wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
 		if err != nil {
 			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
 		}
@@ -656,7 +657,8 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	}
 
 	// Create a fresh task context.
-	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+	remainingTraversals = uint(args.MaxSymlinkTraversals)
+	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
 	if err != nil {
 		return nil, 0, err
 	}
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 45b8d2b04..aaff309f0 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -142,7 +142,7 @@ func (t *Task) Stack() *arch.Stack {
 //  * argv: Binary argv
 //  * envv: Binary envv
 //  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
 	// Prepare a new user address space to load into.
 	m := mm.NewMemoryManager(k)
 	defer m.DecUsers(ctx)
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 9b1e81dc9..385ad0102 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -610,7 +610,7 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, in
 //
 // Preconditions:
 //  * f is an ELF file
-func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+func loadELF(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
 	bin, ac, err := loadInitialELF(ctx, m, fs, f)
 	if err != nil {
 		ctx.Infof("Error loading binary: %v", err)
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index d1417c4f1..69a090844 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -55,7 +55,7 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
 // installed in the Task FDMap. The caller takes ownership of both.
 //
 // name must be a readable, executable, regular file.
-func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, name string) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, name string) (*fs.Dirent, *fs.File, error) {
 	if name == "" {
 		ctx.Infof("cannot open empty name")
 		return nil, nil, syserror.ENOENT
@@ -136,9 +136,9 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated argv
-func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
-		d, f, err := openPath(ctx, mounts, root, wd, maxTraversals, filename)
+		d, f, err := openPath(ctx, mounts, root, wd, remainingTraversals, filename)
 		if err != nil {
 			ctx.Infof("Error opening %s: %v", filename, err)
 			return loadedELF{}, nil, nil, nil, err
@@ -163,7 +163,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 
 		switch {
 		case bytes.Equal(hdr[:], []byte(elfMagic)):
-			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, maxTraversals, fs, f)
+			loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, fs, f)
 			if err != nil {
 				ctx.Infof("Error loading ELF: %v", err)
 				return loadedELF{}, nil, nil, nil, err
@@ -196,7 +196,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
 	// Load the binary itself.
 	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv)
 	if err != nil {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 334169372..4379486cf 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -266,7 +266,8 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 					subPath = "/"
 				}
 				var err error
-				d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, fs.DefaultTraversalLimit)
+				remainingTraversals := uint(fs.DefaultTraversalLimit)
+				d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, &remainingTraversals)
 				if err != nil {
 					// No path available.
 					return syserr.ErrNoSuchFile
@@ -314,7 +315,8 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint,
 	// Find the node in the filesystem.
 	root := t.FSContext().RootDirectory()
 	cwd := t.FSContext().WorkingDirectory()
-	d, e := t.MountNamespace().FindInode(t, root, cwd, path, fs.DefaultTraversalLimit)
+	remainingTraversals := uint(fs.DefaultTraversalLimit)
+	d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals)
 	cwd.DecRef()
 	root.DecRef()
 	if e != nil {
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 89d21dd98..37c90f6fd 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -92,10 +92,11 @@ func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func
 	root := t.FSContext().RootDirectory()
 
 	// Lookup the node.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
 	if resolve {
-		d, err = t.MountNamespace().FindInode(t, root, rel, path, linux.MaxSymlinkTraversals)
+		d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals)
 	} else {
-		d, err = t.MountNamespace().FindLink(t, root, rel, path, linux.MaxSymlinkTraversals)
+		d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
 	}
 	root.DecRef()
 	if wd != nil {
@@ -312,7 +313,8 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 		fileFlags.LargeFile = true
 
 		// Does this file exist already?
-		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		remainingTraversals := uint(linux.MaxSymlinkTraversals)
+		targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
 		var newFile *fs.File
 		switch err {
 		case nil:
@@ -997,7 +999,8 @@ func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileM
 		}
 
 		// Does this directory exist already?
-		f, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals)
+		remainingTraversals := uint(linux.MaxSymlinkTraversals)
+		f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
 		switch err {
 		case nil:
 			// The directory existed.
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 9eed613a1..c12693ee2 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -103,7 +103,8 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	defer wd.DecRef()
 
 	// Load the new TaskContext.
-	tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, linux.MaxSymlinkTraversals, filename, argv, envv, t.Arch().FeatureSet())
+	maxTraversals := uint(linux.MaxSymlinkTraversals)
+	tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet())
 	if err != nil {
 		return 0, nil, err
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 1e355fe4e..1e75b0efc 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -338,7 +338,8 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, ro
 		}
 	}
 
-	dirent, err := mns.FindInode(ctx, root, root, m.Destination, 0 /* maxTraversals */)
+	maxTraversals := uint(0)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
 	if err != nil {
 		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
 	}
@@ -582,7 +583,8 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	defer globalRoot.DecRef()
 
 	// Create mount point for the container's rootfs.
-	contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, 0 /* TraversalLimit */)
+	maxTraversals := uint(0)
+	contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals)
 	if err != nil {
 		return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
 	}
@@ -656,7 +658,8 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	mnsRoot := mns.Root()
 	defer mnsRoot.DecRef()
 	containerRoot := path.Join(ChildContainersDir, cid)
-	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, 0 /* maxTraversals */)
+	maxTraversals := uint(0)
+	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals)
 	if err == syserror.ENOENT {
 		// Container must have been destroyed already. That's fine.
 		return nil
@@ -691,7 +694,8 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 
 	// Get a reference to the parent directory and remove the root
 	// container directory.
-	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, 0 /* maxTraversals */)
+	maxTraversals = 0
+	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
 	if err != nil {
 		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index d5cee5608..0ed3002e0 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -406,7 +406,8 @@ func TestCreateMountNamespace(t *testing.T) {
 			root := mm.Root()
 			defer root.DecRef()
 			for _, p := range tc.expectedPaths {
-				if d, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
+				maxTraversals := uint(0)
+				if d, err := mm.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
 					d.DecRef()
-- 
cgit v1.2.3


From 2cab0e82ad8c1e38392b8c35aaa65d1121a9e2b2 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 4 Dec 2018 14:33:34 -0800
Subject: Linkat(2) should sanity check flags.

PiperOrigin-RevId: 224047765
Change-Id: I6f3c75b33c32bf8f8910ea3fab35406d7d672d87
---
 pkg/sentry/syscalls/linux/sys_file.go | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 37c90f6fd..8673bca0d 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1251,6 +1251,12 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	// AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
 	// dereferenced if it is a symbolic link.
 	flags := args[4].Int()
+
+	// Sanity check flags.
+	if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
 	resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
 	allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
 
-- 
cgit v1.2.3


From ffcbda0c8bd772c9019977775daf1d86891c3f28 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 4 Dec 2018 18:14:17 -0800
Subject: Partial writes should loop in rpcinet.

FileOperations.Write should return ErrWouldBlock to allow the upper
layer to loop and sendmsg should continue writing where it left off
on a partial write.

PiperOrigin-RevId: 224081631
Change-Id: Ic61f6943ea6b7abbd82e4279decea215347eac48
---
 pkg/sentry/socket/rpcinet/socket.go | 39 +++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 7328661ab..90844f10f 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -212,6 +212,11 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 	}
 
 	n, err := rpcWrite(t, &pb.SyscallRequest_Write{&pb.WriteRequest{Fd: s.fd, Data: v}})
+	if n > 0 && n < uint32(src.NumBytes()) {
+		// The FileOperations.Write interface expects us to return ErrWouldBlock in
+		// the event of a partial write.
+		return int64(n), syserror.ErrWouldBlock
+	}
 	return int64(n), err.ToError()
 }
 
@@ -735,19 +740,24 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 	// TODO: this needs to change to map directly to a SendMsg syscall
 	// in the RPC.
-	req := &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
+	totalWritten := 0
+	n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
 		Fd:          uint32(s.fd),
 		Data:        v,
 		Address:     to,
 		More:        flags&linux.MSG_MORE != 0,
 		EndOfRecord: flags&linux.MSG_EOR != 0,
-	}}
+	}})
 
-	n, err := rpcSendMsg(t, req)
 	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
 		return int(n), err
 	}
 
+	if n > 0 {
+		totalWritten += int(n)
+		v.TrimFront(int(n))
+	}
+
 	// We'll have to block. Register for notification and keep trying to
 	// send all the data.
 	e, ch := waiter.NewChannelEntry(nil)
@@ -755,13 +765,30 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	defer s.EventUnregister(&e)
 
 	for {
-		n, err := rpcSendMsg(t, req)
+		n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
+			Fd:          uint32(s.fd),
+			Data:        v,
+			Address:     to,
+			More:        flags&linux.MSG_MORE != 0,
+			EndOfRecord: flags&linux.MSG_EOR != 0,
+		}})
+
+		if n > 0 {
+			totalWritten += int(n)
+			v.TrimFront(int(n))
+
+			if err == nil && totalWritten < int(src.NumBytes()) {
+				continue
+			}
+		}
+
 		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
-			return int(n), err
+			// We eat the error in this situation.
+			return int(totalWritten), nil
 		}
 
 		if err := t.Block(ch); err != nil {
-			return 0, syserr.FromError(err)
+			return int(totalWritten), syserr.FromError(err)
 		}
 	}
 }
-- 
cgit v1.2.3


From 076f107643fafab30a0d45dd5af49b8bd4b574b9 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 4 Dec 2018 18:52:56 -0800
Subject: Remove initRegs arg from clone

It is always the same as t.initRegs.

PiperOrigin-RevId: 224085550
Change-Id: I5cc4ddc3b481d4748c3c43f6f4bb50da1dbac694
---
 pkg/sentry/platform/ptrace/ptrace_unsafe.go | 6 +++---
 pkg/sentry/platform/ptrace/subprocess.go    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 7a3cb8f49..223b23199 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -130,8 +130,8 @@ func (t *thread) getSignalInfo(si *arch.SignalInfo) error {
 // call attach on it.
 //
 // Precondition: the OS thread must be locked and own t.
-func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) {
-	r, ok := usermem.Addr(initRegs.Rsp).RoundUp()
+func (t *thread) clone() (*thread, error) {
+	r, ok := usermem.Addr(t.initRegs.Rsp).RoundUp()
 	if !ok {
 		return nil, syscall.EINVAL
 	}
@@ -153,7 +153,7 @@ func (t *thread) clone(initRegs *syscall.PtraceRegs) (*thread, error) {
 		arch.SyscallArgument{},
 		// We use these registers initially, but really they
 		// could be anything. We're going to stop immediately.
-		arch.SyscallArgument{Value: uintptr(unsafe.Pointer(initRegs))})
+		arch.SyscallArgument{Value: uintptr(unsafe.Pointer(&t.initRegs))})
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 2cd49d1ec..5e56a1514 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -160,7 +160,7 @@ func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
 
 		// Wait for requests to create threads.
 		for r := range requests {
-			t, err := firstThread.clone(&firstThread.initRegs)
+			t, err := firstThread.clone()
 			if err != nil {
 				// Should not happen: not recoverable.
 				panic(fmt.Sprintf("error initializing first thread: %v", err))
-- 
cgit v1.2.3


From 06131fe749e3715534f9d551528d89048ae1398b Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Wed, 5 Dec 2018 10:52:44 -0800
Subject: Check for CAP_SYS_RESOURCE in prctl(PR_SET_MM, ...)

If sys_prctl is called with PR_SET_MM without CAP_SYS_RESOURCE,
the syscall should return failure with errno set to EPERM.
See: http://man7.org/linux/man-pages/man2/prctl.2.html
PiperOrigin-RevId: 224182874
Change-Id: I630d1dd44af8b444dd16e8e58a0764a0cf1ad9a3
---
 pkg/sentry/syscalls/linux/sys_prctl.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 91e852049..4938f27bd 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -87,6 +87,10 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 
 	case linux.PR_SET_MM:
+		if !t.HasCapability(linux.CAP_SYS_RESOURCE) {
+			return 0, nil, syscall.EPERM
+		}
+
 		switch args[1].Int() {
 		case linux.PR_SET_MM_EXE_FILE:
 			fd := kdefs.FD(args[2].Int())
-- 
cgit v1.2.3


From 592f5bdc675ae2933919b649b45551c6781c7876 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 5 Dec 2018 12:45:35 -0800
Subject: Add context to mount errors

This makes it more obvious why a mount failed.

PiperOrigin-RevId: 224203880
Change-Id: I7961774a7b6fdbb5493a791f8b3815c49b8f7631
---
 pkg/sentry/fs/overlay.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 036c0f733..8ace4ee64 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -95,10 +95,10 @@ func isXattrOverlay(name string) bool {
 // - lower must not have dynamic file/directory content.
 func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) {
 	if !IsDir(upper.StableAttr) {
-		return nil, fmt.Errorf("upper Inode is not a directory")
+		return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type)
 	}
 	if !IsDir(lower.StableAttr) {
-		return nil, fmt.Errorf("lower Inode is not a directory")
+		return nil, fmt.Errorf("lower Inode is a %v, not a directory", lower.StableAttr.Type)
 	}
 	if upper.overlay != nil {
 		return nil, fmt.Errorf("cannot nest overlay in upper file of another overlay")
-- 
cgit v1.2.3


From 23438b36327524ba3e71b6416d71863fb4dfa166 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 5 Dec 2018 14:26:24 -0800
Subject: Update MM.usageAS when mremap copies or moves a mapping.

PiperOrigin-RevId: 224221509
Change-Id: I7aaea74629227d682786d3e435737364921249bf
---
 pkg/sentry/mm/syscalls.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 9519c7390..1a46c2105 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -463,6 +463,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			vma.id.IncRef()
 		}
 		mm.vmas.Add(newAR, vma)
+		mm.usageAS += uint64(newAR.Length())
 		return newAR.Start, nil
 	}
 
@@ -479,14 +480,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
 	// overlapping oldAR.
 	//
-	// Call vseg.Value() (rather than vseg.ValuePtr()) first to make a copy of
-	// the vma.
+	// Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the
+	// vma.
 	vseg = mm.vmas.Isolate(vseg, oldAR)
 	vma := vseg.Value()
 	mm.vmas.Remove(vseg)
-
-	// Insert the new vma, transferring the reference on vma.id.
 	mm.vmas.Add(newAR, vma)
+	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
 
 	// Move pmas. This is technically optional for non-private pmas, which
 	// could just go through memmap.Mappable.Translate again, but it's required
-- 
cgit v1.2.3


From 9f64e64a6ee1fe44a05ed57893785fa9064125e1 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 5 Dec 2018 14:31:07 -0800
Subject: Enforce directory accessibility before delete Walk

By Walking before checking that the directory is writable and
executable, MayDelete may return the Walk error (e.g., ENOENT) which
would normally be masked by a permission error (EACCES).

PiperOrigin-RevId: 224222453
Change-Id: I108a7f730e6bdaa7f277eaddb776267c00805475
---
 pkg/sentry/fs/dirent.go | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4c0482036..c4918a11b 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1461,6 +1461,10 @@ func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
 //
 // Compare Linux kernel fs/namei.c:may_delete.
 func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
+	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+		return err
+	}
+
 	victim, err := dir.Walk(ctx, root, name)
 	if err != nil {
 		return err
@@ -1470,11 +1474,11 @@ func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
 	return mayDelete(ctx, dir, victim)
 }
 
+// mayDelete determines whether `victim`, a child of `dir`, can be deleted or
+// renamed by `ctx`.
+//
+// Preconditions: `dir` is writable and executable by `ctx`.
 func mayDelete(ctx context.Context, dir, victim *Dirent) error {
-	if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
-		return err
-	}
-
 	if err := checkSticky(ctx, dir, victim); err != nil {
 		return err
 	}
@@ -1512,6 +1516,15 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		return syscall.ENOENT
 	}
 
+	// Do we have general permission to remove from oldParent and
+	// create/replace in newParent?
+	if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+		return err
+	}
+	if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
+		return err
+	}
+
 	// renamed is the dirent that will be renamed to something else.
 	renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */)
 	if err != nil {
@@ -1549,10 +1562,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 			return err
 		}
 
-		// Make sure we can create a new child in the new parent.
-		if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
-			return err
-		}
+		// newName doesn't exist; simply create it below.
 	} else {
 		// Check constraints on the dirent being replaced.
 
@@ -1560,7 +1570,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// across the Rename, so must call DecRef manually (no defer).
 
 		// Check that we can delete replaced.
-		if err := mayDelete(ctx, oldParent, renamed); err != nil {
+		if err := mayDelete(ctx, newParent, replaced); err != nil {
 			replaced.DecRef()
 			return err
 		}
-- 
cgit v1.2.3


From 4d8c7ae869a4e9bf60c7ea9aff79a0bee551fbc9 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 6 Dec 2018 09:25:57 -0800
Subject: Fixing O_TRUNC behavior to match Linux.

PiperOrigin-RevId: 224351139
Change-Id: I9453bd75e5a8d38db406bb47fdc01038ac60922e
---
 pkg/sentry/syscalls/linux/flags.go    | 3 +++
 pkg/sentry/syscalls/linux/sys_file.go | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index d1e0833fc..d2aec963a 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -22,6 +22,9 @@ import (
 // flagsToPermissions returns a Permissions object from Linux flags.
 // This includes truncate permission if O_TRUNC is set in the mask.
 func flagsToPermissions(mask uint) (p fs.PermMask) {
+	if mask&linux.O_TRUNC != 0 {
+		p.Write = true
+	}
 	switch mask & linux.O_ACCMODE {
 	case linux.O_WRONLY:
 		p.Write = true
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 8673bca0d..7ad0c9517 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -170,7 +170,7 @@ func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd u
 			if dirPath {
 				return syserror.ENOTDIR
 			}
-			if fileFlags.Write && flags&linux.O_TRUNC != 0 {
+			if flags&linux.O_TRUNC != 0 {
 				if err := d.Inode.Truncate(t, d, 0); err != nil {
 					return err
 				}
-- 
cgit v1.2.3


From 685eaf119ffa6c44c4dcaec0e083bbdc0271231a Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 6 Dec 2018 11:14:57 -0800
Subject: Add counters for memory events.

Also ensure an event is emitted at startup.

PiperOrigin-RevId: 224372065
Change-Id: I5f642b6d6b13c6468ee8f794effe285fcbbf29cf
---
 pkg/sentry/kernel/memevent/BUILD            |  1 +
 pkg/sentry/kernel/memevent/memory_events.go | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index e903badd3..dfd8dd062 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -12,6 +12,7 @@ go_library(
         ":memory_events_go_proto",
         "//pkg/eventchannel",
         "//pkg/log",
+        "//pkg/metric",
         "//pkg/sentry/kernel",
         "//pkg/sentry/usage",
     ],
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index f05ef1b64..1a8e86827 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -22,11 +22,15 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 )
 
+var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", false /*sync*/, "Total number of memory event periods that have elapsed since startup.")
+var totalEvents = metric.MustCreateNewUint64Metric("/memory_events/events", false /*sync*/, "Total number of memory events emitted.")
+
 // MemoryEvents describes the configuration for the global memory event emitter.
 type MemoryEvents struct {
 	k *kernel.Kernel
@@ -71,6 +75,10 @@ func (m *MemoryEvents) Start() {
 func (m *MemoryEvents) run() {
 	m.done.Add(1)
 
+	// Emit the first event immediately on startup.
+	totalTicks.Increment()
+	m.emit()
+
 	ticker := time.NewTicker(m.period)
 	defer ticker.Stop()
 
@@ -80,6 +88,7 @@ func (m *MemoryEvents) run() {
 			m.done.Done()
 			return
 		case <-ticker.C:
+			totalTicks.Increment()
 			m.emit()
 		}
 	}
@@ -94,6 +103,7 @@ func (m *MemoryEvents) emit() {
 	snapshot, _ := usage.MemoryAccounting.Copy()
 	total := totalPlatform + snapshot.Mapped
 
+	totalEvents.Increment()
 	eventchannel.Emit(&pb.MemoryUsageEvent{
 		Mapped: snapshot.Mapped,
 		Total:  total,
-- 
cgit v1.2.3


From 000fa84a3bb1aebeda235c56545c942d7c29003d Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 6 Dec 2018 11:40:39 -0800
Subject: Fix tcpip.Endpoint.Write contract regarding short writes

* Clarify tcpip.Endpoint.Write contract regarding short writes.
* Enforce tcpip.Endpoint.Write contract regarding short writes.
* Update relevant users of tcpip.Endpoint.Write.

PiperOrigin-RevId: 224377586
Change-Id: I24299ecce902eb11317ee13dae3b8d8a7c5b097d
---
 pkg/sentry/socket/epsocket/epsocket.go | 34 ++++++++++++++++++++++++++--------
 pkg/sentry/socket/socket.go            |  2 ++
 pkg/tcpip/tcpip.go                     |  5 ++++-
 pkg/tcpip/transport/ping/endpoint.go   |  6 +++++-
 pkg/tcpip/transport/tcp/endpoint.go    |  6 +-----
 5 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index c5ce289b5..8c5db6af8 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -323,20 +323,27 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 	f := &ioSequencePayload{ctx: ctx, src: src}
 	n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{})
 	if err == tcpip.ErrWouldBlock {
-		return int64(n), syserror.ErrWouldBlock
+		return 0, syserror.ErrWouldBlock
 	}
 
 	if resCh != nil {
 		t := ctx.(*kernel.Task)
 		if err := t.Block(resCh); err != nil {
-			return int64(n), syserr.FromError(err).ToError()
+			return 0, syserr.FromError(err).ToError()
 		}
 
 		n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{})
-		return int64(n), syserr.TranslateNetstackError(err).ToError()
 	}
 
-	return int64(n), syserr.TranslateNetstackError(err).ToError()
+	if err != nil {
+		return 0, syserr.TranslateNetstackError(err).ToError()
+	}
+
+	if int64(n) < src.NumBytes() {
+		return int64(n), syserror.ErrWouldBlock
+	}
+
+	return int64(n), nil
 }
 
 // Readiness returns a mask of ready events for socket s.
@@ -1343,11 +1350,16 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 	n, resCh, err := s.Endpoint.Write(tcpip.SlicePayload(v), opts)
 	if resCh != nil {
 		if err := t.Block(resCh); err != nil {
-			return int(n), syserr.FromError(err)
+			return 0, syserr.FromError(err)
 		}
 		n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
 	}
-	if err != tcpip.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	dontWait := flags&linux.MSG_DONTWAIT != 0
+	if err == nil && (n >= uintptr(len(v)) || dontWait) {
+		// Complete write.
+		return int(n), nil
+	}
+	if err != nil && (err != tcpip.ErrWouldBlock || dontWait) {
 		return int(n), syserr.TranslateNetstackError(err)
 	}
 
@@ -1363,11 +1375,17 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
 		v.TrimFront(int(n))
 		total += n
-		if err != tcpip.ErrWouldBlock {
-			return int(total), syserr.TranslateNetstackError(err)
+
+		if err != nil && err != tcpip.ErrWouldBlock && total == 0 {
+			return 0, syserr.TranslateNetstackError(err)
+		}
+
+		if err == nil && len(v) == 0 || err != nil && err != tcpip.ErrWouldBlock {
+			return int(total), nil
 		}
 
 		if err := t.Block(ch); err != nil {
+			// handleIOError will consume errors from t.Block if needed.
 			return int(total), syserr.FromError(err)
 		}
 	}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index b1dcbf7b0..f31729819 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -90,6 +90,8 @@ type Socket interface {
 
 	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
 	// ownership of the ControlMessage on error.
+	//
+	// If n > 0, err will either be nil or an error from t.Block.
 	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages ControlMessages) (n int, err *syserr.Error)
 
 	// SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 8e2fe70ee..dc6339173 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -312,7 +312,10 @@ type Endpoint interface {
 	// the caller should not use data[:n] after Write returns.
 	//
 	// Note that unlike io.Writer.Write, it is not an error for Write to
-	// perform a partial write.
+	// perform a partial write (if n > 0, no error may be returned). Only
+	// stream (TCP) Endpoints may return partial writes, and even then only
+	// in the case where writing additional data would block. Other Endpoints
+	// will either write the entire message or return an error.
 	//
 	// For UDP and Ping sockets if address resolution is required,
 	// ErrNoLinkAddress and a notification channel is returned for the caller to
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index b3f54cfe0..10d4d138e 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -299,7 +299,11 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 		err = sendPing6(route, e.id.LocalPort, v)
 	}
 
-	return uintptr(len(v)), nil, err
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(len(v)), nil, nil
 }
 
 // Peek only returns data from a single datagram, so do nothing here.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 1649dbc97..6034ba90b 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -554,10 +554,6 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 		return 0, nil, perr
 	}
 
-	var err *tcpip.Error
-	if p.Size() > avail {
-		err = tcpip.ErrWouldBlock
-	}
 	l := len(v)
 	s := newSegmentFromView(&e.route, e.id, v)
 
@@ -576,7 +572,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 		// Let the protocol goroutine do the work.
 		e.sndWaker.Assert()
 	}
-	return uintptr(l), nil, err
+	return uintptr(l), nil, nil
 }
 
 // Peek reads data without consuming it from the endpoint.
-- 
cgit v1.2.3


From 666db00c262c7d6d6359fbaba28e344d015a7823 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 6 Dec 2018 11:42:23 -0800
Subject: Convert ValueSet to a map

Unlike FlagSet, order doesn't matter here, so it can simply be a map.

PiperOrigin-RevId: 224377910
Change-Id: I15810c698a7f02d8614bf09b59583ab73cba0514
---
 pkg/abi/flag.go             |  25 +--
 pkg/abi/linux/file.go       |  35 +---
 pkg/sentry/strace/futex.go  |  65 ++----
 pkg/sentry/strace/open.go   |  15 +-
 pkg/sentry/strace/ptrace.go | 190 ++++--------------
 pkg/sentry/strace/socket.go | 470 +++++++++-----------------------------------
 pkg/sentry/strace/strace.go |  15 +-
 7 files changed, 168 insertions(+), 647 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/flag.go b/pkg/abi/flag.go
index 0698e410f..049c1b0dd 100644
--- a/pkg/abi/flag.go
+++ b/pkg/abi/flag.go
@@ -46,30 +46,25 @@ func (s FlagSet) Parse(val uint64) string {
 	return strings.Join(flags, "|")
 }
 
-// ValueSet is a slice of syscall values and their name. Parse will replace
-// values that exactly match an entry with its name.
-type ValueSet []struct {
-	Value uint64
-	Name  string
-}
+// ValueSet is a map of syscall values to their name. Parse will use the name
+// or the value if unknown.
+type ValueSet map[uint64]string
 
 // Parse returns the name of the value associated with `val`. Unknown values
 // are converted to hex.
-func (e ValueSet) Parse(val uint64) string {
-	for _, f := range e {
-		if val == f.Value {
-			return f.Name
-		}
+func (s ValueSet) Parse(val uint64) string {
+	if v, ok := s[val]; ok {
+		return v
 	}
 	return fmt.Sprintf("%#x", val)
 }
 
 // ParseName returns the flag value associated with 'name'. Returns false
 // if no value is found.
-func (e ValueSet) ParseName(name string) (uint64, bool) {
-	for _, f := range e {
-		if name == f.Name {
-			return f.Value, true
+func (s ValueSet) ParseName(name string) (uint64, bool) {
+	for k, v := range s {
+		if v == name {
+			return k, true
 		}
 	}
 	return math.MaxUint64, false
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 8d48e1753..ac49ae9a6 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -223,32 +223,11 @@ var modeExtraBits = abi.FlagSet{
 }
 
 var fileType = abi.ValueSet{
-	{
-		Value: ModeSocket,
-		Name:  "S_IFSOCK",
-	},
-	{
-		Value: ModeSymlink,
-		Name:  "S_IFLINK",
-	},
-	{
-		Value: ModeRegular,
-		Name:  "S_IFREG",
-	},
-	{
-		Value: ModeBlockDevice,
-		Name:  "S_IFBLK",
-	},
-	{
-		Value: ModeDirectory,
-		Name:  "S_IFDIR",
-	},
-	{
-		Value: ModeCharacterDevice,
-		Name:  "S_IFCHR",
-	},
-	{
-		Value: ModeNamedPipe,
-		Name:  "S_IFIFO",
-	},
+	ModeSocket:          "S_IFSOCK",
+	ModeSymlink:         "S_IFLINK",
+	ModeRegular:         "S_IFREG",
+	ModeBlockDevice:     "S_IFBLK",
+	ModeDirectory:       "S_IFDIR",
+	ModeCharacterDevice: "S_IFCHR",
+	ModeNamedPipe:       "S_IFIFO",
 }
diff --git a/pkg/sentry/strace/futex.go b/pkg/sentry/strace/futex.go
index ceb3dc21d..f4aa7fcad 100644
--- a/pkg/sentry/strace/futex.go
+++ b/pkg/sentry/strace/futex.go
@@ -21,58 +21,19 @@ import (
 
 // FutexCmd are the possible futex(2) commands.
 var FutexCmd = abi.ValueSet{
-	{
-		Value: linux.FUTEX_WAIT,
-		Name:  "FUTEX_WAIT",
-	},
-	{
-		Value: linux.FUTEX_WAKE,
-		Name:  "FUTEX_WAKE",
-	},
-	{
-		Value: linux.FUTEX_FD,
-		Name:  "FUTEX_FD",
-	},
-	{
-		Value: linux.FUTEX_REQUEUE,
-		Name:  "FUTEX_REQUEUE",
-	},
-	{
-		Value: linux.FUTEX_CMP_REQUEUE,
-		Name:  "FUTEX_CMP_REQUEUE",
-	},
-	{
-		Value: linux.FUTEX_WAKE_OP,
-		Name:  "FUTEX_WAKE_OP",
-	},
-	{
-		Value: linux.FUTEX_LOCK_PI,
-		Name:  "FUTEX_LOCK_PI",
-	},
-	{
-		Value: linux.FUTEX_UNLOCK_PI,
-		Name:  "FUTEX_UNLOCK_PI",
-	},
-	{
-		Value: linux.FUTEX_TRYLOCK_PI,
-		Name:  "FUTEX_TRYLOCK_PI",
-	},
-	{
-		Value: linux.FUTEX_WAIT_BITSET,
-		Name:  "FUTEX_WAIT_BITSET",
-	},
-	{
-		Value: linux.FUTEX_WAKE_BITSET,
-		Name:  "FUTEX_WAKE_BITSET",
-	},
-	{
-		Value: linux.FUTEX_WAIT_REQUEUE_PI,
-		Name:  "FUTEX_WAIT_REQUEUE_PI",
-	},
-	{
-		Value: linux.FUTEX_CMP_REQUEUE_PI,
-		Name:  "FUTEX_CMP_REQUEUE_PI",
-	},
+	linux.FUTEX_WAIT:            "FUTEX_WAIT",
+	linux.FUTEX_WAKE:            "FUTEX_WAKE",
+	linux.FUTEX_FD:              "FUTEX_FD",
+	linux.FUTEX_REQUEUE:         "FUTEX_REQUEUE",
+	linux.FUTEX_CMP_REQUEUE:     "FUTEX_CMP_REQUEUE",
+	linux.FUTEX_WAKE_OP:         "FUTEX_WAKE_OP",
+	linux.FUTEX_LOCK_PI:         "FUTEX_LOCK_PI",
+	linux.FUTEX_UNLOCK_PI:       "FUTEX_UNLOCK_PI",
+	linux.FUTEX_TRYLOCK_PI:      "FUTEX_TRYLOCK_PI",
+	linux.FUTEX_WAIT_BITSET:     "FUTEX_WAIT_BITSET",
+	linux.FUTEX_WAKE_BITSET:     "FUTEX_WAKE_BITSET",
+	linux.FUTEX_WAIT_REQUEUE_PI: "FUTEX_WAIT_REQUEUE_PI",
+	linux.FUTEX_CMP_REQUEUE_PI:  "FUTEX_CMP_REQUEUE_PI",
 }
 
 func futex(op uint64) string {
diff --git a/pkg/sentry/strace/open.go b/pkg/sentry/strace/open.go
index 5a72a940c..3bf348d7a 100644
--- a/pkg/sentry/strace/open.go
+++ b/pkg/sentry/strace/open.go
@@ -22,18 +22,9 @@ import (
 
 // OpenMode represents the mode to open(2) a file.
 var OpenMode = abi.ValueSet{
-	{
-		Value: syscall.O_RDWR,
-		Name:  "O_RDWR",
-	},
-	{
-		Value: syscall.O_WRONLY,
-		Name:  "O_WRONLY",
-	},
-	{
-		Value: syscall.O_RDONLY,
-		Name:  "O_RDONLY",
-	},
+	syscall.O_RDWR:   "O_RDWR",
+	syscall.O_WRONLY: "O_WRONLY",
+	syscall.O_RDONLY: "O_RDONLY",
 }
 
 // OpenFlagSet is the set of open(2) flags.
diff --git a/pkg/sentry/strace/ptrace.go b/pkg/sentry/strace/ptrace.go
index c572aafb4..8c4b79227 100644
--- a/pkg/sentry/strace/ptrace.go
+++ b/pkg/sentry/strace/ptrace.go
@@ -21,156 +21,42 @@ import (
 
 // PtraceRequestSet are the possible ptrace(2) requests.
 var PtraceRequestSet = abi.ValueSet{
-	{
-		Value: linux.PTRACE_TRACEME,
-		Name:  "PTRACE_TRACEME",
-	},
-	{
-		Value: linux.PTRACE_PEEKTEXT,
-		Name:  "PTRACE_PEEKTEXT",
-	},
-	{
-		Value: linux.PTRACE_PEEKDATA,
-		Name:  "PTRACE_PEEKDATA",
-	},
-	{
-		Value: linux.PTRACE_PEEKUSR,
-		Name:  "PTRACE_PEEKUSR",
-	},
-	{
-		Value: linux.PTRACE_POKETEXT,
-		Name:  "PTRACE_POKETEXT",
-	},
-	{
-		Value: linux.PTRACE_POKEDATA,
-		Name:  "PTRACE_POKEDATA",
-	},
-	{
-		Value: linux.PTRACE_POKEUSR,
-		Name:  "PTRACE_POKEUSR",
-	},
-	{
-		Value: linux.PTRACE_CONT,
-		Name:  "PTRACE_CONT",
-	},
-	{
-		Value: linux.PTRACE_KILL,
-		Name:  "PTRACE_KILL",
-	},
-	{
-		Value: linux.PTRACE_SINGLESTEP,
-		Name:  "PTRACE_SINGLESTEP",
-	},
-	{
-		Value: linux.PTRACE_ATTACH,
-		Name:  "PTRACE_ATTACH",
-	},
-	{
-		Value: linux.PTRACE_DETACH,
-		Name:  "PTRACE_DETACH",
-	},
-	{
-		Value: linux.PTRACE_SYSCALL,
-		Name:  "PTRACE_SYSCALL",
-	},
-	{
-		Value: linux.PTRACE_SETOPTIONS,
-		Name:  "PTRACE_SETOPTIONS",
-	},
-	{
-		Value: linux.PTRACE_GETEVENTMSG,
-		Name:  "PTRACE_GETEVENTMSG",
-	},
-	{
-		Value: linux.PTRACE_GETSIGINFO,
-		Name:  "PTRACE_GETSIGINFO",
-	},
-	{
-		Value: linux.PTRACE_SETSIGINFO,
-		Name:  "PTRACE_SETSIGINFO",
-	},
-	{
-		Value: linux.PTRACE_GETREGSET,
-		Name:  "PTRACE_GETREGSET",
-	},
-	{
-		Value: linux.PTRACE_SETREGSET,
-		Name:  "PTRACE_SETREGSET",
-	},
-	{
-		Value: linux.PTRACE_SEIZE,
-		Name:  "PTRACE_SEIZE",
-	},
-	{
-		Value: linux.PTRACE_INTERRUPT,
-		Name:  "PTRACE_INTERRUPT",
-	},
-	{
-		Value: linux.PTRACE_LISTEN,
-		Name:  "PTRACE_LISTEN",
-	},
-	{
-		Value: linux.PTRACE_PEEKSIGINFO,
-		Name:  "PTRACE_PEEKSIGINFO",
-	},
-	{
-		Value: linux.PTRACE_GETSIGMASK,
-		Name:  "PTRACE_GETSIGMASK",
-	},
-	{
-		Value: linux.PTRACE_SETSIGMASK,
-		Name:  "PTRACE_SETSIGMASK",
-	},
-	{
-		Value: linux.PTRACE_GETREGS,
-		Name:  "PTRACE_GETREGS",
-	},
-	{
-		Value: linux.PTRACE_SETREGS,
-		Name:  "PTRACE_SETREGS",
-	},
-	{
-		Value: linux.PTRACE_GETFPREGS,
-		Name:  "PTRACE_GETFPREGS",
-	},
-	{
-		Value: linux.PTRACE_SETFPREGS,
-		Name:  "PTRACE_SETFPREGS",
-	},
-	{
-		Value: linux.PTRACE_GETFPXREGS,
-		Name:  "PTRACE_GETFPXREGS",
-	},
-	{
-		Value: linux.PTRACE_SETFPXREGS,
-		Name:  "PTRACE_SETFPXREGS",
-	},
-	{
-		Value: linux.PTRACE_OLDSETOPTIONS,
-		Name:  "PTRACE_OLDSETOPTIONS",
-	},
-	{
-		Value: linux.PTRACE_GET_THREAD_AREA,
-		Name:  "PTRACE_GET_THREAD_AREA",
-	},
-	{
-		Value: linux.PTRACE_SET_THREAD_AREA,
-		Name:  "PTRACE_SET_THREAD_AREA",
-	},
-	{
-		Value: linux.PTRACE_ARCH_PRCTL,
-		Name:  "PTRACE_ARCH_PRCTL",
-	},
-	{
-		Value: linux.PTRACE_SYSEMU,
-		Name:  "PTRACE_SYSEMU",
-	},
-	{
-		Value: linux.PTRACE_SYSEMU_SINGLESTEP,
-		Name:  "PTRACE_SYSEMU_SINGLESTEP",
-	},
-	{
-		Value: linux.PTRACE_SINGLEBLOCK,
-		Name:  "PTRACE_SINGLEBLOCK",
-	},
+	linux.PTRACE_TRACEME:           "PTRACE_TRACEME",
+	linux.PTRACE_PEEKTEXT:          "PTRACE_PEEKTEXT",
+	linux.PTRACE_PEEKDATA:          "PTRACE_PEEKDATA",
+	linux.PTRACE_PEEKUSR:           "PTRACE_PEEKUSR",
+	linux.PTRACE_POKETEXT:          "PTRACE_POKETEXT",
+	linux.PTRACE_POKEDATA:          "PTRACE_POKEDATA",
+	linux.PTRACE_POKEUSR:           "PTRACE_POKEUSR",
+	linux.PTRACE_CONT:              "PTRACE_CONT",
+	linux.PTRACE_KILL:              "PTRACE_KILL",
+	linux.PTRACE_SINGLESTEP:        "PTRACE_SINGLESTEP",
+	linux.PTRACE_ATTACH:            "PTRACE_ATTACH",
+	linux.PTRACE_DETACH:            "PTRACE_DETACH",
+	linux.PTRACE_SYSCALL:           "PTRACE_SYSCALL",
+	linux.PTRACE_SETOPTIONS:        "PTRACE_SETOPTIONS",
+	linux.PTRACE_GETEVENTMSG:       "PTRACE_GETEVENTMSG",
+	linux.PTRACE_GETSIGINFO:        "PTRACE_GETSIGINFO",
+	linux.PTRACE_SETSIGINFO:        "PTRACE_SETSIGINFO",
+	linux.PTRACE_GETREGSET:         "PTRACE_GETREGSET",
+	linux.PTRACE_SETREGSET:         "PTRACE_SETREGSET",
+	linux.PTRACE_SEIZE:             "PTRACE_SEIZE",
+	linux.PTRACE_INTERRUPT:         "PTRACE_INTERRUPT",
+	linux.PTRACE_LISTEN:            "PTRACE_LISTEN",
+	linux.PTRACE_PEEKSIGINFO:       "PTRACE_PEEKSIGINFO",
+	linux.PTRACE_GETSIGMASK:        "PTRACE_GETSIGMASK",
+	linux.PTRACE_SETSIGMASK:        "PTRACE_SETSIGMASK",
+	linux.PTRACE_GETREGS:           "PTRACE_GETREGS",
+	linux.PTRACE_SETREGS:           "PTRACE_SETREGS",
+	linux.PTRACE_GETFPREGS:         "PTRACE_GETFPREGS",
+	linux.PTRACE_SETFPREGS:         "PTRACE_SETFPREGS",
+	linux.PTRACE_GETFPXREGS:        "PTRACE_GETFPXREGS",
+	linux.PTRACE_SETFPXREGS:        "PTRACE_SETFPXREGS",
+	linux.PTRACE_OLDSETOPTIONS:     "PTRACE_OLDSETOPTIONS",
+	linux.PTRACE_GET_THREAD_AREA:   "PTRACE_GET_THREAD_AREA",
+	linux.PTRACE_SET_THREAD_AREA:   "PTRACE_SET_THREAD_AREA",
+	linux.PTRACE_ARCH_PRCTL:        "PTRACE_ARCH_PRCTL",
+	linux.PTRACE_SYSEMU:            "PTRACE_SYSEMU",
+	linux.PTRACE_SYSEMU_SINGLESTEP: "PTRACE_SYSEMU_SINGLESTEP",
+	linux.PTRACE_SINGLEBLOCK:       "PTRACE_SINGLEBLOCK",
 }
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 375418dc1..4c1a9d469 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -31,202 +31,58 @@ import (
 
 // SocketFamily are the possible socket(2) families.
 var SocketFamily = abi.ValueSet{
-	{
-		Value: linux.AF_UNSPEC,
-		Name:  "AF_UNSPEC",
-	},
-	{
-		Value: linux.AF_UNIX,
-		Name:  "AF_UNIX",
-	},
-	{
-		Value: linux.AF_INET,
-		Name:  "AF_INET",
-	},
-	{
-		Value: linux.AF_AX25,
-		Name:  "AF_AX25",
-	},
-	{
-		Value: linux.AF_IPX,
-		Name:  "AF_IPX",
-	},
-	{
-		Value: linux.AF_APPLETALK,
-		Name:  "AF_APPLETALK",
-	},
-	{
-		Value: linux.AF_NETROM,
-		Name:  "AF_NETROM",
-	},
-	{
-		Value: linux.AF_BRIDGE,
-		Name:  "AF_BRIDGE",
-	},
-	{
-		Value: linux.AF_ATMPVC,
-		Name:  "AF_ATMPVC",
-	},
-	{
-		Value: linux.AF_X25,
-		Name:  "AF_X25",
-	},
-	{
-		Value: linux.AF_INET6,
-		Name:  "AF_INET6",
-	},
-	{
-		Value: linux.AF_ROSE,
-		Name:  "AF_ROSE",
-	},
-	{
-		Value: linux.AF_DECnet,
-		Name:  "AF_DECnet",
-	},
-	{
-		Value: linux.AF_NETBEUI,
-		Name:  "AF_NETBEUI",
-	},
-	{
-		Value: linux.AF_SECURITY,
-		Name:  "AF_SECURITY",
-	},
-	{
-		Value: linux.AF_KEY,
-		Name:  "AF_KEY",
-	},
-	{
-		Value: linux.AF_NETLINK,
-		Name:  "AF_NETLINK",
-	},
-	{
-		Value: linux.AF_PACKET,
-		Name:  "AF_PACKET",
-	},
-	{
-		Value: linux.AF_ASH,
-		Name:  "AF_ASH",
-	},
-	{
-		Value: linux.AF_ECONET,
-		Name:  "AF_ECONET",
-	},
-	{
-		Value: linux.AF_ATMSVC,
-		Name:  "AF_ATMSVC",
-	},
-	{
-		Value: linux.AF_RDS,
-		Name:  "AF_RDS",
-	},
-	{
-		Value: linux.AF_SNA,
-		Name:  "AF_SNA",
-	},
-	{
-		Value: linux.AF_IRDA,
-		Name:  "AF_IRDA",
-	},
-	{
-		Value: linux.AF_PPPOX,
-		Name:  "AF_PPPOX",
-	},
-	{
-		Value: linux.AF_WANPIPE,
-		Name:  "AF_WANPIPE",
-	},
-	{
-		Value: linux.AF_LLC,
-		Name:  "AF_LLC",
-	},
-	{
-		Value: linux.AF_IB,
-		Name:  "AF_IB",
-	},
-	{
-		Value: linux.AF_MPLS,
-		Name:  "AF_MPLS",
-	},
-	{
-		Value: linux.AF_CAN,
-		Name:  "AF_CAN",
-	},
-	{
-		Value: linux.AF_TIPC,
-		Name:  "AF_TIPC",
-	},
-	{
-		Value: linux.AF_BLUETOOTH,
-		Name:  "AF_BLUETOOTH",
-	},
-	{
-		Value: linux.AF_IUCV,
-		Name:  "AF_IUCV",
-	},
-	{
-		Value: linux.AF_RXRPC,
-		Name:  "AF_RXRPC",
-	},
-	{
-		Value: linux.AF_ISDN,
-		Name:  "AF_ISDN",
-	},
-	{
-		Value: linux.AF_PHONET,
-		Name:  "AF_PHONET",
-	},
-	{
-		Value: linux.AF_IEEE802154,
-		Name:  "AF_IEEE802154",
-	},
-	{
-		Value: linux.AF_CAIF,
-		Name:  "AF_CAIF",
-	},
-	{
-		Value: linux.AF_ALG,
-		Name:  "AF_ALG",
-	},
-	{
-		Value: linux.AF_NFC,
-		Name:  "AF_NFC",
-	},
-	{
-		Value: linux.AF_VSOCK,
-		Name:  "AF_VSOCK",
-	},
+	linux.AF_UNSPEC:     "AF_UNSPEC",
+	linux.AF_UNIX:       "AF_UNIX",
+	linux.AF_INET:       "AF_INET",
+	linux.AF_AX25:       "AF_AX25",
+	linux.AF_IPX:        "AF_IPX",
+	linux.AF_APPLETALK:  "AF_APPLETALK",
+	linux.AF_NETROM:     "AF_NETROM",
+	linux.AF_BRIDGE:     "AF_BRIDGE",
+	linux.AF_ATMPVC:     "AF_ATMPVC",
+	linux.AF_X25:        "AF_X25",
+	linux.AF_INET6:      "AF_INET6",
+	linux.AF_ROSE:       "AF_ROSE",
+	linux.AF_DECnet:     "AF_DECnet",
+	linux.AF_NETBEUI:    "AF_NETBEUI",
+	linux.AF_SECURITY:   "AF_SECURITY",
+	linux.AF_KEY:        "AF_KEY",
+	linux.AF_NETLINK:    "AF_NETLINK",
+	linux.AF_PACKET:     "AF_PACKET",
+	linux.AF_ASH:        "AF_ASH",
+	linux.AF_ECONET:     "AF_ECONET",
+	linux.AF_ATMSVC:     "AF_ATMSVC",
+	linux.AF_RDS:        "AF_RDS",
+	linux.AF_SNA:        "AF_SNA",
+	linux.AF_IRDA:       "AF_IRDA",
+	linux.AF_PPPOX:      "AF_PPPOX",
+	linux.AF_WANPIPE:    "AF_WANPIPE",
+	linux.AF_LLC:        "AF_LLC",
+	linux.AF_IB:         "AF_IB",
+	linux.AF_MPLS:       "AF_MPLS",
+	linux.AF_CAN:        "AF_CAN",
+	linux.AF_TIPC:       "AF_TIPC",
+	linux.AF_BLUETOOTH:  "AF_BLUETOOTH",
+	linux.AF_IUCV:       "AF_IUCV",
+	linux.AF_RXRPC:      "AF_RXRPC",
+	linux.AF_ISDN:       "AF_ISDN",
+	linux.AF_PHONET:     "AF_PHONET",
+	linux.AF_IEEE802154: "AF_IEEE802154",
+	linux.AF_CAIF:       "AF_CAIF",
+	linux.AF_ALG:        "AF_ALG",
+	linux.AF_NFC:        "AF_NFC",
+	linux.AF_VSOCK:      "AF_VSOCK",
 }
 
 // SocketType are the possible socket(2) types.
 var SocketType = abi.ValueSet{
-	{
-		Value: linux.SOCK_STREAM,
-		Name:  "SOCK_STREAM",
-	},
-	{
-		Value: linux.SOCK_DGRAM,
-		Name:  "SOCK_DGRAM",
-	},
-	{
-		Value: linux.SOCK_RAW,
-		Name:  "SOCK_RAW",
-	},
-	{
-		Value: linux.SOCK_RDM,
-		Name:  "SOCK_RDM",
-	},
-	{
-		Value: linux.SOCK_SEQPACKET,
-		Name:  "SOCK_SEQPACKET",
-	},
-	{
-		Value: linux.SOCK_DCCP,
-		Name:  "SOCK_DCCP",
-	},
-	{
-		Value: linux.SOCK_PACKET,
-		Name:  "SOCK_PACKET",
-	},
+	linux.SOCK_STREAM:    "SOCK_STREAM",
+	linux.SOCK_DGRAM:     "SOCK_DGRAM",
+	linux.SOCK_RAW:       "SOCK_RAW",
+	linux.SOCK_RDM:       "SOCK_RDM",
+	linux.SOCK_SEQPACKET: "SOCK_SEQPACKET",
+	linux.SOCK_DCCP:      "SOCK_DCCP",
+	linux.SOCK_PACKET:    "SOCK_PACKET",
 }
 
 // SocketFlagSet are the possible socket(2) flags.
@@ -243,106 +99,31 @@ var SocketFlagSet = abi.FlagSet{
 
 // ipProtocol are the possible socket(2) types for INET and INET6 sockets.
 var ipProtocol = abi.ValueSet{
-	{
-		Value: linux.IPPROTO_IP,
-		Name:  "IPPROTO_IP",
-	},
-	{
-		Value: linux.IPPROTO_ICMP,
-		Name:  "IPPROTO_ICMP",
-	},
-	{
-		Value: linux.IPPROTO_IGMP,
-		Name:  "IPPROTO_IGMP",
-	},
-	{
-		Value: linux.IPPROTO_IPIP,
-		Name:  "IPPROTO_IPIP",
-	},
-	{
-		Value: linux.IPPROTO_TCP,
-		Name:  "IPPROTO_TCP",
-	},
-	{
-		Value: linux.IPPROTO_EGP,
-		Name:  "IPPROTO_EGP",
-	},
-	{
-		Value: linux.IPPROTO_PUP,
-		Name:  "IPPROTO_PUP",
-	},
-	{
-		Value: linux.IPPROTO_UDP,
-		Name:  "IPPROTO_UDP",
-	},
-	{
-		Value: linux.IPPROTO_IDP,
-		Name:  "IPPROTO_IDP",
-	},
-	{
-		Value: linux.IPPROTO_TP,
-		Name:  "IPPROTO_TP",
-	},
-	{
-		Value: linux.IPPROTO_DCCP,
-		Name:  "IPPROTO_DCCP",
-	},
-	{
-		Value: linux.IPPROTO_IPV6,
-		Name:  "IPPROTO_IPV6",
-	},
-	{
-		Value: linux.IPPROTO_RSVP,
-		Name:  "IPPROTO_RSVP",
-	},
-	{
-		Value: linux.IPPROTO_GRE,
-		Name:  "IPPROTO_GRE",
-	},
-	{
-		Value: linux.IPPROTO_ESP,
-		Name:  "IPPROTO_ESP",
-	},
-	{
-		Value: linux.IPPROTO_AH,
-		Name:  "IPPROTO_AH",
-	},
-	{
-		Value: linux.IPPROTO_MTP,
-		Name:  "IPPROTO_MTP",
-	},
-	{
-		Value: linux.IPPROTO_BEETPH,
-		Name:  "IPPROTO_BEETPH",
-	},
-	{
-		Value: linux.IPPROTO_ENCAP,
-		Name:  "IPPROTO_ENCAP",
-	},
-	{
-		Value: linux.IPPROTO_PIM,
-		Name:  "IPPROTO_PIM",
-	},
-	{
-		Value: linux.IPPROTO_COMP,
-		Name:  "IPPROTO_COMP",
-	},
-	{
-		Value: linux.IPPROTO_SCTP,
-		Name:  "IPPROTO_SCTP",
-	},
-	{
-		Value: linux.IPPROTO_UDPLITE,
-		Name:  "IPPROTO_UDPLITE",
-	},
-	{
-		Value: linux.IPPROTO_MPLS,
-		Name:  "IPPROTO_MPLS",
-	},
-	{
-		Value: linux.IPPROTO_RAW,
-		Name:  "IPPROTO_RAW",
-	},
+	linux.IPPROTO_IP:      "IPPROTO_IP",
+	linux.IPPROTO_ICMP:    "IPPROTO_ICMP",
+	linux.IPPROTO_IGMP:    "IPPROTO_IGMP",
+	linux.IPPROTO_IPIP:    "IPPROTO_IPIP",
+	linux.IPPROTO_TCP:     "IPPROTO_TCP",
+	linux.IPPROTO_EGP:     "IPPROTO_EGP",
+	linux.IPPROTO_PUP:     "IPPROTO_PUP",
+	linux.IPPROTO_UDP:     "IPPROTO_UDP",
+	linux.IPPROTO_IDP:     "IPPROTO_IDP",
+	linux.IPPROTO_TP:      "IPPROTO_TP",
+	linux.IPPROTO_DCCP:    "IPPROTO_DCCP",
+	linux.IPPROTO_IPV6:    "IPPROTO_IPV6",
+	linux.IPPROTO_RSVP:    "IPPROTO_RSVP",
+	linux.IPPROTO_GRE:     "IPPROTO_GRE",
+	linux.IPPROTO_ESP:     "IPPROTO_ESP",
+	linux.IPPROTO_AH:      "IPPROTO_AH",
+	linux.IPPROTO_MTP:     "IPPROTO_MTP",
+	linux.IPPROTO_BEETPH:  "IPPROTO_BEETPH",
+	linux.IPPROTO_ENCAP:   "IPPROTO_ENCAP",
+	linux.IPPROTO_PIM:     "IPPROTO_PIM",
+	linux.IPPROTO_COMP:    "IPPROTO_COMP",
+	linux.IPPROTO_SCTP:    "IPPROTO_SCTP",
+	linux.IPPROTO_UDPLITE: "IPPROTO_UDPLITE",
+	linux.IPPROTO_MPLS:    "IPPROTO_MPLS",
+	linux.IPPROTO_RAW:     "IPPROTO_RAW",
 }
 
 // SocketProtocol are the possible socket(2) protocols for each protocol family.
@@ -350,90 +131,27 @@ var SocketProtocol = map[int32]abi.ValueSet{
 	linux.AF_INET:  ipProtocol,
 	linux.AF_INET6: ipProtocol,
 	linux.AF_NETLINK: {
-		{
-			Value: linux.NETLINK_ROUTE,
-			Name:  "NETLINK_ROUTE",
-		},
-		{
-			Value: linux.NETLINK_UNUSED,
-			Name:  "NETLINK_UNUSED",
-		},
-		{
-			Value: linux.NETLINK_USERSOCK,
-			Name:  "NETLINK_USERSOCK",
-		},
-		{
-			Value: linux.NETLINK_FIREWALL,
-			Name:  "NETLINK_FIREWALL",
-		},
-		{
-			Value: linux.NETLINK_SOCK_DIAG,
-			Name:  "NETLINK_SOCK_DIAG",
-		},
-		{
-			Value: linux.NETLINK_NFLOG,
-			Name:  "NETLINK_NFLOG",
-		},
-		{
-			Value: linux.NETLINK_XFRM,
-			Name:  "NETLINK_XFRM",
-		},
-		{
-			Value: linux.NETLINK_SELINUX,
-			Name:  "NETLINK_SELINUX",
-		},
-		{
-			Value: linux.NETLINK_ISCSI,
-			Name:  "NETLINK_ISCSI",
-		},
-		{
-			Value: linux.NETLINK_AUDIT,
-			Name:  "NETLINK_AUDIT",
-		},
-		{
-			Value: linux.NETLINK_FIB_LOOKUP,
-			Name:  "NETLINK_FIB_LOOKUP",
-		},
-		{
-			Value: linux.NETLINK_CONNECTOR,
-			Name:  "NETLINK_CONNECTOR",
-		},
-		{
-			Value: linux.NETLINK_NETFILTER,
-			Name:  "NETLINK_NETFILTER",
-		},
-		{
-			Value: linux.NETLINK_IP6_FW,
-			Name:  "NETLINK_IP6_FW",
-		},
-		{
-			Value: linux.NETLINK_DNRTMSG,
-			Name:  "NETLINK_DNRTMSG",
-		},
-		{
-			Value: linux.NETLINK_KOBJECT_UEVENT,
-			Name:  "NETLINK_KOBJECT_UEVENT",
-		},
-		{
-			Value: linux.NETLINK_GENERIC,
-			Name:  "NETLINK_GENERIC",
-		},
-		{
-			Value: linux.NETLINK_SCSITRANSPORT,
-			Name:  "NETLINK_SCSITRANSPORT",
-		},
-		{
-			Value: linux.NETLINK_ECRYPTFS,
-			Name:  "NETLINK_ECRYPTFS",
-		},
-		{
-			Value: linux.NETLINK_RDMA,
-			Name:  "NETLINK_RDMA",
-		},
-		{
-			Value: linux.NETLINK_CRYPTO,
-			Name:  "NETLINK_CRYPTO",
-		},
+		linux.NETLINK_ROUTE:          "NETLINK_ROUTE",
+		linux.NETLINK_UNUSED:         "NETLINK_UNUSED",
+		linux.NETLINK_USERSOCK:       "NETLINK_USERSOCK",
+		linux.NETLINK_FIREWALL:       "NETLINK_FIREWALL",
+		linux.NETLINK_SOCK_DIAG:      "NETLINK_SOCK_DIAG",
+		linux.NETLINK_NFLOG:          "NETLINK_NFLOG",
+		linux.NETLINK_XFRM:           "NETLINK_XFRM",
+		linux.NETLINK_SELINUX:        "NETLINK_SELINUX",
+		linux.NETLINK_ISCSI:          "NETLINK_ISCSI",
+		linux.NETLINK_AUDIT:          "NETLINK_AUDIT",
+		linux.NETLINK_FIB_LOOKUP:     "NETLINK_FIB_LOOKUP",
+		linux.NETLINK_CONNECTOR:      "NETLINK_CONNECTOR",
+		linux.NETLINK_NETFILTER:      "NETLINK_NETFILTER",
+		linux.NETLINK_IP6_FW:         "NETLINK_IP6_FW",
+		linux.NETLINK_DNRTMSG:        "NETLINK_DNRTMSG",
+		linux.NETLINK_KOBJECT_UEVENT: "NETLINK_KOBJECT_UEVENT",
+		linux.NETLINK_GENERIC:        "NETLINK_GENERIC",
+		linux.NETLINK_SCSITRANSPORT:  "NETLINK_SCSITRANSPORT",
+		linux.NETLINK_ECRYPTFS:       "NETLINK_ECRYPTFS",
+		linux.NETLINK_RDMA:           "NETLINK_RDMA",
+		linux.NETLINK_CRYPTO:         "NETLINK_CRYPTO",
 	},
 }
 
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 4286f0df7..e40e0b57c 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -50,18 +50,9 @@ var EventMaximumSize uint
 
 // ItimerTypes are the possible itimer types.
 var ItimerTypes = abi.ValueSet{
-	{
-		Value: linux.ITIMER_REAL,
-		Name:  "ITIMER_REAL",
-	},
-	{
-		Value: linux.ITIMER_VIRTUAL,
-		Name:  "ITIMER_VIRTUAL",
-	},
-	{
-		Value: linux.ITIMER_PROF,
-		Name:  "ITIMER_PROF",
-	},
+	linux.ITIMER_REAL:    "ITIMER_REAL",
+	linux.ITIMER_VIRTUAL: "ITIMER_VIRTUAL",
+	linux.ITIMER_PROF:    "ITIMER_PROF",
 }
 
 func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, maxBytes uint64) string {
-- 
cgit v1.2.3


From 51900fe3a42ae8523c2b123343347a3215b93dc3 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 6 Dec 2018 15:46:20 -0800
Subject: Format signals, signal masks in strace

Sample:

I1205 16:51:49.869701    2492 x:0] [   1] ioctl_test E rt_sigaction(SIGIO, 0x7e0e5b5e8500, 0x7e0e5b5e85a0)
I1205 16:51:49.869766    2492 x:0] [   1] ioctl_test X rt_sigaction(SIGIO, 0x7e0e5b5e8500, 0x7e0e5b5e85a0) = 0x0 (44.336?s)
I1205 16:51:49.869831    2492 x:0] [   1] ioctl_test E rt_sigprocmask(SIG_UNBLOCK, 0x7e0e5b5e8878 [SIGIO], 0x7e0e5b5e87c0, 0x8)
I1205 16:51:49.869866    2492 x:0] [   1] ioctl_test X rt_sigprocmask(SIG_UNBLOCK, 0x7e0e5b5e8878 [SIGIO], 0x7e0e5b5e87c0 [SIGIO 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64], 0x8) = 0x0 (2.575?s)

PiperOrigin-RevId: 224422404
Change-Id: I3ed3f2ec6b1a639baa9cacd37ce7ee325c3703e4
---
 pkg/abi/flag.go               |  9 +++++
 pkg/sentry/strace/BUILD       |  1 +
 pkg/sentry/strace/linux64.go  | 20 +++++-----
 pkg/sentry/strace/signal.go   | 86 +++++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/strace.go   |  8 ++++
 pkg/sentry/strace/syscalls.go | 12 ++++++
 6 files changed, 126 insertions(+), 10 deletions(-)
 create mode 100644 pkg/sentry/strace/signal.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/flag.go b/pkg/abi/flag.go
index 049c1b0dd..ec87c9cee 100644
--- a/pkg/abi/flag.go
+++ b/pkg/abi/flag.go
@@ -59,6 +59,15 @@ func (s ValueSet) Parse(val uint64) string {
 	return fmt.Sprintf("%#x", val)
 }
 
+// ParseDecimal returns the name of the value associated with `val`. Unknown
+// values are converted to decimal.
+func (s ValueSet) ParseDecimal(val uint64) string {
+	if v, ok := s[val]; ok {
+		return v
+	}
+	return fmt.Sprintf("%d", val)
+}
+
 // ParseName returns the flag value associated with 'name'. Returns false
 // if no value is found.
 func (s ValueSet) ParseName(name string) (uint64, bool) {
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 52c7f325c..8517db1ac 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -11,6 +11,7 @@ go_library(
         "linux64.go",
         "open.go",
         "ptrace.go",
+        "signal.go",
         "socket.go",
         "strace.go",
         "syscalls.go",
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 99714f12c..a2ca1a456 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -30,8 +30,8 @@ var linuxAMD64 = SyscallMap{
 	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
 	11:  makeSyscallInfo("munmap", Hex, Hex),
 	12:  makeSyscallInfo("brk", Hex),
-	13:  makeSyscallInfo("rt_sigaction", Hex, Hex, Hex),
-	14:  makeSyscallInfo("rt_sigprocmask", Hex, Hex, Hex, Hex),
+	13:  makeSyscallInfo("rt_sigaction", Signal, Hex, Hex),
+	14:  makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
 	15:  makeSyscallInfo("rt_sigreturn"),
 	16:  makeSyscallInfo("ioctl", Hex, Hex, Hex),
 	17:  makeSyscallInfo("pread64", Hex, ReadBuffer, Hex, Hex),
@@ -79,7 +79,7 @@ var linuxAMD64 = SyscallMap{
 	59:  makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector),
 	60:  makeSyscallInfo("exit", Hex),
 	61:  makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage),
-	62:  makeSyscallInfo("kill", Hex, Hex),
+	62:  makeSyscallInfo("kill", Hex, Signal),
 	63:  makeSyscallInfo("uname", Uname),
 	64:  makeSyscallInfo("semget", Hex, Hex, Hex),
 	65:  makeSyscallInfo("semop", Hex, Hex, Hex),
@@ -145,8 +145,8 @@ var linuxAMD64 = SyscallMap{
 	125: makeSyscallInfo("capget", Hex, Hex),
 	126: makeSyscallInfo("capset", Hex, Hex),
 	127: makeSyscallInfo("rt_sigpending", Hex),
-	128: makeSyscallInfo("rt_sigtimedwait", Hex, Hex, Timespec, Hex),
-	129: makeSyscallInfo("rt_sigqueueinfo", Hex, Hex, Hex),
+	128: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
+	129: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex),
 	130: makeSyscallInfo("rt_sigsuspend", Hex),
 	131: makeSyscallInfo("sigaltstack", Hex, Hex),
 	132: makeSyscallInfo("utime", Path, Utimbuf),
@@ -217,7 +217,7 @@ var linuxAMD64 = SyscallMap{
 	197: makeSyscallInfo("removexattr", Path, Path),
 	198: makeSyscallInfo("lremovexattr", Path, Path),
 	199: makeSyscallInfo("fremovexattr", Hex, Path),
-	200: makeSyscallInfo("tkill", Hex, Hex),
+	200: makeSyscallInfo("tkill", Hex, Signal),
 	201: makeSyscallInfo("time", Hex),
 	202: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex),
 	203: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex),
@@ -251,7 +251,7 @@ var linuxAMD64 = SyscallMap{
 	231: makeSyscallInfo("exit_group", Hex),
 	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
 	233: makeSyscallInfo("epoll_ctl", Hex, Hex, Hex, Hex),
-	234: makeSyscallInfo("tgkill", Hex, Hex, Hex),
+	234: makeSyscallInfo("tgkill", Hex, Hex, Signal),
 	235: makeSyscallInfo("utimes", Path, Timeval),
 	// 236: vserver (not implemented in the Linux kernel)
 	237: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex),
@@ -288,7 +288,7 @@ var linuxAMD64 = SyscallMap{
 	268: makeSyscallInfo("fchmodat", Hex, Path, Mode),
 	269: makeSyscallInfo("faccessat", Hex, Path, Oct, Hex),
 	270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
-	271: makeSyscallInfo("ppoll", Hex, Hex, Timespec, Hex, Hex),
+	271: makeSyscallInfo("ppoll", Hex, Hex, Timespec, SigSet, Hex),
 	272: makeSyscallInfo("unshare", Hex),
 	273: makeSyscallInfo("set_robust_list", Hex, Hex),
 	274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
@@ -298,7 +298,7 @@ var linuxAMD64 = SyscallMap{
 	278: makeSyscallInfo("vmsplice", Hex, Hex, Hex, Hex),
 	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
 	280: makeSyscallInfo("utimensat", Hex, Path, UTimeTimespec, Hex),
-	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, Hex, Hex),
+	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
 	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
 	283: makeSyscallInfo("timerfd_create", Hex, Hex),
 	284: makeSyscallInfo("eventfd", Hex),
@@ -314,7 +314,7 @@ var linuxAMD64 = SyscallMap{
 	294: makeSyscallInfo("inotify_init1", Hex),
 	295: makeSyscallInfo("preadv", Hex, ReadIOVec, Hex, Hex),
 	296: makeSyscallInfo("pwritev", Hex, WriteIOVec, Hex, Hex),
-	297: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Hex, Hex),
+	297: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex),
 	298: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex),
 	299: makeSyscallInfo("recvmmsg", Hex, Hex, Hex, Hex, Hex),
 	300: makeSyscallInfo("fanotify_init", Hex, Hex),
diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go
new file mode 100644
index 000000000..00ed02a3c
--- /dev/null
+++ b/pkg/sentry/strace/signal.go
@@ -0,0 +1,86 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// signalNames contains the names of all named signals.
+var signalNames = abi.ValueSet{
+	uint64(linux.SIGABRT):   "SIGABRT",
+	uint64(linux.SIGALRM):   "SIGALRM",
+	uint64(linux.SIGBUS):    "SIGBUS",
+	uint64(linux.SIGCHLD):   "SIGCHLD",
+	uint64(linux.SIGCONT):   "SIGCONT",
+	uint64(linux.SIGFPE):    "SIGFPE",
+	uint64(linux.SIGHUP):    "SIGHUP",
+	uint64(linux.SIGILL):    "SIGILL",
+	uint64(linux.SIGINT):    "SIGINT",
+	uint64(linux.SIGIO):     "SIGIO",
+	uint64(linux.SIGKILL):   "SIGKILL",
+	uint64(linux.SIGPIPE):   "SIGPIPE",
+	uint64(linux.SIGPROF):   "SIGPROF",
+	uint64(linux.SIGPWR):    "SIGPWR",
+	uint64(linux.SIGQUIT):   "SIGQUIT",
+	uint64(linux.SIGSEGV):   "SIGSEGV",
+	uint64(linux.SIGSTKFLT): "SIGSTKFLT",
+	uint64(linux.SIGSTOP):   "SIGSTOP",
+	uint64(linux.SIGSYS):    "SIGSYS",
+	uint64(linux.SIGTERM):   "SIGTERM",
+	uint64(linux.SIGTRAP):   "SIGTRAP",
+	uint64(linux.SIGTSTP):   "SIGTSTP",
+	uint64(linux.SIGTTIN):   "SIGTTIN",
+	uint64(linux.SIGTTOU):   "SIGTTOU",
+	uint64(linux.SIGURG):    "SIGURG",
+	uint64(linux.SIGUSR1):   "SIGUSR1",
+	uint64(linux.SIGUSR2):   "SIGUSR2",
+	uint64(linux.SIGVTALRM): "SIGVTALRM",
+	uint64(linux.SIGWINCH):  "SIGWINCH",
+	uint64(linux.SIGXCPU):   "SIGXCPU",
+	uint64(linux.SIGXFSZ):   "SIGXFSZ",
+}
+
+var signalMaskActions = abi.ValueSet{
+	linux.SIG_BLOCK:   "SIG_BLOCK",
+	linux.SIG_UNBLOCK: "SIG_UNBLOCK",
+	linux.SIG_SETMASK: "SIG_SETMASK",
+}
+
+func sigSet(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var b [linux.SignalSetSize]byte
+	if _, err := t.CopyInBytes(addr, b[:]); err != nil {
+		return fmt.Sprintf("%#x (error copying sigset: %v)", addr, err)
+	}
+
+	set := linux.SignalSet(usermem.ByteOrder.Uint64(b[:]))
+
+	var signals []string
+	linux.ForEachSignal(set, func(sig linux.Signal) {
+		signals = append(signals, signalNames.ParseDecimal(uint64(sig)))
+	})
+
+	return fmt.Sprintf("%#x [%v]", addr, strings.Join(signals, " "))
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index e40e0b57c..6df84e690 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -333,6 +333,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, PtraceRequestSet.Parse(args[arg].Uint64()))
 		case ItimerType:
 			output = append(output, ItimerTypes.Parse(uint64(args[arg].Int())))
+		case Signal:
+			output = append(output, signalNames.ParseDecimal(args[arg].Uint64()))
+		case SignalMaskAction:
+			output = append(output, signalMaskActions.Parse(uint64(args[arg].Int())))
+		case SigSet:
+			output = append(output, sigSet(t, args[arg].Pointer()))
 		case Oct:
 			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
 		case Hex:
@@ -391,6 +397,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = timeval(t, args[arg].Pointer())
 		case Rusage:
 			output[arg] = rusage(t, args[arg].Pointer())
+		case PostSigSet:
+			output[arg] = sigSet(t, args[arg].Pointer())
 		}
 	}
 }
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 9eeb18a03..22aecc009 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -170,6 +170,18 @@ const (
 
 	// ItimerType is an itimer type (ITIMER_REAL, etc).
 	ItimerType
+
+	// Signal is a signal number.
+	Signal
+
+	// SignalMaskAction is a signal mask action passed to rt_sigprocmask(2).
+	SignalMaskAction
+
+	// SigSet is a signal set.
+	SigSet
+
+	// PostSigSet is a signal set, formatted after syscall execution.
+	PostSigSet
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
-- 
cgit v1.2.3


From 673949048e84aed6cf7d6ccc4e93bfc0c3855c61 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 7 Dec 2018 11:52:31 -0800
Subject: Add period to comment

PiperOrigin-RevId: 224553291
Change-Id: I35d0772c215b71f4319c23f22df5c61c908f8590
---
 pkg/sentry/fs/proc/proc.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 33030bebf..b658cd328 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -251,7 +251,7 @@ func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 	return offset, err
 }
 
-// newMountsSymlink returns a symlink to "self/mounts"
+// newMountsSymlink returns a symlink to "self/mounts".
 func newMountsSymlink(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	s := &ramfs.Symlink{}
 	s.InitSymlink(ctx, fs.RootOwner, "self/mounts")
-- 
cgit v1.2.3


From 42e2e5cae9b035a62bdbf492ad4a1e9d016c5830 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 7 Dec 2018 16:27:35 -0800
Subject: Format sigaction in strace

Sample:

I1206 14:24:56.768520    3700 x:0] [   1] ioctl_test E rt_sigaction(SIGSEGV, 0x7ee6edb0c590 {Handler: 0x559c6d915cf0, Flags: SA_SIGINFO|SA_RESTORER|SA_ONSTACK|SA_NODEFER, Restorer: 0x2a9901a259a0, Mask: []}, 0x7ee6edb0c630)
I1206 14:24:56.768530    3700 x:0] [   1] ioctl_test X rt_sigaction(SIGSEGV, 0x7ee6edb0c590 {Handler: 0x559c6d915cf0, Flags: SA_SIGINFO|SA_RESTORER|SA_ONSTACK|SA_NODEFER, Restorer: 0x2a9901a259a0, Mask: []}, 0x7ee6edb0c630 {Handler: SIG_DFL, Flags: 0x0, Restorer: 0x0, Mask: []}) = 0x0 (2.701?s)

PiperOrigin-RevId: 224596606
Change-Id: I3512493aed99d3d75600249263da46686b1dc0e7
---
 pkg/abi/flag.go               |  5 ++++
 pkg/abi/linux/signal.go       | 19 +++++++------
 pkg/sentry/strace/linux64.go  |  2 +-
 pkg/sentry/strace/signal.go   | 64 ++++++++++++++++++++++++++++++++++++++++++-
 pkg/sentry/strace/strace.go   |  4 +++
 pkg/sentry/strace/syscalls.go |  6 ++++
 6 files changed, 89 insertions(+), 11 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/flag.go b/pkg/abi/flag.go
index ec87c9cee..b48757da8 100644
--- a/pkg/abi/flag.go
+++ b/pkg/abi/flag.go
@@ -43,6 +43,11 @@ func (s FlagSet) Parse(val uint64) string {
 		flags = append(flags, "0x"+strconv.FormatUint(val, 16))
 	}
 
+	if len(flags) == 0 {
+		// Prefer 0 to an empty string.
+		return "0x0"
+	}
+
 	return strings.Join(flags, "|")
 }
 
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index bf9bce6ed..395f9f31e 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -165,15 +165,16 @@ const (
 
 // Signal action flags for rt_sigaction(2), from uapi/asm-generic/signal.h
 const (
-	SA_NOCLDSTOP   = 0x00000001
-	SA_NOCLDWAIT   = 0x00000002
-	SA_SIGINFO     = 0x00000004
-	SA_ONSTACK     = 0x08000000
-	SA_RESTART     = 0x10000000
-	SA_NODEFER     = 0x40000000
-	SA_RESTARTHAND = 0x80000000
-	SA_NOMASK      = SA_NODEFER
-	SA_ONESHOT     = SA_RESTARTHAND
+	SA_NOCLDSTOP = 0x00000001
+	SA_NOCLDWAIT = 0x00000002
+	SA_SIGINFO   = 0x00000004
+	SA_RESTORER  = 0x04000000
+	SA_ONSTACK   = 0x08000000
+	SA_RESTART   = 0x10000000
+	SA_NODEFER   = 0x40000000
+	SA_RESETHAND = 0x80000000
+	SA_NOMASK    = SA_NODEFER
+	SA_ONESHOT   = SA_RESETHAND
 )
 
 // Signal info types.
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index a2ca1a456..9457e24b5 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -30,7 +30,7 @@ var linuxAMD64 = SyscallMap{
 	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
 	11:  makeSyscallInfo("munmap", Hex, Hex),
 	12:  makeSyscallInfo("brk", Hex),
-	13:  makeSyscallInfo("rt_sigaction", Signal, Hex, Hex),
+	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
 	14:  makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
 	15:  makeSyscallInfo("rt_sigreturn"),
 	16:  makeSyscallInfo("ioctl", Hex, Hex, Hex),
diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go
index 00ed02a3c..524be0e15 100644
--- a/pkg/sentry/strace/signal.go
+++ b/pkg/sentry/strace/signal.go
@@ -65,6 +65,41 @@ var signalMaskActions = abi.ValueSet{
 	linux.SIG_SETMASK: "SIG_SETMASK",
 }
 
+var sigActionFlags = abi.FlagSet{
+	{
+		Flag: linux.SA_NOCLDSTOP,
+		Name: "SA_NOCLDSTOP",
+	},
+	{
+		Flag: linux.SA_NOCLDWAIT,
+		Name: "SA_NOCLDWAIT",
+	},
+	{
+		Flag: linux.SA_SIGINFO,
+		Name: "SA_SIGINFO",
+	},
+	{
+		Flag: linux.SA_RESTORER,
+		Name: "SA_RESTORER",
+	},
+	{
+		Flag: linux.SA_ONSTACK,
+		Name: "SA_ONSTACK",
+	},
+	{
+		Flag: linux.SA_RESTART,
+		Name: "SA_RESTART",
+	},
+	{
+		Flag: linux.SA_NODEFER,
+		Name: "SA_NODEFER",
+	},
+	{
+		Flag: linux.SA_RESETHAND,
+		Name: "SA_RESETHAND",
+	},
+}
+
 func sigSet(t *kernel.Task, addr usermem.Addr) string {
 	if addr == 0 {
 		return "null"
@@ -77,10 +112,37 @@ func sigSet(t *kernel.Task, addr usermem.Addr) string {
 
 	set := linux.SignalSet(usermem.ByteOrder.Uint64(b[:]))
 
+	return fmt.Sprintf("%#x %s", addr, formatSigSet(set))
+}
+
+func formatSigSet(set linux.SignalSet) string {
 	var signals []string
 	linux.ForEachSignal(set, func(sig linux.Signal) {
 		signals = append(signals, signalNames.ParseDecimal(uint64(sig)))
 	})
 
-	return fmt.Sprintf("%#x [%v]", addr, strings.Join(signals, " "))
+	return fmt.Sprintf("[%v]", strings.Join(signals, " "))
+}
+
+func sigAction(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	sa, err := t.CopyInSignalAct(addr)
+	if err != nil {
+		return fmt.Sprintf("%#x (error copying sigaction: %v)", addr, err)
+	}
+
+	var handler string
+	switch sa.Handler {
+	case linux.SIG_IGN:
+		handler = "SIG_IGN"
+	case linux.SIG_DFL:
+		handler = "SIG_DFL"
+	default:
+		handler = fmt.Sprintf("%#x", sa.Handler)
+	}
+
+	return fmt.Sprintf("%#x {Handler: %s, Flags: %s, Restorer: %#x, Mask: %s}", addr, handler, sigActionFlags.Parse(sa.Flags), sa.Restorer, formatSigSet(sa.Mask))
 }
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 6df84e690..da27a2ae8 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -339,6 +339,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, signalMaskActions.Parse(uint64(args[arg].Int())))
 		case SigSet:
 			output = append(output, sigSet(t, args[arg].Pointer()))
+		case SigAction:
+			output = append(output, sigAction(t, args[arg].Pointer()))
 		case Oct:
 			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
 		case Hex:
@@ -399,6 +401,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = rusage(t, args[arg].Pointer())
 		case PostSigSet:
 			output[arg] = sigSet(t, args[arg].Pointer())
+		case PostSigAction:
+			output[arg] = sigAction(t, args[arg].Pointer())
 		}
 	}
 }
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 22aecc009..1ae982354 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -182,6 +182,12 @@ const (
 
 	// PostSigSet is a signal set, formatted after syscall execution.
 	PostSigSet
+
+	// SigAction is a struct sigaction.
+	SigAction
+
+	// PostSigAction is a struct sigaction, formatted after syscall execution.
+	PostSigAction
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
-- 
cgit v1.2.3


From 9984138abee51d6145469f9298bfeb8a98589709 Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Fri, 7 Dec 2018 17:03:06 -0800
Subject: sentry: turn "dynamically-created" procfs files into static creation.

PiperOrigin-RevId: 224600982
Change-Id: I547253528e24fb0bb318fc9d2632cb80504acb34
---
 pkg/sentry/fs/proc/proc.go | 33 +++++++++++++--------------------
 runsc/boot/controller.go   |  4 ++++
 2 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index b658cd328..70e549c31 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -85,8 +85,6 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 
 	p := &proc{k: k, pidns: pidns}
 	p.InitDir(ctx, map[string]*fs.Inode{
-		// Note that these are just the static members. There are
-		// dynamic members populated in Readdir and Lookup below.
 		"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
 		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
 		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
@@ -96,12 +94,23 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 	}, fs.RootOwner, fs.FilePermsFromMode(0555))
 
 	p.AddChild(ctx, "cpuinfo", p.newCPUInfo(ctx, msrc))
+	// If we're using rpcinet we will let it manage /proc/net.
+	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
+		p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc))
+	} else {
+		p.AddChild(ctx, "net", p.newNetDir(ctx, msrc))
+	}
+	p.AddChild(ctx, "self", p.newSelf(ctx, msrc))
+	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
+	p.AddChild(ctx, "thread-self", p.newThreadSelf(ctx, msrc))
 	p.AddChild(ctx, "uptime", p.newUptime(ctx, msrc))
 
 	return newFile(p, msrc, fs.SpecialDirectory, nil), nil
 }
 
 // self is a magical link.
+//
+// +stateify savable
 type self struct {
 	ramfs.Symlink
 
@@ -146,6 +155,8 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 }
 
 // threadSelf is more magical than "self" link.
+//
+// +stateify savable
 type threadSelf struct {
 	ramfs.Symlink
 
@@ -169,29 +180,11 @@ func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, err
 
 // Lookup loads an Inode at name into a Dirent.
 func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
-	// Is it one of the static ones?
 	dirent, walkErr := p.Dir.Lookup(ctx, dir, name)
 	if walkErr == nil {
 		return dirent, nil
 	}
 
-	// Is it a dynamic element?
-	nfs := map[string]func() *fs.Inode{
-		"net": func() *fs.Inode {
-			// If we're using rpcinet we will let it manage /proc/net.
-			if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-				return newRPCInetProcNet(ctx, dir.MountSource)
-			}
-			return p.newNetDir(ctx, dir.MountSource)
-		},
-		"self":        func() *fs.Inode { return p.newSelf(ctx, dir.MountSource) },
-		"sys":         func() *fs.Inode { return p.newSysDir(ctx, dir.MountSource) },
-		"thread-self": func() *fs.Inode { return p.newThreadSelf(ctx, dir.MountSource) },
-	}
-	if nf, ok := nfs[name]; ok {
-		return fs.NewDirent(nf(), name), nil
-	}
-
 	// Try to lookup a corresponding task.
 	tid, err := strconv.ParseUint(name, 10, 64)
 	if err != nil {
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 7a1f42119..05d4f3a5b 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -30,6 +30,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 )
 
@@ -356,6 +357,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	if err != nil {
 		return fmt.Errorf("failed to create network: %v", err)
 	}
+	if eps, ok := networkStack.(*epsocket.Stack); ok {
+		stack.StackFromEnv = eps.Stack // FIXME
+	}
 	info, err := o.FilePayload.Files[0].Stat()
 	if err != nil {
 		return err
-- 
cgit v1.2.3


From 25b8424d754bd659a0f976f82f7c8846dc2a194f Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Sun, 9 Dec 2018 00:49:37 -0800
Subject: Stub out TCP_QUICKACK

PiperOrigin-RevId: 224696233
Change-Id: I45c425d9e32adee5dcce29ca7439a06567b26014
---
 pkg/sentry/socket/epsocket/epsocket.go | 20 ++++++++++++++++++++
 pkg/tcpip/tcpip.go                     |  3 +++
 pkg/tcpip/transport/tcp/endpoint.go    | 22 ++++++++++++++++++++++
 3 files changed, 45 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 8c5db6af8..e1cda78c4 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -698,6 +698,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(v), nil
 
+	case linux.TCP_QUICKACK:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.QuickAckOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
 	case linux.TCP_INFO:
 		var v tcpip.TCPInfoOption
 		if err := ep.GetSockOpt(&v); err != nil {
@@ -870,6 +882,14 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v)))
 
+	case linux.TCP_QUICKACK:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v)))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index dc6339173..f6dd29e77 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -436,6 +436,9 @@ type CorkOption int
 // should allow reuse of local address.
 type ReuseAddressOption int
 
+// QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
+type QuickAckOption int
+
 // PasscredOption is used by SetSockOpt/GetSockOpt to specify whether
 // SCM_CREDENTIALS socket control messages are enabled.
 //
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 6034ba90b..37d4c8f9e 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -177,6 +177,12 @@ type endpoint struct {
 	// options.
 	reuseAddr bool
 
+	// slowAck holds the negated state of quick ack. It is stubbed out and
+	// does nothing.
+	//
+	// slowAck is a boolean (0 is false) and must be accessed atomically.
+	slowAck uint32
+
 	// segmentQueue is used to hand received segments to the protocol
 	// goroutine. Segments are queued as long as the queue is not full,
 	// and dropped when it is.
@@ -677,6 +683,15 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.QuickAckOption:
+		if v == 0 {
+			atomic.StoreUint32(&e.slowAck, 1)
+		} else {
+			atomic.StoreUint32(&e.slowAck, 0)
+		}
+
+		return nil
+
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
@@ -859,6 +874,13 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		return nil
 
+	case *tcpip.QuickAckOption:
+		*o = 1
+		if v := atomic.LoadUint32(&e.slowAck); v != 0 {
+			*o = 0
+		}
+		return nil
+
 	case *tcpip.V6OnlyOption:
 		// We only recognize this option on v6 endpoints.
 		if e.netProto != header.IPv6ProtocolNumber {
-- 
cgit v1.2.3


From 99d595869332f817de8f570fae184658c513a43c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 10 Dec 2018 12:36:27 -0800
Subject: Validate FS_BASE in Task.Clone

arch_prctl already verified that the new FS_BASE was canonical, but
Task.Clone did not. Centralize these checks in the arch packages.

Failure to validate could cause an error in PTRACE_SET_REGS when we try
to switch to the app.

PiperOrigin-RevId: 224862398
Change-Id: Iefe63b3f9aa6c4810326b8936e501be3ec407f14
---
 pkg/sentry/arch/arch.go                  |  6 ++++++
 pkg/sentry/arch/arch_amd64.go            | 16 ++++++++++++++++
 pkg/sentry/arch/arch_x86.go              | 10 ++++++++--
 pkg/sentry/kernel/task_clone.go          |  4 +++-
 pkg/sentry/platform/ptrace/subprocess.go |  4 ++--
 pkg/sentry/syscalls/linux/sys_tls.go     |  9 +++------
 6 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 575b7ba66..4cd7a9af5 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -115,6 +115,12 @@ type Context interface {
 	// SetStack sets the current stack pointer.
 	SetStack(value uintptr)
 
+	// TLS returns the current TLS pointer.
+	TLS() uintptr
+
+	// SetTLS sets the current TLS pointer. Returns false if value is invalid.
+	SetTLS(value uintptr) bool
+
 	// SetRSEQInterruptedIP sets the register that contains the old IP when a
 	// restartable sequence is interrupted.
 	SetRSEQInterruptedIP(value uintptr)
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index bb80a7bed..2507774f7 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -158,6 +158,22 @@ func (c *context64) SetStack(value uintptr) {
 	c.Regs.Rsp = uint64(value)
 }
 
+// TLS returns the current TLS pointer.
+func (c *context64) TLS() uintptr {
+	return uintptr(c.Regs.Fs_base)
+}
+
+// SetTLS sets the current TLS pointer. Returns false if value is invalid.
+func (c *context64) SetTLS(value uintptr) bool {
+	if !isValidSegmentBase(uint64(value)) {
+		return false
+	}
+
+	c.Regs.Fs = 0
+	c.Regs.Fs_base = uint64(value)
+	return true
+}
+
 // SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
 func (c *context64) SetRSEQInterruptedIP(value uintptr) {
 	c.Regs.R10 = uint64(value)
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 59bf89d99..e50a76083 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -353,10 +353,10 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
 	if !isUserSegmentSelector(regs.Ss) {
 		return 0, syscall.EIO
 	}
-	if regs.Fs_base >= uint64(maxAddr64) {
+	if !isValidSegmentBase(regs.Fs_base) {
 		return 0, syscall.EIO
 	}
-	if regs.Gs_base >= uint64(maxAddr64) {
+	if !isValidSegmentBase(regs.Gs_base) {
 		return 0, syscall.EIO
 	}
 	// CS and SS are validated, but changes to them are otherwise silently
@@ -389,6 +389,12 @@ func isUserSegmentSelector(reg uint64) bool {
 	return reg&3 == 3
 }
 
+// isValidSegmentBase returns true if the given segment base specifies a
+// canonical user address.
+func isValidSegmentBase(reg uint64) bool {
+	return reg < uint64(maxAddr64)
+}
+
 // ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type
 // manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently,
 // ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 755fe0370..b66fa34a9 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -210,7 +210,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		tc.Arch.SetStack(uintptr(opts.Stack))
 	}
 	if opts.SetTLS {
-		tc.Arch.StateData().Regs.Fs_base = uint64(opts.TLS)
+		if !tc.Arch.SetTLS(uintptr(opts.TLS)) {
+			return 0, nil, syserror.EPERM
+		}
 	}
 
 	var fsc *FSContext
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 5e56a1514..a9d083f5a 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -480,10 +480,10 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
 
 	// Set registers.
 	if err := t.setRegs(regs); err != nil {
-		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+		panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err))
 	}
 	if err := t.setFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
-		panic(fmt.Sprintf("ptrace set fpregs failed: %v", err))
+		panic(fmt.Sprintf("ptrace set fpregs (%+v) failed: %v", fpState, err))
 	}
 
 	for {
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index 40e84825b..8ea78093b 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
 // ArchPrctl implements linux syscall arch_prctl(2).
@@ -31,19 +30,17 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	switch args[0].Int() {
 	case linux.ARCH_GET_FS:
 		addr := args[1].Pointer()
-		_, err := t.CopyOut(addr, &t.Arch().StateData().Regs.Fs_base)
+		fsbase := t.Arch().TLS()
+		_, err := t.CopyOut(addr, uint64(fsbase))
 		if err != nil {
 			return 0, nil, err
 		}
 
 	case linux.ARCH_SET_FS:
 		fsbase := args[1].Uint64()
-		if _, ok := t.MemoryManager().CheckIORange(usermem.Addr(fsbase), 0); !ok {
+		if !t.Arch().SetTLS(uintptr(fsbase)) {
 			return 0, nil, syscall.EPERM
 		}
-		regs := &t.Arch().StateData().Regs
-		regs.Fs = 0
-		regs.Fs_base = fsbase
 
 	case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
 		t.Kernel().EmitUnimplementedEvent(t)
-- 
cgit v1.2.3


From fc297702511edef4760c4f7a1d89cc6f02347d50 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Mon, 10 Dec 2018 12:47:20 -0800
Subject: Add type safety to shm ids and keys.

PiperOrigin-RevId: 224864380
Change-Id: I49542279ad56bf15ba462d3de1ef2b157b31830a
---
 pkg/sentry/kernel/shm/shm.go         | 24 +++++++++++++++---------
 pkg/sentry/syscalls/linux/sys_shm.go |  8 ++++----
 2 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index f760f5f76..4343dee13 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -51,6 +51,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+// Key represents a shm segment key. Analogous to a file name.
+type Key int32
+
+// ID represents the opaque handle for a shm segment. Analogous to an fd.
+type ID int32
+
 // Registry tracks all shared memory segments in an IPC namespace. The registry
 // provides the mechanisms for creating and finding segments, and reporting
 // global shm parameters.
@@ -63,33 +69,33 @@ type Registry struct {
 	mu sync.Mutex `state:"nosave"`
 
 	// shms maps segment ids to segments. Protected by mu.
-	shms map[int32]*Shm
+	shms map[ID]*Shm
 
 	// Sum of the sizes of all existing segments rounded up to page size, in
 	// units of page size. Protected by mu.
 	totalPages uint64
 
 	// lastIDUsed is protected by mu.
-	lastIDUsed int32
+	lastIDUsed ID
 }
 
 // NewRegistry creates a new shm registry.
 func NewRegistry(userNS *auth.UserNamespace) *Registry {
 	return &Registry{
 		userNS: userNS,
-		shms:   make(map[int32]*Shm),
+		shms:   make(map[ID]*Shm),
 	}
 }
 
 // FindByID looks up a segment given an ID.
-func (r *Registry) FindByID(id int32) *Shm {
+func (r *Registry) FindByID(id ID) *Shm {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	return r.shms[id]
 }
 
 // Precondition: Caller must hold r.mu.
-func (r *Registry) findByKey(key int32) *Shm {
+func (r *Registry) findByKey(key Key) *Shm {
 	for _, v := range r.shms {
 		if v.key == key {
 			return v
@@ -100,7 +106,7 @@ func (r *Registry) findByKey(key int32) *Shm {
 
 // FindOrCreate looks up or creates a segment in the registry. It's functionally
 // analogous to open(2).
-func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
+func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
 	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
 		// "A new segment was to be created and size is less than SHMMIN or
 		// greater than SHMMAX." - man shmget(2)
@@ -178,7 +184,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64
 }
 
 // newShm creates a new segment in the registry.
-func (r *Registry) newShm(ctx context.Context, pid, key int32, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
+func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
 	p := platform.FromContext(ctx)
 	if p == nil {
 		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
@@ -289,7 +295,7 @@ type Shm struct {
 	registry *Registry
 
 	// ID is the kernel identifier for this segment. Immutable.
-	ID int32
+	ID ID
 
 	// creator is the user that created the segment. Immutable.
 	creator fs.FileOwner
@@ -309,7 +315,7 @@ type Shm struct {
 	fr platform.FileRange
 
 	// key is the public identifier for this segment.
-	key int32
+	key Key
 
 	// mu protects all fields below.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index 8753c2e58..a0d3a73c5 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -24,7 +24,7 @@ import (
 
 // Shmget implements shmget(2).
 func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	key := args[0].Int()
+	key := shm.Key(args[0].Int())
 	size := uint64(args[1].SizeT())
 	flag := args[2].Int()
 
@@ -43,7 +43,7 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 }
 
 // findSegment retrives a shm segment by the given id.
-func findSegment(t *kernel.Task, id int32) (*shm.Shm, error) {
+func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
 	r := t.IPCNamespace().ShmRegistry()
 	segment := r.FindByID(id)
 	if segment == nil {
@@ -55,7 +55,7 @@ func findSegment(t *kernel.Task, id int32) (*shm.Shm, error) {
 
 // Shmat implements shmat(2).
 func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	id := args[0].Int()
+	id := shm.ID(args[0].Int())
 	addr := args[1].Pointer()
 	flag := args[2].Int()
 
@@ -86,7 +86,7 @@ func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 // Shmctl implements shmctl(2).
 func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	id := args[0].Int()
+	id := shm.ID(args[0].Int())
 	cmd := args[1].Int()
 	buf := args[2].Pointer()
 
-- 
cgit v1.2.3


From 5d87d8865f8771c00b84717d40f27f8f93dda7ca Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 10 Dec 2018 17:55:45 -0800
Subject: Implement MSG_WAITALL

MSG_WAITALL requests that recv family calls do not perform short reads. It only
has an effect for SOCK_STREAM sockets, other types ignore it.

PiperOrigin-RevId: 224918540
Change-Id: Id97fbf972f1f7cbd4e08eec0138f8cbdf1c94fe7
---
 pkg/sentry/fs/host/socket.go                      |  4 +-
 pkg/sentry/socket/epsocket/epsocket.go            | 30 +++++++++++--
 pkg/sentry/socket/unix/unix.go                    | 51 +++++++++++++++++------
 pkg/sentry/syscalls/linux/sys_socket.go           |  6 +--
 test/syscalls/linux/socket_generic.cc             | 11 ++++-
 test/syscalls/linux/socket_non_stream_blocking.cc |  2 -
 test/syscalls/linux/socket_stream_blocking.cc     |  2 -
 7 files changed, 79 insertions(+), 27 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 506be3056..b9e2aa705 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -169,7 +169,7 @@ func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.F
 
 	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
 
-	return unixsocket.NewWithDirent(ctx, d, ep, flags), nil
+	return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil
 }
 
 // newSocket allocates a new unix socket with host endpoint.
@@ -201,7 +201,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
 
 	ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
 
-	return unixsocket.New(ctx, ep), nil
+	return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil
 }
 
 // Send implements transport.ConnectedEndpoint.Send.
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e1cda78c4..b49ef21ad 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1300,6 +1300,8 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
+	dontWait := flags&linux.MSG_DONTWAIT != 0
+	waitAll := flags&linux.MSG_WAITALL != 0
 	if senderRequested && !s.isPacketBased() {
 		// Stream sockets ignore the sender address.
 		senderRequested = false
@@ -1311,10 +1313,19 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 	}
 
-	if err != syserr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
+		// Read failed and we should not retry.
+		return 0, nil, 0, socket.ControlMessages{}, err
+	}
+
+	if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
+		// We got all the data we need.
 		return
 	}
 
+	// Don't overwrite any data we received.
+	dst = dst.DropFirst(n)
+
 	// We'll have to block. Register for notifications and keep trying to
 	// send all the data.
 	e, ch := waiter.NewChannelEntry(nil)
@@ -1322,10 +1333,23 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	defer s.EventUnregister(&e)
 
 	for {
-		n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
-		if err != syserr.ErrWouldBlock {
+		var rn int
+		rn, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+		n += rn
+		if err != nil && err != syserr.ErrWouldBlock {
+			// Always stop on errors other than would block as we generally
+			// won't be able to get any more data. Eat the error if we got
+			// any data.
+			if n > 0 {
+				err = nil
+			}
+			return
+		}
+		if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
+			// We got all the data we need.
 			return
 		}
+		dst = dst.DropFirst(rn)
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 4379486cf..11cad411d 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -53,19 +53,21 @@ type SocketOperations struct {
 	fsutil.NoopFlush     `state:"nosave"`
 	fsutil.NoMMap        `state:"nosave"`
 	ep                   transport.Endpoint
+	isPacket             bool
 }
 
 // New creates a new unix socket.
-func New(ctx context.Context, endpoint transport.Endpoint) *fs.File {
+func New(ctx context.Context, endpoint transport.Endpoint, isPacket bool) *fs.File {
 	dirent := socket.NewDirent(ctx, unixSocketDevice)
 	defer dirent.DecRef()
-	return NewWithDirent(ctx, dirent, endpoint, fs.FileFlags{Read: true, Write: true})
+	return NewWithDirent(ctx, dirent, endpoint, isPacket, fs.FileFlags{Read: true, Write: true})
 }
 
 // NewWithDirent creates a new unix socket using an existing dirent.
-func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, flags fs.FileFlags) *fs.File {
+func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, isPacket bool, flags fs.FileFlags) *fs.File {
 	return fs.NewFile(ctx, d, flags, &SocketOperations{
-		ep: ep,
+		ep:       ep,
+		isPacket: isPacket,
 	})
 }
 
@@ -188,7 +190,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		}
 	}
 
-	ns := New(t, ep)
+	ns := New(t, ep, s.isPacket)
 	defer ns.DecRef()
 
 	if flags&linux.SOCK_NONBLOCK != 0 {
@@ -471,6 +473,8 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
+	dontWait := flags&linux.MSG_DONTWAIT != 0
+	waitAll := flags&linux.MSG_WAITALL != 0
 
 	// Calculate the number of FDs for which we have space and if we are
 	// requesting credentials.
@@ -497,7 +501,8 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	if senderRequested {
 		r.From = &tcpip.FullAddress{}
 	}
-	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+	var total int64
+	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
 		var from interface{}
 		var fromLen uint32
 		if r.From != nil {
@@ -506,7 +511,13 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		if trunc {
 			n = int64(r.MsgSize)
 		}
-		return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
+		if err != nil || dontWait || !waitAll || s.isPacket || n >= dst.NumBytes() {
+			return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
+		}
+
+		// Don't overwrite any data we received.
+		dst = dst.DropFirst64(n)
+		total += n
 	}
 
 	// We'll have to block. Register for notification and keep trying to
@@ -525,7 +536,13 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			if trunc {
 				n = int64(r.MsgSize)
 			}
-			return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
+			total += n
+			if err != nil || !waitAll || s.isPacket || n >= dst.NumBytes() {
+				return int(total), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
+			}
+
+			// Don't overwrite any data we received.
+			dst = dst.DropFirst64(n)
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
@@ -549,16 +566,21 @@ func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int)
 
 	// Create the endpoint and socket.
 	var ep transport.Endpoint
+	var isPacket bool
 	switch stype {
 	case linux.SOCK_DGRAM:
+		isPacket = true
 		ep = transport.NewConnectionless()
-	case linux.SOCK_STREAM, linux.SOCK_SEQPACKET:
+	case linux.SOCK_SEQPACKET:
+		isPacket = true
+		fallthrough
+	case linux.SOCK_STREAM:
 		ep = transport.NewConnectioned(stype, t.Kernel())
 	default:
 		return nil, syserr.ErrInvalidArgument
 	}
 
-	return New(t, ep), nil
+	return New(t, ep, isPacket), nil
 }
 
 // Pair creates a new pair of AF_UNIX connected sockets.
@@ -568,16 +590,19 @@ func (*provider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*
 		return nil, nil, syserr.ErrInvalidArgument
 	}
 
+	var isPacket bool
 	switch stype {
-	case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
+	case linux.SOCK_STREAM:
+	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
+		isPacket = true
 	default:
 		return nil, nil, syserr.ErrInvalidArgument
 	}
 
 	// Create the endpoints and sockets.
 	ep1, ep2 := transport.NewPair(stype, t.Kernel())
-	s1 := New(t, ep1)
-	s2 := New(t, ep2)
+	s1 := New(t, ep1, isPacket)
+	s2 := New(t, ep2, isPacket)
 
 	return s1, s2, nil
 }
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 0a7551742..1165d4566 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -602,7 +602,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE|linux.MSG_WAITALL) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 
@@ -635,7 +635,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE|linux.MSG_WAITALL) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 
@@ -791,7 +791,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CONFIRM) != 0 {
+	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CONFIRM|linux.MSG_WAITALL) != 0 {
 		return 0, syscall.EINVAL
 	}
 
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index fbc3bebed..fdc346d4d 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -383,8 +383,6 @@ TEST_P(AllSocketPairTest, RecvmsgTimeoutOneSecondSucceeds) {
 }
 
 TEST_P(AllSocketPairTest, RecvWaitAll) {
-  SKIP_IF(IsRunningOnGvisor());  // FIXME: Support MSG_WAITALL.
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[100];
@@ -399,5 +397,14 @@ TEST_P(AllSocketPairTest, RecvWaitAll) {
               SyscallSucceedsWithValue(sizeof(sent_data)));
 }
 
+TEST_P(AllSocketPairTest, RecvWaitAllDontWait) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char data[100] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), data, sizeof(data),
+                               MSG_WAITALL | MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index d64b181c9..9e92628c3 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -31,8 +31,6 @@ namespace gvisor {
 namespace testing {
 
 TEST_P(BlockingNonStreamSocketPairTest, RecvLessThanBufferWaitAll) {
-  SKIP_IF(IsRunningOnGvisor());  // FIXME: Support MSG_WAITALL.
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[100];
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index dd209c67c..3fbbe54d8 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -99,8 +99,6 @@ TEST_P(BlockingStreamSocketPairTest, RecvLessThanBuffer) {
 }
 
 TEST_P(BlockingStreamSocketPairTest, RecvLessThanBufferWaitAll) {
-  SKIP_IF(IsRunningOnGvisor());  // FIXME: Support MSG_WAITALL.
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[100];
-- 
cgit v1.2.3


From 52fe3b87a415006a4ef96548e33a7153b14ac28d Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Mon, 10 Dec 2018 21:34:08 -0800
Subject: Add safecopy support for arm64 platform.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I565214581eeb44045169da7f44d45a489082ac3a
PiperOrigin-RevId: 224938170
---
 pkg/sentry/platform/safecopy/BUILD              |   4 +
 pkg/sentry/platform/safecopy/atomic_arm64.s     |  98 ++++++++++++++++++
 pkg/sentry/platform/safecopy/memclr_arm64.s     |  74 +++++++++++++
 pkg/sentry/platform/safecopy/memcpy_arm64.s     |  78 ++++++++++++++
 pkg/sentry/platform/safecopy/sighandler_arm64.s | 132 ++++++++++++++++++++++++
 5 files changed, 386 insertions(+)
 create mode 100644 pkg/sentry/platform/safecopy/atomic_arm64.s
 create mode 100644 pkg/sentry/platform/safecopy/memclr_arm64.s
 create mode 100644 pkg/sentry/platform/safecopy/memcpy_arm64.s
 create mode 100644 pkg/sentry/platform/safecopy/sighandler_arm64.s

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index ee58a805e..cb8347dd8 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -6,11 +6,15 @@ go_library(
     name = "safecopy",
     srcs = [
         "atomic_amd64.s",
+        "atomic_arm64.s",
         "memclr_amd64.s",
+        "memclr_arm64.s",
         "memcpy_amd64.s",
+        "memcpy_arm64.s",
         "safecopy.go",
         "safecopy_unsafe.go",
         "sighandler_amd64.s",
+        "sighandler_arm64.s",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy",
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s
new file mode 100644
index 000000000..554a5c1e1
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/atomic_arm64.s
@@ -0,0 +1,98 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleSwapUint32Fault returns the value stored in R1. Control is transferred
+// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in R1.
+//
+// It must have the same frame configuration as swapUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
+	MOVW R1, sig+20(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Xchg.
+//
+//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+TEXT ·swapUint32(SB), NOSPLIT, $0-24
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleSwapUint32Fault will store a different value in this address.
+	MOVW $0, sig+20(FP)
+again:
+	MOVD addr+0(FP), R0
+	MOVW new+8(FP), R1
+	LDAXRW (R0), R2
+	STLXRW R1, (R0), R3
+	CBNZ R3, again
+	MOVW R2, old+16(FP)
+	RET
+
+// handleSwapUint64Fault returns the value stored in R1. Control is transferred
+// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in R1.
+//
+// It must have the same frame configuration as swapUint64 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
+	MOVW R1, sig+24(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Xchg64.
+//
+//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+TEXT ·swapUint64(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleSwapUint64Fault will store a different value in this address.
+	MOVW $0, sig+24(FP)
+again:
+	MOVD addr+0(FP), R0
+	MOVD new+8(FP), R1
+	LDAXR (R0), R2
+	STLXR R1, (R0), R3
+	CBNZ R3, again
+	MOVD R2, old+16(FP)
+	RET
+
+// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is
+// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS,
+// with the signal number stored in R1.
+//
+// It must have the same frame configuration as compareAndSwapUint32 so that it
+// can undo any potential call frame set up by the assembler.
+TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
+	MOVW R1, sig+20(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Cas.
+//
+//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
+	// Store 0 as the returned signal number. If we run to completion, this is
+	// the value the caller will see; if a signal is received,
+	// handleCompareAndSwapUint32Fault will store a different value in this
+	// address.
+	MOVW $0, sig+20(FP)
+
+	MOVD addr+0(FP), R0
+	MOVW old+8(FP), R1
+	MOVW new+12(FP), R2
+again:
+	LDAXRW (R0), R3
+	CMPW R1, R3
+	BNE done
+	STLXRW R2, (R0), R4
+	CBNZ R4, again
+done:
+	MOVW R3, prev+16(FP)
+	RET
diff --git a/pkg/sentry/platform/safecopy/memclr_arm64.s b/pkg/sentry/platform/safecopy/memclr_arm64.s
new file mode 100644
index 000000000..7361b9067
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memclr_arm64.s
@@ -0,0 +1,74 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemclrFault returns (the value stored in R0, the value stored in R1).
+// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in R0 and the signal number stored in R1.
+//
+// It must have the same frame configuration as memclr so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemclrFault(SB), NOSPLIT, $0-28
+	MOVD R0, addr+16(FP)
+	MOVW R1, sig+24(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from runtime.memclrNoHeapPointers.
+//
+// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memclr(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemclrFault will store a different value in this address.
+	MOVW $0, sig+24(FP)
+	MOVD ptr+0(FP), R0
+	MOVD n+8(FP), R1
+
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP $16, R1
+	BLT tail_zero
+	// Get buffer offset into 16 byte aligned address for better performance
+	ANDS $15, R0, ZR
+	BNE unaligned_to_16
+aligned_to_16:
+	LSR $4, R1, R2
+zero_by_16:
+	STP.P (ZR, ZR), 16(R0) // Store pair with post index.
+	SUBS $1, R2, R2
+	BNE zero_by_16
+	ANDS $15, R1, R1
+	BEQ end
+
+	// Zero buffer with size=R1 < 16
+tail_zero:
+	TBZ $3, R1, tail_zero_4
+	MOVD.P ZR, 8(R0)
+tail_zero_4:
+	TBZ $2, R1, tail_zero_2
+	MOVW.P ZR, 4(R0)
+tail_zero_2:
+	TBZ $1, R1, tail_zero_1
+	MOVH.P ZR, 2(R0)
+tail_zero_1:
+	TBZ $0, R1, end
+	MOVB ZR, (R0)
+end:
+	RET
+
+unaligned_to_16:
+	MOVD R0, R2
+head_loop:
+	MOVBU.P ZR, 1(R0)
+	ANDS $15, R0, ZR
+	BNE head_loop
+	// Adjust length for what remains
+	SUB R2, R0, R3
+	SUB R3, R1
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP $16, R1
+	BLT tail_zero
+	B aligned_to_16
diff --git a/pkg/sentry/platform/safecopy/memcpy_arm64.s b/pkg/sentry/platform/safecopy/memcpy_arm64.s
new file mode 100644
index 000000000..e7e541565
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memcpy_arm64.s
@@ -0,0 +1,78 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemcpyFault returns (the value stored in R0, the value stored in R1).
+// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in R0 and the signal number stored in R1.
+//
+// It must have the same frame configuration as memcpy so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
+	MOVD R0, addr+24(FP)
+	MOVW R1, sig+32(FP)
+	RET
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+// The code is derived from the Go source runtime.memmove.
+//
+// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memcpy(SB), NOSPLIT, $-8-36
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemcpyFault will store a different value in this address.
+	MOVW $0, sig+32(FP)
+
+	MOVD to+0(FP), R3
+	MOVD from+8(FP), R4
+	MOVD n+16(FP), R5
+	CMP $0, R5
+	BNE check
+	RET
+
+check:
+	AND $~7, R5, R7     // R7 is N&~7.
+	SUB R7, R5, R6      // R6 is N&7.
+
+	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+	// R3 and R4 are advanced as we copy.
+
+	// (There may be implementations of armv8 where copying by bytes until
+	// at least one of source or dest is word aligned is a worthwhile
+	// optimization, but the on the one tested so far (xgene) it did not
+	// make a significance difference.)
+
+	CMP $0, R7          // Do we need to do any word-by-word copying?
+	BEQ noforwardlarge
+	ADD R3, R7, R9      // R9 points just past where we copy by word.
+
+forwardlargeloop:
+	MOVD.P 8(R4), R8       // R8 is just a scratch register.
+	MOVD.P R8, 8(R3)
+	CMP R3, R9
+	BNE forwardlargeloop
+
+noforwardlarge:
+	CMP $0, R6          // Do we need to do any byte-by-byte copying?
+	BNE forwardtail
+	RET
+
+forwardtail:
+	ADD R3, R6, R9      // R9 points just past the destination memory.
+
+forwardtailloop:
+	MOVBU.P 1(R4), R8
+	MOVBU.P R8, 1(R3)
+	CMP R3, R9
+	BNE forwardtailloop
+	RET
diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s
new file mode 100644
index 000000000..5e8e193e7
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s
@@ -0,0 +1,132 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// The signals handled by sigHandler.
+#define SIGBUS 7
+#define SIGSEGV 11
+
+// Offsets to the registers in context->uc_mcontext.gregs[].
+#define REG_R0 0xB8
+#define REG_R1 0xC0
+#define REG_PC 0x1B8
+
+// Offset to the si_addr field of siginfo.
+#define SI_CODE 0x08
+#define SI_ADDR 0x10
+
+// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
+// not be set up as a handler to any other signals.
+//
+// If the instruction causing the signal is within a safecopy-protected
+// function, the signal is handled such that execution resumes in the
+// appropriate fault handling stub with R0 containing the faulting address and
+// R1 containing the signal number. Otherwise control is transferred to the
+// previously configured signal handler (savedSigSegvHandler or
+// savedSigBusHandler).
+//
+// This function cannot be written in go because it runs whenever a signal is
+// received by the thread (preempting whatever was running), which includes when
+// garbage collector has stopped or isn't expecting any interactions (like
+// barriers).
+//
+// The arguments are the following:
+// R0 - The signal number.
+// R1 - Pointer to siginfo_t structure.
+// R2 - Pointer to ucontext structure.
+TEXT ·signalHandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel, si_code > 0 means a kernel signal.
+	MOVD SI_CODE(R1), R7
+	CMPW $0x0, R7
+	BLE original_handler
+
+	// Check if PC is within the area we care about.
+	MOVD REG_PC(R2), R7
+	MOVD ·memcpyBegin(SB), R8
+	CMP R8, R7
+	BLO not_memcpy
+	MOVD ·memcpyEnd(SB), R8
+	CMP R8, R7
+	BHS not_memcpy
+
+	// Modify the context such that execution will resume in the fault handler.
+	MOVD $handleMemcpyFault(SB), R7
+	B handle_fault
+
+not_memcpy:
+	MOVD ·memclrBegin(SB), R8
+	CMP R8, R7
+	BLO not_memclr
+	MOVD ·memclrEnd(SB), R8
+	CMP R8, R7
+	BHS not_memclr
+
+	MOVD $handleMemclrFault(SB), R7
+	B handle_fault
+
+not_memclr:
+	MOVD ·swapUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_swapuint32
+	MOVD ·swapUint32End(SB), R8
+	CMP R8, R7
+	BHS not_swapuint32
+
+	MOVD $handleSwapUint32Fault(SB), R7
+	B handle_fault
+
+not_swapuint32:
+	MOVD ·swapUint64Begin(SB), R8
+	CMP R8, R7
+	BLO not_swapuint64
+	MOVD ·swapUint64End(SB), R8
+	CMP R8, R7
+	BHS not_swapuint64
+
+	MOVD $handleSwapUint64Fault(SB), R7
+	B handle_fault
+
+not_swapuint64:
+	MOVD ·compareAndSwapUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_casuint32
+	MOVD ·compareAndSwapUint32End(SB), R8
+	CMP R8, R7
+	BHS not_casuint32
+
+	MOVD $handleCompareAndSwapUint32Fault(SB), R7
+	B handle_fault
+
+not_casuint32:
+original_handler:
+	// Jump to the previous signal handler, which is likely the golang one.
+	MOVD ·savedSigBusHandler(SB), R7
+	MOVD ·savedSigSegVHandler(SB), R8
+	CMPW $SIGSEGV, R0
+	CSEL EQ, R8, R7, R7
+	B (R7)
+
+handle_fault:
+	// Entered with the address of the fault handler in R7; store it in PC.
+	MOVD R7, REG_PC(R2)
+
+	// Store the faulting address in R0.
+	MOVD SI_ADDR(R1), R7
+	MOVD R7, REG_R0(R2)
+
+	// Store the signal number in R1.
+	MOVW R0, REG_R1(R2)
+
+	RET
-- 
cgit v1.2.3


From 5934fad1d781f13d04184c7585014a98a3b86958 Mon Sep 17 00:00:00 2001
From: Christopher Koch <chrisko@google.com>
Date: Tue, 11 Dec 2018 11:39:17 -0800
Subject: Remove unused envv variable from two funcs.

PiperOrigin-RevId: 225041520
Change-Id: Ib1afc693e592d308d60db82022c5b7743fd3c646
---
 pkg/sentry/loader/interpreter.go | 2 +-
 pkg/sentry/loader/loader.go      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 06a3c7156..35b83654d 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -37,7 +37,7 @@ const (
 )
 
 // parseInterpreterScript returns the interpreter path and argv.
-func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv, envv []string) (newpath string, newargv []string, err error) {
+func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) {
 	line := make([]byte, interpMaxLineLength)
 	n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
 	// Short read is OK.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 69a090844..e955502e3 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -136,7 +136,7 @@ const (
 //  * arch.Context matching the binary arch
 //  * fs.Dirent of the binary file
 //  * Possibly updated argv
-func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
 	for i := 0; i < maxLoaderAttempts; i++ {
 		d, f, err := openPath(ctx, mounts, root, wd, remainingTraversals, filename)
 		if err != nil {
@@ -172,7 +172,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 			d.IncRef()
 			return loaded, ac, d, argv, err
 		case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
-			newpath, newargv, err := parseInterpreterScript(ctx, filename, f, argv, envv)
+			newpath, newargv, err := parseInterpreterScript(ctx, filename, f, argv)
 			if err != nil {
 				ctx.Infof("Error loading interpreter script: %v", err)
 				return loadedELF{}, nil, nil, nil, err
@@ -198,7 +198,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 //  * Load is called on the Task goroutine.
 func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
 	// Load the binary itself.
-	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv)
+	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv)
 	if err != nil {
 		ctx.Infof("Failed to load %s: %v", filename, err)
 		return 0, nil, "", err
-- 
cgit v1.2.3


From 2b6df6a2049e839e39717f90c1760f3d410c98f1 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 11 Dec 2018 15:32:23 -0800
Subject: Format unshare flags

unshare actually takes a subset of clone flags, but has no unique flags,
so formatting as clone flags is close enough.

PiperOrigin-RevId: 225082774
Change-Id: I5b580f18607c7785f323e37809094115520a17c0
---
 pkg/sentry/strace/linux64.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 9457e24b5..e8fb711a5 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -289,7 +289,7 @@ var linuxAMD64 = SyscallMap{
 	269: makeSyscallInfo("faccessat", Hex, Path, Oct, Hex),
 	270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
 	271: makeSyscallInfo("ppoll", Hex, Hex, Timespec, SigSet, Hex),
-	272: makeSyscallInfo("unshare", Hex),
+	272: makeSyscallInfo("unshare", CloneFlags),
 	273: makeSyscallInfo("set_robust_list", Hex, Hex),
 	274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
 	275: makeSyscallInfo("splice", Hex, Hex, Hex, Hex, Hex, Hex),
-- 
cgit v1.2.3


From 75e39eaa74c65b6f7cfb95addb6ac0cbcc7d951a Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 12 Dec 2018 13:09:10 -0800
Subject: Pass information about map writableness to filesystems.

This is necessary to implement file seals for memfds.

PiperOrigin-RevId: 225239394
Change-Id: Ib3f1ab31385afc4b24e96cd81a05ef1bebbcbb70
---
 pkg/sentry/fs/binder/binder.go            |   6 +-
 pkg/sentry/fs/copy_up.go                  |   8 +--
 pkg/sentry/fs/fsutil/inode_cached.go      |  12 ++--
 pkg/sentry/fs/fsutil/inode_cached_test.go |   8 +--
 pkg/sentry/fs/overlay.go                  |  18 +++---
 pkg/sentry/fs/tmpfs/inode_file.go         |  12 ++--
 pkg/sentry/kernel/shm/shm.go              |   6 +-
 pkg/sentry/memmap/mapping_set.go          |  18 ++++--
 pkg/sentry/memmap/mapping_set_test.go     | 102 ++++++++++++++++++++++++++----
 pkg/sentry/memmap/memmap.go               |  17 +++--
 pkg/sentry/mm/aio_context.go              |   6 +-
 pkg/sentry/mm/lifecycle.go                |   2 +-
 pkg/sentry/mm/mm.go                       |   4 ++
 pkg/sentry/mm/special_mappable.go         |   6 +-
 pkg/sentry/mm/syscalls.go                 |   4 +-
 pkg/sentry/mm/vma.go                      |   4 +-
 16 files changed, 162 insertions(+), 71 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 42b9e8b26..e642c7f22 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -302,7 +302,7 @@ func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgum
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error {
 	bp.mu.Lock()
 	defer bp.mu.Unlock()
 	if bp.mapped.Length() != 0 {
@@ -320,12 +320,12 @@ func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar userm
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (bp *Proc) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+func (*Proc) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
 	// Nothing to do. Notably, we don't free bp.mapped to allow another mmap.
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error {
 	// Nothing to do. Notably, this is one case where CopyMapping isn't
 	// equivalent to AddMapping, as AddMapping would return EBUSY.
 	return nil
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index d65dc74bf..6d4ebaaa4 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -270,13 +270,13 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 			added := make(memmap.MappingsOfRange)
 			for m := range seg.Value() {
-				if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start()); err != nil {
+				if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
 					for m := range added {
-						upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start())
+						upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
 					}
 					for mr, mappings := range allAdded {
 						for m := range mappings {
-							upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start)
+							upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
 						}
 					}
 					return err
@@ -301,7 +301,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	if lowerMappable != nil {
 		for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 			for m := range seg.Value() {
-				lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start())
+				lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
 			}
 		}
 	}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index b0af44ddd..707ca76d2 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -686,10 +686,10 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
 	// Hot path. Avoid defers.
 	c.mapsMu.Lock()
-	mapped := c.mappings.AddMapping(ms, ar, offset)
+	mapped := c.mappings.AddMapping(ms, ar, offset, writable)
 	// Do this unconditionally since whether we have c.backingFile.FD() >= 0
 	// can change across save/restore.
 	for _, r := range mapped {
@@ -705,10 +705,10 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
 	// Hot path. Avoid defers.
 	c.mapsMu.Lock()
-	unmapped := c.mappings.RemoveMapping(ms, ar, offset)
+	unmapped := c.mappings.RemoveMapping(ms, ar, offset, writable)
 	for _, r := range unmapped {
 		c.hostFileMapper.DecRefOn(r)
 	}
@@ -739,8 +739,8 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
-	return c.AddMapping(ctx, ms, dstAR, offset)
+func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return c.AddMapping(ctx, ms, dstAR, offset, writable)
 }
 
 // Translate implements memmap.Mappable.Translate.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index e388ec3d7..ce5201a40 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -305,7 +305,7 @@ func TestRead(t *testing.T) {
 	// be cached.
 	var ms noopMappingSpace
 	ar := usermem.AddrRange{usermem.PageSize, 2 * usermem.PageSize}
-	if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil {
+	if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize, true); err != nil {
 		t.Fatalf("AddMapping got %v, want nil", err)
 	}
 	mr := memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize}
@@ -334,7 +334,7 @@ func TestRead(t *testing.T) {
 
 	// Delete the memory mapping and expect it to cause the cached page to be
 	// uncached.
-	iops.RemoveMapping(ctx, ms, ar, usermem.PageSize)
+	iops.RemoveMapping(ctx, ms, ar, usermem.PageSize, true)
 	if cached := iops.cache.Span(); cached != 0 {
 		t.Fatalf("Span got %d, want 0", cached)
 	}
@@ -363,10 +363,10 @@ func TestWrite(t *testing.T) {
 	// Translate to force them to be cached.
 	var ms noopMappingSpace
 	ar := usermem.AddrRange{usermem.PageSize, 3 * usermem.PageSize}
-	if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil {
+	if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize, true); err != nil {
 		t.Fatalf("AddMapping got %v, want nil", err)
 	}
-	defer iops.RemoveMapping(ctx, ms, ar, usermem.PageSize)
+	defer iops.RemoveMapping(ctx, ms, ar, usermem.PageSize, true)
 	mr := memmap.MappableRange{usermem.PageSize, 3 * usermem.PageSize}
 	if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil {
 		t.Fatalf("Translate got %v, want nil", err)
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 8ace4ee64..f3e2d5cbe 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -259,32 +259,32 @@ func (o *overlayEntry) isMappableLocked() bool {
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (o *overlayEntry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+func (o *overlayEntry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
 	o.mapsMu.Lock()
 	defer o.mapsMu.Unlock()
-	if err := o.inodeLocked().Mappable().AddMapping(ctx, ms, ar, offset); err != nil {
+	if err := o.inodeLocked().Mappable().AddMapping(ctx, ms, ar, offset, writable); err != nil {
 		return err
 	}
-	o.mappings.AddMapping(ms, ar, offset)
+	o.mappings.AddMapping(ms, ar, offset, writable)
 	return nil
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (o *overlayEntry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+func (o *overlayEntry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
 	o.mapsMu.Lock()
 	defer o.mapsMu.Unlock()
-	o.inodeLocked().Mappable().RemoveMapping(ctx, ms, ar, offset)
-	o.mappings.RemoveMapping(ms, ar, offset)
+	o.inodeLocked().Mappable().RemoveMapping(ctx, ms, ar, offset, writable)
+	o.mappings.RemoveMapping(ms, ar, offset, writable)
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (o *overlayEntry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+func (o *overlayEntry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
 	o.mapsMu.Lock()
 	defer o.mapsMu.Unlock()
-	if err := o.inodeLocked().Mappable().CopyMapping(ctx, ms, srcAR, dstAR, offset); err != nil {
+	if err := o.inodeLocked().Mappable().CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
 		return err
 	}
-	o.mappings.AddMapping(ms, dstAR, offset)
+	o.mappings.AddMapping(ms, dstAR, offset, writable)
 	return nil
 }
 
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 42a7d7b9c..e0181c52c 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -426,23 +426,23 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
 	f.mapsMu.Lock()
 	defer f.mapsMu.Unlock()
-	f.mappings.AddMapping(ms, ar, offset)
+	f.mappings.AddMapping(ms, ar, offset, writable)
 	return nil
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
 	f.mapsMu.Lock()
 	defer f.mapsMu.Unlock()
-	f.mappings.RemoveMapping(ms, ar, offset)
+	f.mappings.RemoveMapping(ms, ar, offset, writable)
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (f *fileInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
-	return f.AddMapping(ctx, ms, dstAR, offset)
+func (f *fileInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return f.AddMapping(ctx, ms, dstAR, offset, writable)
 }
 
 // Translate implements memmap.Mappable.Translate.
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 4343dee13..2f400cbba 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -375,7 +375,7 @@ func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (s *Shm) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.attachTime = ktime.NowFromContext(ctx)
@@ -390,7 +390,7 @@ func (s *Shm) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (s *Shm) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	// TODO: RemoveMapping may be called during task exit, when ctx
@@ -411,7 +411,7 @@ func (s *Shm) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar user
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (s *Shm) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
 	return nil
 }
 
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index 33cf16f91..bd07e9aac 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -40,6 +40,7 @@ type MappingsOfRange map[MappingOfRange]struct{}
 type MappingOfRange struct {
 	MappingSpace MappingSpace
 	AddrRange    usermem.AddrRange
+	Writable     bool
 }
 
 func (r MappingOfRange) invalidate(opts InvalidateOpts) {
@@ -92,6 +93,7 @@ func (mappingSetFunctions) Merge(r1 MappableRange, val1 MappingsOfRange, r2 Mapp
 				Start: k1.AddrRange.End,
 				End:   k1.AddrRange.End + usermem.Addr(r2.Length()),
 			},
+			Writable: k1.Writable,
 		}
 		if _, ok := val2[k2]; !ok {
 			return nil, false
@@ -104,6 +106,7 @@ func (mappingSetFunctions) Merge(r1 MappableRange, val1 MappingsOfRange, r2 Mapp
 				Start: k1.AddrRange.Start,
 				End:   k2.AddrRange.End,
 			},
+			Writable: k1.Writable,
 		}] = struct{}{}
 	}
 
@@ -129,6 +132,7 @@ func (mappingSetFunctions) Split(r MappableRange, val MappingsOfRange, split uin
 				Start: k.AddrRange.Start,
 				End:   k.AddrRange.Start + offset,
 			},
+			Writable: k.Writable,
 		}
 		m1[k1] = struct{}{}
 
@@ -138,6 +142,7 @@ func (mappingSetFunctions) Split(r MappableRange, val MappingsOfRange, split uin
 				Start: k.AddrRange.Start + offset,
 				End:   k.AddrRange.End,
 			},
+			Writable: k.Writable,
 		}
 		m2[k2] = struct{}{}
 	}
@@ -152,7 +157,7 @@ func (mappingSetFunctions) Split(r MappableRange, val MappingsOfRange, split uin
 // indicating that ms maps addresses [0x4000, 0x6000) to MappableRange [0x0,
 // 0x2000). Then for subsetRange = [0x1000, 0x2000), subsetMapping returns a
 // MappingOfRange for which AddrRange = [0x5000, 0x6000).
-func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr usermem.Addr) MappingOfRange {
+func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr usermem.Addr, writable bool) MappingOfRange {
 	if !wholeRange.IsSupersetOf(subsetRange) {
 		panic(fmt.Sprintf("%v is not a superset of %v", wholeRange, subsetRange))
 	}
@@ -165,6 +170,7 @@ func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr
 			Start: start,
 			End:   start + usermem.Addr(subsetRange.Length()),
 		},
+		Writable: writable,
 	}
 }
 
@@ -172,7 +178,7 @@ func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr
 // previously had no mappings.
 //
 // Preconditions: As for Mappable.AddMapping.
-func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64) []MappableRange {
+func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var mapped []MappableRange
 	seg, gap := s.Find(mr.Start)
@@ -180,7 +186,7 @@ func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset ui
 		switch {
 		case seg.Ok() && seg.Start() < mr.End:
 			seg = s.Isolate(seg, mr)
-			seg.Value()[subsetMapping(mr, seg.Range(), ms, ar.Start)] = struct{}{}
+			seg.Value()[subsetMapping(mr, seg.Range(), ms, ar.Start, writable)] = struct{}{}
 			seg, gap = seg.NextNonEmpty()
 
 		case gap.Ok() && gap.Start() < mr.End:
@@ -199,7 +205,7 @@ func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset ui
 // MappableRanges that now have no mappings.
 //
 // Preconditions: As for Mappable.RemoveMapping.
-func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64) []MappableRange {
+func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange {
 	mr := MappableRange{offset, offset + uint64(ar.Length())}
 	var unmapped []MappableRange
 
@@ -213,7 +219,7 @@ func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset
 
 		// Remove this part of the mapping.
 		mappings := seg.Value()
-		delete(mappings, subsetMapping(mr, seg.Range(), ms, ar.Start))
+		delete(mappings, subsetMapping(mr, seg.Range(), ms, ar.Start, writable))
 
 		if len(mappings) == 0 {
 			unmapped = append(unmapped, seg.Range())
@@ -231,7 +237,7 @@ func (s *MappingSet) Invalidate(mr MappableRange, opts InvalidateOpts) {
 	for seg := s.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() {
 		segMR := seg.Range()
 		for m := range seg.Value() {
-			region := subsetMapping(segMR, segMR.Intersect(mr), m.MappingSpace, m.AddrRange.Start)
+			region := subsetMapping(segMR, segMR.Intersect(mr), m.MappingSpace, m.AddrRange.Start, m.Writable)
 			region.invalidate(opts)
 		}
 	}
diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go
index 49ee34548..45d1d4688 100644
--- a/pkg/sentry/memmap/mapping_set_test.go
+++ b/pkg/sentry/memmap/mapping_set_test.go
@@ -40,7 +40,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	set := MappingSet{}
 	ms := &testMappingSpace{}
 
-	mapped := set.AddMapping(ms, usermem.AddrRange{0x10000, 0x12000}, 0x1000)
+	mapped := set.AddMapping(ms, usermem.AddrRange{0x10000, 0x12000}, 0x1000, true)
 	if got, want := mapped, []MappableRange{{0x1000, 0x3000}}; !reflect.DeepEqual(got, want) {
 		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
 	}
@@ -49,7 +49,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	// [0x10000, 0x12000) => [0x1000, 0x3000)
 	t.Log(&set)
 
-	mapped = set.AddMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000)
+	mapped = set.AddMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000, true)
 	if len(mapped) != 0 {
 		t.Errorf("AddMapping: got %+v, wanted []", mapped)
 	}
@@ -59,7 +59,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	// [0x11000, 0x12000) and [0x20000, 0x21000) => [0x2000, 0x3000)
 	t.Log(&set)
 
-	mapped = set.AddMapping(ms, usermem.AddrRange{0x30000, 0x31000}, 0x4000)
+	mapped = set.AddMapping(ms, usermem.AddrRange{0x30000, 0x31000}, 0x4000, true)
 	if got, want := mapped, []MappableRange{{0x4000, 0x5000}}; !reflect.DeepEqual(got, want) {
 		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
 	}
@@ -70,7 +70,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	// [0x30000, 0x31000) => [0x4000, 0x5000)
 	t.Log(&set)
 
-	mapped = set.AddMapping(ms, usermem.AddrRange{0x12000, 0x15000}, 0x3000)
+	mapped = set.AddMapping(ms, usermem.AddrRange{0x12000, 0x15000}, 0x3000, true)
 	if got, want := mapped, []MappableRange{{0x3000, 0x4000}, {0x5000, 0x6000}}; !reflect.DeepEqual(got, want) {
 		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
 	}
@@ -83,7 +83,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	// [0x14000, 0x15000) => [0x5000, 0x6000)
 	t.Log(&set)
 
-	unmapped := set.RemoveMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0x1000)
+	unmapped := set.RemoveMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0x1000, true)
 	if got, want := unmapped, []MappableRange{{0x1000, 0x2000}}; !reflect.DeepEqual(got, want) {
 		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
 	}
@@ -95,7 +95,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	// [0x14000, 0x15000) => [0x5000, 0x6000)
 	t.Log(&set)
 
-	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000)
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000, true)
 	if len(unmapped) != 0 {
 		t.Errorf("RemoveMapping: got %+v, wanted []", unmapped)
 	}
@@ -106,7 +106,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	// [0x14000, 0x15000) => [0x5000, 0x6000)
 	t.Log(&set)
 
-	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x11000, 0x15000}, 0x2000)
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x11000, 0x15000}, 0x2000, true)
 	if got, want := unmapped, []MappableRange{{0x2000, 0x4000}, {0x5000, 0x6000}}; !reflect.DeepEqual(got, want) {
 		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
 	}
@@ -115,7 +115,7 @@ func TestAddRemoveMapping(t *testing.T) {
 	// [0x30000, 0x31000) => [0x4000, 0x5000)
 	t.Log(&set)
 
-	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x30000, 0x31000}, 0x4000)
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x30000, 0x31000}, 0x4000, true)
 	if got, want := unmapped, []MappableRange{{0x4000, 0x5000}}; !reflect.DeepEqual(got, want) {
 		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
 	}
@@ -125,7 +125,7 @@ func TestInvalidateWholeMapping(t *testing.T) {
 	set := MappingSet{}
 	ms := &testMappingSpace{}
 
-	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0)
+	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0, true)
 	// Mappings:
 	// [0x10000, 0x11000) => [0, 0x1000)
 	t.Log(&set)
@@ -139,7 +139,7 @@ func TestInvalidatePartialMapping(t *testing.T) {
 	set := MappingSet{}
 	ms := &testMappingSpace{}
 
-	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x13000}, 0)
+	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x13000}, 0, true)
 	// Mappings:
 	// [0x10000, 0x13000) => [0, 0x3000)
 	t.Log(&set)
@@ -153,8 +153,8 @@ func TestInvalidateMultipleMappings(t *testing.T) {
 	set := MappingSet{}
 	ms := &testMappingSpace{}
 
-	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0)
-	set.AddMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000)
+	set.AddMapping(ms, usermem.AddrRange{0x10000, 0x11000}, 0, true)
+	set.AddMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000, true)
 	// Mappings:
 	// [0x10000, 0x11000) => [0, 0x1000)
 	// [0x12000, 0x13000) => [0x2000, 0x3000)
@@ -170,8 +170,8 @@ func TestInvalidateOverlappingMappings(t *testing.T) {
 	ms1 := &testMappingSpace{}
 	ms2 := &testMappingSpace{}
 
-	set.AddMapping(ms1, usermem.AddrRange{0x10000, 0x12000}, 0)
-	set.AddMapping(ms2, usermem.AddrRange{0x20000, 0x22000}, 0x1000)
+	set.AddMapping(ms1, usermem.AddrRange{0x10000, 0x12000}, 0, true)
+	set.AddMapping(ms2, usermem.AddrRange{0x20000, 0x22000}, 0x1000, true)
 	// Mappings:
 	// ms1:[0x10000, 0x12000) => [0, 0x2000)
 	// ms2:[0x11000, 0x13000) => [0x1000, 0x3000)
@@ -184,3 +184,77 @@ func TestInvalidateOverlappingMappings(t *testing.T) {
 		t.Errorf("Invalidate: ms1: got %+v, wanted %+v", got, want)
 	}
 }
+
+func TestMixedWritableMappings(t *testing.T) {
+	set := MappingSet{}
+	ms := &testMappingSpace{}
+
+	mapped := set.AddMapping(ms, usermem.AddrRange{0x10000, 0x12000}, 0x1000, true)
+	if got, want := mapped, []MappableRange{{0x1000, 0x3000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x10000, 0x12000) writable => [0x1000, 0x3000)
+	t.Log(&set)
+
+	mapped = set.AddMapping(ms, usermem.AddrRange{0x20000, 0x22000}, 0x2000, false)
+	if got, want := mapped, []MappableRange{{0x3000, 0x4000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("AddMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x10000, 0x11000) writable => [0x1000, 0x2000)
+	// [0x11000, 0x12000) writable and [0x20000, 0x21000) readonly => [0x2000, 0x3000)
+	// [0x21000, 0x22000) readonly => [0x3000, 0x4000)
+	t.Log(&set)
+
+	// Unmap should fail because we specified the readonly map address range, but
+	// asked to unmap a writable segment.
+	unmapped := set.RemoveMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000, true)
+	if len(unmapped) != 0 {
+		t.Errorf("RemoveMapping: got %+v, wanted []", unmapped)
+	}
+
+	// Readonly mapping removed, but writable mapping still exists in the range,
+	// so no mappable range fully unmapped.
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x20000, 0x21000}, 0x2000, false)
+	if len(unmapped) != 0 {
+		t.Errorf("RemoveMapping: got %+v, wanted []", unmapped)
+	}
+
+	// Mappings:
+	// [0x10000, 0x12000) writable => [0x1000, 0x3000)
+	// [0x21000, 0x22000) readonly => [0x3000, 0x4000)
+	t.Log(&set)
+
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x11000, 0x12000}, 0x2000, true)
+	if got, want := unmapped, []MappableRange{{0x2000, 0x3000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x10000, 0x12000) writable => [0x1000, 0x3000)
+	// [0x21000, 0x22000) readonly => [0x3000, 0x4000)
+	t.Log(&set)
+
+	// Unmap should fail since writable bit doesn't match.
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x10000, 0x12000}, 0x1000, false)
+	if len(unmapped) != 0 {
+		t.Errorf("RemoveMapping: got %+v, wanted []", unmapped)
+	}
+
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x10000, 0x12000}, 0x1000, true)
+	if got, want := unmapped, []MappableRange{{0x1000, 0x2000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
+	}
+
+	// Mappings:
+	// [0x21000, 0x22000) readonly => [0x3000, 0x4000)
+	t.Log(&set)
+
+	unmapped = set.RemoveMapping(ms, usermem.AddrRange{0x21000, 0x22000}, 0x3000, false)
+	if got, want := unmapped, []MappableRange{{0x3000, 0x4000}}; !reflect.DeepEqual(got, want) {
+		t.Errorf("RemoveMapping: got %+v, wanted %+v", got, want)
+	}
+}
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 05349a77f..28e2bed9b 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -36,16 +36,22 @@ type Mappable interface {
 	// AddMapping notifies the Mappable of a mapping from addresses ar in ms to
 	// offsets [offset, offset+ar.Length()) in this Mappable.
 	//
+	// The writable flag indicates whether the backing data for a Mappable can
+	// be modified through the mapping. Effectively, this means a shared mapping
+	// where Translate may be called with at.Write == true. This is a property
+	// established at mapping creation and must remain constant throughout the
+	// lifetime of the mapping.
+	//
 	// Preconditions: offset+ar.Length() does not overflow.
-	AddMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64) error
+	AddMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error
 
 	// RemoveMapping notifies the Mappable of the removal of a mapping from
 	// addresses ar in ms to offsets [offset, offset+ar.Length()) in this
 	// Mappable.
 	//
 	// Preconditions: offset+ar.Length() does not overflow. The removed mapping
-	// must exist.
-	RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64)
+	// must exist. writable must match the corresponding call to AddMapping.
+	RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool)
 
 	// CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
 	// from srcAR to dstAR. For most Mappables, this is equivalent to
@@ -56,8 +62,9 @@ type Mappable interface {
 	// MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
 	//
 	// Preconditions: offset+srcAR.Length() and offset+dstAR.Length() do not
-	// overflow. The mapping at srcAR must exist.
-	CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error
+	// overflow. The mapping at srcAR must exist. writable must match the
+	// corresponding call to AddMapping.
+	CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error
 
 	// Translate returns the Mappable's current mappings for at least the range
 	// of offsets specified by required, and at most the range of offsets
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 87942af0e..5e86d3b49 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -244,7 +244,7 @@ func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (m *aioMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error {
 	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
 	// sets VM_DONTEXPAND).
 	if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
@@ -254,11 +254,11 @@ func (m *aioMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (m *aioMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error {
 	// Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
 	// sets VM_DONTEXPAND).
 	if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index b248b76e7..1613ce11d 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -81,7 +81,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
-			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off); err != nil {
+			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.isMappableAsWritable()); err != nil {
 				mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
 				return nil, err
 			}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index aab697f9e..b1e39e898 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -349,6 +349,10 @@ func (v *vma) loadRealPerms(b int) {
 	}
 }
 
+func (v *vma) isMappableAsWritable() bool {
+	return !v.private && v.maxPerms.Write
+}
+
 // pma represents a platform mapping area.
 //
 // +stateify savable
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index e511472f4..64d0dd3f6 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -76,16 +76,16 @@ func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) er
 }
 
 // AddMapping implements memmap.Mappable.AddMapping.
-func (m *SpecialMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+func (*SpecialMappable) AddMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) error {
 	return nil
 }
 
 // RemoveMapping implements memmap.Mappable.RemoveMapping.
-func (m *SpecialMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+func (*SpecialMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) {
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
-func (m *SpecialMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error {
 	return nil
 }
 
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 1a46c2105..daaae4da1 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -443,7 +443,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			return 0, syserror.EINVAL
 		}
 		// Inform the Mappable, if any, of the new mapping.
-		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start)); err != nil {
+		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.isMappableAsWritable()); err != nil {
 			return 0, err
 		}
 	}
@@ -498,7 +498,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// Now that pmas have been moved to newAR, we can notify vma.mappable that
 	// oldAR is no longer mapped.
 	if vma.mappable != nil {
-		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off)
+		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
 	}
 
 	return newAR.Start, nil
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index dafdbd0e4..5c2c802f6 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -65,7 +65,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 
 	// Inform the Mappable, if any, of the new mapping.
 	if opts.Mappable != nil {
-		if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset); err != nil {
+		if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil {
 			return vmaIterator{}, usermem.AddrRange{}, err
 		}
 	}
@@ -332,7 +332,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 		vmaAR := vseg.Range()
 		vma := vseg.ValuePtr()
 		if vma.mappable != nil {
-			vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off)
+			vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.isMappableAsWritable())
 		}
 		if vma.id != nil {
 			vma.id.DecRef()
-- 
cgit v1.2.3


From f93c288dd70846f335239e2d0cb351135a756f51 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 12 Dec 2018 13:17:46 -0800
Subject: Fix a data race on Shm.key.

PiperOrigin-RevId: 225240907
Change-Id: Ie568ce3cd643f3e4a0eaa0444f4ed589dcf6031f
---
 pkg/sentry/kernel/shm/shm.go | 89 +++++++++++++++++++++++++++++---------------
 1 file changed, 59 insertions(+), 30 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 2f400cbba..96414d060 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -66,24 +66,30 @@ type Registry struct {
 	// userNS owns the IPC namespace this registry belong to. Immutable.
 	userNS *auth.UserNamespace
 
+	// mu protects all fields below.
 	mu sync.Mutex `state:"nosave"`
 
-	// shms maps segment ids to segments. Protected by mu.
+	// shms maps segment ids to segments.
 	shms map[ID]*Shm
 
+	// keysToShms maps segment keys to segments.
+	keysToShms map[Key]*Shm
+
 	// Sum of the sizes of all existing segments rounded up to page size, in
-	// units of page size. Protected by mu.
+	// units of page size.
 	totalPages uint64
 
-	// lastIDUsed is protected by mu.
+	// ID assigned to the last created segment. Used to quickly find the next
+	// unused ID.
 	lastIDUsed ID
 }
 
 // NewRegistry creates a new shm registry.
 func NewRegistry(userNS *auth.UserNamespace) *Registry {
 	return &Registry{
-		userNS: userNS,
-		shms:   make(map[ID]*Shm),
+		userNS:     userNS,
+		shms:       make(map[ID]*Shm),
+		keysToShms: make(map[Key]*Shm),
 	}
 }
 
@@ -94,14 +100,20 @@ func (r *Registry) FindByID(id ID) *Shm {
 	return r.shms[id]
 }
 
-// Precondition: Caller must hold r.mu.
-func (r *Registry) findByKey(key Key) *Shm {
-	for _, v := range r.shms {
-		if v.key == key {
-			return v
-		}
+// dissociateKey removes the association between a segment and its key,
+// preventing it from being discovered in the registry. This doesn't necessarily
+// mean the segment is about to be destroyed. This is analogous to unlinking a
+// file; the segment can still be used by a process already referencing it, but
+// cannot be discovered by a new process.
+func (r *Registry) dissociateKey(s *Shm) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.key != linux.IPC_PRIVATE {
+		delete(r.keysToShms, s.key)
+		s.key = linux.IPC_PRIVATE
 	}
-	return nil
 }
 
 // FindOrCreate looks up or creates a segment in the registry. It's functionally
@@ -127,7 +139,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 
 	if !private {
 		// Look up an existing segment.
-		if shm := r.findByKey(key); shm != nil {
+		if shm := r.keysToShms[key]; shm != nil {
 			shm.mu.Lock()
 			defer shm.mu.Unlock()
 
@@ -184,6 +196,8 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 }
 
 // newShm creates a new segment in the registry.
+//
+// Precondition: Caller must hold r.mu.
 func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
 	p := platform.FromContext(ctx)
 	if p == nil {
@@ -219,8 +233,10 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 		}
 		if r.shms[id] == nil {
 			r.lastIDUsed = id
-			r.shms[id] = shm
+
 			shm.ID = id
+			r.shms[id] = shm
+			r.keysToShms[key] = shm
 
 			r.totalPages += effectiveSize / usermem.PageSize
 
@@ -258,13 +274,20 @@ func (r *Registry) ShmInfo() *linux.ShmInfo {
 	}
 }
 
-// remove unregisters a segment from this registry, preventing it from being
-// discovered in the future. Caller is responsible for ensuring s is destroyed.
+// remove deletes a segment from this registry, deaccounting the memory used by
+// the segment.
 //
-// Precondition: To preserve lock ordering, caller must not hold s.mu.
+// Precondition: Must follow a call to r.dissociateKey(s).
 func (r *Registry) remove(s *Shm) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.key != linux.IPC_PRIVATE {
+		panic(fmt.Sprintf("Attempted to remove shm segment %+v from the registry whose key is still associated", s))
+	}
+
 	delete(r.shms, s.ID)
 	r.totalPages -= s.effectiveSize / usermem.PageSize
 }
@@ -314,12 +337,12 @@ type Shm struct {
 	// segment. Immutable.
 	fr platform.FileRange
 
-	// key is the public identifier for this segment.
-	key Key
-
 	// mu protects all fields below.
 	mu sync.Mutex `state:"nosave"`
 
+	// key is the public identifier for this segment.
+	key Key
+
 	// perms is the access permissions for the segment.
 	perms fs.FilePermissions
 
@@ -342,12 +365,14 @@ type Shm struct {
 	// pendingDestruction indicates the segment was marked as destroyed through
 	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
 	// in the registry and can no longer be attached. When the last user
-	// detaches from the segment, it is destroyed. Protected by mu.
+	// detaches from the segment, it is destroyed.
 	pendingDestruction bool
 }
 
 // MappedName implements memmap.MappingIdentity.MappedName.
 func (s *Shm) MappedName(ctx context.Context) string {
+	s.mu.Lock()
+	defer s.mu.Unlock()
 	return fmt.Sprintf("SYSV%08d", s.key)
 }
 
@@ -364,6 +389,8 @@ func (s *Shm) InodeID() uint64 {
 }
 
 // DecRef overrides refs.RefCount.DecRef with a destructor.
+//
+// Precondition: Caller must not hold s.mu.
 func (s *Shm) DecRef() {
 	s.DecRefWithDestructor(s.destroy)
 }
@@ -572,28 +599,30 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
 }
 
 func (s *Shm) destroy() {
-	s.registry.remove(s)
 	s.p.Memory().DecRef(s.fr)
+	s.registry.remove(s)
 }
 
-// MarkDestroyed marks a shm for destruction. The shm is actually destroyed once
-// it has no references. See shmctl(IPC_RMID).
+// MarkDestroyed marks a segment for destruction. The segment is actually
+// destroyed once it has no references. MarkDestroyed may be called multiple
+// times, and is safe to call after a segment has already been destroyed. See
+// shmctl(IPC_RMID).
 func (s *Shm) MarkDestroyed() {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	// Prevent the segment from being found in the registry.
-	s.key = linux.IPC_PRIVATE
+	s.registry.dissociateKey(s)
 
+	s.mu.Lock()
 	// Only drop the segment's self-reference once, when destruction is
-	// requested. Otherwise, repeated calls shmctl(IPC_RMID) would force a
+	// requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a
 	// segment to be destroyed prematurely, potentially with active maps to the
 	// segment's address range. Remaining references are dropped when the
 	// segment is detached or unmaped.
 	if !s.pendingDestruction {
 		s.pendingDestruction = true
+		s.mu.Unlock() // Must release s.mu before calling s.DecRef.
 		s.DecRef()
+		return
 	}
+	s.mu.Unlock()
 }
 
 // checkOwnership verifies whether a segment may be accessed by ctx as an
-- 
cgit v1.2.3


From ccce1d4281ce82fe551d7c8569fe3a545c62e296 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 12 Dec 2018 17:47:01 -0800
Subject: Filesystems shouldn't be saving references to Platform.

Platform objects are not savable, storing references to them in
filesystem datastructures would cause save to fail if someone actually
passed in a Platform.

Current implementations work because everywhere a Platform is
expected, we currently pass in a Kernel object which embeds Platform
and thus satisfies the interface.

Eliminate this indirection and save pointers to Kernel directly.

PiperOrigin-RevId: 225288336
Change-Id: Ica399ff43f425e15bc150a0d7102196c3d54a2ab
---
 pkg/sentry/context/contexttest/contexttest.go | 32 ++++++++++++++++------
 pkg/sentry/fs/BUILD                           |  2 +-
 pkg/sentry/fs/ashmem/area.go                  |  8 +++---
 pkg/sentry/fs/copy_up_test.go                 |  2 +-
 pkg/sentry/fs/dev/BUILD                       |  1 +
 pkg/sentry/fs/dev/dev.go                      |  4 +--
 pkg/sentry/fs/file_overlay_test.go            |  2 +-
 pkg/sentry/fs/inode_overlay_test.go           |  2 +-
 pkg/sentry/fs/mounts_test.go                  |  2 +-
 pkg/sentry/fs/tmpfs/BUILD                     |  6 ++---
 pkg/sentry/fs/tmpfs/file_test.go              |  6 ++---
 pkg/sentry/fs/tmpfs/fs.go                     |  4 +--
 pkg/sentry/fs/tmpfs/inode_file.go             | 20 +++++++-------
 pkg/sentry/fs/tmpfs/tmpfs.go                  | 14 +++++-----
 pkg/sentry/kernel/contexttest/BUILD           | 17 ++++++++++++
 pkg/sentry/kernel/contexttest/contexttest.go  | 38 +++++++++++++++++++++++++++
 16 files changed, 116 insertions(+), 44 deletions(-)
 create mode 100644 pkg/sentry/kernel/contexttest/BUILD
 create mode 100644 pkg/sentry/kernel/contexttest/contexttest.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index d2f084ed7..d5fd9f165 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -31,23 +31,30 @@ import (
 
 // Context returns a Context that may be used in tests. Uses ptrace as the
 // platform.Platform.
+//
+// Note that some filesystems may require a minimal kernel for testing, which
+// this test context does not provide. For such tests, see kernel/contexttest.
 func Context(tb testing.TB) context.Context {
 	p, err := ptrace.New()
 	if err != nil {
 		tb.Fatal(err)
 	}
 	// Test usage of context.Background is fine.
-	return &testContext{
-		Context:  context.Background(),
-		l:        limits.NewLimitSet(),
-		platform: p,
+	return &TestContext{
+		Context:     context.Background(),
+		l:           limits.NewLimitSet(),
+		platform:    p,
+		otherValues: make(map[interface{}]interface{}),
 	}
 }
 
-type testContext struct {
+// TestContext represents a context with minimal functionality suitable for
+// running tests.
+type TestContext struct {
 	context.Context
-	l        *limits.LimitSet
-	platform platform.Platform
+	l           *limits.LimitSet
+	platform    platform.Platform
+	otherValues map[interface{}]interface{}
 }
 
 // globalUniqueID tracks incremental unique identifiers for tests.
@@ -76,8 +83,14 @@ func (hostClock) Now() ktime.Time {
 	return ktime.FromNanoseconds(time.Now().UnixNano())
 }
 
+// RegisterValue registers additional values with this test context. Useful for
+// providing values from external packages that contexttest can't depend on.
+func (t *TestContext) RegisterValue(key, value interface{}) {
+	t.otherValues[key] = value
+}
+
 // Value implements context.Context.
-func (t *testContext) Value(key interface{}) interface{} {
+func (t *TestContext) Value(key interface{}) interface{} {
 	switch key {
 	case limits.CtxLimits:
 		return t.l
@@ -92,6 +105,9 @@ func (t *testContext) Value(key interface{}) interface{} {
 	case ktime.CtxRealtimeClock:
 		return hostClock{}
 	default:
+		if val, ok := t.otherValues[key]; ok {
+			return val
+		}
 		return t.Context.Value(key)
 	}
 }
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 548898aaa..0fe2b14bf 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -94,9 +94,9 @@ go_test(
     deps = [
         ":fs",
         "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs/ramfs/test",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
     ],
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index 5372875ac..d7dd2c084 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -23,8 +23,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -117,11 +117,11 @@ func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MM
 	opts.MaxPerms = opts.MaxPerms.Intersect(a.perms)
 
 	if a.tmpfsFile == nil {
-		p := platform.FromContext(ctx)
-		if p == nil {
+		k := kernel.KernelFromContext(ctx)
+		if k == nil {
 			return syserror.ENOMEM
 		}
-		tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}, p)
+		tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}, k)
 		// This is not backed by a real filesystem, so we pass in nil.
 		tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{})
 		dirent := fs.NewDirent(tmpfsInode, namePrefix+"/"+a.name)
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index fcba14ed4..2b2f4bb8f 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -22,9 +22,9 @@ import (
 	"sync"
 	"testing"
 
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index fc069bb5f..b17b5202c 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -25,6 +25,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/platform",
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 05a5005ad..3e127bf04 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -22,7 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -86,7 +86,7 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn
 		"random":  newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
 		"urandom": newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
 
-		"shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc, platform.FromContext(ctx)),
+		"shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc, kernel.KernelFromContext(ctx)),
 
 		// A devpts is typically mounted at /dev/pts to provide
 		// pseudoterminal support. Place an empty directory there for
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index 11e4f7203..f121cbdda 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -19,9 +19,9 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 )
 
 func TestReaddir(t *testing.T) {
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index acdb2b4f8..9e922d008 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -18,9 +18,9 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index 2f7a1710f..7d682d99b 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -18,9 +18,9 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 )
 
 // Creates a new MountNamespace with filesystem:
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 277583113..9065cdd5d 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -20,10 +20,10 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
@@ -39,9 +39,9 @@ go_test(
     embed = [":tmpfs"],
     deps = [
         "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
     ],
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index b5830d3df..02da9af82 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -19,16 +19,16 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
 func newFileInode(ctx context.Context) *fs.Inode {
 	m := fs.NewCachingMountSource(&Filesystem{}, fs.MountSourceFlags{})
-	iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.WithCurrentTime(ctx, fs.UnstableAttr{}), platform.FromContext(ctx))
+	iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.WithCurrentTime(ctx, fs.UnstableAttr{}), kernel.KernelFromContext(ctx))
 	return fs.NewInode(iops, m, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 3ac0c4dd4..88f85b85a 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -21,8 +21,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 )
 
 const (
@@ -131,5 +131,5 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 	msrc := fs.NewCachingMountSource(f, flags)
 
 	// Construct the tmpfs root.
-	return NewDir(ctx, nil, owner, perms, msrc, platform.FromContext(ctx)), nil
+	return NewDir(ctx, nil, owner, perms, msrc, kernel.KernelFromContext(ctx)), nil
 }
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index e0181c52c..ca2b4aabb 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -21,8 +21,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -52,8 +52,8 @@ type fileInodeOperations struct {
 	fsutil.InodeNotSymlink          `state:"nosave"`
 	fsutil.NoopWriteOut             `state:"nosave"`
 
-	// platform is used to allocate memory that stores the file's contents.
-	platform platform.Platform
+	// kernel is used to allocate platform memory that stores the file's contents.
+	kernel *kernel.Kernel
 
 	// memUsage is the default memory usage that will be reported by this file.
 	memUsage usage.MemoryKind
@@ -84,12 +84,12 @@ type fileInodeOperations struct {
 }
 
 // NewInMemoryFile returns a new file backed by p.Memory().
-func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr, p platform.Platform) fs.InodeOperations {
+func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr, k *kernel.Kernel) fs.InodeOperations {
 	return &fileInodeOperations{
 		attr: fsutil.InMemoryAttributes{
 			Unstable: uattr,
 		},
-		platform: p,
+		kernel:   k,
 		memUsage: usage,
 	}
 }
@@ -98,7 +98,7 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta
 func (f *fileInodeOperations) Release(context.Context) {
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
-	f.data.DropAll(f.platform.Memory())
+	f.data.DropAll(f.kernel.Platform.Memory())
 }
 
 // Mappable implements fs.InodeOperations.Mappable.
@@ -212,7 +212,7 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, siz
 	// and can remove them.
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
-	f.data.Truncate(uint64(size), f.platform.Memory())
+	f.data.Truncate(uint64(size), f.kernel.Platform.Memory())
 
 	return nil
 }
@@ -310,7 +310,7 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		return 0, nil
 	}
 
-	mem := rw.f.platform.Memory()
+	mem := rw.f.kernel.Platform.Memory()
 	var done uint64
 	seg, gap := rw.f.data.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -376,7 +376,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		}
 	}()
 
-	mem := rw.f.platform.Memory()
+	mem := rw.f.kernel.Platform.Memory()
 	// Page-aligned mr for when we need to allocate memory. RoundUp can't
 	// overflow since end is an int64.
 	pgstartaddr := usermem.Addr(rw.offset).RoundDown()
@@ -465,7 +465,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 		optional.End = pgend
 	}
 
-	mem := f.platform.Memory()
+	mem := f.kernel.Platform.Memory()
 	cerr := f.data.Fill(ctx, required, optional, mem, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 91b782540..40a8c4b1e 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -20,8 +20,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -54,13 +54,13 @@ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent
 type Dir struct {
 	ramfs.Dir
 
-	// platform is used to allocate storage for tmpfs Files.
-	platform platform.Platform
+	// kernel is used to allocate platform memory as storage for tmpfs Files.
+	kernel *kernel.Kernel
 }
 
 // NewDir returns a new directory.
-func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, platform platform.Platform) *fs.Inode {
-	d := &Dir{platform: platform}
+func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, kernel *kernel.Kernel) *fs.Inode {
+	d := &Dir{kernel: kernel}
 	d.InitDir(ctx, contents, owner, perms)
 
 	// Manually set the CreateOps.
@@ -84,7 +84,7 @@ func (d *Dir) afterLoad() {
 func (d *Dir) newCreateOps() *ramfs.CreateOps {
 	return &ramfs.CreateOps{
 		NewDir: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
-			return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource, d.platform), nil
+			return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource, d.kernel), nil
 		},
 		NewFile: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
 			uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
@@ -93,7 +93,7 @@ func (d *Dir) newCreateOps() *ramfs.CreateOps {
 				// Always start unlinked.
 				Links: 0,
 			})
-			iops := NewInMemoryFile(ctx, usage.Tmpfs, uattr, d.platform)
+			iops := NewInMemoryFile(ctx, usage.Tmpfs, uattr, d.kernel)
 			return fs.NewInode(iops, dir.MountSource, fs.StableAttr{
 				DeviceID:  tmpfsDevice.DeviceID(),
 				InodeID:   tmpfsDevice.NextIno(),
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
new file mode 100644
index 000000000..391986291
--- /dev/null
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -0,0 +1,17 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+    name = "contexttest",
+    testonly = 1,
+    srcs = ["contexttest.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/platform",
+    ],
+)
diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go
new file mode 100644
index 000000000..9eb18e7e8
--- /dev/null
+++ b/pkg/sentry/kernel/contexttest/contexttest.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package contexttest provides a test context.Context which includes
+// a dummy kernel pointing to a valid platform.
+package contexttest
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+// Context returns a Context that may be used in tests. Uses ptrace as the
+// platform.Platform, and provides a stub kernel that only serves to point to
+// the platform.
+func Context(tb testing.TB) context.Context {
+	ctx := contexttest.Context(tb)
+	k := &kernel.Kernel{
+		Platform: platform.FromContext(ctx),
+	}
+	ctx.(*contexttest.TestContext).RegisterValue(kernel.CtxKernel, k)
+	return ctx
+}
-- 
cgit v1.2.3


From 4659f7ed1a63f031b5450d065684ef6c32d35f01 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 13 Dec 2018 13:19:39 -0800
Subject: Fix WAITALL and RCVTIMEO interaction

PiperOrigin-RevId: 225424296
Change-Id: I60fcc2b859339dca9963cb32227a287e719ab765
---
 pkg/sentry/socket/epsocket/epsocket.go |  3 +++
 pkg/sentry/socket/socket.go            |  2 ++
 pkg/sentry/socket/unix/unix.go         | 10 ++++++++--
 test/syscalls/linux/socket_generic.cc  | 26 ++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index b49ef21ad..19af7bc45 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1352,6 +1352,9 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		dst = dst.DropFirst(rn)
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if n > 0 {
+				return n, senderAddr, senderAddrLen, controlMessages, nil
+			}
 			if err == syserror.ETIMEDOUT {
 				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index f31729819..f73127ea6 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -86,6 +86,8 @@ type Socket interface {
 	//
 	// senderAddrLen is the address length to be returned to the application,
 	// not necessarily the actual length of the address.
+	//
+	// If err != nil, the recv was not successful.
 	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
 
 	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 11cad411d..4c9dcbd61 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -538,6 +538,9 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			}
 			total += n
 			if err != nil || !waitAll || s.isPacket || n >= dst.NumBytes() {
+				if total > 0 {
+					err = nil
+				}
 				return int(total), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
 			}
 
@@ -546,10 +549,13 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if total > 0 {
+				err = nil
+			}
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
+				return int(total), nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+			return int(total), nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index fdc346d4d..a9edbb950 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -395,6 +395,8 @@ TEST_P(AllSocketPairTest, RecvWaitAll) {
   ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
                                sizeof(received_data), MSG_WAITALL),
               SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
 }
 
 TEST_P(AllSocketPairTest, RecvWaitAllDontWait) {
@@ -406,5 +408,29 @@ TEST_P(AllSocketPairTest, RecvWaitAllDontWait) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, RecvTimeoutWaitAll) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 200000  // 200ms
+  };
+  EXPECT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv,
+                         sizeof(tv)),
+              SyscallSucceeds());
+
+  char sent_data[100];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+
+  ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data) * 2] = {};
+  ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+                               sizeof(received_data), MSG_WAITALL),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From e1dcf92ec5cf7d9bf58fb322f46f6ae2d98699d2 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 14 Dec 2018 16:12:51 -0800
Subject: Implement SO_SNDTIMEO

PiperOrigin-RevId: 225620490
Change-Id: Ia726107b3f58093a5f881634f90b071b32d2c269
---
 pkg/sentry/fs/host/BUILD                      |   1 +
 pkg/sentry/fs/host/socket_test.go             |   3 +-
 pkg/sentry/socket/epsocket/epsocket.go        |  35 +++++++-
 pkg/sentry/socket/hostinet/socket.go          |   9 ++-
 pkg/sentry/socket/netlink/socket.go           |   4 +-
 pkg/sentry/socket/rpcinet/socket.go           |  41 ++++++++--
 pkg/sentry/socket/socket.go                   |  49 +++++++++---
 pkg/sentry/socket/unix/unix.go                |  16 ++--
 pkg/sentry/syscalls/linux/sys_socket.go       |  36 +++++++--
 test/syscalls/linux/socket_generic.cc         | 110 ++++++++++++++++++++++++++
 test/syscalls/linux/socket_stream_blocking.cc |  22 ++++++
 test/syscalls/linux/socket_unix_non_stream.cc |  22 ++++++
 12 files changed, 307 insertions(+), 41 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 89d7b2fe7..73d9cc71a 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -71,6 +71,7 @@ go_test(
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 17bf397ef..6ddf63a6a 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -142,7 +143,7 @@ func TestSocketSendMsgLen0(t *testing.T) {
 	defer sfile.DecRef()
 
 	s := sfile.FileOperations.(socket.Socket)
-	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, socket.ControlMessages{})
+	n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, false, ktime.Time{}, socket.ControlMessages{})
 	if n != 0 {
 		t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n)
 	}
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 19af7bc45..ab5d82183 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -30,6 +30,7 @@ import (
 	"strings"
 	"sync"
 	"syscall"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/binary"
@@ -137,12 +138,12 @@ type commonEndpoint interface {
 //
 // +stateify savable
 type SocketOperations struct {
-	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
 	fsutil.NoFsync       `state:"nosave"`
 	fsutil.NoopFlush     `state:"nosave"`
 	fsutil.NoMMap        `state:"nosave"`
+	socket.SendReceiveTimeout
 	*waiter.Queue
 
 	family   int
@@ -643,7 +644,16 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		}
 		return syscall.Linger{}, nil
 
+	case linux.SO_SNDTIMEO:
+		// TODO: Linux allows shorter lengths for partial results.
+		if outLen < linux.SizeOfTimeval {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		return linux.NsecToTimeval(s.SendTimeout()), nil
+
 	case linux.SO_RCVTIMEO:
+		// TODO: Linux allows shorter lengths for partial results.
 		if outLen < linux.SizeOfTimeval {
 			return nil, syserr.ErrInvalidArgument
 		}
@@ -833,6 +843,19 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
 
+	case linux.SO_SNDTIMEO:
+		if len(optVal) < linux.SizeOfTimeval {
+			return syserr.ErrInvalidArgument
+		}
+
+		var v linux.Timeval
+		binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
+			return syserr.ErrDomain
+		}
+		s.SetSendTimeout(v.ToNsecCapped())
+		return nil
+
 	case linux.SO_RCVTIMEO:
 		if len(optVal) < linux.SizeOfTimeval {
 			return syserr.ErrInvalidArgument
@@ -840,6 +863,9 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 
 		var v linux.Timeval
 		binary.Unmarshal(optVal[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
+			return syserr.ErrDomain
+		}
 		s.SetRecvTimeout(v.ToNsecCapped())
 		return nil
 
@@ -1365,7 +1391,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Reject Unix control messages.
 	if !controlMessages.Unix.Empty() {
 		return 0, syserr.ErrInvalidArgument
@@ -1431,7 +1457,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			return int(total), nil
 		}
 
-		if err := t.Block(ch); err != nil {
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				return int(total), syserr.ErrTryAgain
+			}
 			// handleIOError will consume errors from t.Block if needed.
 			return int(total), syserr.FromError(err)
 		}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index e4e950fbb..34281cac0 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -46,12 +46,12 @@ const (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
 	fsutil.NoFsync       `state:"nosave"`
 	fsutil.NoopFlush     `state:"nosave"`
 	fsutil.NoMMap        `state:"nosave"`
+	socket.SendReceiveTimeout
 
 	fd    int // must be O_NONBLOCK
 	queue waiter.Queue
@@ -418,7 +418,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Whitelist flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
@@ -468,7 +468,10 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 				panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err))
 			}
 			if ch != nil {
-				if err = t.Block(ch); err != nil {
+				if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+					if err == syserror.ETIMEDOUT {
+						err = syserror.ErrWouldBlock
+					}
 					break
 				}
 			} else {
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index c4798839e..0a7d4772c 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -65,12 +65,12 @@ var netlinkSocketDevice = device.NewAnonDevice()
 //
 // +stateify savable
 type Socket struct {
-	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
 	fsutil.NoFsync       `state:"nosave"`
 	fsutil.NoopFlush     `state:"nosave"`
 	fsutil.NoMMap        `state:"nosave"`
+	socket.SendReceiveTimeout
 
 	// ports provides netlink port allocation.
 	ports *port.Manager
@@ -593,7 +593,7 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte,
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	return s.sendMsg(t, src, to, flags, controlMessages)
 }
 
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 90844f10f..257bc2d71 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -17,6 +17,7 @@ package rpcinet
 import (
 	"sync/atomic"
 	"syscall"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/binary"
@@ -44,12 +45,12 @@ import (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
 	fsutil.NoFsync       `state:"nosave"`
 	fsutil.NoopFlush     `state:"nosave"`
 	fsutil.NoMMap        `state:"nosave"`
+	socket.SendReceiveTimeout
 
 	fd       uint32 // must be O_NONBLOCK
 	wq       *waiter.Queue
@@ -379,7 +380,8 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 
 // GetSockOpt implements socket.Socket.GetSockOpt.
 func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
-	// SO_RCVTIMEO is special because blocking is performed within the sentry.
+	// SO_RCVTIMEO and SO_SNDTIMEO are special because blocking is performed
+	// within the sentry.
 	if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
 		if outLen < linux.SizeOfTimeval {
 			return nil, syserr.ErrInvalidArgument
@@ -387,6 +389,13 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLe
 
 		return linux.NsecToTimeval(s.RecvTimeout()), nil
 	}
+	if level == linux.SOL_SOCKET && name == linux.SO_SNDTIMEO {
+		if outLen < linux.SizeOfTimeval {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		return linux.NsecToTimeval(s.SendTimeout()), nil
+	}
 
 	stack := t.NetworkContext().(*Stack)
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockOpt{&pb.GetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Length: uint32(outLen)}}}, false /* ignoreResult */)
@@ -403,8 +412,9 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLe
 // SetSockOpt implements socket.Socket.SetSockOpt.
 func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
 	// Because blocking actually happens within the sentry we need to inspect
-	// this socket option to determine if it's a SO_RCVTIMEO, and if so, we will
-	// save it and use it as the deadline for recv(2) related syscalls.
+	// this socket option to determine if it's a SO_RCVTIMEO or SO_SNDTIMEO,
+	// and if so, we will save it and use it as the deadline for recv(2)
+	// or send(2) related syscalls.
 	if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
 		if len(opt) < linux.SizeOfTimeval {
 			return syserr.ErrInvalidArgument
@@ -412,9 +422,25 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 
 		var v linux.Timeval
 		binary.Unmarshal(opt[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
+			return syserr.ErrDomain
+		}
 		s.SetRecvTimeout(v.ToNsecCapped())
 		return nil
 	}
+	if level == linux.SOL_SOCKET && name == linux.SO_SNDTIMEO {
+		if len(opt) < linux.SizeOfTimeval {
+			return syserr.ErrInvalidArgument
+		}
+
+		var v linux.Timeval
+		binary.Unmarshal(opt[:linux.SizeOfTimeval], usermem.ByteOrder, &v)
+		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
+			return syserr.ErrDomain
+		}
+		s.SetSendTimeout(v.ToNsecCapped())
+		return nil
+	}
 
 	stack := t.NetworkContext().(*Stack)
 	id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_SetSockOpt{&pb.SetSockOptRequest{Fd: s.fd, Level: int64(level), Name: int64(name), Opt: opt}}}, false /* ignoreResult */)
@@ -720,7 +746,7 @@ func rpcSendMsg(t *kernel.Task, req *pb.SyscallRequest_Sendmsg) (uint32, *syserr
 }
 
 // SendMsg implements socket.Socket.SendMsg.
-func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	// Whitelist flags.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 {
 		return 0, syserr.ErrInvalidArgument
@@ -787,7 +813,10 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			return int(totalWritten), nil
 		}
 
-		if err := t.Block(ch); err != nil {
+		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				return int(totalWritten), syserr.ErrTryAgain
+			}
 			return int(totalWritten), syserr.FromError(err)
 		}
 	}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index f73127ea6..9d4aaeb9d 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -94,15 +94,23 @@ type Socket interface {
 	// ownership of the ControlMessage on error.
 	//
 	// If n > 0, err will either be nil or an error from t.Block.
-	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages ControlMessages) (n int, err *syserr.Error)
+	SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages ControlMessages) (n int, err *syserr.Error)
 
 	// SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means
-	// no timeout.
+	// no timeout, and negative means DONTWAIT.
 	SetRecvTimeout(nanoseconds int64)
 
 	// RecvTimeout gets the current timeout (in ns) for recv operations. Zero
-	// means no timeout.
+	// means no timeout, and negative means DONTWAIT.
 	RecvTimeout() int64
+
+	// SetSendTimeout sets the timeout (in ns) for send operations. Zero means
+	// no timeout, and negative means DONTWAIT.
+	SetSendTimeout(nanoseconds int64)
+
+	// SendTimeout gets the current timeout (in ns) for send operations. Zero
+	// means no timeout, and negative means DONTWAIT.
+	SendTimeout() int64
 }
 
 // Provider is the interface implemented by providers of sockets for specific
@@ -192,30 +200,45 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 	return fs.NewDirent(inode, fmt.Sprintf("socket:[%d]", ino))
 }
 
-// ReceiveTimeout stores a timeout for receive calls.
+// SendReceiveTimeout stores timeouts for send and receive calls.
 //
 // It is meant to be embedded into Socket implementations to help satisfy the
 // interface.
 //
-// Care must be taken when copying ReceiveTimeout as it contains atomic
+// Care must be taken when copying SendReceiveTimeout as it contains atomic
 // variables.
 //
 // +stateify savable
-type ReceiveTimeout struct {
-	// ns is length of the timeout in nanoseconds.
+type SendReceiveTimeout struct {
+	// send is length of the send timeout in nanoseconds.
+	//
+	// send must be accessed atomically.
+	send int64
+
+	// recv is length of the receive timeout in nanoseconds.
 	//
-	// ns must be accessed atomically.
-	ns int64
+	// recv must be accessed atomically.
+	recv int64
 }
 
 // SetRecvTimeout implements Socket.SetRecvTimeout.
-func (rt *ReceiveTimeout) SetRecvTimeout(nanoseconds int64) {
-	atomic.StoreInt64(&rt.ns, nanoseconds)
+func (to *SendReceiveTimeout) SetRecvTimeout(nanoseconds int64) {
+	atomic.StoreInt64(&to.recv, nanoseconds)
 }
 
 // RecvTimeout implements Socket.RecvTimeout.
-func (rt *ReceiveTimeout) RecvTimeout() int64 {
-	return atomic.LoadInt64(&rt.ns)
+func (to *SendReceiveTimeout) RecvTimeout() int64 {
+	return atomic.LoadInt64(&to.recv)
+}
+
+// SetSendTimeout implements Socket.SetSendTimeout.
+func (to *SendReceiveTimeout) SetSendTimeout(nanoseconds int64) {
+	atomic.StoreInt64(&to.send, nanoseconds)
+}
+
+// SendTimeout implements Socket.SendTimeout.
+func (to *SendReceiveTimeout) SendTimeout() int64 {
+	return atomic.LoadInt64(&to.send)
 }
 
 // GetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 4c9dcbd61..da225eabb 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -45,15 +45,16 @@ import (
 //
 // +stateify savable
 type SocketOperations struct {
-	refs.AtomicRefCount
-	socket.ReceiveTimeout
 	fsutil.PipeSeek      `state:"nosave"`
 	fsutil.NotDirReaddir `state:"nosave"`
 	fsutil.NoFsync       `state:"nosave"`
 	fsutil.NoopFlush     `state:"nosave"`
 	fsutil.NoMMap        `state:"nosave"`
-	ep                   transport.Endpoint
-	isPacket             bool
+	refs.AtomicRefCount
+	socket.SendReceiveTimeout
+
+	ep       transport.Endpoint
+	isPacket bool
 }
 
 // New creates a new unix socket.
@@ -367,7 +368,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
 
 // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
+func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
 	w := EndpointWriter{
 		Endpoint: s.ep,
 		Control:  controlMessages.Unix,
@@ -404,7 +405,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 			break
 		}
 
-		if err := t.Block(ch); err != nil {
+		if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
 			break
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 1165d4566..3049fe6e5 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -612,9 +612,11 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 
 	var haveDeadline bool
 	var deadline ktime.Time
-	if dl := s.RecvTimeout(); dl != 0 {
+	if dl := s.RecvTimeout(); dl > 0 {
 		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
 		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
 	}
 
 	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
@@ -671,10 +673,11 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	if !haveDeadline {
-		dl := s.RecvTimeout()
-		if dl != 0 {
+		if dl := s.RecvTimeout(); dl > 0 {
 			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
 			haveDeadline = true
+		} else if dl < 0 {
+			flags |= linux.MSG_DONTWAIT
 		}
 	}
 
@@ -821,10 +824,11 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 
 	var haveDeadline bool
 	var deadline ktime.Time
-
-	if dl := s.RecvTimeout(); dl != 0 {
+	if dl := s.RecvTimeout(); dl > 0 {
 		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
 		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
 	}
 
 	n, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
@@ -1001,8 +1005,17 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
 		return 0, err
 	}
 
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
 	// Call the syscall implementation.
-	n, e := s.SendMsg(t, src, to, int(flags), socket.ControlMessages{Unix: controlMessages})
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: controlMessages})
 	err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
 	if err != nil {
 		controlMessages.Release()
@@ -1052,8 +1065,17 @@ func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, fla
 		return 0, err
 	}
 
+	var haveDeadline bool
+	var deadline ktime.Time
+	if dl := s.SendTimeout(); dl > 0 {
+		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+		haveDeadline = true
+	} else if dl < 0 {
+		flags |= linux.MSG_DONTWAIT
+	}
+
 	// Call the syscall implementation.
-	n, e := s.SendMsg(t, src, to, int(flags), socket.ControlMessages{Unix: control.New(t, s, nil)})
+	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
 	return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file)
 }
 
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index a9edbb950..c65b29112 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -332,6 +332,35 @@ TEST_P(AllSocketPairTest, RecvmsgTimeoutSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, SendTimeoutAllowsSend) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(AllSocketPairTest, SendmsgTimeoutAllowsSend) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  ASSERT_NO_FATAL_FAILURE(SendNullCmsg(sockets->first_fd(), buf, sizeof(buf)));
+}
+
 TEST_P(AllSocketPairTest, SoRcvTimeoIsSet) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -382,6 +411,87 @@ TEST_P(AllSocketPairTest, RecvmsgTimeoutOneSecondSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, RecvTimeoutUsecTooLarge) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 2000000  // 2 seconds.
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutUsecTooLarge) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 2000000  // 2 seconds.
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutUsecNeg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = -1
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutUsecNeg) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = -1
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutNegSec) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = -1, .tv_usec = 0
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvmsgTimeoutNegSec) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = -1, .tv_usec = 0
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  struct msghdr msg = {};
+  char buf[20] = {};
+  struct iovec iov;
+  iov.iov_base = buf;
+  iov.iov_len = sizeof(buf);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  EXPECT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
 TEST_P(AllSocketPairTest, RecvWaitAll) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index 3fbbe54d8..6cfadc9da 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -125,5 +125,27 @@ TEST_P(BlockingStreamSocketPairTest, RecvLessThanBufferWaitAll) {
   EXPECT_GE(after - before, kDuration);
 }
 
+TEST_P(BlockingStreamSocketPairTest, SendTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[100] = {};
+  for (;;) {
+    int ret;
+    ASSERT_THAT(
+        ret = RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+        ::testing::AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(EAGAIN)));
+    if (ret == -1) {
+      break;
+    }
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 620397746..264b7fe6a 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -225,5 +225,27 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   EXPECT_EQ(0, memcmp(write_buf.data(), ptr, buffer_size));
 }
 
+TEST_P(UnixNonStreamSocketPairTest, SendTimeout) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[100] = {};
+  for (;;) {
+    int ret;
+    ASSERT_THAT(
+        ret = RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+        ::testing::AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(EAGAIN)));
+    if (ret == -1) {
+      break;
+    }
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 3cf84e3bef865214fcf2b080a05065f0a9e976ec Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 14 Dec 2018 16:23:03 -0800
Subject: Mark sync.Mutex in TTYFileOperations as nosave

PiperOrigin-RevId: 225621767
Change-Id: Ie3a42cdf0b0de22a020ff43e307bf86409cff329
---
 pkg/sentry/fs/host/tty.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index f0bcdc908..ac6ad1b87 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -35,7 +35,7 @@ type TTYFileOperations struct {
 	fileOperations
 
 	// mu protects the fields below.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// FGProcessGroup is the foreground process group this TTY.  Will be
 	// nil if not set or if this file has been released.
-- 
cgit v1.2.3


From 5d8cf31346376eb7c6a93bad3eab7666f145fa0e Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 14 Dec 2018 18:03:43 -0800
Subject: Move fdnotifier package to reduce internal confusion.

PiperOrigin-RevId: 225632398
Change-Id: I909e7e2925aa369adc28e844c284d9a6108e85ce
---
 pkg/fdnotifier/BUILD                  |  14 +++
 pkg/fdnotifier/fdnotifier.go          | 200 ++++++++++++++++++++++++++++++++++
 pkg/fdnotifier/poll_unsafe.go         |  74 +++++++++++++
 pkg/sentry/fs/fdpipe/BUILD            |   4 +-
 pkg/sentry/fs/fdpipe/pipe.go          |   2 +-
 pkg/sentry/fs/fdpipe/pipe_test.go     |   2 +-
 pkg/sentry/fs/host/BUILD              |   4 +-
 pkg/sentry/fs/host/descriptor.go      |   2 +-
 pkg/sentry/fs/host/descriptor_test.go |   2 +-
 pkg/sentry/fs/host/file.go            |   2 +-
 pkg/sentry/fs/host/socket.go          |   2 +-
 pkg/sentry/fs/host/socket_test.go     |   2 +-
 pkg/sentry/kernel/eventfd/BUILD       |   2 +-
 pkg/sentry/kernel/eventfd/eventfd.go  |   2 +-
 pkg/sentry/socket/hostinet/BUILD      |   2 +-
 pkg/sentry/socket/hostinet/socket.go  |   2 +-
 pkg/waiter/fdnotifier/BUILD           |  14 ---
 pkg/waiter/fdnotifier/fdnotifier.go   | 200 ----------------------------------
 pkg/waiter/fdnotifier/poll_unsafe.go  |  74 -------------
 19 files changed, 303 insertions(+), 303 deletions(-)
 create mode 100644 pkg/fdnotifier/BUILD
 create mode 100644 pkg/fdnotifier/fdnotifier.go
 create mode 100644 pkg/fdnotifier/poll_unsafe.go
 delete mode 100644 pkg/waiter/fdnotifier/BUILD
 delete mode 100644 pkg/waiter/fdnotifier/fdnotifier.go
 delete mode 100644 pkg/waiter/fdnotifier/poll_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD
new file mode 100644
index 000000000..27d378d5b
--- /dev/null
+++ b/pkg/fdnotifier/BUILD
@@ -0,0 +1,14 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])  # Apache 2.0
+
+go_library(
+    name = "fdnotifier",
+    srcs = [
+        "fdnotifier.go",
+        "poll_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/fdnotifier",
+    visibility = ["//:sandbox"],
+    deps = ["//pkg/waiter"],
+)
diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go
new file mode 100644
index 000000000..624b1a0c5
--- /dev/null
+++ b/pkg/fdnotifier/fdnotifier.go
@@ -0,0 +1,200 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fdnotifier contains an adapter that translates IO events (e.g., a
+// file became readable/writable) from native FDs to the notifications in the
+// waiter package. It uses epoll in edge-triggered mode to receive notifications
+// for registered FDs.
+package fdnotifier
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+type fdInfo struct {
+	queue   *waiter.Queue
+	waiting bool
+}
+
+// notifier holds all the state necessary to issue notifications when IO events
+// occur in the observed FDs.
+type notifier struct {
+	// epFD is the epoll file descriptor used to register for io
+	// notifications.
+	epFD int
+
+	// mu protects fdMap.
+	mu sync.Mutex
+
+	// fdMap maps file descriptors to their notification queues and waiting
+	// status.
+	fdMap map[int32]*fdInfo
+}
+
+// newNotifier creates a new notifier object.
+func newNotifier() (*notifier, error) {
+	epfd, err := syscall.EpollCreate1(0)
+	if err != nil {
+		return nil, err
+	}
+
+	w := &notifier{
+		epFD:  epfd,
+		fdMap: make(map[int32]*fdInfo),
+	}
+
+	go w.waitAndNotify() // S/R-SAFE: no waiter exists during save / load.
+
+	return w, nil
+}
+
+// waitFD waits on mask for fd. The fdMap mutex must be hold.
+func (n *notifier) waitFD(fd int32, fi *fdInfo, mask waiter.EventMask) error {
+	if !fi.waiting && mask == 0 {
+		return nil
+	}
+
+	e := syscall.EpollEvent{
+		Events: uint32(mask) | -syscall.EPOLLET,
+		Fd:     fd,
+	}
+
+	switch {
+	case !fi.waiting && mask != 0:
+		if err := syscall.EpollCtl(n.epFD, syscall.EPOLL_CTL_ADD, int(fd), &e); err != nil {
+			return err
+		}
+		fi.waiting = true
+	case fi.waiting && mask == 0:
+		syscall.EpollCtl(n.epFD, syscall.EPOLL_CTL_DEL, int(fd), nil)
+		fi.waiting = false
+	case fi.waiting && mask != 0:
+		if err := syscall.EpollCtl(n.epFD, syscall.EPOLL_CTL_MOD, int(fd), &e); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// addFD adds an FD to the list of FDs observed by n.
+func (n *notifier) addFD(fd int32, queue *waiter.Queue) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Panic if we're already notifying on this FD.
+	if _, ok := n.fdMap[fd]; ok {
+		panic(fmt.Sprintf("File descriptor %v added twice", fd))
+	}
+
+	// We have nothing to wait for at the moment. Just add it to the map.
+	n.fdMap[fd] = &fdInfo{queue: queue}
+}
+
+// updateFD updates the set of events the fd needs to be notified on.
+func (n *notifier) updateFD(fd int32) error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if fi, ok := n.fdMap[fd]; ok {
+		return n.waitFD(fd, fi, fi.queue.Events())
+	}
+
+	return nil
+}
+
+// RemoveFD removes an FD from the list of FDs observed by n.
+func (n *notifier) removeFD(fd int32) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	// Remove from map, then from epoll object.
+	n.waitFD(fd, n.fdMap[fd], 0)
+	delete(n.fdMap, fd)
+}
+
+// hasFD returns true if the fd is in the list of observed FDs.
+func (n *notifier) hasFD(fd int32) bool {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	_, ok := n.fdMap[fd]
+	return ok
+}
+
+// waitAndNotify run is its own goroutine and loops waiting for io event
+// notifications from the epoll object. Once notifications arrive, they are
+// dispatched to the registered queue.
+func (n *notifier) waitAndNotify() error {
+	e := make([]syscall.EpollEvent, 100)
+	for {
+		v, err := epollWait(n.epFD, e, -1)
+		if err == syscall.EINTR {
+			continue
+		}
+
+		if err != nil {
+			return err
+		}
+
+		n.mu.Lock()
+		for i := 0; i < v; i++ {
+			if fi, ok := n.fdMap[e[i].Fd]; ok {
+				fi.queue.Notify(waiter.EventMask(e[i].Events))
+			}
+		}
+		n.mu.Unlock()
+	}
+}
+
+var shared struct {
+	notifier *notifier
+	once     sync.Once
+	initErr  error
+}
+
+// AddFD adds an FD to the list of observed FDs.
+func AddFD(fd int32, queue *waiter.Queue) error {
+	shared.once.Do(func() {
+		shared.notifier, shared.initErr = newNotifier()
+	})
+
+	if shared.initErr != nil {
+		return shared.initErr
+	}
+
+	shared.notifier.addFD(fd, queue)
+	return nil
+}
+
+// UpdateFD updates the set of events the fd needs to be notified on.
+func UpdateFD(fd int32) error {
+	return shared.notifier.updateFD(fd)
+}
+
+// RemoveFD removes an FD from the list of observed FDs.
+func RemoveFD(fd int32) {
+	shared.notifier.removeFD(fd)
+}
+
+// HasFD returns true if the FD is in the list of observed FDs.
+//
+// This should only be used by tests to assert that FDs are correctly registered.
+func HasFD(fd int32) bool {
+	return shared.notifier.hasFD(fd)
+}
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
new file mode 100644
index 000000000..8459d4c74
--- /dev/null
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fdnotifier
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// NonBlockingPoll polls the given FD in non-blocking fashion. It is used just
+// to query the FD's current state.
+func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
+	e := struct {
+		fd      int32
+		events  int16
+		revents int16
+	}{
+		fd:     fd,
+		events: int16(mask),
+	}
+
+	for {
+		n, _, err := syscall.RawSyscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&e)), 1, 0)
+		// Interrupted by signal, try again.
+		if err == syscall.EINTR {
+			continue
+		}
+		// If an error occur we'll conservatively say the FD is ready for
+		// whatever is being checked.
+		if err != 0 {
+			return mask
+		}
+
+		// If no FDs were returned, it wasn't ready for anything.
+		if n == 0 {
+			return 0
+		}
+
+		// Otherwise we got the ready events in the revents field.
+		return waiter.EventMask(e.revents)
+	}
+}
+
+// epollWait performs a blocking wait on epfd.
+//
+// Preconditions:
+//  * len(events) > 0
+func epollWait(epfd int, events []syscall.EpollEvent, msec int) (int, error) {
+	if len(events) == 0 {
+		panic("Empty events passed to EpollWait")
+	}
+
+	// We actually use epoll_pwait with NULL sigmask instead of epoll_wait
+	// since that is what the Go >= 1.11 runtime prefers.
+	r, _, e := syscall.Syscall6(syscall.SYS_EPOLL_PWAIT, uintptr(epfd), uintptr(unsafe.Pointer(&events[0])), uintptr(len(events)), uintptr(msec), 0, 0)
+	if e != 0 {
+		return 0, e
+	}
+	return int(r), nil
+}
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index ffe4204bc..8a0937cda 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -14,6 +14,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/log",
         "//pkg/secio",
         "//pkg/sentry/context",
@@ -23,7 +24,6 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/syserror",
         "//pkg/waiter",
-        "//pkg/waiter/fdnotifier",
     ],
 )
 
@@ -37,12 +37,12 @@ go_test(
     embed = [":fdpipe"],
     deps = [
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
-        "//pkg/waiter/fdnotifier",
         "@com_github_google_uuid//:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index bfafff5ec..e3b830747 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -21,6 +21,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/secio"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -30,7 +31,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 // pipeOperations are the fs.FileOperations of a host pipe.
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index d3f15be6b..7e3ee5257 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -22,11 +22,11 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 func singlePipeFD() (int, error) {
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 73d9cc71a..6877eb161 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -27,6 +27,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/secio",
@@ -51,7 +52,6 @@ go_library(
         "//pkg/tcpip",
         "//pkg/unet",
         "//pkg/waiter",
-        "//pkg/waiter/fdnotifier",
     ],
 )
 
@@ -68,6 +68,7 @@ go_test(
     embed = [":host"],
     deps = [
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
@@ -78,6 +79,5 @@ go_test(
         "//pkg/syserr",
         "//pkg/tcpip",
         "//pkg/waiter",
-        "//pkg/waiter/fdnotifier",
     ],
 )
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 7c9d2b299..554e1693a 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -19,9 +19,9 @@ import (
 	"path"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 // descriptor wraps a host fd.
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
index 6bc1bd2ae..5dec84ab2 100644
--- a/pkg/sentry/fs/host/descriptor_test.go
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -20,8 +20,8 @@ import (
 	"syscall"
 	"testing"
 
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 func TestDescriptorRelease(t *testing.T) {
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 975084c86..bc6ee7aa4 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/secio"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -29,7 +30,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 // fileOperations implements fs.FileOperations for a host file descriptor.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index b9e2aa705..be2c3581f 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -32,7 +33,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 // maxSendBufferSize is the maximum host send buffer size allowed for endpoint.
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 6ddf63a6a..83e8e1b3c 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -20,6 +20,7 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
@@ -28,7 +29,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 var (
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index cc1120b4f..d96803fc9 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -9,6 +9,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/fdnotifier",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
@@ -16,7 +17,6 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/syserror",
         "//pkg/waiter",
-        "//pkg/waiter/fdnotifier",
     ],
 )
 
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 26dc59a85..063a1d5f5 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -22,6 +22,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
@@ -29,7 +30,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 // EventOperations represents an event with the semantics of Linux's file-based event
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index c30220a46..b8dceb102 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -17,6 +17,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/fdnotifier",
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
@@ -34,6 +35,5 @@ go_library(
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/waiter",
-        "//pkg/waiter/fdnotifier",
     ],
 )
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 34281cac0..f3ecb6dc3 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -19,6 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
@@ -32,7 +33,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
-	"gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier"
 )
 
 const (
diff --git a/pkg/waiter/fdnotifier/BUILD b/pkg/waiter/fdnotifier/BUILD
deleted file mode 100644
index af6baa303..000000000
--- a/pkg/waiter/fdnotifier/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-package(licenses = ["notice"])  # Apache 2.0
-
-go_library(
-    name = "fdnotifier",
-    srcs = [
-        "fdnotifier.go",
-        "poll_unsafe.go",
-    ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier",
-    visibility = ["//:sandbox"],
-    deps = ["//pkg/waiter"],
-)
diff --git a/pkg/waiter/fdnotifier/fdnotifier.go b/pkg/waiter/fdnotifier/fdnotifier.go
deleted file mode 100644
index 624b1a0c5..000000000
--- a/pkg/waiter/fdnotifier/fdnotifier.go
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package fdnotifier contains an adapter that translates IO events (e.g., a
-// file became readable/writable) from native FDs to the notifications in the
-// waiter package. It uses epoll in edge-triggered mode to receive notifications
-// for registered FDs.
-package fdnotifier
-
-import (
-	"fmt"
-	"sync"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-type fdInfo struct {
-	queue   *waiter.Queue
-	waiting bool
-}
-
-// notifier holds all the state necessary to issue notifications when IO events
-// occur in the observed FDs.
-type notifier struct {
-	// epFD is the epoll file descriptor used to register for io
-	// notifications.
-	epFD int
-
-	// mu protects fdMap.
-	mu sync.Mutex
-
-	// fdMap maps file descriptors to their notification queues and waiting
-	// status.
-	fdMap map[int32]*fdInfo
-}
-
-// newNotifier creates a new notifier object.
-func newNotifier() (*notifier, error) {
-	epfd, err := syscall.EpollCreate1(0)
-	if err != nil {
-		return nil, err
-	}
-
-	w := &notifier{
-		epFD:  epfd,
-		fdMap: make(map[int32]*fdInfo),
-	}
-
-	go w.waitAndNotify() // S/R-SAFE: no waiter exists during save / load.
-
-	return w, nil
-}
-
-// waitFD waits on mask for fd. The fdMap mutex must be hold.
-func (n *notifier) waitFD(fd int32, fi *fdInfo, mask waiter.EventMask) error {
-	if !fi.waiting && mask == 0 {
-		return nil
-	}
-
-	e := syscall.EpollEvent{
-		Events: uint32(mask) | -syscall.EPOLLET,
-		Fd:     fd,
-	}
-
-	switch {
-	case !fi.waiting && mask != 0:
-		if err := syscall.EpollCtl(n.epFD, syscall.EPOLL_CTL_ADD, int(fd), &e); err != nil {
-			return err
-		}
-		fi.waiting = true
-	case fi.waiting && mask == 0:
-		syscall.EpollCtl(n.epFD, syscall.EPOLL_CTL_DEL, int(fd), nil)
-		fi.waiting = false
-	case fi.waiting && mask != 0:
-		if err := syscall.EpollCtl(n.epFD, syscall.EPOLL_CTL_MOD, int(fd), &e); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// addFD adds an FD to the list of FDs observed by n.
-func (n *notifier) addFD(fd int32, queue *waiter.Queue) {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	// Panic if we're already notifying on this FD.
-	if _, ok := n.fdMap[fd]; ok {
-		panic(fmt.Sprintf("File descriptor %v added twice", fd))
-	}
-
-	// We have nothing to wait for at the moment. Just add it to the map.
-	n.fdMap[fd] = &fdInfo{queue: queue}
-}
-
-// updateFD updates the set of events the fd needs to be notified on.
-func (n *notifier) updateFD(fd int32) error {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	if fi, ok := n.fdMap[fd]; ok {
-		return n.waitFD(fd, fi, fi.queue.Events())
-	}
-
-	return nil
-}
-
-// RemoveFD removes an FD from the list of FDs observed by n.
-func (n *notifier) removeFD(fd int32) {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	// Remove from map, then from epoll object.
-	n.waitFD(fd, n.fdMap[fd], 0)
-	delete(n.fdMap, fd)
-}
-
-// hasFD returns true if the fd is in the list of observed FDs.
-func (n *notifier) hasFD(fd int32) bool {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-
-	_, ok := n.fdMap[fd]
-	return ok
-}
-
-// waitAndNotify run is its own goroutine and loops waiting for io event
-// notifications from the epoll object. Once notifications arrive, they are
-// dispatched to the registered queue.
-func (n *notifier) waitAndNotify() error {
-	e := make([]syscall.EpollEvent, 100)
-	for {
-		v, err := epollWait(n.epFD, e, -1)
-		if err == syscall.EINTR {
-			continue
-		}
-
-		if err != nil {
-			return err
-		}
-
-		n.mu.Lock()
-		for i := 0; i < v; i++ {
-			if fi, ok := n.fdMap[e[i].Fd]; ok {
-				fi.queue.Notify(waiter.EventMask(e[i].Events))
-			}
-		}
-		n.mu.Unlock()
-	}
-}
-
-var shared struct {
-	notifier *notifier
-	once     sync.Once
-	initErr  error
-}
-
-// AddFD adds an FD to the list of observed FDs.
-func AddFD(fd int32, queue *waiter.Queue) error {
-	shared.once.Do(func() {
-		shared.notifier, shared.initErr = newNotifier()
-	})
-
-	if shared.initErr != nil {
-		return shared.initErr
-	}
-
-	shared.notifier.addFD(fd, queue)
-	return nil
-}
-
-// UpdateFD updates the set of events the fd needs to be notified on.
-func UpdateFD(fd int32) error {
-	return shared.notifier.updateFD(fd)
-}
-
-// RemoveFD removes an FD from the list of observed FDs.
-func RemoveFD(fd int32) {
-	shared.notifier.removeFD(fd)
-}
-
-// HasFD returns true if the FD is in the list of observed FDs.
-//
-// This should only be used by tests to assert that FDs are correctly registered.
-func HasFD(fd int32) bool {
-	return shared.notifier.hasFD(fd)
-}
diff --git a/pkg/waiter/fdnotifier/poll_unsafe.go b/pkg/waiter/fdnotifier/poll_unsafe.go
deleted file mode 100644
index 8459d4c74..000000000
--- a/pkg/waiter/fdnotifier/poll_unsafe.go
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fdnotifier
-
-import (
-	"syscall"
-	"unsafe"
-
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// NonBlockingPoll polls the given FD in non-blocking fashion. It is used just
-// to query the FD's current state.
-func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
-	e := struct {
-		fd      int32
-		events  int16
-		revents int16
-	}{
-		fd:     fd,
-		events: int16(mask),
-	}
-
-	for {
-		n, _, err := syscall.RawSyscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&e)), 1, 0)
-		// Interrupted by signal, try again.
-		if err == syscall.EINTR {
-			continue
-		}
-		// If an error occur we'll conservatively say the FD is ready for
-		// whatever is being checked.
-		if err != 0 {
-			return mask
-		}
-
-		// If no FDs were returned, it wasn't ready for anything.
-		if n == 0 {
-			return 0
-		}
-
-		// Otherwise we got the ready events in the revents field.
-		return waiter.EventMask(e.revents)
-	}
-}
-
-// epollWait performs a blocking wait on epfd.
-//
-// Preconditions:
-//  * len(events) > 0
-func epollWait(epfd int, events []syscall.EpollEvent, msec int) (int, error) {
-	if len(events) == 0 {
-		panic("Empty events passed to EpollWait")
-	}
-
-	// We actually use epoll_pwait with NULL sigmask instead of epoll_wait
-	// since that is what the Go >= 1.11 runtime prefers.
-	r, _, e := syscall.Syscall6(syscall.SYS_EPOLL_PWAIT, uintptr(epfd), uintptr(unsafe.Pointer(&events[0])), uintptr(len(events)), uintptr(msec), 0, 0)
-	if e != 0 {
-		return 0, e
-	}
-	return int(r), nil
-}
-- 
cgit v1.2.3


From 2421006426445a1827422c2dbdd6fc6a47087147 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 17 Dec 2018 11:37:38 -0800
Subject: Implement mlock(), kind of.

Currently mlock() and friends do nothing whatsoever. However, mlocking
is directly application-visible in a number of ways; for example,
madvise(MADV_DONTNEED) and msync(MS_INVALIDATE) both fail on mlocked
regions. We handle this inconsistently: MADV_DONTNEED is too important
to not work, but MS_INVALIDATE is rejected.

Change MM to track mlocked regions in a manner consistent with Linux.
It still will not actually pin pages into host physical memory, but:

- mlock() will now cause sentry memory management to precommit mlocked
pages.

- MADV_DONTNEED and MS_INVALIDATE will interact with mlocked pages as
described above.

PiperOrigin-RevId: 225861605
Change-Id: Iee187204979ac9a4d15d0e037c152c0902c8d0ee
---
 pkg/abi/linux/limits.go                 |   2 +-
 pkg/abi/linux/mm.go                     |  12 +
 pkg/sentry/limits/limits.go             |   2 +-
 pkg/sentry/limits/linux.go              |   2 +-
 pkg/sentry/memmap/memmap.go             |  37 +++
 pkg/sentry/mm/BUILD                     |   1 +
 pkg/sentry/mm/address_space.go          |  12 +-
 pkg/sentry/mm/lifecycle.go              |  24 +-
 pkg/sentry/mm/mm.go                     |  24 +-
 pkg/sentry/mm/syscalls.go               | 423 +++++++++++++++++++++++++++-----
 pkg/sentry/mm/vma.go                    |  38 +++
 pkg/sentry/syscalls/linux/linux64.go    |  15 +-
 pkg/sentry/syscalls/linux/sys_mmap.go   | 106 +++++---
 pkg/sentry/syscalls/linux/sys_rlimit.go |   1 +
 runsc/boot/limits.go                    |   4 +-
 test/syscalls/linux/BUILD               |  15 ++
 test/syscalls/linux/mlock.cc            | 344 ++++++++++++++++++++++++++
 test/syscalls/linux/msync.cc            |  20 +-
 18 files changed, 947 insertions(+), 135 deletions(-)
 create mode 100644 test/syscalls/linux/mlock.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index b2e51b9bd..e0aa5b31d 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -60,7 +60,7 @@ const (
 	DefaultNofileHardLimit = 4096
 
 	// DefaultMemlockLimit is called MLOCK_LIMIT in Linux.
-	DefaultMemlockLimit = 64 * 1094
+	DefaultMemlockLimit = 64 * 1024
 
 	// DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux.
 	DefaultMsgqueueLimit = 819200
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index 3fcdf8235..eda8d9788 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -49,6 +49,18 @@ const (
 	MREMAP_FIXED   = 1 << 1
 )
 
+// Flags for mlock2(2).
+const (
+	MLOCK_ONFAULT = 0x01
+)
+
+// Flags for mlockall(2).
+const (
+	MCL_CURRENT = 1
+	MCL_FUTURE  = 2
+	MCL_ONFAULT = 4
+)
+
 // Advice for madvise(2).
 const (
 	MADV_NORMAL       = 0
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index ba0b7d4fd..eeca01876 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -33,7 +33,7 @@ const (
 	Rss
 	ProcessCount
 	NumberOfFiles
-	MemoryPagesLocked
+	MemoryLocked
 	AS
 	Locks
 	SignalsPending
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 511db6733..295f9c398 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
 	linux.RLIMIT_RSS:        Rss,
 	linux.RLIMIT_NPROC:      ProcessCount,
 	linux.RLIMIT_NOFILE:     NumberOfFiles,
-	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
+	linux.RLIMIT_MEMLOCK:    MemoryLocked,
 	linux.RLIMIT_AS:         AS,
 	linux.RLIMIT_LOCKS:      Locks,
 	linux.RLIMIT_SIGPENDING: SignalsPending,
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 28e2bed9b..cf20b11e3 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -243,6 +243,40 @@ type MappingIdentity interface {
 	Msync(ctx context.Context, mr MappableRange) error
 }
 
+// MLockMode specifies the memory locking behavior of a memory mapping.
+type MLockMode int
+
+// Note that the ordering of MLockModes is significant; see
+// mm.MemoryManager.defMLockMode.
+const (
+	// MLockNone specifies that a mapping has no memory locking behavior.
+	//
+	// This must be the zero value for MLockMode.
+	MLockNone MLockMode = iota
+
+	// MLockEager specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be made, and kept, resident in
+	// physical memory as soon as possible.
+	//
+	// As of this writing, MLockEager does not cause memory-locking to be
+	// requested from the host; it only affects the sentry's memory management
+	// behavior.
+	//
+	// MLockEager is analogous to Linux's VM_LOCKED.
+	MLockEager
+
+	// MLockLazy specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be kept resident in physical memory
+	// once they have been made resident due to e.g. a page fault.
+	//
+	// As of this writing, MLockLazy does not cause memory-locking to be
+	// requested from the host; in fact, it has virtually no effect, except for
+	// interactions between mlocked pages and other syscalls.
+	//
+	// MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
+	MLockLazy
+)
+
 // MMapOpts specifies a request to create a memory mapping.
 type MMapOpts struct {
 	// Length is the length of the mapping.
@@ -303,6 +337,9 @@ type MMapOpts struct {
 	// mapping (see platform.AddressSpace.MapFile).
 	Precommit bool
 
+	// MLockMode specifies the memory locking behavior of the mapping.
+	MLockMode MLockMode
+
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 744e73a39..5a9185e5d 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -106,6 +106,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 7488f7c4a..e7aa24c69 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() {
 // for all addresses in ar should be precommitted.
 //
 // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg.Range().Contains(ar.Start).
+// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
@@ -173,7 +173,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		}
 	}
 
-	for {
+	// Since this checks ar.End and not mapAR.End, we will never map a pma that
+	// is not required.
+	for pseg.Ok() && pseg.Start() < ar.End {
 		pma := pseg.ValuePtr()
 		pmaAR := pseg.Range()
 		pmaMapAR := pmaAR.Intersect(mapAR)
@@ -184,13 +186,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
 			return err
 		}
-		// Since this checks ar.End and not mapAR.End, we will never map a pma
-		// that is not required.
-		if ar.End <= pmaAR.End {
-			return nil
-		}
 		pseg = pseg.NextSegment()
 	}
+	return nil
 }
 
 // unmapASLocked removes all AddressSpace mappings for addresses in ar.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 1613ce11d..a42e32b43 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -58,13 +59,17 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
-		p:                    mm.p,
-		haveASIO:             mm.haveASIO,
-		layout:               mm.layout,
-		privateRefs:          mm.privateRefs,
-		users:                1,
-		usageAS:              mm.usageAS,
-		brk:                  mm.brk,
+		p:           mm.p,
+		haveASIO:    mm.haveASIO,
+		layout:      mm.layout,
+		privateRefs: mm.privateRefs,
+		users:       1,
+		brk:         mm.brk,
+		usageAS:     mm.usageAS,
+		// "The child does not inherit its parent's memory locks (mlock(2),
+		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
+		// MLockNone, both of which are zero values. vma.mlockMode is reset
+		// when copied below.
 		captureInvalidations: true,
 		argv:                 mm.argv,
 		envv:                 mm.envv,
@@ -77,7 +82,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	// Copy vmas.
 	dstvgap := mm2.vmas.FirstGap()
 	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
-		vma := srcvseg.ValuePtr()
+		vma := srcvseg.Value() // makes a copy of the vma
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
@@ -89,7 +94,8 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
+		vma.mlockMode = memmap.MLockNone
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
 		// We don't need to update mm2.usageAS since we copied it from mm
 		// above.
 	}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index b1e39e898..c0632d232 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -95,11 +95,6 @@ type MemoryManager struct {
 	// vmas is protected by mappingMu.
 	vmas vmaSet
 
-	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
-	//
-	// usageAS is protected by mappingMu.
-	usageAS uint64
-
 	// brk is the mm's brk, which is manipulated using the brk(2) system call.
 	// The brk is initially set up by the loader which maps an executable
 	// binary into the mm.
@@ -107,6 +102,23 @@ type MemoryManager struct {
 	// brk is protected by mappingMu.
 	brk usermem.AddrRange
 
+	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+	//
+	// usageAS is protected by mappingMu.
+	usageAS uint64
+
+	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
+	// memmap.MLockNone.
+	//
+	// lockedAS is protected by mappingMu.
+	lockedAS uint64
+
+	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
+	// defMLockMode is greater.
+	//
+	// defMLockMode is protected by mappingMu.
+	defMLockMode memmap.MLockMode
+
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
 	activeMu ssync.DowngradableRWMutex `state:"nosave"`
@@ -252,6 +264,8 @@ type vma struct {
 	// metag, none of which we currently support.
 	growsDown bool `state:"manual"`
 
+	mlockMode memmap.MLockMode
+
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index daaae4da1..383703ec3 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -128,16 +129,24 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 
 	// Get the new vma.
 	mm.mappingMu.Lock()
+	if opts.MLockMode < mm.defMLockMode {
+		opts.MLockMode = mm.defMLockMode
+	}
 	vseg, ar, err := mm.createVMALocked(ctx, opts)
 	if err != nil {
 		mm.mappingMu.Unlock()
 		return 0, err
 	}
 
+	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
+	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
+	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
+	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
+	// populate_vma_page_range(). Confirm this behavior.
 	switch {
-	case opts.Precommit:
+	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
 		// Get pmas and map with precommit as requested.
-		mm.populateAndUnlock(ctx, vseg, ar, true)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
 		// NOTE: Get pmas and map eagerly in the hope
@@ -146,7 +155,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
 		// to avoid needing to allocate large amounts of memory that we may
 		// subsequently need to checkpoint.
-		mm.populateAndUnlock(ctx, vseg, ar, false)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
 
 	default:
 		mm.mappingMu.Unlock()
@@ -155,31 +164,29 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 	return ar.Start, nil
 }
 
-// Preconditions: mm.mappingMu must be locked for writing.
+// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
+// into mm.as if it is active.
 //
-// Postconditions: mm.mappingMu will be unlocked.
-func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
 		// mm/gup.c:populate_vma_page_range.
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	mm.activeMu.Lock()
+	// Can't defer mm.activeMu.Unlock(); see below.
 
-	// Even if we get a new pma, we can't actually map it if we don't have an
+	// Even if we get new pmas, we can't actually map them if we don't have an
 	// AddressSpace.
 	if mm.as == nil {
 		mm.activeMu.Unlock()
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	// Ensure that we have usable pmas.
-	mm.mappingMu.DowngradeLock()
 	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
-	mm.mappingMu.RUnlock()
 	if err != nil {
 		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
 		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@@ -197,6 +204,45 @@ func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator
 	mm.activeMu.RUnlock()
 }
 
+// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
+// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
+// preferable to populateVMA since it unlocks mm.mappingMu before performing
+// expensive operations that don't require it to be locked.
+//
+// Preconditions: mm.mappingMu must be locked for writing.
+// vseg.Range().IsSupersetOf(ar).
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	// See populateVMA above for commentary.
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
+	// isn't needed at all for mapASLocked.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	mm.activeMu.DowngradeLock()
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
 // MapStack allocates the initial process stack.
 func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
 	// maxStackSize is the maximum supported process stack size in bytes.
@@ -236,6 +282,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error
 		MaxPerms:  usermem.AnyAccess,
 		Private:   true,
 		GrowsDown: true,
+		MLockMode: mm.defMLockMode,
 		Hint:      "[stack]",
 	})
 	return ar, err
@@ -334,6 +381,19 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// occupies at least part of the destination. Thus the NoMove case always
 	// fails and the MayMove case always falls back to copying.
 
+	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
+		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
+		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
+		// !CAP_IPC_LOCK.
+		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
+				return 0, syserror.EAGAIN
+			}
+		}
+	}
+
 	if opts.Move != MRemapMustMove {
 		// Handle no-ops and in-place shrinking. These cases don't care if
 		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
@@ -360,7 +420,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.mappable != nil {
 			newOffset = vseg.mappableRange().End
 		}
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length:          newSize - oldSize,
 			MappingIdentity: vma.id,
 			Mappable:        vma.mappable,
@@ -371,9 +431,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			MaxPerms:        vma.maxPerms,
 			Private:         vma.private,
 			GrowsDown:       vma.growsDown,
+			MLockMode:       vma.mlockMode,
 			Hint:            vma.hint,
 		})
 		if err == nil {
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, ar, true)
+			}
 			return oldAddr, nil
 		}
 		// In-place growth failed. In the MRemapMayMove case, fall through to
@@ -462,8 +526,14 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		mm.vmas.Add(newAR, vma)
+		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 		mm.usageAS += uint64(newAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS += uint64(newAR.Length())
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, newAR, true)
+			}
+		}
 		return newAR.Start, nil
 	}
 
@@ -485,8 +555,11 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	vseg = mm.vmas.Isolate(vseg, oldAR)
 	vma := vseg.Value()
 	mm.vmas.Remove(vseg)
-	mm.vmas.Add(newAR, vma)
+	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if vma.mlockMode != memmap.MLockNone {
+		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	}
 
 	// Move pmas. This is technically optional for non-private pmas, which
 	// could just go through memmap.Mappable.Translate again, but it's required
@@ -501,6 +574,10 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
 	}
 
+	if vma.mlockMode == memmap.MLockEager {
+		mm.populateVMA(ctx, vseg, newAR, true)
+	}
+
 	return newAR.Start, nil
 }
 
@@ -611,9 +688,10 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
 // error on failure.
 func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
 	mm.mappingMu.Lock()
-	defer mm.mappingMu.Unlock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
 
 	if addr < mm.brk.Start {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EINVAL
 	}
 
@@ -623,21 +701,24 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// heap + data + bss. The segment sizes need to be plumbed from the
 	// loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.ENOMEM
 	}
 
 	oldbrkpg, _ := mm.brk.End.RoundUp()
 	newbrkpg, ok := addr.RoundUp()
 	if !ok {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EFAULT
 	}
 
 	switch {
 	case newbrkpg < oldbrkpg:
 		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+		mm.mappingMu.Unlock()
 
 	case oldbrkpg < newbrkpg:
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length: uint64(newbrkpg - oldbrkpg),
 			Addr:   oldbrkpg,
 			Fixed:  true,
@@ -646,17 +727,221 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 			Perms:    usermem.ReadWrite,
 			MaxPerms: usermem.AnyAccess,
 			Private:  true,
-			Hint:     "[heap]",
+			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
+			// mm->def_flags.
+			MLockMode: mm.defMLockMode,
+			Hint:      "[heap]",
 		})
 		if err != nil {
+			mm.mappingMu.Unlock()
 			return mm.brk.End, err
 		}
+		if mm.defMLockMode == memmap.MLockEager {
+			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+		} else {
+			mm.mappingMu.Unlock()
+		}
+
+	default:
+		// Nothing to do.
+		mm.mappingMu.Unlock()
 	}
 
 	mm.brk.End = addr
 	return addr, nil
 }
 
+// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
+// depending on mode.
+func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
+	ar, ok := addr.RoundDown().ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if mode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				mm.mappingMu.Unlock()
+				return syserror.EPERM
+			}
+			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
+				mm.mappingMu.Unlock()
+				return syserror.ENOMEM
+			}
+		}
+	}
+
+	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
+	if ar.Length() == 0 {
+		mm.mappingMu.Unlock()
+		return nil
+	}
+
+	// Apply the new mlock mode to vmas.
+	var unmapped bool
+	vseg := mm.vmas.FindSegment(ar.Start)
+	for {
+		if !vseg.Ok() {
+			unmapped = true
+			break
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		prevMode := vma.mlockMode
+		vma.mlockMode = mode
+		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+			mm.lockedAS += uint64(vseg.Range().Length())
+		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vseg.Range().Length())
+		}
+		if ar.End <= vseg.End() {
+			break
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+	mm.vmas.MergeRange(ar)
+	mm.vmas.MergeAdjacent(ar)
+	if unmapped {
+		mm.mappingMu.Unlock()
+		return syserror.ENOMEM
+	}
+
+	if mode == memmap.MLockEager {
+		// Ensure that we have usable pmas. Since we didn't return ENOMEM
+		// above, ar must be fully covered by vmas, so we can just use
+		// NextSegment below.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+			if !vseg.ValuePtr().effectivePerms.Any() {
+				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
+				// case, which is converted to ENOMEM by mlock.
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				return syserror.ENOMEM
+			}
+			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
+			if err != nil {
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				// Linux: mm/mlock.c:__mlock_posix_error_return()
+				if err == syserror.EFAULT {
+					return syserror.ENOMEM
+				}
+				if err == syserror.ENOMEM {
+					return syserror.EAGAIN
+				}
+				return err
+			}
+		}
+
+		// Map pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
+			mm.activeMu.RUnlock()
+			if err != nil {
+				return err
+			}
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+
+	return nil
+}
+
+// MLockAllOpts holds options to MLockAll.
+type MLockAllOpts struct {
+	// If Current is true, change the memory-locking behavior of all mappings
+	// to Mode. If Future is true, upgrade the memory-locking behavior of all
+	// future mappings to Mode. At least one of Current or Future must be true.
+	Current bool
+	Future  bool
+	Mode    memmap.MLockMode
+}
+
+// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
+// depending on opts.
+func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
+	if !opts.Current && !opts.Future {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if opts.Current {
+		if opts.Mode != memmap.MLockNone {
+			// Check against RLIMIT_MEMLOCK.
+			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+				if mlockLimit == 0 {
+					mm.mappingMu.Unlock()
+					return syserror.EPERM
+				}
+				if uint64(mm.vmas.Span()) > mlockLimit {
+					mm.mappingMu.Unlock()
+					return syserror.ENOMEM
+				}
+			}
+		}
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			vma := vseg.ValuePtr()
+			prevMode := vma.mlockMode
+			vma.mlockMode = opts.Mode
+			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+				mm.lockedAS += uint64(vseg.Range().Length())
+			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+				mm.lockedAS -= uint64(vseg.Range().Length())
+			}
+		}
+	}
+
+	if opts.Future {
+		mm.defMLockMode = opts.Mode
+	}
+
+	if opts.Current && opts.Mode == memmap.MLockEager {
+		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
+		// ignores the return value of __mm_populate(), so all errors below are
+		// ignored.
+		//
+		// Try to get usable pmas.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			if vseg.ValuePtr().effectivePerms.Any() {
+				mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
+			}
+		}
+
+		// Map all pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
+			mm.activeMu.RUnlock()
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+	return nil
+}
+
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
@@ -680,46 +965,49 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
-	vseg := mm.vmas.LowerBoundSegment(ar.Start)
 	mem := mm.p.Memory()
-	for pseg.Ok() && pseg.Start() < ar.End {
-		pma := pseg.ValuePtr()
-		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
-			psegAR := pseg.Range().Intersect(ar)
-			vseg = vseg.seekNextLowerBound(psegAR.Start)
-			if checkInvariants {
-				if !vseg.Ok() {
-					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
-				}
-				if psegAR.Start < vseg.Start() {
-					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
-				}
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		vma := vseg.ValuePtr()
+		if vma.mlockMode != memmap.MLockNone {
+			return syserror.EINVAL
+		}
+		vsegAR := vseg.Range().Intersect(ar)
+		// pseg should already correspond to either this vma or a later one,
+		// since there can't be a pma without a corresponding vma.
+		if checkInvariants {
+			if pseg.Ok() && pseg.End() <= vsegAR.Start {
+				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
 			}
-			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
-				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
-					pseg = pseg.NextSegment()
-					continue
+		}
+		for pseg.Ok() && pseg.Start() < vsegAR.End {
+			pma := pseg.ValuePtr()
+			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+				psegAR := pseg.Range().Intersect(ar)
+				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
+					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+						pseg = pseg.NextSegment()
+						continue
+					}
+					// If an error occurs, fall through to the general
+					// invalidation case below.
 				}
-				// If an error occurs, fall through to the general
-				// invalidation case below.
 			}
+			pseg = mm.pmas.Isolate(pseg, vsegAR)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			pma.file.DecRef(pseg.fileRange())
+			mm.removeRSSLocked(pseg.Range())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
 		}
-		pseg = mm.pmas.Isolate(pseg, ar)
-		pma = pseg.ValuePtr()
-		if !didUnmapAS {
-			// Unmap all of ar, not just pseg.Range(), to minimize host
-			// syscalls. AddressSpace mappings must be removed before
-			// mm.decPrivateRef().
-			mm.unmapASLocked(ar)
-			didUnmapAS = true
-		}
-		if pma.private {
-			mm.decPrivateRef(pseg.fileRange())
-		}
-		pma.file.DecRef(pseg.fileRange())
-		mm.removeRSSLocked(pseg.Range())
-
-		pseg = mm.pmas.Remove(pseg).NextSegment()
 	}
 
 	// "If there are some parts of the specified address space that are not
@@ -732,9 +1020,28 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	return nil
 }
 
-// Sync implements the semantics of Linux's msync(MS_SYNC).
-func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
-	ar, ok := addr.ToRange(length)
+// MSyncOpts holds options to MSync.
+type MSyncOpts struct {
+	// Sync has the semantics of MS_SYNC.
+	Sync bool
+
+	// Invalidate has the semantics of MS_INVALIDATE.
+	Invalidate bool
+}
+
+// MSync implements the semantics of Linux's msync().
+func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
 	if !ok {
 		return syserror.ENOMEM
 	}
@@ -759,10 +1066,14 @@ func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uin
 		}
 		lastEnd = vseg.End()
 		vma := vseg.ValuePtr()
+		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
+			mm.mappingMu.RUnlock()
+			return syserror.EBUSY
+		}
 		// It's only possible to have dirtied the Mappable through a shared
 		// mapping. Don't check if the mapping is writable, because mprotect
 		// may have changed this, and also because Linux doesn't.
-		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
 			// We can't call memmap.MappingIdentity.Msync while holding
 			// mm.mappingMu since it may take fs locks that precede it in the
 			// lock order.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 5c2c802f6..28ba9f2f5 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -17,8 +17,10 @@ package mm
 import (
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -53,6 +55,23 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
 	}
 
+	if opts.MLockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
+			}
+			newLockedAS := mm.lockedAS + opts.Length
+			if opts.Unmap {
+				newLockedAS -= mm.mlockedBytesRangeLocked(ar)
+			}
+			if newLockedAS > mlockLimit {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
+			}
+		}
+	}
+
 	// Remove overwritten mappings. This ordering is consistent with Linux:
 	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
 	// file->f_op->mmap().
@@ -85,10 +104,14 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		maxPerms:       opts.MaxPerms,
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
+		mlockMode:      opts.MLockMode,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	})
 	mm.usageAS += opts.Length
+	if opts.MLockMode != memmap.MLockNone {
+		mm.lockedAS += opts.Length
+	}
 
 	return vseg, ar, nil
 }
@@ -201,6 +224,17 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo
 	return 0, syserror.ENOMEM
 }
 
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
+	var total uint64
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		if vseg.ValuePtr().mlockMode != memmap.MLockNone {
+			total += uint64(vseg.Range().Intersect(ar).Length())
+		}
+	}
+	return total
+}
+
 // getVMAsLocked ensures that vmas exist for all addresses in ar, and support
 // access of type (at, ignorePermissions). It returns:
 //
@@ -338,6 +372,9 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 			vma.id.DecRef()
 		}
 		mm.usageAS -= uint64(vmaAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vmaAR.Length())
+		}
 		vgap = mm.vmas.Remove(vseg)
 		vseg = vgap.NextSegment()
 	}
@@ -368,6 +405,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.maxPerms != vma2.maxPerms ||
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
+		vma1.mlockMode != vma2.mlockMode ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 2aab948da..cc5ebb955 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
 		145: SchedGetscheduler,
 		146: SchedGetPriorityMax,
 		147: SchedGetPriorityMin,
-		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
-		149: syscalls.Error(nil),                         // Mlock, TODO
-		150: syscalls.Error(nil),                         // Munlock, TODO
-		151: syscalls.Error(nil),                         // Mlockall, TODO
-		152: syscalls.Error(nil),                         // Munlockall, TODO
+		148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
+		149: Mlock,
+		150: Munlock,
+		151: Mlockall,
+		152: Munlockall,
 		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
 		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
 		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
@@ -373,8 +373,9 @@ var AMD64 = &kernel.SyscallTable{
 		//     322: Execveat, TODO
 		//     323: Userfaultfd, TODO
 		//     324: Membarrier, TODO
-		// Syscalls after 325 are backports from 4.6.
-		325: syscalls.Error(nil), // Mlock2, TODO
+		325: Mlock2,
+		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
+		//	326: CopyFileRange,
 		327: Preadv2,
 		//	328: Pwritev2,  // Pwritev2, TODO
 	},
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 145f7846c..8732861e0 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -69,6 +69,9 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
 		Precommit: linux.MAP_POPULATE&flags != 0,
 	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
 	defer func() {
 		if opts.MappingIdentity != nil {
 			opts.MappingIdentity.DecRef()
@@ -384,16 +387,6 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	length := args[1].SizeT()
 	flags := args[2].Int()
 
-	if addr != addr.RoundDown() {
-		return 0, nil, syserror.EINVAL
-	}
-	if length == 0 {
-		return 0, nil, nil
-	}
-	la, ok := usermem.Addr(length).RoundUp()
-	if !ok {
-		return 0, nil, syserror.ENOMEM
-	}
 	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
 	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
 	// permits a call to msync() that specifies neither of these flags, with
@@ -406,39 +399,72 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if sync && flags&linux.MS_ASYNC != 0 {
 		return 0, nil, syserror.EINVAL
 	}
+	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
+		Sync:       sync,
+		Invalidate: flags&linux.MS_INVALIDATE != 0,
+	})
+	// MSync calls fsync, the same interrupt conversion rules apply, see
+	// mm/msync.c, fsync POSIX.1-2008.
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Mlock implements linux syscall mlock(2).
+func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
+}
 
-	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
-	// that they can be updated with the fresh values just written)". This is a
-	// no-op given that shared memory exists. However, MS_INVALIDATE can also
-	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
-	// and a memory lock exists for the specified address range." Given that
-	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
-	// some user program could be using it for synchronization.
-	if flags&linux.MS_INVALIDATE != 0 {
+// Mlock2 implements linux syscall mlock2(2).
+func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	if flags&^(linux.MLOCK_ONFAULT) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-	// MS_SYNC "requests an update and waits for it to complete."
-	if sync {
-		err := t.MemoryManager().Sync(t, addr, uint64(la))
-		// Sync calls fsync, the same interrupt conversion rules apply, see
-		// mm/msync.c, fsync POSIX.1-2008.
-		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
-	}
-	// MS_ASYNC "specifies that an update be scheduled, but the call returns
-	// immediately". As long as dirty pages are tracked and eventually written
-	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
-	// is in fact a no-op, since the kernel properly tracks dirty pages and
-	// flushes them to storage as necessary.")
-	//
-	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
-	// This applies even for MS_ASYNC.
-	ar, ok := addr.ToRange(uint64(la))
-	if !ok {
-		return 0, nil, syserror.ENOMEM
+
+	mode := memmap.MLockEager
+	if flags&linux.MLOCK_ONFAULT != 0 {
+		mode = memmap.MLockLazy
 	}
-	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
-	if mapped != uint64(la) {
-		return 0, nil, syserror.ENOMEM
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
+}
+
+// Munlock implements linux syscall munlock(2).
+func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
+}
+
+// Mlockall implements linux syscall mlockall(2).
+func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+
+	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
+		return 0, nil, syserror.EINVAL
 	}
-	return 0, nil, nil
+
+	mode := memmap.MLockEager
+	if flags&linux.MCL_ONFAULT != 0 {
+		mode = memmap.MLockLazy
+	}
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: flags&linux.MCL_CURRENT != 0,
+		Future:  flags&linux.MCL_FUTURE != 0,
+		Mode:    mode,
+	})
+}
+
+// Munlockall implements linux syscall munlockall(2).
+func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: true,
+		Future:  true,
+		Mode:    memmap.MLockNone,
+	})
 }
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 2f16e1791..b0b216045 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -90,6 +90,7 @@ var setableLimits = map[limits.LimitType]struct{}{
 	limits.CPU:           {},
 	limits.Data:          {},
 	limits.FileSize:      {},
+	limits.MemoryLocked:  {},
 	limits.Stack:         {},
 	// These are not enforced, but we include them here to avoid returning
 	// EPERM, since some apps expect them to succeed.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 8ecda6d0e..e3e716bf9 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_DATA":       limits.Data,
 	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_LOCKS":      limits.Locks,
-	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
+	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
 	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
 	"RLIMIT_NICE":       limits.Nice,
 	"RLIMIT_NOFILE":     limits.NumberOfFiles,
@@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
-	ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
 	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
 	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 784997c18..aca55f492 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1019,6 +1019,21 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "mlock_test",
+    testonly = 1,
+    srcs = ["mlock.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:memory_util",
+        "//test/util:multiprocess_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "mmap_test",
     testonly = 1,
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
new file mode 100644
index 000000000..a0d876c2e
--- /dev/null
+++ b/test/syscalls/linux/mlock.cc
@@ -0,0 +1,344 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+using ::testing::_;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<bool> CanMlock() {
+  struct rlimit rlim;
+  if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
+    return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)");
+  }
+  if (rlim.rlim_cur != 0) {
+    return true;
+  }
+  return HaveCapability(CAP_IPC_LOCK);
+}
+
+// Returns true if the page containing addr is mlocked.
+bool IsPageMlocked(uintptr_t addr) {
+  // This relies on msync(MS_INVALIDATE) interacting correctly with mlocked
+  // pages, which is tested for by the MsyncInvalidate case below.
+  int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
+                       kPageSize, MS_ASYNC | MS_INVALIDATE);
+  if (rv == 0) {
+    return false;
+  }
+  // This uses TEST_PCHECK_MSG since it's used in subprocesses.
+  TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno");
+  return true;
+}
+
+PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
+  struct rlimit old_rlim;
+  if (getrlimit(resource, &old_rlim) != 0) {
+    return PosixError(errno, "getrlimit failed");
+  }
+  struct rlimit new_rlim = old_rlim;
+  new_rlim.rlim_cur = newval;
+  if (setrlimit(resource, &new_rlim) != 0) {
+    return PosixError(errno, "setrlimit failed");
+  }
+  return Cleanup([resource, old_rlim] {
+    TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
+  });
+}
+
+TEST(MlockTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, ProtNone) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(ENOMEM));
+  // ENOMEM is returned because mlock can't populate the page, but it's still
+  // considered locked.
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, MadviseDontneed) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MlockTest, MsyncInvalidate) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE),
+              SyscallFailsWithErrno(EBUSY));
+  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE),
+              SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MlockTest, Fork) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(
+      InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }),
+      IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MlockTest, RlimitMemlockZero) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MlockTest, RlimitMemlockInsufficient) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(ENOMEM));
+}
+
+TEST(MunlockTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MunlockTest, NotLocked) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+// There is currently no test for mlockall(MCL_CURRENT) because the default
+// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke
+// mlockall(MCL_CURRENT).
+
+TEST(MlockallTest, Future) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+
+  // Run this test in a separate (single-threaded) subprocess to ensure that a
+  // background thread doesn't try to mmap a large amount of memory, fail due
+  // to hitting RLIMIT_MEMLOCK, and explode the process violently.
+  EXPECT_THAT(InForkedProcess([] {
+                auto const mapping =
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)
+                        .ValueOrDie();
+                TEST_CHECK(!IsPageMlocked(mapping.addr()));
+                TEST_PCHECK(mlockall(MCL_FUTURE) == 0);
+                // Ensure that mlockall(MCL_FUTURE) is turned off before the end
+                // of the test, as otherwise mmaps may fail unexpectedly.
+                Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); });
+                auto const mapping2 = ASSERT_NO_ERRNO_AND_VALUE(
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+                TEST_CHECK(IsPageMlocked(mapping2.addr()));
+                // Fire munlockall() and check that it disables
+                // mlockall(MCL_FUTURE).
+                do_munlockall.Release()();
+                auto const mapping3 = ASSERT_NO_ERRNO_AND_VALUE(
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+                TEST_CHECK(!IsPageMlocked(mapping2.addr()));
+              }),
+              IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MunlockallTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(munlockall(), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+#ifndef SYS_mlock2
+#ifdef __x86_64__
+#define SYS_mlock2 325
+#endif
+#endif
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 0x01  // Linux: include/uapi/asm-generic/mman-common.h
+#endif
+
+#ifdef SYS_mlock2
+
+int mlock2(void const* addr, size_t len, int flags) {
+  return syscall(SYS_mlock2, addr, len, flags);
+}
+
+TEST(Mlock2Test, NoFlags) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, MlockOnfault) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT),
+              SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, UnknownFlags) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+#endif  // defined(SYS_mlock2)
+
+TEST(MapLockedTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MapLockedTest, RlimitMemlockZero) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  EXPECT_THAT(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+      PosixErrorIs(EPERM, _));
+}
+
+TEST(MapLockedTest, RlimitMemlockInsufficient) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+  EXPECT_THAT(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+      PosixErrorIs(EAGAIN, _));
+}
+
+TEST(MremapLockedTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  if (addr == MAP_FAILED) {
+    FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")";
+  }
+  mapping.release();
+  mapping.reset(addr, 2 * mapping.len());
+  EXPECT_TRUE(IsPageMlocked(reinterpret_cast<uintptr_t>(addr)));
+}
+
+TEST(MremapLockedTest, RlimitMemlockZero) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+      << "addr = " << addr << ", errno = " << errno;
+}
+
+TEST(MremapLockedTest, RlimitMemlockInsufficient) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE(
+      ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len()));
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+      << "addr = " << addr << ", errno = " << errno;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index 0ddc621aa..72d90dc78 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -43,14 +43,13 @@ class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
  protected:
   int msync_flags() const { return std::get<0>(GetParam()); }
 
-  PosixErrorOr<Mapping> GetMapping() const {
-    auto rv = std::get<1>(GetParam())();
-    return rv;
-  }
+  PosixErrorOr<Mapping> GetMapping() const { return std::get<1>(GetParam())(); }
 };
 
-// All valid msync(2) flag combinations (not including MS_INVALIDATE, which
-// gVisor doesn't implement).
+// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux
+// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with
+// semantics that are (currently) equivalent to specifying MS_ASYNC." -
+// msync(2))
 constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
 
 // Returns functions that return mappings that should be successfully
@@ -134,6 +133,15 @@ TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
       SyscallFailsWithErrno(EINVAL));
 }
 
+TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+  EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE),
+              SyscallSucceeds());
+}
+
+// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires
+// probing for mlock support.
+
 INSTANTIATE_TEST_CASE_P(
     All, MsyncFullParamTest,
     ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
-- 
cgit v1.2.3


From d3ae74d2a5f5933981abeae10e676a2f0cccf67e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 17 Dec 2018 13:45:59 -0800
Subject: overlayBoundEndpoint must be recursive if there is an overlay in the
 lower.

The old overlayBoundEndpoint assumed that the lower is not an overlay.  It
should check if the lower is an overlay and handle that case.

PiperOrigin-RevId: 225882303
Change-Id: I60660c587d91db2826e0719da0983ec8ad024cb8
---
 pkg/sentry/fs/inode_overlay.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 78923fb5b..512a0da28 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -390,8 +390,12 @@ func overlayBoundEndpoint(o *overlayEntry, path string) transport.BoundEndpoint
 	if o.upper != nil {
 		return o.upper.InodeOperations.BoundEndpoint(o.upper, path)
 	}
-	// If a socket is already in the lower file system, allow connections
-	// to it.
+
+	// If the lower is itself an overlay, recurse.
+	if o.lower.overlay != nil {
+		return overlayBoundEndpoint(o.lower.overlay, path)
+	}
+	// Lower is not an overlay. Call BoundEndpoint directly.
 	return o.lower.InodeOperations.BoundEndpoint(o.lower, path)
 }
 
-- 
cgit v1.2.3


From 12c7430a01ad2b484987dd8ee24b6f2907e7366d Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 17 Dec 2018 17:52:05 -0800
Subject: Fix recv blocking for connectionless Unix sockets.

Connectionless Unix sockets (DGRAM Unix sockets created with the socket system
call) inherently only have a read queue. They do not establish bidirectional
connections, instead, the connect system call only sets a default send
location. Writes give the data to the other endpoint which has its own read
queue.

To simplify the code, connectionless Unix sockets still get read and write
queues, but the write queue is a dummy and never waited on. The read queue is
the connectionless endpoint's queue. This change fixes a bug where the dummy
queue was incorrectly set as the read queue and the endpoint's queue was
incorrectly set as the write queue. This meant that read notifications went
to the dummy queue and were black holed.

PiperOrigin-RevId: 225921042
Change-Id: I8d9059def787a2c3c305185b92d05093fbd2be2a
---
 pkg/sentry/socket/unix/transport/connectioned.go   | 8 ++++----
 pkg/sentry/socket/unix/transport/connectionless.go | 2 +-
 pkg/sentry/socket/unix/transport/queue.go          | 5 -----
 test/syscalls/linux/socket_unix_blocking_local.cc  | 6 ++----
 4 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 7cfbbfe8a..62641bb34 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -135,8 +135,8 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
 		stype:        stype,
 	}
 
-	q1 := newQueue(a.Queue, b.Queue, initialLimit)
-	q2 := newQueue(b.Queue, a.Queue, initialLimit)
+	q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
+	q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
 
 	if stype == SockStream {
 		a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -286,13 +286,13 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
 		stype:       e.stype,
 	}
 
-	readQueue := newQueue(ce.WaiterQueue(), ne.Queue, initialLimit)
+	readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
 	ne.connected = &connectedEndpoint{
 		endpoint:   ce,
 		writeQueue: readQueue,
 	}
 
-	writeQueue := newQueue(ne.Queue, ce.WaiterQueue(), initialLimit)
+	writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
 	if e.stype == SockStream {
 		ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
 	} else {
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index f432a9717..728863f3f 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -34,7 +34,7 @@ type connectionlessEndpoint struct {
 // NewConnectionless creates a new unbound dgram endpoint.
 func NewConnectionless() Endpoint {
 	ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
-	ep.receiver = &queueReceiver{readQueue: newQueue(&waiter.Queue{}, ep.Queue, initialLimit)}
+	ep.receiver = &queueReceiver{readQueue: &queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}}
 	return ep
 }
 
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 05d1bdeef..45a58c600 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -38,11 +38,6 @@ type queue struct {
 	dataList messageList
 }
 
-// newQueue allocates and initializes a new queue.
-func newQueue(ReaderQueue *waiter.Queue, WriterQueue *waiter.Queue, limit int64) *queue {
-	return &queue{ReaderQueue: ReaderQueue, WriterQueue: WriterQueue, limit: limit}
-}
-
 // Close closes q for reading and writing. It is immediately not writable and
 // will become unreadable when no more data is pending.
 //
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index f79e04b33..3c2105cc7 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -33,14 +33,12 @@ std::vector<SocketPairKind> GetSocketPairs() {
       ApplyVec<SocketPairKind>(
           FilesystemBoundUnixDomainSocketPair,
           AllBitwiseCombinations(
-              // FIXME: Add SOCK_DGRAM once blocking is fixed.
-              List<int>{SOCK_STREAM, SOCK_SEQPACKET},
+              List<int>{SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM},
               List<int>{0, SOCK_CLOEXEC})),
       ApplyVec<SocketPairKind>(
           AbstractBoundUnixDomainSocketPair,
           AllBitwiseCombinations(
-              // FIXME: Add SOCK_DGRAM once blocking is fixed.
-              List<int>{SOCK_STREAM, SOCK_SEQPACKET},
+              List<int>{SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM},
               List<int>{0, SOCK_CLOEXEC})));
 }
 
-- 
cgit v1.2.3


From 03226cd95055aee73d4e4dfcb4954490b4fd8a2d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 18 Dec 2018 10:27:16 -0800
Subject: Add BPFAction type with Stringer

PiperOrigin-RevId: 226018694
Change-Id: I98965e26fe565f37e98e5df5f997363ab273c91b
---
 pkg/abi/linux/seccomp.go                       | 48 ++++++++++++++++++-----
 pkg/seccomp/seccomp.go                         | 32 ++++++----------
 pkg/seccomp/seccomp_test.go                    | 14 +++----
 pkg/sentry/kernel/seccomp.go                   | 53 +++++++++-----------------
 pkg/sentry/kernel/task_syscall.go              | 18 +++++----
 pkg/sentry/platform/ptrace/subprocess_linux.go | 16 ++++----
 test/syscalls/linux/seccomp.cc                 | 19 +++++++++
 7 files changed, 112 insertions(+), 88 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 785f2f284..8673a27bf 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -14,22 +14,52 @@
 
 package linux
 
+import "fmt"
+
 // Seccomp constants taken from <linux/seccomp.h>.
 const (
 	SECCOMP_MODE_NONE   = 0
 	SECCOMP_MODE_FILTER = 2
 
-	SECCOMP_RET_KILL_PROCESS = 0x80000000
-	SECCOMP_RET_KILL_THREAD  = 0x00000000
-	SECCOMP_RET_TRAP         = 0x00030000
-	SECCOMP_RET_ERRNO        = 0x00050000
-	SECCOMP_RET_TRACE        = 0x7ff00000
-	SECCOMP_RET_ALLOW        = 0x7fff0000
-
-	SECCOMP_RET_ACTION = 0x7fff0000
-	SECCOMP_RET_DATA   = 0x0000ffff
+	SECCOMP_RET_ACTION_FULL = 0xffff0000
+	SECCOMP_RET_ACTION      = 0x7fff0000
+	SECCOMP_RET_DATA        = 0x0000ffff
 
 	SECCOMP_SET_MODE_FILTER   = 1
 	SECCOMP_FILTER_FLAG_TSYNC = 1
 	SECCOMP_GET_ACTION_AVAIL  = 2
 )
+
+type BPFAction uint32
+
+const (
+	SECCOMP_RET_KILL_PROCESS BPFAction = 0x80000000
+	SECCOMP_RET_KILL_THREAD            = 0x00000000
+	SECCOMP_RET_TRAP                   = 0x00030000
+	SECCOMP_RET_ERRNO                  = 0x00050000
+	SECCOMP_RET_TRACE                  = 0x7ff00000
+	SECCOMP_RET_ALLOW                  = 0x7fff0000
+)
+
+func (a BPFAction) String() string {
+	switch a & SECCOMP_RET_ACTION_FULL {
+	case SECCOMP_RET_KILL_PROCESS:
+		return "kill process"
+	case SECCOMP_RET_KILL_THREAD:
+		return "kill thread"
+	case SECCOMP_RET_TRAP:
+		return fmt.Sprintf("trap (%d)", a.Data())
+	case SECCOMP_RET_ERRNO:
+		return fmt.Sprintf("errno (%d)", a.Data())
+	case SECCOMP_RET_TRACE:
+		return fmt.Sprintf("trace (%d)", a.Data())
+	case SECCOMP_RET_ALLOW:
+		return "allow"
+	}
+	return fmt.Sprintf("invalid action: %#x", a)
+}
+
+// Data returns the SECCOMP_RET_DATA portion of the action.
+func (a BPFAction) Data() uint16 {
+	return uint16(a & SECCOMP_RET_DATA)
+}
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 9d714d02d..ba2955752 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -33,16 +33,6 @@ const (
 	defaultLabel = "default_action"
 )
 
-func actionName(a uint32) string {
-	switch a {
-	case linux.SECCOMP_RET_KILL_PROCESS:
-		return "kill process"
-	case linux.SECCOMP_RET_TRAP:
-		return "trap"
-	}
-	panic(fmt.Sprintf("invalid action: %d", a))
-}
-
 // Install generates BPF code based on the set of syscalls provided. It only
 // allows syscalls that conform to the specification. Syscalls that violate the
 // specification will trigger RET_KILL_PROCESS, except for the cases below.
@@ -67,12 +57,12 @@ func Install(rules SyscallRules) error {
 	// Uncomment to get stack trace when there is a violation.
 	// defaultAction = uint32(linux.SECCOMP_RET_TRAP)
 
-	log.Infof("Installing seccomp filters for %d syscalls (action=%s)", len(rules), actionName(defaultAction))
+	log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction)
 
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  rules,
-			Action: uint32(linux.SECCOMP_RET_ALLOW),
+			Action: linux.SECCOMP_RET_ALLOW,
 		},
 	}, defaultAction)
 	if log.IsLogging(log.Debug) {
@@ -95,21 +85,21 @@ func Install(rules SyscallRules) error {
 	return nil
 }
 
-func defaultAction() (uint32, error) {
+func defaultAction() (linux.BPFAction, error) {
 	available, err := isKillProcessAvailable()
 	if err != nil {
 		return 0, err
 	}
 	if available {
-		return uint32(linux.SECCOMP_RET_KILL_PROCESS), nil
+		return linux.SECCOMP_RET_KILL_PROCESS, nil
 	}
-	return uint32(linux.SECCOMP_RET_TRAP), nil
+	return linux.SECCOMP_RET_TRAP, nil
 }
 
 // RuleSet is a set of rules and associated action.
 type RuleSet struct {
 	Rules  SyscallRules
-	Action uint32
+	Action linux.BPFAction
 
 	// Vsyscall indicates that a check is made for a function being called
 	// from kernel mappings. This is where the vsyscall page is located
@@ -127,7 +117,7 @@ var SyscallName = func(sysno uintptr) string {
 
 // BuildProgram builds a BPF program from the given map of actions to matching
 // SyscallRules. The single generated program covers all provided RuleSets.
-func BuildProgram(rules []RuleSet, defaultAction uint32) ([]linux.BPFInstruction, error) {
+func BuildProgram(rules []RuleSet, defaultAction linux.BPFAction) ([]linux.BPFInstruction, error) {
 	program := bpf.NewProgramBuilder()
 
 	// Be paranoid and check that syscall is done in the expected architecture.
@@ -147,7 +137,7 @@ func BuildProgram(rules []RuleSet, defaultAction uint32) ([]linux.BPFInstruction
 	if err := program.AddLabel(defaultLabel); err != nil {
 		return nil, err
 	}
-	program.AddStmt(bpf.Ret|bpf.K, defaultAction)
+	program.AddStmt(bpf.Ret|bpf.K, uint32(defaultAction))
 
 	return program.Instructions()
 }
@@ -217,7 +207,7 @@ func checkArgsLabel(sysno uintptr) string {
 // not insert a jump to the default action at the end and it is the
 // responsibility of the caller to insert an appropriate jump after calling
 // this function.
-func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action uint32, ruleSetIdx int, sysno uintptr) error {
+func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action linux.BPFAction, ruleSetIdx int, sysno uintptr) error {
 	for ruleidx, rule := range rules {
 		labelled := false
 		for i, arg := range rule {
@@ -240,7 +230,7 @@ func addSyscallArgsCheck(p *bpf.ProgramBuilder, rules []Rule, action uint32, rul
 		}
 
 		// Matched, emit the given action.
-		p.AddStmt(bpf.Ret|bpf.K, action)
+		p.AddStmt(bpf.Ret|bpf.K, uint32(action))
 
 		// Label the end of the rule if necessary. This is added for
 		// the jumps above when the argument check fails.
@@ -319,7 +309,7 @@ func buildBSTProgram(n *node, rules []RuleSet, program *bpf.ProgramBuilder) erro
 			// Emit matchers.
 			if len(rs.Rules[sysno]) == 0 {
 				// This is a blanket action.
-				program.AddStmt(bpf.Ret|bpf.K, rs.Action)
+				program.AddStmt(bpf.Ret|bpf.K, uint32(rs.Action))
 				emitted = true
 			} else {
 				// Add an argument check for these particular
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index f2b903e42..11ed90eb4 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -72,12 +72,12 @@ func TestBasic(t *testing.T) {
 		data seccompData
 
 		// want is the expected return value of the BPF program.
-		want uint32
+		want linux.BPFAction
 	}
 
 	for _, test := range []struct {
 		ruleSets      []RuleSet
-		defaultAction uint32
+		defaultAction linux.BPFAction
 		specs         []spec
 	}{
 		{
@@ -357,7 +357,7 @@ func TestBasic(t *testing.T) {
 				t.Errorf("%s: bpf.Exec() got error: %v", spec.desc, err)
 				continue
 			}
-			if got != spec.want {
+			if got != uint32(spec.want) {
 				t.Errorf("%s: bpd.Exec() = %d, want: %d", spec.desc, got, spec.want)
 			}
 		}
@@ -380,9 +380,9 @@ func TestRandom(t *testing.T) {
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  syscallRules,
-			Action: uint32(linux.SECCOMP_RET_ALLOW),
+			Action: linux.SECCOMP_RET_ALLOW,
 		},
-	}, uint32(linux.SECCOMP_RET_TRAP))
+	}, linux.SECCOMP_RET_TRAP)
 	if err != nil {
 		t.Fatalf("buildProgram() got error: %v", err)
 	}
@@ -397,11 +397,11 @@ func TestRandom(t *testing.T) {
 			t.Errorf("bpf.Exec() got error: %v, for syscall %d", err, i)
 			continue
 		}
-		want := uint32(linux.SECCOMP_RET_TRAP)
+		want := linux.SECCOMP_RET_TRAP
 		if _, ok := syscallRules[uintptr(i)]; ok {
 			want = linux.SECCOMP_RET_ALLOW
 		}
-		if got != want {
+		if got != uint32(want) {
 			t.Errorf("bpf.Exec() = %d, want: %d, for syscall %d", got, want, i)
 		}
 	}
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index d6dc45bbd..cec179246 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -27,24 +27,6 @@ import (
 
 const maxSyscallFilterInstructions = 1 << 15
 
-type seccompResult int
-
-const (
-	// seccompResultDeny indicates that a syscall should not be executed.
-	seccompResultDeny seccompResult = iota
-
-	// seccompResultAllow indicates that a syscall should be executed.
-	seccompResultAllow
-
-	// seccompResultKill indicates that the task should be killed immediately,
-	// with the exit status indicating that the task was killed by SIGSYS.
-	seccompResultKill
-
-	// seccompResultTrace indicates that a ptracer was successfully notified as
-	// a result of a SECCOMP_RET_TRACE.
-	seccompResultTrace
-)
-
 // seccompData is equivalent to struct seccomp_data, which contains the data
 // passed to seccomp-bpf filters.
 type seccompData struct {
@@ -83,48 +65,47 @@ func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalIn
 // in because vsyscalls do not use the values in t.Arch().)
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) seccompResult {
-	result := t.evaluateSyscallFilters(sysno, args, ip)
-	switch result & linux.SECCOMP_RET_ACTION {
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction {
+	result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
+	action := result & linux.SECCOMP_RET_ACTION
+	switch action {
 	case linux.SECCOMP_RET_TRAP:
 		// "Results in the kernel sending a SIGSYS signal to the triggering
 		// task without executing the system call. ... The SECCOMP_RET_DATA
 		// portion of the return value will be passed as si_errno." -
 		// Documentation/prctl/seccomp_filter.txt
-		t.SendSignal(seccompSiginfo(t, int32(result&linux.SECCOMP_RET_DATA), sysno, ip))
-		return seccompResultDeny
+		t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip))
 
 	case linux.SECCOMP_RET_ERRNO:
 		// "Results in the lower 16-bits of the return value being passed to
 		// userland as the errno without executing the system call."
-		t.Arch().SetReturn(-uintptr(result & linux.SECCOMP_RET_DATA))
-		return seccompResultDeny
+		t.Arch().SetReturn(-uintptr(result.Data()))
 
 	case linux.SECCOMP_RET_TRACE:
 		// "When returned, this value will cause the kernel to attempt to
 		// notify a ptrace()-based tracer prior to executing the system call.
 		// If there is no tracer present, -ENOSYS is returned to userland and
 		// the system call is not executed."
-		if t.ptraceSeccomp(uint16(result & linux.SECCOMP_RET_DATA)) {
-			return seccompResultTrace
+		if !t.ptraceSeccomp(result.Data()) {
+			// This useless-looking temporary is needed because Go.
+			tmp := uintptr(syscall.ENOSYS)
+			t.Arch().SetReturn(-tmp)
+			return linux.SECCOMP_RET_ERRNO
 		}
-		// This useless-looking temporary is needed because Go.
-		tmp := uintptr(syscall.ENOSYS)
-		t.Arch().SetReturn(-tmp)
-		return seccompResultDeny
 
 	case linux.SECCOMP_RET_ALLOW:
 		// "Results in the system call being executed."
-		return seccompResultAllow
 
 	case linux.SECCOMP_RET_KILL_THREAD:
 		// "Results in the task exiting immediately without executing the
 		// system call. The exit status of the task will be SIGSYS, not
 		// SIGKILL."
-		fallthrough
-	default: // consistent with Linux
-		return seccompResultKill
+
+	default:
+		// consistent with Linux
+		return linux.SECCOMP_RET_KILL_THREAD
 	}
+	return action
 }
 
 func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
@@ -155,7 +136,7 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
 		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
 		if err != nil {
 			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
-			thisRet = linux.SECCOMP_RET_KILL_THREAD
+			thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD)
 		}
 		// "If multiple filters exist, the return value for the evaluation of a
 		// given system call will always use the highest precedent value." -
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 2a39ebc68..9e43f089a 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -199,16 +199,16 @@ func (t *Task) doSyscall() taskRunState {
 	// is rare), not needed for correctness.
 	if t.syscallFilters.Load() != nil {
 		switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
-		case seccompResultDeny:
+		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
 			t.Debugf("Syscall %d: denied by seccomp", sysno)
 			return (*runSyscallExit)(nil)
-		case seccompResultAllow:
+		case linux.SECCOMP_RET_ALLOW:
 			// ok
-		case seccompResultKill:
+		case linux.SECCOMP_RET_KILL_THREAD:
 			t.Debugf("Syscall %d: killed by seccomp", sysno)
 			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
 			return (*runExit)(nil)
-		case seccompResultTrace:
+		case linux.SECCOMP_RET_TRACE:
 			t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
 			return (*runSyscallAfterPtraceEventSeccomp)(nil)
 		default:
@@ -345,14 +345,18 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 	args := t.Arch().SyscallArgs()
 	if t.syscallFilters.Load() != nil {
 		switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
-		case seccompResultDeny:
+		case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
 			t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
 			return (*runApp)(nil)
-		case seccompResultAllow:
+		case linux.SECCOMP_RET_ALLOW:
 			// ok
-		case seccompResultTrace:
+		case linux.SECCOMP_RET_TRACE:
 			t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
 			return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
+		case linux.SECCOMP_RET_KILL_THREAD:
+			t.Debugf("vsyscall %d: killed by seccomp", sysno)
+			t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+			return (*runExit)(nil)
 		default:
 			panic(fmt.Sprintf("Unknown seccomp result %d", r))
 		}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 25b8e8cb7..e2aab8135 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -38,7 +38,7 @@ const syscallEvent syscall.Signal = 0x80
 // Precondition: the runtime OS thread must be locked.
 func probeSeccomp() bool {
 	// Create a completely new, destroyable process.
-	t, err := attachedThread(0, uint32(linux.SECCOMP_RET_ERRNO))
+	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
 	if err != nil {
 		panic(fmt.Sprintf("seccomp probe failed: %v", err))
 	}
@@ -112,14 +112,14 @@ func createStub() (*thread, error) {
 	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
 	// will never run for emulation. Seccomp will only run for injected
 	// system calls, and thus we can use RET_KILL as our violation action.
-	var defaultAction uint32
+	var defaultAction linux.BPFAction
 	if probeSeccomp() {
 		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
-		defaultAction = uint32(linux.SECCOMP_RET_KILL_THREAD)
+		defaultAction = linux.SECCOMP_RET_KILL_THREAD
 	} else {
 		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
 		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
-		defaultAction = uint32(linux.SECCOMP_RET_ALLOW)
+		defaultAction = linux.SECCOMP_RET_ALLOW
 	}
 
 	// When creating the new child process, we specify SIGKILL as the
@@ -135,7 +135,7 @@ func createStub() (*thread, error) {
 // attachedThread returns a new attached thread.
 //
 // Precondition: the runtime OS thread must be locked.
-func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
+func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
 	// Create a BPF program that allows only the system calls needed by the
 	// stub and all its children. This is used to create child stubs
 	// (below), so we must include the ability to fork, but otherwise lock
@@ -148,11 +148,11 @@ func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
 				syscall.SYS_TIME:         {},
 				309:                      {}, // SYS_GETCPU.
 			},
-			Action:   uint32(linux.SECCOMP_RET_TRAP),
+			Action:   linux.SECCOMP_RET_TRAP,
 			Vsyscall: true,
 		},
 	}
-	if defaultAction != uint32(linux.SECCOMP_RET_ALLOW) {
+	if defaultAction != linux.SECCOMP_RET_ALLOW {
 		rules = append(rules, seccomp.RuleSet{
 			Rules: seccomp.SyscallRules{
 				syscall.SYS_CLONE: []seccomp.Rule{
@@ -191,7 +191,7 @@ func attachedThread(flags uintptr, defaultAction uint32) (*thread, error) {
 				syscall.SYS_MMAP:   {},
 				syscall.SYS_MUNMAP: {},
 			},
-			Action: uint32(linux.SECCOMP_RET_ALLOW),
+			Action: linux.SECCOMP_RET_ALLOW,
 		})
 	}
 	instrs, err := seccomp.BuildProgram(rules, defaultAction)
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index d6ac166a4..ac416b75f 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -215,6 +215,25 @@ TEST(SeccompTest, SeccompAppliesToVsyscall) {
       << "status " << status;
 }
 
+TEST(SeccompTest, RetKillVsyscallCausesDeathBySIGSYS) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+  pid_t const pid = fork();
+  if (pid == 0) {
+    // Register a signal handler for SIGSYS that we don't expect to be invoked.
+    RegisterSignalHandler(
+        SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+    ApplySeccompFilter(SYS_time, SECCOMP_RET_KILL);
+    vsyscall_time(nullptr);  // Should result in death.
+    TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
+      << "status " << status;
+}
+
 TEST(SeccompTest, RetTraceWithoutPtracerReturnsENOSYS) {
   pid_t const pid = fork();
   if (pid == 0) {
-- 
cgit v1.2.3


From 3b3f02627870a06de4e1fc3178d5bd23f627a97a Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 18 Dec 2018 11:51:22 -0800
Subject: Truncate ar before calling mm.breakCopyOnWriteLocked().

... as required by the latter's precondition.

PiperOrigin-RevId: 226033824
Change-Id: I6bc46d0e100c61cc58cb5fc69e70c4ca905cd92d
---
 pkg/sentry/mm/pma.go | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 5690fe6b4..63c50f719 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -138,6 +138,10 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar
 
 	var cowerr error
 	if opts.breakCOW {
+		if pend.Start() < ar.End {
+			// Adjust ar to reflect missing pmas.
+			ar.End = pend.Start()
+		}
 		var invalidated bool
 		pend, invalidated, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
 		if pend.Start() <= ar.Start {
@@ -189,6 +193,10 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrR
 			if !pstart.Ok() {
 				pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
 			}
+			if pend.Start() < ar.End {
+				// Adjust ar to reflect missing pmas.
+				ar.End = pend.Start()
+			}
 			pend, _, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
 		}
 
-- 
cgit v1.2.3


From 898838e34d1b0c76405f3e7f7f5fa7f1a444da0e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 18 Dec 2018 13:49:22 -0800
Subject: Fix mremap expansion with mm.checkInvariants = true.

Also remove useless RSS changes in mm.movePMAsLocked().

PiperOrigin-RevId: 226052996
Change-Id: If59fd259b93238fb2f15c1c8ebfeda14cb590a87
---
 pkg/sentry/mm/pma.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 63c50f719..d102035d8 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -679,7 +679,7 @@ func Unpin(prs []PinnedRange) {
 // movePMAsLocked moves all pmas in oldAR to newAR.
 //
 // Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0.
-// oldAR.Length() == newAR.Length(). !oldAR.Overlaps(newAR).
+// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR).
 // mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned.
 func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 	if checkInvariants {
@@ -689,8 +689,8 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 		if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() {
 			panic(fmt.Sprintf("invalid newAR: %v", newAR))
 		}
-		if oldAR.Length() != newAR.Length() {
-			panic(fmt.Sprintf("old and new address ranges have different lengths: %v, %v", oldAR, newAR))
+		if oldAR.Length() > newAR.Length() {
+			panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR))
 		}
 		if oldAR.Overlaps(newAR) {
 			panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
@@ -710,8 +710,9 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 			oldAR: pseg.Range(),
 			pma:   pseg.Value(),
 		})
-		mm.removeRSSLocked(pseg.Range())
 		pseg = mm.pmas.Remove(pseg).NextSegment()
+		// No RSS change is needed since we're re-inserting the same pmas
+		// below.
 	}
 
 	off := newAR.Start - oldAR.Start
@@ -719,7 +720,6 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) {
 	for i := range movedPMAs {
 		mpma := &movedPMAs[i]
 		pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
-		mm.addRSSLocked(pmaNewAR)
 		pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
 	}
 
-- 
cgit v1.2.3


From ff7178a4d10f9f1fb34e54fed5ef27cfbff5d6f9 Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Wed, 19 Dec 2018 13:14:53 -0800
Subject: Implement pwritev2.

Implement pwritev2 and associated unit tests.
Clean up preadv2 unit tests.
Tag RWF_ flags in both preadv2 and pwritev2 with associated bug tickets.

PiperOrigin-RevId: 226222119
Change-Id: Ieb22672418812894ba114bbc88e67f1dd50de620
---
 pkg/abi/linux/file.go                  |   7 +-
 pkg/sentry/syscalls/linux/linux64.go   |   2 +-
 pkg/sentry/syscalls/linux/sys_read.go  |  22 +--
 pkg/sentry/syscalls/linux/sys_write.go |  66 +++++++
 test/syscalls/BUILD                    |   2 +
 test/syscalls/linux/BUILD              |  27 ++-
 test/syscalls/linux/preadv2.cc         | 156 ++++++++++-----
 test/syscalls/linux/pwritev2.cc        | 337 +++++++++++++++++++++++++++++++++
 test/util/test_util.h                  |   7 -
 9 files changed, 553 insertions(+), 73 deletions(-)
 create mode 100644 test/syscalls/linux/pwritev2.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index ac49ae9a6..ae33f4a4d 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -152,9 +152,10 @@ const (
 
 // Values for preadv2/pwritev2.
 const (
-	RWF_HIPRI = 0x0001
-	RWF_DSYNC = 0X0002
-	RWF_SYNC  = 0x0004
+	RWF_HIPRI = 0x00000001
+	RWF_DSYNC = 0x00000002
+	RWF_SYNC  = 0x00000004
+	RWF_VALID = RWF_HIPRI | RWF_DSYNC | RWF_SYNC
 )
 
 // Stat represents struct stat.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index cc5ebb955..e855590e6 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -377,7 +377,7 @@ var AMD64 = &kernel.SyscallTable{
 		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
 		//	326: CopyFileRange,
 		327: Preadv2,
-		//	328: Pwritev2,  // Pwritev2, TODO
+		328: Pwritev2,
 	},
 
 	Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index cbb9eb9f8..b6df4d9d4 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -188,14 +188,20 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 }
 
 // Preadv2 implements linux syscall preadv2(2).
+// TODO: Implement RWF_HIPRI functionality.
 func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the syscall is
+	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the linux internal call
+	// (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1248)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 5th argument.
+
 	fd := kdefs.FD(args[0].Int())
 	addr := args[1].Pointer()
 	iovcnt := int(args[2].Int())
 	offset := args[3].Int64()
-	flags := int(args[4].Int())
-
-	validFlags := linux.RWF_HIPRI
+	flags := int(args[5].Int())
 
 	file := t.FDMap().GetFile(fd)
 	if file == nil {
@@ -219,14 +225,8 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	// Check flags field.
-	if flags != 0 {
-		if flags&^validFlags != 0 {
-			return 0, nil, syserror.EINVAL
-		}
-		// RWF_HIPRI must be called on a file with O_DIRECT flag set.
-		if flags&linux.RWF_HIPRI != 0 && !file.Flags().Direct {
-			return 0, nil, syserror.EINVAL
-		}
+	if flags&^linux.RWF_VALID != 0 {
+		return 0, nil, syserror.EOPNOTSUPP
 	}
 
 	// Read the iovecs that specify the destination of the read.
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 08e263112..750a098cd 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -187,6 +187,72 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
 }
 
+// Pwritev2 implements linux syscall pwritev2(2).
+// TODO: Implement RWF_HIPRI functionality.
+// TODO: Implement O_SYNC and D_SYNC functionality.
+func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	// While the syscall is
+	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
+	// the linux internal call
+	// (https://elixir.bootlin.com/linux/v4.18/source/fs/read_write.c#L1354)
+	// splits the offset argument into a high/low value for compatibility with
+	// 32-bit architectures. The flags argument is the 5th argument.
+
+	fd := kdefs.FD(args[0].Int())
+	addr := args[1].Pointer()
+	iovcnt := int(args[2].Int())
+	offset := args[3].Int64()
+	flags := int(args[5].Int())
+
+	if int(args[4].Int())&0x4 == 1 {
+		return 0, nil, syserror.EACCES
+	}
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	// Check that the offset is legitimate.
+	if offset < -1 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Is writing at an offset supported?
+	if offset > -1 && !file.Flags().Pwrite {
+		return 0, nil, syserror.ESPIPE
+	}
+
+	if flags&^linux.RWF_VALID != 0 {
+		return uintptr(flags), nil, syserror.EOPNOTSUPP
+	}
+
+	// Check that the file is writeable.
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Read the iovecs that specify the source of the write.
+	src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	// If pwritev2 is called with an offset of -1, writev is called.
+	if offset == -1 {
+		n, err := writev(t, file, src)
+		t.IOUsage().AccountWriteSyscall(n)
+		return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+	}
+
+	n, err := pwritev(t, file, src, offset)
+	t.IOUsage().AccountWriteSyscall(n)
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
+}
+
 func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
 	n, err := f.Writev(t, src)
 	if err != syserror.ErrWouldBlock || f.Flags().NonBlocking {
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 711b68c76..12c7049e7 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -219,6 +219,8 @@ syscall_test(
     test = "//test/syscalls/linux:pty_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:pwritev2_test")
+
 syscall_test(test = "//test/syscalls/linux:pwrite64_test")
 
 syscall_test(test = "//test/syscalls/linux:read_test")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index aca55f492..f13e32daa 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1338,18 +1338,18 @@ cc_binary(
     name = "preadv2_test",
     testonly = 1,
     srcs = [
+        "file_base.h",
         "preadv2.cc",
-        "readv_common.cc",
-        "readv_common.h",
     ],
     linkstatic = 1,
     deps = [
-        ":file_base",
         "//test/util:file_descriptor",
-        "//test/util:memory_util",
+        "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1452,6 +1452,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "pwritev2_test",
+    testonly = 1,
+    srcs = [
+        "pwritev2.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":file_base",
+        "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "read_test",
     testonly = 1,
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index 642eed624..58a4f9224 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -13,23 +13,18 @@
 // limitations under the License.
 
 #include <fcntl.h>
-#include <stdlib.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/uio.h>
-#include <sys/wait.h>
-#include <unistd.h>
 
-#include <atomic>
 #include <string>
 #include <vector>
 
 #include "gtest/gtest.h"
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "test/syscalls/linux/file_base.h"
-#include "test/syscalls/linux/readv_common.h"
 #include "test/util/file_descriptor.h"
-#include "test/util/memory_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
@@ -60,11 +55,17 @@ std::string SetContent() {
   return content;
 }
 
+ssize_t preadv2(unsigned long fd, const struct iovec* iov, unsigned long iovcnt,
+                off_t offset, unsigned long flags) {
+  // syscall on preadv2 does some weird things (see man syscall and search
+  // preadv2), so we insert a 0 to word align the flags argument on native.
+  return syscall(SYS_preadv2, fd, iov, iovcnt, offset, 0, flags);
+}
+
 // This test is the base case where we call preadv (no offset, no flags).
 TEST(Preadv2Test, TestBaseCall) {
-  if (!IsRunningOnGvisor()) {
-    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
-  }
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
   std::string content = SetContent();
 
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
@@ -73,12 +74,13 @@ TEST(Preadv2Test, TestBaseCall) {
       ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
 
   std::vector<char> buf(kBufSize);
-  struct iovec iov;
-  iov.iov_base = buf.data();
-  iov.iov_len = buf.size();
+  struct iovec iov[2];
+  iov[0].iov_base = buf.data();
+  iov[0].iov_len = buf.size() / 2;
+  iov[1].iov_base = static_cast<char*>(iov[0].iov_base) + (content.size() / 2);
+  iov[1].iov_len = content.size() / 2;
 
-  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt*/ 1,
-                      /*offset=*/0, /*flags=*/0),
+  EXPECT_THAT(preadv2(fd.get(), iov, /*iovcnt*/ 2, /*offset=*/0, /*flags=*/0),
               SyscallSucceedsWithValue(kBufSize));
 
   EXPECT_EQ(content, std::string(buf.data(), buf.size()));
@@ -86,9 +88,8 @@ TEST(Preadv2Test, TestBaseCall) {
 
 // This test is where we call preadv with an offset and no flags.
 TEST(Preadv2Test, TestValidPositiveOffset) {
-  if (!IsRunningOnGvisor()) {
-    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
-  }
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
   std::string content = SetContent();
   const std::string prefix = "0";
 
@@ -102,10 +103,12 @@ TEST(Preadv2Test, TestValidPositiveOffset) {
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
 
-  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
-                      /*offset=*/prefix.size(), /*flags=*/0),
+  EXPECT_THAT(preadv2(fd.get(), &iov, /*iovcnt=*/1, /*offset=*/prefix.size(),
+                      /*flags=*/0),
               SyscallSucceedsWithValue(kBufSize));
 
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
   EXPECT_EQ(content, std::string(buf.data(), buf.size()));
 }
 
@@ -113,9 +116,8 @@ TEST(Preadv2Test, TestValidPositiveOffset) {
 // read should use the file offset, so the test increments it by one prior to
 // calling preadv2.
 TEST(Preadv2Test, TestNegativeOneOffset) {
-  if (!IsRunningOnGvisor()) {
-    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
-  }
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
   std::string content = SetContent();
   const std::string prefix = "231";
 
@@ -123,6 +125,7 @@ TEST(Preadv2Test, TestNegativeOneOffset) {
       GetAbsoluteTestTmpdir(), prefix + content, TempPath::kDefaultFileMode));
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
   ASSERT_THAT(lseek(fd.get(), prefix.size(), SEEK_SET),
               SyscallSucceedsWithValue(prefix.size()));
 
@@ -131,79 +134,111 @@ TEST(Preadv2Test, TestNegativeOneOffset) {
   iov.iov_base = buf.data();
   iov.iov_len = buf.size();
 
-  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
-                      /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+  EXPECT_THAT(preadv2(fd.get(), &iov, /*iovcnt=*/1, /*offset=*/-1, /*flags=*/0),
               SyscallSucceedsWithValue(kBufSize));
 
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(prefix.size() + buf.size()));
+
   EXPECT_EQ(content, std::string(buf.data(), buf.size()));
 }
 
+// preadv2 requires if the RWF_HIPRI flag is passed, the fd must be opened with
+// O_DIRECT. This test implements a correct call with the RWF_HIPRI flag.
+TEST(Preadv2Test, TestCallWithRWF_HIPRI) {
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  std::string content = SetContent();
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), content, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  EXPECT_THAT(fsync(fd.get()), SyscallSucceeds());
+
+  std::vector<char> buf(kBufSize, '0');
+  struct iovec iov;
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  EXPECT_THAT(
+      preadv2(fd.get(), &iov, /*iovcnt=*/1, /*offset=*/0, /*flags=*/RWF_HIPRI),
+      SyscallSucceedsWithValue(kBufSize));
+
+  EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+  EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
 // This test calls preadv2 with an invalid flag.
 TEST(Preadv2Test, TestInvalidFlag) {
-  if (!IsRunningOnGvisor()) {
-    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
-  }
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
 
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY | O_DIRECT));
 
+  std::vector<char> buf(kBufSize, '0');
   struct iovec iov;
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
 
-  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
-                      /*offset=*/0, /*flags=*/RWF_HIPRI << 1),
-              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(preadv2(fd.get(), &iov, /*iovcnt=*/1,
+                      /*offset=*/0, /*flags=*/0xF0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
 }
 
 // This test calls preadv2 with an invalid offset.
 TEST(Preadv2Test, TestInvalidOffset) {
-  if (!IsRunningOnGvisor()) {
-    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
-  }
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
 
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY | O_DIRECT));
-  struct iovec iov;
 
-  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
-                      /*offset=*/static_cast<off_t>(-8), /*flags=*/RWF_HIPRI),
+  auto iov = absl::make_unique<struct iovec[]>(1);
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 0;
+
+  EXPECT_THAT(preadv2(fd.get(), iov.get(), /*iovcnt=*/1, /*offset=*/-8,
+                      /*flags=*/RWF_HIPRI),
               SyscallFailsWithErrno(EINVAL));
 }
 
 // This test calls preadv with a file set O_WRONLY.
 TEST(Preadv2Test, TestUnreadableFile) {
-  if (!IsRunningOnGvisor()) {
-    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
-  }
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
 
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
       GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
-  struct iovec iov;
 
-  EXPECT_THAT(syscall(SYS_preadv2, fd.get(), &iov, /*iov_cnt=*/1,
+  auto iov = absl::make_unique<struct iovec[]>(1);
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 0;
+
+  EXPECT_THAT(preadv2(fd.get(), iov.get(), /*iovcnt=*/1,
                       /*offset=*/0, /*flags=*/0),
               SyscallFailsWithErrno(EBADF));
 }
 
 // Calling preadv2 with a non-negative offset calls preadv.  Calling preadv with
 // an unseekable file is not allowed. A pipe is used for an unseekable file.
-TEST(Preadv2Test, TestUnseekableFile) {
-  if (!IsRunningOnGvisor()) {
-    SKIP_BEFORE_KERNEL(/*major_version=*/4, /*minor_version=*/6);
-  }
+TEST(Preadv2Test, TestUnseekableFileInvalid) {
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
 
   int pipe_fds[2];
 
   ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
 
-  struct iovec iov;
+  auto iov = absl::make_unique<struct iovec[]>(1);
+  iov[0].iov_base = nullptr;
+  iov[0].iov_len = 0;
 
-  EXPECT_THAT(syscall(SYS_preadv2, pipe_fds[0], &iov, /*iov_cnt=*/1,
+  EXPECT_THAT(preadv2(pipe_fds[0], iov.get(), /*iovcnt=*/1,
                       /*offset=*/2, /*flags=*/0),
               SyscallFailsWithErrno(ESPIPE));
 
@@ -211,6 +246,33 @@ TEST(Preadv2Test, TestUnseekableFile) {
   EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
 }
 
+TEST(Preadv2Test, TestUnseekableFileValid) {
+  SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  int pipe_fds[2];
+
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  std::vector<char> content(32, 'X');
+
+  EXPECT_THAT(write(pipe_fds[1], content.data(), content.size()),
+              SyscallSucceedsWithValue(content.size()));
+
+  std::vector<char> buf(content.size());
+  auto iov = absl::make_unique<struct iovec[]>(1);
+  iov[0].iov_base = buf.data();
+  iov[0].iov_len = buf.size();
+
+  EXPECT_THAT(preadv2(pipe_fds[0], iov.get(), /*iovcnt=*/1,
+                      /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+              SyscallSucceedsWithValue(buf.size()));
+
+  EXPECT_EQ(content, buf);
+
+  EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
new file mode 100644
index 000000000..a6949f08e
--- /dev/null
+++ b/test/syscalls/linux/pwritev2.cc
@@ -0,0 +1,337 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifndef SYS_pwritev2
+#if defined(__x86_64__)
+#define SYS_pwritev2 328
+#else
+#error "Unknown architecture"
+#endif
+#endif  // SYS_pwrite2
+
+#ifndef RWF_HIPRI
+#define RWF_HIPRI 0x1
+#endif  // RWF_HIPRI
+
+#ifndef RWF_DSYNC
+#define RWF_DSYNC 0x2
+#endif  // RWF_DSYNC
+
+#ifndef RWF_SYNC
+#define RWF_SYNC 0x4
+#endif  // RWF_SYNC
+
+constexpr int kBufSize = 1024;
+
+void SetContent(std::vector<char>& content) {
+  for (uint i = 0; i < content.size(); i++) {
+    content[i] = static_cast<char>((i % 10) + '0');
+  }
+}
+
+ssize_t pwritev2(unsigned long fd, const struct iovec* iov,
+                 unsigned long iovcnt, off_t offset, unsigned long flags) {
+  // syscall on pwritev2 does some weird things (see man syscall and search
+  // pwritev2), so we insert a 0 to word align the flags argument on native.
+  return syscall(SYS_pwritev2, fd, iov, iovcnt, offset, 0, flags);
+}
+
+// This test is the base case where we call pwritev (no offset, no flags).
+TEST(Writev2Test, TestBaseCall) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  std::vector<char> content(kBufSize);
+  SetContent(content);
+  struct iovec iov[2];
+  iov[0].iov_base = content.data();
+  iov[0].iov_len = content.size() / 2;
+  iov[1].iov_base = static_cast<char*>(iov[0].iov_base) + (content.size() / 2);
+  iov[1].iov_len = content.size() / 2;
+
+  ASSERT_THAT(pwritev2(fd.get(), iov, /*iovcnt=*/2,
+                       /*offset=*/0, /*flags=*/0),
+              SyscallSucceedsWithValue(kBufSize));
+
+  std::vector<char> buf(kBufSize);
+  EXPECT_THAT(read(fd.get(), buf.data(), kBufSize),
+              SyscallSucceedsWithValue(kBufSize));
+
+  EXPECT_EQ(content, buf);
+}
+
+// This test is where we call pwritev2 with a positive offset and no flags.
+TEST(Pwritev2Test, TestValidPositiveOffset) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  std::string prefix(kBufSize, '0');
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), prefix, TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  std::vector<char> content(kBufSize);
+  SetContent(content);
+  struct iovec iov;
+  iov.iov_base = content.data();
+  iov.iov_len = content.size();
+
+  ASSERT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+                       /*offset=*/prefix.size(), /*flags=*/0),
+              SyscallSucceedsWithValue(content.size()));
+
+  std::vector<char> buf(prefix.size() + content.size());
+  EXPECT_THAT(read(fd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  std::vector<char> want(prefix.begin(), prefix.end());
+  want.insert(want.end(), content.begin(), content.end());
+  EXPECT_EQ(want, buf);
+}
+
+// This test is the base case where we call writev by using -1 as the offset.
+// The write should use the file offset, so the test increments the file offset
+// prior to call pwritev2.
+TEST(Pwritev2Test, TestNegativeOneOffset) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  const std::string prefix = "00";
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), prefix.data(), TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+  ASSERT_THAT(lseek(fd.get(), prefix.size(), SEEK_SET),
+              SyscallSucceedsWithValue(prefix.size()));
+
+  std::vector<char> content(kBufSize);
+  SetContent(content);
+  struct iovec iov;
+  iov.iov_base = content.data();
+  iov.iov_len = content.size();
+
+  ASSERT_THAT(pwritev2(fd.get(), &iov, /*iovcnt*/ 1,
+                       /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+              SyscallSucceedsWithValue(content.size()));
+
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(prefix.size() + content.size()));
+
+  std::vector<char> buf(prefix.size() + content.size());
+  EXPECT_THAT(pread(fd.get(), buf.data(), buf.size(), /*offset=*/0),
+              SyscallSucceedsWithValue(buf.size()));
+
+  std::vector<char> want(prefix.begin(), prefix.end());
+  want.insert(want.end(), content.begin(), content.end());
+  EXPECT_EQ(want, buf);
+}
+
+// pwritev2 requires if the RWF_HIPRI flag is passed, the fd must be opened with
+// O_DIRECT. This test implements a correct call with the RWF_HIPRI flag.
+TEST(Pwritev2Test, TestCallWithRWF_HIPRI) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  std::vector<char> content(kBufSize);
+  SetContent(content);
+  struct iovec iov;
+  iov.iov_base = content.data();
+  iov.iov_len = content.size();
+
+  EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+                       /*offset=*/0, /*flags=*/RWF_HIPRI),
+              SyscallSucceedsWithValue(kBufSize));
+
+  std::vector<char> buf(content.size());
+  EXPECT_THAT(read(fd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  EXPECT_EQ(buf, content);
+}
+
+// This test checks that pwritev2 can be called with valid flags
+TEST(Pwritev2Test, TestCallWithValidFlags) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  std::vector<char> content(kBufSize, '0');
+  struct iovec iov;
+  iov.iov_base = content.data();
+  iov.iov_len = content.size();
+
+  EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+                       /*offset=*/0, /*flags=*/RWF_DSYNC),
+              SyscallSucceedsWithValue(kBufSize));
+
+  std::vector<char> buf(content.size());
+  EXPECT_THAT(read(fd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  EXPECT_EQ(buf, content);
+
+  SetContent(content);
+
+  EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+                       /*offset=*/0, /*flags=*/0x4),
+              SyscallSucceedsWithValue(kBufSize));
+
+  ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(content.size()));
+
+  EXPECT_THAT(pread(fd.get(), buf.data(), buf.size(), /*offset=*/0),
+              SyscallSucceedsWithValue(buf.size()));
+
+  EXPECT_EQ(buf, content);
+}
+
+// This test calls pwritev2 with a bad file descriptor.
+TEST(Writev2Test, TestBadFile) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+  ASSERT_THAT(pwritev2(/*fd=*/-1, /*iov=*/nullptr, /*iovcnt=*/0,
+                       /*offset=*/0, /*flags=*/0),
+              SyscallFailsWithErrno(EBADF));
+}
+
+// This test calls pwrite2 with an invalid offset.
+TEST(Pwritev2Test, TestInvalidOffset) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  struct iovec iov;
+  iov.iov_base = nullptr;
+
+  EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+                       /*offset=*/static_cast<off_t>(-8), /*flags=*/0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Pwritev2Test, TestUnseekableFileValid) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  int pipe_fds[2];
+
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  std::vector<char> content(32, '0');
+  SetContent(content);
+  struct iovec iov;
+  iov.iov_base = content.data();
+  iov.iov_len = content.size();
+
+  EXPECT_THAT(pwritev2(pipe_fds[1], &iov, /*iovcnt=*/1,
+                       /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+              SyscallSucceedsWithValue(content.size()));
+
+  std::vector<char> buf(content.size());
+  EXPECT_THAT(read(pipe_fds[0], buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  EXPECT_EQ(content, buf);
+
+  EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+// Calling pwritev2 with a non-negative offset calls pwritev.  Calling pwritev
+// with an unseekable file is not allowed. A pipe is used for an unseekable
+// file.
+TEST(Pwritev2Test, TestUnseekableFileInValid) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  int pipe_fds[2];
+  struct iovec iov;
+  iov.iov_base = nullptr;
+
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+  EXPECT_THAT(pwritev2(pipe_fds[1], &iov, /*iovcnt=*/1,
+                       /*offset=*/2, /*flags=*/0),
+              SyscallFailsWithErrno(ESPIPE));
+
+  EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+TEST(Pwritev2Test, TestReadOnlyFile) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  struct iovec iov;
+  iov.iov_base = nullptr;
+
+  EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+                       /*offset=*/0, /*flags=*/0),
+              SyscallFailsWithErrno(EBADF));
+}
+
+// This test calls pwritev2 with an invalid flag.
+TEST(Pwritev2Test, TestInvalidFlag) {
+  SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR | O_DIRECT));
+
+  struct iovec iov;
+  iov.iov_base = nullptr;
+
+  EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+                       /*offset=*/0, /*flags=*/0xF0),
+              SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 2a7609e5c..cd71fdd64 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -217,13 +217,6 @@ void TestInit(int* argc, char*** argv);
     }                                                                        \
   } while (0)
 
-#define SKIP_BEFORE_KERNEL(maj, min)                              \
-  do {                                                            \
-    auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion()); \
-    SKIP_IF(version.major < (maj) ||                              \
-            (version.major == (maj) && version.minor < (min)));   \
-  } while (0)
-
 enum class Platform {
   kNative,
   kKVM,
-- 
cgit v1.2.3


From 86c9bd254749ebf65270aa60f728d9c847ac02d4 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Wed, 19 Dec 2018 13:29:10 -0800
Subject: Automated rollback of changelist 225861605

PiperOrigin-RevId: 226224230
Change-Id: Id24c7d3733722fd41d5fe74ef64e0ce8c68f0b12
---
 pkg/abi/linux/limits.go                 |   2 +-
 pkg/abi/linux/mm.go                     |  12 -
 pkg/sentry/limits/limits.go             |   2 +-
 pkg/sentry/limits/linux.go              |   2 +-
 pkg/sentry/memmap/memmap.go             |  37 ---
 pkg/sentry/mm/BUILD                     |   1 -
 pkg/sentry/mm/address_space.go          |  12 +-
 pkg/sentry/mm/lifecycle.go              |  24 +-
 pkg/sentry/mm/mm.go                     |  24 +-
 pkg/sentry/mm/syscalls.go               | 423 +++++---------------------------
 pkg/sentry/mm/vma.go                    |  38 ---
 pkg/sentry/syscalls/linux/linux64.go    |  15 +-
 pkg/sentry/syscalls/linux/sys_mmap.go   | 106 +++-----
 pkg/sentry/syscalls/linux/sys_rlimit.go |   1 -
 runsc/boot/limits.go                    |   4 +-
 test/syscalls/linux/BUILD               |  15 --
 test/syscalls/linux/mlock.cc            | 344 --------------------------
 test/syscalls/linux/msync.cc            |  20 +-
 18 files changed, 135 insertions(+), 947 deletions(-)
 delete mode 100644 test/syscalls/linux/mlock.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index e0aa5b31d..b2e51b9bd 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -60,7 +60,7 @@ const (
 	DefaultNofileHardLimit = 4096
 
 	// DefaultMemlockLimit is called MLOCK_LIMIT in Linux.
-	DefaultMemlockLimit = 64 * 1024
+	DefaultMemlockLimit = 64 * 1094
 
 	// DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux.
 	DefaultMsgqueueLimit = 819200
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index eda8d9788..3fcdf8235 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -49,18 +49,6 @@ const (
 	MREMAP_FIXED   = 1 << 1
 )
 
-// Flags for mlock2(2).
-const (
-	MLOCK_ONFAULT = 0x01
-)
-
-// Flags for mlockall(2).
-const (
-	MCL_CURRENT = 1
-	MCL_FUTURE  = 2
-	MCL_ONFAULT = 4
-)
-
 // Advice for madvise(2).
 const (
 	MADV_NORMAL       = 0
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index eeca01876..ba0b7d4fd 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -33,7 +33,7 @@ const (
 	Rss
 	ProcessCount
 	NumberOfFiles
-	MemoryLocked
+	MemoryPagesLocked
 	AS
 	Locks
 	SignalsPending
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 295f9c398..511db6733 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
 	linux.RLIMIT_RSS:        Rss,
 	linux.RLIMIT_NPROC:      ProcessCount,
 	linux.RLIMIT_NOFILE:     NumberOfFiles,
-	linux.RLIMIT_MEMLOCK:    MemoryLocked,
+	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
 	linux.RLIMIT_AS:         AS,
 	linux.RLIMIT_LOCKS:      Locks,
 	linux.RLIMIT_SIGPENDING: SignalsPending,
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index cf20b11e3..28e2bed9b 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -243,40 +243,6 @@ type MappingIdentity interface {
 	Msync(ctx context.Context, mr MappableRange) error
 }
 
-// MLockMode specifies the memory locking behavior of a memory mapping.
-type MLockMode int
-
-// Note that the ordering of MLockModes is significant; see
-// mm.MemoryManager.defMLockMode.
-const (
-	// MLockNone specifies that a mapping has no memory locking behavior.
-	//
-	// This must be the zero value for MLockMode.
-	MLockNone MLockMode = iota
-
-	// MLockEager specifies that a mapping is memory-locked, as by mlock() or
-	// similar. Pages in the mapping should be made, and kept, resident in
-	// physical memory as soon as possible.
-	//
-	// As of this writing, MLockEager does not cause memory-locking to be
-	// requested from the host; it only affects the sentry's memory management
-	// behavior.
-	//
-	// MLockEager is analogous to Linux's VM_LOCKED.
-	MLockEager
-
-	// MLockLazy specifies that a mapping is memory-locked, as by mlock() or
-	// similar. Pages in the mapping should be kept resident in physical memory
-	// once they have been made resident due to e.g. a page fault.
-	//
-	// As of this writing, MLockLazy does not cause memory-locking to be
-	// requested from the host; in fact, it has virtually no effect, except for
-	// interactions between mlocked pages and other syscalls.
-	//
-	// MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
-	MLockLazy
-)
-
 // MMapOpts specifies a request to create a memory mapping.
 type MMapOpts struct {
 	// Length is the length of the mapping.
@@ -337,9 +303,6 @@ type MMapOpts struct {
 	// mapping (see platform.AddressSpace.MapFile).
 	Precommit bool
 
-	// MLockMode specifies the memory locking behavior of the mapping.
-	MLockMode MLockMode
-
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 5a9185e5d..744e73a39 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -106,7 +106,6 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
-        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index e7aa24c69..7488f7c4a 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() {
 // for all addresses in ar should be precommitted.
 //
 // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
+// ar must be page-aligned. pseg.Range().Contains(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
@@ -173,9 +173,7 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		}
 	}
 
-	// Since this checks ar.End and not mapAR.End, we will never map a pma that
-	// is not required.
-	for pseg.Ok() && pseg.Start() < ar.End {
+	for {
 		pma := pseg.ValuePtr()
 		pmaAR := pseg.Range()
 		pmaMapAR := pmaAR.Intersect(mapAR)
@@ -186,9 +184,13 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
 			return err
 		}
+		// Since this checks ar.End and not mapAR.End, we will never map a pma
+		// that is not required.
+		if ar.End <= pmaAR.End {
+			return nil
+		}
 		pseg = pseg.NextSegment()
 	}
-	return nil
 }
 
 // unmapASLocked removes all AddressSpace mappings for addresses in ar.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index a42e32b43..1613ce11d 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -59,17 +58,13 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
-		p:           mm.p,
-		haveASIO:    mm.haveASIO,
-		layout:      mm.layout,
-		privateRefs: mm.privateRefs,
-		users:       1,
-		brk:         mm.brk,
-		usageAS:     mm.usageAS,
-		// "The child does not inherit its parent's memory locks (mlock(2),
-		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
-		// MLockNone, both of which are zero values. vma.mlockMode is reset
-		// when copied below.
+		p:                    mm.p,
+		haveASIO:             mm.haveASIO,
+		layout:               mm.layout,
+		privateRefs:          mm.privateRefs,
+		users:                1,
+		usageAS:              mm.usageAS,
+		brk:                  mm.brk,
 		captureInvalidations: true,
 		argv:                 mm.argv,
 		envv:                 mm.envv,
@@ -82,7 +77,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	// Copy vmas.
 	dstvgap := mm2.vmas.FirstGap()
 	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
-		vma := srcvseg.Value() // makes a copy of the vma
+		vma := srcvseg.ValuePtr()
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
@@ -94,8 +89,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		vma.mlockMode = memmap.MLockNone
-		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
 		// We don't need to update mm2.usageAS since we copied it from mm
 		// above.
 	}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index c0632d232..b1e39e898 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -95,29 +95,17 @@ type MemoryManager struct {
 	// vmas is protected by mappingMu.
 	vmas vmaSet
 
-	// brk is the mm's brk, which is manipulated using the brk(2) system call.
-	// The brk is initially set up by the loader which maps an executable
-	// binary into the mm.
-	//
-	// brk is protected by mappingMu.
-	brk usermem.AddrRange
-
 	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
 	//
 	// usageAS is protected by mappingMu.
 	usageAS uint64
 
-	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
-	// memmap.MLockNone.
-	//
-	// lockedAS is protected by mappingMu.
-	lockedAS uint64
-
-	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
-	// defMLockMode is greater.
+	// brk is the mm's brk, which is manipulated using the brk(2) system call.
+	// The brk is initially set up by the loader which maps an executable
+	// binary into the mm.
 	//
-	// defMLockMode is protected by mappingMu.
-	defMLockMode memmap.MLockMode
+	// brk is protected by mappingMu.
+	brk usermem.AddrRange
 
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
@@ -264,8 +252,6 @@ type vma struct {
 	// metag, none of which we currently support.
 	growsDown bool `state:"manual"`
 
-	mlockMode memmap.MLockMode
-
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 383703ec3..daaae4da1 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -20,7 +20,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -129,24 +128,16 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 
 	// Get the new vma.
 	mm.mappingMu.Lock()
-	if opts.MLockMode < mm.defMLockMode {
-		opts.MLockMode = mm.defMLockMode
-	}
 	vseg, ar, err := mm.createVMALocked(ctx, opts)
 	if err != nil {
 		mm.mappingMu.Unlock()
 		return 0, err
 	}
 
-	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
-	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
-	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
-	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
-	// populate_vma_page_range(). Confirm this behavior.
 	switch {
-	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
+	case opts.Precommit:
 		// Get pmas and map with precommit as requested.
-		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+		mm.populateAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
 		// NOTE: Get pmas and map eagerly in the hope
@@ -155,7 +146,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
 		// to avoid needing to allocate large amounts of memory that we may
 		// subsequently need to checkpoint.
-		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
+		mm.populateAndUnlock(ctx, vseg, ar, false)
 
 	default:
 		mm.mappingMu.Unlock()
@@ -164,29 +155,31 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 	return ar.Start, nil
 }
 
-// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
-// into mm.as if it is active.
+// Preconditions: mm.mappingMu must be locked for writing.
 //
-// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
-func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
 		// mm/gup.c:populate_vma_page_range.
+		mm.mappingMu.Unlock()
 		return
 	}
 
 	mm.activeMu.Lock()
-	// Can't defer mm.activeMu.Unlock(); see below.
 
-	// Even if we get new pmas, we can't actually map them if we don't have an
+	// Even if we get a new pma, we can't actually map it if we don't have an
 	// AddressSpace.
 	if mm.as == nil {
 		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
 		return
 	}
 
 	// Ensure that we have usable pmas.
+	mm.mappingMu.DowngradeLock()
 	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
 	if err != nil {
 		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
 		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@@ -204,45 +197,6 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
 	mm.activeMu.RUnlock()
 }
 
-// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
-// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
-// preferable to populateVMA since it unlocks mm.mappingMu before performing
-// expensive operations that don't require it to be locked.
-//
-// Preconditions: mm.mappingMu must be locked for writing.
-// vseg.Range().IsSupersetOf(ar).
-//
-// Postconditions: mm.mappingMu will be unlocked.
-func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
-	// See populateVMA above for commentary.
-	if !vseg.ValuePtr().effectivePerms.Any() {
-		mm.mappingMu.Unlock()
-		return
-	}
-
-	mm.activeMu.Lock()
-
-	if mm.as == nil {
-		mm.activeMu.Unlock()
-		mm.mappingMu.Unlock()
-		return
-	}
-
-	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
-	// isn't needed at all for mapASLocked.
-	mm.mappingMu.DowngradeLock()
-	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
-	mm.mappingMu.RUnlock()
-	if err != nil {
-		mm.activeMu.Unlock()
-		return
-	}
-
-	mm.activeMu.DowngradeLock()
-	mm.mapASLocked(pseg, ar, precommit)
-	mm.activeMu.RUnlock()
-}
-
 // MapStack allocates the initial process stack.
 func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
 	// maxStackSize is the maximum supported process stack size in bytes.
@@ -282,7 +236,6 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error
 		MaxPerms:  usermem.AnyAccess,
 		Private:   true,
 		GrowsDown: true,
-		MLockMode: mm.defMLockMode,
 		Hint:      "[stack]",
 	})
 	return ar, err
@@ -381,19 +334,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// occupies at least part of the destination. Thus the NoMove case always
 	// fails and the MayMove case always falls back to copying.
 
-	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
-		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
-		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
-		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
-		// !CAP_IPC_LOCK.
-		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
-				return 0, syserror.EAGAIN
-			}
-		}
-	}
-
 	if opts.Move != MRemapMustMove {
 		// Handle no-ops and in-place shrinking. These cases don't care if
 		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
@@ -420,7 +360,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.mappable != nil {
 			newOffset = vseg.mappableRange().End
 		}
-		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length:          newSize - oldSize,
 			MappingIdentity: vma.id,
 			Mappable:        vma.mappable,
@@ -431,13 +371,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			MaxPerms:        vma.maxPerms,
 			Private:         vma.private,
 			GrowsDown:       vma.growsDown,
-			MLockMode:       vma.mlockMode,
 			Hint:            vma.hint,
 		})
 		if err == nil {
-			if vma.mlockMode == memmap.MLockEager {
-				mm.populateVMA(ctx, vseg, ar, true)
-			}
 			return oldAddr, nil
 		}
 		// In-place growth failed. In the MRemapMayMove case, fall through to
@@ -526,14 +462,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+		mm.vmas.Add(newAR, vma)
 		mm.usageAS += uint64(newAR.Length())
-		if vma.mlockMode != memmap.MLockNone {
-			mm.lockedAS += uint64(newAR.Length())
-			if vma.mlockMode == memmap.MLockEager {
-				mm.populateVMA(ctx, vseg, newAR, true)
-			}
-		}
 		return newAR.Start, nil
 	}
 
@@ -555,11 +485,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	vseg = mm.vmas.Isolate(vseg, oldAR)
 	vma := vseg.Value()
 	mm.vmas.Remove(vseg)
-	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
+	mm.vmas.Add(newAR, vma)
 	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
-	if vma.mlockMode != memmap.MLockNone {
-		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
-	}
 
 	// Move pmas. This is technically optional for non-private pmas, which
 	// could just go through memmap.Mappable.Translate again, but it's required
@@ -574,10 +501,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
 	}
 
-	if vma.mlockMode == memmap.MLockEager {
-		mm.populateVMA(ctx, vseg, newAR, true)
-	}
-
 	return newAR.Start, nil
 }
 
@@ -688,10 +611,9 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
 // error on failure.
 func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
 	mm.mappingMu.Lock()
-	// Can't defer mm.mappingMu.Unlock(); see below.
+	defer mm.mappingMu.Unlock()
 
 	if addr < mm.brk.Start {
-		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EINVAL
 	}
 
@@ -701,24 +623,21 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// heap + data + bss. The segment sizes need to be plumbed from the
 	// loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
-		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.ENOMEM
 	}
 
 	oldbrkpg, _ := mm.brk.End.RoundUp()
 	newbrkpg, ok := addr.RoundUp()
 	if !ok {
-		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EFAULT
 	}
 
 	switch {
 	case newbrkpg < oldbrkpg:
 		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
-		mm.mappingMu.Unlock()
 
 	case oldbrkpg < newbrkpg:
-		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length: uint64(newbrkpg - oldbrkpg),
 			Addr:   oldbrkpg,
 			Fixed:  true,
@@ -727,221 +646,17 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 			Perms:    usermem.ReadWrite,
 			MaxPerms: usermem.AnyAccess,
 			Private:  true,
-			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
-			// mm->def_flags.
-			MLockMode: mm.defMLockMode,
-			Hint:      "[heap]",
+			Hint:     "[heap]",
 		})
 		if err != nil {
-			mm.mappingMu.Unlock()
 			return mm.brk.End, err
 		}
-		if mm.defMLockMode == memmap.MLockEager {
-			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
-		} else {
-			mm.mappingMu.Unlock()
-		}
-
-	default:
-		// Nothing to do.
-		mm.mappingMu.Unlock()
 	}
 
 	mm.brk.End = addr
 	return addr, nil
 }
 
-// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
-// depending on mode.
-func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
-	// Linux allows this to overflow.
-	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
-	ar, ok := addr.RoundDown().ToRange(uint64(la))
-	if !ok {
-		return syserror.EINVAL
-	}
-
-	mm.mappingMu.Lock()
-	// Can't defer mm.mappingMu.Unlock(); see below.
-
-	if mode != memmap.MLockNone {
-		// Check against RLIMIT_MEMLOCK.
-		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-			if mlockLimit == 0 {
-				mm.mappingMu.Unlock()
-				return syserror.EPERM
-			}
-			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
-				mm.mappingMu.Unlock()
-				return syserror.ENOMEM
-			}
-		}
-	}
-
-	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
-	if ar.Length() == 0 {
-		mm.mappingMu.Unlock()
-		return nil
-	}
-
-	// Apply the new mlock mode to vmas.
-	var unmapped bool
-	vseg := mm.vmas.FindSegment(ar.Start)
-	for {
-		if !vseg.Ok() {
-			unmapped = true
-			break
-		}
-		vseg = mm.vmas.Isolate(vseg, ar)
-		vma := vseg.ValuePtr()
-		prevMode := vma.mlockMode
-		vma.mlockMode = mode
-		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
-			mm.lockedAS += uint64(vseg.Range().Length())
-		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
-			mm.lockedAS -= uint64(vseg.Range().Length())
-		}
-		if ar.End <= vseg.End() {
-			break
-		}
-		vseg, _ = vseg.NextNonEmpty()
-	}
-	mm.vmas.MergeRange(ar)
-	mm.vmas.MergeAdjacent(ar)
-	if unmapped {
-		mm.mappingMu.Unlock()
-		return syserror.ENOMEM
-	}
-
-	if mode == memmap.MLockEager {
-		// Ensure that we have usable pmas. Since we didn't return ENOMEM
-		// above, ar must be fully covered by vmas, so we can just use
-		// NextSegment below.
-		mm.activeMu.Lock()
-		mm.mappingMu.DowngradeLock()
-		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
-			if !vseg.ValuePtr().effectivePerms.Any() {
-				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
-				// case, which is converted to ENOMEM by mlock.
-				mm.activeMu.Unlock()
-				mm.mappingMu.RUnlock()
-				return syserror.ENOMEM
-			}
-			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
-			if err != nil {
-				mm.activeMu.Unlock()
-				mm.mappingMu.RUnlock()
-				// Linux: mm/mlock.c:__mlock_posix_error_return()
-				if err == syserror.EFAULT {
-					return syserror.ENOMEM
-				}
-				if err == syserror.ENOMEM {
-					return syserror.EAGAIN
-				}
-				return err
-			}
-		}
-
-		// Map pmas into the active AddressSpace, if we have one.
-		mm.mappingMu.RUnlock()
-		if mm.as != nil {
-			mm.activeMu.DowngradeLock()
-			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
-			mm.activeMu.RUnlock()
-			if err != nil {
-				return err
-			}
-		} else {
-			mm.activeMu.Unlock()
-		}
-	} else {
-		mm.mappingMu.Unlock()
-	}
-
-	return nil
-}
-
-// MLockAllOpts holds options to MLockAll.
-type MLockAllOpts struct {
-	// If Current is true, change the memory-locking behavior of all mappings
-	// to Mode. If Future is true, upgrade the memory-locking behavior of all
-	// future mappings to Mode. At least one of Current or Future must be true.
-	Current bool
-	Future  bool
-	Mode    memmap.MLockMode
-}
-
-// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
-// depending on opts.
-func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
-	if !opts.Current && !opts.Future {
-		return syserror.EINVAL
-	}
-
-	mm.mappingMu.Lock()
-	// Can't defer mm.mappingMu.Unlock(); see below.
-
-	if opts.Current {
-		if opts.Mode != memmap.MLockNone {
-			// Check against RLIMIT_MEMLOCK.
-			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-				if mlockLimit == 0 {
-					mm.mappingMu.Unlock()
-					return syserror.EPERM
-				}
-				if uint64(mm.vmas.Span()) > mlockLimit {
-					mm.mappingMu.Unlock()
-					return syserror.ENOMEM
-				}
-			}
-		}
-		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
-			vma := vseg.ValuePtr()
-			prevMode := vma.mlockMode
-			vma.mlockMode = opts.Mode
-			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
-				mm.lockedAS += uint64(vseg.Range().Length())
-			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
-				mm.lockedAS -= uint64(vseg.Range().Length())
-			}
-		}
-	}
-
-	if opts.Future {
-		mm.defMLockMode = opts.Mode
-	}
-
-	if opts.Current && opts.Mode == memmap.MLockEager {
-		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
-		// ignores the return value of __mm_populate(), so all errors below are
-		// ignored.
-		//
-		// Try to get usable pmas.
-		mm.activeMu.Lock()
-		mm.mappingMu.DowngradeLock()
-		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
-			if vseg.ValuePtr().effectivePerms.Any() {
-				mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
-			}
-		}
-
-		// Map all pmas into the active AddressSpace, if we have one.
-		mm.mappingMu.RUnlock()
-		if mm.as != nil {
-			mm.activeMu.DowngradeLock()
-			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
-			mm.activeMu.RUnlock()
-		} else {
-			mm.activeMu.Unlock()
-		}
-	} else {
-		mm.mappingMu.Unlock()
-	}
-	return nil
-}
-
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
@@ -965,49 +680,46 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
 	mem := mm.p.Memory()
-	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
-		vma := vseg.ValuePtr()
-		if vma.mlockMode != memmap.MLockNone {
-			return syserror.EINVAL
-		}
-		vsegAR := vseg.Range().Intersect(ar)
-		// pseg should already correspond to either this vma or a later one,
-		// since there can't be a pma without a corresponding vma.
-		if checkInvariants {
-			if pseg.Ok() && pseg.End() <= vsegAR.Start {
-				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
-			}
-		}
-		for pseg.Ok() && pseg.Start() < vsegAR.End {
-			pma := pseg.ValuePtr()
-			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
-				psegAR := pseg.Range().Intersect(ar)
-				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
-					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
-						pseg = pseg.NextSegment()
-						continue
-					}
-					// If an error occurs, fall through to the general
-					// invalidation case below.
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+			psegAR := pseg.Range().Intersect(ar)
+			vseg = vseg.seekNextLowerBound(psegAR.Start)
+			if checkInvariants {
+				if !vseg.Ok() {
+					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
+				}
+				if psegAR.Start < vseg.Start() {
+					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
 				}
 			}
-			pseg = mm.pmas.Isolate(pseg, vsegAR)
-			pma = pseg.ValuePtr()
-			if !didUnmapAS {
-				// Unmap all of ar, not just pseg.Range(), to minimize host
-				// syscalls. AddressSpace mappings must be removed before
-				// mm.decPrivateRef().
-				mm.unmapASLocked(ar)
-				didUnmapAS = true
-			}
-			if pma.private {
-				mm.decPrivateRef(pseg.fileRange())
+			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
+				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+					pseg = pseg.NextSegment()
+					continue
+				}
+				// If an error occurs, fall through to the general
+				// invalidation case below.
 			}
-			pma.file.DecRef(pseg.fileRange())
-			mm.removeRSSLocked(pseg.Range())
-			pseg = mm.pmas.Remove(pseg).NextSegment()
 		}
+		pseg = mm.pmas.Isolate(pseg, ar)
+		pma = pseg.ValuePtr()
+		if !didUnmapAS {
+			// Unmap all of ar, not just pseg.Range(), to minimize host
+			// syscalls. AddressSpace mappings must be removed before
+			// mm.decPrivateRef().
+			mm.unmapASLocked(ar)
+			didUnmapAS = true
+		}
+		if pma.private {
+			mm.decPrivateRef(pseg.fileRange())
+		}
+		pma.file.DecRef(pseg.fileRange())
+		mm.removeRSSLocked(pseg.Range())
+
+		pseg = mm.pmas.Remove(pseg).NextSegment()
 	}
 
 	// "If there are some parts of the specified address space that are not
@@ -1020,28 +732,9 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	return nil
 }
 
-// MSyncOpts holds options to MSync.
-type MSyncOpts struct {
-	// Sync has the semantics of MS_SYNC.
-	Sync bool
-
-	// Invalidate has the semantics of MS_INVALIDATE.
-	Invalidate bool
-}
-
-// MSync implements the semantics of Linux's msync().
-func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
-	if addr != addr.RoundDown() {
-		return syserror.EINVAL
-	}
-	if length == 0 {
-		return nil
-	}
-	la, ok := usermem.Addr(length).RoundUp()
-	if !ok {
-		return syserror.ENOMEM
-	}
-	ar, ok := addr.ToRange(uint64(la))
+// Sync implements the semantics of Linux's msync(MS_SYNC).
+func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
 	if !ok {
 		return syserror.ENOMEM
 	}
@@ -1066,14 +759,10 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui
 		}
 		lastEnd = vseg.End()
 		vma := vseg.ValuePtr()
-		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
-			mm.mappingMu.RUnlock()
-			return syserror.EBUSY
-		}
 		// It's only possible to have dirtied the Mappable through a shared
 		// mapping. Don't check if the mapping is writable, because mprotect
 		// may have changed this, and also because Linux doesn't.
-		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
+		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
 			// We can't call memmap.MappingIdentity.Msync while holding
 			// mm.mappingMu since it may take fs locks that precede it in the
 			// lock order.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 28ba9f2f5..5c2c802f6 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -17,10 +17,8 @@ package mm
 import (
 	"fmt"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -55,23 +53,6 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
 	}
 
-	if opts.MLockMode != memmap.MLockNone {
-		// Check against RLIMIT_MEMLOCK.
-		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
-			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
-			if mlockLimit == 0 {
-				return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
-			}
-			newLockedAS := mm.lockedAS + opts.Length
-			if opts.Unmap {
-				newLockedAS -= mm.mlockedBytesRangeLocked(ar)
-			}
-			if newLockedAS > mlockLimit {
-				return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
-			}
-		}
-	}
-
 	// Remove overwritten mappings. This ordering is consistent with Linux:
 	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
 	// file->f_op->mmap().
@@ -104,14 +85,10 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		maxPerms:       opts.MaxPerms,
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
-		mlockMode:      opts.MLockMode,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	})
 	mm.usageAS += opts.Length
-	if opts.MLockMode != memmap.MLockNone {
-		mm.lockedAS += opts.Length
-	}
 
 	return vseg, ar, nil
 }
@@ -224,17 +201,6 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo
 	return 0, syserror.ENOMEM
 }
 
-// Preconditions: mm.mappingMu must be locked.
-func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
-	var total uint64
-	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
-		if vseg.ValuePtr().mlockMode != memmap.MLockNone {
-			total += uint64(vseg.Range().Intersect(ar).Length())
-		}
-	}
-	return total
-}
-
 // getVMAsLocked ensures that vmas exist for all addresses in ar, and support
 // access of type (at, ignorePermissions). It returns:
 //
@@ -372,9 +338,6 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 			vma.id.DecRef()
 		}
 		mm.usageAS -= uint64(vmaAR.Length())
-		if vma.mlockMode != memmap.MLockNone {
-			mm.lockedAS -= uint64(vmaAR.Length())
-		}
 		vgap = mm.vmas.Remove(vseg)
 		vseg = vgap.NextSegment()
 	}
@@ -405,7 +368,6 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.maxPerms != vma2.maxPerms ||
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
-		vma1.mlockMode != vma2.mlockMode ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index e855590e6..7a5c93f9b 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
 		145: SchedGetscheduler,
 		146: SchedGetPriorityMax,
 		147: SchedGetPriorityMin,
-		148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
-		149: Mlock,
-		150: Munlock,
-		151: Mlockall,
-		152: Munlockall,
+		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
+		149: syscalls.Error(nil),                         // Mlock, TODO
+		150: syscalls.Error(nil),                         // Munlock, TODO
+		151: syscalls.Error(nil),                         // Mlockall, TODO
+		152: syscalls.Error(nil),                         // Munlockall, TODO
 		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
 		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
 		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
@@ -373,9 +373,8 @@ var AMD64 = &kernel.SyscallTable{
 		//     322: Execveat, TODO
 		//     323: Userfaultfd, TODO
 		//     324: Membarrier, TODO
-		325: Mlock2,
-		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
-		//	326: CopyFileRange,
+		// Syscalls after 325 are backports from 4.6.
+		325: syscalls.Error(nil), // Mlock2, TODO
 		327: Preadv2,
 		328: Pwritev2,
 	},
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 8732861e0..145f7846c 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -69,9 +69,6 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
 		Precommit: linux.MAP_POPULATE&flags != 0,
 	}
-	if linux.MAP_LOCKED&flags != 0 {
-		opts.MLockMode = memmap.MLockEager
-	}
 	defer func() {
 		if opts.MappingIdentity != nil {
 			opts.MappingIdentity.DecRef()
@@ -387,6 +384,16 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	length := args[1].SizeT()
 	flags := args[2].Int()
 
+	if addr != addr.RoundDown() {
+		return 0, nil, syserror.EINVAL
+	}
+	if length == 0 {
+		return 0, nil, nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return 0, nil, syserror.ENOMEM
+	}
 	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
 	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
 	// permits a call to msync() that specifies neither of these flags, with
@@ -399,72 +406,39 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if sync && flags&linux.MS_ASYNC != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
-		Sync:       sync,
-		Invalidate: flags&linux.MS_INVALIDATE != 0,
-	})
-	// MSync calls fsync, the same interrupt conversion rules apply, see
-	// mm/msync.c, fsync POSIX.1-2008.
-	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
-}
-
-// Mlock implements linux syscall mlock(2).
-func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	addr := args[0].Pointer()
-	length := args[1].SizeT()
-
-	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
-}
 
-// Mlock2 implements linux syscall mlock2(2).
-func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	addr := args[0].Pointer()
-	length := args[1].SizeT()
-	flags := args[2].Int()
-
-	if flags&^(linux.MLOCK_ONFAULT) != 0 {
+	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
+	// that they can be updated with the fresh values just written)". This is a
+	// no-op given that shared memory exists. However, MS_INVALIDATE can also
+	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
+	// and a memory lock exists for the specified address range." Given that
+	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
+	// some user program could be using it for synchronization.
+	if flags&linux.MS_INVALIDATE != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-
-	mode := memmap.MLockEager
-	if flags&linux.MLOCK_ONFAULT != 0 {
-		mode = memmap.MLockLazy
-	}
-	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
-}
-
-// Munlock implements linux syscall munlock(2).
-func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	addr := args[0].Pointer()
-	length := args[1].SizeT()
-
-	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
-}
-
-// Mlockall implements linux syscall mlockall(2).
-func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	flags := args[0].Int()
-
-	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
-		return 0, nil, syserror.EINVAL
+	// MS_SYNC "requests an update and waits for it to complete."
+	if sync {
+		err := t.MemoryManager().Sync(t, addr, uint64(la))
+		// Sync calls fsync, the same interrupt conversion rules apply, see
+		// mm/msync.c, fsync POSIX.1-2008.
+		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+	}
+	// MS_ASYNC "specifies that an update be scheduled, but the call returns
+	// immediately". As long as dirty pages are tracked and eventually written
+	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
+	// is in fact a no-op, since the kernel properly tracks dirty pages and
+	// flushes them to storage as necessary.")
+	//
+	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
+	// This applies even for MS_ASYNC.
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return 0, nil, syserror.ENOMEM
 	}
-
-	mode := memmap.MLockEager
-	if flags&linux.MCL_ONFAULT != 0 {
-		mode = memmap.MLockLazy
+	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
+	if mapped != uint64(la) {
+		return 0, nil, syserror.ENOMEM
 	}
-	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
-		Current: flags&linux.MCL_CURRENT != 0,
-		Future:  flags&linux.MCL_FUTURE != 0,
-		Mode:    mode,
-	})
-}
-
-// Munlockall implements linux syscall munlockall(2).
-func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
-		Current: true,
-		Future:  true,
-		Mode:    memmap.MLockNone,
-	})
+	return 0, nil, nil
 }
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index b0b216045..2f16e1791 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -90,7 +90,6 @@ var setableLimits = map[limits.LimitType]struct{}{
 	limits.CPU:           {},
 	limits.Data:          {},
 	limits.FileSize:      {},
-	limits.MemoryLocked:  {},
 	limits.Stack:         {},
 	// These are not enforced, but we include them here to avoid returning
 	// EPERM, since some apps expect them to succeed.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index e3e716bf9..8ecda6d0e 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_DATA":       limits.Data,
 	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_LOCKS":      limits.Locks,
-	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
+	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
 	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
 	"RLIMIT_NICE":       limits.Nice,
 	"RLIMIT_NOFILE":     limits.NumberOfFiles,
@@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
-	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
 	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
 	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f13e32daa..c0b8246b5 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1019,21 +1019,6 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "mlock_test",
-    testonly = 1,
-    srcs = ["mlock.cc"],
-    linkstatic = 1,
-    deps = [
-        "//test/util:capability_util",
-        "//test/util:cleanup",
-        "//test/util:memory_util",
-        "//test/util:multiprocess_util",
-        "//test/util:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_binary(
     name = "mmap_test",
     testonly = 1,
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
deleted file mode 100644
index a0d876c2e..000000000
--- a/test/syscalls/linux/mlock.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <errno.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/resource.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-#include "test/util/capability_util.h"
-#include "test/util/cleanup.h"
-#include "test/util/memory_util.h"
-#include "test/util/multiprocess_util.h"
-#include "test/util/test_util.h"
-
-using ::testing::_;
-
-namespace gvisor {
-namespace testing {
-
-namespace {
-
-PosixErrorOr<bool> CanMlock() {
-  struct rlimit rlim;
-  if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
-    return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)");
-  }
-  if (rlim.rlim_cur != 0) {
-    return true;
-  }
-  return HaveCapability(CAP_IPC_LOCK);
-}
-
-// Returns true if the page containing addr is mlocked.
-bool IsPageMlocked(uintptr_t addr) {
-  // This relies on msync(MS_INVALIDATE) interacting correctly with mlocked
-  // pages, which is tested for by the MsyncInvalidate case below.
-  int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
-                       kPageSize, MS_ASYNC | MS_INVALIDATE);
-  if (rv == 0) {
-    return false;
-  }
-  // This uses TEST_PCHECK_MSG since it's used in subprocesses.
-  TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno");
-  return true;
-}
-
-PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
-  struct rlimit old_rlim;
-  if (getrlimit(resource, &old_rlim) != 0) {
-    return PosixError(errno, "getrlimit failed");
-  }
-  struct rlimit new_rlim = old_rlim;
-  new_rlim.rlim_cur = newval;
-  if (setrlimit(resource, &new_rlim) != 0) {
-    return PosixError(errno, "setrlimit failed");
-  }
-  return Cleanup([resource, old_rlim] {
-    TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
-  });
-}
-
-TEST(MlockTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MlockTest, ProtNone) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping =
-      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
-              SyscallFailsWithErrno(ENOMEM));
-  // ENOMEM is returned because mlock can't populate the page, but it's still
-  // considered locked.
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MlockTest, MadviseDontneed) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-TEST(MlockTest, MsyncInvalidate) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE),
-              SyscallFailsWithErrno(EBUSY));
-  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE),
-              SyscallFailsWithErrno(EBUSY));
-}
-
-TEST(MlockTest, Fork) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  EXPECT_THAT(
-      InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }),
-      IsPosixErrorOkAndHolds(0));
-}
-
-TEST(MlockTest, RlimitMemlockZero) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
-              SyscallFailsWithErrno(EPERM));
-}
-
-TEST(MlockTest, RlimitMemlockInsufficient) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
-              SyscallFailsWithErrno(ENOMEM));
-}
-
-TEST(MunlockTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MunlockTest, NotLocked) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-// There is currently no test for mlockall(MCL_CURRENT) because the default
-// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke
-// mlockall(MCL_CURRENT).
-
-TEST(MlockallTest, Future) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-
-  // Run this test in a separate (single-threaded) subprocess to ensure that a
-  // background thread doesn't try to mmap a large amount of memory, fail due
-  // to hitting RLIMIT_MEMLOCK, and explode the process violently.
-  EXPECT_THAT(InForkedProcess([] {
-                auto const mapping =
-                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)
-                        .ValueOrDie();
-                TEST_CHECK(!IsPageMlocked(mapping.addr()));
-                TEST_PCHECK(mlockall(MCL_FUTURE) == 0);
-                // Ensure that mlockall(MCL_FUTURE) is turned off before the end
-                // of the test, as otherwise mmaps may fail unexpectedly.
-                Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); });
-                auto const mapping2 = ASSERT_NO_ERRNO_AND_VALUE(
-                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-                TEST_CHECK(IsPageMlocked(mapping2.addr()));
-                // Fire munlockall() and check that it disables
-                // mlockall(MCL_FUTURE).
-                do_munlockall.Release()();
-                auto const mapping3 = ASSERT_NO_ERRNO_AND_VALUE(
-                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-                TEST_CHECK(!IsPageMlocked(mapping2.addr()));
-              }),
-              IsPosixErrorOkAndHolds(0));
-}
-
-TEST(MunlockallTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(munlockall(), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-#ifndef SYS_mlock2
-#ifdef __x86_64__
-#define SYS_mlock2 325
-#endif
-#endif
-
-#ifndef MLOCK_ONFAULT
-#define MLOCK_ONFAULT 0x01  // Linux: include/uapi/asm-generic/mman-common.h
-#endif
-
-#ifdef SYS_mlock2
-
-int mlock2(void const* addr, size_t len, int flags) {
-  return syscall(SYS_mlock2, addr, len, flags);
-}
-
-TEST(Mlock2Test, NoFlags) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(Mlock2Test, MlockOnfault) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT),
-              SyscallSucceeds());
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(Mlock2Test, UnknownFlags) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
-  EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0),
-              SyscallFailsWithErrno(EINVAL));
-}
-
-#endif  // defined(SYS_mlock2)
-
-TEST(MapLockedTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
-  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
-}
-
-TEST(MapLockedTest, RlimitMemlockZero) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
-  EXPECT_THAT(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
-      PosixErrorIs(EPERM, _));
-}
-
-TEST(MapLockedTest, RlimitMemlockInsufficient) {
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
-  EXPECT_THAT(
-      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
-      PosixErrorIs(EAGAIN, _));
-}
-
-TEST(MremapLockedTest, Basic) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-
-  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
-                      MREMAP_MAYMOVE, nullptr);
-  if (addr == MAP_FAILED) {
-    FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")";
-  }
-  mapping.release();
-  mapping.reset(addr, 2 * mapping.len());
-  EXPECT_TRUE(IsPageMlocked(reinterpret_cast<uintptr_t>(addr)));
-}
-
-TEST(MremapLockedTest, RlimitMemlockZero) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit =
-      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
-  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
-                      MREMAP_MAYMOVE, nullptr);
-  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
-      << "addr = " << addr << ", errno = " << errno;
-}
-
-TEST(MremapLockedTest, RlimitMemlockInsufficient) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
-  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
-      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
-  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
-
-  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
-    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
-  }
-  Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE(
-      ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len()));
-  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
-                      MREMAP_MAYMOVE, nullptr);
-  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
-      << "addr = " << addr << ", errno = " << errno;
-}
-
-}  // namespace
-
-}  // namespace testing
-}  // namespace gvisor
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index 72d90dc78..0ddc621aa 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -43,13 +43,14 @@ class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
  protected:
   int msync_flags() const { return std::get<0>(GetParam()); }
 
-  PosixErrorOr<Mapping> GetMapping() const { return std::get<1>(GetParam())(); }
+  PosixErrorOr<Mapping> GetMapping() const {
+    auto rv = std::get<1>(GetParam())();
+    return rv;
+  }
 };
 
-// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux
-// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with
-// semantics that are (currently) equivalent to specifying MS_ASYNC." -
-// msync(2))
+// All valid msync(2) flag combinations (not including MS_INVALIDATE, which
+// gVisor doesn't implement).
 constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
 
 // Returns functions that return mappings that should be successfully
@@ -133,15 +134,6 @@ TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
       SyscallFailsWithErrno(EINVAL));
 }
 
-TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) {
-  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
-  EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE),
-              SyscallSucceeds());
-}
-
-// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires
-// probing for mlock support.
-
 INSTANTIATE_TEST_CASE_P(
     All, MsyncFullParamTest,
     ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
-- 
cgit v1.2.3


From 194ef586fcb1bec049ee8777c2e5f70997de7a87 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 20 Dec 2018 13:27:25 -0800
Subject: Rename limits.MemoryPagesLocked to limits.MemoryLocked.

"RLIMIT_MEMLOCK: This is the maximum number of bytes of memory that may
be locked into RAM." - getrlimit(2)

PiperOrigin-RevId: 226384346
Change-Id: Iefac4a1bb69f7714dc813b5b871226a8344dc800
---
 pkg/sentry/limits/limits.go | 2 +-
 pkg/sentry/limits/linux.go  | 2 +-
 runsc/boot/limits.go        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index ba0b7d4fd..eeca01876 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -33,7 +33,7 @@ const (
 	Rss
 	ProcessCount
 	NumberOfFiles
-	MemoryPagesLocked
+	MemoryLocked
 	AS
 	Locks
 	SignalsPending
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 511db6733..295f9c398 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
 	linux.RLIMIT_RSS:        Rss,
 	linux.RLIMIT_NPROC:      ProcessCount,
 	linux.RLIMIT_NOFILE:     NumberOfFiles,
-	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
+	linux.RLIMIT_MEMLOCK:    MemoryLocked,
 	linux.RLIMIT_AS:         AS,
 	linux.RLIMIT_LOCKS:      Locks,
 	linux.RLIMIT_SIGPENDING: SignalsPending,
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 8ecda6d0e..e3e716bf9 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -29,7 +29,7 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_DATA":       limits.Data,
 	"RLIMIT_FSIZE":      limits.FileSize,
 	"RLIMIT_LOCKS":      limits.Locks,
-	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
+	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
 	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
 	"RLIMIT_NICE":       limits.Nice,
 	"RLIMIT_NOFILE":     limits.NumberOfFiles,
@@ -55,7 +55,7 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
 	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
-	ls.SetUnchecked(limits.MemoryPagesLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
 	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
 	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
-- 
cgit v1.2.3


From f6274804e14ece853c952cb71fde73dfb06b733a Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 20 Dec 2018 13:47:46 -0800
Subject: Make read and write respect SO_RCVTIMEO and SO_SNDTIMEO

PiperOrigin-RevId: 226387521
Change-Id: I0579ab262320fde6c72d2994dd38437f01a99ea5
---
 pkg/sentry/syscalls/linux/sys_read.go  | 23 ++++++++++++-
 pkg/sentry/syscalls/linux/sys_write.go | 23 ++++++++++++-
 test/syscalls/linux/socket_generic.cc  | 59 ++++++++++++++++++++++++++++++----
 3 files changed, 96 insertions(+), 9 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index b6df4d9d4..8105e9b43 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -15,11 +15,15 @@
 package linux
 
 import (
+	"time"
+
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -259,6 +263,20 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
 		return n, err
 	}
 
+	// Sockets support read timeouts.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if s, ok := f.FileOperations.(socket.Socket); ok {
+		dl := s.RecvTimeout()
+		if dl < 0 && err == syserror.ErrWouldBlock {
+			return n, err
+		}
+		if dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		}
+	}
+
 	// Register for notifications.
 	w, ch := waiter.NewChannelEntry(nil)
 	f.EventRegister(&w, EventMaskRead)
@@ -277,7 +295,10 @@ func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) {
 		}
 
 		// Wait for a notification that we should retry.
-		if err = t.Block(ch); err != nil {
+		if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
 			break
 		}
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 750a098cd..a5ad7efb2 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -15,11 +15,15 @@
 package linux
 
 import (
+	"time"
+
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -263,6 +267,20 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
 		return n, err
 	}
 
+	// Sockets support write timeouts.
+	var haveDeadline bool
+	var deadline ktime.Time
+	if s, ok := f.FileOperations.(socket.Socket); ok {
+		dl := s.SendTimeout()
+		if dl < 0 && err == syserror.ErrWouldBlock {
+			return n, err
+		}
+		if dl > 0 {
+			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
+			haveDeadline = true
+		}
+	}
+
 	// Register for notifications.
 	w, ch := waiter.NewChannelEntry(nil)
 	f.EventRegister(&w, EventMaskWrite)
@@ -281,7 +299,10 @@ func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) {
 		}
 
 		// Wait for a notification that we should retry.
-		if err = t.Block(ch); err != nil {
+		if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = syserror.ErrWouldBlock
+			}
 			break
 		}
 	}
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index c65b29112..974c0dd7b 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -280,7 +280,22 @@ TEST_P(AllSocketPairTest, SndBufSucceeds) {
   EXPECT_GT(size, 0);
 }
 
-TEST_P(AllSocketPairTest, RecvTimeoutSucceeds) {
+TEST_P(AllSocketPairTest, RecvTimeoutReadSucceeds) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  EXPECT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutRecvSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval tv {
@@ -295,7 +310,7 @@ TEST_P(AllSocketPairTest, RecvTimeoutSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
-TEST_P(AllSocketPairTest, RecvTimeoutOneSecondSucceeds) {
+TEST_P(AllSocketPairTest, RecvTimeoutRecvOneSecondSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval tv {
@@ -310,7 +325,7 @@ TEST_P(AllSocketPairTest, RecvTimeoutOneSecondSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
-TEST_P(AllSocketPairTest, RecvmsgTimeoutSucceeds) {
+TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval tv {
@@ -332,6 +347,21 @@ TEST_P(AllSocketPairTest, RecvmsgTimeoutSucceeds) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(AllSocketPairTest, SendTimeoutAllowsWrite) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = 0, .tv_usec = 10
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+}
+
 TEST_P(AllSocketPairTest, SendTimeoutAllowsSend) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -347,7 +377,7 @@ TEST_P(AllSocketPairTest, SendTimeoutAllowsSend) {
               SyscallSucceedsWithValue(sizeof(buf)));
 }
 
-TEST_P(AllSocketPairTest, SendmsgTimeoutAllowsSend) {
+TEST_P(AllSocketPairTest, SendTimeoutAllowsSendmsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval tv {
@@ -389,7 +419,7 @@ TEST_P(AllSocketPairTest, SoRcvTimeoIsSetLargerArg) {
               SyscallSucceeds());
 }
 
-TEST_P(AllSocketPairTest, RecvmsgTimeoutOneSecondSucceeds) {
+TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgOneSecondSucceeds) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval tv {
@@ -455,7 +485,22 @@ TEST_P(AllSocketPairTest, SendTimeoutUsecNeg) {
       SyscallFailsWithErrno(EDOM));
 }
 
-TEST_P(AllSocketPairTest, RecvTimeoutNegSec) {
+TEST_P(AllSocketPairTest, RecvTimeoutNegSecRead) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct timeval tv {
+    .tv_sec = -1, .tv_usec = 0
+  };
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+      SyscallSucceeds());
+
+  char buf[20] = {};
+  EXPECT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutNegSecRecv) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval tv {
@@ -470,7 +515,7 @@ TEST_P(AllSocketPairTest, RecvTimeoutNegSec) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
-TEST_P(AllSocketPairTest, RecvmsgTimeoutNegSec) {
+TEST_P(AllSocketPairTest, RecvTimeoutNegSecRecvmsg) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   struct timeval tv {
-- 
cgit v1.2.3


From 8ba450363fed5aa44676c23b737404c52da26a5f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 20 Dec 2018 17:22:15 -0800
Subject: Deflake gofer_test.

We must wait for all lazy resources to be released before closing the rootFile.

PiperOrigin-RevId: 226419499
Change-Id: I1d4d961a92b3816e02690cf3eaf0a88944d730cc
---
 pkg/sentry/fs/gofer/gofer_test.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index b450778ca..36201f017 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -68,7 +68,13 @@ func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context
 		// Ensure that the cache is fully invalidated, so that any
 		// close actions actually take place before the full harness is
 		// torn down.
-		defer m.FlushDirentRefs()
+		defer func() {
+			m.FlushDirentRefs()
+
+			// Wait for all resources to be released, otherwise the
+			// operations may fail after we close the rootFile.
+			fs.AsyncBarrier()
+		}()
 
 		// Execute the test.
 		fn(ctx, h, root, rootInode)
-- 
cgit v1.2.3


From 9a442fa4b5f64bde6554118ed5b340e6b53e8d6e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 21 Dec 2018 08:22:24 -0800
Subject: Automated rollback of changelist 226224230

PiperOrigin-RevId: 226493053
Change-Id: Ia98d1cb6dd0682049e4d907ef69619831de5c34a
---
 pkg/abi/linux/limits.go                 |   2 +-
 pkg/abi/linux/mm.go                     |  12 +
 pkg/sentry/memmap/memmap.go             |  37 +++
 pkg/sentry/mm/BUILD                     |   1 +
 pkg/sentry/mm/address_space.go          |  12 +-
 pkg/sentry/mm/lifecycle.go              |  24 +-
 pkg/sentry/mm/mm.go                     |  24 +-
 pkg/sentry/mm/syscalls.go               | 423 +++++++++++++++++++++++++++-----
 pkg/sentry/mm/vma.go                    |  38 +++
 pkg/sentry/syscalls/linux/linux64.go    |  15 +-
 pkg/sentry/syscalls/linux/sys_mmap.go   | 106 +++++---
 pkg/sentry/syscalls/linux/sys_rlimit.go |   1 +
 test/syscalls/linux/BUILD               |  16 ++
 test/syscalls/linux/mlock.cc            | 345 ++++++++++++++++++++++++++
 test/syscalls/linux/msync.cc            |  20 +-
 15 files changed, 945 insertions(+), 131 deletions(-)
 create mode 100644 test/syscalls/linux/mlock.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index b2e51b9bd..e0aa5b31d 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -60,7 +60,7 @@ const (
 	DefaultNofileHardLimit = 4096
 
 	// DefaultMemlockLimit is called MLOCK_LIMIT in Linux.
-	DefaultMemlockLimit = 64 * 1094
+	DefaultMemlockLimit = 64 * 1024
 
 	// DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux.
 	DefaultMsgqueueLimit = 819200
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index 3fcdf8235..eda8d9788 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -49,6 +49,18 @@ const (
 	MREMAP_FIXED   = 1 << 1
 )
 
+// Flags for mlock2(2).
+const (
+	MLOCK_ONFAULT = 0x01
+)
+
+// Flags for mlockall(2).
+const (
+	MCL_CURRENT = 1
+	MCL_FUTURE  = 2
+	MCL_ONFAULT = 4
+)
+
 // Advice for madvise(2).
 const (
 	MADV_NORMAL       = 0
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 28e2bed9b..cf20b11e3 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -243,6 +243,40 @@ type MappingIdentity interface {
 	Msync(ctx context.Context, mr MappableRange) error
 }
 
+// MLockMode specifies the memory locking behavior of a memory mapping.
+type MLockMode int
+
+// Note that the ordering of MLockModes is significant; see
+// mm.MemoryManager.defMLockMode.
+const (
+	// MLockNone specifies that a mapping has no memory locking behavior.
+	//
+	// This must be the zero value for MLockMode.
+	MLockNone MLockMode = iota
+
+	// MLockEager specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be made, and kept, resident in
+	// physical memory as soon as possible.
+	//
+	// As of this writing, MLockEager does not cause memory-locking to be
+	// requested from the host; it only affects the sentry's memory management
+	// behavior.
+	//
+	// MLockEager is analogous to Linux's VM_LOCKED.
+	MLockEager
+
+	// MLockLazy specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be kept resident in physical memory
+	// once they have been made resident due to e.g. a page fault.
+	//
+	// As of this writing, MLockLazy does not cause memory-locking to be
+	// requested from the host; in fact, it has virtually no effect, except for
+	// interactions between mlocked pages and other syscalls.
+	//
+	// MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
+	MLockLazy
+)
+
 // MMapOpts specifies a request to create a memory mapping.
 type MMapOpts struct {
 	// Length is the length of the mapping.
@@ -303,6 +337,9 @@ type MMapOpts struct {
 	// mapping (see platform.AddressSpace.MapFile).
 	Precommit bool
 
+	// MLockMode specifies the memory locking behavior of the mapping.
+	MLockMode MLockMode
+
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 744e73a39..5a9185e5d 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -106,6 +106,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 7488f7c4a..e7aa24c69 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() {
 // for all addresses in ar should be precommitted.
 //
 // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg.Range().Contains(ar.Start).
+// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
@@ -173,7 +173,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		}
 	}
 
-	for {
+	// Since this checks ar.End and not mapAR.End, we will never map a pma that
+	// is not required.
+	for pseg.Ok() && pseg.Start() < ar.End {
 		pma := pseg.ValuePtr()
 		pmaAR := pseg.Range()
 		pmaMapAR := pmaAR.Intersect(mapAR)
@@ -184,13 +186,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
 			return err
 		}
-		// Since this checks ar.End and not mapAR.End, we will never map a pma
-		// that is not required.
-		if ar.End <= pmaAR.End {
-			return nil
-		}
 		pseg = pseg.NextSegment()
 	}
+	return nil
 }
 
 // unmapASLocked removes all AddressSpace mappings for addresses in ar.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 1613ce11d..a42e32b43 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -58,13 +59,17 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
-		p:                    mm.p,
-		haveASIO:             mm.haveASIO,
-		layout:               mm.layout,
-		privateRefs:          mm.privateRefs,
-		users:                1,
-		usageAS:              mm.usageAS,
-		brk:                  mm.brk,
+		p:           mm.p,
+		haveASIO:    mm.haveASIO,
+		layout:      mm.layout,
+		privateRefs: mm.privateRefs,
+		users:       1,
+		brk:         mm.brk,
+		usageAS:     mm.usageAS,
+		// "The child does not inherit its parent's memory locks (mlock(2),
+		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
+		// MLockNone, both of which are zero values. vma.mlockMode is reset
+		// when copied below.
 		captureInvalidations: true,
 		argv:                 mm.argv,
 		envv:                 mm.envv,
@@ -77,7 +82,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	// Copy vmas.
 	dstvgap := mm2.vmas.FirstGap()
 	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
-		vma := srcvseg.ValuePtr()
+		vma := srcvseg.Value() // makes a copy of the vma
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
@@ -89,7 +94,8 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
+		vma.mlockMode = memmap.MLockNone
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
 		// We don't need to update mm2.usageAS since we copied it from mm
 		// above.
 	}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index b1e39e898..c0632d232 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -95,11 +95,6 @@ type MemoryManager struct {
 	// vmas is protected by mappingMu.
 	vmas vmaSet
 
-	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
-	//
-	// usageAS is protected by mappingMu.
-	usageAS uint64
-
 	// brk is the mm's brk, which is manipulated using the brk(2) system call.
 	// The brk is initially set up by the loader which maps an executable
 	// binary into the mm.
@@ -107,6 +102,23 @@ type MemoryManager struct {
 	// brk is protected by mappingMu.
 	brk usermem.AddrRange
 
+	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+	//
+	// usageAS is protected by mappingMu.
+	usageAS uint64
+
+	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
+	// memmap.MLockNone.
+	//
+	// lockedAS is protected by mappingMu.
+	lockedAS uint64
+
+	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
+	// defMLockMode is greater.
+	//
+	// defMLockMode is protected by mappingMu.
+	defMLockMode memmap.MLockMode
+
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
 	activeMu ssync.DowngradableRWMutex `state:"nosave"`
@@ -252,6 +264,8 @@ type vma struct {
 	// metag, none of which we currently support.
 	growsDown bool `state:"manual"`
 
+	mlockMode memmap.MLockMode
+
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index daaae4da1..383703ec3 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -128,16 +129,24 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 
 	// Get the new vma.
 	mm.mappingMu.Lock()
+	if opts.MLockMode < mm.defMLockMode {
+		opts.MLockMode = mm.defMLockMode
+	}
 	vseg, ar, err := mm.createVMALocked(ctx, opts)
 	if err != nil {
 		mm.mappingMu.Unlock()
 		return 0, err
 	}
 
+	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
+	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
+	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
+	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
+	// populate_vma_page_range(). Confirm this behavior.
 	switch {
-	case opts.Precommit:
+	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
 		// Get pmas and map with precommit as requested.
-		mm.populateAndUnlock(ctx, vseg, ar, true)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
 		// NOTE: Get pmas and map eagerly in the hope
@@ -146,7 +155,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
 		// to avoid needing to allocate large amounts of memory that we may
 		// subsequently need to checkpoint.
-		mm.populateAndUnlock(ctx, vseg, ar, false)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
 
 	default:
 		mm.mappingMu.Unlock()
@@ -155,31 +164,29 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 	return ar.Start, nil
 }
 
-// Preconditions: mm.mappingMu must be locked for writing.
+// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
+// into mm.as if it is active.
 //
-// Postconditions: mm.mappingMu will be unlocked.
-func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
 		// mm/gup.c:populate_vma_page_range.
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	mm.activeMu.Lock()
+	// Can't defer mm.activeMu.Unlock(); see below.
 
-	// Even if we get a new pma, we can't actually map it if we don't have an
+	// Even if we get new pmas, we can't actually map them if we don't have an
 	// AddressSpace.
 	if mm.as == nil {
 		mm.activeMu.Unlock()
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	// Ensure that we have usable pmas.
-	mm.mappingMu.DowngradeLock()
 	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
-	mm.mappingMu.RUnlock()
 	if err != nil {
 		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
 		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@@ -197,6 +204,45 @@ func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator
 	mm.activeMu.RUnlock()
 }
 
+// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
+// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
+// preferable to populateVMA since it unlocks mm.mappingMu before performing
+// expensive operations that don't require it to be locked.
+//
+// Preconditions: mm.mappingMu must be locked for writing.
+// vseg.Range().IsSupersetOf(ar).
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	// See populateVMA above for commentary.
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
+	// isn't needed at all for mapASLocked.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	mm.activeMu.DowngradeLock()
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
 // MapStack allocates the initial process stack.
 func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
 	// maxStackSize is the maximum supported process stack size in bytes.
@@ -236,6 +282,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error
 		MaxPerms:  usermem.AnyAccess,
 		Private:   true,
 		GrowsDown: true,
+		MLockMode: mm.defMLockMode,
 		Hint:      "[stack]",
 	})
 	return ar, err
@@ -334,6 +381,19 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// occupies at least part of the destination. Thus the NoMove case always
 	// fails and the MayMove case always falls back to copying.
 
+	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
+		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
+		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
+		// !CAP_IPC_LOCK.
+		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
+				return 0, syserror.EAGAIN
+			}
+		}
+	}
+
 	if opts.Move != MRemapMustMove {
 		// Handle no-ops and in-place shrinking. These cases don't care if
 		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
@@ -360,7 +420,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.mappable != nil {
 			newOffset = vseg.mappableRange().End
 		}
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length:          newSize - oldSize,
 			MappingIdentity: vma.id,
 			Mappable:        vma.mappable,
@@ -371,9 +431,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			MaxPerms:        vma.maxPerms,
 			Private:         vma.private,
 			GrowsDown:       vma.growsDown,
+			MLockMode:       vma.mlockMode,
 			Hint:            vma.hint,
 		})
 		if err == nil {
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, ar, true)
+			}
 			return oldAddr, nil
 		}
 		// In-place growth failed. In the MRemapMayMove case, fall through to
@@ -462,8 +526,14 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		mm.vmas.Add(newAR, vma)
+		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 		mm.usageAS += uint64(newAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS += uint64(newAR.Length())
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, newAR, true)
+			}
+		}
 		return newAR.Start, nil
 	}
 
@@ -485,8 +555,11 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	vseg = mm.vmas.Isolate(vseg, oldAR)
 	vma := vseg.Value()
 	mm.vmas.Remove(vseg)
-	mm.vmas.Add(newAR, vma)
+	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if vma.mlockMode != memmap.MLockNone {
+		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	}
 
 	// Move pmas. This is technically optional for non-private pmas, which
 	// could just go through memmap.Mappable.Translate again, but it's required
@@ -501,6 +574,10 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
 	}
 
+	if vma.mlockMode == memmap.MLockEager {
+		mm.populateVMA(ctx, vseg, newAR, true)
+	}
+
 	return newAR.Start, nil
 }
 
@@ -611,9 +688,10 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
 // error on failure.
 func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
 	mm.mappingMu.Lock()
-	defer mm.mappingMu.Unlock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
 
 	if addr < mm.brk.Start {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EINVAL
 	}
 
@@ -623,21 +701,24 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// heap + data + bss. The segment sizes need to be plumbed from the
 	// loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.ENOMEM
 	}
 
 	oldbrkpg, _ := mm.brk.End.RoundUp()
 	newbrkpg, ok := addr.RoundUp()
 	if !ok {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EFAULT
 	}
 
 	switch {
 	case newbrkpg < oldbrkpg:
 		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+		mm.mappingMu.Unlock()
 
 	case oldbrkpg < newbrkpg:
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length: uint64(newbrkpg - oldbrkpg),
 			Addr:   oldbrkpg,
 			Fixed:  true,
@@ -646,17 +727,221 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 			Perms:    usermem.ReadWrite,
 			MaxPerms: usermem.AnyAccess,
 			Private:  true,
-			Hint:     "[heap]",
+			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
+			// mm->def_flags.
+			MLockMode: mm.defMLockMode,
+			Hint:      "[heap]",
 		})
 		if err != nil {
+			mm.mappingMu.Unlock()
 			return mm.brk.End, err
 		}
+		if mm.defMLockMode == memmap.MLockEager {
+			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+		} else {
+			mm.mappingMu.Unlock()
+		}
+
+	default:
+		// Nothing to do.
+		mm.mappingMu.Unlock()
 	}
 
 	mm.brk.End = addr
 	return addr, nil
 }
 
+// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
+// depending on mode.
+func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
+	ar, ok := addr.RoundDown().ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if mode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				mm.mappingMu.Unlock()
+				return syserror.EPERM
+			}
+			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
+				mm.mappingMu.Unlock()
+				return syserror.ENOMEM
+			}
+		}
+	}
+
+	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
+	if ar.Length() == 0 {
+		mm.mappingMu.Unlock()
+		return nil
+	}
+
+	// Apply the new mlock mode to vmas.
+	var unmapped bool
+	vseg := mm.vmas.FindSegment(ar.Start)
+	for {
+		if !vseg.Ok() {
+			unmapped = true
+			break
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		prevMode := vma.mlockMode
+		vma.mlockMode = mode
+		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+			mm.lockedAS += uint64(vseg.Range().Length())
+		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vseg.Range().Length())
+		}
+		if ar.End <= vseg.End() {
+			break
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+	mm.vmas.MergeRange(ar)
+	mm.vmas.MergeAdjacent(ar)
+	if unmapped {
+		mm.mappingMu.Unlock()
+		return syserror.ENOMEM
+	}
+
+	if mode == memmap.MLockEager {
+		// Ensure that we have usable pmas. Since we didn't return ENOMEM
+		// above, ar must be fully covered by vmas, so we can just use
+		// NextSegment below.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+			if !vseg.ValuePtr().effectivePerms.Any() {
+				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
+				// case, which is converted to ENOMEM by mlock.
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				return syserror.ENOMEM
+			}
+			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
+			if err != nil {
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				// Linux: mm/mlock.c:__mlock_posix_error_return()
+				if err == syserror.EFAULT {
+					return syserror.ENOMEM
+				}
+				if err == syserror.ENOMEM {
+					return syserror.EAGAIN
+				}
+				return err
+			}
+		}
+
+		// Map pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
+			mm.activeMu.RUnlock()
+			if err != nil {
+				return err
+			}
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+
+	return nil
+}
+
+// MLockAllOpts holds options to MLockAll.
+type MLockAllOpts struct {
+	// If Current is true, change the memory-locking behavior of all mappings
+	// to Mode. If Future is true, upgrade the memory-locking behavior of all
+	// future mappings to Mode. At least one of Current or Future must be true.
+	Current bool
+	Future  bool
+	Mode    memmap.MLockMode
+}
+
+// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
+// depending on opts.
+func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
+	if !opts.Current && !opts.Future {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if opts.Current {
+		if opts.Mode != memmap.MLockNone {
+			// Check against RLIMIT_MEMLOCK.
+			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+				if mlockLimit == 0 {
+					mm.mappingMu.Unlock()
+					return syserror.EPERM
+				}
+				if uint64(mm.vmas.Span()) > mlockLimit {
+					mm.mappingMu.Unlock()
+					return syserror.ENOMEM
+				}
+			}
+		}
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			vma := vseg.ValuePtr()
+			prevMode := vma.mlockMode
+			vma.mlockMode = opts.Mode
+			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+				mm.lockedAS += uint64(vseg.Range().Length())
+			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+				mm.lockedAS -= uint64(vseg.Range().Length())
+			}
+		}
+	}
+
+	if opts.Future {
+		mm.defMLockMode = opts.Mode
+	}
+
+	if opts.Current && opts.Mode == memmap.MLockEager {
+		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
+		// ignores the return value of __mm_populate(), so all errors below are
+		// ignored.
+		//
+		// Try to get usable pmas.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			if vseg.ValuePtr().effectivePerms.Any() {
+				mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
+			}
+		}
+
+		// Map all pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
+			mm.activeMu.RUnlock()
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+	return nil
+}
+
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
@@ -680,46 +965,49 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
-	vseg := mm.vmas.LowerBoundSegment(ar.Start)
 	mem := mm.p.Memory()
-	for pseg.Ok() && pseg.Start() < ar.End {
-		pma := pseg.ValuePtr()
-		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
-			psegAR := pseg.Range().Intersect(ar)
-			vseg = vseg.seekNextLowerBound(psegAR.Start)
-			if checkInvariants {
-				if !vseg.Ok() {
-					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
-				}
-				if psegAR.Start < vseg.Start() {
-					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
-				}
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		vma := vseg.ValuePtr()
+		if vma.mlockMode != memmap.MLockNone {
+			return syserror.EINVAL
+		}
+		vsegAR := vseg.Range().Intersect(ar)
+		// pseg should already correspond to either this vma or a later one,
+		// since there can't be a pma without a corresponding vma.
+		if checkInvariants {
+			if pseg.Ok() && pseg.End() <= vsegAR.Start {
+				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
 			}
-			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
-				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
-					pseg = pseg.NextSegment()
-					continue
+		}
+		for pseg.Ok() && pseg.Start() < vsegAR.End {
+			pma := pseg.ValuePtr()
+			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+				psegAR := pseg.Range().Intersect(ar)
+				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
+					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+						pseg = pseg.NextSegment()
+						continue
+					}
+					// If an error occurs, fall through to the general
+					// invalidation case below.
 				}
-				// If an error occurs, fall through to the general
-				// invalidation case below.
 			}
+			pseg = mm.pmas.Isolate(pseg, vsegAR)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			pma.file.DecRef(pseg.fileRange())
+			mm.removeRSSLocked(pseg.Range())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
 		}
-		pseg = mm.pmas.Isolate(pseg, ar)
-		pma = pseg.ValuePtr()
-		if !didUnmapAS {
-			// Unmap all of ar, not just pseg.Range(), to minimize host
-			// syscalls. AddressSpace mappings must be removed before
-			// mm.decPrivateRef().
-			mm.unmapASLocked(ar)
-			didUnmapAS = true
-		}
-		if pma.private {
-			mm.decPrivateRef(pseg.fileRange())
-		}
-		pma.file.DecRef(pseg.fileRange())
-		mm.removeRSSLocked(pseg.Range())
-
-		pseg = mm.pmas.Remove(pseg).NextSegment()
 	}
 
 	// "If there are some parts of the specified address space that are not
@@ -732,9 +1020,28 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	return nil
 }
 
-// Sync implements the semantics of Linux's msync(MS_SYNC).
-func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
-	ar, ok := addr.ToRange(length)
+// MSyncOpts holds options to MSync.
+type MSyncOpts struct {
+	// Sync has the semantics of MS_SYNC.
+	Sync bool
+
+	// Invalidate has the semantics of MS_INVALIDATE.
+	Invalidate bool
+}
+
+// MSync implements the semantics of Linux's msync().
+func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
 	if !ok {
 		return syserror.ENOMEM
 	}
@@ -759,10 +1066,14 @@ func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uin
 		}
 		lastEnd = vseg.End()
 		vma := vseg.ValuePtr()
+		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
+			mm.mappingMu.RUnlock()
+			return syserror.EBUSY
+		}
 		// It's only possible to have dirtied the Mappable through a shared
 		// mapping. Don't check if the mapping is writable, because mprotect
 		// may have changed this, and also because Linux doesn't.
-		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
 			// We can't call memmap.MappingIdentity.Msync while holding
 			// mm.mappingMu since it may take fs locks that precede it in the
 			// lock order.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 5c2c802f6..28ba9f2f5 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -17,8 +17,10 @@ package mm
 import (
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -53,6 +55,23 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
 	}
 
+	if opts.MLockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
+			}
+			newLockedAS := mm.lockedAS + opts.Length
+			if opts.Unmap {
+				newLockedAS -= mm.mlockedBytesRangeLocked(ar)
+			}
+			if newLockedAS > mlockLimit {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
+			}
+		}
+	}
+
 	// Remove overwritten mappings. This ordering is consistent with Linux:
 	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
 	// file->f_op->mmap().
@@ -85,10 +104,14 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		maxPerms:       opts.MaxPerms,
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
+		mlockMode:      opts.MLockMode,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	})
 	mm.usageAS += opts.Length
+	if opts.MLockMode != memmap.MLockNone {
+		mm.lockedAS += opts.Length
+	}
 
 	return vseg, ar, nil
 }
@@ -201,6 +224,17 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo
 	return 0, syserror.ENOMEM
 }
 
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
+	var total uint64
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		if vseg.ValuePtr().mlockMode != memmap.MLockNone {
+			total += uint64(vseg.Range().Intersect(ar).Length())
+		}
+	}
+	return total
+}
+
 // getVMAsLocked ensures that vmas exist for all addresses in ar, and support
 // access of type (at, ignorePermissions). It returns:
 //
@@ -338,6 +372,9 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 			vma.id.DecRef()
 		}
 		mm.usageAS -= uint64(vmaAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vmaAR.Length())
+		}
 		vgap = mm.vmas.Remove(vseg)
 		vseg = vgap.NextSegment()
 	}
@@ -368,6 +405,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.maxPerms != vma2.maxPerms ||
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
+		vma1.mlockMode != vma2.mlockMode ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 7a5c93f9b..e855590e6 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
 		145: SchedGetscheduler,
 		146: SchedGetPriorityMax,
 		147: SchedGetPriorityMin,
-		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
-		149: syscalls.Error(nil),                         // Mlock, TODO
-		150: syscalls.Error(nil),                         // Munlock, TODO
-		151: syscalls.Error(nil),                         // Mlockall, TODO
-		152: syscalls.Error(nil),                         // Munlockall, TODO
+		148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
+		149: Mlock,
+		150: Munlock,
+		151: Mlockall,
+		152: Munlockall,
 		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
 		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
 		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
@@ -373,8 +373,9 @@ var AMD64 = &kernel.SyscallTable{
 		//     322: Execveat, TODO
 		//     323: Userfaultfd, TODO
 		//     324: Membarrier, TODO
-		// Syscalls after 325 are backports from 4.6.
-		325: syscalls.Error(nil), // Mlock2, TODO
+		325: Mlock2,
+		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
+		//	326: CopyFileRange,
 		327: Preadv2,
 		328: Pwritev2,
 	},
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 145f7846c..8732861e0 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -69,6 +69,9 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
 		Precommit: linux.MAP_POPULATE&flags != 0,
 	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
 	defer func() {
 		if opts.MappingIdentity != nil {
 			opts.MappingIdentity.DecRef()
@@ -384,16 +387,6 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	length := args[1].SizeT()
 	flags := args[2].Int()
 
-	if addr != addr.RoundDown() {
-		return 0, nil, syserror.EINVAL
-	}
-	if length == 0 {
-		return 0, nil, nil
-	}
-	la, ok := usermem.Addr(length).RoundUp()
-	if !ok {
-		return 0, nil, syserror.ENOMEM
-	}
 	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
 	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
 	// permits a call to msync() that specifies neither of these flags, with
@@ -406,39 +399,72 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if sync && flags&linux.MS_ASYNC != 0 {
 		return 0, nil, syserror.EINVAL
 	}
+	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
+		Sync:       sync,
+		Invalidate: flags&linux.MS_INVALIDATE != 0,
+	})
+	// MSync calls fsync, the same interrupt conversion rules apply, see
+	// mm/msync.c, fsync POSIX.1-2008.
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Mlock implements linux syscall mlock(2).
+func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
+}
 
-	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
-	// that they can be updated with the fresh values just written)". This is a
-	// no-op given that shared memory exists. However, MS_INVALIDATE can also
-	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
-	// and a memory lock exists for the specified address range." Given that
-	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
-	// some user program could be using it for synchronization.
-	if flags&linux.MS_INVALIDATE != 0 {
+// Mlock2 implements linux syscall mlock2(2).
+func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	if flags&^(linux.MLOCK_ONFAULT) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-	// MS_SYNC "requests an update and waits for it to complete."
-	if sync {
-		err := t.MemoryManager().Sync(t, addr, uint64(la))
-		// Sync calls fsync, the same interrupt conversion rules apply, see
-		// mm/msync.c, fsync POSIX.1-2008.
-		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
-	}
-	// MS_ASYNC "specifies that an update be scheduled, but the call returns
-	// immediately". As long as dirty pages are tracked and eventually written
-	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
-	// is in fact a no-op, since the kernel properly tracks dirty pages and
-	// flushes them to storage as necessary.")
-	//
-	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
-	// This applies even for MS_ASYNC.
-	ar, ok := addr.ToRange(uint64(la))
-	if !ok {
-		return 0, nil, syserror.ENOMEM
+
+	mode := memmap.MLockEager
+	if flags&linux.MLOCK_ONFAULT != 0 {
+		mode = memmap.MLockLazy
 	}
-	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
-	if mapped != uint64(la) {
-		return 0, nil, syserror.ENOMEM
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
+}
+
+// Munlock implements linux syscall munlock(2).
+func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
+}
+
+// Mlockall implements linux syscall mlockall(2).
+func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+
+	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
+		return 0, nil, syserror.EINVAL
 	}
-	return 0, nil, nil
+
+	mode := memmap.MLockEager
+	if flags&linux.MCL_ONFAULT != 0 {
+		mode = memmap.MLockLazy
+	}
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: flags&linux.MCL_CURRENT != 0,
+		Future:  flags&linux.MCL_FUTURE != 0,
+		Mode:    mode,
+	})
+}
+
+// Munlockall implements linux syscall munlockall(2).
+func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: true,
+		Future:  true,
+		Mode:    memmap.MLockNone,
+	})
 }
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 2f16e1791..b0b216045 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -90,6 +90,7 @@ var setableLimits = map[limits.LimitType]struct{}{
 	limits.CPU:           {},
 	limits.Data:          {},
 	limits.FileSize:      {},
+	limits.MemoryLocked:  {},
 	limits.Stack:         {},
 	// These are not enforced, but we include them here to avoid returning
 	// EPERM, since some apps expect them to succeed.
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index c0b8246b5..03e586688 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1019,6 +1019,22 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "mlock_test",
+    testonly = 1,
+    srcs = ["mlock.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:cleanup",
+        "//test/util:memory_util",
+        "//test/util:multiprocess_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "mmap_test",
     testonly = 1,
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
new file mode 100644
index 000000000..1d93bff58
--- /dev/null
+++ b/test/syscalls/linux/mlock.cc
@@ -0,0 +1,345 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+using ::testing::_;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<bool> CanMlock() {
+  struct rlimit rlim;
+  if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
+    return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)");
+  }
+  if (rlim.rlim_cur != 0) {
+    return true;
+  }
+  return HaveCapability(CAP_IPC_LOCK);
+}
+
+// Returns true if the page containing addr is mlocked.
+bool IsPageMlocked(uintptr_t addr) {
+  // This relies on msync(MS_INVALIDATE) interacting correctly with mlocked
+  // pages, which is tested for by the MsyncInvalidate case below.
+  int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
+                       kPageSize, MS_ASYNC | MS_INVALIDATE);
+  if (rv == 0) {
+    return false;
+  }
+  // This uses TEST_PCHECK_MSG since it's used in subprocesses.
+  TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno");
+  return true;
+}
+
+PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
+  struct rlimit old_rlim;
+  if (getrlimit(resource, &old_rlim) != 0) {
+    return PosixError(errno, "getrlimit failed");
+  }
+  struct rlimit new_rlim = old_rlim;
+  new_rlim.rlim_cur = newval;
+  if (setrlimit(resource, &new_rlim) != 0) {
+    return PosixError(errno, "setrlimit failed");
+  }
+  return Cleanup([resource, old_rlim] {
+    TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
+  });
+}
+
+TEST(MlockTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, ProtNone) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping =
+      ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(ENOMEM));
+  // ENOMEM is returned because mlock can't populate the page, but it's still
+  // considered locked.
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, MadviseDontneed) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MlockTest, MsyncInvalidate) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE),
+              SyscallFailsWithErrno(EBUSY));
+  EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE),
+              SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MlockTest, Fork) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(
+      InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }),
+      IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MlockTest, RlimitMemlockZero) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MlockTest, RlimitMemlockInsufficient) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+              SyscallFailsWithErrno(ENOMEM));
+}
+
+TEST(MunlockTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MunlockTest, NotLocked) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+// There is currently no test for mlockall(MCL_CURRENT) because the default
+// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke
+// mlockall(MCL_CURRENT).
+
+TEST(MlockallTest, Future) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+
+  // Run this test in a separate (single-threaded) subprocess to ensure that a
+  // background thread doesn't try to mmap a large amount of memory, fail due
+  // to hitting RLIMIT_MEMLOCK, and explode the process violently.
+  EXPECT_THAT(InForkedProcess([] {
+                auto const mapping =
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)
+                        .ValueOrDie();
+                TEST_CHECK(!IsPageMlocked(mapping.addr()));
+                TEST_PCHECK(mlockall(MCL_FUTURE) == 0);
+                // Ensure that mlockall(MCL_FUTURE) is turned off before the end
+                // of the test, as otherwise mmaps may fail unexpectedly.
+                Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); });
+                auto const mapping2 = ASSERT_NO_ERRNO_AND_VALUE(
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+                TEST_CHECK(IsPageMlocked(mapping2.addr()));
+                // Fire munlockall() and check that it disables
+                // mlockall(MCL_FUTURE).
+                do_munlockall.Release()();
+                auto const mapping3 = ASSERT_NO_ERRNO_AND_VALUE(
+                    MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+                TEST_CHECK(!IsPageMlocked(mapping2.addr()));
+              }),
+              IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MunlockallTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(munlockall(), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+#ifndef SYS_mlock2
+#ifdef __x86_64__
+#define SYS_mlock2 325
+#endif
+#endif
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 0x01  // Linux: include/uapi/asm-generic/mman-common.h
+#endif
+
+#ifdef SYS_mlock2
+
+int mlock2(void const* addr, size_t len, int flags) {
+  return syscall(SYS_mlock2, addr, len, flags);
+}
+
+TEST(Mlock2Test, NoFlags) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, MlockOnfault) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+  ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT),
+              SyscallSucceeds());
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, UnknownFlags) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+#endif  // defined(SYS_mlock2)
+
+TEST(MapLockedTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+  EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+  EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MapLockedTest, RlimitMemlockZero) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  EXPECT_THAT(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+      PosixErrorIs(EPERM, _));
+}
+
+TEST(MapLockedTest, RlimitMemlockInsufficient) {
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+  EXPECT_THAT(
+      MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+      PosixErrorIs(EAGAIN, _));
+}
+
+TEST(MremapLockedTest, Basic) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  if (addr == MAP_FAILED) {
+    FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")";
+  }
+  mapping.release();
+  mapping.reset(addr, 2 * mapping.len());
+  EXPECT_TRUE(IsPageMlocked(reinterpret_cast<uintptr_t>(addr)));
+}
+
+TEST(MremapLockedTest, RlimitMemlockZero) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+      << "addr = " << addr << ", errno = " << errno;
+}
+
+TEST(MremapLockedTest, RlimitMemlockInsufficient) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+  auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+  EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+  if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+    ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+  }
+  Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE(
+      ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len()));
+  void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+                      MREMAP_MAYMOVE, nullptr);
+  EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+      << "addr = " << addr << ", errno = " << errno;
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index 0ddc621aa..72d90dc78 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -43,14 +43,13 @@ class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
  protected:
   int msync_flags() const { return std::get<0>(GetParam()); }
 
-  PosixErrorOr<Mapping> GetMapping() const {
-    auto rv = std::get<1>(GetParam())();
-    return rv;
-  }
+  PosixErrorOr<Mapping> GetMapping() const { return std::get<1>(GetParam())(); }
 };
 
-// All valid msync(2) flag combinations (not including MS_INVALIDATE, which
-// gVisor doesn't implement).
+// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux
+// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with
+// semantics that are (currently) equivalent to specifying MS_ASYNC." -
+// msync(2))
 constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
 
 // Returns functions that return mappings that should be successfully
@@ -134,6 +133,15 @@ TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
       SyscallFailsWithErrno(EINVAL));
 }
 
+TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) {
+  auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+  EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE),
+              SyscallSucceeds());
+}
+
+// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires
+// probing for mlock support.
+
 INSTANTIATE_TEST_CASE_P(
     All, MsyncFullParamTest,
     ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
-- 
cgit v1.2.3


From 1679ef31ef15344eba218a5251fa1fb1438b4cb7 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 21 Dec 2018 11:52:39 -0800
Subject: inotify notifies watchers when control events bit are set

The code that matches the event being published with events watchers
was wronly matching all watchers in case any of the control event bits
were set.

Issue #121

PiperOrigin-RevId: 226521230
Change-Id: Ie2c42bc4366faaf59fbf80a74e9297499bd93f9e
---
 pkg/sentry/fs/inotify_watch.go | 12 +++++++-----
 pkg/sentry/strace/linux64.go   |  2 +-
 test/syscalls/linux/inotify.cc | 28 ++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index b83544c9f..d33e7e498 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -76,15 +76,17 @@ func isRenameEvent(eventMask uint32) bool {
 
 // Notify queues a new event on this watch.
 func (w *Watch) Notify(name string, events uint32, cookie uint32) {
-	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
-	effectiveMask := unmaskableBits | atomic.LoadUint32(&w.mask)
-	matchedEvents := effectiveMask & events
-
-	if matchedEvents == 0 {
+	mask := atomic.LoadUint32(&w.mask)
+	if mask&events == 0 {
 		// We weren't watching for this event.
 		return
 	}
 
+	// Event mask should include bits matched from the watch plus all control
+	// event bits.
+	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+	effectiveMask := unmaskableBits | mask
+	matchedEvents := effectiveMask & events
 	w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
 }
 
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index e8fb711a5..de2da9369 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -271,7 +271,7 @@ var linuxAMD64 = SyscallMap{
 	251: makeSyscallInfo("ioprio_set", Hex, Hex, Hex),
 	252: makeSyscallInfo("ioprio_get", Hex, Hex),
 	253: makeSyscallInfo("inotify_init"),
-	254: makeSyscallInfo("inotify_add_watch", Hex, Hex, Hex),
+	254: makeSyscallInfo("inotify_add_watch", Hex, Path, Hex),
 	255: makeSyscallInfo("inotify_rm_watch", Hex, Hex),
 	256: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
 	257: makeSyscallInfo("openat", Hex, Path, OpenFlags, Mode),
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 62fc55c72..0e361496c 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1484,6 +1484,34 @@ TEST(Inotify, MaskAddMergesWithExistingEventMask) {
   ASSERT_THAT(events, Are({Event(IN_CLOSE_WRITE, wd)}));
 }
 
+// Test that control events bits are not considered when checking event mask.
+TEST(Inotify, ControlEvents) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), dir.path(), IN_ACCESS));
+
+  // Check that events in the mask are dispatched and that control bits are
+  // part of the event mask.
+  std::vector<std::string> files =
+      ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), false));
+  ASSERT_EQ(files.size(), 2);
+
+  const std::vector<Event> events1 =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events1, Are({Event(IN_ACCESS | IN_ISDIR, wd)}));
+
+  // Check that events not in the mask are discarded.
+  const FileDescriptor dir_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  const std::vector<Event> events2 =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events2, Are({}));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From b515556519a44d4b6a23590e236bb4f30726b5bf Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 21 Dec 2018 13:12:32 -0800
Subject: Implement SO_KEEPALIVE, TCP_KEEPIDLE, and TCP_KEEPINTVL.

Within gVisor, plumb new socket options to netstack.

Within netstack, fix GetSockOpt and SetSockOpt return value logic.

PiperOrigin-RevId: 226532229
Change-Id: If40734e119eed633335f40b4c26facbebc791c74
---
 pkg/abi/linux/tcp.go                         |   6 ++
 pkg/sentry/socket/epsocket/epsocket.go       |  62 +++++++++++-
 pkg/sentry/socket/unix/transport/unix.go     |  13 ++-
 pkg/tcpip/transport/ping/endpoint.go         |  10 +-
 pkg/tcpip/transport/tcp/endpoint.go          |  22 +++--
 pkg/tcpip/transport/udp/endpoint.go          |  10 +-
 test/syscalls/linux/socket_ip_tcp_generic.cc | 136 ++++++++++++++++++++++++++-
 7 files changed, 243 insertions(+), 16 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go
index 7586ada42..67908deb9 100644
--- a/pkg/abi/linux/tcp.go
+++ b/pkg/abi/linux/tcp.go
@@ -52,3 +52,9 @@ const (
 	TCP_ZEROCOPY_RECEIVE     = 35
 	TCP_INQ                  = 36
 )
+
+// Socket constants from include/net/tcp.h.
+const (
+	MAX_TCP_KEEPIDLE  = 32767
+	MAX_TCP_KEEPINTVL = 32767
+)
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index ab5d82183..89580e83a 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -636,7 +636,13 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
 		}
-		return int32(0), nil
+
+		var v tcpip.KeepaliveEnabledOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
 
 	case linux.SO_LINGER:
 		if outLen < syscall.SizeofLinger {
@@ -720,6 +726,30 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(v), nil
 
+	case linux.TCP_KEEPIDLE:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.KeepaliveIdleOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
+	case linux.TCP_KEEPINTVL:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.KeepaliveIntervalOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(time.Duration(v) / time.Second), nil
+
 	case linux.TCP_INFO:
 		var v tcpip.TCPInfoOption
 		if err := ep.GetSockOpt(&v); err != nil {
@@ -843,6 +873,14 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v)))
 
+	case linux.SO_KEEPALIVE:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveEnabledOption(v)))
+
 	case linux.SO_SNDTIMEO:
 		if len(optVal) < linux.SizeOfTimeval {
 			return syserr.ErrInvalidArgument
@@ -916,6 +954,28 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v)))
 
+	case linux.TCP_KEEPIDLE:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))))
+
+	case linux.TCP_KEEPINTVL:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+
 	case linux.TCP_REPAIR_OPTIONS:
 		t.Kernel().EmitUnimplementedEvent(t)
 
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index e98096d7b..12b1576bd 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -837,6 +837,7 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch o := opt.(type) {
 	case tcpip.ErrorOption:
 		return nil
+
 	case *tcpip.SendQueueSizeOption:
 		e.Lock()
 		if !e.Connected() {
@@ -850,6 +851,7 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		*o = qs
 		return nil
+
 	case *tcpip.ReceiveQueueSizeOption:
 		e.Lock()
 		if !e.Connected() {
@@ -863,6 +865,7 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		*o = qs
 		return nil
+
 	case *tcpip.PasscredOption:
 		if e.Passcred() {
 			*o = tcpip.PasscredOption(1)
@@ -870,6 +873,7 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			*o = tcpip.PasscredOption(0)
 		}
 		return nil
+
 	case *tcpip.SendBufferSizeOption:
 		e.Lock()
 		if !e.Connected() {
@@ -883,6 +887,7 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		*o = qs
 		return nil
+
 	case *tcpip.ReceiveBufferSizeOption:
 		e.Lock()
 		if e.receiver == nil {
@@ -896,8 +901,14 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		*o = qs
 		return nil
+
+	case *tcpip.KeepaliveEnabledOption:
+		*o = 0
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
 	}
-	return tcpip.ErrUnknownProtocolOption
 }
 
 // Shutdown closes the read and/or write end of the endpoint connection to its
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index 10d4d138e..d1b9b136c 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -358,9 +358,15 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			*o = 1
 		}
 		e.rcvMu.Unlock()
-	}
+		return nil
 
-	return tcpip.ErrUnknownProtocolOption
+	case *tcpip.KeepaliveEnabledOption:
+		*o = 0
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 func sendPing4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 37d4c8f9e..c549132f0 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -662,7 +662,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		} else {
 			atomic.StoreUint32(&e.delay, 1)
 		}
-
 		return nil
 
 	case tcpip.CorkOption:
@@ -674,7 +673,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		} else {
 			atomic.StoreUint32(&e.cork, 1)
 		}
-
 		return nil
 
 	case tcpip.ReuseAddressOption:
@@ -689,7 +687,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		} else {
 			atomic.StoreUint32(&e.slowAck, 0)
 		}
-
 		return nil
 
 	case tcpip.ReceiveBufferSizeOption:
@@ -754,7 +751,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.sndBufMu.Lock()
 		e.sndBufSize = size
 		e.sndBufMu.Unlock()
-
 		return nil
 
 	case tcpip.V6OnlyOption:
@@ -772,34 +768,39 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		}
 
 		e.v6only = v != 0
+		return nil
 
 	case tcpip.KeepaliveEnabledOption:
 		e.keepalive.Lock()
 		e.keepalive.enabled = v != 0
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+		return nil
 
 	case tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
 		e.keepalive.idle = time.Duration(v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+		return nil
 
 	case tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
 		e.keepalive.interval = time.Duration(v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+		return nil
 
 	case tcpip.KeepaliveCountOption:
 		e.keepalive.Lock()
 		e.keepalive.count = int(v)
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+		return nil
 
+	default:
+		return nil
 	}
-
-	return nil
 }
 
 // readyReceiveSize returns the number of bytes ready to be received.
@@ -908,7 +909,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			o.RTTVar = snd.rtt.rttvar
 			snd.rtt.Unlock()
 		}
-
 		return nil
 
 	case *tcpip.KeepaliveEnabledOption:
@@ -920,25 +920,29 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		if v {
 			*o = 1
 		}
+		return nil
 
 	case *tcpip.KeepaliveIdleOption:
 		e.keepalive.Lock()
 		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
 		e.keepalive.Unlock()
+		return nil
 
 	case *tcpip.KeepaliveIntervalOption:
 		e.keepalive.Lock()
 		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
 		e.keepalive.Unlock()
+		return nil
 
 	case *tcpip.KeepaliveCountOption:
 		e.keepalive.Lock()
 		*o = tcpip.KeepaliveCountOption(e.keepalive.count)
 		e.keepalive.Unlock()
+		return nil
 
+	default:
+		return tcpip.ErrUnknownProtocolOption
 	}
-
-	return tcpip.ErrUnknownProtocolOption
 }
 
 func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 57b875680..67e9ca0ac 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -505,15 +505,21 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 			*o = 1
 		}
 		e.rcvMu.Unlock()
+		return nil
 
 	case *tcpip.MulticastTTLOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastTTLOption(e.multicastTTL)
 		e.mu.Unlock()
 		return nil
-	}
 
-	return tcpip.ErrUnknownProtocolOption
+	case *tcpip.KeepaliveEnabledOption:
+		*o = 0
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // sendUDP sends a UDP segment via the provided network endpoint and under the
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index bb5a83c9a..81508263b 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -296,7 +296,7 @@ TEST_P(TCPSocketPairTest, TCPCorkDefault) {
       getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK, &get, &get_len),
       SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_len, sizeof(get));
-  EXPECT_EQ(get, 0);
+  EXPECT_EQ(get, kSockOptOff);
 }
 
 TEST_P(TCPSocketPairTest, SetTCPCork) {
@@ -388,5 +388,139 @@ TEST_P(TCPSocketPairTest, SetTCPQuickAck) {
   EXPECT_EQ(get, kSockOptOn);
 }
 
+TEST_P(TCPSocketPairTest, SoKeepaliveDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TCPSocketPairTest, SetSoKeepalive) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TCPSocketPairTest, TCPKeepidleDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE, &get,
+                         &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 2 * 60 * 60);  // 2 hours.
+}
+
+TEST_P(TCPSocketPairTest, TCPKeepintvlDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL, &get,
+                         &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 75);  // 75 seconds.
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepidleZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE, &kZero,
+                         sizeof(kZero)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepintvlZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL,
+                         &kZero, sizeof(kZero)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+// Copied from include/net/tcp.h.
+constexpr int MAX_TCP_KEEPIDLE = 32767;
+constexpr int MAX_TCP_KEEPINTVL = 32767;
+
+TEST_P(TCPSocketPairTest, SetTCPKeepidleAboveMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kAboveMax = MAX_TCP_KEEPIDLE + 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE,
+                         &kAboveMax, sizeof(kAboveMax)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepintvlAboveMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kAboveMax = MAX_TCP_KEEPINTVL + 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL,
+                         &kAboveMax, sizeof(kAboveMax)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepidleToMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE,
+                         &MAX_TCP_KEEPIDLE, sizeof(MAX_TCP_KEEPIDLE)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE, &get,
+                         &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, MAX_TCP_KEEPIDLE);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepintvlToMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL,
+                         &MAX_TCP_KEEPINTVL, sizeof(MAX_TCP_KEEPINTVL)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL, &get,
+                         &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, MAX_TCP_KEEPINTVL);
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 0df0df35fc4aa4d69dc01f7c7e2d9e0530a34db7 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 21 Dec 2018 19:45:43 -0800
Subject: Stub out SO_OOBINLINE.

We don't explicitly support out-of-band data and treat it like normal in-band
data. This is equilivent to SO_OOBINLINE being enabled, so always report that
it is enabled.

PiperOrigin-RevId: 226572742
Change-Id: I4c30ccb83265e76c30dea631cbf86822e6ee1c1b
---
 pkg/sentry/socket/epsocket/epsocket.go       | 12 ++++++++++++
 pkg/tcpip/tcpip.go                           |  4 ++++
 pkg/tcpip/transport/tcp/endpoint.go          |  5 +++++
 test/syscalls/linux/socket_ip_tcp_generic.cc | 16 ++++++++++++++++
 4 files changed, 37 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 89580e83a..283f1839d 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -678,6 +678,18 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 
 		return int32(v), nil
 
+	case linux.SO_OOBINLINE:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.OutOfBandInlineOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
 	default:
 		socket.GetSockOptEmitUnimplementedEvent(t, name)
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f6dd29e77..627786808 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -497,6 +497,10 @@ type AddMembershipOption MembershipOption
 // the given interface address.
 type RemoveMembershipOption MembershipOption
 
+// OutOfBandInlineOption is used by SetSockOpt/GetSockOpt to specify whether
+// TCP out-of-band data is delivered along with the normal in-band data.
+type OutOfBandInlineOption int
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination adddress in the row.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index c549132f0..d4eda50ec 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -940,6 +940,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.keepalive.Unlock()
 		return nil
 
+	case *tcpip.OutOfBandInlineOption:
+		// We don't currently support disabling this option.
+		*o = 1
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 81508263b..e8a6210e1 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -522,5 +522,21 @@ TEST_P(TCPSocketPairTest, SetTCPKeepintvlToMax) {
   EXPECT_EQ(get, MAX_TCP_KEEPINTVL);
 }
 
+TEST_P(TCPSocketPairTest, SetOOBInline) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_OOBINLINE,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_OOBINLINE, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From bfa2f314ca05854b0d08aa2f5c2b93b16542d95f Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Wed, 26 Dec 2018 11:24:29 -0800
Subject: Add EventChannel messages for uncaught signals.

PiperOrigin-RevId: 226936778
Change-Id: I2a6dda157c55d39d81e1b543ab11a58a0bfe5c05
---
 pkg/sentry/kernel/BUILD                 | 18 ++++++++++++++++
 pkg/sentry/kernel/task_signals.go       | 19 +++++++++++++++++
 pkg/sentry/kernel/uncaught_signal.proto | 37 +++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)
 create mode 100644 pkg/sentry/kernel/uncaught_signal.proto

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 10d7b97c2..490f674c0 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,3 +1,5 @@
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+
 package(licenses = ["notice"])  # Apache 2.0
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
@@ -62,6 +64,21 @@ go_template_instance(
     },
 )
 
+proto_library(
+    name = "uncaught_signal_proto",
+    srcs = ["uncaught_signal.proto"],
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_proto"],
+)
+
+go_proto_library(
+    name = "uncaught_signal_go_proto",
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto",
+    proto = ":uncaught_signal_proto",
+    visibility = ["//visibility:public"],
+    deps = ["//pkg/sentry/arch:registers_go_proto"],
+)
+
 go_library(
     name = "kernel",
     srcs = [
@@ -122,6 +139,7 @@ go_library(
     ],
     visibility = ["//:sandbox"],
     deps = [
+        ":uncaught_signal_go_proto",
         "//pkg/abi",
         "//pkg/abi/linux",
         "//pkg/amutex",
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index fe24f7542..583acddb1 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -22,8 +22,10 @@ import (
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -184,6 +186,23 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
 	case SignalActionTerm, SignalActionCore:
 		// "Default action is to terminate the process." - signal(7)
 		t.Debugf("Signal %d: terminating thread group", info.Signo)
+
+		// Emit an event channel messages related to this uncaught signal.
+		ucs := &ucspb.UncaughtSignal{
+			Tid:          int32(t.Kernel().TaskSet().Root.IDOfTask(t)),
+			Pid:          int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())),
+			Registers:    t.Arch().StateData().Proto(),
+			SignalNumber: info.Signo,
+		}
+
+		// Attach an fault address if appropriate.
+		switch linux.Signal(info.Signo) {
+		case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS:
+			ucs.FaultAddr = info.Addr()
+		}
+
+		eventchannel.Emit(ucs)
+
 		t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
 		return (*runExit)(nil)
 
diff --git a/pkg/sentry/kernel/uncaught_signal.proto b/pkg/sentry/kernel/uncaught_signal.proto
new file mode 100644
index 000000000..c7f6a1978
--- /dev/null
+++ b/pkg/sentry/kernel/uncaught_signal.proto
@@ -0,0 +1,37 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+import "pkg/sentry/arch/registers.proto";
+
+message UncaughtSignal {
+  // Thread ID.
+  int32 tid = 1;
+
+  // Process ID.
+  int32 pid = 2;
+
+  // Registers at the time of the fault or signal.
+  Registers registers = 3;
+
+  // Signal number.
+  int32 signal_number = 4;
+
+  // The memory location which caused the fault (set if applicable, 0
+  // otherwise). This will be set for SIGILL, SIGFPE, SIGSEGV, and SIGBUS.
+  uint64 fault_addr = 5;
+}
-- 
cgit v1.2.3


From bce2f9751f415da869d04ccb53833b024373666d Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Wed, 26 Dec 2018 23:51:00 -0800
Subject: Plumb IP_MULTICAST_TTL to netstack.

PiperOrigin-RevId: 226993086
Change-Id: I71757f231436538081d494da32ca69f709bc71c7
---
 pkg/sentry/socket/epsocket/epsocket.go        | 113 +++++++++++++++++++++---
 test/syscalls/linux/BUILD                     |  19 ++++
 test/syscalls/linux/socket_ip_udp_generic.cc  | 121 ++++++++++++++++++++++++++
 test/syscalls/linux/socket_ip_udp_generic.h   |  29 ++++++
 test/syscalls/linux/socket_ip_udp_loopback.cc |   5 ++
 5 files changed, 276 insertions(+), 11 deletions(-)
 create mode 100644 test/syscalls/linux/socket_ip_udp_generic.cc
 create mode 100644 test/syscalls/linux/socket_ip_udp_generic.h

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 283f1839d..1b9c75949 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -531,8 +531,10 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 	case linux.SOL_IPV6:
 		return getSockOptIPv6(t, ep, name, outLen)
 
-	case linux.SOL_IP,
-		linux.SOL_UDP,
+	case linux.SOL_IP:
+		return getSockOptIP(t, ep, name, outLen)
+
+	case linux.SOL_UDP,
 		linux.SOL_ICMPV6,
 		linux.SOL_RAW,
 		linux.SOL_PACKET:
@@ -787,7 +789,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 		t.Kernel().EmitUnimplementedEvent(t)
 
 	default:
-		emitUmplementedEventTCP(t, name)
+		emitUnimplementedEventTCP(t, name)
 	}
 	return nil, syserr.ErrProtocolNotAvailable
 }
@@ -811,7 +813,28 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
 		t.Kernel().EmitUnimplementedEvent(t)
 
 	default:
-		emitUmplementedEventIPv6(t, name)
+		emitUnimplementedEventIPv6(t, name)
+	}
+	return nil, syserr.ErrProtocolNotAvailable
+}
+
+// getSockOptIP implements GetSockOpt when level is SOL_IP.
+func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) {
+	switch name {
+	case linux.IP_MULTICAST_TTL:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.MulticastTTLOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
+	default:
+		emitUnimplementedEventIP(t, name)
 	}
 	return nil, syserr.ErrProtocolNotAvailable
 }
@@ -992,7 +1015,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		t.Kernel().EmitUnimplementedEvent(t)
 
 	default:
-		emitUmplementedEventTCP(t, name)
+		emitUnimplementedEventTCP(t, name)
 	}
 
 	// Default to the old behavior; hand off to network stack.
@@ -1028,7 +1051,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 		t.Kernel().EmitUnimplementedEvent(t)
 
 	default:
-		emitUmplementedEventIPv6(t, name)
+		emitUnimplementedEventIPv6(t, name)
 	}
 
 	// Default to the old behavior; hand off to network stack.
@@ -1038,6 +1061,21 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
 func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
+	case linux.IP_MULTICAST_TTL:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := int32(usermem.ByteOrder.Uint32(optVal))
+		if v == -1 {
+			// Linux translates -1 to 1.
+			v = 1
+		}
+		if v < 0 || v > 255 {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v)))
+
 	case linux.IP_ADD_MEMBERSHIP, linux.MCAST_JOIN_GROUP, linux.IP_MULTICAST_IF:
 		// FIXME: Disallow IP-level multicast group options by
 		// default. These will need to be supported by appropriately plumbing
@@ -1060,7 +1098,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_MTU_DISCOVER,
 		linux.IP_MULTICAST_ALL,
 		linux.IP_MULTICAST_LOOP,
-		linux.IP_MULTICAST_TTL,
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
@@ -1092,10 +1129,10 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
 }
 
-// emitUmplementedEventTCP emits unimplemented event if name is valid. This
+// emitUnimplementedEventTCP emits unimplemented event if name is valid. This
 // function contains names that are common between Get and SetSockOpt when
 // level is SOL_TCP.
-func emitUmplementedEventTCP(t *kernel.Task, name int) {
+func emitUnimplementedEventTCP(t *kernel.Task, name int) {
 	switch name {
 	case linux.TCP_CONGESTION,
 		linux.TCP_CORK,
@@ -1129,10 +1166,10 @@ func emitUmplementedEventTCP(t *kernel.Task, name int) {
 	}
 }
 
-// emitUmplementedEventIPv6 emits unimplemented event if name is valid. It
+// emitUnimplementedEventIPv6 emits unimplemented event if name is valid. It
 // contains names that are common between Get and SetSockOpt when level is
 // SOL_IPV6.
-func emitUmplementedEventIPv6(t *kernel.Task, name int) {
+func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
 	switch name {
 	case linux.IPV6_2292DSTOPTS,
 		linux.IPV6_2292HOPLIMIT,
@@ -1179,6 +1216,60 @@ func emitUmplementedEventIPv6(t *kernel.Task, name int) {
 	}
 }
 
+// emitUnimplementedEventIP emits unimplemented event if name is valid. It
+// contains names that are common between Get and SetSockOpt when level is
+// SOL_IP.
+func emitUnimplementedEventIP(t *kernel.Task, name int) {
+	switch name {
+	case linux.IP_TOS,
+		linux.IP_TTL,
+		linux.IP_HDRINCL,
+		linux.IP_OPTIONS,
+		linux.IP_ROUTER_ALERT,
+		linux.IP_RECVOPTS,
+		linux.IP_RETOPTS,
+		linux.IP_PKTINFO,
+		linux.IP_PKTOPTIONS,
+		linux.IP_MTU_DISCOVER,
+		linux.IP_RECVERR,
+		linux.IP_RECVTTL,
+		linux.IP_RECVTOS,
+		linux.IP_MTU,
+		linux.IP_FREEBIND,
+		linux.IP_IPSEC_POLICY,
+		linux.IP_XFRM_POLICY,
+		linux.IP_PASSSEC,
+		linux.IP_TRANSPARENT,
+		linux.IP_ORIGDSTADDR,
+		linux.IP_MINTTL,
+		linux.IP_NODEFRAG,
+		linux.IP_CHECKSUM,
+		linux.IP_BIND_ADDRESS_NO_PORT,
+		linux.IP_RECVFRAGSIZE,
+		linux.IP_MULTICAST_IF,
+		linux.IP_MULTICAST_TTL,
+		linux.IP_MULTICAST_LOOP,
+		linux.IP_ADD_MEMBERSHIP,
+		linux.IP_DROP_MEMBERSHIP,
+		linux.IP_UNBLOCK_SOURCE,
+		linux.IP_BLOCK_SOURCE,
+		linux.IP_ADD_SOURCE_MEMBERSHIP,
+		linux.IP_DROP_SOURCE_MEMBERSHIP,
+		linux.IP_MSFILTER,
+		linux.MCAST_JOIN_GROUP,
+		linux.MCAST_BLOCK_SOURCE,
+		linux.MCAST_UNBLOCK_SOURCE,
+		linux.MCAST_LEAVE_GROUP,
+		linux.MCAST_JOIN_SOURCE_GROUP,
+		linux.MCAST_LEAVE_SOURCE_GROUP,
+		linux.MCAST_MSFILTER,
+		linux.IP_MULTICAST_ALL,
+		linux.IP_UNICAST_IF:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+	}
+}
+
 // isLinkLocal determines if the given IPv6 address is link-local. This is the
 // case when it has the fe80::/10 prefix. This check is used to determine when
 // the NICID is relevant for a given IPv6 address.
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 00d521400..0f9b406d8 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1875,6 +1875,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "socket_ip_udp_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ip_udp_generic.cc",
+    ],
+    hdrs = [
+        "socket_ip_udp_generic.h",
+    ],
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
 cc_binary(
     name = "socket_abstract_test",
     testonly = 1,
@@ -2044,6 +2062,7 @@ cc_binary(
     deps = [
         ":ip_socket_test_util",
         ":socket_generic_test_cases",
+        ":socket_ip_udp_test_cases",
         ":socket_non_stream_test_cases",
         ":socket_test_util",
         "//test/util:test_main",
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
new file mode 100644
index 000000000..789154fb3
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -0,0 +1,121 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ip_udp_generic.h"
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UDPSocketPairTest, MulticastTTLDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 1);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMin) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kMin = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &kMin, sizeof(kMin)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kMin);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kMax = 255;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &kMax, sizeof(kMax)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kMax);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLNegativeOne) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kArbitrary = 6;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &kArbitrary, sizeof(kArbitrary)),
+              SyscallSucceeds());
+
+  constexpr int kNegOne = -1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &kNegOne, sizeof(kNegOne)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 1);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLBelowMin) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kBelowMin = -2;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &kBelowMin, sizeof(kBelowMin)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLAboveMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kAboveMax = 256;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &kAboveMax, sizeof(kAboveMax)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_generic.h b/test/syscalls/linux/socket_ip_udp_generic.h
new file mode 100644
index 000000000..8b8fc7c6e
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_generic.h
@@ -0,0 +1,29 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected UDP sockets.
+using UDPSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index 8a98fa8df..f3548469f 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -16,6 +16,7 @@
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_ip_udp_generic.h"
 #include "test/syscalls/linux/socket_non_stream.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
@@ -44,5 +45,9 @@ INSTANTIATE_TEST_CASE_P(
     AllUnixDomainSockets, NonStreamSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
 
+INSTANTIATE_TEST_CASE_P(
+    UDPSockets, UDPSocketPairTest,
+    ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 46e6577014c849d7306c63905db25f3c695fa7e7 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 27 Dec 2018 14:58:41 -0800
Subject: Fix deadlock between epoll_wait and getdents

epoll_wait acquires EventPoll.listsMu (in EventPoll.ReadEvents) and
then calls Inotify.Readiness which tries to acquire Inotify.evMu.

getdents acquires Inotify.evMu (in Inotify.queueEvent) and then calls
readyCallback.Callback which tries to acquire EventPoll.listsMu.

The fix is to release Inotify.evMu before calling Queue.Notify. Queue
is thread-safe and doesn't require Inotify.evMu to be held.

Closes #121

PiperOrigin-RevId: 227066695
Change-Id: Id29364bb940d1727f33a5dff9a3c52f390c15761
---
 pkg/sentry/fs/inotify.go       | 13 +++++++---
 test/syscalls/linux/BUILD      |  3 +++
 test/syscalls/linux/epoll.cc   | 29 +---------------------
 test/syscalls/linux/inotify.cc | 55 ++++++++++++++++++++++++++++++++++++++++++
 test/util/BUILD                | 13 ++++++++++
 test/util/epoll_util.cc        | 52 +++++++++++++++++++++++++++++++++++++++
 test/util/epoll_util.h         | 36 +++++++++++++++++++++++++++
 7 files changed, 169 insertions(+), 32 deletions(-)
 create mode 100644 test/util/epoll_util.cc
 create mode 100644 test/util/epoll_util.h

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index f251df0d1..51ece5ed0 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -42,14 +42,14 @@ type Inotify struct {
 	// user, since we may aggressively reuse an id on S/R.
 	id uint64
 
-	// evMu *only* protects the event queue. We need a separate lock because
+	waiter.Queue `state:"nosave"`
+
+	// evMu *only* protects the events list. We need a separate lock because
 	// while queuing events, a watch needs to lock the event queue, and using mu
 	// for that would violate lock ordering since at that point the calling
 	// goroutine already holds Watch.target.Watches.mu.
 	evMu sync.Mutex `state:"nosave"`
 
-	waiter.Queue `state:"nosave"`
-
 	// A list of pending events for this inotify instance. Protected by evMu.
 	events ilist.List
 
@@ -212,7 +212,6 @@ func (i *Inotify) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArg
 
 func (i *Inotify) queueEvent(ev *Event) {
 	i.evMu.Lock()
-	defer i.evMu.Unlock()
 
 	// Check if we should coalesce the event we're about to queue with the last
 	// one currently in the queue. Events are coalesced if they are identical.
@@ -221,11 +220,17 @@ func (i *Inotify) queueEvent(ev *Event) {
 			// "Coalesce" the two events by simply not queuing the new one. We
 			// don't need to raise a waiter.EventIn notification because no new
 			// data is available for reading.
+			i.evMu.Unlock()
 			return
 		}
 	}
 
 	i.events.PushBack(ev)
+
+	// Release mutex before notifying waiters because we don't control what they
+	// can do.
+	i.evMu.Unlock()
+
 	i.Queue.Notify(waiter.EventIn)
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 0f9b406d8..ae33d14da 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -496,6 +496,7 @@ cc_binary(
     srcs = ["epoll.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:epoll_util",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
         "//test/util:test_main",
@@ -844,6 +845,7 @@ cc_binary(
     srcs = ["inotify.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:epoll_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         "//test/util:temp_path",
@@ -852,6 +854,7 @@ cc_binary(
         "//test/util:thread_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
     ],
 )
 
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index 9ae87c00b..46fba7b2d 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -25,6 +25,7 @@
 #include <unistd.h>
 
 #include "gtest/gtest.h"
+#include "test/util/epoll_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/test_util.h"
@@ -37,18 +38,6 @@ namespace {
 constexpr int kFDsPerEpoll = 3;
 constexpr uint64_t kMagicConstant = 0x0102030405060708;
 
-// Returns a new epoll file descriptor.
-PosixErrorOr<FileDescriptor> NewEpollFD() {
-  // "Since Linux 2.6.8, the size argument is ignored, but must be greater than
-  // zero." - epoll_create(2)
-  int fd = epoll_create(/* size = */ 1);
-  MaybeSave();
-  if (fd < 0) {
-    return PosixError(errno, "epoll_create");
-  }
-  return FileDescriptor(fd);
-}
-
 // Returns a new eventfd.
 PosixErrorOr<FileDescriptor> NewEventFD() {
   int fd = eventfd(/* initval = */ 0, /* flags = */ 0);
@@ -59,22 +48,6 @@ PosixErrorOr<FileDescriptor> NewEventFD() {
   return FileDescriptor(fd);
 }
 
-// Registers `target_fd` with the epoll instance represented by `epoll_fd` for
-// the epoll events `events`. Events on `target_fd` will be indicated by setting
-// data.u64 to `data` in the returned epoll_event.
-PosixError RegisterEpollFD(int epoll_fd, int target_fd, int events,
-                           uint64_t data) {
-  struct epoll_event event;
-  event.events = events;
-  event.data.u64 = data;
-  int rc = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, target_fd, &event);
-  MaybeSave();
-  if (rc < 0) {
-    return PosixError(errno, "epoll_ctl");
-  }
-  return NoError();
-}
-
 uint64_t ms_elapsed(const struct timespec* begin, const struct timespec* end) {
   return (end->tv_sec - begin->tv_sec) * 1000 +
          (end->tv_nsec - begin->tv_nsec) / 1000000;
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 0e361496c..167ca44a8 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -14,9 +14,12 @@
 
 #include <fcntl.h>
 #include <libgen.h>
+#include <sched.h>
+#include <sys/epoll.h>
 #include <sys/inotify.h>
 #include <sys/ioctl.h>
 
+#include <atomic>
 #include <list>
 #include <string>
 #include <vector>
@@ -24,6 +27,9 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/epoll_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/temp_path.h"
@@ -1512,6 +1518,55 @@ TEST(Inotify, ControlEvents) {
   ASSERT_THAT(events2, Are({}));
 }
 
+// Regression test to ensure epoll and directory access doesn't deadlock.
+TEST(Inotify, EpollNoDeadlock) {
+  const DisableSave ds;  // Too many syscalls.
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  // Create lots of directories and watch all of them.
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  std::vector<TempPath> children;
+  for (size_t i = 0; i < 1000; ++i) {
+    auto child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+    ASSERT_NO_ERRNO_AND_VALUE(
+        InotifyAddWatch(fd.get(), child.path(), IN_ACCESS));
+    children.emplace_back(std::move(child));
+  }
+
+  // Run epoll_wait constantly in a separate thread.
+  std::atomic<bool> done(false);
+  ScopedThread th([&fd, &done] {
+    for (auto start = absl::Now(); absl::Now() - start < absl::Seconds(5);) {
+      FileDescriptor epoll_fd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+      ASSERT_NO_ERRNO(RegisterEpollFD(epoll_fd.get(), fd.get(),
+                                      EPOLLIN | EPOLLOUT | EPOLLET, 0));
+      struct epoll_event result[1];
+      EXPECT_THAT(RetryEINTR(epoll_wait)(epoll_fd.get(), result, 1, -1),
+                  SyscallSucceedsWithValue(1));
+
+      sched_yield();
+    }
+    done = true;
+  });
+
+  // While epoll thread is running, constantly access all directories to
+  // generate inotify events.
+  while (!done) {
+    std::vector<std::string> files =
+        ASSERT_NO_ERRNO_AND_VALUE(ListDir(root.path(), false));
+    ASSERT_EQ(files.size(), 1002);
+    for (const auto& child : files) {
+      if (child == "." || child == "..") {
+        continue;
+      }
+      ASSERT_NO_ERRNO_AND_VALUE(ListDir(JoinPath(root.path(), child), false));
+    }
+    sched_yield();
+  }
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/BUILD b/test/util/BUILD
index f981a8d1d..10507eae4 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -259,3 +259,16 @@ cc_library(
     srcs = ["test_main.cc"],
     deps = [":test_util"],
 )
+
+cc_library(
+    name = "epoll_util",
+    testonly = 1,
+    srcs = ["epoll_util.cc"],
+    hdrs = ["epoll_util.h"],
+    deps = [
+        ":file_descriptor",
+        ":posix_error",
+        ":save_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/util/epoll_util.cc b/test/util/epoll_util.cc
new file mode 100644
index 000000000..0b95aa8cd
--- /dev/null
+++ b/test/util/epoll_util.cc
@@ -0,0 +1,52 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/epoll_util.h"
+
+#include <sys/epoll.h>
+
+#include "gmock/gmock.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<FileDescriptor> NewEpollFD(int size) {
+  // "Since Linux 2.6.8, the size argument is ignored, but must be greater than
+  // zero." - epoll_create(2)
+  int fd = epoll_create(size);
+  MaybeSave();
+  if (fd < 0) {
+    return PosixError(errno, "epoll_create");
+  }
+  return FileDescriptor(fd);
+}
+
+PosixError RegisterEpollFD(int epoll_fd, int target_fd, int events,
+                           uint64_t data) {
+  struct epoll_event event;
+  event.events = events;
+  event.data.u64 = data;
+  int rc = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, target_fd, &event);
+  MaybeSave();
+  if (rc < 0) {
+    return PosixError(errno, "epoll_ctl");
+  }
+  return NoError();
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/epoll_util.h b/test/util/epoll_util.h
new file mode 100644
index 000000000..521e7a3d3
--- /dev/null
+++ b/test/util/epoll_util.h
@@ -0,0 +1,36 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_EPOLL_UTIL_H_
+#define GVISOR_TEST_UTIL_EPOLL_UTIL_H_
+
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// Returns a new epoll file descriptor.
+PosixErrorOr<FileDescriptor> NewEpollFD(int size = 1);
+
+// Registers `target_fd` with the epoll instance represented by `epoll_fd` for
+// the epoll events `events`. Events on `target_fd` will be indicated by setting
+// data.u64 to `data` in the returned epoll_event.
+PosixError RegisterEpollFD(int epoll_fd, int target_fd, int events,
+                           uint64_t data);
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_UTIL_EPOLL_UTIL_H_
-- 
cgit v1.2.3


From 652d068119052b0b3bc4a0808a4400a22380a30b Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 28 Dec 2018 11:26:01 -0800
Subject: Implement SO_REUSEPORT for TCP and UDP sockets

This option allows multiple sockets to be bound to the same port.

Incoming packets are distributed to sockets using a hash based on source and
destination addresses. This means that all packets from one sender will be
received by the same server socket.

PiperOrigin-RevId: 227153413
Change-Id: I59b6edda9c2209d5b8968671e9129adb675920cf
---
 pkg/sentry/socket/epsocket/epsocket.go      |  20 ++
 pkg/sentry/socket/rpcinet/socket.go         |   5 +-
 pkg/tcpip/hash/jenkins/BUILD                |  21 ++
 pkg/tcpip/hash/jenkins/jenkins.go           |  80 ++++++++
 pkg/tcpip/hash/jenkins/jenkins_test.go      | 176 +++++++++++++++++
 pkg/tcpip/ports/BUILD                       |   4 +-
 pkg/tcpip/ports/ports.go                    |  74 +++++--
 pkg/tcpip/ports/ports_test.go               | 134 ++++++++-----
 pkg/tcpip/stack/BUILD                       |   1 +
 pkg/tcpip/stack/stack.go                    |  12 +-
 pkg/tcpip/stack/transport_demuxer.go        | 144 +++++++++++++-
 pkg/tcpip/stack/transport_test.go           |   2 +-
 pkg/tcpip/tcpip.go                          |   4 +
 pkg/tcpip/transport/ping/endpoint.go        |   8 +-
 pkg/tcpip/transport/tcp/accept.go           |   2 +-
 pkg/tcpip/transport/tcp/endpoint.go         |  34 +++-
 pkg/tcpip/transport/udp/endpoint.go         |  30 ++-
 pkg/tcpip/transport/udp/udp_test.go         |  85 ++++++++
 test/syscalls/linux/BUILD                   |   4 +
 test/syscalls/linux/socket_inet_loopback.cc | 289 ++++++++++++++++++++++++++++
 test/syscalls/syscall_test_runner.go        |   1 +
 21 files changed, 1025 insertions(+), 105 deletions(-)
 create mode 100644 pkg/tcpip/hash/jenkins/BUILD
 create mode 100644 pkg/tcpip/hash/jenkins/jenkins.go
 create mode 100644 pkg/tcpip/hash/jenkins/jenkins_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 1b9c75949..d65b5f49e 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -634,6 +634,18 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 
 		return int32(v), nil
 
+	case linux.SO_REUSEPORT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.ReusePortOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
 	case linux.SO_KEEPALIVE:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
@@ -900,6 +912,14 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v)))
 
+	case linux.SO_REUSEPORT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v)))
+
 	case linux.SO_PASSCRED:
 		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 257bc2d71..8c8ebadb7 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -285,7 +285,10 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	if blocking && se == syserr.ErrTryAgain {
 		// Register for notifications.
 		e, ch := waiter.NewChannelEntry(nil)
-		s.EventRegister(&e, waiter.EventIn)
+		// FIXME: This waiter.EventHUp is a partial
+		// measure, need to figure out how to translate linux events to
+		// internal events.
+		s.EventRegister(&e, waiter.EventIn|waiter.EventHUp)
 		defer s.EventUnregister(&e)
 
 		// Try to accept the connection again; if it fails, then wait until we
diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
new file mode 100644
index 000000000..bbb764db8
--- /dev/null
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -0,0 +1,21 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])  # Apache 2.0
+
+go_library(
+    name = "jenkins",
+    srcs = ["jenkins.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/hash/jenkins",
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
+go_test(
+    name = "jenkins_test",
+    size = "small",
+    srcs = [
+        "jenkins_test.go",
+    ],
+    embed = [":jenkins"],
+)
diff --git a/pkg/tcpip/hash/jenkins/jenkins.go b/pkg/tcpip/hash/jenkins/jenkins.go
new file mode 100644
index 000000000..e66d5f12b
--- /dev/null
+++ b/pkg/tcpip/hash/jenkins/jenkins.go
@@ -0,0 +1,80 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package jenkins implements Jenkins's one_at_a_time, non-cryptographic hash
+// functions created by by Bob Jenkins.
+//
+// See https://en.wikipedia.org/wiki/Jenkins_hash_function#cite_note-dobbsx-1
+//
+package jenkins
+
+import (
+	"hash"
+)
+
+// Sum32 represents Jenkins's one_at_a_time hash.
+//
+// Use the Sum32 type directly (as opposed to New32 below)
+// to avoid allocations.
+type Sum32 uint32
+
+// New32 returns a new 32-bit Jenkins's one_at_a_time hash.Hash.
+//
+// Its Sum method will lay the value out in big-endian byte order.
+func New32() hash.Hash32 {
+	var s Sum32
+	return &s
+}
+
+// Reset resets the hash to its initial state.
+func (s *Sum32) Reset() { *s = 0 }
+
+// Sum32 returns the hash value
+func (s *Sum32) Sum32() uint32 {
+	hash := *s
+
+	hash += (hash << 3)
+	hash ^= hash >> 11
+	hash += hash << 15
+
+	return uint32(hash)
+}
+
+// Write adds more data to the running hash.
+//
+// It never returns an error.
+func (s *Sum32) Write(data []byte) (int, error) {
+	hash := *s
+	for _, b := range data {
+		hash += Sum32(b)
+		hash += hash << 10
+		hash ^= hash >> 6
+	}
+	*s = hash
+	return len(data), nil
+}
+
+// Size returns the number of bytes Sum will return.
+func (s *Sum32) Size() int { return 4 }
+
+// BlockSize returns the hash's underlying block size.
+func (s *Sum32) BlockSize() int { return 1 }
+
+// Sum appends the current hash to in and returns the resulting slice.
+//
+// It does not change the underlying hash state.
+func (s *Sum32) Sum(in []byte) []byte {
+	v := s.Sum32()
+	return append(in, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
+}
diff --git a/pkg/tcpip/hash/jenkins/jenkins_test.go b/pkg/tcpip/hash/jenkins/jenkins_test.go
new file mode 100644
index 000000000..9d86174aa
--- /dev/null
+++ b/pkg/tcpip/hash/jenkins/jenkins_test.go
@@ -0,0 +1,176 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package jenkins
+
+import (
+	"bytes"
+	"encoding/binary"
+	"hash"
+	"hash/fnv"
+	"math"
+	"testing"
+)
+
+func TestGolden32(t *testing.T) {
+	var golden32 = []struct {
+		out []byte
+		in  string
+	}{
+		{[]byte{0x00, 0x00, 0x00, 0x00}, ""},
+		{[]byte{0xca, 0x2e, 0x94, 0x42}, "a"},
+		{[]byte{0x45, 0xe6, 0x1e, 0x58}, "ab"},
+		{[]byte{0xed, 0x13, 0x1f, 0x5b}, "abc"},
+	}
+
+	hash := New32()
+
+	for _, g := range golden32 {
+		hash.Reset()
+		done, error := hash.Write([]byte(g.in))
+		if error != nil {
+			t.Fatalf("write error: %s", error)
+		}
+		if done != len(g.in) {
+			t.Fatalf("wrote only %d out of %d bytes", done, len(g.in))
+		}
+		if actual := hash.Sum(nil); !bytes.Equal(g.out, actual) {
+			t.Errorf("hash(%q) = 0x%x want 0x%x", g.in, actual, g.out)
+		}
+	}
+}
+
+func TestIntegrity32(t *testing.T) {
+	data := []byte{'1', '2', 3, 4, 5}
+
+	h := New32()
+	h.Write(data)
+	sum := h.Sum(nil)
+
+	if size := h.Size(); size != len(sum) {
+		t.Fatalf("Size()=%d but len(Sum())=%d", size, len(sum))
+	}
+
+	if a := h.Sum(nil); !bytes.Equal(sum, a) {
+		t.Fatalf("first Sum()=0x%x, second Sum()=0x%x", sum, a)
+	}
+
+	h.Reset()
+	h.Write(data)
+	if a := h.Sum(nil); !bytes.Equal(sum, a) {
+		t.Fatalf("Sum()=0x%x, but after Reset() Sum()=0x%x", sum, a)
+	}
+
+	h.Reset()
+	h.Write(data[:2])
+	h.Write(data[2:])
+	if a := h.Sum(nil); !bytes.Equal(sum, a) {
+		t.Fatalf("Sum()=0x%x, but with partial writes, Sum()=0x%x", sum, a)
+	}
+
+	sum32 := h.(hash.Hash32).Sum32()
+	if sum32 != binary.BigEndian.Uint32(sum) {
+		t.Fatalf("Sum()=0x%x, but Sum32()=0x%x", sum, sum32)
+	}
+}
+
+func BenchmarkJenkins32KB(b *testing.B) {
+	h := New32()
+
+	b.SetBytes(1024)
+	data := make([]byte, 1024)
+	for i := range data {
+		data[i] = byte(i)
+	}
+	in := make([]byte, 0, h.Size())
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		h.Reset()
+		h.Write(data)
+		h.Sum(in)
+	}
+}
+
+func BenchmarkFnv32(b *testing.B) {
+	arr := make([]int64, 1000)
+	for i := 0; i < b.N; i++ {
+		var payload [8]byte
+		binary.BigEndian.PutUint32(payload[:4], uint32(i))
+		binary.BigEndian.PutUint32(payload[4:], uint32(i))
+
+		h := fnv.New32()
+		h.Write(payload[:])
+		idx := int(h.Sum32()) % len(arr)
+		arr[idx]++
+	}
+	b.StopTimer()
+	c := 0
+	if b.N > 1000000 {
+		for i := 0; i < len(arr)-1; i++ {
+			if math.Abs(float64(arr[i]-arr[i+1]))/float64(arr[i]) > float64(0.1) {
+				if c == 0 {
+					b.Logf("i %d val[i] %d val[i+1] %d b.N %b\n", i, arr[i], arr[i+1], b.N)
+				}
+				c++
+			}
+		}
+		if c > 0 {
+			b.Logf("Unbalanced buckets: %d", c)
+		}
+	}
+}
+
+func BenchmarkSum32(b *testing.B) {
+	arr := make([]int64, 1000)
+	for i := 0; i < b.N; i++ {
+		var payload [8]byte
+		binary.BigEndian.PutUint32(payload[:4], uint32(i))
+		binary.BigEndian.PutUint32(payload[4:], uint32(i))
+		h := Sum32(0)
+		h.Write(payload[:])
+		idx := int(h.Sum32()) % len(arr)
+		arr[idx]++
+	}
+	b.StopTimer()
+	if b.N > 1000000 {
+		for i := 0; i < len(arr)-1; i++ {
+			if math.Abs(float64(arr[i]-arr[i+1]))/float64(arr[i]) > float64(0.1) {
+				b.Logf("val[%3d]=%8d\tval[%3d]=%8d\tb.N=%b\n", i, arr[i], i+1, arr[i+1], b.N)
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkNew32(b *testing.B) {
+	arr := make([]int64, 1000)
+	for i := 0; i < b.N; i++ {
+		var payload [8]byte
+		binary.BigEndian.PutUint32(payload[:4], uint32(i))
+		binary.BigEndian.PutUint32(payload[4:], uint32(i))
+		h := New32()
+		h.Write(payload[:])
+		idx := int(h.Sum32()) % len(arr)
+		arr[idx]++
+	}
+	b.StopTimer()
+	if b.N > 1000000 {
+		for i := 0; i < len(arr)-1; i++ {
+			if math.Abs(float64(arr[i]-arr[i+1]))/float64(arr[i]) > float64(0.1) {
+				b.Logf("val[%3d]=%8d\tval[%3d]=%8d\tb.N=%b\n", i, arr[i], i+1, arr[i+1], b.N)
+				break
+			}
+		}
+	}
+}
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index c69fc0744..a2fa9b84a 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -7,7 +7,9 @@ go_library(
     srcs = ["ports.go"],
     importpath = "gvisor.googlesource.com/gvisor/pkg/tcpip/ports",
     visibility = ["//:sandbox"],
-    deps = ["//pkg/tcpip"],
+    deps = [
+        "//pkg/tcpip",
+    ],
 )
 
 go_test(
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 41ef32921..d212a5792 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -42,23 +42,47 @@ type PortManager struct {
 	allocatedPorts map[portDescriptor]bindAddresses
 }
 
+type portNode struct {
+	reuse bool
+	refs  int
+}
+
 // bindAddresses is a set of IP addresses.
-type bindAddresses map[tcpip.Address]struct{}
+type bindAddresses map[tcpip.Address]portNode
 
 // isAvailable checks whether an IP address is available to bind to.
-func (b bindAddresses) isAvailable(addr tcpip.Address) bool {
+func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool) bool {
 	if addr == anyIPAddress {
-		return len(b) == 0
+		if len(b) == 0 {
+			return true
+		}
+		if !reuse {
+			return false
+		}
+		for _, n := range b {
+			if !n.reuse {
+				return false
+			}
+		}
+		return true
 	}
 
 	// If all addresses for this portDescriptor are already bound, no
 	// address is available.
-	if _, ok := b[anyIPAddress]; ok {
-		return false
+	if n, ok := b[anyIPAddress]; ok {
+		if !reuse {
+			return false
+		}
+		if !n.reuse {
+			return false
+		}
 	}
 
-	if _, ok := b[addr]; ok {
-		return false
+	if n, ok := b[addr]; ok {
+		if !reuse {
+			return false
+		}
+		return n.reuse
 	}
 	return true
 }
@@ -92,17 +116,17 @@ func (s *PortManager) PickEphemeralPort(testPort func(p uint16) (bool, *tcpip.Er
 }
 
 // IsPortAvailable tests if the given port is available on all given protocols.
-func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) bool {
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) bool {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	return s.isPortAvailableLocked(networks, transport, addr, port)
+	return s.isPortAvailableLocked(networks, transport, addr, port, reuse)
 }
 
-func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) bool {
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) bool {
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if addrs, ok := s.allocatedPorts[desc]; ok {
-			if !addrs.isAvailable(addr) {
+			if !addrs.isAvailable(addr, reuse) {
 				return false
 			}
 		}
@@ -114,14 +138,14 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) (reservedPort uint16, err *tcpip.Error) {
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	// If a port is specified, just try to reserve it for all network
 	// protocols.
 	if port != 0 {
-		if !s.reserveSpecificPort(networks, transport, addr, port) {
+		if !s.reserveSpecificPort(networks, transport, addr, port, reuse) {
 			return 0, tcpip.ErrPortInUse
 		}
 		return port, nil
@@ -129,13 +153,13 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p), nil
+		return s.reserveSpecificPort(networks, transport, addr, p, reuse), nil
 	})
 }
 
 // reserveSpecificPort tries to reserve the given port on all given protocols.
-func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) bool {
-	if !s.isPortAvailableLocked(networks, transport, addr, port) {
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool) bool {
+	if !s.isPortAvailableLocked(networks, transport, addr, port, reuse) {
 		return false
 	}
 
@@ -147,7 +171,12 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 			m = make(bindAddresses)
 			s.allocatedPorts[desc] = m
 		}
-		m[addr] = struct{}{}
+		if n, ok := m[addr]; ok {
+			n.refs++
+			m[addr] = n
+		} else {
+			m[addr] = portNode{reuse: reuse, refs: 1}
+		}
 	}
 
 	return true
@@ -162,7 +191,16 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if m, ok := s.allocatedPorts[desc]; ok {
-			delete(m, addr)
+			n, ok := m[addr]
+			if !ok {
+				continue
+			}
+			n.refs--
+			if n.refs == 0 {
+				delete(m, addr)
+			} else {
+				m[addr] = n
+			}
 			if len(m) == 0 {
 				delete(s.allocatedPorts, desc)
 			}
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 72577dfcb..01e7320b4 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -28,67 +28,99 @@ const (
 	fakeIPAddress1 = tcpip.Address("\x08\x08\x08\x09")
 )
 
-func TestPortReservation(t *testing.T) {
-	pm := NewPortManager()
-	net := []tcpip.NetworkProtocolNumber{fakeNetworkNumber}
+type portReserveTestAction struct {
+	port    uint16
+	ip      tcpip.Address
+	want    *tcpip.Error
+	reuse   bool
+	release bool
+}
 
+func TestPortReservation(t *testing.T) {
 	for _, test := range []struct {
-		port uint16
-		ip   tcpip.Address
-		want *tcpip.Error
+		tname   string
+		actions []portReserveTestAction
 	}{
 		{
-			port: 80,
-			ip:   fakeIPAddress,
-			want: nil,
-		},
-		{
-			port: 80,
-			ip:   fakeIPAddress1,
-			want: nil,
-		},
-		{
-			/* N.B. Order of tests matters! */
-			port: 80,
-			ip:   anyIPAddress,
-			want: tcpip.ErrPortInUse,
-		},
-		{
-			port: 22,
-			ip:   anyIPAddress,
-			want: nil,
-		},
-		{
-			port: 22,
-			ip:   fakeIPAddress,
-			want: tcpip.ErrPortInUse,
-		},
-		{
-			port: 0,
-			ip:   fakeIPAddress,
-			want: nil,
+			tname: "bind to ip",
+			actions: []portReserveTestAction{
+				{port: 80, ip: fakeIPAddress, want: nil},
+				{port: 80, ip: fakeIPAddress1, want: nil},
+				/* N.B. Order of tests matters! */
+				{port: 80, ip: anyIPAddress, want: tcpip.ErrPortInUse},
+				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+			},
 		},
 		{
-			port: 0,
-			ip:   fakeIPAddress,
-			want: nil,
+			tname: "bind to inaddr any",
+			actions: []portReserveTestAction{
+				{port: 22, ip: anyIPAddress, want: nil},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				/* release fakeIPAddress, but anyIPAddress is still inuse */
+				{port: 22, ip: fakeIPAddress, release: true},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+				/* Release port 22 from any IP address, then try to reserve fake IP address on 22 */
+				{port: 22, ip: anyIPAddress, want: nil, release: true},
+				{port: 22, ip: fakeIPAddress, want: nil},
+			},
+		}, {
+			tname: "bind to zero port",
+			actions: []portReserveTestAction{
+				{port: 00, ip: fakeIPAddress, want: nil},
+				{port: 00, ip: fakeIPAddress, want: nil},
+				{port: 00, ip: fakeIPAddress, reuse: true, want: nil},
+			},
+		}, {
+			tname: "bind to ip with reuseport",
+			actions: []portReserveTestAction{
+				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 25, ip: fakeIPAddress, reuse: true, want: nil},
+
+				{port: 25, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 25, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+
+				{port: 25, ip: anyIPAddress, reuse: true, want: nil},
+			},
+		}, {
+			tname: "bind to inaddr any with reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
+				{port: 24, ip: anyIPAddress, reuse: true, want: nil},
+
+				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+
+				{port: 24, ip: fakeIPAddress, reuse: true, want: nil},
+				{port: 24, ip: fakeIPAddress, release: true, want: nil},
+
+				{port: 24, ip: anyIPAddress, release: true},
+				{port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+
+				{port: 24, ip: anyIPAddress, release: true},
+				{port: 24, ip: anyIPAddress, reuse: false, want: nil},
+			},
 		},
 	} {
-		gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port)
-		if err != test.want {
-			t.Fatalf("ReservePort(.., .., %s, %d) = %v, want %v", test.ip, test.port, err, test.want)
-		}
-		if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
-			t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
-		}
-	}
+		t.Run(test.tname, func(t *testing.T) {
+			pm := NewPortManager()
+			net := []tcpip.NetworkProtocolNumber{fakeNetworkNumber}
 
-	// Release port 22 from any IP address, then try to reserve fake IP
-	// address on 22.
-	pm.ReleasePort(net, fakeTransNumber, anyIPAddress, 22)
+			for _, test := range test.actions {
+				if test.release {
+					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port)
+					continue
+				}
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse)
+				if err != test.want {
+					t.Fatalf("ReservePort(.., .., %s, %d, %t) = %v, want %v", test.ip, test.port, test.release, err, test.want)
+				}
+				if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
+					t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
+				}
+			}
+		})
 
-	if port, err := pm.ReservePort(net, fakeTransNumber, fakeIPAddress, 22); port != 22 || err != nil {
-		t.Fatalf("ReservePort(.., .., .., %d) = (port %d, err %v), want (22, nil); failed to reserve port after it should have been released", 22, port, err)
 	}
 }
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 90cc05cda..9ff1c8731 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -22,6 +22,7 @@ go_library(
         "//pkg/sleep",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/hash/jenkins",
         "//pkg/tcpip/header",
         "//pkg/tcpip/ports",
         "//pkg/tcpip/seqnum",
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 0ac116675..7aa9dbd46 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -883,9 +883,9 @@ func (s *Stack) RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.
 // transport dispatcher. Received packets that match the provided id will be
 // delivered to the given endpoint; specifying a nic is optional, but
 // nic-specific IDs have precedence over global ones.
-func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) *tcpip.Error {
+func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
 	if nicID == 0 {
-		return s.demux.registerEndpoint(netProtos, protocol, id, ep)
+		return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort)
 	}
 
 	s.mu.RLock()
@@ -896,14 +896,14 @@ func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.N
 		return tcpip.ErrUnknownNICID
 	}
 
-	return nic.demux.registerEndpoint(netProtos, protocol, id, ep)
+	return nic.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort)
 }
 
 // UnregisterTransportEndpoint removes the endpoint with the given id from the
 // stack transport dispatcher.
-func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID) {
+func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) {
 	if nicID == 0 {
-		s.demux.unregisterEndpoint(netProtos, protocol, id)
+		s.demux.unregisterEndpoint(netProtos, protocol, id, ep)
 		return
 	}
 
@@ -912,7 +912,7 @@ func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip
 
 	nic := s.nics[nicID]
 	if nic != nil {
-		nic.demux.unregisterEndpoint(netProtos, protocol, id)
+		nic.demux.unregisterEndpoint(netProtos, protocol, id, ep)
 	}
 }
 
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index c8522ad9e..a5ff2159a 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -15,10 +15,12 @@
 package stack
 
 import (
+	"math/rand"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 )
 
@@ -34,6 +36,23 @@ type transportEndpoints struct {
 	endpoints map[TransportEndpointID]TransportEndpoint
 }
 
+// unregisterEndpoint unregisters the endpoint with the given id such that it
+// won't receive any more packets.
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint) {
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+	e, ok := eps.endpoints[id]
+	if !ok {
+		return
+	}
+	if multiPortEp, ok := e.(*multiPortEndpoint); ok {
+		if !multiPortEp.unregisterEndpoint(ep) {
+			return
+		}
+	}
+	delete(eps.endpoints, id)
+}
+
 // transportDemuxer demultiplexes packets targeted at a transport endpoint
 // (i.e., after they've been parsed by the network layer). It does two levels
 // of demultiplexing: first based on the network and transport protocols, then
@@ -57,10 +76,10 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 
 // registerEndpoint registers the given endpoint with the dispatcher such that
 // packets that match the endpoint ID are delivered to it.
-func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) *tcpip.Error {
+func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
 	for i, n := range netProtos {
-		if err := d.singleRegisterEndpoint(n, protocol, id, ep); err != nil {
-			d.unregisterEndpoint(netProtos[:i], protocol, id)
+		if err := d.singleRegisterEndpoint(n, protocol, id, ep, reusePort); err != nil {
+			d.unregisterEndpoint(netProtos[:i], protocol, id, ep)
 			return err
 		}
 	}
@@ -68,7 +87,97 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 	return nil
 }
 
-func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) *tcpip.Error {
+// multiPortEndpoint is a container for TransportEndpoints which are bound to
+// the same pair of address and port.
+type multiPortEndpoint struct {
+	mu           sync.RWMutex
+	endpointsArr []TransportEndpoint
+	endpointsMap map[TransportEndpoint]int
+	// seed is a random secret for a jenkins hash.
+	seed uint32
+}
+
+// reciprocalScale scales a value into range [0, n).
+//
+// This is similar to val % n, but faster.
+// See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+func reciprocalScale(val, n uint32) uint32 {
+	return uint32((uint64(val) * uint64(n)) >> 32)
+}
+
+// selectEndpoint calculates a hash of destination and source addresses and
+// ports then uses it to select a socket. In this case, all packets from one
+// address will be sent to same endpoint.
+func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID) TransportEndpoint {
+	ep.mu.RLock()
+	defer ep.mu.RUnlock()
+
+	payload := []byte{
+		byte(id.LocalPort),
+		byte(id.LocalPort >> 8),
+		byte(id.RemotePort),
+		byte(id.RemotePort >> 8),
+	}
+
+	h := jenkins.Sum32(ep.seed)
+	h.Write(payload)
+	h.Write([]byte(id.LocalAddress))
+	h.Write([]byte(id.RemoteAddress))
+	hash := h.Sum32()
+
+	idx := reciprocalScale(hash, uint32(len(ep.endpointsArr)))
+	return ep.endpointsArr[idx]
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+	ep.selectEndpoint(id).HandlePacket(r, id, vv)
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (ep *multiPortEndpoint) HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+	ep.selectEndpoint(id).HandleControlPacket(id, typ, extra, vv)
+}
+
+func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint) {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	// A new endpoint is added into endpointsArr and its index there is
+	// saved in endpointsMap. This will allows to remove endpoint from
+	// the array fast.
+	ep.endpointsMap[ep] = len(ep.endpointsArr)
+	ep.endpointsArr = append(ep.endpointsArr, t)
+}
+
+// unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
+func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	idx, ok := ep.endpointsMap[t]
+	if !ok {
+		return false
+	}
+	delete(ep.endpointsMap, t)
+	l := len(ep.endpointsArr)
+	if l > 1 {
+		// The last endpoint in endpointsArr is moved instead of the deleted one.
+		lastEp := ep.endpointsArr[l-1]
+		ep.endpointsArr[idx] = lastEp
+		ep.endpointsMap[lastEp] = idx
+		ep.endpointsArr = ep.endpointsArr[0 : l-1]
+		return false
+	}
+	return true
+}
+
+func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+	if id.RemotePort != 0 {
+		reusePort = false
+	}
+
 	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
 	if !ok {
 		return nil
@@ -77,10 +186,29 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 
+	var multiPortEp *multiPortEndpoint
 	if _, ok := eps.endpoints[id]; ok {
-		return tcpip.ErrPortInUse
+		if !reusePort {
+			return tcpip.ErrPortInUse
+		}
+		multiPortEp, ok = eps.endpoints[id].(*multiPortEndpoint)
+		if !ok {
+			return tcpip.ErrPortInUse
+		}
 	}
 
+	if reusePort {
+		if multiPortEp == nil {
+			multiPortEp = &multiPortEndpoint{}
+			multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
+			multiPortEp.seed = rand.Uint32()
+			eps.endpoints[id] = multiPortEp
+		}
+
+		multiPortEp.singleRegisterEndpoint(ep)
+
+		return nil
+	}
 	eps.endpoints[id] = ep
 
 	return nil
@@ -88,12 +216,10 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
 // won't receive any more packets.
-func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID) {
+func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) {
 	for _, n := range netProtos {
 		if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
-			eps.mu.Lock()
-			delete(eps.endpoints, id)
-			eps.mu.Unlock()
+			eps.unregisterEndpoint(id, ep)
 		}
 	}
 }
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index f09760180..022207081 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -107,7 +107,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Try to register so that we can start receiving packets.
 	f.id.RemoteAddress = addr.Addr
-	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.id, f)
+	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.id, f, false)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 627786808..7d4fbe075 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -436,6 +436,10 @@ type CorkOption int
 // should allow reuse of local address.
 type ReuseAddressOption int
 
+// ReusePortOption is used by SetSockOpt/GetSockOpt to permit multiple sockets
+// to be bound to an identical socket address.
+type ReusePortOption int
+
 // QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
 type QuickAckOption int
 
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index d1b9b136c..29f6c543d 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -100,7 +100,7 @@ func (e *endpoint) Close() {
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 	switch e.state {
 	case stateBound, stateConnected:
-		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id)
+		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e)
 	}
 
 	// Close the receive list and drain it.
@@ -541,14 +541,14 @@ func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.Networ
 	if id.LocalPort != 0 {
 		// The endpoint already has a local port, just attempt to
 		// register it.
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e)
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
 		return id, err
 	}
 
 	// We need to find a port for the endpoint.
 	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
 		id.LocalPort = p
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e)
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, e.transProto, id, e, false)
 		switch err {
 		case nil:
 			return true, nil
@@ -597,7 +597,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 	if commit != nil {
 		if err := commit(); err != nil {
 			// Unregister, the commit failed.
-			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, e.transProto, id)
+			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, e.transProto, id, e)
 			return err
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d0e1d6782..78d2c76e0 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -215,7 +215,7 @@ func (l *listenContext) createConnectedEndpoint(s *segment, iss seqnum.Value, ir
 	n.maybeEnableSACKPermitted(rcvdSynOpts)
 
 	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.id, n); err != nil {
+	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.id, n, n.reusePort); err != nil {
 		n.Close()
 		return nil, err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index d4eda50ec..5281f8be2 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -162,6 +162,9 @@ type endpoint struct {
 	// sack holds TCP SACK related information for this endpoint.
 	sack SACKInfo
 
+	// reusePort is set to true if SO_REUSEPORT is enabled.
+	reusePort bool
+
 	// delay enables Nagle's algorithm.
 	//
 	// delay is a boolean (0 is false) and must be accessed atomically.
@@ -416,7 +419,7 @@ func (e *endpoint) Close() {
 		e.isPortReserved = false
 
 		if e.isRegistered {
-			e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id)
+			e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
 			e.isRegistered = false
 		}
 	}
@@ -453,7 +456,7 @@ func (e *endpoint) cleanupLocked() {
 	e.workerCleanup = false
 
 	if e.isRegistered {
-		e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id)
+		e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
 	}
 
 	e.route.Release()
@@ -681,6 +684,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case tcpip.ReusePortOption:
+		e.mu.Lock()
+		e.reusePort = v != 0
+		e.mu.Unlock()
+		return nil
+
 	case tcpip.QuickAckOption:
 		if v == 0 {
 			atomic.StoreUint32(&e.slowAck, 1)
@@ -875,6 +884,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		}
 		return nil
 
+	case *tcpip.ReusePortOption:
+		e.mu.RLock()
+		v := e.reusePort
+		e.mu.RUnlock()
+
+		*o = 0
+		if v {
+			*o = 1
+		}
+		return nil
+
 	case *tcpip.QuickAckOption:
 		*o = 1
 		if v := atomic.LoadUint32(&e.slowAck); v != 0 {
@@ -1057,7 +1077,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 
 	if e.id.LocalPort != 0 {
 		// The endpoint is bound to a port, attempt to register it.
-		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.id, e)
+		err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.id, e, e.reusePort)
 		if err != nil {
 			return err
 		}
@@ -1071,13 +1091,13 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 			if sameAddr && p == e.id.RemotePort {
 				return false, nil
 			}
-			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.id.LocalAddress, p) {
+			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.id.LocalAddress, p, false) {
 				return false, nil
 			}
 
 			id := e.id
 			id.LocalPort = p
-			switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e) {
+			switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort) {
 			case nil:
 				e.id = id
 				return true, nil
@@ -1234,7 +1254,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
 	}
 
 	// Register the endpoint.
-	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e); err != nil {
+	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e, e.reusePort); err != nil {
 		return err
 	}
 
@@ -1315,7 +1335,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) (err
 		}
 	}
 
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port)
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 67e9ca0ac..b2a27a7cb 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -81,6 +81,7 @@ type endpoint struct {
 	dstPort      uint16
 	v6only       bool
 	multicastTTL uint8
+	reusePort    bool
 
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
@@ -132,7 +133,7 @@ func NewConnectedEndpoint(stack *stack.Stack, r *stack.Route, id stack.Transport
 	ep := newEndpoint(stack, r.NetProto, waiterQueue)
 
 	// Register new endpoint so that packets are routed to it.
-	if err := stack.RegisterTransportEndpoint(r.NICID(), []tcpip.NetworkProtocolNumber{r.NetProto}, ProtocolNumber, id, ep); err != nil {
+	if err := stack.RegisterTransportEndpoint(r.NICID(), []tcpip.NetworkProtocolNumber{r.NetProto}, ProtocolNumber, id, ep, ep.reusePort); err != nil {
 		ep.Close()
 		return nil, err
 	}
@@ -155,7 +156,7 @@ func (e *endpoint) Close() {
 
 	switch e.state {
 	case stateBound, stateConnected:
-		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id)
+		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
 		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort)
 	}
 
@@ -449,6 +450,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 				break
 			}
 		}
+
+	case tcpip.ReusePortOption:
+		e.mu.Lock()
+		e.reusePort = v != 0
+		e.mu.Unlock()
+		return nil
 	}
 	return nil
 }
@@ -513,6 +520,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.ReusePortOption:
+		e.mu.RLock()
+		v := e.reusePort
+		e.mu.RUnlock()
+
+		*o = 0
+		if v {
+			*o = 1
+		}
+		return nil
+
 	case *tcpip.KeepaliveEnabledOption:
 		*o = 0
 		return nil
@@ -648,7 +666,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Remove the old registration.
 	if e.id.LocalPort != 0 {
-		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id)
+		e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
 	}
 
 	e.id = id
@@ -711,14 +729,14 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
 	if e.id.LocalPort == 0 {
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort)
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort)
 		if err != nil {
 			return id, err
 		}
 		id.LocalPort = port
 	}
 
-	err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e)
+	err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort)
 	if err != nil {
 		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort)
 	}
@@ -766,7 +784,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 	if commit != nil {
 		if err := commit(); err != nil {
 			// Unregister, the commit failed.
-			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, ProtocolNumber, id)
+			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, ProtocolNumber, id, e)
 			e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort)
 			return err
 		}
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 58a346cd9..2a9cf4b57 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -16,6 +16,7 @@ package udp_test
 
 import (
 	"bytes"
+	"math"
 	"math/rand"
 	"testing"
 	"time"
@@ -254,6 +255,90 @@ func newPayload() []byte {
 	return b
 }
 
+func TestBindPortReuse(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createV6Endpoint(false)
+
+	var eps [5]tcpip.Endpoint
+	reusePortOpt := tcpip.ReusePortOption(1)
+
+	pollChannel := make(chan tcpip.Endpoint)
+	for i := 0; i < len(eps); i++ {
+		// Try to receive the data.
+		wq := waiter.Queue{}
+		we, ch := waiter.NewChannelEntry(nil)
+		wq.EventRegister(&we, waiter.EventIn)
+		defer wq.EventUnregister(&we)
+		defer close(ch)
+
+		var err *tcpip.Error
+		eps[i], err = c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &wq)
+		if err != nil {
+			c.t.Fatalf("NewEndpoint failed: %v", err)
+		}
+
+		go func(ep tcpip.Endpoint) {
+			for range ch {
+				pollChannel <- ep
+			}
+		}(eps[i])
+
+		defer eps[i].Close()
+		if err := eps[i].SetSockOpt(reusePortOpt); err != nil {
+			c.t.Fatalf("SetSockOpt failed failed: %v", err)
+		}
+		if err := eps[i].Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}, nil); err != nil {
+			t.Fatalf("ep.Bind(...) failed: %v", err)
+		}
+	}
+
+	npackets := 100000
+	nports := 10000
+	ports := make(map[uint16]tcpip.Endpoint)
+	stats := make(map[tcpip.Endpoint]int)
+	for i := 0; i < npackets; i++ {
+		// Send a packet.
+		port := uint16(i % nports)
+		payload := newPayload()
+		c.sendV6Packet(payload, &headers{
+			srcPort: testPort + port,
+			dstPort: stackPort,
+		})
+
+		var addr tcpip.FullAddress
+		ep := <-pollChannel
+		_, _, err := ep.Read(&addr)
+		if err != nil {
+			c.t.Fatalf("Read failed: %v", err)
+		}
+		stats[ep]++
+		if i < nports {
+			ports[uint16(i)] = ep
+		} else {
+			// Check that all packets from one client are handled
+			// by the same socket.
+			if ports[port] != ep {
+				t.Fatalf("Port mismatch")
+			}
+		}
+	}
+
+	if len(stats) != len(eps) {
+		t.Fatalf("Only %d(expected %d) sockets received packets", len(stats), len(eps))
+	}
+
+	// Check that a packet distribution is fair between sockets.
+	for _, c := range stats {
+		n := float64(npackets) / float64(len(eps))
+		// The deviation is less than 10%.
+		if math.Abs(float64(c)-n) > n/10 {
+			t.Fatal(c, n)
+		}
+	}
+}
+
 func testV4Read(c *testContext) {
 	// Send a packet.
 	payload := newPayload()
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ae33d14da..f0e61e083 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2163,9 +2163,13 @@ cc_binary(
         ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:posix_error",
+        "//test/util:save_util",
         "//test/util:test_main",
         "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 17a46e149..0893be5a7 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -17,17 +17,24 @@
 #include <string.h>
 #include <sys/socket.h>
 
+#include <atomic>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
+#include "test/util/save_util.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -227,6 +234,238 @@ INSTANTIATE_TEST_CASE_P(
         TestParam{V6Loopback(), V6Loopback()}),
     DescribeTestParam);
 
+using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>;
+
+TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+  sockaddr_storage listen_addr = listener.addr;
+  sockaddr_storage conn_addr = connector.addr;
+  constexpr int kThreadCount = 3;
+
+  // Create the listening socket.
+  FileDescriptor listener_fds[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    listener_fds[i] = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+    int fd = listener_fds[i].get();
+
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+        SyscallSucceeds());
+    ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
+
+    // On the first bind we need to determine which port was bound.
+    if (i != 0) continue;
+
+    // Get the port bound by the listening socket.
+    socklen_t addrlen = listener.addr_len;
+    ASSERT_THAT(
+        getsockname(listener_fds[0].get(),
+                    reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+        SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+    ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  }
+
+  constexpr int kConnectAttempts = 10000;
+  std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
+  std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
+  int accept_counts[kThreadCount] = {};
+  // TODO: figure how to not disable S/R for the whole test.
+  // We need to take into account that this test executes a lot of system
+  // calls from many threads.
+  DisableSave ds;
+
+  for (int i = 0; i < kThreadCount; i++) {
+    listen_thread[i] = absl::make_unique<ScopedThread>(
+        [&listener_fds, &accept_counts, i, &connects_received]() {
+          do {
+            auto fd = Accept(listener_fds[i].get(), nullptr, nullptr);
+            if (!fd.ok()) {
+              if (connects_received >= kConnectAttempts) {
+                // Another thread have shutdown our read side causing the
+                // accept to fail.
+                break;
+              }
+              ASSERT_NO_ERRNO(fd);
+              break;
+            }
+            // Receive some data from a socket to be sure that the connect()
+            // system call has been completed on another side.
+            int data;
+            EXPECT_THAT(
+                RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
+                SyscallSucceedsWithValue(sizeof(data)));
+            accept_counts[i]++;
+          } while (++connects_received < kConnectAttempts);
+
+          // Shutdown all sockets to wake up other threads.
+          for (int j = 0; j < kThreadCount; j++) {
+            shutdown(listener_fds[j].get(), SHUT_RDWR);
+          }
+        });
+  }
+
+  ScopedThread connecting_thread([&connector, &conn_addr]() {
+    for (int i = 0; i < kConnectAttempts; i++) {
+      const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+          Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+      ASSERT_THAT(
+          RetryEINTR(connect)(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                              connector.addr_len),
+          SyscallSucceeds());
+
+      EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+                  SyscallSucceedsWithValue(sizeof(i)));
+    }
+  });
+
+  // Join threads to be sure that all connections have been counted
+  connecting_thread.Join();
+  for (int i = 0; i < kThreadCount; i++) {
+    listen_thread[i]->Join();
+  }
+  // Check that connections are distributed fairly between listening sockets
+  for (int i = 0; i < kThreadCount; i++)
+    EXPECT_THAT(accept_counts[i],
+                EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
+}
+
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
+  auto const& param = GetParam();
+
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+  sockaddr_storage listen_addr = listener.addr;
+  sockaddr_storage conn_addr = connector.addr;
+  constexpr int kThreadCount = 3;
+
+  // Create the listening socket.
+  FileDescriptor listener_fds[kThreadCount];
+  for (int i = 0; i < kThreadCount; i++) {
+    listener_fds[i] =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(listener.family(), SOCK_DGRAM, 0));
+    int fd = listener_fds[i].get();
+
+    ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                           sizeof(kSockOptOn)),
+                SyscallSucceeds());
+    ASSERT_THAT(
+        bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+        SyscallSucceeds());
+
+    // On the first bind we need to determine which port was bound.
+    if (i != 0) continue;
+
+    // Get the port bound by the listening socket.
+    socklen_t addrlen = listener.addr_len;
+    ASSERT_THAT(
+        getsockname(listener_fds[0].get(),
+                    reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+        SyscallSucceeds());
+    uint16_t const port =
+        ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+    ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+    ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  }
+
+  constexpr int kConnectAttempts = 10000;
+  std::atomic<int> packets_received = ATOMIC_VAR_INIT(0);
+  std::unique_ptr<ScopedThread> receiver_thread[kThreadCount];
+  int packets_per_socket[kThreadCount] = {};
+  // TODO: figure how to not disable S/R for the whole test.
+  DisableSave ds;  // Too expensive.
+
+  for (int i = 0; i < kThreadCount; i++) {
+    receiver_thread[i] = absl::make_unique<ScopedThread>(
+        [&listener_fds, &packets_per_socket, i, &packets_received]() {
+          do {
+            struct sockaddr_storage addr = {};
+            socklen_t addrlen = sizeof(addr);
+            int data;
+
+            auto ret = RetryEINTR(recvfrom)(
+                listener_fds[i].get(), &data, sizeof(data), 0,
+                reinterpret_cast<struct sockaddr*>(&addr), &addrlen);
+
+            if (packets_received < kConnectAttempts) {
+              ASSERT_THAT(ret, SyscallSucceedsWithValue(sizeof(data)));
+            }
+
+            if (ret != sizeof(data)) {
+              // Another thread may have shutdown our read side causing the
+              // recvfrom to fail.
+              break;
+            }
+
+            packets_received++;
+            packets_per_socket[i]++;
+
+            // A response is required to synchronize with the main thread,
+            // otherwise the main thread can send more than can fit into receive
+            // queues.
+            EXPECT_THAT(RetryEINTR(sendto)(
+                            listener_fds[i].get(), &data, sizeof(data), 0,
+                            reinterpret_cast<sockaddr*>(&addr), addrlen),
+                        SyscallSucceedsWithValue(sizeof(data)));
+          } while (packets_received < kConnectAttempts);
+
+          // Shutdown all sockets to wake up other threads.
+          for (int j = 0; j < kThreadCount; j++)
+            shutdown(listener_fds[j].get(), SHUT_RDWR);
+        });
+  }
+
+  ScopedThread main_thread([&connector, &conn_addr]() {
+    for (int i = 0; i < kConnectAttempts; i++) {
+      const FileDescriptor fd =
+          ASSERT_NO_ERRNO_AND_VALUE(Socket(connector.family(), SOCK_DGRAM, 0));
+      EXPECT_THAT(RetryEINTR(sendto)(fd.get(), &i, sizeof(i), 0,
+                                     reinterpret_cast<sockaddr*>(&conn_addr),
+                                     connector.addr_len),
+                  SyscallSucceedsWithValue(sizeof(i)));
+      int data;
+      EXPECT_THAT(RetryEINTR(recv)(fd.get(), &data, sizeof(data), 0),
+                  SyscallSucceedsWithValue(sizeof(data)));
+    }
+  });
+
+  main_thread.Join();
+
+  // Join threads to be sure that all connections have been counted
+  for (int i = 0; i < kThreadCount; i++) {
+    receiver_thread[i]->Join();
+  }
+  // Check that packets are distributed fairly between listening sockets.
+  for (int i = 0; i < kThreadCount; i++)
+    EXPECT_THAT(packets_per_socket[i],
+                EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    All, SocketInetReusePortTest,
+    ::testing::Values(
+        // Listeners bound to IPv4 addresses refuse connections using IPv6
+        // addresses.
+        TestParam{V4Any(), V4Loopback()},
+        TestParam{V4Loopback(), V4MappedLoopback()},
+
+        // Listeners bound to IN6ADDR_ANY accept all connections.
+        TestParam{V6Any(), V4Loopback()}, TestParam{V6Any(), V6Loopback()},
+
+        // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4
+        // addresses.
+        TestParam{V6Loopback(), V6Loopback()}),
+    DescribeTestParam);
+
 struct ProtocolTestParam {
   std::string description;
   int type;
@@ -806,6 +1045,56 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
   }
 }
 
+TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
+  auto const& param = GetParam();
+  TestAddress const& test_addr = V4Loopback();
+  sockaddr_storage addr = test_addr.addr;
+
+  for (int i = 0; i < 2; i++) {
+    const int portreuse1 = i % 2;
+    auto s1 =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+    int fd1 = s1.get();
+    socklen_t addrlen = test_addr.addr_len;
+
+    EXPECT_THAT(
+        setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &portreuse1, sizeof(int)),
+        SyscallSucceeds());
+
+    ASSERT_THAT(bind(fd1, reinterpret_cast<sockaddr*>(&addr), addrlen),
+                SyscallSucceeds());
+
+    ASSERT_THAT(getsockname(fd1, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+                SyscallSucceeds());
+    if (param.type == SOCK_STREAM) {
+      ASSERT_THAT(listen(fd1, 1), SyscallSucceeds());
+    }
+
+    // j is less than 4 to check that the port reuse logic works correctly after
+    // closing bound sockets.
+    for (int j = 0; j < 4; j++) {
+      const int portreuse2 = j % 2;
+      auto s2 =
+          ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+      int fd2 = s2.get();
+
+      EXPECT_THAT(
+          setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &portreuse2, sizeof(int)),
+          SyscallSucceeds());
+
+      LOG(INFO) << portreuse1 << " " << portreuse2;
+      int ret = bind(fd2, reinterpret_cast<sockaddr*>(&addr), addrlen);
+
+      // Verify that two sockets can be bound to the same port only if
+      // SO_REUSEPORT is set for both of them.
+      if (!portreuse1 || !portreuse2)
+        ASSERT_THAT(ret, SyscallFailsWithErrno(EADDRINUSE));
+      else
+        ASSERT_THAT(ret, SyscallSucceeds());
+    }
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(AllFamlies, SocketMultiProtocolInetLoopbackTest,
                         ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
                                           ProtocolTestParam{"UDP", SOCK_DGRAM}),
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 9ee0361ee..ec048f10f 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -118,6 +118,7 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 	// Mark the root as writeable, as some tests attempt to
 	// write to the rootfs, and expect EACCES, not EROFS.
 	spec.Root.Readonly = false
+	spec.Mounts = nil
 
 	// Set environment variable that indicates we are
 	// running in gVisor and with the given platform.
-- 
cgit v1.2.3


From 8e586db16274c2563fb13c95bafa9e20ea3d73ce Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 2 Jan 2019 11:38:51 -0800
Subject: Add /proc/net/psched content

FIO reads this file and expects it to be well formed.

PiperOrigin-RevId: 227554483
Change-Id: Ia48ae2377626dd6a2daf17b5b4f5119f90ece55b
---
 pkg/sentry/fs/proc/net.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 3ff60aa5b..2806d6035 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -16,6 +16,7 @@ package proc
 
 import (
 	"fmt"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -42,7 +43,12 @@ func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 		d.AddChild(ctx, "netstat", p.newStubProcFSFile(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")))
 		d.AddChild(ctx, "packet", p.newStubProcFSFile(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")))
 		d.AddChild(ctx, "protocols", p.newStubProcFSFile(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")))
-		d.AddChild(ctx, "psched", p.newStubProcFSFile(ctx, msrc, []byte("")))
+
+		// Linux sets these values to: nsec per usec, psched tick in ns, 1000000,
+		// high res timer ticks per sec (ClockGetres returns 1ns resolution).
+		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
+		d.AddChild(ctx, "psched", p.newStubProcFSFile(ctx, msrc, []byte(psched)))
+
 		d.AddChild(ctx, "ptype", p.newStubProcFSFile(ctx, msrc, []byte("Type Device      Function")))
 		d.AddChild(ctx, "route", p.newStubProcFSFile(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")))
 		d.AddChild(ctx, "tcp", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
-- 
cgit v1.2.3


From 901ed5da44f1ef28c67ce942eef978342a5f8766 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 7 Jan 2019 15:16:37 -0800
Subject: Implement /proc/[pid]/smaps.

PiperOrigin-RevId: 228245523
Change-Id: I5a4d0a6570b93958e51437e917e5331d83e23a7e
---
 pkg/sentry/fs/proc/task.go            |  43 +++-
 pkg/sentry/mm/BUILD                   |   2 +-
 pkg/sentry/mm/lifecycle.go            |   2 +-
 pkg/sentry/mm/mm.go                   |   4 -
 pkg/sentry/mm/proc_pid_maps.go        | 121 ---------
 pkg/sentry/mm/procfs.go               | 289 +++++++++++++++++++++
 pkg/sentry/mm/syscalls.go             |   4 +-
 pkg/sentry/mm/vma.go                  |  17 +-
 test/syscalls/linux/BUILD             |  22 ++
 test/syscalls/linux/proc_pid_smaps.cc | 467 ++++++++++++++++++++++++++++++++++
 10 files changed, 840 insertions(+), 131 deletions(-)
 delete mode 100644 pkg/sentry/mm/proc_pid_maps.go
 create mode 100644 pkg/sentry/mm/procfs.go
 create mode 100644 test/syscalls/linux/proc_pid_smaps.cc

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9f13ff91c..91bda8a95 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -82,6 +82,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
 		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
 		"ns":        newNamespaceDir(t, msrc),
+		"smaps":     newSmaps(t, msrc),
 		"stat":      newTaskStat(t, msrc, showSubtasks, pidns),
 		"statm":     newStatm(t, msrc),
 		"status":    newStatus(t, msrc, pidns),
@@ -316,7 +317,47 @@ func (md *mapsData) NeedsUpdate(generation int64) bool {
 // ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
 func (md *mapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
 	if mm := md.mm(); mm != nil {
-		return mm.ReadSeqFileData(ctx, h)
+		return mm.ReadMapsSeqFileData(ctx, h)
+	}
+	return []seqfile.SeqData{}, 0
+}
+
+// smapsData implements seqfile.SeqSource for /proc/[pid]/smaps.
+//
+// +stateify savable
+type smapsData struct {
+	t *kernel.Task
+}
+
+func newSmaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
+	return newFile(seqfile.NewSeqFile(t, &smapsData{t}), msrc, fs.SpecialFile, t)
+}
+
+func (sd *smapsData) mm() *mm.MemoryManager {
+	var tmm *mm.MemoryManager
+	sd.t.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			// No additional reference is taken on mm here. This is safe
+			// because MemoryManager.destroy is required to leave the
+			// MemoryManager in a state where it's still usable as a SeqSource.
+			tmm = mm
+		}
+	})
+	return tmm
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (sd *smapsData) NeedsUpdate(generation int64) bool {
+	if mm := sd.mm(); mm != nil {
+		return mm.NeedsUpdate(generation)
+	}
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (sd *smapsData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if mm := sd.mm(); mm != nil {
+		return mm.ReadSmapsSeqFileData(ctx, h)
 	}
 	return []seqfile.SeqData{}, 0
 }
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 5a9185e5d..0997ec0a7 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -87,7 +87,7 @@ go_library(
         "mm.go",
         "pma.go",
         "pma_set.go",
-        "proc_pid_maps.go",
+        "procfs.go",
         "save_restore.go",
         "shm.go",
         "special_mappable.go",
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index a42e32b43..1ee8ae74e 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -86,7 +86,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
-			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.isMappableAsWritable()); err != nil {
+			if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
 				mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
 				return nil, err
 			}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index c0632d232..2154e7918 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -363,10 +363,6 @@ func (v *vma) loadRealPerms(b int) {
 	}
 }
 
-func (v *vma) isMappableAsWritable() bool {
-	return !v.private && v.maxPerms.Write
-}
-
 // pma represents a platform mapping area.
 //
 // +stateify savable
diff --git a/pkg/sentry/mm/proc_pid_maps.go b/pkg/sentry/mm/proc_pid_maps.go
deleted file mode 100644
index 247ee45ef..000000000
--- a/pkg/sentry/mm/proc_pid_maps.go
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package mm
-
-import (
-	"bytes"
-	"fmt"
-	"strings"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-const (
-	// devMinorBits is the number of minor bits in a device number. Linux:
-	// include/linux/kdev_t.h:MINORBITS
-	devMinorBits = 20
-)
-
-// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
-func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
-	return true
-}
-
-// ReadSeqFileData is called by fs/proc.mapsData.ReadSeqFileData.
-func (mm *MemoryManager) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
-	mm.mappingMu.RLock()
-	defer mm.mappingMu.RUnlock()
-	var data []seqfile.SeqData
-	var start usermem.Addr
-	if handle != nil {
-		start = *handle.(*usermem.Addr)
-	}
-	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME: If we use a usermem.Addr for the handle, we get
-		// "panic: autosave error: type usermem.Addr is not registered".
-		vmaAddr := vseg.End()
-		data = append(data, seqfile.SeqData{
-			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
-			Handle: &vmaAddr,
-		})
-	}
-
-	// We always emulate vsyscall, so advertise it here. Everything about a
-	// vsyscall region is static, so just hard code the maps entry since we
-	// don't have a real vma backing it. The vsyscall region is at the end of
-	// the virtual address space so nothing should be mapped after it (if
-	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
-	// get the sorting on the maps file wrong at worst; but that's not possible
-	// on any current platform).
-	//
-	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
-	if vsyscallEnd := usermem.Addr(0xffffffffff601000); start != vsyscallEnd {
-		data = append(data, seqfile.SeqData{
-			Buf:    []byte("ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n"),
-			Handle: &vsyscallEnd,
-		})
-	}
-	return data, 1
-}
-
-// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
-// vseg, including the trailing newline.
-//
-// Preconditions: mm.mappingMu must be locked.
-func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
-	vma := vseg.ValuePtr()
-	private := "p"
-	if !vma.private {
-		private = "s"
-	}
-
-	var dev, ino uint64
-	if vma.id != nil {
-		dev = vma.id.DeviceID()
-		ino = vma.id.InodeID()
-	}
-	devMajor := uint32(dev >> devMinorBits)
-	devMinor := uint32(dev & ((1 << devMinorBits) - 1))
-
-	var b bytes.Buffer
-	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
-	// stack_guard_page_start().
-	fmt.Fprintf(&b, "%08x-%08x %s%s %08x %02x:%02x %d ",
-		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
-
-	// Figure out our filename or hint.
-	var s string
-	if vma.hint != "" {
-		s = vma.hint
-	} else if vma.id != nil {
-		// FIXME: We are holding mm.mappingMu here, which is
-		// consistent with Linux's holding mmap_sem in
-		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
-		// However, it's not clear that fs.File.MappedName() is actually
-		// consistent with this lock order.
-		s = vma.id.MappedName(ctx)
-	}
-	if s != "" {
-		// Per linux, we pad until the 74th character.
-		if pad := 73 - b.Len(); pad > 0 {
-			b.WriteString(strings.Repeat(" ", pad))
-		}
-		b.WriteString(s)
-	}
-	b.WriteString("\n")
-	return b.Bytes()
-}
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
new file mode 100644
index 000000000..0c4b8895d
--- /dev/null
+++ b/pkg/sentry/mm/procfs.go
@@ -0,0 +1,289 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// devMinorBits is the number of minor bits in a device number. Linux:
+	// include/linux/kdev_t.h:MINORBITS
+	devMinorBits = 20
+
+	vsyscallEnd        = usermem.Addr(0xffffffffff601000)
+	vsyscallMapsEntry  = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n"
+	vsyscallSmapsEntry = vsyscallMapsEntry +
+		"Size:                  4 kB\n" +
+		"Rss:                   0 kB\n" +
+		"Pss:                   0 kB\n" +
+		"Shared_Clean:          0 kB\n" +
+		"Shared_Dirty:          0 kB\n" +
+		"Private_Clean:         0 kB\n" +
+		"Private_Dirty:         0 kB\n" +
+		"Referenced:            0 kB\n" +
+		"Anonymous:             0 kB\n" +
+		"AnonHugePages:         0 kB\n" +
+		"Shared_Hugetlb:        0 kB\n" +
+		"Private_Hugetlb:       0 kB\n" +
+		"Swap:                  0 kB\n" +
+		"SwapPss:               0 kB\n" +
+		"KernelPageSize:        4 kB\n" +
+		"MMUPageSize:           4 kB\n" +
+		"Locked:                0 kB\n" +
+		"VmFlags: rd ex \n"
+)
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var data []seqfile.SeqData
+	var start usermem.Addr
+	if handle != nil {
+		start = *handle.(*usermem.Addr)
+	}
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		// FIXME: If we use a usermem.Addr for the handle, we get
+		// "panic: autosave error: type usermem.Addr is not registered".
+		vmaAddr := vseg.End()
+		data = append(data, seqfile.SeqData{
+			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
+			Handle: &vmaAddr,
+		})
+	}
+
+	// We always emulate vsyscall, so advertise it here. Everything about a
+	// vsyscall region is static, so just hard code the maps entry since we
+	// don't have a real vma backing it. The vsyscall region is at the end of
+	// the virtual address space so nothing should be mapped after it (if
+	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+	// get the sorting on the maps file wrong at worst; but that's not possible
+	// on any current platform).
+	//
+	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
+	if start != vsyscallEnd {
+		// FIXME: Can't get a pointer to constant vsyscallEnd.
+		vmaAddr := vsyscallEnd
+		data = append(data, seqfile.SeqData{
+			Buf:    []byte(vsyscallMapsEntry),
+			Handle: &vmaAddr,
+		})
+	}
+	return data, 1
+}
+
+// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
+// vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+	var b bytes.Buffer
+	mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
+	return b.Bytes()
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
+	vma := vseg.ValuePtr()
+	private := "p"
+	if !vma.private {
+		private = "s"
+	}
+
+	var dev, ino uint64
+	if vma.id != nil {
+		dev = vma.id.DeviceID()
+		ino = vma.id.InodeID()
+	}
+	devMajor := uint32(dev >> devMinorBits)
+	devMinor := uint32(dev & ((1 << devMinorBits) - 1))
+
+	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
+	// stack_guard_page_start().
+	fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
+		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
+
+	// Figure out our filename or hint.
+	var s string
+	if vma.hint != "" {
+		s = vma.hint
+	} else if vma.id != nil {
+		// FIXME: We are holding mm.mappingMu here, which is
+		// consistent with Linux's holding mmap_sem in
+		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
+		// However, it's not clear that fs.File.MappedName() is actually
+		// consistent with this lock order.
+		s = vma.id.MappedName(ctx)
+	}
+	if s != "" {
+		// Per linux, we pad until the 74th character.
+		if pad := 73 - b.Len(); pad > 0 {
+			b.WriteString(strings.Repeat(" ", pad))
+		}
+		b.WriteString(s)
+	}
+	b.WriteString("\n")
+}
+
+// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to
+// implement /proc/[pid]/smaps.
+func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	var data []seqfile.SeqData
+	var start usermem.Addr
+	if handle != nil {
+		start = *handle.(*usermem.Addr)
+	}
+	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+		// FIXME: If we use a usermem.Addr for the handle, we get
+		// "panic: autosave error: type usermem.Addr is not registered".
+		vmaAddr := vseg.End()
+		data = append(data, seqfile.SeqData{
+			Buf:    mm.vmaSmapsEntryLocked(ctx, vseg),
+			Handle: &vmaAddr,
+		})
+	}
+
+	// We always emulate vsyscall, so advertise it here. See
+	// ReadMapsSeqFileData for additional commentary.
+	if start != vsyscallEnd {
+		// FIXME: Can't get a pointer to constant vsyscallEnd.
+		vmaAddr := vsyscallEnd
+		data = append(data, seqfile.SeqData{
+			Buf:    []byte(vsyscallSmapsEntry),
+			Handle: &vmaAddr,
+		})
+	}
+	return data, 1
+}
+
+// vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated
+// by vseg, including the trailing newline.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
+	var b bytes.Buffer
+	mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
+	vma := vseg.ValuePtr()
+
+	// We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of
+	// requiring it to be locked as a precondition, to reduce the latency
+	// impact of reading /proc/[pid]/smaps on concurrent performance-sensitive
+	// operations requiring activeMu for writing like faults.
+	mm.activeMu.RLock()
+	var rss uint64
+	var anon uint64
+	vsegAR := vseg.Range()
+	for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() {
+		psegAR := pseg.Range().Intersect(vsegAR)
+		size := uint64(psegAR.Length())
+		rss += size
+		if pseg.ValuePtr().private {
+			anon += size
+		}
+	}
+	mm.activeMu.RUnlock()
+
+	fmt.Fprintf(&b, "Size:           %8d kB\n", vseg.Range().Length()/1024)
+	fmt.Fprintf(&b, "Rss:            %8d kB\n", rss/1024)
+	// Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma
+	// is only mapped by that pma. This avoids having to query memmap.Mappables
+	// for reference count information on each page. As a corollary, all pages
+	// are accounted as "private" whether or not the vma is private; compare
+	// Linux's fs/proc/task_mmu.c:smaps_account().
+	fmt.Fprintf(&b, "Pss:            %8d kB\n", rss/1024)
+	fmt.Fprintf(&b, "Shared_Clean:   %8d kB\n", 0)
+	fmt.Fprintf(&b, "Shared_Dirty:   %8d kB\n", 0)
+	// Pretend that all pages are dirty if the vma is writable, and clean otherwise.
+	clean := rss
+	if vma.effectivePerms.Write {
+		clean = 0
+	}
+	fmt.Fprintf(&b, "Private_Clean:  %8d kB\n", clean/1024)
+	fmt.Fprintf(&b, "Private_Dirty:  %8d kB\n", (rss-clean)/1024)
+	// Pretend that all pages are "referenced" (recently touched).
+	fmt.Fprintf(&b, "Referenced:     %8d kB\n", rss/1024)
+	fmt.Fprintf(&b, "Anonymous:      %8d kB\n", anon/1024)
+	// Hugepages (hugetlb and THP) are not implemented.
+	fmt.Fprintf(&b, "AnonHugePages:  %8d kB\n", 0)
+	fmt.Fprintf(&b, "Shared_Hugetlb: %8d kB\n", 0)
+	fmt.Fprintf(&b, "Private_Hugetlb: %7d kB\n", 0)
+	// Swap is not implemented.
+	fmt.Fprintf(&b, "Swap:           %8d kB\n", 0)
+	fmt.Fprintf(&b, "SwapPss:        %8d kB\n", 0)
+	fmt.Fprintf(&b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024)
+	fmt.Fprintf(&b, "MMUPageSize:    %8d kB\n", usermem.PageSize/1024)
+	locked := rss
+	if vma.mlockMode == memmap.MLockNone {
+		locked = 0
+	}
+	fmt.Fprintf(&b, "Locked:         %8d kB\n", locked/1024)
+
+	b.WriteString("VmFlags: ")
+	if vma.realPerms.Read {
+		b.WriteString("rd ")
+	}
+	if vma.realPerms.Write {
+		b.WriteString("wr ")
+	}
+	if vma.realPerms.Execute {
+		b.WriteString("ex ")
+	}
+	if vma.canWriteMappableLocked() { // VM_SHARED
+		b.WriteString("sh ")
+	}
+	if vma.maxPerms.Read {
+		b.WriteString("mr ")
+	}
+	if vma.maxPerms.Write {
+		b.WriteString("mw ")
+	}
+	if vma.maxPerms.Execute {
+		b.WriteString("me ")
+	}
+	if !vma.private { // VM_MAYSHARE
+		b.WriteString("ms ")
+	}
+	if vma.growsDown {
+		b.WriteString("gd ")
+	}
+	if vma.mlockMode != memmap.MLockNone { // VM_LOCKED
+		b.WriteString("lo ")
+	}
+	if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT
+		b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags()
+	}
+	if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT
+		b.WriteString("ac ")
+	}
+	b.WriteString("\n")
+
+	return b.Bytes()
+}
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 383703ec3..fd6929e08 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -507,7 +507,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			return 0, syserror.EINVAL
 		}
 		// Inform the Mappable, if any, of the new mapping.
-		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.isMappableAsWritable()); err != nil {
+		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil {
 			return 0, err
 		}
 	}
@@ -571,7 +571,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// Now that pmas have been moved to newAR, we can notify vma.mappable that
 	// oldAR is no longer mapped.
 	if vma.mappable != nil {
-		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
+		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked())
 	}
 
 	if vma.mlockMode == memmap.MLockEager {
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 28ba9f2f5..e9c9a80ea 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -84,6 +84,8 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 
 	// Inform the Mappable, if any, of the new mapping.
 	if opts.Mappable != nil {
+		// The expression for writable is vma.canWriteMappableLocked(), but we
+		// don't yet have a vma.
 		if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil {
 			return vmaIterator{}, usermem.AddrRange{}, err
 		}
@@ -366,7 +368,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 		vmaAR := vseg.Range()
 		vma := vseg.ValuePtr()
 		if vma.mappable != nil {
-			vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.isMappableAsWritable())
+			vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked())
 		}
 		if vma.id != nil {
 			vma.id.DecRef()
@@ -381,6 +383,19 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 	return vgap
 }
 
+// canWriteMappableLocked returns true if it is possible for vma.mappable to be
+// written to via this vma, i.e. if it is possible that
+// vma.mappable.Translate(at.Write=true) may be called as a result of this vma.
+// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as
+// PTRACE_POKEDATA.
+//
+// canWriteMappableLocked is equivalent to Linux's VM_SHARED.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) canWriteMappableLocked() bool {
+	return !vma.private && vma.maxPerms.Write
+}
+
 // vmaSetFunctions implements segment.Functions for vmaSet.
 type vmaSetFunctions struct{}
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index f0e61e083..028c686a8 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1413,6 +1413,28 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "proc_pid_smaps_test",
+    testonly = 1,
+    srcs = ["proc_pid_smaps.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:memory_util",
+        "//test/util:posix_error",
+        "//test/util:proc_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "pselect_test",
     testonly = 1,
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
new file mode 100644
index 000000000..4aefc1b41
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -0,0 +1,467 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Contains;
+using ::testing::ElementsAreArray;
+using ::testing::IsSupersetOf;
+using ::testing::Not;
+using ::testing::Optional;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+struct ProcPidSmapsEntry {
+  ProcMapsEntry maps_entry;
+
+  // These fields should always exist, as they were included in e070ad49f311
+  // "[PATCH] add /proc/pid/smaps".
+  size_t size_kb;
+  size_t rss_kb;
+  size_t shared_clean_kb;
+  size_t shared_dirty_kb;
+  size_t private_clean_kb;
+  size_t private_dirty_kb;
+
+  // These fields were added later and may not be present.
+  absl::optional<size_t> pss_kb;
+  absl::optional<size_t> referenced_kb;
+  absl::optional<size_t> anonymous_kb;
+  absl::optional<size_t> anon_huge_pages_kb;
+  absl::optional<size_t> shared_hugetlb_kb;
+  absl::optional<size_t> private_hugetlb_kb;
+  absl::optional<size_t> swap_kb;
+  absl::optional<size_t> swap_pss_kb;
+  absl::optional<size_t> kernel_page_size_kb;
+  absl::optional<size_t> mmu_page_size_kb;
+  absl::optional<size_t> locked_kb;
+
+  // Caution: "Note that there is no guarantee that every flag and associated
+  // mnemonic will be present in all further kernel releases. Things get
+  // changed, the flags may be vanished or the reverse -- new added." - Linux
+  // Documentation/filesystems/proc.txt, on VmFlags. Avoid checking for any
+  // flags that are not extremely well-established.
+  absl::optional<std::vector<std::string>> vm_flags;
+};
+
+// Given the value part of a /proc/[pid]/smaps field containing a value in kB
+// (for example, "    4 kB", returns the value in kB (in this example, 4).
+PosixErrorOr<size_t> SmapsValueKb(absl::string_view value) {
+  // TODO: let us use RE2 or <regex>
+  std::pair<absl::string_view, absl::string_view> parts =
+      absl::StrSplit(value, ' ', absl::SkipEmpty());
+  if (parts.second != "kB") {
+    return PosixError(EINVAL,
+                      absl::StrCat("invalid smaps field value: ", value));
+  }
+  ASSIGN_OR_RETURN_ERRNO(auto val_kb, Atoi<size_t>(parts.first));
+  return val_kb;
+}
+
+PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
+    absl::string_view contents) {
+  std::vector<ProcPidSmapsEntry> entries;
+  absl::optional<ProcPidSmapsEntry> entry;
+  bool have_size_kb = false;
+  bool have_rss_kb = false;
+  bool have_shared_clean_kb = false;
+  bool have_shared_dirty_kb = false;
+  bool have_private_clean_kb = false;
+  bool have_private_dirty_kb = false;
+
+  auto const finish_entry = [&] {
+    if (entry) {
+      if (!have_size_kb) {
+        return PosixError(EINVAL, "smaps entry is missing Size");
+      }
+      if (!have_rss_kb) {
+        return PosixError(EINVAL, "smaps entry is missing Rss");
+      }
+      if (!have_shared_clean_kb) {
+        return PosixError(EINVAL, "smaps entry is missing Shared_Clean");
+      }
+      if (!have_shared_dirty_kb) {
+        return PosixError(EINVAL, "smaps entry is missing Shared_Dirty");
+      }
+      if (!have_private_clean_kb) {
+        return PosixError(EINVAL, "smaps entry is missing Private_Clean");
+      }
+      if (!have_private_dirty_kb) {
+        return PosixError(EINVAL, "smaps entry is missing Private_Dirty");
+      }
+      // std::move(entry.value()) instead of std::move(entry).value(), because
+      // otherwise tools may report a "use-after-move" warning, which is
+      // spurious because entry.emplace() below resets entry to a new
+      // ProcPidSmapsEntry.
+      entries.emplace_back(std::move(entry.value()));
+    }
+    entry.emplace();
+    have_size_kb = false;
+    have_rss_kb = false;
+    have_shared_clean_kb = false;
+    have_shared_dirty_kb = false;
+    have_private_clean_kb = false;
+    have_private_dirty_kb = false;
+    return NoError();
+  };
+
+  // Holds key/value pairs from smaps field lines. Declared here so it can be
+  // captured by reference by the following lambdas.
+  std::vector<absl::string_view> key_value;
+
+  auto const on_required_field_kb = [&](size_t* field, bool* have_field) {
+    if (*have_field) {
+      return PosixError(
+          EINVAL,
+          absl::StrFormat("smaps entry has duplicate %s line", key_value[0]));
+    }
+    ASSIGN_OR_RETURN_ERRNO(*field, SmapsValueKb(key_value[1]));
+    *have_field = true;
+    return NoError();
+  };
+
+  auto const on_optional_field_kb = [&](absl::optional<size_t>* field) {
+    if (*field) {
+      return PosixError(
+          EINVAL,
+          absl::StrFormat("smaps entry has duplicate %s line", key_value[0]));
+    }
+    ASSIGN_OR_RETURN_ERRNO(*field, SmapsValueKb(key_value[1]));
+    return NoError();
+  };
+
+  absl::flat_hash_set<std::string> unknown_fields;
+  auto const on_unknown_field = [&] {
+    absl::string_view key = key_value[0];
+    // Don't mention unknown fields more than once.
+    if (unknown_fields.count(key)) {
+      return;
+    }
+    unknown_fields.insert(std::string(key));
+    LOG(INFO) << "skipping unknown smaps field " << key;
+  };
+
+  auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty());
+  for (absl::string_view l : lines) {
+    // Is this line a valid /proc/[pid]/maps entry?
+    auto maybe_maps_entry = ParseProcMapsLine(l);
+    if (maybe_maps_entry.ok()) {
+      // This marks the beginning of a new /proc/[pid]/smaps entry.
+      RETURN_IF_ERRNO(finish_entry());
+      entry->maps_entry = std::move(maybe_maps_entry).ValueOrDie();
+      continue;
+    }
+    // Otherwise it's a field in an existing /proc/[pid]/smaps entry of the form
+    // "key:value" (where value in practice will be preceded by a variable
+    // amount of whitespace).
+    if (!entry) {
+      LOG(WARNING) << "smaps line not considered a maps line: "
+                   << maybe_maps_entry.error_message();
+      return PosixError(
+          EINVAL,
+          absl::StrCat("smaps field line without preceding maps line: ", l));
+    }
+    key_value = absl::StrSplit(l, absl::MaxSplits(':', 1));
+    if (key_value.size() != 2) {
+      return PosixError(EINVAL, absl::StrCat("invalid smaps field line: ", l));
+    }
+    absl::string_view const key = key_value[0];
+    if (key == "Size") {
+      RETURN_IF_ERRNO(on_required_field_kb(&entry->size_kb, &have_size_kb));
+    } else if (key == "Rss") {
+      RETURN_IF_ERRNO(on_required_field_kb(&entry->rss_kb, &have_rss_kb));
+    } else if (key == "Shared_Clean") {
+      RETURN_IF_ERRNO(
+          on_required_field_kb(&entry->shared_clean_kb, &have_shared_clean_kb));
+    } else if (key == "Shared_Dirty") {
+      RETURN_IF_ERRNO(
+          on_required_field_kb(&entry->shared_dirty_kb, &have_shared_dirty_kb));
+    } else if (key == "Private_Clean") {
+      RETURN_IF_ERRNO(on_required_field_kb(&entry->private_clean_kb,
+                                           &have_private_clean_kb));
+    } else if (key == "Private_Dirty") {
+      RETURN_IF_ERRNO(on_required_field_kb(&entry->private_dirty_kb,
+                                           &have_private_dirty_kb));
+    } else if (key == "Pss") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->pss_kb));
+    } else if (key == "Referenced") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->referenced_kb));
+    } else if (key == "Anonymous") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->anonymous_kb));
+    } else if (key == "AnonHugePages") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->anon_huge_pages_kb));
+    } else if (key == "Shared_Hugetlb") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->shared_hugetlb_kb));
+    } else if (key == "Private_Hugetlb") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->private_hugetlb_kb));
+    } else if (key == "Swap") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->swap_kb));
+    } else if (key == "SwapPss") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->swap_pss_kb));
+    } else if (key == "KernelPageSize") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->kernel_page_size_kb));
+    } else if (key == "MMUPageSize") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->mmu_page_size_kb));
+    } else if (key == "Locked") {
+      RETURN_IF_ERRNO(on_optional_field_kb(&entry->locked_kb));
+    } else if (key == "VmFlags") {
+      if (entry->vm_flags) {
+        return PosixError(EINVAL, "duplicate VmFlags line");
+      }
+      entry->vm_flags = absl::StrSplit(key_value[1], ' ', absl::SkipEmpty());
+    } else {
+      on_unknown_field();
+    }
+  }
+  RETURN_IF_ERRNO(finish_entry());
+  return entries;
+};
+
+TEST(ParseProcPidSmapsTest, Correctness) {
+  auto entries = ASSERT_NO_ERRNO_AND_VALUE(
+      ParseProcPidSmaps("0-10000 rw-s 00000000 00:00 0 "
+                        "                   /dev/zero (deleted)\n"
+                        "Size:                  0 kB\n"
+                        "Rss:                   1 kB\n"
+                        "Pss:                   2 kB\n"
+                        "Shared_Clean:          3 kB\n"
+                        "Shared_Dirty:          4 kB\n"
+                        "Private_Clean:         5 kB\n"
+                        "Private_Dirty:         6 kB\n"
+                        "Referenced:            7 kB\n"
+                        "Anonymous:             8 kB\n"
+                        "AnonHugePages:         9 kB\n"
+                        "Shared_Hugetlb:       10 kB\n"
+                        "Private_Hugetlb:      11 kB\n"
+                        "Swap:                 12 kB\n"
+                        "SwapPss:              13 kB\n"
+                        "KernelPageSize:       14 kB\n"
+                        "MMUPageSize:          15 kB\n"
+                        "Locked:               16 kB\n"
+                        "FutureUnknownKey:     17 kB\n"
+                        "VmFlags: rd wr sh mr mw me ms lo ?? sd \n"));
+  ASSERT_EQ(entries.size(), 1);
+  auto& entry = entries[0];
+  EXPECT_EQ(entry.maps_entry.filename, "/dev/zero (deleted)");
+  EXPECT_EQ(entry.size_kb, 0);
+  EXPECT_EQ(entry.rss_kb, 1);
+  EXPECT_THAT(entry.pss_kb, Optional(2));
+  EXPECT_EQ(entry.shared_clean_kb, 3);
+  EXPECT_EQ(entry.shared_dirty_kb, 4);
+  EXPECT_EQ(entry.private_clean_kb, 5);
+  EXPECT_EQ(entry.private_dirty_kb, 6);
+  EXPECT_THAT(entry.referenced_kb, Optional(7));
+  EXPECT_THAT(entry.anonymous_kb, Optional(8));
+  EXPECT_THAT(entry.anon_huge_pages_kb, Optional(9));
+  EXPECT_THAT(entry.shared_hugetlb_kb, Optional(10));
+  EXPECT_THAT(entry.private_hugetlb_kb, Optional(11));
+  EXPECT_THAT(entry.swap_kb, Optional(12));
+  EXPECT_THAT(entry.swap_pss_kb, Optional(13));
+  EXPECT_THAT(entry.kernel_page_size_kb, Optional(14));
+  EXPECT_THAT(entry.mmu_page_size_kb, Optional(15));
+  EXPECT_THAT(entry.locked_kb, Optional(16));
+  EXPECT_THAT(entry.vm_flags,
+              Optional(ElementsAreArray({"rd", "wr", "sh", "mr", "mw", "me",
+                                         "ms", "lo", "??", "sd"})));
+}
+
+// Returns the unique entry in entries containing the given address.
+PosixErrorOr<ProcPidSmapsEntry> FindUniqueSmapsEntry(
+    std::vector<ProcPidSmapsEntry> const& entries, uintptr_t addr) {
+  auto const pred = [&](ProcPidSmapsEntry const& entry) {
+    return entry.maps_entry.start <= addr && addr < entry.maps_entry.end;
+  };
+  auto const it = std::find_if(entries.begin(), entries.end(), pred);
+  if (it == entries.end()) {
+    return PosixError(EINVAL,
+                      absl::StrFormat("no entry contains address %#x", addr));
+  }
+  auto const it2 = std::find_if(it + 1, entries.end(), pred);
+  if (it2 != entries.end()) {
+    return PosixError(
+        EINVAL,
+        absl::StrFormat("overlapping entries [%#x-%#x) and [%#x-%#x) both "
+                        "contain address %#x",
+                        it->maps_entry.start, it->maps_entry.end,
+                        it2->maps_entry.start, it2->maps_entry.end, addr));
+  }
+  return *it;
+}
+
+PosixErrorOr<std::vector<ProcPidSmapsEntry>> ReadProcSelfSmaps() {
+  ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents("/proc/self/smaps"));
+  return ParseProcPidSmaps(contents);
+}
+
+TEST(ProcPidSmapsTest, SharedAnon) {
+  // Map with MAP_POPULATE so we get some RSS.
+  Mapping const m = ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(
+      2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE));
+  auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+  auto const entry =
+      ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr()));
+
+  EXPECT_EQ(entry.size_kb, m.len() / 1024);
+  // It's possible that populated pages have been swapped out, so RSS might be
+  // less than size.
+  EXPECT_LE(entry.rss_kb, entry.size_kb);
+
+  if (entry.pss_kb) {
+    // PSS should be exactly equal to RSS since no other address spaces should
+    // be sharing our new mapping.
+    EXPECT_EQ(entry.pss_kb.value(), entry.rss_kb);
+  }
+
+  // "Shared" and "private" in smaps refers to whether or not *physical pages*
+  // are shared; thus all pages in our MAP_SHARED mapping should nevertheless
+  // be private.
+  EXPECT_EQ(entry.shared_clean_kb, 0);
+  EXPECT_EQ(entry.shared_dirty_kb, 0);
+  EXPECT_EQ(entry.private_clean_kb + entry.private_dirty_kb, entry.rss_kb)
+      << "Private_Clean = " << entry.private_clean_kb
+      << " kB, Private_Dirty = " << entry.private_dirty_kb << " kB";
+
+  // Shared anonymous mappings are implemented as a shmem file, so their pages
+  // are not PageAnon.
+  if (entry.anonymous_kb) {
+    EXPECT_EQ(entry.anonymous_kb.value(), 0);
+  }
+
+  if (entry.vm_flags) {
+    EXPECT_THAT(entry.vm_flags.value(),
+                IsSupersetOf({"rd", "wr", "sh", "mr", "mw", "me", "ms"}));
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex")));
+  }
+}
+
+TEST(ProcPidSmapsTest, PrivateAnon) {
+  // Map with MAP_POPULATE so we get some RSS.
+  Mapping const m = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(2 * kPageSize, PROT_WRITE, MAP_PRIVATE | MAP_POPULATE));
+  auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+  auto const entry =
+      ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr()));
+
+  // It's possible that our mapping was merged with another vma, so the smaps
+  // entry might be bigger than our original mapping.
+  EXPECT_GE(entry.size_kb, m.len() / 1024);
+  EXPECT_LE(entry.rss_kb, entry.size_kb);
+  if (entry.pss_kb) {
+    EXPECT_LE(entry.pss_kb.value(), entry.rss_kb);
+  }
+
+  if (entry.anonymous_kb) {
+    EXPECT_EQ(entry.anonymous_kb.value(), entry.rss_kb);
+  }
+
+  if (entry.vm_flags) {
+    EXPECT_THAT(entry.vm_flags.value(), IsSupersetOf({"wr", "mr", "mw", "me"}));
+    // We passed PROT_WRITE to mmap. On at least x86, the mapping is in
+    // practice readable because there is no way to configure the MMU to make
+    // pages writable but not readable. However, VmFlags should reflect the
+    // flags set on the VMA, so "rd" (VM_READ) should not appear in VmFlags.
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("rd")));
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex")));
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("sh")));
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ms")));
+  }
+}
+
+TEST(ProcPidSmapsTest, SharedReadOnlyFile) {
+  size_t const kFileSize = kPageSize;
+
+  auto const temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  ASSERT_THAT(truncate(temp_file.path().c_str(), kFileSize), SyscallSucceeds());
+  auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+
+  auto const m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kFileSize, PROT_READ, MAP_SHARED | MAP_POPULATE, fd.get(), 0));
+  auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+  auto const entry =
+      ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr()));
+
+  // Most of the same logic as the SharedAnon case applies.
+  EXPECT_EQ(entry.size_kb, kFileSize / 1024);
+  EXPECT_LE(entry.rss_kb, entry.size_kb);
+  if (entry.pss_kb) {
+    EXPECT_EQ(entry.pss_kb.value(), entry.rss_kb);
+  }
+  EXPECT_EQ(entry.shared_clean_kb, 0);
+  EXPECT_EQ(entry.shared_dirty_kb, 0);
+  EXPECT_EQ(entry.private_clean_kb + entry.private_dirty_kb, entry.rss_kb)
+      << "Private_Clean = " << entry.private_clean_kb
+      << " kB, Private_Dirty = " << entry.private_dirty_kb << " kB";
+  if (entry.anonymous_kb) {
+    EXPECT_EQ(entry.anonymous_kb.value(), 0);
+  }
+
+  if (entry.vm_flags) {
+    EXPECT_THAT(entry.vm_flags.value(), IsSupersetOf({"rd", "mr", "me", "ms"}));
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("wr")));
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex")));
+    // Because the mapped file was opened O_RDONLY, the VMA is !VM_MAYWRITE and
+    // also !VM_SHARED.
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("sh")));
+    EXPECT_THAT(entry.vm_flags.value(), Not(Contains("mw")));
+  }
+}
+
+// Tests that gVisor's /proc/[pid]/smaps provides all of the fields we expect it
+// to, which as of this writing is all fields provided by Linux 4.4.
+TEST(ProcPidSmapsTest, GvisorFields) {
+  SKIP_IF(!IsRunningOnGvisor());
+  auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+  for (auto const& entry : entries) {
+    EXPECT_TRUE(entry.pss_kb);
+    EXPECT_TRUE(entry.referenced_kb);
+    EXPECT_TRUE(entry.anonymous_kb);
+    EXPECT_TRUE(entry.anon_huge_pages_kb);
+    EXPECT_TRUE(entry.shared_hugetlb_kb);
+    EXPECT_TRUE(entry.private_hugetlb_kb);
+    EXPECT_TRUE(entry.swap_kb);
+    EXPECT_TRUE(entry.swap_pss_kb);
+    EXPECT_THAT(entry.kernel_page_size_kb, Optional(kPageSize / 1024));
+    EXPECT_THAT(entry.mmu_page_size_kb, Optional(kPageSize / 1024));
+    EXPECT_TRUE(entry.locked_kb);
+    EXPECT_TRUE(entry.vm_flags);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From dc4849e49c354ee43256a4a117a217f86767a059 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 7 Jan 2019 15:39:14 -0800
Subject: Add usermem support for arm64 platform.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
PiperOrigin-RevId: 228249611
Change-Id: I1046e70bec4274f18b9948eefd6b0d546e4c48bb
---
 pkg/sentry/usermem/BUILD            |  1 +
 pkg/sentry/usermem/usermem_arm64.go | 53 +++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 pkg/sentry/usermem/usermem_arm64.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 69ba919e0..dae41ed0e 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -24,6 +24,7 @@ go_library(
         "bytes_io.go",
         "bytes_io_unsafe.go",
         "usermem.go",
+        "usermem_arm64.go",
         "usermem_x86.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
diff --git a/pkg/sentry/usermem/usermem_arm64.go b/pkg/sentry/usermem/usermem_arm64.go
new file mode 100644
index 000000000..7fd4ce963
--- /dev/null
+++ b/pkg/sentry/usermem/usermem_arm64.go
@@ -0,0 +1,53 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package usermem
+
+import (
+	"encoding/binary"
+	"syscall"
+)
+
+const (
+	// PageSize is the system page size.
+	// arm64 support 4K/16K/64K page size,
+	// which can be get by syscall.Getpagesize().
+	// Currently, only 4K page size is supported.
+	PageSize = 1 << PageShift
+
+	// HugePageSize is the system huge page size.
+	HugePageSize = 1 << HugePageShift
+
+	// PageShift is the binary log of the system page size.
+	PageShift = 12
+
+	// HugePageShift is the binary log of the system huge page size.
+	// Should be calculated by "PageShift + (PageShift - 3)"
+	// when multiple page size support is ready.
+	HugePageShift = 21
+)
+
+var (
+	// ByteOrder is the native byte order (little endian).
+	ByteOrder = binary.LittleEndian
+)
+
+func init() {
+	// Make sure the page size is 4K on arm64 platform.
+	if size := syscall.Getpagesize(); size != PageSize {
+		panic("Only 4K page size is supported on arm64!")
+	}
+}
-- 
cgit v1.2.3


From f95b94fbe3e557b16ed2b78c87e8936c0aeab6c5 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 8 Jan 2019 12:51:04 -0800
Subject: Grant no initial capabilities to non-root UIDs.

See modified comment in auth.NewUserCredentials(); compare to the
behavior of setresuid(2) as implemented by
//pkg/sentry/kernel/task_identity.go:kernel.Task.setKUIDsUncheckedLocked().

PiperOrigin-RevId: 228381765
Change-Id: I45238777c8f63fcf41b99fce3969caaf682fe408
---
 pkg/sentry/kernel/auth/credentials.go | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index de33f1953..a843b9aab 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -119,19 +119,24 @@ func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *T
 	// Set additional GIDs.
 	creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)
 
-	// Set capabilities. If capabilities aren't specified, we default to
-	// all capabilities.
+	// Set capabilities.
 	if capabilities != nil {
 		creds.PermittedCaps = capabilities.PermittedCaps
 		creds.EffectiveCaps = capabilities.EffectiveCaps
 		creds.BoundingCaps = capabilities.BoundingCaps
 		creds.InheritableCaps = capabilities.InheritableCaps
-		// // TODO: Support ambient capabilities.
+		// TODO: Support ambient capabilities.
 	} else {
-		// If no capabilities are specified, grant the same capabilities
-		// that NewRootCredentials does.
-		creds.PermittedCaps = AllCapabilities
-		creds.EffectiveCaps = AllCapabilities
+		// If no capabilities are specified, grant capabilities consistent with
+		// setresuid + setresgid from NewRootCredentials to the given uid and
+		// gid.
+		if kuid == RootKUID {
+			creds.PermittedCaps = AllCapabilities
+			creds.EffectiveCaps = AllCapabilities
+		} else {
+			creds.PermittedCaps = 0
+			creds.EffectiveCaps = 0
+		}
 		creds.BoundingCaps = AllCapabilities
 	}
 
-- 
cgit v1.2.3


From 3676b7ff1ca07e9fec1e380a0c2068390ce5d8de Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 8 Jan 2019 12:56:59 -0800
Subject: Improve loader related error messages returned to users.

PiperOrigin-RevId: 228382827
Change-Id: Ica1d30e0df826bdd77f180a5092b2b735ea5c804
---
 pkg/sentry/kernel/BUILD                 |  1 +
 pkg/sentry/kernel/kernel.go             |  7 ++++---
 pkg/sentry/kernel/task_context.go       | 13 +++++++------
 pkg/sentry/loader/BUILD                 |  1 +
 pkg/sentry/loader/loader.go             | 28 +++++++++++-----------------
 pkg/sentry/syscalls/linux/sys_thread.go |  6 +++---
 6 files changed, 27 insertions(+), 29 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 490f674c0..7d41626dc 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -184,6 +184,7 @@ go_library(
         "//pkg/state",
         "//pkg/state/statefile",
         "//pkg/sync",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index cb61e27f1..43e9823cb 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -31,6 +31,7 @@
 package kernel
 
 import (
+	"errors"
 	"fmt"
 	"io"
 	"path/filepath"
@@ -658,9 +659,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 
 	// Create a fresh task context.
 	remainingTraversals = uint(args.MaxSymlinkTraversals)
-	tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
-	if err != nil {
-		return nil, 0, err
+	tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+	if se != nil {
+		return nil, 0, errors.New(se.String())
 	}
 
 	// Take a reference on the FDMap, which will be transferred to
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index aaff309f0..ee3e49d17 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -15,9 +15,9 @@
 package kernel
 
 import (
-	"errors"
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -26,10 +26,10 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 )
 
-// ErrNoSyscalls is returned if there is no syscall table.
-var ErrNoSyscalls = errors.New("no syscall table found")
+var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
 
 // Auxmap contains miscellaneous data for the task.
 type Auxmap map[string]interface{}
@@ -142,7 +142,7 @@ func (t *Task) Stack() *arch.Stack {
 //  * argv: Binary argv
 //  * envv: Binary envv
 //  * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
 	// Prepare a new user address space to load into.
 	m := mm.NewMemoryManager(k)
 	defer m.DecUsers(ctx)
@@ -155,8 +155,9 @@ func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, r
 	// Lookup our new syscall table.
 	st, ok := LookupSyscallTable(os, ac.Arch())
 	if !ok {
-		// No syscall table found. Yikes.
-		return nil, ErrNoSyscalls
+		// No syscall table found. This means that the ELF binary does not match
+		// the architecture.
+		return nil, errNoSyscalls
 	}
 
 	if !m.IncUsers() {
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 83cad186a..24e734b49 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -44,6 +44,7 @@ go_library(
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/waiter",
     ],
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index e955502e3..deb8892f6 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -17,6 +17,7 @@ package loader
 
 import (
 	"bytes"
+	"fmt"
 	"io"
 	"path"
 
@@ -30,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserr"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -196,20 +198,18 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
 // Preconditions:
 //  * The Task MemoryManager is empty.
 //  * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
 	// Load the binary itself.
 	loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv)
 	if err != nil {
-		ctx.Infof("Failed to load %s: %v", filename, err)
-		return 0, nil, "", err
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux())
 	}
 	defer d.DecRef()
 
 	// Load the VDSO.
 	vdsoAddr, err := loadVDSO(ctx, m, vdso, loaded)
 	if err != nil {
-		ctx.Infof("Error loading VDSO: %v", err)
-		return 0, nil, "", err
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	// Setup the heap. brk starts at the next page after the end of the
@@ -217,35 +217,30 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r
 	// loaded.end is available for its use.
 	e, ok := loaded.end.RoundUp()
 	if !ok {
-		ctx.Warningf("brk overflows: %#x", loaded.end)
-		return 0, nil, "", syserror.ENOEXEC
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("brk overflows: %#x", loaded.end), linux.ENOEXEC)
 	}
 	m.BrkSetup(ctx, e)
 
 	// Allocate our stack.
 	stack, err := allocStack(ctx, m, ac)
 	if err != nil {
-		ctx.Infof("Failed to allocate stack: %v", err)
-		return 0, nil, "", err
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to allocate stack: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	// Push the original filename to the stack, for AT_EXECFN.
 	execfn, err := stack.Push(filename)
 	if err != nil {
-		ctx.Infof("Failed to push exec filename: %v", err)
-		return 0, nil, "", err
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	// Push 16 random bytes on the stack which AT_RANDOM will point to.
 	var b [16]byte
 	if _, err := rand.Read(b[:]); err != nil {
-		ctx.Infof("Failed to read random bytes: %v", err)
-		return 0, nil, "", err
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux())
 	}
 	random, err := stack.Push(b)
 	if err != nil {
-		ctx.Infof("Failed to push random bytes: %v", err)
-		return 0, nil, "", err
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	c := auth.CredentialsFromContext(ctx)
@@ -266,8 +261,7 @@ func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, r
 
 	sl, err := stack.Load(argv, envv, auxv)
 	if err != nil {
-		ctx.Infof("Failed to load stack: %v", err)
-		return 0, nil, "", err
+		return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load stack: %v", err), syserr.FromError(err).ToLinux())
 	}
 
 	m.SetArgvStart(sl.ArgvStart)
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index c12693ee2..61cafefb9 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -104,9 +104,9 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 
 	// Load the new TaskContext.
 	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet())
-	if err != nil {
-		return 0, nil, err
+	tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet())
+	if se != nil {
+		return 0, nil, se.ToError()
 	}
 
 	ctrl, err := t.Execve(tc)
-- 
cgit v1.2.3


From dd761c170cc2d44eee20757a6088f80a9322342c Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Tue, 8 Jan 2019 17:12:14 -0800
Subject: Allow MSG_OOB and MSG_DONTROUTE to be no-ops on recvmsg(2).

PiperOrigin-RevId: 228428223
Change-Id: I433ba5ffc15ea4c2706ec944901b8269b1f364f8
---
 pkg/sentry/syscalls/linux/sys_socket.go | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 3049fe6e5..1513f28e7 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -63,6 +63,10 @@ var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
 // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
 var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{}))
 
+// baseRecvFlags are the flags that are accepted across recvmsg(2),
+// recvmmsg(2), and recvfrom(2).
+const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
+
 // MessageHeader64 is the 64-bit representation of the msghdr struct used in
 // the recvmsg and sendmsg syscalls.
 type MessageHeader64 struct {
@@ -602,7 +606,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE|linux.MSG_WAITALL) != 0 {
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 
@@ -637,7 +641,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE|linux.MSG_WAITALL) != 0 {
+	if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 
@@ -794,7 +798,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 	}
 
 	// Reject flags that we don't handle yet.
-	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CTRUNC|linux.MSG_CONFIRM|linux.MSG_WAITALL) != 0 {
+	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
 		return 0, syscall.EINVAL
 	}
 
-- 
cgit v1.2.3


From d321f575e2acec6f1077ab09ff0a089fa6ac0ec6 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 9 Jan 2019 10:28:20 -0800
Subject: Fix lock order violation.

overlayFileOperations.Readdir was holding overlay.copyMu while calling
DirentReaddir, which then attempts to take take the corresponding Dirent.mu,
causing a lock order violation. (See lock order documentation in
fs/copy_up.go.)

We only actually need to hold copyMu during readdirEntries(), so holding the
lock is moved in there, thus avoiding the lock order violation.

A new lock was added to protect overlayFileOperations.dirCache. We were
inadvertently relying on copyMu to protect this.  There is no reason it should
not have its own lock.

PiperOrigin-RevId: 228542473
Change-Id: I03c3a368c8cbc0b5a79d50cc486fc94adaddc1c2
---
 pkg/sentry/fs/file_overlay.go | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 9b958b64b..cd231bdef 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -82,9 +82,14 @@ type overlayFileOperations struct {
 	upper *File
 	lower *File
 
-	// dirCursor is a directory cursor for a directory in an overlay.
+	// dirCursor is a directory cursor for a directory in an overlay. It is
+	// protected by File.mu of the owning file, which is held during
+	// Readdir and Seek calls.
 	dirCursor string
 
+	// dirCacheMu protects dirCache.
+	dirCacheMu sync.RWMutex `state:"nosave"`
+
 	// dirCache is cache of DentAttrs from upper and lower Inodes.
 	dirCache *SortedDentryMap
 }
@@ -180,21 +185,38 @@ func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, seriali
 	// Otherwise proceed with usual overlay readdir.
 	o := file.Dirent.Inode.overlay
 
-	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
-
-	var err error
-	f.dirCache, err = readdirEntries(ctx, o)
+	// readdirEntries holds o.copyUpMu to ensure that copy-up does not
+	// occur while calculating the readir results.
+	//
+	// However, it is possible for a copy-up to occur after the call to
+	// readdirEntries, but before setting f.dirCache. This is OK, since
+	// copy-up only does not change the children in a way that would affect
+	// the children returned in dirCache. Copy-up only moves
+	// files/directories between layers in the overlay.
+	//
+	// It is also possible for Readdir to race with a Create operation
+	// (which may trigger a copy-up during it's execution). Depending on
+	// whether the Create happens before or after the readdirEntries call,
+	// the newly created file may or may not appear in the readdir results.
+	// But this can only be caused by a real race between readdir and
+	// create syscalls, so it's also OK.
+	dirCache, err := readdirEntries(ctx, o)
 	if err != nil {
 		return file.Offset(), err
 	}
 
+	f.dirCacheMu.Lock()
+	f.dirCache = dirCache
+	f.dirCacheMu.Unlock()
+
 	return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset())
 }
 
 // IterateDir implements DirIterator.IterateDir.
 func (f *overlayFileOperations) IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error) {
+	f.dirCacheMu.RLock()
 	n, err := GenericReaddir(dirCtx, f.dirCache)
+	f.dirCacheMu.RUnlock()
 	return offset + n, err
 }
 
@@ -323,6 +345,9 @@ func (*overlayFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arc
 // readdirEntries returns a sorted map of directory entries from the
 // upper and/or lower filesystem.
 func readdirEntries(ctx context.Context, o *overlayEntry) (*SortedDentryMap, error) {
+	o.copyMu.RLock()
+	defer o.copyMu.RUnlock()
+
 	// Assert that there is at least one upper or lower entry.
 	if o.upper == nil && o.lower == nil {
 		panic("invalid overlayEntry, needs at least one Inode")
-- 
cgit v1.2.3


From 9270d940eb1a6e31587c34f4644189f3b2c002e1 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 9 Jan 2019 17:53:31 -0800
Subject: Minor memevent fixes.

- Call MemoryEvents.done.Add(1) outside of MemoryEvents.run() so that if
  MemoryEvents.Stop() => MemoryEvents.done.Wait() is called before the
  goroutine starts running, it still waits for the goroutine to stop.

- Use defer to call MemoryEvents.done.Done() in MemoryEvents.run() so that it's
  called even if the goroutine panics.

PiperOrigin-RevId: 228623307
Change-Id: I1b0459e7999606c1a1a271b16092b1ca87005015
---
 pkg/sentry/kernel/memevent/memory_events.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index 1a8e86827..b6283c5d1 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -69,11 +69,12 @@ func (m *MemoryEvents) Start() {
 	if m.period == 0 {
 		return
 	}
+	m.done.Add(1)
 	go m.run() // S/R-SAFE: doesn't interact with saved state.
 }
 
 func (m *MemoryEvents) run() {
-	m.done.Add(1)
+	defer m.done.Done()
 
 	// Emit the first event immediately on startup.
 	totalTicks.Increment()
@@ -85,7 +86,6 @@ func (m *MemoryEvents) run() {
 	for {
 		select {
 		case <-m.stop:
-			m.done.Done()
 			return
 		case <-ticker.C:
 			totalTicks.Increment()
-- 
cgit v1.2.3


From 7f8de3bf92decbd745a4bc4e8aebf1ba1159ed4b Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Thu, 10 Jan 2019 09:43:43 -0800
Subject: Fixing select call to not enforce RLIMIT_NOFILE.

Removing check to RLIMIT_NOFILE in select call.
Adding unit test to select suite to document behavior.
Moving setrlimit class from mlock to a util file for reuse.
Fixing flaky test based on comments from Jamie.

PiperOrigin-RevId: 228726131
Change-Id: Ie9dbe970bbf835ba2cca6e17eec7c2ee6fadf459
---
 pkg/sentry/syscalls/linux/sys_poll.go |  3 +-
 test/syscalls/linux/BUILD             |  6 ++++
 test/syscalls/linux/mlock.cc          | 19 ++---------
 test/syscalls/linux/select.cc         | 62 ++++++++++++++++++++++++++++-------
 test/util/BUILD                       | 13 ++++++++
 test/util/rlimit_util.cc              | 44 +++++++++++++++++++++++++
 test/util/rlimit_util.h               | 32 ++++++++++++++++++
 7 files changed, 151 insertions(+), 28 deletions(-)
 create mode 100644 test/util/rlimit_util.cc
 create mode 100644 test/util/rlimit_util.h

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index bf0958435..0cf6aad7f 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -82,7 +82,7 @@ func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Durati
 }
 
 func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
-	if nfds < 0 || uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
+	if nfds < 0 || nfds > fileCap {
 		return 0, syserror.EINVAL
 	}
 
@@ -90,6 +90,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 	//
 	// N.B. This only works on little-endian architectures.
 	byteCount := (nfds + 7) / 8
+
 	bitsInLastPartialByte := uint(nfds % 8)
 	r := make([]byte, byteCount)
 	w := make([]byte, byteCount)
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 028c686a8..a8a6e15ee 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1033,6 +1033,7 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:memory_util",
         "//test/util:multiprocess_util",
+        "//test/util:rlimit_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "@com_google_googletest//:gtest",
@@ -1650,6 +1651,11 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":base_poll_test",
+        "//test/util:file_descriptor",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:rlimit_util",
+        "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "@com_google_absl//absl/time",
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index 1d93bff58..a492b2404 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -12,18 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <errno.h>
-#include <string.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+#include <cerrno>
+#include <cstring>
 
 #include "gmock/gmock.h"
 #include "test/util/capability_util.h"
 #include "test/util/cleanup.h"
 #include "test/util/memory_util.h"
 #include "test/util/multiprocess_util.h"
+#include "test/util/rlimit_util.h"
 #include "test/util/test_util.h"
 
 using ::testing::_;
@@ -58,20 +59,6 @@ bool IsPageMlocked(uintptr_t addr) {
   return true;
 }
 
-PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
-  struct rlimit old_rlim;
-  if (getrlimit(resource, &old_rlim) != 0) {
-    return PosixError(errno, "getrlimit failed");
-  }
-  struct rlimit new_rlim = old_rlim;
-  new_rlim.rlim_cur = newval;
-  if (setrlimit(resource, &new_rlim) != 0) {
-    return PosixError(errno, "setrlimit failed");
-  }
-  return Cleanup([resource, old_rlim] {
-    TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
-  });
-}
 
 TEST(MlockTest, Basic) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 6b6fa9217..41e6043cc 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -12,14 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <limits.h>
-#include <signal.h>
+#include <fcntl.h>
+#include <sys/resource.h>
 #include <sys/select.h>
+#include <sys/time.h>
+#include <climits>
+#include <csignal>
+#include <cstdio>
 
 #include "gtest/gtest.h"
 #include "gtest/gtest.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/rlimit_util.h"
+#include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
@@ -57,15 +66,27 @@ TEST_F(SelectTest, NegativeNfds) {
 }
 
 TEST_F(SelectTest, ClosedFds) {
-  fd_set read_set;
-  FD_ZERO(&read_set);
-  int fd;
-  ASSERT_THAT(fd = dup(1), SyscallSucceeds());
-  ASSERT_THAT(close(fd), SyscallSucceeds());
-  FD_SET(fd, &read_set);
-  struct timeval timeout = absl::ToTimeval(absl::Milliseconds(10));
-  EXPECT_THAT(select(fd + 1, &read_set, nullptr, nullptr, &timeout),
-              SyscallFailsWithErrno(EBADF));
+  auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+
+  // We can't rely on a file descriptor being closed in a multi threaded
+  // application so fork to get a clean process.
+  EXPECT_THAT(InForkedProcess([&] {
+                int fd_num = fd.get();
+                fd.reset();
+
+                fd_set read_set;
+                FD_ZERO(&read_set);
+                FD_SET(fd_num, &read_set);
+
+                struct timeval timeout =
+                    absl::ToTimeval(absl::Milliseconds(10));
+                TEST_PCHECK(select(fd_num + 1, &read_set, nullptr, nullptr,
+                                   &timeout) != 0);
+                TEST_PCHECK(errno == EBADF);
+              }),
+              IsPosixErrorOkAndHolds(0));
 }
 
 TEST_F(SelectTest, ZeroTimeout) {
@@ -123,6 +144,25 @@ TEST_F(SelectTest, IgnoreBitsAboveNfds) {
               SyscallSucceedsWithValue(0));
 }
 
+// This test illustrates Linux's behavior of 'select' calls passing after
+// setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on
+// this behavior.
+TEST_F(SelectTest, SetrlimitCallNOFILE) {
+  fd_set read_set;
+  FD_ZERO(&read_set);
+  timeval timeout = {};
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(NewTempAbsPath(), O_RDONLY | O_CREAT, S_IRUSR));
+
+  Cleanup reset_rlimit =
+      ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_NOFILE, 0));
+
+  FD_SET(fd.get(), &read_set);
+  // this call with zero timeout should return immediately
+  EXPECT_THAT(select(fd.get() + 1, &read_set, nullptr, nullptr, &timeout),
+              SyscallSucceeds());
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/util/BUILD b/test/util/BUILD
index 10507eae4..6316fec6e 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -272,3 +272,16 @@ cc_library(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_library(
+    name = "rlimit_util",
+    testonly = 1,
+    srcs = ["rlimit_util.cc"],
+    hdrs = ["rlimit_util.h"],
+    deps = [
+        ":cleanup",
+        ":logging",
+        ":posix_error",
+        ":test_util",
+    ],
+)
diff --git a/test/util/rlimit_util.cc b/test/util/rlimit_util.cc
new file mode 100644
index 000000000..a9912c372
--- /dev/null
+++ b/test/util/rlimit_util.cc
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/util/rlimit_util.h"
+
+#include <sys/resource.h>
+#include <cerrno>
+
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval) {
+  struct rlimit old_rlim;
+  if (getrlimit(resource, &old_rlim) != 0) {
+    return PosixError(errno, "getrlimit failed");
+  }
+  struct rlimit new_rlim = old_rlim;
+  new_rlim.rlim_cur = newval;
+  if (setrlimit(resource, &new_rlim) != 0) {
+    return PosixError(errno, "setrlimit failed");
+  }
+  return Cleanup([resource, old_rlim] {
+    TEST_PCHECK(setrlimit(resource, &old_rlim) == 0);
+  });
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/util/rlimit_util.h b/test/util/rlimit_util.h
new file mode 100644
index 000000000..fa5cc70dc
--- /dev/null
+++ b/test/util/rlimit_util.h
@@ -0,0 +1,32 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_UTIL_RLIMIT_UTIL_H_
+#define GVISOR_TEST_UTIL_RLIMIT_UTIL_H_
+
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<Cleanup> ScopedSetSoftRlimit(int resource, rlim_t newval);
+
+}  // namespace testing
+}  // namespace gvisor
+#endif  // GVISOR_TEST_UTIL_RLIMIT_UTIL_H_
-- 
cgit v1.2.3


From dc8450b5676d4c4ac9bcfa23cabd862e0060527d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 14 Jan 2019 20:33:29 -0800
Subject: Remove fs.Handle, ramfs.Entry, and all the DeprecatedFileOperations.

More helper structs have been added to the fsutil package to make it easier to
implement fs.InodeOperations and fs.FileOperations.

PiperOrigin-RevId: 229305982
Change-Id: Ib6f8d3862f4216745116857913dbfa351530223b
---
 pkg/abi/linux/fs.go                        |   2 +
 pkg/sentry/fs/BUILD                        |   3 +-
 pkg/sentry/fs/anon/anon.go                 |  16 +-
 pkg/sentry/fs/ashmem/BUILD                 |   1 +
 pkg/sentry/fs/ashmem/area.go               |  16 +-
 pkg/sentry/fs/ashmem/device.go             | 133 +--------
 pkg/sentry/fs/attr.go                      |  47 +++
 pkg/sentry/fs/binder/BUILD                 |   2 +-
 pkg/sentry/fs/binder/binder.go             | 135 +--------
 pkg/sentry/fs/dev/BUILD                    |   1 +
 pkg/sentry/fs/dev/dev.go                   |  29 +-
 pkg/sentry/fs/dev/fs.go                    |   2 +
 pkg/sentry/fs/dev/full.go                  |  55 ++--
 pkg/sentry/fs/dev/null.go                  |  88 ++++--
 pkg/sentry/fs/dev/random.go                |  55 ++--
 pkg/sentry/fs/dirent.go                    |   8 +-
 pkg/sentry/fs/fdpipe/pipe.go               |  14 +-
 pkg/sentry/fs/file_operations.go           |   2 +-
 pkg/sentry/fs/file_overlay_test.go         |   7 +-
 pkg/sentry/fs/filetest/filetest.go         |  16 +-
 pkg/sentry/fs/fsutil/BUILD                 |  15 -
 pkg/sentry/fs/fsutil/file.go               | 226 ++++++++++-----
 pkg/sentry/fs/fsutil/fsutil.go             |   2 -
 pkg/sentry/fs/fsutil/handle.go             | 128 ---------
 pkg/sentry/fs/fsutil/handle_test.go        | 227 ---------------
 pkg/sentry/fs/fsutil/inode.go              | 409 ++++++++++++++------------
 pkg/sentry/fs/fsutil/inode_cached_test.go  |  14 +-
 pkg/sentry/fs/gofer/file.go                |   2 +-
 pkg/sentry/fs/gofer/fs.go                  |  14 +-
 pkg/sentry/fs/gofer/inode.go               |   1 -
 pkg/sentry/fs/host/file.go                 |   4 +-
 pkg/sentry/fs/host/fs.go                   |   2 +
 pkg/sentry/fs/host/inode.go                |   1 -
 pkg/sentry/fs/inode.go                     |  29 +-
 pkg/sentry/fs/inode_operations.go          |  80 ------
 pkg/sentry/fs/inode_overlay.go             |  13 -
 pkg/sentry/fs/inode_overlay_test.go        |  62 +++-
 pkg/sentry/fs/mock.go                      |  11 -
 pkg/sentry/fs/mount.go                     |  14 +-
 pkg/sentry/fs/mounts_test.go               |  15 +-
 pkg/sentry/fs/proc/BUILD                   |   4 +-
 pkg/sentry/fs/proc/cpuinfo.go              |  41 +--
 pkg/sentry/fs/proc/exec_args.go            |  57 +++-
 pkg/sentry/fs/proc/fds.go                  | 138 +++++----
 pkg/sentry/fs/proc/file.go                 |  58 ----
 pkg/sentry/fs/proc/inode.go                |  96 +++++++
 pkg/sentry/fs/proc/net.go                  |  55 ++--
 pkg/sentry/fs/proc/proc.go                 | 152 +++++-----
 pkg/sentry/fs/proc/rpcinet_proc.go         | 246 ++++++++--------
 pkg/sentry/fs/proc/seqfile/BUILD           |   7 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go      | 133 ++++++---
 pkg/sentry/fs/proc/seqfile/seqfile_test.go |  45 +--
 pkg/sentry/fs/proc/sys.go                  | 116 +++++---
 pkg/sentry/fs/proc/sys_net.go              | 325 +++++++++++++--------
 pkg/sentry/fs/proc/sys_net_state.go        |  17 +-
 pkg/sentry/fs/proc/sys_net_test.go         |  28 +-
 pkg/sentry/fs/proc/task.go                 | 264 +++++++++++------
 pkg/sentry/fs/proc/uid_gid_map.go          |  17 +-
 pkg/sentry/fs/proc/uptime.go               |  40 ++-
 pkg/sentry/fs/ramfs/BUILD                  |   6 +-
 pkg/sentry/fs/ramfs/dir.go                 | 223 +++++++++++----
 pkg/sentry/fs/ramfs/file.go                | 150 ----------
 pkg/sentry/fs/ramfs/ramfs.go               | 441 -----------------------------
 pkg/sentry/fs/ramfs/socket.go              |  48 +++-
 pkg/sentry/fs/ramfs/symlink.go             |  67 +++--
 pkg/sentry/fs/ramfs/test/BUILD             |  16 --
 pkg/sentry/fs/ramfs/test/test.go           |  46 ---
 pkg/sentry/fs/ramfs/tree.go                |   3 +-
 pkg/sentry/fs/ramfs/tree_test.go           |   2 +-
 pkg/sentry/fs/sys/BUILD                    |   3 +-
 pkg/sentry/fs/sys/devices.go               |  51 ++--
 pkg/sentry/fs/sys/fs.go                    |   2 +
 pkg/sentry/fs/sys/sys.go                   |  10 +-
 pkg/sentry/fs/timerfd/timerfd.go           |  12 +-
 pkg/sentry/fs/tmpfs/BUILD                  |   2 +
 pkg/sentry/fs/tmpfs/file_regular.go        |  14 +-
 pkg/sentry/fs/tmpfs/file_test.go           |   6 +-
 pkg/sentry/fs/tmpfs/fs.go                  |   2 +
 pkg/sentry/fs/tmpfs/inode_file.go          | 112 ++++----
 pkg/sentry/fs/tmpfs/tmpfs.go               | 164 +++++++++--
 pkg/sentry/fs/tty/BUILD                    |   2 -
 pkg/sentry/fs/tty/dir.go                   | 108 ++-----
 pkg/sentry/fs/tty/inode.go                 | 145 ----------
 pkg/sentry/fs/tty/master.go                |  23 +-
 pkg/sentry/fs/tty/slave.go                 |  25 +-
 pkg/sentry/kernel/epoll/epoll.go           |  12 +-
 pkg/sentry/kernel/eventfd/eventfd.go       |  14 +-
 pkg/sentry/kernel/pipe/node.go             |  40 +--
 pkg/sentry/kernel/pipe/node_test.go        |  36 +--
 pkg/sentry/kernel/pipe/pipe.go             |  31 +-
 pkg/sentry/kernel/pipe/reader_writer.go    |  10 +-
 pkg/sentry/loader/vdso.go                  |  48 ++--
 pkg/sentry/socket/epsocket/epsocket.go     |  10 +-
 pkg/sentry/socket/hostinet/socket.go       |  10 +-
 pkg/sentry/socket/netlink/socket.go        |  10 +-
 pkg/sentry/socket/rpcinet/socket.go        |  10 +-
 pkg/sentry/socket/socket.go                |  18 +-
 pkg/sentry/socket/unix/unix.go             |  10 +-
 runsc/boot/fs.go                           |   4 +-
 test/syscalls/linux/proc.cc                |   5 +
 100 files changed, 2547 insertions(+), 3144 deletions(-)
 delete mode 100644 pkg/sentry/fs/fsutil/handle.go
 delete mode 100644 pkg/sentry/fs/fsutil/handle_test.go
 delete mode 100644 pkg/sentry/fs/proc/file.go
 create mode 100644 pkg/sentry/fs/proc/inode.go
 delete mode 100644 pkg/sentry/fs/ramfs/file.go
 delete mode 100644 pkg/sentry/fs/ramfs/ramfs.go
 delete mode 100644 pkg/sentry/fs/ramfs/test/BUILD
 delete mode 100644 pkg/sentry/fs/ramfs/test/test.go
 delete mode 100644 pkg/sentry/fs/tty/inode.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 0b1c9f3db..a9f2ba132 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -22,8 +22,10 @@ const (
 	DEVPTS_SUPER_MAGIC    = 0x00001cd1
 	OVERLAYFS_SUPER_MAGIC = 0x794c7630
 	PIPEFS_MAGIC          = 0x50495045
+	PROC_SUPER_MAGIC      = 0x9fa0
 	RAMFS_MAGIC           = 0x09041934
 	SOCKFS_MAGIC          = 0x534F434B
+	SYSFS_MAGIC           = 0x62656572
 	TMPFS_MAGIC           = 0x01021994
 	V9FS_MAGIC            = 0x01021997
 )
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 0fe2b14bf..6f368b0da 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -94,7 +94,8 @@ go_test(
     deps = [
         ":fs",
         "//pkg/sentry/context",
-        "//pkg/sentry/fs/ramfs/test",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
index 743cf511f..a5e8c4f0d 100644
--- a/pkg/sentry/fs/anon/anon.go
+++ b/pkg/sentry/fs/anon/anon.go
@@ -28,16 +28,12 @@ import (
 // with any real filesystem. Some types depend on completely pseudo
 // "anon" inodes (eventfds, epollfds, etc).
 func NewInode(ctx context.Context) *fs.Inode {
-	return fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.ANON_INODE_FS_MAGIC,
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+	iops := &fsutil.SimpleFileInode{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermissions{
+			User: fs.PermMask{Read: true, Write: true},
+		}, linux.ANON_INODE_FS_MAGIC),
+	}
+	return fs.NewInode(iops, fs.NewPseudoMountSource(), fs.StableAttr{
 		Type:      fs.Anonymous,
 		DeviceID:  PseudoDevice.DeviceID(),
 		InodeID:   PseudoDevice.NextIno(),
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index 44ef82e64..2463111a8 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -28,6 +28,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
 
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index d7dd2c084..7c1b11464 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 const (
@@ -42,9 +43,10 @@ const (
 //
 // +stateify savable
 type Area struct {
-	fsutil.NoFsync                  `state:"nosave"`
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.NotDirReaddir            `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
 
 	ad *Device
 
@@ -98,11 +100,6 @@ func (a *Area) Write(ctx context.Context, file *fs.File, src usermem.IOSequence,
 	return 0, syserror.ENOSYS
 }
 
-// Flush implements fs.FileOperations.Flush.
-func (a *Area) Flush(ctx context.Context, file *fs.File) error {
-	return nil
-}
-
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
 	a.mu.Lock()
@@ -122,8 +119,7 @@ func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MM
 			return syserror.ENOMEM
 		}
 		tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}, k)
-		// This is not backed by a real filesystem, so we pass in nil.
-		tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{})
+		tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewPseudoMountSource(), fs.StableAttr{})
 		dirent := fs.NewDirent(tmpfsInode, namePrefix+"/"+a.name)
 		tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true})
 		// Drop the extra reference on the Dirent.
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index 962da141b..5369d1b0d 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -16,49 +16,40 @@
 package ashmem
 
 import (
-	"sync"
-
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 // Device implements fs.InodeOperations.
 //
 // +stateify savable
 type Device struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoFsync                   `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-	fsutil.NotDirReaddir             `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
 
-	mu       sync.Mutex `state:"nosave"`
-	unstable fs.UnstableAttr
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*Device)(nil)
+
 // NewDevice creates and intializes a Device structure.
 func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
 	return &Device{
-		unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: owner,
-			Perms: fp,
-			Links: 1,
-		}),
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, linux.ANON_INODE_FS_MAGIC),
 	}
 }
 
-// Release implements fs.InodeOperations.Release.
-func (ad *Device) Release(context.Context) {}
-
 // GetFile implements fs.InodeOperations.GetFile.
 func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, d, flags, &Area{
@@ -67,105 +58,3 @@ func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags)
 		perms:     usermem.AnyAccess,
 	}), nil
 }
-
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (ad *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	return ad.unstable, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (ad *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (ad *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	ad.unstable.Perms = fp
-	ad.unstable.StatusChangeTime = time.NowFromContext(ctx)
-	return true
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (ad *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	if owner.UID.Ok() {
-		ad.unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		ad.unstable.Owner.GID = owner.GID
-	}
-	return nil
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (ad *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-
-	now := time.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			ad.unstable.AccessTime = now
-		} else {
-			ad.unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			ad.unstable.ModificationTime = now
-		} else {
-			ad.unstable.ModificationTime = ts.MTime
-		}
-	}
-	ad.unstable.StatusChangeTime = now
-	return nil
-}
-
-// Truncate implements fs.InodeOperations.WriteOut.
-//
-// Ignored by ashmem.
-func (ad *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return nil
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-//
-// Ashmem doesn't support links, no-op.
-func (ad *Device) AddLink() {}
-
-// DropLink implements fs.InodeOperations.DropLink.
-//
-// Ashmem doesn't support links, no-op.
-func (ad *Device) DropLink() {}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (ad *Device) NotifyStatusChange(ctx context.Context) {
-	ad.mu.Lock()
-	defer ad.mu.Unlock()
-	now := time.NowFromContext(ctx)
-	ad.unstable.ModificationTime = now
-	ad.unstable.StatusChangeTime = now
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-//
-// Ashmem is virtual.
-func (ad *Device) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-//
-// Ashmem doesn't support querying for filesystem info.
-func (ad *Device) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{}, syserror.ENOSYS
-}
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 59e060e3c..3523b068a 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -180,6 +180,53 @@ type UnstableAttr struct {
 	Links uint64
 }
 
+// SetOwner sets the owner and group if they are valid.
+//
+// This method is NOT thread-safe. Callers must prevent concurrent calls.
+func (ua *UnstableAttr) SetOwner(ctx context.Context, owner FileOwner) {
+	if owner.UID.Ok() {
+		ua.Owner.UID = owner.UID
+	}
+	if owner.GID.Ok() {
+		ua.Owner.GID = owner.GID
+	}
+	ua.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// SetPermissions sets the permissions.
+//
+// This method is NOT thread-safe. Callers must prevent concurrent calls.
+func (ua *UnstableAttr) SetPermissions(ctx context.Context, p FilePermissions) {
+	ua.Perms = p
+	ua.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// SetTimestamps sets the timestamps according to the TimeSpec.
+//
+// This method is NOT thread-safe. Callers must prevent concurrent calls.
+func (ua *UnstableAttr) SetTimestamps(ctx context.Context, ts TimeSpec) {
+	if ts.ATimeOmit && ts.MTimeOmit {
+		return
+	}
+
+	now := ktime.NowFromContext(ctx)
+	if !ts.ATimeOmit {
+		if ts.ATimeSetSystemTime {
+			ua.AccessTime = now
+		} else {
+			ua.AccessTime = ts.ATime
+		}
+	}
+	if !ts.MTimeOmit {
+		if ts.MTimeSetSystemTime {
+			ua.ModificationTime = now
+		} else {
+			ua.ModificationTime = ts.MTime
+		}
+	}
+	ua.StatusChangeTime = now
+}
+
 // WithCurrentTime returns u with AccessTime == ModificationTime == current time.
 func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr {
 	t := ktime.NowFromContext(ctx)
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index a077b91d2..27155819e 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -16,11 +16,11 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index e642c7f22..19cd55e65 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -24,12 +24,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 const (
@@ -43,34 +43,29 @@ const (
 //
 // +stateify savable
 type Device struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
 
-	// mu protects unstable.
-	mu       sync.Mutex `state:"nosave"`
-	unstable fs.UnstableAttr
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*Device)(nil)
+
 // NewDevice creates and intializes a Device structure.
 func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device {
 	return &Device{
-		unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: owner,
-			Perms: fp,
-			Links: 1,
-		}),
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fp, 0),
 	}
 }
 
-// Release implements fs.InodeOperations.Release.
-func (bd *Device) Release(context.Context) {}
-
 // GetFile implements fs.InodeOperations.GetFile.
 //
 // TODO: Add functionality to GetFile: Additional fields will be
@@ -85,115 +80,13 @@ func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags)
 	}), nil
 }
 
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (bd *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	return bd.unstable, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (bd *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (bd *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	bd.unstable.Perms = fp
-	bd.unstable.StatusChangeTime = time.NowFromContext(ctx)
-	return true
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (bd *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	if owner.UID.Ok() {
-		bd.unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		bd.unstable.Owner.GID = owner.GID
-	}
-	return nil
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (bd *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-
-	now := time.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			bd.unstable.AccessTime = now
-		} else {
-			bd.unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			bd.unstable.ModificationTime = now
-		} else {
-			bd.unstable.ModificationTime = ts.MTime
-		}
-	}
-	bd.unstable.StatusChangeTime = now
-	return nil
-}
-
-// Truncate implements fs.InodeOperations.WriteOut.
-//
-// Ignored for a character device, such as Binder.
-func (bd *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return nil
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-//
-// Binder doesn't support links, no-op.
-func (bd *Device) AddLink() {}
-
-// DropLink implements fs.InodeOperations.DropLink.
-//
-// Binder doesn't support links, no-op.
-func (bd *Device) DropLink() {}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (bd *Device) NotifyStatusChange(ctx context.Context) {
-	bd.mu.Lock()
-	defer bd.mu.Unlock()
-	now := time.NowFromContext(ctx)
-	bd.unstable.ModificationTime = now
-	bd.unstable.StatusChangeTime = now
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-//
-// Binder is virtual.
-func (bd *Device) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-//
-// Binder doesn't support querying for filesystem info.
-func (bd *Device) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{}, syserror.ENOSYS
-}
-
 // Proc implements fs.FileOperations and fs.IoctlGetter.
 //
 // +stateify savable
 type Proc struct {
-	fsutil.NoFsync                  `state:"nosave"`
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.NotDirReaddir            `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
 
 	bd       *Device
 	task     *kernel.Task
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index b17b5202c..b9cfae05f 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -32,5 +32,6 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 3e127bf04..f8e8099f7 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -16,6 +16,8 @@
 package dev
 
 import (
+	"math"
+
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ashmem"
@@ -26,13 +28,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// Dev is the root node.
-//
-// +stateify savable
-type Dev struct {
-	ramfs.Dir
-}
-
 func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
 	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
@@ -43,8 +38,7 @@ func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode
 }
 
 func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	iops := &ramfs.Dir{}
-	iops.InitDir(ctx, map[string]*fs.Inode{}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -54,8 +48,7 @@ func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.Inode {
-	iops := &ramfs.Symlink{}
-	iops.InitSymlink(ctx, fs.RootOwner, target)
+	iops := ramfs.NewSymlink(ctx, fs.RootOwner, target)
 	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
@@ -66,8 +59,6 @@ func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.In
 
 // New returns the root node of a device filesystem.
 func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEnabled bool) *fs.Inode {
-	d := &Dev{}
-
 	contents := map[string]*fs.Inode{
 		"fd":     newSymlink(ctx, "/proc/self/fd", msrc),
 		"stdin":  newSymlink(ctx, "/proc/self/fd/0", msrc),
@@ -114,11 +105,19 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn
 		contents["ashmem"] = newCharacterDevice(ashmem, msrc)
 	}
 
-	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return fs.NewInode(d, msrc, fs.StableAttr{
+	iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
 		InodeID:   devDevice.NextIno(),
 		BlockSize: usermem.PageSize,
 		Type:      fs.Directory,
 	})
 }
+
+// readZeros implements fs.FileOperations.Read with infinite null bytes.
+type readZeros struct{}
+
+// Read implements fs.FileOperations.Read.
+func (readZeros) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	return dst.ZeroOut(ctx, math.MaxInt64)
+}
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index d96f4f423..abfe689f0 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -33,6 +33,8 @@ const ashmemEnabledKey = "ashmem_enabled"
 // +stateify savable
 type filesystem struct{}
 
+var _ fs.Filesystem = (*filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&filesystem{})
 }
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index eeda646ab..cbdd40161 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -15,41 +15,64 @@
 package dev
 
 import (
-	"math"
-
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // fullDevice is used to implement /dev/full.
 //
 // +stateify savable
 type fullDevice struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*fullDevice)(nil)
+
 func newFullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *fullDevice {
-	f := &fullDevice{}
-	f.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	f := &fullDevice{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
 	return f
 }
 
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev by
-// returining ENOSPC.
-func (f *fullDevice) DeprecatedPwritev(_ context.Context, _ usermem.IOSequence, _ int64) (int64, error) {
-	return 0, syserror.ENOSPC
+// GetFile implements fs.InodeOperations.GetFile.
+func (f *fullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	return fs.NewFile(ctx, dirent, flags, &fullFileOperations{}), nil
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (f *fullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, _ int64) (int64, error) {
-	return dst.ZeroOut(ctx, math.MaxInt64)
+// +stateify savable
+type fullFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	readZeros                `state:"nosave"`
 }
 
-// Truncate should be simply ignored for character devices on linux.
-func (f *fullDevice) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
+var _ fs.FileOperations = (*fullFileOperations)(nil)
+
+// Write implements FileOperations.Write.
+func (fullFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.ENOSPC
 }
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 68090f353..73fd09058 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -15,78 +15,104 @@
 package dev
 
 import (
-	"io"
-	"math"
-
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // +stateify savable
 type nullDevice struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*nullDevice)(nil)
+
 func newNullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *nullDevice {
-	n := &nullDevice{}
-	n.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	n := &nullDevice{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
 	return n
 }
 
-// DeprecatedPreadv reads data from the device.
-func (n *nullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	return 0, io.EOF
-}
+// GetFile implements fs.FileOperations.GetFile.
+func (n *nullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
 
-// DeprecatedPwritev discards writes.
-func (n *nullDevice) DeprecatedPwritev(_ context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	return src.NumBytes(), nil
+	return fs.NewFile(ctx, dirent, flags, &nullFileOperations{}), nil
 }
 
-// Truncate should be simply ignored for character devices on linux.
-func (n *nullDevice) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
+// +stateify savable
+type nullFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRead      `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
 }
 
+var _ fs.FileOperations = (*nullFileOperations)(nil)
+
 // +stateify savable
 type zeroDevice struct {
 	nullDevice
 }
 
+var _ fs.InodeOperations = (*zeroDevice)(nil)
+
 func newZeroDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *zeroDevice {
-	zd := &zeroDevice{}
-	zd.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	zd := &zeroDevice{
+		nullDevice: nullDevice{
+			InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+		},
+	}
 	return zd
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (zd *zeroDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	return dst.ZeroOut(ctx, math.MaxInt64)
-}
-
-// GetFile overrides ramfs.Entry.GetFile and returns a zeroFile instead.
+// GetFile implements fs.FileOperations.GetFile.
 func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	// Allow pread(2) and pwrite(2) on this file.
 	flags.Pread = true
 	flags.Pwrite = true
 
-	return fs.NewFile(ctx, dirent, flags, &zeroFileOperations{
-		FileOperations: &fsutil.Handle{HandleOperations: dirent.Inode.HandleOps()},
-	}), nil
+	return fs.NewFile(ctx, dirent, flags, &zeroFileOperations{}), nil
 }
 
 // +stateify savable
 type zeroFileOperations struct {
-	fs.FileOperations
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	readZeros                `state:"nosave"`
 }
 
+var _ fs.FileOperations = (*zeroFileOperations)(nil)
+
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (*zeroFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
 	m, err := mm.NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 33e4913e4..837b7793a 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -19,37 +19,58 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // +stateify savable
 type randomDevice struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 }
 
+var _ fs.InodeOperations = (*randomDevice)(nil)
+
 func newRandomDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *randomDevice {
-	r := &randomDevice{}
-	r.InitEntry(ctx, owner, fs.FilePermsFromMode(mode))
+	r := &randomDevice{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC),
+	}
 	return r
 }
 
-// DeprecatedPreadv reads random data.
-func (*randomDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
+// GetFile implements fs.InodeOperations.GetFile.
+func (randomDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &randomFileOperations{}), nil
 }
 
-// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev.
-func (*randomDevice) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	// On Linux, "Writing to /dev/random or /dev/urandom will update the
-	// entropy pool with the data written, but this will not result in a higher
-	// entropy count" - random(4). We don't need to support this, but we do
-	// need to support the write, so just make it a no-op a la /dev/null.
-	return src.NumBytes(), nil
+// +stateify savable
+type randomFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
 }
 
-// Truncate should be simply ignored for character devices on linux.
-func (r *randomDevice) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
+var _ fs.FileOperations = (*randomFileOperations)(nil)
+
+// Read implements fs.FileOperations.Read.
+func (randomFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
 }
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index c4918a11b..d6a19dc81 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -837,8 +837,8 @@ func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perm
 	})
 }
 
-// getDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
-func (d *Dirent) getDotAttrs(root *Dirent) (DentAttr, DentAttr) {
+// GetDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
+func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) {
 	// Get '.'.
 	sattr := d.Inode.StableAttr
 	dot := DentAttr{
@@ -870,7 +870,7 @@ func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int6
 	// Collect attrs for "." and  "..".
 	attrs := make(map[string]DentAttr)
 	names := []string{".", ".."}
-	attrs["."], attrs[".."] = d.getDotAttrs(root)
+	attrs["."], attrs[".."] = d.GetDotAttrs(root)
 
 	// Get info from all children.
 	d.mu.Lock()
@@ -965,7 +965,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent,
 	}
 
 	// Collect attrs for "." and "..".
-	dot, dotdot := d.getDotAttrs(root)
+	dot, dotdot := d.GetDotAttrs(root)
 
 	// Emit "." and ".." if the offset is low enough.
 	if offset == 0 {
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index e3b830747..b4d11cb45 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -37,13 +37,13 @@ import (
 //
 // +stateify savable
 type pipeOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
-	waiter.Queue         `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	waiter.Queue             `state:"nosave"`
 
 	// flags are the flags used to open the pipe.
 	flags fs.FileFlags `state:".(fs.FileFlags)"`
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index 28e8e233d..81c6e2b5d 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -91,7 +91,7 @@ type FileOperations interface {
 	Flush(ctx context.Context, file *File) error
 
 	// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
-	// implementations can either embed fsutil.NoMMap (if they don't support
+	// implementations can either embed fsutil.FileNoMMap (if they don't support
 	// memory mapping) or call fsutil.GenericConfigureMMap with the appropriate
 	// memmap.Mappable.
 	ConfigureMMap(ctx context.Context, file *File, opts *memmap.MMapOpts) error
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index f121cbdda..a4ac58763 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -20,7 +20,8 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 )
 
@@ -135,7 +136,7 @@ func TestReaddirRevalidation(t *testing.T) {
 
 	// Get a handle to the dirent in the upper filesystem so that we can
 	// modify it without going through the dirent.
-	upperDir := upper.InodeOperations.(*dir).InodeOperations.(*ramfstest.Dir)
+	upperDir := upper.InodeOperations.(*dir).InodeOperations.(*ramfs.Dir)
 
 	// Check that overlay returns the files from both upper and lower.
 	openDir, err := overlay.GetFile(ctx, fs.NewDirent(overlay, "stub"), fs.FileFlags{Read: true})
@@ -155,7 +156,7 @@ func TestReaddirRevalidation(t *testing.T) {
 	if err := upperDir.Remove(ctx, upper, "a"); err != nil {
 		t.Fatalf("error removing child: %v", err)
 	}
-	upperDir.AddChild(ctx, "c", fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}),
+	upperDir.AddChild(ctx, "c", fs.NewInode(fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermissions{}, 0),
 		upper.MountSource, fs.StableAttr{Type: fs.RegularFile}))
 
 	// Seek to beginning of the directory and do the readdir again.
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 65ca196d9..40d84d9f2 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -31,14 +31,14 @@ import (
 // TestFileOperations is an implementation of the File interface. It provides all
 // required methods.
 type TestFileOperations struct {
-	fsutil.NoopRelease   `state:"nosave"`
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
-	waiter.AlwaysReady   `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
 }
 
 // NewTestFile creates and initializes a new test file.
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 6834e1272..4965e1a5f 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -67,7 +67,6 @@ go_library(
         "frame_ref_set.go",
         "frame_ref_set_impl.go",
         "fsutil.go",
-        "handle.go",
         "host_file_mapper.go",
         "host_file_mapper_state.go",
         "host_file_mapper_unsafe.go",
@@ -96,20 +95,6 @@ go_library(
     ],
 )
 
-go_test(
-    name = "fsutil_x_test",
-    size = "small",
-    srcs = ["handle_test.go"],
-    deps = [
-        ":fsutil",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/ramfs/test",
-        "//pkg/sentry/usermem",
-    ],
-)
-
 go_test(
     name = "fsutil_test",
     size = "small",
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 46db2e51c..0970f782b 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -24,12 +24,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// NoopRelease implements FileOperations.Release for files that have no
+// FileNoopRelease implements fs.FileOperations.Release for files that have no
 // resources to release.
-type NoopRelease struct{}
+type FileNoopRelease struct{}
 
 // Release is a no-op.
-func (NoopRelease) Release() {}
+func (FileNoopRelease) Release() {}
 
 // SeekWithDirCursor is used to implement fs.FileOperations.Seek.  If dirCursor
 // is not nil and the seek was on a directory, the cursor will be updated.
@@ -127,71 +127,81 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 	return current, syserror.EINVAL
 }
 
-// GenericSeek implements FileOperations.Seek for files that use a generic
-// seek implementation.
-type GenericSeek struct{}
+// FileGenericSeek implements fs.FileOperations.Seek for files that use a
+// generic seek implementation.
+type FileGenericSeek struct{}
 
 // Seek implements fs.FileOperations.Seek.
-func (GenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+func (FileGenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
 	return SeekWithDirCursor(ctx, file, whence, offset, nil)
 }
 
-// ZeroSeek implements FileOperations.Seek for files that maintain a constant
-// zero-value offset and require a no-op Seek.
-type ZeroSeek struct{}
+// FileZeroSeek implements fs.FileOperations.Seek for files that maintain a
+// constant zero-value offset and require a no-op Seek.
+type FileZeroSeek struct{}
 
-// Seek implements FileOperations.Seek.
-func (ZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+// Seek implements fs.FileOperations.Seek.
+func (FileZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
 	return 0, nil
 }
 
-// PipeSeek implements FileOperations.Seek and can be used for files that behave
-// like pipes (seeking is not supported).
-type PipeSeek struct{}
+// FileNoSeek implements fs.FileOperations.Seek to return EINVAL.
+type FileNoSeek struct{}
+
+// Seek implements fs.FileOperations.Seek.
+func (FileNoSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+	return 0, syserror.EINVAL
+}
 
-// Seek implements FileOperations.Seek.
-func (PipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+// FilePipeSeek implements fs.FileOperations.Seek and can be used for files
+// that behave like pipes (seeking is not supported).
+type FilePipeSeek struct{}
+
+// Seek implements fs.FileOperations.Seek.
+func (FilePipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
 	return 0, syserror.ESPIPE
 }
 
-// NotDirReaddir implements FileOperations.Readdir for non-directories.
-type NotDirReaddir struct{}
+// FileNotDirReaddir implements fs.FileOperations.Readdir for non-directories.
+type FileNotDirReaddir struct{}
 
-// Readdir implements FileOperations.NotDirReaddir.
-func (NotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) {
+// Readdir implements fs.FileOperations.FileNotDirReaddir.
+func (FileNotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) {
 	return 0, syserror.ENOTDIR
 }
 
-// NoFsync implements FileOperations.Fsync for files that don't support syncing.
-type NoFsync struct{}
+// FileNoFsync implements fs.FileOperations.Fsync for files that don't support
+// syncing.
+type FileNoFsync struct{}
 
-// Fsync implements FileOperations.Fsync.
-func (NoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+// Fsync implements fs.FileOperations.Fsync.
+func (FileNoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
 	return syserror.EINVAL
 }
 
-// NoopFsync implements FileOperations.Fsync for files that don't need to synced.
-type NoopFsync struct{}
+// FileNoopFsync implements fs.FileOperations.Fsync for files that don't need
+// to synced.
+type FileNoopFsync struct{}
 
-// Fsync implements FileOperations.Fsync.
-func (NoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+// Fsync implements fs.FileOperations.Fsync.
+func (FileNoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
 	return nil
 }
 
-// NoopFlush implements FileOperations.Flush as a no-op.
-type NoopFlush struct{}
+// FileNoopFlush implements fs.FileOperations.Flush as a no-op.
+type FileNoopFlush struct{}
 
-// Flush implements FileOperations.Flush.
-func (NoopFlush) Flush(context.Context, *fs.File) error {
+// Flush implements fs.FileOperations.Flush.
+func (FileNoopFlush) Flush(context.Context, *fs.File) error {
 	return nil
 }
 
-// NoMMap implements fs.FileOperations.Mappable for files that cannot
+// FileNoMMap implements fs.FileOperations.Mappable for files that cannot
 // be memory mapped.
-type NoMMap struct{}
+type FileNoMMap struct{}
 
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
-func (NoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error {
+func (FileNoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error {
 	return syserror.ENODEV
 }
 
@@ -204,26 +214,43 @@ func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpt
 	return nil
 }
 
-// NoIoctl implements fs.FileOperations.Ioctl for files that don't implement
-// the ioctl syscall.
-type NoIoctl struct{}
+// FileNoIoctl implements fs.FileOperations.Ioctl for files that don't
+// implement the ioctl syscall.
+type FileNoIoctl struct{}
 
 // Ioctl implements fs.FileOperations.Ioctl.
-func (NoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (FileNoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	return 0, syserror.ENOTTY
 }
 
-// DirFileOperations implements FileOperations for directories.
+// DirFileOperations implements most of fs.FileOperations for directories,
+// except for Readdir which the embedding type must implement.
+type DirFileOperations struct {
+	waiter.AlwaysReady
+	FileGenericSeek
+	FileNoFsync
+	FileNoIoctl
+	FileNoMMap
+	FileNoopFlush
+	FileNoopRelease
+}
+
+// Read implements fs.FileOperations.Read
+func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements fs.FileOperations.Write.
+func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// StaticDirFileOperations implements fs.FileOperations for directories with
+// static children.
 //
 // +stateify savable
-type DirFileOperations struct {
-	waiter.AlwaysReady `state:"nosave"`
-	NoopRelease        `state:"nosave"`
-	GenericSeek        `state:"nosave"`
-	NoFsync            `state:"nosave"`
-	NoopFlush          `state:"nosave"`
-	NoMMap             `state:"nosave"`
-	NoIoctl            `state:"nosave"`
+type StaticDirFileOperations struct {
+	DirFileOperations
 
 	// dentryMap is a SortedDentryMap used to implement Readdir.
 	dentryMap *fs.SortedDentryMap
@@ -233,37 +260,106 @@ type DirFileOperations struct {
 	dirCursor string
 }
 
-// NewDirFileOperations returns a new DirFileOperations that will iterate the
-// given denty map.
-func NewDirFileOperations(dentries *fs.SortedDentryMap) *DirFileOperations {
-	return &DirFileOperations{
+// NewStaticDirFileOperations returns a new StaticDirFileOperations that will
+// iterate the given denty map.
+func NewStaticDirFileOperations(dentries *fs.SortedDentryMap) *StaticDirFileOperations {
+	return &StaticDirFileOperations{
 		dentryMap: dentries,
 	}
 }
 
 // IterateDir implements DirIterator.IterateDir.
-func (dfo *DirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	n, err := fs.GenericReaddir(dirCtx, dfo.dentryMap)
+func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	n, err := fs.GenericReaddir(dirCtx, sdfo.dentryMap)
 	return offset + n, err
 }
 
-// Readdir implements FileOperations.Readdir.
-func (dfo *DirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+// Readdir implements fs.FileOperations.Readdir.
+func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
 	root := fs.RootFromContext(ctx)
 	defer root.DecRef()
 	dirCtx := &fs.DirCtx{
 		Serializer: serializer,
-		DirCursor:  &dfo.dirCursor,
+		DirCursor:  &sdfo.dirCursor,
 	}
-	return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset())
+	return fs.DirentReaddir(ctx, file.Dirent, sdfo, root, dirCtx, file.Offset())
 }
 
-// Read implements FileOperations.Read
-func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
-	return 0, syserror.EISDIR
+// NoReadWriteFile is a file that does not support reading or writing.
+//
+// +stateify savable
+type NoReadWriteFile struct {
+	waiter.AlwaysReady `state:"nosave"`
+	FileGenericSeek    `state:"nosave"`
+	FileNoIoctl        `state:"nosave"`
+	FileNoMMap         `state:"nosave"`
+	FileNoopFsync      `state:"nosave"`
+	FileNoopFlush      `state:"nosave"`
+	FileNoopRelease    `state:"nosave"`
+	FileNoRead         `state:"nosave"`
+	FileNoWrite        `state:"nosave"`
+	FileNotDirReaddir  `state:"nosave"`
 }
 
-// Write implements FileOperations.Write.
-func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
-	return 0, syserror.EISDIR
+var _ fs.FileOperations = (*NoReadWriteFile)(nil)
+
+// FileStaticContentReader is a helper to implement fs.FileOperations.Read with
+// static content.
+//
+// +stateify savable
+type FileStaticContentReader struct {
+	// content is immutable.
+	content []byte
+}
+
+// NewFileStaticContentReader initializes a FileStaticContentReader with the
+// given content.
+func NewFileStaticContentReader(b []byte) FileStaticContentReader {
+	return FileStaticContentReader{
+		content: b,
+	}
+}
+
+// Read implements fs.FileOperations.Read.
+func (scr *FileStaticContentReader) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if offset >= int64(len(scr.content)) {
+		return 0, nil
+	}
+	n, err := dst.CopyOut(ctx, scr.content[offset:])
+	return int64(n), err
+}
+
+// FileNoopWrite implements fs.FileOperations.Write as a noop.
+type FileNoopWrite struct{}
+
+// Write implements fs.FileOperations.Write.
+func (FileNoopWrite) Write(_ context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+	return src.NumBytes(), nil
+}
+
+// FileNoRead implements fs.FileOperations.Read to return EINVAL.
+type FileNoRead struct{}
+
+// Read implements fs.FileOperations.Read.
+func (FileNoRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// FileNoWrite implements fs.FileOperations.Write to return EINVAL.
+type FileNoWrite struct{}
+
+// Write implements fs.FileOperations.Write.
+func (FileNoWrite) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// FileNoopRead implement fs.FileOperations.Read as a noop.
+type FileNoopRead struct{}
+
+// Read implements fs.FileOperations.Read.
+func (FileNoopRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, nil
 }
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
index 3d7f3732d..319c4841b 100644
--- a/pkg/sentry/fs/fsutil/fsutil.go
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -20,7 +20,5 @@
 // - For fs.Inodes that require a page cache to be memory mapped, see
 //   inode_cache.go.
 //
-// - For fs.Files that implement fs.HandleOps, see handle.go.
-//
 // - For anon fs.Inodes, see anon.go.
 package fsutil
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
deleted file mode 100644
index 8920b72ee..000000000
--- a/pkg/sentry/fs/fsutil/handle.go
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fsutil
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// Handle implements FileOperations.
-//
-// FIXME: Remove Handle entirely in favor of individual fs.File
-// implementations using simple generic utilities.
-//
-// +stateify savable
-type Handle struct {
-	NoopRelease      `state:"nosave"`
-	NoIoctl          `state:"nosave"`
-	HandleOperations fs.HandleOperations
-
-	// dirCursor is the directory cursor.
-	dirCursor string
-}
-
-// NewHandle returns a File backed by the Dirent and FileFlags.
-func NewHandle(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, hops fs.HandleOperations) *fs.File {
-	if !fs.IsPipe(dirent.Inode.StableAttr) && !fs.IsSocket(dirent.Inode.StableAttr) {
-		// Allow reading/writing at an arbitrary offset for non-pipes
-		// and non-sockets.
-		flags.Pread = true
-		flags.Pwrite = true
-	}
-
-	return fs.NewFile(ctx, dirent, flags, &Handle{HandleOperations: hops})
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (h *Handle) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return h.HandleOperations.Readiness(mask)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (h *Handle) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	h.HandleOperations.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (h *Handle) EventUnregister(e *waiter.Entry) {
-	h.HandleOperations.EventUnregister(e)
-}
-
-// Readdir implements FileOperations.Readdir.
-func (h *Handle) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
-	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
-	dirCtx := &fs.DirCtx{
-		Serializer: serializer,
-		DirCursor:  &h.dirCursor,
-	}
-	n, err := fs.DirentReaddir(ctx, file.Dirent, h, root, dirCtx, file.Offset())
-	return n, err
-}
-
-// Seek implements FileOperations.Seek.
-func (h *Handle) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
-	return SeekWithDirCursor(ctx, file, whence, offset, &h.dirCursor)
-}
-
-// IterateDir implements DirIterator.IterateDir.
-func (h *Handle) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	return h.HandleOperations.DeprecatedReaddir(ctx, dirCtx, offset)
-}
-
-// Read implements FileOperations.Read.
-func (h *Handle) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	return h.HandleOperations.DeprecatedPreadv(ctx, dst, offset)
-}
-
-// Write implements FileOperations.Write.
-func (h *Handle) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
-	return h.HandleOperations.DeprecatedPwritev(ctx, src, offset)
-}
-
-// Fsync implements FileOperations.Fsync.
-func (h *Handle) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
-	switch syncType {
-	case fs.SyncAll, fs.SyncData:
-		// Write out metadata.
-		if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
-			return err
-		}
-		fallthrough
-	case fs.SyncBackingStorage:
-		// Use DeprecatedFsync to sync disks.
-		return h.HandleOperations.DeprecatedFsync()
-	}
-	panic("invalid sync type")
-}
-
-// Flush implements FileOperations.Flush.
-func (h *Handle) Flush(context.Context, *fs.File) error {
-	return h.HandleOperations.DeprecatedFlush()
-}
-
-// ConfigureMMap implements FileOperations.ConfigureMMap.
-func (h *Handle) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
-	mappable := file.Dirent.Inode.Mappable()
-	if mappable == nil {
-		return syserror.ENODEV
-	}
-	return GenericConfigureMMap(file, mappable, opts)
-}
diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go
deleted file mode 100644
index 43e1a3bdf..000000000
--- a/pkg/sentry/fs/fsutil/handle_test.go
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fsutil_test
-
-import (
-	"io"
-	"syscall"
-	"testing"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-type testInodeOperations struct {
-	fs.InodeOperations
-	fs.InodeType
-	FileSize int64
-	writes   uint
-	reads    uint
-}
-
-func (t *testInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	return fs.UnstableAttr{Size: t.FileSize}, nil
-}
-
-// Check implements InodeOperations.Check.
-func (t *testInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-func (t *testInodeOperations) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	t.reads++
-	return t.InodeOperations.DeprecatedPreadv(ctx, dst, offset)
-}
-
-func (t *testInodeOperations) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	t.writes++
-	return t.InodeOperations.DeprecatedPwritev(ctx, src, offset)
-}
-
-// testHandle returns a handle for a test node.
-//
-// The size of the node is fixed at 20 bytes.
-func testHandle(t *testing.T, flags fs.FileFlags, nt fs.InodeType) (*fs.File, *testInodeOperations) {
-	ctx := contexttest.Context(t)
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
-	n := &testInodeOperations{
-		InodeOperations: ramfstest.NewFile(ctx, fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}}),
-		FileSize:        20,
-	}
-	d := fs.NewDirent(fs.NewInode(n, m, fs.StableAttr{Type: nt}), "test")
-	return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), n
-}
-
-func TestHandleOps(t *testing.T) {
-	h, n := testHandle(t, fs.FileFlags{Read: true, Write: true}, fs.RegularFile)
-	defer h.DecRef()
-
-	// Make sure a write request works.
-	if n, err := h.Writev(contexttest.Context(t), usermem.BytesIOSequence([]byte("a"))); n != 1 || err != nil {
-		t.Fatalf("Writev: got (%d, %v), wanted (1, nil)", n, err)
-	}
-	if n.writes != 1 {
-		t.Errorf("found %d writes, expected 1", n.writes)
-	}
-
-	// Make sure a read request works.
-	dst := make([]byte, 1)
-	if n, err := h.Preadv(contexttest.Context(t), usermem.BytesIOSequence(dst), 0); n != 1 || (err != nil && err != io.EOF) {
-		t.Errorf("Preadv: got (%d, %v), wanted (1, nil or EOF)", n, err)
-	}
-	if dst[0] != 'a' {
-		t.Errorf("Preadv: read %q, wanted 'a'", dst[0])
-	}
-	if n.reads != 1 {
-		t.Errorf("found %d reads, expected 1", n.reads)
-	}
-}
-
-type seekTest struct {
-	whence fs.SeekWhence
-	offset int64
-	result int64
-	err    error
-}
-
-type seekSuite struct {
-	nodeType fs.InodeType
-	cases    []seekTest
-}
-
-// FIXME: This is currently missing fs.SeekEnd tests due to the
-// fact that NullInodeOperations returns an error on stat.
-func TestHandleSeek(t *testing.T) {
-	ts := []seekSuite{
-		{
-			nodeType: fs.RegularFile,
-			cases: []seekTest{
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekSet, 10, 10, nil},
-				{fs.SeekSet, -5, 10, syscall.EINVAL},
-				{fs.SeekCurrent, -1, 9, nil},
-				{fs.SeekCurrent, 2, 11, nil},
-				{fs.SeekCurrent, -12, 11, syscall.EINVAL},
-				{fs.SeekEnd, -1, 19, nil},
-				{fs.SeekEnd, 0, 20, nil},
-				{fs.SeekEnd, 2, 22, nil},
-			},
-		},
-		{
-			nodeType: fs.Directory,
-			cases: []seekTest{
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekSet, 10, 0, syscall.EINVAL},
-				{fs.SeekSet, -5, 0, syscall.EINVAL},
-				{fs.SeekCurrent, 0, 0, nil},
-				{fs.SeekCurrent, 11, 0, syscall.EINVAL},
-				{fs.SeekCurrent, -6, 0, syscall.EINVAL},
-				{fs.SeekEnd, 0, 0, syscall.EINVAL},
-				{fs.SeekEnd, -1, 0, syscall.EINVAL},
-				{fs.SeekEnd, 2, 0, syscall.EINVAL},
-			},
-		},
-		{
-			nodeType: fs.Symlink,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, syscall.EINVAL},
-				{fs.SeekSet, -5, 0, syscall.EINVAL},
-				{fs.SeekSet, 0, 0, syscall.EINVAL},
-				{fs.SeekCurrent, 5, 0, syscall.EINVAL},
-				{fs.SeekCurrent, -5, 0, syscall.EINVAL},
-				{fs.SeekCurrent, 0, 0, syscall.EINVAL},
-				{fs.SeekEnd, 5, 0, syscall.EINVAL},
-				{fs.SeekEnd, -5, 0, syscall.EINVAL},
-				{fs.SeekEnd, 0, 0, syscall.EINVAL},
-			},
-		},
-		{
-			nodeType: fs.Pipe,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, syscall.ESPIPE},
-				{fs.SeekSet, -5, 0, syscall.ESPIPE},
-				{fs.SeekSet, 0, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, -5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 0, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, -5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 0, 0, syscall.ESPIPE},
-			},
-		},
-		{
-			nodeType: fs.Socket,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, syscall.ESPIPE},
-				{fs.SeekSet, -5, 0, syscall.ESPIPE},
-				{fs.SeekSet, 0, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, -5, 0, syscall.ESPIPE},
-				{fs.SeekCurrent, 0, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, -5, 0, syscall.ESPIPE},
-				{fs.SeekEnd, 0, 0, syscall.ESPIPE},
-			},
-		},
-		{
-			nodeType: fs.CharacterDevice,
-			cases: []seekTest{
-				{fs.SeekSet, 5, 0, nil},
-				{fs.SeekSet, -5, 0, nil},
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekCurrent, 5, 0, nil},
-				{fs.SeekCurrent, -5, 0, nil},
-				{fs.SeekCurrent, 0, 0, nil},
-				{fs.SeekEnd, 5, 0, nil},
-				{fs.SeekEnd, -5, 0, nil},
-				{fs.SeekEnd, 0, 0, nil},
-			},
-		},
-		{
-			nodeType: fs.BlockDevice,
-			cases: []seekTest{
-				{fs.SeekSet, 0, 0, nil},
-				{fs.SeekSet, 10, 10, nil},
-				{fs.SeekSet, -5, 10, syscall.EINVAL},
-				{fs.SeekCurrent, -1, 9, nil},
-				{fs.SeekCurrent, 2, 11, nil},
-				{fs.SeekCurrent, -12, 11, syscall.EINVAL},
-				{fs.SeekEnd, -1, 19, nil},
-				{fs.SeekEnd, 0, 20, nil},
-				{fs.SeekEnd, 2, 22, nil},
-			},
-		},
-	}
-
-	for _, s := range ts {
-		h, _ := testHandle(t, fs.FileFlags{Read: true, Write: true}, s.nodeType)
-		defer h.DecRef()
-
-		for _, c := range s.cases {
-			// Try the given seek.
-			offset, err := h.Seek(contexttest.Context(t), c.whence, c.offset)
-			if err != c.err {
-				t.Errorf("seek(%s, %d) on %s had unexpected error: expected %v, got %v", c.whence, c.offset, s.nodeType, c.err, err)
-			}
-			if err == nil && offset != c.result {
-				t.Errorf("seek(%s, %d) on %s had bad result: expected %v, got %v", c.whence, c.offset, s.nodeType, c.result, offset)
-			}
-		}
-	}
-}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index d4db1c2de..f1f5ec1de 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -15,213 +15,270 @@
 package fsutil
 
 import (
+	"sync"
+
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// NewSimpleInodeOperations constructs fs.InodeOperations from InodeSimpleAttributes.
-func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations {
-	return &simpleInodeOperations{InodeSimpleAttributes: i}
+// SimpleFileInode is a simple implementation of InodeOperations.
+//
+// +stateify savable
+type SimpleFileInode struct {
+	InodeGenericChecker       `state:"nosave"`
+	InodeNoExtendedAttributes `state:"nosave"`
+	InodeNoopRelease          `state:"nosave"`
+	InodeNoopWriteOut         `state:"nosave"`
+	InodeNotDirectory         `state:"nosave"`
+	InodeNotMappable          `state:"nosave"`
+	InodeNotOpenable          `state:"nosave"`
+	InodeNotSocket            `state:"nosave"`
+	InodeNotSymlink           `state:"nosave"`
+	InodeNotTruncatable       `state:"nosave"`
+	InodeNotVirtual           `state:"nosave"`
+
+	InodeSimpleAttributes
+}
+
+// NewSimpleFileInode returns a new SimpleFileInode.
+func NewSimpleFileInode(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) *SimpleFileInode {
+	return &SimpleFileInode{
+		InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, owner, perms, typ),
+	}
 }
 
-// simpleInodeOperations is a simple implementation of Inode.
+// NoReadWriteFileInode is an implementation of InodeOperations that supports
+// opening files that are not readable or writeable.
 //
 // +stateify savable
-type simpleInodeOperations struct {
-	DeprecatedFileOperations  `state:"nosave"`
+type NoReadWriteFileInode struct {
+	InodeGenericChecker       `state:"nosave"`
+	InodeNoExtendedAttributes `state:"nosave"`
+	InodeNoopRelease          `state:"nosave"`
+	InodeNoopWriteOut         `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
+	InodeNotMappable          `state:"nosave"`
 	InodeNotSocket            `state:"nosave"`
-	InodeNotRenameable        `state:"nosave"`
-	InodeNotOpenable          `state:"nosave"`
-	InodeNotVirtual           `state:"nosave"`
 	InodeNotSymlink           `state:"nosave"`
-	InodeNoExtendedAttributes `state:"nosave"`
-	NoMappable                `state:"nosave"`
-	NoopWriteOut              `state:"nosave"`
+	InodeNotTruncatable       `state:"nosave"`
+	InodeNotVirtual           `state:"nosave"`
 
 	InodeSimpleAttributes
 }
 
-// InodeSimpleAttributes implements a subset of the Inode interface. It provides
-// read-only access to attributes.
+// NewNoReadWriteFileInode returns a new NoReadWriteFileInode.
+func NewNoReadWriteFileInode(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) *NoReadWriteFileInode {
+	return &NoReadWriteFileInode{
+		InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, owner, perms, typ),
+	}
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (*NoReadWriteFileInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &NoReadWriteFile{}), nil
+}
+
+// InodeSimpleAttributes implements methods for updating in-memory unstable
+// attributes.
 //
 // +stateify savable
 type InodeSimpleAttributes struct {
-	// FSType is the filesystem type reported by StatFS.
+	// FSType is the immutable filesystem type that will be returned by
+	// StatFS.
 	FSType uint64
 
-	// UAttr are the unstable attributes of the Inode.
-	UAttr fs.UnstableAttr
+	// mu protects unstable.
+	mu       sync.RWMutex `state:"nosave"`
+	Unstable fs.UnstableAttr
 }
 
-// Release implements fs.InodeOperations.Release.
-func (i *InodeSimpleAttributes) Release(context.Context) {}
-
-// StatFS implements fs.InodeOperations.StatFS.
-func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{Type: i.FSType}, nil
+// NewInodeSimpleAttributes returns a new InodeSimpleAttributes.
+func NewInodeSimpleAttributes(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) InodeSimpleAttributes {
+	return InodeSimpleAttributes{
+		FSType: typ,
+		Unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+			Owner: owner,
+			Perms: perms,
+		}),
+	}
 }
 
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (i *InodeSimpleAttributes) UnstableAttr(context.Context, *fs.Inode) (fs.UnstableAttr, error) {
-	return i.UAttr, nil
+func (i *InodeSimpleAttributes) UnstableAttr(ctx context.Context, _ *fs.Inode) (fs.UnstableAttr, error) {
+	i.mu.RLock()
+	u := i.Unstable
+	i.mu.RUnlock()
+	return u, nil
 }
 
-// Check implements fs.InodeOperations.Check.
-func (i *InodeSimpleAttributes) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (i *InodeSimpleAttributes) SetPermissions(ctx context.Context, _ *fs.Inode, p fs.FilePermissions) bool {
+	i.mu.Lock()
+	i.Unstable.SetPermissions(ctx, p)
+	i.mu.Unlock()
+	return true
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (i *InodeSimpleAttributes) SetOwner(ctx context.Context, _ *fs.Inode, owner fs.FileOwner) error {
+	i.mu.Lock()
+	i.Unstable.SetOwner(ctx, owner)
+	i.mu.Unlock()
+	return nil
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (i *InodeSimpleAttributes) SetTimestamps(ctx context.Context, _ *fs.Inode, ts fs.TimeSpec) error {
+	i.mu.Lock()
+	i.Unstable.SetTimestamps(ctx, ts)
+	i.mu.Unlock()
+	return nil
 }
 
 // AddLink implements fs.InodeOperations.AddLink.
-func (*InodeSimpleAttributes) AddLink() {}
+func (i *InodeSimpleAttributes) AddLink() {
+	i.mu.Lock()
+	i.Unstable.Links++
+	i.mu.Unlock()
+}
 
 // DropLink implements fs.InodeOperations.DropLink.
-func (*InodeSimpleAttributes) DropLink() {}
-
-// NotifyStatusChange implements fs.fs.InodeOperations.
-func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
-	i.UAttr.StatusChangeTime = ktime.NowFromContext(ctx)
+func (i *InodeSimpleAttributes) DropLink() {
+	i.mu.Lock()
+	i.Unstable.Links--
+	i.mu.Unlock()
 }
 
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (*InodeSimpleAttributes) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool {
-	return false
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) {
+	if i.FSType == 0 {
+		return fs.Info{}, syserror.ENOSYS
+	}
+	return fs.Info{Type: i.FSType}, nil
 }
 
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (*InodeSimpleAttributes) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
-	return syserror.EINVAL
+// NotifyAccess updates the access time.
+func (i *InodeSimpleAttributes) NotifyAccess(ctx context.Context) {
+	i.mu.Lock()
+	i.Unstable.AccessTime = ktime.NowFromContext(ctx)
+	i.mu.Unlock()
 }
 
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (*InodeSimpleAttributes) SetTimestamps(context.Context, *fs.Inode, fs.TimeSpec) error {
-	return syserror.EINVAL
+// NotifyModification updates the modification time.
+func (i *InodeSimpleAttributes) NotifyModification(ctx context.Context) {
+	i.mu.Lock()
+	i.Unstable.ModificationTime = ktime.NowFromContext(ctx)
+	i.mu.Unlock()
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error {
-	return syserror.EINVAL
+// NotifyStatusChange updates the status change time.
+func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
+	i.mu.Lock()
+	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+	i.mu.Unlock()
 }
 
-// InMemoryAttributes implements utilities for updating in-memory unstable
-// attributes and extended attributes. It is not thread-safe.
-//
-// Users need not initialize Xattrs to non-nil (it will be initialized
-// when the first extended attribute is set.
+// InodeSimpleExtendedAttributes implements
+// fs.InodeOperations.{Get,Set,List}xattr.
 //
 // +stateify savable
-type InMemoryAttributes struct {
-	Unstable fs.UnstableAttr
-	Xattrs   map[string][]byte
+type InodeSimpleExtendedAttributes struct {
+	// mu protects xattrs.
+	mu     sync.RWMutex `state:"nosave"`
+	xattrs map[string][]byte
 }
 
-// SetPermissions updates the permissions to p.
-func (i *InMemoryAttributes) SetPermissions(ctx context.Context, p fs.FilePermissions) bool {
-	i.Unstable.Perms = p
-	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-	return true
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) ([]byte, error) {
+	i.mu.RLock()
+	value, ok := i.xattrs[name]
+	i.mu.RUnlock()
+	if !ok {
+		return nil, syserror.ENOATTR
+	}
+	return value, nil
 }
 
-// SetOwner updates the file owner to owner.
-func (i *InMemoryAttributes) SetOwner(ctx context.Context, owner fs.FileOwner) error {
-	if owner.UID.Ok() {
-		i.Unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		i.Unstable.Owner.GID = owner.GID
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (i *InodeSimpleExtendedAttributes) Setxattr(_ *fs.Inode, name string, value []byte) error {
+	i.mu.Lock()
+	if i.xattrs == nil {
+		i.xattrs = make(map[string][]byte)
 	}
+	i.xattrs[name] = value
+	i.mu.Unlock()
 	return nil
 }
 
-// SetTimestamps sets the timestamps to ts.
-func (i *InMemoryAttributes) SetTimestamps(ctx context.Context, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	now := ktime.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			i.Unstable.AccessTime = now
-		} else {
-			i.Unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			i.Unstable.ModificationTime = now
-		} else {
-			i.Unstable.ModificationTime = ts.MTime
-		}
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (i *InodeSimpleExtendedAttributes) Listxattr(_ *fs.Inode) (map[string]struct{}, error) {
+	i.mu.RLock()
+	names := make(map[string]struct{}, len(i.xattrs))
+	for name := range i.xattrs {
+		names[name] = struct{}{}
 	}
-	i.Unstable.StatusChangeTime = now
-	return nil
+	i.mu.RUnlock()
+	return names, nil
 }
 
-// TouchAccessTime updates access time to the current time.
-func (i *InMemoryAttributes) TouchAccessTime(ctx context.Context) {
-	i.Unstable.AccessTime = ktime.NowFromContext(ctx)
-}
+// staticFile is a file with static contents. It is returned by
+// InodeStaticFileGetter.GetFile.
+//
+// +stateify savable
+type staticFile struct {
+	waiter.AlwaysReady `state:"nosave"`
+	FileGenericSeek    `state:"nosave"`
+	FileNoIoctl        `state:"nosave"`
+	FileNoMMap         `state:"nosave"`
+	FileNoopFsync      `state:"nosave"`
+	FileNoopFlush      `state:"nosave"`
+	FileNoopRelease    `state:"nosave"`
+	FileNoopWrite      `state:"nosave"`
+	FileNotDirReaddir  `state:"nosave"`
 
-// TouchModificationTime updates modification and status change
-// time to the current time.
-func (i *InMemoryAttributes) TouchModificationTime(ctx context.Context) {
-	now := ktime.NowFromContext(ctx)
-	i.Unstable.ModificationTime = now
-	i.Unstable.StatusChangeTime = now
+	FileStaticContentReader
 }
 
-// TouchStatusChangeTime updates status change time to the current time.
-func (i *InMemoryAttributes) TouchStatusChangeTime(ctx context.Context) {
-	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-}
+// InodeNoStatFS implement StatFS by retuning ENOSYS.
+type InodeNoStatFS struct{}
 
-// Getxattr returns the extended attribute at name or ENOATTR if
-// it isn't set.
-func (i *InMemoryAttributes) Getxattr(name string) ([]byte, error) {
-	if value, ok := i.Xattrs[name]; ok {
-		return value, nil
-	}
-	return nil, syserror.ENOATTR
+// StatFS implements fs.InodeOperations.StatFS.
+func (InodeNoStatFS) StatFS(context.Context) (fs.Info, error) {
+	return fs.Info{}, syserror.ENOSYS
 }
 
-// Setxattr sets the extended attribute at name to value.
-func (i *InMemoryAttributes) Setxattr(name string, value []byte) error {
-	if i.Xattrs == nil {
-		i.Xattrs = make(map[string][]byte)
-	}
-	i.Xattrs[name] = value
-	return nil
+// InodeStaticFileGetter implements GetFile for a file with static contents.
+//
+// +stateify savable
+type InodeStaticFileGetter struct {
+	Contents []byte
 }
 
-// Listxattr returns the set of all currently set extended attributes.
-func (i *InMemoryAttributes) Listxattr() (map[string]struct{}, error) {
-	names := make(map[string]struct{}, len(i.Xattrs))
-	for name := range i.Xattrs {
-		names[name] = struct{}{}
-	}
-	return names, nil
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *InodeStaticFileGetter) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &staticFile{
+		FileStaticContentReader: NewFileStaticContentReader(i.Contents),
+	}), nil
 }
 
-// NoMappable returns a nil memmap.Mappable.
-type NoMappable struct{}
+// InodeNotMappable returns a nil memmap.Mappable.
+type InodeNotMappable struct{}
 
 // Mappable implements fs.InodeOperations.Mappable.
-func (NoMappable) Mappable(*fs.Inode) memmap.Mappable {
+func (InodeNotMappable) Mappable(*fs.Inode) memmap.Mappable {
 	return nil
 }
 
-// NoopWriteOut is a no-op implementation of Inode.WriteOut.
-type NoopWriteOut struct{}
+// InodeNoopWriteOut is a no-op implementation of fs.InodeOperations.WriteOut.
+type InodeNoopWriteOut struct{}
 
 // WriteOut is a no-op.
-func (NoopWriteOut) WriteOut(context.Context, *fs.Inode) error {
+func (InodeNoopWriteOut) WriteOut(context.Context, *fs.Inode) error {
 	return nil
 }
 
@@ -273,6 +330,11 @@ func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) err
 	return syserror.ENOTDIR
 }
 
+// Rename implements fs.FileOperations.Rename.
+func (InodeNotDirectory) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error {
+	return syserror.EINVAL
+}
+
 // InodeNotSocket can be used by Inodes that are not sockets.
 type InodeNotSocket struct{}
 
@@ -281,7 +343,31 @@ func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint {
 	return nil
 }
 
-// InodeNotRenameable can be used by Inodes that cannot be renamed.
+// InodeNotTruncatable can be used by Inodes that cannot be truncated.
+type InodeNotTruncatable struct{}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (InodeNotTruncatable) Truncate(context.Context, *fs.Inode, int64) error {
+	return syserror.EINVAL
+}
+
+// InodeIsDirTruncate implements fs.InodeOperations.Truncate for directories.
+type InodeIsDirTruncate struct{}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (InodeIsDirTruncate) Truncate(context.Context, *fs.Inode, int64) error {
+	return syserror.EISDIR
+}
+
+// InodeNoopTruncate implements fs.InodeOperations.Truncate as a noop.
+type InodeNoopTruncate struct{}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (InodeNoopTruncate) Truncate(context.Context, *fs.Inode, int64) error {
+	return nil
+}
+
+// InodeNotRenameable can be used by Inodes that cannot be truncated.
 type InodeNotRenameable struct{}
 
 // Rename implements fs.InodeOperations.Rename.
@@ -305,6 +391,14 @@ func (InodeNotVirtual) IsVirtual() bool {
 	return false
 }
 
+// InodeVirtual can be used by Inodes that are virtual.
+type InodeVirtual struct{}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (InodeVirtual) IsVirtual() bool {
+	return true
+}
+
 // InodeNotSymlink can be used by Inodes that are not symlinks.
 type InodeNotSymlink struct{}
 
@@ -337,50 +431,17 @@ func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, erro
 	return nil, syserror.EOPNOTSUPP
 }
 
-// DeprecatedFileOperations panics if any deprecated Inode method is called.
-type DeprecatedFileOperations struct{}
+// InodeNoopRelease implements fs.InodeOperations.Release as a noop.
+type InodeNoopRelease struct{}
 
-// Readiness implements fs.InodeOperations.Waitable.Readiness.
-func (DeprecatedFileOperations) Readiness(waiter.EventMask) waiter.EventMask {
-	panic("not implemented")
-}
-
-// EventRegister implements fs.InodeOperations.Waitable.EventRegister.
-func (DeprecatedFileOperations) EventRegister(*waiter.Entry, waiter.EventMask) {
-	panic("not implemented")
-}
-
-// EventUnregister implements fs.InodeOperations.Waitable.EventUnregister.
-func (DeprecatedFileOperations) EventUnregister(*waiter.Entry) {
-	panic("not implemented")
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (DeprecatedFileOperations) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
-	panic("not implemented")
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (DeprecatedFileOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	panic("not implemented")
-}
-
-// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir.
-func (DeprecatedFileOperations) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) {
-	panic("not implemented")
-}
-
-// DeprecatedFsync implements fs.InodeOperations.DeprecatedFsync.
-func (DeprecatedFileOperations) DeprecatedFsync() error {
-	panic("not implemented")
-}
+// Release implements fs.InodeOperations.Release.
+func (InodeNoopRelease) Release(context.Context) {}
 
-// DeprecatedFlush implements fs.InodeOperations.DeprecatedFlush.
-func (DeprecatedFileOperations) DeprecatedFlush() error {
-	panic("not implemented")
-}
+// InodeGenericChecker implements fs.InodeOperations.Check with a generic
+// implementation.
+type InodeGenericChecker struct{}
 
-// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable.
-func (DeprecatedFileOperations) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) {
-	panic("not implemented")
+// Check implements fs.InodeOperations.Check.
+func (InodeGenericChecker) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	return fs.ContextCanAccessFile(ctx, inode, p)
 }
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index ce5201a40..9c9391511 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -261,15 +261,11 @@ func (noopMappingSpace) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateO
 }
 
 func anonInode(ctx context.Context) *fs.Inode {
-	return fs.NewInode(NewSimpleInodeOperations(InodeSimpleAttributes{
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+	return fs.NewInode(&SimpleFileInode{
+		InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), fs.FilePermissions{
+			User: fs.PermMask{Read: true, Write: true},
+		}, 0),
+	}, fs.NewPseudoMountSource(), fs.StableAttr{
 		Type:      fs.Anonymous,
 		BlockSize: usermem.PageSize,
 	})
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 6d961813d..3578b07a0 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -37,7 +37,7 @@ var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_fil
 //
 // +stateify savable
 type fileOperations struct {
-	fsutil.NoIoctl     `state:"nosave"`
+	fsutil.FileNoIoctl `state:"nosave"`
 	waiter.AlwaysReady `state:"nosave"`
 
 	// inodeOperations is the inodeOperations backing the file. It is protected
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index ed30cb1f1..2dc000c6f 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -75,11 +75,11 @@ var (
 	// ErrNoTransport is returned when there is no 'trans' option.
 	ErrNoTransport = errors.New("missing required option: 'trans='")
 
-	// ErrNoReadFD is returned when there is no 'rfdno' option.
-	ErrNoReadFD = errors.New("missing required option: 'rfdno='")
+	// ErrFileNoReadFD is returned when there is no 'rfdno' option.
+	ErrFileNoReadFD = errors.New("missing required option: 'rfdno='")
 
-	// ErrNoWriteFD is returned when there is no 'wfdno' option.
-	ErrNoWriteFD = errors.New("missing required option: 'wfdno='")
+	// ErrFileNoWriteFD is returned when there is no 'wfdno' option.
+	ErrFileNoWriteFD = errors.New("missing required option: 'wfdno='")
 )
 
 // filesystem is a 9p client.
@@ -87,6 +87,8 @@ var (
 // +stateify savable
 type filesystem struct{}
 
+var _ fs.Filesystem = (*filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&filesystem{})
 }
@@ -160,14 +162,14 @@ func options(data string) (opts, error) {
 	// Check for the required 'rfdno=' option.
 	srfd, ok := options[readFDKey]
 	if !ok {
-		return o, ErrNoReadFD
+		return o, ErrFileNoReadFD
 	}
 	delete(options, readFDKey)
 
 	// Check for the required 'wfdno=' option.
 	swfd, ok := options[writeFDKey]
 	if !ok {
-		return o, ErrNoWriteFD
+		return o, ErrFileNoWriteFD
 	}
 	delete(options, writeFDKey)
 
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 7c6e5b025..f0dc99fd0 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -40,7 +40,6 @@ import (
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
 
 	// fileState implements fs.CachedFileObject. It exists
 	// to break a circular load dependency between inodeOperations
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index bc6ee7aa4..4e84d1d6c 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -36,8 +36,8 @@ import (
 //
 // +stateify savable
 type fileOperations struct {
-	fsutil.NoIoctl     `state:"nosave"`
-	fsutil.NoopRelease `state:"nosave"`
+	fsutil.FileNoIoctl     `state:"nosave"`
+	fsutil.FileNoopRelease `state:"nosave"`
 
 	// iops are the Inode operations for this file.
 	iops *inodeOperations `state:"wait"`
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index 54cbb94f9..d2ba38449 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -58,6 +58,8 @@ type Filesystem struct {
 	paths []string
 }
 
+var _ fs.Filesystem = (*Filesystem)(nil)
+
 // Name is the identifier of this file system.
 func (*Filesystem) Name() string {
 	return FilesystemName
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 08754bd6b..6ff6c3254 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -39,7 +39,6 @@ import (
 type inodeOperations struct {
 	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
 
 	// fileState implements fs.CachedFileObject. It exists
 	// to break a circular load dependency between inodeOperations
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index fa3beb111..d32f52d55 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -356,11 +356,10 @@ func (i *Inode) AddLink() {
 	if i.overlay != nil {
 		// FIXME: Remove this from InodeOperations altogether.
 		//
-		// This interface (including DropLink and NotifyStatusChange)
-		// is only used by ramfs to update metadata of children. These
-		// filesystems should _never_ have overlay Inodes cached as
-		// children. So explicitly disallow this scenario and avoid plumbing
-		// Dirents through to do copy up.
+		// This interface is only used by ramfs to update metadata of
+		// children. These filesystems should _never_ have overlay
+		// Inodes cached as children. So explicitly disallow this
+		// scenario and avoid plumbing Dirents through to do copy up.
 		panic("overlay Inodes cached in ramfs directories are not supported")
 	}
 	i.InodeOperations.AddLink()
@@ -375,15 +374,6 @@ func (i *Inode) DropLink() {
 	i.InodeOperations.DropLink()
 }
 
-// NotifyStatusChange calls i.InodeOperations.NotifyStatusChange.
-func (i *Inode) NotifyStatusChange(ctx context.Context) {
-	if i.overlay != nil {
-		// Same as AddLink.
-		panic("overlay Inodes cached in ramfs directories are not supported")
-	}
-	i.InodeOperations.NotifyStatusChange(ctx)
-}
-
 // IsVirtual calls i.InodeOperations.IsVirtual.
 func (i *Inode) IsVirtual() bool {
 	if i.overlay != nil {
@@ -401,17 +391,6 @@ func (i *Inode) StatFS(ctx context.Context) (Info, error) {
 	return i.InodeOperations.StatFS(ctx)
 }
 
-// HandleOps extracts HandleOperations from i.
-func (i *Inode) HandleOps() HandleOperations {
-	if i.overlay != nil {
-		return overlayHandleOps(i.overlay)
-	}
-	if h, ok := i.InodeOperations.(HandleOperations); ok {
-		return h
-	}
-	return nil
-}
-
 // CheckOwnership checks whether `ctx` owns this Inode or may act as its owner.
 // Compare Linux's fs/inode.c:inode_owner_or_capable().
 func (i *Inode) CheckOwnership(ctx context.Context) bool {
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 77973ce79..db40b5256 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -21,8 +21,6 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 var (
@@ -303,83 +301,5 @@ type InodeOperations interface {
 	// StatFS returns a filesystem Info implementation or an error.  If
 	// the filesystem does not support this operation (maybe in the future
 	// it will), then ENOSYS should be returned.
-	//
-	// Move to MountSourceOperations.
 	StatFS(context.Context) (Info, error)
-
-	HandleOperations
-}
-
-// HandleOperations are extended InodeOperations that are only implemented
-// for file systems that use fs/handle.go:Handle to generate open Files.
-//
-// Handle is deprecated; these methods are deprecated as well.
-//
-// Filesystems are encouraged to implement the File interface directly
-// instead of using Handle. To indicate that the below methods should never
-// be called, embed DeprecatedFileOperations to satisfy this interface.
-type HandleOperations interface {
-	waiter.Waitable
-
-	// DeprecatedPreadv is deprecated in favor of filesystems
-	// implementing File.Preadv directly.
-	//
-	// DeprecatedPreadv reads up to dst.NumBytes() bytes into dst, starting at
-	// the given offset, and returns the number of bytes read.
-	//
-	// Preadv may return a partial read result before EOF is reached.
-	//
-	// If a symlink, Preadv reads the target value of the symlink.
-	//
-	// Preadv should not check for readable permissions.
-	DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
-
-	// DeprecatedPwritev is deprecated in favor of filesystems
-	// implementing File.Pwritev directly.
-	//
-	// DeprecatedPwritev writes up to src.NumBytes() bytes from src to the
-	// Inode, starting at the given offset and returns the number of bytes
-	// written.
-	//
-	// Pwritev should not check that the Inode has writable permissions.
-	DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
-
-	// DeprecatedReaddir is deprecated in favor of filesystems
-	// implementing File.Readdir directly.
-	//
-	// DeprecatedReaddir emits directory entries by calling dirCtx.EmitDir,
-	// beginning with the entry at offset.
-	//
-	// Entries for "." and ".." must *not* be included.
-	//
-	// If the offset returned is the same as the argument offset, then
-	// nothing has been serialized.  This is equivalent to reaching EOF.
-	// In this case serializer.Written() should return 0.
-	//
-	// The order of entries to emit must be consistent between Readdir
-	// calls, and must start with the given offset.
-	//
-	// The caller must ensure that this operation is permitted.
-	DeprecatedReaddir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error)
-
-	// DeprecatedFsync is deprecated in favor of filesystems implementing
-	// File.Fsync directly.
-	//
-	// DeprecatedFsync syncs a file.
-	DeprecatedFsync() error
-
-	// DeprecatedMappable is deprecated in favor of filesystems implementing
-	// File.Mappable directly.
-	//
-	// DeprecatedMappable returns a Mappable if the Inode can be mapped.
-	DeprecatedMappable(ctx context.Context, inode *Inode) (memmap.Mappable, bool)
-
-	// DeprecatedFlush is deprecated in favor of filesystems implementing
-	// File.Flush directly.
-	//
-	// DeprecatedFlush flushes a file.
-	//
-	// Implementations may choose to free up memory or complete pending I/O
-	// but also may implement Flush as a no-op.
-	DeprecatedFlush() error
 }
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 512a0da28..77a2623ef 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -596,19 +596,6 @@ func overlayStatFS(ctx context.Context, o *overlayEntry) (Info, error) {
 	return i, nil
 }
 
-func overlayHandleOps(o *overlayEntry) HandleOperations {
-	// Hot path. Avoid defers.
-	var hops HandleOperations
-	o.copyMu.RLock()
-	if o.upper != nil {
-		hops = o.upper.HandleOps()
-	} else {
-		hops = o.lower.HandleOps()
-	}
-	o.copyMu.RUnlock()
-	return hops
-}
-
 // NewTestOverlayDir returns an overlay Inode for tests.
 //
 // If `revalidate` is true, then the upper filesystem will require
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 9e922d008..bc91be226 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -19,7 +19,8 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -376,7 +377,8 @@ type dir struct {
 	// List of negative child names.
 	negative []string
 
-	// Whether DeprecatedReaddir has been called on this dir.
+	// ReaddirCalled records whether Readdir was called on a file
+	// corresponding to this inode.
 	ReaddirCalled bool
 }
 
@@ -390,10 +392,19 @@ func (d *dir) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
 	return nil, syserror.ENOATTR
 }
 
-// DeprecatedReaddir implements InodeOperations.DeprecatedReaddir.
-func (d *dir) DeprecatedReaddir(ctx context.Context, dirctx *fs.DirCtx, offset int) (int, error) {
-	d.ReaddirCalled = true
-	return d.InodeOperations.DeprecatedReaddir(ctx, dirctx, offset)
+// GetFile implements InodeOperations.GetFile.
+func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	file, err := d.InodeOperations.GetFile(ctx, dirent, flags)
+	if err != nil {
+		return nil, err
+	}
+	defer file.DecRef()
+	// Wrap the file's FileOperations in a dirFile.
+	fops := &dirFile{
+		FileOperations: file.FileOperations,
+		inode:          d,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
 }
 
 type dirContent struct {
@@ -401,12 +412,45 @@ type dirContent struct {
 	dir  bool
 }
 
+type dirFile struct {
+	fs.FileOperations
+	inode *dir
+}
+
+type inode struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeStaticFileGetter
+}
+
+// Readdir implements fs.FileOperations.Readdir. It sets the ReaddirCalled
+// field on the inode.
+func (f *dirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	f.inode.ReaddirCalled = true
+	return f.FileOperations.Readdir(ctx, file, ser)
+}
+
 func newTestRamfsInode(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	return fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}), msrc, fs.StableAttr{Type: fs.RegularFile})
+	inode := fs.NewInode(&inode{
+		InodeStaticFileGetter: fsutil.InodeStaticFileGetter{
+			Contents: []byte("foobar"),
+		},
+	}, msrc, fs.StableAttr{Type: fs.RegularFile})
+	return inode
 }
 
 func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []string) *fs.Inode {
-	msrc := fs.NewCachingMountSource(nil, fs.MountSourceFlags{})
+	msrc := fs.NewPseudoMountSource()
 	contents := make(map[string]*fs.Inode)
 	for _, c := range contains {
 		if c.dir {
@@ -415,7 +459,7 @@ func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []stri
 			contents[c.name] = newTestRamfsInode(ctx, msrc)
 		}
 	}
-	dops := ramfstest.NewDir(ctx, contents, fs.FilePermissions{
+	dops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermissions{
 		User: fs.PermMask{Read: true, Execute: true},
 	})
 	return fs.NewInode(&dir{
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 6bfcda6bb..abfdc6a25 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -16,7 +16,6 @@ package fs
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -151,16 +150,6 @@ func (n *MockInodeOperations) Truncate(ctx context.Context, inode *Inode, size i
 	return nil
 }
 
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (n *MockInodeOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, nil
-}
-
-// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir.
-func (n *MockInodeOperations) DeprecatedReaddir(context.Context, *DirCtx, int) (int, error) {
-	return 0, nil
-}
-
 // Remove implements fs.InodeOperations.Remove.
 func (n *MockInodeOperations) Remove(context.Context, *Inode, string) error {
 	return nil
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 24e28ddb2..dd6e64b4c 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -43,8 +43,6 @@ type DirentOperations interface {
 // MountSourceOperations contains filesystem specific operations.
 type MountSourceOperations interface {
 	// TODO: Add:
-	//
-	// StatFS() (Info, error)
 	// BlockSize() int64
 	// FS() Filesystem
 
@@ -249,7 +247,7 @@ func (msrc *MountSource) FlushDirentRefs() {
 }
 
 // NewCachingMountSource returns a generic mount that will cache dirents
-// aggressively. Filesystem may be nil if there is no backing filesystem.
+// aggressively.
 func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
 	return NewMountSource(&SimpleMountSourceOperations{
 		keep:       true,
@@ -258,7 +256,6 @@ func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *Mount
 }
 
 // NewNonCachingMountSource returns a generic mount that will never cache dirents.
-// Filesystem may be nil if there is no backing filesystem.
 func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
 	return NewMountSource(&SimpleMountSourceOperations{
 		keep:       false,
@@ -275,6 +272,15 @@ func NewRevalidatingMountSource(filesystem Filesystem, flags MountSourceFlags) *
 	}, filesystem, flags)
 }
 
+// NewPseudoMountSource returns a "pseudo" mount source that is not backed by
+// an actual filesystem. It is always non-caching.
+func NewPseudoMountSource() *MountSource {
+	return NewMountSource(&SimpleMountSourceOperations{
+		keep:       false,
+		revalidate: false,
+	}, nil, MountSourceFlags{})
+}
+
 // SimpleMountSourceOperations implements MountSourceOperations.
 //
 // +stateify savable
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index 7d682d99b..54000614f 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -19,7 +19,8 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 )
 
@@ -29,15 +30,15 @@ import (
 //   |-bar (file)
 func createMountNamespace(ctx context.Context) (*fs.MountNamespace, error) {
 	perms := fs.FilePermsFromMode(0777)
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	m := fs.NewPseudoMountSource()
 
-	barFile := ramfstest.NewFile(ctx, perms)
-	fooDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{
+	barFile := fsutil.NewSimpleFileInode(ctx, fs.RootOwner, perms, 0)
+	fooDir := ramfs.NewDir(ctx, map[string]*fs.Inode{
 		"bar": fs.NewInode(barFile, m, fs.StableAttr{Type: fs.RegularFile}),
-	}, perms)
-	rootDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{
+	}, fs.RootOwner, perms)
+	rootDir := ramfs.NewDir(ctx, map[string]*fs.Inode{
 		"foo": fs.NewInode(fooDir, m, fs.StableAttr{Type: fs.Directory}),
-	}, perms)
+	}, fs.RootOwner, perms)
 
 	return fs.NewMountNamespace(ctx, fs.NewInode(rootDir, m, fs.StableAttr{Type: fs.Directory}))
 }
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index aff3c3c01..74954f213 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -8,9 +8,9 @@ go_library(
         "cpuinfo.go",
         "exec_args.go",
         "fds.go",
-        "file.go",
         "filesystems.go",
         "fs.go",
+        "inode.go",
         "loadavg.go",
         "meminfo.go",
         "mounts.go",
@@ -32,6 +32,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/proc/device",
         "//pkg/sentry/fs/proc/seqfile",
         "//pkg/sentry/fs/ramfs",
@@ -45,6 +46,7 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
 
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index f8be06dc3..f756c45bf 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -15,52 +15,21 @@
 package proc
 
 import (
-	"io"
-
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// cpuinfo is a file describing the CPU capabilities.
-//
-// Presently cpuinfo never changes, so it doesn't need to be a SeqFile.
-//
-// +stateify savable
-type cpuinfo struct {
-	ramfs.Entry
-
-	// k is the system kernel.
-	k *kernel.Kernel
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (c *cpuinfo) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	features := c.k.FeatureSet()
+func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	k := kernel.KernelFromContext(ctx)
+	features := k.FeatureSet()
 	if features == nil {
 		// Kernel is always initialized with a FeatureSet.
 		panic("cpuinfo read with nil FeatureSet")
 	}
-
 	contents := make([]byte, 0, 1024)
-	for i, max := uint(0), c.k.ApplicationCores(); i < max; i++ {
+	for i, max := uint(0), k.ApplicationCores(); i < max; i++ {
 		contents = append(contents, []byte(features.CPUInfo(i))...)
 	}
-	if offset >= int64(len(contents)) {
-		return 0, io.EOF
-	}
-
-	n, err := dst.CopyOut(ctx, contents[offset:])
-	return int64(n), err
-}
-
-func (p *proc) newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	f := &cpuinfo{
-		k: p.k,
-	}
-	f.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-
-	return newFile(f, msrc, fs.SpecialFile, nil)
+	return newStaticProcInode(ctx, msrc, contents)
 }
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index b4896053f..ddda67f54 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -18,12 +18,14 @@ import (
 	"fmt"
 	"io"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // execArgType enumerates the types of exec arguments that are exposed through
@@ -35,12 +37,12 @@ const (
 	environExecArg
 )
 
-// execArgFile is a file containing the exec args (either cmdline or environ)
+// execArgInode is a inode containing the exec args (either cmdline or environ)
 // for a given task.
 //
 // +stateify savable
-type execArgFile struct {
-	ramfs.Entry
+type execArgInode struct {
+	fsutil.SimpleFileInode
 
 	// arg is the type of exec argument this file contains.
 	arg execArgType
@@ -49,21 +51,52 @@ type execArgFile struct {
 	t *kernel.Task
 }
 
+var _ fs.InodeOperations = (*execArgInode)(nil)
+
 // newExecArgFile creates a file containing the exec args of the given type.
-func newExecArgFile(t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode {
+func newExecArgInode(t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode {
 	if arg != cmdlineExecArg && arg != environExecArg {
 		panic(fmt.Sprintf("unknown exec arg type %v", arg))
 	}
-	f := &execArgFile{
-		arg: arg,
-		t:   t,
+	f := &execArgInode{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		arg:             arg,
+		t:               t,
 	}
-	f.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(f, msrc, fs.SpecialFile, t)
+	return newProcInode(f, msrc, fs.SpecialFile, t)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *execArgInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &execArgFile{
+		arg: i.arg,
+		t:   i.t,
+	}), nil
 }
 
-// DeprecatedPreadv reads the exec arg from the process's address space..
-func (f *execArgFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// +stateify savable
+type execArgFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopWrite     `state:"nosave"`
+
+	// arg is the type of exec argument this file contains.
+	arg execArgType
+
+	// t is the Task to read the exec arg line from.
+	t *kernel.Task
+}
+
+var _ fs.FileOperations = (*execArgFile)(nil)
+
+// Read reads the exec arg from the process's address space..
+func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 5acbce75e..b8a0a5eff 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -21,11 +21,11 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -55,7 +55,7 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF
 // readDescriptors reads fds in the task starting at offset, and calls the
 // toDentAttr callback for each to get a DentAttr, which it then emits. This is
 // a helper for implementing fs.InodeOperations.Readdir.
-func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(int) fs.DentAttr) (int, error) {
+func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) {
 	var fds kernel.FDs
 	t.WithMuLocked(func(t *kernel.Task) {
 		if fdm := t.FDMap(); fdm != nil {
@@ -69,7 +69,7 @@ func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(i
 	}
 
 	// Find the fd to start at.
-	idx := sort.SearchInts(fdInts, offset)
+	idx := sort.SearchInts(fdInts, int(offset))
 	if idx == len(fdInts) {
 		return offset, nil
 	}
@@ -80,28 +80,32 @@ func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(i
 		name := strconv.FormatUint(uint64(fd), 10)
 		if err := c.DirEmit(name, toDentAttr(fd)); err != nil {
 			// Returned offset is the next fd to serialize.
-			return fd, err
+			return int64(fd), err
 		}
 	}
 	// We serialized them all.  Next offset should be higher than last
 	// serialized fd.
-	return fd + 1, nil
+	return int64(fd + 1), nil
 }
 
-// fd is a single file in /proc/TID/fd/.
+// fd implements fs.InodeOperations for a file in /proc/TID/fd/.
 type fd struct {
 	ramfs.Symlink
 	*fs.File
 }
 
+var _ fs.InodeOperations = (*fd)(nil)
+
 // newFd returns a new fd based on an existing file.
 //
 // This inherits one reference to the file.
 func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode {
-	fd := &fd{File: f}
-	// RootOwner by default, is overridden in UnstableAttr()
-	fd.InitSymlink(t, fs.RootOwner, "")
-	return newFile(fd, msrc, fs.Symlink, t)
+	fd := &fd{
+		// RootOwner overridden by taskOwnedInodeOps.UnstableAttrs().
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+		File:    f,
+	}
+	return newProcInode(fd, msrc, fs.Symlink, t)
 }
 
 // GetFile returns the fs.File backing this fd.  The dirent and flags
@@ -142,7 +146,7 @@ func (f *fd) Close() error {
 	return nil
 }
 
-// fdDir implements /proc/TID/fd.
+// fdDir is an InodeOperations for /proc/TID/fd.
 //
 // +stateify savable
 type fdDir struct {
@@ -154,11 +158,15 @@ type fdDir struct {
 	t *kernel.Task
 }
 
+var _ fs.InodeOperations = (*fdDir)(nil)
+
 // newFdDir creates a new fdDir.
 func newFdDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	f := &fdDir{t: t}
-	f.InitDir(t, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}})
-	return newFile(f, msrc, fs.SpecialDirectory, t)
+	f := &fdDir{
+		Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}}),
+		t:   t,
+	}
+	return newProcInode(f, msrc, fs.SpecialDirectory, t)
 }
 
 // Check implements InodeOperations.Check.
@@ -191,49 +199,55 @@ func (f *fdDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent
 	return fs.NewDirent(n, p), nil
 }
 
-// DeprecatedReaddir lists fds in /proc/TID/fd.
-func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	return readDescriptors(f.t, dirCtx, offset, func(fd int) fs.DentAttr {
-		return fs.GenericDentAttr(fs.Symlink, device.ProcDevice)
+// GetFile implements fs.FileOperations.GetFile.
+func (f *fdDir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	fops := &fdDirFile{
+		isInfoFile: false,
+		t:          f.t,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
+}
+
+// +stateify savable
+type fdDirFile struct {
+	fsutil.DirFileOperations `state:"nosave"`
+
+	isInfoFile bool
+
+	t *kernel.Task
+}
+
+var _ fs.FileOperations = (*fdDirFile)(nil)
+
+// Readdir implements fs.FileOperations.Readdir.
+func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	dirCtx := &fs.DirCtx{
+		Serializer: ser,
+	}
+	typ := fs.RegularFile
+	if f.isInfoFile {
+		typ = fs.Symlink
+	}
+	return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr {
+		return fs.GenericDentAttr(typ, device.ProcDevice)
 	})
 }
 
-// fdInfo is a single file in /proc/TID/fdinfo/.
+// fdInfoInode is a single file in /proc/TID/fdinfo/.
 //
 // +stateify savable
-type fdInfo struct {
-	ramfs.File
+type fdInfoInode struct {
+	staticFileInodeOps
 
 	file    *fs.File
 	flags   fs.FileFlags
 	fdFlags kernel.FDFlags
 }
 
-// newFdInfo returns a new fdInfo based on an existing file.
-func newFdInfo(t *kernel.Task, file *fs.File, fdFlags kernel.FDFlags, msrc *fs.MountSource) *fs.Inode {
-	fdi := &fdInfo{file: file, flags: file.Flags(), fdFlags: fdFlags}
-	fdi.InitFile(t, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true}})
-	// TODO: Get pos, locks, and other data.  For now we only
-	// have flags.
-	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
-
-	flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
-	fdi.Append([]byte(fmt.Sprintf("flags:\t0%o\n", flags)))
-	return newFile(fdi, msrc, fs.SpecialFile, t)
-}
-
-// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev.
-func (*fdInfo) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	return 0, ramfs.ErrInvalidOp
-}
-
-// Truncate implements fs.InodeOperations.Truncate.
-func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return ramfs.ErrInvalidOp
-}
+var _ fs.InodeOperations = (*fdInfoInode)(nil)
 
-func (f *fdInfo) Release(ctx context.Context) {
-	f.File.Release(ctx)
+// Release implements fs.InodeOperations.Release.
+func (f *fdInfoInode) Release(ctx context.Context) {
 	f.file.DecRef()
 }
 
@@ -249,25 +263,37 @@ type fdInfoDir struct {
 
 // newFdInfoDir creates a new fdInfoDir.
 func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	fdid := &fdInfoDir{t: t}
-	fdid.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0500))
-	return newFile(fdid, msrc, fs.SpecialDirectory, t)
+	fdid := &fdInfoDir{
+		Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0500)),
+		t:   t,
+	}
+	return newProcInode(fdid, msrc, fs.SpecialDirectory, t)
 }
 
 // Lookup loads an fd in /proc/TID/fdinfo into a Dirent.
 func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
-	n, err := walkDescriptors(fdid.t, p, func(file *fs.File, fdFlags kernel.FDFlags) *fs.Inode {
-		return newFdInfo(fdid.t, file, fdFlags, dir.MountSource)
+	inode, err := walkDescriptors(fdid.t, p, func(file *fs.File, fdFlags kernel.FDFlags) *fs.Inode {
+		// TODO: Using a static inode here means that the
+		// data can be out-of-date if, for instance, the flags on the
+		// FD change before we read this file. We should switch to
+		// generating the data on Read(). Also, we should include pos,
+		// locks, and other data.  For now we only have flags.
+		// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
+		flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
+		contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags))
+		return newStaticProcInode(ctx, dir.MountSource, contents)
 	})
 	if err != nil {
 		return nil, err
 	}
-	return fs.NewDirent(n, p), nil
+	return fs.NewDirent(inode, p), nil
 }
 
-// DeprecatedReaddir lists fds in /proc/TID/fdinfo.
-func (fdid *fdInfoDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	return readDescriptors(fdid.t, dirCtx, offset, func(fd int) fs.DentAttr {
-		return fs.GenericDentAttr(fs.RegularFile, device.ProcDevice)
-	})
+// GetFile implements fs.FileOperations.GetFile.
+func (fdid *fdInfoDir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	fops := &fdDirFile{
+		isInfoFile: true,
+		t:          fdid.t,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
 }
diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go
deleted file mode 100644
index f659e590a..000000000
--- a/pkg/sentry/fs/proc/file.go
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-// +stateify savable
-type file struct {
-	fs.InodeOperations
-
-	// nodeType is the file type of this file.
-	nodeType fs.InodeType
-
-	// t is the associated kernel task that owns this file.
-	t *kernel.Task
-}
-
-func newFile(node fs.InodeOperations, msrc *fs.MountSource, nodeType fs.InodeType, t *kernel.Task) *fs.Inode {
-	iops := &file{node, nodeType, t}
-	sattr := fs.StableAttr{
-		DeviceID:  device.ProcDevice.DeviceID(),
-		InodeID:   device.ProcDevice.NextIno(),
-		BlockSize: usermem.PageSize,
-		Type:      nodeType,
-	}
-	return fs.NewInode(iops, msrc, sattr)
-}
-
-// UnstableAttr returns all attributes of this file.
-func (f *file) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	uattr, err := f.InodeOperations.UnstableAttr(ctx, inode)
-	if err != nil {
-		return fs.UnstableAttr{}, err
-	}
-	if f.t != nil {
-		creds := f.t.Credentials()
-		uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
-	}
-	return uattr, nil
-}
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
new file mode 100644
index 000000000..3c36af5ea
--- /dev/null
+++ b/pkg/sentry/fs/proc/inode.go
@@ -0,0 +1,96 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// taskOwnedInodeOps wraps an fs.InodeOperations and overrides the UnstableAttr
+// method to return the task as the owner.
+//
+// +stateify savable
+type taskOwnedInodeOps struct {
+	fs.InodeOperations
+
+	// t is the task that owns this file.
+	t *kernel.Task
+}
+
+// UnstableAttr implement fs.InodeOperations.UnstableAttr.
+func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+	uattr, err := i.InodeOperations.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	// Set the task owner as the file owner.
+	creds := i.t.Credentials()
+	uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
+	return uattr, nil
+}
+
+// staticFileInodeOps is an InodeOperations implementation that can be used to
+// return file contents which are constant. This file is not writable and will
+// always have mode 0444.
+//
+// +stateify savable
+type staticFileInodeOps struct {
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeStaticFileGetter
+}
+
+var _ fs.InodeOperations = (*staticFileInodeOps)(nil)
+
+// newStaticFileInode returns a procfs InodeOperations with static contents.
+func newStaticProcInode(ctx context.Context, msrc *fs.MountSource, contents []byte) *fs.Inode {
+	iops := &staticFileInodeOps{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		InodeStaticFileGetter: fsutil.InodeStaticFileGetter{
+			Contents: contents,
+		},
+	}
+	return newProcInode(iops, msrc, fs.SpecialFile, nil)
+}
+
+// newProcInode creates a new inode from the given inode operations.
+func newProcInode(iops fs.InodeOperations, msrc *fs.MountSource, typ fs.InodeType, t *kernel.Task) *fs.Inode {
+	sattr := fs.StableAttr{
+		DeviceID:  device.ProcDevice.DeviceID(),
+		InodeID:   device.ProcDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+		Type:      typ,
+	}
+	if t != nil {
+		iops = &taskOwnedInodeOps{iops, t}
+	}
+	return fs.NewInode(iops, msrc, sattr)
+}
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 2806d6035..3ed85a538 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -28,35 +28,36 @@ import (
 
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	var contents map[string]*fs.Inode
 	if s := p.k.NetworkStack(); s != nil && s.SupportsIPv6() {
-		d.AddChild(ctx, "dev", seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc))
-		d.AddChild(ctx, "if_inet6", seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc))
-
-		// The following files are simple stubs until they are implemented in
-		// netstack, if the file contains a header the stub is just the header
-		// otherwise it is an empty file.
-		d.AddChild(ctx, "arp", p.newStubProcFSFile(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")))
-		d.AddChild(ctx, "ipv6_route", p.newStubProcFSFile(ctx, msrc, []byte("")))
-		d.AddChild(ctx, "netlink", p.newStubProcFSFile(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")))
-		d.AddChild(ctx, "netstat", p.newStubProcFSFile(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")))
-		d.AddChild(ctx, "packet", p.newStubProcFSFile(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")))
-		d.AddChild(ctx, "protocols", p.newStubProcFSFile(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")))
-
-		// Linux sets these values to: nsec per usec, psched tick in ns, 1000000,
-		// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
-		d.AddChild(ctx, "psched", p.newStubProcFSFile(ctx, msrc, []byte(psched)))
-
-		d.AddChild(ctx, "ptype", p.newStubProcFSFile(ctx, msrc, []byte("Type Device      Function")))
-		d.AddChild(ctx, "route", p.newStubProcFSFile(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")))
-		d.AddChild(ctx, "tcp", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
-		d.AddChild(ctx, "tcp6", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
-		d.AddChild(ctx, "udp", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")))
-		d.AddChild(ctx, "udp6", p.newStubProcFSFile(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")))
+		contents = map[string]*fs.Inode{
+			"dev":      seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
+			"if_inet6": seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc),
+
+			// The following files are simple stubs until they are
+			// implemented in netstack, if the file contains a
+			// header the stub is just the header otherwise it is
+			// an empty file.
+			"arp":        newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")),
+			"ipv6_route": newStaticProcInode(ctx, msrc, []byte("")),
+			"netlink":    newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")),
+			"netstat":    newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")),
+			"packet":     newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")),
+			"protocols":  newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")),
+			// Linux sets psched values to: nsec per usec, psched
+			// tick in ns, 1000000, high res timer ticks per sec
+			// (ClockGetres returns 1ns resolution).
+			"psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))),
+			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function")),
+			"route":  newStaticProcInode(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")),
+			"tcp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
+			"tcp6":   newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
+			"udp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
+			"udp6":   newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
+		}
 	}
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6.
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 70e549c31..d1c699418 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -17,18 +17,17 @@ package proc
 
 import (
 	"fmt"
-	"io"
 	"sort"
 	"strconv"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -46,32 +45,6 @@ type proc struct {
 	pidns *kernel.PIDNamespace
 }
 
-// stubProcFSFile is a file type that can be used to return file contents
-// which are constant. This file is not writable and will always have mode
-// 0444.
-//
-// +stateify savable
-type stubProcFSFile struct {
-	ramfs.Entry
-
-	// contents are the immutable file contents that will always be returned.
-	contents []byte
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (s *stubProcFSFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-
-	if offset >= int64(len(s.contents)) {
-		return 0, io.EOF
-	}
-
-	n, err := dst.CopyOut(ctx, s.contents[offset:])
-	return int64(n), err
-}
-
 // New returns the root node of a partial simple procfs.
 func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 	k := kernel.KernelFromContext(ctx)
@@ -83,29 +56,39 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 		return nil, fmt.Errorf("procfs requires a PID namespace")
 	}
 
-	p := &proc{k: k, pidns: pidns}
-	p.InitDir(ctx, map[string]*fs.Inode{
+	// Note that these are just the static members. There are dynamic
+	// members populated in Readdir and Lookup below.
+	contents := map[string]*fs.Inode{
+		"cpuinfo":     newCPUInfo(ctx, msrc),
 		"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
 		"loadavg":     seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc),
 		"meminfo":     seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc),
-		"mounts":      newMountsSymlink(ctx, msrc),
+		"mounts":      newProcInode(ramfs.NewSymlink(ctx, fs.RootOwner, "self/mounts"), msrc, fs.Symlink, nil),
+		"self":        newSelf(ctx, pidns, msrc),
 		"stat":        seqfile.NewSeqFileInode(ctx, &statData{k}, msrc),
+		"thread-self": newThreadSelf(ctx, pidns, msrc),
+		"uptime":      newUptime(ctx, msrc),
 		"version":     seqfile.NewSeqFileInode(ctx, &versionData{k}, msrc),
-	}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	}
+
+	// Construct the proc InodeOperations.
+	p := &proc{
+		Dir:   *ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		k:     k,
+		pidns: pidns,
+	}
+
+	// Add more contents that need proc to be initialized.
+	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
 
-	p.AddChild(ctx, "cpuinfo", p.newCPUInfo(ctx, msrc))
 	// If we're using rpcinet we will let it manage /proc/net.
 	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-		p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc))
+		contents["net"] = newRPCInetProcNet(ctx, msrc)
 	} else {
-		p.AddChild(ctx, "net", p.newNetDir(ctx, msrc))
+		contents["net"] = p.newNetDir(ctx, msrc)
 	}
-	p.AddChild(ctx, "self", p.newSelf(ctx, msrc))
-	p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc))
-	p.AddChild(ctx, "thread-self", p.newThreadSelf(ctx, msrc))
-	p.AddChild(ctx, "uptime", p.newUptime(ctx, msrc))
 
-	return newFile(p, msrc, fs.SpecialDirectory, nil), nil
+	return newProcInode(p, msrc, fs.SpecialDirectory, nil), nil
 }
 
 // self is a magical link.
@@ -118,26 +101,21 @@ type self struct {
 }
 
 // newSelf returns a new "self" node.
-func (p *proc) newSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	s := &self{pidns: p.pidns}
-	s.InitSymlink(ctx, fs.RootOwner, "")
-	return newFile(s, msrc, fs.Symlink, nil)
+func newSelf(ctx context.Context, pidns *kernel.PIDNamespace, msrc *fs.MountSource) *fs.Inode {
+	s := &self{
+		Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""),
+		pidns:   pidns,
+	}
+	return newProcInode(s, msrc, fs.Symlink, nil)
 }
 
 // newThreadSelf returns a new "threadSelf" node.
-func (p *proc) newThreadSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	s := &threadSelf{pidns: p.pidns}
-	s.InitSymlink(ctx, fs.RootOwner, "")
-	return newFile(s, msrc, fs.Symlink, nil)
-}
-
-// newStubProcFsFile returns a procfs file with constant contents.
-func (p *proc) newStubProcFSFile(ctx context.Context, msrc *fs.MountSource, c []byte) *fs.Inode {
-	u := &stubProcFSFile{
-		contents: c,
+func newThreadSelf(ctx context.Context, pidns *kernel.PIDNamespace, msrc *fs.MountSource) *fs.Inode {
+	s := &threadSelf{
+		Symlink: *ramfs.NewSymlink(ctx, fs.RootOwner, ""),
+		pidns:   pidns,
 	}
-	u.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(u, msrc, fs.SpecialFile, nil)
+	return newProcInode(s, msrc, fs.Symlink, nil)
 }
 
 // Readlink implements fs.InodeOperations.Readlink.
@@ -145,13 +123,13 @@ func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	if t := kernel.TaskFromContext(ctx); t != nil {
 		tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
 		if tgid == 0 {
-			return "", ramfs.ErrNotFound
+			return "", syserror.ENOENT
 		}
 		return strconv.FormatUint(uint64(tgid), 10), nil
 	}
 
 	// Who is reading this link?
-	return "", ramfs.ErrInvalidOp
+	return "", syserror.EINVAL
 }
 
 // threadSelf is more magical than "self" link.
@@ -169,13 +147,13 @@ func (s *threadSelf) Readlink(ctx context.Context, inode *fs.Inode) (string, err
 		tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
 		tid := s.pidns.IDOfTask(t)
 		if tid == 0 || tgid == 0 {
-			return "", ramfs.ErrNotFound
+			return "", syserror.ENOENT
 		}
 		return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 	}
 
 	// Who is reading this link?
-	return "", ramfs.ErrInvalidOp
+	return "", syserror.EINVAL
 }
 
 // Lookup loads an Inode at name into a Dirent.
@@ -204,25 +182,44 @@ func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dire
 	return fs.NewDirent(td, name), nil
 }
 
-// Readdir synthesizes proc contents.
-func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	// Serialize normal contents.
-	_, err := p.Dir.DeprecatedReaddir(ctx, dirCtx, offset)
-	if err != nil {
-		return offset, err
+// GetFile implements fs.InodeOperations.
+func (p *proc) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &rootProcFile{iops: p}), nil
+}
+
+// rootProcFile implements fs.FileOperations for the proc directory.
+//
+// +stateify savable
+type rootProcFile struct {
+	fsutil.DirFileOperations `state:"nosave"`
+
+	iops *proc
+}
+
+var _ fs.FileOperations = (*rootProcFile)(nil)
+
+// Readdir implements fs.FileOperations.Readdir.
+func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	offset := file.Offset()
+	dirCtx := &fs.DirCtx{
+		Serializer: ser,
 	}
 
-	m := make(map[string]fs.DentAttr)
-	var names []string
+	// Get normal directory contents from ramfs dir.
+	names, m := rpf.iops.Dir.Children()
 
-	// Add special files.
-	m["sys"] = fs.GenericDentAttr(fs.SpecialFile, device.ProcDevice)
-	names = append(names, "sys")
+	// Add dot and dotdot.
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dot, dotdot := file.Dirent.GetDotAttrs(root)
+	names = append(names, ".", "..")
+	m["."] = dot
+	m[".."] = dotdot
 
 	// Collect tasks.
 	// Per linux we only include it in directory listings if it's the leader.
 	// But for whatever crazy reason, you can still walk to the given node.
-	for _, tg := range p.pidns.ThreadGroups() {
+	for _, tg := range rpf.iops.pidns.ThreadGroups() {
 		if leader := tg.Leader(); leader != nil {
 			name := strconv.FormatUint(uint64(tg.ID()), 10)
 			m[name] = fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice)
@@ -230,7 +227,7 @@ func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 		}
 	}
 
-	if offset >= len(m) {
+	if offset >= int64(len(m)) {
 		return offset, nil
 	}
 	sort.Strings(names)
@@ -241,12 +238,5 @@ func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset
 		}
 		offset++
 	}
-	return offset, err
-}
-
-// newMountsSymlink returns a symlink to "self/mounts".
-func newMountsSymlink(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	s := &ramfs.Symlink{}
-	s.InitSymlink(ctx, fs.RootOwner, "self/mounts")
-	return newFile(s, msrc, fs.Symlink, nil)
+	return offset, nil
 }
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index d025069df..65faa21f2 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -20,32 +20,72 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// rpcinetFile implments fs.InodeOperations as RPCs.
-type rpcinetFile struct {
-	ramfs.Entry
+// rpcInetInode implments fs.InodeOperations.
+type rpcInetInode struct {
+	fsutil.SimpleFileInode
 
-	// filepath is the full path of this rpcinetFile.
+	// filepath is the full path of this rpcInetInode.
 	filepath string
 
 	k *kernel.Kernel
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-// This method can panic if an rpcinetFile was created without an rpcinet
+func newRPCInetInode(ctx context.Context, msrc *fs.MountSource, filepath string, mode linux.FileMode) *fs.Inode {
+	f := &rpcInetInode{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(mode), linux.PROC_SUPER_MAGIC),
+		filepath:        filepath,
+		k:               kernel.KernelFromContext(ctx),
+	}
+	return newProcInode(f, msrc, fs.SpecialFile, nil)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (i *rpcInetInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
+	fops := &rpcInetFile{
+		inode: i,
+	}
+	return fs.NewFile(ctx, dirent, flags, fops), nil
+}
+
+// rpcInetFile implements fs.FileOperations as RPCs.
+type rpcInetFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	inode *rpcInetInode
+}
+
+// Read implements fs.FileOperations.Read.
+//
+// This method can panic if an rpcInetInode was created without an rpcinet
 // stack.
-func (r rpcinetFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	s, ok := r.k.NetworkStack().(*rpcinet.Stack)
+func (f *rpcInetFile) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack)
 	if !ok {
 		panic("Network stack is not a rpcinet.")
 	}
 
-	contents, se := s.RPCReadFile(r.filepath)
+	contents, se := s.RPCReadFile(f.inode.filepath)
 	if se != nil || offset >= int64(len(contents)) {
 		return 0, io.EOF
 	}
@@ -54,16 +94,12 @@ func (r rpcinetFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequenc
 	return int64(n), err
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (r rpcinetFile) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-// This method can panic if an rpcinetFile was created without an rpcinet
+// Write implements fs.FileOperations.Write.
+//
+// This method can panic if an rpcInetInode was created without an rpcInet
 // stack.
-func (r rpcinetFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	s, ok := r.k.NetworkStack().(*rpcinet.Stack)
+func (f *rpcInetFile) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	s, ok := f.inode.k.NetworkStack().(*rpcinet.Stack)
 	if !ok {
 		panic("Network stack is not a rpcinet.")
 	}
@@ -78,116 +114,102 @@ func (r rpcinetFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequen
 		return int64(n), err
 	}
 
-	written, se := s.RPCWriteFile(r.filepath, b)
+	written, se := s.RPCWriteFile(f.inode.filepath, b)
 	return int64(written), se.ToError()
 }
 
-func newRPCProcFSFile(ctx context.Context, msrc *fs.MountSource, filepath string, mode linux.FileMode) *fs.Inode {
-	f := &rpcinetFile{
-		filepath: filepath,
-		k:        kernel.KernelFromContext(ctx),
-	}
-	f.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(mode))
-
-	fi := newFile(f, msrc, fs.SpecialFile, nil)
-	return fi
-}
-
 // newRPCInetProcNet will build an inode for /proc/net.
 func newRPCInetProcNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
-	// Add all the files we want to forward for /proc/net.
-	d.AddChild(ctx, "arp", newRPCProcFSFile(ctx, msrc, "/proc/net/arp", 0444))
-	d.AddChild(ctx, "dev", newRPCProcFSFile(ctx, msrc, "/proc/net/dev", 0444))
-	d.AddChild(ctx, "if_inet6", newRPCProcFSFile(ctx, msrc, "/proc/net/if_inet6", 0444))
-	d.AddChild(ctx, "ipv6_route", newRPCProcFSFile(ctx, msrc, "/proc/net/ipv6_route", 0444))
-	d.AddChild(ctx, "netlink", newRPCProcFSFile(ctx, msrc, "/proc/net/netlink", 0444))
-	d.AddChild(ctx, "netstat", newRPCProcFSFile(ctx, msrc, "/proc/net/netstat", 0444))
-	d.AddChild(ctx, "packet", newRPCProcFSFile(ctx, msrc, "/proc/net/packet", 0444))
-	d.AddChild(ctx, "protocols", newRPCProcFSFile(ctx, msrc, "/proc/net/protocols", 0444))
-	d.AddChild(ctx, "psched", newRPCProcFSFile(ctx, msrc, "/proc/net/psched", 0444))
-	d.AddChild(ctx, "ptype", newRPCProcFSFile(ctx, msrc, "/proc/net/ptype", 0444))
-	d.AddChild(ctx, "route", newRPCProcFSFile(ctx, msrc, "/proc/net/route", 0444))
-	d.AddChild(ctx, "tcp", newRPCProcFSFile(ctx, msrc, "/proc/net/tcp", 0444))
-	d.AddChild(ctx, "tcp6", newRPCProcFSFile(ctx, msrc, "/proc/net/tcp6", 0444))
-	d.AddChild(ctx, "udp", newRPCProcFSFile(ctx, msrc, "/proc/net/udp", 0444))
-	d.AddChild(ctx, "udp6", newRPCProcFSFile(ctx, msrc, "/proc/net/udp6", 0444))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"arp":        newRPCInetInode(ctx, msrc, "/proc/net/arp", 0444),
+		"dev":        newRPCInetInode(ctx, msrc, "/proc/net/dev", 0444),
+		"if_inet6":   newRPCInetInode(ctx, msrc, "/proc/net/if_inet6", 0444),
+		"ipv6_route": newRPCInetInode(ctx, msrc, "/proc/net/ipv6_route", 0444),
+		"netlink":    newRPCInetInode(ctx, msrc, "/proc/net/netlink", 0444),
+		"netstat":    newRPCInetInode(ctx, msrc, "/proc/net/netstat", 0444),
+		"packet":     newRPCInetInode(ctx, msrc, "/proc/net/packet", 0444),
+		"protocols":  newRPCInetInode(ctx, msrc, "/proc/net/protocols", 0444),
+		"psched":     newRPCInetInode(ctx, msrc, "/proc/net/psched", 0444),
+		"ptype":      newRPCInetInode(ctx, msrc, "/proc/net/ptype", 0444),
+		"route":      newRPCInetInode(ctx, msrc, "/proc/net/route", 0444),
+		"tcp":        newRPCInetInode(ctx, msrc, "/proc/net/tcp", 0444),
+		"tcp6":       newRPCInetInode(ctx, msrc, "/proc/net/tcp6", 0444),
+		"udp":        newRPCInetInode(ctx, msrc, "/proc/net/udp", 0444),
+		"udp6":       newRPCInetInode(ctx, msrc, "/proc/net/udp6", 0444),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // newRPCInetProcSysNet will build an inode for /proc/sys/net.
 func newRPCInetProcSysNet(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "ipv4", newRPCInetSysNetIPv4Dir(ctx, msrc))
-	d.AddChild(ctx, "core", newRPCInetSysNetCore(ctx, msrc))
+	contents := map[string]*fs.Inode{
+		"ipv4": newRPCInetSysNetIPv4Dir(ctx, msrc),
+		"core": newRPCInetSysNetCore(ctx, msrc),
+	}
 
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // newRPCInetSysNetCore builds the /proc/sys/net/core directory.
 func newRPCInetSysNetCore(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
-	// Add all the files we want to forward over RPC for /proc/sys/net/core
-	d.AddChild(ctx, "default_qdisc", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/default_qdisc", 0444))
-	d.AddChild(ctx, "message_burst", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/message_burst", 0444))
-	d.AddChild(ctx, "message_cost", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/message_cost", 0444))
-	d.AddChild(ctx, "optmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/optmem_max", 0444))
-	d.AddChild(ctx, "rmem_default", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/rmem_default", 0444))
-	d.AddChild(ctx, "rmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/rmem_max", 0444))
-	d.AddChild(ctx, "somaxconn", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/somaxconn", 0444))
-	d.AddChild(ctx, "wmem_default", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/wmem_default", 0444))
-	d.AddChild(ctx, "wmem_max", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/core/wmem_max", 0444))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"default_qdisc": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/default_qdisc", 0444),
+		"message_burst": newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_burst", 0444),
+		"message_cost":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/message_cost", 0444),
+		"optmem_max":    newRPCInetInode(ctx, msrc, "/proc/sys/net/core/optmem_max", 0444),
+		"rmem_default":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_default", 0444),
+		"rmem_max":      newRPCInetInode(ctx, msrc, "/proc/sys/net/core/rmem_max", 0444),
+		"somaxconn":     newRPCInetInode(ctx, msrc, "/proc/sys/net/core/somaxconn", 0444),
+		"wmem_default":  newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_default", 0444),
+		"wmem_max":      newRPCInetInode(ctx, msrc, "/proc/sys/net/core/wmem_max", 0444),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 // newRPCInetSysNetIPv4Dir builds the /proc/sys/net/ipv4 directory.
 func newRPCInetSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
-	// Add all the files we want to forward over RPC for /proc/sys/net/ipv4.
-	d.AddChild(ctx, "ip_local_port_range", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_local_port_range", 0444))
-	d.AddChild(ctx, "ip_local_reserved_ports", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_local_reserved_ports", 0444))
-	d.AddChild(ctx, "ipfrag_time", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ipfrag_time", 0444))
-	d.AddChild(ctx, "ip_nonlocal_bind", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_nonlocal_bind", 0444))
-	d.AddChild(ctx, "ip_no_pmtu_disc", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/ip_no_pmtu_disc", 0444))
-
-	d.AddChild(ctx, "tcp_allowed_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_allowed_congestion_control", 0444))
-	d.AddChild(ctx, "tcp_available_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_available_congestion_control", 0444))
-	d.AddChild(ctx, "tcp_base_mss", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_base_mss", 0444))
-	d.AddChild(ctx, "tcp_congestion_control", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_congestion_control", 0644))
-	d.AddChild(ctx, "tcp_dsack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_dsack", 0644))
-	d.AddChild(ctx, "tcp_early_retrans", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_early_retrans", 0644))
-	d.AddChild(ctx, "tcp_fack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fack", 0644))
-	d.AddChild(ctx, "tcp_fastopen", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen", 0644))
-	d.AddChild(ctx, "tcp_fastopen_key", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen_key", 0444))
-	d.AddChild(ctx, "tcp_fin_timeout", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_fin_timeout", 0644))
-	d.AddChild(ctx, "tcp_invalid_ratelimit", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_invalid_ratelimit", 0444))
-	d.AddChild(ctx, "tcp_keepalive_intvl", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_intvl", 0644))
-	d.AddChild(ctx, "tcp_keepalive_probes", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_probes", 0644))
-	d.AddChild(ctx, "tcp_keepalive_time", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_time", 0644))
-	d.AddChild(ctx, "tcp_mem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_mem", 0444))
-	d.AddChild(ctx, "tcp_mtu_probing", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_mtu_probing", 0644))
-	d.AddChild(ctx, "tcp_no_metrics_save", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_no_metrics_save", 0444))
-	d.AddChild(ctx, "tcp_probe_interval", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_interval", 0444))
-	d.AddChild(ctx, "tcp_probe_threshold", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_threshold", 0444))
-	d.AddChild(ctx, "tcp_retries1", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries1", 0644))
-	d.AddChild(ctx, "tcp_retries2", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries2", 0644))
-	d.AddChild(ctx, "tcp_rfc1337", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_rfc1337", 0444))
-	d.AddChild(ctx, "tcp_rmem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_rmem", 0444))
-	d.AddChild(ctx, "tcp_sack", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_sack", 0644))
-	d.AddChild(ctx, "tcp_slow_start_after_idle", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_slow_start_after_idle", 0644))
-	d.AddChild(ctx, "tcp_synack_retries", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_synack_retries", 0644))
-	d.AddChild(ctx, "tcp_syn_retries", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_syn_retries", 0644))
-	d.AddChild(ctx, "tcp_timestamps", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_timestamps", 0644))
-	d.AddChild(ctx, "tcp_wmem", newRPCProcFSFile(ctx, msrc, "/proc/sys/net/ipv4/tcp_wmem", 0444))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"ip_local_port_range":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_port_range", 0444),
+		"ip_local_reserved_ports":          newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_local_reserved_ports", 0444),
+		"ipfrag_time":                      newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ipfrag_time", 0444),
+		"ip_nonlocal_bind":                 newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_nonlocal_bind", 0444),
+		"ip_no_pmtu_disc":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/ip_no_pmtu_disc", 0444),
+		"tcp_allowed_congestion_control":   newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_allowed_congestion_control", 0444),
+		"tcp_available_congestion_control": newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_available_congestion_control", 0444),
+		"tcp_base_mss":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_base_mss", 0444),
+		"tcp_congestion_control":           newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_congestion_control", 0644),
+		"tcp_dsack":                        newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_dsack", 0644),
+		"tcp_early_retrans":                newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_early_retrans", 0644),
+		"tcp_fack":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fack", 0644),
+		"tcp_fastopen":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen", 0644),
+		"tcp_fastopen_key":                 newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fastopen_key", 0444),
+		"tcp_fin_timeout":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_fin_timeout", 0644),
+		"tcp_invalid_ratelimit":            newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_invalid_ratelimit", 0444),
+		"tcp_keepalive_intvl":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_intvl", 0644),
+		"tcp_keepalive_probes":             newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_probes", 0644),
+		"tcp_keepalive_time":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_keepalive_time", 0644),
+		"tcp_mem":                          newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mem", 0444),
+		"tcp_mtu_probing":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_mtu_probing", 0644),
+		"tcp_no_metrics_save":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_no_metrics_save", 0444),
+		"tcp_probe_interval":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_interval", 0444),
+		"tcp_probe_threshold":              newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_probe_threshold", 0444),
+		"tcp_retries1":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries1", 0644),
+		"tcp_retries2":                     newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_retries2", 0644),
+		"tcp_rfc1337":                      newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rfc1337", 0444),
+		"tcp_rmem":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_rmem", 0444),
+		"tcp_sack":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_sack", 0644),
+		"tcp_slow_start_after_idle":        newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_slow_start_after_idle", 0644),
+		"tcp_synack_retries":               newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_synack_retries", 0644),
+		"tcp_syn_retries":                  newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_syn_retries", 0644),
+		"tcp_timestamps":                   newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_timestamps", 0644),
+		"tcp_wmem":                         newRPCInetInode(ctx, msrc, "/proc/sys/net/ipv4/tcp_wmem", 0444),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index 53c475652..b4ba64e10 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -8,12 +8,15 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/proc/device",
-        "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/usermem",
+        "//pkg/syserror",
+        "//pkg/waiter",
     ],
 )
 
@@ -26,7 +29,7 @@ go_test(
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/fs/ramfs/test",
+        "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 0499ba65b..16fc6789e 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -18,12 +18,15 @@ import (
 	"io"
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // SeqHandle is a helper handle to seek in the file.
@@ -87,7 +90,18 @@ func (s *SeqGenerationCounter) IsCurrent(generation int64) bool {
 //
 // +stateify savable
 type SeqFile struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	fsutil.InodeSimpleExtendedAttributes
+	fsutil.InodeSimpleAttributes
 
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -99,11 +113,14 @@ type SeqFile struct {
 	lastRead   int64
 }
 
+var _ fs.InodeOperations = (*SeqFile)(nil)
+
 // NewSeqFile returns a seqfile suitable for use by external consumers.
 func NewSeqFile(ctx context.Context, source SeqSource) *SeqFile {
-	s := &SeqFile{SeqSource: source}
-	s.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return s
+	return &SeqFile{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		SeqSource:             source,
+	}
 }
 
 // NewSeqFileInode returns an Inode with SeqFile InodeOperations.
@@ -120,11 +137,19 @@ func NewSeqFileInode(ctx context.Context, source SeqSource, msrc *fs.MountSource
 
 // UnstableAttr returns unstable attributes of the SeqFile.
 func (s *SeqFile) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	uattr, _ := s.Entry.UnstableAttr(ctx, inode)
+	uattr, err := s.InodeSimpleAttributes.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
 	uattr.ModificationTime = ktime.NowFromContext(ctx)
 	return uattr, nil
 }
 
+// GetFile implements fs.InodeOperations.GetFile.
+func (s *SeqFile) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &seqFileOperations{seqFile: s}), nil
+}
+
 // findIndexAndOffset finds the unit that corresponds to a certain offset.
 // Returns the unit and the offset within the unit. If there are not enough
 // units len(data) and leftover offset is returned.
@@ -139,36 +164,74 @@ func findIndexAndOffset(data []SeqData, offset int64) (int, int64) {
 	return len(data), offset
 }
 
-// DeprecatedPreadv reads from the file at the given offset.
-func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
+// updateSourceLocked requires that s.mu is held.
+func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
+	var h SeqHandle
+	if record == 0 {
+		h = nil
+	} else {
+		h = s.source[record-1].Handle
+	}
+	// Save what we have previously read.
+	s.source = s.source[:record]
+	var newSource []SeqData
+	newSource, s.generation = s.SeqSource.ReadSeqFileData(ctx, h)
+	s.source = append(s.source, newSource...)
+}
+
+// seqFileOperations implements fs.FileOperations.
+//
+// +stateify savable
+type seqFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	seqFile *SeqFile
+}
+
+var _ fs.FileOperations = (*seqFileOperations)(nil)
+
+// Write implements fs.FileOperations.Write.
+func (*seqFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+	return 0, syserror.EACCES
+}
+
+// Read implements fs.FileOperations.Read.
+func (sfo *seqFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	sfo.seqFile.mu.Lock()
+	defer sfo.seqFile.mu.Unlock()
 
-	s.Entry.NotifyAccess(ctx)
-	defer func() { s.lastRead = offset }()
+	sfo.seqFile.NotifyAccess(ctx)
+	defer func() { sfo.seqFile.lastRead = offset }()
 
 	updated := false
 
 	// Try to find where we should start reading this file.
-	i, recordOffset := findIndexAndOffset(s.source, offset)
-	if i == len(s.source) {
+	i, recordOffset := findIndexAndOffset(sfo.seqFile.source, offset)
+	if i == len(sfo.seqFile.source) {
 		// Ok, we're at EOF. Let's first check to see if there might be
 		// more data available to us. If there is more data, add it to
 		// the end and try reading again.
-		if !s.SeqSource.NeedsUpdate(s.generation) {
+		if !sfo.seqFile.SeqSource.NeedsUpdate(sfo.seqFile.generation) {
 			return 0, io.EOF
 		}
-		oldLen := len(s.source)
-		s.updateSourceLocked(ctx, len(s.source))
+		oldLen := len(sfo.seqFile.source)
+		sfo.seqFile.updateSourceLocked(ctx, len(sfo.seqFile.source))
 		updated = true
 		// We know that we had consumed everything up until this point
 		// so we search in the new slice instead of starting over.
-		i, recordOffset = findIndexAndOffset(s.source[oldLen:], recordOffset)
+		i, recordOffset = findIndexAndOffset(sfo.seqFile.source[oldLen:], recordOffset)
 		i += oldLen
 		// i is at most the length of the slice which is
-		// len(s.source) - oldLen. So at most i will be equal to
-		// len(s.source).
-		if i == len(s.source) {
+		// len(sfo.seqFile.source) - oldLen. So at most i will be equal to
+		// len(sfo.seqFile.source).
+		if i == len(sfo.seqFile.source) {
 			return 0, io.EOF
 		}
 	}
@@ -178,7 +241,7 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	// before continuing on to the next. We don't refresh our data source
 	// before this record is completed.
 	if recordOffset != 0 {
-		n, err := dst.CopyOut(ctx, s.source[i].Buf[recordOffset:])
+		n, err := dst.CopyOut(ctx, sfo.seqFile.source[i].Buf[recordOffset:])
 		done += int64(n)
 		dst = dst.DropFirst(n)
 		if dst.NumBytes() == 0 || err != nil {
@@ -190,15 +253,15 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	// Next/New unit, update the source file if necessary. Make an extra
 	// check to see if we've seeked backwards and if so always update our
 	// data source.
-	if !updated && (s.SeqSource.NeedsUpdate(s.generation) || s.lastRead > offset) {
-		s.updateSourceLocked(ctx, i)
+	if !updated && (sfo.seqFile.SeqSource.NeedsUpdate(sfo.seqFile.generation) || sfo.seqFile.lastRead > offset) {
+		sfo.seqFile.updateSourceLocked(ctx, i)
 		// recordOffset is 0 here and we won't update records behind the
 		// current one so recordOffset is still 0 even though source
 		// just got updated. Just read the next record.
 	}
 
 	// Finish by reading all the available data.
-	for _, buf := range s.source[i:] {
+	for _, buf := range sfo.seqFile.source[i:] {
 		n, err := dst.CopyOut(ctx, buf.Buf)
 		done += int64(n)
 		dst = dst.DropFirst(n)
@@ -214,23 +277,3 @@ func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	}
 	return done, nil
 }
-
-// updateSourceLocked requires that s.mu is held.
-func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
-	var h SeqHandle
-	if record == 0 {
-		h = nil
-	} else {
-		h = s.source[record-1].Handle
-	}
-	// Save what we have previously read.
-	s.source = s.source[:record]
-	var newSource []SeqData
-	newSource, s.generation = s.SeqSource.ReadSeqFileData(ctx, h)
-	s.source = append(s.source, newSource...)
-}
-
-// DeprecatedPwritev is always denied.
-func (*SeqFile) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ramfs.ErrDenied
-}
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index f9a2ca38e..35403ab7f 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -23,7 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -91,10 +91,15 @@ type testTable struct {
 	expectedError  error
 }
 
-func runTableTests(ctx context.Context, table []testTable, n fs.InodeOperations) error {
+func runTableTests(ctx context.Context, table []testTable, dirent *fs.Dirent) error {
 	for _, tt := range table {
+		file, err := dirent.Inode.InodeOperations.GetFile(ctx, dirent, fs.FileFlags{Read: true})
+		if err != nil {
+			return fmt.Errorf("GetFile returned error: %v", err)
+		}
+
 		data := make([]byte, tt.readBufferSize)
-		resultLen, err := n.DeprecatedPreadv(ctx, usermem.BytesIOSequence(data), tt.offset)
+		resultLen, err := file.Preadv(ctx, usermem.BytesIOSequence(data), tt.offset)
 		if err != tt.expectedError {
 			return fmt.Errorf("t.Preadv(len: %v, offset: %v) (error) => %v expected %v", tt.readBufferSize, tt.offset, err, tt.expectedError)
 		}
@@ -115,12 +120,12 @@ func TestSeqFile(t *testing.T) {
 	testSource.Init()
 
 	// Create a file that can be R/W.
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	m := fs.NewPseudoMountSource()
 	ctx := contexttest.Context(t)
 	contents := map[string]*fs.Inode{
 		"foo": NewSeqFileInode(ctx, testSource, m),
 	}
-	root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777))
+	root := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0777))
 
 	// How about opening it?
 	inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory})
@@ -129,9 +134,13 @@ func TestSeqFile(t *testing.T) {
 		t.Fatalf("failed to walk to foo for n2: %v", err)
 	}
 	n2 := dirent2.Inode.InodeOperations
+	file2, err := n2.GetFile(ctx, dirent2, fs.FileFlags{Read: true, Write: true})
+	if err != nil {
+		t.Fatalf("GetFile returned error: %v", err)
+	}
 
 	// Writing?
-	if _, err := n2.DeprecatedPwritev(nil, usermem.BytesIOSequence([]byte("test")), 0); err == nil {
+	if _, err := file2.Writev(ctx, usermem.BytesIOSequence([]byte("test"))); err == nil {
 		t.Fatalf("managed to write to n2: %v", err)
 	}
 
@@ -141,7 +150,6 @@ func TestSeqFile(t *testing.T) {
 		t.Fatalf("failed to walk to foo: %v", err)
 	}
 	n3 := dirent3.Inode.InodeOperations
-
 	if n2 != n3 {
 		t.Error("got n2 != n3, want same")
 	}
@@ -170,13 +178,13 @@ func TestSeqFile(t *testing.T) {
 		// Read the last 3 bytes.
 		{97, 10, testSource.actual[9].Buf[7:], nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed with testSource.update = %v : %v", testSource.update, err)
 	}
 
 	// Disable updates and do it again.
 	testSource.update = false
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed with testSource.update = %v: %v", testSource.update, err)
 	}
 }
@@ -188,25 +196,24 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	testSource.update = true
 
 	// Create a file that can be R/W.
-	m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	m := fs.NewPseudoMountSource()
 	ctx := contexttest.Context(t)
 	contents := map[string]*fs.Inode{
 		"foo": NewSeqFileInode(ctx, testSource, m),
 	}
-	root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777))
+	root := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0777))
 
 	// How about opening it?
 	inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory})
 	dirent2, err := root.Lookup(ctx, inode, "foo")
 	if err != nil {
-		t.Fatalf("failed to walk to foo for n2: %v", err)
+		t.Fatalf("failed to walk to foo for dirent2: %v", err)
 	}
-	n2 := dirent2.Inode.InodeOperations
 
 	table := []testTable{
 		{0, 16, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf[:6]), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed: %v", err)
 	}
 	// Delete the first entry.
@@ -224,7 +231,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 		// Read the following two lines.
 		{30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after removing first entry: %v", err)
 	}
 
@@ -238,7 +245,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{50, 20, flatten(testSource.actual[4].Buf, testSource.actual[5].Buf), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after adding middle entry: %v", err)
 	}
 	// This will be used in a later test.
@@ -249,7 +256,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{20, 20, []byte{}, io.EOF},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after removing all entries: %v", err)
 	}
 	// Restore some of the data.
@@ -257,7 +264,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{6, 20, testSource.actual[0].Buf[6:], nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after adding first entry back: %v", err)
 	}
 
@@ -266,7 +273,7 @@ func TestSeqFileFileUpdated(t *testing.T) {
 	table = []testTable{
 		{30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil},
 	}
-	if err := runTableTests(ctx, table, n2); err != nil {
+	if err := runTableTests(ctx, table, dirent2); err != nil {
 		t.Errorf("runTableTest failed after extending testSource: %v", err)
 	}
 }
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index 54562508d..ee6b9f262 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -22,39 +22,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// hostname is a file containing the system hostname.
-//
-// +stateify savable
-type hostname struct {
-	ramfs.Entry
-}
-
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (hostname) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	utsns := kernel.UTSNamespaceFromContext(ctx)
-	contents := []byte(utsns.HostName() + "\n")
-
-	if offset >= int64(len(contents)) {
-		return 0, io.EOF
-	}
-
-	n, err := dst.CopyOut(ctx, contents[offset:])
-	return int64(n), err
-}
-
-func (p *proc) newHostname(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	h := &hostname{}
-	h.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(h, msrc, fs.SpecialFile, nil)
-}
-
 // mmapMinAddrData backs /proc/sys/vm/mmap_min_addr.
 //
 // +stateify savable
@@ -101,36 +77,84 @@ func (*overcommitMemory) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandl
 }
 
 func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "hostname", p.newHostname(ctx, msrc))
-
-	d.AddChild(ctx, "shmmax", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))))
-	d.AddChild(ctx, "shmall", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))))
-	d.AddChild(ctx, "shmmni", p.newStubProcFSFile(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))))
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	h := hostname{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+	}
+
+	children := map[string]*fs.Inode{
+		"hostname": newProcInode(&h, msrc, fs.SpecialFile, nil),
+		"shmall":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMALL, 10))),
+		"shmmax":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMAX, 10))),
+		"shmmni":   newStaticProcInode(ctx, msrc, []byte(strconv.FormatUint(linux.SHMMNI, 10))),
+	}
+
+	d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newVMDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "mmap_min_addr", seqfile.NewSeqFileInode(ctx, &mmapMinAddrData{p.k}, msrc))
-	d.AddChild(ctx, "overcommit_memory", seqfile.NewSeqFileInode(ctx, &overcommitMemory{}, msrc))
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	children := map[string]*fs.Inode{
+		"mmap_min_addr":     seqfile.NewSeqFileInode(ctx, &mmapMinAddrData{p.k}, msrc),
+		"overcommit_memory": seqfile.NewSeqFileInode(ctx, &overcommitMemory{}, msrc),
+	}
+	d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newSysDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	d.AddChild(ctx, "kernel", p.newKernelDir(ctx, msrc))
-	d.AddChild(ctx, "vm", p.newVMDir(ctx, msrc))
+	children := map[string]*fs.Inode{
+		"kernel": p.newKernelDir(ctx, msrc),
+		"vm":     p.newVMDir(ctx, msrc),
+	}
 
 	// If we're using rpcinet we will let it manage /proc/sys/net.
 	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-		d.AddChild(ctx, "net", newRPCInetProcSysNet(ctx, msrc))
+		children["net"] = newRPCInetProcSysNet(ctx, msrc)
 	} else {
-		d.AddChild(ctx, "net", p.newSysNetDir(ctx, msrc))
+		children["net"] = p.newSysNetDir(ctx, msrc)
 	}
 
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, children, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
+}
+
+// hostname is the inode for a file containing the system hostname.
+//
+// +stateify savable
+type hostname struct {
+	fsutil.SimpleFileInode
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (h *hostname) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, d, flags, &hostnameFile{}), nil
+}
+
+var _ fs.InodeOperations = (*hostname)(nil)
+
+// +stateify savable
+type hostnameFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoSeek        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
 }
+
+// Read implements fs.FileOperations.Read.
+func (hf *hostnameFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	utsns := kernel.UTSNamespaceFromContext(ctx)
+	contents := []byte(utsns.HostName() + "\n")
+	if offset >= int64(len(contents)) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, contents[offset:])
+	return int64(n), err
+
+}
+
+var _ fs.FileOperations = (*hostnameFile)(nil)
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index b50d43d70..42e9bc47f 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -17,13 +17,17 @@ package proc
 import (
 	"fmt"
 	"io"
+	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 type tcpMemDir int
@@ -33,21 +37,37 @@ const (
 	tcpWMem
 )
 
+// tcpMemInode is used to read/write the size of netstack tcp buffers.
+//
+// TODO: If we have multiple proc mounts, concurrent writes can
+// leave netstack and the proc files in an inconsistent state. Since we set the
+// buffer size from these proc files on restore, we may also race and end up in
+// an inconsistent state on restore.
+//
 // +stateify savable
-type tcpMem struct {
-	ramfs.Entry
-	s    inet.Stack `state:"wait"`
+type tcpMemInode struct {
+	fsutil.SimpleFileInode
+	dir tcpMemDir
+	s   inet.Stack `state:"wait"`
+
+	// size stores the tcp buffer size during save, and sets the buffer
+	// size in netstack in restore. We must save/restore this here, since
+	// netstack itself is stateless.
 	size inet.TCPBufferSize
-	dir  tcpMemDir
-}
 
-func newTCPMem(s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *tcpMem {
-	return &tcpMem{s: s, size: size, dir: dir}
+	// mu protects against concurrent reads/writes to files based on this
+	// inode.
+	mu sync.Mutex `state:"nosave"`
 }
 
-func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *fs.Inode {
-	tm := newTCPMem(s, size, dir)
-	tm.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644))
+var _ fs.InodeOperations = (*tcpMemInode)(nil)
+
+func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir tcpMemDir) *fs.Inode {
+	tm := &tcpMemInode{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		s:               s,
+		dir:             dir,
+	}
 	sattr := fs.StableAttr{
 		DeviceID:  device.ProcDevice.DeviceID(),
 		InodeID:   device.ProcDevice.NextIno(),
@@ -57,62 +77,105 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, siz
 	return fs.NewInode(tm, msrc, sattr)
 }
 
-// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
-func (m *tcpMem) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// GetFile implements fs.InodeOperations.GetFile.
+func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	return fs.NewFile(ctx, dirent, flags, &tcpMemFile{tcpMemInode: m}), nil
+}
+
+// +stateify savable
+type tcpMemFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	tcpMemInode *tcpMemInode
+}
+
+var _ fs.FileOperations = (*tcpMemFile)(nil)
+
+// Read implements fs.FileOperations.Read.
+func (f *tcpMemFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset != 0 {
 		return 0, io.EOF
 	}
-	s := fmt.Sprintf("%d\t%d\t%d\n", m.size.Min, m.size.Default, m.size.Max)
+	f.tcpMemInode.mu.Lock()
+	defer f.tcpMemInode.mu.Unlock()
+
+	size, err := readSize(f.tcpMemInode.dir, f.tcpMemInode.s)
+	if err != nil {
+		return 0, err
+	}
+	s := fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max)
 	n, err := dst.CopyOut(ctx, []byte(s))
 	return int64(n), err
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (*tcpMem) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+// Write implements fs.FileOperations.Write.
+func (f *tcpMemFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	if src.NumBytes() == 0 {
 		return 0, nil
 	}
-	src = src.TakeFirst(usermem.PageSize - 1)
+	f.tcpMemInode.mu.Lock()
+	defer f.tcpMemInode.mu.Unlock()
 
-	buf := []int32{int32(m.size.Min), int32(m.size.Default), int32(m.size.Max)}
+	src = src.TakeFirst(usermem.PageSize - 1)
+	size, err := readSize(f.tcpMemInode.dir, f.tcpMemInode.s)
+	if err != nil {
+		return 0, err
+	}
+	buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
 	n, cperr := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
-	m.size = inet.TCPBufferSize{
+	newSize := inet.TCPBufferSize{
 		Min:     int(buf[0]),
 		Default: int(buf[1]),
 		Max:     int(buf[2]),
 	}
-	if err := m.writeSize(); err != nil {
+	if err := writeSize(f.tcpMemInode.dir, f.tcpMemInode.s, newSize); err != nil {
 		return n, err
 	}
 	return n, cperr
 }
 
-func (m *tcpMem) writeSize() error {
-	switch m.dir {
+func readSize(dirType tcpMemDir, s inet.Stack) (inet.TCPBufferSize, error) {
+	switch dirType {
+	case tcpRMem:
+		return s.TCPReceiveBufferSize()
+	case tcpWMem:
+		return s.TCPSendBufferSize()
+	default:
+		panic(fmt.Sprintf("unknown tcpMemFile type: %v", dirType))
+	}
+}
+
+func writeSize(dirType tcpMemDir, s inet.Stack, size inet.TCPBufferSize) error {
+	switch dirType {
 	case tcpRMem:
-		return m.s.SetTCPReceiveBufferSize(m.size)
+		return s.SetTCPReceiveBufferSize(size)
 	case tcpWMem:
-		return m.s.SetTCPSendBufferSize(m.size)
+		return s.SetTCPSendBufferSize(size)
 	default:
-		panic(fmt.Sprintf("unknown tcpMem.dir: %v", m.dir))
+		panic(fmt.Sprintf("unknown tcpMemFile type: %v", dirType))
 	}
 }
 
 // +stateify savable
 type tcpSack struct {
-	ramfs.Entry
-	s       inet.Stack `state:"wait"`
+	stack   inet.Stack `state:"wait"`
 	enabled *bool
+	fsutil.SimpleFileInode
 }
 
 func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
-	ts := &tcpSack{s: s}
-	ts.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644))
+	ts := &tcpSack{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		stack:           s,
+	}
 	sattr := fs.StableAttr{
 		DeviceID:  device.ProcDevice.DeviceID(),
 		InodeID:   device.ProcDevice.NextIno(),
@@ -122,21 +185,48 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
 	return fs.NewInode(ts, msrc, sattr)
 }
 
-func (s *tcpSack) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// GetFile implements fs.InodeOperations.GetFile.
+func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	flags.Pwrite = true
+	return fs.NewFile(ctx, dirent, flags, &tcpSackFile{
+		tcpSack: s,
+		stack:   s.stack,
+	}), nil
+}
+
+// +stateify savable
+type tcpSackFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	tcpSack *tcpSack
+
+	stack inet.Stack `state:"wait"`
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *tcpSackFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset != 0 {
 		return 0, io.EOF
 	}
 
-	if s.enabled == nil {
-		sack, err := s.s.TCPSACKEnabled()
+	if f.tcpSack.enabled == nil {
+		sack, err := f.stack.TCPSACKEnabled()
 		if err != nil {
 			return 0, err
 		}
-		s.enabled = &sack
+		f.tcpSack.enabled = &sack
 	}
 
 	val := "0\n"
-	if *s.enabled {
+	if *f.tcpSack.enabled {
 		// Technically, this is not quite compatible with Linux. Linux
 		// stores these as an integer, so if you write "2" into
 		// tcp_sack, you should get 2 back. Tough luck.
@@ -146,13 +236,8 @@ func (s *tcpSack) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence,
 	return int64(n), err
 }
 
-// Truncate implements fs.InodeOperations.Truncate.
-func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+// Write implements fs.FileOperations.Write.
+func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	if src.NumBytes() == 0 {
 		return 0, nil
 	}
@@ -163,100 +248,104 @@ func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence,
 	if err != nil {
 		return n, err
 	}
-	if s.enabled == nil {
-		s.enabled = new(bool)
+	if f.tcpSack.enabled == nil {
+		f.tcpSack.enabled = new(bool)
 	}
-	*s.enabled = v != 0
-	return n, s.s.SetTCPSACKEnabled(*s.enabled)
+	*f.tcpSack.enabled = v != 0
+	return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled)
 }
 
 func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-
 	// The following files are simple stubs until they are implemented in
 	// netstack, most of these files are configuration related. We use the
 	// value closest to the actual netstack behavior or any empty file,
 	// all of these files will have mode 0444 (read-only for all users).
-	d.AddChild(ctx, "default_qdisc", p.newStubProcFSFile(ctx, msrc, []byte("pfifo_fast")))
-	d.AddChild(ctx, "message_burst", p.newStubProcFSFile(ctx, msrc, []byte("10")))
-	d.AddChild(ctx, "message_cost", p.newStubProcFSFile(ctx, msrc, []byte("5")))
-	d.AddChild(ctx, "optmem_max", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "rmem_default", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-	d.AddChild(ctx, "rmem_max", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-	d.AddChild(ctx, "somaxconn", p.newStubProcFSFile(ctx, msrc, []byte("128")))
-	d.AddChild(ctx, "wmem_default", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-	d.AddChild(ctx, "wmem_max", p.newStubProcFSFile(ctx, msrc, []byte("212992")))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	contents := map[string]*fs.Inode{
+		"default_qdisc": newStaticProcInode(ctx, msrc, []byte("pfifo_fast")),
+		"message_burst": newStaticProcInode(ctx, msrc, []byte("10")),
+		"message_cost":  newStaticProcInode(ctx, msrc, []byte("5")),
+		"optmem_max":    newStaticProcInode(ctx, msrc, []byte("0")),
+		"rmem_default":  newStaticProcInode(ctx, msrc, []byte("212992")),
+		"rmem_max":      newStaticProcInode(ctx, msrc, []byte("212992")),
+		"somaxconn":     newStaticProcInode(ctx, msrc, []byte("128")),
+		"wmem_default":  newStaticProcInode(ctx, msrc, []byte("212992")),
+		"wmem_max":      newStaticProcInode(ctx, msrc, []byte("212992")),
+	}
+
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	contents := map[string]*fs.Inode{
+		// Add tcp_sack.
+		"tcp_sack": newTCPSackInode(ctx, msrc, s),
+
+		// The following files are simple stubs until they are
+		// implemented in netstack, most of these files are
+		// configuration related. We use the value closest to the
+		// actual netstack behavior or any empty file, all of these
+		// files will have mode 0444 (read-only for all users).
+		"ip_local_port_range":     newStaticProcInode(ctx, msrc, []byte("16000   65535")),
+		"ip_local_reserved_ports": newStaticProcInode(ctx, msrc, []byte("")),
+		"ipfrag_time":             newStaticProcInode(ctx, msrc, []byte("30")),
+		"ip_nonlocal_bind":        newStaticProcInode(ctx, msrc, []byte("0")),
+		"ip_no_pmtu_disc":         newStaticProcInode(ctx, msrc, []byte("1")),
+
+		// tcp_allowed_congestion_control tell the user what they are
+		// able to do as an unprivledged process so we leave it empty.
+		"tcp_allowed_congestion_control":   newStaticProcInode(ctx, msrc, []byte("")),
+		"tcp_available_congestion_control": newStaticProcInode(ctx, msrc, []byte("reno")),
+		"tcp_congestion_control":           newStaticProcInode(ctx, msrc, []byte("reno")),
+
+		// Many of the following stub files are features netstack
+		// doesn't support. The unsupported features return "0" to
+		// indicate they are disabled.
+		"tcp_base_mss":              newStaticProcInode(ctx, msrc, []byte("1280")),
+		"tcp_dsack":                 newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_early_retrans":         newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_fack":                  newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_fastopen":              newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_fastopen_key":          newStaticProcInode(ctx, msrc, []byte("")),
+		"tcp_invalid_ratelimit":     newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_keepalive_intvl":       newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_keepalive_probes":      newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_keepalive_time":        newStaticProcInode(ctx, msrc, []byte("7200")),
+		"tcp_mtu_probing":           newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_no_metrics_save":       newStaticProcInode(ctx, msrc, []byte("1")),
+		"tcp_probe_interval":        newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_probe_threshold":       newStaticProcInode(ctx, msrc, []byte("0")),
+		"tcp_retries1":              newStaticProcInode(ctx, msrc, []byte("3")),
+		"tcp_retries2":              newStaticProcInode(ctx, msrc, []byte("15")),
+		"tcp_rfc1337":               newStaticProcInode(ctx, msrc, []byte("1")),
+		"tcp_slow_start_after_idle": newStaticProcInode(ctx, msrc, []byte("1")),
+		"tcp_synack_retries":        newStaticProcInode(ctx, msrc, []byte("5")),
+		"tcp_syn_retries":           newStaticProcInode(ctx, msrc, []byte("3")),
+		"tcp_timestamps":            newStaticProcInode(ctx, msrc, []byte("1")),
+	}
 
 	// Add tcp_rmem.
-	if rs, err := s.TCPReceiveBufferSize(); err == nil {
-		d.AddChild(ctx, "tcp_rmem", newTCPMemInode(ctx, msrc, s, rs, tcpRMem))
+	if _, err := s.TCPReceiveBufferSize(); err == nil {
+		contents["tcp_rmem"] = newTCPMemInode(ctx, msrc, s, tcpRMem)
 	}
 
 	// Add tcp_wmem.
-	if ss, err := s.TCPSendBufferSize(); err == nil {
-		d.AddChild(ctx, "tcp_wmem", newTCPMemInode(ctx, msrc, s, ss, tcpWMem))
+	if _, err := s.TCPSendBufferSize(); err == nil {
+		contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem)
 	}
 
-	// Add tcp_sack.
-	d.AddChild(ctx, "tcp_sack", newTCPSackInode(ctx, msrc, s))
-
-	// The following files are simple stubs until they are implemented in
-	// netstack, most of these files are configuration related. We use the
-	// value closest to the actual netstack behavior or any empty file,
-	// all of these files will have mode 0444 (read-only for all users).
-	d.AddChild(ctx, "ip_local_port_range", p.newStubProcFSFile(ctx, msrc, []byte("16000   65535")))
-	d.AddChild(ctx, "ip_local_reserved_ports", p.newStubProcFSFile(ctx, msrc, []byte("")))
-	d.AddChild(ctx, "ipfrag_time", p.newStubProcFSFile(ctx, msrc, []byte("30")))
-	d.AddChild(ctx, "ip_nonlocal_bind", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "ip_no_pmtu_disc", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-
-	// tcp_allowed_congestion_control tell the user what they are able to do as an
-	// unprivledged process so we leave it empty.
-	d.AddChild(ctx, "tcp_allowed_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("")))
-	d.AddChild(ctx, "tcp_available_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("reno")))
-	d.AddChild(ctx, "tcp_congestion_control", p.newStubProcFSFile(ctx, msrc, []byte("reno")))
-
-	// Many of the following stub files are features netstack doesn't support
-	// and are therefore "0" for disabled.
-	d.AddChild(ctx, "tcp_base_mss", p.newStubProcFSFile(ctx, msrc, []byte("1280")))
-	d.AddChild(ctx, "tcp_dsack", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_early_retrans", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_fack", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_fastopen", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_fastopen_key", p.newStubProcFSFile(ctx, msrc, []byte("")))
-	d.AddChild(ctx, "tcp_invalid_ratelimit", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_keepalive_intvl", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_keepalive_probes", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_keepalive_time", p.newStubProcFSFile(ctx, msrc, []byte("7200")))
-	d.AddChild(ctx, "tcp_mtu_probing", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_no_metrics_save", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-	d.AddChild(ctx, "tcp_probe_interval", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_probe_threshold", p.newStubProcFSFile(ctx, msrc, []byte("0")))
-	d.AddChild(ctx, "tcp_retries1", p.newStubProcFSFile(ctx, msrc, []byte("3")))
-	d.AddChild(ctx, "tcp_retries2", p.newStubProcFSFile(ctx, msrc, []byte("15")))
-	d.AddChild(ctx, "tcp_rfc1337", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-	d.AddChild(ctx, "tcp_slow_start_after_idle", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-	d.AddChild(ctx, "tcp_synack_retries", p.newStubProcFSFile(ctx, msrc, []byte("5")))
-	d.AddChild(ctx, "tcp_syn_retries", p.newStubProcFSFile(ctx, msrc, []byte("3")))
-	d.AddChild(ctx, "tcp_timestamps", p.newStubProcFSFile(ctx, msrc, []byte("1")))
-
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
 
 func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
+	var contents map[string]*fs.Inode
 	if s := p.k.NetworkStack(); s != nil {
-		d.AddChild(ctx, "ipv4", p.newSysNetIPv4Dir(ctx, msrc, s))
-		d.AddChild(ctx, "core", p.newSysNetCore(ctx, msrc, s))
+		contents = map[string]*fs.Inode{
+			"ipv4": p.newSysNetIPv4Dir(ctx, msrc, s),
+			"core": p.newSysNetCore(ctx, msrc, s),
+		}
 	}
-	return newFile(d, msrc, fs.SpecialDirectory, nil)
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	return newProcInode(d, msrc, fs.SpecialDirectory, nil)
 }
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 7f46776c0..5f481a1cf 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -16,17 +16,26 @@ package proc
 
 import "fmt"
 
+// beforeSave is invoked by stateify.
+func (t *tcpMemInode) beforeSave() {
+	size, err := readSize(t.dir, t.s)
+	if err != nil {
+		panic(fmt.Sprintf("failed to read TCP send / receive buffer sizes: %v", err))
+	}
+	t.size = size
+}
+
 // afterLoad is invoked by stateify.
-func (m *tcpMem) afterLoad() {
-	if err := m.writeSize(); err != nil {
-		panic(fmt.Sprintf("failed to write previous TCP send / receive buffer sizes [%v]: %v", m.size, err))
+func (t *tcpMemInode) afterLoad() {
+	if err := writeSize(t.dir, t.s, t.size); err != nil {
+		panic(fmt.Sprintf("failed to write previous TCP send / receive buffer sizes [%v]: %v", t.size, err))
 	}
 }
 
 // afterLoad is invoked by stateify.
 func (s *tcpSack) afterLoad() {
 	if s.enabled != nil {
-		if err := s.s.SetTCPSACKEnabled(*s.enabled); err != nil {
+		if err := s.stack.SetTCPSACKEnabled(*s.enabled); err != nil {
 			panic(fmt.Sprintf("failed to set previous TCP sack configuration [%v]: %v", *s.enabled, err))
 		}
 	}
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index 0ce9d30f1..ea0d94fce 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -26,13 +26,14 @@ func TestQuerySendBufferSize(t *testing.T) {
 	ctx := context.Background()
 	s := inet.NewTestStack()
 	s.TCPSendBufSize = inet.TCPBufferSize{100, 200, 300}
-	tm := newTCPMem(s, s.TCPSendBufSize, tcpWMem)
+	tmi := &tcpMemInode{s: s, dir: tcpWMem}
+	tmf := &tcpMemFile{tcpMemInode: tmi}
 
 	buf := make([]byte, 100)
 	dst := usermem.BytesIOSequence(buf)
-	n, err := tm.DeprecatedPreadv(ctx, dst, 0)
+	n, err := tmf.Read(ctx, nil, dst, 0)
 	if err != nil {
-		t.Fatalf("DeprecatedPreadv failed: %v", err)
+		t.Fatalf("Read failed: %v", err)
 	}
 
 	if got, want := string(buf[:n]), "100\t200\t300\n"; got != want {
@@ -44,13 +45,14 @@ func TestQueryRecvBufferSize(t *testing.T) {
 	ctx := context.Background()
 	s := inet.NewTestStack()
 	s.TCPRecvBufSize = inet.TCPBufferSize{100, 200, 300}
-	tm := newTCPMem(s, s.TCPRecvBufSize, tcpRMem)
+	tmi := &tcpMemInode{s: s, dir: tcpRMem}
+	tmf := &tcpMemFile{tcpMemInode: tmi}
 
 	buf := make([]byte, 100)
 	dst := usermem.BytesIOSequence(buf)
-	n, err := tm.DeprecatedPreadv(ctx, dst, 0)
+	n, err := tmf.Read(ctx, nil, dst, 0)
 	if err != nil {
-		t.Fatalf("DeprecatedPreadv failed: %v", err)
+		t.Fatalf("Read failed: %v", err)
 	}
 
 	if got, want := string(buf[:n]), "100\t200\t300\n"; got != want {
@@ -85,12 +87,13 @@ func TestConfigureSendBufferSize(t *testing.T) {
 	s := inet.NewTestStack()
 	for _, c := range cases {
 		s.TCPSendBufSize = c.initial
-		tm := newTCPMem(s, c.initial, tcpWMem)
+		tmi := &tcpMemInode{s: s, dir: tcpWMem}
+		tmf := &tcpMemFile{tcpMemInode: tmi}
 
 		// Write the values.
 		src := usermem.BytesIOSequence([]byte(c.str))
-		if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil {
-			t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
+		if n, err := tmf.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+			t.Errorf("Write, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
 		}
 
 		// Read the values from the stack and check them.
@@ -105,12 +108,13 @@ func TestConfigureRecvBufferSize(t *testing.T) {
 	s := inet.NewTestStack()
 	for _, c := range cases {
 		s.TCPRecvBufSize = c.initial
-		tm := newTCPMem(s, c.initial, tcpRMem)
+		tmi := &tcpMemInode{s: s, dir: tcpRMem}
+		tmf := &tcpMemFile{tcpMemInode: tmi}
 
 		// Write the values.
 		src := usermem.BytesIOSequence([]byte(c.str))
-		if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil {
-			t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
+		if n, err := tmf.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil {
+			t.Errorf("Write, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str))
 		}
 
 		// Read the values from the stack and check them.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 91bda8a95..41981a973 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
@@ -32,6 +33,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's
@@ -57,19 +59,19 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) {
 type taskDir struct {
 	ramfs.Dir
 
-	// t is the associated kernel task that owns this file.
-	t *kernel.Task
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
 }
 
+var _ fs.InodeOperations = (*taskDir)(nil)
+
 // newTaskDir creates a new proc task entry.
 func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace, showSubtasks bool) *fs.Inode {
-	d := &taskDir{t: t}
-	// TODO: Set EUID/EGID based on dumpability.
-	d.InitDir(t, map[string]*fs.Inode{
+	contents := map[string]*fs.Inode{
 		"auxv":    newAuxvec(t, msrc),
-		"cmdline": newExecArgFile(t, msrc, cmdlineExecArg),
+		"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
 		"comm":    newComm(t, msrc),
-		"environ": newExecArgFile(t, msrc, environExecArg),
+		"environ": newExecArgInode(t, msrc, environExecArg),
 		"exe":     newExe(t, msrc),
 		"fd":      newFdDir(t, msrc),
 		"fdinfo":  newFdInfoDir(t, msrc),
@@ -87,11 +89,18 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"statm":     newStatm(t, msrc),
 		"status":    newStatus(t, msrc, pidns),
 		"uid_map":   newUIDMap(t, msrc),
-	}, fs.RootOwner, fs.FilePermsFromMode(0555))
+	}
 	if showSubtasks {
-		d.AddChild(t, "task", newSubtasks(t, msrc, pidns))
+		contents["task"] = newSubtasks(t, msrc, pidns)
 	}
-	return newFile(d, msrc, fs.SpecialDirectory, t)
+
+	// TODO: Set EUID/EGID based on dumpability.
+	d := &taskDir{
+		Dir:   *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		t:     t,
+		pidns: pidns,
+	}
+	return newProcInode(d, msrc, fs.SpecialDirectory, t)
 }
 
 // subtasks represents a /proc/TID/task directory.
@@ -100,15 +109,19 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 type subtasks struct {
 	ramfs.Dir
 
-	t *kernel.Task
-
+	t     *kernel.Task
 	pidns *kernel.PIDNamespace
 }
 
+var _ fs.InodeOperations = (*subtasks)(nil)
+
 func newSubtasks(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
-	s := &subtasks{t: t, pidns: pidns}
-	s.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
-	return newFile(s, msrc, fs.SpecialDirectory, t)
+	s := &subtasks{
+		Dir:   *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		t:     t,
+		pidns: pidns,
+	}
+	return newProcInode(s, msrc, fs.SpecialDirectory, t)
 }
 
 // UnstableAttr returns unstable attributes of the subtasks.
@@ -123,35 +136,52 @@ func (s *subtasks) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.Unstab
 	return uattr, nil
 }
 
-// Lookup loads an Inode in a task's subtask directory into a Dirent.
-func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
-	tid, err := strconv.ParseUint(p, 10, 32)
-	if err != nil {
-		return nil, syserror.ENOENT
-	}
+// GetFile implements fs.InodeOperations.GetFile.
+func (s *subtasks) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &subtasksFile{t: s.t, pidns: s.pidns}), nil
+}
 
-	task := s.pidns.TaskWithID(kernel.ThreadID(tid))
-	if task == nil {
-		return nil, syserror.ENOENT
-	}
-	if task.ThreadGroup() != s.t.ThreadGroup() {
-		return nil, syserror.ENOENT
-	}
+// +stateify savable
+type subtasksFile struct {
+	fsutil.DirFileOperations `state:"nosave"`
 
-	td := newTaskDir(task, dir.MountSource, s.pidns, false)
-	return fs.NewDirent(td, p), nil
+	t     *kernel.Task
+	pidns *kernel.PIDNamespace
 }
 
-// DeprecatedReaddir lists a task's subtask directory.
-func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	tasks := s.t.ThreadGroup().MemberIDs(s.pidns)
+// Readdir implements fs.FileOperations.Readdir.
+func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySerializer) (int64, error) {
+	dirCtx := fs.DirCtx{
+		Serializer: ser,
+	}
+
+	// Note that unlike most Readdir implementations, the offset here is
+	// not an index into the subtasks, but rather the TID of the next
+	// subtask to emit.
+	offset := file.Offset()
+
+	if offset == 0 {
+		// Serialize "." and "..".
+		root := fs.RootFromContext(ctx)
+		defer root.DecRef()
+		dot, dotdot := file.Dirent.GetDotAttrs(root)
+		if err := dirCtx.DirEmit(".", dot); err != nil {
+			return offset, err
+		}
+		if err := dirCtx.DirEmit("..", dotdot); err != nil {
+			return offset, err
+		}
+	}
+
+	// Serialize tasks.
+	tasks := f.t.ThreadGroup().MemberIDs(f.pidns)
 	taskInts := make([]int, 0, len(tasks))
 	for _, tid := range tasks {
 		taskInts = append(taskInts, int(tid))
 	}
 
 	// Find the task to start at.
-	idx := sort.SearchInts(taskInts, offset)
+	idx := sort.SearchInts(taskInts, int(offset))
 	if idx == len(taskInts) {
 		return offset, nil
 	}
@@ -163,12 +193,33 @@ func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, off
 		attr := fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice)
 		if err := dirCtx.DirEmit(name, attr); err != nil {
 			// Returned offset is next tid to serialize.
-			return tid, err
+			return int64(tid), err
 		}
 	}
 	// We serialized them all.  Next offset should be higher than last
 	// serialized tid.
-	return tid + 1, nil
+	return int64(tid) + 1, nil
+}
+
+var _ fs.FileOperations = (*subtasksFile)(nil)
+
+// Lookup loads an Inode in a task's subtask directory into a Dirent.
+func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+	tid, err := strconv.ParseUint(p, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+
+	task := s.pidns.TaskWithID(kernel.ThreadID(tid))
+	if task == nil {
+		return nil, syserror.ENOENT
+	}
+	if task.ThreadGroup() != s.t.ThreadGroup() {
+		return nil, syserror.ENOENT
+	}
+
+	td := newTaskDir(task, dir.MountSource, s.pidns, false)
+	return fs.NewDirent(td, p), nil
 }
 
 // exe is an fs.InodeOperations symlink for the /proc/PID/exe file.
@@ -181,9 +232,11 @@ type exe struct {
 }
 
 func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	exeSymlink := &exe{t: t}
-	exeSymlink.InitSymlink(t, fs.RootOwner, "")
-	return newFile(exeSymlink, msrc, fs.Symlink, t)
+	exeSymlink := &exe{
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
+		t:       t,
+	}
+	return newProcInode(exeSymlink, msrc, fs.Symlink, t)
 }
 
 func (e *exe) executable() (d *fs.Dirent, err error) {
@@ -231,55 +284,48 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
 	return n, nil
 }
 
-// namespaceFile represents a file in the namespacefs, such as the files in
-// /proc/<pid>/ns.
+// namespaceSymlink represents a symlink in the namespacefs, such as the files
+// in /proc/<pid>/ns.
 //
 // +stateify savable
-type namespaceFile struct {
+type namespaceSymlink struct {
 	ramfs.Symlink
 
 	t *kernel.Task
 }
 
-func newNamespaceFile(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
-	n := &namespaceFile{t: t}
-	n.InitSymlink(t, fs.RootOwner, "")
-
+func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
 	// TODO: Namespace symlinks should contain the namespace name and the
 	// inode number for the namespace instance, so for example user:[123456]. We
 	// currently fake the inode number by sticking the symlink inode in its
 	// place.
-	n.Target = fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno())
-
-	return newFile(n, msrc, fs.Symlink, t)
+	target := fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno())
+	n := &namespaceSymlink{
+		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, target),
+		t:       t,
+	}
+	return newProcInode(n, msrc, fs.Symlink, t)
 }
 
 // Getlink implements fs.InodeOperations.Getlink.
-func (n *namespaceFile) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
+func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) {
 	if !kernel.ContextCanTrace(ctx, n.t, false) {
 		return nil, syserror.EACCES
 	}
 
 	// Create a new regular file to fake the namespace file.
-	node := &ramfs.Entry{}
-	node.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0777))
-	sattr := fs.StableAttr{
-		DeviceID:  device.ProcDevice.DeviceID(),
-		InodeID:   device.ProcDevice.NextIno(),
-		BlockSize: usermem.PageSize,
-		Type:      fs.RegularFile,
-	}
-	return fs.NewDirent(fs.NewInode(node, inode.MountSource, sattr), n.Symlink.Target), nil
+	iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC)
+	return fs.NewDirent(newProcInode(iops, inode.MountSource, fs.RegularFile, nil), n.Symlink.Target), nil
 }
 
 func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	d := &ramfs.Dir{}
-	d.InitDir(t, map[string]*fs.Inode{
-		"net":  newNamespaceFile(t, msrc, "net"),
-		"pid":  newNamespaceFile(t, msrc, "pid"),
-		"user": newNamespaceFile(t, msrc, "user"),
-	}, fs.RootOwner, fs.FilePermsFromMode(0511))
-	return newFile(d, msrc, fs.SpecialDirectory, t)
+	contents := map[string]*fs.Inode{
+		"net":  newNamespaceSymlink(t, msrc, "net"),
+		"pid":  newNamespaceSymlink(t, msrc, "pid"),
+		"user": newNamespaceSymlink(t, msrc, "user"),
+	}
+	d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0511))
+	return newProcInode(d, msrc, fs.SpecialDirectory, t)
 }
 
 // mapsData implements seqfile.SeqSource for /proc/[pid]/maps.
@@ -290,7 +336,7 @@ type mapsData struct {
 }
 
 func newMaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &mapsData{t}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &mapsData{t}), msrc, fs.SpecialFile, t)
 }
 
 func (md *mapsData) mm() *mm.MemoryManager {
@@ -330,7 +376,7 @@ type smapsData struct {
 }
 
 func newSmaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &smapsData{t}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &smapsData{t}), msrc, fs.SpecialFile, t)
 }
 
 func (sd *smapsData) mm() *mm.MemoryManager {
@@ -376,7 +422,7 @@ type taskStatData struct {
 }
 
 func newTaskStat(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool, pidns *kernel.PIDNamespace) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate returns whether the generation is old or not.
@@ -450,7 +496,7 @@ type statmData struct {
 }
 
 func newStatm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &statmData{t}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &statmData{t}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
@@ -487,7 +533,7 @@ type statusData struct {
 }
 
 func newStatus(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &statusData{t, pidns}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &statusData{t, pidns}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
@@ -552,7 +598,7 @@ type ioData struct {
 }
 
 func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	return newFile(seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+	return newProcInode(seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
 }
 
 // NeedsUpdate returns whether the generation is old or not.
@@ -590,25 +636,49 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 //
 // +stateify savable
 type comm struct {
-	ramfs.Entry
+	fsutil.SimpleFileInode
 
 	t *kernel.Task
 }
 
 // newComm returns a new comm file.
 func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	c := &comm{t: t}
-	c.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(c, msrc, fs.SpecialFile, t)
+	c := &comm{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(c, msrc, fs.SpecialFile, t)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &commFile{t: c.t}), nil
+}
+
+// +stateify savable
+type commFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+
+	t *kernel.Task
 }
 
-// DeprecatedPreadv reads the current command name.
-func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+var _ fs.FileOperations = (*commFile)(nil)
+
+// Read implements fs.FileOperations.Read.
+func (f *commFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
 
-	buf := []byte(c.t.Name() + "\n")
+	buf := []byte(f.t.Name() + "\n")
 	if offset >= int64(len(buf)) {
 		return 0, io.EOF
 	}
@@ -621,25 +691,47 @@ func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, off
 //
 // +stateify savable
 type auxvec struct {
-	ramfs.Entry
+	fsutil.SimpleFileInode
 
 	t *kernel.Task
 }
 
 // newAuxvec returns a new auxvec file.
 func newAuxvec(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
-	a := &auxvec{t: t}
-	a.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0400))
-	return newFile(a, msrc, fs.SpecialFile, t)
+	a := &auxvec{
+		SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		t:               t,
+	}
+	return newProcInode(a, msrc, fs.SpecialFile, t)
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (a *auxvec) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &auxvecFile{t: a.t}), nil
+}
+
+// +stateify savable
+type auxvecFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+
+	t *kernel.Task
 }
 
-// DeprecatedPreadv reads the current auxiliary vector.
-func (a *auxvec) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// Read implements fs.FileOperations.Read.
+func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
 
-	m, err := getTaskMM(a.t)
+	m, err := getTaskMM(f.t)
 	if err != nil {
 		return 0, err
 	}
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index f70399686..815c40b7f 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -90,12 +90,13 @@ func newGIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newIDMap(t *kernel.Task, msrc *fs.MountSource, gids bool) *fs.Inode {
-	imsf := &idMapSeqFile{seqfile.SeqFile{SeqSource: &idMapSeqSource{
-		t:    t,
-		gids: gids,
-	}}}
-	imsf.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0644))
-	return newFile(imsf, msrc, fs.SpecialFile, t)
+	imsf := &idMapSeqFile{
+		*seqfile.NewSeqFile(t, &idMapSeqSource{
+			t:    t,
+			gids: gids,
+		}),
+	}
+	return newProcInode(imsf, msrc, fs.SpecialFile, t)
 }
 
 func (imsf *idMapSeqFile) source() *idMapSeqSource {
@@ -106,8 +107,8 @@ func (imsf *idMapSeqFile) source() *idMapSeqSource {
 // Linux 3.18, the limit is five lines." - user_namespaces(7)
 const maxIDMapLines = 5
 
-// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
-func (imsf *idMapSeqFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+// Write implements fs.FileOperations.Write.
+func (imsf *idMapSeqFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	// "In addition, the number of bytes written to the file must be less than
 	// the system page size, and the write must be performed at the start of
 	// the file ..." - user_namespaces(7)
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index 80c7ce0b4..40d0fd1fd 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -18,42 +18,64 @@ import (
 	"fmt"
 	"io"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // uptime is a file containing the system uptime.
 //
 // +stateify savable
 type uptime struct {
-	ramfs.Entry
+	fsutil.SimpleFileInode
 
 	// The "start time" of the sandbox.
 	startTime ktime.Time
 }
 
 // newUptime returns a new uptime file.
-func (p *proc) newUptime(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+func newUptime(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	u := &uptime{
-		startTime: ktime.NowFromContext(ctx),
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+		startTime:       ktime.NowFromContext(ctx),
 	}
-	u.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
-	return newFile(u, msrc, fs.SpecialFile, nil)
+	return newProcInode(u, msrc, fs.SpecialFile, nil)
 }
 
-// DeprecatedPreadv reads the current uptime.
-func (u *uptime) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+// GetFile implements fs.InodeOperations.GetFile.
+func (u *uptime) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &uptimeFile{startTime: u.startTime}), nil
+}
+
+// +stateify savable
+type uptimeFile struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+
+	startTime ktime.Time
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *uptimeFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	if offset < 0 {
 		return 0, syserror.EINVAL
 	}
 
 	now := ktime.NowFromContext(ctx)
 	// Pretend that we've spent zero time sleeping (second number).
-	s := []byte(fmt.Sprintf("%.2f 0.00\n", now.Sub(u.startTime).Seconds()))
+	s := []byte(fmt.Sprintf("%.2f 0.00\n", now.Sub(f.startTime).Seconds()))
 	if offset >= int64(len(s)) {
 		return 0, io.EOF
 	}
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index a93ad6240..a476c9cce 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -6,8 +6,6 @@ go_library(
     name = "ramfs",
     srcs = [
         "dir.go",
-        "file.go",
-        "ramfs.go",
         "socket.go",
         "symlink.go",
         "tree.go",
@@ -15,14 +13,12 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/secio",
+        "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/time",
-        "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 0a911b155..729f37694 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -18,10 +18,12 @@ import (
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -47,7 +49,17 @@ type CreateOps struct {
 //
 // +stateify savable
 type Dir struct {
-	Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeIsDirTruncate  `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeSimpleExtendedAttributes
 
 	// CreateOps may be provided.
 	//
@@ -64,17 +76,23 @@ type Dir struct {
 	children map[string]*fs.Inode
 
 	// dentryMap is a sortedDentryMap containing entries for all children.
-	// Its entries ar kept up-to-date with d.children.
+	// Its entries are kept up-to-date with d.children.
 	dentryMap *fs.SortedDentryMap
 }
 
-// InitDir initializes a directory.
-func (d *Dir) InitDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) {
-	d.InitEntry(ctx, owner, perms)
+var _ fs.InodeOperations = (*Dir)(nil)
+
+// NewDir returns a new Dir with the given contents and attributes.
+func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) *Dir {
+	d := &Dir{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, perms, linux.RAMFS_MAGIC),
+	}
+
 	if contents == nil {
 		contents = make(map[string]*fs.Inode)
 	}
 	d.children = contents
+
 	// Build the entries map ourselves, rather than calling addChildLocked,
 	// because it will be faster.
 	entries := make(map[string]fs.DentAttr, len(contents))
@@ -88,6 +106,8 @@ func (d *Dir) InitDir(ctx context.Context, contents map[string]*fs.Inode, owner
 
 	// Directories have an extra link, corresponding to '.'.
 	d.AddLink()
+
+	return d
 }
 
 // addChildLocked add the child inode, inheriting its reference.
@@ -124,17 +144,24 @@ func (d *Dir) FindChild(name string) (*fs.Inode, bool) {
 	return child, ok
 }
 
+// Children returns the names and DentAttrs of all children. It can be used to
+// implement Readdir for types that embed ramfs.Dir.
+func (d *Dir) Children() ([]string, map[string]fs.DentAttr) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.dentryMap.GetAll()
+}
+
 // removeChildLocked attempts to remove an entry from this directory.
-// This Entry's mutex must be held. It returns the removed Inode.
 func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) {
 	inode, ok := d.children[name]
 	if !ok {
-		return nil, ErrNotFound
+		return nil, syserror.EACCES
 	}
 
 	delete(d.children, name)
 	d.dentryMap.Remove(name)
-	d.Entry.NotifyModification(ctx)
+	d.NotifyModification(ctx)
 
 	// If the child was a subdirectory, then we must decrement this dir's
 	// link count which was the child's ".." directory entry.
@@ -143,7 +170,7 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 	}
 
 	// Update ctime.
-	inode.NotifyStatusChange(ctx)
+	inode.InodeOperations.NotifyStatusChange(ctx)
 
 	// Given we're now removing this inode to the directory we must also
 	// decrease its link count. Similarly it is increased in addChildLocked.
@@ -152,8 +179,8 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 	return inode, nil
 }
 
-// RemoveEntry attempts to remove an entry from this directory.
-func (d *Dir) RemoveEntry(ctx context.Context, name string) error {
+// Remove removes the named non-directory.
+func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 	inode, err := d.removeChildLocked(ctx, name)
@@ -166,27 +193,23 @@ func (d *Dir) RemoveEntry(ctx context.Context, name string) error {
 	return nil
 }
 
-// Remove removes the named non-directory.
-func (d *Dir) Remove(ctx context.Context, dir *fs.Inode, name string) error {
-	return d.RemoveEntry(ctx, name)
-}
-
 // RemoveDirectory removes the named directory.
-func (d *Dir) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) error {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	n, err := d.walkLocked(ctx, name)
+	// Get the child and make sure it is not empty.
+	childInode, err := d.walkLocked(ctx, name)
 	if err != nil {
 		return err
 	}
-	dirCtx := &fs.DirCtx{}
-	if _, err := n.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0); err != nil {
+	if ok, err := hasChildren(ctx, childInode); err != nil {
 		return err
+	} else if ok {
+		return syserror.ENOTEMPTY
 	}
-	if len(dirCtx.DentAttrs()) > 0 {
-		return ErrNotEmpty
-	}
+
+	// Child was empty. Proceed with removal.
 	inode, err := d.removeChildLocked(ctx, name)
 	if err != nil {
 		return err
@@ -195,11 +218,11 @@ func (d *Dir) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) e
 	// Remove our reference on the inode.
 	inode.DecRef()
 
-	return err
+	return nil
 }
 
 // Lookup loads an inode at p into a Dirent.
-func (d *Dir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
+func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, error) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
@@ -214,9 +237,9 @@ func (d *Dir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent,
 	return fs.NewDirent(inode, p), nil
 }
 
-// walkLocked must be called with this Entry's mutex held.
+// walkLocked must be called with d.mu held.
 func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) {
-	d.Entry.NotifyAccess(ctx)
+	d.NotifyAccess(ctx)
 
 	// Lookup a child node.
 	if inode, ok := d.children[p]; ok {
@@ -244,7 +267,7 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make
 	}
 
 	d.addChildLocked(name, inode)
-	d.Entry.NotifyModification(ctx)
+	d.NotifyModification(ctx)
 
 	return inode, nil
 }
@@ -252,7 +275,7 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make
 // Create creates a new Inode with the given name and returns its File.
 func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) {
 	if d.CreateOps == nil || d.CreateOps.NewFile == nil {
-		return nil, ErrDenied
+		return nil, syserror.EACCES
 	}
 
 	inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
@@ -274,7 +297,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F
 // CreateLink returns a new link.
 func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error {
 	if d.CreateOps == nil || d.CreateOps.NewSymlink == nil {
-		return ErrDenied
+		return syserror.EACCES
 	}
 	_, err := d.createInodeOperationsCommon(ctx, newname, func() (*fs.Inode, error) {
 		return d.NewSymlink(ctx, dir, oldname)
@@ -292,10 +315,10 @@ func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inod
 
 	// The link count will be incremented in addChildLocked.
 	d.addChildLocked(name, target)
-	d.Entry.NotifyModification(ctx)
+	d.NotifyModification(ctx)
 
 	// Update ctime.
-	target.NotifyStatusChange(ctx)
+	target.InodeOperations.NotifyStatusChange(ctx)
 
 	return nil
 }
@@ -303,7 +326,7 @@ func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inod
 // CreateDirectory returns a new subdirectory.
 func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
 	if d.CreateOps == nil || d.CreateOps.NewDir == nil {
-		return ErrDenied
+		return syserror.EACCES
 	}
 	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewDir(ctx, dir, perms)
@@ -316,7 +339,7 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 // Bind implements fs.InodeOperations.Bind.
 func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
 	if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil {
-		return nil, ErrDenied
+		return nil, syserror.EACCES
 	}
 	inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewBoundEndpoint(ctx, dir, ep, perms)
@@ -335,7 +358,7 @@ func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport
 // CreateFifo implements fs.InodeOperations.CreateFifo.
 func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
 	if d.CreateOps == nil || d.CreateOps.NewFifo == nil {
-		return ErrDenied
+		return syserror.EACCES
 	}
 	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewFifo(ctx, dir, perms)
@@ -343,29 +366,125 @@ func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms
 	return err
 }
 
-func (d *Dir) readdirLocked(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	// Serialize the entries in dentryMap.
-	n, err := fs.GenericReaddir(dirCtx, d.dentryMap)
+// GetFile implements fs.InodeOperations.GetFile.
+func (d *Dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	flags.Pread = true
+	return fs.NewFile(ctx, dirent, flags, &dirFileOperations{dir: d}), nil
+}
 
-	// Touch the access time.
-	d.Entry.NotifyAccess(ctx)
+// Rename implements fs.InodeOperations.Rename.
+func (*Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName)
+}
 
+// dirFileOperations implements fs.FileOperations for a ramfs directory.
+//
+// +stateify savable
+type dirFileOperations struct {
+	fsutil.DirFileOperations `state:"nosave"`
+
+	// dirCursor contains the name of the last directory entry that was
+	// serialized.
+	dirCursor string
+
+	// dir is the ramfs dir that this file corresponds to.
+	dir *Dir
+}
+
+var _ fs.FileOperations = (*dirFileOperations)(nil)
+
+// Seek implements fs.FileOperations.Seek.
+func (dfo *dirFileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &dfo.dirCursor)
+}
+
+// IterateDir implements DirIterator.IterateDir.
+func (dfo *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+	dfo.dir.mu.Lock()
+	defer dfo.dir.mu.Unlock()
+
+	n, err := fs.GenericReaddir(dirCtx, dfo.dir.dentryMap)
 	return offset + n, err
 }
 
-// DeprecatedReaddir emits the entries contained in this directory.
-func (d *Dir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.readdirLocked(ctx, dirCtx, offset)
+// Readdir implements FileOperations.Readdir.
+func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+	root := fs.RootFromContext(ctx)
+	defer root.DecRef()
+	dirCtx := &fs.DirCtx{
+		Serializer: serializer,
+		DirCursor:  &dfo.dirCursor,
+	}
+	dfo.dir.mu.Lock()
+	dfo.dir.InodeSimpleAttributes.Unstable.AccessTime = ktime.NowFromContext(ctx)
+	dfo.dir.mu.Unlock()
+	return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset())
 }
 
-// DeprecatedPreadv always returns ErrIsDirectory
-func (*Dir) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrIsDirectory
+// hasChildren is a helper method that determines whether an arbitrary inode
+// (not necessarily ramfs) has any children.
+func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) {
+	// Take an extra ref on inode which will be given to the dirent and
+	// dropped when that dirent is destroyed.
+	inode.IncRef()
+	d := fs.NewTransientDirent(inode)
+	defer d.DecRef()
+
+	file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true})
+	if err != nil {
+		return false, err
+	}
+	defer file.DecRef()
+
+	ser := &fs.CollectEntriesSerializer{}
+	if err := file.Readdir(ctx, ser); err != nil {
+		return false, err
+	}
+	// We will always write "." and "..", so ignore those two.
+	if ser.Written() > 2 {
+		return true, nil
+	}
+	return false, nil
 }
 
-// DeprecatedPwritev always returns ErrIsDirectory
-func (*Dir) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrIsDirectory
+// Rename renames from a *ramfs.Dir to another *ramfs.Dir.
+func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string) error {
+	op, ok := oldParent.(*Dir)
+	if !ok {
+		return syserror.EXDEV
+	}
+	np, ok := newParent.(*Dir)
+	if !ok {
+		return syserror.EXDEV
+	}
+
+	np.mu.Lock()
+	defer np.mu.Unlock()
+
+	// Check whether the ramfs entry to be replaced is a non-empty directory.
+	if replaced, ok := np.children[newName]; ok {
+		if fs.IsDir(replaced.StableAttr) {
+			if ok, err := hasChildren(ctx, replaced); err != nil {
+				return err
+			} else if ok {
+				return syserror.ENOTEMPTY
+			}
+		}
+	}
+
+	// Be careful, we may have already grabbed this mutex above.
+	if op != np {
+		op.mu.Lock()
+		defer op.mu.Unlock()
+	}
+
+	// Do the swap.
+	n := op.children[oldName]
+	op.removeChildLocked(ctx, oldName)
+	np.addChildLocked(newName, n)
+
+	// Update ctime.
+	n.InodeOperations.NotifyStatusChange(ctx)
+
+	return nil
 }
diff --git a/pkg/sentry/fs/ramfs/file.go b/pkg/sentry/fs/ramfs/file.go
deleted file mode 100644
index b7fc98ffc..000000000
--- a/pkg/sentry/fs/ramfs/file.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ramfs
-
-import (
-	"io"
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/secio"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-)
-
-// File represents a unique file.  It uses a simple byte slice as storage, and
-// thus should only be used for small files.
-//
-// A File is not mappable.
-//
-// +stateify savable
-type File struct {
-	Entry
-
-	// mu protects the fields below.
-	mu sync.Mutex `state:"nosave"`
-
-	// data tracks backing data for the file.
-	data []byte
-}
-
-// InitFile initializes a file.
-func (f *File) InitFile(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions) {
-	f.InitEntry(ctx, owner, perms)
-}
-
-// UnstableAttr returns unstable attributes of this ramfs file.
-func (f *File) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	uattr, _ := f.Entry.UnstableAttr(ctx, inode)
-	uattr.Size = int64(len(f.data))
-	uattr.Usage = f.usageLocked()
-
-	return uattr, nil
-}
-
-// usageLocked returns the disk usage. Caller must hold f.mu.
-func (f *File) usageLocked() int64 {
-	return int64(len(f.data))
-}
-
-// Append appends the given data. This is for internal use.
-func (f *File) Append(data []byte) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	f.data = append(f.data, data...)
-}
-
-// Truncate truncates this node.
-func (f *File) Truncate(ctx context.Context, inode *fs.Inode, l int64) error {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if l < int64(len(f.data)) {
-		// Remove excess bytes.
-		f.data = f.data[:l]
-		return nil
-	} else if l > int64(len(f.data)) {
-		// Create a new slice with size l, and copy f.data into it.
-		d := make([]byte, l)
-		copy(d, f.data)
-		f.data = d
-	}
-	f.Entry.NotifyModification(ctx)
-	return nil
-}
-
-// ReadAt implements io.ReaderAt.
-func (f *File) ReadAt(data []byte, offset int64) (int, error) {
-	if offset < 0 {
-		return 0, ErrInvalidOp
-	}
-	if offset >= int64(len(f.data)) {
-		return 0, io.EOF
-	}
-	n := copy(data, f.data[offset:])
-	// Did we read past the end?
-	if offset+int64(len(data)) >= int64(len(f.data)) {
-		return n, io.EOF
-	}
-	return n, nil
-}
-
-// DeprecatedPreadv reads into a collection of slices from a given offset.
-func (f *File) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if offset >= int64(len(f.data)) {
-		return 0, io.EOF
-	}
-	n, err := dst.CopyOut(ctx, f.data[offset:])
-	if n > 0 {
-		f.Entry.NotifyAccess(ctx)
-	}
-	return int64(n), err
-}
-
-// WriteAt implements io.WriterAt.
-func (f *File) WriteAt(data []byte, offset int64) (int, error) {
-	if offset < 0 {
-		return 0, ErrInvalidOp
-	}
-	newLen := offset + int64(len(data))
-	if newLen < 0 {
-		// Overflow.
-		return 0, syserror.EINVAL
-	}
-	if newLen > int64(len(f.data)) {
-		// Copy f.data into new slice with expanded length.
-		d := make([]byte, newLen)
-		copy(d, f.data)
-		f.data = d
-	}
-	return copy(f.data[offset:], data), nil
-}
-
-// DeprecatedPwritev writes from a collection of slices at a given offset.
-func (f *File) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	n, err := src.CopyInTo(ctx, safemem.FromIOWriter{secio.NewOffsetWriter(f, offset)})
-	if n > 0 {
-		f.Entry.NotifyModification(ctx)
-	}
-	return n, err
-}
diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go
deleted file mode 100644
index d77688a34..000000000
--- a/pkg/sentry/fs/ramfs/ramfs.go
+++ /dev/null
@@ -1,441 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package ramfs implements an in-memory file system that can be associated with
-// any device.
-package ramfs
-
-import (
-	"errors"
-	"sync"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-var (
-	// ErrInvalidOp indicates the operation is not valid.
-	ErrInvalidOp = errors.New("invalid operation")
-
-	// ErrDenied indicates the operation was denied.
-	ErrDenied = errors.New("operation denied")
-
-	// ErrNotFound indicates that a node was not found on a walk.
-	ErrNotFound = errors.New("node not found")
-
-	// ErrCrossDevice indicates a cross-device link or rename.
-	ErrCrossDevice = errors.New("can't link across filesystems")
-
-	// ErrIsDirectory indicates that the operation failed because
-	// the node is a directory.
-	ErrIsDirectory = errors.New("is a directory")
-
-	// ErrNotDirectory indicates that the operation failed because
-	// the node is a not directory.
-	ErrNotDirectory = errors.New("not a directory")
-
-	// ErrNotEmpty indicates that the operation failed because the
-	// directory is not empty.
-	ErrNotEmpty = errors.New("directory not empty")
-)
-
-// Entry represents common internal state for file and directory nodes.
-// This may be used by other packages to easily create ramfs files.
-//
-// +stateify savable
-type Entry struct {
-	waiter.AlwaysReady    `state:"nosave"`
-	fsutil.NoMappable     `state:"nosave"`
-	fsutil.NoopWriteOut   `state:"nosave"`
-	fsutil.InodeNotSocket `state:"nosave"`
-
-	// mu protects the fields below.
-	mu sync.Mutex `state:"nosave"`
-
-	// unstable is unstable attributes.
-	unstable fs.UnstableAttr
-
-	// xattrs are the extended attributes of the Entry.
-	xattrs map[string][]byte
-}
-
-// InitEntry initializes an entry.
-func (e *Entry) InitEntry(ctx context.Context, owner fs.FileOwner, p fs.FilePermissions) {
-	e.InitEntryWithAttr(ctx, fs.WithCurrentTime(ctx, fs.UnstableAttr{
-		Owner: owner,
-		Perms: p,
-		// Always start unlinked.
-		Links: 0,
-	}))
-}
-
-// InitEntryWithAttr initializes an entry with a complete set of attributes.
-func (e *Entry) InitEntryWithAttr(ctx context.Context, uattr fs.UnstableAttr) {
-	e.unstable = uattr
-	e.xattrs = make(map[string][]byte)
-}
-
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (e *Entry) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	e.mu.Lock()
-	attr := e.unstable
-	e.mu.Unlock()
-	return attr, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (*Entry) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// Getxattr implements fs.InodeOperations.Getxattr.
-func (e *Entry) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
-	// Hot path. Avoid defers.
-	e.mu.Lock()
-	value, ok := e.xattrs[name]
-	e.mu.Unlock()
-	if ok {
-		return value, nil
-	}
-	return nil, syserror.ENOATTR
-}
-
-// Setxattr implements fs.InodeOperations.Setxattr.
-func (e *Entry) Setxattr(inode *fs.Inode, name string, value []byte) error {
-	e.mu.Lock()
-	e.xattrs[name] = value
-	e.mu.Unlock()
-	return nil
-}
-
-// Listxattr implements fs.InodeOperations.Listxattr.
-func (e *Entry) Listxattr(inode *fs.Inode) (map[string]struct{}, error) {
-	e.mu.Lock()
-	names := make(map[string]struct{}, len(e.xattrs))
-	for name := range e.xattrs {
-		names[name] = struct{}{}
-	}
-	e.mu.Unlock()
-	return names, nil
-}
-
-// GetFile returns a fs.File backed by the dirent argument and flags.
-func (*Entry) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), nil
-}
-
-// SetPermissions always sets the permissions.
-func (e *Entry) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
-	e.mu.Lock()
-	e.unstable.Perms = p
-	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-	e.mu.Unlock()
-	return true
-}
-
-// SetOwner always sets ownership.
-func (e *Entry) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	e.mu.Lock()
-	if owner.UID.Ok() {
-		e.unstable.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		e.unstable.Owner.GID = owner.GID
-	}
-	e.mu.Unlock()
-	return nil
-}
-
-// SetTimestamps sets the timestamps.
-func (e *Entry) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	e.mu.Lock()
-	now := ktime.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATimeSetSystemTime {
-			e.unstable.AccessTime = now
-		} else {
-			e.unstable.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTimeSetSystemTime {
-			e.unstable.ModificationTime = now
-		} else {
-			e.unstable.ModificationTime = ts.MTime
-		}
-	}
-	e.unstable.StatusChangeTime = now
-	e.mu.Unlock()
-	return nil
-}
-
-// NotifyStatusChange updates the status change time (ctime).
-func (e *Entry) NotifyStatusChange(ctx context.Context) {
-	e.mu.Lock()
-	e.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
-	e.mu.Unlock()
-}
-
-// StatusChangeTime returns the last status change time for this node.
-func (e *Entry) StatusChangeTime() ktime.Time {
-	e.mu.Lock()
-	t := e.unstable.StatusChangeTime
-	e.mu.Unlock()
-	return t
-}
-
-// NotifyModification updates the modification time and the status change time.
-func (e *Entry) NotifyModification(ctx context.Context) {
-	e.mu.Lock()
-	now := ktime.NowFromContext(ctx)
-	e.unstable.ModificationTime = now
-	e.unstable.StatusChangeTime = now
-	e.mu.Unlock()
-}
-
-// ModificationTime returns the last modification time for this node.
-func (e *Entry) ModificationTime() ktime.Time {
-	e.mu.Lock()
-	t := e.unstable.ModificationTime
-	e.mu.Unlock()
-	return t
-}
-
-// NotifyAccess updates the access time.
-func (e *Entry) NotifyAccess(ctx context.Context) {
-	e.mu.Lock()
-	now := ktime.NowFromContext(ctx)
-	e.unstable.AccessTime = now
-	e.mu.Unlock()
-}
-
-// AccessTime returns the last access time for this node.
-func (e *Entry) AccessTime() ktime.Time {
-	e.mu.Lock()
-	t := e.unstable.AccessTime
-	e.mu.Unlock()
-	return t
-}
-
-// Permissions returns permissions on this entry.
-func (e *Entry) Permissions() fs.FilePermissions {
-	e.mu.Lock()
-	p := e.unstable.Perms
-	e.mu.Unlock()
-	return p
-}
-
-// Lookup is not supported by default.
-func (*Entry) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) {
-	return nil, ErrInvalidOp
-}
-
-// Create is not supported by default.
-func (*Entry) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) {
-	return nil, ErrInvalidOp
-}
-
-// CreateLink is not supported by default.
-func (*Entry) CreateLink(context.Context, *fs.Inode, string, string) error {
-	return ErrInvalidOp
-}
-
-// CreateHardLink is not supported by default.
-func (*Entry) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
-	return ErrInvalidOp
-}
-
-// IsVirtual returns true.
-func (*Entry) IsVirtual() bool {
-	return true
-}
-
-// CreateDirectory is not supported by default.
-func (*Entry) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return ErrInvalidOp
-}
-
-// Bind is not supported by default.
-func (*Entry) Bind(context.Context, *fs.Inode, string, transport.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) {
-	return nil, ErrInvalidOp
-}
-
-// CreateFifo implements fs.InodeOperations.CreateFifo. CreateFifo is not supported by
-// default.
-func (*Entry) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return ErrInvalidOp
-}
-
-// Remove is not supported by default.
-func (*Entry) Remove(context.Context, *fs.Inode, string) error {
-	return ErrInvalidOp
-}
-
-// RemoveDirectory is not supported by default.
-func (*Entry) RemoveDirectory(context.Context, *fs.Inode, string) error {
-	return ErrInvalidOp
-}
-
-// StatFS always returns ENOSYS.
-func (*Entry) StatFS(context.Context) (fs.Info, error) {
-	return fs.Info{}, syscall.ENOSYS
-}
-
-// Rename implements fs.InodeOperations.Rename.
-func (e *Entry) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName)
-}
-
-// Rename renames from a *ramfs.Dir to another *ramfs.Dir.
-func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string) error {
-	op, ok := oldParent.(*Dir)
-	if !ok {
-		return ErrCrossDevice
-	}
-	np, ok := newParent.(*Dir)
-	if !ok {
-		return ErrCrossDevice
-	}
-
-	np.mu.Lock()
-	defer np.mu.Unlock()
-
-	// Check whether the ramfs entry to be replaced is a non-empty directory.
-	if replaced, ok := np.children[newName]; ok {
-		if fs.IsDir(replaced.StableAttr) {
-			// FIXME: simplify by pinning children of ramfs-backed directories
-			// in the Dirent tree: this allows us to generalize ramfs operations without
-			// relying on an implementation of Readdir (which may do anything, like require
-			// that the file be open ... which would be reasonable).
-			dirCtx := &fs.DirCtx{}
-			_, err := replaced.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0)
-			if err != nil {
-				return err
-			}
-			attrs := dirCtx.DentAttrs()
-
-			// ramfs-backed directories should not contain "." and "..", but we do this
-			// just in case.
-			delete(attrs, ".")
-			delete(attrs, "..")
-
-			// If the directory to be replaced is not empty, reject the rename.
-			if len(attrs) != 0 {
-				return ErrNotEmpty
-			}
-		}
-	}
-
-	// Be careful, we may have already grabbed this mutex above.
-	if op != np {
-		op.mu.Lock()
-		defer op.mu.Unlock()
-	}
-
-	// Do the swap.
-	n := op.children[oldName]
-	op.removeChildLocked(ctx, oldName)
-	np.addChildLocked(newName, n)
-
-	// Update ctime.
-	n.NotifyStatusChange(ctx)
-
-	return nil
-}
-
-// Truncate is not supported by default.
-func (*Entry) Truncate(context.Context, *fs.Inode, int64) error {
-	return ErrInvalidOp
-}
-
-// Readlink always returns ENOLINK.
-func (*Entry) Readlink(context.Context, *fs.Inode) (string, error) {
-	return "", syscall.ENOLINK
-}
-
-// Getlink always returns ENOLINK.
-func (*Entry) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
-	return nil, syscall.ENOLINK
-}
-
-// Release is a no-op.
-func (e *Entry) Release(context.Context) {}
-
-// AddLink implements InodeOperationss.AddLink.
-func (e *Entry) AddLink() {
-	e.mu.Lock()
-	e.unstable.Links++
-	e.mu.Unlock()
-}
-
-// DropLink implements InodeOperationss.DropLink.
-func (e *Entry) DropLink() {
-	e.mu.Lock()
-	e.unstable.Links--
-	e.mu.Unlock()
-}
-
-// DeprecatedReaddir is not supported by default.
-func (*Entry) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) {
-	return 0, ErrNotDirectory
-}
-
-// DeprecatedPreadv always returns ErrInvalidOp.
-func (*Entry) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrInvalidOp
-}
-
-// DeprecatedPwritev always returns ErrInvalidOp.
-func (*Entry) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
-	return 0, ErrInvalidOp
-}
-
-// DeprecatedFsync is a noop.
-func (*Entry) DeprecatedFsync() error {
-	// Ignore, this is in memory.
-	return nil
-}
-
-// DeprecatedFlush always returns nil.
-func (*Entry) DeprecatedFlush() error {
-	return nil
-}
-
-// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable.
-func (*Entry) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) {
-	return nil, false
-}
-
-func init() {
-	// Register ramfs errors.
-	syserror.AddErrorTranslation(ErrInvalidOp, syscall.EINVAL)
-	syserror.AddErrorTranslation(ErrDenied, syscall.EACCES)
-	syserror.AddErrorTranslation(ErrNotFound, syscall.ENOENT)
-	syserror.AddErrorTranslation(ErrCrossDevice, syscall.EXDEV)
-	syserror.AddErrorTranslation(ErrIsDirectory, syscall.EISDIR)
-	syserror.AddErrorTranslation(ErrNotDirectory, syscall.ENOTDIR)
-	syserror.AddErrorTranslation(ErrNotEmpty, syscall.ENOTEMPTY)
-}
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 8c81478c8..2c1295897 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -15,25 +15,42 @@
 package ramfs
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // Socket represents a socket.
 //
 // +stateify savable
 type Socket struct {
-	Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeSimpleExtendedAttributes
 
 	// ep is the bound endpoint.
 	ep transport.BoundEndpoint
 }
 
-// InitSocket initializes a socket.
-func (s *Socket) InitSocket(ctx context.Context, ep transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) {
-	s.InitEntry(ctx, owner, perms)
-	s.ep = ep
+var _ fs.InodeOperations = (*Socket)(nil)
+
+// NewSocket returns a new Socket.
+func NewSocket(ctx context.Context, ep transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) *Socket {
+	return &Socket{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, perms, linux.SOCKFS_MAGIC),
+		ep:                    ep,
+	}
 }
 
 // BoundEndpoint returns the socket data.
@@ -42,3 +59,24 @@ func (s *Socket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint {
 	// care about the path argument.
 	return s.ep
 }
+
+// GetFile implements fs.FileOperations.GetFile.
+func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &socketFileOperations{}), nil
+}
+
+// +stateify savable
+type socketFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoRead        `state:"nosave"`
+	fsutil.FileNoSeek        `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+}
+
+var _ fs.FileOperations = (*socketFileOperations)(nil)
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index a21fac2c7..47dae380b 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -15,44 +15,55 @@
 package ramfs
 
 import (
-	"sync"
-
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
 // Symlink represents a symlink.
 //
 // +stateify savable
 type Symlink struct {
-	Entry
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
 
-	mu sync.Mutex `state:"nosave"`
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeSimpleExtendedAttributes
 
 	// Target is the symlink target.
 	Target string
 }
 
-// InitSymlink initializes a symlink, pointing to the given target.
-// A symlink is assumed to always have permissions 0777.
-func (s *Symlink) InitSymlink(ctx context.Context, owner fs.FileOwner, target string) {
-	s.InitEntry(ctx, owner, fs.FilePermsFromMode(0777))
-	s.Target = target
+var _ fs.InodeOperations = (*Symlink)(nil)
+
+// NewSymlink returns a new Symlink.
+func NewSymlink(ctx context.Context, owner fs.FileOwner, target string) *Symlink {
+	// A symlink is assumed to always have permissions 0777.
+	return &Symlink{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(0777), linux.RAMFS_MAGIC),
+		Target:                target,
+	}
 }
 
 // UnstableAttr returns all attributes of this ramfs symlink.
 func (s *Symlink) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	uattr, _ := s.Entry.UnstableAttr(ctx, inode)
+	uattr, err := s.InodeSimpleAttributes.UnstableAttr(ctx, inode)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
 	uattr.Size = int64(len(s.Target))
 	uattr.Usage = uattr.Size
 	return uattr, nil
 }
 
-// Check implements InodeOperations.Check.
-func (s *Symlink) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
 // SetPermissions on a symlink is always rejected.
 func (s *Symlink) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool {
 	return false
@@ -60,10 +71,7 @@ func (s *Symlink) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions)
 
 // Readlink reads the symlink value.
 func (s *Symlink) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	s.Entry.NotifyAccess(ctx)
+	s.NotifyAccess(ctx)
 	return s.Target, nil
 }
 
@@ -72,3 +80,24 @@ func (s *Symlink) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
 func (*Symlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
 	return nil, fs.ErrResolveViaReadlink
 }
+
+// GetFile implements fs.FileOperations.GetFile.
+func (s *Symlink) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &symlinkFileOperations{}), nil
+}
+
+// +stateify savable
+type symlinkFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNoRead        `state:"nosave"`
+	fsutil.FileNoSeek        `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoWrite       `state:"nosave"`
+}
+
+var _ fs.FileOperations = (*symlinkFileOperations)(nil)
diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD
deleted file mode 100644
index 187eac49d..000000000
--- a/pkg/sentry/fs/ramfs/test/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-package(licenses = ["notice"])  # Apache 2.0
-
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-go_library(
-    name = "test",
-    testonly = 1,
-    srcs = ["test.go"],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test",
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/ramfs",
-    ],
-)
diff --git a/pkg/sentry/fs/ramfs/test/test.go b/pkg/sentry/fs/ramfs/test/test.go
deleted file mode 100644
index 11bff7729..000000000
--- a/pkg/sentry/fs/ramfs/test/test.go
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package test provides a simple ramfs-based filesystem for use in testing.
-package test
-
-import (
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
-)
-
-// Dir is a simple ramfs.Dir that supports save/restore as-is.
-type Dir struct {
-	ramfs.Dir
-}
-
-// NewDir returns a simple ramfs directory with the passed contents.
-func NewDir(ctx context.Context, contents map[string]*fs.Inode, perms fs.FilePermissions) *Dir {
-	d := &Dir{}
-	d.InitDir(ctx, contents, fs.RootOwner, perms)
-	return d
-}
-
-// File is a simple ramfs.File that supports save/restore as-is.
-type File struct {
-	ramfs.File
-}
-
-// NewFile returns a simple ramfs File.
-func NewFile(ctx context.Context, perms fs.FilePermissions) *File {
-	f := &File{}
-	f.InitFile(ctx, fs.RootOwner, perms)
-	return f
-}
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index 29a70f698..f6d5ffdec 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -60,8 +60,7 @@ func makeSubdir(ctx context.Context, msrc *fs.MountSource, root *Dir, subdir str
 
 // emptyDir returns an empty *ramfs.Dir that is traversable but not writable.
 func emptyDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	dir := &Dir{}
-	dir.InitDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0555))
+	dir := NewDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(dir, msrc, fs.StableAttr{
 		DeviceID:  anon.PseudoDevice.DeviceID(),
 		InodeID:   anon.PseudoDevice.NextIno(),
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index 54df2143c..8bee9cfc1 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -22,7 +22,7 @@ import (
 )
 
 func TestMakeDirectoryTree(t *testing.T) {
-	mount := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	mount := fs.NewPseudoMountSource()
 
 	for _, test := range []struct {
 		name    string
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 5ba23d5da..7de928e16 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -13,12 +13,13 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys",
     visibility = ["//pkg/sentry:internal"],
     deps = [
+        "//pkg/abi/linux",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/usermem",
-        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index e64aa0edc..8b728a4e4 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -16,43 +16,50 @@ package sys
 
 import (
 	"fmt"
-	"io"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 // +stateify savable
 type cpunum struct {
-	ramfs.Entry
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
+	fsutil.InodeNotTruncatable       `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeStaticFileGetter
+
+	// k is the system kernel.
+	k *kernel.Kernel
 }
 
-func (c *cpunum) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
+var _ fs.InodeOperations = (*cpunum)(nil)
 
+func newPossible(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+	var maxCore uint
 	k := kernel.KernelFromContext(ctx)
-	if k == nil {
-		return 0, io.EOF
+	if k != nil {
+		maxCore = k.ApplicationCores() - 1
 	}
+	contents := []byte(fmt.Sprintf("0-%d\n", maxCore))
 
-	str := []byte(fmt.Sprintf("0-%d\n", k.ApplicationCores()-1))
-	if offset >= int64(len(str)) {
-		return 0, io.EOF
+	c := &cpunum{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.SYSFS_MAGIC),
+		InodeStaticFileGetter: fsutil.InodeStaticFileGetter{
+			Contents: contents,
+		},
 	}
-
-	n, err := dst.CopyOut(ctx, str[offset:])
-	return int64(n), err
-}
-
-func newPossible(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	c := &cpunum{}
-	c.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444))
 	return newFile(c, msrc)
 }
 
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 5ce33f87f..301fef038 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -24,6 +24,8 @@ import (
 // +stateify savable
 type filesystem struct{}
 
+var _ fs.Filesystem = (*filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&filesystem{})
 }
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index 7cc1942c7..c5b56fe69 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -22,13 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// sys is a root sys node.
-//
-// +stateify savable
-type sys struct {
-	ramfs.Dir
-}
-
 func newFile(node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
 	sattr := fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
@@ -40,8 +33,7 @@ func newFile(node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode {
-	d := &sys{}
-	d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
+	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  sysfsDevice.DeviceID(),
 		InodeID:   sysfsDevice.NextIno(),
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 7423e816c..b26466b9d 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -33,12 +33,12 @@ import (
 //
 // +stateify savable
 type TimerOperations struct {
-	fsutil.ZeroSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
+	fsutil.FileZeroSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
 
 	events waiter.Queue `state:"zerovalue"`
 	timer  *ktime.Timer
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 9065cdd5d..14c7a9e62 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -23,11 +23,13 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
+        "//pkg/syserror",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 1f9d69909..2c1eb0fd2 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -28,13 +28,13 @@ import (
 //
 // +stateify savable
 type regularFileOperations struct {
-	waiter.AlwaysReady   `state:"nosave"`
-	fsutil.NoopRelease   `state:"nosave"`
-	fsutil.GenericSeek   `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoopFsync     `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
 
 	// iops is the InodeOperations of a regular tmpfs file. It is
 	// guaranteed to be the same as file.Dirent.Inode.InodeOperations,
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index 02da9af82..e7bbdc404 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -52,19 +52,19 @@ func TestGrow(t *testing.T) {
 	abuf := bytes.Repeat([]byte{'a'}, 68)
 	n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0)
 	if n != int64(len(abuf)) || err != nil {
-		t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(abuf))
+		t.Fatalf("Pwritev got (%d, %v) want (%d, nil)", n, err, len(abuf))
 	}
 
 	bbuf := bytes.Repeat([]byte{'b'}, 856)
 	n, err = f.Pwritev(ctx, usermem.BytesIOSequence(bbuf), 68)
 	if n != int64(len(bbuf)) || err != nil {
-		t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(bbuf))
+		t.Fatalf("Pwritev got (%d, %v) want (%d, nil)", n, err, len(bbuf))
 	}
 
 	rbuf := make([]byte, len(abuf)+len(bbuf))
 	n, err = f.Preadv(ctx, usermem.BytesIOSequence(rbuf), 0)
 	if n != int64(len(rbuf)) || err != nil {
-		t.Fatalf("DeprecatedPreadv got (%d, %v) want (%d, nil)", n, err, len(rbuf))
+		t.Fatalf("Preadv got (%d, %v) want (%d, nil)", n, err, len(rbuf))
 	}
 
 	if want := append(abuf, bbuf...); !bytes.Equal(rbuf, want) {
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 88f85b85a..caa3220ee 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -50,6 +50,8 @@ const (
 // +stateify savable
 type Filesystem struct{}
 
+var _ fs.Filesystem = (*Filesystem)(nil)
+
 func init() {
 	fs.RegisterFilesystem(&Filesystem{})
 }
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index ca2b4aabb..42d4bc76f 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -46,11 +47,13 @@ import (
 //
 // +stateify savable
 type fileInodeOperations struct {
-	fsutil.DeprecatedFileOperations `state:"nosave"`
-	fsutil.InodeNotDirectory        `state:"nosave"`
-	fsutil.InodeNotSocket           `state:"nosave"`
-	fsutil.InodeNotSymlink          `state:"nosave"`
-	fsutil.NoopWriteOut             `state:"nosave"`
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+
+	fsutil.InodeSimpleExtendedAttributes
 
 	// kernel is used to allocate platform memory that stores the file's contents.
 	kernel *kernel.Kernel
@@ -62,10 +65,10 @@ type fileInodeOperations struct {
 
 	// attr contains the unstable metadata for the file.
 	//
-	// attr is protected by attrMu. attr.Unstable.Size is protected by both
-	// attrMu and dataMu; reading it requires locking either mutex, while
-	// mutating it requires locking both.
-	attr fsutil.InMemoryAttributes
+	// attr is protected by attrMu. attr.Size is protected by both attrMu
+	// and dataMu; reading it requires locking either mutex, while mutating
+	// it requires locking both.
+	attr fs.UnstableAttr
 
 	mapsMu sync.Mutex `state:"nosave"`
 
@@ -83,12 +86,12 @@ type fileInodeOperations struct {
 	data fsutil.FileRangeSet
 }
 
+var _ fs.InodeOperations = (*fileInodeOperations)(nil)
+
 // NewInMemoryFile returns a new file backed by p.Memory().
 func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr, k *kernel.Kernel) fs.InodeOperations {
 	return &fileInodeOperations{
-		attr: fsutil.InMemoryAttributes{
-			Unstable: uattr,
-		},
+		attr:     uattr,
 		kernel:   k,
 		memUsage: usage,
 	}
@@ -121,71 +124,56 @@ func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags f
 // UnstableAttr returns unstable attributes of this tmpfs file.
 func (f *fileInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
 	f.dataMu.RLock()
-	defer f.dataMu.RUnlock()
-	attr := f.attr.Unstable
+	attr := f.attr
 	attr.Usage = int64(f.data.Span())
+	f.dataMu.RUnlock()
+	f.attrMu.Unlock()
 	return attr, nil
 }
 
-// Getxattr implements fs.InodeOperations.Getxattr.
-func (f *fileInodeOperations) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
-	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.Getxattr(name)
-}
-
-// Setxattr implements fs.InodeOperations.Setxattr.
-func (f *fileInodeOperations) Setxattr(inode *fs.Inode, name string, value []byte) error {
-	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.Setxattr(name, value)
-}
-
-// Listxattr implements fs.InodeOperations.Listxattr.
-func (f *fileInodeOperations) Listxattr(inode *fs.Inode) (map[string]struct{}, error) {
-	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.Listxattr()
-}
-
 // Check implements fs.InodeOperations.Check.
 func (f *fileInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
 	return fs.ContextCanAccessFile(ctx, inode, p)
 }
 
 // SetPermissions implements fs.InodeOperations.SetPermissions.
-func (f *fileInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
+func (f *fileInodeOperations) SetPermissions(ctx context.Context, _ *fs.Inode, p fs.FilePermissions) bool {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.SetPermissions(ctx, p)
+	f.attr.SetPermissions(ctx, p)
+	f.attrMu.Unlock()
+	return true
 }
 
 // SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (f *fileInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+func (f *fileInodeOperations) SetTimestamps(ctx context.Context, _ *fs.Inode, ts fs.TimeSpec) error {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.SetTimestamps(ctx, ts)
+	f.attr.SetTimestamps(ctx, ts)
+	f.attrMu.Unlock()
+	return nil
 }
 
 // SetOwner implements fs.InodeOperations.SetOwner.
-func (f *fileInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+func (f *fileInodeOperations) SetOwner(ctx context.Context, _ *fs.Inode, owner fs.FileOwner) error {
 	f.attrMu.Lock()
-	defer f.attrMu.Unlock()
-	return f.attr.SetOwner(ctx, owner)
+	f.attr.SetOwner(ctx, owner)
+	f.attrMu.Unlock()
+	return nil
 }
 
 // Truncate implements fs.InodeOperations.Truncate.
-func (f *fileInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size int64) error {
 	f.attrMu.Lock()
 	defer f.attrMu.Unlock()
 
 	f.dataMu.Lock()
-	oldSize := f.attr.Unstable.Size
+	oldSize := f.attr.Size
 	if oldSize != size {
-		f.attr.Unstable.Size = size
-		f.attr.TouchModificationTime(ctx)
+		f.attr.Size = size
+		// Update mtime and ctime.
+		now := ktime.NowFromContext(ctx)
+		f.attr.ModificationTime = now
+		f.attr.StatusChangeTime = now
 	}
 	f.dataMu.Unlock()
 
@@ -220,21 +208,21 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, siz
 // AddLink implements fs.InodeOperations.AddLink.
 func (f *fileInodeOperations) AddLink() {
 	f.attrMu.Lock()
-	f.attr.Unstable.Links++
+	f.attr.Links++
 	f.attrMu.Unlock()
 }
 
 // DropLink implements fs.InodeOperations.DropLink.
 func (f *fileInodeOperations) DropLink() {
 	f.attrMu.Lock()
-	f.attr.Unstable.Links--
+	f.attr.Links--
 	f.attrMu.Unlock()
 }
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
 func (f *fileInodeOperations) NotifyStatusChange(ctx context.Context) {
 	f.attrMu.Lock()
-	f.attr.TouchStatusChangeTime(ctx)
+	f.attr.StatusChangeTime = ktime.NowFromContext(ctx)
 	f.attrMu.Unlock()
 }
 
@@ -264,7 +252,7 @@ func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence,
 	// TODO: Separate out f.attr.Size and use atomics instead of
 	// f.dataMu.
 	f.dataMu.RLock()
-	size := f.attr.Unstable.Size
+	size := f.attr.Size
 	f.dataMu.RUnlock()
 	if offset >= size {
 		return 0, io.EOF
@@ -273,7 +261,7 @@ func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence,
 	n, err := dst.CopyOutFrom(ctx, &fileReadWriter{f, offset})
 	// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
 	f.attrMu.Lock()
-	f.attr.TouchAccessTime(ctx)
+	f.attr.AccessTime = ktime.NowFromContext(ctx)
 	f.attrMu.Unlock()
 	return n, err
 }
@@ -287,7 +275,9 @@ func (f *fileInodeOperations) write(ctx context.Context, src usermem.IOSequence,
 	f.attrMu.Lock()
 	defer f.attrMu.Unlock()
 	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
-	f.attr.TouchModificationTime(ctx)
+	now := ktime.NowFromContext(ctx)
+	f.attr.ModificationTime = now
+	f.attr.StatusChangeTime = now
 	return src.CopyInTo(ctx, &fileReadWriter{f, offset})
 }
 
@@ -302,10 +292,10 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	defer rw.f.dataMu.RUnlock()
 
 	// Compute the range to read.
-	if rw.offset >= rw.f.attr.Unstable.Size {
+	if rw.offset >= rw.f.attr.Size {
 		return 0, io.EOF
 	}
-	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.f.attr.Unstable.Size)
+	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.f.attr.Size)
 	if end == rw.offset { // dsts.NumBytes() == 0?
 		return 0, nil
 	}
@@ -371,8 +361,8 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 	defer func() {
 		// If the write ends beyond the file's previous size, it causes the
 		// file to grow.
-		if rw.offset > rw.f.attr.Unstable.Size {
-			rw.f.attr.Unstable.Size = rw.offset
+		if rw.offset > rw.f.attr.Size {
+			rw.f.attr.Size = rw.offset
 		}
 	}()
 
@@ -450,9 +440,9 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
 
-	// Constrain translations to f.attr.Unstable.Size (rounded up) to prevent
+	// Constrain translations to f.attr.Size (rounded up) to prevent
 	// translation to pages that may be concurrently truncated.
-	pgend := fs.OffsetPageEnd(f.attr.Unstable.Size)
+	pgend := fs.OffsetPageEnd(f.attr.Size)
 	var beyondEOF bool
 	if required.End > pgend {
 		if required.Start >= pgend {
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 40a8c4b1e..a0277a132 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -19,12 +19,14 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 var fsInfo = fs.Info{
@@ -39,32 +41,54 @@ var fsInfo = fs.Info{
 func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
 	op, ok := oldParent.InodeOperations.(*Dir)
 	if !ok {
-		return ramfs.ErrCrossDevice
+		return syserror.EXDEV
 	}
 	np, ok := newParent.InodeOperations.(*Dir)
 	if !ok {
-		return ramfs.ErrCrossDevice
+		return syserror.EXDEV
 	}
-	return ramfs.Rename(ctx, &op.Dir, oldName, &np.Dir, newName)
+	return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName)
 }
 
 // Dir is a directory.
 //
 // +stateify savable
 type Dir struct {
-	ramfs.Dir
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeIsDirTruncate  `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	// Ideally this would be embedded, so that we "inherit" all of the
+	// InodeOperations implemented by ramfs.Dir for free.
+	//
+	// However, ramfs.dirFileOperations stores a pointer to a ramfs.Dir,
+	// and our save/restore package does not allow saving a pointer to an
+	// embedded field elsewhere.
+	//
+	// Thus, we must make the ramfs.Dir is a field, and we delegate all the
+	// InodeOperation methods to it.
+	ramfsDir *ramfs.Dir
 
 	// kernel is used to allocate platform memory as storage for tmpfs Files.
 	kernel *kernel.Kernel
 }
 
+var _ fs.InodeOperations = (*Dir)(nil)
+
 // NewDir returns a new directory.
 func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, kernel *kernel.Kernel) *fs.Inode {
-	d := &Dir{kernel: kernel}
-	d.InitDir(ctx, contents, owner, perms)
+	d := &Dir{
+		ramfsDir: ramfs.NewDir(ctx, contents, owner, perms),
+		kernel:   kernel,
+	}
 
 	// Manually set the CreateOps.
-	d.CreateOps = d.newCreateOps()
+	d.ramfsDir.CreateOps = d.newCreateOps()
 
 	return fs.NewInode(d, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
@@ -77,7 +101,107 @@ func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwn
 // afterLoad is invoked by stateify.
 func (d *Dir) afterLoad() {
 	// Per NewDir, manually set the CreateOps.
-	d.Dir.CreateOps = d.newCreateOps()
+	d.ramfsDir.CreateOps = d.newCreateOps()
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (d *Dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return d.ramfsDir.GetFile(ctx, dirent, flags)
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+func (d *Dir) AddLink() {
+	d.ramfsDir.AddLink()
+}
+
+// DropLink implements fs.InodeOperations.DropLink.
+func (d *Dir) DropLink() {
+	d.ramfsDir.DropLink()
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perms fs.FilePermissions) (*fs.Dirent, error) {
+	return d.ramfsDir.Bind(ctx, dir, name, ep, perms)
+}
+
+// Create implements fs.InodeOperations.Create.
+func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) {
+	return d.ramfsDir.Create(ctx, dir, name, flags, perms)
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error {
+	return d.ramfsDir.CreateLink(ctx, dir, oldname, newname)
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error {
+	return d.ramfsDir.CreateHardLink(ctx, dir, target, name)
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
+	return d.ramfsDir.CreateDirectory(ctx, dir, name, perms)
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error {
+	return d.ramfsDir.CreateFifo(ctx, dir, name, perms)
+}
+
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (d *Dir) Getxattr(i *fs.Inode, name string) ([]byte, error) {
+	return d.ramfsDir.Getxattr(i, name)
+}
+
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (d *Dir) Setxattr(i *fs.Inode, name string, value []byte) error {
+	return d.ramfsDir.Setxattr(i, name, value)
+}
+
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (d *Dir) Listxattr(i *fs.Inode) (map[string]struct{}, error) {
+	return d.ramfsDir.Listxattr(i)
+}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (d *Dir) Lookup(ctx context.Context, i *fs.Inode, p string) (*fs.Dirent, error) {
+	return d.ramfsDir.Lookup(ctx, i, p)
+}
+
+// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
+func (d *Dir) NotifyStatusChange(ctx context.Context) {
+	d.ramfsDir.NotifyStatusChange(ctx)
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (d *Dir) Remove(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.Remove(ctx, i, name)
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (d *Dir) RemoveDirectory(ctx context.Context, i *fs.Inode, name string) error {
+	return d.ramfsDir.RemoveDirectory(ctx, i, name)
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (d *Dir) UnstableAttr(ctx context.Context, i *fs.Inode) (fs.UnstableAttr, error) {
+	return d.ramfsDir.UnstableAttr(ctx, i)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (d *Dir) SetPermissions(ctx context.Context, i *fs.Inode, p fs.FilePermissions) bool {
+	return d.ramfsDir.SetPermissions(ctx, i, p)
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (d *Dir) SetOwner(ctx context.Context, i *fs.Inode, owner fs.FileOwner) error {
+	return d.ramfsDir.SetOwner(ctx, i, owner)
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (d *Dir) SetTimestamps(ctx context.Context, i *fs.Inode, ts fs.TimeSpec) error {
+	return d.ramfsDir.SetTimestamps(ctx, i, ts)
 }
 
 // newCreateOps builds the custom CreateOps for this Dir.
@@ -132,8 +256,7 @@ type Symlink struct {
 
 // NewSymlink returns a new symlink with the provided permissions.
 func NewSymlink(ctx context.Context, target string, owner fs.FileOwner, msrc *fs.MountSource) *fs.Inode {
-	s := &Symlink{}
-	s.InitSymlink(ctx, owner, target)
+	s := &Symlink{Symlink: *ramfs.NewSymlink(ctx, owner, target)}
 	return fs.NewInode(s, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
@@ -157,12 +280,12 @@ func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
 // +stateify savable
 type Socket struct {
 	ramfs.Socket
+	fsutil.InodeNotTruncatable `state:"nosave"`
 }
 
 // NewSocket returns a new socket with the provided permissions.
 func NewSocket(ctx context.Context, socket transport.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
-	s := &Socket{}
-	s.InitSocket(ctx, socket, owner, perms)
+	s := &Socket{Socket: *ramfs.NewSocket(ctx, socket, owner, perms)}
 	return fs.NewInode(s, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
@@ -185,15 +308,22 @@ func (s *Socket) StatFS(context.Context) (fs.Info, error) {
 //
 // +stateify savable
 type Fifo struct {
-	ramfs.Entry
+	fs.InodeOperations
 }
 
 // NewFifo creates a new named pipe.
 func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
-	f := &Fifo{}
-	f.InitEntry(ctx, owner, perms)
-	iops := pipe.NewInodeOperations(f, pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize))
-	return fs.NewInode(iops, msrc, fs.StableAttr{
+	// First create a pipe.
+	p := pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+
+	// Build pipe InodeOperations.
+	iops := pipe.NewInodeOperations(ctx, perms, p)
+
+	// Wrap the iops with our Fifo.
+	fifoIops := &Fifo{iops}
+
+	// Build a new Inode.
+	return fs.NewInode(fifoIops, msrc, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
 		BlockSize: usermem.PageSize,
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 2b45069a6..011cb6955 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -7,7 +7,6 @@ go_library(
     srcs = [
         "dir.go",
         "fs.go",
-        "inode.go",
         "line_discipline.go",
         "master.go",
         "queue.go",
@@ -25,7 +24,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index e32b05c1d..485cdb456 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -52,13 +52,17 @@ import (
 //
 // +stateify savable
 type dirInodeOperations struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
-	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotRenameable        `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeVirtual              `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 
 	// msrc is the super block this directory is on.
 	//
@@ -68,9 +72,6 @@ type dirInodeOperations struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
 
-	// attr contains the UnstableAttrs.
-	attr fsutil.InMemoryAttributes
-
 	// master is the master PTY inode.
 	master *fs.Inode
 
@@ -97,15 +98,10 @@ var _ fs.InodeOperations = (*dirInodeOperations)(nil)
 // newDir creates a new dir with a ptmx file and no terminals.
 func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 	d := &dirInodeOperations{
-		attr: fsutil.InMemoryAttributes{
-			Unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-				Owner: fs.RootOwner,
-				Perms: fs.FilePermsFromMode(0555),
-			}),
-		},
-		msrc:      m,
-		slaves:    make(map[uint32]*fs.Inode),
-		dentryMap: fs.NewSortedDentryMap(nil),
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC),
+		msrc:                  m,
+		slaves:                make(map[uint32]*fs.Inode),
+		dentryMap:             fs.NewSortedDentryMap(nil),
 	}
 	// Linux devpts uses a default mode of 0000 for ptmx which can be
 	// changed with the ptmxmode mount option. However, that default is not
@@ -224,70 +220,6 @@ func (d *dirInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent, fla
 	return fs.NewFile(ctx, dirent, flags, &dirFileOperations{di: d}), nil
 }
 
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (d *dirInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.Unstable, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (d *dirInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions.
-func (d *dirInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.SetPermissions(ctx, p)
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (d *dirInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.SetOwner(ctx, owner)
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (d *dirInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-	return d.attr.SetTimestamps(ctx, ts)
-}
-
-// Truncate implements fs.InodeOperations.Truncate.
-func (d *dirInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return syserror.EINVAL
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-func (d *dirInodeOperations) AddLink() {}
-
-// DropLink implements fs.InodeOperations.DropLink.
-func (d *dirInodeOperations) DropLink() {}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (d *dirInodeOperations) NotifyStatusChange(ctx context.Context) {
-	d.mu.Lock()
-	defer d.mu.Unlock()
-
-	d.attr.TouchStatusChangeTime(ctx)
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-func (d *dirInodeOperations) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-func (d *dirInodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
-	return fs.Info{
-		Type: linux.DEVPTS_SUPER_MAGIC,
-	}, nil
-}
-
 // allocateTerminal creates a new Terminal and installs a pts node for it.
 //
 // The caller must call DecRef when done with the returned Terminal.
@@ -353,13 +285,13 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
 //
 // +stateify savable
 type dirFileOperations struct {
-	waiter.AlwaysReady `state:"nosave"`
-	fsutil.NoopRelease `state:"nosave"`
-	fsutil.GenericSeek `state:"nosave"`
-	fsutil.NoFsync     `state:"nosave"`
-	fsutil.NoopFlush   `state:"nosave"`
-	fsutil.NoMMap      `state:"nosave"`
-	fsutil.NoIoctl     `state:"nosave"`
+	waiter.AlwaysReady     `state:"nosave"`
+	fsutil.FileNoopRelease `state:"nosave"`
+	fsutil.FileGenericSeek `state:"nosave"`
+	fsutil.FileNoFsync     `state:"nosave"`
+	fsutil.FileNoopFlush   `state:"nosave"`
+	fsutil.FileNoMMap      `state:"nosave"`
+	fsutil.FileNoIoctl     `state:"nosave"`
 
 	// di is the inode operations.
 	di *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go
deleted file mode 100644
index d5d1caafc..000000000
--- a/pkg/sentry/fs/tty/inode.go
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tty
-
-import (
-	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-)
-
-// inodeOperations are the base fs.InodeOperations for master and slave Inodes.
-//
-// inodeOperations does not implement:
-//
-// * fs.InodeOperations.Release
-// * fs.InodeOperations.GetFile
-//
-// +stateify savable
-type inodeOperations struct {
-	fsutil.DeprecatedFileOperations  `state:"nosave"`
-	fsutil.InodeNoExtendedAttributes `state:"nosave"`
-	fsutil.InodeNotDirectory         `state:"nosave"`
-	fsutil.InodeNotRenameable        `state:"nosave"`
-	fsutil.InodeNotSocket            `state:"nosave"`
-	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.NoMappable                `state:"nosave"`
-	fsutil.NoopWriteOut              `state:"nosave"`
-
-	// mu protects the fields below.
-	mu sync.Mutex `state:"nosave"`
-
-	// uattr is the inode's UnstableAttr.
-	uattr fs.UnstableAttr
-}
-
-// UnstableAttr implements fs.InodeOperations.UnstableAttr.
-func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	return i.uattr, nil
-}
-
-// Check implements fs.InodeOperations.Check.
-func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
-	return fs.ContextCanAccessFile(ctx, inode, p)
-}
-
-// SetPermissions implements fs.InodeOperations.SetPermissions
-func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	i.uattr.Perms = p
-	i.uattr.StatusChangeTime = ktime.NowFromContext(ctx)
-	return true
-}
-
-// SetOwner implements fs.InodeOperations.SetOwner.
-func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	if owner.UID.Ok() {
-		i.uattr.Owner.UID = owner.UID
-	}
-	if owner.GID.Ok() {
-		i.uattr.Owner.GID = owner.GID
-	}
-	return nil
-}
-
-// SetTimestamps implements fs.InodeOperations.SetTimestamps.
-func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
-	if ts.ATimeOmit && ts.MTimeOmit {
-		return nil
-	}
-
-	i.mu.Lock()
-	defer i.mu.Unlock()
-
-	now := ktime.NowFromContext(ctx)
-	if !ts.ATimeOmit {
-		if ts.ATime.IsZero() {
-			i.uattr.AccessTime = now
-		} else {
-			i.uattr.AccessTime = ts.ATime
-		}
-	}
-	if !ts.MTimeOmit {
-		if ts.MTime.IsZero() {
-			i.uattr.ModificationTime = now
-		} else {
-			i.uattr.ModificationTime = ts.MTime
-		}
-	}
-	i.uattr.StatusChangeTime = now
-	return nil
-}
-
-// Truncate implements fs.InodeOperations.Truncate.
-func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
-	return syserror.EINVAL
-}
-
-// AddLink implements fs.InodeOperations.AddLink.
-func (i *inodeOperations) AddLink() {
-}
-
-// DropLink implements fs.InodeOperations.DropLink.
-func (i *inodeOperations) DropLink() {
-}
-
-// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {
-	i.mu.Lock()
-	defer i.mu.Unlock()
-	i.uattr.StatusChangeTime = ktime.NowFromContext(ctx)
-}
-
-// IsVirtual implements fs.InodeOperations.IsVirtual.
-func (i *inodeOperations) IsVirtual() bool {
-	return true
-}
-
-// StatFS implements fs.InodeOperations.StatFS.
-func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
-	return fs.Info{
-		Type: linux.DEVPTS_SUPER_MAGIC,
-	}, nil
-}
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 00bec4c2c..b5e13ab36 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -31,7 +31,7 @@ import (
 //
 // +stateify savable
 type masterInodeOperations struct {
-	inodeOperations
+	fsutil.SimpleFileInode
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -42,15 +42,8 @@ var _ fs.InodeOperations = (*masterInodeOperations)(nil)
 // newMasterInode creates an Inode for the master end of a terminal.
 func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
 	iops := &masterInodeOperations{
-		inodeOperations: inodeOperations{
-			uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-				Owner: owner,
-				Perms: p,
-				Links: 1,
-				// Size and Blocks are always 0.
-			}),
-		},
-		d: d,
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
+		d:               d,
 	}
 
 	return fs.NewInode(iops, d.msrc, fs.StableAttr{
@@ -102,11 +95,11 @@ func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flag
 //
 // +stateify savable
 type masterFileOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 
 	// d is the containing dir.
 	d *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index a696fbb51..6dbce90b4 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -30,7 +30,7 @@ import (
 //
 // +stateify savable
 type slaveInodeOperations struct {
-	inodeOperations
+	fsutil.SimpleFileInode
 
 	// d is the containing dir.
 	d *dirInodeOperations
@@ -46,16 +46,9 @@ var _ fs.InodeOperations = (*slaveInodeOperations)(nil)
 // newSlaveInode takes ownership of t.
 func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode {
 	iops := &slaveInodeOperations{
-		inodeOperations: inodeOperations{
-			uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-				Owner: owner,
-				Perms: p,
-				Links: 1,
-				// Size and Blocks are always 0.
-			}),
-		},
-		d: d,
-		t: t,
+		SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC),
+		d:               d,
+		t:               t,
 	}
 
 	return fs.NewInode(iops, d.msrc, fs.StableAttr{
@@ -91,11 +84,11 @@ func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags
 //
 // +stateify savable
 type slaveFileOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 
 	// si is the inode operations.
 	si *slaveInodeOperations
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 9c13ecfcc..502395f18 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -99,12 +99,12 @@ func (p *pollEntry) WeakRefGone() {
 //
 // +stateify savable
 type EventPoll struct {
-	fsutil.PipeSeek      `state:"zerovalue"`
-	fsutil.NotDirReaddir `state:"zerovalue"`
-	fsutil.NoFsync       `state:"zerovalue"`
-	fsutil.NoopFlush     `state:"zerovalue"`
-	fsutil.NoMMap        `state:"zerovalue"`
-	fsutil.NoIoctl       `state:"zerovalue"`
+	fsutil.FilePipeSeek      `state:"zerovalue"`
+	fsutil.FileNotDirReaddir `state:"zerovalue"`
+	fsutil.FileNoFsync       `state:"zerovalue"`
+	fsutil.FileNoopFlush     `state:"zerovalue"`
+	fsutil.FileNoMMap        `state:"zerovalue"`
+	fsutil.FileNoIoctl       `state:"zerovalue"`
 
 	// Wait queue is used to notify interested parties when the event poll
 	// object itself becomes readable or writable.
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 063a1d5f5..2d43c986d 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -38,13 +38,13 @@ import (
 //
 // +stateify savable
 type EventOperations struct {
-	fsutil.NoopRelease   `state:"nosave"`
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
-	fsutil.NoIoctl       `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
 
 	// Mutex that protects accesses to the fields of this event.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 4b0e00b85..1336b6293 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -17,17 +17,30 @@ package pipe
 import (
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/amutex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-// inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
+// inodeOperations implements fs.InodeOperations for pipes.
 //
 // +stateify savable
 type inodeOperations struct {
-	fs.InodeOperations
+	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopRelease          `state:"nosave"`
+	fsutil.InodeNoopTruncate         `state:"nosave"`
+	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotDirectory         `state:"nosave"`
+	fsutil.InodeNotMappable          `state:"nosave"`
+	fsutil.InodeNotSocket            `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
 
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -46,12 +59,15 @@ type inodeOperations struct {
 	wWakeup chan struct{} `state:"nosave"`
 }
 
-// NewInodeOperations creates a new pipe fs.InodeOperations.
-func NewInodeOperations(base fs.InodeOperations, p *Pipe) fs.InodeOperations {
+var _ fs.InodeOperations = (*inodeOperations)(nil)
+
+// NewInodeOperations returns a new fs.InodeOperations for a given pipe.
+func NewInodeOperations(ctx context.Context, perms fs.FilePermissions, p *Pipe) *inodeOperations {
 	return &inodeOperations{
-		InodeOperations: base,
-		p:               p,
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), perms, linux.PIPEFS_MAGIC),
+		p:                     p,
 	}
+
 }
 
 // GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
@@ -164,18 +180,6 @@ func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Slee
 	}
 }
 
-// Truncate implements fs.InodeOperations.Truncate
-//
-// This method is required to override the default i.InodeOperations.Truncate
-// which may return ErrInvalidOperation, this allows open related
-// syscalls to set the O_TRUNC flag without returning an error by
-// calling Truncate directly during openat. The ftruncate and truncate
-// system calls will check that the file is an actual file and return
-// EINVAL because it's a PIPE, making this behavior consistent with linux.
-func (i *inodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
-	return nil
-}
-
 // newHandleLocked signals a new pipe reader or writer depending on where
 // 'wakeupChan' points. This unblocks any corresponding reader or writer
 // waiting for the other end of the channel to be opened, see Fifo.waitFor.
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index eda551594..ad103b195 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -53,6 +53,10 @@ type openResult struct {
 	error
 }
 
+var perms fs.FilePermissions = fs.FilePermissions{
+	User: fs.PermMask{Read: true, Write: true},
+}
+
 func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
 	file, err := n.GetFile(ctx, nil, flags)
 	if err != nil {
@@ -93,8 +97,8 @@ func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Durati
 }
 
 func TestReadOpenBlocksForWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
@@ -111,8 +115,8 @@ func TestReadOpenBlocksForWriteOpen(t *testing.T) {
 }
 
 func TestWriteOpenBlocksForReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	wDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
@@ -129,8 +133,8 @@ func TestWriteOpenBlocksForReadOpen(t *testing.T) {
 }
 
 func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone1 := make(chan struct{})
 	rDone2 := make(chan struct{})
@@ -151,8 +155,8 @@ func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
 }
 
 func TestClosedReaderBlocksWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
 	rFile.DecRef()
@@ -172,8 +176,8 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) {
 }
 
 func TestReadWriteOpenNeverBlocks(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rwDone := make(chan struct{})
 	// Open for read-write never wait for a reader or writer, even if the
@@ -183,8 +187,8 @@ func TestReadWriteOpenNeverBlocks(t *testing.T) {
 }
 
 func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
@@ -197,8 +201,8 @@ func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
 }
 
 func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	wDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
@@ -211,8 +215,8 @@ func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
 }
 
 func TestBlockedOpenIsCancellable(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	done := make(chan openResult)
 	go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done)
@@ -233,18 +237,18 @@ func TestBlockedOpenIsCancellable(t *testing.T) {
 	}
 }
 
-func TestNonblockingReadOpenNoWriters(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
+func TestNonblockingReadOpenFileNoWriters(t *testing.T) {
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
 		t.Fatalf("Nonblocking open for read failed with error %v.", err)
 	}
 }
 
-func TestNonblockingWriteOpenNoReaders(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
+func TestNonblockingWriteOpenFileNoReaders(t *testing.T) {
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO {
 		t.Fatalf("Nonblocking open for write failed unexpected error %v.", err)
@@ -252,8 +256,8 @@ func TestNonblockingWriteOpenNoReaders(t *testing.T) {
 }
 
 func TestNonBlockingReadOpenWithWriter(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	wDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
@@ -271,8 +275,8 @@ func TestNonBlockingReadOpenWithWriter(t *testing.T) {
 }
 
 func TestNonBlockingWriteOpenWithReader(t *testing.T) {
-	f := NewInodeOperations(nil, newNamedPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newNamedPipe(t))
 
 	rDone := make(chan struct{})
 	go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
@@ -290,8 +294,8 @@ func TestNonBlockingWriteOpenWithReader(t *testing.T) {
 }
 
 func TestAnonReadOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newAnonPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newAnonPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil {
 		t.Fatalf("open anon pipe for read failed: %v", err)
@@ -299,8 +303,8 @@ func TestAnonReadOpen(t *testing.T) {
 }
 
 func TestAnonWriteOpen(t *testing.T) {
-	f := NewInodeOperations(nil, newAnonPipe(t))
 	ctx := newSleeperContext(t)
+	f := NewInodeOperations(ctx, perms, newAnonPipe(t))
 
 	if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil {
 		t.Fatalf("open anon pipe for write failed: %v", err)
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 126054826..fad077d2d 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -25,11 +25,9 @@ import (
 	"sync/atomic"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -50,7 +48,7 @@ type Pipe struct {
 	isNamed bool
 
 	// The dirent backing this pipe. Shared by all readers and writers.
-	dirent *fs.Dirent
+	Dirent *fs.Dirent
 
 	// The buffered byte queue.
 	data ilist.List
@@ -97,28 +95,19 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *P
 
 	// Build the fs.Dirent of this pipe, shared by all fs.Files associated
 	// with this pipe.
+	perms := fs.FilePermissions{
+		User: fs.PermMask{Read: true, Write: true},
+	}
+	iops := NewInodeOperations(ctx, perms, p)
 	ino := pipeDevice.NextIno()
-	base := fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.PIPEFS_MAGIC,
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	})
 	sattr := fs.StableAttr{
 		Type:      fs.Pipe,
 		DeviceID:  pipeDevice.DeviceID(),
 		InodeID:   ino,
 		BlockSize: int64(atomicIOBytes),
 	}
-	// There is no real filesystem backing this pipe, so we pass in a nil
-	// Filesystem.
-	sb := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
-	p.dirent = fs.NewDirent(fs.NewInode(NewInodeOperations(base, p), sb, sattr), fmt.Sprintf("pipe:[%d]", ino))
-
+	ms := fs.NewPseudoMountSource()
+	p.Dirent = fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
 	return p
 }
 
@@ -135,7 +124,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*f
 // ROpen opens the pipe for reading.
 func (p *Pipe) ROpen(ctx context.Context) *fs.File {
 	p.rOpen()
-	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true}, &Reader{
+	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Read: true}, &Reader{
 		ReaderWriter: ReaderWriter{Pipe: p},
 	})
 }
@@ -143,7 +132,7 @@ func (p *Pipe) ROpen(ctx context.Context) *fs.File {
 // WOpen opens the pipe for writing.
 func (p *Pipe) WOpen(ctx context.Context) *fs.File {
 	p.wOpen()
-	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Write: true}, &Writer{
+	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Write: true}, &Writer{
 		ReaderWriter: ReaderWriter{Pipe: p},
 	})
 }
@@ -152,7 +141,7 @@ func (p *Pipe) WOpen(ctx context.Context) *fs.File {
 func (p *Pipe) RWOpen(ctx context.Context) *fs.File {
 	p.rOpen()
 	p.wOpen()
-	return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
+	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
 		Pipe: p,
 	})
 }
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 36be1efc3..028175530 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -35,11 +35,11 @@ import (
 //
 // +stateify savable
 type ReaderWriter struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	*Pipe
 }
 
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 437cc5da1..c070c7316 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -20,7 +20,6 @@ import (
 	"io"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi"
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -38,20 +37,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// byteReaderFileOperations implements fs.FileOperations for reading
-// from a []byte source.
-type byteReader struct {
-	fsutil.NoopRelease
-	fsutil.PipeSeek
-	fsutil.NotDirReaddir
-	fsutil.NoFsync
-	fsutil.NoopFlush
-	fsutil.NoMMap
-	fsutil.NoIoctl
-	waiter.AlwaysReady
-	data []byte
-}
-
 type fileContext struct {
 	context.Context
 }
@@ -65,17 +50,34 @@ func (f *fileContext) Value(key interface{}) interface{} {
 	}
 }
 
+// byteReader implements fs.FileOperations for reading from a []byte source.
+type byteReader struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+
+	data []byte
+}
+
+var _ fs.FileOperations = (*byteReader)(nil)
+
 // newByteReaderFile creates a fake file to read data from.
 func newByteReaderFile(data []byte) *fs.File {
 	// Create a fake inode.
-	inode := fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.ANON_INODE_FS_MAGIC,
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
-		Type:      fs.Anonymous,
-		DeviceID:  anon.PseudoDevice.DeviceID(),
-		InodeID:   anon.PseudoDevice.NextIno(),
-		BlockSize: usermem.PageSize,
-	})
+	inode := fs.NewInode(
+		&fsutil.SimpleFileInode{},
+		fs.NewPseudoMountSource(),
+		fs.StableAttr{
+			Type:      fs.Anonymous,
+			DeviceID:  anon.PseudoDevice.DeviceID(),
+			InodeID:   anon.PseudoDevice.NextIno(),
+			BlockSize: usermem.PageSize,
+		})
 
 	// Use the fake inode to create a fake dirent.
 	dirent := fs.NewTransientDirent(inode)
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index d65b5f49e..ca865b111 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -138,11 +138,11 @@ type commonEndpoint interface {
 //
 // +stateify savable
 type SocketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 	*waiter.Queue
 
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index f3ecb6dc3..2c54e8de2 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -46,11 +46,11 @@ const (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	fd    int // must be O_NONBLOCK
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0a7d4772c..5b0c11c84 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -65,11 +65,11 @@ var netlinkSocketDevice = device.NewAnonDevice()
 //
 // +stateify savable
 type Socket struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	// ports provides netlink port allocation.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 8c8ebadb7..13681100e 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -45,11 +45,11 @@ import (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	fd       uint32 // must be O_NONBLOCK
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 9d4aaeb9d..e28d2c4fa 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -178,18 +178,12 @@ func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*
 // NewDirent returns a sockfs fs.Dirent that resides on device d.
 func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
 	ino := d.NextIno()
-	// There is no real filesystem backing this pipe, so we pass in a nil
-	// Filesystem.
-	inode := fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
-		FSType: linux.SOCKFS_MAGIC,
-		UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: fs.FileOwnerFromContext(ctx),
-			Perms: fs.FilePermissions{
-				User: fs.PermMask{Read: true, Write: true},
-			},
-			Links: 1,
-		}),
-	}), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+	iops := &fsutil.SimpleFileInode{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), fs.FilePermissions{
+			User: fs.PermMask{Read: true, Write: true},
+		}, linux.SOCKFS_MAGIC),
+	}
+	inode := fs.NewInode(iops, fs.NewPseudoMountSource(), fs.StableAttr{
 		Type:      fs.Socket,
 		DeviceID:  d.DeviceID(),
 		InodeID:   ino,
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index da225eabb..19258e692 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -45,11 +45,11 @@ import (
 //
 // +stateify savable
 type SocketOperations struct {
-	fsutil.PipeSeek      `state:"nosave"`
-	fsutil.NotDirReaddir `state:"nosave"`
-	fsutil.NoFsync       `state:"nosave"`
-	fsutil.NoopFlush     `state:"nosave"`
-	fsutil.NoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek      `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+	fsutil.FileNoFsync       `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
 	refs.AtomicRefCount
 	socket.SendReceiveTimeout
 
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 1e75b0efc..942315d6e 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -489,9 +489,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
 func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
-	// There is no real filesystem backing this ramfs tree, so we pass in
-	// "nil" here.
-	msrc := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+	msrc := fs.NewPseudoMountSource()
 	mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
 	if err != nil {
 		return nil, fmt.Errorf("error creating mount tree: %v", err)
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index e64df97b0..6ffe9aed6 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -334,6 +334,11 @@ int ReadlinkWhileExited(std::string const& basename, char* buf, size_t count) {
   return ret;
 }
 
+TEST(ProcTest, NotFoundInRoot) {
+  struct stat s;
+  EXPECT_THAT(stat("/proc/foobar", &s), SyscallFailsWithErrno(ENOENT));
+}
+
 TEST(ProcSelfTest, IsThreadGroupLeader) {
   ScopedThread([] {
     const pid_t tgid = getpid();
-- 
cgit v1.2.3


From 12bc7834dccdcd21353fc476dc76a5c9bc0d47bb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 17 Jan 2019 11:05:40 -0800
Subject: Allow fsync on a directory.

PiperOrigin-RevId: 229781337
Change-Id: I1f946cff2771714fb1abd83a83ed454e9febda0a
---
 pkg/sentry/fs/fsutil/file.go |  2 +-
 test/syscalls/linux/BUILD    |  1 +
 test/syscalls/linux/fsync.cc | 23 +++++++++++++----------
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 0970f782b..32f8133fb 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -228,10 +228,10 @@ func (FileNoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallAr
 type DirFileOperations struct {
 	waiter.AlwaysReady
 	FileGenericSeek
-	FileNoFsync
 	FileNoIoctl
 	FileNoMMap
 	FileNoopFlush
+	FileNoopFsync
 	FileNoopRelease
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 41d476481..19884d55e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -783,6 +783,7 @@ cc_binary(
     srcs = ["fsync.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:file_descriptor",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/fsync.cc b/test/syscalls/linux/fsync.cc
index 536a73bf1..b34229248 100644
--- a/test/syscalls/linux/fsync.cc
+++ b/test/syscalls/linux/fsync.cc
@@ -19,6 +19,7 @@
 #include <string>
 
 #include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
@@ -28,22 +29,24 @@ namespace testing {
 namespace {
 
 TEST(FsyncTest, TempFileSucceeds) {
-  std::string path = NewTempAbsPath();
-  int fd;
-  EXPECT_THAT(fd = open(path.c_str(), O_RDWR | O_CREAT, 0666),
-              SyscallSucceeds());
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
   const std::string data = "some data to sync";
-  EXPECT_THAT(write(fd, data.c_str(), data.size()),
+  EXPECT_THAT(write(fd.get(), data.c_str(), data.size()),
               SyscallSucceedsWithValue(data.size()));
-  EXPECT_THAT(fsync(fd), SyscallSucceeds());
-  ASSERT_THAT(close(fd), SyscallSucceeds());
-  ASSERT_THAT(unlink(path.c_str()), SyscallSucceeds());
+  EXPECT_THAT(fsync(fd.get()), SyscallSucceeds());
+}
+
+TEST(FsyncTest, TempDirSucceeds) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+  EXPECT_THAT(fsync(fd.get()), SyscallSucceeds());
 }
 
 TEST(FsyncTest, CannotFsyncOnUnopenedFd) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   int fd;
-  auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  ASSERT_THAT(fd = open(f.path().c_str(), O_RDONLY), SyscallSucceeds());
+  ASSERT_THAT(fd = open(file.path().c_str(), O_RDONLY), SyscallSucceeds());
   ASSERT_THAT(close(fd), SyscallSucceeds());
 
   // fd is now invalid.
-- 
cgit v1.2.3


From 8d7c10e90840cfecf53089e7cc3507cac2804fd1 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Fri, 18 Jan 2019 10:01:08 -0800
Subject: Display /proc/net entries for all network configurations.

Most of the entries are stubbed out at the moment, but even those were
only displayed if IPv6 support was enabled. The entries should be
displayed with IPv4-support only, and with only loopback devices.

PiperOrigin-RevId: 229946441
Change-Id: I18afaa3af386322787f91bf9d168ab66c01d5a4c
---
 pkg/sentry/fs/proc/net.go  | 29 +++++++++++++++++------------
 pkg/sentry/fs/proc/proc.go |  4 ++--
 2 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 3ed85a538..219eea7f8 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -29,21 +29,20 @@ import (
 // newNet creates a new proc net entry.
 func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
-	if s := p.k.NetworkStack(); s != nil && s.SupportsIPv6() {
+	if s := p.k.NetworkStack(); s != nil {
 		contents = map[string]*fs.Inode{
-			"dev":      seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
-			"if_inet6": seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc),
+			"dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc),
 
 			// The following files are simple stubs until they are
 			// implemented in netstack, if the file contains a
 			// header the stub is just the header otherwise it is
 			// an empty file.
-			"arp":        newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")),
-			"ipv6_route": newStaticProcInode(ctx, msrc, []byte("")),
-			"netlink":    newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")),
-			"netstat":    newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")),
-			"packet":     newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")),
-			"protocols":  newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")),
+			"arp": newStaticProcInode(ctx, msrc, []byte("IP address       HW type     Flags       HW address            Mask     Device")),
+
+			"netlink":   newStaticProcInode(ctx, msrc, []byte("sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode")),
+			"netstat":   newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")),
+			"packet":    newStaticProcInode(ctx, msrc, []byte("sk       RefCnt Type Proto  Iface R Rmem   User   Inode")),
+			"protocols": newStaticProcInode(ctx, msrc, []byte("protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")),
 			// Linux sets psched values to: nsec per usec, psched
 			// tick in ns, 1000000, high res timer ticks per sec
 			// (ClockGetres returns 1ns resolution).
@@ -51,9 +50,15 @@ func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 			"ptype":  newStaticProcInode(ctx, msrc, []byte("Type Device      Function")),
 			"route":  newStaticProcInode(ctx, msrc, []byte("Iface   Destination     Gateway         Flags   RefCnt  Use     Metric  Mask            MTU     Window  IRTT")),
 			"tcp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
-			"tcp6":   newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
-			"udp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
-			"udp6":   newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
+
+			"udp": newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
+		}
+
+		if s.SupportsIPv6() {
+			contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc)
+			contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte(""))
+			contents["tcp6"] = newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode"))
+			contents["udp6"] = newStaticProcInode(ctx, msrc, []byte("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode"))
 		}
 	}
 	d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index d1c699418..be04f94af 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -83,9 +83,9 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 
 	// If we're using rpcinet we will let it manage /proc/net.
 	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
-		contents["net"] = newRPCInetProcNet(ctx, msrc)
+		p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc))
 	} else {
-		contents["net"] = p.newNetDir(ctx, msrc)
+		p.AddChild(ctx, "net", p.newNetDir(ctx, msrc))
 	}
 
 	return newProcInode(p, msrc, fs.SpecialDirectory, nil), nil
-- 
cgit v1.2.3


From b5088ba59c4b6e6fe19a38e15a5472d36f80b397 Mon Sep 17 00:00:00 2001
From: Adin Scannell <adin@scannell.ca>
Date: Thu, 24 Jan 2019 17:01:20 -0800
Subject: cleanup: extract the kernel from context

Change-Id: I94704a90beebb53164325e0cce1fcb9a0b97d65c
PiperOrigin-RevId: 230817308
---
 pkg/sentry/fs/ashmem/BUILD        | 1 -
 pkg/sentry/fs/ashmem/area.go      | 7 +------
 pkg/sentry/fs/dev/BUILD           | 1 -
 pkg/sentry/fs/dev/dev.go          | 3 +--
 pkg/sentry/fs/tmpfs/BUILD         | 1 -
 pkg/sentry/fs/tmpfs/file_test.go  | 3 +--
 pkg/sentry/fs/tmpfs/fs.go         | 3 +--
 pkg/sentry/fs/tmpfs/inode_file.go | 4 ++--
 pkg/sentry/fs/tmpfs/tmpfs.go      | 8 ++++----
 9 files changed, 10 insertions(+), 21 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index 2463111a8..e5bb661b5 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -21,7 +21,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/tmpfs",
-        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/platform",
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index 7c1b11464..710b5185f 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -23,7 +23,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -114,11 +113,7 @@ func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MM
 	opts.MaxPerms = opts.MaxPerms.Intersect(a.perms)
 
 	if a.tmpfsFile == nil {
-		k := kernel.KernelFromContext(ctx)
-		if k == nil {
-			return syserror.ENOMEM
-		}
-		tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}, k)
+		tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{})
 		tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewPseudoMountSource(), fs.StableAttr{})
 		dirent := fs.NewDirent(tmpfsInode, namePrefix+"/"+a.name)
 		tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true})
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index b9cfae05f..85371032a 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -25,7 +25,6 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/ramfs",
         "//pkg/sentry/fs/tmpfs",
-        "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
         "//pkg/sentry/platform",
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index f8e8099f7..2ec4c9bff 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -24,7 +24,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
@@ -77,7 +76,7 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn
 		"random":  newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
 		"urandom": newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
 
-		"shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc, kernel.KernelFromContext(ctx)),
+		"shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc),
 
 		// A devpts is typically mounted at /dev/pts to provide
 		// pseudoterminal support. Place an empty directory there for
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 14c7a9e62..c5ec85460 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -42,7 +42,6 @@ go_test(
     deps = [
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
-        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/contexttest",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index e7bbdc404..743061190 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -20,7 +20,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -28,7 +27,7 @@ import (
 
 func newFileInode(ctx context.Context) *fs.Inode {
 	m := fs.NewCachingMountSource(&Filesystem{}, fs.MountSourceFlags{})
-	iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.WithCurrentTime(ctx, fs.UnstableAttr{}), kernel.KernelFromContext(ctx))
+	iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.WithCurrentTime(ctx, fs.UnstableAttr{}))
 	return fs.NewInode(iops, m, fs.StableAttr{
 		DeviceID:  tmpfsDevice.DeviceID(),
 		InodeID:   tmpfsDevice.NextIno(),
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index caa3220ee..d495430e9 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -21,7 +21,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -133,5 +132,5 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 	msrc := fs.NewCachingMountSource(f, flags)
 
 	// Construct the tmpfs root.
-	return NewDir(ctx, nil, owner, perms, msrc, kernel.KernelFromContext(ctx)), nil
+	return NewDir(ctx, nil, owner, perms, msrc), nil
 }
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 42d4bc76f..2505e2c69 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -89,10 +89,10 @@ type fileInodeOperations struct {
 var _ fs.InodeOperations = (*fileInodeOperations)(nil)
 
 // NewInMemoryFile returns a new file backed by p.Memory().
-func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr, k *kernel.Kernel) fs.InodeOperations {
+func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr) fs.InodeOperations {
 	return &fileInodeOperations{
 		attr:     uattr,
-		kernel:   k,
+		kernel:   kernel.KernelFromContext(ctx),
 		memUsage: usage,
 	}
 }
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index a0277a132..4b1762ce4 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -81,10 +81,10 @@ type Dir struct {
 var _ fs.InodeOperations = (*Dir)(nil)
 
 // NewDir returns a new directory.
-func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, kernel *kernel.Kernel) *fs.Inode {
+func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
 	d := &Dir{
 		ramfsDir: ramfs.NewDir(ctx, contents, owner, perms),
-		kernel:   kernel,
+		kernel:   kernel.KernelFromContext(ctx),
 	}
 
 	// Manually set the CreateOps.
@@ -208,7 +208,7 @@ func (d *Dir) SetTimestamps(ctx context.Context, i *fs.Inode, ts fs.TimeSpec) er
 func (d *Dir) newCreateOps() *ramfs.CreateOps {
 	return &ramfs.CreateOps{
 		NewDir: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
-			return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource, d.kernel), nil
+			return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource), nil
 		},
 		NewFile: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) {
 			uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
@@ -217,7 +217,7 @@ func (d *Dir) newCreateOps() *ramfs.CreateOps {
 				// Always start unlinked.
 				Links: 0,
 			})
-			iops := NewInMemoryFile(ctx, usage.Tmpfs, uattr, d.kernel)
+			iops := NewInMemoryFile(ctx, usage.Tmpfs, uattr)
 			return fs.NewInode(iops, dir.MountSource, fs.StableAttr{
 				DeviceID:  tmpfsDevice.DeviceID(),
 				InodeID:   tmpfsDevice.NextIno(),
-- 
cgit v1.2.3


From 55e8eb775b422a7485d6d1dc4f8e4c8fd32096da Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 25 Jan 2019 17:22:04 -0800
Subject: Make cacheRemoteRevalidating detect changes to file size

When file size changes outside the sandbox, page cache was not
refreshing file size which is required for cacheRemoteRevalidating.
In fact, cacheRemoteRevalidating should be skipping the cache
completely since it's not really benefiting from it. The cache is
cache is already bypassed for unstable attributes (see
cachePolicy.cacheUAttrs). And althought the cache is called to
map pages, they will always miss the cache and map directly from
the host.

Created a HostMappable struct that maps directly to the host and
use it for files with cacheRemoteRevalidating.

Closes #124

PiperOrigin-RevId: 230998440
Change-Id: Ic5f632eabe33b47241e05e98c95e9b2090ae08fc
---
 pkg/sentry/fs/fsutil/BUILD                  |   2 +
 pkg/sentry/fs/fsutil/host_mappable.go       | 136 ++++++++++++++
 pkg/sentry/fs/fsutil/host_mappable_state.go |  22 +++
 pkg/sentry/fs/gofer/cache_policy.go         |  22 ++-
 pkg/sentry/fs/gofer/file.go                 |   9 +-
 pkg/sentry/fs/gofer/inode.go                |  30 +++-
 pkg/sentry/fs/gofer/path.go                 |   2 +-
 pkg/sentry/fs/gofer/session.go              |  14 +-
 runsc/container/BUILD                       |   1 +
 runsc/container/container_test.go           | 137 --------------
 runsc/container/shared_volume_test.go       | 267 ++++++++++++++++++++++++++++
 11 files changed, 486 insertions(+), 156 deletions(-)
 create mode 100644 pkg/sentry/fs/fsutil/host_mappable.go
 create mode 100644 pkg/sentry/fs/fsutil/host_mappable_state.go
 create mode 100644 runsc/container/shared_volume_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 4965e1a5f..d4767642b 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -70,6 +70,8 @@ go_library(
         "host_file_mapper.go",
         "host_file_mapper_state.go",
         "host_file_mapper_unsafe.go",
+        "host_mappable.go",
+        "host_mappable_state.go",
         "inode.go",
         "inode_cached.go",
     ],
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
new file mode 100644
index 000000000..4e4bcf4a4
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -0,0 +1,136 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// HostMappable implements memmap.Mappable and platform.File over an arbitrary
+// host file descriptor.
+//
+// +stateify savable
+type HostMappable struct {
+	hostFileMapper *HostFileMapper
+
+	mu sync.Mutex `state:"nosave"`
+
+	// fd is the file descriptor to the host. Protected by mu.
+	fd int `state:"nosave"`
+
+	// mappings tracks mappings of the cached file object into
+	// memmap.MappingSpaces so it can invalidated upon save. Protected by mu.
+	mappings memmap.MappingSet
+}
+
+// NewHostMappable creates a new mappable that maps directly to host FD.
+func NewHostMappable() *HostMappable {
+	return &HostMappable{
+		hostFileMapper: NewHostFileMapper(),
+		fd:             -1,
+	}
+}
+
+func (h *HostMappable) getFD() int {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	if h.fd < 0 {
+		panic("HostMappable FD isn't set")
+	}
+	return h.fd
+}
+
+// UpdateFD sets the host FD iff FD hasn't been set before or if there are
+// no mappings.
+func (h *HostMappable) UpdateFD(fd int) {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.fd = fd
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (h *HostMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	// Hot path. Avoid defers.
+	h.mu.Lock()
+	mapped := h.mappings.AddMapping(ms, ar, offset, writable)
+	for _, r := range mapped {
+		h.hostFileMapper.IncRefOn(r)
+	}
+	h.mu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (h *HostMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	// Hot path. Avoid defers.
+	h.mu.Lock()
+	unmapped := h.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		h.hostFileMapper.DecRefOn(r)
+	}
+	h.mu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (h *HostMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return h.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (h *HostMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	return []memmap.Translation{
+		{
+			Source: optional,
+			File:   h,
+			Offset: optional.Start,
+		},
+	}, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (h *HostMappable) InvalidateUnsavable(ctx context.Context) error {
+	h.mu.Lock()
+	h.mappings.InvalidateAll(memmap.InvalidateOpts{})
+	h.mu.Unlock()
+	return nil
+}
+
+// MapInto implements platform.File.MapInto.
+func (h *HostMappable) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	return as.MapFile(addr, h.getFD(), fr, at, precommit)
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	return h.hostFileMapper.MapInternal(fr, h.getFD(), at.Write)
+}
+
+// IncRef implements platform.File.IncRef.
+func (h *HostMappable) IncRef(fr platform.FileRange) {
+	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
+	h.hostFileMapper.IncRefOn(mr)
+}
+
+// DecRef implements platform.File.DecRef.
+func (h *HostMappable) DecRef(fr platform.FileRange) {
+	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
+	h.hostFileMapper.DecRefOn(mr)
+}
diff --git a/pkg/sentry/fs/fsutil/host_mappable_state.go b/pkg/sentry/fs/fsutil/host_mappable_state.go
new file mode 100644
index 000000000..765f1ec87
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_mappable_state.go
@@ -0,0 +1,22 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+// afterLoad is invoked by stateify.
+func (h *HostMappable) afterLoad() {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	h.fd = -1
+}
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 3d380f0e8..507d6900f 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -90,17 +90,29 @@ func (cp cachePolicy) cacheReaddir() bool {
 	return cp == cacheAll || cp == cacheAllWritethrough
 }
 
-// usePageCache determines whether the page cache should be used for the given
-// inode. If the remote filesystem donates host FDs to the sentry, then the
-// host kernel's page cache will be used, otherwise we will use a
+// useCachingInodeOps determines whether the page cache should be used for the
+// given inode. If the remote filesystem donates host FDs to the sentry, then
+// the host kernel's page cache will be used, otherwise we will use a
 // sentry-internal page cache.
-func (cp cachePolicy) usePageCache(inode *fs.Inode) bool {
+func (cp cachePolicy) useCachingInodeOps(inode *fs.Inode) bool {
 	// Do cached IO for regular files only. Some "character devices" expect
 	// no caching.
 	if !fs.IsFile(inode.StableAttr) {
 		return false
 	}
-	return cp == cacheAll || cp == cacheAllWritethrough || cp == cacheRemoteRevalidating
+	return cp == cacheAll || cp == cacheAllWritethrough
+}
+
+// cacheHandles determine whether handles need to be cached with the given
+// inode. Handles must be cached when inode can be mapped into memory to
+// implement InodeOperations.Mappable with stable handles.
+func (cp cachePolicy) cacheHandles(inode *fs.Inode) bool {
+	// Do cached IO for regular files only. Some "character devices" expect
+	// no caching.
+	if !fs.IsFile(inode.StableAttr) {
+		return false
+	}
+	return cp.useCachingInodeOps(inode) || cp == cacheRemoteRevalidating
 }
 
 // writeThough indicates whether writes to the file should be synced to the
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 3578b07a0..2181ddc68 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -204,7 +204,7 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I
 		return 0, syserror.EISDIR
 	}
 	cp := f.inodeOperations.session().cachePolicy
-	if cp.usePageCache(file.Dirent.Inode) {
+	if cp.useCachingInodeOps(file.Dirent.Inode) {
 		n, err := f.inodeOperations.cachingInodeOps.Write(ctx, src, offset)
 		if err != nil {
 			return n, err
@@ -225,7 +225,7 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO
 		return 0, syserror.EISDIR
 	}
 
-	if f.inodeOperations.session().cachePolicy.usePageCache(file.Dirent.Inode) {
+	if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
 		return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
 	}
 	return dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset))
@@ -267,10 +267,7 @@ func (f *fileOperations) Flush(ctx context.Context, file *fs.File) error {
 
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
-	if !f.inodeOperations.session().cachePolicy.usePageCache(file.Dirent.Inode) {
-		return syserror.ENODEV
-	}
-	return fsutil.GenericConfigureMMap(file, f.inodeOperations.cachingInodeOps, opts)
+	return f.inodeOperations.configureMMap(file, opts)
 }
 
 // Seek implements fs.FileOperations.Seek.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index f0dc99fd0..043705c58 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -125,6 +125,10 @@ type inodeFileState struct {
 	// failures. S/R is transparent to Sentry and the latter will continue
 	// using its cached values after restore.
 	savedUAttr *fs.UnstableAttr
+
+	// hostMappable is created when using 'cacheRemoteRevalidating' to map pages
+	// directly from host.
+	hostMappable *fsutil.HostMappable
 }
 
 // Release releases file handles.
@@ -166,6 +170,9 @@ func (i *inodeFileState) setHandlesForCachedIO(flags fs.FileFlags, h *handles) {
 			i.writebackRW = true
 		}
 	}
+	if i.hostMappable != nil {
+		i.hostMappable.UpdateFD(i.fdLocked())
+	}
 }
 
 // getCachedHandles returns any cached handles which would accelerate
@@ -287,7 +294,10 @@ func (i *inodeFileState) Sync(ctx context.Context) error {
 func (i *inodeFileState) FD() int {
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
+	return i.fdLocked()
+}
 
+func (i *inodeFileState) fdLocked() int {
 	// Assert that the file was actually opened.
 	if i.writeback == nil && i.readthrough == nil {
 		panic("cannot get host FD for a file that was never opened")
@@ -344,9 +354,13 @@ func (i *inodeOperations) Release(ctx context.Context) {
 
 // Mappable implements fs.InodeOperations.Mappable.
 func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
-	if i.session().cachePolicy.usePageCache(inode) {
+	if i.session().cachePolicy.useCachingInodeOps(inode) {
 		return i.cachingInodeOps
 	}
+	// This check is necessary because it's returning an interface type.
+	if i.fileState.hostMappable != nil {
+		return i.fileState.hostMappable
+	}
 	return nil
 }
 
@@ -434,7 +448,7 @@ func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*
 }
 
 func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	if !i.session().cachePolicy.usePageCache(d.Inode) {
+	if !i.session().cachePolicy.cacheHandles(d.Inode) {
 		h, err := newHandles(ctx, i.fileState.file, flags)
 		if err != nil {
 			return nil, err
@@ -503,7 +517,7 @@ func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts
 // Truncate implements fs.InodeOperations.Truncate.
 func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error {
 	// This can only be called for files anyway.
-	if i.session().cachePolicy.usePageCache(inode) {
+	if i.session().cachePolicy.useCachingInodeOps(inode) {
 		return i.cachingInodeOps.Truncate(ctx, inode, length)
 	}
 
@@ -561,6 +575,16 @@ func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
 	return info, nil
 }
 
+func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) error {
+	if i.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
+		return fsutil.GenericConfigureMMap(file, i.cachingInodeOps, opts)
+	}
+	if i.fileState.hostMappable != nil {
+		return fsutil.GenericConfigureMMap(file, i.fileState.hostMappable, opts)
+	}
+	return syserror.ENODEV
+}
+
 func init() {
 	syserror.AddErrorUnwrapper(func(err error) (syscall.Errno, bool) {
 		if _, ok := err.(p9.ErrSocket); ok {
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index a324dc990..faedfb81c 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -128,7 +128,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 		File: newFile,
 		Host: hostFile,
 	}
-	if iops.session().cachePolicy.usePageCache(d.Inode) {
+	if iops.session().cachePolicy.cacheHandles(d.Inode) {
 		iops.fileState.setHandlesForCachedIO(flags, h)
 	}
 	return NewFile(ctx, d, name, flags, iops, h), nil
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index f76a83cd9..b5b1c8202 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -197,11 +197,17 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 		}
 	}
 
+	var hm *fsutil.HostMappable
+	if s.cachePolicy == cacheRemoteRevalidating && fs.IsFile(sattr) {
+		hm = fsutil.NewHostMappable()
+	}
+
 	fileState := &inodeFileState{
-		s:     s,
-		file:  file,
-		sattr: sattr,
-		key:   deviceKey,
+		s:            s,
+		file:         file,
+		sattr:        sattr,
+		key:          deviceKey,
+		hostMappable: hm,
 	}
 
 	uattr := unstable(ctx, valid, attr, s.mounter, s.client)
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 5dfff5c5e..354ce2661 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -36,6 +36,7 @@ go_test(
         "container_test.go",
         "fs_test.go",
         "multi_container_test.go",
+        "shared_volume_test.go",
     ],
     data = [
         ":test_app",
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9f3d6b454..06a25de6d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1353,143 +1353,6 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 }
 
-// Check that modifications to a volume mount are propigated into and out of
-// the sandbox.
-func TestContainerVolumeContentsShared(t *testing.T) {
-	// Only run this test with shared file access, since that is the only
-	// behavior it is testing.
-	conf := testutil.TestConfig()
-	conf.FileAccess = boot.FileAccessShared
-	t.Logf("Running test with conf: %+v", conf)
-
-	// Main process just sleeps. We will use "exec" to probe the state of
-	// the filesystem.
-	spec := testutil.NewSpecWithArgs("sleep", "1000")
-
-	dir, err := ioutil.TempDir(testutil.TmpDir(), "root-fs-test")
-	if err != nil {
-		t.Fatalf("TempDir failed: %v", err)
-	}
-
-	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
-	if err != nil {
-		t.Fatalf("error setting up container: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-	defer os.RemoveAll(bundleDir)
-
-	// Create and start the container.
-	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
-	if err != nil {
-		t.Fatalf("error creating container: %v", err)
-	}
-	defer c.Destroy()
-	if err := c.Start(conf); err != nil {
-		t.Fatalf("error starting container: %v", err)
-	}
-
-	// File that will be used to check consistency inside/outside sandbox.
-	filename := filepath.Join(dir, "file")
-
-	// File does not exist yet. Reading from the sandbox should fail.
-	argsTestFile := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", filename},
-	}
-	if ws, err := c.executeSync(argsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", filename, err)
-	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
-	}
-
-	// Create the file from outside of the sandbox.
-	if err := ioutil.WriteFile(filename, []byte("foobar"), 0777); err != nil {
-		t.Fatalf("error writing to file %q: %v", filename, err)
-	}
-
-	// Now we should be able to test the file from within the sandbox.
-	if ws, err := c.executeSync(argsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", filename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
-	}
-
-	// Rename the file from outside of the sandbox.
-	newFilename := filepath.Join(dir, "newfile")
-	if err := os.Rename(filename, newFilename); err != nil {
-		t.Fatalf("os.Rename(%q, %q) failed: %v", filename, newFilename, err)
-	}
-
-	// File should no longer exist at the old path within the sandbox.
-	if ws, err := c.executeSync(argsTestFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", filename, err)
-	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
-	}
-
-	// We should be able to test the new filename from within the sandbox.
-	argsTestNewFile := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", newFilename},
-	}
-	if ws, err := c.executeSync(argsTestNewFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
-	}
-
-	// Delete the renamed file from outside of the sandbox.
-	if err := os.Remove(newFilename); err != nil {
-		t.Fatalf("error removing file %q: %v", filename, err)
-	}
-
-	// Renamed file should no longer exist at the old path within the sandbox.
-	if ws, err := c.executeSync(argsTestNewFile); err != nil {
-		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
-	} else if ws.ExitStatus() == 0 {
-		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
-	}
-
-	// Now create the file from WITHIN the sandbox.
-	argsTouch := &control.ExecArgs{
-		Filename: "/usr/bin/touch",
-		Argv:     []string{"touch", filename},
-		KUID:     auth.KUID(os.Getuid()),
-		KGID:     auth.KGID(os.Getgid()),
-	}
-	if ws, err := c.executeSync(argsTouch); err != nil {
-		t.Fatalf("unexpected error touching file %q: %v", filename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
-	}
-
-	// File should exist outside the sandbox.
-	if _, err := os.Stat(filename); err != nil {
-		t.Errorf("stat %q got error %v, wanted nil", filename, err)
-	}
-
-	// File should exist outside the sandbox.
-	if _, err := os.Stat(filename); err != nil {
-		t.Errorf("stat %q got error %v, wanted nil", filename, err)
-	}
-
-	// Delete the file from within the sandbox.
-	argsRemove := &control.ExecArgs{
-		Filename: "/bin/rm",
-		Argv:     []string{"rm", filename},
-	}
-	if ws, err := c.executeSync(argsRemove); err != nil {
-		t.Fatalf("unexpected error removing file %q: %v", filename, err)
-	} else if ws.ExitStatus() != 0 {
-		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
-	}
-
-	// File should not exist outside the sandbox.
-	if _, err := os.Stat(filename); !os.IsNotExist(err) {
-		t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err)
-	}
-}
-
 func TestGoferExits(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
 	conf := testutil.TestConfig()
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
new file mode 100644
index 000000000..8f81ed630
--- /dev/null
+++ b/runsc/container/shared_volume_test.go
@@ -0,0 +1,267 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// TestSharedVolume checks that modifications to a volume mount are propagated
+// into and out of the sandbox.
+func TestSharedVolume(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.FileAccess = boot.FileAccessShared
+	t.Logf("Running test with conf: %+v", conf)
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	filename := filepath.Join(dir, "file")
+
+	// File does not exist yet. Reading from the sandbox should fail.
+	argsTestFile := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", filename},
+	}
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
+	}
+
+	// Create the file from outside of the sandbox.
+	if err := ioutil.WriteFile(filename, []byte("foobar"), 0777); err != nil {
+		t.Fatalf("error writing to file %q: %v", filename, err)
+	}
+
+	// Now we should be able to test the file from within the sandbox.
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// Rename the file from outside of the sandbox.
+	newFilename := filepath.Join(dir, "newfile")
+	if err := os.Rename(filename, newFilename); err != nil {
+		t.Fatalf("os.Rename(%q, %q) failed: %v", filename, newFilename, err)
+	}
+
+	// File should no longer exist at the old path within the sandbox.
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
+	}
+
+	// We should be able to test the new filename from within the sandbox.
+	argsTestNewFile := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", newFilename},
+	}
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
+	}
+
+	// Delete the renamed file from outside of the sandbox.
+	if err := os.Remove(newFilename); err != nil {
+		t.Fatalf("error removing file %q: %v", filename, err)
+	}
+
+	// Renamed file should no longer exist at the old path within the sandbox.
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
+	}
+
+	// Now create the file from WITHIN the sandbox.
+	argsTouch := &control.ExecArgs{
+		Filename: "/usr/bin/touch",
+		Argv:     []string{"touch", filename},
+		KUID:     auth.KUID(os.Getuid()),
+		KGID:     auth.KGID(os.Getgid()),
+	}
+	if ws, err := c.executeSync(argsTouch); err != nil {
+		t.Fatalf("unexpected error touching file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
+	}
+
+	// Delete the file from within the sandbox.
+	argsRemove := &control.ExecArgs{
+		Filename: "/bin/rm",
+		Argv:     []string{"rm", filename},
+	}
+	if ws, err := c.executeSync(argsRemove); err != nil {
+		t.Fatalf("unexpected error removing file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// File should not exist outside the sandbox.
+	if _, err := os.Stat(filename); !os.IsNotExist(err) {
+		t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err)
+	}
+}
+
+func checkFile(c *Container, filename string, want []byte) error {
+	cpy := filename + ".copy"
+	argsCp := &control.ExecArgs{
+		Filename: "/bin/cp",
+		Argv:     []string{"cp", "-f", filename, cpy},
+	}
+	if _, err := c.executeSync(argsCp); err != nil {
+		return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
+	}
+	got, err := ioutil.ReadFile(cpy)
+	if err != nil {
+		return fmt.Errorf("Error reading file %q: %v", filename, err)
+	}
+	if !bytes.Equal(got, want) {
+		return fmt.Errorf("file content inside the sandbox is wrong, got: %q, want: %q", got, want)
+	}
+	return nil
+}
+
+// TestSharedVolumeFile tests that changes to file content outside the sandbox
+// is reflected inside.
+func TestSharedVolumeFile(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.FileAccess = boot.FileAccessShared
+	t.Logf("Running test with conf: %+v", conf)
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+
+	rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the container.
+	c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	filename := filepath.Join(dir, "file")
+
+	// Write file from outside the container and check that the same content is
+	// read inside.
+	want := []byte("host-")
+	if err := ioutil.WriteFile(filename, []byte(want), 0666); err != nil {
+		t.Fatalf("Error writing to %q: %v", filename, err)
+	}
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Append to file inside the container and check that content is not lost.
+	argsAppend := &control.ExecArgs{
+		Filename: "/bin/bash",
+		Argv:     []string{"bash", "-c", "echo -n sandbox- >> " + filename},
+	}
+	if _, err := c.executeSync(argsAppend); err != nil {
+		t.Fatalf("unexpected error appending file %q: %v", filename, err)
+	}
+	want = []byte("host-sandbox-")
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Write again from outside the container and check that the same content is
+	// read inside.
+	f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0)
+	if err != nil {
+		t.Fatalf("Error openning file %q: %v", filename, err)
+	}
+	defer f.Close()
+	if _, err := f.Write([]byte("host")); err != nil {
+		t.Fatalf("Error writing to file %q: %v", filename, err)
+	}
+	want = []byte("host-sandbox-host")
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Shrink file outside and check that the same content is read inside.
+	if err := f.Truncate(5); err != nil {
+		t.Fatalf("Error truncating file %q: %v", filename, err)
+	}
+	want = want[:5]
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+}
-- 
cgit v1.2.3


From 1cedccf8e9b63757c182477f803afcb27bd9e17e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 28 Jan 2019 10:59:01 -0800
Subject: Drop the one-page limit for /proc/[pid]/{cmdline,environ}.

It never actually should have applied to environ (the relevant change in
Linux 4.2 is c2c0bb44620d "proc: fix PAGE_SIZE limit of
/proc/$PID/cmdline"), and we claim to be Linux 4.4 now anyway.

PiperOrigin-RevId: 231250661
Change-Id: I37f9c4280a533d1bcb3eebb7803373ac3c7b9f15
---
 pkg/sentry/fs/proc/exec_args.go | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index ddda67f54..a716eb5f5 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -101,12 +101,6 @@ func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen
 		return 0, syserror.EINVAL
 	}
 
-	// N.B. Linux 4.2 eliminates the arbitrary one page limit.
-	if offset > usermem.PageSize {
-		return 0, io.EOF
-	}
-	dst = dst.TakeFirst64(usermem.PageSize - offset)
-
 	m, err := getTaskMM(f.t)
 	if err != nil {
 		return 0, err
-- 
cgit v1.2.3


From 09cf3b40a8994a3f52dfe2a85e5198c5986b8264 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 28 Jan 2019 13:25:27 -0800
Subject: Fix data race in InodeSimpleAttributes.Unstable.

We were modifying InodeSimpleAttributes.Unstable.AccessTime without holding
the necessary lock.  Luckily for us, InodeSimpleAttributes already has a
NotifyAccess method that will do the update while holding the lock.

In addition, we were holding dfo.dir.mu.Lock while setting AccessTime, which
is unnecessary, so that lock has been removed.

PiperOrigin-RevId: 231278447
Change-Id: I81ed6d3dbc0b18e3f90c1df5e5a9c06132761769
---
 pkg/sentry/fs/fsutil/inode.go | 47 +++++++++++++++++++++++++------------------
 pkg/sentry/fs/ramfs/BUILD     |  1 -
 pkg/sentry/fs/ramfs/dir.go    |  5 +----
 3 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index f1f5ec1de..bd3bd1bb2 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -88,30 +88,37 @@ func (*NoReadWriteFileInode) GetFile(ctx context.Context, dirent *fs.Dirent, fla
 //
 // +stateify savable
 type InodeSimpleAttributes struct {
-	// FSType is the immutable filesystem type that will be returned by
+	// fsType is the immutable filesystem type that will be returned by
 	// StatFS.
-	FSType uint64
+	fsType uint64
 
 	// mu protects unstable.
 	mu       sync.RWMutex `state:"nosave"`
-	Unstable fs.UnstableAttr
+	unstable fs.UnstableAttr
 }
 
-// NewInodeSimpleAttributes returns a new InodeSimpleAttributes.
+// NewInodeSimpleAttributes returns a new InodeSimpleAttributes with the given
+// owner and permissions, and all timestamps set to the current time.
 func NewInodeSimpleAttributes(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) InodeSimpleAttributes {
+	return NewInodeSimpleAttributesWithUnstable(fs.WithCurrentTime(ctx, fs.UnstableAttr{
+		Owner: owner,
+		Perms: perms,
+	}), typ)
+}
+
+// NewInodeSimpleAttributesWithUnstable returns a new InodeSimpleAttributes
+// with the given unstable attributes.
+func NewInodeSimpleAttributesWithUnstable(uattr fs.UnstableAttr, typ uint64) InodeSimpleAttributes {
 	return InodeSimpleAttributes{
-		FSType: typ,
-		Unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{
-			Owner: owner,
-			Perms: perms,
-		}),
+		fsType:   typ,
+		unstable: uattr,
 	}
 }
 
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
 func (i *InodeSimpleAttributes) UnstableAttr(ctx context.Context, _ *fs.Inode) (fs.UnstableAttr, error) {
 	i.mu.RLock()
-	u := i.Unstable
+	u := i.unstable
 	i.mu.RUnlock()
 	return u, nil
 }
@@ -119,7 +126,7 @@ func (i *InodeSimpleAttributes) UnstableAttr(ctx context.Context, _ *fs.Inode) (
 // SetPermissions implements fs.InodeOperations.SetPermissions.
 func (i *InodeSimpleAttributes) SetPermissions(ctx context.Context, _ *fs.Inode, p fs.FilePermissions) bool {
 	i.mu.Lock()
-	i.Unstable.SetPermissions(ctx, p)
+	i.unstable.SetPermissions(ctx, p)
 	i.mu.Unlock()
 	return true
 }
@@ -127,7 +134,7 @@ func (i *InodeSimpleAttributes) SetPermissions(ctx context.Context, _ *fs.Inode,
 // SetOwner implements fs.InodeOperations.SetOwner.
 func (i *InodeSimpleAttributes) SetOwner(ctx context.Context, _ *fs.Inode, owner fs.FileOwner) error {
 	i.mu.Lock()
-	i.Unstable.SetOwner(ctx, owner)
+	i.unstable.SetOwner(ctx, owner)
 	i.mu.Unlock()
 	return nil
 }
@@ -135,7 +142,7 @@ func (i *InodeSimpleAttributes) SetOwner(ctx context.Context, _ *fs.Inode, owner
 // SetTimestamps implements fs.InodeOperations.SetTimestamps.
 func (i *InodeSimpleAttributes) SetTimestamps(ctx context.Context, _ *fs.Inode, ts fs.TimeSpec) error {
 	i.mu.Lock()
-	i.Unstable.SetTimestamps(ctx, ts)
+	i.unstable.SetTimestamps(ctx, ts)
 	i.mu.Unlock()
 	return nil
 }
@@ -143,43 +150,43 @@ func (i *InodeSimpleAttributes) SetTimestamps(ctx context.Context, _ *fs.Inode,
 // AddLink implements fs.InodeOperations.AddLink.
 func (i *InodeSimpleAttributes) AddLink() {
 	i.mu.Lock()
-	i.Unstable.Links++
+	i.unstable.Links++
 	i.mu.Unlock()
 }
 
 // DropLink implements fs.InodeOperations.DropLink.
 func (i *InodeSimpleAttributes) DropLink() {
 	i.mu.Lock()
-	i.Unstable.Links--
+	i.unstable.Links--
 	i.mu.Unlock()
 }
 
 // StatFS implements fs.InodeOperations.StatFS.
 func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) {
-	if i.FSType == 0 {
+	if i.fsType == 0 {
 		return fs.Info{}, syserror.ENOSYS
 	}
-	return fs.Info{Type: i.FSType}, nil
+	return fs.Info{Type: i.fsType}, nil
 }
 
 // NotifyAccess updates the access time.
 func (i *InodeSimpleAttributes) NotifyAccess(ctx context.Context) {
 	i.mu.Lock()
-	i.Unstable.AccessTime = ktime.NowFromContext(ctx)
+	i.unstable.AccessTime = ktime.NowFromContext(ctx)
 	i.mu.Unlock()
 }
 
 // NotifyModification updates the modification time.
 func (i *InodeSimpleAttributes) NotifyModification(ctx context.Context) {
 	i.mu.Lock()
-	i.Unstable.ModificationTime = ktime.NowFromContext(ctx)
+	i.unstable.ModificationTime = ktime.NowFromContext(ctx)
 	i.mu.Unlock()
 }
 
 // NotifyStatusChange updates the status change time.
 func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
 	i.mu.Lock()
-	i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+	i.unstable.StatusChangeTime = ktime.NowFromContext(ctx)
 	i.mu.Unlock()
 }
 
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index a476c9cce..4a629e38e 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -18,7 +18,6 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/fsutil",
-        "//pkg/sentry/kernel/time",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 729f37694..696825eb5 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -415,9 +414,7 @@ func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serial
 		Serializer: serializer,
 		DirCursor:  &dfo.dirCursor,
 	}
-	dfo.dir.mu.Lock()
-	dfo.dir.InodeSimpleAttributes.Unstable.AccessTime = ktime.NowFromContext(ctx)
-	dfo.dir.mu.Unlock()
+	dfo.dir.InodeSimpleAttributes.NotifyAccess(ctx)
 	return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset())
 }
 
-- 
cgit v1.2.3


From ae6e37df2abe450b30aba0908c212e9a1f81b84a Mon Sep 17 00:00:00 2001
From: Zhaozhong Ni <nzz@google.com>
Date: Mon, 28 Jan 2019 15:33:09 -0800
Subject: Convert TODO into FIXME.

PiperOrigin-RevId: 231301228
Change-Id: I3e18f3a12a35fb89a22a8c981188268d5887dc61
---
 pkg/sentry/fs/proc/task.go | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 41981a973..70578d3fa 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -76,9 +76,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"fd":      newFdDir(t, msrc),
 		"fdinfo":  newFdInfoDir(t, msrc),
 		"gid_map": newGIDMap(t, msrc),
-		// TODO: This is incorrect for /proc/[pid]/task/[tid]/io, i.e. if
-		// showSubtasks is false:
-		// https://elixir.bootlin.com/linux/v4.4/source/fs/proc/base.c#L3154
+		// FIXME: create the correct io file for threads.
 		"io":        newIO(t, msrc),
 		"maps":      newMaps(t, msrc),
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
-- 
cgit v1.2.3


From cedff8d3aef3bc2055b1a7c3ad47a4c8297367ea Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 30 Jan 2019 11:48:02 -0800
Subject: Add muldiv/rd_tsc support for arm64 platform.

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: If35459be78e023346a140184401172f8e023c7f9
PiperOrigin-RevId: 231638020
---
 pkg/sentry/time/BUILD          |  5 ++-
 pkg/sentry/time/LICENSE        | 27 ++++++++++++++++
 pkg/sentry/time/arith_arm64.go | 70 ++++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/time/muldiv_arm64.s | 44 ++++++++++++++++++++++++++
 pkg/sentry/time/tsc_arm64.s    | 22 +++++++++++++
 5 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 pkg/sentry/time/LICENSE
 create mode 100644 pkg/sentry/time/arith_arm64.go
 create mode 100644 pkg/sentry/time/muldiv_arm64.s
 create mode 100644 pkg/sentry/time/tsc_arm64.s

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 5dadb8a2d..1191010e6 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])  # Apache 2.0, portions BSD
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
@@ -18,15 +18,18 @@ go_template_instance(
 go_library(
     name = "time",
     srcs = [
+        "arith_arm64.go",
         "calibrated_clock.go",
         "clock_id.go",
         "clocks.go",
         "muldiv_amd64.s",
+        "muldiv_arm64.s",
         "parameters.go",
         "sampler.go",
         "sampler_unsafe.go",
         "seqatomic_parameters.go",
         "tsc_amd64.s",
+        "tsc_arm64.s",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/time",
     visibility = ["//pkg/sentry:internal"],
diff --git a/pkg/sentry/time/LICENSE b/pkg/sentry/time/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/pkg/sentry/time/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/sentry/time/arith_arm64.go b/pkg/sentry/time/arith_arm64.go
new file mode 100644
index 000000000..b94740c2a
--- /dev/null
+++ b/pkg/sentry/time/arith_arm64.go
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file provides a generic Go implementation of uint128 divided by uint64.
+
+// The code is derived from Go's generic math/big.divWW_g
+// (src/math/big/arith.go), but is only used on ARM64.
+
+package time
+
+import "math/bits"
+
+type word uint
+
+const (
+	_W  = bits.UintSize // word size in bits
+	_W2 = _W / 2        // half word size in bits
+	_B2 = 1 << _W2      // half digit base
+	_M2 = _B2 - 1       // half digit mask
+)
+
+// nlz returns the number of leading zeros in x.
+// Wraps bits.LeadingZeros call for convenience.
+func nlz(x word) uint {
+	return uint(bits.LeadingZeros(uint(x)))
+}
+
+// q = (u1<<_W + u0 - r)/y
+// Adapted from Warren, Hacker's Delight, p. 152.
+func divWW(u1, u0, v word) (q, r word) {
+	if u1 >= v {
+		return 1<<_W - 1, 1<<_W - 1
+	}
+
+	s := nlz(v)
+	v <<= s
+
+	vn1 := v >> _W2
+	vn0 := v & _M2
+	un32 := u1<<s | u0>>(_W-s)
+	un10 := u0 << s
+	un1 := un10 >> _W2
+	un0 := un10 & _M2
+	q1 := un32 / vn1
+	rhat := un32 - q1*vn1
+
+	for q1 >= _B2 || q1*vn0 > _B2*rhat+un1 {
+		q1--
+		rhat += vn1
+
+		if rhat >= _B2 {
+			break
+		}
+	}
+
+	un21 := un32*_B2 + un1 - q1*v
+	q0 := un21 / vn1
+	rhat = un21 - q0*vn1
+
+	for q0 >= _B2 || q0*vn0 > _B2*rhat+un0 {
+		q0--
+		rhat += vn1
+		if rhat >= _B2 {
+			break
+		}
+	}
+
+	return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s
+}
diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s
new file mode 100644
index 000000000..5fa82a136
--- /dev/null
+++ b/pkg/sentry/time/muldiv_arm64.s
@@ -0,0 +1,44 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// Documentation is available in parameters.go.
+//
+// func muldiv64(value, multiplier, divisor uint64) (uint64, bool)
+TEXT ·muldiv64(SB),NOSPLIT,$40-33
+    MOVD    value+0(FP), R0
+    MOVD    multiplier+8(FP), R1
+    MOVD    divisor+16(FP), R2
+
+    UMULH   R0, R1, R3
+    MUL     R0, R1, R4
+
+    CMP     R2, R3
+    BHS     overflow
+
+    MOVD    R3, 8(RSP)
+    MOVD    R4, 16(RSP)
+    MOVD    R2, 24(RSP)
+    CALL    ·divWW(SB)
+    MOVD    32(RSP), R0
+    MOVD    R0, result+24(FP)
+    MOVD    $1, R0
+    MOVB    R0, ok+32(FP)
+    RET
+
+overflow:
+    MOVD    ZR, result+24(FP)
+    MOVB    ZR, ok+32(FP)
+    RET
diff --git a/pkg/sentry/time/tsc_arm64.s b/pkg/sentry/time/tsc_arm64.s
new file mode 100644
index 000000000..c1c9760ef
--- /dev/null
+++ b/pkg/sentry/time/tsc_arm64.s
@@ -0,0 +1,22 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+TEXT ·Rdtsc(SB),NOSPLIT,$0-8
+	// Get the virtual counter.
+	ISB	$15
+	WORD	$0xd53be040     //MRS	CNTVCT_EL0, R0
+	MOVD	R0, ret+0(FP)
+	RET
-- 
cgit v1.2.3


From 2a0c69b19f4b55c3f9777f0098a72af123ccff3c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 31 Jan 2019 11:11:44 -0800
Subject: Remove license comments

Nothing reads them and they can simply get stale.

Generated with:
$ sed -i "s/licenses(\(.*\)).*/licenses(\1)/" **/BUILD

PiperOrigin-RevId: 231818945
Change-Id: Ibc3f9838546b7e94f13f217060d31f4ada9d4bf0
---
 pkg/abi/BUILD                                 | 2 +-
 pkg/abi/linux/BUILD                           | 2 +-
 pkg/amutex/BUILD                              | 2 +-
 pkg/atomicbitops/BUILD                        | 2 +-
 pkg/binary/BUILD                              | 2 +-
 pkg/bits/BUILD                                | 2 +-
 pkg/bpf/BUILD                                 | 2 +-
 pkg/compressio/BUILD                          | 2 +-
 pkg/control/client/BUILD                      | 2 +-
 pkg/control/server/BUILD                      | 2 +-
 pkg/cpuid/BUILD                               | 2 +-
 pkg/dhcp/BUILD                                | 2 +-
 pkg/eventchannel/BUILD                        | 2 +-
 pkg/fd/BUILD                                  | 2 +-
 pkg/fdnotifier/BUILD                          | 2 +-
 pkg/gate/BUILD                                | 2 +-
 pkg/ilist/BUILD                               | 2 +-
 pkg/linewriter/BUILD                          | 2 +-
 pkg/log/BUILD                                 | 2 +-
 pkg/metric/BUILD                              | 2 +-
 pkg/p9/BUILD                                  | 2 +-
 pkg/p9/local_server/BUILD                     | 2 +-
 pkg/p9/p9test/BUILD                           | 2 +-
 pkg/rand/BUILD                                | 2 +-
 pkg/refs/BUILD                                | 2 +-
 pkg/seccomp/BUILD                             | 2 +-
 pkg/secio/BUILD                               | 2 +-
 pkg/segment/BUILD                             | 2 +-
 pkg/segment/test/BUILD                        | 2 +-
 pkg/sentry/BUILD                              | 2 +-
 pkg/sentry/arch/BUILD                         | 2 +-
 pkg/sentry/context/BUILD                      | 2 +-
 pkg/sentry/context/contexttest/BUILD          | 2 +-
 pkg/sentry/control/BUILD                      | 2 +-
 pkg/sentry/device/BUILD                       | 2 +-
 pkg/sentry/fs/BUILD                           | 2 +-
 pkg/sentry/fs/anon/BUILD                      | 2 +-
 pkg/sentry/fs/ashmem/BUILD                    | 2 +-
 pkg/sentry/fs/binder/BUILD                    | 2 +-
 pkg/sentry/fs/dev/BUILD                       | 2 +-
 pkg/sentry/fs/fdpipe/BUILD                    | 2 +-
 pkg/sentry/fs/filetest/BUILD                  | 2 +-
 pkg/sentry/fs/fsutil/BUILD                    | 2 +-
 pkg/sentry/fs/gofer/BUILD                     | 2 +-
 pkg/sentry/fs/host/BUILD                      | 2 +-
 pkg/sentry/fs/lock/BUILD                      | 2 +-
 pkg/sentry/fs/proc/BUILD                      | 2 +-
 pkg/sentry/fs/proc/device/BUILD               | 2 +-
 pkg/sentry/fs/proc/seqfile/BUILD              | 2 +-
 pkg/sentry/fs/ramfs/BUILD                     | 2 +-
 pkg/sentry/fs/sys/BUILD                       | 2 +-
 pkg/sentry/fs/timerfd/BUILD                   | 2 +-
 pkg/sentry/fs/tmpfs/BUILD                     | 2 +-
 pkg/sentry/fs/tty/BUILD                       | 2 +-
 pkg/sentry/hostcpu/BUILD                      | 2 +-
 pkg/sentry/inet/BUILD                         | 2 +-
 pkg/sentry/kernel/BUILD                       | 2 +-
 pkg/sentry/kernel/auth/BUILD                  | 2 +-
 pkg/sentry/kernel/contexttest/BUILD           | 2 +-
 pkg/sentry/kernel/epoll/BUILD                 | 2 +-
 pkg/sentry/kernel/eventfd/BUILD               | 2 +-
 pkg/sentry/kernel/fasync/BUILD                | 2 +-
 pkg/sentry/kernel/futex/BUILD                 | 2 +-
 pkg/sentry/kernel/kdefs/BUILD                 | 2 +-
 pkg/sentry/kernel/memevent/BUILD              | 2 +-
 pkg/sentry/kernel/pipe/BUILD                  | 2 +-
 pkg/sentry/kernel/sched/BUILD                 | 2 +-
 pkg/sentry/kernel/semaphore/BUILD             | 2 +-
 pkg/sentry/kernel/shm/BUILD                   | 2 +-
 pkg/sentry/kernel/time/BUILD                  | 2 +-
 pkg/sentry/limits/BUILD                       | 2 +-
 pkg/sentry/loader/BUILD                       | 2 +-
 pkg/sentry/memmap/BUILD                       | 2 +-
 pkg/sentry/memutil/BUILD                      | 2 +-
 pkg/sentry/mm/BUILD                           | 2 +-
 pkg/sentry/platform/BUILD                     | 2 +-
 pkg/sentry/platform/filemem/BUILD             | 2 +-
 pkg/sentry/platform/interrupt/BUILD           | 2 +-
 pkg/sentry/platform/kvm/BUILD                 | 2 +-
 pkg/sentry/platform/kvm/testutil/BUILD        | 2 +-
 pkg/sentry/platform/procid/BUILD              | 2 +-
 pkg/sentry/platform/ptrace/BUILD              | 2 +-
 pkg/sentry/platform/ring0/BUILD               | 2 +-
 pkg/sentry/platform/ring0/gen_offsets/BUILD   | 2 +-
 pkg/sentry/platform/ring0/pagetables/BUILD    | 2 +-
 pkg/sentry/platform/safecopy/BUILD            | 2 +-
 pkg/sentry/safemem/BUILD                      | 2 +-
 pkg/sentry/sighandling/BUILD                  | 2 +-
 pkg/sentry/socket/BUILD                       | 2 +-
 pkg/sentry/socket/control/BUILD               | 2 +-
 pkg/sentry/socket/epsocket/BUILD              | 2 +-
 pkg/sentry/socket/hostinet/BUILD              | 2 +-
 pkg/sentry/socket/netlink/BUILD               | 2 +-
 pkg/sentry/socket/netlink/port/BUILD          | 2 +-
 pkg/sentry/socket/netlink/route/BUILD         | 2 +-
 pkg/sentry/socket/rpcinet/BUILD               | 2 +-
 pkg/sentry/socket/rpcinet/conn/BUILD          | 2 +-
 pkg/sentry/socket/rpcinet/notifier/BUILD      | 2 +-
 pkg/sentry/socket/unix/BUILD                  | 2 +-
 pkg/sentry/socket/unix/transport/BUILD        | 2 +-
 pkg/sentry/state/BUILD                        | 2 +-
 pkg/sentry/strace/BUILD                       | 2 +-
 pkg/sentry/syscalls/BUILD                     | 2 +-
 pkg/sentry/syscalls/linux/BUILD               | 2 +-
 pkg/sentry/time/BUILD                         | 2 +-
 pkg/sentry/unimpl/BUILD                       | 2 +-
 pkg/sentry/uniqueid/BUILD                     | 2 +-
 pkg/sentry/usage/BUILD                        | 2 +-
 pkg/sentry/usermem/BUILD                      | 2 +-
 pkg/sentry/watchdog/BUILD                     | 2 +-
 pkg/sleep/BUILD                               | 2 +-
 pkg/state/BUILD                               | 2 +-
 pkg/state/statefile/BUILD                     | 2 +-
 pkg/sync/BUILD                                | 2 +-
 pkg/sync/atomicptrtest/BUILD                  | 2 +-
 pkg/sync/seqatomictest/BUILD                  | 2 +-
 pkg/syserr/BUILD                              | 2 +-
 pkg/syserror/BUILD                            | 2 +-
 pkg/tcpip/BUILD                               | 2 +-
 pkg/tcpip/adapters/gonet/BUILD                | 2 +-
 pkg/tcpip/buffer/BUILD                        | 2 +-
 pkg/tcpip/checker/BUILD                       | 2 +-
 pkg/tcpip/hash/jenkins/BUILD                  | 2 +-
 pkg/tcpip/header/BUILD                        | 2 +-
 pkg/tcpip/link/channel/BUILD                  | 2 +-
 pkg/tcpip/link/fdbased/BUILD                  | 2 +-
 pkg/tcpip/link/loopback/BUILD                 | 2 +-
 pkg/tcpip/link/rawfile/BUILD                  | 2 +-
 pkg/tcpip/link/sharedmem/BUILD                | 2 +-
 pkg/tcpip/link/sharedmem/pipe/BUILD           | 2 +-
 pkg/tcpip/link/sharedmem/queue/BUILD          | 2 +-
 pkg/tcpip/link/sniffer/BUILD                  | 2 +-
 pkg/tcpip/link/tun/BUILD                      | 2 +-
 pkg/tcpip/link/waitable/BUILD                 | 2 +-
 pkg/tcpip/network/BUILD                       | 2 +-
 pkg/tcpip/network/arp/BUILD                   | 2 +-
 pkg/tcpip/network/fragmentation/BUILD         | 2 +-
 pkg/tcpip/network/hash/BUILD                  | 2 +-
 pkg/tcpip/network/ipv4/BUILD                  | 2 +-
 pkg/tcpip/network/ipv6/BUILD                  | 2 +-
 pkg/tcpip/ports/BUILD                         | 2 +-
 pkg/tcpip/sample/tun_tcp_connect/BUILD        | 2 +-
 pkg/tcpip/sample/tun_tcp_echo/BUILD           | 2 +-
 pkg/tcpip/seqnum/BUILD                        | 2 +-
 pkg/tcpip/stack/BUILD                         | 2 +-
 pkg/tcpip/transport/ping/BUILD                | 2 +-
 pkg/tcpip/transport/tcp/BUILD                 | 2 +-
 pkg/tcpip/transport/tcp/testing/context/BUILD | 2 +-
 pkg/tcpip/transport/tcpconntrack/BUILD        | 2 +-
 pkg/tcpip/transport/udp/BUILD                 | 2 +-
 pkg/tmutex/BUILD                              | 2 +-
 pkg/unet/BUILD                                | 2 +-
 pkg/urpc/BUILD                                | 2 +-
 pkg/waiter/BUILD                              | 2 +-
 runsc/boot/BUILD                              | 2 +-
 runsc/boot/filter/BUILD                       | 2 +-
 runsc/cgroup/BUILD                            | 2 +-
 runsc/cmd/BUILD                               | 2 +-
 runsc/console/BUILD                           | 2 +-
 runsc/container/BUILD                         | 2 +-
 runsc/fsgofer/BUILD                           | 2 +-
 runsc/fsgofer/filter/BUILD                    | 2 +-
 runsc/sandbox/BUILD                           | 2 +-
 runsc/specutils/BUILD                         | 2 +-
 runsc/test/image/BUILD                        | 2 +-
 runsc/test/integration/BUILD                  | 2 +-
 runsc/test/root/BUILD                         | 2 +-
 runsc/test/root/testdata/BUILD                | 2 +-
 runsc/test/testutil/BUILD                     | 2 +-
 runsc/tools/dockercfg/BUILD                   | 2 +-
 test/syscalls/BUILD                           | 2 +-
 test/syscalls/gtest/BUILD                     | 2 +-
 test/syscalls/linux/BUILD                     | 2 +-
 test/util/BUILD                               | 2 +-
 tools/go_generics/BUILD                       | 2 +-
 tools/go_generics/globals/BUILD               | 2 +-
 tools/go_generics/go_merge/BUILD              | 2 +-
 tools/go_generics/rules_tests/BUILD           | 2 +-
 tools/go_stateify/BUILD                       | 2 +-
 vdso/BUILD                                    | 2 +-
 180 files changed, 180 insertions(+), 180 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD
index 1ba4f3a46..323263ebf 100644
--- a/pkg/abi/BUILD
+++ b/pkg/abi/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index e6043abf4..7648c9469 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -2,7 +2,7 @@
 # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix
 # when the host OS may not be Linux.
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 7cda07418..bdb6e8f2c 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "amutex",
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 235188531..9555bf645 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "atomicbitops",
diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD
index 571151f72..bd37376b0 100644
--- a/pkg/binary/BUILD
+++ b/pkg/binary/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "binary",
diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD
index 46794bdb8..5214b2c24 100644
--- a/pkg/bits/BUILD
+++ b/pkg/bits/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD
index 564df3af5..3c7ae3103 100644
--- a/pkg/bpf/BUILD
+++ b/pkg/bpf/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index 72952d735..3a0ac64e6 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "compressio",
diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD
index 32853875d..22a4a4a5a 100644
--- a/pkg/control/client/BUILD
+++ b/pkg/control/client/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "client",
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index ba2b1be9f..76b2e9787 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "server",
diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD
index 46fc4703b..29cc38778 100644
--- a/pkg/cpuid/BUILD
+++ b/pkg/cpuid/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/dhcp/BUILD b/pkg/dhcp/BUILD
index c97dfc14b..003620b48 100644
--- a/pkg/dhcp/BUILD
+++ b/pkg/dhcp/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "dhcp",
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 18348ef54..5c2a44aa1 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "eventchannel",
diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD
index 06cfd445e..ab1109157 100644
--- a/pkg/fd/BUILD
+++ b/pkg/fd/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fd",
diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD
index 27d378d5b..8c8d193cc 100644
--- a/pkg/fdnotifier/BUILD
+++ b/pkg/fdnotifier/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fdnotifier",
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index 9a87a3a31..83679f2da 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "gate",
diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD
index a67aa2cff..dbd65ab12 100644
--- a/pkg/ilist/BUILD
+++ b/pkg/ilist/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ilist",
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index 3f28ba867..d1aa2e7d6 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "linewriter",
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index 94ac66db3..b2d18eddb 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "log",
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index d96e5563b..4b2c7a00e 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "metric",
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 2c224e65b..5d972309d 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -2,7 +2,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 go_library(
diff --git a/pkg/p9/local_server/BUILD b/pkg/p9/local_server/BUILD
index b17ebb79d..aa6db186c 100644
--- a/pkg/p9/local_server/BUILD
+++ b/pkg/p9/local_server/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "local_server",
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index 7c4b875ce..cf22edde8 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 alias(
     name = "mockgen",
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index 0c9efc709..4eec3a4dd 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "rand",
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 98150ba8f..fc562f821 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD
index 657f923ed..0e9c4692d 100644
--- a/pkg/seccomp/BUILD
+++ b/pkg/seccomp/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "victim",
diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD
index 29f751725..2b4b87c61 100644
--- a/pkg/secio/BUILD
+++ b/pkg/secio/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "secio",
diff --git a/pkg/segment/BUILD b/pkg/segment/BUILD
index 964d73af8..700385907 100644
--- a/pkg/segment/BUILD
+++ b/pkg/segment/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 load("//tools/go_generics:defs.bzl", "go_template")
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index bdf53e24e..81e929b8c 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -2,7 +2,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//visibility:private"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD
index d18cf3555..53989301f 100644
--- a/pkg/sentry/BUILD
+++ b/pkg/sentry/BUILD
@@ -1,7 +1,7 @@
 # This BUILD file defines a package_group that allows for interdependencies for
 # sentry-internal packages.
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 package_group(
     name = "internal",
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 9bf04360a..0c044bc33 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD
index 02d24defd..a3c8d0177 100644
--- a/pkg/sentry/context/BUILD
+++ b/pkg/sentry/context/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "context",
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index 01bb40b04..bed156b70 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index c3b682d6f..f54e01ee8 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "control",
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index bebdb2939..01de708d3 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "device",
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 6f368b0da..e58333da3 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD
index 4bd912e95..2111df2e8 100644
--- a/pkg/sentry/fs/anon/BUILD
+++ b/pkg/sentry/fs/anon/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "anon",
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index e5bb661b5..dcf620dca 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index 27155819e..8a448175f 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index 85371032a..e5b962c8c 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 8a0937cda..098463e97 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD
index d137fee4c..05ca72aa0 100644
--- a/pkg/sentry/fs/filetest/BUILD
+++ b/pkg/sentry/fs/filetest/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index d4767642b..7dff970ea 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 35ffadd13..f2c79b475 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 6877eb161..ea2ca11bf 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 3159ff1da..7164744b8 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 74954f213..f6bc90634 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD
index ff7dacf07..64b0c5a3a 100644
--- a/pkg/sentry/fs/proc/device/BUILD
+++ b/pkg/sentry/fs/proc/device/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "device",
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index b4ba64e10..6b44c0075 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 4a629e38e..f36e4a5e8 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD
index 7de928e16..42e98230e 100644
--- a/pkg/sentry/fs/sys/BUILD
+++ b/pkg/sentry/fs/sys/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD
index ffdd7e0dc..0e06a5028 100644
--- a/pkg/sentry/fs/timerfd/BUILD
+++ b/pkg/sentry/fs/timerfd/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index c5ec85460..bf5b68869 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 011cb6955..bee2db3f3 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD
index 33197cf14..b5067ae6d 100644
--- a/pkg/sentry/hostcpu/BUILD
+++ b/pkg/sentry/hostcpu/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "hostcpu",
diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD
index 159c50efb..e288d34e9 100644
--- a/pkg/sentry/inet/BUILD
+++ b/pkg/sentry/inet/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 7d41626dc..b230aff98 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index a81085372..abd4f2dae 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index 391986291..5769a3b28 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 5e8b36ed6..1567d5050 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index d96803fc9..f2f1a1223 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 17749c0de..5faf95909 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index afd35985f..da24c36c1 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
index 3f8fa206c..38aaca134 100644
--- a/pkg/sentry/kernel/kdefs/BUILD
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "kdefs",
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index dfd8dd062..347a69062 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "memevent",
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 19b23c6d2..011a3f349 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
index 52e226a39..184e8a35b 100644
--- a/pkg/sentry/kernel/sched/BUILD
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sched",
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index bdcf4ce5c..840943ca8 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index 40e641355..f45770eef 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 5d8db2273..584f7c7cc 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 90f4395d4..800166675 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 24e734b49..1ea260a4e 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_embed_data")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index c9e0b95a0..9c2cbd18b 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
index 88738d65d..68b03d4cc 100644
--- a/pkg/sentry/memutil/BUILD
+++ b/pkg/sentry/memutil/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "memutil",
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 0997ec0a7..f679262d0 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD
index af9ba5394..ac8a6cb7f 100644
--- a/pkg/sentry/platform/BUILD
+++ b/pkg/sentry/platform/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
index 2a5982763..1a61cfaa5 100644
--- a/pkg/sentry/platform/filemem/BUILD
+++ b/pkg/sentry/platform/filemem/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index dbafa3204..eeccd4d0e 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "interrupt",
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 1b71e629f..6e40b3177 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
index 1dffe94a4..e10087e8e 100644
--- a/pkg/sentry/platform/kvm/testutil/BUILD
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "testutil",
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD
index 20c8bc02c..277509624 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/sentry/platform/procid/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "procid",
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 2eb354ad4..f86790942 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ptrace",
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index c35d49f2d..ecb3e9a9c 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
index b76d7974e..d7029d5a9 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/BUILD
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index de1b920af..fe93d3030 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index cb8347dd8..05a6a61ae 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0, portions BSD, MIT
+package(licenses = ["notice"])
 
 go_library(
     name = "safecopy",
diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD
index 87a9bff12..3ab453718 100644
--- a/pkg/sentry/safemem/BUILD
+++ b/pkg/sentry/safemem/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "safemem",
diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD
index 41313d334..cec3af92e 100644
--- a/pkg/sentry/sighandling/BUILD
+++ b/pkg/sentry/sighandling/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sighandling",
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 3a8044b5f..076f953e7 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index d3a63f15f..9f4763906 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index da4aaf510..45e418db3 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index b8dceb102..a469af7ac 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index cff922cb8..148306329 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 3a7dbc5ed..a7370a4ec 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD
index e1bcfe252..be0419679 100644
--- a/pkg/sentry/socket/netlink/route/BUILD
+++ b/pkg/sentry/socket/netlink/route/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 06e121946..4da14a1e0 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "rpcinet",
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
index a16977f29..4336ae9b4 100644
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # BSD
+package(licenses = ["notice"])
 
 go_library(
     name = "conn",
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index 2bab01774..b0b107ddb 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # BSD
+package(licenses = ["notice"])
 
 go_library(
     name = "notifier",
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index a12fa93db..fe6871cc6 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 5a90837bc..5a2de0c4c 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index f1f6fdb7d..42c459acc 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "state",
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 8517db1ac..552e79686 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "strace",
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 35192ff49..6b5469e45 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "syscalls",
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 7621bfdbd..846601881 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 1191010e6..c4b6dcc63 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0, portions BSD
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD
index 42e24ace5..b608867a9 100644
--- a/pkg/sentry/unimpl/BUILD
+++ b/pkg/sentry/unimpl/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 proto_library(
     name = "unimplemented_syscall_proto",
diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD
index 0929497c3..ccc5a28d3 100644
--- a/pkg/sentry/uniqueid/BUILD
+++ b/pkg/sentry/uniqueid/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "uniqueid",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index 868dfd400..09198496b 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index dae41ed0e..1a560b6f3 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index b2c687b20..0bbf3705c 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "watchdog",
diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD
index 338fd9336..2b005bf66 100644
--- a/pkg/sleep/BUILD
+++ b/pkg/sleep/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sleep",
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index dd0f250fa..0a975e162 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -1,7 +1,7 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD
index 66c8f3807..5967781e8 100644
--- a/pkg/state/statefile/BUILD
+++ b/pkg/state/statefile/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "statefile",
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 6ddc6e812..1624e681c 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -2,7 +2,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0, portions BSD
+    licenses = ["notice"],
 )
 
 load("//tools/go_generics:defs.bzl", "go_template")
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
index 9cb7f66fe..198fbb895 100644
--- a/pkg/sync/atomicptrtest/BUILD
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index 54f8e59b1..23132650a 100644
--- a/pkg/sync/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD
index 30ae20772..0d65115ef 100644
--- a/pkg/syserr/BUILD
+++ b/pkg/syserr/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "syserr",
diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD
index d4c6da97a..ac478d0ff 100644
--- a/pkg/syserror/BUILD
+++ b/pkg/syserror/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "syserror",
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index daff9a0a0..83524cc8a 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 723ad668f..ee2417238 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "gonet",
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
index 11a725423..648d12cdf 100644
--- a/pkg/tcpip/buffer/BUILD
+++ b/pkg/tcpip/buffer/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index a1de808b9..f597d0b24 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "checker",
diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
index bbb764db8..ce2194a4d 100644
--- a/pkg/tcpip/hash/jenkins/BUILD
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "jenkins",
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 8e455fe1e..a5c7290ee 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index 25f6c1457..ae285e495 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "channel",
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index a4aa3feec..0d78c9b15 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fdbased",
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
index a46ba7f11..710a05ede 100644
--- a/pkg/tcpip/link/loopback/BUILD
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "loopback",
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
index 2746d4ced..f01bb2c07 100644
--- a/pkg/tcpip/link/rawfile/BUILD
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "rawfile",
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index d7f1e66ef..dc8f1543e 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sharedmem",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index 12e813509..85deafa38 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "pipe",
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
index 661037bb2..d7dc631eb 100644
--- a/pkg/tcpip/link/sharedmem/queue/BUILD
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "queue",
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 52e237c25..7d0d1781e 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sniffer",
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index 5ec01cec9..e54852d3f 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "tun",
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index ba495c437..89a9eee23 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "waitable",
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
index a2a07f533..f36f49453 100644
--- a/pkg/tcpip/network/BUILD
+++ b/pkg/tcpip/network/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_test(
     name = "ip_test",
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index f6fb7daf7..ef18bb93d 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "arp",
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index aaabfcb9a..bf0a7b99c 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
index 401dce646..ea520c6ed 100644
--- a/pkg/tcpip/network/hash/BUILD
+++ b/pkg/tcpip/network/hash/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "hash",
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index e72317e9f..7a5341def 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ipv4",
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index 808c37df3..000e00dba 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ipv6",
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index a2fa9b84a..3ee80c62b 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "ports",
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index 32baf2115..996939581 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "tun_tcp_connect",
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index 760445843..dad8ef399 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "tun_tcp_echo",
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
index c5c889239..a63665efc 100644
--- a/pkg/tcpip/seqnum/BUILD
+++ b/pkg/tcpip/seqnum/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library")
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 8a598c57d..551c3c73e 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/pkg/tcpip/transport/ping/BUILD b/pkg/tcpip/transport/ping/BUILD
index 982b6795c..4d4241d4b 100644
--- a/pkg/tcpip/transport/ping/BUILD
+++ b/pkg/tcpip/transport/ping/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 726107739..e5c05f8c0 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
index 814e5c1ea..1584e4095 100644
--- a/pkg/tcpip/transport/tcp/testing/context/BUILD
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "context",
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
index ac1a94d4d..31a845dee 100644
--- a/pkg/tcpip/transport/tcpconntrack/BUILD
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "tcpconntrack",
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 4225e28dc..8ccb79c48 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index c20df7005..69035044d 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "tmutex",
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index f90e43c89..5e177e78e 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "unet",
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index 21008cf6c..36cae67e1 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -1,6 +1,6 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "urpc",
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 5e611c54f..b748246da 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,4 +1,4 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 15a7cdae1..540e99151 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "boot",
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 004222242..3b6020cf3 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "filter",
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index 4f9a25a25..620d33a19 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "cgroup",
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index a908172af..9e2be0d37 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "cmd",
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
index ff4ccff69..3ff9eba27 100644
--- a/runsc/console/BUILD
+++ b/runsc/console/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "console",
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 354ce2661..3b25ff79a 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "container",
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 756c20ad7..4adc9c1bc 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "fsgofer",
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index c7848d10c..78c5b526c 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "filter",
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 899fd99de..2ed793333 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "sandbox",
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 77a10e2b6..372799850 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "specutils",
diff --git a/runsc/test/image/BUILD b/runsc/test/image/BUILD
index 22b3ebd2a..e8b629c6a 100644
--- a/runsc/test/image/BUILD
+++ b/runsc/test/image/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_test(
     name = "image_test",
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index e7204dc66..779d30ec9 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_test(
     name = "integration_test",
diff --git a/runsc/test/root/BUILD b/runsc/test/root/BUILD
index 77dcbd79e..75826a521 100644
--- a/runsc/test/root/BUILD
+++ b/runsc/test/root/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "root",
diff --git a/runsc/test/root/testdata/BUILD b/runsc/test/root/testdata/BUILD
index 6c9fe0aea..7f272dcd3 100644
--- a/runsc/test/root/testdata/BUILD
+++ b/runsc/test/root/testdata/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "testdata",
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index 8c3919320..ddec81444 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "testutil",
diff --git a/runsc/tools/dockercfg/BUILD b/runsc/tools/dockercfg/BUILD
index a80b3abab..fd406ab93 100644
--- a/runsc/tools/dockercfg/BUILD
+++ b/runsc/tools/dockercfg/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "dockercfg",
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 8c391c8a6..148d9c366 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -1,7 +1,7 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("//test/syscalls:build_defs.bzl", "syscall_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 syscall_test(test = "//test/syscalls/linux:32bit_test")
 
diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD
index d078fd3d5..22e061652 100644
--- a/test/syscalls/gtest/BUILD
+++ b/test/syscalls/gtest/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "gtest",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e70742875..a311ca12c 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 cc_binary(
diff --git a/test/util/BUILD b/test/util/BUILD
index f2e563507..fac0730b4 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -1,6 +1,6 @@
 package(
     default_visibility = ["//:sandbox"],
-    licenses = ["notice"],  # Apache 2.0
+    licenses = ["notice"],
 )
 
 cc_library(
diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD
index 2d97d99dc..39318b877 100644
--- a/tools/go_generics/BUILD
+++ b/tools/go_generics/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "go_generics",
diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD
index c26ac56d2..6628132f5 100644
--- a/tools/go_generics/globals/BUILD
+++ b/tools/go_generics/globals/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_library(
     name = "globals",
diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD
index a60437962..02b09120e 100644
--- a/tools/go_generics/go_merge/BUILD
+++ b/tools/go_generics/go_merge/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "go_merge",
diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD
index 23b2d656d..a6f8cdd3c 100644
--- a/tools/go_generics/rules_tests/BUILD
+++ b/tools/go_generics/rules_tests/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
 
diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD
index 68d37f5d7..bb53f8ae9 100644
--- a/tools/go_stateify/BUILD
+++ b/tools/go_stateify/BUILD
@@ -1,6 +1,6 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 go_binary(
     name = "stateify",
diff --git a/vdso/BUILD b/vdso/BUILD
index fd395511c..c43d24070 100644
--- a/vdso/BUILD
+++ b/vdso/BUILD
@@ -3,7 +3,7 @@
 #   normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses
 #   timekeeping parameters managed by the sandbox kernel.
 
-package(licenses = ["notice"])  # Apache 2.0
+package(licenses = ["notice"])
 
 genrule(
     name = "vdso",
-- 
cgit v1.2.3


From a497f5ed5f97e4ad49ed60dd46f0146ae45eefd6 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 31 Jan 2019 12:53:00 -0800
Subject: Invalidate COW mappings when file is truncated

This changed required making fsutil.HostMappable use
a backing file to ensure the correct FD would be used
for read/write operations.

RELNOTES: relnotes is needed for the parent CL.
PiperOrigin-RevId: 231836164
Change-Id: I8ae9639715529874ea7d80a65e2c711a5b4ce254
---
 pkg/sentry/fs/fsutil/BUILD                  |   1 -
 pkg/sentry/fs/fsutil/host_mappable.go       | 104 +++++++++++++++++++++-------
 pkg/sentry/fs/fsutil/host_mappable_state.go |  22 ------
 pkg/sentry/fs/gofer/file.go                 |   3 +
 pkg/sentry/fs/gofer/inode.go                |   6 +-
 pkg/sentry/fs/gofer/session.go              |  17 ++---
 6 files changed, 91 insertions(+), 62 deletions(-)
 delete mode 100644 pkg/sentry/fs/fsutil/host_mappable_state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 7dff970ea..d41fc17cc 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -71,7 +71,6 @@ go_library(
         "host_file_mapper_state.go",
         "host_file_mapper_unsafe.go",
         "host_mappable.go",
-        "host_mappable_state.go",
         "inode.go",
         "inode_cached.go",
     ],
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 4e4bcf4a4..340f8d288 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -15,57 +15,50 @@
 package fsutil
 
 import (
+	"math"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// HostMappable implements memmap.Mappable and platform.File over an arbitrary
-// host file descriptor.
+// HostMappable implements memmap.Mappable and platform.File over a
+// CachedFileObject.
+//
+// Lock order (compare the lock order model in mm/mm.go):
+//   truncateMu ("fs locks")
+//     mu ("memmap.Mappable locks not taken by Translate")
+//       ("platform.File locks")
+//   	     backingFile ("CachedFileObject locks")
 //
 // +stateify savable
 type HostMappable struct {
 	hostFileMapper *HostFileMapper
 
-	mu sync.Mutex `state:"nosave"`
+	backingFile CachedFileObject
 
-	// fd is the file descriptor to the host. Protected by mu.
-	fd int `state:"nosave"`
+	mu sync.Mutex `state:"nosave"`
 
 	// mappings tracks mappings of the cached file object into
 	// memmap.MappingSpaces so it can invalidated upon save. Protected by mu.
 	mappings memmap.MappingSet
+
+	// truncateMu protects writes and truncations. See Truncate() for details.
+	truncateMu sync.RWMutex `state:"nosave"`
 }
 
 // NewHostMappable creates a new mappable that maps directly to host FD.
-func NewHostMappable() *HostMappable {
+func NewHostMappable(backingFile CachedFileObject) *HostMappable {
 	return &HostMappable{
 		hostFileMapper: NewHostFileMapper(),
-		fd:             -1,
+		backingFile:    backingFile,
 	}
 }
 
-func (h *HostMappable) getFD() int {
-	h.mu.Lock()
-	defer h.mu.Unlock()
-	if h.fd < 0 {
-		panic("HostMappable FD isn't set")
-	}
-	return h.fd
-}
-
-// UpdateFD sets the host FD iff FD hasn't been set before or if there are
-// no mappings.
-func (h *HostMappable) UpdateFD(fd int) {
-	h.mu.Lock()
-	defer h.mu.Unlock()
-	h.fd = fd
-}
-
 // AddMapping implements memmap.Mappable.AddMapping.
 func (h *HostMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
 	// Hot path. Avoid defers.
@@ -115,12 +108,12 @@ func (h *HostMappable) InvalidateUnsavable(ctx context.Context) error {
 
 // MapInto implements platform.File.MapInto.
 func (h *HostMappable) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
-	return as.MapFile(addr, h.getFD(), fr, at, precommit)
+	return as.MapFile(addr, h.backingFile.FD(), fr, at, precommit)
 }
 
 // MapInternal implements platform.File.MapInternal.
 func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
-	return h.hostFileMapper.MapInternal(fr, h.getFD(), at.Write)
+	return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write)
 }
 
 // IncRef implements platform.File.IncRef.
@@ -134,3 +127,62 @@ func (h *HostMappable) DecRef(fr platform.FileRange) {
 	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
 	h.hostFileMapper.DecRefOn(mr)
 }
+
+// Truncate truncates the file, invalidating any mapping that may have been
+// removed after the size change.
+//
+// Truncation and writes are synchronized to prevent races where writes make the
+// file grow between truncation and invalidation below:
+//   T1: Calls SetMaskedAttributes and stalls
+//   T2: Appends to file causing it to grow
+//   T2: Writes to mapped pages and COW happens
+//   T1: Continues and wronly invalidates the page mapped in step above.
+func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error {
+	h.truncateMu.Lock()
+	defer h.truncateMu.Unlock()
+
+	mask := fs.AttrMask{Size: true}
+	attr := fs.UnstableAttr{Size: newSize}
+	if err := h.backingFile.SetMaskedAttributes(ctx, mask, attr); err != nil {
+		return err
+	}
+
+	// Invalidate COW mappings that may exist beyond the new size in case the file
+	// is being shrunk. Other mappinsg don't need to be invalidated because
+	// translate will just return identical mappings after invalidation anyway,
+	// and SIGBUS will be raised and handled when the mappings are touched.
+	//
+	// Compare Linux's mm/truncate.c:truncate_setsize() =>
+	// truncate_pagecache() =>
+	// mm/memory.c:unmap_mapping_range(evencows=1).
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	mr := memmap.MappableRange{
+		Start: fs.OffsetPageEnd(newSize),
+		End:   fs.OffsetPageEnd(math.MaxInt64),
+	}
+	h.mappings.Invalidate(mr, memmap.InvalidateOpts{InvalidatePrivate: true})
+
+	return nil
+}
+
+// Write writes to the file backing this mappable.
+func (h *HostMappable) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	h.truncateMu.RLock()
+	n, err := src.CopyInTo(ctx, &writer{ctx: ctx, hostMappable: h, off: offset})
+	h.truncateMu.RUnlock()
+	return n, err
+}
+
+type writer struct {
+	ctx          context.Context
+	hostMappable *HostMappable
+	off          int64
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (w *writer) WriteFromBlocks(src safemem.BlockSeq) (uint64, error) {
+	n, err := w.hostMappable.backingFile.WriteFromBlocksAt(w.ctx, src, uint64(w.off))
+	w.off += int64(n)
+	return n, err
+}
diff --git a/pkg/sentry/fs/fsutil/host_mappable_state.go b/pkg/sentry/fs/fsutil/host_mappable_state.go
deleted file mode 100644
index 765f1ec87..000000000
--- a/pkg/sentry/fs/fsutil/host_mappable_state.go
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fsutil
-
-// afterLoad is invoked by stateify.
-func (h *HostMappable) afterLoad() {
-	h.mu.Lock()
-	defer h.mu.Unlock()
-	h.fd = -1
-}
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 2181ddc68..2bb25daf1 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -215,6 +215,9 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I
 		}
 		return n, err
 	}
+	if f.inodeOperations.fileState.hostMappable != nil {
+		return f.inodeOperations.fileState.hostMappable.Write(ctx, src, offset)
+	}
 	return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
 }
 
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 043705c58..1dc0ca0db 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -170,9 +170,6 @@ func (i *inodeFileState) setHandlesForCachedIO(flags fs.FileFlags, h *handles) {
 			i.writebackRW = true
 		}
 	}
-	if i.hostMappable != nil {
-		i.hostMappable.UpdateFD(i.fdLocked())
-	}
 }
 
 // getCachedHandles returns any cached handles which would accelerate
@@ -520,6 +517,9 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 	if i.session().cachePolicy.useCachingInodeOps(inode) {
 		return i.cachingInodeOps.Truncate(ctx, inode, length)
 	}
+	if i.session().cachePolicy == cacheRemoteRevalidating {
+		return i.fileState.hostMappable.Truncate(ctx, length)
+	}
 
 	return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)})
 }
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index b5b1c8202..d626b86f5 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -197,17 +197,14 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
 		}
 	}
 
-	var hm *fsutil.HostMappable
-	if s.cachePolicy == cacheRemoteRevalidating && fs.IsFile(sattr) {
-		hm = fsutil.NewHostMappable()
-	}
-
 	fileState := &inodeFileState{
-		s:            s,
-		file:         file,
-		sattr:        sattr,
-		key:          deviceKey,
-		hostMappable: hm,
+		s:     s,
+		file:  file,
+		sattr: sattr,
+		key:   deviceKey,
+	}
+	if s.cachePolicy == cacheRemoteRevalidating && fs.IsFile(sattr) {
+		fileState.hostMappable = fsutil.NewHostMappable(fileState)
 	}
 
 	uattr := unstable(ctx, valid, attr, s.mounter, s.client)
-- 
cgit v1.2.3


From 88b4ce8cac9c438da472205e0e710dd75e73b050 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 31 Jan 2019 15:01:57 -0800
Subject: Fix comment

PiperOrigin-RevId: 231861005
Change-Id: I134d4e20cc898d44844219db0a8aacda87e11ef0
---
 pkg/sentry/fs/proc/mounts.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 0b0e87528..7111e5c0f 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -153,7 +153,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountInfoFile)(nil)}}, 0
 }
 
-// mountsFile is used to implement /proc/[pid]/mountinfo.
+// mountsFile is used to implement /proc/[pid]/mounts.
 //
 // +stateify savable
 type mountsFile struct {
-- 
cgit v1.2.3


From fe1369ac98a4f1d8af5e8be6da71165339e52034 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 31 Jan 2019 17:47:24 -0800
Subject: Move package sync to third_party

PiperOrigin-RevId: 231889261
Change-Id: I482f1df055bcedf4edb9fe3fe9b8e9c80085f1a0
---
 pkg/sentry/kernel/BUILD                            |   4 +-
 pkg/sentry/kernel/futex/BUILD                      |   2 +-
 pkg/sentry/kernel/task.go                          |   4 +-
 pkg/sentry/mm/BUILD                                |   2 +-
 pkg/sentry/mm/mm.go                                |   6 +-
 pkg/sentry/time/BUILD                              |   4 +-
 pkg/sync/BUILD                                     |  50 -------
 pkg/sync/LICENSE                                   |  27 ----
 pkg/sync/atomicptr_unsafe.go                       |  46 ------
 pkg/sync/atomicptrtest/BUILD                       |  29 ----
 pkg/sync/atomicptrtest/atomicptr_test.go           |  40 -----
 pkg/sync/downgradable_rwmutex_test.go              | 149 -------------------
 pkg/sync/downgradable_rwmutex_unsafe.go            | 143 ------------------
 pkg/sync/memmove_unsafe.go                         |  32 ----
 pkg/sync/norace_unsafe.go                          |  44 ------
 pkg/sync/race_unsafe.go                            |  50 -------
 pkg/sync/seqatomic_unsafe.go                       |  81 -----------
 pkg/sync/seqatomictest/BUILD                       |  35 -----
 pkg/sync/seqatomictest/seqatomic_test.go           | 132 -----------------
 pkg/sync/seqcount.go                               | 158 --------------------
 pkg/sync/seqcount_test.go                          | 162 ---------------------
 pkg/sync/sync.go                                   |  16 --
 third_party/gvsync/BUILD                           |  50 +++++++
 third_party/gvsync/LICENSE                         |  27 ++++
 third_party/gvsync/README.md                       |   3 +
 third_party/gvsync/atomicptr_unsafe.go             |  37 +++++
 third_party/gvsync/atomicptrtest/BUILD             |  29 ++++
 third_party/gvsync/atomicptrtest/atomicptr_test.go |  31 ++++
 third_party/gvsync/downgradable_rwmutex_test.go    | 150 +++++++++++++++++++
 third_party/gvsync/downgradable_rwmutex_unsafe.go  | 144 ++++++++++++++++++
 third_party/gvsync/gvsync.go                       |   7 +
 third_party/gvsync/memmove_unsafe.go               |  23 +++
 third_party/gvsync/norace_unsafe.go                |  35 +++++
 third_party/gvsync/race_unsafe.go                  |  41 ++++++
 third_party/gvsync/seqatomic_unsafe.go             |  72 +++++++++
 third_party/gvsync/seqatomictest/BUILD             |  35 +++++
 third_party/gvsync/seqatomictest/seqatomic_test.go | 132 +++++++++++++++++
 third_party/gvsync/seqcount.go                     | 149 +++++++++++++++++++
 third_party/gvsync/seqcount_test.go                | 153 +++++++++++++++++++
 39 files changed, 1129 insertions(+), 1205 deletions(-)
 delete mode 100644 pkg/sync/BUILD
 delete mode 100644 pkg/sync/LICENSE
 delete mode 100644 pkg/sync/atomicptr_unsafe.go
 delete mode 100644 pkg/sync/atomicptrtest/BUILD
 delete mode 100644 pkg/sync/atomicptrtest/atomicptr_test.go
 delete mode 100644 pkg/sync/downgradable_rwmutex_test.go
 delete mode 100644 pkg/sync/downgradable_rwmutex_unsafe.go
 delete mode 100644 pkg/sync/memmove_unsafe.go
 delete mode 100644 pkg/sync/norace_unsafe.go
 delete mode 100644 pkg/sync/race_unsafe.go
 delete mode 100644 pkg/sync/seqatomic_unsafe.go
 delete mode 100644 pkg/sync/seqatomictest/BUILD
 delete mode 100644 pkg/sync/seqatomictest/seqatomic_test.go
 delete mode 100644 pkg/sync/seqcount.go
 delete mode 100644 pkg/sync/seqcount_test.go
 delete mode 100644 pkg/sync/sync.go
 create mode 100644 third_party/gvsync/BUILD
 create mode 100644 third_party/gvsync/LICENSE
 create mode 100644 third_party/gvsync/README.md
 create mode 100644 third_party/gvsync/atomicptr_unsafe.go
 create mode 100644 third_party/gvsync/atomicptrtest/BUILD
 create mode 100644 third_party/gvsync/atomicptrtest/atomicptr_test.go
 create mode 100644 third_party/gvsync/downgradable_rwmutex_test.go
 create mode 100644 third_party/gvsync/downgradable_rwmutex_unsafe.go
 create mode 100644 third_party/gvsync/gvsync.go
 create mode 100644 third_party/gvsync/memmove_unsafe.go
 create mode 100644 third_party/gvsync/norace_unsafe.go
 create mode 100644 third_party/gvsync/race_unsafe.go
 create mode 100644 third_party/gvsync/seqatomic_unsafe.go
 create mode 100644 third_party/gvsync/seqatomictest/BUILD
 create mode 100644 third_party/gvsync/seqatomictest/seqatomic_test.go
 create mode 100644 third_party/gvsync/seqcount.go
 create mode 100644 third_party/gvsync/seqcount_test.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index b230aff98..773cb8c91 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -34,7 +34,7 @@ go_template_instance(
     out = "seqatomic_taskgoroutineschedinfo.go",
     package = "kernel",
     suffix = "TaskGoroutineSchedInfo",
-    template = "//pkg/sync:generic_seqatomic",
+    template = "//third_party/gvsync:generic_seqatomic",
     types = {
         "Value": "TaskGoroutineSchedInfo",
     },
@@ -183,12 +183,12 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/state/statefile",
-        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/stack",
         "//pkg/waiter",
+        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index da24c36c1..91feeb5ed 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -8,7 +8,7 @@ go_template_instance(
     out = "atomicptr_bucket.go",
     package = "futex",
     suffix = "Bucket",
-    template = "//pkg/sync:generic_atomicptr",
+    template = "//third_party/gvsync:generic_atomicptr",
     types = {
         "Value": "bucket",
     },
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 2982bc5d1..702e40cce 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -34,7 +34,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	ssync "gvisor.googlesource.com/gvisor/pkg/sync"
+	"gvisor.googlesource.com/gvisor/third_party/gvsync"
 )
 
 // Task represents a thread of execution in the untrusted app.  It
@@ -81,7 +81,7 @@ type Task struct {
 	//
 	// gosched is protected by goschedSeq. gosched is owned by the task
 	// goroutine.
-	goschedSeq ssync.SeqCount `state:"nosave"`
+	goschedSeq gvsync.SeqCount `state:"nosave"`
 	gosched    TaskGoroutineSchedInfo
 
 	// yieldCount is the number of times the task goroutine has called
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index f679262d0..a85ffdef8 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -116,9 +116,9 @@ go_library(
         "//pkg/sentry/safemem",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
-        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/buffer",
+        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 2154e7918..a69b8c7be 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -42,7 +42,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	ssync "gvisor.googlesource.com/gvisor/pkg/sync"
+	"gvisor.googlesource.com/gvisor/third_party/gvsync"
 )
 
 // MemoryManager implements a virtual address space.
@@ -84,7 +84,7 @@ type MemoryManager struct {
 	users int32
 
 	// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
-	mappingMu ssync.DowngradableRWMutex `state:"nosave"`
+	mappingMu gvsync.DowngradableRWMutex `state:"nosave"`
 
 	// vmas stores virtual memory areas. Since vmas are stored by value,
 	// clients should usually use vmaIterator.ValuePtr() instead of
@@ -121,7 +121,7 @@ type MemoryManager struct {
 
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
-	activeMu ssync.DowngradableRWMutex `state:"nosave"`
+	activeMu gvsync.DowngradableRWMutex `state:"nosave"`
 
 	// pmas stores platform mapping areas used to implement vmas. Since pmas
 	// are stored by value, clients should usually use pmaIterator.ValuePtr()
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index c4b6dcc63..b2f8f6832 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
     out = "seqatomic_parameters.go",
     package = "time",
     suffix = "Parameters",
-    template = "//pkg/sync:generic_seqatomic",
+    template = "//third_party/gvsync:generic_seqatomic",
     types = {
         "Value": "Parameters",
     },
@@ -36,8 +36,8 @@ go_library(
     deps = [
         "//pkg/log",
         "//pkg/metric",
-        "//pkg/sync",
         "//pkg/syserror",
+        "//third_party/gvsync",
     ],
 )
 
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
deleted file mode 100644
index 1624e681c..000000000
--- a/pkg/sync/BUILD
+++ /dev/null
@@ -1,50 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-package(
-    default_visibility = ["//:sandbox"],
-    licenses = ["notice"],
-)
-
-load("//tools/go_generics:defs.bzl", "go_template")
-
-go_template(
-    name = "generic_atomicptr",
-    srcs = ["atomicptr_unsafe.go"],
-    types = [
-        "Value",
-    ],
-)
-
-go_template(
-    name = "generic_seqatomic",
-    srcs = ["seqatomic_unsafe.go"],
-    types = [
-        "Value",
-    ],
-    deps = [
-        ":sync",
-    ],
-)
-
-go_library(
-    name = "sync",
-    srcs = [
-        "downgradable_rwmutex_unsafe.go",
-        "memmove_unsafe.go",
-        "norace_unsafe.go",
-        "race_unsafe.go",
-        "seqcount.go",
-        "sync.go",
-    ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sync",
-)
-
-go_test(
-    name = "sync_test",
-    size = "small",
-    srcs = [
-        "downgradable_rwmutex_test.go",
-        "seqcount_test.go",
-    ],
-    embed = [":sync"],
-)
diff --git a/pkg/sync/LICENSE b/pkg/sync/LICENSE
deleted file mode 100644
index 6a66aea5e..000000000
--- a/pkg/sync/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/sync/atomicptr_unsafe.go b/pkg/sync/atomicptr_unsafe.go
deleted file mode 100644
index d943b7ff4..000000000
--- a/pkg/sync/atomicptr_unsafe.go
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"sync/atomic"
-	"unsafe"
-)
-
-// Value is a required type parameter.
-type Value struct{}
-
-// An AtomicPtr is a pointer to a value of type Value that can be atomically
-// loaded and stored. The zero value of an AtomicPtr represents nil.
-//
-// Note that copying AtomicPtr by value performs a non-atomic read of the
-// stored pointer, which is unsafe if Store() can be called concurrently; in
-// this case, do `dst.Store(src.Load())` instead.
-type AtomicPtr struct {
-	ptr unsafe.Pointer
-}
-
-// Load returns the value set by the most recent Store. It returns nil if there
-// has been no previous call to Store.
-func (p *AtomicPtr) Load() *Value {
-	return (*Value)(atomic.LoadPointer(&p.ptr))
-}
-
-// Store sets the value returned by Load to x.
-func (p *AtomicPtr) Store(x *Value) {
-	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
-}
diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
deleted file mode 100644
index 198fbb895..000000000
--- a/pkg/sync/atomicptrtest/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-go_template_instance(
-    name = "atomicptr_int",
-    out = "atomicptr_int.go",
-    package = "atomicptr",
-    suffix = "Int",
-    template = "//pkg/sync:generic_atomicptr",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "atomicptr",
-    srcs = ["atomicptr_int.go"],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sync/atomicptr",
-)
-
-go_test(
-    name = "atomicptr_test",
-    size = "small",
-    srcs = ["atomicptr_test.go"],
-    embed = [":atomicptr"],
-)
diff --git a/pkg/sync/atomicptrtest/atomicptr_test.go b/pkg/sync/atomicptrtest/atomicptr_test.go
deleted file mode 100644
index 3262785ce..000000000
--- a/pkg/sync/atomicptrtest/atomicptr_test.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package atomicptr
-
-import (
-	"testing"
-)
-
-func newInt(val int) *int {
-	return &val
-}
-
-func TestAtomicPtr(t *testing.T) {
-	var p AtomicPtrInt
-	if got := p.Load(); got != nil {
-		t.Errorf("initial value is %p (%v), wanted nil", got, got)
-	}
-	want := newInt(42)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-	want = newInt(100)
-	p.Store(want)
-	if got := p.Load(); got != want {
-		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
-	}
-}
diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go
deleted file mode 100644
index 99c1c8be0..000000000
--- a/pkg/sync/downgradable_rwmutex_test.go
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// GOMAXPROCS=10 go test
-
-// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
-// addition of downgradingWriter and the renaming of num_iterations to
-// numIterations to shut up Golint.
-
-package sync
-
-import (
-	"fmt"
-	"runtime"
-	"sync/atomic"
-	"testing"
-)
-
-func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
-	m.RLock()
-	clocked <- true
-	<-cunlock
-	m.RUnlock()
-	cdone <- true
-}
-
-func doTestParallelReaders(numReaders, gomaxprocs int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	var m DowngradableRWMutex
-	clocked := make(chan bool)
-	cunlock := make(chan bool)
-	cdone := make(chan bool)
-	for i := 0; i < numReaders; i++ {
-		go parallelReader(&m, clocked, cunlock, cdone)
-	}
-	// Wait for all parallel RLock()s to succeed.
-	for i := 0; i < numReaders; i++ {
-		<-clocked
-	}
-	for i := 0; i < numReaders; i++ {
-		cunlock <- true
-	}
-	// Wait for the goroutines to finish.
-	for i := 0; i < numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestParallelReaders(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	doTestParallelReaders(1, 4)
-	doTestParallelReaders(3, 4)
-	doTestParallelReaders(4, 2)
-}
-
-func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.RLock()
-		n := atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.Unlock()
-	}
-	cdone <- true
-}
-
-func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
-	for i := 0; i < numIterations; i++ {
-		rwm.Lock()
-		n := atomic.AddInt32(activity, 10000)
-		if n != 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		atomic.AddInt32(activity, -10000)
-		rwm.DowngradeLock()
-		n = atomic.AddInt32(activity, 1)
-		if n < 1 || n >= 10000 {
-			panic(fmt.Sprintf("wlock(%d)\n", n))
-		}
-		for i := 0; i < 100; i++ {
-		}
-		n = atomic.AddInt32(activity, -1)
-		rwm.RUnlock()
-	}
-	cdone <- true
-}
-
-func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
-	runtime.GOMAXPROCS(gomaxprocs)
-	// Number of active readers + 10000 * number of active writers.
-	var activity int32
-	var rwm DowngradableRWMutex
-	cdone := make(chan bool)
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	var i int
-	for i = 0; i < numReaders/2; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	go writer(&rwm, numIterations, &activity, cdone)
-	go downgradingWriter(&rwm, numIterations, &activity, cdone)
-	for ; i < numReaders; i++ {
-		go reader(&rwm, numIterations, &activity, cdone)
-	}
-	// Wait for the 4 writers and all readers to finish.
-	for i := 0; i < 4+numReaders; i++ {
-		<-cdone
-	}
-}
-
-func TestDowngradableRWMutex(t *testing.T) {
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
-	n := 1000
-	if testing.Short() {
-		n = 5
-	}
-	HammerDowngradableRWMutex(1, 1, n)
-	HammerDowngradableRWMutex(1, 3, n)
-	HammerDowngradableRWMutex(1, 10, n)
-	HammerDowngradableRWMutex(4, 1, n)
-	HammerDowngradableRWMutex(4, 3, n)
-	HammerDowngradableRWMutex(4, 10, n)
-	HammerDowngradableRWMutex(10, 1, n)
-	HammerDowngradableRWMutex(10, 3, n)
-	HammerDowngradableRWMutex(10, 10, n)
-	HammerDowngradableRWMutex(10, 5, n)
-}
diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go
deleted file mode 100644
index 9a96056fc..000000000
--- a/pkg/sync/downgradable_rwmutex_unsafe.go
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This is mostly copied from the standard library's sync/rwmutex.go.
-//
-// Happens-before relationships indicated to the race detector:
-// - Unlock -> Lock (via writerSem)
-// - Unlock -> RLock (via readerSem)
-// - RUnlock -> Lock (via writerSem)
-// - DowngradeLock -> RLock (via readerSem)
-
-package sync
-
-import (
-	"sync"
-	"sync/atomic"
-	"unsafe"
-)
-
-// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
-// method.
-type DowngradableRWMutex struct {
-	w           sync.Mutex // held if there are pending writers
-	writerSem   uint32     // semaphore for writers to wait for completing readers
-	readerSem   uint32     // semaphore for readers to wait for completing writers
-	readerCount int32      // number of pending readers
-	readerWait  int32      // number of departing readers
-}
-
-const rwmutexMaxReaders = 1 << 30
-
-// RLock locks rw for reading.
-func (rw *DowngradableRWMutex) RLock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
-		// A writer is pending, wait for it.
-		runtimeSemacquire(&rw.readerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.readerSem))
-	}
-}
-
-// RUnlock undoes a single RLock call.
-func (rw *DowngradableRWMutex) RUnlock() {
-	if RaceEnabled {
-		// TODO: Why does this need to be ReleaseMerge instead of
-		// Release? IIUC this establishes Unlock happens-before RUnlock, which
-		// seems unnecessary.
-		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
-		RaceDisable()
-	}
-	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
-		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
-			panic("RUnlock of unlocked DowngradableRWMutex")
-		}
-		// A writer is pending.
-		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
-			// The last reader unblocks the writer.
-			runtimeSemrelease(&rw.writerSem, false)
-		}
-	}
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// Lock locks rw for writing.
-func (rw *DowngradableRWMutex) Lock() {
-	if RaceEnabled {
-		RaceDisable()
-	}
-	// First, resolve competition with other writers.
-	rw.w.Lock()
-	// Announce to readers there is a pending writer.
-	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
-	// Wait for active readers.
-	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
-		runtimeSemacquire(&rw.writerSem)
-	}
-	if RaceEnabled {
-		RaceEnable()
-		RaceAcquire(unsafe.Pointer(&rw.writerSem))
-	}
-}
-
-// Unlock unlocks rw for writing.
-func (rw *DowngradableRWMutex) Unlock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.writerSem))
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
-	if r >= rwmutexMaxReaders {
-		panic("Unlock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any.
-	for i := 0; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false)
-	}
-	// Allow other writers to proceed.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-// DowngradeLock atomically unlocks rw for writing and locks it for reading.
-func (rw *DowngradableRWMutex) DowngradeLock() {
-	if RaceEnabled {
-		RaceRelease(unsafe.Pointer(&rw.readerSem))
-		RaceDisable()
-	}
-	// Announce to readers there is no active writer and one additional reader.
-	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
-	if r >= rwmutexMaxReaders+1 {
-		panic("DowngradeLock of unlocked DowngradableRWMutex")
-	}
-	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
-	// includes this goroutine.
-	for i := 1; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false)
-	}
-	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
-	// block on rw.writerSem since at least this reader exists, such that
-	// DowngradeLock() is atomic with the previous write lock.
-	rw.w.Unlock()
-	if RaceEnabled {
-		RaceEnable()
-	}
-}
-
-//go:linkname runtimeSemacquire sync.runtime_Semacquire
-func runtimeSemacquire(s *uint32)
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool)
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
deleted file mode 100644
index cd7a02dca..000000000
--- a/pkg/sync/memmove_unsafe.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sync
-
-import (
-	"unsafe"
-)
-
-// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
-// define it because go_generics can't update the go:linkname annotation.
-// Furthermore, go:linkname silently doesn't work if the local name is exported
-// (this is of course undocumented), which is why this indirection is
-// necessary.
-func Memmove(to, from unsafe.Pointer, n uintptr) {
-	memmove(to, from, n)
-}
-
-//go:linkname memmove runtime.memmove
-//go:noescape
-func memmove(to, from unsafe.Pointer, n uintptr)
diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go
deleted file mode 100644
index 1593b9e5d..000000000
--- a/pkg/sync/norace_unsafe.go
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build !race
-
-package sync
-
-import (
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = false
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-}
diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go
deleted file mode 100644
index 473eaddc6..000000000
--- a/pkg/sync/race_unsafe.go
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build race
-
-package sync
-
-import (
-	"runtime"
-	"unsafe"
-)
-
-// RaceEnabled is true if the Go data race detector is enabled.
-const RaceEnabled = true
-
-// RaceDisable has the same semantics as runtime.RaceDisable.
-func RaceDisable() {
-	runtime.RaceDisable()
-}
-
-// RaceEnable has the same semantics as runtime.RaceEnable.
-func RaceEnable() {
-	runtime.RaceEnable()
-}
-
-// RaceAcquire has the same semantics as runtime.RaceAcquire.
-func RaceAcquire(addr unsafe.Pointer) {
-	runtime.RaceAcquire(addr)
-}
-
-// RaceRelease has the same semantics as runtime.RaceRelease.
-func RaceRelease(addr unsafe.Pointer) {
-	runtime.RaceRelease(addr)
-}
-
-// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
-func RaceReleaseMerge(addr unsafe.Pointer) {
-	runtime.RaceReleaseMerge(addr)
-}
diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
deleted file mode 100644
index bea31adc5..000000000
--- a/pkg/sync/seqatomic_unsafe.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package template doesn't exist. This file must be instantiated using the
-// go_template_instance rule in tools/go_generics/defs.bzl.
-package template
-
-import (
-	"fmt"
-	"reflect"
-	"strings"
-	"unsafe"
-
-	ssync "gvisor.googlesource.com/gvisor/pkg/sync"
-)
-
-// Value is a required type parameter.
-//
-// Value must not contain any pointers, including interface objects, function
-// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
-// containing any of the above. An init() function will panic if this property
-// does not hold.
-type Value struct{}
-
-// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
-// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *ssync.SeqCount, ptr *Value) Value {
-	// This function doesn't use SeqAtomicTryLoad because doing so is
-	// measurably, significantly (~20%) slower; Go is awful at inlining.
-	var val Value
-	for {
-		epoch := sc.BeginRead()
-		if ssync.RaceEnabled {
-			// runtime.RaceDisable() doesn't actually stop the race detector,
-			// so it can't help us here. Instead, call runtime.memmove
-			// directly, which is not instrumented by the race detector.
-			ssync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-		} else {
-			// This is ~40% faster for short reads than going through memmove.
-			val = *ptr
-		}
-		if sc.ReadOk(epoch) {
-			break
-		}
-	}
-	return val
-}
-
-// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
-// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
-// would race with a writer critical section, SeqAtomicTryLoad returns
-// (unspecified, false).
-func SeqAtomicTryLoad(sc *ssync.SeqCount, epoch ssync.SeqCountEpoch, ptr *Value) (Value, bool) {
-	var val Value
-	if ssync.RaceEnabled {
-		ssync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
-	} else {
-		val = *ptr
-	}
-	return val, sc.ReadOk(epoch)
-}
-
-func init() {
-	var val Value
-	typ := reflect.TypeOf(val)
-	name := typ.Name()
-	if ptrs := ssync.PointersInType(typ, name); len(ptrs) != 0 {
-		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
-	}
-}
diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
deleted file mode 100644
index 23132650a..000000000
--- a/pkg/sync/seqatomictest/BUILD
+++ /dev/null
@@ -1,35 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-go_template_instance(
-    name = "seqatomic_int",
-    out = "seqatomic_int.go",
-    package = "seqatomic",
-    suffix = "Int",
-    template = "//pkg/sync:generic_seqatomic",
-    types = {
-        "Value": "int",
-    },
-)
-
-go_library(
-    name = "seqatomic",
-    srcs = ["seqatomic_int.go"],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sync/seqatomic",
-    deps = [
-        "//pkg/sync",
-    ],
-)
-
-go_test(
-    name = "seqatomic_test",
-    size = "small",
-    srcs = ["seqatomic_test.go"],
-    embed = [":seqatomic"],
-    deps = [
-        "//pkg/sync",
-    ],
-)
diff --git a/pkg/sync/seqatomictest/seqatomic_test.go b/pkg/sync/seqatomictest/seqatomic_test.go
deleted file mode 100644
index f5e1fbfff..000000000
--- a/pkg/sync/seqatomictest/seqatomic_test.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package seqatomic
-
-import (
-	"sync/atomic"
-	"testing"
-	"time"
-
-	gvsync "gvisor.googlesource.com/gvisor/pkg/sync"
-)
-
-func TestSeqAtomicLoadUncontended(t *testing.T) {
-	var seq gvsync.SeqCount
-	const want = 1
-	data := want
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadAfterWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicLoadDuringWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	if got := SeqAtomicLoadInt(&seq, &data); got != want {
-		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-	}
-}
-
-func TestSeqAtomicTryLoadUncontended(t *testing.T) {
-	var seq gvsync.SeqCount
-	const want = 1
-	data := want
-	epoch := seq.BeginRead()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-	}
-}
-
-func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-	seq.EndWrite()
-}
-
-func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
-	var seq gvsync.SeqCount
-	var data int
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
-		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
-	}
-}
-
-func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
-	var seq gvsync.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := SeqAtomicLoadInt(&seq, &data); got != want {
-				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
-
-func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
-	var seq gvsync.SeqCount
-	const want = 42
-	data := want
-	b.RunParallel(func(pb *testing.PB) {
-		epoch := seq.BeginRead()
-		for pb.Next() {
-			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
-				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
-			}
-		}
-	})
-}
-
-// For comparison:
-func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
-	var a atomic.Value
-	const want = 42
-	a.Store(int(want))
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			if got := a.Load().(int); got != want {
-				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
-			}
-		}
-	})
-}
diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go
deleted file mode 100644
index 732e856a4..000000000
--- a/pkg/sync/seqcount.go
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sync
-
-import (
-	"fmt"
-	"reflect"
-	"runtime"
-	"sync/atomic"
-)
-
-// SeqCount is a synchronization primitive for optimistic reader/writer
-// synchronization in cases where readers can work with stale data and
-// therefore do not need to block writers.
-//
-// Compared to sync/atomic.Value:
-//
-// - Mutation of SeqCount-protected data does not require memory allocation,
-// whereas atomic.Value generally does. This is a significant advantage when
-// writes are common.
-//
-// - Atomic reads of SeqCount-protected data require copying. This is a
-// disadvantage when atomic reads are common.
-//
-// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
-// operations to be made atomic with reads of SeqCount-protected data.
-//
-// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
-// cannot include pointers.
-//
-// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
-// data require instantiating function templates using go_generics (see
-// seqatomic.go).
-type SeqCount struct {
-	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
-	// if a writer critical section is active, and a read from data protected
-	// by this SeqCount is atomic iff epoch is the same even value before and
-	// after the read.
-	epoch uint32
-}
-
-// SeqCountEpoch tracks writer critical sections in a SeqCount.
-type SeqCountEpoch struct {
-	val uint32
-}
-
-// We assume that:
-//
-// - All functions in sync/atomic that perform a memory read are at least a
-// read fence: memory reads before calls to such functions cannot be reordered
-// after the call, and memory reads after calls to such functions cannot be
-// reordered before the call, even if those reads do not use sync/atomic.
-//
-// - All functions in sync/atomic that perform a memory write are at least a
-// write fence: memory writes before calls to such functions cannot be
-// reordered after the call, and memory writes after calls to such functions
-// cannot be reordered before the call, even if those writes do not use
-// sync/atomic.
-//
-// As of this writing, the Go memory model completely fails to describe
-// sync/atomic, but these properties are implied by
-// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
-
-// BeginRead indicates the beginning of a reader critical section. Reader
-// critical sections DO NOT BLOCK writer critical sections, so operations in a
-// reader critical section MAY RACE with writer critical sections. Races are
-// detected by ReadOk at the end of the reader critical section. Thus, the
-// low-level structure of readers is generally:
-//
-//     for {
-//         epoch := seq.BeginRead()
-//         // do something idempotent with seq-protected data
-//         if seq.ReadOk(epoch) {
-//             break
-//         }
-//     }
-//
-// However, since reader critical sections may race with writer critical
-// sections, the Go race detector will (accurately) flag data races in readers
-// using this pattern. Most users of SeqCount will need to use the
-// SeqAtomicLoad function template in seqatomic.go.
-func (s *SeqCount) BeginRead() SeqCountEpoch {
-	epoch := atomic.LoadUint32(&s.epoch)
-	for epoch&1 != 0 {
-		runtime.Gosched()
-		epoch = atomic.LoadUint32(&s.epoch)
-	}
-	return SeqCountEpoch{epoch}
-}
-
-// ReadOk returns true if the reader critical section initiated by a previous
-// call to BeginRead() that returned epoch did not race with any writer critical
-// sections.
-//
-// ReadOk may be called any number of times during a reader critical section.
-// Reader critical sections do not need to be explicitly terminated; the last
-// call to ReadOk is implicitly the end of the reader critical section.
-func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
-	return atomic.LoadUint32(&s.epoch) == epoch.val
-}
-
-// BeginWrite indicates the beginning of a writer critical section.
-//
-// SeqCount does not support concurrent writer critical sections; clients with
-// concurrent writers must synchronize them using e.g. sync.Mutex.
-func (s *SeqCount) BeginWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
-		panic("SeqCount.BeginWrite during writer critical section")
-	}
-}
-
-// EndWrite ends the effect of a preceding BeginWrite.
-func (s *SeqCount) EndWrite() {
-	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
-		panic("SeqCount.EndWrite outside writer critical section")
-	}
-}
-
-// PointersInType returns a list of pointers reachable from values named
-// valName of the given type.
-//
-// PointersInType is not exhaustive, but it is guaranteed that if typ contains
-// at least one pointer, then PointersInTypeOf returns a non-empty list.
-func PointersInType(typ reflect.Type, valName string) []string {
-	switch kind := typ.Kind(); kind {
-	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		return nil
-
-	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
-		return []string{valName}
-
-	case reflect.Array:
-		return PointersInType(typ.Elem(), valName+"[]")
-
-	case reflect.Struct:
-		var ptrs []string
-		for i, n := 0, typ.NumField(); i < n; i++ {
-			field := typ.Field(i)
-			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
-		}
-		return ptrs
-
-	default:
-		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
-	}
-}
diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go
deleted file mode 100644
index b14a8878e..000000000
--- a/pkg/sync/seqcount_test.go
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package sync
-
-import (
-	"reflect"
-	"testing"
-	"time"
-)
-
-func TestSeqCountWriteUncontended(t *testing.T) {
-	var seq SeqCount
-	seq.BeginWrite()
-	seq.EndWrite()
-}
-
-func TestSeqCountReadUncontended(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadAfterWrite(t *testing.T) {
-	var seq SeqCount
-	var data int32
-	const want = 1
-	seq.BeginWrite()
-	data = want
-	seq.EndWrite()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountBeginReadDuringWrite(t *testing.T) {
-	var seq SeqCount
-	var data int
-	const want = 1
-	seq.BeginWrite()
-	go func() {
-		time.Sleep(time.Second)
-		data = want
-		seq.EndWrite()
-	}()
-	epoch := seq.BeginRead()
-	if data != want {
-		t.Errorf("Reader: got %v, wanted %v", data, want)
-	}
-	if !seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got false, wanted true")
-	}
-}
-
-func TestSeqCountReadOkAfterWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	seq.EndWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-}
-
-func TestSeqCountReadOkDuringWrite(t *testing.T) {
-	var seq SeqCount
-	epoch := seq.BeginRead()
-	seq.BeginWrite()
-	if seq.ReadOk(epoch) {
-		t.Errorf("ReadOk: got true, wanted false")
-	}
-	seq.EndWrite()
-}
-
-func BenchmarkSeqCountWriteUncontended(b *testing.B) {
-	var seq SeqCount
-	for i := 0; i < b.N; i++ {
-		seq.BeginWrite()
-		seq.EndWrite()
-	}
-}
-
-func BenchmarkSeqCountReadUncontended(b *testing.B) {
-	var seq SeqCount
-	b.RunParallel(func(pb *testing.PB) {
-		for pb.Next() {
-			epoch := seq.BeginRead()
-			if !seq.ReadOk(epoch) {
-				b.Fatalf("ReadOk: got false, wanted true")
-			}
-		}
-	})
-}
-
-func TestPointersInType(t *testing.T) {
-	for _, test := range []struct {
-		name string // used for both test and value name
-		val  interface{}
-		ptrs []string
-	}{
-		{
-			name: "EmptyStruct",
-			val:  struct{}{},
-		},
-		{
-			name: "Int",
-			val:  int(0),
-		},
-		{
-			name: "MixedStruct",
-			val: struct {
-				b             bool
-				I             int
-				ExportedPtr   *struct{}
-				unexportedPtr *struct{}
-				arr           [2]int
-				ptrArr        [2]*int
-				nestedStruct  struct {
-					nestedNonptr int
-					nestedPtr    *int
-				}
-				structArr [1]struct {
-					nonptr int
-					ptr    *int
-				}
-			}{},
-			ptrs: []string{
-				"MixedStruct.ExportedPtr",
-				"MixedStruct.unexportedPtr",
-				"MixedStruct.ptrArr[]",
-				"MixedStruct.nestedStruct.nestedPtr",
-				"MixedStruct.structArr[].ptr",
-			},
-		},
-	} {
-		t.Run(test.name, func(t *testing.T) {
-			typ := reflect.TypeOf(test.val)
-			ptrs := PointersInType(typ, test.name)
-			t.Logf("Found pointers: %v", ptrs)
-			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
-				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
-			}
-		})
-	}
-}
diff --git a/pkg/sync/sync.go b/pkg/sync/sync.go
deleted file mode 100644
index 22c5348d7..000000000
--- a/pkg/sync/sync.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package sync provides synchronization primitives.
-package sync
diff --git a/third_party/gvsync/BUILD b/third_party/gvsync/BUILD
new file mode 100644
index 000000000..4764eaa83
--- /dev/null
+++ b/third_party/gvsync/BUILD
@@ -0,0 +1,50 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+load("//tools/go_generics:defs.bzl", "go_template")
+
+go_template(
+    name = "generic_atomicptr",
+    srcs = ["atomicptr_unsafe.go"],
+    types = [
+        "Value",
+    ],
+)
+
+go_template(
+    name = "generic_seqatomic",
+    srcs = ["seqatomic_unsafe.go"],
+    types = [
+        "Value",
+    ],
+    deps = [
+        ":sync",
+    ],
+)
+
+go_library(
+    name = "gvsync",
+    srcs = [
+        "downgradable_rwmutex_unsafe.go",
+        "gvsync.go",
+        "memmove_unsafe.go",
+        "norace_unsafe.go",
+        "race_unsafe.go",
+        "seqcount.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/third_party/gvsync",
+)
+
+go_test(
+    name = "gvsync_test",
+    size = "small",
+    srcs = [
+        "downgradable_rwmutex_test.go",
+        "seqcount_test.go",
+    ],
+    embed = [":gvsync"],
+)
diff --git a/third_party/gvsync/LICENSE b/third_party/gvsync/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/third_party/gvsync/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/gvsync/README.md b/third_party/gvsync/README.md
new file mode 100644
index 000000000..fcc7e6f44
--- /dev/null
+++ b/third_party/gvsync/README.md
@@ -0,0 +1,3 @@
+This package provides additional synchronization primitives not provided by the
+Go stdlib 'sync' package. It is partially derived from the upstream 'sync'
+package.
diff --git a/third_party/gvsync/atomicptr_unsafe.go b/third_party/gvsync/atomicptr_unsafe.go
new file mode 100644
index 000000000..da9f16240
--- /dev/null
+++ b/third_party/gvsync/atomicptr_unsafe.go
@@ -0,0 +1,37 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// Value is a required type parameter.
+type Value struct{}
+
+// An AtomicPtr is a pointer to a value of type Value that can be atomically
+// loaded and stored. The zero value of an AtomicPtr represents nil.
+//
+// Note that copying AtomicPtr by value performs a non-atomic read of the
+// stored pointer, which is unsafe if Store() can be called concurrently; in
+// this case, do `dst.Store(src.Load())` instead.
+type AtomicPtr struct {
+	ptr unsafe.Pointer
+}
+
+// Load returns the value set by the most recent Store. It returns nil if there
+// has been no previous call to Store.
+func (p *AtomicPtr) Load() *Value {
+	return (*Value)(atomic.LoadPointer(&p.ptr))
+}
+
+// Store sets the value returned by Load to x.
+func (p *AtomicPtr) Store(x *Value) {
+	atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
+}
diff --git a/third_party/gvsync/atomicptrtest/BUILD b/third_party/gvsync/atomicptrtest/BUILD
new file mode 100644
index 000000000..74c51fd18
--- /dev/null
+++ b/third_party/gvsync/atomicptrtest/BUILD
@@ -0,0 +1,29 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "atomicptr_int",
+    out = "atomicptr_int.go",
+    package = "atomicptr",
+    suffix = "Int",
+    template = "//third_party/gvsync:generic_atomicptr",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "atomicptr",
+    srcs = ["atomicptr_int.go"],
+    importpath = "gvisor.googlesource.com/gvisor/third_party/gvsync/atomicptr",
+)
+
+go_test(
+    name = "atomicptr_test",
+    size = "small",
+    srcs = ["atomicptr_test.go"],
+    embed = [":atomicptr"],
+)
diff --git a/third_party/gvsync/atomicptrtest/atomicptr_test.go b/third_party/gvsync/atomicptrtest/atomicptr_test.go
new file mode 100644
index 000000000..15d0936d4
--- /dev/null
+++ b/third_party/gvsync/atomicptrtest/atomicptr_test.go
@@ -0,0 +1,31 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomicptr
+
+import (
+	"testing"
+)
+
+func newInt(val int) *int {
+	return &val
+}
+
+func TestAtomicPtr(t *testing.T) {
+	var p AtomicPtrInt
+	if got := p.Load(); got != nil {
+		t.Errorf("initial value is %p (%v), wanted nil", got, got)
+	}
+	want := newInt(42)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+	want = newInt(100)
+	p.Store(want)
+	if got := p.Load(); got != want {
+		t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want)
+	}
+}
diff --git a/third_party/gvsync/downgradable_rwmutex_test.go b/third_party/gvsync/downgradable_rwmutex_test.go
new file mode 100644
index 000000000..6517dd5dc
--- /dev/null
+++ b/third_party/gvsync/downgradable_rwmutex_test.go
@@ -0,0 +1,150 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 Google LLC
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the
+// addition of downgradingWriter and the renaming of num_iterations to
+// numIterations to shut up Golint.
+
+package gvsync
+
+import (
+	"fmt"
+	"runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	<-cunlock
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders, gomaxprocs int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	var m DowngradableRWMutex
+	clocked := make(chan bool)
+	cunlock := make(chan bool)
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	for i := 0; i < numReaders; i++ {
+		cunlock <- true
+	}
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelReaders(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	doTestParallelReaders(1, 4)
+	doTestParallelReaders(3, 4)
+	doTestParallelReaders(4, 2)
+}
+
+func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < numIterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.DowngradeLock()
+		n = atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		n = atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm DowngradableRWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	go writer(&rwm, numIterations, &activity, cdone)
+	go downgradingWriter(&rwm, numIterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, numIterations, &activity, cdone)
+	}
+	// Wait for the 4 writers and all readers to finish.
+	for i := 0; i < 4+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestDowngradableRWMutex(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerDowngradableRWMutex(1, 1, n)
+	HammerDowngradableRWMutex(1, 3, n)
+	HammerDowngradableRWMutex(1, 10, n)
+	HammerDowngradableRWMutex(4, 1, n)
+	HammerDowngradableRWMutex(4, 3, n)
+	HammerDowngradableRWMutex(4, 10, n)
+	HammerDowngradableRWMutex(10, 1, n)
+	HammerDowngradableRWMutex(10, 3, n)
+	HammerDowngradableRWMutex(10, 10, n)
+	HammerDowngradableRWMutex(10, 5, n)
+}
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
new file mode 100644
index 000000000..a63a0d084
--- /dev/null
+++ b/third_party/gvsync/downgradable_rwmutex_unsafe.go
@@ -0,0 +1,144 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 Google LLC
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is mostly copied from the standard library's sync/rwmutex.go.
+//
+// Happens-before relationships indicated to the race detector:
+// - Unlock -> Lock (via writerSem)
+// - Unlock -> RLock (via readerSem)
+// - RUnlock -> Lock (via writerSem)
+// - DowngradeLock -> RLock (via readerSem)
+
+package gvsync
+
+import (
+	"sync"
+	"sync/atomic"
+	"unsafe"
+)
+
+// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
+// method.
+type DowngradableRWMutex struct {
+	w           sync.Mutex // held if there are pending writers
+	writerSem   uint32     // semaphore for writers to wait for completing readers
+	readerSem   uint32     // semaphore for readers to wait for completing writers
+	readerCount int32      // number of pending readers
+	readerWait  int32      // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// RLock locks rw for reading.
+func (rw *DowngradableRWMutex) RLock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
+		// A writer is pending, wait for it.
+		runtimeSemacquire(&rw.readerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.readerSem))
+	}
+}
+
+// RUnlock undoes a single RLock call.
+func (rw *DowngradableRWMutex) RUnlock() {
+	if RaceEnabled {
+		// TODO: Why does this need to be ReleaseMerge instead of
+		// Release? IIUC this establishes Unlock happens-before RUnlock, which
+		// seems unnecessary.
+		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
+		RaceDisable()
+	}
+	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			panic("RUnlock of unlocked DowngradableRWMutex")
+		}
+		// A writer is pending.
+		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			runtimeSemrelease(&rw.writerSem, false)
+		}
+	}
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// Lock locks rw for writing.
+func (rw *DowngradableRWMutex) Lock() {
+	if RaceEnabled {
+		RaceDisable()
+	}
+	// First, resolve competition with other writers.
+	rw.w.Lock()
+	// Announce to readers there is a pending writer.
+	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
+	// Wait for active readers.
+	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
+		runtimeSemacquire(&rw.writerSem)
+	}
+	if RaceEnabled {
+		RaceEnable()
+		RaceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+}
+
+// Unlock unlocks rw for writing.
+func (rw *DowngradableRWMutex) Unlock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.writerSem))
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
+	if r >= rwmutexMaxReaders {
+		panic("Unlock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any.
+	for i := 0; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false)
+	}
+	// Allow other writers to proceed.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+// DowngradeLock atomically unlocks rw for writing and locks it for reading.
+func (rw *DowngradableRWMutex) DowngradeLock() {
+	if RaceEnabled {
+		RaceRelease(unsafe.Pointer(&rw.readerSem))
+		RaceDisable()
+	}
+	// Announce to readers there is no active writer and one additional reader.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
+	if r >= rwmutexMaxReaders+1 {
+		panic("DowngradeLock of unlocked DowngradableRWMutex")
+	}
+	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
+	// includes this goroutine.
+	for i := 1; i < int(r); i++ {
+		runtimeSemrelease(&rw.readerSem, false)
+	}
+	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
+	// block on rw.writerSem since at least this reader exists, such that
+	// DowngradeLock() is atomic with the previous write lock.
+	rw.w.Unlock()
+	if RaceEnabled {
+		RaceEnable()
+	}
+}
+
+//go:linkname runtimeSemacquire sync.runtime_Semacquire
+func runtimeSemacquire(s *uint32)
+
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool)
diff --git a/third_party/gvsync/gvsync.go b/third_party/gvsync/gvsync.go
new file mode 100644
index 000000000..46a2565fd
--- /dev/null
+++ b/third_party/gvsync/gvsync.go
@@ -0,0 +1,7 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gvsync provides synchronization primitives.
+package gvsync
diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go
new file mode 100644
index 000000000..d483fc739
--- /dev/null
+++ b/third_party/gvsync/memmove_unsafe.go
@@ -0,0 +1,23 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gvsync
+
+import (
+	"unsafe"
+)
+
+// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
+// define it because go_generics can't update the go:linkname annotation.
+// Furthermore, go:linkname silently doesn't work if the local name is exported
+// (this is of course undocumented), which is why this indirection is
+// necessary.
+func Memmove(to, from unsafe.Pointer, n uintptr) {
+	memmove(to, from, n)
+}
+
+//go:linkname memmove runtime.memmove
+//go:noescape
+func memmove(to, from unsafe.Pointer, n uintptr)
diff --git a/third_party/gvsync/norace_unsafe.go b/third_party/gvsync/norace_unsafe.go
new file mode 100644
index 000000000..f9c88d13f
--- /dev/null
+++ b/third_party/gvsync/norace_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+package gvsync
+
+import (
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = false
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+}
diff --git a/third_party/gvsync/race_unsafe.go b/third_party/gvsync/race_unsafe.go
new file mode 100644
index 000000000..2cdcdf7f7
--- /dev/null
+++ b/third_party/gvsync/race_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+package gvsync
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+// RaceEnabled is true if the Go data race detector is enabled.
+const RaceEnabled = true
+
+// RaceDisable has the same semantics as runtime.RaceDisable.
+func RaceDisable() {
+	runtime.RaceDisable()
+}
+
+// RaceEnable has the same semantics as runtime.RaceEnable.
+func RaceEnable() {
+	runtime.RaceEnable()
+}
+
+// RaceAcquire has the same semantics as runtime.RaceAcquire.
+func RaceAcquire(addr unsafe.Pointer) {
+	runtime.RaceAcquire(addr)
+}
+
+// RaceRelease has the same semantics as runtime.RaceRelease.
+func RaceRelease(addr unsafe.Pointer) {
+	runtime.RaceRelease(addr)
+}
+
+// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
+func RaceReleaseMerge(addr unsafe.Pointer) {
+	runtime.RaceReleaseMerge(addr)
+}
diff --git a/third_party/gvsync/seqatomic_unsafe.go b/third_party/gvsync/seqatomic_unsafe.go
new file mode 100644
index 000000000..ef61503e2
--- /dev/null
+++ b/third_party/gvsync/seqatomic_unsafe.go
@@ -0,0 +1,72 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package template doesn't exist. This file must be instantiated using the
+// go_template_instance rule in tools/go_generics/defs.bzl.
+package template
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/third_party/gvsync"
+)
+
+// Value is a required type parameter.
+//
+// Value must not contain any pointers, including interface objects, function
+// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs
+// containing any of the above. An init() function will panic if this property
+// does not hold.
+type Value struct{}
+
+// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
+// with any writer critical sections in sc.
+func SeqAtomicLoad(sc *gvsync.SeqCount, ptr *Value) Value {
+	// This function doesn't use SeqAtomicTryLoad because doing so is
+	// measurably, significantly (~20%) slower; Go is awful at inlining.
+	var val Value
+	for {
+		epoch := sc.BeginRead()
+		if gvsync.RaceEnabled {
+			// runtime.RaceDisable() doesn't actually stop the race detector,
+			// so it can't help us here. Instead, call runtime.memmove
+			// directly, which is not instrumented by the race detector.
+			gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+		} else {
+			// This is ~40% faster for short reads than going through memmove.
+			val = *ptr
+		}
+		if sc.ReadOk(epoch) {
+			break
+		}
+	}
+	return val
+}
+
+// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section
+// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
+// would race with a writer critical section, SeqAtomicTryLoad returns
+// (unspecified, false).
+func SeqAtomicTryLoad(sc *gvsync.SeqCount, epoch gvsync.SeqCountEpoch, ptr *Value) (Value, bool) {
+	var val Value
+	if gvsync.RaceEnabled {
+		gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+	} else {
+		val = *ptr
+	}
+	return val, sc.ReadOk(epoch)
+}
+
+func init() {
+	var val Value
+	typ := reflect.TypeOf(val)
+	name := typ.Name()
+	if ptrs := gvsync.PointersInType(typ, name); len(ptrs) != 0 {
+		panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
+	}
+}
diff --git a/third_party/gvsync/seqatomictest/BUILD b/third_party/gvsync/seqatomictest/BUILD
new file mode 100644
index 000000000..d83149e81
--- /dev/null
+++ b/third_party/gvsync/seqatomictest/BUILD
@@ -0,0 +1,35 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "seqatomic_int",
+    out = "seqatomic_int.go",
+    package = "seqatomic",
+    suffix = "Int",
+    template = "//third_party/gvsync:generic_seqatomic",
+    types = {
+        "Value": "int",
+    },
+)
+
+go_library(
+    name = "seqatomic",
+    srcs = ["seqatomic_int.go"],
+    importpath = "gvisor.googlesource.com/gvisor/third_party/gvsync/seqatomic",
+    deps = [
+        "//third_party/gvsync",
+    ],
+)
+
+go_test(
+    name = "seqatomic_test",
+    size = "small",
+    srcs = ["seqatomic_test.go"],
+    embed = [":seqatomic"],
+    deps = [
+        "//third_party/gvsync",
+    ],
+)
diff --git a/third_party/gvsync/seqatomictest/seqatomic_test.go b/third_party/gvsync/seqatomictest/seqatomic_test.go
new file mode 100644
index 000000000..d0c373bae
--- /dev/null
+++ b/third_party/gvsync/seqatomictest/seqatomic_test.go
@@ -0,0 +1,132 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seqatomic
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/third_party/gvsync"
+)
+
+func TestSeqAtomicLoadUncontended(t *testing.T) {
+	var seq gvsync.SeqCount
+	const want = 1
+	data := want
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadAfterWrite(t *testing.T) {
+	var seq gvsync.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicLoadDuringWrite(t *testing.T) {
+	var seq gvsync.SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	if got := SeqAtomicLoadInt(&seq, &data); got != want {
+		t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+	}
+}
+
+func TestSeqAtomicTryLoadUncontended(t *testing.T) {
+	var seq gvsync.SeqCount
+	const want = 1
+	data := want
+	epoch := seq.BeginRead()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+	}
+}
+
+func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
+	var seq gvsync.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+	seq.EndWrite()
+}
+
+func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
+	var seq gvsync.SeqCount
+	var data int
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok {
+		t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got)
+	}
+}
+
+func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
+	var seq gvsync.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := SeqAtomicLoadInt(&seq, &data); got != want {
+				b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
+
+func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
+	var seq gvsync.SeqCount
+	const want = 42
+	data := want
+	b.RunParallel(func(pb *testing.PB) {
+		epoch := seq.BeginRead()
+		for pb.Next() {
+			if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want {
+				b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want)
+			}
+		}
+	})
+}
+
+// For comparison:
+func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) {
+	var a atomic.Value
+	const want = 42
+	a.Store(int(want))
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			if got := a.Load().(int); got != want {
+				b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want)
+			}
+		}
+	})
+}
diff --git a/third_party/gvsync/seqcount.go b/third_party/gvsync/seqcount.go
new file mode 100644
index 000000000..c7ae91cfa
--- /dev/null
+++ b/third_party/gvsync/seqcount.go
@@ -0,0 +1,149 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gvsync
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"sync/atomic"
+)
+
+// SeqCount is a synchronization primitive for optimistic reader/writer
+// synchronization in cases where readers can work with stale data and
+// therefore do not need to block writers.
+//
+// Compared to sync/atomic.Value:
+//
+// - Mutation of SeqCount-protected data does not require memory allocation,
+// whereas atomic.Value generally does. This is a significant advantage when
+// writes are common.
+//
+// - Atomic reads of SeqCount-protected data require copying. This is a
+// disadvantage when atomic reads are common.
+//
+// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
+// operations to be made atomic with reads of SeqCount-protected data.
+//
+// - SeqCount may be less flexible: as of this writing, SeqCount-protected data
+// cannot include pointers.
+//
+// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
+// data require instantiating function templates using go_generics (see
+// seqatomic.go).
+type SeqCount struct {
+	// epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
+	// if a writer critical section is active, and a read from data protected
+	// by this SeqCount is atomic iff epoch is the same even value before and
+	// after the read.
+	epoch uint32
+}
+
+// SeqCountEpoch tracks writer critical sections in a SeqCount.
+type SeqCountEpoch struct {
+	val uint32
+}
+
+// We assume that:
+//
+// - All functions in sync/atomic that perform a memory read are at least a
+// read fence: memory reads before calls to such functions cannot be reordered
+// after the call, and memory reads after calls to such functions cannot be
+// reordered before the call, even if those reads do not use sync/atomic.
+//
+// - All functions in sync/atomic that perform a memory write are at least a
+// write fence: memory writes before calls to such functions cannot be
+// reordered after the call, and memory writes after calls to such functions
+// cannot be reordered before the call, even if those writes do not use
+// sync/atomic.
+//
+// As of this writing, the Go memory model completely fails to describe
+// sync/atomic, but these properties are implied by
+// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.
+
+// BeginRead indicates the beginning of a reader critical section. Reader
+// critical sections DO NOT BLOCK writer critical sections, so operations in a
+// reader critical section MAY RACE with writer critical sections. Races are
+// detected by ReadOk at the end of the reader critical section. Thus, the
+// low-level structure of readers is generally:
+//
+//     for {
+//         epoch := seq.BeginRead()
+//         // do something idempotent with seq-protected data
+//         if seq.ReadOk(epoch) {
+//             break
+//         }
+//     }
+//
+// However, since reader critical sections may race with writer critical
+// sections, the Go race detector will (accurately) flag data races in readers
+// using this pattern. Most users of SeqCount will need to use the
+// SeqAtomicLoad function template in seqatomic.go.
+func (s *SeqCount) BeginRead() SeqCountEpoch {
+	epoch := atomic.LoadUint32(&s.epoch)
+	for epoch&1 != 0 {
+		runtime.Gosched()
+		epoch = atomic.LoadUint32(&s.epoch)
+	}
+	return SeqCountEpoch{epoch}
+}
+
+// ReadOk returns true if the reader critical section initiated by a previous
+// call to BeginRead() that returned epoch did not race with any writer critical
+// sections.
+//
+// ReadOk may be called any number of times during a reader critical section.
+// Reader critical sections do not need to be explicitly terminated; the last
+// call to ReadOk is implicitly the end of the reader critical section.
+func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
+	return atomic.LoadUint32(&s.epoch) == epoch.val
+}
+
+// BeginWrite indicates the beginning of a writer critical section.
+//
+// SeqCount does not support concurrent writer critical sections; clients with
+// concurrent writers must synchronize them using e.g. sync.Mutex.
+func (s *SeqCount) BeginWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
+		panic("SeqCount.BeginWrite during writer critical section")
+	}
+}
+
+// EndWrite ends the effect of a preceding BeginWrite.
+func (s *SeqCount) EndWrite() {
+	if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
+		panic("SeqCount.EndWrite outside writer critical section")
+	}
+}
+
+// PointersInType returns a list of pointers reachable from values named
+// valName of the given type.
+//
+// PointersInType is not exhaustive, but it is guaranteed that if typ contains
+// at least one pointer, then PointersInTypeOf returns a non-empty list.
+func PointersInType(typ reflect.Type, valName string) []string {
+	switch kind := typ.Kind(); kind {
+	case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+		return nil
+
+	case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer:
+		return []string{valName}
+
+	case reflect.Array:
+		return PointersInType(typ.Elem(), valName+"[]")
+
+	case reflect.Struct:
+		var ptrs []string
+		for i, n := 0, typ.NumField(); i < n; i++ {
+			field := typ.Field(i)
+			ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...)
+		}
+		return ptrs
+
+	default:
+		return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)}
+	}
+}
diff --git a/third_party/gvsync/seqcount_test.go b/third_party/gvsync/seqcount_test.go
new file mode 100644
index 000000000..ee6579ed8
--- /dev/null
+++ b/third_party/gvsync/seqcount_test.go
@@ -0,0 +1,153 @@
+// Copyright 2019 Google LLC
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gvsync
+
+import (
+	"reflect"
+	"testing"
+	"time"
+)
+
+func TestSeqCountWriteUncontended(t *testing.T) {
+	var seq SeqCount
+	seq.BeginWrite()
+	seq.EndWrite()
+}
+
+func TestSeqCountReadUncontended(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadAfterWrite(t *testing.T) {
+	var seq SeqCount
+	var data int32
+	const want = 1
+	seq.BeginWrite()
+	data = want
+	seq.EndWrite()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountBeginReadDuringWrite(t *testing.T) {
+	var seq SeqCount
+	var data int
+	const want = 1
+	seq.BeginWrite()
+	go func() {
+		time.Sleep(time.Second)
+		data = want
+		seq.EndWrite()
+	}()
+	epoch := seq.BeginRead()
+	if data != want {
+		t.Errorf("Reader: got %v, wanted %v", data, want)
+	}
+	if !seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got false, wanted true")
+	}
+}
+
+func TestSeqCountReadOkAfterWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	seq.EndWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+}
+
+func TestSeqCountReadOkDuringWrite(t *testing.T) {
+	var seq SeqCount
+	epoch := seq.BeginRead()
+	seq.BeginWrite()
+	if seq.ReadOk(epoch) {
+		t.Errorf("ReadOk: got true, wanted false")
+	}
+	seq.EndWrite()
+}
+
+func BenchmarkSeqCountWriteUncontended(b *testing.B) {
+	var seq SeqCount
+	for i := 0; i < b.N; i++ {
+		seq.BeginWrite()
+		seq.EndWrite()
+	}
+}
+
+func BenchmarkSeqCountReadUncontended(b *testing.B) {
+	var seq SeqCount
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			epoch := seq.BeginRead()
+			if !seq.ReadOk(epoch) {
+				b.Fatalf("ReadOk: got false, wanted true")
+			}
+		}
+	})
+}
+
+func TestPointersInType(t *testing.T) {
+	for _, test := range []struct {
+		name string // used for both test and value name
+		val  interface{}
+		ptrs []string
+	}{
+		{
+			name: "EmptyStruct",
+			val:  struct{}{},
+		},
+		{
+			name: "Int",
+			val:  int(0),
+		},
+		{
+			name: "MixedStruct",
+			val: struct {
+				b             bool
+				I             int
+				ExportedPtr   *struct{}
+				unexportedPtr *struct{}
+				arr           [2]int
+				ptrArr        [2]*int
+				nestedStruct  struct {
+					nestedNonptr int
+					nestedPtr    *int
+				}
+				structArr [1]struct {
+					nonptr int
+					ptr    *int
+				}
+			}{},
+			ptrs: []string{
+				"MixedStruct.ExportedPtr",
+				"MixedStruct.unexportedPtr",
+				"MixedStruct.ptrArr[]",
+				"MixedStruct.nestedStruct.nestedPtr",
+				"MixedStruct.structArr[].ptr",
+			},
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			typ := reflect.TypeOf(test.val)
+			ptrs := PointersInType(typ, test.name)
+			t.Logf("Found pointers: %v", ptrs)
+			if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) {
+				t.Errorf("Got %v, wanted %v", ptrs, test.ptrs)
+			}
+		})
+	}
+}
-- 
cgit v1.2.3


From 92e85623a0cd7b2043a79b757e1874a67796dea9 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 1 Feb 2019 15:22:22 -0800
Subject: Factor the subtargets method into a helper method with tests.

PiperOrigin-RevId: 232047515
Change-Id: I00f036816e320356219be7b2f2e6d5fe57583a60
---
 pkg/sentry/fs/path.go      | 27 ++++++++++++++++
 pkg/sentry/fs/path_test.go | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/fs.go           | 14 ++-------
 3 files changed, 107 insertions(+), 12 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go
index 91a9a8ffd..52139b648 100644
--- a/pkg/sentry/fs/path.go
+++ b/pkg/sentry/fs/path.go
@@ -14,6 +14,11 @@
 
 package fs
 
+import (
+	"path/filepath"
+	"strings"
+)
+
 // TrimTrailingSlashes trims any trailing slashes.
 //
 // The returned boolean indicates whether any changes were made.
@@ -90,3 +95,25 @@ func SplitFirst(path string) (current, remainder string) {
 		return current, remainder
 	}
 }
+
+// IsSubpath checks whether the first path is a (strict) descendent of the
+// second. If it is a subpath, then true is returned along with a clean
+// relative path from the second path to the first. Otherwise false is
+// returned.
+func IsSubpath(subpath, path string) (string, bool) {
+	cleanPath := filepath.Clean(path)
+	cleanSubpath := filepath.Clean(subpath)
+
+	// Add a trailing slash to the path if it does not already have one.
+	if len(cleanPath) == 0 || cleanPath[len(cleanPath)-1] != '/' {
+		cleanPath += "/"
+	}
+	if cleanPath == cleanSubpath {
+		// Paths are equal, thus not a strict subpath.
+		return "", false
+	}
+	if strings.HasPrefix(cleanSubpath, cleanPath) {
+		return strings.TrimPrefix(cleanSubpath, cleanPath), true
+	}
+	return "", false
+}
diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go
index 391b010a7..4ba1498f6 100644
--- a/pkg/sentry/fs/path_test.go
+++ b/pkg/sentry/fs/path_test.go
@@ -209,3 +209,81 @@ func TestSplitFirst(t *testing.T) {
 		}
 	}
 }
+
+// TestIsSubpath tests the IsSubpath method.
+func TestIsSubpath(t *testing.T) {
+	tcs := []struct {
+		// Two absolute paths.
+		pathA string
+		pathB string
+
+		// Whether pathA is a subpath of pathB.
+		wantIsSubpath bool
+
+		// Relative path from pathA to pathB. Only checked if
+		// wantIsSubpath is true.
+		wantRelpath string
+	}{
+		{
+			pathA:         "/foo/bar/baz",
+			pathB:         "/foo",
+			wantIsSubpath: true,
+			wantRelpath:   "bar/baz",
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foo/bar/baz",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foo",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foobar",
+			pathB:         "/foo",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foobar",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/foobar",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/",
+			pathB:         "/foo",
+			wantIsSubpath: false,
+		},
+		{
+			pathA:         "/foo",
+			pathB:         "/",
+			wantIsSubpath: true,
+			wantRelpath:   "foo",
+		},
+		{
+			pathA:         "/foo/bar/../bar",
+			pathB:         "/foo",
+			wantIsSubpath: true,
+			wantRelpath:   "bar",
+		},
+		{
+			pathA:         "/foo/bar",
+			pathB:         "/foo/../foo",
+			wantIsSubpath: true,
+			wantRelpath:   "bar",
+		},
+	}
+
+	for _, tc := range tcs {
+		gotRelpath, gotIsSubpath := IsSubpath(tc.pathA, tc.pathB)
+		if gotRelpath != tc.wantRelpath || gotIsSubpath != tc.wantIsSubpath {
+			t.Errorf("IsSubpath(%q, %q) got %q %t, want %q %t", tc.pathA, tc.pathB, gotRelpath, gotIsSubpath, tc.wantRelpath, tc.wantIsSubpath)
+		}
+	}
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 5c5e650ca..ada292c9e 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -515,20 +515,10 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 // subtargets takes a set of Mounts and returns only the targets that are
 // children of the given root. The returned paths are relative to the root.
 func subtargets(root string, mnts []specs.Mount) []string {
-	r := filepath.Clean(root)
-	if len(r) > 0 && r[len(r)-1] != '/' {
-		r += "/"
-	}
 	var targets []string
 	for _, mnt := range mnts {
-		t := filepath.Clean(mnt.Destination)
-		if strings.HasPrefix(t, r) {
-			// Make the mnt path relative to the root path.  If the
-			// result is empty, then mnt IS the root mount, not a
-			// submount.  We don't want to include those.
-			if t := strings.TrimPrefix(t, r); t != "" {
-				targets = append(targets, t)
-			}
+		if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
+			targets = append(targets, relPath)
 		}
 	}
 	return targets
-- 
cgit v1.2.3


From 2d20b121d710fda3ad3382b66cd6c936e20a1119 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 1 Feb 2019 17:50:32 -0800
Subject: CachingInodeOperations was over-dirtying cached attributes

Dirty should be set only when the attribute is changed in the cache
only. Instances where the change was also sent to the backing file
doesn't need to dirty the attribute.

Also remove size update during WriteOut as writing dirty page would
naturaly grow the file if needed.

RELNOTES: relnotes is needed for the parent CL.
PiperOrigin-RevId: 232068978
Change-Id: I00ba54693a2c7adc06efa9e030faf8f2e8e7f188
---
 pkg/sentry/fs/fsutil/inode_cached.go      | 66 ++++++++++++-------------------
 pkg/sentry/fs/fsutil/inode_cached_test.go | 39 ++++++------------
 pkg/sentry/fs/gofer/inode.go              | 14 +++++--
 3 files changed, 48 insertions(+), 71 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 707ca76d2..5e7e861d2 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
@@ -141,10 +142,6 @@ type CachedFileObject interface {
 	//
 	// FD is called iff the file has been memory mapped. This implies that
 	// the file was opened (see fs.InodeOperations.GetFile).
-	//
-	// FIXME: This interface seems to be
-	// fundamentally broken.  We should clarify CachingInodeOperation's
-	// behavior with metadata.
 	FD() int
 }
 
@@ -190,16 +187,14 @@ func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.I
 	c.attrMu.Lock()
 	defer c.attrMu.Unlock()
 
+	now := ktime.NowFromContext(ctx)
 	masked := fs.AttrMask{Perms: true}
 	if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil {
 		return false
 	}
 	c.attr.Perms = perms
-	// FIXME: Clarify CachingInodeOperations behavior with metadata.
-	c.dirtyAttr.Perms = true
-	c.touchStatusChangeTimeLocked(ctx)
+	c.touchStatusChangeTimeLocked(now)
 	return true
-
 }
 
 // SetOwner implements fs.InodeOperations.SetOwner.
@@ -211,6 +206,7 @@ func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode,
 	c.attrMu.Lock()
 	defer c.attrMu.Unlock()
 
+	now := ktime.NowFromContext(ctx)
 	masked := fs.AttrMask{
 		UID: owner.UID.Ok(),
 		GID: owner.GID.Ok(),
@@ -220,15 +216,11 @@ func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode,
 	}
 	if owner.UID.Ok() {
 		c.attr.Owner.UID = owner.UID
-		// FIXME: Clarify CachingInodeOperations behavior with metadata.
-		c.dirtyAttr.UID = true
 	}
 	if owner.GID.Ok() {
 		c.attr.Owner.GID = owner.GID
-		// FIXME: Clarify CachingInodeOperations behavior with metadata.
-		c.dirtyAttr.GID = true
 	}
-	c.touchStatusChangeTimeLocked(ctx)
+	c.touchStatusChangeTimeLocked(now)
 	return nil
 }
 
@@ -260,15 +252,11 @@ func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.In
 	}
 	if !ts.ATimeOmit {
 		c.attr.AccessTime = ts.ATime
-		// FIXME: Clarify CachingInodeOperations behavior with metadata.
-		c.dirtyAttr.AccessTime = true
 	}
 	if !ts.MTimeOmit {
 		c.attr.ModificationTime = ts.MTime
-		// FIXME: Clarify CachingInodeOperations behavior with metadata.
-		c.dirtyAttr.ModificationTime = true
 	}
-	c.touchStatusChangeTimeLocked(ctx)
+	c.touchStatusChangeTimeLocked(now)
 	return nil
 }
 
@@ -279,21 +267,17 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode,
 
 	// c.attr.Size is protected by both c.attrMu and c.dataMu.
 	c.dataMu.Lock()
-	if err := c.backingFile.SetMaskedAttributes(ctx, fs.AttrMask{
-		Size: true,
-	}, fs.UnstableAttr{
-		Size: size,
-	}); err != nil {
+	now := ktime.NowFromContext(ctx)
+	masked := fs.AttrMask{Size: true}
+	attr := fs.UnstableAttr{Size: size}
+	if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr); err != nil {
 		c.dataMu.Unlock()
 		return err
 	}
 	oldSize := c.attr.Size
-	if oldSize != size {
-		c.attr.Size = size
-		// FIXME: Clarify CachingInodeOperations behavior with metadata.
-		c.dirtyAttr.Size = true
-		c.touchModificationTimeLocked(ctx)
-	}
+	c.attr.Size = size
+	c.touchModificationTimeLocked(now)
+
 	// We drop c.dataMu here so that we can lock c.mapsMu and invalidate
 	// mappings below. This allows concurrent calls to Read/Translate/etc.
 	// These functions synchronize with an in-progress Truncate by refusing to
@@ -346,6 +330,10 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode)
 		return err
 	}
 
+	// SyncDirtyAll above would have grown if needed. On shrinks, the backing
+	// file is called directly, so size is never needs to be updated.
+	c.dirtyAttr.Size = false
+
 	// Write out cached attributes.
 	if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil {
 		c.attrMu.Unlock()
@@ -363,7 +351,7 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode)
 func (c *CachingInodeOperations) IncLinks(ctx context.Context) {
 	c.attrMu.Lock()
 	c.attr.Links++
-	c.touchModificationTimeLocked(ctx)
+	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
 	c.attrMu.Unlock()
 }
 
@@ -371,7 +359,7 @@ func (c *CachingInodeOperations) IncLinks(ctx context.Context) {
 func (c *CachingInodeOperations) DecLinks(ctx context.Context) {
 	c.attrMu.Lock()
 	c.attr.Links--
-	c.touchModificationTimeLocked(ctx)
+	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
 	c.attrMu.Unlock()
 }
 
@@ -384,7 +372,7 @@ func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.
 	}
 
 	c.attrMu.Lock()
-	c.touchAccessTimeLocked(ctx)
+	c.touchAccessTimeLocked(ktime.NowFromContext(ctx))
 	c.attrMu.Unlock()
 }
 
@@ -392,8 +380,8 @@ func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.
 // time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchAccessTimeLocked(ctx context.Context) {
-	c.attr.AccessTime = ktime.NowFromContext(ctx)
+func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) {
+	c.attr.AccessTime = now
 	c.dirtyAttr.AccessTime = true
 }
 
@@ -401,7 +389,7 @@ func (c *CachingInodeOperations) touchAccessTimeLocked(ctx context.Context) {
 // in-place to the current time.
 func (c *CachingInodeOperations) TouchModificationTime(ctx context.Context) {
 	c.attrMu.Lock()
-	c.touchModificationTimeLocked(ctx)
+	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
 	c.attrMu.Unlock()
 }
 
@@ -409,8 +397,7 @@ func (c *CachingInodeOperations) TouchModificationTime(ctx context.Context) {
 // change time in-place to the current time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchModificationTimeLocked(ctx context.Context) {
-	now := ktime.NowFromContext(ctx)
+func (c *CachingInodeOperations) touchModificationTimeLocked(now time.Time) {
 	c.attr.ModificationTime = now
 	c.dirtyAttr.ModificationTime = true
 	c.attr.StatusChangeTime = now
@@ -421,8 +408,7 @@ func (c *CachingInodeOperations) touchModificationTimeLocked(ctx context.Context
 // in-place to the current time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchStatusChangeTimeLocked(ctx context.Context) {
-	now := ktime.NowFromContext(ctx)
+func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) {
 	c.attr.StatusChangeTime = now
 	c.dirtyAttr.StatusChangeTime = true
 }
@@ -513,7 +499,7 @@ func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequen
 
 	c.attrMu.Lock()
 	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
-	c.touchModificationTimeLocked(ctx)
+	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
 	n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
 	c.attrMu.Unlock()
 	return n, err
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 9c9391511..2a8a1639c 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -17,7 +17,6 @@ package fsutil
 import (
 	"bytes"
 	"io"
-	"reflect"
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -66,9 +65,6 @@ func TestSetPermissions(t *testing.T) {
 	}
 
 	// Did permissions change?
-	if !iops.dirtyAttr.Perms {
-		t.Fatalf("got perms not dirty, want dirty")
-	}
 	if iops.attr.Perms != perms {
 		t.Fatalf("got perms +%v, want +%v", iops.attr.Perms, perms)
 	}
@@ -85,9 +81,9 @@ func TestSetPermissions(t *testing.T) {
 func TestSetTimestamps(t *testing.T) {
 	ctx := contexttest.Context(t)
 	for _, test := range []struct {
-		desc      string
-		ts        fs.TimeSpec
-		wantDirty fs.AttrMask
+		desc        string
+		ts          fs.TimeSpec
+		wantChanged fs.AttrMask
 	}{
 		{
 			desc: "noop",
@@ -95,7 +91,7 @@ func TestSetTimestamps(t *testing.T) {
 				ATimeOmit: true,
 				MTimeOmit: true,
 			},
-			wantDirty: fs.AttrMask{},
+			wantChanged: fs.AttrMask{},
 		},
 		{
 			desc: "access time only",
@@ -103,9 +99,8 @@ func TestSetTimestamps(t *testing.T) {
 				ATime:     ktime.NowFromContext(ctx),
 				MTimeOmit: true,
 			},
-			wantDirty: fs.AttrMask{
-				AccessTime:       true,
-				StatusChangeTime: true,
+			wantChanged: fs.AttrMask{
+				AccessTime: true,
 			},
 		},
 		{
@@ -114,9 +109,8 @@ func TestSetTimestamps(t *testing.T) {
 				ATimeOmit: true,
 				MTime:     ktime.NowFromContext(ctx),
 			},
-			wantDirty: fs.AttrMask{
+			wantChanged: fs.AttrMask{
 				ModificationTime: true,
-				StatusChangeTime: true,
 			},
 		},
 		{
@@ -125,10 +119,9 @@ func TestSetTimestamps(t *testing.T) {
 				ATime: ktime.NowFromContext(ctx),
 				MTime: ktime.NowFromContext(ctx),
 			},
-			wantDirty: fs.AttrMask{
+			wantChanged: fs.AttrMask{
 				AccessTime:       true,
 				ModificationTime: true,
-				StatusChangeTime: true,
 			},
 		},
 		{
@@ -137,10 +130,9 @@ func TestSetTimestamps(t *testing.T) {
 				ATimeSetSystemTime: true,
 				MTimeSetSystemTime: true,
 			},
-			wantDirty: fs.AttrMask{
+			wantChanged: fs.AttrMask{
 				AccessTime:       true,
 				ModificationTime: true,
-				StatusChangeTime: true,
 			},
 		},
 	} {
@@ -159,10 +151,7 @@ func TestSetTimestamps(t *testing.T) {
 			if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil {
 				t.Fatalf("SetTimestamps got error %v, want nil", err)
 			}
-			if !reflect.DeepEqual(iops.dirtyAttr, test.wantDirty) {
-				t.Fatalf("dirty got %+v, want %+v", iops.dirtyAttr, test.wantDirty)
-			}
-			if iops.dirtyAttr.AccessTime {
+			if test.wantChanged.AccessTime {
 				if !iops.attr.AccessTime.After(uattr.AccessTime) {
 					t.Fatalf("diritied access time did not advance, want %v > %v", iops.attr.AccessTime, uattr.AccessTime)
 				}
@@ -173,7 +162,7 @@ func TestSetTimestamps(t *testing.T) {
 					t.Fatalf("dirtied status change time did not advance")
 				}
 			}
-			if iops.dirtyAttr.ModificationTime {
+			if test.wantChanged.ModificationTime {
 				if !iops.attr.ModificationTime.After(uattr.ModificationTime) {
 					t.Fatalf("diritied modification time did not advance")
 				}
@@ -200,16 +189,10 @@ func TestTruncate(t *testing.T) {
 	if err := iops.Truncate(ctx, nil, uattr.Size); err != nil {
 		t.Fatalf("Truncate got error %v, want nil", err)
 	}
-	if iops.dirtyAttr.Size {
-		t.Fatalf("Truncate caused size to be dirtied")
-	}
 	var size int64 = 4096
 	if err := iops.Truncate(ctx, nil, size); err != nil {
 		t.Fatalf("Truncate got error %v, want nil", err)
 	}
-	if !iops.dirtyAttr.Size {
-		t.Fatalf("Truncate caused size to not be dirtied")
-	}
 	if iops.attr.Size != size {
 		t.Fatalf("Truncate got %d, want %d", iops.attr.Size, size)
 	}
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 1dc0ca0db..16435169a 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -247,6 +247,7 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa
 // skipSetAttr checks if attribute change can be skipped. It can be skipped
 // when:
 //   - Mask is empty
+//   - Mask contains only attributes that cannot be set in the gofer
 //   - Mask contains only atime and/or mtime, and host FD exists
 //
 // Updates to atime and mtime can be skipped because cached value will be
@@ -254,15 +255,22 @@ func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMa
 // Skipping atime updates is particularly important to reduce the number of
 // operations sent to the Gofer for readonly files.
 func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool {
-	if mask.Empty() {
+	// First remove attributes that cannot be updated.
+	cpy := mask
+	cpy.Type = false
+	cpy.DeviceID = false
+	cpy.InodeID = false
+	cpy.BlockSize = false
+	cpy.Usage = false
+	cpy.Links = false
+	if cpy.Empty() {
 		return true
 	}
 
-	cpy := mask
+	// Then check if more than just atime and mtime is being set.
 	cpy.AccessTime = false
 	cpy.ModificationTime = false
 	if !cpy.Empty() {
-		// More than just atime and mtime is being set.
 		return false
 	}
 
-- 
cgit v1.2.3


From 0cf7fc4e115c2dcc40901c44b238ab36b5d966fc Mon Sep 17 00:00:00 2001
From: Zach Koopmans <zkoopmans@google.com>
Date: Tue, 5 Feb 2019 10:00:22 -0800
Subject: Change /proc/PID/cmdline to read environment vector.

- Change proc to return envp on overwrite of argv with limitations from
upstream.
- Add unit tests
- Change layout of argv/envp on the stack so that end of argv is contiguous with
beginning of envp.

PiperOrigin-RevId: 232506107
Change-Id: I993880499ab2c1220f6dc456a922235c49304dec
---
 pkg/sentry/arch/stack.go        | 30 +++++++++++++---------
 pkg/sentry/fs/proc/exec_args.go | 55 ++++++++++++++++++++++++++++++++++++-----
 test/util/multiprocess_util.h   |  1 +
 3 files changed, 68 insertions(+), 18 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 716a3574d..f2cfb0426 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -170,6 +170,24 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 	// Make sure we start with a 16-byte alignment.
 	s.Align(16)
 
+	// Push the environment vector so the end of the argument vector is adjacent to
+	// the beginning of the environment vector.
+	// While the System V abi for x86_64 does not specify an ordering to the
+	// Information Block (the block holding the arg, env, and aux vectors),
+	// support features like setproctitle(3) naturally expect these segments
+	// to be in this order. See: https://www.uclibc.org/docs/psABI-x86_64.pdf
+	// page 29.
+	l.EnvvEnd = s.Bottom
+	envAddrs := make([]usermem.Addr, len(env))
+	for i := len(env) - 1; i >= 0; i-- {
+		addr, err := s.Push(env[i])
+		if err != nil {
+			return StackLayout{}, err
+		}
+		envAddrs[i] = addr
+	}
+	l.EnvvStart = s.Bottom
+
 	// Push our strings.
 	l.ArgvEnd = s.Bottom
 	argAddrs := make([]usermem.Addr, len(args))
@@ -182,18 +200,6 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error)
 	}
 	l.ArgvStart = s.Bottom
 
-	// Push our environment.
-	l.EnvvEnd = s.Bottom
-	envAddrs := make([]usermem.Addr, len(env))
-	for i := len(env) - 1; i >= 0; i-- {
-		addr, err := s.Push(env[i])
-		if err != nil {
-			return StackLayout{}, err
-		}
-		envAddrs[i] = addr
-	}
-	l.EnvvStart = s.Bottom
-
 	// We need to align the arguments appropriately.
 	//
 	// We must finish on a 16-byte alignment, but we'll play it
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index a716eb5f5..9daad5d2b 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -15,6 +15,7 @@
 package proc
 
 import (
+	"bytes"
 	"fmt"
 	"io"
 
@@ -139,20 +140,62 @@ func (f *execArgFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequen
 	// N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true
 	// until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading
 	// cmdline and environment").
-	copyN, copyErr := m.CopyIn(ctx, start, buf, usermem.IOOpts{})
+	copyN, err := m.CopyIn(ctx, start, buf, usermem.IOOpts{})
 	if copyN == 0 {
 		// Nothing to copy.
-		return 0, copyErr
+		return 0, err
 	}
 	buf = buf[:copyN]
 
-	// TODO: On Linux, if the NUL byte at the end of the
-	// argument vector has been overwritten, it continues reading the
-	// environment vector as part of the argument vector.
+	// On Linux, if the NUL byte at the end of the argument vector has been
+	// overwritten, it continues reading the environment vector as part of
+	// the argument vector.
+
+	if f.arg == cmdlineExecArg && buf[copyN-1] != 0 {
+		// Linux will limit the return up to and including the first null character in argv
+
+		copyN = bytes.IndexByte(buf, 0)
+		if copyN == -1 {
+			copyN = len(buf)
+		}
+		// If we found a NUL character in argv, return upto and including that character.
+		if copyN < len(buf) {
+			buf = buf[:copyN]
+		} else { // Otherwise return into envp.
+			lengthEnvv := int(m.EnvvEnd() - m.EnvvStart())
+
+			// Upstream limits the returned amount to one page of slop.
+			// https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208
+			// we'll return one page total between argv and envp because of the
+			// above page restrictions.
+			if lengthEnvv > usermem.PageSize-len(buf) {
+				lengthEnvv = usermem.PageSize - len(buf)
+			}
+			// Make a new buffer to fit the whole thing
+			tmp := make([]byte, length+lengthEnvv)
+			copyNE, err := m.CopyIn(ctx, m.EnvvStart(), tmp[copyN:], usermem.IOOpts{})
+			if err != nil {
+				return 0, err
+			}
+
+			// Linux will return envp up to and including the first NUL character, so find it.
+			for i, c := range tmp[copyN:] {
+				if c == 0 {
+					copyNE = i
+					break
+				}
+			}
+
+			copy(tmp, buf)
+			buf = tmp[:copyN+copyNE]
+
+		}
+
+	}
 
 	n, dstErr := dst.CopyOut(ctx, buf)
 	if dstErr != nil {
 		return int64(n), dstErr
 	}
-	return int64(n), copyErr
+	return int64(n), err
 }
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index c09d6167f..ba5f2601f 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -74,6 +74,7 @@ class ExecveArray {
   ExecveArray& operator=(ExecveArray&&) = delete;
 
   char* const* get() const { return ptrs_.data(); }
+  size_t get_size() { return str_.size(); }
 
  private:
   std::vector<char> str_;
-- 
cgit v1.2.3


From 9ef3427ac14a84002497f3c8bac346486cb36f2b Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 7 Feb 2019 11:40:45 -0800
Subject: Implement semctl(2) SETALL and GETALL

PiperOrigin-RevId: 232914984
Change-Id: Id2643d7ad8e986ca9be76d860788a71db2674cda
---
 pkg/sentry/kernel/semaphore/semaphore.go | 65 +++++++++++++++++++++++++++++---
 pkg/sentry/syscalls/linux/sys_sem.go     | 38 +++++++++++++++++++
 test/syscalls/linux/semaphore.cc         | 25 +++++++++++-
 3 files changed, 122 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 232a276dc..c134931cd 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -16,6 +16,7 @@
 package semaphore
 
 import (
+	"fmt"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -75,7 +76,10 @@ type Set struct {
 	perms      fs.FilePermissions
 	opTime     ktime.Time
 	changeTime ktime.Time
-	sems       []sem
+
+	// sems holds all semaphores in the set. The slice itself is immutable after
+	// it's been set, however each 'sem' object in the slice requires 'mu' lock.
+	sems []sem
 
 	// dead is set to true when the set is removed and can't be reached anymore.
 	// All waiters must wake up and fail when set is dead.
@@ -136,7 +140,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
 			}
 
 			// Validate parameters.
-			if nsems > int32(set.size()) {
+			if nsems > int32(set.Size()) {
 				return nil, syserror.EINVAL
 			}
 			if create && exclusive {
@@ -244,19 +248,20 @@ func (r *Registry) findByKey(key int32) *Set {
 func (r *Registry) totalSems() int {
 	totalSems := 0
 	for _, v := range r.semaphores {
-		totalSems += v.size()
+		totalSems += v.Size()
 	}
 	return totalSems
 }
 
 func (s *Set) findSem(num int32) *sem {
-	if num < 0 || int(num) >= s.size() {
+	if num < 0 || int(num) >= s.Size() {
 		return nil
 	}
 	return &s.sems[num]
 }
 
-func (s *Set) size() int {
+// Size returns the number of semaphores in the set. Size is immutable.
+func (s *Set) Size() int {
 	return len(s.sems)
 }
 
@@ -303,6 +308,39 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 	return nil
 }
 
+// SetValAll overrides all semaphores values, waking up waiters as needed.
+//
+// 'len(vals)' must be equal to 's.Size()'.
+func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials) error {
+	if len(vals) != s.Size() {
+		panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size()))
+	}
+
+	for _, val := range vals {
+		if val < 0 || val > valueMax {
+			return syserror.ERANGE
+		}
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have alter permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+		return syserror.EACCES
+	}
+
+	for i, val := range vals {
+		sem := &s.sems[i]
+
+		// TODO: Clear undo entries in all processes
+		sem.value = int16(val)
+		sem.wakeWaiters()
+	}
+	s.changeTime = ktime.NowFromContext(ctx)
+	return nil
+}
+
 // GetVal returns a semaphore value.
 func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
 	s.mu.Lock()
@@ -320,6 +358,23 @@ func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
 	return sem.value, nil
 }
 
+// GetValAll returns value for all semaphores.
+func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return nil, syserror.EACCES
+	}
+
+	vals := make([]uint16, s.Size())
+	for i, sem := range s.sems {
+		vals[i] = uint16(sem.value)
+	}
+	return vals, nil
+}
+
 // ExecuteOps attempts to execute a list of operations to the set. It only
 // succeeds when all operations can be applied. No changes are made if it fails.
 //
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 4ed52c4a7..6775725ca 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -97,10 +98,18 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		}
 		return 0, nil, setVal(t, id, num, int16(val))
 
+	case linux.SETALL:
+		array := args[3].Pointer()
+		return 0, nil, setValAll(t, id, array)
+
 	case linux.GETVAL:
 		v, err := getVal(t, id, num)
 		return uintptr(v), nil, err
 
+	case linux.GETALL:
+		array := args[3].Pointer()
+		return 0, nil, getValAll(t, id, array)
+
 	case linux.IPC_RMID:
 		return 0, nil, remove(t, id)
 
@@ -155,6 +164,20 @@ func setVal(t *kernel.Task, id int32, num int32, val int16) error {
 	return set.SetVal(t, num, val, creds)
 }
 
+func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+	vals := make([]uint16, set.Size())
+	if _, err := t.CopyIn(array, vals); err != nil {
+		return err
+	}
+	creds := auth.CredentialsFromContext(t)
+	return set.SetValAll(t, vals, creds)
+}
+
 func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
 	r := t.IPCNamespace().SemaphoreRegistry()
 	set := r.FindByID(id)
@@ -164,3 +187,18 @@ func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
 	creds := auth.CredentialsFromContext(t)
 	return set.GetVal(num, creds)
 }
+
+func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	vals, err := set.GetValAll(creds)
+	if err != nil {
+		return err
+	}
+	_, err = t.CopyOut(array, vals)
+	return err
+}
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index 12e33732d..da3d2c6fe 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -223,7 +223,7 @@ TEST(SemaphoreTest, SemOpRemoveWithWaiter_NoRandomSave) {
   });
 
   // This must happen before IPC_RMID runs above. Otherwise it fails with EINVAL
-  // instead because the semaphire has already been removed.
+  // instead because the semaphore has already been removed.
   struct sembuf buf = {};
   buf.sem_op = -1;
   ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1),
@@ -408,6 +408,29 @@ TEST(SemaphoreTest, SemCtlVal) {
   thZero.Join();
 }
 
+TEST(SemaphoreTest, SemCtlValAll) {
+  AutoSem sem(semget(IPC_PRIVATE, 3, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  // Semaphores must start with 0.
+  uint16_t get[3] = {10, 10, 10};
+  EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
+  for (auto v : get) {
+    EXPECT_EQ(v, 0);
+  }
+
+  // SetAll and check that they were set.
+  uint16_t vals[3] = {0, 10, 20};
+  EXPECT_THAT(semctl(sem.get(), 1, SETALL, vals), SyscallSucceedsWithValue(0));
+  EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(vals); ++i) {
+    EXPECT_EQ(get[i], vals[i]);
+  }
+
+  EXPECT_THAT(semctl(sem.get(), 1, SETALL, nullptr),
+              SyscallFailsWithErrno(EFAULT));
+}
+
 TEST(SemaphoreTest, SemIpcSet) {
   // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false));
-- 
cgit v1.2.3


From fcae058a1476a793cd1623907ca5886ccd871edf Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 7 Feb 2019 13:54:13 -0800
Subject: Make context.Background return a global background context.

It currently allocates a new context on the heap each time it is called. Some
of these are in relatively hot paths like signal delivery and releasing gofer
inodes.  It is also called very commonly in afterLoad.  All of these should
benefit from fewer heap allocations.

PiperOrigin-RevId: 232938873
Change-Id: I53cec0ca299f56dcd4866b0b4fd2ec4938526849
---
 pkg/sentry/context/context.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index 12bdcef85..7ed6a5e8a 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -108,6 +108,9 @@ func (NoopSleeper) UninterruptibleSleepStart(bool) {}
 // UninterruptibleSleepFinish does nothing.
 func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
 
+// bgContext is the context returned by context.Background.
+var bgContext = &logContext{Logger: log.Log()}
+
 // Background returns an empty context using the default logger.
 //
 // Users should be wary of using a Background context. Please tag any use with
@@ -119,5 +122,5 @@ func (NoopSleeper) UninterruptibleSleepFinish(bool) {}
 // Using a Background context for tests is fine, as long as no values are
 // needed from the context in the tested code paths.
 func Background() Context {
-	return logContext{Logger: log.Log()}
+	return bgContext
 }
-- 
cgit v1.2.3


From 2ba74f84be8b9d3d588fb834414d151607799fd3 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Thu, 7 Feb 2019 14:43:18 -0800
Subject: Implement /proc/net/unix.

PiperOrigin-RevId: 232948478
Change-Id: Ib830121e5e79afaf5d38d17aeef5a1ef97913d23
---
 pkg/abi/linux/socket.go              |  16 +++
 pkg/sentry/fs/proc/BUILD             |   3 +
 pkg/sentry/fs/proc/net.go            | 126 +++++++++++++++++-
 pkg/sentry/fs/proc/proc.go           |   2 +-
 pkg/sentry/kernel/kernel.go          |  58 ++++++++-
 pkg/sentry/socket/socket.go          |  10 +-
 pkg/sentry/socket/unix/unix.go       |   2 +
 test/syscalls/BUILD                  |   7 +
 test/syscalls/linux/BUILD            |  17 +++
 test/syscalls/linux/proc_net_unix.cc | 246 +++++++++++++++++++++++++++++++++++
 10 files changed, 481 insertions(+), 6 deletions(-)
 create mode 100644 test/syscalls/linux/proc_net_unix.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 929814752..a5f78506a 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -191,6 +191,15 @@ const (
 	SO_TXTIME                = 61
 )
 
+// enum socket_state, from uapi/linux/net.h.
+const (
+	SS_FREE          = 0 // Not allocated.
+	SS_UNCONNECTED   = 1 // Unconnected to any socket.
+	SS_CONNECTING    = 2 // In process of connecting.
+	SS_CONNECTED     = 3 // Connected to socket.
+	SS_DISCONNECTING = 4 // In process of disconnecting.
+)
+
 // SockAddrMax is the maximum size of a struct sockaddr, from
 // uapi/linux/socket.h.
 const SockAddrMax = 128
@@ -343,3 +352,10 @@ const SizeOfControlMessageRight = 4
 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
 // From net/scm.h.
 const SCM_MAX_FD = 253
+
+// SO_ACCEPTCON is defined as __SO_ACCEPTCON in
+// include/uapi/linux/net.h, which represents a listening socket
+// state. Note that this is distinct from SO_ACCEPTCONN, which is a
+// socket option for querying whether a socket is in a listening
+// state.
+const SO_ACCEPTCON = 1 << 16
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index f6bc90634..666b0ab3a 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -30,6 +30,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
@@ -43,6 +44,8 @@ go_library(
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/mm",
         "//pkg/sentry/socket/rpcinet",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 219eea7f8..55a958f9e 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -15,19 +15,24 @@
 package proc
 
 import (
+	"bytes"
 	"fmt"
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 )
 
 // newNet creates a new proc net entry.
-func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
+func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode {
 	var contents map[string]*fs.Inode
 	if s := p.k.NetworkStack(); s != nil {
 		contents = map[string]*fs.Inode{
@@ -52,6 +57,8 @@ func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 			"tcp":    newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode")),
 
 			"udp": newStaticProcInode(ctx, msrc, []byte("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops")),
+
+			"unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc),
 		}
 
 		if s.SupportsIPv6() {
@@ -182,3 +189,120 @@ func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 
 	return data, 0
 }
+
+// netUnix implements seqfile.SeqSource for /proc/net/unix.
+//
+// +stateify savable
+type netUnix struct {
+	k *kernel.Kernel
+}
+
+// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
+func (*netUnix) NeedsUpdate(generation int64) bool {
+	return true
+}
+
+// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
+func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
+	if h != nil {
+		return []seqfile.SeqData{}, 0
+	}
+
+	var buf bytes.Buffer
+	// Header
+	fmt.Fprintf(&buf, "Num       RefCount Protocol Flags    Type St Inode Path\n")
+
+	// Entries
+	for _, sref := range n.k.ListSockets(linux.AF_UNIX) {
+		s := sref.Get()
+		if s == nil {
+			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", sref)
+			continue
+		}
+		sfile := s.(*fs.File)
+		sops, ok := sfile.FileOperations.(*unix.SocketOperations)
+		if !ok {
+			panic(fmt.Sprintf("Found non-unix socket file in unix socket table: %+v", sfile))
+		}
+
+		addr, err := sops.Endpoint().GetLocalAddress()
+		if err != nil {
+			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+			addr.Addr = "<unknown>"
+		}
+
+		sockFlags := 0
+		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
+			if ce.Listening() {
+				// For unix domain sockets, linux reports a single flag
+				// value if the socket is listening, of __SO_ACCEPTCON.
+				sockFlags = linux.SO_ACCEPTCON
+			}
+		}
+
+		var sockState int
+		switch sops.Endpoint().Type() {
+		case linux.SOCK_DGRAM:
+			sockState = linux.SS_CONNECTING
+			// Unlike Linux, we don't have unbound connection-less sockets,
+			// so no SS_DISCONNECTING.
+
+		case linux.SOCK_SEQPACKET:
+			fallthrough
+		case linux.SOCK_STREAM:
+			// Connectioned.
+			if sops.Endpoint().(transport.ConnectingEndpoint).Connected() {
+				sockState = linux.SS_CONNECTED
+			} else {
+				sockState = linux.SS_UNCONNECTED
+			}
+		}
+
+		// In the socket entry below, the value for the 'Num' field requires
+		// some consideration. Linux prints the address to the struct
+		// unix_sock representing a socket in the kernel, but may redact the
+		// value for unprivileged users depending on the kptr_restrict
+		// sysctl.
+		//
+		// One use for this field is to allow a privileged user to
+		// introspect into the kernel memory to determine information about
+		// a socket not available through procfs, such as the socket's peer.
+		//
+		// On gvisor, returning a pointer to our internal structures would
+		// be pointless, as it wouldn't match the memory layout for struct
+		// unix_sock, making introspection difficult. We could populate a
+		// struct unix_sock with the appropriate data, but even that
+		// requires consideration for which kernel version to emulate, as
+		// the definition of this struct changes over time.
+		//
+		// For now, we always redact this pointer.
+		fmt.Fprintf(&buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
+			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
+			0,                             // Protocol, always 0 for UDS.
+			sockFlags,                     // Flags.
+			sops.Endpoint().Type(),        // Type.
+			sockState,                     // State.
+			sfile.InodeID(),               // Inode.
+		)
+
+		// Path
+		if len(addr.Addr) != 0 {
+			if addr.Addr[0] == 0 {
+				// Abstract path.
+				fmt.Fprintf(&buf, " @%s", string(addr.Addr[1:]))
+			} else {
+				fmt.Fprintf(&buf, " %s", string(addr.Addr))
+			}
+		}
+		fmt.Fprintf(&buf, "\n")
+
+		sfile.DecRef()
+	}
+
+	data := []seqfile.SeqData{{
+		Buf:    buf.Bytes(),
+		Handle: (*netUnix)(nil),
+	}}
+	return data, 0
+}
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index be04f94af..88018e707 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -85,7 +85,7 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 	if _, ok := p.k.NetworkStack().(*rpcinet.Stack); ok {
 		p.AddChild(ctx, "net", newRPCInetProcNet(ctx, msrc))
 	} else {
-		p.AddChild(ctx, "net", p.newNetDir(ctx, msrc))
+		p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc))
 	}
 
 	return newProcInode(p, msrc, fs.SpecialDirectory, nil), nil
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 43e9823cb..e7e5ff777 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -43,6 +43,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -164,7 +165,7 @@ type Kernel struct {
 	// nextInotifyCookie is a monotonically increasing counter used for
 	// generating unique inotify event cookies.
 	//
-	// nextInotifyCookie is mutable, and is accesed using atomic memory
+	// nextInotifyCookie is mutable, and is accessed using atomic memory
 	// operations.
 	nextInotifyCookie uint32
 
@@ -177,6 +178,10 @@ type Kernel struct {
 
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
+
+	// socketTable is used to track all sockets on the system. Protected by
+	// extMu.
+	socketTable map[int]map[*refs.WeakRef]struct{}
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -266,6 +271,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
 	k.futexes = futex.NewManager()
 	k.netlinkPorts = port.New()
+	k.socketTable = make(map[int]map[*refs.WeakRef]struct{})
 
 	return nil
 }
@@ -1051,6 +1057,56 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
 	})
 }
 
+// socketEntry represents a socket recorded in Kernel.socketTable. It implements
+// refs.WeakRefUser for sockets stored in the socket table.
+//
+// +stateify savable
+type socketEntry struct {
+	k      *Kernel
+	sock   *refs.WeakRef
+	family int
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (s *socketEntry) WeakRefGone() {
+	s.k.extMu.Lock()
+	// k.socketTable is guaranteed to point to a valid socket table for s.family
+	// at this point, since we made sure of the fact when we created this
+	// socketEntry, and we never delete socket tables.
+	delete(s.k.socketTable[s.family], s.sock)
+	s.k.extMu.Unlock()
+}
+
+// RecordSocket adds a socket to the system-wide socket table for tracking.
+//
+// Precondition: Caller must hold a reference to sock.
+func (k *Kernel) RecordSocket(sock *fs.File, family int) {
+	k.extMu.Lock()
+	table, ok := k.socketTable[family]
+	if !ok {
+		table = make(map[*refs.WeakRef]struct{})
+		k.socketTable[family] = table
+	}
+	se := socketEntry{k: k, family: family}
+	se.sock = refs.NewWeakRef(sock, &se)
+	table[se.sock] = struct{}{}
+	k.extMu.Unlock()
+}
+
+// ListSockets returns a snapshot of all sockets of a given family.
+func (k *Kernel) ListSockets(family int) []*refs.WeakRef {
+	k.extMu.Lock()
+	socks := []*refs.WeakRef{}
+	if table, ok := k.socketTable[family]; ok {
+		socks = make([]*refs.WeakRef, 0, len(table))
+		for s, _ := range table {
+			socks = append(socks, s)
+		}
+	}
+	k.extMu.Unlock()
+	return socks
+}
+
 type supervisorContext struct {
 	context.NoopSleeper
 	log.Logger
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index e28d2c4fa..5ab423f3c 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -147,6 +147,7 @@ func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*f
 			return nil, err
 		}
 		if s != nil {
+			t.Kernel().RecordSocket(s, family)
 			return s, nil
 		}
 	}
@@ -163,12 +164,15 @@ func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*
 	}
 
 	for _, p := range providers {
-		s, t, err := p.Pair(t, stype, protocol)
+		s1, s2, err := p.Pair(t, stype, protocol)
 		if err != nil {
 			return nil, nil, err
 		}
-		if s != nil && t != nil {
-			return s, t, nil
+		if s1 != nil && s2 != nil {
+			k := t.Kernel()
+			k.RecordSocket(s1, family)
+			k.RecordSocket(s2, family)
+			return s1, s2, nil
 		}
 	}
 
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 19258e692..c857a0f33 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -219,6 +219,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		return 0, nil, 0, syserr.FromError(e)
 	}
 
+	t.Kernel().RecordSocket(ns, linux.AF_UNIX)
+
 	return fd, addr, addrLen, nil
 }
 
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 148d9c366..53da121ec 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -534,6 +534,13 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:write_test")
 
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_unix_test",
+    # Unix domain socket creation isn't supported on all file systems. The
+    # sentry-internal tmpfs is known to support it.
+    use_tmpfs = True,
+)
+
 go_binary(
     name = "syscall_test_runner",
     srcs = ["syscall_test_runner.go"],
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index a311ca12c..590ee1659 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3102,3 +3102,20 @@ cc_binary(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_binary(
+    name = "proc_net_unix_test",
+    testonly = 1,
+    srcs = ["proc_net_unix.cc"],
+    linkstatic = 1,
+    deps = [
+        ":unix_domain_socket_test_util",
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
new file mode 100644
index 000000000..ea7c93012
--- /dev/null
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -0,0 +1,246 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using absl::StrCat;
+using absl::StreamFormat;
+using absl::StrFormat;
+
+constexpr char kProcNetUnixHeader[] =
+    "Num       RefCount Protocol Flags    Type St Inode Path";
+
+// UnixEntry represents a single entry from /proc/net/unix.
+struct UnixEntry {
+  uintptr_t addr;
+  uint64_t refs;
+  uint64_t protocol;
+  uint64_t flags;
+  uint64_t type;
+  uint64_t state;
+  uint64_t inode;
+  std::string path;
+};
+
+std::string ExtractPath(const struct sockaddr* addr) {
+  const char* path =
+      reinterpret_cast<const struct sockaddr_un*>(addr)->sun_path;
+  // Note: sockaddr_un.sun_path is an embedded character array of length
+  // UNIX_PATH_MAX, so we can always safely dereference the first 2 bytes below.
+  //
+  // The kernel also enforces that the path is always null terminated.
+  if (path[0] == 0) {
+    // Abstract socket paths are null padded to the end of the struct
+    // sockaddr. However, these null bytes may or may not show up in
+    // /proc/net/unix depending on the kernel version. Truncate after the first
+    // null byte (by treating path as a c-std::string).
+    return StrCat("@", &path[1]);
+  }
+  return std::string(path);
+}
+
+// Returns a parsed representation of /proc/net/unix entries.
+PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
+  std::string content;
+  RETURN_IF_ERRNO(GetContents("/proc/net/unix", &content));
+
+  bool skipped_header = false;
+  std::vector<UnixEntry> entries;
+  std::vector<std::string> lines = absl::StrSplit(content, absl::ByAnyChar("\n"));
+  for (std::string line : lines) {
+    if (!skipped_header) {
+      EXPECT_EQ(line, kProcNetUnixHeader);
+      skipped_header = true;
+      continue;
+    }
+    if (line.empty()) {
+      continue;
+    }
+
+    // Abstract socket paths can have trailing null bytes in them depending on
+    // the linux version. Strip off everything after a null byte, including the
+    // null byte.
+    std::size_t null_pos = line.find('\0');
+    if (null_pos != std::string::npos) {
+      line.erase(null_pos);
+    }
+
+    // Parse a single entry from /proc/net/unix.
+    //
+    // Sample file:
+    //
+    // clang-format off
+    //
+    // Num       RefCount Protocol Flags    Type St Inode Path"
+    // ffffa130e7041c00: 00000002 00000000 00010000 0001 01 1299413685 /tmp/control_server/13293772586877554487
+    // ffffa14f547dc400: 00000002 00000000 00010000 0001 01  3793 @remote_coredump
+    //
+    // clang-format on
+    //
+    // Note that from the second entry, the inode number can be padded using
+    // spaces, so we need to handle it separately during parsing. See
+    // net/unix/af_unix.c:unix_seq_show() for how these entries are produced. In
+    // particular, only the inode field is padded with spaces.
+    UnixEntry entry;
+
+    // Process the first 6 fields, up to but not including "Inode".
+    std::vector<std::string> fields = absl::StrSplit(line, absl::MaxSplits(' ', 6));
+
+    if (fields.size() < 7) {
+      return PosixError(EINVAL, StrFormat("Invalid entry: '%s'\n", line));
+    }
+
+    // AtoiBase can't handle the ':' in the "Num" field, so strip it out.
+    std::vector<std::string> addr = absl::StrSplit(fields[0], ':');
+    ASSIGN_OR_RETURN_ERRNO(entry.addr, AtoiBase(addr[0], 16));
+
+    ASSIGN_OR_RETURN_ERRNO(entry.refs, AtoiBase(fields[1], 16));
+    ASSIGN_OR_RETURN_ERRNO(entry.protocol, AtoiBase(fields[2], 16));
+    ASSIGN_OR_RETURN_ERRNO(entry.flags, AtoiBase(fields[3], 16));
+    ASSIGN_OR_RETURN_ERRNO(entry.type, AtoiBase(fields[4], 16));
+    ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
+
+    absl::string_view rest = absl::StripAsciiWhitespace(fields[6]);
+    fields = absl::StrSplit(rest, absl::MaxSplits(' ', 1));
+    if (fields.empty()) {
+      return PosixError(
+          EINVAL, StrFormat("Invalid entry, missing 'Inode': '%s'\n", line));
+    }
+    ASSIGN_OR_RETURN_ERRNO(entry.inode, AtoiBase(fields[0], 10));
+
+    entry.path = "";
+    if (fields.size() > 1) {
+      entry.path = fields[1];
+    }
+
+    entries.push_back(entry);
+  }
+
+  return entries;
+}
+
+// Finds the first entry in 'entries' for which 'predicate' returns true.
+// Returns true on match, and sets 'match' to point to the matching entry.
+bool FindBy(std::vector<UnixEntry> entries, UnixEntry* match,
+            std::function<bool(UnixEntry)> predicate) {
+  for (int i = 0; i < entries.size(); ++i) {
+    if (predicate(entries[i])) {
+      *match = entries[i];
+      return true;
+    }
+  }
+  return false;
+}
+
+bool FindByPath(std::vector<UnixEntry> entries, UnixEntry* match,
+                const std::string& path) {
+  return FindBy(entries, match, [path](UnixEntry e) { return e.path == path; });
+}
+
+TEST(ProcNetUnix, Exists) {
+  const std::string content =
+      ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/unix"));
+  const std::string header_line = StrCat(kProcNetUnixHeader, "\n");
+  if (IsRunningOnGvisor()) {
+    // Should be just the header since we don't have any unix domain sockets
+    // yet.
+    EXPECT_EQ(content, header_line);
+  } else {
+    // However, on a general linux machine, we could have abitrary sockets on
+    // the system, so just check the header.
+    EXPECT_THAT(content, ::testing::StartsWith(header_line));
+  }
+}
+
+TEST(ProcNetUnix, FilesystemBindAcceptConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+      FilesystemBoundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+  std::string path1 = ExtractPath(sockets->first_addr());
+  std::string path2 = ExtractPath(sockets->second_addr());
+  std::cout << StreamFormat("Server socket address: %s\n", path1);
+  std::cout << StreamFormat("Client socket address: %s\n", path2);
+
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  if (IsRunningOnGvisor()) {
+    EXPECT_EQ(entries.size(), 2);
+  }
+
+  // The server-side socket's path is listed in the socket entry...
+  UnixEntry s1;
+  EXPECT_TRUE(FindByPath(entries, &s1, path1));
+
+  // ... but the client-side socket's path is not.
+  UnixEntry s2;
+  EXPECT_FALSE(FindByPath(entries, &s2, path2));
+}
+
+TEST(ProcNetUnix, AbstractBindAcceptConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+      AbstractBoundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+  std::string path1 = ExtractPath(sockets->first_addr());
+  std::string path2 = ExtractPath(sockets->second_addr());
+  std::cout << StreamFormat("Server socket address: '%s'\n", path1);
+  std::cout << StreamFormat("Client socket address: '%s'\n", path2);
+
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  if (IsRunningOnGvisor()) {
+    EXPECT_EQ(entries.size(), 2);
+  }
+
+  // The server-side socket's path is listed in the socket entry...
+  UnixEntry s1;
+  EXPECT_TRUE(FindByPath(entries, &s1, path1));
+
+  // ... but the client-side socket's path is not.
+  UnixEntry s2;
+  EXPECT_FALSE(FindByPath(entries, &s2, path2));
+}
+
+TEST(ProcNetUnix, SocketPair) {
+  // Under gvisor, ensure a socketpair() syscall creates exactly 2 new
+  // entries. We have no way to verify this under Linux, as we have no control
+  // over socket creation on a general Linux machine.
+  SKIP_IF(!IsRunningOnGvisor());
+
+  std::vector<UnixEntry> entries =
+      ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  ASSERT_EQ(entries.size(), 0);
+
+  auto sockets =
+      ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_STREAM).Create());
+
+  entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+  EXPECT_EQ(entries.size(), 2);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 80f901b16b8bb8fe397cc44578035173f5155b24 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 7 Feb 2019 23:14:06 -0800
Subject: Plumb IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP to netstack.

Also includes a few fixes for IPv4 multicast support. IPv6 support is coming in
a followup CL.

PiperOrigin-RevId: 233008638
Change-Id: If7dae6222fef43fda48033f0292af77832d95e82
---
 pkg/abi/linux/socket.go                            |  21 +-
 pkg/sentry/socket/epsocket/epsocket.go             |  46 ++-
 pkg/tcpip/stack/stack.go                           |   7 +
 pkg/tcpip/transport/udp/endpoint.go                |  16 +-
 pkg/tcpip/transport/udp/endpoint_state.go          |   6 +
 test/syscalls/BUILD                                |   2 +
 test/syscalls/linux/BUILD                          |  34 ++
 test/syscalls/linux/ip_socket_test_util.cc         |  33 +-
 test/syscalls/linux/ip_socket_test_util.h          |   7 +
 test/syscalls/linux/socket_ip_udp_generic.cc       |  14 +
 test/syscalls/linux/socket_ipv4_udp_unbound.cc     | 424 +++++++++++++++++++++
 test/syscalls/linux/socket_ipv4_udp_unbound.h      |  29 ++
 .../linux/socket_ipv4_udp_unbound_loopback.cc      |  35 ++
 test/syscalls/linux/socket_test_util.cc            |  38 +-
 test/syscalls/linux/socket_test_util.h             |   5 +
 15 files changed, 694 insertions(+), 23 deletions(-)
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound.cc
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound.h
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index a5f78506a..906776525 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -204,15 +204,30 @@ const (
 // uapi/linux/socket.h.
 const SockAddrMax = 128
 
-// SockAddrInt is struct sockaddr_in, from uapi/linux/in.h.
+// InetAddr is struct in_addr, from uapi/linux/in.h.
+type InetAddr [4]byte
+
+// SockAddrInet is struct sockaddr_in, from uapi/linux/in.h.
 type SockAddrInet struct {
 	Family uint16
 	Port   uint16
-	Addr   [4]byte
+	Addr   InetAddr
 	Zero   [8]uint8 // pad to sizeof(struct sockaddr).
 }
 
-// SockAddrInt6 is struct sockaddr_in6, from uapi/linux/in6.h.
+// InetMulticastRequest is struct ip_mreq, from uapi/linux/in.h.
+type InetMulticastRequest struct {
+	MulticastAddr InetAddr
+	InterfaceAddr InetAddr
+}
+
+// InetMulticastRequestWithNIC is struct ip_mreqn, from uapi/linux/in.h.
+type InetMulticastRequestWithNIC struct {
+	InetMulticastRequest
+	InterfaceIndex int32
+}
+
+// SockAddrInet6 is struct sockaddr_in6, from uapi/linux/in6.h.
 type SockAddrInet6 struct {
 	Family   uint16
 	Port     uint16
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index ca865b111..16720456a 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1078,6 +1078,25 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
 	return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{}))
 }
 
+var (
+	inetMulticastRequestSize        = int(binary.Size(linux.InetMulticastRequest{}))
+	inetMulticastRequestWithNICSize = int(binary.Size(linux.InetMulticastRequestWithNIC{}))
+)
+
+func copyInMulticastRequest(optVal []byte) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
+	if len(optVal) < inetMulticastRequestSize {
+		return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
+	}
+
+	var req linux.InetMulticastRequestWithNIC
+	if len(optVal) >= inetMulticastRequestWithNICSize {
+		binary.Unmarshal(optVal[:inetMulticastRequestWithNICSize], usermem.ByteOrder, &req)
+	} else {
+		binary.Unmarshal(optVal[:inetMulticastRequestSize], usermem.ByteOrder, &req.InetMulticastRequest)
+	}
+	return req, nil
+}
+
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
 func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
@@ -1096,7 +1115,31 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v)))
 
-	case linux.IP_ADD_MEMBERSHIP, linux.MCAST_JOIN_GROUP, linux.IP_MULTICAST_IF:
+	case linux.IP_ADD_MEMBERSHIP:
+		req, err := copyInMulticastRequest(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
+			NIC:           tcpip.NICID(req.InterfaceIndex),
+			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
+			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
+		}))
+
+	case linux.IP_DROP_MEMBERSHIP:
+		req, err := copyInMulticastRequest(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
+			NIC:           tcpip.NICID(req.InterfaceIndex),
+			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
+			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
+		}))
+
+	case linux.MCAST_JOIN_GROUP, linux.IP_MULTICAST_IF:
 		// FIXME: Disallow IP-level multicast group options by
 		// default. These will need to be supported by appropriately plumbing
 		// the level through to the network stack (if at all). However, we
@@ -1108,7 +1151,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_BIND_ADDRESS_NO_PORT,
 		linux.IP_BLOCK_SOURCE,
 		linux.IP_CHECKSUM,
-		linux.IP_DROP_MEMBERSHIP,
 		linux.IP_DROP_SOURCE_MEMBERSHIP,
 		linux.IP_FREEBIND,
 		linux.IP_HDRINCL,
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 7aa9dbd46..854ebe1bb 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -742,6 +742,9 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 		return Route{}, tcpip.ErrNoRoute
 	}
 
+	// TODO: Route multicast packets with no specified local
+	// address or NIC.
+
 	for i := range s.routeTable {
 		if (id != 0 && id != s.routeTable[i].NIC) || (len(remoteAddr) != 0 && !s.routeTable[i].Match(remoteAddr)) {
 			continue
@@ -768,6 +771,10 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 		return r, nil
 	}
 
+	if isMulticast {
+		return Route{}, tcpip.ErrNetworkUnreachable
+	}
+
 	return Route{}, tcpip.ErrNoRoute
 }
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index b2a27a7cb..d46bf0ade 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -99,6 +99,7 @@ type endpoint struct {
 	effectiveNetProtos []tcpip.NetworkProtocolNumber
 }
 
+// +stateify savable
 type multicastMembership struct {
 	nicID         tcpip.NICID
 	multicastAddr tcpip.Address
@@ -412,6 +413,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr)
 		}
 		if nicID == 0 {
+			// TODO: Allow adding memberships without
+			// specifing an interface.
 			return tcpip.ErrNoRoute
 		}
 
@@ -766,9 +769,11 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 		}
 	}
 
+	nicid := addr.NIC
 	if len(addr.Addr) != 0 {
 		// A local address was specified, verify that it's valid.
-		if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 {
+		nicid = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
+		if nicid == 0 {
 			return tcpip.ErrBadLocalAddress
 		}
 	}
@@ -777,21 +782,21 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 		LocalPort:    addr.Port,
 		LocalAddress: addr.Addr,
 	}
-	id, err = e.registerWithStack(addr.NIC, netProtos, id)
+	id, err = e.registerWithStack(nicid, netProtos, id)
 	if err != nil {
 		return err
 	}
 	if commit != nil {
 		if err := commit(); err != nil {
 			// Unregister, the commit failed.
-			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, ProtocolNumber, id, e)
+			e.stack.UnregisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e)
 			e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort)
 			return err
 		}
 	}
 
 	e.id = id
-	e.regNICID = addr.NIC
+	e.regNICID = nicid
 	e.effectiveNetProtos = netProtos
 
 	// Mark endpoint as bound.
@@ -815,7 +820,8 @@ func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcp
 		return err
 	}
 
-	e.bindNICID = addr.NIC
+	// Save the effective NICID generated by bindLocked.
+	e.bindNICID = e.regNICID
 
 	return nil
 }
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index db1e281ad..4d8210294 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -103,4 +103,10 @@ func (e *endpoint) afterLoad() {
 	if err != nil {
 		panic(*err)
 	}
+
+	for _, m := range e.multicastMemberships {
+		if err := e.stack.JoinGroup(e.netProto, m.nicID, m.multicastAddr); err != nil {
+			panic(err)
+		}
+	}
 }
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 53da121ec..a5abf8013 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -361,6 +361,8 @@ syscall_test(
     test = "//test/syscalls/linux:socket_ip_udp_loopback_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_test")
+
 syscall_test(test = "//test/syscalls/linux:socket_netdevice_test")
 
 syscall_test(test = "//test/syscalls/linux:socket_netlink_route_test")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 590ee1659..75fa52a57 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1931,6 +1931,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "socket_ipv4_udp_unbound_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound.cc",
+    ],
+    hdrs = [
+        "socket_ipv4_udp_unbound.h",
+    ],
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
 cc_binary(
     name = "socket_abstract_test",
     testonly = 1,
@@ -2124,6 +2142,22 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_ipv4_udp_unbound_loopback_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_loopback.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ipv4_udp_unbound_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_binary(
     name = "socket_domain_test",
     testonly = 1,
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 1659d3d83..f8232fc24 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -12,11 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <cstring>
+
 #include "test/syscalls/linux/ip_socket_test_util.h"
 
 namespace gvisor {
 namespace testing {
 
+PosixErrorOr<int> InterfaceIndex(std::string name) {
+  // TODO: Consider using netlink.
+  ifreq req = {};
+  memcpy(req.ifr_name, name.c_str(), name.size());
+  ASSIGN_OR_RETURN_ERRNO(auto sock, Socket(AF_INET, SOCK_DGRAM, 0));
+  RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(sock.get(), SIOCGIFINDEX, &req));
+  return req.ifr_ifindex;
+}
+
 namespace {
 
 std::string DescribeSocketType(int type) {
@@ -28,7 +41,7 @@ std::string DescribeSocketType(int type) {
 
 SocketPairKind IPv6TCPAcceptBindSocketPair(int type) {
   std::string description =
-      absl::StrCat(DescribeSocketType(type), "IPv6 TCP socket");
+      absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket");
   return SocketPairKind{
       description, TCPAcceptBindSocketPairCreator(AF_INET6, type | SOCK_STREAM,
                                                   0, /* dual_stack = */ false)};
@@ -36,7 +49,7 @@ SocketPairKind IPv6TCPAcceptBindSocketPair(int type) {
 
 SocketPairKind IPv4TCPAcceptBindSocketPair(int type) {
   std::string description =
-      absl::StrCat(DescribeSocketType(type), "IPv4 TCP socket");
+      absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket");
   return SocketPairKind{
       description, TCPAcceptBindSocketPairCreator(AF_INET, type | SOCK_STREAM,
                                                   0, /* dual_stack = */ false)};
@@ -44,7 +57,7 @@ SocketPairKind IPv4TCPAcceptBindSocketPair(int type) {
 
 SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
   std::string description =
-      absl::StrCat(DescribeSocketType(type), "dual stack TCP socket");
+      absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket");
   return SocketPairKind{
       description, TCPAcceptBindSocketPairCreator(AF_INET6, type | SOCK_STREAM,
                                                   0, /* dual_stack = */ true)};
@@ -52,7 +65,7 @@ SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
 
 SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
   std::string description =
-      absl::StrCat(DescribeSocketType(type), "IPv6 UDP socket");
+      absl::StrCat(DescribeSocketType(type), "connected IPv6 UDP socket");
   return SocketPairKind{description, UDPBidirectionalBindSocketPairCreator(
                                          AF_INET6, type | SOCK_DGRAM, 0,
                                          /* dual_stack = */ false)};
@@ -60,7 +73,7 @@ SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
 
 SocketPairKind IPv4UDPBidirectionalBindSocketPair(int type) {
   std::string description =
-      absl::StrCat(DescribeSocketType(type), "IPv4 UDP socket");
+      absl::StrCat(DescribeSocketType(type), "connected IPv4 UDP socket");
   return SocketPairKind{description, UDPBidirectionalBindSocketPairCreator(
                                          AF_INET, type | SOCK_DGRAM, 0,
                                          /* dual_stack = */ false)};
@@ -68,11 +81,19 @@ SocketPairKind IPv4UDPBidirectionalBindSocketPair(int type) {
 
 SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type) {
   std::string description =
-      absl::StrCat(DescribeSocketType(type), "dual stack UDP socket");
+      absl::StrCat(DescribeSocketType(type), "connected dual stack UDP socket");
   return SocketPairKind{description, UDPBidirectionalBindSocketPairCreator(
                                          AF_INET6, type | SOCK_DGRAM, 0,
                                          /* dual_stack = */ true)};
 }
 
+SocketPairKind IPv4UDPUnboundSocketPair(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "IPv4 UDP socket");
+  return SocketPairKind{
+      description, UDPUnboundSocketPairCreator(AF_INET, type | SOCK_DGRAM, 0,
+                                               /* dual_stack = */ false)};
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 1e1400ecd..a6721091a 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -21,6 +21,9 @@
 namespace gvisor {
 namespace testing {
 
+// InterfaceIndex returns the index of the named interface.
+PosixErrorOr<int> InterfaceIndex(std::string name);
+
 // IPv6TCPAcceptBindSocketPair returns a SocketPairKind that represents
 // SocketPairs created with bind() and accept() syscalls with AF_INET6 and the
 // given type bound to the IPv6 loopback.
@@ -51,6 +54,10 @@ SocketPairKind IPv4UDPBidirectionalBindSocketPair(int type);
 // AF_INET6 and the given type bound to the IPv4 loopback.
 SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 
+// IPv4UDPUnboundSocketPair returns a SocketPairKind that represents
+// SocketPairs created with AF_INET and the given type.
+SocketPairKind IPv4UDPUnboundSocketPair(int type);
+
 }  // namespace testing
 }  // namespace gvisor
 
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 789154fb3..58d1c846d 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -117,5 +117,19 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLAboveMax) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+TEST_P(UDPSocketPairTest, SetEmptyIPAddMembership) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct ip_mreqn req = {};
+  int ret = setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &req,
+                       sizeof(req));
+  // FIXME: gVisor returns the incorrect errno.
+  if (IsRunningOnGvisor()) {
+    EXPECT_THAT(ret, SyscallFails());
+  } else {
+    EXPECT_THAT(ret, SyscallFailsWithErrno(EINVAL));
+  }
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
new file mode 100644
index 000000000..1b47139e4
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -0,0 +1,424 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
+
+#include <arpa/inet.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <cstdio>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Check that packets are not received without a group memebership. Default send
+// interface configured by bind.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNoGroup) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Bind the first FD to the loopback. This is an alternative to
+  // IP_MULTICAST_IF for setting the default send interface.
+  sockaddr_in senderAddr = {};
+  senderAddr.sin_family = AF_INET;
+  senderAddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(
+      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&senderAddr),
+           sizeof(senderAddr)),
+      SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address. If multicast worked like unicast,
+  // this would ensure that we get the packet.
+  sockaddr_in receiverAddr = {};
+  receiverAddr.sin_family = AF_INET;
+  receiverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  EXPECT_THAT(
+      bind(sockets->second_fd(), reinterpret_cast<sockaddr*>(&receiverAddr),
+           sizeof(receiverAddr)),
+      SyscallSucceeds());
+  socklen_t receiverAddrLen = sizeof(receiverAddr);
+  EXPECT_THAT(
+      getsockname(sockets->second_fd(),
+                  reinterpret_cast<sockaddr*>(&receiverAddr), &receiverAddrLen),
+      SyscallSucceeds());
+  EXPECT_EQ(receiverAddrLen, sizeof(receiverAddr));
+
+  // Send the multicast packet.
+  sockaddr_in sendAddr = {};
+  sendAddr.sin_family = AF_INET;
+  sendAddr.sin_port = receiverAddr.sin_port;
+  sendAddr.sin_addr.s_addr = inet_addr("224.0.2.1");
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(RetryEINTR(sendto)(
+                  sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                  reinterpret_cast<sockaddr*>(&sendAddr), sizeof(sendAddr)),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that not setting a default send interface prevents multicast packets
+// from being sent. Group membership interface configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive any
+  // unicast packet.
+  sockaddr_in receiverAddr = {};
+  receiverAddr.sin_family = AF_INET;
+  receiverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  EXPECT_THAT(
+      bind(sockets->second_fd(), reinterpret_cast<sockaddr*>(&receiverAddr),
+           sizeof(receiverAddr)),
+      SyscallSucceeds());
+  socklen_t receiverAddrLen = sizeof(receiverAddr);
+  EXPECT_THAT(
+      getsockname(sockets->second_fd(),
+                  reinterpret_cast<sockaddr*>(&receiverAddr), &receiverAddrLen),
+      SyscallSucceeds());
+  EXPECT_EQ(receiverAddrLen, sizeof(receiverAddr));
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr("224.0.2.1");
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  sockaddr_in sendAddr = {};
+  sendAddr.sin_family = AF_INET;
+  sendAddr.sin_port = receiverAddr.sin_port;
+  sendAddr.sin_addr.s_addr = inet_addr("224.0.2.1");
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(RetryEINTR(sendto)(
+                  sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                  reinterpret_cast<sockaddr*>(&sendAddr), sizeof(sendAddr)),
+              SyscallFailsWithErrno(ENETUNREACH));
+}
+
+// Check that not setting a default send interface prevents multicast packets
+// from being sent. Group membership interface configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive any
+  // unicast packet.
+  sockaddr_in receiverAddr = {};
+  receiverAddr.sin_family = AF_INET;
+  receiverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  EXPECT_THAT(
+      bind(sockets->second_fd(), reinterpret_cast<sockaddr*>(&receiverAddr),
+           sizeof(receiverAddr)),
+      SyscallSucceeds());
+  socklen_t receiverAddrLen = sizeof(receiverAddr);
+  EXPECT_THAT(
+      getsockname(sockets->second_fd(),
+                  reinterpret_cast<sockaddr*>(&receiverAddr), &receiverAddrLen),
+      SyscallSucceeds());
+  EXPECT_EQ(receiverAddrLen, sizeof(receiverAddr));
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr("224.0.2.1");
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  sockaddr_in sendAddr = {};
+  sendAddr.sin_family = AF_INET;
+  sendAddr.sin_port = receiverAddr.sin_port;
+  sendAddr.sin_addr.s_addr = inet_addr("224.0.2.1");
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(RetryEINTR(sendto)(
+                  sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                  reinterpret_cast<sockaddr*>(&sendAddr), sizeof(sendAddr)),
+              SyscallFailsWithErrno(ENETUNREACH));
+}
+
+// Check that multicast works when the default send interface is configured by
+// bind and the group membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Bind the first FD to the loopback. This is an alternative to
+  // IP_MULTICAST_IF for setting the default send interface.
+  sockaddr_in senderAddr = {};
+  senderAddr.sin_family = AF_INET;
+  senderAddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(
+      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&senderAddr),
+           sizeof(senderAddr)),
+      SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  sockaddr_in receiverAddr = {};
+  receiverAddr.sin_family = AF_INET;
+  receiverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  EXPECT_THAT(
+      bind(sockets->second_fd(), reinterpret_cast<sockaddr*>(&receiverAddr),
+           sizeof(receiverAddr)),
+      SyscallSucceeds());
+  socklen_t receiverAddrLen = sizeof(receiverAddr);
+  EXPECT_THAT(
+      getsockname(sockets->second_fd(),
+                  reinterpret_cast<sockaddr*>(&receiverAddr), &receiverAddrLen),
+      SyscallSucceeds());
+  EXPECT_EQ(receiverAddrLen, sizeof(receiverAddr));
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr("224.0.2.1");
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  sockaddr_in sendAddr = {};
+  sendAddr.sin_family = AF_INET;
+  sendAddr.sin_port = receiverAddr.sin_port;
+  sendAddr.sin_addr.s_addr = inet_addr("224.0.2.1");
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(RetryEINTR(sendto)(
+                  sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                  reinterpret_cast<sockaddr*>(&sendAddr), sizeof(sendAddr)),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is confgured by
+// bind and the group membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Bind the first FD to the loopback. This is an alternative to
+  // IP_MULTICAST_IF for setting the default send interface.
+  sockaddr_in senderAddr = {};
+  senderAddr.sin_family = AF_INET;
+  senderAddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(
+      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&senderAddr),
+           sizeof(senderAddr)),
+      SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  sockaddr_in receiverAddr = {};
+  receiverAddr.sin_family = AF_INET;
+  receiverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  EXPECT_THAT(
+      bind(sockets->second_fd(), reinterpret_cast<sockaddr*>(&receiverAddr),
+           sizeof(receiverAddr)),
+      SyscallSucceeds());
+  socklen_t receiverAddrLen = sizeof(receiverAddr);
+  EXPECT_THAT(
+      getsockname(sockets->second_fd(),
+                  reinterpret_cast<sockaddr*>(&receiverAddr), &receiverAddrLen),
+      SyscallSucceeds());
+  EXPECT_EQ(receiverAddrLen, sizeof(receiverAddr));
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr("224.0.2.1");
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  sockaddr_in sendAddr = {};
+  sendAddr.sin_family = AF_INET;
+  sendAddr.sin_port = receiverAddr.sin_port;
+  sendAddr.sin_addr.s_addr = inet_addr("224.0.2.1");
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(RetryEINTR(sendto)(
+                  sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                  reinterpret_cast<sockaddr*>(&sendAddr), sizeof(sendAddr)),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that dropping a group membership that does not exist fails.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastInvalidDrop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Unregister from a membership that we didn't have.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr("224.0.2.1");
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+// Check that dropping a group membership prevents multicast packets from being
+// delivered. Default send address configured by bind and group membership
+// interface configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Bind the first FD to the loopback. This is an alternative to
+  // IP_MULTICAST_IF for setting the default send interface.
+  sockaddr_in senderAddr = {};
+  senderAddr.sin_family = AF_INET;
+  senderAddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(
+      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&senderAddr),
+           sizeof(senderAddr)),
+      SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  sockaddr_in receiverAddr = {};
+  receiverAddr.sin_family = AF_INET;
+  receiverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  EXPECT_THAT(
+      bind(sockets->second_fd(), reinterpret_cast<sockaddr*>(&receiverAddr),
+           sizeof(receiverAddr)),
+      SyscallSucceeds());
+  socklen_t receiverAddrLen = sizeof(receiverAddr);
+  EXPECT_THAT(
+      getsockname(sockets->second_fd(),
+                  reinterpret_cast<sockaddr*>(&receiverAddr), &receiverAddrLen),
+      SyscallSucceeds());
+  EXPECT_EQ(receiverAddrLen, sizeof(receiverAddr));
+
+  // Register and unregister to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr("224.0.2.1");
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  sockaddr_in sendAddr = {};
+  sendAddr.sin_family = AF_INET;
+  sendAddr.sin_port = receiverAddr.sin_port;
+  sendAddr.sin_addr.s_addr = inet_addr("224.0.2.1");
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(RetryEINTR(sendto)(
+                  sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                  reinterpret_cast<sockaddr*>(&sendAddr), sizeof(sendAddr)),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that dropping a group membership prevents multicast packets from being
+// delivered. Default send address configured by bind and group membership
+// interface configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Bind the first FD to the loopback. This is an alternative to
+  // IP_MULTICAST_IF for setting the default send interface.
+  sockaddr_in senderAddr = {};
+  senderAddr.sin_family = AF_INET;
+  senderAddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(
+      bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&senderAddr),
+           sizeof(senderAddr)),
+      SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  sockaddr_in receiverAddr = {};
+  receiverAddr.sin_family = AF_INET;
+  receiverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+  EXPECT_THAT(
+      bind(sockets->second_fd(), reinterpret_cast<sockaddr*>(&receiverAddr),
+           sizeof(receiverAddr)),
+      SyscallSucceeds());
+  socklen_t receiverAddrLen = sizeof(receiverAddr);
+  EXPECT_THAT(
+      getsockname(sockets->second_fd(),
+                  reinterpret_cast<sockaddr*>(&receiverAddr), &receiverAddrLen),
+      SyscallSucceeds());
+  EXPECT_EQ(receiverAddrLen, sizeof(receiverAddr));
+
+  // Register and unregister to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr("224.0.2.1");
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_DROP_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  sockaddr_in sendAddr = {};
+  sendAddr.sin_family = AF_INET;
+  sendAddr.sin_port = receiverAddr.sin_port;
+  sendAddr.sin_addr.s_addr = inet_addr("224.0.2.1");
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(RetryEINTR(sendto)(
+                  sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                  reinterpret_cast<sockaddr*>(&sendAddr), sizeof(sendAddr)),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.h b/test/syscalls/linux/socket_ipv4_udp_unbound.h
new file mode 100644
index 000000000..a780c0144
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.h
@@ -0,0 +1,29 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of IPv4 UDP sockets.
+using IPv4UDPUnboundSocketPairTest = SocketPairTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
new file mode 100644
index 000000000..b70faa33d
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
@@ -0,0 +1,35 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+  return ApplyVec<SocketPairKind>(
+      IPv4UDPUnboundSocketPair,
+      AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_CASE_P(IPv4UDPSockets, IPv4UDPUnboundSocketPairTest,
+                        ::testing::ValuesIn(GetSocketPairs()));
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 80a59df7e..49b8c583f 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -388,24 +388,33 @@ Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
   };
 }
 
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateUDPBoundSocketPair(
+    int sock1, int sock2, int type, bool dual_stack) {
+  ASSIGN_OR_RETURN_ERRNO(T addr1, BindIP<T>(sock1, dual_stack));
+  ASSIGN_OR_RETURN_ERRNO(T addr2, BindIP<T>(sock2, dual_stack));
+
+  return absl::make_unique<AddrFDSocketPair>(sock1, sock2, addr1, addr2);
+}
+
 template <typename T>
 PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
 CreateUDPBidirectionalBindSocketPair(int sock1, int sock2, int type,
                                      bool dual_stack) {
-  ASSIGN_OR_RETURN_ERRNO(T addr1, BindIP<T>(sock1, dual_stack));
-  ASSIGN_OR_RETURN_ERRNO(T addr2, BindIP<T>(sock2, dual_stack));
+  ASSIGN_OR_RETURN_ERRNO(
+      auto socks, CreateUDPBoundSocketPair<T>(sock1, sock2, type, dual_stack));
 
   // Connect sock1 to sock2.
-  RETURN_ERROR_IF_SYSCALL_FAIL(connect(
-      sock1, reinterpret_cast<struct sockaddr*>(&addr2), sizeof(addr2)));
+  RETURN_ERROR_IF_SYSCALL_FAIL(connect(socks->first_fd(), socks->second_addr(),
+                                       socks->second_addr_size()));
   MaybeSave();  // Successful connection.
 
   // Connect sock2 to sock1.
-  RETURN_ERROR_IF_SYSCALL_FAIL(connect(
-      sock2, reinterpret_cast<struct sockaddr*>(&addr1), sizeof(addr1)));
+  RETURN_ERROR_IF_SYSCALL_FAIL(connect(socks->second_fd(), socks->first_addr(),
+                                       socks->first_addr_size()));
   MaybeSave();  // Successful connection.
 
-  return absl::make_unique<AddrFDSocketPair>(sock1, sock2, addr1, addr2);
+  return socks;
 }
 
 Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
@@ -429,6 +438,21 @@ Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
   };
 }
 
+Creator<SocketPair> UDPUnboundSocketPairCreator(int domain, int type,
+                                                int protocol, bool dual_stack) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+    int sock1;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    int sock2;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    return absl::make_unique<FDSocketPair>(sock1, sock2);
+  };
+}
+
 SocketPairKind Reversed(SocketPairKind const& base) {
   auto const& creator = base.creator;
   return SocketPairKind{
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 6d84b3fa8..826374dc6 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -273,6 +273,11 @@ Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
                                                           int protocol,
                                                           bool dual_stack);
 
+// UDPUnboundSocketPairCreator returns a Creator<SocketPair> that obtains file
+// descriptors by creating UDP sockets.
+Creator<SocketPair> UDPUnboundSocketPairCreator(int domain, int type,
+                                                int protocol, bool dual_stack);
+
 // A SocketPairKind couples a human-readable description of a socket pair with
 // a function that creates such a socket pair.
 struct SocketPairKind {
-- 
cgit v1.2.3


From 9c9386d2a8c041f3c1f19469b47414c419f7d534 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 8 Feb 2019 12:59:04 -0800
Subject: CopyObjectOut should allocate a byte slice the size of the encoded
 object.

This adds an extra Reflection call to CopyObjectOut, but avoids many small
slice allocations if the object is large, since without this we grow the
backing slice incrementally as we encode more data.

PiperOrigin-RevId: 233110960
Change-Id: I93569af55912391e5471277f779139c23f040147
---
 pkg/sentry/usermem/usermem.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 1d6c0b4d6..75ac4d22d 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -181,7 +181,11 @@ func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts
 		Addr: addr,
 		Opts: opts,
 	}
-	return w.Write(binary.Marshal(nil, ByteOrder, src))
+	// Allocate a byte slice the size of the object being marshaled. This
+	// adds an extra reflection call, but avoids needing to grow the slice
+	// during encoding, which can result in many heap-allocated slices.
+	b := make([]byte, 0, binary.Size(src))
+	return w.Write(binary.Marshal(b, ByteOrder, src))
 }
 
 // CopyObjectIn copies a fixed-size value or slice of fixed-size values from
-- 
cgit v1.2.3


From e884168e1ea5cd8be4d50c85a4ad4fbcdaca1e5c Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 8 Feb 2019 15:47:25 -0800
Subject: Encode stat to bytes manually, instead of calling CopyObjectOut.

CopyObjectOut grows its destination byte slice incrementally, causing
many small slice allocations on the heap. This leads to increased GC and
noticeably slower stat calls.

PiperOrigin-RevId: 233140904
Change-Id: Ieb90295dd8dd45b3e56506fef9d7f86c92e97d97
---
 pkg/abi/linux/file.go                 |  4 +++
 pkg/sentry/syscalls/linux/sys_stat.go | 61 ++++++++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index ae33f4a4d..e5a51a9fd 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -19,6 +19,7 @@ import (
 	"strings"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
 )
 
 // Constants for open(2).
@@ -177,6 +178,9 @@ type Stat struct {
 	X_unused [3]int64
 }
 
+// SizeOfStat is the size of a Stat struct.
+var SizeOfStat = binary.Size(Stat{})
+
 // FileMode represents a mode_t.
 type FileMode uint
 
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 9c433c45d..95f161aac 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -16,6 +16,7 @@ package linux
 
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/binary"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -124,21 +125,51 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 		mode |= linux.ModeSocket
 	}
 
-	_, err = t.CopyOut(statAddr, linux.Stat{
-		Dev:     uint64(d.Inode.StableAttr.DeviceID),
-		Rdev:    uint64(linux.MakeDeviceID(d.Inode.StableAttr.DeviceFileMajor, d.Inode.StableAttr.DeviceFileMinor)),
-		Ino:     uint64(d.Inode.StableAttr.InodeID),
-		Nlink:   uattr.Links,
-		Mode:    mode | uint32(uattr.Perms.LinuxMode()),
-		UID:     uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
-		GID:     uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
-		Size:    uattr.Size,
-		Blksize: d.Inode.StableAttr.BlockSize,
-		Blocks:  uattr.Usage / 512,
-		ATime:   uattr.AccessTime.Timespec(),
-		MTime:   uattr.ModificationTime.Timespec(),
-		CTime:   uattr.StatusChangeTime.Timespec(),
-	})
+	// We encode the stat struct to bytes manually, as stat() is a very
+	// common syscall for many applications, and t.CopyObjectOut has
+	// noticeable performance impact due to its many slice allocations and
+	// use of reflection.
+	b := make([]byte, 0, linux.SizeOfStat)
+
+	// Dev (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(d.Inode.StableAttr.DeviceID))
+	// Ino (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(d.Inode.StableAttr.InodeID))
+	// Nlink (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
+	// Mode (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, mode|uint32(uattr.Perms.LinuxMode()))
+	// UID (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+	// GID (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
+	// Padding (uint32)
+	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
+	// Rdev (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(d.Inode.StableAttr.DeviceFileMajor, d.Inode.StableAttr.DeviceFileMinor)))
+	// Size (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
+	// Blksize (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(d.Inode.StableAttr.BlockSize))
+	// Blocks (uint64)
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
+
+	// ATime
+	atime := uattr.AccessTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
+
+	// MTime
+	mtime := uattr.ModificationTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
+
+	// CTime
+	ctime := uattr.StatusChangeTime.Timespec()
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
+
+	_, err = t.CopyOutBytes(statAddr, b)
 	return err
 }
 
-- 
cgit v1.2.3


From f17692d8074787d058dc33cc95587b15dba3b161 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 8 Feb 2019 15:53:16 -0800
Subject: Add fs.AsyncWithContext and call it in
 fs/gofer/inodeOperations.Release.

fs/gofer/inodeOperations.Release does some asynchronous work.  Previously it
was calling fs.Async with an anonymous function, which caused the function to
be allocated on the heap.  Because Release is relatively hot, this results in a
lot of small allocations and increased GC pressure, noticeable in perf profiles.

This CL adds a new function, AsyncWithContext, which is just like Async, but
passes a context to the async function.  It avoids the need for an extra
anonymous function in fs/gofer/inodeOperations.Release.  The Async function
itself still requires a single anonymous function.

PiperOrigin-RevId: 233141763
Change-Id: I1dce4a883a7be9a8a5b884db01e654655f16d19c
---
 pkg/sentry/fs/fs.go          | 12 ++++++++++++
 pkg/sentry/fs/gofer/inode.go |  7 ++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 0ba4b7269..36f263235 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -57,6 +57,7 @@ import (
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 )
 
 var (
@@ -87,6 +88,17 @@ func Async(f func()) {
 	}()
 }
 
+// AsyncWithContext is just like Async, except that it calls the asynchronous
+// function with the given context as argument. This function exists to avoid
+// needing to allocate an extra function on the heap in a hot path.
+func AsyncWithContext(ctx context.Context, f func(context.Context)) {
+	workMu.RLock()
+	go func() { // S/R-SAFE: AsyncBarrier must be called.
+		defer workMu.RUnlock() // Ensure RUnlock in case of panic.
+		f(ctx)
+	}()
+}
+
 // AsyncErrorBarrier waits for all outstanding asynchronous work to complete, or
 // the first async error to arrive. Other unfinished async executions will
 // continue in the background. Other past and future async errors are ignored.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 16435169a..83fff7517 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -352,9 +352,10 @@ func (i *inodeOperations) Release(ctx context.Context) {
 	// Releasing the fileState may make RPCs to the gofer. There is
 	// no need to wait for those to return, so we can do this
 	// asynchronously.
-	fs.Async(func() {
-		i.fileState.Release(ctx)
-	})
+	//
+	// We use AsyncWithContext to avoid needing to allocate an extra
+	// anonymous function on the heap.
+	fs.AsyncWithContext(ctx, i.fileState.Release)
 }
 
 // Mappable implements fs.InodeOperations.Mappable.
-- 
cgit v1.2.3


From 7aaa6cf22594cfc7eff2070191c0077bfd58046a Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Wed, 13 Feb 2019 12:06:20 -0800
Subject: Internal change.

PiperOrigin-RevId: 233802562
Change-Id: I40e1b13fd571daaf241b00f8df4bcedd034dc3f1
---
 pkg/sentry/fs/BUILD               |  1 +
 pkg/sentry/fs/file.go             |  5 +++++
 pkg/sentry/fs/gofer/file.go       | 20 ++++++++++++++++++--
 pkg/sentry/fs/inode.go            |  4 ++++
 pkg/sentry/fs/tmpfs/BUILD         |  1 +
 pkg/sentry/fs/tmpfs/inode_file.go | 13 +++++++++++++
 6 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index e58333da3..6957c1bbe 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -48,6 +48,7 @@ go_library(
         "//pkg/amutex",
         "//pkg/ilist",
         "//pkg/log",
+        "//pkg/metric",
         "//pkg/p9",
         "//pkg/refs",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index d6752ed1b..b66d2f265 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/amutex"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
@@ -32,6 +33,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+var reads = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
+
 // FileMaxOffset is the maximum possible file offset.
 const FileMaxOffset = math.MaxInt64
 
@@ -237,6 +240,7 @@ func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		return 0, syserror.ErrInterrupted
 	}
 
+	reads.Increment()
 	n, err := f.FileOperations.Read(ctx, f, dst, f.offset)
 	if n > 0 {
 		atomic.AddInt64(&f.offset, n)
@@ -255,6 +259,7 @@ func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64)
 		return 0, syserror.ErrInterrupted
 	}
 
+	reads.Increment()
 	n, err := f.FileOperations.Read(ctx, f, dst, offset)
 	f.mu.Unlock()
 	return n, err
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 2bb25daf1..7a6dabba8 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -31,7 +31,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
+var (
+	opensWX   = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
+	opens9P   = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.")
+	opensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.")
+	reads9P   = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
+	readsHost = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
+)
 
 // fileOperations implements fs.FileOperations for a remote file system.
 //
@@ -91,10 +97,15 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF
 	}
 	if flags.Write {
 		if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil {
-			openedWX.Increment()
+			opensWX.Increment()
 			log.Warningf("Opened a writable executable: %q", name)
 		}
 	}
+	if handles.Host != nil {
+		opensHost.Increment()
+	} else {
+		opens9P.Increment()
+	}
 	return fs.NewFile(ctx, dirent, flags, f)
 }
 
@@ -227,6 +238,11 @@ func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IO
 		// Not all remote file systems enforce this so this client does.
 		return 0, syserror.EISDIR
 	}
+	if f.handles.Host != nil {
+		readsHost.Increment()
+	} else {
+		reads9P.Increment()
+	}
 
 	if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
 		return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d32f52d55..08b5c5902 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -17,6 +17,7 @@ package fs
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
@@ -26,6 +27,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+var opens = metric.MustCreateNewUint64Metric("/fs/opens", false /* sync */, "Number of file opens.")
+
 // Inode is a file system object that can be simultaneously referenced by different
 // components of the VFS (Dirent, fs.File, etc).
 //
@@ -236,6 +239,7 @@ func (i *Inode) GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File,
 	if i.overlay != nil {
 		return overlayGetFile(ctx, i.overlay, d, flags)
 	}
+	opens.Increment()
 	return i.InodeOperations.GetFile(ctx, d, flags)
 }
 
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index bf5b68869..9570c71e5 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -15,6 +15,7 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/metric",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 2505e2c69..ef5e67dda 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -18,6 +18,7 @@ import (
 	"io"
 	"sync"
 
+	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
@@ -29,6 +30,12 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+var (
+	opensRO = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
+	opensW  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
+	reads   = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
+)
+
 // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
 // These files are backed by FrameRegions allocated from a platform.Memory,
 // and may be directly mapped.
@@ -116,6 +123,11 @@ func (*fileInodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, old
 
 // GetFile implements fs.InodeOperations.GetFile.
 func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	if flags.Write {
+		opensW.Increment()
+	} else if flags.Read {
+		opensRO.Increment()
+	}
 	flags.Pread = true
 	flags.Pwrite = true
 	return fs.NewFile(ctx, d, flags, &regularFileOperations{iops: f}), nil
@@ -237,6 +249,7 @@ func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	reads.Increment()
 	// Zero length reads for tmpfs are no-ops.
 	if dst.NumBytes() == 0 {
 		return 0, nil
-- 
cgit v1.2.3


From 0e84ae72e086c77cea066000a898b7bc951ba790 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 13 Feb 2019 14:24:23 -0800
Subject: Improve safecopy sanity checks.

- Fix CopyIn/CopyOut/ZeroOut range checks.

- Include the faulting signal number in the panic message.

PiperOrigin-RevId: 233829501
Change-Id: I8959ead12d05dbd4cd63c2b908cddeb2a27eb513
---
 pkg/sentry/platform/safecopy/safecopy_unsafe.go | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
index df1c35b66..e78a6714e 100644
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -92,14 +92,14 @@ func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
 		return len(dst), nil
 	}
 
-	if faultN, srcN := uintptr(fault), uintptr(src); faultN < srcN && faultN >= srcN+toCopy {
-		panic(fmt.Sprintf("CopyIn faulted at %#x, which is outside source [%#x, %#x)", faultN, srcN, srcN+toCopy))
+	faultN, srcN := uintptr(fault), uintptr(src)
+	if faultN < srcN || faultN >= srcN+toCopy {
+		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy))
 	}
 
 	// memcpy might have ended the copy up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to copy up to the fault.
-	faultN, srcN := uintptr(fault), uintptr(src)
 	var done int
 	if faultN-srcN > maxRegisterSize {
 		done = int(faultN - srcN - maxRegisterSize)
@@ -126,14 +126,14 @@ func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
 		return len(src), nil
 	}
 
-	if faultN, dstN := uintptr(fault), uintptr(dst); faultN < dstN && faultN >= dstN+toCopy {
-		panic(fmt.Sprintf("CopyOut faulted at %#x, which is outside destination [%#x, %#x)", faultN, dstN, dstN+toCopy))
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	if faultN < dstN || faultN >= dstN+toCopy {
+		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy))
 	}
 
 	// memcpy might have ended the copy up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to copy up to the fault.
-	faultN, dstN := uintptr(fault), uintptr(dst)
 	var done int
 	if faultN-dstN > maxRegisterSize {
 		done = int(faultN - dstN - maxRegisterSize)
@@ -173,7 +173,7 @@ func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
 		faultAfterDst = faultN - dstN
 	}
 	if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
-		panic(fmt.Sprintf("Copy faulted at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
+		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
 	}
 	faultedAfter := faultAfterSrc
 	if faultedAfter > faultAfterDst {
@@ -207,14 +207,14 @@ func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
 		return toZero, nil
 	}
 
-	if faultN, dstN := uintptr(fault), uintptr(dst); faultN < dstN && faultN >= dstN+toZero {
-		panic(fmt.Sprintf("ZeroOut faulted at %#x, which is outside destination [%#x, %#x)", faultN, dstN, dstN+toZero))
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	if faultN < dstN || faultN >= dstN+toZero {
+		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero))
 	}
 
 	// memclr might have ended the write up to maxRegisterSize bytes before
 	// fault, if an instruction caused a memory access that straddled two
 	// pages, and the second one faulted. Try to write up to the fault.
-	faultN, dstN := uintptr(fault), uintptr(dst)
 	var done uintptr
 	if faultN-dstN > maxRegisterSize {
 		done = faultN - dstN - maxRegisterSize
-- 
cgit v1.2.3


From 0a41ea72c1f70916bdbb68d9fdfa6c438e28b5b2 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 14 Feb 2019 15:46:25 -0800
Subject: Don't allow writing or reading to TTY unless process group is in
 foreground.

If a background process tries to read from a TTY, linux sends it a SIGTTIN
unless the signal is blocked or ignored, or the process group is an orphan, in
which case the syscall returns EIO.

See drivers/tty/n_tty.c:n_tty_read()=>job_control().

If a background process tries to write a TTY, set the termios, or set the
foreground process group, linux then sends a SIGTTOU. If the signal is ignored
or blocked, linux allows the write. If the process group is an orphan, the
syscall returns EIO.

See drivers/tty/tty_io.c:tty_check_change().

PiperOrigin-RevId: 234044367
Change-Id: I009461352ac4f3f11c5d42c43ac36bb0caa580f9
---
 pkg/sentry/control/proc.go           |  14 ++-
 pkg/sentry/fs/host/tty.go            | 183 ++++++++++++++++++++++++++++++-----
 pkg/sentry/kernel/kernel.go          |  44 +++------
 pkg/sentry/kernel/sessions.go        |  29 ++++++
 pkg/sentry/kernel/signal_handlers.go |   8 ++
 runsc/boot/loader.go                 |  14 ++-
 6 files changed, 234 insertions(+), 58 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 923399fb2..e848def14 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -222,10 +222,18 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		return nil, 0, nil, err
 	}
 
-	if ttyFile == nil {
-		return tg, tid, nil, nil
+	var ttyFileOps *host.TTYFileOperations
+	if ttyFile != nil {
+		// Set the foreground process group on the TTY before starting
+		// the process.
+		ttyFileOps = ttyFile.FileOperations.(*host.TTYFileOperations)
+		ttyFileOps.InitForegroundProcessGroup(tg.ProcessGroup())
 	}
-	return tg, tid, ttyFile.FileOperations.(*host.TTYFileOperations), nil
+
+	// Start the newly created process.
+	proc.Kernel.StartProcess(tg)
+
+	return tg, tid, ttyFileOps, nil
 }
 
 // PsArgs is the set of arguments to ps.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index ac6ad1b87..21db0086e 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -37,8 +37,11 @@ type TTYFileOperations struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
 
-	// FGProcessGroup is the foreground process group this TTY.  Will be
-	// nil if not set or if this file has been released.
+	// session is the session attached to this TTYFileOperations.
+	session *kernel.Session
+
+	// fgProcessGroup is the foreground process group that is currently
+	// connected to this TTY.
 	fgProcessGroup *kernel.ProcessGroup
 }
 
@@ -49,15 +52,58 @@ func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops
 	})
 }
 
-// ForegroundProcessGroup returns the foreground process for the TTY. This will
-// be nil if the foreground process has not been set or if the file has been
-// released.
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *TTYFileOperations) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.fgProcessGroup != nil {
+		panic("foreground process group is already set")
+	}
+	t.fgProcessGroup = pg
+	t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
 func (t *TTYFileOperations) ForegroundProcessGroup() *kernel.ProcessGroup {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 	return t.fgProcessGroup
 }
 
+// Read implements fs.FileOperations.Read.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+//
+// See drivers/tty/n_tty.c:n_tty_read()=>job_control().
+func (t *TTYFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileOperations.Read(ctx, file, dst, offset)
+}
+
+// Write implements fs.FileOperations.Write.
+func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the write?
+	if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+		return 0, err
+	}
+	return t.fileOperations.Write(ctx, file, src, offset)
+}
+
 // Release implements fs.FileOperations.Release.
 func (t *TTYFileOperations) Release() {
 	t.mu.Lock()
@@ -84,6 +130,13 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+
 		var termios linux.Termios
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -99,20 +152,17 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		// Get the process group ID of the foreground process group on
 		// this terminal.
 
+		pidns := kernel.PIDNamespaceFromContext(ctx)
+		if pidns == nil {
+			return 0, syserror.ENOTTY
+		}
+
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
-		if t.fgProcessGroup == nil {
-			// No process group has been set yet. Let's just lie
-			// and tell it the process group from the current task.
-			// The app is probably going to set it to something
-			// else very soon anyways.
-			t.fgProcessGroup = kernel.TaskFromContext(ctx).ThreadGroup().ProcessGroup()
-		}
-
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID
 		// namespace.
-		pgID := kernel.TaskFromContext(ctx).ThreadGroup().PIDNamespace().IDOfProcessGroup(t.fgProcessGroup)
+		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
@@ -123,6 +173,30 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
+		task := kernel.TaskFromContext(ctx)
+		if task == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Check that we are allowed to set the process group.
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			// drivers/tty/tty_io.c:tiocspgrp() converts -EIO from
+			// tty_check_change() to -ENOTTY.
+			if err == syserror.EIO {
+				return 0, syserror.ENOTTY
+			}
+			return 0, err
+		}
+
+		// Check that calling task's process group is in the TTY
+		// session.
+		if task.ThreadGroup().Session() != t.session {
+			return 0, syserror.ENOTTY
+		}
+
 		var pgID kernel.ProcessGroupID
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -136,24 +210,18 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		}
 
 		// Process group with pgID must exist in this PID namespace.
-		task := kernel.TaskFromContext(ctx)
 		pidns := task.PIDNamespace()
 		pg := pidns.ProcessGroupWithID(pgID)
 		if pg == nil {
 			return 0, syserror.ESRCH
 		}
 
-		// Process group must be in same session as calling task's
-		// process group.
-		curSession := task.ThreadGroup().ProcessGroup().Session()
-		curSessionID := pidns.IDOfSession(curSession)
-		if pidns.IDOfSession(pg.Session()) != curSessionID {
+		// Check that new process group is in the TTY session.
+		if pg.Session() != t.session {
 			return 0, syserror.EPERM
 		}
 
-		t.mu.Lock()
 		t.fgProcessGroup = pg
-		t.mu.Unlock()
 		return 0, nil
 
 	case linux.TIOCGWINSZ:
@@ -171,6 +239,10 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 	case linux.TIOCSWINSZ:
 		// Args: const struct winsize *argp
 		// Set window size.
+
+		// Unlike setting the termios, any process group (even
+		// background ones) can set the winsize.
+
 		var winsize linux.Winsize
 		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
 			AddressSpaceActive: true,
@@ -213,3 +285,70 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.
 		return 0, syserror.ENOTTY
 	}
 }
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		// No task? Linux does not have an analog for this case, but
+		// tty_check_change is more of a blacklist of cases than a
+		// whitelist, and is surprisingly permissive. Allowing the
+		// change seems most appropriate.
+		return nil
+	}
+
+	tg := task.ThreadGroup()
+	pg := tg.ProcessGroup()
+
+	// If the session for the task is different than the session for the
+	// controlling TTY, then the change is allowed. Seems like a bad idea,
+	// but that's exactly what linux does.
+	if tg.Session() != t.fgProcessGroup.Session() {
+		return nil
+	}
+
+	// If we are the foreground process group, then the change is allowed.
+	if pg == t.fgProcessGroup {
+		return nil
+	}
+
+	// We are not the foreground process group.
+
+	// Is the provided signal blocked or ignored?
+	if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+		// If the signal is SIGTTIN, then we are attempting to read
+		// from the TTY. Don't send the signal and return EIO.
+		if sig == linux.SIGTTIN {
+			return syserror.EIO
+		}
+
+		// Otherwise, we are writing or changing terminal state. This is allowed.
+		return nil
+	}
+
+	// If the process group is an orphan, return EIO.
+	if pg.IsOrphan() {
+		return syserror.EIO
+	}
+
+	// Otherwise, send the signal to the process group and return ERESTARTSYS.
+	//
+	// Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+	// but this isn't necessary in gVisor because the rationale given in
+	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+	// apply: the sentry will handle -ERESTARTSYS in
+	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+	si := arch.SignalInfo{
+		Code:  arch.SignalInfoKernel,
+		Signo: int32(sig),
+	}
+	// Linux ignores the result of kill_pgrp().
+	_ = pg.SendSignal(&si)
+	return kernel.ERESTARTSYS
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index e7e5ff777..c6afae2e6 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -615,8 +615,11 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 // CreateProcess creates a new task in a new thread group with the given
 // options. The new task has no parent and is in the root PID namespace.
 //
-// If k.Start() has already been called, the created task will begin running
-// immediately. Otherwise, it will be started when k.Start() is called.
+// If k.Start() has already been called, then the created process must be
+// started by calling kernel.StartProcess(tg).
+//
+// If k.Start() has not yet been called, then the created task will begin
+// running when k.Start() is called.
 //
 // CreateProcess has no analogue in Linux; it is used to create the initial
 // application task, as well as processes started by the control server.
@@ -688,22 +691,25 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
 		ContainerID:             args.ContainerID,
 	}
-	t, err := k.tasks.NewTask(config)
-	if err != nil {
+	if _, err := k.tasks.NewTask(config); err != nil {
 		return nil, 0, err
 	}
 
 	// Success.
 	tgid := k.tasks.Root.IDOfThreadGroup(tg)
-	if k.started {
-		tid := k.tasks.Root.IDOfTask(t)
-		t.Start(tid)
-	} else if k.globalInit == nil {
+	if k.globalInit == nil {
 		k.globalInit = tg
 	}
 	return tg, tgid, nil
 }
 
+// StartProcess starts running a process that was created with CreateProcess.
+func (k *Kernel) StartProcess(tg *ThreadGroup) {
+	t := tg.Leader()
+	tid := k.tasks.Root.IDOfTask(t)
+	t.Start(tid)
+}
+
 // Start starts execution of all tasks in k.
 //
 // Preconditions: Start may be called exactly once.
@@ -866,28 +872,6 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	return lastErr
 }
 
-// SendProcessGroupSignal sends a signal to all processes inside the process
-// group. It is analagous to kernel/signal.c:kill_pgrp.
-func (k *Kernel) SendProcessGroupSignal(pg *ProcessGroup, info *arch.SignalInfo) error {
-	k.extMu.Lock()
-	defer k.extMu.Unlock()
-	k.tasks.mu.RLock()
-	defer k.tasks.mu.RUnlock()
-
-	var lastErr error
-	for t := range k.tasks.Root.tids {
-		if t == t.tg.leader && t.tg.ProcessGroup() == pg {
-			t.tg.signalHandlers.mu.Lock()
-			defer t.tg.signalHandlers.mu.Unlock()
-			infoCopy := *info
-			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
-				lastErr = err
-			}
-		}
-	}
-	return lastErr
-}
-
 // FeatureSet returns the FeatureSet.
 func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
 	return k.featureSet
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 78a5b4063..6fd65f2b0 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
@@ -119,6 +120,13 @@ func (pg *ProcessGroup) Originator() *ThreadGroup {
 	return pg.originator
 }
 
+// IsOrphan returns true if this process group is an orphan.
+func (pg *ProcessGroup) IsOrphan() bool {
+	pg.originator.TaskSet().mu.RLock()
+	defer pg.originator.TaskSet().mu.RUnlock()
+	return pg.ancestors == 0
+}
+
 // incRefWithParent grabs a reference.
 //
 // This function is called when this ProcessGroup is being associated with some
@@ -224,6 +232,27 @@ func (pg *ProcessGroup) Session() *Session {
 	return pg.session
 }
 
+// SendSignal sends a signal to all processes inside the process group. It is
+// analagous to kernel/signal.c:kill_pgrp.
+func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
+	tasks := pg.originator.TaskSet()
+	tasks.mu.RLock()
+	defer tasks.mu.RUnlock()
+
+	var lastErr error
+	for t := range tasks.Root.tids {
+		if t == t.tg.leader && t.tg.ProcessGroup() == pg {
+			t.tg.signalHandlers.mu.Lock()
+			defer t.tg.signalHandlers.mu.Unlock()
+			infoCopy := *info
+			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+				lastErr = err
+			}
+		}
+	}
+	return lastErr
+}
+
 // CreateSession creates a new Session, with the ThreadGroup as the leader.
 //
 // EPERM may be returned if either the given ThreadGroup is already a Session
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 3f1ac9898..60cbe85b8 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -69,6 +69,14 @@ func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
 	return sh2
 }
 
+// IsIgnored returns true if the signal is ignored.
+func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool {
+	sh.mu.Lock()
+	defer sh.mu.Unlock()
+	sa, ok := sh.actions[sig]
+	return ok && sa.Handler == arch.SignalActIgnore
+}
+
 // dequeueActionLocked returns the SignalAct that should be used to handle sig.
 //
 // Preconditions: sh.mu must be locked.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 973578484..41f456af7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -477,9 +477,9 @@ func (l *Loader) run() error {
 			return err
 		}
 
-		// Create the root container init task.
-		_, _, err := l.k.CreateProcess(l.rootProcArgs)
-		if err != nil {
+		// Create the root container init task. It will begin running
+		// when the kernel is started.
+		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
 			return fmt.Errorf("creating init process: %v", err)
 		}
 
@@ -492,6 +492,11 @@ func (l *Loader) run() error {
 		ttyFile := l.rootProcArgs.FDMap.GetFile(0)
 		defer ttyFile.DecRef()
 		ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
+
+		// Set the foreground process group on the TTY to the global
+		// init process group, since that is what we are about to
+		// start running.
+		ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
 	}
 
 	// Start signal forwarding only after an init process is created.
@@ -595,10 +600,13 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
 
+	// Create and start the new process.
 	tg, _, err := l.k.CreateProcess(procArgs)
 	if err != nil {
 		return fmt.Errorf("creating process: %v", err)
 	}
+	l.k.StartProcess(tg)
+
 	// CreateProcess takes a reference on FDMap if successful.
 	procArgs.FDMap.DecRef()
 
-- 
cgit v1.2.3


From e34d27e8b6709809582eb0ad43c7232f2d5ab8ad Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 15 Feb 2019 08:22:26 -0800
Subject: Redirect FIXME to more appropriate bug

PiperOrigin-RevId: 234147487
Change-Id: I779a6012832bb94a6b89f5bcc7d821b40ae969cc
---
 pkg/sentry/socket/epsocket/epsocket.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 16720456a..5e4a269c6 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1139,11 +1139,14 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
 		}))
 
-	case linux.MCAST_JOIN_GROUP, linux.IP_MULTICAST_IF:
+	case linux.IP_MULTICAST_IF:
 		// FIXME: Disallow IP-level multicast group options by
 		// default. These will need to be supported by appropriately plumbing
 		// the level through to the network stack (if at all). However, we
 		// still allow setting TTL, and multicast-enable/disable type options.
+		fallthrough
+	case linux.MCAST_JOIN_GROUP:
+		// FIXME: Implement MCAST_JOIN_GROUP.
 		t.Kernel().EmitUnimplementedEvent(t)
 		return syserr.ErrInvalidArgument
 
-- 
cgit v1.2.3


From a9cb3dcd9df373fb7a531476bf1da69fc9189e3a Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 15 Feb 2019 11:17:51 -0800
Subject: Move SO_TIMESTAMP from different transport endpoints to epsocket.

SO_TIMESTAMP is reimplemented in ping and UDP sockets (and needs to be added for
TCP), but can just be implemented in epsocket for simplicity. This will also
make SIOCGSTAMP easier to implement.

PiperOrigin-RevId: 234179300
Change-Id: Ib5ea0b1261dc218c1a8b15a65775de0050fe3230
---
 pkg/sentry/socket/epsocket/epsocket.go | 94 +++++++++++++++++++++++-----------
 pkg/tcpip/tcpip.go                     | 10 ----
 pkg/tcpip/transport/ping/endpoint.go   | 30 +----------
 pkg/tcpip/transport/udp/endpoint.go    | 29 +----------
 4 files changed, 69 insertions(+), 94 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 5e4a269c6..3a9d1182f 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -150,11 +150,24 @@ type SocketOperations struct {
 	Endpoint tcpip.Endpoint
 	skType   transport.SockType
 
-	// readMu protects access to readView, control, and sender.
-	readMu   sync.Mutex `state:"nosave"`
+	// readMu protects access to the below fields.
+	readMu sync.Mutex `state:"nosave"`
+	// readView contains the remaining payload from the last packet.
 	readView buffer.View
-	readCM   tcpip.ControlMessages
-	sender   tcpip.FullAddress
+	// readCM holds control message information for the last packet read
+	// from Endpoint.
+	readCM tcpip.ControlMessages
+	sender tcpip.FullAddress
+	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
+	// of returned messages can be returned via control messages. When
+	// false, the same timestamp is instead stored and can be read via the
+	// SIOCGSTAMP ioctl. See socket(7).
+	sockOptTimestamp bool
+	// timestampValid indicates whether timestamp has been set.
+	timestampValid bool
+	// timestampNS holds the timestamp to use with SIOCGSTAMP. It is only
+	// valid when timestampValid is true.
+	timestampNS int64
 }
 
 // New creates a new endpoint socket.
@@ -515,6 +528,24 @@ func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
+	// TODO: Unlike other socket options, SO_TIMESTAMP is
+	// implemented specifically for epsocket.SocketOperations rather than
+	// commonEndpoint. commonEndpoint should be extended to support socket
+	// options where the implementation is not shared, as unix sockets need
+	// their own support for SO_TIMESTAMP.
+	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+		val := int32(0)
+		s.readMu.Lock()
+		defer s.readMu.Unlock()
+		if s.sockOptTimestamp {
+			val = 1
+		}
+		return val, nil
+	}
+
 	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
 }
 
@@ -680,18 +711,6 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 
 		return linux.NsecToTimeval(s.RecvTimeout()), nil
 
-	case linux.SO_TIMESTAMP:
-		if outLen < sizeOfInt32 {
-			return nil, syserr.ErrInvalidArgument
-		}
-
-		var v tcpip.TimestampOption
-		if err := ep.GetSockOpt(&v); err != nil {
-			return nil, syserr.TranslateNetstackError(err)
-		}
-
-		return int32(v), nil
-
 	case linux.SO_OOBINLINE:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
@@ -854,6 +873,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
+	// TODO: Unlike other socket options, SO_TIMESTAMP is
+	// implemented specifically for epsocket.SocketOperations rather than
+	// commonEndpoint. commonEndpoint should be extended to support socket
+	// options where the implementation is not shared, as unix sockets need
+	// their own support for SO_TIMESTAMP.
+	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+		s.readMu.Lock()
+		defer s.readMu.Unlock()
+		s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0
+		return nil
+	}
+
 	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
 }
 
@@ -962,14 +996,6 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		s.SetRecvTimeout(v.ToNsecCapped())
 		return nil
 
-	case linux.SO_TIMESTAMP:
-		if len(optVal) < sizeOfInt32 {
-			return syserr.ErrInvalidArgument
-		}
-
-		v := usermem.ByteOrder.Uint32(optVal)
-		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TimestampOption(v)))
-
 	default:
 		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
@@ -1436,6 +1462,11 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 			}
 		} else {
 			n, e = dst.CopyOut(ctx, s.readView)
+			// Set the control message, even if 0 bytes were read.
+			if e == nil && s.readCM.HasTimestamp && s.sockOptTimestamp {
+				s.timestampNS = s.readCM.Timestamp
+				s.timestampValid = true
+			}
 		}
 		copied += n
 		s.readView.TrimFront(n)
@@ -1499,6 +1530,11 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	}
 
 	n, err := dst.CopyOut(ctx, s.readView)
+	// Set the control message, even if 0 bytes were read.
+	if err == nil && s.readCM.HasTimestamp && s.sockOptTimestamp {
+		s.timestampNS = s.readCM.Timestamp
+		s.timestampValid = true
+	}
 	var addr interface{}
 	var addrLen uint32
 	if isPacket && senderRequested {
@@ -1508,11 +1544,11 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	if peek {
 		if l := len(s.readView); trunc && l > n {
 			// isPacket must be true.
-			return l, addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
+			return l, addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
 		}
 
 		if isPacket || err != nil {
-			return int(n), addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
+			return int(n), addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
 		}
 
 		// We need to peek beyond the first message.
@@ -1530,7 +1566,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 			// We got some data, so no need to return an error.
 			err = nil
 		}
-		return int(n), nil, 0, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
+		return int(n), nil, 0, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
 	}
 
 	var msgLen int
@@ -1543,10 +1579,10 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	}
 
 	if trunc {
-		return msgLen, addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
+		return msgLen, addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
 	}
 
-	return int(n), addr, addrLen, socket.ControlMessages{IP: s.readCM}, syserr.FromError(err)
+	return int(n), addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
 }
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index fef5ba0e4..3cd431d4c 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -298,9 +298,6 @@ type Endpoint interface {
 	//
 	// This method does not block if there is no data pending. It will also
 	// either return an error or data, never both.
-	//
-	// A timestamp (in ns) is optionally returned. A zero value indicates
-	// that no timestamp was available.
 	Read(*FullAddress) (buffer.View, ControlMessages, *Error)
 
 	// Write writes data to the endpoint's peer. This method does not block if
@@ -326,9 +323,6 @@ type Endpoint interface {
 	// Peek reads data without consuming it from the endpoint.
 	//
 	// This method does not block if there is no data pending.
-	//
-	// A timestamp (in ns) is optionally returned. A zero value indicates
-	// that no timestamp was available.
 	Peek([][]byte) (uintptr, ControlMessages, *Error)
 
 	// Connect connects the endpoint to its peer. Specifying a NIC is
@@ -449,10 +443,6 @@ type QuickAckOption int
 // Only supported on Unix sockets.
 type PasscredOption int
 
-// TimestampOption is used by SetSockOpt/GetSockOpt to specify whether
-// SO_TIMESTAMP socket control messages are enabled.
-type TimestampOption int
-
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
 // TODO: Add and populate stat fields.
diff --git a/pkg/tcpip/transport/ping/endpoint.go b/pkg/tcpip/transport/ping/endpoint.go
index 29f6c543d..c8263a512 100644
--- a/pkg/tcpip/transport/ping/endpoint.go
+++ b/pkg/tcpip/transport/ping/endpoint.go
@@ -32,7 +32,6 @@ type pingPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	hasTimestamp  bool
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -67,7 +66,6 @@ type endpoint struct {
 	rcvBufSizeMax int `state:".(int)"`
 	rcvBufSize    int
 	rcvClosed     bool
-	rcvTimestamp  bool
 
 	// The following fields are protected by the mu mutex.
 	mu         sync.RWMutex `state:"nosave"`
@@ -140,7 +138,6 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	p := e.rcvList.Front()
 	e.rcvList.Remove(p)
 	e.rcvBufSize -= p.data.Size()
-	ts := e.rcvTimestamp
 
 	e.rcvMu.Unlock()
 
@@ -148,12 +145,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	if ts && !p.hasTimestamp {
-		// Linux uses the current time.
-		p.timestamp = e.stack.NowNanoseconds()
-	}
-
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: ts, Timestamp: p.timestamp}, nil
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -313,12 +305,6 @@ func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error)
 
 // SetSockOpt sets a socket option. Currently not supported.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	switch v := opt.(type) {
-	case tcpip.TimestampOption:
-		e.rcvMu.Lock()
-		e.rcvTimestamp = v != 0
-		e.rcvMu.Unlock()
-	}
 	return nil
 }
 
@@ -351,15 +337,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.rcvMu.Unlock()
 		return nil
 
-	case *tcpip.TimestampOption:
-		e.rcvMu.Lock()
-		*o = 0
-		if e.rcvTimestamp {
-			*o = 1
-		}
-		e.rcvMu.Unlock()
-		return nil
-
 	case *tcpip.KeepaliveEnabledOption:
 		*o = 0
 		return nil
@@ -702,10 +679,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	e.rcvList.PushBack(pkt)
 	e.rcvBufSize += vv.Size()
 
-	if e.rcvTimestamp {
-		pkt.timestamp = e.stack.NowNanoseconds()
-		pkt.hasTimestamp = true
-	}
+	pkt.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index d46bf0ade..fa8f02e46 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -32,7 +32,6 @@ type udpPacket struct {
 	senderAddress tcpip.FullAddress
 	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
 	timestamp     int64
-	hasTimestamp  bool
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -68,7 +67,6 @@ type endpoint struct {
 	rcvBufSizeMax int `state:".(int)"`
 	rcvBufSize    int
 	rcvClosed     bool
-	rcvTimestamp  bool
 
 	// The following fields are protected by the mu mutex.
 	mu           sync.RWMutex `state:"nosave"`
@@ -203,7 +201,6 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 	p := e.rcvList.Front()
 	e.rcvList.Remove(p)
 	e.rcvBufSize -= p.data.Size()
-	ts := e.rcvTimestamp
 
 	e.rcvMu.Unlock()
 
@@ -211,12 +208,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		*addr = p.senderAddress
 	}
 
-	if ts && !p.hasTimestamp {
-		// Linux uses the current time.
-		p.timestamp = e.stack.NowNanoseconds()
-	}
-
-	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: ts, Timestamp: p.timestamp}, nil
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
 }
 
 // prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -397,11 +389,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 		e.v6only = v != 0
 
-	case tcpip.TimestampOption:
-		e.rcvMu.Lock()
-		e.rcvTimestamp = v != 0
-		e.rcvMu.Unlock()
-
 	case tcpip.MulticastTTLOption:
 		e.mu.Lock()
 		e.multicastTTL = uint8(v)
@@ -508,15 +495,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.rcvMu.Unlock()
 		return nil
 
-	case *tcpip.TimestampOption:
-		e.rcvMu.Lock()
-		*o = 0
-		if e.rcvTimestamp {
-			*o = 1
-		}
-		e.rcvMu.Unlock()
-		return nil
-
 	case *tcpip.MulticastTTLOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastTTLOption(e.multicastTTL)
@@ -909,10 +887,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 	e.rcvList.PushBack(pkt)
 	e.rcvBufSize += vv.Size()
 
-	if e.rcvTimestamp {
-		pkt.timestamp = e.stack.NowNanoseconds()
-		pkt.hasTimestamp = true
-	}
+	pkt.timestamp = e.stack.NowNanoseconds()
 
 	e.rcvMu.Unlock()
 
-- 
cgit v1.2.3


From c611dbc5a7399922588e3fd99b22bda19f684afe Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 15 Feb 2019 18:39:10 -0800
Subject: Implement IP_MULTICAST_IF.

This allows setting a default send interface for IPv4 multicast. IPv6 support
will come later.

PiperOrigin-RevId: 234251379
Change-Id: I65922341cd8b8880f690fae3eeb7ddfa47c8c173
---
 pkg/sentry/socket/epsocket/epsocket.go         |  66 +++++--
 pkg/tcpip/stack/stack.go                       |  11 ++
 pkg/tcpip/tcpip.go                             |   7 +
 pkg/tcpip/transport/udp/endpoint.go            | 115 ++++++++---
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 261 +++++++++++++++++++++++++
 5 files changed, 418 insertions(+), 42 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 3a9d1182f..3392ac645 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -27,7 +27,6 @@ package epsocket
 import (
 	"bytes"
 	"math"
-	"strings"
 	"sync"
 	"syscall"
 	"time"
@@ -191,6 +190,15 @@ func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Qu
 var sockAddrInetSize = int(binary.Size(linux.SockAddrInet{}))
 var sockAddrInet6Size = int(binary.Size(linux.SockAddrInet6{}))
 
+// bytesToIPAddress converts an IPv4 or IPv6 address from the user to the
+// netstack representation taking any addresses into account.
+func bytesToIPAddress(addr []byte) tcpip.Address {
+	if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) {
+		return ""
+	}
+	return tcpip.Address(addr)
+}
+
 // GetAddress reads an sockaddr struct from the given address and converts it
 // to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6
 // addresses.
@@ -231,12 +239,9 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
 		binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
 
 		out := tcpip.FullAddress{
-			Addr: tcpip.Address(a.Addr[:]),
+			Addr: bytesToIPAddress(a.Addr[:]),
 			Port: ntohs(a.Port),
 		}
-		if out.Addr == "\x00\x00\x00\x00" {
-			out.Addr = ""
-		}
 		return out, nil
 
 	case linux.AF_INET6:
@@ -247,15 +252,12 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
 		binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
 
 		out := tcpip.FullAddress{
-			Addr: tcpip.Address(a.Addr[:]),
+			Addr: bytesToIPAddress(a.Addr[:]),
 			Port: ntohs(a.Port),
 		}
 		if isLinkLocal(out.Addr) {
 			out.NIC = tcpip.NICID(a.Scope_id)
 		}
-		if out.Addr == tcpip.Address(strings.Repeat("\x00", 16)) {
-			out.Addr = ""
-		}
 		return out, nil
 
 	default:
@@ -864,6 +866,30 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 
 		return int32(v), nil
 
+	case linux.IP_MULTICAST_IF:
+		if outLen < inetMulticastRequestSize {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.MulticastInterfaceOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
+
+		rv := linux.InetMulticastRequestWithNIC{
+			linux.InetMulticastRequest{
+				InterfaceAddr: a.(linux.SockAddrInet).Addr,
+			},
+			int32(v.NIC),
+		}
+
+		if outLen >= inetMulticastRequestWithNICSize {
+			return rv, nil
+		}
+		return rv.InetMulticastRequest, nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1148,7 +1174,9 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
-			NIC:           tcpip.NICID(req.InterfaceIndex),
+			NIC: tcpip.NICID(req.InterfaceIndex),
+			// TODO: Change AddMembership to use the standard
+			// any address representation.
 			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
 			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
 		}))
@@ -1160,19 +1188,29 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
-			NIC:           tcpip.NICID(req.InterfaceIndex),
+			NIC: tcpip.NICID(req.InterfaceIndex),
+			// TODO: Change DropMembership to use the standard
+			// any address representation.
 			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
 			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
 		}))
 
 	case linux.IP_MULTICAST_IF:
+		req, err := copyInMulticastRequest(optVal)
+		if err != nil {
+			return err
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{
+			NIC:           tcpip.NICID(req.InterfaceIndex),
+			InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
+		}))
+
+	case linux.MCAST_JOIN_GROUP:
 		// FIXME: Disallow IP-level multicast group options by
 		// default. These will need to be supported by appropriately plumbing
 		// the level through to the network stack (if at all). However, we
 		// still allow setting TTL, and multicast-enable/disable type options.
-		fallthrough
-	case linux.MCAST_JOIN_GROUP:
-		// FIXME: Implement MCAST_JOIN_GROUP.
 		t.Kernel().EmitUnimplementedEvent(t)
 		return syserr.ErrInvalidArgument
 
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 854ebe1bb..252c79317 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -565,6 +565,17 @@ func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
 	return nil
 }
 
+// CheckNIC checks if a NIC is usable.
+func (s *Stack) CheckNIC(id tcpip.NICID) bool {
+	s.mu.RLock()
+	nic, ok := s.nics[id]
+	s.mu.RUnlock()
+	if ok {
+		return nic.linkEP.IsAttached()
+	}
+	return false
+}
+
 // NICSubnets returns a map of NICIDs to their associated subnets.
 func (s *Stack) NICSubnets() map[tcpip.NICID][]tcpip.Subnet {
 	s.mu.RLock()
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 3cd431d4c..a6e47397a 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -473,6 +473,13 @@ type KeepaliveCountOption int
 // TTL value for multicast messages. The default is 1.
 type MulticastTTLOption uint8
 
+// MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
+// default interface for multicast.
+type MulticastInterfaceOption struct {
+	NIC           NICID
+	InterfaceAddr Address
+}
+
 // MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
 // AddMembershipOption and RemoveMembershipOption.
 type MembershipOption struct {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index fa8f02e46..9c3881d63 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -69,17 +69,19 @@ type endpoint struct {
 	rcvClosed     bool
 
 	// The following fields are protected by the mu mutex.
-	mu           sync.RWMutex `state:"nosave"`
-	sndBufSize   int
-	id           stack.TransportEndpointID
-	state        endpointState
-	bindNICID    tcpip.NICID
-	regNICID     tcpip.NICID
-	route        stack.Route `state:"manual"`
-	dstPort      uint16
-	v6only       bool
-	multicastTTL uint8
-	reusePort    bool
+	mu             sync.RWMutex `state:"nosave"`
+	sndBufSize     int
+	id             stack.TransportEndpointID
+	state          endpointState
+	bindNICID      tcpip.NICID
+	regNICID       tcpip.NICID
+	route          stack.Route `state:"manual"`
+	dstPort        uint16
+	v6only         bool
+	multicastTTL   uint8
+	multicastAddr  tcpip.Address
+	multicastNICID tcpip.NICID
+	reusePort      bool
 
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
@@ -251,6 +253,33 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 	return true, nil
 }
 
+// connectRoute establishes a route to the specified interface or the
+// configured multicast interface if no interface is specified and the
+// specified address is a multicast address.
+func (e *endpoint) connectRoute(nicid tcpip.NICID, addr tcpip.FullAddress) (stack.Route, tcpip.NICID, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto, err := e.checkV4Mapped(&addr, false)
+	if err != nil {
+		return stack.Route{}, 0, 0, err
+	}
+
+	localAddr := e.id.LocalAddress
+	if header.IsV4MulticastAddress(addr.Addr) || header.IsV6MulticastAddress(addr.Addr) {
+		if nicid == 0 {
+			nicid = e.multicastNICID
+		}
+		if localAddr == "" {
+			localAddr = e.multicastAddr
+		}
+	}
+
+	// Find a route to the desired destination.
+	r, err := e.stack.FindRoute(nicid, localAddr, addr.Addr, netProto)
+	if err != nil {
+		return stack.Route{}, 0, 0, err
+	}
+	return r, nicid, netProto, nil
+}
+
 // Write writes data to the endpoint's peer. This method does not block
 // if the data cannot be written.
 func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) {
@@ -318,15 +347,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 			nicid = e.bindNICID
 		}
 
-		toCopy := *to
-		to = &toCopy
-		netProto, err := e.checkV4Mapped(to, false)
-		if err != nil {
-			return 0, nil, err
-		}
-
-		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, to.Addr, netProto)
+		r, _, _, err := e.connectRoute(nicid, *to)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -394,6 +415,42 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.multicastTTL = uint8(v)
 		e.mu.Unlock()
 
+	case tcpip.MulticastInterfaceOption:
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
+		netProto, err := e.checkV4Mapped(&fa, false)
+		if err != nil {
+			return err
+		}
+		nic := v.NIC
+		addr := fa.Addr
+
+		if nic == 0 && addr == "" {
+			e.multicastAddr = ""
+			e.multicastNICID = 0
+			break
+		}
+
+		if nic != 0 {
+			if !e.stack.CheckNIC(nic) {
+				return tcpip.ErrBadLocalAddress
+			}
+		} else {
+			nic = e.stack.CheckLocalAddress(0, netProto, addr)
+			if nic == 0 {
+				return tcpip.ErrBadLocalAddress
+			}
+		}
+
+		if e.bindNICID != 0 && e.bindNICID != nic {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		e.multicastNICID = nic
+		e.multicastAddr = addr
+
 	case tcpip.AddMembershipOption:
 		nicID := v.NIC
 		if v.InterfaceAddr != header.IPv4Any {
@@ -445,7 +502,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		e.reusePort = v != 0
 		e.mu.Unlock()
-		return nil
 	}
 	return nil
 }
@@ -501,6 +557,15 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.MulticastInterfaceOption:
+		e.mu.Lock()
+		*o = tcpip.MulticastInterfaceOption{
+			e.multicastNICID,
+			e.multicastAddr,
+		}
+		e.mu.Unlock()
+		return nil
+
 	case *tcpip.ReusePortOption:
 		e.mu.RLock()
 		v := e.reusePort
@@ -610,13 +675,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	netProto, err := e.checkV4Mapped(&addr, false)
-	if err != nil {
-		return err
-	}
-
-	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, addr.Addr, netProto)
+	r, nicid, netProto, err := e.connectRoute(nicid, addr)
 	if err != nil {
 		return err
 	}
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 4058324a2..2d702179e 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -278,6 +278,238 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
+// Check that multicast works when the default send interface is confgured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  EXPECT_THAT(bind(sockets->second_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  EXPECT_THAT(getsockname(sockets->second_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is confgured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  EXPECT_THAT(bind(sockets->second_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  EXPECT_THAT(getsockname(sockets->second_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is confgured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  EXPECT_THAT(bind(sockets->second_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  EXPECT_THAT(getsockname(sockets->second_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto connect_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  EXPECT_THAT(
+      RetryEINTR(connect)(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&connect_addr.addr),
+                          connect_addr.addr_len),
+      SyscallSucceeds());
+
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is confgured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  EXPECT_THAT(bind(sockets->second_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  EXPECT_THAT(getsockname(sockets->second_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto connect_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  EXPECT_THAT(
+      RetryEINTR(connect)(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&connect_addr.addr),
+                          connect_addr.addr_len),
+      SyscallSucceeds());
+
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  EXPECT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
 // Check that dropping a group membership that does not exist fails.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastInvalidDrop) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -407,5 +639,34 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn iface = {};
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidNic) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn iface = {};
+  iface.imr_ifindex = -1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = inet_addr("255.255.255");
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 22d8b6eba1487d3f0d87a578e414e451d9aeb26d Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 19 Feb 2019 11:20:48 -0800
Subject: Break /proc/[pid]/{uid,gid}_map's dependence on seqfile.

In addition to simplifying the implementation, this fixes two bugs:

- seqfile.NewSeqFile unconditionally creates an inode with mode 0444,
  but {uid,gid}_map have mode 0644.

- idMapSeqFile.Write implements fs.FileOperations.Write ... but it
  doesn't implement any other fs.FileOperations methods and is never
  used as fs.FileOperations. idMapSeqFile.GetFile() =>
  seqfile.SeqFile.GetFile() uses seqfile.seqFileOperations instead,
  which rejects all writes.

PiperOrigin-RevId: 234638212
Change-Id: I4568f741ab07929273a009d7e468c8205a8541bc
---
 pkg/sentry/fs/proc/uid_gid_map.go           | 148 ++++++++++----------
 test/syscalls/BUILD                         |   2 +
 test/syscalls/linux/BUILD                   |  19 +++
 test/syscalls/linux/proc_pid_uid_gid_map.cc | 204 ++++++++++++++++++++++++++++
 4 files changed, 300 insertions(+), 73 deletions(-)
 create mode 100644 test/syscalls/linux/proc_pid_uid_gid_map.cc

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 815c40b7f..d6e278f79 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -17,67 +17,42 @@ package proc
 import (
 	"bytes"
 	"fmt"
+	"io"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// An idMapSeqSource is a seqfile.SeqSource that returns UID or GID mappings
-// from a task's user namespace.
+// idMapInodeOperations implements fs.InodeOperations for
+// /proc/[pid]/{uid,gid}_map.
 //
 // +stateify savable
-type idMapSeqSource struct {
+type idMapInodeOperations struct {
+	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeNoopRelease    `state:"nosave"`
+	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotDirectory   `state:"nosave"`
+	fsutil.InodeNotMappable    `state:"nosave"`
+	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotSymlink     `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeVirtual        `state:"nosave"`
+
+	fsutil.InodeSimpleAttributes
+	fsutil.InodeSimpleExtendedAttributes
+
 	t    *kernel.Task
 	gids bool
 }
 
-// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
-func (imss *idMapSeqSource) NeedsUpdate(generation int64) bool {
-	return true
-}
-
-// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData.
-func (imss *idMapSeqSource) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
-	var start int
-	if handle != nil {
-		start = handle.(*idMapSeqHandle).value
-	}
-	var entries []auth.IDMapEntry
-	if imss.gids {
-		entries = imss.t.UserNamespace().GIDMap()
-	} else {
-		entries = imss.t.UserNamespace().UIDMap()
-	}
-	var data []seqfile.SeqData
-	i := 1
-	for _, e := range entries {
-		if i > start {
-			data = append(data, seqfile.SeqData{
-				Buf:    idMapLineFromEntry(e),
-				Handle: &idMapSeqHandle{i},
-			})
-		}
-		i++
-	}
-	return data, 0
-}
-
-// TODO: Fix issue requiring idMapSeqHandle wrapping an int.
-//
-// +stateify savable
-type idMapSeqHandle struct {
-	value int
-}
-
-// +stateify savable
-type idMapSeqFile struct {
-	seqfile.SeqFile
-}
+var _ fs.InodeOperations = (*idMapInodeOperations)(nil)
 
 // newUIDMap returns a new uid_map file.
 func newUIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
@@ -90,25 +65,64 @@ func newGIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 }
 
 func newIDMap(t *kernel.Task, msrc *fs.MountSource, gids bool) *fs.Inode {
-	imsf := &idMapSeqFile{
-		*seqfile.NewSeqFile(t, &idMapSeqSource{
-			t:    t,
-			gids: gids,
-		}),
-	}
-	return newProcInode(imsf, msrc, fs.SpecialFile, t)
+	return newProcInode(&idMapInodeOperations{
+		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+		t:                     t,
+		gids:                  gids,
+	}, msrc, fs.SpecialFile, t)
 }
 
-func (imsf *idMapSeqFile) source() *idMapSeqSource {
-	return imsf.SeqFile.SeqSource.(*idMapSeqSource)
+// GetFile implements fs.InodeOperations.GetFile.
+func (imio *idMapInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+	return fs.NewFile(ctx, dirent, flags, &idMapFileOperations{
+		iops: imio,
+	}), nil
 }
 
+// +stateify savable
+type idMapFileOperations struct {
+	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileGenericSeek   `state:"nosave"`
+	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FileNoopFlush     `state:"nosave"`
+	fsutil.FileNoopFsync     `state:"nosave"`
+	fsutil.FileNoopRelease   `state:"nosave"`
+	fsutil.FileNotDirReaddir `state:"nosave"`
+
+	iops *idMapInodeOperations
+}
+
+var _ fs.FileOperations = (*idMapFileOperations)(nil)
+
 // "There is an (arbitrary) limit on the number of lines in the file. As at
 // Linux 3.18, the limit is five lines." - user_namespaces(7)
 const maxIDMapLines = 5
 
+// Read implements fs.FileOperations.Read.
+func (imfo *idMapFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	var entries []auth.IDMapEntry
+	if imfo.iops.gids {
+		entries = imfo.iops.t.UserNamespace().GIDMap()
+	} else {
+		entries = imfo.iops.t.UserNamespace().UIDMap()
+	}
+	var buf bytes.Buffer
+	for _, e := range entries {
+		fmt.Fprintf(&buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
+	}
+	if offset >= int64(buf.Len()) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, buf.Bytes()[offset:])
+	return int64(n), err
+}
+
 // Write implements fs.FileOperations.Write.
-func (imsf *idMapSeqFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
 	// "In addition, the number of bytes written to the file must be less than
 	// the system page size, and the write must be performed at the start of
 	// the file ..." - user_namespaces(7)
@@ -126,33 +140,21 @@ func (imsf *idMapSeqFile) Write(ctx context.Context, _ *fs.File, src usermem.IOS
 	}
 	entries := make([]auth.IDMapEntry, len(lines))
 	for i, l := range lines {
-		e, err := idMapEntryFromLine(string(l))
+		var e auth.IDMapEntry
+		_, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
 		if err != nil {
 			return 0, syserror.EINVAL
 		}
 		entries[i] = e
 	}
-	t := imsf.source().t
 	var err error
-	if imsf.source().gids {
-		err = t.UserNamespace().SetGIDMap(ctx, entries)
+	if imfo.iops.gids {
+		err = imfo.iops.t.UserNamespace().SetGIDMap(ctx, entries)
 	} else {
-		err = t.UserNamespace().SetUIDMap(ctx, entries)
+		err = imfo.iops.t.UserNamespace().SetUIDMap(ctx, entries)
 	}
 	if err != nil {
 		return 0, err
 	}
 	return int64(len(b)), nil
 }
-
-func idMapLineFromEntry(e auth.IDMapEntry) []byte {
-	var b bytes.Buffer
-	fmt.Fprintf(&b, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
-	return b.Bytes()
-}
-
-func idMapEntryFromLine(line string) (auth.IDMapEntry, error) {
-	var e auth.IDMapEntry
-	_, err := fmt.Sscan(line, &e.FirstID, &e.FirstParentID, &e.Length)
-	return e, err
-}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index ca69f3309..1be7a9bd4 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -214,6 +214,8 @@ syscall_test(
     test = "//test/syscalls/linux:proc_test",
 )
 
+syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
+
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:pselect_test",
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 75fa52a57..3c61c48ef 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1444,6 +1444,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "proc_pid_uid_gid_map_test",
+    testonly = 1,
+    srcs = ["proc_pid_uid_gid_map.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:capability_util",
+        "//test/util:fs_util",
+        "//test/util:logging",
+        "//test/util:multiprocess_util",
+        "//test/util:posix_error",
+        "//test/util:save_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "pselect_test",
     testonly = 1,
diff --git a/test/syscalls/linux/proc_pid_uid_gid_map.cc b/test/syscalls/linux/proc_pid_uid_gid_map.cc
new file mode 100644
index 000000000..bf0f8b2bb
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_uid_gid_map.cc
@@ -0,0 +1,204 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "test/util/capability_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<int> InNewUserNamespace(const std::function<void()>& fn) {
+  return InForkedProcess([&] {
+    TEST_PCHECK(unshare(CLONE_NEWUSER) == 0);
+    MaybeSave();
+    fn();
+  });
+}
+
+// TEST_CHECK-fails on error, since this function is used in contexts that
+// require async-signal-safety.
+void DenySelfSetgroups() {
+  int fd = open("/proc/self/setgroups", O_WRONLY);
+  if (fd < 0 && errno == ENOENT) {
+    // On kernels where this file doesn't exist, writing "deny" to it isn't
+    // necessary to write to gid_map.
+    return;
+  }
+  TEST_PCHECK(fd >= 0);
+  MaybeSave();
+  char deny[] = "deny";
+  TEST_PCHECK(write(fd, deny, sizeof(deny)) == sizeof(deny));
+  MaybeSave();
+  TEST_PCHECK(close(fd) == 0);
+}
+
+// Returns a valid UID/GID that isn't id.
+uint32_t another_id(uint32_t id) { return (id + 1) % 65535; }
+
+struct TestParam {
+  std::string desc;
+  std::string map_filename;
+  int cap;
+  std::function<uint32_t()> get_current_id;
+};
+
+std::string DescribeTestParam(const ::testing::TestParamInfo<TestParam>& info) {
+  return info.param.desc;
+}
+
+class ProcSelfUidGidMapTest : public ::testing::TestWithParam<TestParam> {
+ protected:
+  PosixErrorOr<int> InNewUserNamespaceWithMapFD(
+      const std::function<void(int)>& fn) {
+    std::string map_filename = GetParam().map_filename;
+    return InNewUserNamespace([&] {
+      int fd = open(map_filename.c_str(), O_RDWR);
+      TEST_PCHECK(fd >= 0);
+      MaybeSave();
+      fn(fd);
+      TEST_PCHECK(close(fd) == 0);
+    });
+  }
+
+  uint32_t CurrentID() { return GetParam().get_current_id(); }
+
+  PosixErrorOr<bool> HaveSetIDCapability() {
+    return HaveCapability(GetParam().cap);
+  }
+
+  // Returns true if the caller is running in a user namespace with all IDs
+  // mapped. This matters for tests that expect to successfully map arbitrary
+  // IDs into a child user namespace, since even with CAP_SET*ID this is only
+  // possible if those IDs are mapped into the current one.
+  PosixErrorOr<bool> AllIDsMapped() {
+    ASSIGN_OR_RETURN_ERRNO(std::string id_map, GetContents(GetParam().map_filename));
+    std::vector<std::string> id_map_parts =
+        absl::StrSplit(id_map, ' ', absl::SkipEmpty());
+    return id_map_parts == std::vector<std::string>({"0", "0", "4294967295"});
+  }
+};
+
+TEST_P(ProcSelfUidGidMapTest, IsInitiallyEmpty) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  EXPECT_THAT(InNewUserNamespaceWithMapFD([](int fd) {
+                char buf[64];
+                TEST_PCHECK(read(fd, buf, sizeof(buf)) == 0);
+              }),
+              IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, IdentityMapOwnID) {
+  // This is the only write permitted if the writer does not have CAP_SET*ID.
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  uint32_t id = CurrentID();
+  std::string line = absl::StrCat(id, " ", id, " 1");
+  EXPECT_THAT(
+      InNewUserNamespaceWithMapFD([&](int fd) {
+        DenySelfSetgroups();
+        TEST_PCHECK(write(fd, line.c_str(), line.size()) == line.size());
+      }),
+      IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, NonIdentityMapOwnID) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveSetIDCapability()));
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
+  std::string line = absl::StrCat(id2, " ", id, " 1");
+  EXPECT_THAT(
+      InNewUserNamespaceWithMapFD([&](int fd) {
+        DenySelfSetgroups();
+        TEST_PCHECK(write(fd, line.c_str(), line.size()) == line.size());
+      }),
+      IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, MapOtherIDUnprivileged) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveSetIDCapability()));
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
+  std::string line = absl::StrCat(id, " ", id2, " 1");
+  EXPECT_THAT(InNewUserNamespaceWithMapFD([&](int fd) {
+                DenySelfSetgroups();
+                TEST_PCHECK(write(fd, line.c_str(), line.size()) < 0);
+                TEST_CHECK(errno == EPERM);
+              }),
+              IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, MapOtherIDPrivileged) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveSetIDCapability()));
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(AllIDsMapped()));
+  uint32_t id = CurrentID();
+  uint32_t id2 = another_id(id);
+  std::string line = absl::StrCat(id, " ", id2, " 1");
+  EXPECT_THAT(
+      InNewUserNamespaceWithMapFD([&](int fd) {
+        DenySelfSetgroups();
+        TEST_PCHECK(write(fd, line.c_str(), line.size()) == line.size());
+      }),
+      IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, MapAnyIDsPrivileged) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveSetIDCapability()));
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(AllIDsMapped()));
+  // Test all of:
+  //
+  // - Mapping ranges of length > 1
+  //
+  // - Mapping multiple ranges
+  //
+  // - Non-identity mappings
+  char entries[] = "2 0 2\n4 6 2";
+  EXPECT_THAT(
+      InNewUserNamespaceWithMapFD([&](int fd) {
+        DenySelfSetgroups();
+        TEST_PCHECK(write(fd, entries, sizeof(entries)) == sizeof(entries));
+      }),
+      IsPosixErrorOkAndHolds(0));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    All, ProcSelfUidGidMapTest,
+    ::testing::Values(TestParam{"UID", "/proc/self/uid_map", CAP_SETUID,
+                                []() -> uint32_t { return getuid(); }},
+                      TestParam{"GID", "/proc/self/gid_map", CAP_SETGID,
+                                []() -> uint32_t { return getgid(); }}),
+    DescribeTestParam);
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From bb47d8a545f82849f637c480459109e16be336cf Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 19 Feb 2019 14:19:07 -0800
Subject: Fix clone(CLONE_NEWUSER).

- Use new user namespace for namespace creation checks.

- Ensure userns is never nil since it's used by other namespaces.

PiperOrigin-RevId: 234673175
Change-Id: I4b9d9d1e63ce4e24362089793961a996f7540cd9
---
 pkg/sentry/kernel/task_clone.go |  7 +++----
 test/syscalls/linux/BUILD       |  2 ++
 test/syscalls/linux/fork.cc     | 31 +++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index b66fa34a9..114e7f858 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -17,7 +17,6 @@ package kernel
 import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/bpf"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -166,7 +165,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	// privileges over the remaining namespaces created by the call." -
 	// user_namespaces(7)
 	creds := t.Credentials()
-	var userns *auth.UserNamespace
+	userns := creds.UserNamespace
 	if opts.NewUserNamespace {
 		var err error
 		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
@@ -182,7 +181,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 			return 0, nil, err
 		}
 	}
-	if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapability(linux.CAP_SYS_ADMIN) {
+	if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
 		return 0, nil, syserror.EPERM
 	}
 
@@ -287,7 +286,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		nt.SetSignalStack(t.SignalStack())
 	}
 
-	if userns != nil {
+	if userns != creds.UserNamespace {
 		if err := nt.SetUserNamespace(userns); err != nil {
 			// This shouldn't be possible: userns was created from nt.creds, so
 			// nt should have CAP_SYS_ADMIN in userns.
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 3c61c48ef..e7f5ea998 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -732,7 +732,9 @@ cc_binary(
     srcs = ["fork.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:capability_util",
         "//test/util:logging",
+        "//test/util:memory_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 1bff5e50f..73ac885b5 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -21,11 +21,14 @@
 #include <sys/types.h>
 #include <unistd.h>
 #include <atomic>
+#include <cstdlib>
 
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/util/capability_util.h"
 #include "test/util/logging.h"
+#include "test/util/memory_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -393,6 +396,34 @@ TEST_F(ForkTest, Affinity) {
   EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
 }
 
+TEST(CloneTest, NewUserNamespacePermitsAllOtherNamespaces) {
+  // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
+  // single clone(2) or unshare(2) call, the user namespace is guaranteed to be
+  // created first, giving the child (clone(2)) or caller (unshare(2))
+  // privileges over the remaining namespaces created by the call. Thus, it is
+  // possible for an unprivileged caller to specify this combination of flags."
+  // - user_namespaces(7)
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  Mapping child_stack = ASSERT_NO_ERRNO_AND_VALUE(
+      MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+  int child_pid;
+  // We only test with CLONE_NEWIPC, CLONE_NEWNET, and CLONE_NEWUTS since these
+  // namespaces were implemented in Linux before user namespaces.
+  ASSERT_THAT(
+      child_pid = clone(
+          +[](void*) { return 0; },
+          reinterpret_cast<void*>(child_stack.addr() + kPageSize),
+          CLONE_NEWUSER | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUTS | SIGCHLD,
+          /* arg = */ nullptr),
+      SyscallSucceeds());
+
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status = " << status;
+}
+
 #ifdef __x86_64__
 // Clone with CLONE_SETTLS and a non-canonical TLS address is rejected.
 TEST(CloneTest, NonCanonicalTLS) {
-- 
cgit v1.2.3


From bed6f8534b1bedaad031682fe052b5a46d9cb3ee Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 19 Feb 2019 15:48:39 -0800
Subject: Set rax to syscall number on SECCOMP_RET_TRAP.

PiperOrigin-RevId: 234690475
Change-Id: I1cbfb5aecd4697a4a26ec8524354aa8656cc3ba1
---
 pkg/sentry/kernel/seccomp.go   |  3 +++
 test/syscalls/linux/seccomp.cc | 53 ++++++++++++++++++++++++++----------------
 2 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index cec179246..4bed4d373 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -75,6 +75,9 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 		// portion of the return value will be passed as si_errno." -
 		// Documentation/prctl/seccomp_filter.txt
 		t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip))
+		// "The return value register will contain an arch-dependent value." In
+		// practice, it's ~always the syscall number.
+		t.Arch().SetReturn(uintptr(sysno))
 
 	case linux.SECCOMP_RET_ERRNO:
 		// "Results in the lower 16-bits of the return value being passed to
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index ac416b75f..27740d7ef 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -23,6 +23,7 @@
 #include <sys/prctl.h>
 #include <sys/syscall.h>
 #include <time.h>
+#include <ucontext.h>
 #include <unistd.h>
 #include <atomic>
 
@@ -161,16 +162,21 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
   pid_t const pid = fork();
   if (pid == 0) {
     constexpr uint16_t kTrapValue = 0xdead;
-    RegisterSignalHandler(SIGSYS, +[](int signo, siginfo_t* info, void*) {
-      // This is a signal handler, so we must stay async-signal-safe.
-      TEST_CHECK(info->si_signo == SIGSYS);
-      TEST_CHECK(info->si_code == SYS_SECCOMP);
-      TEST_CHECK(info->si_errno == kTrapValue);
-      TEST_CHECK(info->si_call_addr != nullptr);
-      TEST_CHECK(info->si_syscall == kFilteredSyscall);
-      TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
-      _exit(0);
-    });
+    RegisterSignalHandler(
+        SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
+          ucontext_t* uc = static_cast<ucontext_t*>(ucv);
+          // This is a signal handler, so we must stay async-signal-safe.
+          TEST_CHECK(info->si_signo == SIGSYS);
+          TEST_CHECK(info->si_code == SYS_SECCOMP);
+          TEST_CHECK(info->si_errno == kTrapValue);
+          TEST_CHECK(info->si_call_addr != nullptr);
+          TEST_CHECK(info->si_syscall == kFilteredSyscall);
+#ifdef __x86_64__
+          TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
+          TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall);
+#endif  // defined(__x86_64__)
+          _exit(0);
+        });
     ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRAP | kTrapValue);
     syscall(kFilteredSyscall);
     TEST_CHECK_MSG(false, "Survived invocation of test syscall");
@@ -182,6 +188,8 @@ TEST(SeccompTest, RetTrapCausesSIGSYS) {
       << "status " << status;
 }
 
+#ifdef __x86_64__
+
 constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
 
 time_t vsyscall_time(time_t* t) {
@@ -194,16 +202,19 @@ TEST(SeccompTest, SeccompAppliesToVsyscall) {
   pid_t const pid = fork();
   if (pid == 0) {
     constexpr uint16_t kTrapValue = 0xdead;
-    RegisterSignalHandler(SIGSYS, +[](int signo, siginfo_t* info, void*) {
-      // This is a signal handler, so we must stay async-signal-safe.
-      TEST_CHECK(info->si_signo == SIGSYS);
-      TEST_CHECK(info->si_code == SYS_SECCOMP);
-      TEST_CHECK(info->si_errno == kTrapValue);
-      TEST_CHECK(info->si_call_addr != nullptr);
-      TEST_CHECK(info->si_syscall == SYS_time);
-      TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
-      _exit(0);
-    });
+    RegisterSignalHandler(
+        SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
+          ucontext_t* uc = static_cast<ucontext_t*>(ucv);
+          // This is a signal handler, so we must stay async-signal-safe.
+          TEST_CHECK(info->si_signo == SIGSYS);
+          TEST_CHECK(info->si_code == SYS_SECCOMP);
+          TEST_CHECK(info->si_errno == kTrapValue);
+          TEST_CHECK(info->si_call_addr != nullptr);
+          TEST_CHECK(info->si_syscall == SYS_time);
+          TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
+          TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == SYS_time);
+          _exit(0);
+        });
     ApplySeccompFilter(SYS_time, SECCOMP_RET_TRAP | kTrapValue);
     vsyscall_time(nullptr);  // Should result in death.
     TEST_CHECK_MSG(false, "Survived invocation of test syscall");
@@ -234,6 +245,8 @@ TEST(SeccompTest, RetKillVsyscallCausesDeathBySIGSYS) {
       << "status " << status;
 }
 
+#endif  // defined(__x86_64__)
+
 TEST(SeccompTest, RetTraceWithoutPtracerReturnsENOSYS) {
   pid_t const pid = fork();
   if (pid == 0) {
-- 
cgit v1.2.3


From ec2460b1890aa1dbf8bd84f11dbdb3758e2443b2 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 19 Feb 2019 16:40:31 -0800
Subject: netstack: Add SIOCGSTAMP support.

Ping sometimes uses this instead of SO_TIMESTAMP.

PiperOrigin-RevId: 234699590
Change-Id: Ibec9c34fa0d443a931557a2b1b1ecd83effe7765
---
 pkg/sentry/socket/epsocket/epsocket.go | 67 +++++++++++++++++++------
 test/syscalls/linux/BUILD              |  1 +
 test/syscalls/linux/udp_socket.cc      | 89 +++++++++++++++++++++++++++++++++-
 3 files changed, 140 insertions(+), 17 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 3392ac645..a97db5348 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -157,15 +157,17 @@ type SocketOperations struct {
 	// from Endpoint.
 	readCM tcpip.ControlMessages
 	sender tcpip.FullAddress
+
 	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
 	// of returned messages can be returned via control messages. When
 	// false, the same timestamp is instead stored and can be read via the
-	// SIOCGSTAMP ioctl. See socket(7).
+	// SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
 	sockOptTimestamp bool
-	// timestampValid indicates whether timestamp has been set.
+	// timestampValid indicates whether timestamp for SIOCGSTAMP has been
+	// set. It is protected by readMu.
 	timestampValid bool
-	// timestampNS holds the timestamp to use with SIOCGSTAMP. It is only
-	// valid when timestampValid is true.
+	// timestampNS holds the timestamp to use with SIOCTSTAMP. It is only
+	// valid when timestampValid is true. It is protected by readMu.
 	timestampNS int64
 }
 
@@ -266,7 +268,7 @@ func GetAddress(sfamily int, addr []byte) (tcpip.FullAddress, *syserr.Error) {
 }
 
 func (s *SocketOperations) isPacketBased() bool {
-	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM
+	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
 }
 
 // fetchReadView updates the readView field of the socket if it's currently
@@ -1480,6 +1482,8 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *sy
 // coalescingRead is the fast path for non-blocking, non-peek, stream-based
 // case. It coalesces as many packets as possible before returning to the
 // caller.
+//
+// Precondition: s.readMu must be locked.
 func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) {
 	var err *syserr.Error
 	var copied int
@@ -1501,9 +1505,8 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 		} else {
 			n, e = dst.CopyOut(ctx, s.readView)
 			// Set the control message, even if 0 bytes were read.
-			if e == nil && s.readCM.HasTimestamp && s.sockOptTimestamp {
-				s.timestampNS = s.readCM.Timestamp
-				s.timestampValid = true
+			if e == nil {
+				s.updateTimestamp()
 			}
 		}
 		copied += n
@@ -1569,9 +1572,8 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 
 	n, err := dst.CopyOut(ctx, s.readView)
 	// Set the control message, even if 0 bytes were read.
-	if err == nil && s.readCM.HasTimestamp && s.sockOptTimestamp {
-		s.timestampNS = s.readCM.Timestamp
-		s.timestampValid = true
+	if err == nil {
+		s.updateTimestamp()
 	}
 	var addr interface{}
 	var addrLen uint32
@@ -1582,11 +1584,11 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	if peek {
 		if l := len(s.readView); trunc && l > n {
 			// isPacket must be true.
-			return l, addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
+			return l, addr, addrLen, s.controlMessages(), syserr.FromError(err)
 		}
 
 		if isPacket || err != nil {
-			return int(n), addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
+			return int(n), addr, addrLen, s.controlMessages(), syserr.FromError(err)
 		}
 
 		// We need to peek beyond the first message.
@@ -1604,7 +1606,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 			// We got some data, so no need to return an error.
 			err = nil
 		}
-		return int(n), nil, 0, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
+		return int(n), nil, 0, s.controlMessages(), syserr.FromError(err)
 	}
 
 	var msgLen int
@@ -1617,10 +1619,26 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	}
 
 	if trunc {
-		return msgLen, addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
+		return msgLen, addr, addrLen, s.controlMessages(), syserr.FromError(err)
 	}
 
-	return int(n), addr, addrLen, socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.timestampValid, Timestamp: s.timestampNS}}, syserr.FromError(err)
+	return int(n), addr, addrLen, s.controlMessages(), syserr.FromError(err)
+}
+
+func (s *SocketOperations) controlMessages() socket.ControlMessages {
+	return socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp}}
+}
+
+// updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
+// successfully writing packet data out to userspace.
+//
+// Precondition: s.readMu must be locked.
+func (s *SocketOperations) updateTimestamp() {
+	// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
+	if !s.sockOptTimestamp {
+		s.timestampValid = true
+		s.timestampNS = s.readCM.Timestamp
+	}
 }
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
@@ -1771,6 +1789,23 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// SIOCGSTAMP is implemented by epsocket rather than all commonEndpoint
+	// sockets.
+	// TODO: Add a commonEndpoint method to support SIOCGSTAMP.
+	if int(args[1].Int()) == syscall.SIOCGSTAMP {
+		s.readMu.Lock()
+		defer s.readMu.Unlock()
+		if !s.timestampValid {
+			return 0, syserror.ENOENT
+		}
+
+		tv := linux.NsecToTimeval(s.timestampNS)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	}
+
 	return Ioctl(ctx, s.Endpoint, io, args)
 }
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e7f5ea998..9da5204c1 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -2907,6 +2907,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":socket_test_util",
+        ":unix_domain_socket_test_util",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index a02b418a3..38dfd0ad0 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -24,6 +24,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -892,12 +893,21 @@ TEST_P(UdpSocketTest, ErrorQueue) {
               SyscallFailsWithErrno(EAGAIN));
 }
 
+TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+  int v = -1;
+  socklen_t optlen = sizeof(v);
+  ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
+              SyscallSucceeds());
+  ASSERT_EQ(v, kSockOptOff);
+  ASSERT_EQ(optlen, sizeof(v));
+}
+
 TEST_P(UdpSocketTest, SoTimestamp) {
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
   ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
 
   int v = 1;
-  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+  ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
               SyscallSucceeds());
 
   char buf[3];
@@ -926,12 +936,89 @@ TEST_P(UdpSocketTest, SoTimestamp) {
   memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
 
   ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // There should be nothing to get via ioctl.
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
 }
 
 TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
   EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
 }
 
+TEST_P(UdpSocketTest, TimestampIoctl) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+}
+
+TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
+}
+
+// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
+// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
+TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+  ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+  ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+  char buf[3];
+  // Send packet from t_ to s_.
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  // There should be no control messages.
+  char recv_buf[sizeof(buf)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+  // A nonzero timeval should be available via ioctl.
+  struct timeval tv = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+  ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+  // Enable SO_TIMESTAMP and send a message.
+  int v = 1;
+  EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+              SyscallSucceeds());
+  ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+  // There should be a message for SO_TIMESTAMP.
+  char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+  msghdr msg = {};
+  iovec iov = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = cmsgbuf;
+  msg.msg_controllen = sizeof(cmsgbuf);
+  ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+  struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg = CMSG_FIRSTHDR(&msg);
+  ASSERT_NE(cmsg, nullptr);
+
+  // The ioctl should return the exact same values as before.
+  struct timeval tv2 = {};
+  ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds());
+  ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
+  ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
+}
+
 INSTANTIATE_TEST_CASE_P(AllInetTests, UdpSocketTest,
                         ::testing::Values(AF_INET, AF_INET6));
 
-- 
cgit v1.2.3


From ea070b9d5f4be0b25b028e90ab4518ef2e4df16b Mon Sep 17 00:00:00 2001
From: Amanda Tait <atait@google.com>
Date: Wed, 20 Feb 2019 12:53:07 -0800
Subject: Implement Broadcast support

This change adds support for the SO_BROADCAST socket option in gVisor Netstack.
This support includes getsockopt()/setsockopt() functionality for both UDP and
TCP endpoints (the latter being a NOOP), dispatching broadcast messages up and
down the stack, and route finding/creation for broadcast packets. Finally, a
suite of tests have been implemented, exercising this functionality through the
Linux syscall API.

PiperOrigin-RevId: 234850781
Change-Id: If3e666666917d39f55083741c78314a06defb26c
---
 pkg/dhcp/client.go                                 |   3 +
 pkg/dhcp/dhcp_test.go                              |   3 +
 pkg/dhcp/server.go                                 |   3 +
 pkg/sentry/socket/epsocket/epsocket.go             |  21 ++
 pkg/syserr/netstack.go                             |   2 +
 pkg/tcpip/stack/nic.go                             |  15 ++
 pkg/tcpip/stack/transport_demuxer.go               |  70 ++++++-
 pkg/tcpip/tcpip.go                                 |  11 +
 pkg/tcpip/transport/tcp/endpoint.go                |  20 ++
 pkg/tcpip/transport/tcp/endpoint_state.go          |   1 +
 pkg/tcpip/transport/udp/endpoint.go                |  23 ++
 test/syscalls/linux/BUILD                          |  69 ++++++
 test/syscalls/linux/ip_socket_test_util.cc         |  16 ++
 test/syscalls/linux/ip_socket_test_util.h          |   8 +
 .../socket_ipv4_tcp_unbound_external_networking.cc |  66 ++++++
 .../socket_ipv4_tcp_unbound_external_networking.h  |  30 +++
 ...et_ipv4_tcp_unbound_external_networking_test.cc |  35 ++++
 .../socket_ipv4_udp_unbound_external_networking.cc | 231 +++++++++++++++++++++
 .../socket_ipv4_udp_unbound_external_networking.h  |  30 +++
 ...et_ipv4_udp_unbound_external_networking_test.cc |  35 ++++
 test/syscalls/linux/socket_test_util.cc            |  12 ++
 test/syscalls/linux/socket_test_util.h             |   5 +
 22 files changed, 698 insertions(+), 11 deletions(-)
 create mode 100644 test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
 create mode 100644 test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
 create mode 100644 test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
 create mode 100644 test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc

(limited to 'pkg/sentry')

diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 3330c4998..6d48eec7e 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -141,6 +141,9 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) (cfg
 	}, nil); err != nil {
 		return Config{}, fmt.Errorf("dhcp: connect failed: %v", err)
 	}
+	if err := ep.SetSockOpt(tcpip.BroadcastOption(1)); err != nil {
+		return Config{}, fmt.Errorf("dhcp: setsockopt SO_BROADCAST: %v", err)
+	}
 
 	epin, err := c.stack.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
 	if err != nil {
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index a21dce6bc..026064394 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -287,6 +287,9 @@ func TestTwoServers(t *testing.T) {
 	if err = ep.Bind(tcpip.FullAddress{Port: ServerPort}, nil); err != nil {
 		t.Fatalf("dhcp: server bind: %v", err)
 	}
+	if err = ep.SetSockOpt(tcpip.BroadcastOption(1)); err != nil {
+		t.Fatalf("dhcp: setsockopt: %v", err)
+	}
 
 	serverCtx, cancel := context.WithCancel(context.Background())
 	defer cancel()
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index 3e06ab4c7..c72c3b70d 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -123,6 +123,9 @@ func newEPConnServer(ctx context.Context, stack *stack.Stack, addrs []tcpip.Addr
 	if err := ep.Bind(tcpip.FullAddress{Port: ServerPort}, nil); err != nil {
 		return nil, fmt.Errorf("dhcp: server bind: %v", err)
 	}
+	if err := ep.SetSockOpt(tcpip.BroadcastOption(1)); err != nil {
+		return nil, fmt.Errorf("dhcp: server setsockopt: %v", err)
+	}
 	c := newEPConn(ctx, wq, ep)
 	return NewServer(ctx, c, addrs, cfg)
 }
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index a97db5348..e24e58aed 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -582,6 +582,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
 func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) {
+	// TODO: Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_TYPE:
 		if outLen < sizeOfInt32 {
@@ -681,6 +682,18 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 
 		return int32(v), nil
 
+	case linux.SO_BROADCAST:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.BroadcastOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
 	case linux.SO_KEEPALIVE:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
@@ -982,6 +995,14 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		v := usermem.ByteOrder.Uint32(optVal)
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v)))
 
+	case linux.SO_BROADCAST:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BroadcastOption(v)))
+
 	case linux.SO_PASSCRED:
 		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index 20e756edb..05ca475d1 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -43,6 +43,7 @@ var (
 	ErrQueueSizeNotSupported = New(tcpip.ErrQueueSizeNotSupported.String(), linux.ENOTTY)
 	ErrNoSuchFile            = New(tcpip.ErrNoSuchFile.String(), linux.ENOENT)
 	ErrInvalidOptionValue    = New(tcpip.ErrInvalidOptionValue.String(), linux.EINVAL)
+	ErrBroadcastDisabled     = New(tcpip.ErrBroadcastDisabled.String(), linux.EACCES)
 )
 
 var netstackErrorTranslations = map[*tcpip.Error]*Error{
@@ -80,6 +81,7 @@ var netstackErrorTranslations = map[*tcpip.Error]*Error{
 	tcpip.ErrNetworkUnreachable:    ErrNetworkUnreachable,
 	tcpip.ErrMessageTooLong:        ErrMessageTooLong,
 	tcpip.ErrNoBufferSpace:         ErrNoBufferSpace,
+	tcpip.ErrBroadcastDisabled:     ErrBroadcastDisabled,
 }
 
 // TranslateNetstackError converts an error from the tcpip package to a sentry
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 586ca873e..43d7c2ec4 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -399,6 +399,21 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 
 	src, dst := netProto.ParseAddresses(vv.First())
 
+	// If the packet is destined to the IPv4 Broadcast address, then make a
+	// route to each IPv4 network endpoint and let each endpoint handle the
+	// packet.
+	if dst == header.IPv4Broadcast {
+		for _, ref := range n.endpoints {
+			if ref.protocol == header.IPv4ProtocolNumber && ref.tryIncRef() {
+				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
+				r.RemoteLinkAddress = remote
+				ref.ep.HandlePacket(&r, vv)
+				ref.decRef()
+			}
+		}
+		return
+	}
+
 	if ref := n.getRef(protocol, dst); ref != nil {
 		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
 		r.RemoteLinkAddress = remote
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index a5ff2159a..c18208dc0 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -132,7 +132,22 @@ func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID) TransportEnd
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
 func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
-	ep.selectEndpoint(id).HandlePacket(r, id, vv)
+	// If this is a broadcast datagram, deliver the datagram to all endpoints
+	// managed by ep.
+	if id.LocalAddress == header.IPv4Broadcast {
+		for i, endpoint := range ep.endpointsArr {
+			// HandlePacket modifies vv, so each endpoint needs its own copy.
+			if i == len(ep.endpointsArr)-1 {
+				endpoint.HandlePacket(r, id, vv)
+				break
+			}
+			vvCopy := buffer.NewView(vv.Size())
+			copy(vvCopy, vv.ToView())
+			endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
+		}
+	} else {
+		ep.selectEndpoint(id).HandlePacket(r, id, vv)
+	}
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
@@ -224,20 +239,47 @@ func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolN
 	}
 }
 
-// deliverPacket attempts to deliver the given packet. Returns true if it found
-// an endpoint, false otherwise.
+var loopbackSubnet = func() tcpip.Subnet {
+	sn, err := tcpip.NewSubnet("\x7f\x00\x00\x00", "\xff\x00\x00\x00")
+	if err != nil {
+		panic(err)
+	}
+	return sn
+}()
+
+// deliverPacket attempts to find one or more matching transport endpoints, and
+// then, if matches are found, delivers the packet to them. Returns true if it
+// found one or more endpoints, false otherwise.
 func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
 	}
 
+	// If a sender bound to the Loopback interface sends a broadcast,
+	// that broadcast must not be delivered to the sender.
+	if loopbackSubnet.Contains(r.RemoteAddress) && r.LocalAddress == header.IPv4Broadcast && id.LocalPort == id.RemotePort {
+		return false
+	}
+
+	// If the packet is a broadcast, then find all matching transport endpoints.
+	// Otherwise, try to find a single matching transport endpoint.
+	destEps := make([]TransportEndpoint, 0, 1)
 	eps.mu.RLock()
-	ep := d.findEndpointLocked(eps, vv, id)
+
+	if protocol == header.UDPProtocolNumber && id.LocalAddress == header.IPv4Broadcast {
+		for epID, endpoint := range eps.endpoints {
+			if epID.LocalPort == id.LocalPort {
+				destEps = append(destEps, endpoint)
+			}
+		}
+	} else if ep := d.findEndpointLocked(eps, vv, id); ep != nil {
+		destEps = append(destEps, ep)
+	}
 	eps.mu.RUnlock()
 
-	// Fail if we didn't find one.
-	if ep == nil {
+	// Fail if we didn't find at least one matching transport endpoint.
+	if len(destEps) == 0 {
 		// UDP packet could not be delivered to an unknown destination port.
 		if protocol == header.UDPProtocolNumber {
 			r.Stats().UDP.UnknownPortErrors.Increment()
@@ -246,7 +288,9 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	}
 
 	// Deliver the packet.
-	ep.HandlePacket(r, id, vv)
+	for _, ep := range destEps {
+		ep.HandlePacket(r, id, vv)
+	}
 
 	return true
 }
@@ -277,7 +321,7 @@ func (d *transportDemuxer) deliverControlPacket(net tcpip.NetworkProtocolNumber,
 
 func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) TransportEndpoint {
 	// Try to find a match with the id as provided.
-	if ep := eps.endpoints[id]; ep != nil {
+	if ep, ok := eps.endpoints[id]; ok {
 		return ep
 	}
 
@@ -285,7 +329,7 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer
 	nid := id
 
 	nid.LocalAddress = ""
-	if ep := eps.endpoints[nid]; ep != nil {
+	if ep, ok := eps.endpoints[nid]; ok {
 		return ep
 	}
 
@@ -293,11 +337,15 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer
 	nid.LocalAddress = id.LocalAddress
 	nid.RemoteAddress = ""
 	nid.RemotePort = 0
-	if ep := eps.endpoints[nid]; ep != nil {
+	if ep, ok := eps.endpoints[nid]; ok {
 		return ep
 	}
 
 	// Try to find a match with only the local port.
 	nid.LocalAddress = ""
-	return eps.endpoints[nid]
+	if ep, ok := eps.endpoints[nid]; ok {
+		return ep
+	}
+
+	return nil
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index a6e47397a..89e9d6741 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -100,6 +100,7 @@ var (
 	ErrNetworkUnreachable    = &Error{msg: "network is unreachable"}
 	ErrMessageTooLong        = &Error{msg: "message too long"}
 	ErrNoBufferSpace         = &Error{msg: "no buffer space available"}
+	ErrBroadcastDisabled     = &Error{msg: "broadcast socket option disabled"}
 )
 
 // Errors related to Subnet
@@ -502,6 +503,10 @@ type RemoveMembershipOption MembershipOption
 // TCP out-of-band data is delivered along with the normal in-band data.
 type OutOfBandInlineOption int
 
+// BroadcastOption is used by SetSockOpt/GetSockOpt to specify whether
+// datagram sockets are allowed to send packets to a broadcast address.
+type BroadcastOption int
+
 // Route is a row in the routing table. It specifies through which NIC (and
 // gateway) sets of packets should be routed. A row is considered viable if the
 // masked target address matches the destination adddress in the row.
@@ -527,6 +532,12 @@ func (r *Route) Match(addr Address) bool {
 		return false
 	}
 
+	// Using header.Ipv4Broadcast would introduce an import cycle, so
+	// we'll use a literal instead.
+	if addr == "\xff\xff\xff\xff" {
+		return true
+	}
+
 	for i := 0; i < len(r.Destination); i++ {
 		if (addr[i] & r.Mask[i]) != r.Destination[i] {
 			return false
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 1ee9f8d25..aa31a78af 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -116,6 +116,9 @@ type endpoint struct {
 	route             stack.Route `state:"manual"`
 	v6only            bool
 	isConnectNotified bool
+	// TCP should never broadcast but Linux nevertheless supports enabling/
+	// disabling SO_BROADCAST, albeit as a NOOP.
+	broadcast bool
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -813,6 +816,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 		return nil
 
+	case tcpip.BroadcastOption:
+		e.mu.Lock()
+		e.broadcast = v != 0
+		e.mu.Unlock()
+		return nil
+
 	default:
 		return nil
 	}
@@ -971,6 +980,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = 1
 		return nil
 
+	case *tcpip.BroadcastOption:
+		e.mu.Lock()
+		v := e.broadcast
+		e.mu.Unlock()
+
+		*o = 0
+		if v {
+			*o = 1
+		}
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 4891c7941..a07cd9011 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -336,6 +336,7 @@ func loadError(s string) *tcpip.Error {
 			tcpip.ErrNetworkUnreachable,
 			tcpip.ErrMessageTooLong,
 			tcpip.ErrNoBufferSpace,
+			tcpip.ErrBroadcastDisabled,
 		}
 
 		messageToError = make(map[string]*tcpip.Error)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 9c3881d63..05d35e526 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -82,6 +82,7 @@ type endpoint struct {
 	multicastAddr  tcpip.Address
 	multicastNICID tcpip.NICID
 	reusePort      bool
+	broadcast      bool
 
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
@@ -347,6 +348,10 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 			nicid = e.bindNICID
 		}
 
+		if to.Addr == header.IPv4Broadcast && !e.broadcast {
+			return 0, nil, tcpip.ErrBroadcastDisabled
+		}
+
 		r, _, _, err := e.connectRoute(nicid, *to)
 		if err != nil {
 			return 0, nil, err
@@ -502,6 +507,13 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		e.reusePort = v != 0
 		e.mu.Unlock()
+
+	case tcpip.BroadcastOption:
+		e.mu.Lock()
+		e.broadcast = v != 0
+		e.mu.Unlock()
+
+		return nil
 	}
 	return nil
 }
@@ -581,6 +593,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = 0
 		return nil
 
+	case *tcpip.BroadcastOption:
+		e.mu.RLock()
+		v := e.broadcast
+		e.mu.RUnlock()
+
+		*o = 0
+		if v {
+			*o = 1
+		}
+		return nil
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 9da5204c1..beece8930 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -148,6 +148,7 @@ cc_library(
     hdrs = ["ip_socket_test_util.h"],
     deps = [
         ":socket_test_util",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1970,6 +1971,42 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "socket_ipv4_udp_unbound_external_networking_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_external_networking.cc",
+    ],
+    hdrs = [
+        "socket_ipv4_udp_unbound_external_networking.h",
+    ],
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "socket_ipv4_tcp_unbound_external_networking_test_cases",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_tcp_unbound_external_networking.cc",
+    ],
+    hdrs = [
+        "socket_ipv4_tcp_unbound_external_networking.h",
+    ],
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+    alwayslink = 1,
+)
+
 cc_binary(
     name = "socket_abstract_test",
     testonly = 1,
@@ -2147,6 +2184,38 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_ipv4_udp_unbound_external_networking_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_udp_unbound_external_networking_test.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ipv4_udp_unbound_external_networking_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
+    name = "socket_ipv4_tcp_unbound_external_networking_test",
+    testonly = 1,
+    srcs = [
+        "socket_ipv4_tcp_unbound_external_networking_test.cc",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_ipv4_tcp_unbound_external_networking_test_cases",
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
 cc_binary(
     name = "socket_ip_udp_loopback_non_blocking_test",
     testonly = 1,
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index f8232fc24..4ad787cc0 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <net/if.h>
+#include <netinet/in.h>
 #include <sys/ioctl.h>
+#include <sys/socket.h>
 #include <cstring>
 
 #include "test/syscalls/linux/ip_socket_test_util.h"
@@ -95,5 +97,19 @@ SocketPairKind IPv4UDPUnboundSocketPair(int type) {
                                                /* dual_stack = */ false)};
 }
 
+SocketKind IPv4UDPUnboundSocket(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "IPv4 UDP socket");
+  return SocketKind{description, UnboundSocketCreator(
+                                     AF_INET, type | SOCK_DGRAM, IPPROTO_UDP)};
+}
+
+SocketKind IPv4TCPUnboundSocket(int type) {
+  std::string description =
+      absl::StrCat(DescribeSocketType(type), "IPv4 TCP socket");
+  return SocketKind{description, UnboundSocketCreator(
+                                     AF_INET, type | SOCK_STREAM, IPPROTO_TCP)};
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index a6721091a..cac790e64 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -58,6 +58,14 @@ SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
 // SocketPairs created with AF_INET and the given type.
 SocketPairKind IPv4UDPUnboundSocketPair(int type);
 
+// IPv4UDPUnboundSocketPair returns a SocketKind that represents
+// a SimpleSocket created with AF_INET, SOCK_DGRAM, and the given type.
+SocketKind IPv4UDPUnboundSocket(int type);
+
+// IPv4TCPUnboundSocketPair returns a SocketKind that represents
+// a SimpleSocket created with AF_INET, SOCK_STREAM and the given type.
+SocketKind IPv4TCPUnboundSocket(int type);
+
 }  // namespace testing
 }  // namespace gvisor
 
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
new file mode 100644
index 000000000..8e1c13ff4
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -0,0 +1,66 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <cstdio>
+#include <cstring>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Verifies that a newly instantiated TCP socket does not have the
+// broadcast socket option enabled.
+TEST_P(IPv4TCPUnboundExternalNetworkingSocketTest, TCPBroadcastDefault) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  int get = -1;
+  socklen_t get_sz = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_sz, sizeof(get));
+}
+
+// Verifies that a newly instantiated TCP socket returns true after enabling
+// the broadcast socket option.
+TEST_P(IPv4TCPUnboundExternalNetworkingSocketTest, SetTCPBroadcast) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  EXPECT_THAT(setsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_sz = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_sz, sizeof(get));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
new file mode 100644
index 000000000..b23de08d1
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
@@ -0,0 +1,30 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_TCP_UNBOUND_EXTERNAL_NETWORKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_TCP_UNBOUND_EXTERNAL_NETWORKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to unbound IPv4 TCP sockets in a sandbox
+// with external networking support.
+using IPv4TCPUnboundExternalNetworkingSocketTest = SimpleSocketTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_TCP_UNBOUND_EXTERNAL_NETWORKING_H_
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
new file mode 100644
index 000000000..c6fb42641
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketKind> GetSockets() {
+  return ApplyVec<SocketKind>(
+      IPv4TCPUnboundSocket,
+      AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_CASE_P(IPv4TCPSockets,
+                        IPv4TCPUnboundExternalNetworkingSocketTest,
+                        ::testing::ValuesIn(GetSockets()));
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
new file mode 100644
index 000000000..7d561b991
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -0,0 +1,231 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <cstdio>
+#include <cstring>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Verifies that a newly instantiated UDP socket does not have the
+// broadcast socket option enabled.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, UDPBroadcastDefault) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  int get = -1;
+  socklen_t get_sz = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_sz, sizeof(get));
+}
+
+// Verifies that a newly instantiated UDP socket returns true after enabling
+// the broadcast socket option.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, SetUDPBroadcast) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  EXPECT_THAT(setsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_sz = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_sz, sizeof(get));
+}
+
+// Verifies that a broadcast UDP packet will arrive at all UDP sockets with
+// the destination port number.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       UDPBroadcastReceivedOnAllExpectedEndpoints) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto rcvr1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto rcvr2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto norcv = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Enable SO_BROADCAST on the sending socket.
+  ASSERT_THAT(setsockopt(sender->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  // Enable SO_REUSEPORT on the receiving sockets so that they may both be bound
+  // to the broadcast messages destination port.
+  ASSERT_THAT(setsockopt(rcvr1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+  ASSERT_THAT(setsockopt(rcvr2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  sockaddr_in rcv_addr = {};
+  socklen_t rcv_addr_sz = sizeof(rcv_addr);
+  rcv_addr.sin_family = AF_INET;
+  rcv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+  ASSERT_THAT(bind(rcvr1->get(), reinterpret_cast<struct sockaddr*>(&rcv_addr),
+                   rcv_addr_sz),
+              SyscallSucceedsWithValue(0));
+  // Retrieve port number from first socket so that it can be bound to the
+  // second socket.
+  rcv_addr = {};
+  ASSERT_THAT(
+      getsockname(rcvr1->get(), reinterpret_cast<struct sockaddr*>(&rcv_addr),
+                  &rcv_addr_sz),
+      SyscallSucceedsWithValue(0));
+  ASSERT_THAT(bind(rcvr2->get(), reinterpret_cast<struct sockaddr*>(&rcv_addr),
+                   rcv_addr_sz),
+              SyscallSucceedsWithValue(0));
+
+  // Bind the non-receiving socket to an ephemeral port.
+  sockaddr_in norcv_addr = {};
+  norcv_addr.sin_family = AF_INET;
+  norcv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+  ASSERT_THAT(
+      bind(norcv->get(), reinterpret_cast<struct sockaddr*>(&norcv_addr),
+           sizeof(norcv_addr)),
+      SyscallSucceedsWithValue(0));
+
+  // Broadcast a test message.
+  sockaddr_in dst_addr = {};
+  dst_addr.sin_family = AF_INET;
+  dst_addr.sin_addr.s_addr = htonl(INADDR_BROADCAST);
+  dst_addr.sin_port = rcv_addr.sin_port;
+  constexpr char kTestMsg[] = "hello, world";
+  EXPECT_THAT(
+      sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+             reinterpret_cast<struct sockaddr*>(&dst_addr), sizeof(dst_addr)),
+      SyscallSucceedsWithValue(sizeof(kTestMsg)));
+
+  // Verify that the receiving sockets received the test message.
+  char buf[sizeof(kTestMsg)] = {};
+  EXPECT_THAT(read(rcvr1->get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(kTestMsg)));
+  EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+  memset(buf, 0, sizeof(buf));
+  EXPECT_THAT(read(rcvr2->get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(kTestMsg)));
+  EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+
+  // Verify that the non-receiving socket did not receive the test message.
+  memset(buf, 0, sizeof(buf));
+  EXPECT_THAT(RetryEINTR(recv)(norcv->get(), buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Verifies that a UDP broadcast sent via the loopback interface is not received
+// by the sender.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       UDPBroadcastViaLoopbackFails) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Enable SO_BROADCAST.
+  ASSERT_THAT(setsockopt(sender->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  // Bind the sender to the loopback interface.
+  sockaddr_in src = {};
+  socklen_t src_sz = sizeof(src);
+  src.sin_family = AF_INET;
+  src.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(
+      bind(sender->get(), reinterpret_cast<struct sockaddr*>(&src), src_sz),
+      SyscallSucceedsWithValue(0));
+  ASSERT_THAT(getsockname(sender->get(),
+                          reinterpret_cast<struct sockaddr*>(&src), &src_sz),
+              SyscallSucceedsWithValue(0));
+  ASSERT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+
+  // Send the message.
+  sockaddr_in dst = {};
+  dst.sin_family = AF_INET;
+  dst.sin_addr.s_addr = htonl(INADDR_BROADCAST);
+  dst.sin_port = src.sin_port;
+  constexpr char kTestMsg[] = "hello, world";
+  EXPECT_THAT(sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+                     reinterpret_cast<struct sockaddr*>(&dst), sizeof(dst)),
+              SyscallSucceedsWithValue(sizeof(kTestMsg)));
+
+  // Verify that the message was not received by the sender (loopback).
+  char buf[sizeof(kTestMsg)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sender->get(), buf, sizeof(buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Verifies that a UDP broadcast fails to send on a socket with SO_BROADCAST
+// disabled.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendBroadcast) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Broadcast a test message without having enabled SO_BROADCAST on the sending
+  // socket.
+  sockaddr_in addr = {};
+  socklen_t addr_sz = sizeof(addr);
+  addr.sin_family = AF_INET;
+  addr.sin_port = htons(12345);
+  addr.sin_addr.s_addr = htonl(INADDR_BROADCAST);
+  constexpr char kTestMsg[] = "hello, world";
+
+  EXPECT_THAT(sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+                     reinterpret_cast<struct sockaddr*>(&addr), addr_sz),
+              SyscallFailsWithErrno(EACCES));
+}
+
+// Verifies that a UDP unicast on an unbound socket reaches its destination.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendUnicastOnUnbound) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto rcvr = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the receiver and retrieve its address and port number.
+  sockaddr_in addr = {};
+  addr.sin_family = AF_INET;
+  addr.sin_addr.s_addr = htonl(INADDR_ANY);
+  addr.sin_port = htons(0);
+  ASSERT_THAT(bind(rcvr->get(), reinterpret_cast<struct sockaddr*>(&addr),
+                   sizeof(addr)),
+              SyscallSucceedsWithValue(0));
+  memset(&addr, 0, sizeof(addr));
+  socklen_t addr_sz = sizeof(addr);
+  ASSERT_THAT(getsockname(rcvr->get(),
+                          reinterpret_cast<struct sockaddr*>(&addr), &addr_sz),
+              SyscallSucceedsWithValue(0));
+
+  // Send a test message to the receiver.
+  constexpr char kTestMsg[] = "hello, world";
+  ASSERT_THAT(sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+                     reinterpret_cast<struct sockaddr*>(&addr), addr_sz),
+              SyscallSucceedsWithValue(sizeof(kTestMsg)));
+  char buf[sizeof(kTestMsg)] = {};
+  ASSERT_THAT(read(rcvr->get(), buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(kTestMsg)));
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
new file mode 100644
index 000000000..5cf9fa8eb
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
@@ -0,0 +1,30 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_EXTERNAL_NETWORKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_EXTERNAL_NETWORKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to unbound IPv4 UDP sockets in a sandbox
+// with external networking support.
+using IPv4UDPUnboundExternalNetworkingSocketTest = SimpleSocketTest;
+
+}  // namespace testing
+}  // namespace gvisor
+
+#endif  // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_EXTERNAL_NETWORKING_H_
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
new file mode 100644
index 000000000..e07385134
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -0,0 +1,35 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::vector<SocketKind> GetSockets() {
+  return ApplyVec<SocketKind>(
+      IPv4UDPUnboundSocket,
+      AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_CASE_P(IPv4UDPSockets,
+                        IPv4UDPUnboundExternalNetworkingSocketTest,
+                        ::testing::ValuesIn(GetSockets()));
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 8d19f79ac..035087566 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -22,6 +22,7 @@
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/thread_util.h"
@@ -463,6 +464,17 @@ SocketPairKind Reversed(SocketPairKind const& base) {
       }};
 }
 
+Creator<FileDescriptor> UnboundSocketCreator(int domain, int type,
+                                             int protocol) {
+  return [=]() -> PosixErrorOr<std::unique_ptr<FileDescriptor>> {
+    int sock;
+    RETURN_ERROR_IF_SYSCALL_FAIL(sock = socket(domain, type, protocol));
+    MaybeSave();  // Successful socket creation.
+
+    return absl::make_unique<FileDescriptor>(sock);
+  };
+}
+
 std::vector<SocketPairKind> IncludeReversals(std::vector<SocketPairKind> vec) {
   return ApplyVecToVec<SocketPairKind>(std::vector<Middleware>{NoOp, Reversed},
                                        vec);
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index 906b3e929..dfabdf179 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -278,6 +278,11 @@ Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
 Creator<SocketPair> UDPUnboundSocketPairCreator(int domain, int type,
                                                 int protocol, bool dual_stack);
 
+// UnboundSocketCreator returns a Creator<FileDescriptor> that obtains a file
+// descriptor by creating a socket.
+Creator<FileDescriptor> UnboundSocketCreator(int domain, int type,
+                                             int protocol);
+
 // A SocketPairKind couples a human-readable description of a socket pair with
 // a function that creates such a socket pair.
 struct SocketPairKind {
-- 
cgit v1.2.3


From 15d3189884c2e8050992381ff2a1f0521eae0ba2 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo.xu@arm.com>
Date: Wed, 20 Feb 2019 15:09:50 -0800
Subject: Make some ptrace commands x86-only

Signed-off-by: Haibo Xu <haibo.xu@arm.com>
Change-Id: I9751f859332d433ca772d6b9733f5a5a64398ec7
PiperOrigin-RevId: 234877624
---
 pkg/sentry/kernel/BUILD           |  2 +
 pkg/sentry/kernel/ptrace.go       | 63 ++-------------------------
 pkg/sentry/kernel/ptrace_amd64.go | 89 +++++++++++++++++++++++++++++++++++++++
 pkg/sentry/kernel/ptrace_arm64.go | 28 ++++++++++++
 4 files changed, 122 insertions(+), 60 deletions(-)
 create mode 100644 pkg/sentry/kernel/ptrace_amd64.go
 create mode 100644 pkg/sentry/kernel/ptrace_arm64.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 773cb8c91..d9bbfb556 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -95,6 +95,8 @@ go_library(
         "posixtimer.go",
         "process_group_list.go",
         "ptrace.go",
+        "ptrace_amd64.go",
+        "ptrace_arm64.go",
         "rseq.go",
         "seccomp.go",
         "seqatomic_taskgoroutineschedinfo.go",
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 20bac2b70..fa7a0d141 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -863,42 +863,6 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		})
 		return err
 
-	case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
-		n, err := target.Arch().PtracePeekUser(uintptr(addr))
-		if err != nil {
-			return err
-		}
-		_, err = t.CopyOut(data, n)
-		return err
-
-	case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
-		return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
-
-	case linux.PTRACE_GETREGS:
-		// "Copy the tracee's general-purpose ... registers ... to the address
-		// data in the tracer. ... (addr is ignored.) Note that SPARC systems
-		// have the meaning of data and addr reversed ..."
-		_, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
-			Ctx:  t,
-			IO:   t.MemoryManager(),
-			Addr: data,
-			Opts: usermem.IOOpts{
-				AddressSpaceActive: true,
-			},
-		})
-		return err
-
-	case linux.PTRACE_GETFPREGS:
-		_, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
-			Ctx:  t,
-			IO:   t.MemoryManager(),
-			Addr: data,
-			Opts: usermem.IOOpts{
-				AddressSpaceActive: true,
-			},
-		})
-		return err
-
 	case linux.PTRACE_GETREGSET:
 		// "Read the tracee's registers. addr specifies, in an
 		// architecture-dependent way, the type of registers to be read. ...
@@ -930,28 +894,6 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		ar.End = end
 		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
 
-	case linux.PTRACE_SETREGS:
-		_, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
-			Ctx:  t,
-			IO:   t.MemoryManager(),
-			Addr: data,
-			Opts: usermem.IOOpts{
-				AddressSpaceActive: true,
-			},
-		})
-		return err
-
-	case linux.PTRACE_SETFPREGS:
-		_, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
-			Ctx:  t,
-			IO:   t.MemoryManager(),
-			Addr: data,
-			Opts: usermem.IOOpts{
-				AddressSpaceActive: true,
-			},
-		})
-		return err
-
 	case linux.PTRACE_SETREGSET:
 		ars, err := t.CopyInIovecs(data, 1)
 		if err != nil {
@@ -1047,8 +989,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		_, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
 		return err
 
+	// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
+
 	default:
-		// PEEKSIGINFO is unimplemented but seems to have no users anywhere.
-		return syserror.EIO
+		return t.ptraceArch(target, req, addr, data)
 	}
 }
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
new file mode 100644
index 000000000..1f88efca3
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -0,0 +1,89 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptraceArch implements arch-specific ptrace commands.
+func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+	switch req {
+	case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
+		n, err := target.Arch().PtracePeekUser(uintptr(addr))
+		if err != nil {
+			return err
+		}
+		_, err = t.CopyOut(data, n)
+		return err
+
+	case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
+		return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
+
+	case linux.PTRACE_GETREGS:
+		// "Copy the tracee's general-purpose ... registers ... to the address
+		// data in the tracer. ... (addr is ignored.) Note that SPARC systems
+		// have the meaning of data and addr reversed ..."
+		_, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_GETFPREGS:
+		_, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_SETREGS:
+		_, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	case linux.PTRACE_SETFPREGS:
+		_, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
+			Ctx:  t,
+			IO:   t.MemoryManager(),
+			Addr: data,
+			Opts: usermem.IOOpts{
+				AddressSpaceActive: true,
+			},
+		})
+		return err
+
+	default:
+		return syserror.EIO
+	}
+}
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
new file mode 100644
index 000000000..4636405e6
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptraceArch implements arch-specific ptrace commands.
+func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error {
+	return syserror.EIO
+}
-- 
cgit v1.2.3


From 532f4b2fbaf66382a3d9e118b5a7a3ee272c8edc Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Thu, 21 Feb 2019 13:07:25 -0800
Subject: Internal change.

PiperOrigin-RevId: 235053594
Change-Id: Ie3d7b11843d0710184a2463886c7034e8f5305d1
---
 pkg/sentry/fs/file.go             | 30 ++++++++++++++++++++++++++-
 pkg/sentry/fs/gofer/file.go       | 43 ++++++++++++++++++++++++++++-----------
 pkg/sentry/fs/tmpfs/inode_file.go | 15 +++++++++++---
 3 files changed, 72 insertions(+), 16 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index b66d2f265..d66813103 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -18,6 +18,7 @@ import (
 	"math"
 	"sync"
 	"sync/atomic"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/amutex"
 	"gvisor.googlesource.com/gvisor/pkg/log"
@@ -33,7 +34,22 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-var reads = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
+var (
+	// RecordWaitTime controls writing metrics for filesystem reads. Enabling this comes at a small
+	// CPU cost due to performing two monotonic clock reads per read call.
+	RecordWaitTime = false
+
+	reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
+	readWait = metric.MustCreateNewUint64Metric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
+)
+
+// IncrementWait increments the given wait time metric, if enabled.
+func IncrementWait(m *metric.Uint64Metric, start time.Time) {
+	if !RecordWaitTime {
+		return
+	}
+	m.IncrementBy(uint64(time.Since(start)))
+}
 
 // FileMaxOffset is the maximum possible file offset.
 const FileMaxOffset = math.MaxInt64
@@ -236,7 +252,12 @@ func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error {
 //
 // Returns syserror.ErrInterrupted if reading was interrupted.
 func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	var start time.Time
+	if RecordWaitTime {
+		start = time.Now()
+	}
 	if !f.mu.Lock(ctx) {
+		IncrementWait(readWait, start)
 		return 0, syserror.ErrInterrupted
 	}
 
@@ -246,6 +267,7 @@ func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		atomic.AddInt64(&f.offset, n)
 	}
 	f.mu.Unlock()
+	IncrementWait(readWait, start)
 	return n, err
 }
 
@@ -255,13 +277,19 @@ func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error)
 //
 // Otherwise same as Readv.
 func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var start time.Time
+	if RecordWaitTime {
+		start = time.Now()
+	}
 	if !f.mu.Lock(ctx) {
+		IncrementWait(readWait, start)
 		return 0, syserror.ErrInterrupted
 	}
 
 	reads.Increment()
 	n, err := f.FileOperations.Read(ctx, f, dst, offset)
 	f.mu.Unlock()
+	IncrementWait(readWait, start)
 	return n, err
 }
 
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 7a6dabba8..631cc80ae 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -17,6 +17,7 @@ package gofer
 import (
 	"fmt"
 	"syscall"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/metric"
@@ -32,11 +33,13 @@ import (
 )
 
 var (
-	opensWX   = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
-	opens9P   = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.")
-	opensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.")
-	reads9P   = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
-	readsHost = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
+	opensWX      = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.")
+	opens9P      = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a 9P file was opened from a gofer.")
+	opensHost    = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a host file was opened from a gofer.")
+	reads9P      = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
+	readWait9P   = metric.MustCreateNewUint64Metric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
+	readsHost    = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
+	readWaitHost = metric.MustCreateNewUint64Metric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
 )
 
 // fileOperations implements fs.FileOperations for a remote file system.
@@ -232,22 +235,38 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I
 	return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset))
 }
 
+// incrementReadCounters increments the read counters for the read starting at the given time. We
+// use this function rather than using a defer in Read() to avoid the performance hit of defer.
+func (f *fileOperations) incrementReadCounters(start time.Time) {
+	if f.handles.Host != nil {
+		readsHost.Increment()
+		fs.IncrementWait(readWaitHost, start)
+	} else {
+		reads9P.Increment()
+		fs.IncrementWait(readWait9P, start)
+	}
+}
+
 // Read implements fs.FileOperations.Read.
 func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+	var start time.Time
+	if fs.RecordWaitTime {
+		start = time.Now()
+	}
 	if fs.IsDir(file.Dirent.Inode.StableAttr) {
 		// Not all remote file systems enforce this so this client does.
+		f.incrementReadCounters(start)
 		return 0, syserror.EISDIR
 	}
-	if f.handles.Host != nil {
-		readsHost.Increment()
-	} else {
-		reads9P.Increment()
-	}
 
 	if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
-		return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
+		n, err := f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset)
+		f.incrementReadCounters(start)
+		return n, err
 	}
-	return dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset))
+	n, err := dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset))
+	f.incrementReadCounters(start)
+	return n, err
 }
 
 // Fsync implements fs.FileOperations.Fsync.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index ef5e67dda..1cc972afa 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -17,6 +17,7 @@ package tmpfs
 import (
 	"io"
 	"sync"
+	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -31,9 +32,10 @@ import (
 )
 
 var (
-	opensRO = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
-	opensW  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
-	reads   = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
+	opensRO  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
+	opensW   = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
+	reads    = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
+	readWait = metric.MustCreateNewUint64Metric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
 )
 
 // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
@@ -249,9 +251,14 @@ func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+	var start time.Time
+	if fs.RecordWaitTime {
+		start = time.Now()
+	}
 	reads.Increment()
 	// Zero length reads for tmpfs are no-ops.
 	if dst.NumBytes() == 0 {
+		fs.IncrementWait(readWait, start)
 		return 0, nil
 	}
 
@@ -268,6 +275,7 @@ func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence,
 	size := f.attr.Size
 	f.dataMu.RUnlock()
 	if offset >= size {
+		fs.IncrementWait(readWait, start)
 		return 0, io.EOF
 	}
 
@@ -276,6 +284,7 @@ func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence,
 	f.attrMu.Lock()
 	f.attr.AccessTime = ktime.NowFromContext(ctx)
 	f.attrMu.Unlock()
+	fs.IncrementWait(readWait, start)
 	return n, err
 }
 
-- 
cgit v1.2.3


From 10426e0f31e427e90e69fee83f199ea521b8fe3d Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 25 Feb 2019 12:16:44 -0800
Subject: Handle invalid offset in sendfile(2)

PiperOrigin-RevId: 235578698
Change-Id: I608ff5e25eac97f6e1bda058511c1f82b0e3b736
---
 pkg/sentry/syscalls/linux/sys_file.go |  9 ++++++---
 test/syscalls/linux/sendfile.cc       | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 7ad0c9517..cf6fdc190 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -2022,7 +2022,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	}
 
 	// Setup for sending data.
-	var offset uint64
 	var n int64
 	var err error
 	w := &fs.FileWriter{t, outFile}
@@ -2034,14 +2033,18 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			return 0, nil, syserror.ESPIPE
 		}
 		// Copy in the offset.
+		var offset int64
 		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
 			return 0, nil, err
 		}
+		if offset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
 		// Send data using Preadv.
-		r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), count)
+		r := io.NewSectionReader(&fs.FileReader{t, inFile}, offset, count)
 		n, err = io.Copy(w, r)
 		// Copy out the new offset.
-		if _, err := t.CopyOut(offsetAddr, n+int64(offset)); err != nil {
+		if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
 			return 0, nil, err
 		}
 		// If we don't have a provided offset.
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 92b7b9478..15fd01ff0 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -46,6 +46,25 @@ TEST(SendFileTest, SendZeroBytes) {
               SyscallSucceedsWithValue(0));
 }
 
+TEST(SendFileTest, InvalidOffset) {
+  // Create temp files.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Send data and verify that sendfile returns the correct value.
+  off_t offset = -1;
+  EXPECT_THAT(sendfile(outf.get(), inf.get(), &offset, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(SendFileTest, SendTrivially) {
   // Create temp files.
   constexpr char kData[] = "To be, or not to be, that is the question:";
-- 
cgit v1.2.3


From 23fe059761a470d7724b462ad8ead09356ec21b7 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 26 Feb 2019 09:32:20 -0800
Subject: Lazily allocate inotify map on inode

PiperOrigin-RevId: 235735865
Change-Id: I84223eb18eb51da1fa9768feaae80387ff6bfed0
---
 pkg/sentry/fs/inode_inotify.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index e213df924..d2b653bc7 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -39,9 +39,7 @@ type Watches struct {
 }
 
 func newWatches() *Watches {
-	return &Watches{
-		ws: make(map[uint64]*Watch),
-	}
+	return &Watches{}
 }
 
 // MarkUnlinked indicates the target for this set of watches to be unlinked.
@@ -78,6 +76,9 @@ func (w *Watches) Add(watch *Watch) {
 	if _, exists := w.ws[watch.ID()]; exists {
 		panic(fmt.Sprintf("Watch collision with ID %+v", watch.ID()))
 	}
+	if w.ws == nil {
+		w.ws = make(map[uint64]*Watch)
+	}
 	w.ws[watch.ID()] = watch
 }
 
-- 
cgit v1.2.3


From a2b794b30dd952793f4d99a9423cef7efdc7843f Mon Sep 17 00:00:00 2001
From: Ruidong Cao <crdfrank@gmail.com>
Date: Tue, 26 Feb 2019 11:47:42 -0800
Subject: FPE_INTOVF (integer overflow) should be 2 refer to Linux.

Signed-off-by: Ruidong Cao <crdfrank@gmail.com>
Change-Id: I03f8ab25cf29257b31f145cf43304525a93f3300
PiperOrigin-RevId: 235763203
---
 pkg/sentry/platform/kvm/machine_amd64.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 5ad805b8b..ccfe837b5 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -290,7 +290,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 	case ring0.Overflow:
 		*info = arch.SignalInfo{
 			Signo: int32(syscall.SIGFPE),
-			Code:  1, // FPE_INTOVF (integer overflow).
+			Code:  2, // FPE_INTOVF (integer overflow).
 		}
 		info.SetAddr(switchOpts.Registers.Rip) // Include address.
 		return usermem.AccessType{}, platform.ErrContextSignal
-- 
cgit v1.2.3


From cff2c57192ccd5ccf4cec6280afcd724dc1135d1 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 26 Feb 2019 16:41:15 -0800
Subject: Fix bad merge

PiperOrigin-RevId: 235818534
Change-Id: I99f7e3fd1dc808b35f7a08b96b7c3226603ab808
---
 pkg/sentry/socket/epsocket/epsocket.go | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e24e58aed..8aec97e72 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1230,10 +1230,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}))
 
 	case linux.MCAST_JOIN_GROUP:
-		// FIXME: Disallow IP-level multicast group options by
-		// default. These will need to be supported by appropriately plumbing
-		// the level through to the network stack (if at all). However, we
-		// still allow setting TTL, and multicast-enable/disable type options.
+		// FIXME: Implement MCAST_JOIN_GROUP.
 		t.Kernel().EmitUnimplementedEvent(t)
 		return syserr.ErrInvalidArgument
 
-- 
cgit v1.2.3


From d516ee3312411c60630305cfaac6c5a0e21537e8 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 27 Feb 2019 09:44:45 -0800
Subject: Allow overlay to merge Directories and SepcialDirectories.

Needed to mount inside /proc or /sys.

PiperOrigin-RevId: 235936529
Change-Id: Iee6f2671721b1b9b58a3989705ea901322ec9206
---
 pkg/sentry/fs/inode_overlay.go | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 77a2623ef..b11e2bd13 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -131,11 +131,20 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 		}
 		if child != nil {
 			if !child.IsNegative() {
-				// Did we find something in the upper filesystem? We can
-				// only use it if the types match.
-				if upperInode == nil || upperInode.StableAttr.Type == child.Inode.StableAttr.Type {
+				if upperInode == nil {
+					// If nothing was in the upper, use what we found in the lower.
 					lowerInode = child.Inode
 					lowerInode.IncRef()
+				} else {
+					// If we have something from the upper, we can only use it if the types
+					// match.
+					// NOTE: Allow SpecialDirectories and Directories to merge.
+					// This is needed to allow submounts in /proc and /sys.
+					if upperInode.StableAttr.Type == child.Inode.StableAttr.Type ||
+						(IsDir(upperInode.StableAttr) && IsDir(child.Inode.StableAttr)) {
+						lowerInode = child.Inode
+						lowerInode.IncRef()
+					}
 				}
 			}
 			child.DecRef()
-- 
cgit v1.2.3


From 121db29a93c651b8b62e8701bb0f16c231b08257 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 27 Feb 2019 14:30:20 -0800
Subject: Ping support via IPv4 raw sockets.

Broadly, this change:
* Enables sockets to be created via `socket(AF_INET, SOCK_RAW, IPPROTO_ICMP)`.
* Passes the network-layer (IP) header up the stack to the transport endpoint,
  which can pass it up to the socket layer. This allows a raw socket to return
  the entire IP packet to users.
* Adds functions to stack.TransportProtocol, stack.Stack, stack.transportDemuxer
  that enable incoming packets to be delivered to raw endpoints. New raw sockets
  of other protocols (not ICMP) just need to register with the stack.
* Enables ping.endpoint to return IP headers when created via SOCK_RAW.

PiperOrigin-RevId: 235993280
Change-Id: I60ed994f5ff18b2cbd79f063a7fdf15d093d845a
---
 pkg/sentry/socket/epsocket/BUILD       |   1 +
 pkg/sentry/socket/epsocket/provider.go |  28 +++-
 pkg/tcpip/network/ip_test.go           |   2 +-
 pkg/tcpip/network/ipv4/icmp.go         |  11 +-
 pkg/tcpip/network/ipv4/ipv4.go         |   8 +-
 pkg/tcpip/network/ipv6/icmp.go         |   4 +-
 pkg/tcpip/network/ipv6/ipv6.go         |   7 +-
 pkg/tcpip/stack/nic.go                 |   8 +-
 pkg/tcpip/stack/registration.go        |  10 +-
 pkg/tcpip/stack/stack.go               |  52 ++++++-
 pkg/tcpip/stack/stack_test.go          |   2 +-
 pkg/tcpip/stack/transport_demuxer.go   |  78 +++++++++--
 pkg/tcpip/stack/transport_test.go      |   6 +-
 pkg/tcpip/transport/icmp/endpoint.go   |  81 ++++++++---
 pkg/tcpip/transport/icmp/protocol.go   |  15 +-
 pkg/tcpip/transport/tcp/endpoint.go    |   2 +-
 pkg/tcpip/transport/tcp/forwarder.go   |   2 +-
 pkg/tcpip/transport/tcp/protocol.go    |   6 +
 pkg/tcpip/transport/udp/endpoint.go    |   2 +-
 pkg/tcpip/transport/udp/protocol.go    |   6 +
 test/syscalls/linux/BUILD              |  16 +++
 test/syscalls/linux/raw_socket_ipv4.cc | 245 +++++++++++++++++++++++++++++++++
 22 files changed, 533 insertions(+), 59 deletions(-)
 create mode 100644 test/syscalls/linux/raw_socket_ipv4.cc

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 45e418db3..44bb97b5b 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -27,6 +27,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/safemem",
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 0184d8e3e..0d9c2df24 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -18,8 +18,10 @@ import (
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.googlesource.com/gvisor/pkg/syserr"
@@ -38,9 +40,9 @@ type provider struct {
 	netProto tcpip.NetworkProtocolNumber
 }
 
-// GetTransportProtocol figures out transport protocol. Currently only TCP,
+// getTransportProtocol figures out transport protocol. Currently only TCP,
 // UDP, and ICMP are supported.
-func GetTransportProtocol(stype transport.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
+func getTransportProtocol(ctx context.Context, stype transport.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
 	switch stype {
 	case linux.SOCK_STREAM:
 		if protocol != 0 && protocol != syscall.IPPROTO_TCP {
@@ -57,6 +59,18 @@ func GetTransportProtocol(stype transport.SockType, protocol int) (tcpip.Transpo
 		case syscall.IPPROTO_ICMPV6:
 			return header.ICMPv6ProtocolNumber, nil
 		}
+
+	case linux.SOCK_RAW:
+		// Raw sockets require CAP_NET_RAW.
+		creds := auth.CredentialsFromContext(ctx)
+		if !creds.HasCapability(linux.CAP_NET_RAW) {
+			return 0, syserr.ErrPermissionDenied
+		}
+
+		switch protocol {
+		case syscall.IPPROTO_ICMP:
+			return header.ICMPv4ProtocolNumber, nil
+		}
 	}
 	return 0, syserr.ErrInvalidArgument
 }
@@ -76,14 +90,20 @@ func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int
 	}
 
 	// Figure out the transport protocol.
-	transProto, err := GetTransportProtocol(stype, protocol)
+	transProto, err := getTransportProtocol(t, stype, protocol)
 	if err != nil {
 		return nil, err
 	}
 
 	// Create the endpoint.
+	var ep tcpip.Endpoint
+	var e *tcpip.Error
 	wq := &waiter.Queue{}
-	ep, e := eps.Stack.NewEndpoint(transProto, p.netProto, wq)
+	if stype == linux.SOCK_RAW {
+		ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq)
+	} else {
+		ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq)
+	}
 	if e != nil {
 		return nil, syserr.TranslateNetstackError(e)
 	}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 5c1e88e56..97a43aece 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -94,7 +94,7 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) {
 	t.checkValues(protocol, vv, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index f82dc098f..ea8392c98 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -55,7 +55,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) {
+func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
 	v := vv.First()
 	if len(v) < header.ICMPv4MinimumSize {
 		return
@@ -67,19 +67,22 @@ func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) {
 		if len(v) < header.ICMPv4EchoMinimumSize {
 			return
 		}
-		vv.TrimFront(header.ICMPv4MinimumSize)
-		req := echoRequest{r: r.Clone(), v: vv.ToView()}
+		echoPayload := vv.ToView()
+		echoPayload.TrimFront(header.ICMPv4MinimumSize)
+		req := echoRequest{r: r.Clone(), v: echoPayload}
 		select {
 		case e.echoRequests <- req:
 		default:
 			req.r.Release()
 		}
+		// It's possible that a raw socket expects to receive this.
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
 
 	case header.ICMPv4EchoReply:
 		if len(v) < header.ICMPv4EchoMinimumSize {
 			return
 		}
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
 
 	case header.ICMPv4DstUnreachable:
 		if len(v) < header.ICMPv4DstUnreachableMinimumSize {
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 0c41519df..bfc3c08fa 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -131,7 +131,8 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	h := header.IPv4(vv.First())
+	headerView := vv.First()
+	h := header.IPv4(headerView)
 	if !h.IsValid(vv.Size()) {
 		return
 	}
@@ -153,11 +154,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 	}
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
-		e.handleICMP(r, vv)
+		headerView.CapLength(hlen)
+		e.handleICMP(r, headerView, vv)
 		return
 	}
 	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, vv)
+	e.dispatcher.DeliverTransportPacket(r, p, headerView, vv)
 }
 
 // Close cleans up resources associated with the endpoint.
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 14107443b..5a3c17768 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -62,7 +62,7 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) {
+func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
 	v := vv.First()
 	if len(v) < header.ICMPv6MinimumSize {
 		return
@@ -148,7 +148,7 @@ func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) {
 		if len(v) < header.ICMPv6EchoMinimumSize {
 			return
 		}
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, vv)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, netHeader, vv)
 
 	}
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 4d0b6ee9c..5f68ef7d5 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -102,7 +102,8 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
-	h := header.IPv6(vv.First())
+	headerView := vv.First()
+	h := header.IPv6(headerView)
 	if !h.IsValid(vv.Size()) {
 		return
 	}
@@ -112,12 +113,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) {
 
 	p := h.TransportProtocol()
 	if p == header.ICMPv6ProtocolNumber {
-		e.handleICMP(r, vv)
+		e.handleICMP(r, headerView, vv)
 		return
 	}
 
 	r.Stats().IP.PacketsDelivered.Increment()
-	e.dispatcher.DeliverTransportPacket(r, p, vv)
+	e.dispatcher.DeliverTransportPacket(r, p, headerView, vv)
 }
 
 // Close cleans up resources associated with the endpoint.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 2278fbf65..79f845225 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -505,7 +505,7 @@ func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *r
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -525,16 +525,16 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	}
 
 	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
-	if n.demux.deliverPacket(r, protocol, vv, id) {
+	if n.demux.deliverPacket(r, protocol, netHeader, vv, id) {
 		return
 	}
-	if n.stack.demux.deliverPacket(r, protocol, vv, id) {
+	if n.stack.demux.deliverPacket(r, protocol, netHeader, vv, id) {
 		return
 	}
 
 	// Try to deliver to per-stack default handler.
 	if state.defaultHandler != nil {
-		if state.defaultHandler(r, id, vv) {
+		if state.defaultHandler(r, id, netHeader, vv) {
 			return
 		}
 	}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 5accffa1b..62acd5919 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -64,7 +64,7 @@ const (
 type TransportEndpoint interface {
 	// HandlePacket is called by the stack when new packets arrive to
 	// this transport endpoint.
-	HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView)
+	HandlePacket(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView)
 
 	// HandleControlPacket is called by the stack when new control (e.g.,
 	// ICMP) packets arrive to this transport endpoint.
@@ -80,6 +80,9 @@ type TransportProtocol interface {
 	// NewEndpoint creates a new endpoint of the transport protocol.
 	NewEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
 
+	// NewRawEndpoint creates a new raw endpoint of the transport protocol.
+	NewRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
 	// MinimumPacketSize returns the minimum valid packet size of this
 	// transport protocol. The stack automatically drops any packets smaller
 	// than this targeted at this protocol.
@@ -113,8 +116,9 @@ type TransportProtocol interface {
 // the network layer.
 type TransportDispatcher interface {
 	// DeliverTransportPacket delivers packets to the appropriate
-	// transport protocol endpoint.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView)
+	// transport protocol endpoint. It also returns the network layer
+	// header for the enpoint to inspect or pass up the stack.
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 252c79317..797489ad9 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -48,7 +48,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(*Route, TransportEndpointID, buffer.VectorisedView) bool
+	defaultHandler func(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -437,7 +437,7 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, buffer.VectorisedView) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, buffer.View, buffer.VectorisedView) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
@@ -499,6 +499,18 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp
 	return t.proto.NewEndpoint(s, network, waiterQueue)
 }
 
+// NewRawEndpoint creates a new raw transport layer endpoint of the given
+// protocol. Raw endpoints receive all traffic for a given protocol regardless
+// of address.
+func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	t, ok := s.transportProtocols[transport]
+	if !ok {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	return t.proto.NewRawEndpoint(s, network, waiterQueue)
+}
+
 // createNIC creates a NIC with the provided id and link-layer endpoint, and
 // optionally enable it.
 func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enabled bool) *tcpip.Error {
@@ -934,6 +946,42 @@ func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip
 	}
 }
 
+// RegisterRawTransportEndpoint registers the given endpoint with the stack
+// transport dispatcher. Received packets that match the provided protocol will
+// be delivered to the given endpoint.
+func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+	if nicID == 0 {
+		return s.demux.registerRawEndpoint(netProtos, protocol, ep, reusePort)
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.demux.registerRawEndpoint(netProtos, protocol, ep, reusePort)
+}
+
+// UnregisterRawTransportEndpoint removes the endpoint for the protocol from
+// the stack transport dispatcher.
+func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint) {
+	if nicID == 0 {
+		s.demux.unregisterRawEndpoint(netProtos, protocol, ep)
+		return
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic != nil {
+		nic.demux.unregisterRawEndpoint(netProtos, protocol, ep)
+	}
+}
+
 // NetworkProtocolInstance returns the protocol instance in the stack for the
 // specified network protocol. This method is public for protocol implementers
 // and tests to use.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 163fadded..28743f3d5 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -97,7 +97,7 @@ func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedV
 	}
 
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), vv)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), buffer.View([]byte{}), vv)
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index c18208dc0..9ab314188 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -32,8 +32,12 @@ type protocolIDs struct {
 // transportEndpoints manages all endpoints of a given protocol. It has its own
 // mutex so as to reduce interference between protocols.
 type transportEndpoints struct {
+	// mu protects all fields of the transportEndpoints.
 	mu        sync.RWMutex
 	endpoints map[TransportEndpointID]TransportEndpoint
+	// rawEndpoints contains endpoints for raw sockets, which receive all
+	// traffic of a given protocol regardless of port.
+	rawEndpoints []TransportEndpoint
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
@@ -67,7 +71,9 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 	// Add each network and transport pair to the demuxer.
 	for netProto := range stack.networkProtocols {
 		for proto := range stack.transportProtocols {
-			d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{endpoints: make(map[TransportEndpointID]TransportEndpoint)}
+			d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
+				endpoints: make(map[TransportEndpointID]TransportEndpoint),
+			}
 		}
 	}
 
@@ -131,22 +137,22 @@ func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID) TransportEnd
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) {
 	// If this is a broadcast datagram, deliver the datagram to all endpoints
 	// managed by ep.
 	if id.LocalAddress == header.IPv4Broadcast {
 		for i, endpoint := range ep.endpointsArr {
 			// HandlePacket modifies vv, so each endpoint needs its own copy.
 			if i == len(ep.endpointsArr)-1 {
-				endpoint.HandlePacket(r, id, vv)
+				endpoint.HandlePacket(r, id, netHeader, vv)
 				break
 			}
 			vvCopy := buffer.NewView(vv.Size())
 			copy(vvCopy, vv.ToView())
-			endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
+			endpoint.HandlePacket(r, id, buffer.NewViewFromBytes(netHeader), vvCopy.ToVectorisedView())
 		}
 	} else {
-		ep.selectEndpoint(id).HandlePacket(r, id, vv)
+		ep.selectEndpoint(id).HandlePacket(r, id, netHeader, vv)
 	}
 }
 
@@ -250,7 +256,7 @@ var loopbackSubnet = func() tcpip.Subnet {
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if it
 // found one or more endpoints, false otherwise.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -276,10 +282,21 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	} else if ep := d.findEndpointLocked(eps, vv, id); ep != nil {
 		destEps = append(destEps, ep)
 	}
+
+	// As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via
+	// raw endpoint first. If there are multipe raw endpoints, they all
+	// receive the packet.
+	found := false
+	for _, rawEP := range eps.rawEndpoints {
+		// Each endpoint gets its own copy of the packet for the sake
+		// of save/restore.
+		rawEP.HandlePacket(r, id, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView())
+		found = true
+	}
 	eps.mu.RUnlock()
 
 	// Fail if we didn't find at least one matching transport endpoint.
-	if len(destEps) == 0 {
+	if len(destEps) == 0 && !found {
 		// UDP packet could not be delivered to an unknown destination port.
 		if protocol == header.UDPProtocolNumber {
 			r.Stats().UDP.UnknownPortErrors.Increment()
@@ -289,7 +306,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 	// Deliver the packet.
 	for _, ep := range destEps {
-		ep.HandlePacket(r, id, vv)
+		ep.HandlePacket(r, id, netHeader, vv)
 	}
 
 	return true
@@ -349,3 +366,48 @@ func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer
 
 	return nil
 }
+
+// registerRawEndpoint registers the given endpoint with the dispatcher such
+// that packets of the appropriate protocol are delivered to it. A single
+// packet can be sent to one or more raw endpoints along with a non-raw
+// endpoint.
+func (d *transportDemuxer) registerRawEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+	for i, n := range netProtos {
+		if err := d.singleRegisterRawEndpoint(n, protocol, ep); err != nil {
+			d.unregisterRawEndpoint(netProtos[:i], protocol, ep)
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *transportDemuxer) singleRegisterRawEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint) *tcpip.Error {
+	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
+	if !ok {
+		return nil
+	}
+
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+	eps.rawEndpoints = append(eps.rawEndpoints, ep)
+
+	return nil
+}
+
+// unregisterRawEndpoint unregisters the raw endpoint for the given protocol
+// such that it won't receive any more packets.
+func (d *transportDemuxer) unregisterRawEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, ep TransportEndpoint) {
+	for _, n := range netProtos {
+		if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
+			eps.mu.Lock()
+			defer eps.mu.Unlock()
+			for i, rawEP := range eps.rawEndpoints {
+				if rawEP == ep {
+					eps.rawEndpoints = append(eps.rawEndpoints[:i], eps.rawEndpoints[i+1:]...)
+					return
+				}
+			}
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index da460db77..3347b5599 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -168,7 +168,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ buffer.VectorisedView) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ buffer.View, _ buffer.VectorisedView) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -214,6 +214,10 @@ func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.N
 	return newFakeTransportEndpoint(stack, f, netProto), nil
 }
 
+func (f *fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return nil, tcpip.ErrUnknownProtocol
+}
+
 func (*fakeTransportProtocol) MinimumPacketSize() int {
 	return fakeTransHeaderLen
 }
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index d87bfe048..b3b7a1d0e 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -46,17 +46,23 @@ const (
 	stateClosed
 )
 
-// endpoint represents an ICMP (ping) endpoint. This struct serves as the
-// interface between users of the endpoint and the protocol implementation; it
-// is legal to have concurrent goroutines make calls into the endpoint, they
-// are properly synchronized.
+// endpoint represents an ICMP endpoint. This struct serves as the interface
+// between users of the endpoint and the protocol implementation; it is legal to
+// have concurrent goroutines make calls into the endpoint, they are properly
+// synchronized.
+//
+// +stateify savable
 type endpoint struct {
-	// The following fields are initialized at creation time and do not
-	// change throughout the lifetime of the endpoint.
+	// The following fields are initialized at creation time and are
+	// immutable.
 	stack       *stack.Stack `state:"manual"`
 	netProto    tcpip.NetworkProtocolNumber
 	transProto  tcpip.TransportProtocolNumber
 	waiterQueue *waiter.Queue
+	// raw indicates whether the endpoint is intended for use by a raw
+	// socket, which returns the network layer header along with the
+	// payload. It is immutable.
+	raw bool
 
 	// The following fields are used to manage the receive queue, and are
 	// protected by rcvMu.
@@ -80,15 +86,26 @@ type endpoint struct {
 	route         stack.Route `state:"manual"`
 }
 
-func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
-	return &endpoint{
+func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, raw bool) (*endpoint, *tcpip.Error) {
+	e := &endpoint{
 		stack:         stack,
 		netProto:      netProto,
 		transProto:    transProto,
 		waiterQueue:   waiterQueue,
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
+		raw:           raw,
+	}
+
+	// Raw endpoints must be immediately bound because they receive all
+	// ICMP traffic starting from when they're created via socket().
+	if raw {
+		if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
+			return nil, err
+		}
 	}
+
+	return e, nil
 }
 
 // Close puts the endpoint in a closed state and frees all resources
@@ -98,7 +115,11 @@ func (e *endpoint) Close() {
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 	switch e.state {
 	case stateBound, stateConnected:
-		e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e)
+		if e.raw {
+			e.stack.UnregisterRawTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e)
+		} else {
+			e.stack.UnregisterTransportEndpoint(e.regNICID, []tcpip.NetworkProtocolNumber{e.netProto}, e.transProto, e.id, e)
+		}
 	}
 
 	// Close the receive list and drain it.
@@ -285,10 +306,10 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 
 	switch e.netProto {
 	case header.IPv4ProtocolNumber:
-		err = sendPing4(route, e.id.LocalPort, v)
+		err = e.send4(route, v)
 
 	case header.IPv6ProtocolNumber:
-		err = sendPing6(route, e.id.LocalPort, v)
+		err = send6(route, e.id.LocalPort, v)
 	}
 
 	if err != nil {
@@ -346,13 +367,19 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	}
 }
 
-func sendPing4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
+func (e *endpoint) send4(r *stack.Route, data buffer.View) *tcpip.Error {
+	if e.raw {
+		hdr := buffer.NewPrependable(len(data) + int(r.MaxHeaderLength()))
+		return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
+	}
+
 	if len(data) < header.ICMPv4EchoMinimumSize {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	// Set the ident. Sequence number is provided by the user.
-	binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], ident)
+	// Set the ident to the user-specified port. Sequence number should
+	// already be set by the user.
+	binary.BigEndian.PutUint16(data[header.ICMPv4MinimumSize:], e.id.LocalPort)
 
 	hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength()))
 
@@ -371,7 +398,7 @@ func sendPing4(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
 	return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL())
 }
 
-func sendPing6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
+func send6(r *stack.Route, ident uint16, data buffer.View) *tcpip.Error {
 	if len(data) < header.ICMPv6EchoMinimumSize {
 		return tcpip.ErrInvalidEndpointState
 	}
@@ -412,6 +439,11 @@ func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (t
 
 // Connect connects the endpoint to its peer. Specifying a NIC is optional.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	// TODO: We don't yet support connect on a raw socket.
+	if e.raw {
+		return tcpip.ErrNotSupported
+	}
+
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
@@ -515,6 +547,11 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 }
 
 func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+	if e.raw {
+		err := e.stack.RegisterRawTransportEndpoint(nicid, netProtos, e.transProto, e, false)
+		return stack.TransportEndpointID{}, err
+	}
+
 	if id.LocalPort != 0 {
 		// The endpoint already has a local port, just attempt to
 		// register it.
@@ -657,7 +694,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) {
 	e.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -675,9 +712,17 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 			Addr: id.RemoteAddress,
 		},
 	}
-	pkt.data = vv.Clone(pkt.views[:])
+
+	if e.raw {
+		combinedVV := netHeader.ToVectorisedView()
+		combinedVV.Append(vv)
+		pkt.data = combinedVV.Clone(pkt.views[:])
+	} else {
+		pkt.data = vv.Clone(pkt.views[:])
+	}
+
 	e.rcvList.PushBack(pkt)
-	e.rcvBufSize += vv.Size()
+	e.rcvBufSize += pkt.data.Size()
 
 	pkt.timestamp = e.stack.NowNanoseconds()
 
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 9f0a2bf71..36b70988a 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -47,6 +47,7 @@ const (
 	ProtocolNumber6 = header.ICMPv6ProtocolNumber
 )
 
+// protocol implements stack.TransportProtocol.
 type protocol struct {
 	number tcpip.TransportProtocolNumber
 }
@@ -66,12 +67,22 @@ func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
 	panic(fmt.Sprint("unknown protocol number: ", p.number))
 }
 
-// NewEndpoint creates a new icmp endpoint.
+// NewEndpoint creates a new icmp endpoint. It implements
+// stack.TransportProtocol.NewEndpoint.
 func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	if netProto != p.netProto() {
 		return nil, tcpip.ErrUnknownProtocol
 	}
-	return newEndpoint(stack, netProto, p.number, waiterQueue), nil
+	return newEndpoint(stack, netProto, p.number, waiterQueue, false)
+}
+
+// NewRawEndpoint creates a new raw icmp endpoint. It implements
+// stack.TransportProtocol.NewRawEndpoint.
+func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if netProto != p.netProto() {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+	return newEndpoint(stack, netProto, p.number, waiterQueue, true)
 }
 
 // MinimumPacketSize returns the minimum valid icmp packet size.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index a8618bb4a..c48a27d8f 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1441,7 +1441,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) {
 	s := newSegment(r, id, vv)
 	if !s.parse() {
 		e.stack.Stats().MalformedRcvdPackets.Increment()
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 2f90839e9..ca53a076f 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -63,7 +63,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool {
 	s := newSegment(r, id, vv)
 	defer s.decRef()
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 753e1419e..639ad3fae 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -101,6 +101,12 @@ func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolN
 	return newEndpoint(stack, netProto, waiterQueue), nil
 }
 
+// NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
+// unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
+func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return nil, tcpip.ErrUnknownProtocol
+}
+
 // MinimumPacketSize returns the minimum valid tcp packet size.
 func (*protocol) MinimumPacketSize() int {
 	return header.TCPMinimumSize
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 05d35e526..44b9cdf6a 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -934,7 +934,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) {
 	// Get the header then trim it from the view.
 	hdr := header.UDP(vv.First())
 	if int(hdr.Length()) > vv.Size() {
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index b3fbed6e4..616a9f388 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -48,6 +48,12 @@ func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolN
 	return newEndpoint(stack, netProto, waiterQueue), nil
 }
 
+// NewRawEndpoint creates a new raw UDP endpoint. Raw UDP sockets are currently
+// unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
+func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return nil, tcpip.ErrUnknownProtocol
+}
+
 // MinimumPacketSize returns the minimum valid udp packet size.
 func (*protocol) MinimumPacketSize() int {
 	return header.UDPMinimumSize
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index beece8930..4c818238b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1529,6 +1529,22 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "raw_socket_ipv4_test",
+    testonly = 1,
+    srcs = ["raw_socket_ipv4.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "read_test",
     testonly = 1,
diff --git a/test/syscalls/linux/raw_socket_ipv4.cc b/test/syscalls/linux/raw_socket_ipv4.cc
new file mode 100644
index 000000000..c6749321c
--- /dev/null
+++ b/test/syscalls/linux/raw_socket_ipv4.cc
@@ -0,0 +1,245 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/capability.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Fixture for tests parameterized by address family (currently only AF_INET).
+class RawSocketTest : public ::testing::Test {
+ protected:
+  // Creates a socket to be used in tests.
+  void SetUp() override;
+
+  // Closes the socket created by SetUp().
+  void TearDown() override;
+
+  // The socket used for both reading and writing.
+  int s_;
+
+  // The loopback address.
+  struct sockaddr_in addr_;
+
+  void sendEmptyICMP(struct icmphdr *icmp);
+
+  void sendEmptyICMPTo(int sock, struct sockaddr_in *addr,
+                       struct icmphdr *icmp);
+
+  void receiveICMP(char *recv_buf, size_t recv_buf_len, size_t expected_size,
+                   struct sockaddr_in *src);
+
+  void receiveICMPFrom(char *recv_buf, size_t recv_buf_len,
+                       size_t expected_size, struct sockaddr_in *src, int sock);
+};
+
+void RawSocketTest::SetUp() {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+
+  addr_ = {};
+
+  // We don't set ports because raw sockets don't have a notion of ports.
+  addr_.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  addr_.sin_family = AF_INET;
+}
+
+void RawSocketTest::TearDown() {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+}
+
+// We should be able to create multiple raw sockets for the same protocol.
+// BasicRawSocket::Setup creates the first one, so we only have to create one
+// more here.
+TEST_F(RawSocketTest, MultipleCreation) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int s2;
+  ASSERT_THAT(s2 = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+
+  ASSERT_THAT(close(s2), SyscallSucceeds());
+}
+
+// Send and receive an ICMP packet.
+TEST_F(RawSocketTest, SendAndReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+  // and ID. None of that should matter for raw sockets - the kernel should
+  // still give us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = *(unsigned short *)&icmp.checksum;
+  icmp.un.echo.sequence = *(unsigned short *)&icmp.un.echo.sequence;
+  icmp.un.echo.id = *(unsigned short *)&icmp.un.echo.id;
+  ASSERT_NO_FATAL_FAILURE(sendEmptyICMP(&icmp));
+
+  // Receive the packet and make sure it's identical.
+  char recv_buf[512];
+  struct sockaddr_in src;
+  ASSERT_NO_FATAL_FAILURE(receiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf),
+                                      sizeof(struct icmphdr), &src));
+  EXPECT_EQ(memcmp(&src, &addr_, sizeof(sockaddr_in)), 0);
+  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)), 0);
+
+  // We should also receive the automatically generated echo reply.
+  ASSERT_NO_FATAL_FAILURE(receiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf),
+                                      sizeof(struct icmphdr), &src));
+  EXPECT_EQ(memcmp(&src, &addr_, sizeof(sockaddr_in)), 0);
+  struct icmphdr *reply_icmp =
+      (struct icmphdr *)(recv_buf + sizeof(struct iphdr));
+  // Most fields should be the same.
+  EXPECT_EQ(reply_icmp->code, icmp.code);
+  EXPECT_EQ(reply_icmp->un.echo.sequence, icmp.un.echo.sequence);
+  EXPECT_EQ(reply_icmp->un.echo.id, icmp.un.echo.id);
+  // A couple are different.
+  EXPECT_EQ(reply_icmp->type, ICMP_ECHOREPLY);
+  // The checksum is computed in such a way that it is guaranteed to have
+  // changed.
+  EXPECT_NE(reply_icmp->checksum, icmp.checksum);
+}
+
+// We should be able to create multiple raw sockets for the same protocol and
+// receive the same packet on both.
+TEST_F(RawSocketTest, MultipleSocketReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  FileDescriptor s2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_ICMP));
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+  // and ID. None of that should matter for raw sockets - the kernel should
+  // still give us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = *(unsigned short *)&icmp.checksum;
+  icmp.un.echo.sequence = *(unsigned short *)&icmp.un.echo.sequence;
+  icmp.un.echo.id = *(unsigned short *)&icmp.un.echo.id;
+  ASSERT_NO_FATAL_FAILURE(sendEmptyICMP(&icmp));
+
+  // Receive it on socket 1.
+  char recv_buf1[512];
+  struct sockaddr_in src;
+  ASSERT_NO_FATAL_FAILURE(receiveICMP(recv_buf1, ABSL_ARRAYSIZE(recv_buf1),
+                                      sizeof(struct icmphdr), &src));
+  EXPECT_EQ(memcmp(&src, &addr_, sizeof(sockaddr_in)), 0);
+
+  // Receive it on socket 2.
+  char recv_buf2[512];
+  ASSERT_NO_FATAL_FAILURE(receiveICMPFrom(recv_buf2, ABSL_ARRAYSIZE(recv_buf2),
+                                          sizeof(struct icmphdr), &src,
+                                          s2.get()));
+  EXPECT_EQ(memcmp(&src, &addr_, sizeof(sockaddr_in)), 0);
+
+  EXPECT_EQ(memcmp(recv_buf1 + sizeof(struct iphdr),
+                   recv_buf2 + sizeof(struct iphdr), sizeof(icmp)),
+            0);
+}
+
+// A raw ICMP socket and ping socket should both receive the ICMP packets
+// indended for the ping socket.
+TEST_F(RawSocketTest, RawAndPingSockets) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  FileDescriptor ping_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+
+  // Ping sockets take care of the ICMP ID and checksum.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.un.echo.sequence = *(unsigned short *)&icmp.un.echo.sequence;
+  ASSERT_THAT(RetryEINTR(sendto)(ping_sock.get(), &icmp, sizeof(icmp), 0,
+                                 (struct sockaddr *)&addr_, sizeof(addr_)),
+              SyscallSucceedsWithValue(sizeof(icmp)));
+
+  // Receive the packet via raw socket.
+  char recv_buf[512];
+  struct sockaddr_in src;
+  ASSERT_NO_FATAL_FAILURE(receiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf),
+                                      sizeof(struct icmphdr), &src));
+  EXPECT_EQ(memcmp(&src, &addr_, sizeof(sockaddr_in)), 0);
+
+  // Receive the packet via ping socket.
+  struct icmphdr ping_header;
+  ASSERT_THAT(
+      RetryEINTR(recv)(ping_sock.get(), &ping_header, sizeof(ping_header), 0),
+      SyscallSucceedsWithValue(sizeof(ping_header)));
+
+  // Packets should be the same.
+  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &ping_header,
+                   sizeof(struct icmphdr)),
+            0);
+}
+
+void RawSocketTest::sendEmptyICMP(struct icmphdr *icmp) {
+  ASSERT_NO_FATAL_FAILURE(sendEmptyICMPTo(s_, &addr_, icmp));
+}
+
+void RawSocketTest::sendEmptyICMPTo(int sock, struct sockaddr_in *addr,
+                                    struct icmphdr *icmp) {
+  struct iovec iov = {.iov_base = icmp, .iov_len = sizeof(*icmp)};
+  struct msghdr msg {
+    .msg_name = addr, .msg_namelen = sizeof(*addr), .msg_iov = &iov,
+    .msg_iovlen = 1, .msg_control = NULL, .msg_controllen = 0, .msg_flags = 0,
+  };
+  ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallSucceedsWithValue(sizeof(*icmp)));
+}
+
+void RawSocketTest::receiveICMP(char *recv_buf, size_t recv_buf_len,
+                                size_t expected_size, struct sockaddr_in *src) {
+  ASSERT_NO_FATAL_FAILURE(
+      receiveICMPFrom(recv_buf, recv_buf_len, expected_size, src, s_));
+}
+
+void RawSocketTest::receiveICMPFrom(char *recv_buf, size_t recv_buf_len,
+                                    size_t expected_size,
+                                    struct sockaddr_in *src, int sock) {
+  struct iovec iov = {.iov_base = recv_buf, .iov_len = recv_buf_len};
+  struct msghdr msg = {
+      .msg_name = src,
+      .msg_namelen = sizeof(*src),
+      .msg_iov = &iov,
+      .msg_iovlen = 1,
+      .msg_control = NULL,
+      .msg_controllen = 0,
+      .msg_flags = 0,
+  };
+  // We should receive the ICMP packet plus 20 bytes of IP header.
+  ASSERT_THAT(recvmsg(sock, &msg, 0),
+              SyscallSucceedsWithValue(expected_size + sizeof(struct iphdr)));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 05d721f9eec3ad0a430906b968a2876bf37c44a7 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 28 Feb 2019 13:13:38 -0800
Subject: Hold dataMu for writing in CachingInodeOperations.WriteOut.

fsutil.SyncDirtyAll mutates the DirtySet.

PiperOrigin-RevId: 236183349
Change-Id: I7e809d5b406ac843407e61eff17d81259a819b4f
---
 pkg/sentry/fs/fsutil/inode_cached.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 5e7e861d2..e3b52e943 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -322,9 +322,9 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode)
 	c.attrMu.Lock()
 
 	// Write dirty pages back.
-	c.dataMu.RLock()
+	c.dataMu.Lock()
 	err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt)
-	c.dataMu.RUnlock()
+	c.dataMu.Unlock()
 	if err != nil {
 		c.attrMu.Unlock()
 		return err
-- 
cgit v1.2.3


From f7df9d72cf1d10922ff5a55ef5664b4325439ef5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 28 Feb 2019 16:25:14 -0800
Subject: Upgrade to Go 1.12

PiperOrigin-RevId: 236218980
Change-Id: I82cb4aeb2a56524ee1324bfea2ad41dce26db354
---
 WORKSPACE                                 | 10 +++++-----
 pkg/sentry/platform/procid/procid_amd64.s |  2 +-
 pkg/sentry/platform/procid/procid_arm64.s |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/WORKSPACE b/WORKSPACE
index 47f3e85ba..dee249a79 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,18 +2,18 @@
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 http_archive(
     name = "io_bazel_rules_go",
-    urls = ["https://github.com/bazelbuild/rules_go/releases/download/0.17.0/rules_go-0.17.0.tar.gz"],
-    sha256 = "492c3ac68ed9dcf527a07e6a1b2dcbf199c6bf8b35517951467ac32e421c06c1",
+    urls = ["https://github.com/bazelbuild/rules_go/releases/download/0.17.1/rules_go-0.17.1.tar.gz"],
+    sha256 = "6776d68ebb897625dead17ae510eac3d5f6342367327875210df44dbe2aeeb19",
 )
 http_archive(
     name = "bazel_gazelle",
-    url = "https://github.com/bazelbuild/bazel-gazelle/releases/download/0.16.0/bazel-gazelle-0.16.0.tar.gz",
-    sha256 = "7949fc6cc17b5b191103e97481cf8889217263acf52e00b560683413af204fcb",
+    url = "https://github.com/bazelbuild/bazel-gazelle/releases/download/0.17.0/bazel-gazelle-0.17.0.tar.gz",
+    sha256 = "3c681998538231a2d24d0c07ed5a7658cb72bfb5fd4bf9911157c0e9ac6a2687",
 )
 
 load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies", "go_register_toolchains")
 go_rules_dependencies()
-go_register_toolchains(go_version="1.11.5")
+go_register_toolchains(go_version="1.12")
 load("@bazel_gazelle//:deps.bzl", "gazelle_dependencies", "go_repository")
 gazelle_dependencies()
 
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
index fd88ce82e..ef3439c03 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.12
+// +build !go1.13
 
 #include "textflag.h"
 
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/sentry/platform/procid/procid_arm64.s
index be65d0db0..02e907b6b 100644
--- a/pkg/sentry/platform/procid/procid_arm64.s
+++ b/pkg/sentry/platform/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.12
+// +build !go1.13
 
 #include "textflag.h"
 
-- 
cgit v1.2.3


From 3851705a73235baa6d153970c95921d17a39d77a Mon Sep 17 00:00:00 2001
From: Ruidong Cao <crdfrank@gmail.com>
Date: Thu, 28 Feb 2019 16:43:59 -0800
Subject: Fix procfs bugs

Current procfs has some bugs. After executing ls twice, many dirs come
out with same name like "1" or ".". Files like "cpuinfo" disappear.
Here variable names is a slice with cap() > len(). Sort after appending
to it will not alloc a new space and impact orignal slice. Same to m.

Signed-off-by: Ruidong Cao <crdfrank@gmail.com>
Change-Id: I83e5cd1c7968c6fe28c35ea4fee497488d4f9eef
PiperOrigin-RevId: 236222270
---
 pkg/sentry/fs/dentry.go    |  3 ++-
 pkg/sentry/fs/ramfs/dir.go | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index ef6d1a870..4879df4d6 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -185,7 +185,8 @@ func NewSortedDentryMap(entries map[string]DentAttr) *SortedDentryMap {
 	return s
 }
 
-// GetAll returns all names and entries in s.
+// GetAll returns all names and entries in s. Callers should not modify the
+// returned values.
 func (s *SortedDentryMap) GetAll() ([]string, map[string]DentAttr) {
 	return s.names, s.entries
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 696825eb5..4da876ebd 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -148,7 +148,18 @@ func (d *Dir) FindChild(name string) (*fs.Inode, bool) {
 func (d *Dir) Children() ([]string, map[string]fs.DentAttr) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
-	return d.dentryMap.GetAll()
+
+	// Return a copy to prevent callers from modifying our children.
+	names, entries := d.dentryMap.GetAll()
+	namesCopy := make([]string, len(names))
+	copy(namesCopy, names)
+
+	entriesCopy := make(map[string]fs.DentAttr)
+	for k, v := range entries {
+		entriesCopy[k] = v
+	}
+
+	return namesCopy, entriesCopy
 }
 
 // removeChildLocked attempts to remove an entry from this directory.
-- 
cgit v1.2.3


From 3b44377eda93137212e6e437b62dcb216566b858 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 28 Feb 2019 18:37:34 -0800
Subject: Fix "-c dbg" build break

Remove allocation from vCPU.die() to save stack space.

Closes #131

PiperOrigin-RevId: 236238102
Change-Id: Iafca27a1a3a472d4cb11dcda9a2060e585139d11
---
 kokoro/run_tests.sh                 |  9 +++++++--
 pkg/sentry/platform/kvm/bluepill.go |  9 ++++-----
 pkg/sentry/platform/kvm/machine.go  | 12 ++++++++++--
 3 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'pkg/sentry')

diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 648e72a90..4fcaed238 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -71,11 +71,13 @@ BAZEL_BUILD_RBE_FLAGS=(
 ####################
 
 build_everything() {
+  FLAVOR="${1}"
+
   cd ${WORKSPACE_DIR}
   bazel \
     "${BAZEL_RBE_FLAGS[@]}" \
     build \
-    "${BAZEL_BUILD_RBE_FLAGS[@]}" \
+    -c "${FLAVOR}" "${BAZEL_BUILD_RBE_FLAGS[@]}" \
     "${BUILD_PACKAGES[@]}"
 }
 
@@ -217,7 +219,7 @@ main() {
   trap finish EXIT
 
   # Build and run the simple tests.
-  build_everything
+  build_everything opt
   run_simple_tests
 
   # So far so good. Install more deps and run the integration tests.
@@ -228,6 +230,9 @@ main() {
 
   run_syscall_tests
 
+  # Build other flavors too.
+  build_everything dbg
+
   # No need to call "finish" here, it will happen at exit.
 }
 
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index d98ec8377..f24f1c662 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -49,7 +49,7 @@ var (
 //
 //go:nosplit
 func dieHandler(c *vCPU) {
-	throw(c.dieMessage)
+	throw(c.dieState.message)
 }
 
 // die is called to set the vCPU up to panic.
@@ -59,17 +59,16 @@ func dieHandler(c *vCPU) {
 //go:nosplit
 func (c *vCPU) die(context *arch.SignalContext64, msg string) {
 	// Save the death message, which will be thrown.
-	c.dieMessage = msg
+	c.dieState.message = msg
 
 	// Reload all registers to have an accurate stack trace when we return
 	// to host mode. This means that the stack should be unwound correctly.
-	var guestRegs userRegs
-	if errno := c.getUserRegisters(&guestRegs); errno != 0 {
+	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
 		throw(msg)
 	}
 
 	// Setup the trampoline.
-	dieArchSetup(c, context, &guestRegs)
+	dieArchSetup(c, context, &c.dieState.guestRegs)
 }
 
 func init() {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index deead1b5f..b8b3c9a4a 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -121,8 +121,16 @@ type vCPU struct {
 	// vCPUArchState is the architecture-specific state.
 	vCPUArchState
 
-	// dieMessage is thrown from die.
-	dieMessage string
+	dieState dieState
+}
+
+type dieState struct {
+	// message is thrown from die.
+	message string
+
+	// guestRegs is used to store register state during vCPU.die() to prevent
+	// allocation inside nosplit function.
+	guestRegs userRegs
 }
 
 // newVCPU creates a returns a new vCPU.
-- 
cgit v1.2.3


From 7693b7469f7464a88c1eb62f9479b73d5cee3921 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 1 Mar 2019 10:45:04 -0800
Subject: Format capget/capset arguments

I0225 15:32:10.795034    4166 x:0] [   6]  E capget(0x7f477fdff8c8 {Version: 3, Pid: 0}, 0x7f477fdff8b0)
I0225 15:32:10.795059    4166 x:0] [   6]  X capget(0x7f477fdff8c8 {Version: 3, Pid: 0}, 0x7f477fdff8b0 {Permitted: CAP_CHOWN|CAP_DAC_OVERRIDE|CAP_DAC_READ_SEARCH|CAP_FOWNER|CAP_FSETID|CAP_KILL|CAP_SETGID|CAP_SETUID|CAP_SETPCAP|CAP_LINUX_IMMUTABLE|CAP_NET_BIND_SERVICE|CAP_NET_BROADCAST|CAP_NET_ADMIN|CAP_NET_RAW|CAP_IPC_LOCK|CAP_IPC_OWNER|CAP_SYS_MODULE|CAP_SYS_RAWIO|CAP_SYS_CHROOT|CAP_SYS_PTRACE|CAP_SYS_PACCT|CAP_SYS_ADMIN|CAP_SYS_BOOT|CAP_SYS_NICE|CAP_SYS_RESOURCE|CAP_SYS_TIME|CAP_SYS_TTY_CONFIG|CAP_MKNOD|CAP_LEASE|CAP_AUDIT_WRITE|CAP_AUDIT_CONTROL|CAP_SETFCAP|CAP_MAC_OVERRIDE|CAP_MAC_ADMIN|CAP_SYSLOG|CAP_WAKE_ALARM|CAP_BLOCK_SUSPEND|CAP_AUDIT_READ, Inheritable: CAP_CHOWN|CAP_DAC_OVERRIDE|CAP_DAC_READ_SEARCH|CAP_FOWNER|CAP_FSETID|CAP_KILL|CAP_SETGID|CAP_SETUID|CAP_SETPCAP|CAP_LINUX_IMMUTABLE|CAP_NET_BIND_SERVICE|CAP_NET_BROADCAST|CAP_NET_ADMIN|CAP_NET_RAW|CAP_IPC_LOCK|CAP_IPC_OWNER|CAP_SYS_MODULE|CAP_SYS_RAWIO|CAP_SYS_CHROOT|CAP_SYS_PTRACE|CAP_SYS_PACCT|CAP_SYS_ADMIN|CAP_SYS_BOOT|CAP_SYS_NICE|CAP_SYS_RESOURCE|CAP_SYS_TIME|CAP_SYS_TTY_CONFIG|CAP_MKNOD|CAP_LEASE|CAP_AUDIT_WRITE|CAP_AUDIT_CONTROL|CAP_SETFCAP|CAP_MAC_OVERRIDE|CAP_MAC_ADMIN|CAP_SYSLOG|CAP_WAKE_ALARM|CAP_BLOCK_SUSPEND|CAP_AUDIT_READ, Effective: 0x0}) = 0x0 (3.399?s)
I0225 15:32:10.795114    4166 x:0] [   6]  E capset(0x7f477fdff8c8 {Version: 3, Pid: 0}, 0x7f477fdff8b0 {Permitted: CAP_CHOWN|CAP_DAC_OVERRIDE|CAP_DAC_READ_SEARCH|CAP_FOWNER|CAP_FSETID|CAP_KILL|CAP_SETGID|CAP_SETUID|CAP_SETPCAP|CAP_LINUX_IMMUTABLE|CAP_NET_BIND_SERVICE|CAP_NET_BROADCAST|CAP_NET_ADMIN|CAP_NET_RAW|CAP_IPC_LOCK|CAP_IPC_OWNER|CAP_SYS_MODULE|CAP_SYS_RAWIO|CAP_SYS_CHROOT|CAP_SYS_PTRACE|CAP_SYS_PACCT|CAP_SYS_ADMIN|CAP_SYS_BOOT|CAP_SYS_NICE|CAP_SYS_RESOURCE|CAP_SYS_TIME|CAP_SYS_TTY_CONFIG|CAP_MKNOD|CAP_LEASE|CAP_AUDIT_WRITE|CAP_AUDIT_CONTROL|CAP_SETFCAP|CAP_MAC_OVERRIDE|CAP_MAC_ADMIN|CAP_SYSLOG|CAP_WAKE_ALARM|CAP_BLOCK_SUSPEND|CAP_AUDIT_READ, Inheritable: CAP_CHOWN|CAP_DAC_OVERRIDE|CAP_DAC_READ_SEARCH|CAP_FOWNER|CAP_FSETID|CAP_KILL|CAP_SETGID|CAP_SETUID|CAP_SETPCAP|CAP_LINUX_IMMUTABLE|CAP_NET_BIND_SERVICE|CAP_NET_BROADCAST|CAP_NET_ADMIN|CAP_NET_RAW|CAP_IPC_LOCK|CAP_IPC_OWNER|CAP_SYS_MODULE|CAP_SYS_RAWIO|CAP_SYS_CHROOT|CAP_SYS_PTRACE|CAP_SYS_PACCT|CAP_SYS_ADMIN|CAP_SYS_BOOT|CAP_SYS_NICE|CAP_SYS_RESOURCE|CAP_SYS_TIME|CAP_SYS_TTY_CONFIG|CAP_MKNOD|CAP_LEASE|CAP_AUDIT_WRITE|CAP_AUDIT_CONTROL|CAP_SETFCAP|CAP_MAC_OVERRIDE|CAP_MAC_ADMIN|CAP_SYSLOG|CAP_WAKE_ALARM|CAP_BLOCK_SUSPEND|CAP_AUDIT_READ, Effective: CAP_FOWNER})
I0225 15:32:10.795127    4166 x:0] [   6]  X capset(0x7f477fdff8c8 {Version: 3, Pid: 0}, 0x7f477fdff8b0 {Permitted: CAP_CHOWN|CAP_DAC_OVERRIDE|CAP_DAC_READ_SEARCH|CAP_FOWNER|CAP_FSETID|CAP_KILL|CAP_SETGID|CAP_SETUID|CAP_SETPCAP|CAP_LINUX_IMMUTABLE|CAP_NET_BIND_SERVICE|CAP_NET_BROADCAST|CAP_NET_ADMIN|CAP_NET_RAW|CAP_IPC_LOCK|CAP_IPC_OWNER|CAP_SYS_MODULE|CAP_SYS_RAWIO|CAP_SYS_CHROOT|CAP_SYS_PTRACE|CAP_SYS_PACCT|CAP_SYS_ADMIN|CAP_SYS_BOOT|CAP_SYS_NICE|CAP_SYS_RESOURCE|CAP_SYS_TIME|CAP_SYS_TTY_CONFIG|CAP_MKNOD|CAP_LEASE|CAP_AUDIT_WRITE|CAP_AUDIT_CONTROL|CAP_SETFCAP|CAP_MAC_OVERRIDE|CAP_MAC_ADMIN|CAP_SYSLOG|CAP_WAKE_ALARM|CAP_BLOCK_SUSPEND|CAP_AUDIT_READ, Inheritable: CAP_CHOWN|CAP_DAC_OVERRIDE|CAP_DAC_READ_SEARCH|CAP_FOWNER|CAP_FSETID|CAP_KILL|CAP_SETGID|CAP_SETUID|CAP_SETPCAP|CAP_LINUX_IMMUTABLE|CAP_NET_BIND_SERVICE|CAP_NET_BROADCAST|CAP_NET_ADMIN|CAP_NET_RAW|CAP_IPC_LOCK|CAP_IPC_OWNER|CAP_SYS_MODULE|CAP_SYS_RAWIO|CAP_SYS_CHROOT|CAP_SYS_PTRACE|CAP_SYS_PACCT|CAP_SYS_ADMIN|CAP_SYS_BOOT|CAP_SYS_NICE|CAP_SYS_RESOURCE|CAP_SYS_TIME|CAP_SYS_TTY_CONFIG|CAP_MKNOD|CAP_LEASE|CAP_AUDIT_WRITE|CAP_AUDIT_CONTROL|CAP_SETFCAP|CAP_MAC_OVERRIDE|CAP_MAC_ADMIN|CAP_SYSLOG|CAP_WAKE_ALARM|CAP_BLOCK_SUSPEND|CAP_AUDIT_READ, Effective: CAP_FOWNER}) = 0x0 (3.062?s)

Not the most readable, but better than just a pointer.

PiperOrigin-RevId: 236338875
Change-Id: I4b83f778122ab98de3874e16f4258dae18da916b
---
 pkg/sentry/strace/BUILD         |   1 +
 pkg/sentry/strace/capability.go | 176 ++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/linux64.go    |   4 +-
 pkg/sentry/strace/strace.go     |  67 +++++++++++++++
 pkg/sentry/strace/syscalls.go   |  11 +++
 5 files changed, 257 insertions(+), 2 deletions(-)
 create mode 100644 pkg/sentry/strace/capability.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 552e79686..73f1e9814 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -6,6 +6,7 @@ package(licenses = ["notice"])
 go_library(
     name = "strace",
     srcs = [
+        "capability.go",
         "clone.go",
         "futex.go",
         "linux64.go",
diff --git a/pkg/sentry/strace/capability.go b/pkg/sentry/strace/capability.go
new file mode 100644
index 000000000..9001181e7
--- /dev/null
+++ b/pkg/sentry/strace/capability.go
@@ -0,0 +1,176 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// CapabilityBitset is the set of capabilties in a bitset.
+var CapabilityBitset = abi.FlagSet{
+	{
+		Flag: 1 << uint32(linux.CAP_CHOWN),
+		Name: "CAP_CHOWN",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_DAC_OVERRIDE),
+		Name: "CAP_DAC_OVERRIDE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_DAC_READ_SEARCH),
+		Name: "CAP_DAC_READ_SEARCH",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_FOWNER),
+		Name: "CAP_FOWNER",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_FSETID),
+		Name: "CAP_FSETID",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_KILL),
+		Name: "CAP_KILL",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SETGID),
+		Name: "CAP_SETGID",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SETUID),
+		Name: "CAP_SETUID",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SETPCAP),
+		Name: "CAP_SETPCAP",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_LINUX_IMMUTABLE),
+		Name: "CAP_LINUX_IMMUTABLE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_NET_BIND_SERVICE),
+		Name: "CAP_NET_BIND_SERVICE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_NET_BROADCAST),
+		Name: "CAP_NET_BROADCAST",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_NET_ADMIN),
+		Name: "CAP_NET_ADMIN",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_NET_RAW),
+		Name: "CAP_NET_RAW",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_IPC_LOCK),
+		Name: "CAP_IPC_LOCK",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_IPC_OWNER),
+		Name: "CAP_IPC_OWNER",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_MODULE),
+		Name: "CAP_SYS_MODULE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_RAWIO),
+		Name: "CAP_SYS_RAWIO",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_CHROOT),
+		Name: "CAP_SYS_CHROOT",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_PTRACE),
+		Name: "CAP_SYS_PTRACE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_PACCT),
+		Name: "CAP_SYS_PACCT",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_ADMIN),
+		Name: "CAP_SYS_ADMIN",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_BOOT),
+		Name: "CAP_SYS_BOOT",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_NICE),
+		Name: "CAP_SYS_NICE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_RESOURCE),
+		Name: "CAP_SYS_RESOURCE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_TIME),
+		Name: "CAP_SYS_TIME",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYS_TTY_CONFIG),
+		Name: "CAP_SYS_TTY_CONFIG",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_MKNOD),
+		Name: "CAP_MKNOD",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_LEASE),
+		Name: "CAP_LEASE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_AUDIT_WRITE),
+		Name: "CAP_AUDIT_WRITE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_AUDIT_CONTROL),
+		Name: "CAP_AUDIT_CONTROL",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SETFCAP),
+		Name: "CAP_SETFCAP",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_MAC_OVERRIDE),
+		Name: "CAP_MAC_OVERRIDE",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_MAC_ADMIN),
+		Name: "CAP_MAC_ADMIN",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_SYSLOG),
+		Name: "CAP_SYSLOG",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_WAKE_ALARM),
+		Name: "CAP_WAKE_ALARM",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_BLOCK_SUSPEND),
+		Name: "CAP_BLOCK_SUSPEND",
+	},
+	{
+		Flag: 1 << uint32(linux.CAP_AUDIT_READ),
+		Name: "CAP_AUDIT_READ",
+	},
+}
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index de2da9369..ca695e80f 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -142,8 +142,8 @@ var linuxAMD64 = SyscallMap{
 	122: makeSyscallInfo("setfsuid", Hex),
 	123: makeSyscallInfo("setfsgid", Hex),
 	124: makeSyscallInfo("getsid", Hex),
-	125: makeSyscallInfo("capget", Hex, Hex),
-	126: makeSyscallInfo("capset", Hex, Hex),
+	125: makeSyscallInfo("capget", CapHeader, PostCapData),
+	126: makeSyscallInfo("capset", CapHeader, CapData),
 	127: makeSyscallInfo("rt_sigpending", Hex),
 	128: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
 	129: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex),
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index da27a2ae8..6c93d7de7 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -271,6 +271,67 @@ func rusage(t *kernel.Task, addr usermem.Addr) string {
 	return fmt.Sprintf("%#x %+v", addr, ru)
 }
 
+func capHeader(t *kernel.Task, addr usermem.Addr) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(addr, &hdr); err != nil {
+		return fmt.Sprintf("%#x (error decoding header: %s)", addr, err)
+	}
+
+	var version string
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		version = "1"
+	case linux.LINUX_CAPABILITY_VERSION_2:
+		version = "2"
+	case linux.LINUX_CAPABILITY_VERSION_3:
+		version = "3"
+	default:
+		version = strconv.FormatUint(uint64(hdr.Version), 16)
+	}
+
+	return fmt.Sprintf("%#x {Version: %s, Pid: %d}", addr, version, hdr.Pid)
+}
+
+func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string {
+	if dataAddr == 0 {
+		return "null"
+	}
+
+	var hdr linux.CapUserHeader
+	if _, err := t.CopyIn(hdrAddr, &hdr); err != nil {
+		return fmt.Sprintf("%#x (error decoding header: %v)", dataAddr, err)
+	}
+
+	var p, i, e uint64
+
+	switch hdr.Version {
+	case linux.LINUX_CAPABILITY_VERSION_1:
+		var data linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
+		}
+		p = uint64(data.Permitted)
+		i = uint64(data.Inheritable)
+		e = uint64(data.Effective)
+	case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
+		var data [2]linux.CapUserData
+		if _, err := t.CopyIn(dataAddr, &data); err != nil {
+			return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err)
+		}
+		p = uint64(data[0].Permitted) | (uint64(data[1].Permitted) << 32)
+		i = uint64(data[0].Inheritable) | (uint64(data[1].Inheritable) << 32)
+		e = uint64(data[0].Effective) | (uint64(data[1].Effective) << 32)
+	default:
+		return fmt.Sprintf("%#x (unknown version %d)", dataAddr, hdr.Version)
+	}
+
+	return fmt.Sprintf("%#x {Permitted: %s, Inheritable: %s, Effective: %s}", dataAddr, CapabilityBitset.Parse(p), CapabilityBitset.Parse(i), CapabilityBitset.Parse(e))
+}
+
 // pre fills in the pre-execution arguments for a system call. If an argument
 // cannot be interpreted before the system call is executed, then a hex value
 // will be used. Note that a full output slice will always be provided, that is
@@ -341,6 +402,10 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, sigSet(t, args[arg].Pointer()))
 		case SigAction:
 			output = append(output, sigAction(t, args[arg].Pointer()))
+		case CapHeader:
+			output = append(output, capHeader(t, args[arg].Pointer()))
+		case CapData:
+			output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
 		case Oct:
 			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
 		case Hex:
@@ -403,6 +468,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = sigSet(t, args[arg].Pointer())
 		case PostSigAction:
 			output[arg] = sigAction(t, args[arg].Pointer())
+		case PostCapData:
+			output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer())
 		}
 	}
 }
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 1ae982354..b2715856e 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -188,6 +188,17 @@ const (
 
 	// PostSigAction is a struct sigaction, formatted after syscall execution.
 	PostSigAction
+
+	// CapHeader is a cap_user_header_t.
+	CapHeader
+
+	// CapData is the data argument to capget(2)/capset(2). The previous
+	// argument must be CapHeader.
+	CapData
+
+	// PostCapData is the data argument to capget(2)/capset(2), formatted
+	// after syscall execution. The previous argument must be CapHeader.
+	PostCapData
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
-- 
cgit v1.2.3


From 3dbd4a16f8ae4da967f69fd93870462d1b3554f5 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 1 Mar 2019 10:55:22 -0800
Subject: Add semctl(GETPID) syscall

Also added unimplemented notification for semctl(2)
commands.

PiperOrigin-RevId: 236340672
Change-Id: I0795e3bd2e6d41d7936fabb731884df426a42478
---
 pkg/abi/linux/sem.go                          |  5 ++--
 pkg/sentry/kernel/semaphore/semaphore.go      | 34 +++++++++++++++++----
 pkg/sentry/kernel/semaphore/semaphore_test.go |  6 ++--
 pkg/sentry/syscalls/linux/sys_sem.go          | 43 +++++++++++++++++++++++++--
 runsc/boot/compat.go                          |  4 +++
 test/syscalls/linux/semaphore.cc              | 29 ++++++++++++++++++
 6 files changed, 107 insertions(+), 14 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index d1a0bdb32..b80c93daf 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -27,8 +27,9 @@ const (
 
 // ipcs ctl cmds. Source: include/uapi/linux/sem.h
 const (
-	SEM_STAT = 18
-	SEM_INFO = 19
+	SEM_STAT     = 18
+	SEM_INFO     = 19
+	SEM_STAT_ANY = 20
 )
 
 const SEM_UNDO = 0x1000
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index c134931cd..29a2eb804 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -92,6 +92,7 @@ type Set struct {
 type sem struct {
 	value   int16
 	waiters waiterList `state:"zerovalue"`
+	pid     int32
 }
 
 // waiter represents a caller that is waiting for the semaphore value to
@@ -283,7 +284,7 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File
 }
 
 // SetVal overrides a semaphore value, waking up waiters as needed.
-func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials) error {
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
 	if val < 0 || val > valueMax {
 		return syserror.ERANGE
 	}
@@ -303,15 +304,17 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 
 	// TODO: Clear undo entries in all processes
 	sem.value = val
+	sem.pid = pid
 	s.changeTime = ktime.NowFromContext(ctx)
 	sem.wakeWaiters()
 	return nil
 }
 
-// SetValAll overrides all semaphores values, waking up waiters as needed.
+// SetValAll overrides all semaphores values, waking up waiters as needed. It also
+// sets semaphore's PID which was fixed in Linux 4.6.
 //
 // 'len(vals)' must be equal to 's.Size()'.
-func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials) error {
+func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error {
 	if len(vals) != s.Size() {
 		panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size()))
 	}
@@ -335,6 +338,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 
 		// TODO: Clear undo entries in all processes
 		sem.value = int16(val)
+		sem.pid = pid
 		sem.wakeWaiters()
 	}
 	s.changeTime = ktime.NowFromContext(ctx)
@@ -375,12 +379,29 @@ func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
 	return vals, nil
 }
 
+// GetPID returns the PID set when performing operations in the semaphore.
+func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	return sem.pid, nil
+}
+
 // ExecuteOps attempts to execute a list of operations to the set. It only
 // succeeds when all operations can be applied. No changes are made if it fails.
 //
 // On failure, it may return an error (retries are hopeless) or it may return
 // a channel that can be waited on before attempting again.
-func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials) (chan struct{}, int32, error) {
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -404,14 +425,14 @@ func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Cr
 		return nil, 0, syserror.EACCES
 	}
 
-	ch, num, err := s.executeOps(ctx, ops)
+	ch, num, err := s.executeOps(ctx, ops, pid)
 	if err != nil {
 		return nil, 0, err
 	}
 	return ch, num, nil
 }
 
-func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}, int32, error) {
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) {
 	// Changes to semaphores go to this slice temporarily until they all succeed.
 	tmpVals := make([]int16, len(s.sems))
 	for i := range s.sems {
@@ -464,6 +485,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}
 	for i, v := range tmpVals {
 		s.sems[i].value = v
 		s.sems[i].wakeWaiters()
+		s.sems[i].pid = pid
 	}
 	s.opTime = ktime.NowFromContext(ctx)
 	return nil, 0, nil
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index 5f886bf31..2e51e6ee5 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -25,7 +25,7 @@ import (
 )
 
 func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} {
-	ch, _, err := set.executeOps(ctx, ops)
+	ch, _, err := set.executeOps(ctx, ops, 123)
 	if err != nil {
 		t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops)
 	}
@@ -123,13 +123,13 @@ func TestNoWait(t *testing.T) {
 
 	ops[0].SemOp = -2
 	ops[0].SemFlg = linux.IPC_NOWAIT
-	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+	if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock {
 		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
 	}
 
 	ops[0].SemOp = 0
 	ops[0].SemFlg = linux.IPC_NOWAIT
-	if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+	if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock {
 		t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 6775725ca..86f850ef1 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -71,8 +71,9 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 
 	creds := auth.CredentialsFromContext(t)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
 	for {
-		ch, num, err := set.ExecuteOps(t, ops, creds)
+		ch, num, err := set.ExecuteOps(t, ops, creds, int32(pid))
 		if ch == nil || err != nil {
 			// We're done (either on success or a failure).
 			return 0, nil, err
@@ -123,6 +124,21 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 		perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
 		return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)
 
+	case linux.GETPID:
+		v, err := getPID(t, id, num)
+		return uintptr(v), nil, err
+
+	case linux.IPC_INFO,
+		linux.SEM_INFO,
+		linux.IPC_STAT,
+		linux.SEM_STAT,
+		linux.SEM_STAT_ANY,
+		linux.GETNCNT,
+		linux.GETZCNT:
+
+		t.Kernel().EmitUnimplementedEvent(t)
+		fallthrough
+
 	default:
 		return 0, nil, syserror.EINVAL
 	}
@@ -161,7 +177,8 @@ func setVal(t *kernel.Task, id int32, num int32, val int16) error {
 		return syserror.EINVAL
 	}
 	creds := auth.CredentialsFromContext(t)
-	return set.SetVal(t, num, val, creds)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+	return set.SetVal(t, num, val, creds, int32(pid))
 }
 
 func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
@@ -175,7 +192,8 @@ func setValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 		return err
 	}
 	creds := auth.CredentialsFromContext(t)
-	return set.SetValAll(t, vals, creds)
+	pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
+	return set.SetValAll(t, vals, creds, int32(pid))
 }
 
 func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
@@ -202,3 +220,22 @@ func getValAll(t *kernel.Task, id int32, array usermem.Addr) error {
 	_, err = t.CopyOut(array, vals)
 	return err
 }
+
+func getPID(t *kernel.Task, id int32, num int32) (int32, error) {
+	r := t.IPCNamespace().SemaphoreRegistry()
+	set := r.FindByID(id)
+	if set == nil {
+		return 0, syserror.EINVAL
+	}
+	creds := auth.CredentialsFromContext(t)
+	gpid, err := set.GetPID(num, creds)
+	if err != nil {
+		return 0, err
+	}
+	// Convert pid from init namespace to the caller's namespace.
+	tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(gpid))
+	if tg == nil {
+		return 0, nil
+	}
+	return int32(tg.ID()), nil
+}
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index c2a77ebf5..572b5b472 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -100,6 +100,10 @@ func (c *compatEmitter) Emit(msg proto.Message) (hangup bool, err error) {
 			// args: fd, level, name, ...
 			tr = newArgsTracker(1, 2)
 
+		case syscall.SYS_SEMCTL:
+			// args: semid, semnum, cmd, ...
+			tr = newArgsTracker(2)
+
 		default:
 			tr = &onceTracker{}
 		}
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index da3d2c6fe..1c47b6851 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -431,6 +431,35 @@ TEST(SemaphoreTest, SemCtlValAll) {
               SyscallFailsWithErrno(EFAULT));
 }
 
+TEST(SemaphoreTest, SemCtlGetPid) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 1), SyscallSucceeds());
+  EXPECT_THAT(semctl(sem.get(), 0, GETPID), SyscallSucceedsWithValue(getpid()));
+}
+
+TEST(SemaphoreTest, SemCtlGetPidFork) {
+  AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+  ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+  const pid_t child_pid = fork();
+  if (child_pid == 0) {
+    ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 1), SyscallSucceeds());
+    ASSERT_THAT(semctl(sem.get(), 0, GETPID),
+                SyscallSucceedsWithValue(getpid()));
+
+    _exit(0);
+  }
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
 TEST(SemaphoreTest, SemIpcSet) {
   // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions.
   ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false));
-- 
cgit v1.2.3


From 9177bcd0ba7f68bd5e28123c95fe0d69f822703e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 1 Mar 2019 11:57:52 -0800
Subject: DecRef replaced dirent in inode_overlay.

PiperOrigin-RevId: 236352158
Change-Id: Ide5104620999eaef6820917505e7299c7b0c5a03
---
 pkg/sentry/fs/inode_overlay.go | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index b11e2bd13..92a77917a 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -336,18 +336,26 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 	if err != nil && err != syserror.ENOENT {
 		return err
 	}
-	if err == nil && !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) {
-		children, err := readdirOne(ctx, replaced)
-		if err != nil {
-			return err
-		}
+	if err == nil {
+		// NOTE: We must drop the reference on replaced before we make
+		// the rename call. For that reason we can't use defer.
+		if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) {
+			children, err := readdirOne(ctx, replaced)
+			if err != nil {
+				replaced.DecRef()
+				return err
+			}
 
-		// readdirOne ensures that "." and ".." are not
-		// included among the returned children, so we don't
-		// need to bother checking for them.
-		if len(children) > 0 {
-			return syserror.ENOTEMPTY
+			// readdirOne ensures that "." and ".." are not
+			// included among the returned children, so we don't
+			// need to bother checking for them.
+			if len(children) > 0 {
+				replaced.DecRef()
+				return syserror.ENOTEMPTY
+			}
 		}
+
+		replaced.DecRef()
 	}
 	if err := copyUpLockedForRename(ctx, renamed); err != nil {
 		return err
-- 
cgit v1.2.3


From d811c1016d090ea88a687bd9bef4951dc08b391d Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 1 Mar 2019 15:04:15 -0800
Subject: ptrace: drop old FIXME

The globalPool uses a sync.Once mechanism for initialization,
and no cleanup is strictly required. It's not really feasible
to have the platform implement a full creation -> destruction
cycle (due to the way filters are assumed to be installed), so
drop the FIXME.

PiperOrigin-RevId: 236385278
Change-Id: I98ac660ed58cc688d8a07147d16074a3e8181314
---
 pkg/sentry/platform/ptrace/ptrace.go | 2 --
 1 file changed, 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 00d92b092..8d3f6ac9a 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -38,8 +38,6 @@
 //	The requested operation is performed in the traced subprocess thread
 //	(e.g. set registers, execute, return).
 //
-// FIXME: This package is currently sloppy with cleanup.
-//
 // Lock order:
 //
 // subprocess.mu
-- 
cgit v1.2.3


From 0d683c9961a6d39d06896a230b8d52edfcf6e0cc Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 4 Mar 2019 16:56:11 -0800
Subject: Make tmpfs respect MountNoATime now that fs.Handle is gone.

PiperOrigin-RevId: 236752802
Change-Id: I9e50600b2ae25d5f2ac632c4405a7a185bdc3c92
---
 pkg/sentry/fs/tmpfs/file_regular.go |  2 +-
 pkg/sentry/fs/tmpfs/inode_file.go   | 12 +++++++-----
 test/syscalls/linux/mount.cc        |  4 +---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 2c1eb0fd2..be6298130 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -44,7 +44,7 @@ type regularFileOperations struct {
 
 // Read implements fs.FileOperations.Read.
 func (r *regularFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
-	return r.iops.read(ctx, dst, offset)
+	return r.iops.read(ctx, file, dst, offset)
 }
 
 // Write implements fs.FileOperations.Write.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 1cc972afa..5648ff8f4 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -250,7 +250,7 @@ func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) {
 	return fsInfo, nil
 }
 
-func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	var start time.Time
 	if fs.RecordWaitTime {
 		start = time.Now()
@@ -280,10 +280,12 @@ func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence,
 	}
 
 	n, err := dst.CopyOutFrom(ctx, &fileReadWriter{f, offset})
-	// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-	f.attrMu.Lock()
-	f.attr.AccessTime = ktime.NowFromContext(ctx)
-	f.attrMu.Unlock()
+	if !file.Dirent.Inode.MountSource.Flags.NoAtime {
+		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+		f.attrMu.Lock()
+		f.attr.AccessTime = ktime.NowFromContext(ctx)
+		f.attrMu.Unlock()
+	}
 	fs.IncrementWait(readWait, start)
 	return n, err
 }
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index 76da8b75a..6bb4287a3 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -250,9 +250,7 @@ PosixErrorOr<absl::Time> ATime(absl::string_view file) {
   return absl::TimeFromTimespec(s.st_atim);
 }
 
-// FIXME: Disabled until tmpfs stops using Handle, as only the gofer
-// and host file system respect the MS_NOATIME flag.
-TEST(MountTest, DISABLED_MountNoAtime) {
+TEST(MountTest, MountNoAtime) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
   auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
-- 
cgit v1.2.3


From 23e66ee96d159a774ecac9f89fab8cff463174a4 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Tue, 5 Mar 2019 14:52:35 -0800
Subject: Remove unused commit() function argument to Bind.

PiperOrigin-RevId: 236926132
Change-Id: I5cf103f22766e6e65a581de780c7bb9ca0fa3181
---
 pkg/dhcp/client.go                                 |  2 +-
 pkg/dhcp/dhcp_test.go                              |  2 +-
 pkg/dhcp/server.go                                 |  2 +-
 pkg/sentry/socket/epsocket/epsocket.go             |  2 +-
 pkg/tcpip/adapters/gonet/gonet.go                  |  4 +--
 pkg/tcpip/network/ipv4/ipv4_test.go                |  2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go           |  2 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go              |  2 +-
 pkg/tcpip/stack/transport_test.go                  |  6 ++--
 pkg/tcpip/tcpip.go                                 |  6 +---
 pkg/tcpip/transport/icmp/endpoint.go               | 17 ++++--------
 pkg/tcpip/transport/tcp/dual_stack_test.go         | 24 ++++++++--------
 pkg/tcpip/transport/tcp/endpoint.go                | 10 +------
 pkg/tcpip/transport/tcp/endpoint_state.go          |  2 +-
 pkg/tcpip/transport/tcp/tcp_test.go                | 32 +++++++++++-----------
 pkg/tcpip/transport/tcp/testing/context/context.go |  2 +-
 pkg/tcpip/transport/udp/endpoint.go                | 16 +++--------
 pkg/tcpip/transport/udp/udp_test.go                | 28 +++++++++----------
 18 files changed, 67 insertions(+), 94 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index fb3ae5b49..354205e63 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -140,7 +140,7 @@ func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) (cfg
 		Addr: tcpipHeader.IPv4Any,
 		Port: ClientPort,
 		NIC:  c.nicid,
-	}, nil); err != nil {
+	}); err != nil {
 		return Config{}, fmt.Errorf("dhcp: Bind(): %s", err)
 	}
 
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index 026064394..e1d8ef603 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -284,7 +284,7 @@ func TestTwoServers(t *testing.T) {
 	if err != nil {
 		t.Fatalf("dhcp: server endpoint: %v", err)
 	}
-	if err = ep.Bind(tcpip.FullAddress{Port: ServerPort}, nil); err != nil {
+	if err = ep.Bind(tcpip.FullAddress{Port: ServerPort}); err != nil {
 		t.Fatalf("dhcp: server bind: %v", err)
 	}
 	if err = ep.SetSockOpt(tcpip.BroadcastOption(1)); err != nil {
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index c72c3b70d..9549ff705 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -120,7 +120,7 @@ func newEPConnServer(ctx context.Context, stack *stack.Stack, addrs []tcpip.Addr
 	if err != nil {
 		return nil, fmt.Errorf("dhcp: server endpoint: %v", err)
 	}
-	if err := ep.Bind(tcpip.FullAddress{Port: ServerPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: ServerPort}); err != nil {
 		return nil, fmt.Errorf("dhcp: server bind: %v", err)
 	}
 	if err := ep.SetSockOpt(tcpip.BroadcastOption(1)); err != nil {
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 8aec97e72..8fa108bea 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -423,7 +423,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
 	}
 
 	// Issue the bind request to the endpoint.
-	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr, nil))
+	return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
 }
 
 // Listen implements the linux syscall listen(2) for sockets backed by
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 8b077156c..560b8ac4b 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -60,7 +60,7 @@ func NewListener(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkPr
 		return nil, errors.New(err.String())
 	}
 
-	if err := ep.Bind(addr, nil); err != nil {
+	if err := ep.Bind(addr); err != nil {
 		ep.Close()
 		return nil, &net.OpError{
 			Op:   "bind",
@@ -524,7 +524,7 @@ func NewPacketConn(s *stack.Stack, addr tcpip.FullAddress, network tcpip.Network
 		return nil, errors.New(err.String())
 	}
 
-	if err := ep.Bind(addr, nil); err != nil {
+	if err := ep.Bind(addr); err != nil {
 		ep.Close()
 		return nil, &net.OpError{
 			Op:   "bind",
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 190d548eb..42e85564e 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -69,7 +69,7 @@ func TestExcludeBroadcast(t *testing.T) {
 		}
 
 		// However, we can bind to a broadcast address to listen.
-		if err := ep.Bind(tcpip.FullAddress{Addr: header.IPv4Broadcast, Port: 53, NIC: 1}, nil); err != nil {
+		if err := ep.Bind(tcpip.FullAddress{Addr: header.IPv4Broadcast, Port: 53, NIC: 1}); err != nil {
 			t.Errorf("Bind failed: %v", err)
 		}
 	})
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 67e8f0b9e..327a79f48 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -165,7 +165,7 @@ func main() {
 
 	// Bind if a port is specified.
 	if localPort != 0 {
-		if err := ep.Bind(tcpip.FullAddress{0, "", localPort}, nil); err != nil {
+		if err := ep.Bind(tcpip.FullAddress{0, "", localPort}); err != nil {
 			log.Fatal("Bind failed: ", err)
 		}
 	}
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index ab40e9e0b..b23dc13e7 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -165,7 +165,7 @@ func main() {
 
 	defer ep.Close()
 
-	if err := ep.Bind(tcpip.FullAddress{0, "", uint16(localPort)}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{0, "", uint16(localPort)}); err != nil {
 		log.Fatal("Bind failed: ", err)
 	}
 
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 3347b5599..a9e844e3d 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -145,7 +145,7 @@ func (f *fakeTransportEndpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.
 	return &a, nil, nil
 }
 
-func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
 	if err := f.stack.RegisterTransportEndpoint(
 		a.NIC,
 		[]tcpip.NetworkProtocolNumber{fakeNetNumber},
@@ -157,7 +157,7 @@ func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress, commit func() *tcpip.E
 		return err
 	}
 	f.acceptQueue = []fakeTransportEndpoint{}
-	return commit()
+	return nil
 }
 
 func (*fakeTransportEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
@@ -483,7 +483,7 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	if err := ep.Bind(tcpip.FullAddress{Addr: "\x01", NIC: 1}, func() *tcpip.Error { return nil }); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Addr: "\x01", NIC: 1}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 89e9d6741..49cc8705a 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -356,11 +356,7 @@ type Endpoint interface {
 
 	// Bind binds the endpoint to a specific local address and port.
 	// Specifying a NIC is optional.
-	//
-	// An optional commit function will be executed atomically with respect
-	// to binding the endpoint. If this returns an error, the bind will not
-	// occur and the error will be propagated back to the caller.
-	Bind(address FullAddress, commit func() *Error) *Error
+	Bind(address FullAddress) *Error
 
 	// GetLocalAddress returns the address to which the endpoint is bound.
 	GetLocalAddress() (FullAddress, *Error)
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index b3b7a1d0e..05c4b532a 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -100,7 +100,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, trans
 	// Raw endpoints must be immediately bound because they receive all
 	// ICMP traffic starting from when they're created via socket().
 	if raw {
-		if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
+		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
 			return nil, err
 		}
 	}
@@ -202,7 +202,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 	}
 
 	// The state is still 'initial', so try to bind the endpoint.
-	if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
+	if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
 		return false, err
 	}
 
@@ -576,7 +576,7 @@ func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.Networ
 	return id, err
 }
 
-func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore.
 	if e.state != stateInitial {
@@ -608,13 +608,6 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 	if err != nil {
 		return err
 	}
-	if commit != nil {
-		if err := commit(); err != nil {
-			// Unregister, the commit failed.
-			e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, e.transProto, id, e)
-			return err
-		}
-	}
 
 	e.id = id
 	e.regNICID = addr.NIC
@@ -631,11 +624,11 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 
 // Bind binds the endpoint to a specific local address and port.
 // Specifying a NIC is optional.
-func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	err := e.bindLocked(addr, commit)
+	err := e.bindLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index d3120c1d8..52f20bef1 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -113,7 +113,7 @@ func TestV4ConnectWhenBoundToWildcard(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind to wildcard.
-	if err := c.EP.Bind(tcpip.FullAddress{}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -128,7 +128,7 @@ func TestV4ConnectWhenBoundToV4MappedWildcard(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind to v4 mapped wildcard.
-	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -143,7 +143,7 @@ func TestV4ConnectWhenBoundToV4Mapped(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind to v4 mapped address.
-	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV4MappedAddr}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV4MappedAddr}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -233,7 +233,7 @@ func TestV6ConnectWhenBoundToWildcard(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind to wildcard.
-	if err := c.EP.Bind(tcpip.FullAddress{}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -248,7 +248,7 @@ func TestV6ConnectWhenBoundToLocalAddress(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind to local address.
-	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV6Addr}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV6Addr}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -263,7 +263,7 @@ func TestV4RefuseOnV6Only(t *testing.T) {
 	c.CreateV6Endpoint(true)
 
 	// Bind to wildcard.
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -300,7 +300,7 @@ func TestV6RefuseOnBoundToV4Mapped(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind and listen.
-	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr, Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr, Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -415,7 +415,7 @@ func TestV4AcceptOnV6(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind to wildcard.
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -430,7 +430,7 @@ func TestV4AcceptOnBoundToV4MappedWildcard(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind to v4 mapped wildcard.
-	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr, Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr, Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -445,7 +445,7 @@ func TestV4AcceptOnBoundToV4Mapped(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind and listen.
-	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV4MappedAddr, Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV4MappedAddr, Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -460,7 +460,7 @@ func TestV6AcceptOnV6(t *testing.T) {
 	c.CreateV6Endpoint(false)
 
 	// Bind and listen.
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -551,7 +551,7 @@ func TestV4AcceptOnV4(t *testing.T) {
 	}
 
 	// Bind to wildcard.
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index c48a27d8f..ae99f0f8e 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1336,7 +1336,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 }
 
 // Bind binds the endpoint to a specific local port and optionally address.
-func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) (err *tcpip.Error) {
+func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
@@ -1397,14 +1397,6 @@ func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) (err
 		e.id.LocalAddress = addr.Addr
 	}
 
-	// Check the commit function.
-	if commit != nil {
-		if err := commit(); err != nil {
-			// The defer takes care of unwind.
-			return err
-		}
-	}
-
 	// Mark endpoint as bound.
 	e.state = stateBound
 
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index ca7852d04..87e988afa 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -185,7 +185,7 @@ func (e *endpoint) afterLoad() {
 		if len(e.bindAddress) == 0 {
 			e.bindAddress = e.id.LocalAddress
 		}
-		if err := e.Bind(tcpip.FullAddress{Addr: e.bindAddress, Port: e.id.LocalPort}, nil); err != nil {
+		if err := e.Bind(tcpip.FullAddress{Addr: e.bindAddress, Port: e.id.LocalPort}); err != nil {
 			panic("endpoint binding failed: " + err.String())
 		}
 	}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 557cc258d..2011189b7 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -135,7 +135,7 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 	if err := ep.Listen(1); err != nil {
@@ -193,7 +193,7 @@ func TestTCPResetsSentIncrement(t *testing.T) {
 	}
 	want := stats.TCP.SegmentsSent.Value() + 1
 
-	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -1042,7 +1042,7 @@ func TestScaledWindowAccept(t *testing.T) {
 		t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 
-	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -1115,7 +1115,7 @@ func TestNonScaledWindowAccept(t *testing.T) {
 		t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 
-	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -1618,7 +1618,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 		t.Fatalf("SetSockOpt failed failed: %v", err)
 	}
 
-	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -1675,7 +1675,7 @@ func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
 	}
 	defer ep.Close()
 
-	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -1840,7 +1840,7 @@ func TestCloseListener(t *testing.T) {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	if err := ep.Bind(tcpip.FullAddress{}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -2824,7 +2824,7 @@ func TestUpdateListenBacklog(t *testing.T) {
 		t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	if err := ep.Bind(tcpip.FullAddress{}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -3096,7 +3096,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %v", err)
 	}
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -3105,7 +3105,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %v", err)
 	}
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 	c.EP.Close()
@@ -3115,7 +3115,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %v", err)
 	}
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 	if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
@@ -3127,7 +3127,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %v", err)
 	}
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 	c.EP.Close()
@@ -3137,7 +3137,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %v", err)
 	}
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 	if err := c.EP.Listen(10); err != nil {
@@ -3149,7 +3149,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %v", err)
 	}
-	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 	if err := c.EP.Listen(10); err != nil {
@@ -3337,7 +3337,7 @@ func TestSelfConnect(t *testing.T) {
 	}
 	defer ep.Close()
 
-	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -3508,7 +3508,7 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 											}
 
 											for i := ports.FirstEphemeral; i <= math.MaxUint16; i++ {
-												if makeEP(exhaustedNetwork).Bind(tcpip.FullAddress{Addr: address(t, exhaustedAddressType, isAny), Port: uint16(i)}, nil); err != nil {
+												if makeEP(exhaustedNetwork).Bind(tcpip.FullAddress{Addr: address(t, exhaustedAddressType, isAny), Port: uint16(i)}); err != nil {
 													t.Fatalf("Bind(%d) failed: %v", i, err)
 												}
 											}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 0695e8150..fb4ae4a1b 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -796,7 +796,7 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
 	}
 	defer ep.Close()
 
-	if err := ep.Bind(tcpip.FullAddress{Port: StackPort}, nil); err != nil {
+	if err := ep.Bind(tcpip.FullAddress{Port: StackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 44b9cdf6a..4108cb09c 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -247,7 +247,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 	}
 
 	// The state is still 'initial', so try to bind the endpoint.
-	if err := e.bindLocked(tcpip.FullAddress{}, nil); err != nil {
+	if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
 		return false, err
 	}
 
@@ -806,7 +806,7 @@ func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.Networ
 	return id, err
 }
 
-func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore.
 	if e.state != stateInitial {
@@ -846,14 +846,6 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 	if err != nil {
 		return err
 	}
-	if commit != nil {
-		if err := commit(); err != nil {
-			// Unregister, the commit failed.
-			e.stack.UnregisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e)
-			e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort)
-			return err
-		}
-	}
 
 	e.id = id
 	e.regNICID = nicid
@@ -871,11 +863,11 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error
 
 // Bind binds the endpoint to a specific local address and port.
 // Specifying a NIC is optional.
-func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error {
+func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	err := e.bindLocked(addr, commit)
+	err := e.bindLocked(addr)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 2a9cf4b57..884a76b04 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -289,7 +289,7 @@ func TestBindPortReuse(t *testing.T) {
 		if err := eps[i].SetSockOpt(reusePortOpt); err != nil {
 			c.t.Fatalf("SetSockOpt failed failed: %v", err)
 		}
-		if err := eps[i].Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}, nil); err != nil {
+		if err := eps[i].Bind(tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort}); err != nil {
 			t.Fatalf("ep.Bind(...) failed: %v", err)
 		}
 	}
@@ -385,7 +385,7 @@ func TestBindEphemeralPort(t *testing.T) {
 
 	c.createV6Endpoint(false)
 
-	if err := c.ep.Bind(tcpip.FullAddress{}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{}); err != nil {
 		t.Fatalf("ep.Bind(...) failed: %v", err)
 	}
 }
@@ -412,7 +412,7 @@ func TestBindReservedPort(t *testing.T) {
 			t.Fatalf("NewEndpoint failed: %v", err)
 		}
 		defer ep.Close()
-		if got, want := ep.Bind(addr, nil), tcpip.ErrPortInUse; got != want {
+		if got, want := ep.Bind(addr), tcpip.ErrPortInUse; got != want {
 			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
 		}
 	}
@@ -425,11 +425,11 @@ func TestBindReservedPort(t *testing.T) {
 		defer ep.Close()
 		// We can't bind ipv4-any on the port reserved by the connected endpoint
 		// above, since the endpoint is dual-stack.
-		if got, want := ep.Bind(tcpip.FullAddress{Port: addr.Port}, nil), tcpip.ErrPortInUse; got != want {
+		if got, want := ep.Bind(tcpip.FullAddress{Port: addr.Port}), tcpip.ErrPortInUse; got != want {
 			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
 		}
 		// We can bind an ipv4 address on this port, though.
-		if err := ep.Bind(tcpip.FullAddress{Addr: stackAddr, Port: addr.Port}, nil); err != nil {
+		if err := ep.Bind(tcpip.FullAddress{Addr: stackAddr, Port: addr.Port}); err != nil {
 			t.Fatalf("ep.Bind(...) failed: %v", err)
 		}
 	}()
@@ -443,7 +443,7 @@ func TestBindReservedPort(t *testing.T) {
 			t.Fatalf("NewEndpoint failed: %v", err)
 		}
 		defer ep.Close()
-		if err := ep.Bind(tcpip.FullAddress{Port: addr.Port}, nil); err != nil {
+		if err := ep.Bind(tcpip.FullAddress{Port: addr.Port}); err != nil {
 			t.Fatalf("ep.Bind(...) failed: %v", err)
 		}
 	}()
@@ -456,7 +456,7 @@ func TestV4ReadOnV6(t *testing.T) {
 	c.createV6Endpoint(false)
 
 	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -471,7 +471,7 @@ func TestV4ReadOnBoundToV4MappedWildcard(t *testing.T) {
 	c.createV6Endpoint(false)
 
 	// Bind to v4 mapped wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Addr: V4MappedWildcardAddr, Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Addr: V4MappedWildcardAddr, Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -486,7 +486,7 @@ func TestV4ReadOnBoundToV4Mapped(t *testing.T) {
 	c.createV6Endpoint(false)
 
 	// Bind to local address.
-	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -501,7 +501,7 @@ func TestV6ReadOnV6(t *testing.T) {
 	c.createV6Endpoint(false)
 
 	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -556,7 +556,7 @@ func TestV4ReadOnV4(t *testing.T) {
 	}
 
 	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -650,7 +650,7 @@ func TestDualWriteBoundToWildcard(t *testing.T) {
 	c.createV6Endpoint(false)
 
 	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -729,7 +729,7 @@ func TestV6WriteOnBoundToV4Mapped(t *testing.T) {
 	c.createV6Endpoint(false)
 
 	// Bind to v4 mapped address.
-	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
@@ -827,7 +827,7 @@ func TestReadIncrementsPacketsReceived(t *testing.T) {
 	}
 
 	// Bind to wildcard.
-	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}, nil); err != nil {
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
 		c.t.Fatalf("Bind failed: %v", err)
 	}
 
-- 
cgit v1.2.3


From 1718fdd1a8e16f36433b069a0f5d88ea7bdb65f5 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 5 Mar 2019 16:40:40 -0800
Subject: Add new retransmissions and recovery related metrics.

PiperOrigin-RevId: 236945145
Change-Id: I051760d95154ea5574c8bb6aea526f488af5e07b
---
 pkg/sentry/socket/epsocket/epsocket.go   |  6 ++++++
 pkg/tcpip/tcpip.go                       | 22 ++++++++++++++++++++++
 pkg/tcpip/transport/tcp/segment.go       |  3 +++
 pkg/tcpip/transport/tcp/segment_state.go | 10 ++++++++++
 pkg/tcpip/transport/tcp/snd.go           | 15 ++++++++++++++-
 pkg/tcpip/transport/tcp/tcp_test.go      | 32 ++++++++++++++++++++++++++++++++
 6 files changed, 87 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 8fa108bea..4e547ea33 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -82,6 +82,12 @@ var Metrics = tcpip.Stats{
 		SegmentsSent:              mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
 		ResetsSent:                mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
 		ResetsReceived:            mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
+		Retransmits:               mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
+		FastRecovery:              mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
+		SACKRecovery:              mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
+		SlowStartRetransmits:      mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
+		FastRetransmit:            mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
+		Timeouts:                  mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
 	},
 	UDP: tcpip.UDPStats{
 		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 49cc8705a..7010d1b68 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -628,6 +628,28 @@ type TCPStats struct {
 
 	// ResetsReceived is the number of TCP resets received.
 	ResetsReceived *StatCounter
+
+	// Retransmits is the number of TCP segments retransmitted.
+	Retransmits *StatCounter
+
+	// FastRecovery is the number of times Fast Recovery was used to
+	// recover from packet loss.
+	FastRecovery *StatCounter
+
+	// SACKRecovery is the number of times SACK Recovery was used to
+	// recover from packet loss.
+	SACKRecovery *StatCounter
+
+	// SlowStartRetransmits is the number of segments retransmitted in slow
+	// start.
+	SlowStartRetransmits *StatCounter
+
+	// FastRetransmit is the number of segments retransmitted in fast
+	// recovery.
+	FastRetransmit *StatCounter
+
+	// Timeouts is the number of times the RTO expired.
+	Timeouts *StatCounter
 }
 
 // UDPStats collects UDP-specific stats.
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index bd8017f64..a4c4a115c 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -61,6 +61,9 @@ type segment struct {
 	options        []byte `state:".([]byte)"`
 	hasNewSACKInfo bool
 	rcvdTime       time.Time `state:".(unixTime)"`
+	// xmitTime is the last transmit time of this segment. A zero value
+	// indicates that the segment has yet to be transmitted.
+	xmitTime time.Time `state:".(unixTime)"`
 }
 
 func newSegment(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) *segment {
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
index 7b98a3ec8..68b049f06 100644
--- a/pkg/tcpip/transport/tcp/segment_state.go
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -70,3 +70,13 @@ func (s *segment) saveRcvdTime() unixTime {
 func (s *segment) loadRcvdTime(unix unixTime) {
 	s.rcvdTime = time.Unix(unix.second, unix.nano)
 }
+
+// saveXmitTime is invoked by stateify.
+func (s *segment) saveXmitTime() unixTime {
+	return unixTime{s.rcvdTime.Unix(), s.rcvdTime.UnixNano()}
+}
+
+// loadXmitTime is invoked by stateify.
+func (s *segment) loadXmitTime(unix unixTime) {
+	s.rcvdTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 8312ae077..2cf12f4a6 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -338,6 +338,8 @@ func (s *sender) resendSegment() {
 	// Resend the segment.
 	if seg := s.writeList.Front(); seg != nil {
 		s.sendSegment(seg.data, seg.flags, seg.sequenceNumber)
+		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
+		s.ep.stack.Stats().TCP.Retransmits.Increment()
 	}
 }
 
@@ -352,6 +354,8 @@ func (s *sender) retransmitTimerExpired() bool {
 		return true
 	}
 
+	s.ep.stack.Stats().TCP.Timeouts.Increment()
+
 	// Give up if we've waited more than a minute since the last resend.
 	if s.rto >= 60*time.Second {
 		return false
@@ -422,7 +426,6 @@ func (s *sender) sendData() {
 	end := s.sndUna.Add(s.sndWnd)
 	var dataSent bool
 	for ; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
-
 		// We abuse the flags field to determine if we have already
 		// assigned a sequence number to this segment.
 		if seg.flags == 0 {
@@ -524,6 +527,15 @@ func (s *sender) sendData() {
 			// ensure that no keepalives are sent while there is pending data.
 			s.ep.disableKeepaliveTimer()
 		}
+
+		if !seg.xmitTime.IsZero() {
+			s.ep.stack.Stats().TCP.Retransmits.Increment()
+			if s.sndCwnd < s.sndSsthresh {
+				s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
+			}
+		}
+
+		seg.xmitTime = time.Now()
 		s.sendSegment(seg.data, seg.flags, seg.sequenceNumber)
 
 		// Update sndNxt if we actually sent new data (as opposed to
@@ -556,6 +568,7 @@ func (s *sender) enterFastRecovery() {
 	s.fr.first = s.sndUna
 	s.fr.last = s.sndNxt - 1
 	s.fr.maxCwnd = s.sndCwnd + s.outstanding
+	s.ep.stack.Stats().TCP.FastRecovery.Increment()
 }
 
 func (s *sender) leaveFastRecovery() {
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 2011189b7..7f2615ca9 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2669,6 +2669,18 @@ func TestFastRecovery(t *testing.T) {
 	// Receive the retransmitted packet.
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
+	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+	}
+
 	// Now send 7 mode duplicate acks. Each of these should cause a window
 	// inflation by 1 and cause the sender to send an extra packet.
 	for i := 0; i < 7; i++ {
@@ -2688,6 +2700,14 @@ func TestFastRecovery(t *testing.T) {
 	// Receive the retransmit due to partial ack.
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
+	if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
+		t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
+		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+	}
+
 	// Receive the 10 extra packets that should have been released due to
 	// the congestion window inflation in recovery.
 	for i := 0; i < 10; i++ {
@@ -2799,6 +2819,18 @@ func TestRetransmit(t *testing.T) {
 	rtxOffset := bytesRead - maxPayload*expected
 	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
 
+	if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+	}
+
+	if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
+		t.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+	}
+
 	// Acknowledge half of the pending data.
 	rtxOffset = bytesRead - expected*maxPayload/2
 	c.SendAck(790, rtxOffset)
-- 
cgit v1.2.3


From 0b76887147820a809beaa497ede8dc4f7b7b120a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 5 Mar 2019 23:39:14 -0800
Subject: Priority-inheritance futex implementation

It is Implemented without the priority inheritance part given
that gVisor defers scheduling decisions to Go runtime and doesn't
have control over it.

PiperOrigin-RevId: 236989545
Change-Id: I714c8ca0798743ecf3167b14ffeb5cd834302560
---
 pkg/abi/linux/futex.go                          |   6 +
 pkg/sentry/kernel/futex/BUILD                   |   2 +
 pkg/sentry/kernel/futex/futex.go                | 215 +++++++++++++++++++++---
 pkg/sentry/kernel/futex/futex_test.go           |   4 +
 pkg/sentry/kernel/task_futex.go                 |   7 +
 pkg/sentry/mm/io.go                             |  45 +++++
 pkg/sentry/platform/platform.go                 |  10 ++
 pkg/sentry/platform/safecopy/BUILD              |   4 +-
 pkg/sentry/platform/safecopy/atomic_amd64.s     |  28 +++
 pkg/sentry/platform/safecopy/atomic_arm64.s     |  28 +++
 pkg/sentry/platform/safecopy/safecopy.go        |   4 +
 pkg/sentry/platform/safecopy/safecopy_unsafe.go |  20 +++
 pkg/sentry/platform/safecopy/sighandler_amd64.s |   9 +
 pkg/sentry/platform/safecopy/sighandler_arm64.s |  11 ++
 pkg/sentry/safemem/block_unsafe.go              |  10 ++
 pkg/sentry/syscalls/linux/sys_futex.go          |  68 +++++++-
 pkg/sentry/usermem/bytes_io_unsafe.go           |   8 +
 pkg/sentry/usermem/usermem.go                   |   7 +
 pkg/syserror/syserror.go                        |   1 +
 runsc/boot/compat.go                            |   4 +-
 test/syscalls/linux/BUILD                       |   1 +
 test/syscalls/linux/futex.cc                    | 115 ++++++++++++-
 22 files changed, 578 insertions(+), 29 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go
index 5dff01fba..afdf4123b 100644
--- a/pkg/abi/linux/futex.go
+++ b/pkg/abi/linux/futex.go
@@ -54,3 +54,9 @@ const (
 
 // FUTEX_TID_MASK is the TID portion of a PI futex word.
 const FUTEX_TID_MASK = 0x3fffffff
+
+// Constants used for priority-inheritance futexes.
+const (
+	FUTEX_WAITERS    = 0x80000000
+	FUTEX_OWNER_DIED = 0x40000000
+)
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 91feeb5ed..b6af5b20b 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -37,6 +37,8 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/context",
         "//pkg/sentry/memmap",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index b3e628fd4..cd7d51621 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -95,12 +95,15 @@ func (k *Key) matches(k2 *Key) bool {
 
 // Target abstracts memory accesses and keys.
 type Target interface {
-	// SwapUint32 gives access to usermem.SwapUint32.
+	// SwapUint32 gives access to usermem.IO.SwapUint32.
 	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
 
-	// CompareAndSwap gives access to usermem.CompareAndSwapUint32.
+	// CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32.
 	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
 
+	// LoadUint32 gives access to usermem.IO.LoadUint32.
+	LoadUint32(addr usermem.Addr) (uint32, error)
+
 	// GetSharedKey returns a Key with kind KindSharedPrivate or
 	// KindSharedMappable corresponding to the memory mapped at address addr.
 	//
@@ -112,11 +115,11 @@ type Target interface {
 
 // check performs a basic equality check on the given address.
 func check(t Target, addr usermem.Addr, val uint32) error {
-	prev, err := t.CompareAndSwapUint32(addr, val, val)
+	cur, err := t.LoadUint32(addr)
 	if err != nil {
 		return err
 	}
-	if prev != val {
+	if cur != val {
 		return syserror.EAGAIN
 	}
 	return nil
@@ -140,11 +143,14 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
 	)
 	if opType == linux.FUTEX_OP_SET {
 		oldVal, err = t.SwapUint32(addr, opArg)
+		if err != nil {
+			return false, err
+		}
 	} else {
 		for {
-			oldVal, err = t.CompareAndSwapUint32(addr, 0, 0)
+			oldVal, err = t.LoadUint32(addr)
 			if err != nil {
-				break
+				return false, err
 			}
 			var newVal uint32
 			switch opType {
@@ -161,7 +167,7 @@ func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) {
 			}
 			prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
 			if err != nil {
-				break
+				return false, err
 			}
 			if prev == oldVal {
 				break // Success.
@@ -222,6 +228,9 @@ type Waiter struct {
 	// The bitmask we're waiting on.
 	// This is used the case of a FUTEX_WAKE_BITSET.
 	bitmask uint32
+
+	// tid is the thread ID for the waiter in case this is a PI mutex.
+	tid uint32
 }
 
 // NewWaiter returns a new unqueued Waiter.
@@ -262,23 +271,28 @@ func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
 		// Remove from the bucket and wake the waiter.
 		woke := w
 		w = w.Next() // Next iteration.
-		b.waiters.Remove(woke)
-		woke.C <- struct{}{}
-
-		// NOTE: The above channel write establishes a write barrier according
-		// to the memory model, so nothing may be ordered around it. Since
-		// we've dequeued woke and will never touch it again, we can safely
-		// store nil to woke.bucket here and allow the WaitComplete() to
-		// short-circuit grabbing the bucket lock. If they somehow miss the
-		// store, we are still holding the lock, so we can know that they won't
-		// dequeue woke, assume it's free and have the below operation
-		// afterwards.
-		woke.bucket.Store(nil)
+		b.wakeWaiterLocked(woke)
 		done++
 	}
 	return done
 }
 
+func (b *bucket) wakeWaiterLocked(w *Waiter) {
+	// Remove from the bucket and wake the waiter.
+	b.waiters.Remove(w)
+	w.C <- struct{}{}
+
+	// NOTE: The above channel write establishes a write barrier according
+	// to the memory model, so nothing may be ordered around it. Since
+	// we've dequeued w and will never touch it again, we can safely
+	// store nil to w.bucket here and allow the WaitComplete() to
+	// short-circuit grabbing the bucket lock. If they somehow miss the
+	// store, we are still holding the lock, so we can know that they won't
+	// dequeue w, assume it's free and have the below operation
+	// afterwards.
+	w.bucket.Store(nil)
+}
+
 // requeueLocked takes n waiters from the bucket and moves them to naddr on the
 // bucket "to".
 //
@@ -596,7 +610,7 @@ func (m *Manager) WaitComplete(w *Waiter) {
 			continue
 		}
 
-		// Remove w from b.
+		// Remove waiter from bucket.
 		b.waiters.Remove(w)
 		w.bucket.Store(nil)
 		b.mu.Unlock()
@@ -606,3 +620,164 @@ func (m *Manager) WaitComplete(w *Waiter) {
 	// Release references held by the waiter.
 	w.key.release()
 }
+
+// LockPI attempts to lock the futex following the Priority-inheritance futex
+// rules. The lock is acquired only when 'addr' points to 0. The TID of the
+// calling task is set to 'addr' to indicate the futex is owned. It returns true
+// if the futex was successfully acquired.
+//
+// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
+// exit_robust_list()). Given we don't support robust lists, although handled
+// below, it's never set.
+func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return false, err
+	}
+	// Ownership of k is transferred to w below.
+
+	// Prepare the Waiter before taking the bucket lock.
+	select {
+	case <-w.C:
+	default:
+	}
+	w.key = k
+	w.tid = tid
+
+	b := m.lockBucket(&k)
+	// Hot function: avoid defers.
+
+	success, err := m.lockPILocked(w, t, addr, tid, b, try)
+	if err != nil {
+		w.key.release()
+		b.mu.Unlock()
+		return false, err
+	}
+	if success || try {
+		// Release waiter if it's not going to be a wait.
+		w.key.release()
+	}
+	b.mu.Unlock()
+	return success, nil
+}
+
+func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) {
+	for {
+		cur, err := t.LoadUint32(addr)
+		if err != nil {
+			return false, err
+		}
+		if (cur & linux.FUTEX_TID_MASK) == tid {
+			return false, syserror.EDEADLK
+		}
+
+		if (cur & linux.FUTEX_TID_MASK) == 0 {
+			// No owner and no waiters, try to acquire the futex.
+
+			// Set TID and preserve owner died status.
+			val := tid
+			val |= cur & linux.FUTEX_OWNER_DIED
+			prev, err := t.CompareAndSwapUint32(addr, cur, val)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				// Linux reacquires the bucket lock on retries, which will re-lookup the
+				// mapping at the futex address. However, retrying while holding the
+				// lock is more efficient and reduces the chance of another conflict.
+				continue
+			}
+			// Futex acquired.
+			return true, nil
+		}
+
+		// Futex is already owned, prepare to wait.
+
+		if try {
+			// Caller doesn't want to wait.
+			return false, nil
+		}
+
+		// Set waiters bit if not set yet.
+		if cur&linux.FUTEX_WAITERS == 0 {
+			prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
+			if err != nil {
+				return false, err
+			}
+			if prev != cur {
+				// CAS failed, retry...
+				continue
+			}
+		}
+
+		// Add the waiter to the bucket.
+		b.waiters.PushBack(w)
+		w.bucket.Store(b)
+		return false, nil
+	}
+}
+
+// UnlockPI unlock the futex following the Priority-inheritance futex
+// rules. The address provided must contain the caller's TID. If there are
+// waiters, TID of the next waiter (FIFO) is set to the given address, and the
+// waiter woken up. If there are no waiters, 0 is set to the address.
+func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error {
+	k, err := getKey(t, addr, private)
+	if err != nil {
+		return err
+	}
+	b := m.lockBucket(&k)
+
+	err = m.unlockPILocked(t, addr, tid, b)
+
+	k.release()
+	b.mu.Unlock()
+	return err
+}
+
+func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error {
+	cur, err := t.LoadUint32(addr)
+	if err != nil {
+		return err
+	}
+
+	if (cur & linux.FUTEX_TID_MASK) != tid {
+		return syserror.EPERM
+	}
+
+	if b.waiters.Empty() {
+		// It's safe to set 0 because there are no waiters, no new owner, and the
+		// executing task is the current owner (no owner died bit).
+		prev, err := t.CompareAndSwapUint32(addr, cur, 0)
+		if err != nil {
+			return err
+		}
+		if prev != cur {
+			// Let user mode handle CAS races. This is different than lock, which
+			// retries when CAS fails.
+			return syserror.EAGAIN
+		}
+		return nil
+	}
+
+	next := b.waiters.Front()
+
+	// Set next owner's TID, waiters if there are any. Resets owner died bit, if
+	// set, because the executing task takes over as the owner.
+	val := next.tid
+	if next.Next() != nil {
+		val |= linux.FUTEX_WAITERS
+	}
+
+	prev, err := t.CompareAndSwapUint32(addr, cur, val)
+	if err != nil {
+		return err
+	}
+	if prev != cur {
+		return syserror.EINVAL
+	}
+
+	b.wakeWaiterLocked(next)
+	return nil
+}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index a7ab9f229..9d44ee8e5 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -49,6 +49,10 @@ func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint
 	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
 }
 
+func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil
+}
+
 func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) {
 	return Key{
 		Kind:   KindSharedMappable,
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 921f7bdbc..351cf47d7 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -41,6 +41,13 @@ func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32,
 	})
 }
 
+// LoadUint32 implemets futex.Target.LoadUint32.
+func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) {
+	return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+}
+
 // GetSharedKey implements futex.Target.GetSharedKey.
 func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) {
 	return t.MemoryManager().GetSharedFutexKey(t, addr)
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index 6600ddd78..e0cebef84 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -346,6 +346,7 @@ func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new
 		if err != nil {
 			return 0, translateIOError(ctx, err)
 		}
+		// Return the number of bytes read.
 		return 4, nil
 	})
 	return old, err
@@ -388,11 +389,55 @@ func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.
 		if err != nil {
 			return 0, translateIOError(ctx, err)
 		}
+		// Return the number of bytes read.
 		return 4, nil
 	})
 	return prev, err
 }
 
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	ar, ok := mm.CheckIORange(addr, 4)
+	if !ok {
+		return 0, syserror.EFAULT
+	}
+
+	// Do AddressSpace IO if applicable.
+	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
+		for {
+			val, err := mm.as.LoadUint32(addr)
+			if err == nil {
+				return val, nil
+			}
+			if f, ok := err.(platform.SegmentationFault); ok {
+				if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil {
+					return 0, err
+				}
+				continue
+			}
+			return 0, translateIOError(ctx, err)
+		}
+	}
+
+	// Go through internal mappings.
+	var val uint32
+	_, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
+		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
+			// Atomicity is unachievable across mappings.
+			return 0, syserror.EFAULT
+		}
+		im := ims.Head()
+		var err error
+		val, err = safemem.LoadUint32(im)
+		if err != nil {
+			return 0, translateIOError(ctx, err)
+		}
+		// Return the number of bytes read.
+		return 4, nil
+	})
+	return val, err
+}
+
 // handleASIOFault handles a page fault at address addr for an AddressSpaceIO
 // operation spanning ioar.
 //
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index f16588e6e..a9e76bd45 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -254,6 +254,11 @@ type AddressSpaceIO interface {
 	//
 	// Preconditions: addr must be aligned to a 4-byte boundary.
 	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	LoadUint32(addr usermem.Addr) (uint32, error)
 }
 
 // NoAddressSpaceIO implements AddressSpaceIO methods by panicing.
@@ -284,6 +289,11 @@ func (NoAddressSpaceIO) CompareAndSwapUint32(addr usermem.Addr, old, new uint32)
 	panic("This platform does not support AddressSpaceIO")
 }
 
+// LoadUint32 implements AddressSpaceIO.LoadUint32.
+func (NoAddressSpaceIO) LoadUint32(addr usermem.Addr) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
 // SegmentationFault is an error returned by AddressSpaceIO methods when IO
 // fails due to access of an unmapped page, or a mapped page with insufficient
 // permissions.
diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD
index 05a6a61ae..d97a40297 100644
--- a/pkg/sentry/platform/safecopy/BUILD
+++ b/pkg/sentry/platform/safecopy/BUILD
@@ -18,9 +18,7 @@ go_library(
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy",
     visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/syserror",
-    ],
+    deps = ["//pkg/syserror"],
 )
 
 go_test(
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
index 873ffa046..f90b4bfd1 100644
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -106,3 +106,31 @@ TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
   CMPXCHGL DX, 0(DI)
   MOVL AX, prev+16(FP)
   RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+  MOVL DI, sig+12(FP)
+  RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleLoadUint32Fault will store a different value in this address.
+  MOVL $0, sig+12(FP)
+
+  MOVQ addr+0(FP), AX
+  MOVL (AX), BX
+  MOVL BX, val+8(FP)
+  RET
diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s
index 554a5c1e1..d58ed71f7 100644
--- a/pkg/sentry/platform/safecopy/atomic_arm64.s
+++ b/pkg/sentry/platform/safecopy/atomic_arm64.s
@@ -96,3 +96,31 @@ again:
 done:
 	MOVW R3, prev+16(FP)
 	RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+	MOVW R1, sig+12(FP)
+	RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleLoadUint32Fault will store a different value in this address.
+	MOVW $0, sig+12(FP)
+
+	MOVD addr+0(FP), R0
+	LDARW (R0), R1
+	MOVW R1, val+8(FP)
+	RET
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
index c60f73103..69c66a3b7 100644
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -75,6 +75,8 @@ var (
 	swapUint64End             uintptr
 	compareAndSwapUint32Begin uintptr
 	compareAndSwapUint32End   uintptr
+	loadUint32Begin           uintptr
+	loadUint32End             uintptr
 
 	// savedSigSegVHandler is a pointer to the SIGSEGV handler that was
 	// configured before we replaced it with our own. We still call into it
@@ -119,6 +121,8 @@ func initializeAddresses() {
 	swapUint64End = FindEndAddress(swapUint64Begin)
 	compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer()
 	compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
+	loadUint32Begin = reflect.ValueOf(loadUint32).Pointer()
+	loadUint32End = FindEndAddress(loadUint32Begin)
 }
 
 func init() {
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
index e78a6714e..f84527484 100644
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -79,6 +79,14 @@ func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
 //go:noescape
 func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
 
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+
 // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
 // copied and an error if SIGSEGV or SIGBUS is received while reading from src.
 func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
@@ -260,6 +268,18 @@ func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
 	return prev, errorFromFaultSignal(ptr, sig)
 }
 
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	val, sig := loadUint32(ptr)
+	return val, errorFromFaultSignal(ptr, sig)
+}
+
 func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
 	switch sig {
 	case 0:
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
index 06614f1b4..db7701a29 100644
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -101,6 +101,15 @@ not_swapuint64:
 	JMP handle_fault
 
 not_casuint32:
+	CMPQ CX, ·loadUint32Begin(SB)
+	JB not_loaduint32
+	CMPQ CX, ·loadUint32End(SB)
+	JAE not_loaduint32
+
+	LEAQ handleLoadUint32Fault(SB), CX
+	JMP handle_fault
+
+not_loaduint32:
 original_handler:
 	// Jump to the previous signal handler, which is likely the golang one.
 	XORQ CX, CX
diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s
index 5e8e193e7..cdfca8207 100644
--- a/pkg/sentry/platform/safecopy/sighandler_arm64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s
@@ -110,6 +110,17 @@ not_swapuint64:
 	B handle_fault
 
 not_casuint32:
+	MOVD ·loadUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_loaduint32
+	MOVD ·loadUint32End(SB), R8
+	CMP R8, R7
+	BHS not_loaduint32
+
+	MOVD $handleLoadUint32Fault(SB), R7
+	B handle_fault
+
+not_loaduint32:
 original_handler:
 	// Jump to the previous signal handler, which is likely the golang one.
 	MOVD ·savedSigBusHandler(SB), R7
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
index e91ff66ae..c3a9780d2 100644
--- a/pkg/sentry/safemem/block_unsafe.go
+++ b/pkg/sentry/safemem/block_unsafe.go
@@ -267,3 +267,13 @@ func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) {
 	}
 	return safecopy.CompareAndSwapUint32(b.start, old, new)
 }
+
+// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b.
+//
+// Preconditions: b.Len() >= 4.
+func LoadUint32(b Block) (uint32, error) {
+	if b.length < 4 {
+		panic(fmt.Sprintf("insufficient length: %d", b.length))
+	}
+	return safecopy.LoadUint32(b.start)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index 7a1d396ec..f0c89cba4 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -124,6 +124,46 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add
 	return 0, kernel.ERESTART_RESTARTBLOCK
 }
 
+func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
+	if err != nil {
+		return err
+	}
+	if locked {
+		// Futex acquired, we're done!
+		return nil
+	}
+
+	if forever {
+		err = t.Block(w.C)
+	} else {
+		notifier, tchan := ktime.NewChannelNotifier()
+		timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
+		timer.Swap(ktime.Setting{
+			Enabled: true,
+			Next:    ktime.FromTimespec(ts),
+		})
+		err = t.BlockWithTimer(w.C, tchan)
+		timer.Destroy()
+	}
+
+	t.Futex().WaitComplete(w)
+	return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error {
+	w := t.FutexWaiter()
+	locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
+	if err != nil {
+		return err
+	}
+	if !locked {
+		return syserror.EWOULDBLOCK
+	}
+	return nil
+}
+
 // Futex implements linux syscall futex(2).
 // It provides a method for a program to wait for a value at a given address to
 // change, and a method to wake up anyone waiting on a particular address.
@@ -144,7 +184,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	switch cmd {
 	case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
 		// WAIT{_BITSET} wait forever if the timeout isn't passed.
-		forever := timeout == 0
+		forever := (timeout == 0)
 
 		var timespec linux.Timespec
 		if !forever {
@@ -205,8 +245,30 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
 		return uintptr(n), nil, err
 
-	case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
-		// We don't support any priority inversion futexes.
+	case linux.FUTEX_LOCK_PI:
+		forever := (timeout == 0)
+
+		var timespec linux.Timespec
+		if !forever {
+			var err error
+			timespec, err = copyTimespecIn(t, timeout)
+			if err != nil {
+				return 0, nil, err
+			}
+		}
+		err := futexLockPI(t, timespec, forever, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_TRYLOCK_PI:
+		err := tryLockPI(t, addr, private)
+		return 0, nil, err
+
+	case linux.FUTEX_UNLOCK_PI:
+		err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
+		return 0, nil, err
+
+	case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
+		t.Kernel().EmitUnimplementedEvent(t)
 		return 0, nil, syserror.ENOSYS
 
 	default:
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
index 8bdf3a508..7add8bc82 100644
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@@ -37,3 +37,11 @@ func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new
 	}
 	return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil
 }
+
+// LoadUint32 implements IO.LoadUint32.
+func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) {
+	if _, err := b.rangeCheck(addr, 4); err != nil {
+		return 0, err
+	}
+	return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil
+}
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 75ac4d22d..c3c9c153b 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -103,6 +103,13 @@ type IO interface {
 	// any following locks in the lock order. addr must be aligned to a 4-byte
 	// boundary.
 	CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
+	// any following locks in the lock order. addr must be aligned to a 4-byte
+	// boundary.
+	LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error)
 }
 
 // IOOpts contains options applicable to all IO methods.
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 4228707f4..5558cccff 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -33,6 +33,7 @@ var (
 	ECHILD       = error(syscall.ECHILD)
 	ECONNREFUSED = error(syscall.ECONNREFUSED)
 	ECONNRESET   = error(syscall.ECONNRESET)
+	EDEADLK      = error(syscall.EDEADLK)
 	EEXIST       = error(syscall.EEXIST)
 	EFAULT       = error(syscall.EFAULT)
 	EFBIG        = error(syscall.EFBIG)
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 37d0c31fd..b3499bcde 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -99,8 +99,8 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
-		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL:
-			// args: fd, cmd, ...
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX:
+			// args: fd/addr, cmd, ...
 			tr = newArgsTracker(1)
 
 		case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 4c818238b..2c214925e 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -808,6 +808,7 @@ cc_binary(
         "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:memory_util",
+        "//test/util:save_util",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index 6fa284013..35933b660 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -32,6 +32,7 @@
 #include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/memory_util.h"
+#include "test/util/save_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -118,6 +119,30 @@ int futex_wake_op(bool priv, std::atomic<int>* uaddr1, std::atomic<int>* uaddr2,
   return syscall(SYS_futex, uaddr1, op, nwake1, nwake2, uaddr2, sub_op);
 }
 
+int futex_lock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_LOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_trylock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_TRYLOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_unlock_pi(bool priv, std::atomic<int>* uaddr) {
+  int op = FUTEX_UNLOCK_PI;
+  if (priv) {
+    op |= FUTEX_PRIVATE_FLAG;
+  }
+  return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
 // Fixture for futex tests parameterized by whether to use private or shared
 // futexes.
 class PrivateAndSharedFutexTest : public ::testing::TestWithParam<bool> {
@@ -589,7 +614,95 @@ TEST(SharedFutexTest, WakeInterprocessFile_NoRandomSave) {
       << " status " << status;
 }
 
-}  // namespace
+TEST_P(PrivateAndSharedFutexTest, PIBasic) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+
+  ASSERT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+  EXPECT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EDEADLK));
 
+  ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), 0);
+  EXPECT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIConcurrency_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  std::unique_ptr<ScopedThread> threads[100];
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+    threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+      for (size_t j = 0; j < 10; ++j) {
+        ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+        EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+        SleepSafe(absl::Milliseconds(5));
+        ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+      }
+    });
+  }
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIWaiters) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+
+  ScopedThread th([is_priv, &a] {
+    ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+    ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+  });
+
+  // Wait until the thread blocks on the futex, setting the waiters bit.
+  auto start = absl::Now();
+  while (a.load() != (FUTEX_WAITERS | gettid())) {
+    ASSERT_LT(absl::Now() - start, absl::Seconds(5));
+    absl::SleepFor(absl::Milliseconds(100));
+  }
+  ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLock) {
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  ASSERT_THAT(futex_trylock_pi(IsPrivate(), &a), SyscallSucceeds());
+  EXPECT_EQ(a.load(), gettid());
+
+  EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EDEADLK));
+  ScopedThread th([is_priv, &a] {
+    EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EAGAIN));
+  });
+  th.Join();
+
+  ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLockConcurrency_NoRandomSave) {
+  DisableSave ds;  // Too many syscalls.
+
+  std::atomic<int> a = ATOMIC_VAR_INIT(0);
+  const bool is_priv = IsPrivate();
+
+  std::unique_ptr<ScopedThread> threads[100];
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+    threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+      for (size_t j = 0; j < 10;) {
+        if (futex_trylock_pi(is_priv, &a) >= 0) {
+          ++j;
+          EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+          SleepSafe(absl::Milliseconds(5));
+          ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+        }
+      }
+    });
+  }
+}
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From fbacb350391667fa9ffb78a84ae51a37d477aa02 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 6 Mar 2019 15:05:43 -0800
Subject: No need to check for negative uintptr.

Fixes #134

PiperOrigin-RevId: 237128306
Change-Id: I396e808484c18931fc5775970ec1f5ae231e1cb9
---
 pkg/sentry/fs/host/util_unsafe.go | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index d00da89d6..a8721d197 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -57,9 +57,6 @@ func readLink(fd int) (string, error) {
 			uintptr(unsafe.Pointer(&b[0])),
 			uintptr(l),
 			0, 0)
-		if n < 0 {
-			n = 0
-		}
 		if errno != 0 {
 			return "", errno
 		}
-- 
cgit v1.2.3


From 56a61282953b46c8f8b707d5948a2d3958dced0c Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 8 Mar 2019 15:48:16 -0800
Subject: Implement IP_MULTICAST_LOOP.

IP_MULTICAST_LOOP controls whether or not multicast packets sent on the default
route are looped back. In order to implement this switch, support for sending
and looping back multicast packets on the default route had to be implemented.

For now we only support IPv4 multicast.

PiperOrigin-RevId: 237534603
Change-Id: I490ac7ff8e8ebef417c7eb049a919c29d156ac1c
---
 pkg/sentry/socket/epsocket/epsocket.go             |  37 +-
 pkg/syserr/netstack.go                             |   2 +
 pkg/tcpip/network/arp/arp.go                       |   2 +-
 pkg/tcpip/network/ip_test.go                       |   8 +-
 pkg/tcpip/network/ipv4/ipv4.go                     |  15 +-
 pkg/tcpip/network/ipv6/icmp_test.go                |   2 +-
 pkg/tcpip/network/ipv6/ipv6.go                     |  15 +-
 pkg/tcpip/stack/nic.go                             |  18 +-
 pkg/tcpip/stack/registration.go                    |  14 +-
 pkg/tcpip/stack/route.go                           |  12 +-
 pkg/tcpip/stack/stack.go                           |  24 +-
 pkg/tcpip/stack/stack_test.go                      |  40 +-
 pkg/tcpip/stack/transport_test.go                  |   2 +-
 pkg/tcpip/tcpip.go                                 |   5 +
 pkg/tcpip/transport/icmp/endpoint.go               |   4 +-
 pkg/tcpip/transport/icmp/endpoint_state.go         |   2 +-
 pkg/tcpip/transport/tcp/endpoint.go                |   2 +-
 pkg/tcpip/transport/tcp/endpoint_state.go          |   1 +
 pkg/tcpip/transport/udp/BUILD                      |   1 +
 pkg/tcpip/transport/udp/endpoint.go                |  43 +-
 pkg/tcpip/transport/udp/endpoint_state.go          |   2 +-
 runsc/boot/network.go                              |  16 +-
 test/syscalls/linux/socket_ipv4_udp_unbound.cc     | 451 +++++++++++++++++++--
 .../socket_ipv4_udp_unbound_external_networking.cc | 332 +++++++++++++++
 24 files changed, 946 insertions(+), 104 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 4e547ea33..f7636e056 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -911,6 +911,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 		}
 		return rv.InetMulticastRequest, nil
 
+	case linux.IP_MULTICAST_LOOP:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		var v tcpip.MulticastLoopOption
+		if err := ep.GetSockOpt(&v); err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		if v {
+			return int32(1), nil
+		}
+		return int32(0), nil
+
 	default:
 		emitUnimplementedEventIP(t, name)
 	}
@@ -1178,6 +1193,15 @@ func copyInMulticastRequest(optVal []byte) (linux.InetMulticastRequestWithNIC, *
 	return req, nil
 }
 
+// reduceToByte ORs all of the bytes in the input.
+func reduceToByte(buf []byte) byte {
+	var out byte
+	for _, b := range buf {
+		out |= b
+	}
+	return out
+}
+
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
 func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
@@ -1235,6 +1259,18 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 			InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]),
 		}))
 
+	case linux.IP_MULTICAST_LOOP:
+		if len(optVal) < 1 {
+			return syserr.ErrInvalidArgument
+		}
+		if len(optVal) > sizeOfInt32 {
+			optVal = optVal[:sizeOfInt32]
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(
+			tcpip.MulticastLoopOption(reduceToByte(optVal) != 0),
+		))
+
 	case linux.MCAST_JOIN_GROUP:
 		// FIXME: Implement MCAST_JOIN_GROUP.
 		t.Kernel().EmitUnimplementedEvent(t)
@@ -1252,7 +1288,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		linux.IP_MSFILTER,
 		linux.IP_MTU_DISCOVER,
 		linux.IP_MULTICAST_ALL,
-		linux.IP_MULTICAST_LOOP,
 		linux.IP_NODEFRAG,
 		linux.IP_OPTIONS,
 		linux.IP_PASSSEC,
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index 05ca475d1..c5a628c7d 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -23,6 +23,7 @@ import (
 var (
 	ErrUnknownProtocol       = New(tcpip.ErrUnknownProtocol.String(), linux.EINVAL)
 	ErrUnknownNICID          = New(tcpip.ErrUnknownNICID.String(), linux.EINVAL)
+	ErrUnknownDevice         = New(tcpip.ErrUnknownDevice.String(), linux.ENODEV)
 	ErrUnknownProtocolOption = New(tcpip.ErrUnknownProtocolOption.String(), linux.ENOPROTOOPT)
 	ErrDuplicateNICID        = New(tcpip.ErrDuplicateNICID.String(), linux.EEXIST)
 	ErrDuplicateAddress      = New(tcpip.ErrDuplicateAddress.String(), linux.EEXIST)
@@ -49,6 +50,7 @@ var (
 var netstackErrorTranslations = map[*tcpip.Error]*Error{
 	tcpip.ErrUnknownProtocol:       ErrUnknownProtocol,
 	tcpip.ErrUnknownNICID:          ErrUnknownNICID,
+	tcpip.ErrUnknownDevice:         ErrUnknownDevice,
 	tcpip.ErrUnknownProtocolOption: ErrUnknownProtocolOption,
 	tcpip.ErrDuplicateNICID:        ErrDuplicateNICID,
 	tcpip.ErrDuplicateAddress:      ErrDuplicateAddress,
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index ed39640c1..5ab542f2c 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -79,7 +79,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, buffer.Prependable, buffer.VectorisedView, tcpip.TransportProtocolNumber, uint8, stack.PacketLooping) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 97a43aece..7eb0e697d 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -177,7 +177,7 @@ func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 		NIC:         1,
 	}})
 
-	return s.FindRoute(1, local, remote, ipv4.ProtocolNumber)
+	return s.FindRoute(1, local, remote, ipv4.ProtocolNumber, false /* multicastLoop */)
 }
 
 func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
@@ -191,7 +191,7 @@ func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 		NIC:         1,
 	}})
 
-	return s.FindRoute(1, local, remote, ipv6.ProtocolNumber)
+	return s.FindRoute(1, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
 }
 
 func TestIPv4Send(t *testing.T) {
@@ -221,7 +221,7 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123); err != nil {
+	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123, stack.PacketOut); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
@@ -450,7 +450,7 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123); err != nil {
+	if err := ep.WritePacket(&r, hdr, payload.ToVectorisedView(), 123, 123, stack.PacketOut); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index bfc3c08fa..545684032 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -104,7 +104,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop stack.PacketLooping) *tcpip.Error {
 	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
 	length := uint16(hdr.UsedLength() + payload.Size())
 	id := uint32(0)
@@ -123,8 +123,19 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
-	r.Stats().IP.PacketsSent.Increment()
 
+	if loop&stack.PacketLoop != 0 {
+		views := make([]buffer.View, 1, 1+len(payload.Views()))
+		views[0] = hdr.View()
+		views = append(views, payload.Views()...)
+		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
+		e.HandlePacket(r, vv)
+	}
+	if loop&stack.PacketOut == 0 {
+		return nil
+	}
+
+	r.Stats().IP.PacketsSent.Increment()
 	return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber)
 }
 
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 797176243..15574bab1 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -161,7 +161,7 @@ func (c *testContext) cleanup() {
 func TestLinkResolution(t *testing.T) {
 	c := newTestContext(t)
 	defer c.cleanup()
-	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber)
+	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 5f68ef7d5..df3b64c98 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -84,7 +84,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop stack.PacketLooping) *tcpip.Error {
 	length := uint16(hdr.UsedLength() + payload.Size())
 	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
@@ -94,8 +94,19 @@ func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload b
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
-	r.Stats().IP.PacketsSent.Increment()
 
+	if loop&stack.PacketLoop != 0 {
+		views := make([]buffer.View, 1, 1+len(payload.Views()))
+		views[0] = hdr.View()
+		views = append(views, payload.Views()...)
+		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
+		e.HandlePacket(r, vv)
+	}
+	if loop&stack.PacketOut == 0 {
+		return nil
+	}
+
+	r.Stats().IP.PacketsSent.Increment()
 	return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber)
 }
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 79f845225..14267bb48 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -28,10 +28,11 @@ import (
 // NIC represents a "network interface card" to which the networking stack is
 // attached.
 type NIC struct {
-	stack  *Stack
-	id     tcpip.NICID
-	name   string
-	linkEP LinkEndpoint
+	stack    *Stack
+	id       tcpip.NICID
+	name     string
+	linkEP   LinkEndpoint
+	loopback bool
 
 	demux *transportDemuxer
 
@@ -62,12 +63,13 @@ const (
 	NeverPrimaryEndpoint
 )
 
-func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC {
+func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
 	return &NIC{
 		stack:     stack,
 		id:        id,
 		name:      name,
 		linkEP:    ep,
+		loopback:  loopback,
 		demux:     newTransportDemuxer(stack),
 		primary:   make(map[tcpip.NetworkProtocolNumber]*ilist.List),
 		endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint),
@@ -407,7 +409,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 		n.mu.RLock()
 		for _, ref := range n.endpoints {
 			if ref.protocol == header.IPv4ProtocolNumber && ref.tryIncRef() {
-				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
+				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* multicastLoop */)
 				r.RemoteLinkAddress = remote
 				ref.ep.HandlePacket(&r, vv)
 				ref.decRef()
@@ -418,7 +420,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 	}
 
 	if ref := n.getRef(protocol, dst); ref != nil {
-		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref)
+		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* multicastLoop */)
 		r.RemoteLinkAddress = remote
 		ref.ep.HandlePacket(&r, vv)
 		ref.decRef()
@@ -430,7 +432,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 	//
 	// TODO: Should we be forwarding the packet even if promiscuous?
 	if n.stack.Forwarding() {
-		r, err := n.stack.FindRoute(0, "", dst, protocol)
+		r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */)
 		if err != nil {
 			n.stack.stats.IP.InvalidAddressesReceived.Increment()
 			return
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 62acd5919..cf4d52fe9 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -125,6 +125,18 @@ type TransportDispatcher interface {
 	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView)
 }
 
+// PacketLooping specifies where an outbound packet should be sent.
+type PacketLooping byte
+
+const (
+	// PacketOut indicates that the packet should be passed to the link
+	// endpoint.
+	PacketOut PacketLooping = 1 << iota
+
+	// PacketLoop indicates that the packet should be handled locally.
+	PacketLoop
+)
+
 // NetworkEndpoint is the interface that needs to be implemented by endpoints
 // of network layer protocols (e.g., ipv4, ipv6).
 type NetworkEndpoint interface {
@@ -149,7 +161,7 @@ type NetworkEndpoint interface {
 
 	// WritePacket writes a packet to the given destination address and
 	// protocol.
-	WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error
+	WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop PacketLooping) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 2b4185014..c9603ad5e 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -46,17 +46,20 @@ type Route struct {
 	// ref a reference to the network endpoint through which the route
 	// starts.
 	ref *referencedNetworkEndpoint
+
+	multicastLoop bool
 }
 
 // makeRoute initializes a new route. It takes ownership of the provided
 // reference to a network endpoint.
-func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint) Route {
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, multicastLoop bool) Route {
 	return Route{
 		NetProto:         netProto,
 		LocalAddress:     localAddr,
 		LocalLinkAddress: localLinkAddr,
 		RemoteAddress:    remoteAddr,
 		ref:              ref,
+		multicastLoop:    multicastLoop,
 	}
 }
 
@@ -134,7 +137,12 @@ func (r *Route) IsResolutionRequired() bool {
 
 // WritePacket writes the packet through the given route.
 func (r *Route) WritePacket(hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
-	err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl)
+	loop := PacketOut
+	if r.multicastLoop && (header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress)) {
+		loop |= PacketLoop
+	}
+
+	err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl, loop)
 	if err == tcpip.ErrNoRoute {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index cfda7ec3c..047b704e0 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -513,7 +513,7 @@ func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network
 
 // createNIC creates a NIC with the provided id and link-layer endpoint, and
 // optionally enable it.
-func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enabled bool) *tcpip.Error {
+func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enabled, loopback bool) *tcpip.Error {
 	ep := FindLinkEndpoint(linkEP)
 	if ep == nil {
 		return tcpip.ErrBadLinkEndpoint
@@ -527,7 +527,7 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpoint
 		return tcpip.ErrDuplicateNICID
 	}
 
-	n := newNIC(s, id, name, ep)
+	n := newNIC(s, id, name, ep, loopback)
 
 	s.nics[id] = n
 	if enabled {
@@ -539,26 +539,32 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpoint
 
 // CreateNIC creates a NIC with the provided id and link-layer endpoint.
 func (s *Stack) CreateNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, "", linkEP, true)
+	return s.createNIC(id, "", linkEP, true, false)
 }
 
 // CreateNamedNIC creates a NIC with the provided id and link-layer endpoint,
 // and a human-readable name.
 func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, name, linkEP, true)
+	return s.createNIC(id, name, linkEP, true, false)
+}
+
+// CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer
+// endpoint, and a human-readable name.
+func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, name, linkEP, true, true)
 }
 
 // CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint,
 // but leave it disable. Stack.EnableNIC must be called before the link-layer
 // endpoint starts delivering packets to it.
 func (s *Stack) CreateDisabledNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, "", linkEP, false)
+	return s.createNIC(id, "", linkEP, false, false)
 }
 
 // CreateDisabledNamedNIC is a combination of CreateNamedNIC and
 // CreateDisabledNIC.
 func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
-	return s.createNIC(id, name, linkEP, false)
+	return s.createNIC(id, name, linkEP, false, false)
 }
 
 // EnableNIC enables the given NIC so that the link-layer endpoint can start
@@ -748,7 +754,7 @@ func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.Netwo
 
 // FindRoute creates a route to the given destination address, leaving through
 // the given nic and local address (if provided).
-func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (Route, *tcpip.Error) {
+func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
@@ -758,7 +764,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	if id != 0 && !needRoute {
 		if nic, ok := s.nics[id]; ok {
 			if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
-				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref), nil
+				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, multicastLoop && !nic.loopback), nil
 			}
 		}
 	} else {
@@ -774,7 +780,7 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 						remoteAddr = ref.ep.ID().LocalAddress
 					}
 
-					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref)
+					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, multicastLoop && !nic.loopback)
 					if needRoute {
 						r.NextHop = route.Gateway
 					}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index aba1e984c..b366de21d 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -112,7 +112,7 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
 	return f.linkEP.Capabilities()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, _ uint8) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, _ uint8, loop stack.PacketLooping) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
@@ -122,6 +122,18 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable
 	b[0] = r.RemoteAddress[0]
 	b[1] = f.id.LocalAddress[0]
 	b[2] = byte(protocol)
+
+	if loop&stack.PacketLoop != 0 {
+		views := make([]buffer.View, 1, 1+len(payload.Views()))
+		views[0] = hdr.View()
+		views = append(views, payload.Views()...)
+		vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
+		f.HandlePacket(r, vv)
+	}
+	if loop&stack.PacketOut == 0 {
+		return nil
+	}
+
 	return f.linkEP.WritePacket(r, hdr, payload, fakeNetNumber)
 }
 
@@ -262,7 +274,7 @@ func TestNetworkReceive(t *testing.T) {
 }
 
 func sendTo(t *testing.T, s *stack.Stack, addr tcpip.Address) {
-	r, err := s.FindRoute(0, "", addr, fakeNetNumber)
+	r, err := s.FindRoute(0, "", addr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -354,7 +366,7 @@ func TestNetworkSendMultiRoute(t *testing.T) {
 }
 
 func testRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr, expectedSrcAddr tcpip.Address) {
-	r, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber)
+	r, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -371,7 +383,7 @@ func testRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr,
 }
 
 func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr tcpip.Address) {
-	_, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber)
+	_, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err != tcpip.ErrNoRoute {
 		t.Fatalf("FindRoute returned unexpected error, expected tcpip.ErrNoRoute, got %v", err)
 	}
@@ -514,7 +526,7 @@ func TestDelayedRemovalDueToRoute(t *testing.T) {
 	}
 
 	// Get a route, check that packet is still deliverable.
-	r, err := s.FindRoute(0, "", "\x02", fakeNetNumber)
+	r, err := s.FindRoute(0, "", "\x02", fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -584,7 +596,7 @@ func TestPromiscuousMode(t *testing.T) {
 	}
 
 	// Check that we can't get a route as there is no local address.
-	_, err := s.FindRoute(0, "", "\x02", fakeNetNumber)
+	_, err := s.FindRoute(0, "", "\x02", fakeNetNumber, false /* multicastLoop */)
 	if err != tcpip.ErrNoRoute {
 		t.Fatalf("FindRoute returned unexpected status: expected %v, got %v", tcpip.ErrNoRoute, err)
 	}
@@ -622,7 +634,7 @@ func TestAddressSpoofing(t *testing.T) {
 
 	// With address spoofing disabled, FindRoute does not permit an address
 	// that was not added to the NIC to be used as the source.
-	r, err := s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber)
+	r, err := s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err == nil {
 		t.Errorf("FindRoute succeeded with route %+v when it should have failed", r)
 	}
@@ -632,7 +644,7 @@ func TestAddressSpoofing(t *testing.T) {
 	if err := s.SetSpoofing(1, true); err != nil {
 		t.Fatalf("SetSpoofing failed: %v", err)
 	}
-	r, err = s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber)
+	r, err = s.FindRoute(0, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute failed: %v", err)
 	}
@@ -654,14 +666,14 @@ func TestBroadcastNeedsNoRoute(t *testing.T) {
 	s.SetRouteTable([]tcpip.Route{})
 
 	// If there is no endpoint, it won't work.
-	if _, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber); err != tcpip.ErrNetworkUnreachable {
+	if _, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
 		t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
 	}
 
 	if err := s.AddAddress(1, fakeNetNumber, header.IPv4Any); err != nil {
 		t.Fatalf("AddAddress(%v, %v) failed: %v", fakeNetNumber, header.IPv4Any, err)
 	}
-	r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber)
+	r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		t.Fatalf("FindRoute(1, %v, %v, %v) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
 	}
@@ -675,7 +687,7 @@ func TestBroadcastNeedsNoRoute(t *testing.T) {
 	}
 
 	// If the NIC doesn't exist, it won't work.
-	if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber); err != tcpip.ErrNetworkUnreachable {
+	if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
 		t.Fatalf("got FindRoute(2, %v, %v, %v) = %v want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
 	}
 }
@@ -738,7 +750,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 			}
 
 			// If there is no endpoint, it won't work.
-			if _, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber); err != want {
+			if _, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); err != want {
 				t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", anyAddr, tc.address, fakeNetNumber, err, want)
 			}
 
@@ -746,7 +758,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 				t.Fatalf("AddAddress(%v, %v) failed: %v", fakeNetNumber, anyAddr, err)
 			}
 
-			if r, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber); tc.routeNeeded {
+			if r, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); tc.routeNeeded {
 				// Route table is empty but we need a route, this should cause an error.
 				if err != tcpip.ErrNoRoute {
 					t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", anyAddr, tc.address, fakeNetNumber, err, tcpip.ErrNoRoute)
@@ -763,7 +775,7 @@ func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
 				}
 			}
 			// If the NIC doesn't exist, it won't work.
-			if _, err := s.FindRoute(2, anyAddr, tc.address, fakeNetNumber); err != want {
+			if _, err := s.FindRoute(2, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); err != want {
 				t.Fatalf("got FindRoute(2, %v, %v, %v) = %v want = %v", anyAddr, tc.address, fakeNetNumber, err, want)
 			}
 		})
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index a9e844e3d..279ab3c56 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -103,7 +103,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	f.peerAddr = addr.Addr
 
 	// Find the route.
-	r, err := f.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber)
+	r, err := f.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber, false /* multicastLoop */)
 	if err != nil {
 		return tcpip.ErrNoRoute
 	}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 7010d1b68..825854148 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -68,6 +68,7 @@ func (e *Error) IgnoreStats() bool {
 var (
 	ErrUnknownProtocol       = &Error{msg: "unknown protocol"}
 	ErrUnknownNICID          = &Error{msg: "unknown nic id"}
+	ErrUnknownDevice         = &Error{msg: "unknown device"}
 	ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"}
 	ErrDuplicateNICID        = &Error{msg: "duplicate nic id"}
 	ErrDuplicateAddress      = &Error{msg: "duplicate address"}
@@ -477,6 +478,10 @@ type MulticastInterfaceOption struct {
 	InterfaceAddr Address
 }
 
+// MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
+// multicast packets sent over a non-loopback interface will be looped back.
+type MulticastLoopOption bool
+
 // MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
 // AddMembershipOption and RemoveMembershipOption.
 type MembershipOption struct {
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 05c4b532a..d876005fe 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -277,7 +277,7 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
 		}
 
 		// Find the enpoint.
-		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto)
+		r, err := e.stack.FindRoute(nicid, e.bindAddr, to.Addr, netProto, false /* multicastLoop */)
 		if err != nil {
 			return 0, nil, err
 		}
@@ -471,7 +471,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto)
+	r, err := e.stack.FindRoute(nicid, e.bindAddr, addr.Addr, netProto, false /* multicastLoop */)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/icmp/endpoint_state.go b/pkg/tcpip/transport/icmp/endpoint_state.go
index 21008d089..8a7909246 100644
--- a/pkg/tcpip/transport/icmp/endpoint_state.go
+++ b/pkg/tcpip/transport/icmp/endpoint_state.go
@@ -71,7 +71,7 @@ func (e *endpoint) afterLoad() {
 
 	var err *tcpip.Error
 	if e.state == stateConnected {
-		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto)
+		e.route, err = e.stack.FindRoute(e.regNICID, e.bindAddr, e.id.RemoteAddress, e.netProto, false /* multicastLoop */)
 		if err != nil {
 			panic(*err)
 		}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index ae99f0f8e..fc4f82402 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1091,7 +1091,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, addr.Addr, netProto)
+	r, err := e.stack.FindRoute(nicid, e.id.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 87e988afa..a42e09b8c 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -307,6 +307,7 @@ func loadError(s string) *tcpip.Error {
 		var errors = []*tcpip.Error{
 			tcpip.ErrUnknownProtocol,
 			tcpip.ErrUnknownNICID,
+			tcpip.ErrUnknownDevice,
 			tcpip.ErrUnknownProtocolOption,
 			tcpip.ErrDuplicateNICID,
 			tcpip.ErrDuplicateAddress,
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 8ccb79c48..d271490c1 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -27,6 +27,7 @@ go_library(
     imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/log",
         "//pkg/sleep",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 4108cb09c..3693abae5 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -81,6 +81,7 @@ type endpoint struct {
 	multicastTTL   uint8
 	multicastAddr  tcpip.Address
 	multicastNICID tcpip.NICID
+	multicastLoop  bool
 	reusePort      bool
 	broadcast      bool
 
@@ -124,6 +125,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
 		//
 		// Linux defaults to TTL=1.
 		multicastTTL:  1,
+		multicastLoop: true,
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 	}
@@ -274,7 +276,7 @@ func (e *endpoint) connectRoute(nicid tcpip.NICID, addr tcpip.FullAddress) (stac
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicid, localAddr, addr.Addr, netProto)
+	r, err := e.stack.FindRoute(nicid, localAddr, addr.Addr, netProto, e.multicastLoop)
 	if err != nil {
 		return stack.Route{}, 0, 0, err
 	}
@@ -458,13 +460,19 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 	case tcpip.AddMembershipOption:
 		nicID := v.NIC
-		if v.InterfaceAddr != header.IPv4Any {
+		if v.InterfaceAddr == header.IPv4Any {
+			if nicID == 0 {
+				r, err := e.stack.FindRoute(0, "", v.MulticastAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+				if err == nil {
+					nicID = r.NICID()
+					r.Release()
+				}
+			}
+		} else {
 			nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr)
 		}
 		if nicID == 0 {
-			// TODO: Allow adding memberships without
-			// specifing an interface.
-			return tcpip.ErrNoRoute
+			return tcpip.ErrUnknownDevice
 		}
 
 		// TODO: check that v.MulticastAddr is a multicast address.
@@ -479,11 +487,19 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 
 	case tcpip.RemoveMembershipOption:
 		nicID := v.NIC
-		if v.InterfaceAddr != header.IPv4Any {
+		if v.InterfaceAddr == header.IPv4Any {
+			if nicID == 0 {
+				r, err := e.stack.FindRoute(0, "", v.MulticastAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+				if err == nil {
+					nicID = r.NICID()
+					r.Release()
+				}
+			}
+		} else {
 			nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr)
 		}
 		if nicID == 0 {
-			return tcpip.ErrNoRoute
+			return tcpip.ErrUnknownDevice
 		}
 
 		// TODO: check that v.MulticastAddr is a multicast address.
@@ -503,6 +519,11 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			}
 		}
 
+	case tcpip.MulticastLoopOption:
+		e.mu.Lock()
+		e.multicastLoop = bool(v)
+		e.mu.Unlock()
+
 	case tcpip.ReusePortOption:
 		e.mu.Lock()
 		e.reusePort = v != 0
@@ -578,6 +599,14 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Unlock()
 		return nil
 
+	case *tcpip.MulticastLoopOption:
+		e.mu.RLock()
+		v := e.multicastLoop
+		e.mu.RUnlock()
+
+		*o = tcpip.MulticastLoopOption(v)
+		return nil
+
 	case *tcpip.ReusePortOption:
 		e.mu.RLock()
 		v := e.reusePort
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 4d8210294..b2daaf751 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -82,7 +82,7 @@ func (e *endpoint) afterLoad() {
 
 	var err *tcpip.Error
 	if e.state == stateConnected {
-		e.route, err = e.stack.FindRoute(e.regNICID, e.id.LocalAddress, e.id.RemoteAddress, netProto)
+		e.route, err = e.stack.FindRoute(e.regNICID, e.id.LocalAddress, e.id.RemoteAddress, netProto, e.multicastLoop)
 		if err != nil {
 			panic(*err)
 		}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0cadf48d6..40bc147ca 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -112,7 +112,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		linkEP := loopback.New()
 
 		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
 			return err
 		}
 
@@ -144,7 +144,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
 			return err
 		}
 
@@ -169,9 +169,15 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP) error {
-	if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
-		return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error {
+	if loopback {
+		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil {
+			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		}
+	} else {
+		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
+			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		}
 	}
 
 	// Always start with an arp address for the NIC.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 2d702179e..38bc85ce9 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -61,7 +61,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNoGroup) {
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -99,7 +99,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -134,12 +134,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNicNoDefaultSendIf) {
   // Bind the second FD to the v4 any address to ensure that we can receive any
   // unicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -174,7 +174,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
-  EXPECT_THAT(
+  ASSERT_THAT(
       bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
@@ -182,12 +182,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -197,7 +197,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -207,7 +207,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -222,7 +222,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackAddr) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // bind and the group membership is configured by NIC ID.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -230,7 +230,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   // Bind the first FD to the loopback. This is an alternative to
   // IP_MULTICAST_IF for setting the default send interface.
   auto sender_addr = V4Loopback();
-  EXPECT_THAT(
+  ASSERT_THAT(
       bind(sockets->first_fd(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
            sender_addr.addr_len),
       SyscallSucceeds());
@@ -238,12 +238,12 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -253,7 +253,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -263,7 +263,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -278,7 +278,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackNic) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by address.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
@@ -287,19 +287,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -309,7 +309,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -319,7 +319,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -334,7 +334,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddr) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in sendto, and the group
 // membership is configured by NIC ID.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
@@ -343,19 +343,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -365,7 +365,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -375,7 +375,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
                          reinterpret_cast<sockaddr*>(&send_addr.addr),
                          send_addr.addr_len),
@@ -390,7 +390,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNic) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by address.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
@@ -399,19 +399,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   // Set the default send interface.
   ip_mreq iface = {};
   iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -421,7 +421,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   ip_mreq group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -429,7 +429,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   auto connect_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(connect)(sockets->first_fd(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
@@ -437,7 +437,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
       SyscallSucceedsWithValue(sizeof(send_buf)));
 
@@ -450,7 +450,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrConnect) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
-// Check that multicast works when the default send interface is confgured by
+// Check that multicast works when the default send interface is configured by
 // IP_MULTICAST_IF, the send address is specified in connect, and the group
 // membership is configured by NIC ID.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
@@ -459,19 +459,19 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   // Set the default send interface.
   ip_mreqn iface = {};
   iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
                          &iface, sizeof(iface)),
               SyscallSucceeds());
 
   // Bind the second FD to the v4 any address to ensure that we can receive the
   // multicast packet.
   auto receiver_addr = V4Any();
-  EXPECT_THAT(bind(sockets->second_fd(),
+  ASSERT_THAT(bind(sockets->second_fd(),
                    reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -481,7 +481,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   ip_mreqn group = {};
   group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
   group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
-  EXPECT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+  ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
                          &group, sizeof(group)),
               SyscallSucceeds());
 
@@ -489,7 +489,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   auto connect_addr = V4Multicast();
   reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
       reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(connect)(sockets->first_fd(),
                           reinterpret_cast<sockaddr*>(&connect_addr.addr),
                           connect_addr.addr_len),
@@ -497,7 +497,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
 
   char send_buf[200];
   RandomizeBuffer(send_buf, sizeof(send_buf));
-  EXPECT_THAT(
+  ASSERT_THAT(
       RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
       SyscallSucceedsWithValue(sizeof(send_buf)));
 
@@ -510,6 +510,354 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicConnect) {
   EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
 }
 
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto connect_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  EXPECT_THAT(
+      RetryEINTR(connect)(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&connect_addr.addr),
+                          connect_addr.addr_len),
+      SyscallSucceeds());
+
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfConnect) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto connect_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  ASSERT_THAT(
+      RetryEINTR(connect)(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&connect_addr.addr),
+                          connect_addr.addr_len),
+      SyscallSucceeds());
+
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), send_buf, sizeof(send_buf), 0),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreq iface = {};
+  iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Bind the first FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastLoopbackIfNicSelfNoLoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Set the default send interface.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+                         &iface, sizeof(iface)),
+              SyscallSucceeds());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(bind(sockets->first_fd(),
+                   reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                   receiver_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(sockets->first_fd(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(
+      RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+                         reinterpret_cast<sockaddr*>(&send_addr.addr),
+                         send_addr.addr_len),
+      SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(sockets->first_fd(), recv_buf, sizeof(recv_buf), 0),
+      SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
 // Check that dropping a group membership that does not exist fails.
 TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastInvalidDrop) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -668,5 +1016,26 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidAddr) {
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
+TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupNoIf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallFailsWithErrno(ENODEV));
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupInvalidIf) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn group = {};
+  group.imr_address.s_addr = inet_addr("255.255.255");
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+                         &group, sizeof(group)),
+              SyscallFailsWithErrno(ENODEV));
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 7d561b991..8b4fc57b6 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -14,6 +14,7 @@
 
 #include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
 
+#include <arpa/inet.h>
 #include <netinet/in.h>
 #include <sys/socket.h>
 #include <sys/types.h>
@@ -24,6 +25,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
 #include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/test_util.h"
 
@@ -227,5 +229,335 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendUnicastOnUnbound) {
               SyscallSucceedsWithValue(sizeof(kTestMsg)));
 }
 
+constexpr char kMulticastAddress[] = "224.0.2.1";
+
+TestAddress V4Multicast() {
+  TestAddress t("V4Multicast");
+  t.addr.ss_family = AF_INET;
+  t.addr_len = sizeof(sockaddr_in);
+  reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+      inet_addr(kMulticastAddress);
+  return t;
+}
+
+// Check that multicast packets won't be delivered to the sending socket with no
+// set interface or group membership.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastSelfNoGroup) {
+  // FIXME: A group membership is not required for external
+  // multicast on gVisor.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  auto bind_addr = V4Any();
+  ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                   bind_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t bind_addr_len = bind_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                  &bind_addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(
+      RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+      SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to the sending socket without
+// setting an interface.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSelf) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  auto bind_addr = V4Any();
+  ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                   bind_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t bind_addr_len = bind_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                  &bind_addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  ASSERT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast packets won't be delivered to the sending socket with no
+// set interface and IP_MULTICAST_LOOP disabled.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastSelfLoopOff) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  auto bind_addr = V4Any();
+  ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                   bind_addr.addr_len),
+              SyscallSucceeds());
+  socklen_t bind_addr_len = bind_addr.addr_len;
+  ASSERT_THAT(
+      getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+                  &bind_addr_len),
+      SyscallSucceeds());
+  EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+  // Disable multicast looping.
+  EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Register to receive multicast packets.
+  ip_mreq group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  EXPECT_THAT(
+      RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+      SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets won't be delivered to another socket with no
+// set interface or group membership.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastNoGroup) {
+  // FIXME: A group membership is not required for external
+  // multicast on gVisor.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to another socket without
+// setting an interface.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticast) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast packets won't be delivered to another socket with no
+// set interface and IP_MULTICAST_LOOP disabled on the sending socket.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastSenderNoLoop) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Disable multicast looping on the sender.
+  EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  EXPECT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we did not receive the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf),
+                               MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to the sending socket without
+// setting an interface and IP_MULTICAST_LOOP disabled on the receiving socket.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+       TestSendMulticastReceiverNoLoop) {
+  auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // Bind the second FD to the v4 any address to ensure that we can receive the
+  // multicast packet.
+  auto receiver_addr = V4Any();
+  ASSERT_THAT(
+      bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+           receiver_addr.addr_len),
+      SyscallSucceeds());
+  socklen_t receiver_addr_len = receiver_addr.addr_len;
+  ASSERT_THAT(getsockname(receiver->get(),
+                          reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+                          &receiver_addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+  // Disable multicast looping on the receiver.
+  ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  // Register to receive multicast packets.
+  ip_mreqn group = {};
+  group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+  ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+                         sizeof(group)),
+              SyscallSucceeds());
+
+  // Send a multicast packet.
+  auto send_addr = V4Multicast();
+  reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+      reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+  char send_buf[200];
+  RandomizeBuffer(send_buf, sizeof(send_buf));
+  ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+                                 reinterpret_cast<sockaddr*>(&send_addr.addr),
+                                 send_addr.addr_len),
+              SyscallSucceedsWithValue(sizeof(send_buf)));
+
+  // Check that we received the multicast packet.
+  char recv_buf[sizeof(send_buf)] = {};
+  ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+              SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+  EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 281092e842445cfb9ff474aae81c169954b469cb Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 8 Mar 2019 20:26:55 -0800
Subject: Make IP_MULTICAST_LOOP and IP_MULTICAST_TTL allow setting int or
 char.

This is the correct Linux behavior, and at least PHP depends on it.

PiperOrigin-RevId: 237565639
Change-Id: I931af09c8ed99a842cf70d22bfe0b65e330c4137
---
 pkg/sentry/socket/epsocket/epsocket.go       | 34 ++++++-----
 test/syscalls/linux/socket_ip_udp_generic.cc | 84 ++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 15 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index f7636e056..6e95fd448 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1193,24 +1193,30 @@ func copyInMulticastRequest(optVal []byte) (linux.InetMulticastRequestWithNIC, *
 	return req, nil
 }
 
-// reduceToByte ORs all of the bytes in the input.
-func reduceToByte(buf []byte) byte {
-	var out byte
-	for _, b := range buf {
-		out |= b
+// parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
+//
+// net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
+func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
+	if len(buf) == 0 {
+		return 0, syserr.ErrInvalidArgument
+	}
+
+	if len(buf) >= sizeOfInt32 {
+		return int32(usermem.ByteOrder.Uint32(buf)), nil
 	}
-	return out
+
+	return int32(buf[0]), nil
 }
 
 // setSockOptIP implements SetSockOpt when level is SOL_IP.
 func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
 	switch name {
 	case linux.IP_MULTICAST_TTL:
-		if len(optVal) < sizeOfInt32 {
-			return syserr.ErrInvalidArgument
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
 		}
 
-		v := int32(usermem.ByteOrder.Uint32(optVal))
 		if v == -1 {
 			// Linux translates -1 to 1.
 			v = 1
@@ -1260,15 +1266,13 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}))
 
 	case linux.IP_MULTICAST_LOOP:
-		if len(optVal) < 1 {
-			return syserr.ErrInvalidArgument
-		}
-		if len(optVal) > sizeOfInt32 {
-			optVal = optVal[:sizeOfInt32]
+		v, err := parseIntOrChar(optVal)
+		if err != nil {
+			return err
 		}
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(
-			tcpip.MulticastLoopOption(reduceToByte(optVal) != 0),
+			tcpip.MulticastLoopOption(v != 0),
 		))
 
 	case linux.MCAST_JOIN_GROUP:
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 197783e55..432017b12 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -117,6 +117,23 @@ TEST_P(UDPSocketPairTest, SetUDPMulticastTTLAboveMax) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLChar) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr char kArbitrary = 6;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &kArbitrary, sizeof(kArbitrary)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kArbitrary);
+}
+
 TEST_P(UDPSocketPairTest, SetEmptyIPAddMembership) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
@@ -126,5 +143,72 @@ TEST_P(UDPSocketPairTest, SetEmptyIPAddMembership) {
               SyscallFailsWithErrno(EINVAL));
 }
 
+TEST_P(UDPSocketPairTest, MulticastLoopDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(UDPSocketPairTest, SetMulticastLoop) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOff, sizeof(kSockOptOff)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOn, sizeof(kSockOptOn)),
+              SyscallSucceeds());
+
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr char kSockOptOnChar = kSockOptOn;
+  constexpr char kSockOptOffChar = kSockOptOff;
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOffChar, sizeof(kSockOptOffChar)),
+              SyscallSucceeds());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOff);
+
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &kSockOptOnChar, sizeof(kSockOptOnChar)),
+              SyscallSucceeds());
+
+  EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+                         &get, &get_len),
+              SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, kSockOptOn);
+}
+
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 71d53382bfb3a6f05e90e31df8f39d22c0131040 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Sat, 9 Mar 2019 11:39:41 -0800
Subject: Fix getsockopt(IP_MULTICAST_IF).

getsockopt(IP_MULTICAST_IF) only supports struct in_addr.

Also adds support for setsockopt(IP_MULTICAST_IF) with struct in_addr.

PiperOrigin-RevId: 237620230
Change-Id: I75e7b5b3e08972164eb1906f43ddd67aedffc27c
---
 pkg/sentry/socket/epsocket/epsocket.go         |  47 ++++---
 test/syscalls/linux/socket_ipv4_udp_unbound.cc | 174 ++++++++++++++++++++++++-
 2 files changed, 199 insertions(+), 22 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 6e95fd448..468e65373 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -888,7 +888,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 		return int32(v), nil
 
 	case linux.IP_MULTICAST_IF:
-		if outLen < inetMulticastRequestSize {
+		if outLen < len(linux.InetAddr{}) {
 			return nil, syserr.ErrInvalidArgument
 		}
 
@@ -899,17 +899,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 
 		a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
 
-		rv := linux.InetMulticastRequestWithNIC{
-			linux.InetMulticastRequest{
-				InterfaceAddr: a.(linux.SockAddrInet).Addr,
-			},
-			int32(v.NIC),
-		}
-
-		if outLen >= inetMulticastRequestWithNICSize {
-			return rv, nil
-		}
-		return rv.InetMulticastRequest, nil
+		return a.(linux.SockAddrInet).Addr, nil
 
 	case linux.IP_MULTICAST_LOOP:
 		if outLen < sizeOfInt32 {
@@ -1179,17 +1169,34 @@ var (
 	inetMulticastRequestWithNICSize = int(binary.Size(linux.InetMulticastRequestWithNIC{}))
 )
 
-func copyInMulticastRequest(optVal []byte) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
-	if len(optVal) < inetMulticastRequestSize {
+// copyInMulticastRequest copies in a variable-size multicast request. The
+// kernel determines which structure was passed by its length. IP_MULTICAST_IF
+// supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
+// IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
+// allowAddr controls whether in_addr is accepted or rejected.
+func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
+	if len(optVal) < len(linux.InetAddr{}) {
 		return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
 	}
 
-	var req linux.InetMulticastRequestWithNIC
+	if len(optVal) < inetMulticastRequestSize {
+		if !allowAddr {
+			return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
+		}
+
+		var req linux.InetMulticastRequestWithNIC
+		copy(req.InterfaceAddr[:], optVal)
+		return req, nil
+	}
+
 	if len(optVal) >= inetMulticastRequestWithNICSize {
+		var req linux.InetMulticastRequestWithNIC
 		binary.Unmarshal(optVal[:inetMulticastRequestWithNICSize], usermem.ByteOrder, &req)
-	} else {
-		binary.Unmarshal(optVal[:inetMulticastRequestSize], usermem.ByteOrder, &req.InetMulticastRequest)
+		return req, nil
 	}
+
+	var req linux.InetMulticastRequestWithNIC
+	binary.Unmarshal(optVal[:inetMulticastRequestSize], usermem.ByteOrder, &req.InetMulticastRequest)
 	return req, nil
 }
 
@@ -1227,7 +1234,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v)))
 
 	case linux.IP_ADD_MEMBERSHIP:
-		req, err := copyInMulticastRequest(optVal)
+		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
 		if err != nil {
 			return err
 		}
@@ -1241,7 +1248,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}))
 
 	case linux.IP_DROP_MEMBERSHIP:
-		req, err := copyInMulticastRequest(optVal)
+		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
 		if err != nil {
 			return err
 		}
@@ -1255,7 +1262,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		}))
 
 	case linux.IP_MULTICAST_IF:
-		req, err := copyInMulticastRequest(optVal)
+		req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
 		if err != nil {
 			return err
 		}
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 38bc85ce9..c99958ed5 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -893,7 +893,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropAddr) {
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -951,7 +951,7 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastDropNic) {
                    receiver_addr.addr_len),
               SyscallSucceeds());
   socklen_t receiver_addr_len = receiver_addr.addr_len;
-  EXPECT_THAT(getsockname(sockets->second_fd(),
+  ASSERT_THAT(getsockname(sockets->second_fd(),
                           reinterpret_cast<sockaddr*>(&receiver_addr.addr),
                           &receiver_addr_len),
               SyscallSucceeds());
@@ -1016,6 +1016,176 @@ TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfInvalidAddr) {
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetShort) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  // Create a valid full-sized request.
+  ip_mreqn iface = {};
+  iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+
+  // Send an optlen of 1 to check that optlen is enforced.
+  EXPECT_THAT(
+      setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1),
+      SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  in_addr get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+  EXPECT_EQ(size, sizeof(get));
+  EXPECT_EQ(get.s_addr, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfDefaultReqn) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+
+  // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
+  // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr.
+  // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr.
+  EXPECT_EQ(size, sizeof(in_addr));
+
+  // getsockopt(IP_MULTICAST_IF) will only return the interface address which
+  // hasn't been set.
+  EXPECT_EQ(get.imr_multiaddr.s_addr, 0);
+  EXPECT_EQ(get.imr_address.s_addr, 0);
+  EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddrGetReqn) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  in_addr set = {};
+  set.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+                         sizeof(set)),
+              SyscallSucceeds());
+
+  ip_mreqn get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+
+  // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
+  // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr.
+  // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr.
+  EXPECT_EQ(size, sizeof(in_addr));
+  EXPECT_EQ(get.imr_multiaddr.s_addr, set.s_addr);
+  EXPECT_EQ(get.imr_address.s_addr, 0);
+  EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddrGetReqn) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreq set = {};
+  set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+                         sizeof(set)),
+              SyscallSucceeds());
+
+  ip_mreqn get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+
+  // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
+  // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr.
+  // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr.
+  EXPECT_EQ(size, sizeof(in_addr));
+  EXPECT_EQ(get.imr_multiaddr.s_addr, set.imr_interface.s_addr);
+  EXPECT_EQ(get.imr_address.s_addr, 0);
+  EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNicGetReqn) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn set = {};
+  set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+                         sizeof(set)),
+              SyscallSucceeds());
+
+  ip_mreqn get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+  EXPECT_EQ(size, sizeof(in_addr));
+  EXPECT_EQ(get.imr_multiaddr.s_addr, 0);
+  EXPECT_EQ(get.imr_address.s_addr, 0);
+  EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  in_addr set = {};
+  set.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+                         sizeof(set)),
+              SyscallSucceeds());
+
+  in_addr get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+
+  EXPECT_EQ(size, sizeof(get));
+  EXPECT_EQ(get.s_addr, set.s_addr);
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetReqAddr) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreq set = {};
+  set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+                         sizeof(set)),
+              SyscallSucceeds());
+
+  in_addr get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+
+  EXPECT_EQ(size, sizeof(get));
+  EXPECT_EQ(get.s_addr, set.imr_interface.s_addr);
+}
+
+TEST_P(IPv4UDPUnboundSocketPairTest, IpMulticastIfSetNic) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  ip_mreqn set = {};
+  set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+  ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+                         sizeof(set)),
+              SyscallSucceeds());
+
+  in_addr get = {};
+  socklen_t size = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+      SyscallSucceeds());
+  EXPECT_EQ(size, sizeof(get));
+  EXPECT_EQ(get.s_addr, 0);
+}
+
 TEST_P(IPv4UDPUnboundSocketPairTest, TestJoinGroupNoIf) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-- 
cgit v1.2.3


From bc9b979b9412ad5852872c1a9bee462f73d2455e Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Mon, 11 Mar 2019 11:46:18 -0700
Subject: Add profiling commands to runsc

Example:
  runsc debug --root=<dir> \
      --profile-heap=/tmp/heap.prof \
      --profile-cpu=/tmp/cpu.prod --profile-delay=30 \
      <container ID>
PiperOrigin-RevId: 237848456
Change-Id: Icff3f20c1b157a84d0922599eaea327320dad773
---
 pkg/seccomp/seccomp.go      |   2 +-
 pkg/sentry/control/BUILD    |   2 +
 pkg/sentry/control/pprof.go | 124 ++++++++++++++++++++++++++++++++++++++++++++
 runsc/boot/config.go        |   4 ++
 runsc/boot/controller.go    |   8 +++
 runsc/boot/filter/config.go |  13 +++++
 runsc/boot/filter/filter.go |  11 ++--
 runsc/boot/loader.go        |   7 +--
 runsc/cmd/debug.go          |  44 ++++++++++++++--
 runsc/main.go               |   2 +
 runsc/sandbox/sandbox.go    |  55 ++++++++++++++++++++
 11 files changed, 262 insertions(+), 10 deletions(-)
 create mode 100644 pkg/sentry/control/pprof.go

(limited to 'pkg/sentry')

diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index ba2955752..e113f3574 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -55,7 +55,7 @@ func Install(rules SyscallRules) error {
 	}
 
 	// Uncomment to get stack trace when there is a violation.
-	// defaultAction = uint32(linux.SECCOMP_RET_TRAP)
+	// defaultAction = linux.BPFAction(linux.SECCOMP_RET_TRAP)
 
 	log.Infof("Installing seccomp filters for %d syscalls (action=%v)", len(rules), defaultAction)
 
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index f54e01ee8..5052bcc0d 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -6,6 +6,7 @@ go_library(
     name = "control",
     srcs = [
         "control.go",
+        "pprof.go",
         "proc.go",
         "state.go",
     ],
@@ -15,6 +16,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/fd",
         "//pkg/log",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
new file mode 100644
index 000000000..1af092af3
--- /dev/null
+++ b/pkg/sentry/control/pprof.go
@@ -0,0 +1,124 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package control
+
+import (
+	"errors"
+	"runtime"
+	"runtime/pprof"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+var errNoOutput = errors.New("no output writer provided")
+
+// ProfileOpts contains options for the StartCPUProfile/Goroutine RPC call.
+type ProfileOpts struct {
+	// File is the filesystem path for the profile.
+	File string `json:"path"`
+
+	// FilePayload is the destination for the profiling output.
+	urpc.FilePayload
+}
+
+// Profile includes profile-related RPC stubs. It provides a way to
+// control the built-in pprof facility in sentry via sentryctl.
+//
+// The following options to sentryctl are added:
+//
+// - collect CPU profile on-demand.
+//   sentryctl -pid <pid> pprof-cpu-start
+//   sentryctl -pid <pid> pprof-cpu-stop
+//
+// - dump out the stack trace of current go routines.
+//   sentryctl -pid <pid> pprof-goroutine
+type Profile struct {
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// cpuFile is the current CPU profile output file.
+	cpuFile *fd.FD
+}
+
+// StartCPUProfile is an RPC stub which starts recording the CPU profile in a
+// file.
+func (p *Profile) StartCPUProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+
+	output, err := fd.NewFromFile(o.FilePayload.Files[0])
+	if err != nil {
+		return err
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Returns an error if profiling is already started.
+	if err := pprof.StartCPUProfile(output); err != nil {
+		output.Close()
+		return err
+	}
+
+	p.cpuFile = output
+	return nil
+}
+
+// StopCPUProfile is an RPC stub which stops the CPU profiling and flush out the
+// profile data. It takes no argument.
+func (p *Profile) StopCPUProfile(_, _ *struct{}) error {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if p.cpuFile == nil {
+		return errors.New("CPU profiling not started")
+	}
+
+	pprof.StopCPUProfile()
+	p.cpuFile.Close()
+	p.cpuFile = nil
+	return nil
+}
+
+// HeapProfile generates a heap profile for the sentry.
+func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	runtime.GC() // Get up-to-date statistics.
+	if err := pprof.WriteHeapProfile(output); err != nil {
+		return err
+	}
+	return nil
+}
+
+// Goroutine is an RPC stub which dumps out the stack trace for all running
+// goroutines.
+func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+	output := o.FilePayload.Files[0]
+	defer output.Close()
+	if err := pprof.Lookup("goroutine").WriteTo(output, 2); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 400203c99..626fcabdd 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -202,6 +202,9 @@ type Config struct {
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
 	PanicSignal int
 
+	// ProfileEnable is set to prepare the sandbox to be profiled.
+	ProfileEnable bool
+
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
@@ -228,6 +231,7 @@ func (c *Config) ToFlags() []string {
 		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
 		"--watchdog-action=" + c.WatchdogAction.String(),
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
+		"--profile=" + strconv.FormatBool(c.ProfileEnable),
 	}
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 23d476f7f..a864be720 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -95,6 +95,11 @@ const (
 
 	// SandboxStacks collects sandbox stacks for debugging.
 	SandboxStacks = "debug.Stacks"
+
+	// Profiling related commands (see pprof.go for more details).
+	StartCPUProfile = "Profile.StartCPUProfile"
+	StopCPUProfile  = "Profile.StopCPUProfile"
+	HeapProfile     = "Profile.HeapProfile"
 )
 
 // ControlSocketAddr generates an abstract unix socket name for the given ID.
@@ -135,6 +140,9 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 
 	srv.Register(&debug{})
+	if l.conf.ProfileEnable {
+		srv.Register(&control.Profile{})
+	}
 
 	return &controller{
 		srv:     srv,
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index bde749861..1ba5b7257 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -470,3 +470,16 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index d69a6a2cc..fb197f9b1 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -29,9 +29,10 @@ import (
 
 // Options are seccomp filter related options.
 type Options struct {
-	Platform     platform.Platform
-	HostNetwork  bool
-	ControllerFD int
+	Platform      platform.Platform
+	HostNetwork   bool
+	ProfileEnable bool
+	ControllerFD  int
 }
 
 // Install installs seccomp filters for based on the given platform.
@@ -47,6 +48,10 @@ func Install(opt Options) error {
 		Report("host networking enabled: syscall filters less restrictive!")
 		s.Merge(hostInetFilters())
 	}
+	if opt.ProfileEnable {
+		Report("profile enabled: syscall filters less restrictive!")
+		s.Merge(profileFilters())
+	}
 
 	switch p := opt.Platform.(type) {
 	case *ptrace.PTrace:
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 42fe6f312..4c7e6abfc 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -445,9 +445,10 @@ func (l *Loader) run() error {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
 		opts := filter.Options{
-			Platform:     l.k.Platform,
-			HostNetwork:  l.conf.Network == NetworkHost,
-			ControllerFD: l.ctrl.srv.FD(),
+			Platform:      l.k.Platform,
+			HostNetwork:   l.conf.Network == NetworkHost,
+			ProfileEnable: l.conf.ProfileEnable,
+			ControllerFD:  l.ctrl.srv.FD(),
 		}
 		if err := filter.Install(opts); err != nil {
 			return fmt.Errorf("installing seccomp filters: %v", err)
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index e10326754..3ee9a9b49 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -16,7 +16,9 @@ package cmd
 
 import (
 	"context"
+	"os"
 	"syscall"
+	"time"
 
 	"flag"
 	"github.com/google/subcommands"
@@ -27,9 +29,12 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid    int
-	stacks bool
-	signal int
+	pid          int
+	stacks       bool
+	signal       int
+	profileHeap  string
+	profileCPU   string
+	profileDelay int
 }
 
 // Name implements subcommands.Command.
@@ -51,6 +56,9 @@ func (*Debug) Usage() string {
 func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
+	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
+	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+	f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 }
 
@@ -114,5 +122,35 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("     *** Stack dump ***\n%s", stacks)
 	}
+	if d.profileCPU != "" {
+		f, err := os.Create(d.profileCPU)
+		if err != nil {
+			Fatalf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.StartCPUProfile(f); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+
+		if err := c.Sandbox.StopCPUProfile(); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("CPU profile written to %q", d.profileCPU)
+	}
+	if d.profileHeap != "" {
+		f, err := os.Create(d.profileHeap)
+		if err != nil {
+			Fatalf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.HeapProfile(f); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("Heap profile written to %q", d.profileHeap)
+	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/main.go b/runsc/main.go
index 4f89312b3..82c37ec11 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -63,6 +63,7 @@ var (
 	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+	profile        = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
 
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
@@ -146,6 +147,7 @@ func main() {
 		StraceLogSize:  *straceLogSize,
 		WatchdogAction: wa,
 		PanicSignal:    *panicSignal,
+		ProfileEnable:  *profile,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 	}
 	if len(*straceSyscalls) != 0 {
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ce8c21681..2698e3f86 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -825,6 +825,61 @@ func (s *Sandbox) Stacks() (string, error) {
 	return stacks, nil
 }
 
+// HeapProfile writes a heap profile to the given file.
+func (s *Sandbox) HeapProfile(f *os.File) error {
+	log.Debugf("Heap profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.HeapProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q heap profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StartCPUProfile start CPU profile writing to the given file.
+func (s *Sandbox) StartCPUProfile(f *os.File) error {
+	log.Debugf("CPU profile start %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.StartCPUProfile, &opts, nil); err != nil {
+		return fmt.Errorf("starting sandbox %q CPU profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StopCPUProfile stops a previously started CPU profile.
+func (s *Sandbox) StopCPUProfile() error {
+	log.Debugf("CPU profile stop %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.StopCPUProfile, nil, nil); err != nil {
+		return fmt.Errorf("stopping sandbox %q CPU profile: %v", s.ID, err)
+	}
+	return nil
+}
+
 // DestroyContainer destroys the given container. If it is the root container,
 // then the entire sandbox is destroyed.
 func (s *Sandbox) DestroyContainer(cid string) error {
-- 
cgit v1.2.3


From 6e6dbf0e566270ae96a4db81d9d04275d0fffb00 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Mon, 11 Mar 2019 18:18:41 -0700
Subject: kvm: minimum guest/host timekeeping delta.

PiperOrigin-RevId: 237927368
Change-Id: I359badd1967bb118fe74eab3282c946c18937edc
---
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 47 +++++++++++++++++++------
 1 file changed, 37 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 8ebd4ab71..69ba67ced 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -87,23 +87,50 @@ func (c *vCPU) setCPUID() error {
 
 // setSystemTime sets the TSC for the vCPU.
 //
-// FIXME: This introduces a slight TSC offset between host and
-// guest, which may vary per vCPU.
+// This has to make the call many times in order to minimize the intrinstic
+// error in the offset. Unfortunately KVM does not expose a relative offset via
+// the API, so this is an approximation. We do this via an iterative algorithm.
+// This has the advantage that it can generally deal with highly variable
+// system call times and should converge on the correct offset.
 func (c *vCPU) setSystemTime() error {
-	const _MSR_IA32_TSC = 0x00000010
+	const (
+		_MSR_IA32_TSC  = 0x00000010
+		calibrateTries = 10
+	)
 	registers := modelControlRegisters{
 		nmsrs: 1,
 	}
 	registers.entries[0] = modelControlRegister{
 		index: _MSR_IA32_TSC,
-		data:  uint64(time.Rdtsc()),
 	}
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_IOCTL,
-		uintptr(c.fd),
-		_KVM_SET_MSRS,
-		uintptr(unsafe.Pointer(&registers))); errno != 0 {
-		return fmt.Errorf("error setting system time: %v", errno)
+	target := uint64(^uint32(0))
+	for done := 0; done < calibrateTries; {
+		start := uint64(time.Rdtsc())
+		registers.entries[0].data = start + target
+		if _, _, errno := syscall.RawSyscall(
+			syscall.SYS_IOCTL,
+			uintptr(c.fd),
+			_KVM_SET_MSRS,
+			uintptr(unsafe.Pointer(&registers))); errno != 0 {
+			return fmt.Errorf("error setting system time: %v", errno)
+		}
+		// See if this is our new minimum call time. Note that this
+		// serves two functions: one, we make sure that we are
+		// accurately predicting the offset we need to set. Second, we
+		// don't want to do the final set on a slow call, which could
+		// produce a really bad result. So we only count attempts
+		// within +/- 6.25% of our minimum as an attempt.
+		end := uint64(time.Rdtsc())
+		if end < start {
+			continue // Totally bogus.
+		}
+		half := (end - start) / 2
+		if half < target {
+			target = half
+		}
+		if (half - target) < target/8 {
+			done++
+		}
 	}
 	return nil
 }
-- 
cgit v1.2.3


From 8930e79ebf72a0cc69e9b81af37bcbb57b115543 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 12 Mar 2019 10:28:23 -0700
Subject: Clarify the platform.File interface.

- Redefine some memmap.Mappable, platform.File, and platform.Memory
semantics in terms of File reference counts (no functional change).

- Make AddressSpace.MapFile take a platform.File instead of a raw FD,
and replace platform.File.MapInto with platform.File.FD. This allows
kvm.AddressSpace.MapFile to always use platform.File.MapInternal instead
of maintaining its own (redundant) cache of file mappings in the sentry
address space.

PiperOrigin-RevId: 238044504
Change-Id: Ib73a11e4275c0da0126d0194aa6c6017a9cef64f
---
 pkg/sentry/fs/fsutil/README.md           |  17 ++-
 pkg/sentry/fs/fsutil/host_mappable.go    |  10 +-
 pkg/sentry/fs/fsutil/inode_cached.go     |  28 ++---
 pkg/sentry/fs/tmpfs/inode_file.go        |  16 +--
 pkg/sentry/memmap/memmap.go              |  14 +--
 pkg/sentry/mm/address_space.go           |   2 +-
 pkg/sentry/mm/mm.go                      |   3 +-
 pkg/sentry/platform/filemem/filemem.go   |  65 +++++------
 pkg/sentry/platform/kvm/BUILD            |  24 ----
 pkg/sentry/platform/kvm/address_space.go | 104 +++++------------
 pkg/sentry/platform/kvm/host_map.go      | 184 -------------------------------
 pkg/sentry/platform/platform.go          |  95 +++++++---------
 pkg/sentry/platform/ptrace/subprocess.go |   4 +-
 13 files changed, 139 insertions(+), 427 deletions(-)
 delete mode 100644 pkg/sentry/platform/kvm/host_map.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/README.md b/pkg/sentry/fs/fsutil/README.md
index d3780e9fa..6e677890c 100644
--- a/pkg/sentry/fs/fsutil/README.md
+++ b/pkg/sentry/fs/fsutil/README.md
@@ -108,9 +108,9 @@ The host then sends a `SIGSEGV` to the sentry because the address range [`A`,
 `A`+8) is not mapped on the host. The `SIGSEGV` indicates that the memory was
 accessed writable. The sentry looks up the vma associated with [`A`, `A`+8),
 finds the file that was mapped and its `CachingInodeOperations`. It then calls
-`CachingInodeOperations.MapInto` which allocates memory to back [`A`, `A`+8). It
-may choose to allocate more memory (i.e. do "readahead") to minimize subsequent
-faults.
+`CachingInodeOperations.Translate` which allocates memory to back [`A`, `A`+8).
+It may choose to allocate more memory (i.e. do "readahead") to minimize
+subsequent faults.
 
 Memory that is allocated comes from a host tmpfs file (see `filemem.FileMem`).
 The host tmpfs file memory is brought up to date with the contents of the mapped
@@ -138,12 +138,11 @@ memcpy(A, buffer, 4);
 ```
 
 Since the first process has already mapped and accessed the same region of the
-file writable, `CachingInodeOperations.MapInto` is called but re-maps the memory
-that has already been allocated (because the host mapping can be invalidated at
-any time) rather than allocating new memory. The address range [`A`, `A`+0x1000)
-reflects the same cached view of the file as the first process sees. For
-example, reading 8 bytes from the file from either process via read(2) starting
-at offset 0 returns a consistent "bbbbaaaa".
+file writable, `CachingInodeOperations.Translate` is called but returns the
+memory that has already been allocated rather than allocating new memory. The
+address range [`A`, `A`+0x1000) reflects the same cached view of the file as the
+first process sees. For example, reading 8 bytes from the file from either
+process via read(2) starting at offset 0 returns a consistent "bbbbaaaa".
 
 When this process no longer needs the shared memory, it may do:
 
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 340f8d288..1bb5c6b6e 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -106,16 +106,16 @@ func (h *HostMappable) InvalidateUnsavable(ctx context.Context) error {
 	return nil
 }
 
-// MapInto implements platform.File.MapInto.
-func (h *HostMappable) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
-	return as.MapFile(addr, h.backingFile.FD(), fr, at, precommit)
-}
-
 // MapInternal implements platform.File.MapInternal.
 func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
 	return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write)
 }
 
+// FD implements platform.File.FD.
+func (h *HostMappable) FD() int {
+	return h.backingFile.FD()
+}
+
 // IncRef implements platform.File.IncRef.
 func (h *HostMappable) IncRef(fr platform.FileRange) {
 	mr := memmap.MappableRange{Start: fr.Start, End: fr.End}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index e3b52e943..ef11676b8 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -835,20 +835,6 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error
 	return nil
 }
 
-// MapInto implements platform.File.MapInto. This is used when we directly map
-// an underlying host fd and CachingInodeOperations is used as the platform.File
-// during translation.
-func (c *CachingInodeOperations) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
-	return as.MapFile(addr, c.backingFile.FD(), fr, at, precommit)
-}
-
-// MapInternal implements platform.File.MapInternal. This is used when we
-// directly map an underlying host fd and CachingInodeOperations is used as the
-// platform.File during translation.
-func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
-	return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
-}
-
 // IncRef implements platform.File.IncRef. This is used when we directly map an
 // underlying host fd and CachingInodeOperations is used as the platform.File
 // during translation.
@@ -900,3 +886,17 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
 	c.dataMu.Unlock()
 
 }
+
+// MapInternal implements platform.File.MapInternal. This is used when we
+// directly map an underlying host fd and CachingInodeOperations is used as the
+// platform.File during translation.
+func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
+}
+
+// FD implements platform.File.FD. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the platform.File
+// during translation.
+func (c *CachingInodeOperations) FD() int {
+	return c.backingFile.FD()
+}
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 5648ff8f4..13d06684d 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -39,20 +39,8 @@ var (
 )
 
 // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
-// These files are backed by FrameRegions allocated from a platform.Memory,
-// and may be directly mapped.
-//
-// The tmpfs file memory is backed by FrameRegions, each of which is reference
-// counted. frames maintains a single reference on each of the FrameRegions.
-// Since these contain the contents of the file, the reference may only be
-// decremented once this file is both deleted and all handles to the file have
-// been closed.
-//
-// Mappable users may also call IncRefOn/DecRefOn, generally to indicate that
-// they plan to use MapInto to map the file into an AddressSpace. These calls
-// include an InvalidatorRegion associated with that reference. When the
-// referenced portion of the file is removed (with Truncate), the associated
-// InvalidatorRegion is invalidated.
+// These files are backed by pages allocated from a platform.Memory, and may be
+// directly mapped.
 //
 // +stateify savable
 type fileInodeOperations struct {
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index cf20b11e3..70cdf428b 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -70,11 +70,13 @@ type Mappable interface {
 	// of offsets specified by required, and at most the range of offsets
 	// specified by optional. at is the set of access types that may be
 	// performed using the returned Translations. If not all required offsets
-	// are translated, it returns a non-nil error explaining why. Returned
-	// translations, and any mappings returned by platform.File.MapInternal for
-	// translated platform.Files, are valid until invalidated by a call back to
+	// are translated, it returns a non-nil error explaining why.
+	//
+	// Translations are valid until invalidated by a callback to
 	// MappingSpace.Invalidate or until the caller removes its mapping of the
-	// translated range.
+	// translated range. Mappable implementations must ensure that at least one
+	// reference is held on all pages in a platform.File that may be the result
+	// of a valid Translation.
 	//
 	// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
 	// required and optional must be page-aligned. The caller must have
@@ -98,9 +100,7 @@ type Translation struct {
 	// Source is the translated range in the Mappable.
 	Source MappableRange
 
-	// File is the mapped file. When the Translation is invalidated, pages
-	// mapped by File.MapInto must be unmapped, and pages mapped by
-	// File.MapInternal become invalid.
+	// File is the mapped file.
 	File platform.File
 
 	// Offset is the offset into File at which this Translation begins.
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index e7aa24c69..90cfef746 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -183,7 +183,7 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if pma.needCOW {
 			perms.Write = false
 		}
-		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
+		if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
 			return err
 		}
 		pseg = pseg.NextSegment()
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index a69b8c7be..e2c636f38 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -24,7 +24,8 @@
 //         mm.MemoryManager.activeMu
 //           Locks taken by memmap.Mappable.Translate
 //             mm.privateRefs.mu
-//               platform.File locks
+//               platform.AddressSpace locks
+//                 platform.File locks
 //         mm.aioManager.mu
 //           mm.AIOContext.mu
 //
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
index 97da31e70..f41c70ba5 100644
--- a/pkg/sentry/platform/filemem/filemem.go
+++ b/pkg/sentry/platform/filemem/filemem.go
@@ -32,7 +32,6 @@ import (
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
@@ -504,39 +503,6 @@ func (f *FileMem) markReclaimed(fr platform.FileRange) {
 	}
 }
 
-// MapInto implements platform.File.MapInto.
-func (f *FileMem) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
-	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-	return as.MapFile(addr, int(f.file.Fd()), fr, at, precommit)
-}
-
-// MapInternal implements platform.File.MapInternal.
-func (f *FileMem) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
-	if !fr.WellFormed() || fr.Length() == 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-	if at.Execute {
-		return safemem.BlockSeq{}, syserror.EACCES
-	}
-
-	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
-	if chunks == 1 {
-		// Avoid an unnecessary slice allocation.
-		var seq safemem.BlockSeq
-		err := f.forEachMappingSlice(fr, func(bs []byte) {
-			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
-		})
-		return seq, err
-	}
-	blocks := make([]safemem.Block, 0, chunks)
-	err := f.forEachMappingSlice(fr, func(bs []byte) {
-		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
-	})
-	return safemem.BlockSeqFromSlice(blocks), err
-}
-
 // IncRef implements platform.File.IncRef.
 func (f *FileMem) IncRef(fr platform.FileRange) {
 	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
@@ -596,9 +562,29 @@ func (f *FileMem) DecRef(fr platform.FileRange) {
 	}
 }
 
-// Flush implements platform.Mappable.Flush.
-func (f *FileMem) Flush(ctx context.Context) error {
-	return nil
+// MapInternal implements platform.File.MapInternal.
+func (f *FileMem) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	if !fr.WellFormed() || fr.Length() == 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+	if at.Execute {
+		return safemem.BlockSeq{}, syserror.EACCES
+	}
+
+	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
+	if chunks == 1 {
+		// Avoid an unnecessary slice allocation.
+		var seq safemem.BlockSeq
+		err := f.forEachMappingSlice(fr, func(bs []byte) {
+			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
+		})
+		return seq, err
+	}
+	blocks := make([]safemem.Block, 0, chunks)
+	err := f.forEachMappingSlice(fr, func(bs []byte) {
+		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
+	})
+	return safemem.BlockSeqFromSlice(blocks), err
 }
 
 // forEachMappingSlice invokes fn on a sequence of byte slices that
@@ -653,6 +639,11 @@ func (f *FileMem) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
 	return mappings, m, nil
 }
 
+// FD implements platform.File.FD.
+func (f *FileMem) FD() int {
+	return int(f.file.Fd())
+}
+
 // UpdateUsage implements platform.Memory.UpdateUsage.
 func (f *FileMem) UpdateUsage() error {
 	f.mu.Lock()
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 6e40b3177..b7bf88249 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -2,28 +2,6 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-go_template_instance(
-    name = "host_map_set",
-    out = "host_map_set.go",
-    consts = {
-        "minDegree": "15",
-    },
-    imports = {
-        "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
-    },
-    package = "kvm",
-    prefix = "hostMap",
-    template = "//pkg/segment:generic_set",
-    types = {
-        "Key": "usermem.Addr",
-        "Range": "usermem.AddrRange",
-        "Value": "uintptr",
-        "Functions": "hostMapSetFunctions",
-    },
-)
-
 go_library(
     name = "kvm",
     srcs = [
@@ -36,8 +14,6 @@ go_library(
         "bluepill_fault.go",
         "bluepill_unsafe.go",
         "context.go",
-        "host_map.go",
-        "host_map_set.go",
         "kvm.go",
         "kvm_amd64.go",
         "kvm_amd64_unsafe.go",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 72e897a9a..6d8d8e65b 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -15,7 +15,6 @@
 package kvm
 
 import (
-	"reflect"
 	"sync"
 	"sync/atomic"
 
@@ -88,11 +87,6 @@ type addressSpace struct {
 
 	// dirtySet is the set of dirty vCPUs.
 	dirtySet *dirtySet
-
-	// files contains files mapped in the host address space.
-	//
-	// See host_map.go for more information.
-	files hostMap
 }
 
 // invalidate is the implementation for Invalidate.
@@ -118,6 +112,11 @@ func (as *addressSpace) Touch(c *vCPU) bool {
 	return as.dirtySet.mark(c)
 }
 
+type hostMapEntry struct {
+	addr   uintptr
+	length uintptr
+}
+
 func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
 	for m.length > 0 {
 		physical, length, ok := translateToPhysical(m.addr)
@@ -158,100 +157,57 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 	return inv
 }
 
-func (as *addressSpace) mapHostFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType) error {
-	// Create custom host mappings.
-	ms, err := as.files.CreateMappings(usermem.AddrRange{
-		Start: addr,
-		End:   addr + usermem.Addr(fr.End-fr.Start),
-	}, at, fd, fr.Start)
-	if err != nil {
-		return err
-	}
-
-	inv := false
-	for _, m := range ms {
-		// The host mapped slices are guaranteed to be aligned.
-		prev := as.mapHost(addr, m, at)
-		inv = inv || prev
-		addr += usermem.Addr(m.length)
-	}
-	if inv {
-		as.invalidate()
-	}
-
-	return nil
-}
+// MapFile implements platform.AddressSpace.MapFile.
+func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	as.mu.Lock()
+	defer as.mu.Unlock()
 
-func (as *addressSpace) mapFilemem(addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
-	// TODO: Lock order at the platform level is not sufficiently
-	// well-defined to guarantee that the caller (FileMem.MapInto) is not
-	// holding any locks that FileMem.MapInternal may take.
-
-	// Retrieve mappings for the underlying filemem. Note that the
-	// permissions here are largely irrelevant, since it corresponds to
-	// physical memory for the guest. We enforce the given access type
-	// below, in the guest page tables.
-	bs, err := as.filemem.MapInternal(fr, usermem.AccessType{
-		Read:  true,
-		Write: true,
+	// Get mappings in the sentry's address space, which are guaranteed to be
+	// valid as long as a reference is held on the mapped pages (which is in
+	// turn required by AddressSpace.MapFile precondition).
+	//
+	// If precommit is true, we will touch mappings to commit them, so ensure
+	// that mappings are readable from sentry context.
+	//
+	// We don't execute from application file-mapped memory, and guest page
+	// tables don't care if we have execute permission (but they do need pages
+	// to be readable).
+	bs, err := f.MapInternal(fr, usermem.AccessType{
+		Read:  at.Read || at.Execute || precommit,
+		Write: at.Write,
 	})
 	if err != nil {
 		return err
 	}
 
-	// Save the original range for invalidation.
-	orig := usermem.AddrRange{
-		Start: addr,
-		End:   addr + usermem.Addr(fr.End-fr.Start),
-	}
-
+	// Map the mappings in the sentry's address space (guest physical memory)
+	// into the application's address space (guest virtual memory).
 	inv := false
 	for !bs.IsEmpty() {
 		b := bs.Head()
 		bs = bs.Tail()
 		// Since fr was page-aligned, b should also be page-aligned. We do the
 		// lookup in our host page tables for this translation.
-		s := b.ToSlice()
 		if precommit {
+			s := b.ToSlice()
 			for i := 0; i < len(s); i += usermem.PageSize {
 				_ = s[i] // Touch to commit.
 			}
 		}
 		prev := as.mapHost(addr, hostMapEntry{
-			addr:   reflect.ValueOf(&s[0]).Pointer(),
-			length: uintptr(len(s)),
+			addr:   b.Addr(),
+			length: uintptr(b.Len()),
 		}, at)
 		inv = inv || prev
-		addr += usermem.Addr(len(s))
+		addr += usermem.Addr(b.Len())
 	}
 	if inv {
 		as.invalidate()
-		as.files.DeleteMapping(orig)
 	}
 
 	return nil
 }
 
-// MapFile implements platform.AddressSpace.MapFile.
-func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
-	as.mu.Lock()
-	defer as.mu.Unlock()
-
-	// Create an appropriate mapping. If this is filemem, we don't create
-	// custom mappings for each in-application mapping. For files however,
-	// we create distinct mappings for each address space. Unfortunately,
-	// there's not a better way to manage this here. The file underlying
-	// this fd can change at any time, so we can't actually index the file
-	// and share between address space. Oh well. It's all referring to the
-	// same physical pages, hopefully we don't run out of address space.
-	if fd != int(as.filemem.File().Fd()) {
-		// N.B. precommit is ignored for host files.
-		return as.mapHostFile(addr, fd, fr, at)
-	}
-
-	return as.mapFilemem(addr, fr, at, precommit)
-}
-
 // Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
 func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 	as.mu.Lock()
@@ -264,10 +220,6 @@ func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 	})
 	if prev {
 		as.invalidate()
-		as.files.DeleteMapping(usermem.AddrRange{
-			Start: addr,
-			End:   addr + usermem.Addr(length),
-		})
 
 		// Recycle any freed intermediate pages.
 		as.pageTables.Allocator.Recycle()
diff --git a/pkg/sentry/platform/kvm/host_map.go b/pkg/sentry/platform/kvm/host_map.go
deleted file mode 100644
index ee6a1a42d..000000000
--- a/pkg/sentry/platform/kvm/host_map.go
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kvm
-
-import (
-	"fmt"
-	"sync"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-type hostMap struct {
-	// mu protects below.
-	mu sync.RWMutex
-
-	// set contains host mappings.
-	set hostMapSet
-}
-
-type hostMapEntry struct {
-	addr   uintptr
-	length uintptr
-}
-
-// forEach iterates over all mappings in the given range.
-//
-// Precondition: segFn and gapFn must be non-nil.
-func (hm *hostMap) forEach(
-	r usermem.AddrRange,
-	segFn func(offset uint64, m hostMapEntry),
-	gapFn func(offset uint64, length uintptr) (uintptr, bool)) {
-
-	seg, gap := hm.set.Find(r.Start)
-	for {
-		if seg.Ok() && seg.Start() < r.End {
-			// A valid segment: pass information.
-			overlap := seg.Range().Intersect(r)
-			segOffset := uintptr(overlap.Start - seg.Start())
-			mapOffset := uint64(overlap.Start - r.Start)
-			segFn(mapOffset, hostMapEntry{
-				addr:   seg.Value() + segOffset,
-				length: uintptr(overlap.Length()),
-			})
-			seg, gap = seg.NextNonEmpty()
-		} else if gap.Ok() && gap.Start() < r.End {
-			// A gap: pass gap information.
-			overlap := gap.Range().Intersect(r)
-			mapOffset := uint64(overlap.Start - r.Start)
-			addr, ok := gapFn(mapOffset, uintptr(overlap.Length()))
-			if ok {
-				seg = hm.set.Insert(gap, overlap, addr)
-				seg, gap = seg.NextNonEmpty()
-			} else {
-				seg = gap.NextSegment()
-				gap = hostMapGapIterator{} // Invalid.
-			}
-		} else {
-			// Terminal.
-			break
-		}
-	}
-}
-
-func (hm *hostMap) createMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) {
-	hm.forEach(r, func(mapOffset uint64, m hostMapEntry) {
-		// Replace any existing mappings.
-		_, _, errno := syscall.RawSyscall6(
-			syscall.SYS_MMAP,
-			m.addr,
-			m.length,
-			uintptr(at.Prot()),
-			syscall.MAP_FIXED|syscall.MAP_SHARED,
-			uintptr(fd),
-			uintptr(offset+mapOffset))
-		if errno != 0 && err == nil {
-			err = errno
-		}
-	}, func(mapOffset uint64, length uintptr) (uintptr, bool) {
-		// Create a new mapping.
-		addr, _, errno := syscall.RawSyscall6(
-			syscall.SYS_MMAP,
-			0,
-			length,
-			uintptr(at.Prot()),
-			syscall.MAP_SHARED,
-			uintptr(fd),
-			uintptr(offset+mapOffset))
-		if errno != 0 {
-			err = errno
-			return 0, false
-		}
-		return addr, true
-	})
-	if err != nil {
-		return nil, err
-	}
-
-	// Collect all entries.
-	//
-	// We do this after the first iteration because some segments may have
-	// been merged in the above, and we'll return the simplest form. This
-	// also provides a basic sanity check in the form of no gaps.
-	hm.forEach(r, func(_ uint64, m hostMapEntry) {
-		ms = append(ms, m)
-	}, func(uint64, uintptr) (uintptr, bool) {
-		// Should not happen: we just mapped this above.
-		panic("unexpected gap")
-	})
-
-	return ms, nil
-}
-
-// CreateMappings creates a new set of host mapping entries.
-func (hm *hostMap) CreateMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) {
-	hm.mu.Lock()
-	ms, err = hm.createMappings(r, at, fd, offset)
-	hm.mu.Unlock()
-	return
-}
-
-func (hm *hostMap) deleteMapping(r usermem.AddrRange) {
-	// Remove all the existing mappings.
-	hm.forEach(r, func(_ uint64, m hostMapEntry) {
-		_, _, errno := syscall.RawSyscall(
-			syscall.SYS_MUNMAP,
-			m.addr,
-			m.length,
-			0)
-		if errno != 0 {
-			// Should never happen.
-			panic(fmt.Sprintf("unmap error: %v", errno))
-		}
-	}, func(uint64, uintptr) (uintptr, bool) {
-		// Sometimes deleteMapping will be called on a larger range
-		// than physical mappings are defined. That's okay.
-		return 0, false
-	})
-
-	// Knock the entire range out.
-	hm.set.RemoveRange(r)
-}
-
-// DeleteMapping deletes the given range.
-func (hm *hostMap) DeleteMapping(r usermem.AddrRange) {
-	hm.mu.Lock()
-	hm.deleteMapping(r)
-	hm.mu.Unlock()
-}
-
-// hostMapSetFunctions is used in the implementation of mapSet.
-type hostMapSetFunctions struct{}
-
-func (hostMapSetFunctions) MinKey() usermem.Addr    { return 0 }
-func (hostMapSetFunctions) MaxKey() usermem.Addr    { return ^usermem.Addr(0) }
-func (hostMapSetFunctions) ClearValue(val *uintptr) { *val = 0 }
-
-func (hostMapSetFunctions) Merge(r1 usermem.AddrRange, addr1 uintptr, r2 usermem.AddrRange, addr2 uintptr) (uintptr, bool) {
-	if addr1+uintptr(r1.Length()) != addr2 {
-		return 0, false
-	}
-
-	// Since the two regions are contiguous in both the key space and the
-	// value space, we can just store a single segment with the first host
-	// virtual address; the logic above operates based on the size of the
-	// segments.
-	return addr1, true
-}
-
-func (hostMapSetFunctions) Split(r usermem.AddrRange, hostAddr uintptr, split usermem.Addr) (uintptr, uintptr) {
-	return hostAddr, hostAddr + uintptr(split-r.Start)
-}
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index a9e76bd45..b2ce851da 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -52,11 +52,11 @@ type Platform interface {
 	DetectsCPUPreemption() bool
 
 	// MapUnit returns the alignment used for optional mappings into this
-	// platform's AddressSpaces. Higher values indicate lower per-page
-	// costs for AddressSpace.MapInto. As a special case, a MapUnit of 0
-	// indicates that the cost of AddressSpace.MapInto is effectively
-	// independent of the number of pages mapped. If MapUnit is non-zero,
-	// it must be a power-of-2 multiple of usermem.PageSize.
+	// platform's AddressSpaces. Higher values indicate lower per-page costs
+	// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
+	// that the cost of AddressSpace.MapFile is effectively independent of the
+	// number of pages mapped. If MapUnit is non-zero, it must be a power-of-2
+	// multiple of usermem.PageSize.
 	MapUnit() uint64
 
 	// MinUserAddress returns the minimum mappable address on this
@@ -194,17 +194,17 @@ const SignalInterrupt = linux.SIGCHLD
 // AddressSpace represents a virtual address space in which a Context can
 // execute.
 type AddressSpace interface {
-	// MapFile creates a shared mapping of offsets in fr, from the file
-	// with file descriptor fd, at address addr. Any existing overlapping
-	// mappings are silently replaced.
+	// MapFile creates a shared mapping of offsets fr from f at address addr.
+	// Any existing overlapping mappings are silently replaced.
 	//
-	// If precommit is true, host memory should be committed to the mapping
-	// when MapFile returns when possible. The precommit flag is advisory
-	// and implementations may choose to ignore it.
+	// If precommit is true, the platform should eagerly commit resources (e.g.
+	// physical memory) to the mapping. The precommit flag is advisory and
+	// implementations may choose to ignore it.
 	//
-	// Preconditions: addr and fr must be page-aligned. length > 0.
-	// at.Any() == true.
-	MapFile(addr usermem.Addr, fd int, fr FileRange, at usermem.AccessType, precommit bool) error
+	// Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
+	// at.Any() == true. At least one reference must be held on all pages in
+	// fr, and must continue to be held as long as pages are mapped.
+	MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error
 
 	// Unmap unmaps the given range.
 	//
@@ -309,44 +309,39 @@ func (f SegmentationFault) Error() string {
 
 // File represents a host file that may be mapped into an AddressSpace.
 type File interface {
-	// MapInto maps fr into as, starting at addr, for accesses of type at.
-	//
-	// If precommit is true, the platform should eagerly commit resources (e.g.
-	// physical memory) to the mapping. The precommit flag is advisory and
-	// implementations may choose to ignore it.
+	// All pages in a File are reference-counted.
+
+	// IncRef increments the reference count on all pages in fr.
 	//
-	// Note that there is no File.Unmap; clients should use as.Unmap directly.
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+	// 0. At least one reference must be held on all pages in fr. (The File
+	// interface does not provide a way to acquire an initial reference;
+	// implementors may define mechanisms for doing so.)
+	IncRef(fr FileRange)
+
+	// DecRef decrements the reference count on all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned.
-	// fr.Length() > 0. at.Any() == true. Implementors may define
-	// additional requirements.
-	MapInto(as AddressSpace, addr usermem.Addr, fr FileRange, at usermem.AccessType, precommit bool) error
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+	// 0. At least one reference must be held on all pages in fr.
+	DecRef(fr FileRange)
 
 	// MapInternal returns a mapping of the given file offsets in the invoking
-	// process' address space for reading and writing. The returned mapping is
-	// valid as long as a reference is held on the mapped range.
+	// process' address space for reading and writing.
 	//
 	// Note that fr.Start and fr.End need not be page-aligned.
 	//
-	// Preconditions: fr.Length() > 0. Implementors may define additional
-	// requirements.
-	MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
-
-	// IncRef signals that a region in the file is actively referenced through a
-	// memory map. Implementors must ensure that the contents of a referenced
-	// region remain consistent. Specifically, mappings returned by MapInternal
-	// must refer to the same underlying contents. If the implementor also
-	// implements the Memory interface, the file range must not be reused in a
-	// different allocation while it has active references.
+	// Preconditions: fr.Length() > 0. At least one reference must be held on
+	// all pages in fr.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > 0.
-	IncRef(fr FileRange)
+	// Postconditions: The returned mapping is valid as long as at least one
+	// reference is held on the mapped pages.
+	MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
 
-	// DecRef reduces the frame ref count on the range specified by fr.
+	// FD returns the file descriptor represented by the File.
 	//
-	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
-	// 0. DecRef()s on a region must match earlier IncRef()s.
-	DecRef(fr FileRange)
+	// The only permitted operation on the returned file descriptor is to map
+	// pages from it consistent with the requirements of AddressSpace.MapFile.
+	FD() int
 }
 
 // FileRange represents a range of uint64 offsets into a File.
@@ -361,19 +356,13 @@ func (fr FileRange) String() string {
 // Memory represents an allocatable File that may be mapped into any
 // AddressSpace associated with the same Platform.
 type Memory interface {
-	// Memory implements File methods with the following properties:
-	//
-	// - Pages mapped by MapInto must be allocated, and must be unmapped from
-	// all AddressSpaces before they are freed.
-	//
-	// - Pages mapped by MapInternal must be allocated. Returned mappings are
-	// guaranteed to be valid until the mapped pages are freed.
 	File
 
-	// Allocate returns a range of pages of the given length, owned by the
-	// caller and with the given accounting kind. Allocated memory initially has
-	// a single reference and will automatically be freed when no references to
-	// them remain. See File.IncRef and File.DecRef.
+	// Allocate returns a range of initially-zeroed pages of the given length
+	// with the given accounting kind and a single reference held by the
+	// caller. When the last reference on an allocated page is released,
+	// ownership of the page is returned to the Memory, allowing it to be
+	// returned by a future call to Allocate.
 	//
 	// Preconditions: length must be page-aligned and non-zero.
 	Allocate(length uint64, kind usage.MemoryKind) (FileRange, error)
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index a9d083f5a..82f125073 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -563,7 +563,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp
 }
 
 // MapFile implements platform.AddressSpace.MapFile.
-func (s *subprocess) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
 	var flags int
 	if precommit {
 		flags |= syscall.MAP_POPULATE
@@ -574,7 +574,7 @@ func (s *subprocess) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, a
 		arch.SyscallArgument{Value: uintptr(fr.Length())},
 		arch.SyscallArgument{Value: uintptr(at.Prot())},
 		arch.SyscallArgument{Value: uintptr(flags | syscall.MAP_SHARED | syscall.MAP_FIXED)},
-		arch.SyscallArgument{Value: uintptr(fd)},
+		arch.SyscallArgument{Value: uintptr(f.FD())},
 		arch.SyscallArgument{Value: uintptr(fr.Start)})
 	return err
 }
-- 
cgit v1.2.3


From 2512cc561778b096459182b531eae4e0797e4ec5 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 13 Mar 2019 19:23:02 -0700
Subject: Allow filesystem.Mount to take an optional interface argument.

PiperOrigin-RevId: 238360231
Change-Id: I5eaf8d26f8892f77d71c7fbd6c5225ef471cedf1
---
 pkg/sentry/fs/copy_up_test.go          |  4 ++--
 pkg/sentry/fs/dev/fs.go                |  2 +-
 pkg/sentry/fs/filesystems.go           |  2 +-
 pkg/sentry/fs/gofer/fs.go              |  2 +-
 pkg/sentry/fs/gofer/session_state.go   |  2 +-
 pkg/sentry/fs/host/fs.go               |  2 +-
 pkg/sentry/fs/host/fs_test.go          |  2 +-
 pkg/sentry/fs/mount_overlay.go         |  2 +-
 pkg/sentry/fs/proc/fs.go               |  2 +-
 pkg/sentry/fs/restore.go               |  7 +++++--
 pkg/sentry/fs/sys/fs.go                |  2 +-
 pkg/sentry/fs/tmpfs/fs.go              |  2 +-
 pkg/sentry/fs/tty/fs.go                |  2 +-
 pkg/sentry/syscalls/linux/sys_mount.go |  2 +-
 runsc/boot/fs.go                       | 18 +++++++++---------
 runsc/boot/loader_test.go              | 28 ++++++++++++++--------------
 16 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 2b2f4bb8f..98a0b7638 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -98,7 +98,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
 
 	// Create a lower tmpfs mount.
 	fsys, _ := fs.FindFilesystem("tmpfs")
-	lower, err := fsys.Mount(contexttest.Context(t), "", fs.MountSourceFlags{}, "")
+	lower, err := fsys.Mount(contexttest.Context(t), "", fs.MountSourceFlags{}, "", nil)
 	if err != nil {
 		t.Fatalf("failed to mount tmpfs: %v", err)
 	}
@@ -147,7 +147,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile {
 	}
 
 	// Create an empty upper tmpfs mount which we will copy up into.
-	upper, err := fsys.Mount(ctx, "", fs.MountSourceFlags{}, "")
+	upper, err := fsys.Mount(ctx, "", fs.MountSourceFlags{}, "", nil)
 	if err != nil {
 		t.Fatalf("failed to mount tmpfs: %v", err)
 	}
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index abfe689f0..cf4e7d00f 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -66,7 +66,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns a devtmpfs root that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 	// devtmpfs backed by ramfs ignores bad options. See fs/ramfs/inode.c:ramfs_parse_options.
 	//  -> we should consider parsing the mode and backing devtmpfs by this.
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index ba8be85e4..aa664b973 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -50,7 +50,7 @@ type Filesystem interface {
 	// data options.
 	//
 	// Mount may return arbitrary errors. They do not need syserr translations.
-	Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error)
+	Mount(ctx context.Context, device string, flags MountSourceFlags, data string, dataObj interface{}) (*Inode, error)
 
 	// AllowUserMount determines whether mount(2) is allowed to mount a
 	// file system of this type.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 2dc000c6f..adff0abac 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -120,7 +120,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns an attached 9p client that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// Parse and validate the mount options.
 	o, err := options(data)
 	if err != nil {
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index d9fd7a221..0ad5d63b5 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -57,7 +57,7 @@ func (s *session) afterLoad() {
 	}
 
 	// Validate the mount flags and options.
-	opts, err := options(args.Data)
+	opts, err := options(args.DataString)
 	if err != nil {
 		panic("failed to parse mount options: " + err.Error())
 	}
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index d2ba38449..800649211 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -82,7 +82,7 @@ func (*Filesystem) Flags() fs.FilesystemFlags {
 
 // Mount returns an fs.Inode exposing the host file system.  It is intended to be locked
 // down in PreExec below.
-func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// Parse generic comma-separated key=value options.
 	options := fs.GenericMountSourceOptions(data)
 
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index 44db61ecd..c83b29a16 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -321,7 +321,7 @@ func TestRootPath(t *testing.T) {
 	hostFS := &Filesystem{}
 	ctx := contexttest.Context(t)
 	data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name())
-	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data)
+	inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data, nil)
 	if err != nil {
 		t.Fatalf("Mount failed: %v", err)
 	}
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index fb91635bc..4c89673b5 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -122,6 +122,6 @@ func (*overlayFilesystem) AllowUserList() bool {
 }
 
 // Mount implements Filesystem.Mount.
-func (ofs *overlayFilesystem) Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error) {
+func (ofs *overlayFilesystem) Mount(ctx context.Context, device string, flags MountSourceFlags, data string, _ interface{}) (*Inode, error) {
 	panic("overlayFilesystem.Mount should not be called!")
 }
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 63f737ff4..666a2d054 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -57,7 +57,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns the root of a procfs that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 
 	// Parse generic comma-separated key=value options, this file system expects them.
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index da2df7e1d..a6645b41e 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -41,8 +41,11 @@ type MountArgs struct {
 	// Flags corresponds to the flags argument of Mount.
 	Flags MountSourceFlags
 
-	// Data corresponds to the data argument of Mount.
-	Data string
+	// DataString corresponds to the data argument of Mount.
+	DataString string
+
+	// DataObj corresponds to the data interface argument of Mount.
+	DataObj interface{}
 }
 
 // restoreEnv holds the fs package global RestoreEnvironment.
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 301fef038..44ae43754 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -57,7 +57,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns a sysfs root which can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 	// sysfs ignores data, see fs/sysfs/mount.c:sysfs_mount.
 
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d495430e9..d0c93028f 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -82,7 +82,7 @@ func (*Filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns a tmpfs root that can be positioned in the vfs.
-func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 
 	// Parse generic comma-separated key=value options, this file system expects them.
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 0c412eb21..43e0e2a04 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -59,7 +59,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // MountSource returns a devpts root that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 
 	// No options are supported.
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index bf0df7302..6b8d75d24 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -101,7 +101,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		superFlags.ReadOnly = true
 	}
 
-	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data)
+	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil)
 	if err != nil {
 		return 0, nil, syserror.EINVAL
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index ada292c9e..25e23c09b 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -181,7 +181,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	p9FS := mustFindFilesystem("9p")
 	opts := p9MountOptions(fd, conf.FileAccess)
-	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
+	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
 	if err != nil {
 		return nil, fmt.Errorf("creating root mount point: %v", err)
 	}
@@ -220,7 +220,7 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 	}
 
 	// Create overlay on top of mount dir.
-	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
+	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
 	if err != nil {
 		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
 	}
@@ -309,7 +309,7 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, ro
 		mf.ReadOnly = true
 	}
 
-	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","))
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
 	if err != nil {
 		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
 	}
@@ -415,9 +415,9 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	}
 
 	newMount := fs.MountArgs{
-		Dev:   mountDevice(m),
-		Flags: mountFlags(m.Options),
-		Data:  strings.Join(opts, ","),
+		Dev:        mountDevice(m),
+		Flags:      mountFlags(m.Options),
+		DataString: strings.Join(opts, ","),
 	}
 	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
 	log.Infof("Added mount at %q: %+v", fsName, newMount)
@@ -441,9 +441,9 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 	}
 
 	rootMount := fs.MountArgs{
-		Dev:   rootDevice,
-		Flags: mf,
-		Data:  strings.Join(opts, ","),
+		Dev:        rootDevice,
+		Flags:      mf,
+		DataString: strings.Join(opts, ","),
 	}
 	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 4fcc0faea..01578cfc5 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -456,9 +456,9 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "9pfs-/",
-							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -510,13 +510,13 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "9pfs-/",
-							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 						{
-							Dev:  "9pfs-/dev/fd-foo",
-							Data: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/dev/fd-foo",
+							DataString: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
@@ -568,16 +568,16 @@ func TestRestoreEnvironment(t *testing.T) {
 				MountSources: map[string][]fs.MountArgs{
 					"9p": {
 						{
-							Dev:   "9pfs-/",
-							Flags: fs.MountSourceFlags{ReadOnly: true},
-							Data:  "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
 						},
 					},
 					"tmpfs": {
 						{
-							Dev:   "none",
-							Flags: fs.MountSourceFlags{NoAtime: true},
-							Data:  "uid=1022",
+							Dev:        "none",
+							Flags:      fs.MountSourceFlags{NoAtime: true},
+							DataString: "uid=1022",
 						},
 						{
 							Dev: "none",
-- 
cgit v1.2.3


From fb9919881c7dc98eaf97cad2a70d187bd78f1566 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 14 Mar 2019 07:42:13 -0700
Subject: Use WalkGetAttr in gofer.inodeOperations.Create.

p9.Twalk.handle() with a non-empty path also stats the walked-to path
anyway, so the preceding GetAttr is completely wasted.

PiperOrigin-RevId: 238440645
Change-Id: I7fbc7536f46b8157639d0d1f491e6aaa9ab688a3
---
 pkg/sentry/fs/gofer/path.go | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index faedfb81c..43f990d16 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/p9"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
@@ -101,20 +102,20 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 
 	i.touchModificationTime(ctx, dir)
 
-	// Get the attributes of the file.
-	qid, mask, p9attr, err := getattr(ctx, newFile)
+	// Get an unopened p9.File for the file we created so that it can be cloned
+	// and re-opened multiple times after creation, while also getting its
+	// attributes. Both are required for inodeOperations.
+	qids, unopened, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		newFile.close(ctx)
 		return nil, err
 	}
-
-	// Get an unopened p9.File for the file we created so that it can be
-	// cloned and re-opened multiple times after creation.
-	_, unopened, err := i.fileState.file.walk(ctx, []string{name})
-	if err != nil {
+	if len(qids) != 1 {
+		log.Warningf("WalkGetAttr(%s) succeeded, but returned %d QIDs (%v), wanted 1", name, len(qids), qids)
 		newFile.close(ctx)
-		return nil, err
+		return nil, syserror.EIO
 	}
+	qid := qids[0]
 
 	// Construct the InodeOperations.
 	sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false)
-- 
cgit v1.2.3


From 8f4634997bd97810a85a70b71f000378d9db2e55 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 14 Mar 2019 08:11:36 -0700
Subject: Decouple filemem from platform and move it to pgalloc.MemoryFile.

This is in preparation for improved page cache reclaim, which requires
greater integration between the page cache and page allocator.

PiperOrigin-RevId: 238444706
Change-Id: Id24141b3678d96c7d7dc24baddd9be555bffafe4
---
 pkg/sentry/context/contexttest/BUILD          |   2 +
 pkg/sentry/context/contexttest/contexttest.go |  25 +
 pkg/sentry/fs/ashmem/BUILD                    |   1 -
 pkg/sentry/fs/binder/BUILD                    |   1 +
 pkg/sentry/fs/binder/binder.go                |  21 +-
 pkg/sentry/fs/dev/BUILD                       |   2 +-
 pkg/sentry/fs/dev/null.go                     |   4 +-
 pkg/sentry/fs/fsutil/BUILD                    |   1 +
 pkg/sentry/fs/fsutil/README.md                |  11 +-
 pkg/sentry/fs/fsutil/file_range_set.go        |  23 +-
 pkg/sentry/fs/fsutil/inode_cached.go          |  42 +-
 pkg/sentry/fs/proc/meminfo.go                 |   6 +-
 pkg/sentry/fs/tmpfs/inode_file.go             |  24 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                  |   2 +-
 pkg/sentry/kernel/BUILD                       |   3 +-
 pkg/sentry/kernel/contexttest/BUILD           |   1 +
 pkg/sentry/kernel/contexttest/contexttest.go  |   2 +
 pkg/sentry/kernel/kernel.go                   |  57 +-
 pkg/sentry/kernel/memevent/memory_events.go   |   2 +-
 pkg/sentry/kernel/shm/BUILD                   |   1 +
 pkg/sentry/kernel/shm/shm.go                  |  19 +-
 pkg/sentry/kernel/task.go                     |   5 +
 pkg/sentry/kernel/task_context.go             |   2 +-
 pkg/sentry/kernel/timekeeper.go               |   5 +-
 pkg/sentry/kernel/timekeeper_test.go          |   8 +-
 pkg/sentry/kernel/vdso.go                     |  17 +-
 pkg/sentry/loader/BUILD                       |   2 +-
 pkg/sentry/loader/vdso.go                     |  21 +-
 pkg/sentry/memutil/memutil_unsafe.go          |  14 +-
 pkg/sentry/mm/BUILD                           |   2 +
 pkg/sentry/mm/README.md                       |   4 +-
 pkg/sentry/mm/aio_context.go                  |  17 +-
 pkg/sentry/mm/lifecycle.go                    |   5 +-
 pkg/sentry/mm/mm.go                           |  20 +-
 pkg/sentry/mm/mm_test.go                      |   4 +-
 pkg/sentry/mm/pma.go                          |  20 +-
 pkg/sentry/mm/save_restore.go                 |  10 +-
 pkg/sentry/mm/special_mappable.go             |  36 +-
 pkg/sentry/mm/syscalls.go                     |   8 +-
 pkg/sentry/pgalloc/BUILD                      |  57 ++
 pkg/sentry/pgalloc/context.go                 |  48 ++
 pkg/sentry/pgalloc/pgalloc.go                 | 922 ++++++++++++++++++++++++++
 pkg/sentry/pgalloc/pgalloc_test.go            | 168 +++++
 pkg/sentry/pgalloc/pgalloc_unsafe.go          |  40 ++
 pkg/sentry/pgalloc/save_restore.go            | 205 ++++++
 pkg/sentry/platform/filemem/BUILD             |  56 --
 pkg/sentry/platform/filemem/filemem.go        | 879 ------------------------
 pkg/sentry/platform/filemem/filemem_state.go  | 194 ------
 pkg/sentry/platform/filemem/filemem_test.go   | 168 -----
 pkg/sentry/platform/filemem/filemem_unsafe.go |  40 --
 pkg/sentry/platform/kvm/BUILD                 |   1 -
 pkg/sentry/platform/kvm/address_space.go      |   4 -
 pkg/sentry/platform/kvm/kvm.go                |  17 -
 pkg/sentry/platform/kvm/kvm_test.go           |   1 -
 pkg/sentry/platform/platform.go               |  90 +--
 pkg/sentry/platform/ptrace/BUILD              |   1 -
 pkg/sentry/platform/ptrace/ptrace.go          |  14 +-
 pkg/sentry/state/BUILD                        |   1 -
 pkg/sentry/state/state.go                     |   5 +-
 pkg/sentry/syscalls/linux/sys_sysinfo.go      |   6 +-
 pkg/sentry/usage/memory.go                    |   3 -
 runsc/boot/BUILD                              |   2 +
 runsc/boot/controller.go                      |   9 +-
 runsc/boot/events.go                          |   2 +-
 runsc/boot/loader.go                          |  26 +-
 65 files changed, 1743 insertions(+), 1666 deletions(-)
 create mode 100644 pkg/sentry/pgalloc/BUILD
 create mode 100644 pkg/sentry/pgalloc/context.go
 create mode 100644 pkg/sentry/pgalloc/pgalloc.go
 create mode 100644 pkg/sentry/pgalloc/pgalloc_test.go
 create mode 100644 pkg/sentry/pgalloc/pgalloc_unsafe.go
 create mode 100644 pkg/sentry/pgalloc/save_restore.go
 delete mode 100644 pkg/sentry/platform/filemem/BUILD
 delete mode 100644 pkg/sentry/platform/filemem/filemem.go
 delete mode 100644 pkg/sentry/platform/filemem/filemem_state.go
 delete mode 100644 pkg/sentry/platform/filemem/filemem_test.go
 delete mode 100644 pkg/sentry/platform/filemem/filemem_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index bed156b70..ce4f1e42c 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -13,6 +13,8 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
+        "//pkg/sentry/memutil",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/ptrace",
         "//pkg/sentry/uniqueid",
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index d5fd9f165..a29087775 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -16,6 +16,7 @@
 package contexttest
 
 import (
+	"os"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -24,6 +25,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
@@ -35,6 +38,17 @@ import (
 // Note that some filesystems may require a minimal kernel for testing, which
 // this test context does not provide. For such tests, see kernel/contexttest.
 func Context(tb testing.TB) context.Context {
+	const memfileName = "contexttest-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		tb.Fatalf("error creating application memory file: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile)
+	if err != nil {
+		memfile.Close()
+		tb.Fatalf("error creating pgalloc.MemoryFile: %v", err)
+	}
 	p, err := ptrace.New()
 	if err != nil {
 		tb.Fatal(err)
@@ -43,6 +57,7 @@ func Context(tb testing.TB) context.Context {
 	return &TestContext{
 		Context:     context.Background(),
 		l:           limits.NewLimitSet(),
+		mf:          mf,
 		platform:    p,
 		otherValues: make(map[interface{}]interface{}),
 	}
@@ -53,6 +68,7 @@ func Context(tb testing.TB) context.Context {
 type TestContext struct {
 	context.Context
 	l           *limits.LimitSet
+	mf          *pgalloc.MemoryFile
 	platform    platform.Platform
 	otherValues map[interface{}]interface{}
 }
@@ -94,6 +110,10 @@ func (t *TestContext) Value(key interface{}) interface{} {
 	switch key {
 	case limits.CtxLimits:
 		return t.l
+	case pgalloc.CtxMemoryFile:
+		return t.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t
 	case platform.CtxPlatform:
 		return t.platform
 	case uniqueid.CtxGlobalUniqueID:
@@ -112,6 +132,11 @@ func (t *TestContext) Value(key interface{}) interface{} {
 	}
 }
 
+// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
+func (t *TestContext) MemoryFile() *pgalloc.MemoryFile {
+	return t.mf
+}
+
 // RootContext returns a Context that may be used in tests that need root
 // credentials. Uses ptrace as the platform.Platform.
 func RootContext(tb testing.TB) context.Context {
diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD
index dcf620dca..ef1c31a3e 100644
--- a/pkg/sentry/fs/ashmem/BUILD
+++ b/pkg/sentry/fs/ashmem/BUILD
@@ -23,7 +23,6 @@ go_library(
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD
index 8a448175f..3710664d3 100644
--- a/pkg/sentry/fs/binder/BUILD
+++ b/pkg/sentry/fs/binder/BUILD
@@ -17,6 +17,7 @@ go_library(
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 19cd55e65..16fb4806f 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -74,9 +75,9 @@ func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *
 // ioctl.
 func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, d, flags, &Proc{
-		bd:       bd,
-		task:     kernel.TaskFromContext(ctx),
-		platform: platform.FromContext(ctx),
+		bd:   bd,
+		task: kernel.TaskFromContext(ctx),
+		mfp:  pgalloc.MemoryFileProviderFromContext(ctx),
 	}), nil
 }
 
@@ -88,14 +89,14 @@ type Proc struct {
 	fsutil.FileNoFsync       `state:"nosave"`
 	fsutil.FileNotDirReaddir `state:"nosave"`
 
-	bd       *Device
-	task     *kernel.Task
-	platform platform.Platform
+	bd   *Device
+	task *kernel.Task
+	mfp  pgalloc.MemoryFileProvider
 
 	// mu protects fr.
 	mu sync.Mutex `state:"nosave"`
 
-	// mapped is memory allocated from platform.Memory() by AddMapping.
+	// mapped is memory allocated from mfp.MemoryFile() by AddMapping.
 	mapped platform.FileRange
 }
 
@@ -104,7 +105,7 @@ func (bp *Proc) Release() {
 	bp.mu.Lock()
 	defer bp.mu.Unlock()
 	if bp.mapped.Length() != 0 {
-		bp.platform.Memory().DecRef(bp.mapped)
+		bp.mfp.MemoryFile().DecRef(bp.mapped)
 	}
 }
 
@@ -204,7 +205,7 @@ func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar userm
 	}
 	// Binder only allocates and maps a single page up-front
 	// (drivers/android/binder.c:binder_mmap() => binder_update_page_range()).
-	fr, err := bp.platform.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+	fr, err := bp.mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
 	if err != nil {
 		return err
 	}
@@ -241,7 +242,7 @@ func (bp *Proc) Translate(ctx context.Context, required, optional memmap.Mappabl
 		return []memmap.Translation{
 			{
 				Source: memmap.MappableRange{0, usermem.PageSize},
-				File:   bp.platform.Memory(),
+				File:   bp.mfp.MemoryFile(),
 				Offset: bp.mapped.Start,
 			},
 		}, err
diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD
index e5b962c8c..6c4fdaba9 100644
--- a/pkg/sentry/fs/dev/BUILD
+++ b/pkg/sentry/fs/dev/BUILD
@@ -27,7 +27,7 @@ go_library(
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 73fd09058..83f43c203 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -21,7 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -115,7 +115,7 @@ var _ fs.FileOperations = (*zeroFileOperations)(nil)
 
 // ConfigureMMap implements fs.FileOperations.ConfigureMMap.
 func (*zeroFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
-	m, err := mm.NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+	m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index d41fc17cc..01098675d 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -85,6 +85,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/fsutil/README.md b/pkg/sentry/fs/fsutil/README.md
index 6e677890c..8be367334 100644
--- a/pkg/sentry/fs/fsutil/README.md
+++ b/pkg/sentry/fs/fsutil/README.md
@@ -112,11 +112,12 @@ finds the file that was mapped and its `CachingInodeOperations`. It then calls
 It may choose to allocate more memory (i.e. do "readahead") to minimize
 subsequent faults.
 
-Memory that is allocated comes from a host tmpfs file (see `filemem.FileMem`).
-The host tmpfs file memory is brought up to date with the contents of the mapped
-file on its filesystem. The region of the host tmpfs file that reflects the
-mapped file is then mapped into the host address space of the application so
-that subsequent memory accesses do not repeatedly generate a `SIGSEGV`.
+Memory that is allocated comes from a host tmpfs file (see
+`pgalloc.MemoryFile`). The host tmpfs file memory is brought up to date with the
+contents of the mapped file on its filesystem. The region of the host tmpfs file
+that reflects the mapped file is then mapped into the host address space of the
+application so that subsequent memory accesses do not repeatedly generate a
+`SIGSEGV`.
 
 The range that was allocated, including any extra memory allocation to minimize
 faults, is marked dirty due to the write fault. This overcounts dirty memory if
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index dd7ab4b4a..32ebf64ff 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -77,7 +78,7 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileR
 }
 
 // Fill attempts to ensure that all memmap.Mappable offsets in required are
-// mapped to a platform.File offset, by allocating from mem with the given
+// mapped to a platform.File offset, by allocating from mf with the given
 // memory usage kind and invoking readAt to store data into memory. (If readAt
 // returns a successful partial read, Fill will call it repeatedly until all
 // bytes have been read.) EOF is handled consistently with the requirements of
@@ -90,7 +91,7 @@ func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileR
 //
 // Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
 // required and optional must be page-aligned.
-func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mem platform.Memory, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
 	gap := frs.LowerBoundGap(required.Start)
 	for gap.Ok() && gap.Start() < required.End {
 		if gap.Range().Length() == 0 {
@@ -100,7 +101,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 		gr := gap.Range().Intersect(optional)
 
 		// Read data into the gap.
-		fr, err := platform.AllocateAndFill(mem, gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
+		fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
 			var done uint64
 			for !dsts.IsEmpty() {
 				n, err := readAt(ctx, dsts, gr.Start+done)
@@ -108,7 +109,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 				dsts = dsts.DropFirst64(n)
 				if err != nil {
 					if err == io.EOF {
-						// platform.AllocateAndFill truncates down to a page
+						// MemoryFile.AllocateAndFill truncates down to a page
 						// boundary, but FileRangeSet.Fill is supposed to
 						// zero-fill to the end of the page in this case.
 						donepgaddr, ok := usermem.Addr(done).RoundUp()
@@ -143,20 +144,20 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map
 // corresponding platform.FileRanges.
 //
 // Preconditions: mr must be page-aligned.
-func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mem platform.Memory) {
+func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
 	seg := frs.LowerBoundSegment(mr.Start)
 	for seg.Ok() && seg.Start() < mr.End {
 		seg = frs.Isolate(seg, mr)
-		mem.DecRef(seg.FileRange())
+		mf.DecRef(seg.FileRange())
 		seg = frs.Remove(seg).NextSegment()
 	}
 }
 
 // DropAll removes all segments in mr, freeing the corresponding
 // platform.FileRanges.
-func (frs *FileRangeSet) DropAll(mem platform.Memory) {
+func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) {
 	for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		mem.DecRef(seg.FileRange())
+		mf.DecRef(seg.FileRange())
 	}
 	frs.RemoveAll()
 }
@@ -164,7 +165,7 @@ func (frs *FileRangeSet) DropAll(mem platform.Memory) {
 // Truncate updates frs to reflect Mappable truncation to the given length:
 // bytes after the new EOF on the same page are zeroed, and pages after the new
 // EOF are freed.
-func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
+func (frs *FileRangeSet) Truncate(end uint64, mf *pgalloc.MemoryFile) {
 	pgendaddr, ok := usermem.Addr(end).RoundUp()
 	if ok {
 		pgend := uint64(pgendaddr)
@@ -173,7 +174,7 @@ func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
 		frs.SplitAt(pgend)
 		seg := frs.LowerBoundSegment(pgend)
 		for seg.Ok() {
-			mem.DecRef(seg.FileRange())
+			mf.DecRef(seg.FileRange())
 			seg = frs.Remove(seg).NextSegment()
 		}
 
@@ -189,7 +190,7 @@ func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
 	if seg.Ok() {
 		fr := seg.FileRange()
 		fr.Start += end - seg.Start()
-		ims, err := mem.MapInternal(fr, usermem.Write)
+		ims, err := mf.MapInternal(fr, usermem.Write)
 		if err != nil {
 			// There's no good recourse from here. This means
 			// that we can't keep cached memory consistent with
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index ef11676b8..9bd923678 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -62,8 +63,8 @@ type CachingInodeOperations struct {
 	// backingFile is a handle to a cached file object.
 	backingFile CachedFileObject
 
-	// platform is used to allocate memory that caches backingFile's contents.
-	platform platform.Platform
+	// mfp is used to allocate memory that caches backingFile's contents.
+	mfp pgalloc.MemoryFileProvider
 
 	// forcePageCache indicates the sentry page cache should be used regardless
 	// of whether the platform supports host mapped I/O or not. This must not be
@@ -96,7 +97,7 @@ type CachingInodeOperations struct {
 	dataMu sync.RWMutex `state:"nosave"`
 
 	// cache maps offsets into the cached file to offsets into
-	// platform.Memory() that store the file's data.
+	// mfp.MemoryFile() that store the file's data.
 	//
 	// cache is protected by dataMu.
 	cache FileRangeSet
@@ -148,13 +149,13 @@ type CachedFileObject interface {
 // NewCachingInodeOperations returns a new CachingInodeOperations backed by
 // a CachedFileObject and its initial unstable attributes.
 func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations {
-	p := platform.FromContext(ctx)
-	if p == nil {
-		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
 	}
 	return &CachingInodeOperations{
 		backingFile:    backingFile,
-		platform:       p,
+		mfp:            mfp,
 		forcePageCache: forcePageCache,
 		attr:           uattr,
 		hostFileMapper: NewHostFileMapper(),
@@ -311,7 +312,7 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode,
 	// written back.
 	c.dataMu.Lock()
 	defer c.dataMu.Unlock()
-	c.cache.Truncate(uint64(size), c.platform.Memory())
+	c.cache.Truncate(uint64(size), c.mfp.MemoryFile())
 	c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend})
 
 	return nil
@@ -323,7 +324,7 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode)
 
 	// Write dirty pages back.
 	c.dataMu.Lock()
-	err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt)
+	err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt)
 	c.dataMu.Unlock()
 	if err != nil {
 		c.attrMu.Unlock()
@@ -527,7 +528,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		return 0, nil
 	}
 
-	mem := rw.c.platform.Memory()
+	mem := rw.c.mfp.MemoryFile()
 	var done uint64
 	seg, gap := rw.c.cache.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -613,7 +614,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 		return 0, nil
 	}
 
-	mem := rw.c.platform.Memory()
+	mf := rw.c.mfp.MemoryFile()
 	var done uint64
 	seg, gap := rw.c.cache.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -622,7 +623,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 		case seg.Ok() && seg.Start() < mr.End:
 			// Get internal mappings from the cache.
 			segMR := seg.Range().Intersect(mr)
-			ims, err := mem.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
 			if err != nil {
 				rw.maybeGrowFile()
 				rw.c.dataMu.Unlock()
@@ -711,13 +712,13 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 	// Writeback dirty mapped memory now that there are no longer any
 	// mappings that reference it. This is our naive memory eviction
 	// strategy.
-	mem := c.platform.Memory()
+	mf := c.mfp.MemoryFile()
 	c.dataMu.Lock()
 	for _, r := range unmapped {
-		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
 			log.Warningf("Failed to writeback cached data %v: %v", r, err)
 		}
-		c.cache.Drop(r, mem)
+		c.cache.Drop(r, mf)
 		c.dirty.KeepClean(r)
 	}
 	c.dataMu.Unlock()
@@ -760,8 +761,8 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 		optional.End = pgend
 	}
 
-	mem := c.platform.Memory()
-	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mem, usage.PageCache, c.backingFile.ReadToBlocksAt)
+	mf := c.mfp.MemoryFile()
+	cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
 
 	var ts []memmap.Translation
 	var translatedEnd uint64
@@ -769,7 +770,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 		segMR := seg.Range().Intersect(optional)
 		ts = append(ts, memmap.Translation{
 			Source: segMR,
-			File:   mem,
+			File:   mf,
 			Offset: seg.FileRangeOf(segMR).Start,
 		})
 		if at.Write {
@@ -820,16 +821,17 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error
 
 	// Sync the cache's contents so that if we have a host fd after restore,
 	// the remote file's contents are coherent.
+	mf := c.mfp.MemoryFile()
 	c.dataMu.Lock()
 	defer c.dataMu.Unlock()
-	if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+	if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
 		return err
 	}
 
 	// Discard the cache so that it's not stored in saved state. This is safe
 	// because per InvalidateUnsavable invariants, no new translations can have
 	// been returned after we invalidated all existing translations above.
-	c.cache.DropAll(c.platform.Memory())
+	c.cache.DropAll(mf)
 	c.dirty.RemoveAll()
 
 	return nil
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index b31258eed..620e93ce3 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -44,10 +44,10 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 		return nil, 0
 	}
 
-	mem := d.k.Platform.Memory()
-	mem.UpdateUsage()
+	mf := d.k.MemoryFile()
+	mf.UpdateUsage()
 	snapshot, totalUsage := usage.MemoryAccounting.Copy()
-	totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage)
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
 	anon := snapshot.Anonymous + snapshot.Tmpfs
 	file := snapshot.PageCache + snapshot.Mapped
 	// We don't actually have active/inactive LRUs, so just make up numbers.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 13d06684d..a98fbf0f1 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -52,7 +52,7 @@ type fileInodeOperations struct {
 
 	fsutil.InodeSimpleExtendedAttributes
 
-	// kernel is used to allocate platform memory that stores the file's contents.
+	// kernel is used to allocate memory that stores the file's contents.
 	kernel *kernel.Kernel
 
 	// memUsage is the default memory usage that will be reported by this file.
@@ -85,7 +85,7 @@ type fileInodeOperations struct {
 
 var _ fs.InodeOperations = (*fileInodeOperations)(nil)
 
-// NewInMemoryFile returns a new file backed by p.Memory().
+// NewInMemoryFile returns a new file backed by Kernel.MemoryFile().
 func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr) fs.InodeOperations {
 	return &fileInodeOperations{
 		attr:     uattr,
@@ -98,7 +98,7 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta
 func (f *fileInodeOperations) Release(context.Context) {
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
-	f.data.DropAll(f.kernel.Platform.Memory())
+	f.data.DropAll(f.kernel.MemoryFile())
 }
 
 // Mappable implements fs.InodeOperations.Mappable.
@@ -202,7 +202,7 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in
 	// and can remove them.
 	f.dataMu.Lock()
 	defer f.dataMu.Unlock()
-	f.data.Truncate(uint64(size), f.kernel.Platform.Memory())
+	f.data.Truncate(uint64(size), f.kernel.MemoryFile())
 
 	return nil
 }
@@ -312,7 +312,7 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		return 0, nil
 	}
 
-	mem := rw.f.kernel.Platform.Memory()
+	mf := rw.f.kernel.MemoryFile()
 	var done uint64
 	seg, gap := rw.f.data.Find(uint64(rw.offset))
 	for rw.offset < end {
@@ -320,7 +320,7 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 		switch {
 		case seg.Ok():
 			// Get internal mappings.
-			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
 			if err != nil {
 				return done, err
 			}
@@ -378,7 +378,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		}
 	}()
 
-	mem := rw.f.kernel.Platform.Memory()
+	mf := rw.f.kernel.MemoryFile()
 	// Page-aligned mr for when we need to allocate memory. RoundUp can't
 	// overflow since end is an int64.
 	pgstartaddr := usermem.Addr(rw.offset).RoundDown()
@@ -392,7 +392,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		switch {
 		case seg.Ok():
 			// Get internal mappings.
-			ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
 			if err != nil {
 				return done, err
 			}
@@ -412,7 +412,7 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		case gap.Ok():
 			// Allocate memory for the write.
 			gapMR := gap.Range().Intersect(pgMR)
-			fr, err := mem.Allocate(gapMR.Length(), rw.f.memUsage)
+			fr, err := mf.Allocate(gapMR.Length(), rw.f.memUsage)
 			if err != nil {
 				return done, err
 			}
@@ -467,8 +467,8 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 		optional.End = pgend
 	}
 
-	mem := f.kernel.Platform.Memory()
-	cerr := f.data.Fill(ctx, required, optional, mem, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+	mf := f.kernel.MemoryFile()
+	cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
 	})
@@ -479,7 +479,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 		segMR := seg.Range().Intersect(optional)
 		ts = append(ts, memmap.Translation{
 			Source: segMR,
-			File:   mem,
+			File:   mf,
 			Offset: seg.FileRangeOf(segMR).Start,
 		})
 		translatedEnd = segMR.End
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 4b1762ce4..1a9d12c0b 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -74,7 +74,7 @@ type Dir struct {
 	// InodeOperation methods to it.
 	ramfsDir *ramfs.Dir
 
-	// kernel is used to allocate platform memory as storage for tmpfs Files.
+	// kernel is used to allocate memory as storage for tmpfs Files.
 	kernel *kernel.Kernel
 }
 
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index d9bbfb556..4d34bc733 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -173,6 +173,7 @@ go_library(
         "//pkg/sentry/loader",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/safemem",
         "//pkg/sentry/socket/netlink/port",
@@ -212,7 +213,7 @@ go_test(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/sched",
         "//pkg/sentry/limits",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/time",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD
index 5769a3b28..bfb2a0b73 100644
--- a/pkg/sentry/kernel/contexttest/BUILD
+++ b/pkg/sentry/kernel/contexttest/BUILD
@@ -12,6 +12,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
     ],
 )
diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go
index 9eb18e7e8..eb56a6a07 100644
--- a/pkg/sentry/kernel/contexttest/contexttest.go
+++ b/pkg/sentry/kernel/contexttest/contexttest.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 )
 
@@ -33,6 +34,7 @@ func Context(tb testing.TB) context.Context {
 	k := &kernel.Kernel{
 		Platform: platform.FromContext(ctx),
 	}
+	k.SetMemoryFile(pgalloc.MemoryFileFromContext(ctx))
 	ctx.(*contexttest.TestContext).RegisterValue(kernel.CtxKernel, k)
 	return ctx
 }
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index c6afae2e6..3533fd8f7 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -58,6 +58,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
@@ -89,12 +90,14 @@ type Kernel struct {
 
 	// All of the following fields are immutable unless otherwise specified.
 
-	// Platform is the platform that is used to execute tasks in the
-	// created Kernel. It is embedded so that Kernel can directly serve as
-	// Platform in mm logic and also serve as platform.MemoryProvider in
-	// filemem S/R logic.
+	// Platform is the platform that is used to execute tasks in the created
+	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
+	// embedded anonymously (the same issue applies).
 	platform.Platform `state:"nosave"`
 
+	// mf provides application memory.
+	mf *pgalloc.MemoryFile `state:"nosave"`
+
 	// See InitKernelArgs for the meaning of these fields.
 	featureSet                  *cpuid.FeatureSet
 	timekeeper                  *Timekeeper
@@ -229,7 +232,8 @@ type InitKernelArgs struct {
 
 // Init initialize the Kernel with no tasks.
 //
-// Callers must manually set Kernel.Platform before caling Init.
+// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
+// before calling Init.
 func (k *Kernel) Init(args InitKernelArgs) error {
 	if args.FeatureSet == nil {
 		return fmt.Errorf("FeatureSet is nil")
@@ -332,15 +336,9 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	log.Infof("Kernel save stats: %s", &stats)
 	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
 
-	// Save the memory state.
-	//
-	// FIXME: In the future, this should not be dispatched via
-	// an abstract memory type. This should be dispatched to a single
-	// memory implementation that belongs to the kernel. (There is
-	// currently a single implementation anyways, it just needs to be
-	// "unabstracted" and reparented appropriately.)
+	// Save the memory file's state.
 	memoryStart := time.Now()
-	if err := k.Platform.Memory().SaveTo(w); err != nil {
+	if err := k.mf.SaveTo(w); err != nil {
 		return err
 	}
 	log.Infof("Memory save took [%s].", time.Since(memoryStart))
@@ -418,13 +416,9 @@ func (ts *TaskSet) unregisterEpollWaiters() {
 }
 
 // LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error {
+func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
 	loadStart := time.Now()
-	if p == nil {
-		return fmt.Errorf("Platform is nil")
-	}
 
-	k.Platform = p
 	k.networkStack = net
 
 	initAppCores := k.applicationCores
@@ -438,11 +432,9 @@ func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) erro
 	log.Infof("Kernel load stats: %s", &stats)
 	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
 
-	// Load the memory state.
-	//
-	// See the note in SaveTo.
+	// Load the memory file's state.
 	memoryStart := time.Now()
-	if err := k.Platform.Memory().LoadFrom(r); err != nil {
+	if err := k.mf.LoadFrom(r); err != nil {
 		return err
 	}
 	log.Infof("Memory load took [%s].", time.Since(memoryStart))
@@ -597,6 +589,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
 		return ctx.args.Limits
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
 	case platform.CtxPlatform:
 		return ctx.k
 	case uniqueid.CtxGlobalUniqueID:
@@ -1018,6 +1014,17 @@ func (k *Kernel) NowMonotonic() int64 {
 	return now
 }
 
+// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
+// LoadFrom.
+func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
+	k.mf = mf
+}
+
+// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
+func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
+	return k.mf
+}
+
 // SupervisorContext returns a Context with maximum privileges in k. It should
 // only be used by goroutines outside the control of the emulated kernel
 // defined by e.
@@ -1083,7 +1090,7 @@ func (k *Kernel) ListSockets(family int) []*refs.WeakRef {
 	socks := []*refs.WeakRef{}
 	if table, ok := k.socketTable[family]; ok {
 		socks = make([]*refs.WeakRef, 0, len(table))
-		for s, _ := range table {
+		for s := range table {
 			socks = append(socks, s)
 		}
 	}
@@ -1123,6 +1130,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 	case limits.CtxLimits:
 		// No limits apply.
 		return limits.NewLimitSet()
+	case pgalloc.CtxMemoryFile:
+		return ctx.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return ctx.k
 	case platform.CtxPlatform:
 		return ctx.k
 	case uniqueid.CtxGlobalUniqueID:
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index b6283c5d1..d09d6debf 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -95,7 +95,7 @@ func (m *MemoryEvents) run() {
 }
 
 func (m *MemoryEvents) emit() {
-	totalPlatform, err := m.k.Platform.Memory().TotalUsage()
+	totalPlatform, err := m.k.MemoryFile().TotalUsage()
 	if err != nil {
 		log.Warningf("Failed to fetch memory usage for memory events: %v", err)
 		return
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index f45770eef..bc2089872 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -20,6 +20,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/usage",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 96414d060..4525aabf4 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -45,6 +45,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -199,19 +200,19 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 //
 // Precondition: Caller must hold r.mu.
 func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
-	p := platform.FromContext(ctx)
-	if p == nil {
-		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
 	}
 
 	effectiveSize := uint64(usermem.Addr(size).MustRoundUp())
-	fr, err := p.Memory().Allocate(effectiveSize, usage.Anonymous)
+	fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
 	if err != nil {
 		return nil, err
 	}
 
 	shm := &Shm{
-		p:             p,
+		mfp:           mfp,
 		registry:      r,
 		creator:       creator,
 		size:          size,
@@ -312,7 +313,7 @@ type Shm struct {
 	// destruction.
 	refs.AtomicRefCount
 
-	p platform.Platform
+	mfp pgalloc.MemoryFileProvider
 
 	// registry points to the shm registry containing this segment. Immutable.
 	registry *Registry
@@ -333,7 +334,7 @@ type Shm struct {
 	// Invariant: effectiveSize must be a multiple of usermem.PageSize.
 	effectiveSize uint64
 
-	// fr is the offset into platform.Memory() that backs this contents of this
+	// fr is the offset into mfp.MemoryFile() that backs this contents of this
 	// segment. Immutable.
 	fr platform.FileRange
 
@@ -452,7 +453,7 @@ func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableR
 		return []memmap.Translation{
 			{
 				Source: source,
-				File:   s.p.Memory(),
+				File:   s.mfp.MemoryFile(),
 				Offset: s.fr.Start + source.Start,
 			},
 		}, err
@@ -599,7 +600,7 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
 }
 
 func (s *Shm) destroy() {
-	s.p.Memory().DecRef(s.fr)
+	s.mfp.MemoryFile().DecRef(s.fr)
 	s.registry.remove(s)
 }
 
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 702e40cce..e9f133c0b 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
@@ -587,6 +588,10 @@ func (t *Task) Value(key interface{}) interface{} {
 		return t.k.RealtimeClock()
 	case limits.CtxLimits:
 		return t.tg.limits
+	case pgalloc.CtxMemoryFile:
+		return t.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t.k
 	case platform.CtxPlatform:
 		return t.k
 	case uniqueid.CtxGlobalUniqueID:
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index ee3e49d17..d1c82f2aa 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -144,7 +144,7 @@ func (t *Task) Stack() *arch.Stack {
 //  * fs: Binary FeatureSet
 func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
 	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k)
+	m := mm.NewMemoryManager(k, k)
 	defer m.DecUsers(ctx)
 
 	os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 6bff80f13..d7bd85e78 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
 )
@@ -85,9 +86,9 @@ type Timekeeper struct {
 // NewTimekeeper does not take ownership of paramPage.
 //
 // SetClocks must be called on the returned Timekeeper before it is usable.
-func NewTimekeeper(platform platform.Platform, paramPage platform.FileRange) (*Timekeeper, error) {
+func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) {
 	return &Timekeeper{
-		params: NewVDSOParamPage(platform, paramPage),
+		params: NewVDSOParamPage(mfp, paramPage),
 	}, nil
 }
 
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 71674c21c..6084bcb18 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -18,7 +18,7 @@ import (
 	"testing"
 
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -53,13 +53,13 @@ func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
 // SetClocks called.
 func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
 	ctx := contexttest.Context(tb)
-	p := platform.FromContext(ctx)
-	fr, err := p.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	fr, err := mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous)
 	if err != nil {
 		tb.Fatalf("failed to allocate memory: %v", err)
 	}
 	return &Timekeeper{
-		params: NewVDSOParamPage(p, fr),
+		params: NewVDSOParamPage(mfp, fr),
 	}
 }
 
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 0ec858a4a..3a35f1d00 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.googlesource.com/gvisor/pkg/binary"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -55,9 +56,9 @@ type vdsoParams struct {
 //
 // +stateify savable
 type VDSOParamPage struct {
-	// The parameter page is fr, allocated from platform.Memory().
-	platform platform.Platform
-	fr       platform.FileRange
+	// The parameter page is fr, allocated from mfp.MemoryFile().
+	mfp pgalloc.MemoryFileProvider
+	fr  platform.FileRange
 
 	// seq is the current sequence count written to the page.
 	//
@@ -73,20 +74,20 @@ type VDSOParamPage struct {
 //
 // Preconditions:
 //
-// * fr is a single page allocated from platform.Memory(). VDSOParamPage does
+// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
 //   not take ownership of fr; it must remain allocated for the lifetime of the
 //   VDSOParamPage.
 //
 // * VDSOParamPage must be the only writer to fr.
 //
-// * platform.Memory().MapInternal(fr) must return a single safemem.Block.
-func NewVDSOParamPage(platform platform.Platform, fr platform.FileRange) *VDSOParamPage {
-	return &VDSOParamPage{platform: platform, fr: fr}
+// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
+func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage {
+	return &VDSOParamPage{mfp: mfp, fr: fr}
 }
 
 // access returns a mapping of the param page.
 func (v *VDSOParamPage) access() (safemem.Block, error) {
-	bs, err := v.platform.Memory().MapInternal(v.fr, usermem.ReadWrite)
+	bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite)
 	if err != nil {
 		return safemem.Block{}, err
 	}
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 1ea260a4e..66300f25a 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -39,7 +39,7 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/mm",
-        "//pkg/sentry/platform",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/safemem",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index c070c7316..273f6b5b9 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -28,7 +28,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -217,7 +217,7 @@ type VDSO struct {
 
 // PrepareVDSO validates the system VDSO and returns a VDSO, containing the
 // param page for updating by the kernel.
-func PrepareVDSO(p platform.Platform) (*VDSO, error) {
+func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
 	vdsoFile := newByteReaderFile(vdsoBin)
 
 	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
@@ -234,35 +234,36 @@ func PrepareVDSO(p platform.Platform) (*VDSO, error) {
 		return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsoBin))
 	}
 
-	vdso, err := p.Memory().Allocate(uint64(size), usage.System)
+	mf := mfp.MemoryFile()
+	vdso, err := mf.Allocate(uint64(size), usage.System)
 	if err != nil {
 		return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err)
 	}
 
-	ims, err := p.Memory().MapInternal(vdso, usermem.ReadWrite)
+	ims, err := mf.MapInternal(vdso, usermem.ReadWrite)
 	if err != nil {
-		p.Memory().DecRef(vdso)
+		mf.DecRef(vdso)
 		return nil, fmt.Errorf("unable to map VDSO memory: %v", err)
 	}
 
 	_, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsoBin)))
 	if err != nil {
-		p.Memory().DecRef(vdso)
+		mf.DecRef(vdso)
 		return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err)
 	}
 
 	// Finally, allocate a param page for this VDSO.
-	paramPage, err := p.Memory().Allocate(usermem.PageSize, usage.System)
+	paramPage, err := mf.Allocate(usermem.PageSize, usage.System)
 	if err != nil {
-		p.Memory().DecRef(vdso)
+		mf.DecRef(vdso)
 		return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err)
 	}
 
 	return &VDSO{
-		ParamPage: mm.NewSpecialMappable("[vvar]", p, paramPage),
+		ParamPage: mm.NewSpecialMappable("[vvar]", mfp, paramPage),
 		// TODO: Don't advertise the VDSO, as some applications may
 		// not be able to handle multiple [vdso] hints.
-		vdso:  mm.NewSpecialMappable("", p, vdso),
+		vdso:  mm.NewSpecialMappable("", mfp, vdso),
 		phdrs: info.phdrs,
 	}, nil
 }
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go
index 8d9fc64fb..bc2c72f55 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/sentry/memutil/memutil_unsafe.go
@@ -15,6 +15,7 @@
 package memutil
 
 import (
+	"fmt"
 	"syscall"
 	"unsafe"
 
@@ -22,14 +23,17 @@ import (
 )
 
 // CreateMemFD creates a memfd file and returns the fd.
-func CreateMemFD(name string, flags int) (fd int, err error) {
+func CreateMemFD(name string, flags int) (int, error) {
 	p, err := syscall.BytePtrFromString(name)
 	if err != nil {
 		return -1, err
 	}
-	r0, _, e0 := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0)
-	if e0 != 0 {
-		return -1, e0
+	fd, _, e := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0)
+	if e != 0 {
+		if e == syscall.ENOSYS {
+			return -1, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
+		}
+		return -1, e
 	}
-	return int(r0), nil
+	return int(fd), nil
 }
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index a85ffdef8..c78cb4280 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -111,6 +111,7 @@ go_library(
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/safecopy",
         "//pkg/sentry/safemem",
@@ -133,6 +134,7 @@ go_test(
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md
index e485a5ca5..e6efbf565 100644
--- a/pkg/sentry/mm/README.md
+++ b/pkg/sentry/mm/README.md
@@ -153,7 +153,7 @@ manner, and the sentry handles the fault:
     represented by a host file descriptor and offset, since (as noted in
     "Background") this is the memory mapping primitive provided by the host
     kernel. In general, memory is allocated from a temporary host file using the
-    `filemem` package. Supposing that the sentry allocates offset 0x3000 from
+    `pgalloc` package. Supposing that the sentry allocates offset 0x3000 from
     host file "memory-file", the resulting state is:
 
         Sentry VMA:     VA:0x400000 -> /tmp/foo:0x0
@@ -274,7 +274,7 @@ In the sentry:
     methods
     [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform].
 
-[filemem]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/filemem/filemem.go
 [memmap]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/memmap/memmap.go
 [mm]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/mm/mm.go
+[pgalloc]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/pgalloc/pgalloc.go
 [platform]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/platform/platform.go
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 5e86d3b49..6cec6387a 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -201,24 +202,24 @@ func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) {
 type aioMappable struct {
 	refs.AtomicRefCount
 
-	p  platform.Platform
-	fr platform.FileRange
+	mfp pgalloc.MemoryFileProvider
+	fr  platform.FileRange
 }
 
 var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp())
 
-func newAIOMappable(p platform.Platform) (*aioMappable, error) {
-	fr, err := p.Memory().Allocate(aioRingBufferSize, usage.Anonymous)
+func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
+	fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous)
 	if err != nil {
 		return nil, err
 	}
-	return &aioMappable{p: p, fr: fr}, nil
+	return &aioMappable{mfp: mfp, fr: fr}, nil
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *aioMappable) DecRef() {
 	m.AtomicRefCount.DecRefWithDestructor(func() {
-		m.p.Memory().DecRef(m.fr)
+		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
 
@@ -299,7 +300,7 @@ func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.M
 		return []memmap.Translation{
 			{
 				Source: source,
-				File:   m.p.Memory(),
+				File:   m.mfp.MemoryFile(),
 				Offset: m.fr.Start + source.Start,
 			},
 		}, err
@@ -320,7 +321,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 	// libaio peeks inside looking for a magic number. This function allocates
 	// a page per context and keeps it set to zeroes to ensure it will not
 	// match AIO_RING_MAGIC and make libaio happy.
-	m, err := newAIOMappable(mm.p)
+	m, err := newAIOMappable(mm.mfp)
 	if err != nil {
 		return 0, err
 	}
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 1ee8ae74e..a71286f14 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -23,14 +23,16 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
 // NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
-func NewMemoryManager(p platform.Platform) *MemoryManager {
+func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager {
 	return &MemoryManager{
 		p:           p,
+		mfp:         mfp,
 		haveASIO:    p.SupportsAddressSpaceIO(),
 		privateRefs: &privateRefs{},
 		users:       1,
@@ -60,6 +62,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
 		p:           mm.p,
+		mfp:         mm.mfp,
 		haveASIO:    mm.haveASIO,
 		layout:      mm.layout,
 		privateRefs: mm.privateRefs,
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index e2c636f38..6ed838d64 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -40,6 +40,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -50,10 +51,9 @@ import (
 //
 // +stateify savable
 type MemoryManager struct {
-	// p is the platform.
-	//
-	// p is immutable.
-	p platform.Platform
+	// p and mfp are immutable.
+	p   platform.Platform
+	mfp pgalloc.MemoryFileProvider
 
 	// haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
 	// eliminating an indirect call in the hot I/O path, this makes
@@ -369,8 +369,8 @@ func (v *vma) loadRealPerms(b int) {
 // +stateify savable
 type pma struct {
 	// file is the file mapped by this pma. Only pmas for which file ==
-	// platform.Platform.Memory() may be saved. pmas hold a reference to the
-	// corresponding file range while they exist.
+	// MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
+	// the corresponding file range while they exist.
 	file platform.File `state:"nosave"`
 
 	// off is the offset into file at which this pma begins.
@@ -387,7 +387,7 @@ type pma struct {
 
 	// private is true if this pma represents private memory.
 	//
-	// If private is true, file must be platform.Platform.Memory(), the pma
+	// If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
 	// holds a reference on the mapped memory that is tracked in privateRefs,
 	// and calls to Invalidate for which
 	// memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
@@ -405,9 +405,9 @@ type pma struct {
 type privateRefs struct {
 	mu sync.Mutex `state:"nosave"`
 
-	// refs maps offsets into Platform.Memory() to the number of pmas (or,
-	// equivalently, MemoryManagers) that share ownership of the memory at that
-	// offset.
+	// refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
+	// pmas (or, equivalently, MemoryManagers) that share ownership of the
+	// memory at that offset.
 	refs fileRefcountSet
 }
 
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index f2db43196..e12cb3bd1 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -29,7 +30,8 @@ import (
 
 func testMemoryManager(ctx context.Context) *MemoryManager {
 	p := platform.FromContext(ctx)
-	mm := NewMemoryManager(p)
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	mm := NewMemoryManager(p, mfp)
 	mm.layout = arch.MmapLayout{
 		MinAddr:      p.MinUserAddress(),
 		MaxAddr:      p.MaxUserAddress(),
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index d102035d8..bb779a45b 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -328,8 +328,8 @@ func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator,
 		// Limit the range we allocate to ar, aligned to privateAllocUnit.
 		maskAR := privateAligned(ar)
 		allocAR := optAR.Intersect(maskAR)
-		mem := mm.p.Memory()
-		fr, err := mem.Allocate(uint64(allocAR.Length()), usage.Anonymous)
+		mf := mm.mfp.MemoryFile()
+		fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
 		if err != nil {
 			return pgap, err
 		}
@@ -342,10 +342,10 @@ func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator,
 		}
 
 		mm.addRSSLocked(allocAR)
-		mem.IncRef(fr)
+		mf.IncRef(fr)
 
 		return mm.pmas.Insert(pgap, allocAR, pma{
-			file:              mem,
+			file:              mf,
 			off:               fr.Start,
 			vmaEffectivePerms: vma.effectivePerms,
 			vmaMaxPerms:       vma.maxPerms,
@@ -426,7 +426,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 	// Limit the range we copy to ar, aligned to privateAllocUnit.
 	maskAR := privateAligned(ar)
 	var invalidatedIterators, didUnmapAS bool
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for {
 		if mm.isPMACopyOnWriteLocked(pseg) {
 			// Determine the range to copy.
@@ -438,7 +438,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 			}
 
 			// Copy contents.
-			fr, err := platform.AllocateAndFill(mem, uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
+			fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
 			if _, ok := err.(safecopy.BusError); ok {
 				// If we got SIGBUS during the copy, deliver SIGBUS to
 				// userspace (instead of SIGSEGV) if we're breaking
@@ -449,7 +449,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 				return pseg.PrevGap(), invalidatedIterators, err
 			}
 			mm.incPrivateRef(fr)
-			mem.IncRef(fr)
+			mf.IncRef(fr)
 
 			// Unmap all of maskAR, not just copyAR, to minimize host syscalls.
 			// AddressSpace mappings must be removed before mm.decPrivateRef().
@@ -471,7 +471,7 @@ func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.Add
 			}
 			pma.file.DecRef(pseg.fileRange())
 
-			pma.file = mem
+			pma.file = mf
 			pma.off = fr.Start
 			pma.private = true
 			pma.needCOW = false
@@ -881,9 +881,9 @@ func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) {
 	refSet.MergeAdjacent(fr)
 	mm.privateRefs.mu.Unlock()
 
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for _, fr := range freed {
-		mem.DecRef(fr)
+		mf.DecRef(fr)
 	}
 }
 
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
index 6e7080a84..46e0e0754 100644
--- a/pkg/sentry/mm/save_restore.go
+++ b/pkg/sentry/mm/save_restore.go
@@ -37,12 +37,12 @@ func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error {
 
 // beforeSave is invoked by stateify.
 func (mm *MemoryManager) beforeSave() {
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
-		if pma := pseg.ValuePtr(); pma.file != mem {
+		if pma := pseg.ValuePtr(); pma.file != mf {
 			// InvalidateUnsavable should have caused all such pmas to be
 			// invalidated.
-			panic(fmt.Sprintf("Can't save pma %#v with non-Memory file of type %T:\n%s", pseg.Range(), pma.file, mm))
+			panic(fmt.Sprintf("Can't save pma %#v with non-MemoryFile of type %T:\n%s", pseg.Range(), pma.file, mm))
 		}
 	}
 }
@@ -50,8 +50,8 @@ func (mm *MemoryManager) beforeSave() {
 // afterLoad is invoked by stateify.
 func (mm *MemoryManager) afterLoad() {
 	mm.haveASIO = mm.p.SupportsAddressSpaceIO()
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
-		pseg.ValuePtr().file = mem
+		pseg.ValuePtr().file = mf
 	}
 }
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 64d0dd3f6..aa94d7d6a 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -33,24 +34,24 @@ import (
 type SpecialMappable struct {
 	refs.AtomicRefCount
 
-	p    platform.Platform
+	mfp  pgalloc.MemoryFileProvider
 	fr   platform.FileRange
 	name string
 }
 
 // NewSpecialMappable returns a SpecialMappable that owns fr, which represents
-// offsets in p.Memory() that contain the SpecialMappable's data. The
+// offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The
 // SpecialMappable will use the given name in /proc/[pid]/maps.
 //
 // Preconditions: fr.Length() != 0.
-func NewSpecialMappable(name string, p platform.Platform, fr platform.FileRange) *SpecialMappable {
-	return &SpecialMappable{p: p, fr: fr, name: name}
+func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable {
+	return &SpecialMappable{mfp: mfp, fr: fr, name: name}
 }
 
 // DecRef implements refs.RefCounter.DecRef.
 func (m *SpecialMappable) DecRef() {
 	m.AtomicRefCount.DecRefWithDestructor(func() {
-		m.p.Memory().DecRef(m.fr)
+		m.mfp.MemoryFile().DecRef(m.fr)
 	})
 }
 
@@ -99,7 +100,7 @@ func (m *SpecialMappable) Translate(ctx context.Context, required, optional memm
 		return []memmap.Translation{
 			{
 				Source: source,
-				File:   m.p.Memory(),
+				File:   m.mfp.MemoryFile(),
 				Offset: m.fr.Start + source.Start,
 			},
 		}, err
@@ -109,19 +110,19 @@ func (m *SpecialMappable) Translate(ctx context.Context, required, optional memm
 
 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
 func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error {
-	// Since data is stored in platform.Platform.Memory(), the contents of
-	// which are preserved across save/restore, we don't need to do anything.
+	// Since data is stored in pgalloc.MemoryFile, the contents of which are
+	// preserved across save/restore, we don't need to do anything.
 	return nil
 }
 
-// Platform returns the Platform whose Memory stores the SpecialMappable's
-// contents.
-func (m *SpecialMappable) Platform() platform.Platform {
-	return m.p
+// MemoryFileProvider returns the MemoryFileProvider whose MemoryFile stores
+// the SpecialMappable's contents.
+func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
+	return m.mfp
 }
 
-// FileRange returns the offsets into Platform().Memory() that store the
-// SpecialMappable's contents.
+// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
+// store the SpecialMappable's contents.
 func (m *SpecialMappable) FileRange() platform.FileRange {
 	return m.fr
 }
@@ -137,7 +138,7 @@ func (m *SpecialMappable) Length() uint64 {
 // TODO: The use of SpecialMappable is a lazy code reuse hack. Linux
 // uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
 // do the same to get non-zero device and inode IDs.
-func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable, error) {
+func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
 	if length == 0 {
 		return nil, syserror.EINVAL
 	}
@@ -145,10 +146,9 @@ func NewSharedAnonMappable(length uint64, p platform.Platform) (*SpecialMappable
 	if !ok {
 		return nil, syserror.EINVAL
 	}
-
-	fr, err := p.Memory().Allocate(uint64(alignedLen), usage.Anonymous)
+	fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous)
 	if err != nil {
 		return nil, err
 	}
-	return NewSpecialMappable("/dev/zero (deleted)", p, fr), nil
+	return NewSpecialMappable("/dev/zero (deleted)", mfp, fr), nil
 }
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index fd6929e08..b56e0d3b9 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -24,7 +24,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -99,7 +99,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 			if opts.MappingIdentity != nil {
 				return 0, syserror.EINVAL
 			}
-			m, err := NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+			m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
 			if err != nil {
 				return 0, err
 			}
@@ -965,7 +965,7 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
-	mem := mm.p.Memory()
+	mf := mm.mfp.MemoryFile()
 	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
 		vma := vseg.ValuePtr()
 		if vma.mlockMode != memmap.MLockNone {
@@ -984,7 +984,7 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
 				psegAR := pseg.Range().Intersect(ar)
 				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
-					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+					if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
 						pseg = pseg.NextSegment()
 						continue
 					}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
new file mode 100644
index 000000000..7efa55c20
--- /dev/null
+++ b/pkg/sentry/pgalloc/BUILD
@@ -0,0 +1,57 @@
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+go_template_instance(
+    name = "usage_set",
+    out = "usage_set.go",
+    consts = {
+        "minDegree": "10",
+    },
+    imports = {
+        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+    },
+    package = "pgalloc",
+    prefix = "usage",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "usageInfo",
+        "Functions": "usageSetFunctions",
+    },
+)
+
+go_library(
+    name = "pgalloc",
+    srcs = [
+        "context.go",
+        "pgalloc.go",
+        "pgalloc_unsafe.go",
+        "save_restore.go",
+        "usage_set.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/memutil",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/state",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "pgalloc_test",
+    size = "small",
+    srcs = ["pgalloc_test.go"],
+    embed = [":pgalloc"],
+    deps = ["//pkg/sentry/usermem"],
+)
diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go
new file mode 100644
index 000000000..adc97e78f
--- /dev/null
+++ b/pkg/sentry/pgalloc/context.go
@@ -0,0 +1,48 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is this package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxMemoryFile is a Context.Value key for a MemoryFile.
+	CtxMemoryFile contextID = iota
+
+	// CtxMemoryFileProvider is a Context.Value key for a MemoryFileProvider.
+	CtxMemoryFileProvider
+)
+
+// MemoryFileFromContext returns the MemoryFile used by ctx, or nil if no such
+// MemoryFile exists.
+func MemoryFileFromContext(ctx context.Context) *MemoryFile {
+	if v := ctx.Value(CtxMemoryFile); v != nil {
+		return v.(*MemoryFile)
+	}
+	return nil
+}
+
+// MemoryFileProviderFromContext returns the MemoryFileProvider used by ctx, or nil if no such
+// MemoryFileProvider exists.
+func MemoryFileProviderFromContext(ctx context.Context) MemoryFileProvider {
+	if v := ctx.Value(CtxMemoryFileProvider); v != nil {
+		return v.(MemoryFileProvider)
+	}
+	return nil
+}
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
new file mode 100644
index 000000000..0754e608f
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -0,0 +1,922 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pgalloc contains the page allocator subsystem, which manages memory
+// that may be mapped into application address spaces.
+//
+// Lock order:
+//
+// pgalloc.MemoryFile.mu
+//   pgalloc.MemoryFile.mappingsMu
+package pgalloc
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MemoryFile is a platform.File whose pages may be allocated to arbitrary
+// users.
+type MemoryFile struct {
+	// MemoryFile owns a single backing file, which is modeled as follows:
+	//
+	// Each page in the file can be committed or uncommitted. A page is
+	// committed if the host kernel is spending resources to store its contents
+	// and uncommitted otherwise. This definition includes pages that the host
+	// kernel has swapped; this is intentional, to ensure that accounting does
+	// not change even if host kernel swapping behavior changes, and that
+	// memory used by pseudo-swap mechanisms like zswap is still accounted.
+	//
+	// The initial contents of uncommitted pages are implicitly zero bytes. A
+	// read or write to the contents of an uncommitted page causes it to be
+	// committed. This is the only event that can cause a uncommitted page to
+	// be committed.
+	//
+	// fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
+	// pages to be uncommitted. This is the only event that can cause a
+	// committed page to be uncommitted.
+	//
+	// Memory accounting is based on identifying the set of committed pages.
+	// Since we do not have direct access to the MMU, tracking reads and writes
+	// to uncommitted pages to detect commitment would introduce additional
+	// page faults, which would be prohibitively expensive. Instead, we query
+	// the host kernel to determine which pages are committed.
+
+	// file is the backing file. The file pointer is immutable.
+	file *os.File
+
+	mu sync.Mutex
+
+	// usage maps each page in the file to metadata for that page. Pages for
+	// which no segment exists in usage are both unallocated (not in use) and
+	// uncommitted.
+	//
+	// Since usage stores usageInfo objects by value, clients should usually
+	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
+	// pointer to the usageInfo rather than a copy.
+	//
+	// usage must be kept maximally merged (that is, there should never be two
+	// adjacent segments with the same values). At least markReclaimed depends
+	// on this property.
+	//
+	// usage is protected by mu.
+	usage usageSet
+
+	// The UpdateUsage function scans all segments with knownCommitted set
+	// to false, sees which pages are committed and creates corresponding
+	// segments with knownCommitted set to true.
+	//
+	// In order to avoid unnecessary scans, usageExpected tracks the total
+	// file blocks expected. This is used to elide the scan when this
+	// matches the underlying file blocks.
+	//
+	// To track swapped pages, usageSwapped tracks the discrepency between
+	// what is observed in core and what is reported by the file. When
+	// usageSwapped is non-zero, a sweep will be performed at least every
+	// second. The start of the last sweep is recorded in usageLast.
+	//
+	// All usage attributes are all protected by mu.
+	usageExpected uint64
+	usageSwapped  uint64
+	usageLast     time.Time
+
+	// minUnallocatedPage is the minimum page that may be unallocated.
+	// i.e., there are no unallocated pages below minUnallocatedPage.
+	//
+	// minUnallocatedPage is protected by mu.
+	minUnallocatedPage uint64
+
+	// fileSize is the size of the backing memory file in bytes. fileSize is
+	// always a power-of-two multiple of chunkSize.
+	//
+	// fileSize is protected by mu.
+	fileSize int64
+
+	// destroyed is set by Destroy to instruct the reclaimer goroutine to
+	// release resources and exit. destroyed is protected by mu.
+	destroyed bool
+
+	// reclaimable is true if usage may contain reclaimable pages. reclaimable
+	// is protected by mu.
+	reclaimable bool
+
+	// minReclaimablePage is the minimum page that may be reclaimable.
+	// i.e., all reclaimable pages are >= minReclaimablePage.
+	//
+	// minReclaimablePage is protected by mu.
+	minReclaimablePage uint64
+
+	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
+	// transitions from false to true.
+	reclaimCond sync.Cond
+
+	// Pages from the backing file are mapped into the local address space on
+	// the granularity of large pieces called chunks. mappings is a []uintptr
+	// that stores, for each chunk, the start address of a mapping of that
+	// chunk in the current process' address space, or 0 if no such mapping
+	// exists. Once a chunk is mapped, it is never remapped or unmapped until
+	// the MemoryFile is destroyed.
+	//
+	// Mutating the mappings slice or its contents requires both holding
+	// mappingsMu and using atomic memory operations. (The slice is mutated
+	// whenever the file is expanded. Per the above, the only permitted
+	// mutation of the slice's contents is the assignment of a mapping to a
+	// chunk that was previously unmapped.) Reading the slice or its contents
+	// only requires *either* holding mappingsMu or using atomic memory
+	// operations. This allows MemoryFile.MapInternal to avoid locking in the
+	// common case where chunk mappings already exist.
+	mappingsMu sync.Mutex
+	mappings   atomic.Value
+}
+
+// usage tracks usage information.
+//
+// +stateify savable
+type usageInfo struct {
+	// kind is the usage kind.
+	kind usage.MemoryKind
+
+	// knownCommitted is true if the tracked region is definitely committed.
+	// (If it is false, the tracked region may or may not be committed.)
+	knownCommitted bool
+
+	refs uint64
+}
+
+const (
+	chunkShift = 24
+	chunkSize  = 1 << chunkShift // 16 MB
+	chunkMask  = chunkSize - 1
+
+	initialSize = chunkSize
+
+	// maxPage is the highest 64-bit page.
+	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
+)
+
+// NewMemoryFile creates a MemoryFile backed by the given file. If
+// NewMemoryFile succeeds, ownership of file is transferred to the returned
+// MemoryFile.
+func NewMemoryFile(file *os.File) (*MemoryFile, error) {
+	// Truncate the file to 0 bytes first to ensure that it's empty.
+	if err := file.Truncate(0); err != nil {
+		return nil, err
+	}
+	if err := file.Truncate(initialSize); err != nil {
+		return nil, err
+	}
+	f := &MemoryFile{
+		fileSize: initialSize,
+		file:     file,
+		// No pages are reclaimable. DecRef will always be able to
+		// decrease minReclaimablePage from this point.
+		minReclaimablePage: maxPage,
+	}
+	f.reclaimCond.L = &f.mu
+	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	go f.runReclaim() // S/R-SAFE: f.mu
+
+	// The Linux kernel contains an optional feature called "Integrity
+	// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
+	// binaries the first time they are mapped PROT_EXEC. This is bad news for
+	// executable pages mapped from our backing file, which can grow to
+	// terabytes in (sparse) size. If IMA attempts to checksum a file that
+	// large, it will allocate all of the sparse pages and quickly exhaust all
+	// memory.
+	//
+	// Work around IMA by immediately creating a temporary PROT_EXEC mapping,
+	// while the backing file is still small. IMA will ignore any future
+	// mappings.
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		usermem.PageSize,
+		syscall.PROT_EXEC,
+		syscall.MAP_SHARED,
+		file.Fd(),
+		0)
+	if errno != 0 {
+		// This isn't fatal (IMA may not even be in use). Log the error, but
+		// don't return it.
+		log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
+	} else {
+		if _, _, errno := syscall.Syscall(
+			syscall.SYS_MUNMAP,
+			m,
+			usermem.PageSize,
+			0); errno != 0 {
+			panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
+		}
+	}
+
+	return f, nil
+}
+
+// Destroy releases all resources used by f.
+//
+// Preconditions: All pages allocated by f have been freed.
+//
+// Postconditions: None of f's methods may be called after Destroy.
+func (f *MemoryFile) Destroy() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.destroyed = true
+	f.reclaimCond.Signal()
+}
+
+// Allocate returns a range of initially-zeroed pages of the given length with
+// the given accounting kind and a single reference held by the caller. When
+// the last reference on an allocated page is released, ownership of the page
+// is returned to the MemoryFile, allowing it to be returned by a future call
+// to Allocate.
+//
+// Preconditions: length must be page-aligned and non-zero.
+func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
+	if length == 0 || length%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid allocation length: %#x", length))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Align hugepage-and-larger allocations on hugepage boundaries to try
+	// to take advantage of hugetmpfs.
+	alignment := uint64(usermem.PageSize)
+	if length >= usermem.HugePageSize {
+		alignment = usermem.HugePageSize
+	}
+
+	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
+	end := start + length
+	// File offsets are int64s. Since length must be strictly positive, end
+	// cannot legitimately be 0.
+	if end < start || int64(end) <= 0 {
+		return platform.FileRange{}, syserror.ENOMEM
+	}
+
+	// Expand the file if needed. Double the file size on each expansion;
+	// uncommitted pages have effectively no cost.
+	fileSize := f.fileSize
+	for int64(end) > fileSize {
+		if fileSize >= 2*fileSize {
+			// fileSize overflow.
+			return platform.FileRange{}, syserror.ENOMEM
+		}
+		fileSize *= 2
+	}
+	if fileSize > f.fileSize {
+		if err := f.file.Truncate(fileSize); err != nil {
+			return platform.FileRange{}, err
+		}
+		f.fileSize = fileSize
+		f.mappingsMu.Lock()
+		oldMappings := f.mappings.Load().([]uintptr)
+		newMappings := make([]uintptr, fileSize>>chunkShift)
+		copy(newMappings, oldMappings)
+		f.mappings.Store(newMappings)
+		f.mappingsMu.Unlock()
+	}
+
+	// Mark selected pages as in use.
+	fr := platform.FileRange{start, end}
+	if !f.usage.Add(fr, usageInfo{
+		kind: kind,
+		refs: 1,
+	}) {
+		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
+	}
+
+	if minUnallocatedPage < start {
+		f.minUnallocatedPage = minUnallocatedPage
+	} else {
+		// start was the first unallocated page. The next must be
+		// somewhere beyond end.
+		f.minUnallocatedPage = end
+	}
+
+	return fr, nil
+}
+
+// findUnallocatedRange returns the first unallocated page in usage of the
+// specified length and alignment beginning at page start and the first single
+// unallocated page.
+func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
+	// Only searched until the first page is found.
+	firstPage := start
+	foundFirstPage := false
+	alignMask := alignment - 1
+	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
+		r := seg.Range()
+
+		if !foundFirstPage && r.Start > firstPage {
+			foundFirstPage = true
+		}
+
+		if start >= r.End {
+			// start was rounded up to an alignment boundary from the end
+			// of a previous segment and is now beyond r.End.
+			continue
+		}
+		// This segment represents allocated or reclaimable pages; only the
+		// range from start to the segment's beginning is allocatable, and the
+		// next allocatable range begins after the segment.
+		if r.Start > start && r.Start-start >= length {
+			break
+		}
+		start = (r.End + alignMask) &^ alignMask
+		if !foundFirstPage {
+			firstPage = r.End
+		}
+	}
+	return start, firstPage
+}
+
+// AllocateAndFill allocates memory of the given kind and fills it by calling
+// r.ReadToBlocks() repeatedly until either length bytes are read or a non-nil
+// error is returned. It returns the memory filled by r, truncated down to the
+// nearest page. If this is shorter than length bytes due to an error returned
+// by r.ReadToBlocks(), it returns that error.
+//
+// Preconditions: length > 0. length must be page-aligned.
+func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) {
+	fr, err := f.Allocate(length, kind)
+	if err != nil {
+		return platform.FileRange{}, err
+	}
+	dsts, err := f.MapInternal(fr, usermem.Write)
+	if err != nil {
+		f.DecRef(fr)
+		return platform.FileRange{}, err
+	}
+	n, err := safemem.ReadFullToBlocks(r, dsts)
+	un := uint64(usermem.Addr(n).RoundDown())
+	if un < length {
+		// Free unused memory and update fr to contain only the memory that is
+		// still allocated.
+		f.DecRef(platform.FileRange{fr.Start + un, fr.End})
+		fr.End = fr.Start + un
+	}
+	return fr, err
+}
+
+// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
+const (
+	_FALLOC_FL_KEEP_SIZE  = 1
+	_FALLOC_FL_PUNCH_HOLE = 2
+)
+
+// Decommit releases resources associated with maintaining the contents of the
+// given pages. If Decommit succeeds, future accesses of the decommitted pages
+// will read zeroes.
+//
+// Preconditions: fr.Length() > 0.
+func (f *MemoryFile) Decommit(fr platform.FileRange) error {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	// "After a successful call, subsequent reads from this range will
+	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
+	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
+	err := syscall.Fallocate(
+		int(f.file.Fd()),
+		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
+		int64(fr.Start),
+		int64(fr.Length()))
+	if err != nil {
+		return err
+	}
+	f.markDecommitted(fr)
+	return nil
+}
+
+func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	// Since we're changing the knownCommitted attribute, we need to merge
+	// across the entire range to ensure that the usage tree is minimal.
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		val := seg.ValuePtr()
+		if val.knownCommitted {
+			// Drop the usageExpected appropriately.
+			amount := seg.Range().Length()
+			usage.MemoryAccounting.Dec(amount, val.kind)
+			f.usageExpected -= amount
+			val.knownCommitted = false
+		}
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+	f.usage.MergeRange(fr)
+}
+
+// runReclaim implements the reclaimer goroutine, which continuously decommits
+// reclaimable pages in order to reduce memory usage and make them available
+// for allocation.
+func (f *MemoryFile) runReclaim() {
+	for {
+		fr, ok := f.findReclaimable()
+		if !ok {
+			break
+		}
+
+		if err := f.Decommit(fr); err != nil {
+			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+			// Zero the pages manually. This won't reduce memory usage, but at
+			// least ensures that the pages will be zero when reallocated.
+			f.forEachMappingSlice(fr, func(bs []byte) {
+				for i := range bs {
+					bs[i] = 0
+				}
+			})
+			// Pretend the pages were decommitted even though they weren't,
+			// since the memory accounting implementation has no idea how to
+			// deal with this.
+			f.markDecommitted(fr)
+		}
+		f.markReclaimed(fr)
+	}
+	// We only get here if findReclaimable finds f.destroyed set and returns
+	// false.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if !f.destroyed {
+		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
+	}
+	f.file.Close()
+	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
+	// that has possibly been reassigned.
+	f.file = nil
+	mappings := f.mappings.Load().([]uintptr)
+	for i, m := range mappings {
+		if m != 0 {
+			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
+			if errno != 0 {
+				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
+			}
+		}
+	}
+	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
+	f.mappings.Store([]uintptr{})
+}
+
+func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for {
+		for {
+			if f.destroyed {
+				return platform.FileRange{}, false
+			}
+			if f.reclaimable {
+				break
+			}
+			f.reclaimCond.Wait()
+		}
+		// Allocate returns the first usable range in offset order and is
+		// currently a linear scan, so reclaiming from the beginning of the
+		// file minimizes the expected latency of Allocate.
+		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
+			if seg.ValuePtr().refs == 0 {
+				f.minReclaimablePage = seg.End()
+				return seg.Range(), true
+			}
+		}
+		// No pages are reclaimable.
+		f.reclaimable = false
+		f.minReclaimablePage = maxPage
+	}
+}
+
+func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	seg := f.usage.FindSegment(fr.Start)
+	// All of fr should be mapped to a single uncommitted reclaimable segment
+	// accounted to System.
+	if !seg.Ok() {
+		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
+	}
+	if !seg.Range().IsSupersetOf(fr) {
+		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
+	}
+	if got, want := seg.Value(), (usageInfo{
+		kind:           usage.System,
+		knownCommitted: false,
+		refs:           0,
+	}); got != want {
+		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
+	}
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
+	// caller of markReclaimed may not have decommitted it, so we can only mark
+	// fr as reclaimed.
+	f.usage.Remove(f.usage.Isolate(seg, fr))
+	if fr.Start < f.minUnallocatedPage {
+		// We've deallocated at least one lower page.
+		f.minUnallocatedPage = fr.Start
+	}
+}
+
+// IncRef implements platform.File.IncRef.
+func (f *MemoryFile) IncRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		seg.ValuePtr().refs++
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+
+	f.usage.MergeAdjacent(fr)
+}
+
+// DecRef implements platform.File.DecRef.
+func (f *MemoryFile) DecRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	var freed bool
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
+		seg = f.usage.Isolate(seg, fr)
+		val := seg.ValuePtr()
+		if val.refs == 0 {
+			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
+		}
+		val.refs--
+		if val.refs == 0 {
+			freed = true
+			// Reclassify memory as System, until it's freed by the reclaim
+			// goroutine.
+			if val.knownCommitted {
+				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
+			}
+			val.kind = usage.System
+		}
+	}
+	f.usage.MergeAdjacent(fr)
+
+	if freed {
+		if fr.Start < f.minReclaimablePage {
+			// We've freed at least one lower page.
+			f.minReclaimablePage = fr.Start
+		}
+		f.reclaimable = true
+		f.reclaimCond.Signal()
+	}
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	if !fr.WellFormed() || fr.Length() == 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+	if at.Execute {
+		return safemem.BlockSeq{}, syserror.EACCES
+	}
+
+	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
+	if chunks == 1 {
+		// Avoid an unnecessary slice allocation.
+		var seq safemem.BlockSeq
+		err := f.forEachMappingSlice(fr, func(bs []byte) {
+			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
+		})
+		return seq, err
+	}
+	blocks := make([]safemem.Block, 0, chunks)
+	err := f.forEachMappingSlice(fr, func(bs []byte) {
+		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
+	})
+	return safemem.BlockSeqFromSlice(blocks), err
+}
+
+// forEachMappingSlice invokes fn on a sequence of byte slices that
+// collectively map all bytes in fr.
+func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
+	mappings := f.mappings.Load().([]uintptr)
+	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
+		chunk := int(chunkStart >> chunkShift)
+		m := atomic.LoadUintptr(&mappings[chunk])
+		if m == 0 {
+			var err error
+			mappings, m, err = f.getChunkMapping(chunk)
+			if err != nil {
+				return err
+			}
+		}
+		startOff := uint64(0)
+		if chunkStart < fr.Start {
+			startOff = fr.Start - chunkStart
+		}
+		endOff := uint64(chunkSize)
+		if chunkStart+chunkSize > fr.End {
+			endOff = fr.End - chunkStart
+		}
+		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
+	}
+	return nil
+}
+
+func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
+	f.mappingsMu.Lock()
+	defer f.mappingsMu.Unlock()
+	// Another thread may have replaced f.mappings altogether due to file
+	// expansion.
+	mappings := f.mappings.Load().([]uintptr)
+	// Another thread may have already mapped the chunk.
+	if m := mappings[chunk]; m != 0 {
+		return mappings, m, nil
+	}
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		chunkSize,
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_SHARED,
+		f.file.Fd(),
+		uintptr(chunk<<chunkShift))
+	if errno != 0 {
+		return nil, 0, errno
+	}
+	atomic.StoreUintptr(&mappings[chunk], m)
+	return mappings, m, nil
+}
+
+// FD implements platform.File.FD.
+func (f *MemoryFile) FD() int {
+	return int(f.file.Fd())
+}
+
+// UpdateUsage ensures that the memory usage statistics in
+// usage.MemoryAccounting are up to date.
+func (f *MemoryFile) UpdateUsage() error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// If the underlying usage matches where the usage tree already
+	// represents, then we can just avoid the entire scan (we know it's
+	// accurate).
+	currentUsage, err := f.TotalUsage()
+	if err != nil {
+		return err
+	}
+	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
+		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
+		return nil
+	}
+	// If the current usage matches the expected but there's swap
+	// accounting, then ensure a scan takes place at least every second
+	// (when requested).
+	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
+		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
+		return nil
+	}
+
+	f.usageLast = time.Now()
+	err = f.updateUsageLocked(currentUsage, mincore)
+	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
+		currentUsage, f.usageExpected, f.usageSwapped)
+	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
+	return err
+}
+
+// updateUsageLocked attempts to detect commitment of previous-uncommitted
+// pages by invoking checkCommitted, which is a function that, for each page i
+// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
+//
+// Precondition: f.mu must be held.
+func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
+	// Track if anything changed to elide the merge. In the common case, we
+	// expect all segments to be committed and no merge to occur.
+	changedAny := false
+	defer func() {
+		if changedAny {
+			f.usage.MergeAll()
+		}
+
+		// Adjust the swap usage to reflect reality.
+		if f.usageExpected < currentUsage {
+			// Since no pages may be marked decommitted while we hold mu, we
+			// know that usage may have only increased since we got the last
+			// current usage. Therefore, if usageExpected is still short of
+			// currentUsage, we must assume that the difference is in pages
+			// that have been swapped.
+			newUsageSwapped := currentUsage - f.usageExpected
+			if f.usageSwapped < newUsageSwapped {
+				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
+			} else {
+				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
+			}
+			f.usageSwapped = newUsageSwapped
+		} else if f.usageSwapped != 0 {
+			// We have more usage accounted for than the file itself.
+			// That's fine, we probably caught a race where pages were
+			// being committed while the above loop was running. Just
+			// report the higher number that we found and ignore swap.
+			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
+			f.usageSwapped = 0
+		}
+	}()
+
+	// Reused mincore buffer, will generally be <= 4096 bytes.
+	var buf []byte
+
+	// Iterate over all usage data. There will only be usage segments
+	// present when there is an associated reference.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		val := seg.Value()
+
+		// Already known to be committed; ignore.
+		if val.knownCommitted {
+			continue
+		}
+
+		// Assume that reclaimable pages (that aren't already known to be
+		// committed) are not committed. This isn't necessarily true, even
+		// after the reclaimer does Decommit(), because the kernel may
+		// subsequently back the hugepage-sized region containing the
+		// decommitted page with a hugepage. However, it's consistent with our
+		// treatment of unallocated pages, which have the same property.
+		if val.refs == 0 {
+			continue
+		}
+
+		// Get the range for this segment. As we touch slices, the
+		// Start value will be walked along.
+		r := seg.Range()
+
+		var checkErr error
+		err := f.forEachMappingSlice(r, func(s []byte) {
+			if checkErr != nil {
+				return
+			}
+
+			// Ensure that we have sufficient buffer for the call
+			// (one byte per page). The length of each slice must
+			// be page-aligned.
+			bufLen := len(s) / usermem.PageSize
+			if len(buf) < bufLen {
+				buf = make([]byte, bufLen)
+			}
+
+			// Query for new pages in core.
+			if err := checkCommitted(s, buf); err != nil {
+				checkErr = err
+				return
+			}
+
+			// Scan each page and switch out segments.
+			populatedRun := false
+			populatedRunStart := 0
+			for i := 0; i <= bufLen; i++ {
+				// We run past the end of the slice here to
+				// simplify the logic and only set populated if
+				// we're still looking at elements.
+				populated := false
+				if i < bufLen {
+					populated = buf[i]&0x1 != 0
+				}
+
+				switch {
+				case populated == populatedRun:
+					// Keep the run going.
+					continue
+				case populated && !populatedRun:
+					// Begin the run.
+					populatedRun = true
+					populatedRunStart = i
+					// Keep going.
+					continue
+				case !populated && populatedRun:
+					// Finish the run by changing this segment.
+					runRange := platform.FileRange{
+						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
+						End:   r.Start + uint64(i*usermem.PageSize),
+					}
+					seg = f.usage.Isolate(seg, runRange)
+					seg.ValuePtr().knownCommitted = true
+					// Advance the segment only if we still
+					// have work to do in the context of
+					// the original segment from the for
+					// loop. Otherwise, the for loop itself
+					// will advance the segment
+					// appropriately.
+					if runRange.End != r.End {
+						seg = seg.NextSegment()
+					}
+					amount := runRange.Length()
+					usage.MemoryAccounting.Inc(amount, val.kind)
+					f.usageExpected += amount
+					changedAny = true
+					populatedRun = false
+				}
+			}
+
+			// Advance r.Start.
+			r.Start += uint64(len(s))
+		})
+		if checkErr != nil {
+			return checkErr
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// TotalUsage returns an aggregate usage for all memory statistics except
+// Mapped (which is external to MemoryFile). This is generally much cheaper
+// than UpdateUsage, but will not provide a fine-grained breakdown.
+func (f *MemoryFile) TotalUsage() (uint64, error) {
+	// Stat the underlying file to discover the underlying usage. stat(2)
+	// always reports the allocated block count in units of 512 bytes. This
+	// includes pages in the page cache and swapped pages.
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(int(f.file.Fd()), &stat); err != nil {
+		return 0, err
+	}
+	return uint64(stat.Blocks * 512), nil
+}
+
+// TotalSize returns the current size of the backing file in bytes, which is an
+// upper bound on the amount of memory that can currently be allocated from the
+// MemoryFile. The value returned by TotalSize is permitted to change.
+func (f *MemoryFile) TotalSize() uint64 {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return uint64(f.fileSize)
+}
+
+// File returns the backing file.
+func (f *MemoryFile) File() *os.File {
+	return f.file
+}
+
+// String implements fmt.Stringer.String.
+//
+// Note that because f.String locks f.mu, calling f.String internally
+// (including indirectly through the fmt package) risks recursive locking.
+// Within the pgalloc package, use f.usage directly instead.
+func (f *MemoryFile) String() string {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.usage.String()
+}
+
+type usageSetFunctions struct{}
+
+func (usageSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (usageSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (usageSetFunctions) ClearValue(val *usageInfo) {
+}
+
+func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
+	return val1, val1 == val2
+}
+
+func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
+	return val, val
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
new file mode 100644
index 000000000..726623c1a
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	page     = usermem.PageSize
+	hugepage = usermem.HugePageSize
+)
+
+func TestFindUnallocatedRange(t *testing.T) {
+	for _, test := range []struct {
+		desc           string
+		usage          *usageSegmentDataSlices
+		start          uint64
+		length         uint64
+		alignment      uint64
+		unallocated    uint64
+		minUnallocated uint64
+	}{
+		{
+			desc:           "Initial allocation succeeds",
+			usage:          &usageSegmentDataSlices{},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    0,
+			minUnallocated: 0,
+		},
+		{
+			desc: "Allocation begins at start of file",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page},
+				End:    []uint64{2 * page},
+				Values: []usageInfo{{refs: 1}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    0,
+			minUnallocated: 0,
+		},
+		{
+			desc: "In-use frames are not allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, page},
+				End:    []uint64{page, 2 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
+		},
+		{
+			desc: "Reclaimable frames are not allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, page, 2 * page},
+				End:    []uint64{page, 2 * page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    3 * page,
+			minUnallocated: 3 * page,
+		},
+		{
+			desc: "Gaps between in-use frames are allocatable",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 2 * page},
+				End:    []uint64{page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			start:          0,
+			length:         page,
+			alignment:      page,
+			unallocated:    page,
+			minUnallocated: page,
+		},
+		{
+			desc: "Inadequately-sized gaps are rejected",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 2 * page},
+				End:    []uint64{page, 3 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			start:          0,
+			length:         2 * page,
+			alignment:      page,
+			unallocated:    3 * page,
+			minUnallocated: page,
+		},
+		{
+			desc: "Hugepage alignment is honored",
+			usage: &usageSegmentDataSlices{
+				Start: []uint64{0, hugepage + page},
+				// Hugepage-sized gap here that shouldn't be allocated from
+				// since it's incorrectly aligned.
+				End:    []uint64{page, hugepage + 2*page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			start:          0,
+			length:         hugepage,
+			alignment:      hugepage,
+			unallocated:    2 * hugepage,
+			minUnallocated: page,
+		},
+		{
+			desc: "Pages before start ignored",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 3 * page},
+				End:    []uint64{2 * page, 4 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          page,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
+		},
+		{
+			desc: "start may be in the middle of segment",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0, 3 * page},
+				End:    []uint64{2 * page, 4 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}},
+			},
+			start:          page,
+			length:         page,
+			alignment:      page,
+			unallocated:    2 * page,
+			minUnallocated: 2 * page,
+		},
+	} {
+		t.Run(test.desc, func(t *testing.T) {
+			var usage usageSet
+			if err := usage.ImportSortedSlices(test.usage); err != nil {
+				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
+			}
+			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
+			if unallocated != test.unallocated {
+				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+			}
+			if minUnallocated != test.minUnallocated {
+				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_unsafe.go b/pkg/sentry/pgalloc/pgalloc_unsafe.go
new file mode 100644
index 000000000..33b0a68a8
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc_unsafe.go
@@ -0,0 +1,40 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+)
+
+func unsafeSlice(addr uintptr, length int) (slice []byte) {
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	sh.Data = addr
+	sh.Len = length
+	sh.Cap = length
+	return
+}
+
+func mincore(s []byte, buf []byte) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_MINCORE,
+		uintptr(unsafe.Pointer(&s[0])),
+		uintptr(len(s)),
+		uintptr(unsafe.Pointer(&buf[0]))); errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
new file mode 100644
index 000000000..21024e656
--- /dev/null
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -0,0 +1,205 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// SaveTo writes f's state to the given stream.
+func (f *MemoryFile) SaveTo(w io.Writer) error {
+	// Wait for reclaim.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for f.reclaimable {
+		f.reclaimCond.Signal()
+		f.mu.Unlock()
+		runtime.Gosched()
+		f.mu.Lock()
+	}
+
+	// Ensure that all pages that contain data have knownCommitted set, since
+	// we only store knownCommitted pages below.
+	zeroPage := make([]byte, usermem.PageSize)
+	err := f.updateUsageLocked(0, func(bs []byte, committed []byte) error {
+		for pgoff := 0; pgoff < len(bs); pgoff += usermem.PageSize {
+			i := pgoff / usermem.PageSize
+			pg := bs[pgoff : pgoff+usermem.PageSize]
+			if !bytes.Equal(pg, zeroPage) {
+				committed[i] = 1
+				continue
+			}
+			committed[i] = 0
+			// Reading the page caused it to be committed; decommit it to
+			// reduce memory usage.
+			//
+			// "MADV_REMOVE [...] Free up a given range of pages and its
+			// associated backing store. This is equivalent to punching a hole
+			// in the corresponding byte range of the backing store (see
+			// fallocate(2))." - madvise(2)
+			if err := syscall.Madvise(pg, syscall.MADV_REMOVE); err != nil {
+				// This doesn't impact the correctness of saved memory, it
+				// just means that we're incrementally more likely to OOM.
+				// Complain, but don't abort saving.
+				log.Warningf("Decommitting page %p while saving failed: %v", pg, err)
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	// Save metadata.
+	if err := state.Save(w, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := state.Save(w, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Dump out committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Write a header to distinguish from objects.
+		if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil {
+			return err
+		}
+		// Write out data.
+		var ioErr error
+		err := f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = w.Write(s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// LoadFrom loads MemoryFile state from the given stream.
+func (f *MemoryFile) LoadFrom(r io.Reader) error {
+	// Load metadata.
+	if err := state.Load(r, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := f.file.Truncate(f.fileSize); err != nil {
+		return err
+	}
+	newMappings := make([]uintptr, f.fileSize>>chunkShift)
+	f.mappings.Store(newMappings)
+	if err := state.Load(r, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Try to map committed chunks concurrently: For any given chunk, either
+	// this loop or the following one will mmap the chunk first and cache it in
+	// f.mappings for the other, but this loop is likely to run ahead of the
+	// other since it doesn't do any work between mmaps. The rest of this
+	// function doesn't mutate f.usage, so it's safe to iterate concurrently.
+	mapperDone := make(chan struct{})
+	mapperCanceled := int32(0)
+	go func() { // S/R-SAFE: see comment
+		defer func() { close(mapperDone) }()
+		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			if atomic.LoadInt32(&mapperCanceled) != 0 {
+				return
+			}
+			if seg.Value().knownCommitted {
+				f.forEachMappingSlice(seg.Range(), func(s []byte) {})
+			}
+		}
+	}()
+	defer func() {
+		atomic.StoreInt32(&mapperCanceled, 1)
+		<-mapperDone
+	}()
+
+	// Load committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Verify header.
+		length, object, err := state.ReadHeader(r)
+		if err != nil {
+			return err
+		}
+		if object {
+			// Not expected.
+			return fmt.Errorf("unexpected object")
+		}
+		if expected := uint64(seg.Range().Length()); length != expected {
+			// Size mismatch.
+			return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length)
+		}
+		// Read data.
+		var ioErr error
+		err = f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = io.ReadFull(r, s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+
+		// Update accounting for restored pages. We need to do this here since
+		// these segments are marked as "known committed", and will be skipped
+		// over on accounting scans.
+		usage.MemoryAccounting.Inc(seg.End()-seg.Start(), seg.Value().kind)
+	}
+
+	return nil
+}
+
+// MemoryFileProvider provides the MemoryFile method.
+//
+// This type exists to work around a save/restore defect. The only object in a
+// saved object graph that S/R allows to be replaced at time of restore is the
+// starting point of the restore, kernel.Kernel. However, the MemoryFile
+// changes between save and restore as well, so objects that need persistent
+// access to the MemoryFile must instead store a pointer to the Kernel and call
+// Kernel.MemoryFile() as required. In most cases, depending on the kernel
+// package directly would create a package dependency loop, so the stored
+// pointer must instead be a MemoryProvider interface object. Correspondingly,
+// kernel.Kernel is the only implementation of this interface.
+type MemoryFileProvider interface {
+	// MemoryFile returns the Kernel MemoryFile.
+	MemoryFile() *MemoryFile
+}
diff --git a/pkg/sentry/platform/filemem/BUILD b/pkg/sentry/platform/filemem/BUILD
deleted file mode 100644
index 1a61cfaa5..000000000
--- a/pkg/sentry/platform/filemem/BUILD
+++ /dev/null
@@ -1,56 +0,0 @@
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-go_template_instance(
-    name = "usage_set",
-    out = "usage_set.go",
-    consts = {
-        "minDegree": "10",
-    },
-    imports = {
-        "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
-    },
-    package = "filemem",
-    prefix = "usage",
-    template = "//pkg/segment:generic_set",
-    types = {
-        "Key": "uint64",
-        "Range": "platform.FileRange",
-        "Value": "usageInfo",
-        "Functions": "usageSetFunctions",
-    },
-)
-
-go_library(
-    name = "filemem",
-    srcs = [
-        "filemem.go",
-        "filemem_state.go",
-        "filemem_unsafe.go",
-        "usage_set.go",
-    ],
-    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem",
-    visibility = ["//pkg/sentry:internal"],
-    deps = [
-        "//pkg/log",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/memutil",
-        "//pkg/sentry/platform",
-        "//pkg/sentry/safemem",
-        "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
-        "//pkg/state",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "filemem_test",
-    size = "small",
-    srcs = ["filemem_test.go"],
-    embed = [":filemem"],
-    deps = ["//pkg/sentry/usermem"],
-)
diff --git a/pkg/sentry/platform/filemem/filemem.go b/pkg/sentry/platform/filemem/filemem.go
deleted file mode 100644
index f41c70ba5..000000000
--- a/pkg/sentry/platform/filemem/filemem.go
+++ /dev/null
@@ -1,879 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package filemem provides a reusable implementation of platform.Memory.
-//
-// It enables memory to be sourced from a memfd file.
-//
-// Lock order:
-//
-// filemem.FileMem.mu
-//   filemem.FileMem.mappingsMu
-package filemem
-
-import (
-	"fmt"
-	"math"
-	"os"
-	"sync"
-	"sync/atomic"
-	"syscall"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
-)
-
-// FileMem is a platform.Memory that allocates from a host file that it owns.
-type FileMem struct {
-	// Filemem models the backing file as follows:
-	//
-	// Each page in the file can be committed or uncommitted. A page is
-	// committed if the host kernel is spending resources to store its contents
-	// and uncommitted otherwise. This definition includes pages that the host
-	// kernel has swapped; this is intentional, to ensure that accounting does
-	// not change even if host kernel swapping behavior changes, and that
-	// memory used by pseudo-swap mechanisms like zswap is still accounted.
-	//
-	// The initial contents of uncommitted pages are implicitly zero bytes. A
-	// read or write to the contents of an uncommitted page causes it to be
-	// committed. This is the only event that can cause a uncommitted page to
-	// be committed.
-	//
-	// fallocate(FALLOC_FL_PUNCH_HOLE) (FileMem.Decommit) causes committed
-	// pages to be uncommitted. This is the only event that can cause a
-	// committed page to be uncommitted.
-	//
-	// Filemem's accounting is based on identifying the set of committed pages.
-	// Since filemem does not have direct access to the MMU, tracking reads and
-	// writes to uncommitted pages to detect commitment would introduce
-	// additional page faults, which would be prohibitively expensive. Instead,
-	// filemem queries the host kernel to determine which pages are committed.
-
-	// file is the backing memory file. The file pointer is immutable.
-	file *os.File
-
-	mu sync.Mutex
-
-	// usage maps each page in the file to metadata for that page. Pages for
-	// which no segment exists in usage are both unallocated (not in use) and
-	// uncommitted.
-	//
-	// Since usage stores usageInfo objects by value, clients should usually
-	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
-	// pointer to the usageInfo rather than a copy.
-	//
-	// usage must be kept maximally merged (that is, there should never be two
-	// adjacent segments with the same values). At least markReclaimed depends
-	// on this property.
-	//
-	// usage is protected by mu.
-	usage usageSet
-
-	// The UpdateUsage function scans all segments with knownCommitted set
-	// to false, sees which pages are committed and creates corresponding
-	// segments with knownCommitted set to true.
-	//
-	// In order to avoid unnecessary scans, usageExpected tracks the total
-	// file blocks expected. This is used to elide the scan when this
-	// matches the underlying file blocks.
-	//
-	// To track swapped pages, usageSwapped tracks the discrepency between
-	// what is observed in core and what is reported by the file. When
-	// usageSwapped is non-zero, a sweep will be performed at least every
-	// second. The start of the last sweep is recorded in usageLast.
-	//
-	// All usage attributes are all protected by mu.
-	usageExpected uint64
-	usageSwapped  uint64
-	usageLast     time.Time
-
-	// minUnallocatedPage is the minimum page that may be unallocated.
-	// i.e., there are no unallocated pages below minUnallocatedPage.
-	//
-	// minUnallocatedPage is protected by mu.
-	minUnallocatedPage uint64
-
-	// fileSize is the size of the backing memory file in bytes. fileSize is
-	// always a power-of-two multiple of chunkSize.
-	//
-	// fileSize is protected by mu.
-	fileSize int64
-
-	// destroyed is set by Destroy to instruct the reclaimer goroutine to
-	// release resources and exit. destroyed is protected by mu.
-	destroyed bool
-
-	// reclaimable is true if usage may contain reclaimable pages. reclaimable
-	// is protected by mu.
-	reclaimable bool
-
-	// minReclaimablePage is the minimum page that may be reclaimable.
-	// i.e., all reclaimable pages are >= minReclaimablePage.
-	//
-	// minReclaimablePage is protected by mu.
-	minReclaimablePage uint64
-
-	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
-	// transitions from false to true.
-	reclaimCond sync.Cond
-
-	// Filemem pages are mapped into the local address space on the granularity
-	// of large pieces called chunks. mappings is a []uintptr that stores, for
-	// each chunk, the start address of a mapping of that chunk in the current
-	// process' address space, or 0 if no such mapping exists. Once a chunk is
-	// mapped, it is never remapped or unmapped until the filemem is destroyed.
-	//
-	// Mutating the mappings slice or its contents requires both holding
-	// mappingsMu and using atomic memory operations. (The slice is mutated
-	// whenever the file is expanded. Per the above, the only permitted
-	// mutation of the slice's contents is the assignment of a mapping to a
-	// chunk that was previously unmapped.) Reading the slice or its contents
-	// only requires *either* holding mappingsMu or using atomic memory
-	// operations. This allows FileMem.AccessPhysical to avoid locking in the
-	// common case where chunk mappings already exist.
-
-	mappingsMu sync.Mutex
-	mappings   atomic.Value
-}
-
-// usage tracks usage information.
-//
-// +stateify savable
-type usageInfo struct {
-	// kind is the usage kind.
-	kind usage.MemoryKind
-
-	// knownCommitted indicates whether this region is known to be
-	// committed. If this is false, then the region may or may not have
-	// been touched. If it is true however, then mincore (below) has
-	// indicated that the page is present at least once.
-	knownCommitted bool
-
-	refs uint64
-}
-
-const (
-	chunkShift = 24
-	chunkSize  = 1 << chunkShift // 16 MB
-	chunkMask  = chunkSize - 1
-
-	initialSize = chunkSize
-
-	// maxPage is the highest 64-bit page.
-	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
-)
-
-// newFromFile creates a FileMem backed by the given file.
-func newFromFile(file *os.File) (*FileMem, error) {
-	if err := file.Truncate(initialSize); err != nil {
-		return nil, err
-	}
-	f := &FileMem{
-		fileSize: initialSize,
-		file:     file,
-		// No pages are reclaimable. DecRef will always be able to
-		// decrease minReclaimablePage from this point.
-		minReclaimablePage: maxPage,
-	}
-	f.reclaimCond.L = &f.mu
-	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
-	go f.runReclaim() // S/R-SAFE: f.mu
-
-	// The Linux kernel contains an optional feature called "Integrity
-	// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
-	// binaries the first time they are mapped PROT_EXEC. This is bad news for
-	// executable pages mapped from FileMem, which can grow to terabytes in
-	// (sparse) size. If IMA attempts to checksum a file that large, it will
-	// allocate all of the sparse pages and quickly exhaust all memory.
-	//
-	// Work around IMA by immediately creating a temporary PROT_EXEC mapping,
-	// while FileMem is still small. IMA will ignore any future mappings.
-	m, _, errno := syscall.Syscall6(
-		syscall.SYS_MMAP,
-		0,
-		usermem.PageSize,
-		syscall.PROT_EXEC,
-		syscall.MAP_SHARED,
-		f.file.Fd(),
-		0)
-	if errno != 0 {
-		// This isn't fatal to filemem (IMA may not even be in use). Log the
-		// error, but don't return it.
-		log.Warningf("Failed to pre-map FileMem PROT_EXEC: %v", errno)
-	} else {
-		syscall.Syscall(
-			syscall.SYS_MUNMAP,
-			m,
-			usermem.PageSize,
-			0)
-	}
-
-	return f, nil
-}
-
-// New creates a FileMem backed by a memfd file.
-func New(name string) (*FileMem, error) {
-	fd, err := memutil.CreateMemFD(name, 0)
-	if err != nil {
-		if e, ok := err.(syscall.Errno); ok && e == syscall.ENOSYS {
-			return nil, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
-		}
-		return nil, err
-	}
-	return newFromFile(os.NewFile(uintptr(fd), name))
-}
-
-// Destroy implements platform.Memory.Destroy.
-func (f *FileMem) Destroy() {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	f.destroyed = true
-	f.reclaimCond.Signal()
-}
-
-// Allocate implements platform.Memory.Allocate.
-func (f *FileMem) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
-	if length == 0 || length%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid allocation length: %#x", length))
-	}
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	// Align hugepage-and-larger allocations on hugepage boundaries to try
-	// to take advantage of hugetmpfs.
-	alignment := uint64(usermem.PageSize)
-	if length >= usermem.HugePageSize {
-		alignment = usermem.HugePageSize
-	}
-
-	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
-	end := start + length
-	// File offsets are int64s. Since length must be strictly positive, end
-	// cannot legitimately be 0.
-	if end < start || int64(end) <= 0 {
-		return platform.FileRange{}, syserror.ENOMEM
-	}
-
-	// Expand the file if needed. Double the file size on each expansion;
-	// uncommitted pages have effectively no cost.
-	fileSize := f.fileSize
-	for int64(end) > fileSize {
-		if fileSize >= 2*fileSize {
-			// fileSize overflow.
-			return platform.FileRange{}, syserror.ENOMEM
-		}
-		fileSize *= 2
-	}
-	if fileSize > f.fileSize {
-		if err := f.file.Truncate(fileSize); err != nil {
-			return platform.FileRange{}, err
-		}
-		f.fileSize = fileSize
-		f.mappingsMu.Lock()
-		oldMappings := f.mappings.Load().([]uintptr)
-		newMappings := make([]uintptr, fileSize>>chunkShift)
-		copy(newMappings, oldMappings)
-		f.mappings.Store(newMappings)
-		f.mappingsMu.Unlock()
-	}
-
-	// Mark selected pages as in use.
-	fr := platform.FileRange{start, end}
-	if !f.usage.Add(fr, usageInfo{
-		kind: kind,
-		refs: 1,
-	}) {
-		panic(fmt.Sprintf("allocating %v: failed to insert into f.usage:\n%v", fr, &f.usage))
-	}
-
-	if minUnallocatedPage < start {
-		f.minUnallocatedPage = minUnallocatedPage
-	} else {
-		// start was the first unallocated page. The next must be
-		// somewhere beyond end.
-		f.minUnallocatedPage = end
-	}
-
-	return fr, nil
-}
-
-// findUnallocatedRange returns the first unallocated page in usage of the
-// specified length and alignment beginning at page start and the first single
-// unallocated page.
-func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
-	// Only searched until the first page is found.
-	firstPage := start
-	foundFirstPage := false
-	alignMask := alignment - 1
-	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
-		r := seg.Range()
-
-		if !foundFirstPage && r.Start > firstPage {
-			foundFirstPage = true
-		}
-
-		if start >= r.End {
-			// start was rounded up to an alignment boundary from the end
-			// of a previous segment and is now beyond r.End.
-			continue
-		}
-		// This segment represents allocated or reclaimable pages; only the
-		// range from start to the segment's beginning is allocatable, and the
-		// next allocatable range begins after the segment.
-		if r.Start > start && r.Start-start >= length {
-			break
-		}
-		start = (r.End + alignMask) &^ alignMask
-		if !foundFirstPage {
-			firstPage = r.End
-		}
-	}
-	return start, firstPage
-}
-
-// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
-const (
-	_FALLOC_FL_KEEP_SIZE  = 1
-	_FALLOC_FL_PUNCH_HOLE = 2
-)
-
-// Decommit implements platform.Memory.Decommit.
-func (f *FileMem) Decommit(fr platform.FileRange) error {
-	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-
-	// "After a successful call, subsequent reads from this range will
-	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
-	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
-	err := syscall.Fallocate(
-		int(f.file.Fd()),
-		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
-		int64(fr.Start),
-		int64(fr.Length()))
-	if err != nil {
-		return err
-	}
-	f.markDecommitted(fr)
-	return nil
-}
-
-func (f *FileMem) markDecommitted(fr platform.FileRange) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	// Since we're changing the knownCommitted attribute, we need to merge
-	// across the entire range to ensure that the usage tree is minimal.
-	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
-		val := seg.ValuePtr()
-		if val.knownCommitted {
-			// Drop the usageExpected appropriately.
-			amount := seg.Range().Length()
-			usage.MemoryAccounting.Dec(amount, val.kind)
-			f.usageExpected -= amount
-			val.knownCommitted = false
-		}
-	})
-	if gap.Ok() {
-		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
-	}
-	f.usage.MergeRange(fr)
-}
-
-// runReclaim implements the reclaimer goroutine, which continuously decommits
-// reclaimable frames in order to reduce memory usage.
-func (f *FileMem) runReclaim() {
-	for {
-		fr, ok := f.findReclaimable()
-		if !ok {
-			break
-		}
-
-		if err := f.Decommit(fr); err != nil {
-			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
-			// Zero the frames manually. This won't reduce memory usage, but at
-			// least ensures that the frames will be zero when reallocated.
-			f.forEachMappingSlice(fr, func(bs []byte) {
-				for i := range bs {
-					bs[i] = 0
-				}
-			})
-			// Pretend the frames were decommitted even though they weren't,
-			// since the memory accounting implementation has no idea how to
-			// deal with this.
-			f.markDecommitted(fr)
-		}
-		f.markReclaimed(fr)
-	}
-	// We only get here if findReclaimable finds f.destroyed set and returns
-	// false.
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if !f.destroyed {
-		panic("findReclaimable broke out of reclaim loop, but f.destroyed is no longer set")
-	}
-	f.file.Close()
-	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
-	// that has possibly been reassigned.
-	f.file = nil
-	mappings := f.mappings.Load().([]uintptr)
-	for i, m := range mappings {
-		if m != 0 {
-			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
-			if errno != 0 {
-				log.Warningf("Failed to unmap mapping %#x for filemem chunk %d: %v", m, i, errno)
-			}
-		}
-	}
-	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
-	f.mappings.Store([]uintptr{})
-}
-
-func (f *FileMem) findReclaimable() (platform.FileRange, bool) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	for {
-		for {
-			if f.destroyed {
-				return platform.FileRange{}, false
-			}
-			if f.reclaimable {
-				break
-			}
-			f.reclaimCond.Wait()
-		}
-		// Allocate returns the first usable range in offset order and is
-		// currently a linear scan, so reclaiming from the beginning of the
-		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
-			if seg.ValuePtr().refs == 0 {
-				f.minReclaimablePage = seg.End()
-				return seg.Range(), true
-			}
-		}
-		f.reclaimable = false
-		// No pages are reclaimable.
-		f.minReclaimablePage = maxPage
-	}
-}
-
-func (f *FileMem) markReclaimed(fr platform.FileRange) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	seg := f.usage.FindSegment(fr.Start)
-	// All of fr should be mapped to a single uncommitted reclaimable segment
-	// accounted to System.
-	if !seg.Ok() {
-		panic(fmt.Sprintf("Reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
-	}
-	if !seg.Range().IsSupersetOf(fr) {
-		panic(fmt.Sprintf("Reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
-	}
-	if got, want := seg.Value(), (usageInfo{
-		kind:           usage.System,
-		knownCommitted: false,
-		refs:           0,
-	}); got != want {
-		panic(fmt.Sprintf("Reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
-	}
-	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
-	// caller of markReclaimed may not have decommitted it, so we can only mark
-	// fr as reclaimed.
-	f.usage.Remove(f.usage.Isolate(seg, fr))
-	if fr.Start < f.minUnallocatedPage {
-		// We've deallocated at least one lower page.
-		f.minUnallocatedPage = fr.Start
-	}
-}
-
-// IncRef implements platform.File.IncRef.
-func (f *FileMem) IncRef(fr platform.FileRange) {
-	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
-		seg.ValuePtr().refs++
-	})
-	if gap.Ok() {
-		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
-	}
-
-	f.usage.MergeAdjacent(fr)
-}
-
-// DecRef implements platform.File.DecRef.
-func (f *FileMem) DecRef(fr platform.FileRange) {
-	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-
-	var freed bool
-
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
-		seg = f.usage.Isolate(seg, fr)
-		val := seg.ValuePtr()
-		if val.refs == 0 {
-			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
-		}
-		val.refs--
-		if val.refs == 0 {
-			freed = true
-			// Reclassify memory as System, until it's freed by the reclaim
-			// goroutine.
-			if val.knownCommitted {
-				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
-			}
-			val.kind = usage.System
-		}
-	}
-	f.usage.MergeAdjacent(fr)
-
-	if freed {
-		if fr.Start < f.minReclaimablePage {
-			// We've freed at least one lower page.
-			f.minReclaimablePage = fr.Start
-		}
-		f.reclaimable = true
-		f.reclaimCond.Signal()
-	}
-}
-
-// MapInternal implements platform.File.MapInternal.
-func (f *FileMem) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
-	if !fr.WellFormed() || fr.Length() == 0 {
-		panic(fmt.Sprintf("invalid range: %v", fr))
-	}
-	if at.Execute {
-		return safemem.BlockSeq{}, syserror.EACCES
-	}
-
-	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
-	if chunks == 1 {
-		// Avoid an unnecessary slice allocation.
-		var seq safemem.BlockSeq
-		err := f.forEachMappingSlice(fr, func(bs []byte) {
-			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
-		})
-		return seq, err
-	}
-	blocks := make([]safemem.Block, 0, chunks)
-	err := f.forEachMappingSlice(fr, func(bs []byte) {
-		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
-	})
-	return safemem.BlockSeqFromSlice(blocks), err
-}
-
-// forEachMappingSlice invokes fn on a sequence of byte slices that
-// collectively map all bytes in fr.
-func (f *FileMem) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
-	mappings := f.mappings.Load().([]uintptr)
-	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
-		chunk := int(chunkStart >> chunkShift)
-		m := atomic.LoadUintptr(&mappings[chunk])
-		if m == 0 {
-			var err error
-			mappings, m, err = f.getChunkMapping(chunk)
-			if err != nil {
-				return err
-			}
-		}
-		startOff := uint64(0)
-		if chunkStart < fr.Start {
-			startOff = fr.Start - chunkStart
-		}
-		endOff := uint64(chunkSize)
-		if chunkStart+chunkSize > fr.End {
-			endOff = fr.End - chunkStart
-		}
-		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
-	}
-	return nil
-}
-
-func (f *FileMem) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
-	f.mappingsMu.Lock()
-	defer f.mappingsMu.Unlock()
-	// Another thread may have replaced f.mappings altogether due to file
-	// expansion.
-	mappings := f.mappings.Load().([]uintptr)
-	// Another thread may have already mapped the chunk.
-	if m := mappings[chunk]; m != 0 {
-		return mappings, m, nil
-	}
-	m, _, errno := syscall.Syscall6(
-		syscall.SYS_MMAP,
-		0,
-		chunkSize,
-		syscall.PROT_READ|syscall.PROT_WRITE,
-		syscall.MAP_SHARED,
-		f.file.Fd(),
-		uintptr(chunk<<chunkShift))
-	if errno != 0 {
-		return nil, 0, errno
-	}
-	atomic.StoreUintptr(&mappings[chunk], m)
-	return mappings, m, nil
-}
-
-// FD implements platform.File.FD.
-func (f *FileMem) FD() int {
-	return int(f.file.Fd())
-}
-
-// UpdateUsage implements platform.Memory.UpdateUsage.
-func (f *FileMem) UpdateUsage() error {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-
-	// If the underlying usage matches where the usage tree already
-	// represents, then we can just avoid the entire scan (we know it's
-	// accurate).
-	currentUsage, err := f.TotalUsage()
-	if err != nil {
-		return err
-	}
-	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
-		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
-		return nil
-	}
-	// If the current usage matches the expected but there's swap
-	// accounting, then ensure a scan takes place at least every second
-	// (when requested).
-	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
-		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
-		return nil
-	}
-
-	f.usageLast = time.Now()
-	err = f.updateUsageLocked(currentUsage, mincore)
-	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
-		currentUsage, f.usageExpected, f.usageSwapped)
-	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
-	return err
-}
-
-// updateUsageLocked attempts to detect commitment of previous-uncommitted
-// pages by invoking checkCommitted, which is a function that, for each page i
-// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
-//
-// Precondition: f.mu must be held.
-func (f *FileMem) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
-	// Track if anything changed to elide the merge. In the common case, we
-	// expect all segments to be committed and no merge to occur.
-	changedAny := false
-	defer func() {
-		if changedAny {
-			f.usage.MergeAll()
-		}
-
-		// Adjust the swap usage to reflect reality.
-		if f.usageExpected < currentUsage {
-			// Since no pages may be decommitted while we hold usageMu, we
-			// know that usage may have only increased since we got the
-			// last current usage. Therefore, if usageExpected is still
-			// short of currentUsage, we must assume that the difference is
-			// in pages that have been swapped.
-			newUsageSwapped := currentUsage - f.usageExpected
-			if f.usageSwapped < newUsageSwapped {
-				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
-			} else {
-				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
-			}
-			f.usageSwapped = newUsageSwapped
-		} else if f.usageSwapped != 0 {
-			// We have more usage accounted for than the file itself.
-			// That's fine, we probably caught a race where pages were
-			// being committed while the above loop was running. Just
-			// report the higher number that we found and ignore swap.
-			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
-			f.usageSwapped = 0
-		}
-	}()
-
-	// Reused mincore buffer, will generally be <= 4096 bytes.
-	var buf []byte
-
-	// Iterate over all usage data. There will only be usage segments
-	// present when there is an associated reference.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		val := seg.Value()
-
-		// Already known to be committed; ignore.
-		if val.knownCommitted {
-			continue
-		}
-
-		// Assume that reclaimable pages (that aren't already known to be
-		// committed) are not committed. This isn't necessarily true, even
-		// after the reclaimer does Decommit(), because the kernel may
-		// subsequently back the hugepage-sized region containing the
-		// decommitted page with a hugepage. However, it's consistent with our
-		// treatment of unallocated pages, which have the same property.
-		if val.refs == 0 {
-			continue
-		}
-
-		// Get the range for this segment. As we touch slices, the
-		// Start value will be walked along.
-		r := seg.Range()
-
-		var checkErr error
-		err := f.forEachMappingSlice(r, func(s []byte) {
-			if checkErr != nil {
-				return
-			}
-
-			// Ensure that we have sufficient buffer for the call
-			// (one byte per page). The length of each slice must
-			// be page-aligned.
-			bufLen := len(s) / usermem.PageSize
-			if len(buf) < bufLen {
-				buf = make([]byte, bufLen)
-			}
-
-			// Query for new pages in core.
-			if err := checkCommitted(s, buf); err != nil {
-				checkErr = err
-				return
-			}
-
-			// Scan each page and switch out segments.
-			populatedRun := false
-			populatedRunStart := 0
-			for i := 0; i <= bufLen; i++ {
-				// We run past the end of the slice here to
-				// simplify the logic and only set populated if
-				// we're still looking at elements.
-				populated := false
-				if i < bufLen {
-					populated = buf[i]&0x1 != 0
-				}
-
-				switch {
-				case populated == populatedRun:
-					// Keep the run going.
-					continue
-				case populated && !populatedRun:
-					// Begin the run.
-					populatedRun = true
-					populatedRunStart = i
-					// Keep going.
-					continue
-				case !populated && populatedRun:
-					// Finish the run by changing this segment.
-					runRange := platform.FileRange{
-						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
-						End:   r.Start + uint64(i*usermem.PageSize),
-					}
-					seg = f.usage.Isolate(seg, runRange)
-					seg.ValuePtr().knownCommitted = true
-					// Advance the segment only if we still
-					// have work to do in the context of
-					// the original segment from the for
-					// loop. Otherwise, the for loop itself
-					// will advance the segment
-					// appropriately.
-					if runRange.End != r.End {
-						seg = seg.NextSegment()
-					}
-					amount := runRange.Length()
-					usage.MemoryAccounting.Inc(amount, val.kind)
-					f.usageExpected += amount
-					changedAny = true
-					populatedRun = false
-				}
-			}
-
-			// Advance r.Start.
-			r.Start += uint64(len(s))
-		})
-		if checkErr != nil {
-			return checkErr
-		}
-		if err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// TotalUsage implements platform.Memory.TotalUsage.
-func (f *FileMem) TotalUsage() (uint64, error) {
-	// Stat the underlying file to discover the underlying usage. stat(2)
-	// always reports the allocated block count in units of 512 bytes. This
-	// includes pages in the page cache and swapped pages.
-	var stat syscall.Stat_t
-	if err := syscall.Fstat(int(f.file.Fd()), &stat); err != nil {
-		return 0, err
-	}
-	return uint64(stat.Blocks * 512), nil
-}
-
-// TotalSize implements platform.Memory.TotalSize.
-func (f *FileMem) TotalSize() uint64 {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	return uint64(f.fileSize)
-}
-
-// File returns the memory file used by f.
-func (f *FileMem) File() *os.File {
-	return f.file
-}
-
-// String implements fmt.Stringer.String.
-//
-// Note that because f.String locks f.mu, calling f.String internally
-// (including indirectly through the fmt package) risks recursive locking.
-// Within the filemem package, use f.usage directly instead.
-func (f *FileMem) String() string {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	return f.usage.String()
-}
-
-type usageSetFunctions struct{}
-
-func (usageSetFunctions) MinKey() uint64 {
-	return 0
-}
-
-func (usageSetFunctions) MaxKey() uint64 {
-	return math.MaxUint64
-}
-
-func (usageSetFunctions) ClearValue(val *usageInfo) {
-}
-
-func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
-	return val1, val1 == val2
-}
-
-func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
-	return val, val
-}
diff --git a/pkg/sentry/platform/filemem/filemem_state.go b/pkg/sentry/platform/filemem/filemem_state.go
deleted file mode 100644
index 964e2aaaa..000000000
--- a/pkg/sentry/platform/filemem/filemem_state.go
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package filemem
-
-import (
-	"bytes"
-	"fmt"
-	"io"
-	"runtime"
-	"sync/atomic"
-	"syscall"
-
-	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/state"
-)
-
-// SaveTo implements platform.Memory.SaveTo.
-func (f *FileMem) SaveTo(w io.Writer) error {
-	// Wait for reclaim.
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	for f.reclaimable {
-		f.reclaimCond.Signal()
-		f.mu.Unlock()
-		runtime.Gosched()
-		f.mu.Lock()
-	}
-
-	// Ensure that all pages that contain data have knownCommitted set, since
-	// we only store knownCommitted pages below.
-	zeroPage := make([]byte, usermem.PageSize)
-	err := f.updateUsageLocked(0, func(bs []byte, committed []byte) error {
-		for pgoff := 0; pgoff < len(bs); pgoff += usermem.PageSize {
-			i := pgoff / usermem.PageSize
-			pg := bs[pgoff : pgoff+usermem.PageSize]
-			if !bytes.Equal(pg, zeroPage) {
-				committed[i] = 1
-				continue
-			}
-			committed[i] = 0
-			// Reading the page caused it to be committed; decommit it to
-			// reduce memory usage.
-			//
-			// "MADV_REMOVE [...] Free up a given range of pages and its
-			// associated backing store. This is equivalent to punching a hole
-			// in the corresponding byte range of the backing store (see
-			// fallocate(2))." - madvise(2)
-			if err := syscall.Madvise(pg, syscall.MADV_REMOVE); err != nil {
-				// This doesn't impact the correctness of saved memory, it
-				// just means that we're incrementally more likely to OOM.
-				// Complain, but don't abort saving.
-				log.Warningf("Decommitting page %p while saving failed: %v", pg, err)
-			}
-		}
-		return nil
-	})
-	if err != nil {
-		return err
-	}
-
-	// Save metadata.
-	if err := state.Save(w, &f.fileSize, nil); err != nil {
-		return err
-	}
-	if err := state.Save(w, &f.usage, nil); err != nil {
-		return err
-	}
-
-	// Dump out committed pages.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		if !seg.Value().knownCommitted {
-			continue
-		}
-		// Write a header to distinguish from objects.
-		if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil {
-			return err
-		}
-		// Write out data.
-		var ioErr error
-		err := f.forEachMappingSlice(seg.Range(), func(s []byte) {
-			if ioErr != nil {
-				return
-			}
-			_, ioErr = w.Write(s)
-		})
-		if ioErr != nil {
-			return ioErr
-		}
-		if err != nil {
-			return err
-		}
-
-		// Update accounting for restored pages. We need to do this here since
-		// these segments are marked as "known committed", and will be skipped
-		// over on accounting scans.
-		usage.MemoryAccounting.Inc(seg.Range().Length(), seg.Value().kind)
-	}
-
-	return nil
-}
-
-// LoadFrom implements platform.Memory.LoadFrom.
-func (f *FileMem) LoadFrom(r io.Reader) error {
-	// Load metadata.
-	if err := state.Load(r, &f.fileSize, nil); err != nil {
-		return err
-	}
-	if err := f.file.Truncate(f.fileSize); err != nil {
-		return err
-	}
-	newMappings := make([]uintptr, f.fileSize>>chunkShift)
-	f.mappings.Store(newMappings)
-	if err := state.Load(r, &f.usage, nil); err != nil {
-		return err
-	}
-
-	// Try to map committed chunks concurrently: For any given chunk, either
-	// this loop or the following one will mmap the chunk first and cache it in
-	// f.mappings for the other, but this loop is likely to run ahead of the
-	// other since it doesn't do any work between mmaps. The rest of this
-	// function doesn't mutate f.usage, so it's safe to iterate concurrently.
-	mapperDone := make(chan struct{})
-	mapperCanceled := int32(0)
-	go func() { // S/R-SAFE: see comment
-		defer func() { close(mapperDone) }()
-		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-			if atomic.LoadInt32(&mapperCanceled) != 0 {
-				return
-			}
-			if seg.Value().knownCommitted {
-				f.forEachMappingSlice(seg.Range(), func(s []byte) {})
-			}
-		}
-	}()
-	defer func() {
-		atomic.StoreInt32(&mapperCanceled, 1)
-		<-mapperDone
-	}()
-
-	// Load committed pages.
-	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		if !seg.Value().knownCommitted {
-			continue
-		}
-		// Verify header.
-		length, object, err := state.ReadHeader(r)
-		if err != nil {
-			return err
-		}
-		if object {
-			// Not expected.
-			return fmt.Errorf("unexpected object")
-		}
-		if expected := uint64(seg.Range().Length()); length != expected {
-			// Size mismatch.
-			return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length)
-		}
-		// Read data.
-		var ioErr error
-		err = f.forEachMappingSlice(seg.Range(), func(s []byte) {
-			if ioErr != nil {
-				return
-			}
-			_, ioErr = io.ReadFull(r, s)
-		})
-		if ioErr != nil {
-			return ioErr
-		}
-		if err != nil {
-			return err
-		}
-
-		// Update accounting for restored pages. We need to do this here since
-		// these segments are marked as "known committed", and will be skipped
-		// over on accounting scans.
-		usage.MemoryAccounting.Inc(seg.End()-seg.Start(), seg.Value().kind)
-	}
-
-	return nil
-}
diff --git a/pkg/sentry/platform/filemem/filemem_test.go b/pkg/sentry/platform/filemem/filemem_test.go
deleted file mode 100644
index 9becec25f..000000000
--- a/pkg/sentry/platform/filemem/filemem_test.go
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package filemem
-
-import (
-	"testing"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-)
-
-const (
-	page     = usermem.PageSize
-	hugepage = usermem.HugePageSize
-)
-
-func TestFindUnallocatedRange(t *testing.T) {
-	for _, test := range []struct {
-		desc           string
-		usage          *usageSegmentDataSlices
-		start          uint64
-		length         uint64
-		alignment      uint64
-		unallocated    uint64
-		minUnallocated uint64
-	}{
-		{
-			desc:           "Initial allocation succeeds",
-			usage:          &usageSegmentDataSlices{},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
-		},
-		{
-			desc: "Allocation begins at start of file",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{page},
-				End:    []uint64{2 * page},
-				Values: []usageInfo{{refs: 1}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
-		},
-		{
-			desc: "In-use frames are not allocatable",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, page},
-				End:    []uint64{page, 2 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 2}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
-		},
-		{
-			desc: "Reclaimable frames are not allocatable",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, page, 2 * page},
-				End:    []uint64{page, 2 * page, 3 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: 3 * page,
-		},
-		{
-			desc: "Gaps between in-use frames are allocatable",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 2 * page},
-				End:    []uint64{page, 3 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 1}},
-			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    page,
-			minUnallocated: page,
-		},
-		{
-			desc: "Inadequately-sized gaps are rejected",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 2 * page},
-				End:    []uint64{page, 3 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 1}},
-			},
-			start:          0,
-			length:         2 * page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: page,
-		},
-		{
-			desc: "Hugepage alignment is honored",
-			usage: &usageSegmentDataSlices{
-				Start: []uint64{0, hugepage + page},
-				// Hugepage-sized gap here that shouldn't be allocated from
-				// since it's incorrectly aligned.
-				End:    []uint64{page, hugepage + 2*page},
-				Values: []usageInfo{{refs: 1}, {refs: 1}},
-			},
-			start:          0,
-			length:         hugepage,
-			alignment:      hugepage,
-			unallocated:    2 * hugepage,
-			minUnallocated: page,
-		},
-		{
-			desc: "Pages before start ignored",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{page, 3 * page},
-				End:    []uint64{2 * page, 4 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 2}},
-			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
-		},
-		{
-			desc: "start may be in the middle of segment",
-			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 3 * page},
-				End:    []uint64{2 * page, 4 * page},
-				Values: []usageInfo{{refs: 1}, {refs: 2}},
-			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
-		},
-	} {
-		t.Run(test.desc, func(t *testing.T) {
-			var usage usageSet
-			if err := usage.ImportSortedSlices(test.usage); err != nil {
-				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
-			}
-			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
-			if unallocated != test.unallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
-			}
-			if minUnallocated != test.minUnallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
-			}
-		})
-	}
-}
diff --git a/pkg/sentry/platform/filemem/filemem_unsafe.go b/pkg/sentry/platform/filemem/filemem_unsafe.go
deleted file mode 100644
index 776aed74d..000000000
--- a/pkg/sentry/platform/filemem/filemem_unsafe.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package filemem
-
-import (
-	"reflect"
-	"syscall"
-	"unsafe"
-)
-
-func unsafeSlice(addr uintptr, length int) (slice []byte) {
-	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
-	sh.Data = addr
-	sh.Len = length
-	sh.Cap = length
-	return
-}
-
-func mincore(s []byte, buf []byte) error {
-	if _, _, errno := syscall.RawSyscall(
-		syscall.SYS_MINCORE,
-		uintptr(unsafe.Pointer(&s[0])),
-		uintptr(len(s)),
-		uintptr(unsafe.Pointer(&buf[0]))); errno != 0 {
-		return errno
-	}
-	return nil
-}
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index b7bf88249..9999e58f4 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -34,7 +34,6 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
-        "//pkg/sentry/platform/filemem",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/procid",
         "//pkg/sentry/platform/ring0",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 6d8d8e65b..f2f7ab1e8 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -20,7 +20,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -76,9 +75,6 @@ type addressSpace struct {
 	// Note that the page tables themselves are not locked.
 	mu sync.Mutex
 
-	// filemem is the memory instance.
-	filemem *filemem.FileMem
-
 	// machine is the underlying machine.
 	machine *machine
 
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index d4f50024d..c5a4435b1 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -23,7 +23,6 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -33,9 +32,6 @@ import (
 type KVM struct {
 	platform.NoCPUPreemptionDetection
 
-	// filemem is our memory source.
-	*filemem.FileMem
-
 	// machine is the backing VM.
 	machine *machine
 }
@@ -56,12 +52,6 @@ func OpenDevice() (*os.File, error) {
 
 // New returns a new KVM-based implementation of the platform interface.
 func New(deviceFile *os.File) (*KVM, error) {
-	// Allocate physical memory for the vCPUs.
-	fm, err := filemem.New("kvm-memory")
-	if err != nil {
-		return nil, err
-	}
-
 	fd := deviceFile.Fd()
 
 	// Ensure global initialization is done.
@@ -90,7 +80,6 @@ func New(deviceFile *os.File) (*KVM, error) {
 
 	// All set.
 	return &KVM{
-		FileMem: fm,
 		machine: machine,
 	}, nil
 }
@@ -140,7 +129,6 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru
 
 	// Return the new address space.
 	return &addressSpace{
-		filemem:    k.FileMem,
 		machine:    k.machine,
 		pageTables: pageTables,
 		dirtySet:   k.machine.newDirtySet(),
@@ -153,8 +141,3 @@ func (k *KVM) NewContext() platform.Context {
 		machine: k.machine,
 	}
 }
-
-// Memory returns the platform memory used to do allocations.
-func (k *KVM) Memory() platform.Memory {
-	return k.FileMem
-}
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index fff463a6e..361200622 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -48,7 +48,6 @@ func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
 		t.Fatalf("error creating KVM instance: %v", err)
 	}
 	defer k.machine.Destroy()
-	defer k.FileMem.Destroy()
 
 	// Call additional setup.
 	if setup != nil {
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index b2ce851da..d1c9458ea 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -19,17 +19,15 @@ package platform
 
 import (
 	"fmt"
-	"io"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// Platform provides abstractions for execution contexts (Context) and memory
-// management (Memory, AddressSpace).
+// Platform provides abstractions for execution contexts (Context,
+// AddressSpace).
 type Platform interface {
 	// SupportsAddressSpaceIO returns true if AddressSpaces returned by this
 	// Platform support AddressSpaceIO methods.
@@ -87,9 +85,6 @@ type Platform interface {
 	// NewContext returns a new execution context.
 	NewContext() Context
 
-	// Memory returns memory for allocations.
-	Memory() Memory
-
 	// PreemptAllCPUs causes all concurrent calls to Context.Switch(), as well
 	// as the first following call to Context.Switch() for each Context, to
 	// return ErrContextCPUPreempted.
@@ -352,84 +347,3 @@ type File interface {
 func (fr FileRange) String() string {
 	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
 }
-
-// Memory represents an allocatable File that may be mapped into any
-// AddressSpace associated with the same Platform.
-type Memory interface {
-	File
-
-	// Allocate returns a range of initially-zeroed pages of the given length
-	// with the given accounting kind and a single reference held by the
-	// caller. When the last reference on an allocated page is released,
-	// ownership of the page is returned to the Memory, allowing it to be
-	// returned by a future call to Allocate.
-	//
-	// Preconditions: length must be page-aligned and non-zero.
-	Allocate(length uint64, kind usage.MemoryKind) (FileRange, error)
-
-	// Decommit releases resources associated with maintaining the contents of
-	// the given frames. If Decommit succeeds, future accesses of the
-	// decommitted frames will read zeroes.
-	//
-	// Preconditions: fr.Length() > 0.
-	Decommit(fr FileRange) error
-
-	// UpdateUsage updates the memory usage statistics. This must be called
-	// before the relevant memory statistics in usage.MemoryAccounting can
-	// be considered accurate.
-	UpdateUsage() error
-
-	// TotalUsage returns an aggregate usage for all memory statistics
-	// except Mapped (which is external to the Memory implementation). This
-	// is generally much cheaper than UpdateUsage, but will not provide a
-	// fine-grained breakdown.
-	TotalUsage() (uint64, error)
-
-	// TotalSize returns the current maximum size of the Memory in bytes. The
-	// value returned by TotalSize is permitted to change.
-	TotalSize() uint64
-
-	// Destroy releases all resources associated with the Memory.
-	//
-	// Preconditions: There are no remaining uses of any of the freed memory's
-	// frames.
-	//
-	// Postconditions: None of the Memory's methods may be called after Destroy.
-	Destroy()
-
-	// SaveTo saves the memory state to the given stream, which will
-	// generally be a statefile.
-	SaveTo(w io.Writer) error
-
-	// LoadFrom loads the memory state from the given stream, which will
-	// generally be a statefile.
-	LoadFrom(r io.Reader) error
-}
-
-// AllocateAndFill allocates memory of the given kind from mem and fills it by
-// calling r.ReadToBlocks() repeatedly until either length bytes are read or a
-// non-nil error is returned. It returns the memory filled by r, truncated down
-// to the nearest page. If this is shorter than length bytes due to an error
-// returned by r.ReadToBlocks(), it returns that error.
-//
-// Preconditions: length > 0. length must be page-aligned.
-func AllocateAndFill(mem Memory, length uint64, kind usage.MemoryKind, r safemem.Reader) (FileRange, error) {
-	fr, err := mem.Allocate(length, kind)
-	if err != nil {
-		return FileRange{}, err
-	}
-	dsts, err := mem.MapInternal(fr, usermem.Write)
-	if err != nil {
-		mem.DecRef(fr)
-		return FileRange{}, err
-	}
-	n, err := safemem.ReadFullToBlocks(r, dsts)
-	un := uint64(usermem.Addr(n).RoundDown())
-	if un < length {
-		// Free unused memory and update fr to contain only the memory that is
-		// still allocated.
-		mem.DecRef(FileRange{fr.Start + un, fr.End})
-		fr.End = fr.Start + un
-	}
-	return fr, err
-}
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index f86790942..e9e4a0d16 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -23,7 +23,6 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/platform",
-        "//pkg/sentry/platform/filemem",
         "//pkg/sentry/platform/interrupt",
         "//pkg/sentry/platform/procid",
         "//pkg/sentry/platform/safecopy",
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 8d3f6ac9a..3c0713e95 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -50,7 +50,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -181,7 +180,6 @@ func (c *context) Interrupt() {
 type PTrace struct {
 	platform.MMapMinAddr
 	platform.NoCPUPreemptionDetection
-	*filemem.FileMem
 }
 
 // New returns a new ptrace-based implementation of the platform interface.
@@ -202,12 +200,7 @@ func New() (*PTrace, error) {
 		globalPool.master = master
 	})
 
-	fm, err := filemem.New("ptrace-memory")
-	if err != nil {
-		return nil, err
-	}
-
-	return &PTrace{FileMem: fm}, nil
+	return &PTrace{}, nil
 }
 
 // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
@@ -243,8 +236,3 @@ func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan s
 func (*PTrace) NewContext() platform.Context {
 	return &context{}
 }
-
-// Memory returns the platform memory used to do allocations.
-func (p *PTrace) Memory() platform.Memory {
-	return p.FileMem
-}
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 42c459acc..69385e23c 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -16,7 +16,6 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
-        "//pkg/sentry/platform",
         "//pkg/sentry/watchdog",
         "//pkg/state/statefile",
     ],
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 70b33f190..67db78a56 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 	"gvisor.googlesource.com/gvisor/pkg/state/statefile"
 )
@@ -95,7 +94,7 @@ type LoadOpts struct {
 }
 
 // Load loads the given kernel, setting the provided platform and stack.
-func (opts LoadOpts) Load(k *kernel.Kernel, p platform.Platform, n inet.Stack) error {
+func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack) error {
 	// Open the file.
 	r, m, err := statefile.NewReader(opts.Source, opts.Key)
 	if err != nil {
@@ -105,5 +104,5 @@ func (opts LoadOpts) Load(k *kernel.Kernel, p platform.Platform, n inet.Stack) e
 	previousMetadata = m
 
 	// Restore the Kernel object graph.
-	return k.LoadFrom(r, p, n)
+	return k.LoadFrom(r, n)
 }
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 5eeb3ba58..6f7acf98f 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -25,10 +25,10 @@ import (
 func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	addr := args[0].Pointer()
 
-	mem := t.Kernel().Platform.Memory()
-	mem.UpdateUsage()
+	mf := t.Kernel().MemoryFile()
+	mf.UpdateUsage()
 	_, totalUsage := usage.MemoryAccounting.Copy()
-	totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage)
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
 
 	// Only a subset of the fields in sysinfo_t make sense to return.
 	si := linux.Sysinfo{
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 7e065cb76..5be9ed9c6 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -122,9 +122,6 @@ func Init() error {
 	const name = "memory-usage"
 	fd, err := memutil.CreateMemFD(name, 0)
 	if err != nil {
-		if e, ok := err.(syscall.Errno); ok && e == syscall.ENOSYS {
-			return fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher")
-		}
 		return fmt.Errorf("error creating usage file: %v", err)
 	}
 	file := os.NewFile(uintptr(fd), name)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index daa197437..df9907e52 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -51,6 +51,8 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
+        "//pkg/sentry/memutil",
+        "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm",
         "//pkg/sentry/platform/ptrace",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index a864be720..14e1eba5b 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -332,6 +332,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	k := &kernel.Kernel{
 		Platform: p,
 	}
+	mf, err := createMemoryFile()
+	if err != nil {
+		return fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
 	cm.l.k = k
 
 	// Set up the restore environment.
@@ -362,7 +367,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	loadOpts := state.LoadOpts{
 		Source: o.FilePayload.Files[0],
 	}
-	if err := loadOpts.Load(k, p, networkStack); err != nil {
+	if err := loadOpts.Load(k, networkStack); err != nil {
 		return err
 	}
 
@@ -384,7 +389,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.mu.Lock()
 	eid := execID{cid: o.SandboxID}
 	cm.l.processes = map[execID]*execProcess{
-		eid: &execProcess{
+		eid: {
 			tg: cm.l.k.GlobalInit(),
 		},
 	}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index f954b8c0b..717adfedd 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -68,7 +68,7 @@ func (cm *containerManager) Event(_ *struct{}, out *Event) error {
 }
 
 func (s *Stats) populateMemory(k *kernel.Kernel) {
-	mem := k.Platform.Memory()
+	mem := k.MemoryFile()
 	mem.UpdateUsage()
 	_, totalUsage := usage.MemoryAccounting.Copy()
 	s.Memory.Usage = MemoryEntry{
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 9ebe64dce..56cb137f0 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -37,6 +37,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
@@ -189,6 +191,13 @@ func New(args Args) (*Loader, error) {
 		Platform: p,
 	}
 
+	// Create memory file.
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
+
 	// Create VDSO.
 	//
 	// Pass k as the platform since it is savable, unlike the actual platform.
@@ -297,7 +306,7 @@ func New(args Args) (*Loader, error) {
 		stdioFDs:     args.StdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
-		processes:    map[execID]*execProcess{eid: &execProcess{}},
+		processes:    map[execID]*execProcess{eid: {}},
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -404,6 +413,21 @@ func createPlatform(conf *Config, deviceFD int) (platform.Platform, error) {
 	}
 }
 
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "runsc-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile)
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
+
 // Run runs the root container..
 func (l *Loader) Run() error {
 	err := l.run()
-- 
cgit v1.2.3


From cea1dd7d21b976ad5cb145b94be7b1bf879235be Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Mon, 18 Mar 2019 10:47:59 -0700
Subject: Remove racy access to shm fields.

PiperOrigin-RevId: 239016776
Change-Id: Ia7af4258e7c69b16a4630a6f3278aa8e6b627746
---
 pkg/sentry/kernel/shm/shm.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 4525aabf4..a7f0758ec 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -286,7 +286,7 @@ func (r *Registry) remove(s *Shm) {
 	defer s.mu.Unlock()
 
 	if s.key != linux.IPC_PRIVATE {
-		panic(fmt.Sprintf("Attempted to remove shm segment %+v from the registry whose key is still associated", s))
+		panic(fmt.Sprintf("Attempted to remove shm segment %d (key=%d) from the registry whose key is still associated", s.ID, s.key))
 	}
 
 	delete(r.shms, s.ID)
-- 
cgit v1.2.3


From 8a499ae65f361fb01c2e4be03122f69910a8ba4a Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 18 Mar 2019 18:39:08 -0700
Subject: Remove references to replaced child in Rename in ramfs/agentfs

In the case of a rename replacing an existing destination inode, ramfs
Rename failed to first remove the replaced inode. This caused:

1. A leak of a reference to the inode (making it live indefinitely).
2. For directories, a leak of the replaced directory's .. link to the
   parent. This would cause the parent's link count to incorrectly
   increase.

(2) is much simpler to test than (1), so that's what I've done.

agentfs has a similar bug with link count only, so the Dirent layer
informs the Inode if this is a replacing rename.

Fixes #133

PiperOrigin-RevId: 239105698
Change-Id: I4450af2462d8ae3339def812287213d2cbeebde0
---
 pkg/sentry/fs/dirent.go           |   3 +-
 pkg/sentry/fs/fsutil/inode.go     |   4 +-
 pkg/sentry/fs/gofer/path.go       |   9 +++-
 pkg/sentry/fs/host/inode.go       |   2 +-
 pkg/sentry/fs/inode.go            |   9 ++--
 pkg/sentry/fs/inode_operations.go |  13 +++--
 pkg/sentry/fs/inode_overlay.go    | 101 ++++++++++++++++++++++++--------------
 pkg/sentry/fs/mock.go             |   2 +-
 pkg/sentry/fs/ramfs/dir.go        |  24 +++++++--
 pkg/sentry/fs/tmpfs/inode_file.go |   4 +-
 pkg/sentry/fs/tmpfs/tmpfs.go      |  20 ++++----
 test/BUILD                        |  48 ++++++++++++++++++
 test/BUILD.opensource             |  48 ------------------
 test/syscalls/linux/rename.cc     |  25 +++++++++-
 14 files changed, 191 insertions(+), 121 deletions(-)
 create mode 100644 test/BUILD
 delete mode 100644 test/BUILD.opensource

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index d6a19dc81..15a0129ce 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1563,6 +1563,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		}
 
 		// newName doesn't exist; simply create it below.
+		replaced = nil
 	} else {
 		// Check constraints on the dirent being replaced.
 
@@ -1620,7 +1621,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		replaced.DecRef()
 	}
 
-	if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName); err != nil {
+	if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil {
 		return err
 	}
 
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index bd3bd1bb2..c1ad45e52 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -338,7 +338,7 @@ func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) err
 }
 
 // Rename implements fs.FileOperations.Rename.
-func (InodeNotDirectory) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error {
+func (InodeNotDirectory) Rename(context.Context, *fs.Inode, string, *fs.Inode, string, bool) error {
 	return syserror.EINVAL
 }
 
@@ -378,7 +378,7 @@ func (InodeNoopTruncate) Truncate(context.Context, *fs.Inode, int64) error {
 type InodeNotRenameable struct{}
 
 // Rename implements fs.InodeOperations.Rename.
-func (InodeNotRenameable) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error {
+func (InodeNotRenameable) Rename(context.Context, *fs.Inode, string, *fs.Inode, string, bool) error {
 	return syserror.EINVAL
 }
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 43f990d16..2ba400836 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -298,7 +298,7 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na
 }
 
 // Rename renames this node.
-func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	// Unwrap the new parent to a *inodeOperations.
 	newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations)
 	if !ok {
@@ -323,7 +323,12 @@ func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldNa
 			oldParentInodeOperations.cachingInodeOps.DecLinks(ctx)
 		}
 		if i.session().cachePolicy.cacheUAttrs(newParent) {
-			newParentInodeOperations.cachingInodeOps.IncLinks(ctx)
+			// Only IncLinks if there is a new addition to
+			// newParent. If this is replacement, then the total
+			// count remains the same.
+			if !replacement {
+				newParentInodeOperations.cachingInodeOps.IncLinks(ctx)
+			}
 		}
 	}
 	if i.session().cachePolicy.cacheReaddir() {
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 6ff6c3254..2030edcb4 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -296,7 +296,7 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	op, ok := oldParent.InodeOperations.(*inodeOperations)
 	if !ok {
 		return syscall.EXDEV
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 08b5c5902..b8b5c1528 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -152,7 +152,8 @@ func (i *Inode) WriteOut(ctx context.Context) error {
 // Lookup calls i.InodeOperations.Lookup with i as the directory.
 func (i *Inode) Lookup(ctx context.Context, name string) (*Dirent, error) {
 	if i.overlay != nil {
-		return overlayLookup(ctx, i.overlay, i, name)
+		d, _, err := overlayLookup(ctx, i.overlay, i, name)
+		return d, err
 	}
 	return i.InodeOperations.Lookup(ctx, i, name)
 }
@@ -211,11 +212,11 @@ func (i *Inode) Remove(ctx context.Context, d *Dirent, remove *Dirent) error {
 }
 
 // Rename calls i.InodeOperations.Rename with the given arguments.
-func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string) error {
+func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string, replacement bool) error {
 	if i.overlay != nil {
-		return overlayRename(ctx, i.overlay, oldParent, renamed, newParent, newName)
+		return overlayRename(ctx, i.overlay, oldParent, renamed, newParent, newName, replacement)
 	}
-	return i.InodeOperations.Rename(ctx, oldParent.Inode, renamed.name, newParent.Inode, newName)
+	return i.InodeOperations.Rename(ctx, oldParent.Inode, renamed.name, newParent.Inode, newName, replacement)
 }
 
 // Bind calls i.InodeOperations.Bind with i as the directory.
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index db40b5256..548f1eb8b 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -133,12 +133,15 @@ type InodeOperations interface {
 	// removed is empty.
 	RemoveDirectory(ctx context.Context, dir *Inode, name string) error
 
-	// Rename atomically renames oldName under oldParent to newName
-	// under newParent where oldParent and newParent are directories.
+	// Rename atomically renames oldName under oldParent to newName under
+	// newParent where oldParent and newParent are directories.
 	//
-	// Implementations are responsible for rejecting renames that
-	// replace non-empty directories.
-	Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string) error
+	// If replacement is true, then newName already exists and this call
+	// will replace it with oldName.
+	//
+	// Implementations are responsible for rejecting renames that replace
+	// non-empty directories.
+	Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string, replacement bool) error
 
 	// Bind binds a new socket under dir at the given name.
 	// Implementations must ensure that name does not already exist.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 92a77917a..6e1dfecf9 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -44,7 +44,11 @@ func overlayWriteOut(ctx context.Context, o *overlayEntry) error {
 	return err
 }
 
-func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name string) (*Dirent, error) {
+// overlayLookup performs a lookup in parent.
+//
+// If name exists, it returns true if the Dirent is in the upper, false if the
+// Dirent is in the lower.
+func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name string) (*Dirent, bool, error) {
 	// Hot path. Avoid defers.
 	parent.copyMu.RLock()
 
@@ -71,7 +75,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 			// We encountered an error that an overlay cannot handle,
 			// we must propagate it to the caller.
 			parent.copyMu.RUnlock()
-			return nil, err
+			return nil, false, err
 		}
 		if child != nil {
 			if child.IsNegative() {
@@ -93,23 +97,23 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 					// that negative Dirent being cached in
 					// the Dirent tree, so we can return
 					// one from the overlay.
-					return NewNegativeDirent(name), nil
+					return NewNegativeDirent(name), false, nil
 				}
 				// Upper fs is not OK with a negative Dirent
 				// being cached in the Dirent tree, so don't
 				// return one.
-				return nil, syserror.ENOENT
+				return nil, false, syserror.ENOENT
 			}
 			entry, err := newOverlayEntry(ctx, upperInode, nil, false)
 			if err != nil {
 				// Don't leak resources.
 				upperInode.DecRef()
 				parent.copyMu.RUnlock()
-				return nil, err
+				return nil, false, err
 			}
 			d, err := NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
 			parent.copyMu.RUnlock()
-			return d, err
+			return d, true, err
 		}
 	}
 
@@ -127,7 +131,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 				upperInode.DecRef()
 			}
 			parent.copyMu.RUnlock()
-			return nil, err
+			return nil, false, err
 		}
 		if child != nil {
 			if !child.IsNegative() {
@@ -158,9 +162,9 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 		// one as well. See comments above regarding negativeUpperChild
 		// for more info.
 		if negativeUpperChild {
-			return NewNegativeDirent(name), nil
+			return NewNegativeDirent(name), false, nil
 		}
-		return nil, syserror.ENOENT
+		return nil, false, syserror.ENOENT
 	}
 
 	// Did we find a lower Inode? Remember this because we may decide we don't
@@ -195,11 +199,11 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 			lowerInode.DecRef()
 		}
 		parent.copyMu.RUnlock()
-		return nil, err
+		return nil, false, err
 	}
 	d, err := NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil
 	parent.copyMu.RUnlock()
-	return d, err
+	return d, upperInode != nil, err
 }
 
 func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) {
@@ -317,7 +321,7 @@ func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child *
 	return nil
 }
 
-func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string) error {
+func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string, replacement bool) error {
 	// To be able to copy these up below, they have to be part of an
 	// overlay file system.
 	//
@@ -327,36 +331,57 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 		return syserror.EXDEV
 	}
 
-	// Check here if the file to be replaced exists and is a non-empty
-	// directory. If we copy up first, we may end up copying the directory
-	// but none of its children, so the directory will appear empty in the
-	// upper fs, which will then allow the rename to proceed when it should
-	// return ENOTEMPTY.
-	replaced, err := newParent.Inode.Lookup(ctx, newName)
-	if err != nil && err != syserror.ENOENT {
-		return err
-	}
-	if err == nil {
-		// NOTE: We must drop the reference on replaced before we make
-		// the rename call. For that reason we can't use defer.
-		if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) {
-			children, err := readdirOne(ctx, replaced)
-			if err != nil {
-				replaced.DecRef()
-				return err
+	if replacement {
+		// Check here if the file to be replaced exists and is a
+		// non-empty directory. If we copy up first, we may end up
+		// copying the directory but none of its children, so the
+		// directory will appear empty in the upper fs, which will then
+		// allow the rename to proceed when it should return ENOTEMPTY.
+		//
+		// NOTE: Ideally, we'd just pass in the replaced
+		// Dirent from Rename, but we must drop the reference on
+		// replaced before we make the rename call, so Rename can't
+		// pass the Dirent to the Inode without significantly
+		// complicating the API. Thus we look it up again here.
+		//
+		// For the same reason we can't use defer here.
+		replaced, inUpper, err := overlayLookup(ctx, newParent.Inode.overlay, newParent.Inode, newName)
+		// If err == ENOENT or a negative Dirent is returned, then
+		// newName has been removed out from under us. That's fine;
+		// filesystems where that can happen must handle stale
+		// 'replaced'.
+		if err != nil && err != syserror.ENOENT {
+			return err
+		}
+		if err == nil {
+			if !inUpper {
+				// newName doesn't exist in
+				// newParent.Inode.overlay.upper, thus from
+				// that Inode's perspective this won't be a
+				// replacing rename.
+				replacement = false
 			}
 
-			// readdirOne ensures that "." and ".." are not
-			// included among the returned children, so we don't
-			// need to bother checking for them.
-			if len(children) > 0 {
-				replaced.DecRef()
-				return syserror.ENOTEMPTY
+			if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) {
+				children, err := readdirOne(ctx, replaced)
+				if err != nil {
+					replaced.DecRef()
+					return err
+				}
+
+				// readdirOne ensures that "." and ".." are not
+				// included among the returned children, so we don't
+				// need to bother checking for them.
+				if len(children) > 0 {
+					replaced.DecRef()
+					return syserror.ENOTEMPTY
+				}
 			}
-		}
 
-		replaced.DecRef()
+			replaced.DecRef()
+		}
 	}
+
 	if err := copyUpLockedForRename(ctx, renamed); err != nil {
 		return err
 	}
@@ -364,7 +389,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 		return err
 	}
 	oldName := renamed.name
-	if err := o.upper.InodeOperations.Rename(ctx, oldParent.Inode.overlay.upper, oldName, newParent.Inode.overlay.upper, newName); err != nil {
+	if err := o.upper.InodeOperations.Rename(ctx, oldParent.Inode.overlay.upper, oldName, newParent.Inode.overlay.upper, newName, replacement); err != nil {
 		return err
 	}
 	if renamed.Inode.overlay.lowerExists {
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index abfdc6a25..118e30f63 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -132,7 +132,7 @@ func (n *MockInodeOperations) CreateDirectory(context.Context, *Inode, string, F
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (n *MockInodeOperations) Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string) error {
+func (n *MockInodeOperations) Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string, replacement bool) error {
 	n.renameCalled = true
 	return nil
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 4da876ebd..b60dab243 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -15,6 +15,7 @@
 package ramfs
 
 import (
+	"fmt"
 	"sync"
 	"syscall"
 
@@ -383,8 +384,8 @@ func (d *Dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (*Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName)
+func (*Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName, replacement)
 }
 
 // dirFileOperations implements fs.FileOperations for a ramfs directory.
@@ -456,7 +457,7 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) {
 }
 
 // Rename renames from a *ramfs.Dir to another *ramfs.Dir.
-func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string) error {
+func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string, replacement bool) error {
 	op, ok := oldParent.(*Dir)
 	if !ok {
 		return syserror.EXDEV
@@ -469,8 +470,14 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n
 	np.mu.Lock()
 	defer np.mu.Unlock()
 
-	// Check whether the ramfs entry to be replaced is a non-empty directory.
-	if replaced, ok := np.children[newName]; ok {
+	// Is this is an overwriting rename?
+	if replacement {
+		replaced, ok := np.children[newName]
+		if !ok {
+			panic(fmt.Sprintf("Dirent claims rename is replacement, but %q is missing from %+v", newName, np))
+		}
+
+		// Non-empty directories cannot be replaced.
 		if fs.IsDir(replaced.StableAttr) {
 			if ok, err := hasChildren(ctx, replaced); err != nil {
 				return err
@@ -478,6 +485,13 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n
 				return syserror.ENOTEMPTY
 			}
 		}
+
+		// Remove the replaced child and drop our reference on it.
+		inode, err := np.removeChildLocked(ctx, newName)
+		if err != nil {
+			return err
+		}
+		inode.DecRef()
 	}
 
 	// Be careful, we may have already grabbed this mutex above.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index a98fbf0f1..3c84b2977 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -107,8 +107,8 @@ func (f *fileInodeOperations) Mappable(*fs.Inode) memmap.Mappable {
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (*fileInodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return rename(ctx, oldParent, oldName, newParent, newName)
+func (*fileInodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
 // GetFile implements fs.InodeOperations.GetFile.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 1a9d12c0b..a1672a4d0 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -38,7 +38,7 @@ var fsInfo = fs.Info{
 }
 
 // rename implements fs.InodeOperations.Rename for tmpfs nodes.
-func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
+func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	op, ok := oldParent.InodeOperations.(*Dir)
 	if !ok {
 		return syserror.EXDEV
@@ -47,7 +47,7 @@ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent
 	if !ok {
 		return syserror.EXDEV
 	}
-	return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName)
+	return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName, replacement)
 }
 
 // Dir is a directory.
@@ -238,8 +238,8 @@ func (d *Dir) newCreateOps() *ramfs.CreateOps {
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (d *Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return rename(ctx, oldParent, oldName, newParent, newName)
+func (d *Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
 // StatFS implments fs.InodeOperations.StatFS.
@@ -266,8 +266,8 @@ func NewSymlink(ctx context.Context, target string, owner fs.FileOwner, msrc *fs
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (s *Symlink) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return rename(ctx, oldParent, oldName, newParent, newName)
+func (s *Symlink) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
 // StatFS returns the tmpfs info.
@@ -295,8 +295,8 @@ func NewSocket(ctx context.Context, socket transport.BoundEndpoint, owner fs.Fil
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (s *Socket) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return rename(ctx, oldParent, oldName, newParent, newName)
+func (s *Socket) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
 // StatFS returns the tmpfs info.
@@ -332,8 +332,8 @@ func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions,
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (f *Fifo) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error {
-	return rename(ctx, oldParent, oldName, newParent, newName)
+func (f *Fifo) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
 // StatFS returns the tmpfs info.
diff --git a/test/BUILD b/test/BUILD
new file mode 100644
index 000000000..6b83757f6
--- /dev/null
+++ b/test/BUILD
@@ -0,0 +1,48 @@
+# gVisor is a general-purpose sandbox.
+
+package(licenses = ["notice"])
+
+exports_files(["LICENSE"])
+
+# We need to define a bazel platform and toolchain to specify dockerPrivileged
+# and dockerRunAsRoot options, they are required to run tests on the RBE
+# cluster in Kokoro.
+alias(
+    name = "rbe_ubuntu1604",
+    actual = ":rbe_ubuntu1604_r346485",
+)
+
+platform(
+    name = "rbe_ubuntu1604_r346485",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//tools/cpp:clang",
+        "@bazel_toolchains//constraints:xenial",
+        "@bazel_toolchains//constraints/sanitizers:support_msan",
+    ],
+    remote_execution_properties = """
+        properties: {
+          name: "container-image"
+          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:f3120a030a19d67626ababdac79cc787e699a1aa924081431285118f87e7b375"
+        }
+        properties: {
+          name: "dockerAddCapabilities"
+          value: "SYS_ADMIN"
+        }
+        properties: {
+          name: "dockerPrivileged"
+          value: "true"
+        }
+    """,
+)
+
+toolchain(
+    name = "cc-toolchain-clang-x86_64-default",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+    ],
+    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/1.1/bazel_0.20.0/default:cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/test/BUILD.opensource b/test/BUILD.opensource
deleted file mode 100644
index 8d2969204..000000000
--- a/test/BUILD.opensource
+++ /dev/null
@@ -1,48 +0,0 @@
-# gVisor is a general-purpose sandbox.
-
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-# We need to define a bazel platform and toolchain to specify dockerPrivileged
-# and dockerRunAsRoot options, they are required to run tests on the RBE
-# cluster in Kokoro.
-alias(
-    name = "rbe_ubuntu1604",
-    actual = ":rbe_ubuntu1604_r346485",
-)
-
-platform(
-    name = "rbe_ubuntu1604_r346485",
-    constraint_values = [
-        "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//tools/cpp:clang",
-        "@bazel_toolchains//constraints:xenial",
-        "@bazel_toolchains//constraints/sanitizers:support_msan",
-    ],
-    remote_execution_properties = """
-        properties: {
-          name: "container-image"
-          value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:f3120a030a19d67626ababdac79cc787e699a1aa924081431285118f87e7b375"
-        }
-        properties: {
-          name: "dockerAddCapabilities"
-          value: "SYS_ADMIN"
-        }
-        properties: {
-          name: "dockerPrivileged"
-          value: "true"
-        }
-    """,
-)
-
-toolchain(
-    name = "cc-toolchain-clang-x86_64-default",
-    exec_compatible_with = [
-    ],
-    target_compatible_with = [
-    ],
-    toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/1.1/bazel_0.20.0/default:cc-compiler-k8",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index f4c877a00..c0cbc7cd9 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -155,10 +155,11 @@ TEST(RenameTest, DirectoryToOwnChildDirectory) {
 }
 
 TEST(RenameTest, FileOverwritesFile) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
-      GetAbsoluteTestTmpdir(), "first", TempPath::kDefaultFileMode));
+      dir.path(), "first", TempPath::kDefaultFileMode));
   auto f2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
-      GetAbsoluteTestTmpdir(), "second", TempPath::kDefaultFileMode));
+      dir.path(), "second", TempPath::kDefaultFileMode));
   ASSERT_THAT(rename(f1.path().c_str(), f2.path().c_str()), SyscallSucceeds());
   EXPECT_THAT(Exists(f1.path()), IsPosixErrorOkAndHolds(false));
 
@@ -168,6 +169,26 @@ TEST(RenameTest, FileOverwritesFile) {
   EXPECT_EQ("first", f2_contents);
 }
 
+TEST(RenameTest, DirectoryOverwritesDirectoryLinkCount) {
+  auto parent1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(2));
+
+  auto parent2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  EXPECT_THAT(Links(parent2.path()), IsPosixErrorOkAndHolds(2));
+
+  auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent1.path()));
+  auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent2.path()));
+
+  EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(3));
+  EXPECT_THAT(Links(parent2.path()), IsPosixErrorOkAndHolds(3));
+
+  ASSERT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+              SyscallSucceeds());
+
+  EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(2));
+  EXPECT_THAT(Links(parent2.path()), IsPosixErrorOkAndHolds(3));
+}
+
 TEST(RenameTest, FileDoesNotExist) {
   auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const std::string source = JoinPath(dir.path(), "source");
-- 
cgit v1.2.3


From 7b33df68450bdb9519cf650a8d92fa4a81f37fa0 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 19 Mar 2019 10:37:46 -0700
Subject: Fix data race in netlink send buffer size

PiperOrigin-RevId: 239221041
Change-Id: Icc19e32a00fa89167447ab2f45e90dcfd61bea04
---
 pkg/sentry/socket/netlink/socket.go | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 5b0c11c84..7223773ad 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -291,6 +291,8 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (in
 			if outLen < sizeOfInt32 {
 				return nil, syserr.ErrInvalidArgument
 			}
+			s.mu.Lock()
+			defer s.mu.Unlock()
 			return int32(s.sendBufferSize), nil
 
 		case linux.SO_RCVBUF:
@@ -335,7 +337,9 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 			} else if size > maxSendBufferSize {
 				size = maxSendBufferSize
 			}
+			s.mu.Lock()
 			s.sendBufferSize = size
+			s.mu.Unlock()
 			return nil
 		case linux.SO_RCVBUF:
 			if len(opt) < sizeOfInt32 {
-- 
cgit v1.2.3


From 87cce0ec08b9d629a5e3a88be411b1721d767301 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 19 Mar 2019 17:32:23 -0700
Subject: netstack: reduce MSS from SYN to account tcp options

See: https://tools.ietf.org/html/rfc6691#section-2
PiperOrigin-RevId: 239305632
Change-Id: Ie8eb912a43332e6490045dc95570709c5b81855e
---
 pkg/sentry/fs/proc/README.md        |  2 --
 pkg/tcpip/transport/tcp/endpoint.go | 10 ++++++++++
 pkg/tcpip/transport/tcp/snd.go      | 13 +++++++------
 runsc/test/README.md                |  2 --
 runsc/test/root/crictl_test.go      |  3 +++
 test/syscalls/linux/exec.cc         |  1 +
 test/syscalls/linux/preadv.cc       |  1 +
 test/syscalls/linux/proc.cc         |  2 ++
 test/syscalls/linux/sigaltstack.cc  |  1 +
 test/syscalls/linux/time.cc         |  1 +
 test/util/temp_path.cc              |  1 +
 test/util/test_util.cc              |  2 ++
 test/util/test_util.h               |  1 +
 13 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 686d40f0c..3cc5f197c 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -11,7 +11,6 @@ inconsistency, please file a bug.
 
 The following files are implemented:
 
-
 | File /proc/                 | Content                                               |
 | :------------------------   | :---------------------------------------------------- |
 | [cpuinfo](#cpuinfo)         | Info about the CPU                                    |
@@ -23,7 +22,6 @@ The following files are implemented:
 | [uptime](#uptime)           | Wall clock since boot, combined idle time of all cpus |
 | [version](#version)         | Kernel version                                        |
 
-
 ### cpuinfo
 
 ```bash
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 7d18e3612..5656890f6 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1596,6 +1596,16 @@ func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
 	}
 }
 
+// maxOptionSize return the maximum size of TCP options.
+func (e *endpoint) maxOptionSize() (size int) {
+	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
+	options := e.makeOptions(maxSackBlocks[:])
+	size = len(options)
+	putOptions(options)
+
+	return size
+}
+
 // completeState makes a full copy of the endpoint and returns it. This is used
 // before invoking the probe. The state returned may not be fully consistent if
 // there are intervening syscalls when the state is being copied.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index e38932df7..18365a673 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -172,6 +172,11 @@ type fastRecovery struct {
 }
 
 func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
+	// The sender MUST reduce the TCP data length to account for any IP or
+	// TCP options that it is including in the packets that it sends.
+	// See: https://tools.ietf.org/html/rfc6691#section-2
+	maxPayloadSize := int(mss) - ep.maxOptionSize()
+
 	s := &sender{
 		ep:               ep,
 		sndCwnd:          InitialCwnd,
@@ -183,7 +188,7 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 		rto:              1 * time.Second,
 		rttMeasureSeqNum: iss + 1,
 		lastSendTime:     time.Now(),
-		maxPayloadSize:   int(mss),
+		maxPayloadSize:   maxPayloadSize,
 		maxSentAck:       irs + 1,
 		fr: fastRecovery{
 			// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
@@ -226,11 +231,7 @@ func (s *sender) initCongestionControl(congestionControlName CongestionControlOp
 func (s *sender) updateMaxPayloadSize(mtu, count int) {
 	m := mtu - header.TCPMinimumSize
 
-	// Calculate the maximum option size.
-	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
-	options := s.ep.makeOptions(maxSackBlocks[:])
-	m -= len(options)
-	putOptions(options)
+	m -= s.ep.maxOptionSize()
 
 	// We don't adjust up for now.
 	if m >= s.maxPayloadSize {
diff --git a/runsc/test/README.md b/runsc/test/README.md
index 5929cbeb6..f22a8e017 100644
--- a/runsc/test/README.md
+++ b/runsc/test/README.md
@@ -12,13 +12,11 @@ they may need extra setup in the test machine and extra configuration to run.
 
 The following setup steps are required in order to run these tests:
 
-
      `./runsc/test/install.sh [--runtime <name>]`
 
 The tests expect the runtime name to be provided in the `RUNSC_RUNTIME`
 environment variable (default: `runsc-test`). To run the tests execute:
 
-
 ```
 bazel test --test_env=RUNSC_RUNTIME=runsc-test \
   //runsc/test/image:image_test \
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
index 556d95fff..37fe53ba3 100644
--- a/runsc/test/root/crictl_test.go
+++ b/runsc/test/root/crictl_test.go
@@ -36,6 +36,7 @@ import (
 
 // Tests for crictl have to be run as root (rather than in a user namespace)
 // because crictl creates named network namespaces in /var/run/netns/.
+
 func TestCrictlSanity(t *testing.T) {
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
@@ -58,6 +59,7 @@ func TestCrictlSanity(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
 func TestMountPaths(t *testing.T) {
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
@@ -80,6 +82,7 @@ func TestMountPaths(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
 func TestMountOverSymlinks(t *testing.T) {
 	// Setup containerd and crictl.
 	crictl, cleanup, err := setup(t)
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 2d2287c2a..d5a938a98 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -58,6 +58,7 @@ std::string WorkloadPath(absl::string_view binary) {
   if (test_src) {
     full_path = JoinPath(test_src, "__main__/test/syscalls/linux", binary);
   }
+
   TEST_CHECK(full_path.empty() == false);
   return full_path;
 }
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index 8d3aed43c..4a31123d8 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -37,6 +37,7 @@ namespace gvisor {
 namespace testing {
 
 namespace {
+
 TEST(PreadvTest, MMConcurrencyStress) {
   // Fill a one-page file with zeroes (the contents don't really matter).
   const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 6ffe9aed6..0da682e7b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1258,6 +1258,7 @@ TEST(ProcPidSymlink, SubprocessRunning) {
   EXPECT_THAT(ReadlinkWhileRunning("ns/user", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 }
+
 // FIXME: Inconsistent behavior between gVisor and linux
 // on proc files.
 TEST(ProcPidSymlink, SubprocessZombied) {
@@ -1362,6 +1363,7 @@ TEST(ProcPidFile, SubprocessRunning) {
 // Test whether /proc/PID/ files can be read for a zombie process.
 TEST(ProcPidFile, SubprocessZombie) {
   char buf[1];
+
   // 4.17: Succeeds and returns 1
   // gVisor: Succeds and returns 0
   EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index b1845ac85..5741720f4 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -101,6 +101,7 @@ TEST(SigaltstackTest, ResetByExecve) {
   if (test_src) {
     full_path = JoinPath(test_src, "../../linux/sigaltstack_check");
   }
+
   ASSERT_FALSE(full_path.empty());
 
   pid_t child_pid = -1;
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 3abcd8098..5a3dfd026 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -61,6 +61,7 @@ TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
   EXPECT_EXIT(vsyscall_time(reinterpret_cast<time_t*>(0x1)),
               ::testing::KilledBySignal(SIGSEGV), "");
 }
+
 int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
   constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
   return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index e45909655..11c14fb1a 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -75,6 +75,7 @@ std::string NewTempRelPath() { return NextTempBasename(); }
 std::string GetAbsoluteTestTmpdir() {
   char* env_tmpdir = getenv("TEST_TMPDIR");
   std::string tmp_dir = env_tmpdir != nullptr ? std::string(env_tmpdir) : "/tmp";
+
   return MakeAbsolute(tmp_dir, "").ValueOrDie();
 }
 
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 7b40260d1..ebcbca238 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -26,6 +26,7 @@
 
 #include <ctime>
 #include <vector>
+
 #include "absl/base/attributes.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -234,6 +235,7 @@ bool Equivalent(uint64_t current, uint64_t target, double tolerance) {
   auto abs_diff = target > current ? target - current : current - target;
   return abs_diff <= static_cast<uint64_t>(tolerance * target);
 }
+
 void TestInit(int* argc, char*** argv) {
   ::testing::InitGoogleTest(argc, *argv);
   ::gflags::ParseCommandLineFlags(argc, argv, true);
diff --git a/test/util/test_util.h b/test/util/test_util.h
index cd71fdd64..37e40de8e 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -184,6 +184,7 @@
 #include <thread>  // NOLINT: using std::thread::hardware_concurrency().
 #include <utility>
 #include <vector>
+
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include "gmock/gmock.h"
-- 
cgit v1.2.3


From 81f4829d1195276d037f8bd23a2ef69e88f5ae6c Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Wed, 20 Mar 2019 14:30:00 -0700
Subject: Record sockets created during accept(2) for all families.

Track new sockets created during accept(2) in the socket table for all
families. Previously we were only doing this for unix domain sockets.

PiperOrigin-RevId: 239475550
Change-Id: I16f009f24a06245bfd1d72ffd2175200f837c6ac
---
 pkg/sentry/socket/epsocket/epsocket.go |  2 ++
 pkg/sentry/socket/hostinet/socket.go   | 14 ++++++++------
 pkg/sentry/socket/rpcinet/socket.go    |  3 +++
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 468e65373..e74bd1bdd 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -504,6 +504,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	}
 	fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
 
+	t.Kernel().RecordSocket(ns, s.family)
+
 	return fd, addr, addrLen, syserr.FromError(e)
 }
 
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 2c54e8de2..a0a8a3220 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -53,14 +53,15 @@ type socketOperations struct {
 	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
-	fd    int // must be O_NONBLOCK
-	queue waiter.Queue
+	family int // Read-only.
+	fd     int // must be O_NONBLOCK
+	queue  waiter.Queue
 }
 
 var _ = socket.Socket(&socketOperations{})
 
-func newSocketFile(ctx context.Context, fd int, nonblock bool) (*fs.File, *syserr.Error) {
-	s := &socketOperations{fd: fd}
+func newSocketFile(ctx context.Context, family int, fd int, nonblock bool) (*fs.File, *syserr.Error) {
+	s := &socketOperations{family: family, fd: fd}
 	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
 		return nil, syserr.FromError(err)
 	}
@@ -218,7 +219,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
 	}
 
-	f, err := newSocketFile(t, fd, flags&syscall.SOCK_NONBLOCK != 0)
+	f, err := newSocketFile(t, s.family, fd, flags&syscall.SOCK_NONBLOCK != 0)
 	if err != nil {
 		syscall.Close(fd)
 		return 0, nil, 0, err
@@ -229,6 +230,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 		CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
 	}
 	kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits())
+	t.Kernel().RecordSocket(f, s.family)
 	return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
 }
 
@@ -552,7 +554,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
 	if err != nil {
 		return nil, syserr.FromError(err)
 	}
-	return newSocketFile(t, fd, stypeflags&syscall.SOCK_NONBLOCK != 0)
+	return newSocketFile(t, p.family, fd, stypeflags&syscall.SOCK_NONBLOCK != 0)
 }
 
 // Pair implements socket.Provider.Pair.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 13681100e..548a22f32 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -52,6 +52,7 @@ type socketOperations struct {
 	fsutil.FileNoMMap        `state:"nosave"`
 	socket.SendReceiveTimeout
 
+	family   int    // Read-only.
 	fd       uint32 // must be O_NONBLOCK
 	wq       *waiter.Queue
 	rpcConn  *conn.RPCConnection
@@ -83,6 +84,7 @@ func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, pr
 	dirent := socket.NewDirent(ctx, socketDevice)
 	defer dirent.DecRef()
 	return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{
+		family:   family,
 		wq:       &wq,
 		fd:       fd,
 		rpcConn:  stack.rpcConn,
@@ -329,6 +331,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	if err != nil {
 		return 0, nil, 0, syserr.FromError(err)
 	}
+	t.Kernel().RecordSocket(file, s.family)
 
 	if peerRequested {
 		return fd, payload.Address.Address, payload.Address.Length, nil
-- 
cgit v1.2.3


From 064fda1a759fa3e73d25da3fd535d256ac8ccfb0 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 20 Mar 2019 18:39:57 -0700
Subject: gvisor: don't allocate a new credential object on fork

A credential object is immutable, so we don't need to copy it for a new
task.

PiperOrigin-RevId: 239519266
Change-Id: I0632f641fdea9554779ac25d84bee4231d0d18f2
---
 pkg/sentry/kernel/task_clone.go    | 2 +-
 pkg/sentry/kernel/task_identity.go | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 114e7f858..daf974920 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -252,7 +252,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		TaskContext:             tc,
 		FSContext:               fsc,
 		FDMap:                   fds,
-		Credentials:             creds.Fork(),
+		Credentials:             creds,
 		Niceness:                t.Niceness(),
 		NetworkNamespaced:       t.netns,
 		AllowedCPUMask:          t.CPUMask(),
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 8f90ed786..e105eba13 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -372,6 +372,7 @@ func (t *Task) DropBoundingCapability(cp linux.Capability) error {
 	if !t.creds.HasCapability(linux.CAP_SETPCAP) {
 		return syserror.EPERM
 	}
+	t.creds = t.creds.Fork() // See doc for creds.
 	t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
 	return nil
 }
-- 
cgit v1.2.3


From ba828233b9e934992ac024232e5018ce9971f334 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 21 Mar 2019 13:18:00 -0700
Subject: Clear msghdr flags on successful recvmsg.

.net sets these flags to -1 and then uses their result, especting it to be
zero.

Does not set actual flags (e.g. MSG_TRUNC), but setting to zero is more correct
than what we did before.

PiperOrigin-RevId: 239657951
Change-Id: I89c5f84bc9b94a2cd8ff84e8ecfea09e01142030
---
 pkg/sentry/syscalls/linux/sys_socket.go | 21 ++++++++++
 test/syscalls/linux/socket_generic.cc   | 74 +++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 1513f28e7..564357bac 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -57,6 +57,10 @@ const nameLenOffset = 8
 // to the ControlLen field.
 const controlLenOffset = 40
 
+// flagsOffset is the offset form the start of the MessageHeader64 struct
+// to the Flags field.
+const flagsOffset = 48
+
 // messageHeader64Len is the length of a MessageHeader64 struct.
 var messageHeader64Len = uint64(binary.Size(MessageHeader64{}))
 
@@ -743,6 +747,16 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
 		}
 		cms.Unix.Release()
+
+		if msg.Flags != 0 {
+			// Copy out the flags to the caller.
+			//
+			// TODO: Plumb through actual flags.
+			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(0)); err != nil {
+				return 0, err
+			}
+		}
+
 		return uintptr(n), nil
 	}
 
@@ -787,6 +801,13 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 		}
 	}
 
+	// Copy out the flags to the caller.
+	//
+	// TODO: Plumb through actual flags.
+	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(0)); err != nil {
+		return 0, err
+	}
+
 	return uintptr(n), nil
 }
 
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index 974c0dd7b..c83fb82fe 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -183,6 +183,80 @@ TEST_P(AllSocketPairTest, SendmsgRecvmsg16KB) {
             memcmp(sent_data.data(), received_data.data(), sent_data.size()));
 }
 
+TEST_P(AllSocketPairTest, RecvmsgMsghdrFlagsNotClearedOnFailure) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char received_data[10] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+
+  // Check that msghdr flags were not changed.
+  EXPECT_EQ(msg.msg_flags, -1);
+}
+
+TEST_P(AllSocketPairTest, RecvmsgMsghdrFlagsCleared) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data)] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(sent_data)));
+
+  // Check that msghdr flags were cleared.
+  EXPECT_EQ(msg.msg_flags, 0);
+}
+
+TEST_P(AllSocketPairTest, RecvmsgPeekMsghdrFlagsCleared) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data)] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_PEEK),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(sent_data)));
+
+  // Check that msghdr flags were cleared.
+  EXPECT_EQ(msg.msg_flags, 0);
+}
+
 TEST_P(AllSocketPairTest, RecvmmsgInvalidTimeout) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   char buf[10];
-- 
cgit v1.2.3


From 0cd5f2004444b1c792ab3d4bd3b01699b11b9553 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Thu, 21 Mar 2019 18:03:49 -0700
Subject: Replace manual pty copies to/from userspace with safemem operations.

Also, changing queue.writeBuf from a buffer.Bytes to a [][]byte should reduce
copying and reallocating of slices.

PiperOrigin-RevId: 239713547
Change-Id: I6ee5ff19c3ee2662f1af5749cae7b73db0569e96
---
 pkg/sentry/fs/tty/BUILD              |   1 +
 pkg/sentry/fs/tty/line_discipline.go |  46 ++++++-----
 pkg/sentry/fs/tty/queue.go           | 154 ++++++++++++++++++++++-------------
 3 files changed, 123 insertions(+), 78 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index bee2db3f3..908d9de09 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -24,6 +24,7 @@ go_library(
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/safemem",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/usermem",
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 484366f85..31b6344f2 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -140,9 +140,9 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 	// buffer to its read buffer. Anything already in the read buffer is
 	// now readable.
 	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
-		if n := l.inQueue.pushWaitBuf(l); n > 0 {
-			l.slaveWaiter.Notify(waiter.EventIn)
-		}
+		l.inQueue.pushWaitBuf(l)
+		l.inQueue.readable = true
+		l.slaveWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
@@ -263,7 +263,7 @@ type outputQueueTransformer struct{}
 // transform does output processing for one end of the pty. See
 // drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
 //
-// Precondition:
+// Preconditions:
 // * l.termiosMu must be held for reading.
 // * q.mu must be held.
 func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
@@ -271,11 +271,11 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 	// master termios never has ICANON set.
 
 	if !l.termios.OEnabled(linux.OPOST) {
-		n, _ := q.readBuf.Write(buf)
-		if q.readBuf.Len() > 0 {
+		q.readBuf = append(q.readBuf, buf...)
+		if len(q.readBuf) > 0 {
 			q.readable = true
 		}
-		return n
+		return len(buf)
 	}
 
 	var ret int
@@ -289,7 +289,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 				l.column = 0
 			}
 			if l.termios.OEnabled(linux.ONLCR) {
-				q.readBuf.Write([]byte{'\r', '\n'})
+				q.readBuf = append(q.readBuf, '\r', '\n')
 				continue
 			}
 		case '\r':
@@ -308,7 +308,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 			spaces := spacesPerTab - l.column%spacesPerTab
 			if l.termios.OutputFlags&linux.TABDLY == linux.XTABS {
 				l.column += spaces
-				q.readBuf.Write(bytes.Repeat([]byte{' '}, spacesPerTab))
+				q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...)
 				continue
 			}
 			l.column += spaces
@@ -319,9 +319,12 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 		default:
 			l.column++
 		}
-		q.readBuf.WriteRune(c)
+		// The compiler optimizes this by growing readBuf without
+		// creating the intermediate slice.
+		q.readBuf = append(q.readBuf, make([]byte, size)...)
+		utf8.EncodeRune(q.readBuf[len(q.readBuf)-size:], c)
 	}
-	if q.readBuf.Len() > 0 {
+	if len(q.readBuf) > 0 {
 		q.readable = true
 	}
 	return ret
@@ -338,7 +341,7 @@ type inputQueueTransformer struct{}
 // drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
 // function.
 //
-// Precondition:
+// Preconditions:
 // * l.termiosMu must be held for reading.
 // * q.mu must be held.
 func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
@@ -354,7 +357,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 	}
 
 	var ret int
-	for len(buf) > 0 && q.readBuf.Len() < canonMaxBytes {
+	for len(buf) > 0 && len(q.readBuf) < canonMaxBytes {
 		c, size := l.peekRune(buf)
 		switch c {
 		case '\r':
@@ -381,7 +384,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 		}
 
 		// Stop if the buffer would be overfilled.
-		if q.readBuf.Len()+size > maxBytes {
+		if len(q.readBuf)+size > maxBytes {
 			break
 		}
 		cBytes := buf[:size]
@@ -394,12 +397,15 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 			break
 		}
 
-		q.readBuf.WriteRune(c)
+		// The compiler optimizes this by growing readBuf without
+		// creating the intermediate slice.
+		q.readBuf = append(q.readBuf, make([]byte, size)...)
+		utf8.EncodeRune(q.readBuf[len(q.readBuf)-size:], c)
+
 		// Anything written to the readBuf will have to be echoed.
 		if l.termios.LEnabled(linux.ECHO) {
-			if l.outQueue.writeBytes(cBytes, l) > 0 {
-				l.masterWaiter.Notify(waiter.EventIn)
-			}
+			l.outQueue.writeBytes(cBytes, l)
+			l.masterWaiter.Notify(waiter.EventIn)
 		}
 
 		// If we finish a line, make it available for reading.
@@ -410,7 +416,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 	}
 
 	// In noncanonical mode, everything is readable.
-	if !l.termios.LEnabled(linux.ICANON) && q.readBuf.Len() > 0 {
+	if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 {
 		q.readable = true
 	}
 
@@ -425,7 +431,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 // * l.termiosMu must be held for reading.
 // * q.mu must be held.
 func (l *lineDiscipline) shouldDiscard(q *queue, c rune) bool {
-	return l.termios.LEnabled(linux.ICANON) && q.readBuf.Len()+utf8.RuneLen(c) >= canonMaxBytes && !l.termios.IsTerminating(c)
+	return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+utf8.RuneLen(c) >= canonMaxBytes && !l.termios.IsTerminating(c)
 }
 
 // peekRune returns the first rune from the byte array depending on whether
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index a09ca0119..f39f47941 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -15,17 +15,21 @@
 package tty
 
 import (
-	"bytes"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+// waitBufMaxBytes is the maximum size of a wait buffer. It is based on
+// TTYB_DEFAULT_MEM_LIMIT.
+const waitBufMaxBytes = 131072
+
 // queue represents one of the input or output queues between a pty master and
 // slave. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
@@ -40,12 +44,13 @@ type queue struct {
 
 	// readBuf is buffer of data ready to be read when readable is true.
 	// This data has been processed.
-	readBuf bytes.Buffer `state:".([]byte)"`
+	readBuf []byte
 
 	// waitBuf contains data that can't fit into readBuf. It is put here
 	// until it can be loaded into the read buffer. waitBuf contains data
 	// that hasn't been processed.
-	waitBuf bytes.Buffer `state:".([]byte)"`
+	waitBuf    [][]byte
+	waitBufLen uint64
 
 	// readable indicates whether the read buffer can be read from.  In
 	// canonical mode, there can be an unterminated line in the read buffer,
@@ -58,31 +63,54 @@ type queue struct {
 	transformer
 }
 
-// saveReadBuf is invoked by stateify.
-func (q *queue) saveReadBuf() []byte {
-	return append([]byte(nil), q.readBuf.Bytes()...)
-}
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (q *queue) ReadToBlocks(dst safemem.BlockSeq) (uint64, error) {
+	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf))
+	n, err := safemem.CopySeq(dst, src)
+	if err != nil {
+		return 0, err
+	}
+	q.readBuf = q.readBuf[n:]
 
-// loadReadBuf is invoked by stateify.
-func (q *queue) loadReadBuf(b []byte) {
-	q.readBuf.Write(b)
-}
+	// If we read everything, this queue is no longer readable.
+	if len(q.readBuf) == 0 {
+		q.readable = false
+	}
 
-// saveWaitBuf is invoked by stateify.
-func (q *queue) saveWaitBuf() []byte {
-	return append([]byte(nil), q.waitBuf.Bytes()...)
+	return n, nil
 }
 
-// loadWaitBuf is invoked by stateify.
-func (q *queue) loadWaitBuf(b []byte) {
-	q.waitBuf.Write(b)
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (q *queue) WriteFromBlocks(src safemem.BlockSeq) (uint64, error) {
+	copyLen := src.NumBytes()
+	room := waitBufMaxBytes - q.waitBufLen
+	// If out of room, return EAGAIN.
+	if room == 0 && copyLen > 0 {
+		return 0, syserror.ErrWouldBlock
+	}
+	// Cap the size of the wait buffer.
+	if copyLen > room {
+		copyLen = room
+		src = src.TakeFirst64(room)
+	}
+	buf := make([]byte, copyLen)
+
+	// Copy the data into the wait buffer.
+	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))
+	n, err := safemem.CopySeq(dst, src)
+	if err != nil {
+		return 0, err
+	}
+	q.waitBufAppend(buf)
+
+	return n, nil
 }
 
 // readReadiness returns whether q is ready to be read from.
 func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	if q.readBuf.Len() > 0 && q.readable {
+	if len(q.readBuf) > 0 && q.readable {
 		return waiter.EventIn
 	}
 	return waiter.EventMask(0)
@@ -90,8 +118,10 @@ func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
 
 // writeReadiness returns whether q is ready to be written to.
 func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
-	// Like Linux, we don't impose a maximum size on what can be enqueued.
-	return waiter.EventOut
+	if q.waitBufLen < waitBufMaxBytes {
+		return waiter.EventOut
+	}
+	return waiter.EventMask(0)
 }
 
 // readableSize writes the number of readable bytes to userspace.
@@ -100,7 +130,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 	defer q.mu.Unlock()
 	var size int32
 	if q.readable {
-		size = int32(q.readBuf.Len())
+		size = int32(len(q.readBuf))
 	}
 
 	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
@@ -119,29 +149,19 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
+
 	if !q.readable {
 		return 0, false, syserror.ErrWouldBlock
 	}
 
-	// Read out from the read buffer.
-	n := canonMaxBytes
-	if n > int(dst.NumBytes()) {
-		n = int(dst.NumBytes())
+	if dst.NumBytes() > canonMaxBytes {
+		dst = dst.TakeFirst(canonMaxBytes)
 	}
-	if n > q.readBuf.Len() {
-		n = q.readBuf.Len()
-	}
-	n, err := dst.Writer(ctx).Write(q.readBuf.Bytes()[:n])
+
+	n, err := dst.CopyOutFrom(ctx, q)
 	if err != nil {
 		return 0, false, err
 	}
-	// Discard bytes read out.
-	q.readBuf.Next(n)
-
-	// If we read everything, this queue is no longer readable.
-	if q.readBuf.Len() == 0 {
-		q.readable = false
-	}
 
 	// Move data from the queue's wait buffer to its read buffer.
 	nPushed := q.pushWaitBufLocked(l)
@@ -154,37 +174,32 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 // Preconditions:
 // * l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
-	// TODO: Use CopyInTo/safemem to avoid extra copying.
-	// Copy in the bytes to write from user-space.
-	b := make([]byte, src.NumBytes())
-	n, err := src.CopyIn(ctx, b)
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	// Copy data into the wait buffer.
+	n, err := src.CopyInTo(ctx, q)
 	if err != nil {
 		return 0, err
 	}
-	b = b[:n]
 
-	// If state changed, notify any waiters. If we were unable to write
-	// anything, let the caller know we could block.
-	if c := q.writeBytes(b, l); c > 0 {
-		return c, nil
-	}
-	return 0, syserror.ErrWouldBlock
+	// Push data from the wait to the read buffer.
+	q.pushWaitBufLocked(l)
+
+	return n, nil
 }
 
 // writeBytes writes to q from b.
 //
 // Preconditions:
 // * l.termiosMu must be held for reading.
-func (q *queue) writeBytes(b []byte, l *lineDiscipline) int64 {
+func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	// Write as much as possible to the read buffer.
-	n := q.transform(l, q, b)
-
-	// Write remaining data to the wait buffer.
-	nWaiting, _ := q.waitBuf.Write(b[n:])
 
-	return int64(n + nWaiting)
+	// Write to the wait buffer.
+	q.waitBufAppend(b)
+	q.pushWaitBufLocked(l)
 }
 
 // pushWaitBuf fills the queue's read buffer with data from the wait buffer.
@@ -201,9 +216,32 @@ func (q *queue) pushWaitBuf(l *lineDiscipline) int {
 // * l.termiosMu must be held for reading.
 // * q.mu must be locked.
 func (q *queue) pushWaitBufLocked(l *lineDiscipline) int {
-	// Remove bytes from the wait buffer and move them to the read buffer.
-	n := q.transform(l, q, q.waitBuf.Bytes())
-	q.waitBuf.Next(n)
+	if q.waitBufLen == 0 {
+		return 0
+	}
+
+	// Move data from the wait to the read buffer.
+	var total int
+	var i int
+	for i = 0; i < len(q.waitBuf); i++ {
+		n := q.transform(l, q, q.waitBuf[i])
+		total += n
+		if n != len(q.waitBuf[i]) {
+			// The read buffer filled up without consuming the
+			// entire buffer.
+			q.waitBuf[i] = q.waitBuf[i][n:]
+			break
+		}
+	}
+
+	// Update wait buffer based on consumed data.
+	q.waitBuf = q.waitBuf[i:]
+	q.waitBufLen -= uint64(total)
+
+	return total
+}
 
-	return n
+func (q *queue) waitBufAppend(b []byte) {
+	q.waitBuf = append(q.waitBuf, b)
+	q.waitBufLen += uint64(len(b))
 }
-- 
cgit v1.2.3


From 45ba52f8246a7060da48e250512a734a79187adf Mon Sep 17 00:00:00 2001
From: Yong He <chenglang.hy@alibaba-inc.com>
Date: Thu, 21 Mar 2019 22:03:34 -0700
Subject: Allow BP and OF can be called from user space

Change the DPL from 0 to 3 for Breakpoint and Overflow,
then user space could trigger Breakpoint and Overflow
as excepected.

Change-Id: Ibead65fb8c98b32b7737f316db93b3a8d9dcd648
PiperOrigin-RevId: 239736648
---
 pkg/sentry/platform/ring0/kernel_amd64.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 9e8c56a54..5ed4342dd 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -27,9 +27,15 @@ func (k *Kernel) init(opts KernelOpts) {
 
 	// Setup the IDT, which is uniform.
 	for v, handler := range handlers {
+		// Allow Breakpoint and Overflow to be called from all
+		// privilege levels.
+		dpl := 0
+		if v == Breakpoint || v == Overflow {
+			dpl = 3
+		}
 		// Note that we set all traps to use the interrupt stack, this
 		// is defined below when setting up the TSS.
-		k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), 0 /* dpl */, 1 /* ist */)
+		k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */)
 	}
 }
 
-- 
cgit v1.2.3


From 3d0b960112c94379e4974fd9b60d4632548a4389 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 22 Mar 2019 08:54:30 -0700
Subject: Implement PTRACE_SEIZE, PTRACE_INTERRUPT, and PTRACE_LISTEN.

PiperOrigin-RevId: 239803092
Change-Id: I42d612ed6a889e011e8474538958c6de90c6fcab
---
 pkg/sentry/kernel/ptrace.go       | 242 +++++++++++++++++++++++++++-----------
 pkg/sentry/kernel/sessions.go     |   2 +-
 pkg/sentry/kernel/task.go         |  42 +++++--
 pkg/sentry/kernel/task_exit.go    |  26 +---
 pkg/sentry/kernel/task_signals.go | 220 ++++++++++++++++++++--------------
 pkg/sentry/kernel/thread_group.go |  47 ++++----
 test/syscalls/linux/ptrace.cc     | 135 +++++++++++++++++++--
 7 files changed, 496 insertions(+), 218 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index fa7a0d141..e8043bf8a 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -193,6 +193,10 @@ type ptraceStop struct {
 	// If frozen is true, the stopped task's tracer is currently operating on
 	// it, so Task.Kill should not remove the stop.
 	frozen bool
+
+	// If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so
+	// ptraceFreeze should fail.
+	listen bool
 }
 
 // Killable implements TaskStop.Killable.
@@ -216,11 +220,11 @@ func (t *Task) beginPtraceStopLocked() bool {
 	// is what prevents tasks from entering ptrace-stops after being killed.
 	// Note that if t was SIGKILLed and beingPtraceStopLocked is being called
 	// for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
-	// entering the exit path, so t.killable() will no longer return true. This
-	// is consistent with Linux: "Bugs: ... A SIGKILL signal may still cause a
-	// PTRACE_EVENT_EXIT stop before actual signal death. This may be changed
-	// in the future; SIGKILL is meant to always immediately kill tasks even
-	// under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
+	// entering the exit path, so t.killedLocked() will no longer return true.
+	// This is consistent with Linux: "Bugs: ... A SIGKILL signal may still
+	// cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be
+	// changed in the future; SIGKILL is meant to always immediately kill tasks
+	// even under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
 	if t.killedLocked() {
 		return false
 	}
@@ -230,6 +234,10 @@ func (t *Task) beginPtraceStopLocked() bool {
 
 // Preconditions: The TaskSet mutex must be locked.
 func (t *Task) ptraceTrapLocked(code int32) {
+	// This is unconditional in ptrace_stop().
+	t.tg.signalHandlers.mu.Lock()
+	t.trapStopPending = false
+	t.tg.signalHandlers.mu.Unlock()
 	t.ptraceCode = code
 	t.ptraceSiginfo = &arch.SignalInfo{
 		Signo: int32(linux.SIGTRAP),
@@ -260,6 +268,9 @@ func (t *Task) ptraceFreeze() bool {
 	if !ok {
 		return false
 	}
+	if s.listen {
+		return false
+	}
 	s.frozen = true
 	return true
 }
@@ -273,6 +284,12 @@ func (t *Task) ptraceUnfreeze() {
 	// preventing its thread group from completing execve.
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
+	t.ptraceUnfreezeLocked()
+}
+
+// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be
+// locked.
+func (t *Task) ptraceUnfreezeLocked() {
 	// Do this even if the task has been killed to ensure a panic if t.stop is
 	// nil or not a ptraceStop.
 	t.stop.(*ptraceStop).frozen = false
@@ -336,8 +353,9 @@ func (t *Task) ptraceTraceme() error {
 	return nil
 }
 
-// ptraceAttach implements ptrace(PTRACE_ATTACH, target). t is the caller.
-func (t *Task) ptraceAttach(target *Task) error {
+// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and
+// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller.
+func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
 	if t.tg == target.tg {
 		return syserror.EPERM
 	}
@@ -355,19 +373,31 @@ func (t *Task) ptraceAttach(target *Task) error {
 	if target.exitState >= TaskExitZombie {
 		return syserror.EPERM
 	}
+	if seize {
+		if err := t.ptraceSetOptionsLocked(opts); err != nil {
+			return syserror.EIO
+		}
+	}
 	target.ptraceTracer.Store(t)
 	t.ptraceTracees[target] = struct{}{}
+	target.ptraceSeized = seize
 	target.tg.signalHandlers.mu.Lock()
-	target.sendSignalLocked(&arch.SignalInfo{
-		Signo: int32(linux.SIGSTOP),
-		Code:  arch.SignalInfoUser,
-	}, false /* group */)
+	// "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." -
+	// ptrace(2)
+	if !seize {
+		target.sendSignalLocked(&arch.SignalInfo{
+			Signo: int32(linux.SIGSTOP),
+			Code:  arch.SignalInfoUser,
+		}, false /* group */)
+	}
 	// Undocumented Linux feature: If the tracee is already group-stopped (and
 	// consequently will not report the SIGSTOP just sent), force it to leave
 	// and re-enter the stop so that it will switch to a ptrace-stop.
 	if target.stop == (*groupStop)(nil) {
-		target.groupStopRequired = true
+		target.trapStopPending = true
 		target.endInternalStopLocked()
+		// TODO: Linux blocks ptrace_attach() until the task has
+		// entered the ptrace-stop (or exited) via JOBCTL_TRAPPING.
 	}
 	target.tg.signalHandlers.mu.Unlock()
 	return nil
@@ -418,6 +448,7 @@ func (t *Task) exitPtrace() {
 //
 // Preconditions: The TaskSet mutex must be locked for writing.
 func (t *Task) forgetTracerLocked() {
+	t.ptraceSeized = false
 	t.ptraceOpts = ptraceOptions{}
 	t.ptraceSyscallMode = ptraceSyscallNone
 	t.ptraceSinglestep = false
@@ -426,21 +457,25 @@ func (t *Task) forgetTracerLocked() {
 		t.exitTracerAcked = true
 		t.exitNotifyLocked(true)
 	}
-	// If t is ptrace-stopped, but its thread group is in a group stop and t is
-	// eligible to participate, make it do so. This is essentially the reverse
-	// of the special case in ptraceAttach, which converts a group stop to a
-	// ptrace stop. ("Handling of restart from group-stop is currently buggy,
-	// but the "as planned" behavior is to leave tracee stopped and waiting for
-	// SIGCONT." - ptrace(2))
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
-	if t.stop == nil {
-		return
+	// Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If
+	// it wasn't, it will be reset via t.groupStopPending after the following.
+	t.trapStopPending = false
+	// If t's thread group is in a group stop and t is eligible to participate,
+	// make it do so. This is essentially the reverse of the special case in
+	// ptraceAttach, which converts a group stop to a ptrace stop. ("Handling
+	// of restart from group-stop is currently buggy, but the "as planned"
+	// behavior is to leave tracee stopped and waiting for SIGCONT." -
+	// ptrace(2))
+	if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated {
+		t.groupStopPending = true
+		// t already participated in the group stop when it unset
+		// groupStopPending.
+		t.groupStopAcknowledged = true
+		t.interrupt()
 	}
 	if _, ok := t.stop.(*ptraceStop); ok {
-		if t.exitState < TaskExitInitiated && t.tg.groupStopPhase >= groupStopInitiated {
-			t.groupStopRequired = true
-		}
 		t.endInternalStopLocked()
 	}
 }
@@ -460,9 +495,9 @@ func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
 	// The tracer might change this signal into a stop signal, in which case
 	// any SIGCONT received after the signal was originally dequeued should
 	// cancel it. This is consistent with Linux.
-	if t.tg.groupStopPhase == groupStopNone {
-		t.tg.groupStopPhase = groupStopDequeued
-	}
+	t.tg.groupStopDequeued = true
+	// This is unconditional in ptrace_stop().
+	t.trapStopPending = false
 	// Can't lock the TaskSet mutex while holding a signal mutex.
 	t.tg.signalHandlers.mu.Unlock()
 	defer t.tg.signalHandlers.mu.Lock()
@@ -612,22 +647,27 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions
 		if tracer != nil {
 			child.ptraceTracer.Store(tracer)
 			tracer.ptraceTracees[child] = struct{}{}
+			// "The "seized" behavior ... is inherited by children that are
+			// automatically attached using PTRACE_O_TRACEFORK,
+			// PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2)
+			child.ptraceSeized = t.ptraceSeized
 			// "Flags are inherited by new tracees created and "auto-attached"
 			// via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
-			// PTRACE_O_TRACECLONE options."
+			// PTRACE_O_TRACECLONE options." - ptrace(2)
 			child.ptraceOpts = t.ptraceOpts
 			child.tg.signalHandlers.mu.Lock()
-			// If the child is PT_SEIZED (currently not possible in the sentry
-			// because PTRACE_SEIZE is unimplemented, but for future
-			// reference), Linux just sets JOBCTL_TRAP_STOP instead, so the
-			// child skips signal-delivery-stop and goes directly to
-			// group-stop.
-			//
-			// The child will self-t.interrupt() when its task goroutine starts
+			// "PTRACE_SEIZE: ... Automatically attached children stop with
+			// PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead
+			// of having SIGSTOP signal delivered to them." - ptrace(2)
+			if child.ptraceSeized {
+				child.trapStopPending = true
+			} else {
+				child.pendingSignals.enqueue(&arch.SignalInfo{
+					Signo: int32(linux.SIGSTOP),
+				}, nil)
+			}
+			// The child will self-interrupt() when its task goroutine starts
 			// running, so we don't have to.
-			child.pendingSignals.enqueue(&arch.SignalInfo{
-				Signo: int32(linux.SIGSTOP),
-			}, nil)
 			child.tg.signalHandlers.mu.Unlock()
 		}
 	}
@@ -681,6 +721,9 @@ func (t *Task) ptraceExec(oldTID ThreadID) {
 	// Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
 	// (SI_USER). This signal may be blocked by signal mask, and thus may be
 	// delivered (much) later." - ptrace(2)
+	if t.ptraceSeized {
+		return
+	}
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
 	t.sendSignalLocked(&arch.SignalInfo{
@@ -749,6 +792,57 @@ func (t *Task) ptraceKill(target *Task) error {
 	return nil
 }
 
+func (t *Task) ptraceInterrupt(target *Task) error {
+	t.tg.pidns.owner.mu.Lock()
+	defer t.tg.pidns.owner.mu.Unlock()
+	if target.Tracer() != t {
+		return syserror.ESRCH
+	}
+	if !target.ptraceSeized {
+		return syserror.EIO
+	}
+	target.tg.signalHandlers.mu.Lock()
+	defer target.tg.signalHandlers.mu.Unlock()
+	if target.killedLocked() || target.exitState >= TaskExitInitiated {
+		return nil
+	}
+	target.trapStopPending = true
+	if s, ok := target.stop.(*ptraceStop); ok && s.listen {
+		target.endInternalStopLocked()
+	}
+	target.interrupt()
+	return nil
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing. t must have a
+// tracer.
+func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
+	const valid = uintptr(linux.PTRACE_O_EXITKILL |
+		linux.PTRACE_O_TRACESYSGOOD |
+		linux.PTRACE_O_TRACECLONE |
+		linux.PTRACE_O_TRACEEXEC |
+		linux.PTRACE_O_TRACEEXIT |
+		linux.PTRACE_O_TRACEFORK |
+		linux.PTRACE_O_TRACESECCOMP |
+		linux.PTRACE_O_TRACEVFORK |
+		linux.PTRACE_O_TRACEVFORKDONE)
+	if opts&^valid != 0 {
+		return syserror.EINVAL
+	}
+	t.ptraceOpts = ptraceOptions{
+		ExitKill:       opts&linux.PTRACE_O_EXITKILL != 0,
+		SysGood:        opts&linux.PTRACE_O_TRACESYSGOOD != 0,
+		TraceClone:     opts&linux.PTRACE_O_TRACECLONE != 0,
+		TraceExec:      opts&linux.PTRACE_O_TRACEEXEC != 0,
+		TraceExit:      opts&linux.PTRACE_O_TRACEEXIT != 0,
+		TraceFork:      opts&linux.PTRACE_O_TRACEFORK != 0,
+		TraceSeccomp:   opts&linux.PTRACE_O_TRACESECCOMP != 0,
+		TraceVfork:     opts&linux.PTRACE_O_TRACEVFORK != 0,
+		TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0,
+	}
+	return nil
+}
+
 // Ptrace implements the ptrace system call.
 func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	// PTRACE_TRACEME ignores all other arguments.
@@ -762,16 +856,23 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		return syserror.ESRCH
 	}
 
-	// PTRACE_ATTACH (and PTRACE_SEIZE, which is unimplemented) do not require
-	// that target is not already a tracee.
-	if req == linux.PTRACE_ATTACH {
-		return t.ptraceAttach(target)
+	// PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already
+	// a tracee.
+	if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE {
+		seize := req == linux.PTRACE_SEIZE
+		if seize && addr != 0 {
+			return syserror.EIO
+		}
+		return t.ptraceAttach(target, seize, uintptr(data))
 	}
-	// PTRACE_KILL (and PTRACE_INTERRUPT, which is unimplemented) require that
-	// the target is a tracee, but does not require that it is ptrace-stopped.
+	// PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee,
+	// but does not require that it is ptrace-stopped.
 	if req == linux.PTRACE_KILL {
 		return t.ptraceKill(target)
 	}
+	if req == linux.PTRACE_INTERRUPT {
+		return t.ptraceInterrupt(target)
+	}
 	// All other ptrace requests require that the target is a ptrace-stopped
 	// tracee, and freeze the ptrace-stop so the tracee can be operated on.
 	t.tg.pidns.owner.mu.RLock()
@@ -801,6 +902,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	t.UninterruptibleSleepFinish(true)
 
 	// Resuming commands end the ptrace stop, but only if successful.
+	// PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the
+	// target.
 	switch req {
 	case linux.PTRACE_DETACH:
 		if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
@@ -808,37 +911,65 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 			return err
 		}
 		return nil
+
 	case linux.PTRACE_CONT:
 		if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
+
 	case linux.PTRACE_SYSCALL:
 		if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
+
 	case linux.PTRACE_SINGLESTEP:
 		if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
+
 	case linux.PTRACE_SYSEMU:
 		if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
+
 	case linux.PTRACE_SYSEMU_SINGLESTEP:
 		if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
 			target.ptraceUnfreeze()
 			return err
 		}
 		return nil
+
+	case linux.PTRACE_LISTEN:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		if !target.ptraceSeized {
+			return syserror.EIO
+		}
+		if target.ptraceSiginfo == nil {
+			return syserror.EIO
+		}
+		if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP {
+			return syserror.EIO
+		}
+		target.tg.signalHandlers.mu.Lock()
+		defer target.tg.signalHandlers.mu.Unlock()
+		if target.trapNotifyPending {
+			target.endInternalStopLocked()
+		} else {
+			target.stop.(*ptraceStop).listen = true
+			target.ptraceUnfreezeLocked()
+		}
+		return nil
 	}
+
 	// All other ptrace requests expect us to unfreeze the stop.
 	defer target.ptraceUnfreeze()
 
@@ -958,30 +1089,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 	case linux.PTRACE_SETOPTIONS:
 		t.tg.pidns.owner.mu.Lock()
 		defer t.tg.pidns.owner.mu.Unlock()
-		validOpts := uintptr(linux.PTRACE_O_EXITKILL |
-			linux.PTRACE_O_TRACESYSGOOD |
-			linux.PTRACE_O_TRACECLONE |
-			linux.PTRACE_O_TRACEEXEC |
-			linux.PTRACE_O_TRACEEXIT |
-			linux.PTRACE_O_TRACEFORK |
-			linux.PTRACE_O_TRACESECCOMP |
-			linux.PTRACE_O_TRACEVFORK |
-			linux.PTRACE_O_TRACEVFORKDONE)
-		if uintptr(data)&^validOpts != 0 {
-			return syserror.EINVAL
-		}
-		target.ptraceOpts = ptraceOptions{
-			ExitKill:       data&linux.PTRACE_O_EXITKILL != 0,
-			SysGood:        data&linux.PTRACE_O_TRACESYSGOOD != 0,
-			TraceClone:     data&linux.PTRACE_O_TRACECLONE != 0,
-			TraceExec:      data&linux.PTRACE_O_TRACEEXEC != 0,
-			TraceExit:      data&linux.PTRACE_O_TRACEEXIT != 0,
-			TraceFork:      data&linux.PTRACE_O_TRACEFORK != 0,
-			TraceSeccomp:   data&linux.PTRACE_O_TRACESECCOMP != 0,
-			TraceVfork:     data&linux.PTRACE_O_TRACEVFORK != 0,
-			TraceVforkDone: data&linux.PTRACE_O_TRACEVFORKDONE != 0,
-		}
-		return nil
+		return target.ptraceSetOptionsLocked(uintptr(data))
 
 	case linux.PTRACE_GETEVENTMSG:
 		t.tg.pidns.owner.mu.RLock()
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 6fd65f2b0..ae6daac60 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -204,7 +204,7 @@ func (pg *ProcessGroup) handleOrphan() {
 			return
 		}
 		tg.signalHandlers.mu.Lock()
-		if tg.groupStopPhase == groupStopComplete {
+		if tg.groupStopComplete {
 			hasStopped = true
 		}
 		tg.signalHandlers.mu.Unlock()
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e9f133c0b..f958aba26 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -133,28 +133,42 @@ type Task struct {
 	// signalStack is exclusive to the task goroutine.
 	signalStack arch.SignalStack
 
-	// If groupStopRequired is true, the task should enter a group stop in the
-	// interrupt path. groupStopRequired is not redundant with
-	// tg.groupStopPhase != groupStopNone, because ptrace allows tracers to
-	// resume individual tasks from a group stop without ending the group stop
-	// as a whole.
+	// If groupStopPending is true, the task should participate in a group
+	// stop in the interrupt path.
 	//
-	// groupStopRequired is analogous to JOBCTL_TRAP_STOP in Linux, except that
-	// Linux only uses that flag for ptraced tasks.
+	// groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux.
 	//
-	// groupStopRequired is protected by the signal mutex.
-	groupStopRequired bool
+	// groupStopPending is protected by the signal mutex.
+	groupStopPending bool
 
 	// If groupStopAcknowledged is true, the task has already acknowledged that
 	// it is entering the most recent group stop that has been initiated on its
-	// thread group. groupStopAcknowledged is only meaningful if
-	// tg.groupStopPhase == groupStopInitiated.
+	// thread group.
 	//
 	// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
 	//
 	// groupStopAcknowledged is protected by the signal mutex.
 	groupStopAcknowledged bool
 
+	// If trapStopPending is true, the task goroutine should enter a
+	// PTRACE_INTERRUPT-induced stop from the interrupt path.
+	//
+	// trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that
+	// Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects
+	// JOBCTL_STOP_PENDING.
+	//
+	// trapStopPending is protected by the signal mutex.
+	trapStopPending bool
+
+	// If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group
+	// stop has begun or ended since the last time the task entered a
+	// ptrace-stop from the group-stop path.
+	//
+	// trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux.
+	//
+	// trapNotifyPending is protected by the signal mutex.
+	trapNotifyPending bool
+
 	// If stop is not nil, it is the internally-initiated condition that
 	// currently prevents the task goroutine from running.
 	//
@@ -296,6 +310,12 @@ type Task struct {
 	// ptraceTracees is protected by the TaskSet mutex.
 	ptraceTracees map[*Task]struct{}
 
+	// ptraceSeized is true if ptraceTracer attached to this task with
+	// PTRACE_SEIZE.
+	//
+	// ptraceSeized is protected by the TaskSet mutex.
+	ptraceSeized bool
+
 	// ptraceOpts contains ptrace options explicitly set by the tracer. If
 	// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
 	//
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 791cc9831..b9c558ccb 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -122,7 +122,6 @@ func (t *Task) killLocked() {
 	if t.stop != nil && t.stop.Killable() {
 		t.endInternalStopLocked()
 	}
-	t.groupStopRequired = false
 	t.pendingSignals.enqueue(&arch.SignalInfo{
 		Signo: int32(linux.SIGKILL),
 		// Linux just sets SIGKILL in the pending signal bitmask without
@@ -304,33 +303,16 @@ func (t *Task) exitThreadGroup() bool {
 	t.setSignalMaskLocked(^linux.SignalSet(0))
 
 	// Check if this task's exit interacts with an initiated group stop.
-	if t.tg.groupStopPhase != groupStopInitiated {
+	if !t.groupStopPending {
 		t.tg.signalHandlers.mu.Unlock()
 		return last
 	}
-	if t.groupStopAcknowledged {
-		// Un-acknowledge the group stop.
-		t.tg.groupStopCount--
-		t.groupStopAcknowledged = false
-		// If the group stop wasn't complete before, then there is still at
-		// least one other task that hasn't acknowledged the group stop, so
-		// it is still not complete now.
-		t.tg.signalHandlers.mu.Unlock()
-		return last
-	}
-	if t.tg.groupStopCount != t.tg.activeTasks {
-		t.tg.signalHandlers.mu.Unlock()
-		return last
-	}
-	t.Debugf("Completing group stop")
-	t.tg.groupStopPhase = groupStopComplete
-	t.tg.groupStopWaitable = true
+	t.groupStopPending = false
 	sig := t.tg.groupStopSignal
-	t.tg.groupContNotify = false
-	t.tg.groupContWaitable = false
+	notifyParent := t.participateGroupStopLocked()
 	// signalStop must be called with t's signal mutex unlocked.
 	t.tg.signalHandlers.mu.Unlock()
-	if t.tg.leader.parent != nil {
+	if notifyParent && t.tg.leader.parent != nil {
 		t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
 		t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
 	}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 583acddb1..6a204aa59 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -748,48 +748,21 @@ type groupStop struct{}
 // Killable implements TaskStop.Killable.
 func (*groupStop) Killable() bool { return true }
 
-type groupStopPhase int
-
-const (
-	// groupStopNone indicates that a thread group is not in, or attempting to
-	// enter or leave, a group stop.
-	groupStopNone groupStopPhase = iota
-
-	// groupStopDequeued indicates that at least one task in a thread group has
-	// dequeued a stop signal (or dequeued any signal and entered a
-	// signal-delivery-stop as a result, which allows ptrace to change the
-	// signal into a stop signal), but temporarily dropped the signal mutex
-	// without initiating the group stop.
-	//
-	// groupStopDequeued is analogous to JOBCTL_STOP_DEQUEUED in Linux.
-	groupStopDequeued
-
-	// groupStopInitiated indicates that a task in a thread group has initiated
-	// a group stop, but not all tasks in the thread group have acknowledged
-	// entering the group stop.
-	//
-	// groupStopInitiated is represented by JOBCTL_STOP_PENDING &&
-	// !SIGNAL_STOP_STOPPED in Linux.
-	groupStopInitiated
-
-	// groupStopComplete indicates that all tasks in a thread group have
-	// acknowledged entering the group stop, and the last one to do so has
-	// notified the thread group's parent.
-	//
-	// groupStopComplete is represented by JOBCTL_STOP_PENDING &&
-	// SIGNAL_STOP_STOPPED in Linux.
-	groupStopComplete
-)
-
 // initiateGroupStop attempts to initiate a group stop based on a
 // previously-dequeued stop signal.
 //
 // Preconditions: The caller must be running on the task goroutine.
 func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
+	t.tg.pidns.owner.mu.RLock()
+	defer t.tg.pidns.owner.mu.RUnlock()
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
-	if t.tg.groupStopPhase != groupStopDequeued {
-		t.Debugf("Signal %d: not stopping thread group: lost to racing signal", info.Signo)
+	if t.groupStopPending {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo)
+		return
+	}
+	if !t.tg.groupStopDequeued {
+		t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo)
 		return
 	}
 	if t.tg.exiting {
@@ -800,15 +773,27 @@ func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
 		t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
 		return
 	}
-	t.Debugf("Signal %d: stopping thread group", info.Signo)
-	t.tg.groupStopPhase = groupStopInitiated
-	t.tg.groupStopSignal = linux.Signal(info.Signo)
-	t.tg.groupStopCount = 0
+	if !t.tg.groupStopComplete {
+		t.tg.groupStopSignal = linux.Signal(info.Signo)
+	}
+	t.tg.groupStopPendingCount = 0
 	for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
-		t2.groupStopRequired = true
+		if t2.killedLocked() || t2.exitState >= TaskExitInitiated {
+			t2.groupStopPending = false
+			continue
+		}
+		t2.groupStopPending = true
 		t2.groupStopAcknowledged = false
+		if t2.ptraceSeized {
+			t2.trapNotifyPending = true
+			if s, ok := t2.stop.(*ptraceStop); ok && s.listen {
+				t2.endInternalStopLocked()
+			}
+		}
 		t2.interrupt()
+		t.tg.groupStopPendingCount++
 	}
+	t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount)
 }
 
 // endGroupStopLocked ensures that all prior stop signals received by tg are
@@ -820,37 +805,77 @@ func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
 	// Discard all previously-queued stop signals.
 	linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)
 
-	if tg.groupStopPhase != groupStopNone {
-		tg.leader.Debugf("Ending group stop currently in phase %d", tg.groupStopPhase)
-		if tg.groupStopPhase == groupStopInitiated || tg.groupStopPhase == groupStopComplete {
-			tg.groupStopSignal = 0
-			for t := tg.tasks.Front(); t != nil; t = t.Next() {
-				if _, ok := t.stop.(*groupStop); ok {
-					t.endInternalStopLocked()
-				}
+	if tg.groupStopPendingCount == 0 && !tg.groupStopComplete {
+		return
+	}
+
+	completeStr := "incomplete"
+	if tg.groupStopComplete {
+		completeStr = "complete"
+	}
+	tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount)
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		t.groupStopPending = false
+		if t.ptraceSeized {
+			t.trapNotifyPending = true
+			if s, ok := t.stop.(*ptraceStop); ok && s.listen {
+				t.endInternalStopLocked()
 			}
-			if broadcast {
-				// Instead of notifying the parent here, set groupContNotify so
-				// that one of the continuing tasks does so. (Linux does
-				// something similar.) The reason we do this is to keep locking
-				// sane. In order to send a signal to the parent, we need to
-				// lock its signal mutex, but we're already holding tg's signal
-				// mutex, and the TaskSet mutex must be locked for writing for
-				// us to hold two signal mutexes. Since we don't want to
-				// require this for endGroupStopLocked (which is called from
-				// signal-sending paths), nor do we want to lose atomicity by
-				// releasing the mutexes we're already holding, just let the
-				// continuing thread group deal with it.
-				tg.groupContNotify = true
-				tg.groupContInterrupted = tg.groupStopPhase == groupStopInitiated
-				tg.groupContWaitable = true
+		} else {
+			if _, ok := t.stop.(*groupStop); ok {
+				t.endInternalStopLocked()
 			}
 		}
-		// If groupStopPhase was groupStopDequeued, setting it to groupStopNone
-		// will cause following calls to initiateGroupStop to recognize that
-		// the group stop has been cancelled.
-		tg.groupStopPhase = groupStopNone
 	}
+	if broadcast {
+		// Instead of notifying the parent here, set groupContNotify so that
+		// one of the continuing tasks does so. (Linux does something similar.)
+		// The reason we do this is to keep locking sane. In order to send a
+		// signal to the parent, we need to lock its signal mutex, but we're
+		// already holding tg's signal mutex, and the TaskSet mutex must be
+		// locked for writing for us to hold two signal mutexes. Since we don't
+		// want to require this for endGroupStopLocked (which is called from
+		// signal-sending paths), nor do we want to lose atomicity by releasing
+		// the mutexes we're already holding, just let the continuing thread
+		// group deal with it.
+		tg.groupContNotify = true
+		tg.groupContInterrupted = !tg.groupStopComplete
+		tg.groupContWaitable = true
+	}
+	// Unsetting groupStopDequeued will cause racing calls to initiateGroupStop
+	// to recognize that the group stop has been cancelled.
+	tg.groupStopDequeued = false
+	tg.groupStopSignal = 0
+	tg.groupStopPendingCount = 0
+	tg.groupStopComplete = false
+	tg.groupStopWaitable = false
+}
+
+// participateGroupStopLocked is called to handle thread group side effects
+// after t unsets t.groupStopPending. The caller must handle task side effects
+// (e.g. placing the task goroutine into the group stop). It returns true if
+// the caller must notify t.tg.leader's parent of a completed group stop (which
+// participateGroupStopLocked cannot do due to holding the wrong locks).
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) participateGroupStopLocked() bool {
+	if t.groupStopAcknowledged {
+		return false
+	}
+	t.groupStopAcknowledged = true
+	t.tg.groupStopPendingCount--
+	if t.tg.groupStopPendingCount != 0 {
+		return false
+	}
+	if t.tg.groupStopComplete {
+		return false
+	}
+	t.Debugf("Completing group stop")
+	t.tg.groupStopComplete = true
+	t.tg.groupStopWaitable = true
+	t.tg.groupContNotify = false
+	t.tg.groupContWaitable = false
+	return true
 }
 
 // signalStop sends a signal to t's thread group of a new group stop, group
@@ -899,7 +924,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 		// leader's) tracer are in the same thread group, deduplicate
 		// notifications.
 		notifyParent := t.tg.leader.parent != nil
-		if tracer := t.tg.leader.ptraceTracer.Load().(*Task); tracer != nil {
+		if tracer := t.tg.leader.Tracer(); tracer != nil {
 			if notifyParent && tracer.tg == t.tg.leader.parent.tg {
 				notifyParent = false
 			}
@@ -938,23 +963,21 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 		return (*runInterrupt)(nil)
 	}
 
-	// Do we need to enter a group stop?
-	if t.groupStopRequired {
-		t.groupStopRequired = false
+	// Do we need to enter a group stop or related ptrace stop? This path is
+	// analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop()
+	// (with ptrace enabled) and do_jobctl_trap().
+	if t.groupStopPending || t.trapStopPending || t.trapNotifyPending {
 		sig := t.tg.groupStopSignal
 		notifyParent := false
-		if !t.groupStopAcknowledged {
-			t.groupStopAcknowledged = true
-			t.tg.groupStopCount++
-			if t.tg.groupStopCount == t.tg.activeTasks {
-				t.Debugf("Completing group stop")
-				notifyParent = true
-				t.tg.groupStopPhase = groupStopComplete
-				t.tg.groupStopWaitable = true
-				t.tg.groupContNotify = false
-				t.tg.groupContWaitable = false
-			}
+		if t.groupStopPending {
+			t.groupStopPending = false
+			// We care about t.tg.groupStopSignal (for tracer notification)
+			// even if this doesn't complete a group stop, so keep the
+			// value of sig we've already read.
+			notifyParent = t.participateGroupStopLocked()
 		}
+		t.trapStopPending = false
+		t.trapNotifyPending = false
 		// Drop the signal mutex so we can take the TaskSet mutex.
 		t.tg.signalHandlers.mu.Unlock()
 
@@ -963,8 +986,26 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 			notifyParent = false
 		}
 		if tracer := t.Tracer(); tracer != nil {
-			t.ptraceCode = int32(sig)
-			t.ptraceSiginfo = nil
+			if t.ptraceSeized {
+				if sig == 0 {
+					sig = linux.SIGTRAP
+				}
+				// "If tracee was attached using PTRACE_SEIZE, group-stop is
+				// indicated by PTRACE_EVENT_STOP: status>>16 ==
+				// PTRACE_EVENT_STOP. This allows detection of group-stops
+				// without requiring an extra PTRACE_GETSIGINFO call." -
+				// "Group-stop", ptrace(2)
+				t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8
+				t.ptraceSiginfo = &arch.SignalInfo{
+					Signo: int32(sig),
+					Code:  t.ptraceCode,
+				}
+				t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+				t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+			} else {
+				t.ptraceCode = int32(sig)
+				t.ptraceSiginfo = nil
+			}
 			if t.beginPtraceStopLocked() {
 				tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
 				// For consistency with Linux, if the parent and tracer are in the
@@ -994,12 +1035,11 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 
 	// Are there signals pending?
 	if info := t.dequeueSignalLocked(t.signalMask); info != nil {
-		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 && t.tg.groupStopPhase == groupStopNone {
-			// Indicate that we've dequeued a stop signal before
-			// unlocking the signal mutex; initiateGroupStop will check
-			// that the phase hasn't changed (or is at least another
-			// "stop signal dequeued" phase) after relocking it.
-			t.tg.groupStopPhase = groupStopDequeued
+		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
+			// Indicate that we've dequeued a stop signal before unlocking the
+			// signal mutex; initiateGroupStop will check for races with
+			// endGroupStopLocked after relocking it.
+			t.tg.groupStopDequeued = true
 		}
 		if t.ptraceSignalLocked(info) {
 			// Dequeueing the signal action must wait until after the
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index d7652f57c..1b7b74319 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -60,25 +60,35 @@ type ThreadGroup struct {
 	// pendingSignals is protected by the signal mutex.
 	pendingSignals pendingSignals
 
-	// groupStopPhase indicates the state of a group stop in progress on the
-	// thread group, if any.
+	// If groupStopDequeued is true, a task in the thread group has dequeued a
+	// stop signal, but has not yet initiated the group stop.
 	//
-	// groupStopPhase is protected by the signal mutex.
-	groupStopPhase groupStopPhase
+	// groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED.
+	//
+	// groupStopDequeued is protected by the signal mutex.
+	groupStopDequeued bool
 
 	// groupStopSignal is the signal that caused a group stop to be initiated.
-	// groupStopSignal is only meaningful if groupStopPhase is
-	// groupStopInitiated or groupStopComplete.
 	//
 	// groupStopSignal is protected by the signal mutex.
 	groupStopSignal linux.Signal
 
-	// groupStopCount is the number of non-exited tasks in the thread group
-	// that have acknowledged an initiated group stop. groupStopCount is only
-	// meaningful if groupStopPhase is groupStopInitiated.
+	// groupStopPendingCount is the number of active tasks in the thread group
+	// for which Task.groupStopPending is set.
+	//
+	// groupStopPendingCount is analogous to Linux's
+	// signal_struct::group_stop_count.
 	//
-	// groupStopCount is protected by the signal mutex.
-	groupStopCount int
+	// groupStopPendingCount is protected by the signal mutex.
+	groupStopPendingCount int
+
+	// If groupStopComplete is true, groupStopPendingCount transitioned from
+	// non-zero to zero without an intervening SIGCONT.
+	//
+	// groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED.
+	//
+	// groupStopComplete is protected by the signal mutex.
+	groupStopComplete bool
 
 	// If groupStopWaitable is true, the thread group is indicating a waitable
 	// group stop event (as defined by EventChildGroupStop).
@@ -91,14 +101,9 @@ type ThreadGroup struct {
 
 	// If groupContNotify is true, then a SIGCONT has recently ended a group
 	// stop on this thread group, and the first task to observe it should
-	// notify its parent.
-	//
-	// groupContNotify is protected by the signal mutex.
-	groupContNotify bool
-
-	// If groupContNotify is true, groupContInterrupted is true iff SIGCONT
-	// ended a group stop in phase groupStopInitiated. If groupContNotify is
-	// false, groupContInterrupted is meaningless.
+	// notify its parent. groupContInterrupted is true iff SIGCONT ended an
+	// incomplete group stop. If groupContNotify is false, groupContInterrupted is
+	// meaningless.
 	//
 	// Analogues in Linux:
 	//
@@ -110,7 +115,9 @@ type ThreadGroup struct {
 	//
 	// - !groupContNotify is represented by neither flag being set.
 	//
-	// groupContInterrupted is protected by the signal mutex.
+	// groupContNotify and groupContInterrupted are protected by the signal
+	// mutex.
+	groupContNotify      bool
 	groupContInterrupted bool
 
 	// If groupContWaitable is true, the thread group is indicating a waitable
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 6f1701aef..6d5c425d8 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -44,6 +44,20 @@ namespace testing {
 
 namespace {
 
+// PTRACE_GETSIGMASK and PTRACE_SETSIGMASK are not defined until glibc 2.23
+// (fb53a27c5741 "Add new header definitions from Linux 4.4 (plus older ptrace
+// definitions)").
+constexpr auto kPtraceGetSigMask = static_cast<__ptrace_request>(0x420a);
+constexpr auto kPtraceSetSigMask = static_cast<__ptrace_request>(0x420b);
+
+// PTRACE_SYSEMU is not defined until glibc 2.27 (c48831d0eebf "linux/x86: sync
+// sys/ptrace.h with Linux 4.14 [BZ #22433]").
+constexpr auto kPtraceSysemu = static_cast<__ptrace_request>(31);
+
+// PTRACE_EVENT_STOP is not defined until glibc 2.26 (3f67d1a7021e "Add Linux
+// PTRACE_EVENT_STOP").
+constexpr int kPtraceEventStop = 128;
+
 // Sends sig to the current process with tgkill(2).
 //
 // glibc's raise(2) may change the signal mask before sending the signal. These
@@ -146,10 +160,6 @@ TEST(PtraceTest, AttachParent_PeekData_PokeData_SignalSuppression) {
 }
 
 TEST(PtraceTest, GetSigMask) {
-  // <sys/user.h> doesn't define these until Linux 4.4, even though the features
-  // were added in 3.11.
-  constexpr auto kPtraceGetSigMask = static_cast<enum __ptrace_request>(0x420a);
-  constexpr auto kPtraceSetSigMask = static_cast<enum __ptrace_request>(0x420b);
   // glibc and the Linux kernel define a sigset_t with different sizes. To avoid
   // creating a kernel_sigset_t and recreating all the modification functions
   // (sigemptyset, etc), we just hardcode the kernel sigset size.
@@ -878,9 +888,7 @@ TEST(PtraceTest, Sysemu_PokeUser) {
       << " status " << status;
 
   // Suppress the SIGSTOP and wait for the child to enter syscall-enter-stop
-  // for its first exit_group syscall. glibc doesn't necessarily define
-  // PTRACE_SYSEMU.
-  constexpr auto kPtraceSysemu = static_cast<__ptrace_request>(31);
+  // for its first exit_group syscall.
   ASSERT_THAT(ptrace(kPtraceSysemu, child_pid, 0, 0), SyscallSucceeds());
   ASSERT_THAT(waitpid(child_pid, &status, 0),
               SyscallSucceedsWithValue(child_pid));
@@ -999,6 +1007,119 @@ TEST(PtraceTest, ERESTART_NoRandomSave) {
 }
 #endif  // defined(__x86_64__)
 
+TEST(PtraceTest, Seize_Interrupt_Listen) {
+  volatile long child_should_spin = 1;
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    while (child_should_spin) {
+      SleepSafe(absl::Seconds(1));
+    }
+    _exit(1);
+  }
+
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Attach to the child with PTRACE_SEIZE; doing so should not stop the child.
+  ASSERT_THAT(ptrace(PTRACE_SEIZE, child_pid, 0, 0), SyscallSucceeds());
+  int status;
+  EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Stop the child with PTRACE_INTERRUPT.
+  ASSERT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8);
+
+  // Unset child_should_spin to verify that the child never leaves the spin
+  // loop.
+  ASSERT_THAT(ptrace(PTRACE_POKEDATA, child_pid, &child_should_spin, 0),
+              SyscallSucceeds());
+
+  // Send SIGSTOP to the child, then resume it, allowing it to proceed to
+  // signal-delivery-stop.
+  ASSERT_THAT(kill(child_pid, SIGSTOP), SyscallSucceeds());
+  ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // Release the child from signal-delivery-stop without suppressing the
+  // SIGSTOP, causing it to enter group-stop.
+  ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, SIGSTOP), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_EQ(SIGSTOP | (kPtraceEventStop << 8), status >> 8);
+
+  // "The state of the tracee after PTRACE_LISTEN is somewhat of a gray area: it
+  // is not in any ptrace-stop (ptrace commands won't work on it, and it will
+  // deliver waitpid(2) notifications), but it also may be considered 'stopped'
+  // because it is not executing instructions (is not scheduled), and if it was
+  // in group-stop before PTRACE_LISTEN, it will not respond to signals until
+  // SIGCONT is received." - ptrace(2).
+  ASSERT_THAT(ptrace(PTRACE_LISTEN, child_pid, 0, 0), SyscallSucceeds());
+  EXPECT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0),
+              SyscallFailsWithErrno(ESRCH));
+  EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+              SyscallSucceedsWithValue(0));
+  EXPECT_THAT(kill(child_pid, SIGTERM), SyscallSucceeds());
+  absl::SleepFor(absl::Seconds(1));
+  EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+              SyscallSucceedsWithValue(0));
+
+  // Send SIGCONT to the child, causing it to leave group-stop and re-trap due
+  // to PTRACE_LISTEN.
+  EXPECT_THAT(kill(child_pid, SIGCONT), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8);
+
+  // Detach the child and expect it to exit due to the SIGTERM we sent while
+  // it was stopped by PTRACE_LISTEN.
+  ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM)
+      << " status " << status;
+}
+
+TEST(PtraceTest, Interrupt_Listen_RequireSeize) {
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+    MaybeSave();
+    raise(SIGSTOP);
+    _exit(0);
+  }
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+      << " status " << status;
+
+  // PTRACE_INTERRUPT and PTRACE_LISTEN should fail since the child wasn't
+  // attached with PTRACE_SEIZE, leaving the child in signal-delivery-stop.
+  EXPECT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0),
+              SyscallFailsWithErrno(EIO));
+  EXPECT_THAT(ptrace(PTRACE_LISTEN, child_pid, 0, 0),
+              SyscallFailsWithErrno(EIO));
+
+  // Suppress SIGSTOP and detach from the child, expecting it to exit normally.
+  ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << " status " << status;
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From b81bfd6013ce871524e493272ac36b134f7fbbdf Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 22 Mar 2019 17:37:10 -0700
Subject: lstat should resolve the final path component if it ends in a slash.

PiperOrigin-RevId: 239896221
Change-Id: I0949981fe50c57131c5631cdeb10b225648575c0
---
 pkg/sentry/syscalls/linux/sys_stat.go |  6 +++++-
 test/syscalls/linux/stat.cc           | 25 +++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 95f161aac..8d6a8f616 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -78,7 +78,11 @@ func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		return 0, nil, err
 	}
 
-	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+	// If the path ends in a slash (i.e. dirPath is true), then we *do*
+	// want to resolve the final component.
+	resolve := dirPath
+
+	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
 		return stat(t, d, dirPath, statAddr)
 	})
 }
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 625392375..f96da5706 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -374,6 +374,31 @@ TEST_F(StatTest, ChildOfNonDir) {
   EXPECT_THAT(lstat(filename.c_str(), &st), SyscallFailsWithErrno(ENOTDIR));
 }
 
+// Test lstating a symlink directory.
+TEST_F(StatTest, LstatSymlinkDir) {
+  // Create a directory and symlink to it.
+  const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string symlink_to_dir = NewTempAbsPath();
+  EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+              SyscallSucceeds());
+  auto cleanup = Cleanup([&symlink_to_dir]() {
+    EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+  });
+
+  // Lstat on the symlink should return symlink data.
+  struct stat st = {};
+  ASSERT_THAT(lstat(symlink_to_dir.c_str(), &st), SyscallSucceeds());
+  EXPECT_FALSE(S_ISDIR(st.st_mode));
+  EXPECT_TRUE(S_ISLNK(st.st_mode));
+
+  // Lstat on the symlink with a trailing slash should return the directory
+  // data.
+  ASSERT_THAT(lstat(absl::StrCat(symlink_to_dir, "/").c_str(), &st),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st.st_mode));
+  EXPECT_FALSE(S_ISLNK(st.st_mode));
+}
+
 // Verify that we get an ELOOP from too many symbolic links even when there
 // are directories in the middle.
 TEST_F(StatTest, LstatELOOPPath) {
-- 
cgit v1.2.3


From ddc05e3053e387be9c81aa98c621b6fc92b01000 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 25 Mar 2019 11:40:49 -0700
Subject: epoll: use ilist:generic_list instead of ilist:ilist

ilist:generic_list works faster than ilist:ilist.

Here is a beanchmark test to measure performance of epoll_wait, when readyList
isn't empty. It shows about 30% better performance with these changes.

Benchmark           Time(ns)        CPU(ns)     Iterations
Before:
BM_EpollAllEvents      46725          46899          14286

After:
BM_EpollAllEvents      33167          33300          18919
PiperOrigin-RevId: 240185278
Change-Id: I3e33f9b214db13ab840b91613400525de5b58d18
---
 pkg/sentry/kernel/epoll/BUILD          | 15 ++++++++++++++-
 pkg/sentry/kernel/epoll/epoll.go       | 17 ++++++++---------
 pkg/sentry/kernel/epoll/epoll_state.go | 14 ++++++--------
 3 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 1567d5050..3ac59e13e 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -1,17 +1,30 @@
 package(licenses = ["notice"])
 
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+go_template_instance(
+    name = "epoll_list",
+    out = "epoll_list.go",
+    package = "epoll",
+    prefix = "pollEntry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*pollEntry",
+        "Linker": "*pollEntry",
+    },
+)
+
 go_library(
     name = "epoll",
     srcs = [
         "epoll.go",
+        "epoll_list.go",
         "epoll_state.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll",
     visibility = ["//pkg/sentry:internal"],
     deps = [
-        "//pkg/ilist",
         "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 502395f18..61c0fb7c5 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -21,7 +21,6 @@ import (
 	"sync"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -70,7 +69,7 @@ type FileIdentifier struct {
 //
 // +stateify savable
 type pollEntry struct {
-	ilist.Entry
+	pollEntryEntry
 	file     *refs.WeakRef  `state:"manual"`
 	id       FileIdentifier `state:"wait"`
 	userData [2]int32
@@ -84,7 +83,7 @@ type pollEntry struct {
 	// struct, while state framework currently does not support such
 	// in-struct pointers. Instead, EventPoll will properly set this field
 	// in its loading logic.
-	curList *ilist.List `state:"nosave"`
+	curList *pollEntryList `state:"nosave"`
 }
 
 // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
@@ -133,9 +132,9 @@ type EventPoll struct {
 	//	disabledList -- when the entry is disabled. This happens when
 	//		a one-shot entry gets delivered via readEvents().
 	listsMu      sync.Mutex `state:"nosave"`
-	readyList    ilist.List
-	waitingList  ilist.List
-	disabledList ilist.List
+	readyList    pollEntryList
+	waitingList  pollEntryList
+	disabledList pollEntryList
 }
 
 // cycleMu is used to serialize all the cycle checks. This is only used when
@@ -189,7 +188,7 @@ func (e *EventPoll) eventsAvailable() bool {
 	e.listsMu.Lock()
 
 	for it := e.readyList.Front(); it != nil; {
-		entry := it.(*pollEntry)
+		entry := it
 		it = it.Next()
 
 		// If the entry is ready, we know 'e' has at least one entry
@@ -225,14 +224,14 @@ func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // ReadEvents returns up to max available events.
 func (e *EventPoll) ReadEvents(max int) []Event {
-	var local ilist.List
+	var local pollEntryList
 	var ret []Event
 
 	e.listsMu.Lock()
 
 	// Go through all entries we believe may be ready.
 	for it := e.readyList.Front(); it != nil && len(ret) < max; {
-		entry := it.(*pollEntry)
+		entry := it
 		it = it.Next()
 
 		// Check the entry's readiness. It it's not really ready, we
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index 7f3e2004a..f6e3e4825 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -15,7 +15,6 @@
 package epoll
 
 import (
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
@@ -33,18 +32,17 @@ func (e *EventPoll) afterLoad() {
 	e.listsMu.Lock()
 	defer e.listsMu.Unlock()
 
-	for _, ls := range []*ilist.List{&e.waitingList, &e.readyList, &e.disabledList} {
+	for _, ls := range []*pollEntryList{&e.waitingList, &e.readyList, &e.disabledList} {
 		for it := ls.Front(); it != nil; it = it.Next() {
-			it.(*pollEntry).curList = ls
+			it.curList = ls
 		}
 	}
 
 	for it := e.waitingList.Front(); it != nil; it = it.Next() {
-		p := it.(*pollEntry)
-		if p.id.File.Readiness(p.mask) != 0 {
-			e.waitingList.Remove(p)
-			e.readyList.PushBack(p)
-			p.curList = &e.readyList
+		if it.id.File.Readiness(it.mask) != 0 {
+			e.waitingList.Remove(it)
+			e.readyList.PushBack(it)
+			it.curList = &e.readyList
 			e.Notify(waiter.EventIn)
 		}
 	}
-- 
cgit v1.2.3


From f3723f805989d0c5956fbb6aa88b1cd9ac20753c Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 25 Mar 2019 12:41:36 -0700
Subject: Call memmap.Mappable.Translate with more conservative
 usermem.AccessType.

MM.insertPMAsLocked() passes vma.maxPerms to memmap.Mappable.Translate
(although it unsets AccessType.Write if the vma is private). This
somewhat simplifies handling of pmas, since it means only COW-break
needs to replace existing pmas. However, it also means that a MAP_SHARED
mapping of a file opened O_RDWR dirties the file, regardless of the
mapping's permissions and whether or not the mapping is ever actually
written to with I/O that ignores permissions (e.g.
ptrace(PTRACE_POKEDATA)).

To fix this:

- Change the pma-getting path to request only the permissions that are
required for the calling access.

- Change memmap.Mappable.Translate to take requested permissions, and
return allowed permissions. This preserves the existing behavior in the
common cases where the memmap.Mappable isn't
fsutil.CachingInodeOperations and doesn't care if the translated
platform.File pages are written to.

- Change the MM.getPMAsLocked path to support permission upgrading of
pmas outside of copy-on-write.

PiperOrigin-RevId: 240196979
Change-Id: Ie0147c62c1fbc409467a6fa16269a413f3d7d571
---
 pkg/sentry/fs/binder/binder.go        |   1 +
 pkg/sentry/fs/fsutil/host_mappable.go |   1 +
 pkg/sentry/fs/fsutil/inode_cached.go  |  19 +-
 pkg/sentry/fs/tmpfs/inode_file.go     |   1 +
 pkg/sentry/kernel/shm/shm.go          |   1 +
 pkg/sentry/memmap/memmap.go           |  17 +-
 pkg/sentry/mm/address_space.go        |   2 +-
 pkg/sentry/mm/aio_context.go          |   1 +
 pkg/sentry/mm/debug.go                |   6 +-
 pkg/sentry/mm/io.go                   |  20 +-
 pkg/sentry/mm/lifecycle.go            |   4 +-
 pkg/sentry/mm/mm.go                   |  27 +-
 pkg/sentry/mm/pma.go                  | 664 +++++++++++++++++-----------------
 pkg/sentry/mm/special_mappable.go     |   1 +
 pkg/sentry/mm/syscalls.go             |  22 +-
 pkg/sentry/usermem/access_type.go     |   9 +
 16 files changed, 416 insertions(+), 380 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 16fb4806f..188353961 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -244,6 +244,7 @@ func (bp *Proc) Translate(ctx context.Context, required, optional memmap.Mappabl
 				Source: memmap.MappableRange{0, usermem.PageSize},
 				File:   bp.mfp.MemoryFile(),
 				Offset: bp.mapped.Start,
+				Perms:  usermem.AnyAccess,
 			},
 		}, err
 	}
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 1bb5c6b6e..4a182baa1 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -94,6 +94,7 @@ func (h *HostMappable) Translate(ctx context.Context, required, optional memmap.
 			Source: optional,
 			File:   h,
 			Offset: optional.Start,
+			Perms:  usermem.AnyAccess,
 		},
 	}, nil
 }
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 9bd923678..6ca51ab0d 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -739,6 +739,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 				Source: optional,
 				File:   c,
 				Offset: optional.Start,
+				Perms:  usermem.AnyAccess,
 			},
 		}, nil
 	}
@@ -768,16 +769,24 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 	var translatedEnd uint64
 	for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
 		segMR := seg.Range().Intersect(optional)
-		ts = append(ts, memmap.Translation{
-			Source: segMR,
-			File:   mf,
-			Offset: seg.FileRangeOf(segMR).Start,
-		})
+		// TODO: Make Translations writable even if writability is
+		// not required if already kept-dirty by another writable translation.
+		perms := usermem.AccessType{
+			Read:    true,
+			Execute: true,
+		}
 		if at.Write {
 			// From this point forward, this memory can be dirtied through the
 			// mapping at any time.
 			c.dirty.KeepDirty(segMR)
+			perms.Write = true
 		}
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   mf,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  perms,
+		})
 		translatedEnd = segMR.End
 	}
 
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 3c84b2977..25bf2b9dd 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -481,6 +481,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 			Source: segMR,
 			File:   mf,
 			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  usermem.AnyAccess,
 		})
 		translatedEnd = segMR.End
 	}
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index a7f0758ec..2b291e4f8 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -455,6 +455,7 @@ func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableR
 				Source: source,
 				File:   s.mfp.MemoryFile(),
 				Offset: s.fr.Start + source.Start,
+				Perms:  usermem.AnyAccess,
 			},
 		}, err
 	}
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 70cdf428b..1ef1f0dd8 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -105,13 +105,22 @@ type Translation struct {
 
 	// Offset is the offset into File at which this Translation begins.
 	Offset uint64
+
+	// Perms is the set of permissions for which platform.AddressSpace.MapFile
+	// and platform.AddressSpace.MapInternal on this Translation is permitted.
+	Perms usermem.AccessType
+}
+
+// FileRange returns the platform.FileRange represented by t.
+func (t Translation) FileRange() platform.FileRange {
+	return platform.FileRange{t.Offset, t.Offset + t.Source.Length()}
 }
 
 // CheckTranslateResult returns an error if (ts, terr) does not satisfy all
-// postconditions for Mappable.Translate(required, optional).
+// postconditions for Mappable.Translate(required, optional, at).
 //
 // Preconditions: As for Mappable.Translate.
-func CheckTranslateResult(required, optional MappableRange, ts []Translation, terr error) error {
+func CheckTranslateResult(required, optional MappableRange, at usermem.AccessType, ts []Translation, terr error) error {
 	// Verify that the inputs to Mappable.Translate were valid.
 	if !required.WellFormed() || required.Length() <= 0 {
 		panic(fmt.Sprintf("invalid required range: %v", required))
@@ -156,6 +165,10 @@ func CheckTranslateResult(required, optional MappableRange, ts []Translation, te
 		if !optional.IsSupersetOf(t.Source) {
 			return fmt.Errorf("Translation %+v lies outside optional range %v", t, optional)
 		}
+		// Each Translation must permit a superset of requested accesses.
+		if !t.Perms.SupersetOf(at) {
+			return fmt.Errorf("Translation %+v does not permit all requested access types %v", t, at)
+		}
 	}
 	// If the set of Translations does not cover the entire required range,
 	// Translate must return a non-nil error explaining why.
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 90cfef746..4dddcf7b5 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -179,7 +179,7 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		pma := pseg.ValuePtr()
 		pmaAR := pseg.Range()
 		pmaMapAR := pmaAR.Intersect(mapAR)
-		perms := pma.vmaEffectivePerms
+		perms := pma.effectivePerms
 		if pma.needCOW {
 			perms.Write = false
 		}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 6cec6387a..f7ff06de0 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -302,6 +302,7 @@ func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.M
 				Source: source,
 				File:   m.mfp.MemoryFile(),
 				Offset: m.fr.Start + source.Start,
+				Perms:  usermem.AnyAccess,
 			},
 		}, err
 	}
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
index d341b9c07..d075ee1ca 100644
--- a/pkg/sentry/mm/debug.go
+++ b/pkg/sentry/mm/debug.go
@@ -68,12 +68,12 @@ func (pseg pmaIterator) debugStringEntryLocked() []byte {
 	fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End())
 
 	pma := pseg.ValuePtr()
-	if pma.vmaEffectivePerms.Read {
+	if pma.effectivePerms.Read {
 		b.WriteByte('r')
 	} else {
 		b.WriteByte('-')
 	}
-	if pma.vmaEffectivePerms.Write {
+	if pma.effectivePerms.Write {
 		if pma.needCOW {
 			b.WriteByte('c')
 		} else {
@@ -82,7 +82,7 @@ func (pseg pmaIterator) debugStringEntryLocked() []byte {
 	} else {
 		b.WriteByte('-')
 	}
-	if pma.vmaEffectivePerms.Execute {
+	if pma.effectivePerms.Execute {
 		b.WriteByte('x')
 	} else {
 		b.WriteByte('-')
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index e0cebef84..81787a6fd 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -466,9 +466,7 @@ func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr,
 
 	// Ensure that we have usable pmas.
 	mm.activeMu.Lock()
-	pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
-		breakCOW: at.Write,
-	})
+	pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at)
 	mm.mappingMu.RUnlock()
 	if pendaddr := pend.Start(); pendaddr < ar.End {
 		if pendaddr <= ar.Start {
@@ -498,14 +496,10 @@ func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr,
 //
 // Preconditions: 0 < ar.Length() <= math.MaxInt64.
 func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
-	po := pmaOpts{
-		breakCOW: at.Write,
-	}
-
 	// If pmas are already available, we can do IO without touching mm.vmas or
 	// mm.mappingMu.
 	mm.activeMu.RLock()
-	if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, po, true /* needInternalMappings */); pseg.Ok() {
+	if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() {
 		n, err := f(mm.internalMappingsLocked(pseg, ar))
 		mm.activeMu.RUnlock()
 		// Do not convert errors returned by f to EFAULT.
@@ -526,7 +520,7 @@ func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.Ad
 
 	// Ensure that we have usable pmas.
 	mm.activeMu.Lock()
-	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, po)
+	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
 	mm.mappingMu.RUnlock()
 	if pendaddr := pend.Start(); pendaddr < ar.End {
 		if pendaddr <= ar.Start {
@@ -578,14 +572,10 @@ func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars userme
 		return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f)
 	}
 
-	po := pmaOpts{
-		breakCOW: at.Write,
-	}
-
 	// If pmas are already available, we can do IO without touching mm.vmas or
 	// mm.mappingMu.
 	mm.activeMu.RLock()
-	if mm.existingVecPMAsLocked(ars, at, ignorePermissions, po, true /* needInternalMappings */) {
+	if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) {
 		n, err := f(mm.vecInternalMappingsLocked(ars))
 		mm.activeMu.RUnlock()
 		// Do not convert errors returned by f to EFAULT.
@@ -603,7 +593,7 @@ func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars userme
 
 	// Ensure that we have usable pmas.
 	mm.activeMu.Lock()
-	pars, perr := mm.getVecPMAsLocked(ctx, vars, po)
+	pars, perr := mm.getVecPMAsLocked(ctx, vars, at)
 	mm.mappingMu.RUnlock()
 	if pars.NumBytes() == 0 {
 		mm.activeMu.Unlock()
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index a71286f14..2fe03172c 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -124,7 +124,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		}
 		if !pma.needCOW {
 			pma.needCOW = true
-			if pma.vmaEffectivePerms.Write {
+			if pma.effectivePerms.Write {
 				// We don't want to unmap the whole address space, even though
 				// doing so would reduce calls to unmapASLocked(), because mm
 				// will most likely continue to be used after the fork, so
@@ -139,7 +139,9 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 					}
 					unmapAR = srcpseg.Range()
 				}
+				pma.effectivePerms.Write = false
 			}
+			pma.maxPerms.Write = false
 		}
 		fr := srcpseg.fileRange()
 		mm2.incPrivateRef(fr)
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 6ed838d64..a3417a46e 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -71,9 +71,6 @@ type MemoryManager struct {
 	// ownership is shared by one or more pmas instead of being owned by a
 	// memmap.Mappable).
 	//
-	// NOTE: This should be replaced using refcounts on
-	// platform.File.
-	//
 	// privateRefs is immutable.
 	privateRefs *privateRefs
 
@@ -374,13 +371,27 @@ type pma struct {
 	file platform.File `state:"nosave"`
 
 	// off is the offset into file at which this pma begins.
+	//
+	// Note that pmas do *not* hold references on offsets in file! If private
+	// is true, MemoryManager.privateRefs holds the reference instead. If
+	// private is false, the corresponding memmap.Mappable holds the reference
+	// instead (per memmap.Mappable.Translate requirement).
 	off uint64
 
-	// vmaEffectivePerms and vmaMaxPerms are duplicated from the
-	// corresponding vma so that the IO implementation can avoid iterating
-	// mm.vmas when pmas already exist.
-	vmaEffectivePerms usermem.AccessType
-	vmaMaxPerms       usermem.AccessType
+	// translatePerms is the permissions returned by memmap.Mappable.Translate.
+	// If private is true, translatePerms is usermem.AnyAccess.
+	translatePerms usermem.AccessType
+
+	// effectivePerms is the permissions allowed for non-ignorePermissions
+	// accesses. maxPerms is the permissions allowed for ignorePermissions
+	// accesses. These are vma.effectivePerms and vma.maxPerms respectively,
+	// masked by pma.translatePerms and with Write disallowed if pma.needCOW is
+	// true.
+	//
+	// These are stored in the pma so that the IO implementation can avoid
+	// iterating mm.vmas when pmas already exist.
+	effectivePerms usermem.AccessType
+	maxPerms       usermem.AccessType
 
 	// needCOW is true if writes to the mapping must be propagated to a copy.
 	needCOW bool
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index bb779a45b..e090537cc 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -27,18 +27,13 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-type pmaOpts struct {
-	// If breakCOW is true, pmas must not be copy-on-write.
-	breakCOW bool
-}
-
 // existingPMAsLocked checks that pmas exist for all addresses in ar, and
 // support access of type (at, ignorePermissions). If so, it returns an
 // iterator to the pma containing ar.Start. Otherwise it returns a terminal
 // iterator.
 //
 // Preconditions: mm.activeMu must be locked. ar.Length() != 0.
-func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) pmaIterator {
+func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
@@ -49,16 +44,11 @@ func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.Acc
 	pseg := first
 	for pseg.Ok() {
 		pma := pseg.ValuePtr()
-		perms := pma.vmaEffectivePerms
+		perms := pma.effectivePerms
 		if ignorePermissions {
-			perms = pma.vmaMaxPerms
+			perms = pma.maxPerms
 		}
 		if !perms.SupersetOf(at) {
-			// These are the vma's permissions, so the caller will get an error
-			// when they try to get new pmas.
-			return pmaIterator{}
-		}
-		if opts.breakCOW && pma.needCOW {
 			return pmaIterator{}
 		}
 		if needInternalMappings && pma.internalMappings.IsEmpty() {
@@ -79,17 +69,17 @@ func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.Acc
 // and support access of type (at, ignorePermissions).
 //
 // Preconditions: mm.activeMu must be locked.
-func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, opts pmaOpts, needInternalMappings bool) bool {
+func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) bool {
 	for ; !ars.IsEmpty(); ars = ars.Tail() {
-		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, opts, needInternalMappings).Ok() {
+		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() {
 			return false
 		}
 	}
 	return true
 }
 
-// getPMAsLocked ensures that pmas exist for all addresses in ar, subject to
-// opts. It returns:
+// getPMAsLocked ensures that pmas exist for all addresses in ar, and support
+// access of type at. It returns:
 //
 // - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
 // the iterator is unspecified.
@@ -102,8 +92,9 @@ func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at user
 //
 // Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
 // writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist
-// for all addresses in ar.
-func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, opts pmaOpts) (pmaIterator, pmaGapIterator, error) {
+// for all addresses in ar, and support accesses of type at (i.e. permission
+// checks must have been performed against vmas).
+func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
@@ -125,54 +116,40 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar
 	}
 	ar = usermem.AddrRange{ar.Start.RoundDown(), end}
 
-	pstart, pend, perr := mm.ensurePMAsLocked(ctx, vseg, ar)
+	pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at)
 	if pend.Start() <= ar.Start {
 		return pmaIterator{}, pend, perr
 	}
-	// ensurePMAsLocked may not have pstart due to iterator invalidation. We
-	// need it, either to return it immediately or to pass to
-	// breakCopyOnWriteLocked.
+	// getPMAsInternalLocked may not have returned pstart due to iterator
+	// invalidation.
 	if !pstart.Ok() {
 		pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
 	}
-
-	var cowerr error
-	if opts.breakCOW {
-		if pend.Start() < ar.End {
-			// Adjust ar to reflect missing pmas.
-			ar.End = pend.Start()
-		}
-		var invalidated bool
-		pend, invalidated, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
-		if pend.Start() <= ar.Start {
-			return pmaIterator{}, pend, cowerr
-		}
-		if invalidated {
-			pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
-		}
-	}
-
-	if cowerr != nil {
-		return pstart, pend, cowerr
-	}
 	if perr != nil {
 		return pstart, pend, perr
 	}
 	return pstart, pend, alignerr
 }
 
-// getVecPMAsLocked ensures that pmas exist for all addresses in ars. It
-// returns the subset of ars for which pmas exist. If this is not equal to ars,
-// it returns a non-nil error explaining why.
+// getVecPMAsLocked ensures that pmas exist for all addresses in ars, and
+// support access of type at. It returns the subset of ars for which pmas
+// exist. If this is not equal to ars, it returns a non-nil error explaining
+// why.
 //
 // Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. vmas must exist for all addresses in ars.
-func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, opts pmaOpts) (usermem.AddrRangeSeq, error) {
+// writing. vmas must exist for all addresses in ars, and support accesses of
+// type at (i.e. permission checks must have been performed against vmas).
+func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) {
 	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
 		ar := arsit.Head()
 		if ar.Length() == 0 {
 			continue
 		}
+		if checkInvariants {
+			if !ar.WellFormed() {
+				panic(fmt.Sprintf("invalid ar: %v", ar))
+			}
+		}
 
 		// Page-align ar so that all AddrRanges are aligned.
 		end, ok := ar.End.RoundUp()
@@ -183,26 +160,7 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrR
 		}
 		ar = usermem.AddrRange{ar.Start.RoundDown(), end}
 
-		pstart, pend, perr := mm.ensurePMAsLocked(ctx, mm.vmas.FindSegment(ar.Start), ar)
-		if pend.Start() <= ar.Start {
-			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
-		}
-
-		var cowerr error
-		if opts.breakCOW {
-			if !pstart.Ok() {
-				pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
-			}
-			if pend.Start() < ar.End {
-				// Adjust ar to reflect missing pmas.
-				ar.End = pend.Start()
-			}
-			pend, _, cowerr = mm.breakCopyOnWriteLocked(pstart, ar)
-		}
-
-		if cowerr != nil {
-			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), cowerr
-		}
+		_, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at)
 		if perr != nil {
 			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
 		}
@@ -214,64 +172,303 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrR
 	return ars, nil
 }
 
-// ensurePMAsLocked ensures that pmas exist for all addresses in ar. It returns:
+// getPMAsInternalLocked is equivalent to getPMAsLocked, with the following
+// exceptions:
 //
-// - An iterator to the pma containing ar.Start, on a best-effort basis (that
-// is, the returned iterator may be terminal, even if such a pma exists).
-// Returning this iterator on a best-effort basis allows callers that require
-// it to use it when it's cheaply available, while also avoiding the overhead
-// of retrieving it when it's not.
-//
-// - An iterator to the gap after the last pma containing an address in ar. If
-// pmas exist for no addresses in ar, the iterator is to a gap that begins
-// before ar.Start.
+// - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that
+// is, the returned iterator may be terminal, even if a pma that contains
+// ar.Start exists). Returning this iterator on a best-effort basis allows
+// callers that require it to use it when it's cheaply available, while also
+// avoiding the overhead of retrieving it when it's not.
 //
-// - An error that is non-nil if pmas exist for only a subset of ar.
+// - getPMAsInternalLocked additionally requires that ar is page-aligned.
 //
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. ar.Length() != 0. ar must be page-aligned.
-// vseg.Range().Contains(ar.Start). vmas must exist for all addresses in ar.
-func (mm *MemoryManager) ensurePMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange) (pmaIterator, pmaGapIterator, error) {
+// getPMAsInternalLocked is an implementation helper for getPMAsLocked and
+// getVecPMAsLocked; other clients should call one of those instead.
+func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) {
 	if checkInvariants {
 		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
 			panic(fmt.Sprintf("invalid ar: %v", ar))
 		}
+		if !vseg.Ok() {
+			panic("terminal vma iterator")
+		}
 		if !vseg.Range().Contains(ar.Start) {
 			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
 		}
 	}
 
-	pstart, pgap := mm.pmas.Find(ar.Start)
-	if pstart.Ok() {
-		pgap = pstart.NextGap()
-	}
-	for pgap.Start() < ar.End {
-		if pgap.Range().Length() == 0 {
-			pgap = pgap.NextGap()
-			continue
-		}
-		// A single pgap might be spanned by multiple vmas. Insert pmas to
-		// cover the first (vma, pgap) pair.
-		pgapAR := pgap.Range().Intersect(ar)
-		vseg = vseg.seekNextLowerBound(pgapAR.Start)
-		if checkInvariants {
-			if !vseg.Ok() {
-				panic(fmt.Sprintf("no vma after %#x", pgapAR.Start))
-			}
-			if pgapAR.Start < vseg.Start() {
-				panic(fmt.Sprintf("no vma in [%#x, %#x)", pgapAR.Start, vseg.Start()))
+	mf := mm.mfp.MemoryFile()
+	// Limit the range we allocate to ar, aligned to privateAllocUnit.
+	maskAR := privateAligned(ar)
+	didUnmapAS := false
+	// The range in which we iterate vmas and pmas is still limited to ar, to
+	// ensure that we don't allocate or COW-break a pma we don't need.
+	pseg, pgap := mm.pmas.Find(ar.Start)
+	pstart := pseg
+	for {
+		// Get pmas for this vma.
+		vsegAR := vseg.Range().Intersect(ar)
+		vma := vseg.ValuePtr()
+	pmaLoop:
+		for {
+			switch {
+			case pgap.Ok() && pgap.Start() < vsegAR.End:
+				// Need a pma here.
+				optAR := vseg.Range().Intersect(pgap.Range())
+				if checkInvariants {
+					if optAR.Length() <= 0 {
+						panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
+					}
+				}
+				if vma.mappable == nil {
+					// Private anonymous mappings get pmas by allocating.
+					allocAR := optAR.Intersect(maskAR)
+					fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
+					if err != nil {
+						return pstart, pgap, err
+					}
+					if checkInvariants {
+						if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
+							panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
+						}
+					}
+					mm.addRSSLocked(allocAR)
+					mm.incPrivateRef(fr)
+					mf.IncRef(fr)
+					pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{
+						file:           mf,
+						off:            fr.Start,
+						translatePerms: usermem.AnyAccess,
+						effectivePerms: vma.effectivePerms,
+						maxPerms:       vma.maxPerms,
+						// Since we just allocated this memory and have the
+						// only reference, the new pma does not need
+						// copy-on-write.
+						private: true,
+					}).NextNonEmpty()
+					pstart = pmaIterator{} // iterators invalidated
+				} else {
+					// Other mappings get pmas by translating.
+					optMR := vseg.mappableRangeOf(optAR)
+					reqAR := optAR.Intersect(ar)
+					reqMR := vseg.mappableRangeOf(reqAR)
+					perms := at
+					if vma.private {
+						// This pma will be copy-on-write; don't require write
+						// permission, but do require read permission to
+						// facilitate the copy.
+						//
+						// If at.Write is true, we will need to break
+						// copy-on-write immediately, which occurs after
+						// translation below.
+						perms.Read = true
+						perms.Write = false
+					}
+					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+					if checkInvariants {
+						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
+							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
+						}
+					}
+					// Install a pma for each translation.
+					if len(ts) == 0 {
+						return pstart, pgap, err
+					}
+					pstart = pmaIterator{} // iterators invalidated
+					for _, t := range ts {
+						newpmaAR := vseg.addrRangeOf(t.Source)
+						newpma := pma{
+							file:           t.File,
+							off:            t.Offset,
+							translatePerms: t.Perms,
+							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
+							maxPerms:       vma.maxPerms.Intersect(t.Perms),
+						}
+						if vma.private {
+							newpma.effectivePerms.Write = false
+							newpma.maxPerms.Write = false
+							newpma.needCOW = true
+						}
+						mm.addRSSLocked(newpmaAR)
+						t.File.IncRef(t.FileRange())
+						// This is valid because memmap.Mappable.Translate is
+						// required to return Translations in increasing
+						// Translation.Source order.
+						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
+						pgap = pseg.NextGap()
+					}
+					// The error returned by Translate is only significant if
+					// it occurred before ar.End.
+					if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End {
+						return pstart, pgap, err
+					}
+					// Rewind pseg to the first pma inserted and continue the
+					// loop to check if we need to break copy-on-write.
+					pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{}
+					continue
+				}
+
+			case pseg.Ok() && pseg.Start() < vsegAR.End:
+				oldpma := pseg.ValuePtr()
+				if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) {
+					// Break copy-on-write by copying.
+					if checkInvariants {
+						if !oldpma.maxPerms.Read {
+							panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
+						}
+					}
+					copyAR := pseg.Range().Intersect(maskAR)
+					// Get internal mappings from the pma to copy from.
+					if err := pseg.getInternalMappingsLocked(); err != nil {
+						return pstart, pseg.PrevGap(), err
+					}
+					// Copy contents.
+					fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
+					if _, ok := err.(safecopy.BusError); ok {
+						// If we got SIGBUS during the copy, deliver SIGBUS to
+						// userspace (instead of SIGSEGV) if we're breaking
+						// copy-on-write due to application page fault.
+						err = &memmap.BusError{err}
+					}
+					if fr.Length() == 0 {
+						return pstart, pseg.PrevGap(), err
+					}
+					// Unmap all of maskAR, not just copyAR, to minimize host
+					// syscalls. AddressSpace mappings must be removed before
+					// mm.decPrivateRef().
+					if !didUnmapAS {
+						mm.unmapASLocked(maskAR)
+						didUnmapAS = true
+					}
+					// Replace the pma with a copy in the part of the address
+					// range where copying was successful. This doesn't change
+					// RSS.
+					copyAR.End = copyAR.Start + usermem.Addr(fr.Length())
+					if copyAR != pseg.Range() {
+						pseg = mm.pmas.Isolate(pseg, copyAR)
+						pstart = pmaIterator{} // iterators invalidated
+					}
+					oldpma = pseg.ValuePtr()
+					if oldpma.private {
+						mm.decPrivateRef(pseg.fileRange())
+					}
+					oldpma.file.DecRef(pseg.fileRange())
+					mm.incPrivateRef(fr)
+					mf.IncRef(fr)
+					oldpma.file = mf
+					oldpma.off = fr.Start
+					oldpma.translatePerms = usermem.AnyAccess
+					oldpma.effectivePerms = vma.effectivePerms
+					oldpma.maxPerms = vma.maxPerms
+					oldpma.needCOW = false
+					oldpma.private = true
+					oldpma.internalMappings = safemem.BlockSeq{}
+					// Try to merge the pma with its neighbors.
+					if prev := pseg.PrevSegment(); prev.Ok() {
+						if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
+							pseg = merged
+							pstart = pmaIterator{} // iterators invalidated
+						}
+					}
+					if next := pseg.NextSegment(); next.Ok() {
+						if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
+							pseg = merged
+							pstart = pmaIterator{} // iterators invalidated
+						}
+					}
+					// The error returned by AllocateAndFill is only
+					// significant if it occurred before ar.End.
+					if err != nil && pseg.End() < ar.End {
+						return pstart, pseg.NextGap(), err
+					}
+					// Ensure pseg and pgap are correct for the next iteration
+					// of the loop.
+					pseg, pgap = pseg.NextNonEmpty()
+				} else if !oldpma.translatePerms.SupersetOf(at) {
+					// Get new pmas (with sufficient permissions) by calling
+					// memmap.Mappable.Translate again.
+					if checkInvariants {
+						if oldpma.private {
+							panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma))
+						}
+					}
+					// Allow the entire pma to be replaced.
+					optAR := pseg.Range()
+					optMR := vseg.mappableRangeOf(optAR)
+					reqAR := optAR.Intersect(ar)
+					reqMR := vseg.mappableRangeOf(reqAR)
+					perms := oldpma.translatePerms.Union(at)
+					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
+					if checkInvariants {
+						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
+							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
+						}
+					}
+					// Remove the part of the existing pma covered by new
+					// Translations, then insert new pmas. This doesn't change
+					// RSS. Note that we don't need to call unmapASLocked: any
+					// existing AddressSpace mappings are still valid (though
+					// less permissive than the new pmas indicate) until
+					// Invalidate is called, and will be replaced by future
+					// calls to mapASLocked.
+					if len(ts) == 0 {
+						return pstart, pseg.PrevGap(), err
+					}
+					transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End}
+					transAR := vseg.addrRangeOf(transMR)
+					pseg = mm.pmas.Isolate(pseg, transAR)
+					pseg.ValuePtr().file.DecRef(pseg.fileRange())
+					pgap = mm.pmas.Remove(pseg)
+					pstart = pmaIterator{} // iterators invalidated
+					for _, t := range ts {
+						newpmaAR := vseg.addrRangeOf(t.Source)
+						newpma := pma{
+							file:           t.File,
+							off:            t.Offset,
+							translatePerms: t.Perms,
+							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
+							maxPerms:       vma.maxPerms.Intersect(t.Perms),
+						}
+						if vma.private {
+							newpma.effectivePerms.Write = false
+							newpma.maxPerms.Write = false
+							newpma.needCOW = true
+						}
+						t.File.IncRef(t.FileRange())
+						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
+						pgap = pseg.NextGap()
+					}
+					// The error returned by Translate is only significant if
+					// it occurred before ar.End.
+					if err != nil && pseg.End() < ar.End {
+						return pstart, pgap, err
+					}
+					// Ensure pseg and pgap are correct for the next iteration
+					// of the loop.
+					if pgap.Range().Length() == 0 {
+						pseg, pgap = pgap.NextSegment(), pmaGapIterator{}
+					} else {
+						pseg = pmaIterator{}
+					}
+				} else {
+					// We have a usable pma; continue.
+					pseg, pgap = pseg.NextNonEmpty()
+				}
+
+			default:
+				break pmaLoop
 			}
 		}
-		var err error
-		pgap, err = mm.insertPMAsLocked(ctx, vseg, pgap, ar)
-		// insertPMAsLocked most likely invalidated iterators, so pstart is now
-		// unknown.
-		pstart = pmaIterator{}
-		if err != nil {
-			return pstart, pgap, err
+		// Go to the next vma.
+		if ar.End <= vseg.End() {
+			if pgap.Ok() {
+				return pstart, pgap, nil
+			}
+			return pstart, pseg.PrevGap(), nil
 		}
+		vseg = vseg.NextSegment()
 	}
-	return pstart, pgap, nil
 }
 
 const (
@@ -299,215 +496,16 @@ func privateAligned(ar usermem.AddrRange) usermem.AddrRange {
 	return aligned
 }
 
-// insertPMAsLocked inserts pmas into pgap corresponding to the vma iterated by
-// vseg, spanning at least ar. It returns:
-//
-// - An iterator to the gap after the last pma containing an address in ar. If
-// pmas exist for no addresses in ar, the iterator is to a gap that begins
-// before ar.Start.
-//
-// - An error that is non-nil if pmas exist for only a subset of ar.
-//
-// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for
-// writing. vseg.Range().Intersect(pgap.Range()).Intersect(ar).Length() != 0.
-// ar must be page-aligned.
-func (mm *MemoryManager) insertPMAsLocked(ctx context.Context, vseg vmaIterator, pgap pmaGapIterator, ar usermem.AddrRange) (pmaGapIterator, error) {
-	optAR := vseg.Range().Intersect(pgap.Range())
-	if checkInvariants {
-		if optAR.Length() <= 0 {
-			panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
-		}
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
-			panic(fmt.Sprintf("invalid ar %v", ar))
-		}
-	}
-	vma := vseg.ValuePtr()
-
-	// Private anonymous mappings get pmas by allocating.
-	if vma.mappable == nil {
-		// Limit the range we allocate to ar, aligned to privateAllocUnit.
-		maskAR := privateAligned(ar)
-		allocAR := optAR.Intersect(maskAR)
-		mf := mm.mfp.MemoryFile()
-		fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
-		if err != nil {
-			return pgap, err
-		}
-		mm.incPrivateRef(fr)
-
-		if checkInvariants {
-			if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
-				panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
-			}
-		}
-
-		mm.addRSSLocked(allocAR)
-		mf.IncRef(fr)
-
-		return mm.pmas.Insert(pgap, allocAR, pma{
-			file:              mf,
-			off:               fr.Start,
-			vmaEffectivePerms: vma.effectivePerms,
-			vmaMaxPerms:       vma.maxPerms,
-			private:           true,
-			// Since we just allocated this memory and have the only reference,
-			// the new pma does not need copy-on-write.
-		}).NextGap(), nil
-	}
-
-	// Other mappings get pmas by translating. Limit the required range
-	// to ar.
-	optMR := vseg.mappableRangeOf(optAR)
-	reqAR := optAR.Intersect(ar)
-	reqMR := vseg.mappableRangeOf(reqAR)
-	perms := vma.maxPerms
-	if vma.private {
-		perms.Write = false
-	}
-	ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
-	if checkInvariants {
-		if err := memmap.CheckTranslateResult(reqMR, optMR, ts, err); err != nil {
-			panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v): %v", vma.mappable, reqMR, optMR, err))
-		}
-	}
-
-	// Install a pma for each Translation.
-	for _, t := range ts {
-		// This is valid because memmap.Mappable.Translate is required to
-		// return Translations in increasing Translation.Source order.
-		addrRange := vseg.addrRangeOf(t.Source)
-		mm.addRSSLocked(addrRange)
-		pseg := mm.pmas.Insert(pgap, addrRange, pma{
-			file:              t.File,
-			off:               t.Offset,
-			vmaEffectivePerms: vma.effectivePerms,
-			vmaMaxPerms:       vma.maxPerms,
-			needCOW:           vma.private,
-		})
-		// The new pseg may have been merged with existing segments, only take a
-		// ref on the inserted range.
-		t.File.IncRef(pseg.fileRangeOf(addrRange))
-		pgap = pseg.NextGap()
-	}
-
-	// Even if Translate returned an error, if we got to ar.End,
-	// insertPMAsLocked succeeded.
-	if ar.End <= pgap.Start() {
-		return pgap, nil
-	}
-	return pgap, err
-}
-
-// breakCopyOnWriteLocked ensures that pmas in ar are not copy-on-write. It
-// returns:
+// isPMACopyOnWriteLocked returns true if the contents of the pma represented
+// by pseg must be copied to a new private pma to be written to.
 //
-// - An iterator to the gap after the last non-COW pma containing an address in
-// ar. If non-COW pmas exist for no addresses in ar, the iterator is to a gap
-// that begins before ar.Start.
+// If the pma is a copy-on-write private pma, and holds the only reference on
+// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
+// and update the pma to indicate that it does not require copy-on-write.
 //
-// - A boolean that is true if iterators into mm.pmas may have been
-// invalidated.
-//
-// - An error that is non-nil if non-COW pmas exist for only a subset of ar.
-//
-// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar
-// must be page-aligned. pseg.Range().Contains(ar.Start). pmas must exist for
-// all addresses in ar.
-func (mm *MemoryManager) breakCopyOnWriteLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, bool, error) {
-	if checkInvariants {
-		if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
-			panic(fmt.Sprintf("invalid ar: %v", ar))
-		}
-		if !pseg.Range().Contains(ar.Start) {
-			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
-		}
-	}
-
-	// Limit the range we copy to ar, aligned to privateAllocUnit.
-	maskAR := privateAligned(ar)
-	var invalidatedIterators, didUnmapAS bool
-	mf := mm.mfp.MemoryFile()
-	for {
-		if mm.isPMACopyOnWriteLocked(pseg) {
-			// Determine the range to copy.
-			copyAR := pseg.Range().Intersect(maskAR)
-
-			// Get internal mappings from the pma to copy from.
-			if err := pseg.getInternalMappingsLocked(); err != nil {
-				return pseg.PrevGap(), invalidatedIterators, err
-			}
-
-			// Copy contents.
-			fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
-			if _, ok := err.(safecopy.BusError); ok {
-				// If we got SIGBUS during the copy, deliver SIGBUS to
-				// userspace (instead of SIGSEGV) if we're breaking
-				// copy-on-write due to application page fault.
-				err = &memmap.BusError{err}
-			}
-			if fr.Length() == 0 {
-				return pseg.PrevGap(), invalidatedIterators, err
-			}
-			mm.incPrivateRef(fr)
-			mf.IncRef(fr)
-
-			// Unmap all of maskAR, not just copyAR, to minimize host syscalls.
-			// AddressSpace mappings must be removed before mm.decPrivateRef().
-			if !didUnmapAS {
-				mm.unmapASLocked(maskAR)
-				didUnmapAS = true
-			}
-
-			// Replace the pma with a copy in the part of the address range
-			// where copying was successful.
-			copyAR.End = copyAR.Start + usermem.Addr(fr.Length())
-			if copyAR != pseg.Range() {
-				pseg = mm.pmas.Isolate(pseg, copyAR)
-				invalidatedIterators = true
-			}
-			pma := pseg.ValuePtr()
-			if pma.private {
-				mm.decPrivateRef(pseg.fileRange())
-			}
-			pma.file.DecRef(pseg.fileRange())
-
-			pma.file = mf
-			pma.off = fr.Start
-			pma.private = true
-			pma.needCOW = false
-			pma.internalMappings = safemem.BlockSeq{}
-
-			// Try to merge pma with its neighbors.
-			if prev := pseg.PrevSegment(); prev.Ok() {
-				if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
-					pseg = merged
-					invalidatedIterators = true
-				}
-			}
-			if next := pseg.NextSegment(); next.Ok() {
-				if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
-					pseg = merged
-					invalidatedIterators = true
-				}
-			}
-
-			// If an error occurred after ar.End, breakCopyOnWriteLocked still
-			// did its job, so discard the error.
-			if err != nil && pseg.End() < ar.End {
-				return pseg.NextGap(), invalidatedIterators, err
-			}
-		}
-		// This checks against ar.End, not maskAR.End, so we will never break
-		// COW on a pma that does not intersect ar.
-		if ar.End <= pseg.End() {
-			return pseg.NextGap(), invalidatedIterators, nil
-		}
-		pseg = pseg.NextSegment()
-	}
-}
-
-// Preconditions: mm.activeMu must be locked for writing.
-func (mm *MemoryManager) isPMACopyOnWriteLocked(pseg pmaIterator) bool {
+// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be
+// locked. mm.activeMu must be locked for writing.
+func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
 	pma := pseg.ValuePtr()
 	if !pma.needCOW {
 		return false
@@ -526,6 +524,10 @@ func (mm *MemoryManager) isPMACopyOnWriteLocked(pseg pmaIterator) bool {
 	rseg := mm.privateRefs.refs.FindSegment(fr.Start)
 	if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
 		pma.needCOW = false
+		// pma.private => pma.translatePerms == usermem.AnyAccess
+		vma := vseg.ValuePtr()
+		pma.effectivePerms = vma.effectivePerms
+		pma.maxPerms = vma.maxPerms
 		return false
 	}
 	return true
@@ -617,9 +619,7 @@ func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at userm
 
 	// Ensure that we have usable pmas.
 	mm.activeMu.Lock()
-	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
-		breakCOW: at.Write,
-	})
+	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
 	mm.mappingMu.RUnlock()
 	if pendaddr := pend.Start(); pendaddr < ar.End {
 		if pendaddr <= ar.Start {
@@ -925,8 +925,9 @@ func (pmaSetFunctions) ClearValue(pma *pma) {
 func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) {
 	if pma1.file != pma2.file ||
 		pma1.off+uint64(ar1.Length()) != pma2.off ||
-		pma1.vmaEffectivePerms != pma2.vmaEffectivePerms ||
-		pma1.vmaMaxPerms != pma2.vmaMaxPerms ||
+		pma1.translatePerms != pma2.translatePerms ||
+		pma1.effectivePerms != pma2.effectivePerms ||
+		pma1.maxPerms != pma2.maxPerms ||
 		pma1.needCOW != pma2.needCOW ||
 		pma1.private != pma2.private {
 		return pma{}, false
@@ -979,20 +980,13 @@ func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pma
 func (pseg pmaIterator) getInternalMappingsLocked() error {
 	pma := pseg.ValuePtr()
 	if pma.internalMappings.IsEmpty() {
-		// Internal mappings are used for ignorePermissions accesses,
-		// so we need to use vma.maxPerms instead of
-		// vma.effectivePerms. However, we will never execute
-		// application code through an internal mapping, and we don't
-		// actually need a writable mapping if copy-on-write is in
-		// effect. (But get a writable mapping anyway if the pma is
-		// private, so that if breakCopyOnWriteLocked =>
-		// isPMACopyOnWriteLocked takes ownership of the pma instead of
-		// copying, it doesn't need to get a new mapping.)
-		perms := pma.vmaMaxPerms
+		// This must use maxPerms (instead of perms) because some permission
+		// constraints are only visible to vmas; for example, mappings of
+		// read-only files have vma.maxPerms.Write unset, but this may not be
+		// visible to the memmap.Mappable.
+		perms := pma.maxPerms
+		// We will never execute application code through an internal mapping.
 		perms.Execute = false
-		if pma.needCOW && !pma.private {
-			perms.Write = false
-		}
 		ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
 		if err != nil {
 			return err
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index aa94d7d6a..cfbf7a104 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -102,6 +102,7 @@ func (m *SpecialMappable) Translate(ctx context.Context, required, optional memm
 				Source: source,
 				File:   m.mfp.MemoryFile(),
 				Offset: m.fr.Start + source.Start,
+				Perms:  usermem.AnyAccess,
 			},
 		}, err
 	}
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index b56e0d3b9..3725c98aa 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -54,9 +54,7 @@ func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr,
 
 	// Ensure that we have a usable pma.
 	mm.activeMu.Lock()
-	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
-		breakCOW: at.Write,
-	})
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at)
 	mm.mappingMu.RUnlock()
 	if err != nil {
 		mm.activeMu.Unlock()
@@ -186,7 +184,7 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u
 	}
 
 	// Ensure that we have usable pmas.
-	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
 	if err != nil {
 		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
 		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@@ -231,7 +229,7 @@ func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaItera
 	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
 	// isn't needed at all for mapASLocked.
 	mm.mappingMu.DowngradeLock()
-	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess)
 	mm.mappingMu.RUnlock()
 	if err != nil {
 		mm.activeMu.Unlock()
@@ -651,13 +649,17 @@ func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms us
 		for pseg.Ok() && pseg.Start() < vseg.End() {
 			if pseg.Range().Overlaps(vseg.Range()) {
 				pseg = mm.pmas.Isolate(pseg, vseg.Range())
-				if !effectivePerms.SupersetOf(pseg.ValuePtr().vmaEffectivePerms) && !didUnmapAS {
+				pma := pseg.ValuePtr()
+				if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS {
 					// Unmap all of ar, not just vseg.Range(), to minimize host
 					// syscalls.
 					mm.unmapASLocked(ar)
 					didUnmapAS = true
 				}
-				pseg.ValuePtr().vmaEffectivePerms = effectivePerms
+				pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms)
+				if pma.needCOW {
+					pma.effectivePerms.Write = false
+				}
 			}
 			pseg = pseg.NextSegment()
 		}
@@ -828,7 +830,7 @@ func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length ui
 				mm.mappingMu.RUnlock()
 				return syserror.ENOMEM
 			}
-			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
+			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), usermem.NoAccess)
 			if err != nil {
 				mm.activeMu.Unlock()
 				mm.mappingMu.RUnlock()
@@ -923,7 +925,7 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error
 		mm.mappingMu.DowngradeLock()
 		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
 			if vseg.ValuePtr().effectivePerms.Any() {
-				mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
+				mm.getPMAsLocked(ctx, vseg, vseg.Range(), usermem.NoAccess)
 			}
 		}
 
@@ -981,7 +983,7 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 		}
 		for pseg.Ok() && pseg.Start() < vsegAR.End {
 			pma := pseg.ValuePtr()
-			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+			if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) {
 				psegAR := pseg.Range().Intersect(ar)
 				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
 					if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index c71d05afe..9e6a27bcf 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -93,6 +93,15 @@ func (a AccessType) Intersect(other AccessType) AccessType {
 	}
 }
 
+// Union returns the access types set in either a or other.
+func (a AccessType) Union(other AccessType) AccessType {
+	return AccessType{
+		Read:    a.Read || other.Read,
+		Write:   a.Write || other.Write,
+		Execute: a.Execute || other.Execute,
+	}
+}
+
 // Effective returns the set of effective access types allowed by a, even if
 // some types are not explicitly allowed.
 func (a AccessType) Effective() AccessType {
-- 
cgit v1.2.3


From 06ec97a3f823f1f5d928fc9c2beb3a11c2c88487 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Tue, 26 Mar 2019 16:15:55 -0700
Subject: Implement memfd_create.

Memfds are simply anonymous tmpfs files with no associated
mounts. Also implementing file seals, which Linux only implements for
memfds at the moment.

PiperOrigin-RevId: 240450031
Change-Id: I31de78b950101ae8d7a13d0e93fe52d98ea06f2f
---
 pkg/abi/linux/file.go                 |  18 ++
 pkg/sentry/fs/tmpfs/inode_file.go     | 151 ++++++++++
 pkg/sentry/syscalls/linux/BUILD       |   1 +
 pkg/sentry/syscalls/linux/linux64.go  |   2 +-
 pkg/sentry/syscalls/linux/sys_file.go |  59 ++++
 test/syscalls/linux/BUILD             |  17 ++
 test/syscalls/linux/memfd.cc          | 546 ++++++++++++++++++++++++++++++++++
 7 files changed, 793 insertions(+), 1 deletion(-)
 create mode 100644 test/syscalls/linux/memfd.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index e5a51a9fd..46b10ca97 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -236,3 +236,21 @@ var fileType = abi.ValueSet{
 	ModeCharacterDevice: "S_IFCHR",
 	ModeNamedPipe:       "S_IFIFO",
 }
+
+// Constants for memfd_create(2). Source: include/uapi/linux/memfd.h
+const (
+	MFD_CLOEXEC       = 0x0001
+	MFD_ALLOW_SEALING = 0x0002
+)
+
+// Constants related to file seals. Source: include/uapi/{asm-generic,linux}/fcntl.h
+const (
+	F_LINUX_SPECIFIC_BASE = 1024
+	F_ADD_SEALS           = F_LINUX_SPECIFIC_BASE + 9
+	F_GET_SEALS           = F_LINUX_SPECIFIC_BASE + 10
+
+	F_SEAL_SEAL   = 0x0001 // Prevent further seals from being set.
+	F_SEAL_SHRINK = 0x0002 // Prevent file from shrinking.
+	F_SEAL_GROW   = 0x0004 // Prevent file from growing.
+	F_SEAL_WRITE  = 0x0008 // Prevent writes.
+)
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 25bf2b9dd..7c80d711b 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -15,10 +15,12 @@
 package tmpfs
 
 import (
+	"fmt"
 	"io"
 	"sync"
 	"time"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -29,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 var (
@@ -42,6 +45,8 @@ var (
 // These files are backed by pages allocated from a platform.Memory, and may be
 // directly mapped.
 //
+// Lock order: attrMu -> mapsMu -> dataMu.
+//
 // +stateify savable
 type fileInodeOperations struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
@@ -74,6 +79,17 @@ type fileInodeOperations struct {
 	// mappings is protected by mapsMu.
 	mappings memmap.MappingSet
 
+	// writableMappingPages tracks how many pages of virtual memory are mapped
+	// as potentially writable from this file. If a page has multiple mappings,
+	// each mapping is counted separately.
+	//
+	// This counter is susceptible to overflow as we can potentially count
+	// mappings from many VMAs. We count pages rather than bytes to slightly
+	// mitigate this.
+	//
+	// Protected by mapsMu.
+	writableMappingPages uint64
+
 	dataMu sync.RWMutex `state:"nosave"`
 
 	// data maps offsets into the file to offsets into platform.Memory() that
@@ -81,6 +97,11 @@ type fileInodeOperations struct {
 	//
 	// data is protected by dataMu.
 	data fsutil.FileRangeSet
+
+	// seals represents file seals on this inode.
+	//
+	// Protected by dataMu.
+	seals uint32
 }
 
 var _ fs.InodeOperations = (*fileInodeOperations)(nil)
@@ -91,9 +112,30 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta
 		attr:     uattr,
 		kernel:   kernel.KernelFromContext(ctx),
 		memUsage: usage,
+		seals:    linux.F_SEAL_SEAL,
 	}
 }
 
+// NewMemfdInode creates a new inode backing a memfd. Memory used by the memfd
+// is backed by platform memory.
+func NewMemfdInode(ctx context.Context, allowSeals bool) *fs.Inode {
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
+	// S_IRWXUGO.
+	perms := fs.PermMask{Read: true, Write: true, Execute: true}
+	iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{
+		Owner: fs.FileOwnerFromContext(ctx),
+		Perms: fs.FilePermissions{User: perms, Group: perms, Other: perms}}).(*fileInodeOperations)
+	if allowSeals {
+		iops.seals = 0
+	}
+	return fs.NewInode(iops, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+		Type:      fs.RegularFile,
+		DeviceID:  tmpfsDevice.DeviceID(),
+		InodeID:   tmpfsDevice.NextIno(),
+		BlockSize: usermem.PageSize,
+	})
+}
+
 // Release implements fs.InodeOperations.Release.
 func (f *fileInodeOperations) Release(context.Context) {
 	f.dataMu.Lock()
@@ -170,6 +212,16 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in
 
 	f.dataMu.Lock()
 	oldSize := f.attr.Size
+
+	// Check if current seals allow truncation.
+	switch {
+	case size > oldSize && f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+		fallthrough
+	case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed
+		f.dataMu.Unlock()
+		return syserror.EPERM
+	}
+
 	if oldSize != size {
 		f.attr.Size = size
 		// Update mtime and ctime.
@@ -370,6 +422,34 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 		return 0, nil
 	}
 
+	// Check if seals prevent either file growth or all writes.
+	switch {
+	case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+		return 0, syserror.EPERM
+	case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+		// When growth is sealed, Linux effectively allows writes which would
+		// normally grow the file to partially succeed up to the current EOF,
+		// rounded down to the page boundary before the EOF.
+		//
+		// This happens because writes (and thus the growth check) for tmpfs
+		// files proceed page-by-page on Linux, and the final write to the page
+		// containing EOF fails, resulting in a partial write up to the start of
+		// that page.
+		//
+		// To emulate this behaviour, artifically truncate the write to the
+		// start of the page containing the current EOF.
+		//
+		// See Linux, mm/filemap.c:generic_perform_write() and
+		// mm/shmem.c:shmem_write_begin().
+		if pgstart := int64(usermem.Addr(rw.f.attr.Size).RoundDown()); end > pgstart {
+			end = pgstart
+		}
+		if end <= rw.offset {
+			// Truncation would result in no data being written.
+			return 0, syserror.EPERM
+		}
+	}
+
 	defer func() {
 		// If the write ends beyond the file's previous size, it causes the
 		// file to grow.
@@ -431,7 +511,27 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
 func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
 	f.mapsMu.Lock()
 	defer f.mapsMu.Unlock()
+
+	f.dataMu.RLock()
+	defer f.dataMu.RUnlock()
+
+	// Reject writable mapping if F_SEAL_WRITE is set.
+	if f.seals&linux.F_SEAL_WRITE != 0 && writable {
+		return syserror.EPERM
+	}
+
 	f.mappings.AddMapping(ms, ar, offset, writable)
+	if writable {
+		pagesBefore := f.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		f.writableMappingPages += uint64(ar.Length() / usermem.PageSize)
+
+		if f.writableMappingPages < pagesBefore {
+			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
+		}
+	}
+
 	return nil
 }
 
@@ -439,7 +539,19 @@ func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingS
 func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
 	f.mapsMu.Lock()
 	defer f.mapsMu.Unlock()
+
 	f.mappings.RemoveMapping(ms, ar, offset, writable)
+
+	if writable {
+		pagesBefore := f.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		f.writableMappingPages -= uint64(ar.Length() / usermem.PageSize)
+
+		if f.writableMappingPages > pagesBefore {
+			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
+		}
+	}
 }
 
 // CopyMapping implements memmap.Mappable.CopyMapping.
@@ -501,3 +613,42 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
 func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error {
 	return nil
 }
+
+// GetSeals returns the current set of seals on a memfd inode.
+func GetSeals(inode *fs.Inode) (uint32, error) {
+	if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
+		f.dataMu.RLock()
+		defer f.dataMu.RUnlock()
+		return f.seals, nil
+	}
+	// Not a memfd inode.
+	return 0, syserror.EINVAL
+}
+
+// AddSeals adds new file seals to a memfd inode.
+func AddSeals(inode *fs.Inode, val uint32) error {
+	if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
+		f.mapsMu.Lock()
+		defer f.mapsMu.Unlock()
+		f.dataMu.Lock()
+		defer f.dataMu.Unlock()
+
+		if f.seals&linux.F_SEAL_SEAL != 0 {
+			// Seal applied which prevents addition of any new seals.
+			return syserror.EPERM
+		}
+
+		// F_SEAL_WRITE can only be added if there are no active writable maps.
+		if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
+			if f.writableMappingPages > 0 {
+				return syserror.EBUSY
+			}
+		}
+
+		// Seals can only be added, never removed.
+		f.seals |= val
+		return nil
+	}
+	// Not a memfd inode.
+	return syserror.EINVAL
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 846601881..6e2843b36 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -63,6 +63,7 @@ go_library(
         "//pkg/sentry/fs/anon",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
+        "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/epoll",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index e855590e6..888b5aa9f 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -367,7 +367,7 @@ var AMD64 = &kernel.SyscallTable{
 		//     316: Renameat2, TODO
 		317: Seccomp,
 		318: GetRandom,
-		//     319: MemfdCreate, TODO
+		319: MemfdCreate,
 		320: syscalls.CapError(linux.CAP_SYS_BOOT),  // KexecFileLoad, infeasible to support
 		321: syscalls.CapError(linux.CAP_SYS_ADMIN), // Bpf, requires cap_sys_admin for all commands
 		//     322: Execveat, TODO
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index cf6fdc190..3193718b5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync"
@@ -933,6 +934,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.F_SETOWN:
 		fSetOwn(t, file, args[2].Int())
 		return 0, nil, nil
+	case linux.F_GET_SEALS:
+		val, err := tmpfs.GetSeals(file.Dirent.Inode)
+		return uintptr(val), nil, err
+	case linux.F_ADD_SEALS:
+		if !file.Flags().Write {
+			return 0, nil, syserror.EPERM
+		}
+		err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
+		return 0, nil, err
 	default:
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
@@ -2066,3 +2076,52 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	// arbitrarily.
 	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile)
 }
+
+const (
+	memfdPrefix     = "/memfd:"
+	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Uint()
+
+	if flags&^memfdAllFlags != 0 {
+		// Unknown bits in flags.
+		return 0, nil, syserror.EINVAL
+	}
+
+	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+	cloExec := flags&linux.MFD_CLOEXEC != 0
+
+	name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix))
+	if err != nil {
+		return 0, nil, err
+	}
+	if len(name) > memfdMaxNameLen {
+		return 0, nil, syserror.EINVAL
+	}
+	name = memfdPrefix + name
+
+	inode := tmpfs.NewMemfdInode(t, allowSeals)
+	dirent := fs.NewDirent(inode, name)
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+	// FMODE_READ | FMODE_WRITE.
+	file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true})
+	if err != nil {
+		return 0, nil, err
+	}
+
+	defer dirent.DecRef()
+	defer file.DecRef()
+
+	fdFlags := kernel.FDFlags{CloseOnExec: cloExec}
+	newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(newFD), nil, nil
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 2c214925e..7dd63dd0a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3261,3 +3261,20 @@ cc_binary(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_binary(
+    name = "memfd_test",
+    testonly = 1,
+    srcs = ["memfd.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:fs_util",
+        "//test/util:memory_util",
+        "//test/util:multiprocess_util",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
new file mode 100644
index 000000000..ccdddd4e5
--- /dev/null
+++ b/test/syscalls/linux/memfd.cc
@@ -0,0 +1,546 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/magic.h>
+#include <linux/memfd.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <sys/syscall.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// The header sys/memfd.h isn't available on all systems, so redefining some of
+// the constants here.
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+#define F_SEAL_SEAL 0x0001
+#define F_SEAL_SHRINK 0x0002
+#define F_SEAL_GROW 0x0004
+#define F_SEAL_WRITE 0x0008
+
+using ::testing::StartsWith;
+
+const std::string kMemfdName = "some-memfd";
+
+int memfd_create(const std::string& name, unsigned int flags) {
+  return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+PosixErrorOr<FileDescriptor> MemfdCreate(const std::string& name, uint32_t flags) {
+  int fd = memfd_create(name, flags);
+  if (fd < 0) {
+    return PosixError(
+        errno, absl::StrFormat("memfd_create(\"%s\", %#x)", name, flags));
+  }
+  MaybeSave();
+  return FileDescriptor(fd);
+}
+
+// Procfs entries for memfds display the appropriate name.
+TEST(MemfdTest, Name) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+  const std::string proc_name = ASSERT_NO_ERRNO_AND_VALUE(
+      ReadLink(absl::StrFormat("/proc/self/fd/%d", memfd.get())));
+  EXPECT_THAT(proc_name, StartsWith("/memfd:" + kMemfdName));
+}
+
+// Memfds support read/write syscalls.
+TEST(MemfdTest, WriteRead) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+
+  // Write a random page of data to the memfd via write(2).
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Read back the same data and verify.
+  std::vector<char> buf2(kPageSize);
+  ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(buf, buf2);
+}
+
+// Memfds can be mapped and used as usual.
+TEST(MemfdTest, Mmap) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+  const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+  // Write a random page of data to the memfd via mmap m1.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+  memcpy(m1.ptr(), buf.data(), buf.size());
+
+  // Read the data back via a read syscall on the memfd.
+  std::vector<char> buf2(kPageSize);
+  EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(buf, buf2);
+
+  // The same data should be accessible via a new mapping m2.
+  const Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+  EXPECT_EQ(0, memcmp(m1.ptr(), m2.ptr(), kPageSize));
+}
+
+TEST(MemfdTest, DuplicateFDsShareContent) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+  const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+  const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup());
+
+  // Write a random page of data to the memfd via mmap m1.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+  memcpy(m1.ptr(), buf.data(), buf.size());
+
+  // Read the data back via a read syscall on a duplicate fd.
+  std::vector<char> buf2(kPageSize);
+  EXPECT_THAT(read(memfd2.get(), buf2.data(), buf2.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(buf, buf2);
+}
+
+// File seals are disabled by default on memfds.
+TEST(MemfdTest, SealingDisabledByDefault) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+  EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+              SyscallSucceedsWithValue(F_SEAL_SEAL));
+  // Attempting to set any seal should fail.
+  EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+              SyscallFailsWithErrno(EPERM));
+}
+
+// Seals can be retrieved and updated for memfds.
+TEST(MemfdTest, SealsGetSet) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  int seals;
+  ASSERT_THAT(seals = fcntl(memfd.get(), F_GET_SEALS), SyscallSucceeds());
+  // No seals are set yet.
+  EXPECT_EQ(0, seals);
+
+  // Set a seal and check that we can get it back.
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+  EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+              SyscallSucceedsWithValue(F_SEAL_WRITE));
+
+  // Set some more seals and verify.
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SHRINK),
+              SyscallSucceeds());
+  EXPECT_THAT(
+      fcntl(memfd.get(), F_GET_SEALS),
+      SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK));
+
+  // Attempting to set a seal that is already set is a no-op.
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+  EXPECT_THAT(
+      fcntl(memfd.get(), F_GET_SEALS),
+      SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK));
+
+  // Add remaining seals and verify.
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SEAL), SyscallSucceeds());
+  EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+              SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW |
+                                       F_SEAL_SHRINK | F_SEAL_SEAL));
+}
+
+// F_SEAL_GROW prevents a memfd from being grown using ftruncate.
+TEST(MemfdTest, SealGrowWithTruncate) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+  // Try grow the memfd by 1 page.
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2),
+              SyscallFailsWithErrno(EPERM));
+
+  // Ftruncate calls that don't actually grow the memfd are allowed.
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2), SyscallSucceeds());
+
+  // After shrinking, growing back is not allowed.
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_GROW prevents a memfd from being grown using the write syscall.
+TEST(MemfdTest, SealGrowWithWrite) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+  // Initially, writing to the memfd succeeds.
+  const std::vector<char> buf(kPageSize);
+  EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Apply F_SEAL_GROW, subsequent writes which extend the memfd should fail.
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+  EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallFailsWithErrno(EPERM));
+
+  // However, zero-length writes are ok since they don't grow the memfd.
+  EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds());
+
+  // Writing to existing parts of the memfd is also ok.
+  ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds());
+  EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Returning the end of the file and writing still not allowed.
+  EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_GROW causes writes which partially extend off the current EOF to
+// partially succeed, up to the page containing the EOF.
+TEST(MemfdTest, SealGrowPartialWriteTruncated) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+  // FD offset: 1 page, EOF: 1 page.
+
+  ASSERT_THAT(lseek(memfd.get(), kPageSize * 3 / 4, SEEK_SET),
+              SyscallSucceeds());
+
+  // FD offset: 3/4 page. Writing a full page now should only write 1/4 page
+  // worth of data. This partially succeeds because the first page is entirely
+  // within the file and requires no growth, but attempting to write the final
+  // 3/4 page would require growing the file.
+  const std::vector<char> buf(kPageSize);
+  EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize / 4));
+}
+
+// F_SEAL_GROW causes writes which partially extend off the current EOF to fail
+// in its entirety if the only data written would be to the page containing the
+// EOF.
+TEST(MemfdTest, SealGrowPartialWriteTruncatedSamePage) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 3 / 4), SyscallSucceeds());
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+  // EOF: 3/4 page, writing 1/2 page starting at 1/2 page would cause the file
+  // to grow. Since this would require only the page containing the EOF to be
+  // modified, the write is rejected entirely.
+  const std::vector<char> buf(kPageSize / 2);
+  EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2),
+              SyscallFailsWithErrno(EPERM));
+
+  // However, writing up to EOF is fine.
+  EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2),
+              SyscallSucceedsWithValue(kPageSize / 4));
+}
+
+// F_SEAL_SHRINK prevents a memfd from being shrunk using ftruncate.
+TEST(MemfdTest, SealShrink) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SHRINK),
+              SyscallSucceeds());
+
+  // Shrink by half a page.
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2),
+              SyscallFailsWithErrno(EPERM));
+
+  // Ftruncate calls that don't actually shrink the file are allowed.
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2), SyscallSucceeds());
+
+  // After growing, shrinking is still not allowed.
+  ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_WRITE prevents a memfd from being written to through a write
+// syscall.
+TEST(MemfdTest, SealWriteWithWrite) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  const std::vector<char> buf(kPageSize);
+  ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+  // Attemping to write at the end of the file fails.
+  EXPECT_THAT(write(memfd.get(), buf.data(), 1), SyscallFailsWithErrno(EPERM));
+
+  // Attemping to overwrite an existing part of the memfd fails.
+  EXPECT_THAT(pwrite(memfd.get(), buf.data(), 1, 0),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2),
+              SyscallFailsWithErrno(EPERM));
+
+  // Zero-length writes however do not fail.
+  EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds());
+}
+
+// F_SEAL_WRITE prevents a memfd from being written to through an mmap.
+TEST(MemfdTest, SealWriteWithMmap) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  const std::vector<char> buf(kPageSize);
+  ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+  // Can't create a shared mapping with writes sealed.
+  void* ret = mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0);
+  EXPECT_EQ(ret, MAP_FAILED);
+  EXPECT_EQ(errno, EPERM);
+  ret = mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0);
+  EXPECT_EQ(ret, MAP_FAILED);
+  EXPECT_EQ(errno, EPERM);
+
+  // However, private mappings are ok.
+  EXPECT_NO_ERRNO(Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+                       memfd.get(), 0));
+}
+
+// Adding F_SEAL_WRITE fails when there are outstanding writable mappings to a
+// memfd.
+TEST(MemfdTest, SealWriteWithOutstandingWritbleMapping) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  const std::vector<char> buf(kPageSize);
+  ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Attempting to add F_SEAL_WRITE with active shared mapping with any set of
+  // permissions fails.
+
+  // Read-only shared mapping.
+  {
+    const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+        Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0));
+    EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+                SyscallFailsWithErrno(EBUSY));
+  }
+
+  // Write-only shared mapping.
+  {
+    const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+        Mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+    EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+                SyscallFailsWithErrno(EBUSY));
+  }
+
+  // Read-write shared mapping.
+  {
+    const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+        Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+             memfd.get(), 0));
+    EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+                SyscallFailsWithErrno(EBUSY));
+  }
+
+  // F_SEAL_WRITE can be set with private mappings with any permissions.
+  {
+    const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+        Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+             memfd.get(), 0));
+    EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+                SyscallSucceeds());
+  }
+}
+
+// When applying F_SEAL_WRITE fails due to outstanding writable mappings, any
+// additional seals passed to the same add seal call are also rejected.
+TEST(MemfdTest, NoPartialSealApplicationWhenWriteSealRejected) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+  // Try add some seals along with F_SEAL_WRITE. The seal application should
+  // fail since there exists an active shared mapping.
+  EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE | F_SEAL_GROW),
+              SyscallFailsWithErrno(EBUSY));
+
+  // None of the seals should be applied.
+  EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS), SyscallSucceedsWithValue(0));
+}
+
+// Seals are inode level properties, and apply to all file descriptors referring
+// to a memfd.
+TEST(MemfdTest, SealsAreInodeLevelProperties) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup());
+
+  // Add seal through the original memfd, and verify that it appears on the
+  // dupped fd.
+  ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+  EXPECT_THAT(fcntl(memfd2.get(), F_GET_SEALS),
+              SyscallSucceedsWithValue(F_SEAL_WRITE));
+
+  // Verify the seal actually applies to both fds.
+  std::vector<char> buf(kPageSize);
+  EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(write(memfd2.get(), buf.data(), buf.size()),
+              SyscallFailsWithErrno(EPERM));
+
+  // Seals are enforced on new FDs that are dupped after the seal is already
+  // applied.
+  const FileDescriptor memfd3 = ASSERT_NO_ERRNO_AND_VALUE(memfd2.Dup());
+  EXPECT_THAT(write(memfd3.get(), buf.data(), buf.size()),
+              SyscallFailsWithErrno(EPERM));
+
+  // Try a new seal applied to one of the dupped fds.
+  ASSERT_THAT(fcntl(memfd3.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+  EXPECT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(ftruncate(memfd2.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(ftruncate(memfd3.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+PosixErrorOr<bool> IsTmpfs(const std::string& path) {
+  struct statfs stat;
+  if (statfs(path.c_str(), &stat)) {
+    if (errno == ENOENT) {
+      // Nothing at path, don't raise this as an error. Instead, just report no
+      // tmpfs at path.
+      return false;
+    }
+    return PosixError(errno,
+                      absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
+  }
+  return stat.f_type == TMPFS_MAGIC;
+}
+
+// Tmpfs files also support seals, but are created with F_SEAL_SEAL.
+TEST(MemfdTest, TmpfsFilesHaveSealSeal) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs("/tmp")));
+  const TempPath tmpfs_file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn("/tmp"));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfs_file.path(), O_RDWR, 0644));
+  EXPECT_THAT(fcntl(fd.get(), F_GET_SEALS),
+              SyscallSucceedsWithValue(F_SEAL_SEAL));
+}
+
+// Can open a memfd from procfs and use as normal.
+TEST(MemfdTest, CanOpenFromProcfs) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+  // Write a random page of data to the memfd via write(2).
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Read back the same data from the fd obtained from procfs and verify.
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDWR));
+  std::vector<char> buf2(kPageSize);
+  EXPECT_THAT(pread(fd.get(), buf2.data(), buf2.size(), 0),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(buf, buf2);
+}
+
+// Test that memfd permissions are set up correctly to allow another process to
+// open it from procfs.
+TEST(MemfdTest, OtherProcessCanOpenFromProcfs) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+  pid_t pid = getpid();
+  const auto rest = [&] {
+    ASSERT_NO_ERRNO(
+        Open(absl::StrFormat("/proc/self/%d/%d", pid, memfd.get()), O_RDWR));
+  };
+  EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+// Test that only files opened as writable can have seals applied to them.
+// Normally there's no way to specify file permissions on memfds, but we can
+// obtain a read-only memfd by opening the corresponding procfs fd entry as
+// read-only.
+TEST(MemfdTest, MemfdMustBeWritableToModifySeals) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+  // Initially adding a seal works.
+  EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+  // Re-open the memfd as read-only from procfs.
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDONLY));
+
+  // Can't add seals through an unwritable fd.
+  EXPECT_THAT(fcntl(fd.get(), F_ADD_SEALS, F_SEAL_GROW),
+              SyscallFailsWithErrno(EPERM));
+}
+
+// Test that the memfd implementation internally tracks potentially writable
+// maps correctly.
+TEST(MemfdTest, MultipleWritableAndNonWritableRefsToSameFileRegion) {
+  const FileDescriptor memfd =
+      ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+
+  // Populate with a random page of data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Read-only map to the page. This should cause an initial mapping to be
+  // created.
+  Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(
+      Mmap(nullptr, kPageSize, PROT_READ, MAP_PRIVATE, memfd.get(), 0));
+
+  // Create a shared writable map to the page. This should cause the internal
+  // mapping to become potentially writable.
+  Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+      nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+  // Drop the read-only mapping first. If writable-ness isn't tracked correctly,
+  // this can cause some misaccounting, which can trigger asserts internally.
+  m1.reset();
+  m2.reset();
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From 26583e413e40666f70170025cd4c5224f45fdfa3 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 27 Mar 2019 10:45:17 -0700
Subject: Convert []byte to string without copying in usermem.CopyStringIn.

This is the same technique used by Go's strings.Builder
(https://golang.org/src/strings/builder.go#L45), and for the same
reason. (We can't just use strings.Builder because there's no way to get
the underlying []byte to pass to usermem.IO.CopyIn.)

PiperOrigin-RevId: 240594892
Change-Id: Ic070e7e480aee53a71289c7c120850991358c52c
---
 pkg/sentry/usermem/BUILD             |  1 +
 pkg/sentry/usermem/usermem.go        | 14 +++++++++-----
 pkg/sentry/usermem/usermem_unsafe.go | 27 +++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 5 deletions(-)
 create mode 100644 pkg/sentry/usermem/usermem_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD
index 1a560b6f3..e38b31b08 100644
--- a/pkg/sentry/usermem/BUILD
+++ b/pkg/sentry/usermem/BUILD
@@ -25,6 +25,7 @@ go_library(
         "bytes_io_unsafe.go",
         "usermem.go",
         "usermem_arm64.go",
+        "usermem_unsafe.go",
         "usermem_x86.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index c3c9c153b..99766a803 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -37,6 +37,8 @@ type IO interface {
 	//
 	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
 	// any following locks in the lock order.
+	//
+	// Postconditions: CopyOut does not retain src.
 	CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error)
 
 	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
@@ -45,6 +47,8 @@ type IO interface {
 	//
 	// Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
 	// any following locks in the lock order.
+	//
+	// Postconditions: CopyIn does not retain dst.
 	CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error)
 
 	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
@@ -237,7 +241,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		if !ok {
 			// Last page of kernel memory. The application can't use this
 			// anyway.
-			return string(buf[:done]), syserror.EFAULT
+			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
 		}
 		// Read up to copyStringIncrement bytes at a time.
 		readlen := copyStringIncrement
@@ -246,7 +250,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		}
 		end, ok := start.AddLength(uint64(readlen))
 		if !ok {
-			return string(buf[:done]), syserror.EFAULT
+			return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
 		}
 		// Shorten the read to avoid crossing page boundaries, since faulting
 		// in a page unnecessarily is expensive. This also ensures that partial
@@ -259,15 +263,15 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
 		// hitting err.
 		for i, c := range buf[done : done+n] {
 			if c == 0 {
-				return string(buf[:done+i]), nil
+				return stringFromImmutableBytes(buf[:done+i]), nil
 			}
 		}
 		done += n
 		if err != nil {
-			return string(buf[:done]), err
+			return stringFromImmutableBytes(buf[:done]), err
 		}
 	}
-	return string(buf), syserror.ENAMETOOLONG
+	return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
 }
 
 // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
diff --git a/pkg/sentry/usermem/usermem_unsafe.go b/pkg/sentry/usermem/usermem_unsafe.go
new file mode 100644
index 000000000..3895e7871
--- /dev/null
+++ b/pkg/sentry/usermem/usermem_unsafe.go
@@ -0,0 +1,27 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package usermem
+
+import (
+	"unsafe"
+)
+
+// stringFromImmutableBytes is equivalent to string(bs), except that it never
+// copies even if escape analysis can't prove that bs does not escape. This is
+// only valid if bs is never mutated after stringFromImmutableBytes returns.
+func stringFromImmutableBytes(bs []byte) string {
+	// Compare strings.Builder.String().
+	return *(*string)(unsafe.Pointer(&bs))
+}
-- 
cgit v1.2.3


From 645af7cdd8a183ce80218b1ad275001084c133ce Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 27 Mar 2019 11:07:41 -0700
Subject: Dev device methods should take pointer receiver.

PiperOrigin-RevId: 240600504
Change-Id: I7dd5f27c8da31f24b68b48acdf8f1c19dbd0c32d
---
 pkg/sentry/fs/dev/dev.go    | 2 +-
 pkg/sentry/fs/dev/full.go   | 2 +-
 pkg/sentry/fs/dev/random.go | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index 2ec4c9bff..e1eaa08cb 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -117,6 +117,6 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn
 type readZeros struct{}
 
 // Read implements fs.FileOperations.Read.
-func (readZeros) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+func (*readZeros) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
 	return dst.ZeroOut(ctx, math.MaxInt64)
 }
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index cbdd40161..0cb513004 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -73,6 +73,6 @@ type fullFileOperations struct {
 var _ fs.FileOperations = (*fullFileOperations)(nil)
 
 // Write implements FileOperations.Write.
-func (fullFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+func (*fullFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
 	return 0, syserror.ENOSPC
 }
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index 837b7793a..b9b78db7a 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -51,7 +51,7 @@ func newRandomDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMod
 }
 
 // GetFile implements fs.InodeOperations.GetFile.
-func (randomDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+func (*randomDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
 	return fs.NewFile(ctx, dirent, flags, &randomFileOperations{}), nil
 }
 
@@ -71,6 +71,6 @@ type randomFileOperations struct {
 var _ fs.FileOperations = (*randomFileOperations)(nil)
 
 // Read implements fs.FileOperations.Read.
-func (randomFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+func (*randomFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
 	return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
 }
-- 
cgit v1.2.3


From 2d355f0e8fb8f7e72e6448fd4fcc4e79cdb6ba72 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 27 Mar 2019 12:40:18 -0700
Subject: Add start time to /proc/<pid>/stat.

The start time is the number of clock ticks between the boot time and
application start time.

PiperOrigin-RevId: 240619475
Change-Id: Ic8bd7a73e36627ed563988864b0c551c052492a5
---
 pkg/sentry/fs/proc/task.go  | 9 ++++++++-
 test/syscalls/linux/proc.cc | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 70578d3fa..335003dd8 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -461,7 +461,14 @@ func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	fmt.Fprintf(&buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
 	fmt.Fprintf(&buf, "%d %d ", s.t.Priority(), s.t.Niceness())
 	fmt.Fprintf(&buf, "%d ", s.t.ThreadGroup().Count())
-	fmt.Fprintf(&buf, "0 0 " /* itrealvalue starttime */)
+
+	// itrealvalue. Since kernel 2.6.17, this field is no longer
+	// maintained, and is hard coded as 0.
+	fmt.Fprintf(&buf, "0 ")
+
+	// Start time is relative to boot time, expressed in clock ticks.
+	fmt.Fprintf(&buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+
 	var vss, rss uint64
 	s.t.WithMuLocked(func(t *kernel.Task) {
 		if mm := t.MemoryManager(); mm != nil {
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 0da682e7b..64d75d9fb 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -880,6 +880,14 @@ TEST_P(ProcPidStatTest, HasBasicFields) {
   EXPECT_EQ("R", fields[2]);  // task state
   EXPECT_EQ(absl::StrCat(getppid()), fields[3]);
 
+  // If the test starts up quickly, then the process start time and the kernel
+  // boot time will be very close, and the proc starttime field (which is the
+  // delta of the two times) will be 0.  For that unfortunate reason, we can
+  // only check that starttime >= 0, and not that it is strictly > 0.
+  uint64_t starttime;
+  ASSERT_TRUE(absl::SimpleAtoi(fields[21], &starttime));
+  EXPECT_GE(starttime, 0);
+
   uint64_t vss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[22], &vss));
   EXPECT_GT(vss, 0);
-- 
cgit v1.2.3


From 9c188978870051f0b42ceb1a3f16320286936976 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 27 Mar 2019 17:43:30 -0700
Subject: Add rsslim field in /proc/pid/stat.

PiperOrigin-RevId: 240681675
Change-Id: Ib214106e303669fca2d5c744ed5c18e835775161
---
 pkg/sentry/fs/proc/BUILD    | 1 +
 pkg/sentry/fs/proc/task.go  | 7 ++++++-
 test/syscalls/linux/proc.cc | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 666b0ab3a..3aa70a28e 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -42,6 +42,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
         "//pkg/sentry/mm",
         "//pkg/sentry/socket/rpcinet",
         "//pkg/sentry/socket/unix",
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 335003dd8..5a90c5578 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/mm"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -477,7 +478,11 @@ func (s *taskStatData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 		}
 	})
 	fmt.Fprintf(&buf, "%d %d ", vss, rss/usermem.PageSize)
-	fmt.Fprintf(&buf, "0 0 0 0 0 0 " /* rsslim startcode endcode startstack kstkesp kstkeip */)
+
+	// rsslim.
+	fmt.Fprintf(&buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+
+	fmt.Fprintf(&buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
 	fmt.Fprintf(&buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
 	fmt.Fprintf(&buf, "0 0 " /* nswap cnswap */)
 	terminationSignal := linux.Signal(0)
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 64d75d9fb..5b52ed71a 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -895,6 +895,10 @@ TEST_P(ProcPidStatTest, HasBasicFields) {
   uint64_t rss;
   ASSERT_TRUE(absl::SimpleAtoi(fields[23], &rss));
   EXPECT_GT(rss, 0);
+
+  uint64_t rsslim;
+  ASSERT_TRUE(absl::SimpleAtoi(fields[24], &rsslim));
+  EXPECT_GT(rsslim, 0);
 }
 
 INSTANTIATE_TEST_CASE_P(SelfAndNumericPid, ProcPidStatTest,
-- 
cgit v1.2.3


From f005350c93cb9e2a247b0d8a061e52f3160d36d4 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 28 Mar 2019 11:42:38 -0700
Subject: Clean up gofer handle caching.

- Document fsutil.CachedFileObject.FD() requirements on access
permissions, and change gofer.inodeFileState.FD() to honor them.
Fixes #147.

- Combine gofer.inodeFileState.readonly and
gofer.inodeFileState.readthrough, and simplify handle caching logic.

- Inline gofer.cachePolicy.cacheHandles into
gofer.inodeFileState.setSharedHandles, because users with access to
gofer.inodeFileState don't necessarily have access to the fs.Inode
(predictably, this is a save/restore problem).

Before this CL:

$ docker run --runtime=runsc-d -v $(pwd)/gvisor/repro:/root/repro -it ubuntu bash
root@34d51017ed67:/# /root/repro/runsc-b147
mmap: 0x7f3c01e45000
Segmentation fault

After this CL:

$ docker run --runtime=runsc-d -v $(pwd)/gvisor/repro:/root/repro -it ubuntu bash
root@d3c3cb56bbf9:/# /root/repro/runsc-b147
mmap: 0x7f78987ec000
o
PiperOrigin-RevId: 240818413
Change-Id: I49e1d4a81a0cb9177832b0a9f31a10da722a896b
---
 pkg/sentry/fs/fsutil/inode_cached.go |  11 +-
 pkg/sentry/fs/gofer/cache_policy.go  |  12 ---
 pkg/sentry/fs/gofer/file_state.go    |   3 +-
 pkg/sentry/fs/gofer/inode.go         | 197 ++++++++++++++---------------------
 pkg/sentry/fs/gofer/path.go          |   6 +-
 test/syscalls/linux/mmap.cc          |  23 ++++
 6 files changed, 118 insertions(+), 134 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 6ca51ab0d..b690cfe93 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -138,8 +138,15 @@ type CachedFileObject interface {
 	// Sync instructs the remote filesystem to sync the file to stable storage.
 	Sync(ctx context.Context) error
 
-	// FD returns a host file descriptor. Return value must be -1 or not -1
-	// for the lifetime of the CachedFileObject.
+	// FD returns a host file descriptor. If it is possible for
+	// CachingInodeOperations.AddMapping to have ever been called with writable
+	// = true, the FD must have been opened O_RDWR; otherwise, it may have been
+	// opened O_RDONLY or O_RDWR. (mmap unconditionally requires that mapped
+	// files are readable.) If no host file descriptor is available, FD returns
+	// a negative number.
+	//
+	// For any given CachedFileObject, if FD() ever succeeds (returns a
+	// non-negative number), it must always succeed.
 	//
 	// FD is called iff the file has been memory mapped. This implies that
 	// the file was opened (see fs.InodeOperations.GetFile).
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 507d6900f..d7fbb71b7 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -103,18 +103,6 @@ func (cp cachePolicy) useCachingInodeOps(inode *fs.Inode) bool {
 	return cp == cacheAll || cp == cacheAllWritethrough
 }
 
-// cacheHandles determine whether handles need to be cached with the given
-// inode. Handles must be cached when inode can be mapped into memory to
-// implement InodeOperations.Mappable with stable handles.
-func (cp cachePolicy) cacheHandles(inode *fs.Inode) bool {
-	// Do cached IO for regular files only. Some "character devices" expect
-	// no caching.
-	if !fs.IsFile(inode.StableAttr) {
-		return false
-	}
-	return cp.useCachingInodeOps(inode) || cp == cacheRemoteRevalidating
-}
-
 // writeThough indicates whether writes to the file should be synced to the
 // gofer immediately.
 func (cp cachePolicy) writeThrough(inode *fs.Inode) bool {
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index dd4f817bf..f770ca4ea 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -29,11 +29,10 @@ func (f *fileOperations) afterLoad() {
 		// Manually load the open handles.
 		var err error
 		// TODO: Context is not plumbed to save/restore.
-		f.handles, err = newHandles(context.Background(), f.inodeOperations.fileState.file, f.flags)
+		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), f.flags)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
 		}
-		f.inodeOperations.fileState.setHandlesForCachedIO(f.flags, f.handles)
 		return nil
 	}
 	fs.Async(fs.CatchError(load))
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 83fff7517..29af1010c 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -94,21 +94,23 @@ type inodeFileState struct {
 	// handlesMu protects the below fields.
 	handlesMu sync.RWMutex `state:"nosave"`
 
-	// Do minimal open handle caching: only for read only filesystems.
-	readonly *handles `state:"nosave"`
-
-	// Maintain readthrough handles for populating page caches.
-	readthrough *handles `state:"nosave"`
-
-	// Maintain writeback handles for syncing from page caches.
-	writeback *handles `state:"nosave"`
-
-	// writebackRW indicates whether writeback is opened read-write. If
-	// it is not and a read-write handle could replace writeback (above),
-	// then writeback is replaced with the read-write handle. This
-	// ensures that files that were first opened write-only and then
-	// later are opened read-write to be mapped can in fact be mapped.
-	writebackRW bool
+	// If readHandles is non-nil, it holds handles that are either read-only or
+	// read/write. If writeHandles is non-nil, it holds write-only handles if
+	// writeHandlesRW is false, and read/write handles if writeHandlesRW is
+	// true.
+	//
+	// Once readHandles becomes non-nil, it can't be changed until
+	// inodeFileState.Release(), because of a defect in the
+	// fsutil.CachedFileObject interface: there's no way for the caller of
+	// fsutil.CachedFileObject.FD() to keep the returned FD open, so if we
+	// racily replace readHandles after inodeFileState.FD() has returned
+	// readHandles.Host.FD(), fsutil.CachingInodeOperations may use a closed
+	// FD. writeHandles can be changed if writeHandlesRW is false, since
+	// inodeFileState.FD() can't return a write-only FD, but can't be changed
+	// if writeHandlesRW is true for the same reason.
+	readHandles    *handles `state:"nosave"`
+	writeHandles   *handles `state:"nosave"`
+	writeHandlesRW bool     `state:"nosave"`
 
 	// loading is acquired when the inodeFileState begins an asynchronous
 	// load. It releases when the load is complete. Callers that require all
@@ -134,81 +136,82 @@ type inodeFileState struct {
 // Release releases file handles.
 func (i *inodeFileState) Release(ctx context.Context) {
 	i.file.close(ctx)
-	if i.readonly != nil {
-		i.readonly.DecRef()
-	}
-	if i.readthrough != nil {
-		i.readthrough.DecRef()
+	if i.readHandles != nil {
+		i.readHandles.DecRef()
 	}
-	if i.writeback != nil {
-		i.writeback.DecRef()
+	if i.writeHandles != nil {
+		i.writeHandles.DecRef()
 	}
 }
 
-// setHandlesForCachedIO installs file handles for reading and writing
-// through fs.CachingInodeOperations.
-func (i *inodeFileState) setHandlesForCachedIO(flags fs.FileFlags, h *handles) {
-	i.handlesMu.Lock()
-	defer i.handlesMu.Unlock()
+func (i *inodeFileState) canShareHandles() bool {
+	// Only share handles for regular files, since for other file types,
+	// distinct handles may have special semantics even if they represent the
+	// same file. Disable handle sharing for cache policy cacheNone, since this
+	// is legacy behavior.
+	return fs.IsFile(i.sattr) && i.s.cachePolicy != cacheNone
+}
 
-	if flags.Read {
-		if i.readthrough == nil {
-			h.IncRef()
-			i.readthrough = h
-		}
+// Preconditions: i.handlesMu must be locked for writing.
+func (i *inodeFileState) setSharedHandlesLocked(flags fs.FileFlags, h *handles) {
+	if flags.Read && i.readHandles == nil {
+		h.IncRef()
+		i.readHandles = h
 	}
 	if flags.Write {
-		if i.writeback == nil {
+		if i.writeHandles == nil {
 			h.IncRef()
-			i.writeback = h
-		} else if !i.writebackRW && flags.Read {
-			i.writeback.DecRef()
+			i.writeHandles = h
+			i.writeHandlesRW = flags.Read
+		} else if !i.writeHandlesRW && flags.Read {
+			// Upgrade i.writeHandles.
+			i.writeHandles.DecRef()
 			h.IncRef()
-			i.writeback = h
-		}
-		if flags.Read {
-			i.writebackRW = true
+			i.writeHandles = h
+			i.writeHandlesRW = flags.Read
 		}
 	}
 }
 
-// getCachedHandles returns any cached handles which would accelerate
-// performance generally. These handles should only be used if the mount
-// supports caching. This is distinct from fs.CachingInodeOperations
-// which is used for a limited set of file types (those that can be mapped).
-func (i *inodeFileState) getCachedHandles(ctx context.Context, flags fs.FileFlags, msrc *fs.MountSource) (*handles, bool) {
+// getHandles returns a set of handles for a new file using i opened with the
+// given flags.
+func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags) (*handles, error) {
+	if !i.canShareHandles() {
+		return newHandles(ctx, i.file, flags)
+	}
 	i.handlesMu.Lock()
 	defer i.handlesMu.Unlock()
-
-	if flags.Read && !flags.Write && msrc.Flags.ReadOnly {
-		if i.readonly != nil {
-			i.readonly.IncRef()
-			return i.readonly, true
-		}
-		h, err := newHandles(ctx, i.file, flags)
-		if err != nil {
-			return nil, false
+	// Do we already have usable shared handles?
+	if flags.Write {
+		if i.writeHandles != nil && (i.writeHandlesRW || !flags.Read) {
+			i.writeHandles.IncRef()
+			return i.writeHandles, nil
 		}
-		i.readonly = h
-		i.readonly.IncRef()
-		return i.readonly, true
+	} else if i.readHandles != nil {
+		i.readHandles.IncRef()
+		return i.readHandles, nil
 	}
-
-	return nil, false
+	// No; get new handles and cache them for future sharing.
+	h, err := newHandles(ctx, i.file, flags)
+	if err != nil {
+		return nil, err
+	}
+	i.setSharedHandlesLocked(flags, h)
+	return h, nil
 }
 
 // ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
 func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
-	return i.readthrough.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts)
+	return i.readHandles.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts)
 }
 
 // WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
 func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
-	return i.writeback.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs)
+	return i.writeHandles.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs)
 }
 
 // SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
@@ -276,52 +279,31 @@ func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool {
 
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
-	return (i.readonly != nil && i.readonly.Host != nil) ||
-		(i.readthrough != nil && i.readthrough.Host != nil) ||
-		(i.writeback != nil && i.writeback.Host != nil)
+	return (i.readHandles != nil && i.readHandles.Host != nil) ||
+		(i.writeHandles != nil && i.writeHandles.Host != nil)
 }
 
 // Sync implements fsutil.CachedFileObject.Sync.
 func (i *inodeFileState) Sync(ctx context.Context) error {
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
-	if i.writeback == nil {
+	if i.writeHandles == nil {
 		return nil
 	}
-	return i.writeback.File.fsync(ctx)
+	return i.writeHandles.File.fsync(ctx)
 }
 
 // FD implements fsutil.CachedFileObject.FD.
-//
-// FD meets the requirements of fsutil.CachedFileObject.FD because p9.File.Open
-// returns a host file descriptor to back _both_ readthrough and writeback or
-// not at all (e.g. both are nil).
 func (i *inodeFileState) FD() int {
 	i.handlesMu.RLock()
 	defer i.handlesMu.RUnlock()
-	return i.fdLocked()
-}
-
-func (i *inodeFileState) fdLocked() int {
-	// Assert that the file was actually opened.
-	if i.writeback == nil && i.readthrough == nil {
-		panic("cannot get host FD for a file that was never opened")
+	if i.writeHandlesRW && i.writeHandles != nil && i.writeHandles.Host != nil {
+		return int(i.writeHandles.Host.FD())
 	}
-	// If this file is mapped, then it must have been opened
-	// read-write and i.writeback was upgraded to a read-write
-	// handle. Prefer that to map.
-	if i.writeback != nil {
-		if i.writeback.Host == nil {
-			return -1
-		}
-		return int(i.writeback.Host.FD())
-	}
-	// Otherwise the file may only have been opened readable
-	// so far. That's the only way it can be accessed.
-	if i.readthrough.Host == nil {
-		return -1
+	if i.readHandles != nil && i.readHandles.Host != nil {
+		return int(i.readHandles.Host.FD())
 	}
-	return int(i.readthrough.Host.FD())
+	return -1
 }
 
 // waitForLoad makes sure any restore-issued loading is done.
@@ -409,18 +391,15 @@ func (i *inodeOperations) getFileSocket(ctx context.Context, d *fs.Dirent, flags
 }
 
 func (i *inodeOperations) getFilePipe(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	// Try to open as a host pipe.
-	if pipeOps, err := fdpipe.Open(ctx, i, flags); err != errNotHostFile {
-		return fs.NewFile(ctx, d, flags, pipeOps), err
+	// Try to open as a host pipe; if that doesn't work, handle it normally.
+	pipeOps, err := fdpipe.Open(ctx, i, flags)
+	if err == errNotHostFile {
+		return i.getFileDefault(ctx, d, flags)
 	}
-
-	// If the error is due to the fact that this was never a host pipe, then back
-	// this file with its dirent.
-	h, err := newHandles(ctx, i.fileState.file, flags)
 	if err != nil {
 		return nil, err
 	}
-	return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
+	return fs.NewFile(ctx, d, flags, pipeOps), nil
 }
 
 // errNotHostFile indicates that the file is not a host file.
@@ -454,24 +433,10 @@ func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*
 }
 
 func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	if !i.session().cachePolicy.cacheHandles(d.Inode) {
-		h, err := newHandles(ctx, i.fileState.file, flags)
-		if err != nil {
-			return nil, err
-		}
-		return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
-	}
-
-	h, ok := i.fileState.getCachedHandles(ctx, flags, d.Inode.MountSource)
-	if !ok {
-		var err error
-		h, err = newHandles(ctx, i.fileState.file, flags)
-		if err != nil {
-			return nil, err
-		}
+	h, err := i.fileState.getHandles(ctx, flags)
+	if err != nil {
+		return nil, err
 	}
-	i.fileState.setHandlesForCachedIO(flags, h)
-
 	return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
 }
 
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 2ba400836..5e1a8b623 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -129,8 +129,10 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 		File: newFile,
 		Host: hostFile,
 	}
-	if iops.session().cachePolicy.cacheHandles(d.Inode) {
-		iops.fileState.setHandlesForCachedIO(flags, h)
+	if iops.fileState.canShareHandles() {
+		iops.fileState.handlesMu.Lock()
+		iops.fileState.setSharedHandlesLocked(flags, h)
+		iops.fileState.handlesMu.Unlock()
 	}
 	return NewFile(ctx, d, name, flags, iops, h), nil
 }
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index afe060d33..b500e79a4 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1696,6 +1696,29 @@ TEST(MMapDeathTest, TruncateAfterCOWBreak) {
               ::testing::KilledBySignal(SIGBUS), "");
 }
 
+// Regression test for #147.
+TEST(MMapNoFixtureTest, MapReadOnlyAfterCreateWriteOnly) {
+  std::string filename = NewTempAbsPath();
+
+  // We have to create the file O_RDONLY to reproduce the bug because
+  // fsgofer.localFile.Create() silently upgrades O_WRONLY to O_RDWR, causing
+  // the cached "write-only" FD to be read/write and therefore usable by mmap().
+  auto const ro_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Open(filename, O_RDONLY | O_CREAT | O_EXCL, 0666));
+
+  // Get a write-only FD for the same file, which should be ignored by mmap()
+  // (but isn't in #147).
+  auto const wo_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_WRONLY));
+  ASSERT_THAT(ftruncate(wo_fd.get(), kPageSize), SyscallSucceeds());
+
+  auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+      Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, ro_fd.get(), 0));
+  std::vector<char> buf(kPageSize);
+  // The test passes if this survives.
+  std::copy(static_cast<char*>(mapping.ptr()),
+            static_cast<char*>(mapping.endptr()), buf.data());
+}
+
 // Conditional on MAP_32BIT.
 #ifdef __x86_64__
 
-- 
cgit v1.2.3


From e373d3642e95768229f0414fe164beeb13170817 Mon Sep 17 00:00:00 2001
From: Googler <noreply@google.com>
Date: Thu, 28 Mar 2019 13:42:48 -0700
Subject: Internal change.

PiperOrigin-RevId: 240842801
Change-Id: Ibbd6f849f9613edc1b1dd7a99a97d1ecdb6e9188
---
 pkg/sentry/fs/proc/uid_gid_map.go | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index d6e278f79..a52e0cb1f 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -134,10 +134,23 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u
 	if _, err := src.CopyIn(ctx, b); err != nil {
 		return 0, err
 	}
-	lines := bytes.SplitN(bytes.TrimSpace(b), []byte("\n"), maxIDMapLines+1)
+
+	// Truncate from the first NULL byte.
+	var nul int64
+	nul = int64(bytes.IndexByte(b, 0))
+	if nul == -1 {
+		nul = srclen
+	}
+	b = b[:nul]
+	// Remove the last \n.
+	if nul >= 1 && b[nul-1] == '\n' {
+		b = b[:nul-1]
+	}
+	lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
 	if len(lines) > maxIDMapLines {
 		return 0, syserror.EINVAL
 	}
+
 	entries := make([]auth.IDMapEntry, len(lines))
 	for i, l := range lines {
 		var e auth.IDMapEntry
-- 
cgit v1.2.3


From f2e5dcf21c270d5d56da63e03ed204845e192e56 Mon Sep 17 00:00:00 2001
From: Bert Muthalaly <stijlist@google.com>
Date: Thu, 28 Mar 2019 14:08:11 -0700
Subject: Add ICMP stats

PiperOrigin-RevId: 240848882
Change-Id: I23dd4599f073263437aeab357c3f767e1a432b82
---
 pkg/sentry/socket/epsocket/epsocket.go |  66 +++++++++
 pkg/tcpip/network/ip_test.go           |  15 +-
 pkg/tcpip/network/ipv4/icmp.go         |  45 +++++-
 pkg/tcpip/network/ipv6/icmp.go         |  58 +++++++-
 pkg/tcpip/network/ipv6/icmp_test.go    | 251 +++++++++++++++++++++++++--------
 pkg/tcpip/tcpip.go                     | 169 ++++++++++++++++++++--
 6 files changed, 519 insertions(+), 85 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e74bd1bdd..e170da169 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -66,6 +66,72 @@ var Metrics = tcpip.Stats{
 	UnknownProtocolRcvdPackets: mustCreateMetric("/netstack/unknown_protocol_received_packets", "Number of packets received by netstack that were for an unknown or unsupported protocol."),
 	MalformedRcvdPackets:       mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."),
 	DroppedPackets:             mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."),
+	ICMP: tcpip.ICMPStats{
+		V4PacketsSent: tcpip.ICMPv4SentPacketStats{
+			ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
+				Echo:           mustCreateMetric("/netstack/icmp/v4/packets_sent/echo", "Total number of ICMPv4 echo packets sent by netstack."),
+				EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Total number of ICMPv4 echo reply packets sent by netstack."),
+				DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Total number of ICMPv4 destination unreachable packets sent by netstack."),
+				SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Total number of ICMPv4 source quench packets sent by netstack."),
+				Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Total number of ICMPv4 redirect packets sent by netstack."),
+				TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Total number of ICMPv4 time exceeded packets sent by netstack."),
+				ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Total number of ICMPv4 parameter problem packets sent by netstack."),
+				Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Total number of ICMPv4 timestamp packets sent by netstack."),
+				TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Total number of ICMPv4 timestamp reply packets sent by netstack."),
+				InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Total number of ICMPv4 information request packets sent by netstack."),
+				InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Total number of ICMPv4 information reply packets sent by netstack."),
+			},
+			Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Total number of ICMPv4 packets dropped by netstack due to link layer errors."),
+		},
+		V4PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
+			ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
+				Echo:           mustCreateMetric("/netstack/icmp/v4/packets_received/echo", "Total number of ICMPv4 echo packets received by netstack."),
+				EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Total number of ICMPv4 echo reply packets received by netstack."),
+				DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Total number of ICMPv4 destination unreachable packets received by netstack."),
+				SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Total number of ICMPv4 source quench packets received by netstack."),
+				Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Total number of ICMPv4 redirect packets received by netstack."),
+				TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Total number of ICMPv4 time exceeded packets received by netstack."),
+				ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Total number of ICMPv4 parameter problem packets received by netstack."),
+				Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Total number of ICMPv4 timestamp packets received by netstack."),
+				TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Total number of ICMPv4 timestamp reply packets received by netstack."),
+				InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Total number of ICMPv4 information request packets received by netstack."),
+				InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Total number of ICMPv4 information reply packets received by netstack."),
+			},
+			Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Total number of ICMPv4 packets received that the transport layer could not parse."),
+		},
+		V6PacketsSent: tcpip.ICMPv6SentPacketStats{
+			ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
+				EchoRequest:     mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Total number of ICMPv6 echo request packets sent by netstack."),
+				EchoReply:       mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Total number of ICMPv6 echo reply packets sent by netstack."),
+				DstUnreachable:  mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Total number of ICMPv6 destination unreachable packets sent by netstack."),
+				PacketTooBig:    mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Total number of ICMPv6 packet too big packets sent by netstack."),
+				TimeExceeded:    mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Total number of ICMPv6 time exceeded packets sent by netstack."),
+				ParamProblem:    mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Total number of ICMPv6 parameter problem packets sent by netstack."),
+				RouterSolicit:   mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Total number of ICMPv6 router solicit packets sent by netstack."),
+				RouterAdvert:    mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Total number of ICMPv6 router advert packets sent by netstack."),
+				NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets sent by netstack."),
+				NeighborAdvert:  mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Total number of ICMPv6 neighbor advert packets sent by netstack."),
+				RedirectMsg:     mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Total number of ICMPv6 redirect message packets sent by netstack."),
+			},
+			Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Total number of ICMPv6 packets dropped by netstack due to link layer errors."),
+		},
+		V6PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
+			ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
+				EchoRequest:     mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Total number of ICMPv6 echo request packets received by netstack."),
+				EchoReply:       mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Total number of ICMPv6 echo reply packets received by netstack."),
+				DstUnreachable:  mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Total number of ICMPv6 destination unreachable packets received by netstack."),
+				PacketTooBig:    mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Total number of ICMPv6 packet too big packets received by netstack."),
+				TimeExceeded:    mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Total number of ICMPv6 time exceeded packets received by netstack."),
+				ParamProblem:    mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Total number of ICMPv6 parameter problem packets received by netstack."),
+				RouterSolicit:   mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Total number of ICMPv6 router solicit packets received by netstack."),
+				RouterAdvert:    mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Total number of ICMPv6 router advert packets received by netstack."),
+				NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Total number of ICMPv6 neighbor solicit packets received by netstack."),
+				NeighborAdvert:  mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Total number of ICMPv6 neighbor advert packets received by netstack."),
+				RedirectMsg:     mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Total number of ICMPv6 redirect message packets received by netstack."),
+			},
+			Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Total number of ICMPv6 packets received that the transport layer could not parse."),
+		},
+	},
 	IP: tcpip.IPStats{
 		PacketsReceived:          mustCreateMetric("/netstack/ip/packets_received", "Total number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
 		InvalidAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Total number of IP packets received with an unknown or invalid destination address."),
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index d79eba4b0..522009fac 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -287,9 +287,9 @@ func TestIPv4ReceiveControl(t *testing.T) {
 		{"Non-zero fragment offset", 0, 100, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0},
 		{"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4DstUnreachableMinimumSize + 8},
 	}
-	r := stack.Route{
-		LocalAddress:  localIpv4Addr,
-		RemoteAddress: "\x0a\x00\x00\xbb",
+	r, err := buildIPv4Route(localIpv4Addr, "\x0a\x00\x00\xbb")
+	if err != nil {
+		t.Fatal(err)
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
@@ -521,9 +521,12 @@ func TestIPv6ReceiveControl(t *testing.T) {
 		{"Non-zero fragment offset", 0, newUint16(100), header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 0},
 		{"Zero-length packet", 0, nil, header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv6MinimumSize + header.ICMPv6DstUnreachableMinimumSize + 8},
 	}
-	r := stack.Route{
-		LocalAddress:  localIpv6Addr,
-		RemoteAddress: "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa",
+	r, err := buildIPv6Route(
+		localIpv6Addr,
+		"\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa",
+	)
+	if err != nil {
+		t.Fatal(err)
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index a9650de03..ed9a4eee5 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -55,15 +55,21 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+	stats := r.Stats()
+	received := stats.ICMP.V4PacketsReceived
 	v := vv.First()
 	if len(v) < header.ICMPv4MinimumSize {
+		received.Invalid.Increment()
 		return
 	}
 	h := header.ICMPv4(v)
 
+	// TODO: Meaningfully handle all ICMP types.
 	switch h.Type() {
 	case header.ICMPv4Echo:
+		received.Echo.Increment()
 		if len(v) < header.ICMPv4EchoMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
 		// It's possible that a raw socket expects to receive this.
@@ -76,16 +82,25 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		copy(pkt, h)
 		pkt.SetType(header.ICMPv4EchoReply)
 		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
-		r.WritePacket(nil /* gso */, hdr, vv, header.ICMPv4ProtocolNumber, r.DefaultTTL())
+		sent := stats.ICMP.V4PacketsSent
+		if err := r.WritePacket(nil /* gso */, hdr, vv, header.ICMPv4ProtocolNumber, r.DefaultTTL()); err != nil {
+			sent.Dropped.Increment()
+			return
+		}
+		sent.EchoReply.Increment()
 
 	case header.ICMPv4EchoReply:
+		received.EchoReply.Increment()
 		if len(v) < header.ICMPv4EchoMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
 		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, netHeader, vv)
 
 	case header.ICMPv4DstUnreachable:
+		received.DstUnreachable.Increment()
 		if len(v) < header.ICMPv4DstUnreachableMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
 		vv.TrimFront(header.ICMPv4DstUnreachableMinimumSize)
@@ -97,6 +112,32 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 			mtu := uint32(binary.BigEndian.Uint16(v[header.ICMPv4DstUnreachableMinimumSize-2:]))
 			e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv)
 		}
+
+	case header.ICMPv4SrcQuench:
+		received.SrcQuench.Increment()
+
+	case header.ICMPv4Redirect:
+		received.Redirect.Increment()
+
+	case header.ICMPv4TimeExceeded:
+		received.TimeExceeded.Increment()
+
+	case header.ICMPv4ParamProblem:
+		received.ParamProblem.Increment()
+
+	case header.ICMPv4Timestamp:
+		received.Timestamp.Increment()
+
+	case header.ICMPv4TimestampReply:
+		received.TimestampReply.Increment()
+
+	case header.ICMPv4InfoRequest:
+		received.InfoRequest.Increment()
+
+	case header.ICMPv4InfoReply:
+		received.InfoReply.Increment()
+
+	default:
+		received.Invalid.Increment()
 	}
-	// TODO: Handle other ICMP types.
 }
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 36d98caef..3210e6fc7 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -63,15 +63,22 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.
 }
 
 func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.VectorisedView) {
+	stats := r.Stats().ICMP
+	sent := stats.V6PacketsSent
+	received := stats.V6PacketsReceived
 	v := vv.First()
 	if len(v) < header.ICMPv6MinimumSize {
+		received.Invalid.Increment()
 		return
 	}
 	h := header.ICMPv6(v)
 
+	// TODO: Meaningfully handle all ICMP types.
 	switch h.Type() {
 	case header.ICMPv6PacketTooBig:
+		received.PacketTooBig.Increment()
 		if len(v) < header.ICMPv6PacketTooBigMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
 		vv.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
@@ -79,7 +86,9 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv)
 
 	case header.ICMPv6DstUnreachable:
+		received.DstUnreachable.Increment()
 		if len(v) < header.ICMPv6DstUnreachableMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
 		vv.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
@@ -89,15 +98,21 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		}
 
 	case header.ICMPv6NeighborSolicit:
+		received.NeighborSolicit.Increment()
+
+		e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
+
 		if len(v) < header.ICMPv6NeighborSolicitMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
-		targetAddr := tcpip.Address(v[8 : 8+16])
+		targetAddr := tcpip.Address(v[8:][:16])
 		if e.linkAddrCache.CheckLocalAddress(e.nicid, ProtocolNumber, targetAddr) == 0 {
 			// We don't have a useful answer; the best we can do is ignore the request.
 			return
 		}
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
+
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertSize)
 		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
 		pkt.SetType(header.ICMPv6NeighborAdvert)
 		pkt[icmpV6FlagOffset] = ndpSolicitedFlag | ndpOverrideFlag
@@ -118,22 +133,29 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		defer r.Release()
 		r.LocalAddress = targetAddr
 		pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
-		r.WritePacket(nil /* gso */, hdr, buffer.VectorisedView{}, header.ICMPv6ProtocolNumber, r.DefaultTTL())
 
-		e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
+		if err := r.WritePacket(nil /* gso */, hdr, buffer.VectorisedView{}, header.ICMPv6ProtocolNumber, r.DefaultTTL()); err != nil {
+			sent.Dropped.Increment()
+			return
+		}
+		sent.NeighborAdvert.Increment()
 
 	case header.ICMPv6NeighborAdvert:
+		received.NeighborAdvert.Increment()
 		if len(v) < header.ICMPv6NeighborAdvertSize {
+			received.Invalid.Increment()
 			return
 		}
-		targetAddr := tcpip.Address(v[8 : 8+16])
+		targetAddr := tcpip.Address(v[8:][:16])
 		e.linkAddrCache.AddLinkAddress(e.nicid, targetAddr, r.RemoteLinkAddress)
 		if targetAddr != r.RemoteAddress {
 			e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress)
 		}
 
 	case header.ICMPv6EchoRequest:
+		received.EchoRequest.Increment()
 		if len(v) < header.ICMPv6EchoMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
 
@@ -143,14 +165,37 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 		copy(pkt, h)
 		pkt.SetType(header.ICMPv6EchoReply)
 		pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, vv))
-		r.WritePacket(nil /* gso */, hdr, vv, header.ICMPv6ProtocolNumber, r.DefaultTTL())
+		if err := r.WritePacket(nil /* gso */, hdr, vv, header.ICMPv6ProtocolNumber, r.DefaultTTL()); err != nil {
+			sent.Dropped.Increment()
+			return
+		}
+		sent.EchoReply.Increment()
 
 	case header.ICMPv6EchoReply:
+		received.EchoReply.Increment()
 		if len(v) < header.ICMPv6EchoMinimumSize {
+			received.Invalid.Increment()
 			return
 		}
 		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, netHeader, vv)
 
+	case header.ICMPv6TimeExceeded:
+		received.TimeExceeded.Increment()
+
+	case header.ICMPv6ParamProblem:
+		received.ParamProblem.Increment()
+
+	case header.ICMPv6RouterSolicit:
+		received.RouterSolicit.Increment()
+
+	case header.ICMPv6RouterAdvert:
+		received.RouterAdvert.Increment()
+
+	case header.ICMPv6RedirectMsg:
+		received.RedirectMsg.Increment()
+
+	default:
+		received.Invalid.Increment()
 	}
 }
 
@@ -202,6 +247,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 		DstAddr:       r.RemoteAddress,
 	})
 
+	// TODO: count this in ICMP stats.
 	return linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
 }
 
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index eee09f3af..8b57a0641 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -15,9 +15,10 @@
 package ipv6
 
 import (
+	"fmt"
+	"reflect"
 	"strings"
 	"testing"
-	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
@@ -39,20 +40,151 @@ var (
 	lladdr1 = header.LinkLocalAddr(linkAddr1)
 )
 
-type icmpInfo struct {
-	typ header.ICMPv6Type
-	src tcpip.Address
+type stubLinkEndpoint struct {
+	stack.LinkEndpoint
+}
+
+func (*stubLinkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return 0
+}
+
+func (*stubLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, buffer.Prependable, buffer.VectorisedView, tcpip.NetworkProtocolNumber) *tcpip.Error {
+	return nil
+}
+
+func (*stubLinkEndpoint) Attach(stack.NetworkDispatcher) {}
+
+type stubDispatcher struct {
+	stack.TransportDispatcher
+}
+
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, buffer.View, buffer.VectorisedView) {
+}
+
+type stubLinkAddressCache struct {
+	stack.LinkAddressCache
+}
+
+func (*stubLinkAddressCache) CheckLocalAddress(tcpip.NICID, tcpip.NetworkProtocolNumber, tcpip.Address) tcpip.NICID {
+	return 0
+}
+
+func (*stubLinkAddressCache) AddLinkAddress(tcpip.NICID, tcpip.Address, tcpip.LinkAddress) {
+}
+
+func TestICMPCounts(t *testing.T) {
+	s := stack.New([]string{ProtocolName}, []string{icmp.ProtocolName6}, stack.Options{})
+	{
+		id := stack.RegisterLinkEndpoint(&stubLinkEndpoint{})
+		if err := s.CreateNIC(1, id); err != nil {
+			t.Fatalf("CreateNIC(_) = %s", err)
+		}
+		if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+			t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+		}
+	}
+	s.SetRouteTable(
+		[]tcpip.Route{{
+			Destination: lladdr1,
+			Mask:        tcpip.AddressMask(strings.Repeat("\xff", 16)),
+			NIC:         1,
+		}},
+	)
+
+	ep, err := s.NetworkProtocolInstance(ProtocolNumber).NewEndpoint(0, lladdr1, &stubLinkAddressCache{}, &stubDispatcher{}, nil)
+	if err != nil {
+		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
+	}
+
+	r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+	}
+	defer r.Release()
+
+	types := []struct {
+		typ  header.ICMPv6Type
+		size int
+	}{
+		{header.ICMPv6DstUnreachable, header.ICMPv6DstUnreachableMinimumSize},
+		{header.ICMPv6PacketTooBig, header.ICMPv6PacketTooBigMinimumSize},
+		{header.ICMPv6TimeExceeded, header.ICMPv6MinimumSize},
+		{header.ICMPv6ParamProblem, header.ICMPv6MinimumSize},
+		{header.ICMPv6EchoRequest, header.ICMPv6EchoMinimumSize},
+		{header.ICMPv6EchoReply, header.ICMPv6EchoMinimumSize},
+		{header.ICMPv6RouterSolicit, header.ICMPv6MinimumSize},
+		{header.ICMPv6RouterAdvert, header.ICMPv6MinimumSize},
+		{header.ICMPv6NeighborSolicit, header.ICMPv6NeighborSolicitMinimumSize},
+		{header.ICMPv6NeighborAdvert, header.ICMPv6NeighborAdvertSize},
+		{header.ICMPv6RedirectMsg, header.ICMPv6MinimumSize},
+	}
+
+	handleIPv6Payload := func(hdr buffer.Prependable) {
+		payloadLength := hdr.UsedLength()
+		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: uint16(payloadLength),
+			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+			HopLimit:      r.DefaultTTL(),
+			SrcAddr:       r.LocalAddress,
+			DstAddr:       r.RemoteAddress,
+		})
+		ep.HandlePacket(&r, hdr.View().ToVectorisedView())
+	}
+
+	for _, typ := range types {
+		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size)
+		pkt := header.ICMPv6(hdr.Prepend(typ.size))
+		pkt.SetType(typ.typ)
+		pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+		handleIPv6Payload(hdr)
+	}
+
+	// Construct an empty ICMP packet so that
+	// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
+	handleIPv6Payload(buffer.NewPrependable(header.IPv6MinimumSize))
+
+	icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
+	visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
+		if got, want := s.Value(), uint64(1); got != want {
+			t.Errorf("got %s = %d, want = %d", name, got, want)
+		}
+	})
+	if t.Failed() {
+		t.Logf("stats:\n%+v", s.Stats())
+	}
+}
+
+func visitStats(v reflect.Value, f func(string, *tcpip.StatCounter)) {
+	t := v.Type()
+	for i := 0; i < v.NumField(); i++ {
+		v := v.Field(i)
+		switch v.Kind() {
+		case reflect.Ptr:
+			f(t.Field(i).Name, v.Interface().(*tcpip.StatCounter))
+		case reflect.Struct:
+			visitStats(v, f)
+		default:
+			panic(fmt.Sprintf("unexpected type %s", v.Type()))
+		}
+	}
 }
 
 type testContext struct {
-	t  *testing.T
 	s0 *stack.Stack
 	s1 *stack.Stack
 
 	linkEP0 *channel.Endpoint
 	linkEP1 *channel.Endpoint
-
-	icmpCh chan icmpInfo
 }
 
 type endpointWithResolutionCapability struct {
@@ -65,10 +197,8 @@ func (e endpointWithResolutionCapability) Capabilities() stack.LinkEndpointCapab
 
 func newTestContext(t *testing.T) *testContext {
 	c := &testContext{
-		t:      t,
-		s0:     stack.New([]string{ProtocolName}, []string{icmp.ProtocolName6}, stack.Options{}),
-		s1:     stack.New([]string{ProtocolName}, []string{icmp.ProtocolName6}, stack.Options{}),
-		icmpCh: make(chan icmpInfo, 10),
+		s0: stack.New([]string{ProtocolName}, []string{icmp.ProtocolName6}, stack.Options{}),
+		s1: stack.New([]string{ProtocolName}, []string{icmp.ProtocolName6}, stack.Options{}),
 	}
 
 	const defaultMTU = 65536
@@ -118,50 +248,58 @@ func newTestContext(t *testing.T) *testContext {
 		}},
 	)
 
-	go c.routePackets(linkEP0.C, linkEP1)
-	go c.routePackets(linkEP1.C, linkEP0)
-
 	return c
 }
 
-func (c *testContext) countPacket(pkt channel.PacketInfo) {
+func (c *testContext) cleanup() {
+	close(c.linkEP0.C)
+	close(c.linkEP1.C)
+}
+
+type routeArgs struct {
+	src, dst *channel.Endpoint
+	typ      header.ICMPv6Type
+}
+
+func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.ICMPv6)) {
+	t.Helper()
+
+	pkt := <-args.src.C
+
+	{
+		views := []buffer.View{pkt.Header, pkt.Payload}
+		size := len(pkt.Header) + len(pkt.Payload)
+		vv := buffer.NewVectorisedView(size, views)
+		args.dst.InjectLinkAddr(pkt.Proto, args.dst.LinkAddress(), vv)
+	}
+
 	if pkt.Proto != ProtocolNumber {
+		t.Errorf("unexpected protocol number %d", pkt.Proto)
 		return
 	}
 	ipv6 := header.IPv6(pkt.Header)
 	transProto := tcpip.TransportProtocolNumber(ipv6.NextHeader())
 	if transProto != header.ICMPv6ProtocolNumber {
+		t.Errorf("unexpected transport protocol number %d", transProto)
 		return
 	}
-	b := pkt.Header[header.IPv6MinimumSize:]
-	icmp := header.ICMPv6(b)
-	c.icmpCh <- icmpInfo{
-		typ: icmp.Type(),
-		src: ipv6.SourceAddress(),
+	icmpv6 := header.ICMPv6(ipv6.Payload())
+	if got, want := icmpv6.Type(), args.typ; got != want {
+		t.Errorf("got ICMPv6 type = %d, want = %d", got, want)
+		return
 	}
-}
-
-func (c *testContext) routePackets(ch <-chan channel.PacketInfo, ep *channel.Endpoint) {
-	for pkt := range ch {
-		c.countPacket(pkt)
-		views := []buffer.View{pkt.Header, pkt.Payload}
-		size := len(pkt.Header) + len(pkt.Payload)
-		vv := buffer.NewVectorisedView(size, views)
-		ep.InjectLinkAddr(pkt.Proto, ep.LinkAddress(), vv)
+	if fn != nil {
+		fn(t, icmpv6)
 	}
 }
 
-func (c *testContext) cleanup() {
-	close(c.linkEP0.C)
-	close(c.linkEP1.C)
-}
-
 func TestLinkResolution(t *testing.T) {
 	c := newTestContext(t)
 	defer c.cleanup()
+
 	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
 	}
 	defer r.Release()
 
@@ -176,14 +314,24 @@ func TestLinkResolution(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := c.s0.NewEndpoint(header.ICMPv6ProtocolNumber, ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
 	}
 
 	for {
 		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: 1, Addr: lladdr1}})
 		if resCh != nil {
 			if err != tcpip.ErrNoLinkAddress {
-				t.Fatalf("ep.Write(_) = _, <non-nil>, %s want _, <non-nil>, tcpip.ErrNoLinkAddress", err)
+				t.Fatalf("ep.Write(_) = _, <non-nil>, %s, want = _, <non-nil>, tcpip.ErrNoLinkAddress", err)
+			}
+			for _, args := range []routeArgs{
+				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit},
+				{src: c.linkEP1, dst: c.linkEP0, typ: header.ICMPv6NeighborAdvert},
+			} {
+				routeICMPv6Packet(t, args, func(t *testing.T, icmpv6 header.ICMPv6) {
+					if got, want := tcpip.Address(icmpv6[8:][:16]), lladdr1; got != want {
+						t.Errorf("%d: got target = %s, want = %s", icmpv6.Type(), got, want)
+					}
+				})
 			}
 			<-resCh
 			continue
@@ -194,29 +342,10 @@ func TestLinkResolution(t *testing.T) {
 		break
 	}
 
-	stats := make(map[header.ICMPv6Type]int)
-	for {
-		// This actually takes about 10 milliseconds, so no need to wait for
-		// a multi-minute go test timeout if something is broken.
-		select {
-		case <-time.After(2 * time.Second):
-			t.Errorf("timeout waiting for ICMP, got: %#+v", stats)
-			return
-		case icmpInfo := <-c.icmpCh:
-			switch icmpInfo.typ {
-			case header.ICMPv6NeighborAdvert:
-				if got, want := icmpInfo.src, lladdr1; got != want {
-					t.Errorf("got ICMPv6NeighborAdvert.sourceAddress = %v, want = %v", got, want)
-				}
-			}
-			stats[icmpInfo.typ]++
-
-			if stats[header.ICMPv6NeighborSolicit] > 0 &&
-				stats[header.ICMPv6NeighborAdvert] > 0 &&
-				stats[header.ICMPv6EchoRequest] > 0 &&
-				stats[header.ICMPv6EchoReply] > 0 {
-				return
-			}
-		}
+	for _, args := range []routeArgs{
+		{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6EchoRequest},
+		{src: c.linkEP1, dst: c.linkEP0, typ: header.ICMPv6EchoReply},
+	} {
+		routeICMPv6Packet(t, args, nil)
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 825854148..e9f73635f 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -581,10 +581,156 @@ func (s *StatCounter) String() string {
 	return strconv.FormatUint(s.Value(), 10)
 }
 
+// ICMPv4PacketStats enumerates counts for all ICMPv4 packet types.
+type ICMPv4PacketStats struct {
+	// Echo is the total number of ICMPv4 echo packets counted.
+	Echo *StatCounter
+
+	// EchoReply is the total number of ICMPv4 echo reply packets counted.
+	EchoReply *StatCounter
+
+	// DstUnreachable is the total number of ICMPv4 destination unreachable
+	// packets counted.
+	DstUnreachable *StatCounter
+
+	// SrcQuench is the total number of ICMPv4 source quench packets
+	// counted.
+	SrcQuench *StatCounter
+
+	// Redirect is the total number of ICMPv4 redirect packets counted.
+	Redirect *StatCounter
+
+	// TimeExceeded is the total number of ICMPv4 time exceeded packets
+	// counted.
+	TimeExceeded *StatCounter
+
+	// ParamProblem is the total number of ICMPv4 parameter problem packets
+	// counted.
+	ParamProblem *StatCounter
+
+	// Timestamp is the total number of ICMPv4 timestamp packets counted.
+	Timestamp *StatCounter
+
+	// TimestampReply is the total number of ICMPv4 timestamp reply packets
+	// counted.
+	TimestampReply *StatCounter
+
+	// InfoRequest is the total number of ICMPv4 information request
+	// packets counted.
+	InfoRequest *StatCounter
+
+	// InfoReply is the total number of ICMPv4 information reply packets
+	// counted.
+	InfoReply *StatCounter
+}
+
+// ICMPv6PacketStats enumerates counts for all ICMPv6 packet types.
+type ICMPv6PacketStats struct {
+	// EchoRequest is the total number of ICMPv6 echo request packets
+	// counted.
+	EchoRequest *StatCounter
+
+	// EchoReply is the total number of ICMPv6 echo reply packets counted.
+	EchoReply *StatCounter
+
+	// DstUnreachable is the total number of ICMPv6 destination unreachable
+	// packets counted.
+	DstUnreachable *StatCounter
+
+	// PacketTooBig is the total number of ICMPv6 packet too big packets
+	// counted.
+	PacketTooBig *StatCounter
+
+	// TimeExceeded is the total number of ICMPv6 time exceeded packets
+	// counted.
+	TimeExceeded *StatCounter
+
+	// ParamProblem is the total number of ICMPv6 parameter problem packets
+	// counted.
+	ParamProblem *StatCounter
+
+	// RouterSolicit is the total number of ICMPv6 router solicit packets
+	// counted.
+	RouterSolicit *StatCounter
+
+	// RouterAdvert is the total number of ICMPv6 router advert packets
+	// counted.
+	RouterAdvert *StatCounter
+
+	// NeighborSolicit is the total number of ICMPv6 neighbor solicit
+	// packets counted.
+	NeighborSolicit *StatCounter
+
+	// NeighborAdvert is the total number of ICMPv6 neighbor advert packets
+	// counted.
+	NeighborAdvert *StatCounter
+
+	// RedirectMsg is the total number of ICMPv6 redirect message packets
+	// counted.
+	RedirectMsg *StatCounter
+}
+
+// ICMPv4SentPacketStats collects outbound ICMPv4-specific stats.
+type ICMPv4SentPacketStats struct {
+	ICMPv4PacketStats
+
+	// Dropped is the total number of ICMPv4 packets dropped due to link
+	// layer errors.
+	Dropped *StatCounter
+}
+
+// ICMPv4ReceivedPacketStats collects inbound ICMPv4-specific stats.
+type ICMPv4ReceivedPacketStats struct {
+	ICMPv4PacketStats
+
+	// Invalid is the total number of ICMPv4 packets received that the
+	// transport layer could not parse.
+	Invalid *StatCounter
+}
+
+// ICMPv6SentPacketStats collects outbound ICMPv6-specific stats.
+type ICMPv6SentPacketStats struct {
+	ICMPv6PacketStats
+
+	// Dropped is the total number of ICMPv6 packets dropped due to link
+	// layer errors.
+	Dropped *StatCounter
+}
+
+// ICMPv6ReceivedPacketStats collects inbound ICMPv6-specific stats.
+type ICMPv6ReceivedPacketStats struct {
+	ICMPv6PacketStats
+
+	// Invalid is the total number of ICMPv6 packets received that the
+	// transport layer could not parse.
+	Invalid *StatCounter
+}
+
+// ICMPStats collects ICMP-specific stats (both v4 and v6).
+type ICMPStats struct {
+	// ICMPv4SentPacketStats contains counts of sent packets by ICMPv4 packet type
+	// and a single count of packets which failed to write to the link
+	// layer.
+	V4PacketsSent ICMPv4SentPacketStats
+
+	// ICMPv4ReceivedPacketStats contains counts of received packets by ICMPv4
+	// packet type and a single count of invalid packets received.
+	V4PacketsReceived ICMPv4ReceivedPacketStats
+
+	// ICMPv6SentPacketStats contains counts of sent packets by ICMPv6 packet type
+	// and a single count of packets which failed to write to the link
+	// layer.
+	V6PacketsSent ICMPv6SentPacketStats
+
+	// ICMPv6ReceivedPacketStats contains counts of received packets by ICMPv6
+	// packet type and a single count of invalid packets received.
+	V6PacketsReceived ICMPv6ReceivedPacketStats
+}
+
 // IPStats collects IP-specific stats (both v4 and v6).
 type IPStats struct {
-	// PacketsReceived is the total number of IP packets received from the link
-	// layer in nic.DeliverNetworkPacket.
+	// PacketsReceived is the total number of IP packets received from the
+	// link layer in nic.DeliverNetworkPacket.
 	PacketsReceived *StatCounter
 
 	// InvalidAddressesReceived is the total number of IP packets received
@@ -605,8 +751,8 @@ type IPStats struct {
 
 // TCPStats collects TCP-specific stats.
 type TCPStats struct {
-	// ActiveConnectionOpenings is the number of connections opened successfully
-	// via Connect.
+	// ActiveConnectionOpenings is the number of connections opened
+	// successfully via Connect.
 	ActiveConnectionOpenings *StatCounter
 
 	// PassiveConnectionOpenings is the number of connections opened
@@ -617,8 +763,8 @@ type TCPStats struct {
 	// (active and passive openings, respectively) that end in an error.
 	FailedConnectionAttempts *StatCounter
 
-	// ValidSegmentsReceived is the number of TCP segments received that the
-	// transport layer successfully parsed.
+	// ValidSegmentsReceived is the number of TCP segments received that
+	// the transport layer successfully parsed.
 	ValidSegmentsReceived *StatCounter
 
 	// InvalidSegmentsReceived is the number of TCP segments received that
@@ -694,6 +840,9 @@ type Stats struct {
 	// DroppedPackets is the number of packets dropped due to full queues.
 	DroppedPackets *StatCounter
 
+	// ICMP breaks out ICMP-specific stats (both v4 and v6).
+	ICMP ICMPStats
+
 	// IP breaks out IP-specific stats (both v4 and v6).
 	IP IPStats
 
@@ -709,13 +858,13 @@ func fillIn(v reflect.Value) {
 		v := v.Field(i)
 		switch v.Kind() {
 		case reflect.Ptr:
-			if s, ok := v.Addr().Interface().(**StatCounter); ok {
-				if *s == nil {
-					*s = &StatCounter{}
-				}
+			if s := v.Addr().Interface().(**StatCounter); *s == nil {
+				*s = &StatCounter{}
 			}
 		case reflect.Struct:
 			fillIn(v)
+		default:
+			panic(fmt.Sprintf("unexpected type %s", v.Type()))
 		}
 	}
 }
-- 
cgit v1.2.3


From 99195b0e166536dd81c7096c0dc0f2cc527f1553 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 28 Mar 2019 14:14:13 -0700
Subject: Setting timestamps should trigger an inotify event.

PiperOrigin-RevId: 240850187
Change-Id: I1458581b771a1031e47bba439e480829794927b8
---
 pkg/sentry/syscalls/linux/sys_file.go |  8 +++++++-
 test/syscalls/linux/inotify.cc        | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 3193718b5..5a874d935 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1698,7 +1698,13 @@ func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, r
 			}
 		}
 
-		return d.Inode.SetTimestamps(t, d, ts)
+		if err := d.Inode.SetTimestamps(t, d, ts); err != nil {
+			return err
+		}
+
+		// File attribute changed, generate notification.
+		d.InotifyEvent(linux.IN_ATTRIB, 0)
+		return nil
 	}
 
 	// From utimes.c:
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 88997094c..b99d339e5 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -18,6 +18,7 @@
 #include <sys/epoll.h>
 #include <sys/inotify.h>
 #include <sys/ioctl.h>
+#include <sys/time.h>
 
 #include <atomic>
 #include <list>
@@ -1232,6 +1233,26 @@ TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
                            Event(IN_CREATE, root_wd, Basename(link1.path()))}));
 }
 
+TEST(Inotify, UtimesGeneratesAttribEvent) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  struct timeval times[2] = {{1, 0}, {2, 0}};
+  EXPECT_THAT(futimes(file1_fd.get(), times), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file1.path()))}));
+}
+
 TEST(Inotify, HardlinksReuseSameWatch) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath file1 =
-- 
cgit v1.2.3


From 31c2236e97db6a57ca9b6ab3771876cd2231fd85 Mon Sep 17 00:00:00 2001
From: "chris.zn" <chris.zn@antfin.com>
Date: Thu, 28 Mar 2019 18:04:32 -0700
Subject: set task's name when fork

When fork a child process, the name filed of TaskContext is not set.
It results in that when we cat /proc/{pid}/status, the name filed is
null.

Like this:
Name:
State:  S (sleeping)
Tgid:   28
Pid:    28
PPid:   26
TracerPid:      0
FDSize: 8
VmSize: 89712 kB
VmRSS:  6648 kB
Threads:        1
CapInh: 00000000a93d35fb
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: 00000000a93d35fb
Seccomp:        0
Change-Id: I5d469098c37cedd19da16b7ffab2e546a28a321e
PiperOrigin-RevId: 240893304
---
 pkg/sentry/kernel/task_context.go |  1 +
 test/syscalls/linux/prctl.cc      | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index d1c82f2aa..1b4d4cf2f 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -74,6 +74,7 @@ func (tc *TaskContext) release() {
 // of the original's.
 func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) {
 	newTC := &TaskContext{
+		Name: tc.Name,
 		Arch: tc.Arch.Fork(),
 		st:   tc.st,
 	}
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index 44f3df6a3..854dec714 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -52,6 +52,30 @@ TEST(PrctlTest, SetNameLongName) {
   ASSERT_EQ(long_name.substr(0, truncated_length), std::string(truncated_name));
 }
 
+TEST(PrctlTest, ChildProcessName) {
+  constexpr size_t kMaxNameLength = 15;
+
+  char parent_name[kMaxNameLength + 1] = {};
+  memset(parent_name, 'a', kMaxNameLength);
+
+  ASSERT_THAT(prctl(PR_SET_NAME, parent_name), SyscallSucceeds());
+
+  pid_t child_pid = fork();
+  TEST_PCHECK(child_pid >= 0);
+  if (child_pid == 0) {
+    char child_name[kMaxNameLength + 1] = {};
+    TEST_PCHECK(prctl(PR_GET_NAME, child_name) >= 0);
+    TEST_CHECK(memcmp(parent_name, child_name, sizeof(parent_name)) == 0);
+    _exit(0);
+  }
+
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+      << "status =" << status;
+}
+
 // Offset added to exit code from test child to distinguish from other abnormal
 // exits.
 constexpr int kPrctlNoNewPrivsTestChildExitBase = 100;
-- 
cgit v1.2.3


From ed23f547093e705ba3d6f82b2ce49592180f9a5a Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 29 Mar 2019 12:25:17 -0700
Subject: Treat ENOSPC as a state-file error during save.

PiperOrigin-RevId: 241028806
Change-Id: I770bf751a2740869a93c3ab50370a727ae580470
---
 pkg/sentry/kernel/kernel.go |  5 +++++
 pkg/sentry/state/BUILD      |  1 +
 pkg/sentry/state/state.go   | 14 ++++++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3533fd8f7..d9f3f4e24 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -365,6 +365,11 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
 				syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
 				if err := fs.SaveFileFsyncError(syncErr); err != nil {
 					name, _ := desc.file.Dirent.FullName(nil /* root */)
+					// Wrapping this error not only allows
+					// for a more useful message, but is
+					// required to distinguish Fsync errors
+					// from state file errors in
+					// state.Save.
 					return fmt.Errorf("%q was not sufficiently synced: %v", name, err)
 				}
 			}
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index 69385e23c..cee18f681 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -18,5 +18,6 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/watchdog",
         "//pkg/state/statefile",
+        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 67db78a56..224f8b709 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -24,18 +24,20 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
 	"gvisor.googlesource.com/gvisor/pkg/state/statefile"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 var previousMetadata map[string]string
 
-// ErrStateFile is returned when the state file cannot be opened.
+// ErrStateFile is returned when an error is encountered writing the statefile
+// (which may occur during open or close calls in addition to write).
 type ErrStateFile struct {
 	err error
 }
 
 // Error implements error.Error().
 func (e ErrStateFile) Error() string {
-	return fmt.Sprintf("failed to open statefile: %v", e.err)
+	return fmt.Sprintf("statefile error: %v", e.err)
 }
 
 // SaveOpts contains save-related options.
@@ -76,6 +78,14 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
 	} else {
 		// Save the kernel.
 		err = k.SaveTo(wc)
+
+		// ENOSPC is a state file error. This error can only come from
+		// writing the state file, and not from fs.FileOperations.Fsync
+		// because we wrap those in kernel.TaskSet.flushWritesToFiles.
+		if err == syserror.ENOSPC {
+			err = ErrStateFile{err}
+		}
+
 		if closeErr := wc.Close(); err == nil && closeErr != nil {
 			err = ErrStateFile{closeErr}
 		}
-- 
cgit v1.2.3


From 69afd0438e3213b8bf6d74bdf9c288772f81e834 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 29 Mar 2019 13:15:49 -0700
Subject: Return srclen in proc.idMapFileOperations.Write.

PiperOrigin-RevId: 241037926
Change-Id: I4b0381ac1c7575e8b861291b068d3da22bc03850
---
 pkg/sentry/fs/proc/uid_gid_map.go           |  5 ++++-
 test/syscalls/linux/proc_pid_uid_gid_map.cc | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index a52e0cb1f..0c68bbfc9 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -169,5 +169,8 @@ func (imfo *idMapFileOperations) Write(ctx context.Context, file *fs.File, src u
 	if err != nil {
 		return 0, err
 	}
-	return int64(len(b)), nil
+
+	// On success, Linux's kernel/user_namespace.c:map_write() always returns
+	// count, even if fewer bytes were used.
+	return int64(srclen), nil
 }
diff --git a/test/syscalls/linux/proc_pid_uid_gid_map.cc b/test/syscalls/linux/proc_pid_uid_gid_map.cc
index bf0f8b2bb..e6a5265fa 100644
--- a/test/syscalls/linux/proc_pid_uid_gid_map.cc
+++ b/test/syscalls/linux/proc_pid_uid_gid_map.cc
@@ -129,6 +129,23 @@ TEST_P(ProcSelfUidGidMapTest, IdentityMapOwnID) {
       IsPosixErrorOkAndHolds(0));
 }
 
+TEST_P(ProcSelfUidGidMapTest, TrailingNewlineAndNULIgnored) {
+  // This is identical to IdentityMapOwnID, except that a trailing newline, NUL,
+  // and an invalid (incomplete) map entry are appended to the valid entry. The
+  // newline should be accepted, and everything after the NUL should be ignored.
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+  uint32_t id = CurrentID();
+  std::string line = absl::StrCat(id, " ", id, " 1\n\0 4 3");
+  EXPECT_THAT(
+      InNewUserNamespaceWithMapFD([&](int fd) {
+        DenySelfSetgroups();
+        // The write should return the full size of the write, even though
+        // characters after the NUL were ignored.
+        TEST_PCHECK(write(fd, line.c_str(), line.size()) == line.size());
+      }),
+      IsPosixErrorOkAndHolds(0));
+}
+
 TEST_P(ProcSelfUidGidMapTest, NonIdentityMapOwnID) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
   SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveSetIDCapability()));
-- 
cgit v1.2.3


From d11ef20a936536aec811aec4e156366c5aeaac47 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 29 Mar 2019 13:16:49 -0700
Subject: Drop reference on shared anon mappable

We call NewSharedAnonMappable simply to use it for Mappable/MappingIdentity for
shared anon mmap. From MMapOpts.MappingIdentity: "If MMapOpts is used to
successfully create a memory mapping, a reference is taken on MappingIdentity."

mm.createVMALocked (below) takes this additional reference, so we don't need
the reference returned by NewSharedAnonMappable. Holding it leaks the mappable.

PiperOrigin-RevId: 241038108
Change-Id: I78ee3af78e0cc7aac4063b274b30d0e41eb5677d
---
 pkg/sentry/mm/syscalls.go | 1 +
 1 file changed, 1 insertion(+)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 3725c98aa..f8f095fed 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -101,6 +101,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 			if err != nil {
 				return 0, err
 			}
+			defer m.DecRef()
 			opts.MappingIdentity = m
 			opts.Mappable = m
 		}
-- 
cgit v1.2.3


From e8fef3d873e4564f0979303d134478b11def8349 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 29 Mar 2019 14:47:16 -0700
Subject: Treat fsync errors during save as SaveRejection errors.

PiperOrigin-RevId: 241055485
Change-Id: I70259e9fef59bdf9733b35a2cd3319359449dd45
---
 pkg/sentry/kernel/kernel.go | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index d9f3f4e24..f5cbd6c23 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -365,12 +365,15 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
 				syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
 				if err := fs.SaveFileFsyncError(syncErr); err != nil {
 					name, _ := desc.file.Dirent.FullName(nil /* root */)
-					// Wrapping this error not only allows
-					// for a more useful message, but is
-					// required to distinguish Fsync errors
-					// from state file errors in
+					// Wrap this error in ErrSaveRejection
+					// so that it will trigger a save
+					// error, rather than a panic. This
+					// also allows us to distinguish Fsync
+					// errors from state file errors in
 					// state.Save.
-					return fmt.Errorf("%q was not sufficiently synced: %v", name, err)
+					return fs.ErrSaveRejection{
+						Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
+					}
 				}
 			}
 		}
-- 
cgit v1.2.3


From 26e8d9981fcf6d08199a9fd9c609d9715c3cf37e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 29 Mar 2019 16:24:29 -0700
Subject: Use kernel.Task.CopyScratchBuffer in syscalls/linux where possible.

PiperOrigin-RevId: 241072126
Change-Id: Ib4d9f58f550732ac4c5153d3cf159a5b1a9749da
---
 pkg/sentry/kernel/task.go               | 9 ++++-----
 pkg/sentry/syscalls/linux/sys_prctl.go  | 2 +-
 pkg/sentry/syscalls/linux/sys_socket.go | 2 +-
 pkg/sentry/syscalls/linux/sys_stat.go   | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f958aba26..9c365e781 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -550,17 +550,16 @@ func (t *Task) afterLoad() {
 	t.futexWaiter = futex.NewWaiter()
 }
 
-// copyScratchBufferLen is the length of the copyScratchBuffer field of the Task
-// struct.
-const copyScratchBufferLen = 52
+// copyScratchBufferLen is the length of Task.copyScratchBuffer.
+const copyScratchBufferLen = 144 // sizeof(struct stat)
 
 // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
 // functions. It must only be used within those functions and can only be used
 // by the task goroutine; it exists to improve performance and thus
 // intentionally lacks any synchronization.
 //
-// Callers should pass a constant value as an argument, which will allow the
-// compiler to inline and optimize out the if statement below.
+// Callers should pass a constant value as an argument if possible, which will
+// allow the compiler to inline and optimize out the if statement below.
 func (t *Task) CopyScratchBuffer(size int) []byte {
 	if size > copyScratchBufferLen {
 		return make([]byte, size)
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 4938f27bd..7a29bd9b7 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -75,7 +75,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	case linux.PR_GET_NAME:
 		addr := args[1].Pointer()
-		buf := make([]byte, linux.TASK_COMM_LEN)
+		buf := t.CopyScratchBuffer(linux.TASK_COMM_LEN)
 		len := copy(buf, t.Name())
 		if len < linux.TASK_COMM_LEN {
 			buf[len] = 0
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 564357bac..49e6f4aeb 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -516,7 +516,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
 	if optLen > maxOptLen {
 		return 0, nil, syscall.EINVAL
 	}
-	buf := make([]byte, optLen)
+	buf := t.CopyScratchBuffer(int(optLen))
 	if _, err := t.CopyIn(optValAddr, &buf); err != nil {
 		return 0, nil, err
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 8d6a8f616..bdfb9b3ef 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -133,7 +133,7 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 	// common syscall for many applications, and t.CopyObjectOut has
 	// noticeable performance impact due to its many slice allocations and
 	// use of reflection.
-	b := make([]byte, 0, linux.SizeOfStat)
+	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
 
 	// Dev (uint64)
 	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(d.Inode.StableAttr.DeviceID))
-- 
cgit v1.2.3


From a4b34e26372528ef60140acef0b7c1ab1934f82a Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Mon, 1 Apr 2019 12:52:19 -0700
Subject: gvisor: convert ilist to ilist:generic_list

ilist:generic_list works faster (cl/240185278) and
the code looks cleaner without type casting.
PiperOrigin-RevId: 241381175
Change-Id: I8487ab1d73637b3e9733c253c56dce9e79f0d35f
---
 pkg/sentry/fs/BUILD               | 14 +++++++++++++-
 pkg/sentry/fs/inotify.go          | 14 +++++---------
 pkg/sentry/fs/inotify_event.go    |  3 +--
 pkg/sentry/kernel/pipe/BUILD      | 15 ++++++++++++++-
 pkg/sentry/kernel/pipe/buffers.go |  6 +-----
 pkg/sentry/kernel/pipe/pipe.go    |  8 +++-----
 pkg/waiter/BUILD                  | 27 +++++++++++++++++++++++++--
 pkg/waiter/waiter.go              | 12 ++++--------
 8 files changed, 66 insertions(+), 33 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 6957c1bbe..dda6a0c9f 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -14,6 +14,7 @@ go_library(
         "dirent_cache.go",
         "dirent_list.go",
         "dirent_state.go",
+        "event_list.go",
         "file.go",
         "file_operations.go",
         "file_overlay.go",
@@ -46,7 +47,6 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
-        "//pkg/ilist",
         "//pkg/log",
         "//pkg/metric",
         "//pkg/p9",
@@ -83,6 +83,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "event_list",
+    out = "event_list.go",
+    package = "fs",
+    prefix = "event",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Linker": "*Event",
+        "Element": "*Event",
+    },
+)
+
 go_test(
     name = "fs_x_test",
     size = "small",
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 51ece5ed0..5d6a7074b 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -19,7 +19,6 @@ import (
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -51,7 +50,7 @@ type Inotify struct {
 	evMu sync.Mutex `state:"nosave"`
 
 	// A list of pending events for this inotify instance. Protected by evMu.
-	events ilist.List
+	events eventList
 
 	// A scratch buffer, use to serialize inotify events. Use allocate this
 	// ahead of time and reuse performance. Protected by evMu.
@@ -143,9 +142,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i
 	}
 
 	var writeLen int64
-	for e := i.events.Front(); e != nil; e = e.Next() {
-		event := e.(*Event)
-
+	for event := i.events.Front(); event != nil; event = event.Next() {
 		// Does the buffer have enough remaining space to hold the event we're
 		// about to write out?
 		if dst.NumBytes() < int64(event.sizeOf()) {
@@ -160,7 +157,7 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i
 		// Linux always dequeues an available event as long as there's enough
 		// buffer space to copy it out, even if the copy below fails. Emulate
 		// this behaviour.
-		i.events.Remove(e)
+		i.events.Remove(event)
 
 		// Buffer has enough space, copy event to the read buffer.
 		n, err := event.CopyTo(ctx, i.scratch, dst)
@@ -197,8 +194,7 @@ func (i *Inotify) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArg
 		defer i.evMu.Unlock()
 		var n uint32
 		for e := i.events.Front(); e != nil; e = e.Next() {
-			event := e.(*Event)
-			n += uint32(event.sizeOf())
+			n += uint32(e.sizeOf())
 		}
 		var buf [4]byte
 		usermem.ByteOrder.PutUint32(buf[:], n)
@@ -216,7 +212,7 @@ func (i *Inotify) queueEvent(ev *Event) {
 	// Check if we should coalesce the event we're about to queue with the last
 	// one currently in the queue. Events are coalesced if they are identical.
 	if last := i.events.Back(); last != nil {
-		if ev.equals(last.(*Event)) {
+		if ev.equals(last) {
 			// "Coalesce" the two events by simply not queuing the new one. We
 			// don't need to raise a waiter.EventIn notification because no new
 			// data is available for reading.
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index 9e3e9d816..f09928b68 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -18,7 +18,6 @@ import (
 	"bytes"
 	"fmt"
 
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -31,7 +30,7 @@ const inotifyEventBaseSize = 16
 //
 // +stateify savable
 type Event struct {
-	ilist.Entry
+	eventEntry
 
 	wd     int32
 	mask   uint32
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 011a3f349..6b23117d9 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -1,10 +1,24 @@
 package(licenses = ["notice"])
 
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+go_template_instance(
+    name = "buffer_list",
+    out = "buffer_list.go",
+    package = "pipe",
+    prefix = "buffer",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Buffer",
+        "Linker": "*Buffer",
+    },
+)
+
 go_library(
     name = "pipe",
     srcs = [
+        "buffer_list.go",
         "buffers.go",
         "device.go",
         "node.go",
@@ -18,7 +32,6 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
-        "//pkg/ilist",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index fa8045910..54e059f8b 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -14,16 +14,12 @@
 
 package pipe
 
-import (
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
-)
-
 // Buffer encapsulates a queueable byte buffer that can
 // easily be truncated.  It is designed only for use with pipes.
 //
 // +stateify savable
 type Buffer struct {
-	ilist.Entry
+	bufferEntry
 	data []byte
 }
 
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index fad077d2d..357d1162e 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -25,7 +25,6 @@ import (
 	"sync/atomic"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -51,7 +50,7 @@ type Pipe struct {
 	Dirent *fs.Dirent
 
 	// The buffered byte queue.
-	data ilist.List
+	data bufferList
 
 	// Max size of the pipe in bytes.  When this max has been reached,
 	// writers will get EWOULDBLOCK.
@@ -170,13 +169,12 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		return 0, syserror.ErrWouldBlock
 	}
 	var n int64
-	for b := p.data.Front(); b != nil; b = p.data.Front() {
-		buffer := b.(*Buffer)
+	for buffer := p.data.Front(); buffer != nil; buffer = p.data.Front() {
 		n0, err := dst.CopyOut(ctx, buffer.bytes())
 		n += int64(n0)
 		p.size -= n0
 		if buffer.truncate(n0) == 0 {
-			p.data.Remove(b)
+			p.data.Remove(buffer)
 		}
 		dst = dst.DropFirst(n0)
 		if dst.NumBytes() == 0 || err != nil {
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index b748246da..48ce063d7 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -1,13 +1,28 @@
 package(licenses = ["notice"])
 
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+go_template_instance(
+    name = "waiter_list",
+    out = "waiter_list.go",
+    package = "waiter",
+    prefix = "waiter",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Entry",
+        "Linker": "*Entry",
+    },
+)
+
 go_library(
     name = "waiter",
-    srcs = ["waiter.go"],
+    srcs = [
+        "waiter.go",
+        "waiter_list.go",
+    ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/waiter",
     visibility = ["//visibility:public"],
-    deps = ["//pkg/ilist"],
 )
 
 go_test(
@@ -18,3 +33,11 @@ go_test(
     ],
     embed = [":waiter"],
 )
+
+filegroup(
+    name = "autogen",
+    srcs = [
+        "waiter_list.go",
+    ],
+    visibility = ["//:sandbox"],
+)
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 93390b299..fd429f733 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -59,8 +59,6 @@ package waiter
 
 import (
 	"sync"
-
-	"gvisor.googlesource.com/gvisor/pkg/ilist"
 )
 
 // EventMask represents io events as used in the poll() syscall.
@@ -127,7 +125,7 @@ type Entry struct {
 
 	// The following fields are protected by the queue lock.
 	mask EventMask
-	ilist.Entry
+	waiterEntry
 }
 
 type channelCallback struct{}
@@ -162,7 +160,7 @@ func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) {
 //
 // +stateify savable
 type Queue struct {
-	list ilist.List   `state:"zerovalue"`
+	list waiterList   `state:"zerovalue"`
 	mu   sync.RWMutex `state:"nosave"`
 }
 
@@ -186,8 +184,7 @@ func (q *Queue) EventUnregister(e *Entry) {
 // in common with the notification mask.
 func (q *Queue) Notify(mask EventMask) {
 	q.mu.RLock()
-	for it := q.list.Front(); it != nil; it = it.Next() {
-		e := it.(*Entry)
+	for e := q.list.Front(); e != nil; e = e.Next() {
 		if mask&e.mask != 0 {
 			e.Callback.Callback(e)
 		}
@@ -201,8 +198,7 @@ func (q *Queue) Events() EventMask {
 	ret := EventMask(0)
 
 	q.mu.RLock()
-	for it := q.list.Front(); it != nil; it = it.Next() {
-		e := it.(*Entry)
+	for e := q.list.Front(); e != nil; e = e.Next() {
 		ret |= e.mask
 	}
 	q.mu.RUnlock()
-- 
cgit v1.2.3


From b4006686d2752857b406c6c7e53a112efca826ff Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 1 Apr 2019 14:46:28 -0700
Subject: Don't expand COW-break on executable VMAs.

PiperOrigin-RevId: 241403847
Change-Id: I4631ca05734142da6e80cdfa1a1d63ed68aa05cc
---
 pkg/sentry/mm/pma.go | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index e090537cc..0cca743ef 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -318,7 +318,23 @@ func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIter
 							panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
 						}
 					}
-					copyAR := pseg.Range().Intersect(maskAR)
+					// The majority of copy-on-write breaks on executable pages
+					// come from:
+					//
+					// - The ELF loader, which must zero out bytes on the last
+					// page of each segment after the end of the segment.
+					//
+					// - gdb's use of ptrace to insert breakpoints.
+					//
+					// Neither of these cases has enough spatial locality to
+					// benefit from copying nearby pages, so if the vma is
+					// executable, only copy the pages required.
+					var copyAR usermem.AddrRange
+					if vseg.ValuePtr().effectivePerms.Execute {
+						copyAR = pseg.Range().Intersect(ar)
+					} else {
+						copyAR = pseg.Range().Intersect(maskAR)
+					}
 					// Get internal mappings from the pma to copy from.
 					if err := pseg.getInternalMappingsLocked(); err != nil {
 						return pstart, pseg.PrevGap(), err
-- 
cgit v1.2.3


From 7cff746ef2bbe5351e5985bebc88efc9e0881c78 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Mon, 1 Apr 2019 15:38:08 -0700
Subject: Save/restore simple devices.

We weren't saving simple devices' last allocated inode numbers, which
caused inode number reuse across S/R.

PiperOrigin-RevId: 241414245
Change-Id: I964289978841ef0a57d2fa48daf8eab7633c1284
---
 pkg/sentry/device/BUILD           |   4 +-
 pkg/sentry/device/device.go       | 111 ++++++++++++++++++++++++++++++--------
 pkg/sentry/kernel/BUILD           |   2 +
 pkg/sentry/kernel/kernel.go       |   3 ++
 pkg/sentry/kernel/kernel_state.go |  11 ++++
 test/syscalls/linux/stat.cc       |  33 ++++++++++++
 6 files changed, 139 insertions(+), 25 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 01de708d3..4ccf0674d 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -1,7 +1,7 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
 package(licenses = ["notice"])
 
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
 go_library(
     name = "device",
     srcs = ["device.go"],
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 27e4eb258..ae4fa1d93 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -14,11 +14,6 @@
 
 // Package device defines reserved virtual kernel devices and structures
 // for managing them.
-//
-// Saving and restoring devices is not necessary if the devices are initialized
-// as package global variables. Package initialization happens in a single goroutine
-// and in a deterministic order, so minor device numbers will be assigned in the
-// same order as packages are loaded.
 package device
 
 import (
@@ -30,7 +25,83 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 )
 
+// Registry tracks all simple devices and related state on the system for
+// save/restore.
+//
+// The set of devices across save/restore must remain consistent. That is, no
+// devices may be created or removed on restore relative to the saved
+// system. Practically, this means do not create new devices specifically as
+// part of restore.
+//
+// +stateify savable
+type Registry struct {
+	// lastAnonDeviceMinor is the last minor device number used for an anonymous
+	// device. Must be accessed atomically.
+	lastAnonDeviceMinor uint64
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	devices map[ID]*Device
+}
+
+// SimpleDevices is the system-wide simple device registry. This is
+// saved/restored by kernel.Kernel, but defined here to allow access without
+// depending on the kernel package. See kernel.Kernel.deviceRegistry.
+var SimpleDevices = newRegistry()
+
+func newRegistry() *Registry {
+	return &Registry{
+		devices: make(map[ID]*Device),
+	}
+}
+
+// newAnonID assigns a major and minor number to an anonymous device ID.
+func (r *Registry) newAnonID() ID {
+	return ID{
+		// Anon devices always have a major number of 0.
+		Major: 0,
+		// Use the next minor number.
+		Minor: atomic.AddUint64(&r.lastAnonDeviceMinor, 1),
+	}
+}
+
+// newAnonDevice allocates a new anonymous device with a unique minor device
+// number, and registers it with r.
+func (r *Registry) newAnonDevice() *Device {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	d := &Device{
+		ID: r.newAnonID(),
+	}
+	r.devices[d.ID] = d
+	return d
+}
+
+// LoadFrom initializes the internal state of all devices in r from other. The
+// set of devices in both registries must match. Devices may not be created or
+// destroyed across save/restore.
+func (r *Registry) LoadFrom(other *Registry) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	other.mu.Lock()
+	defer other.mu.Unlock()
+	if len(r.devices) != len(other.devices) {
+		panic(fmt.Sprintf("Devices were added or removed when restoring the registry:\nnew:\n%+v\nold:\n%+v", r.devices, other.devices))
+	}
+	for id, otherD := range other.devices {
+		ourD, ok := r.devices[id]
+		if !ok {
+			panic(fmt.Sprintf("Device %+v could not be restored as it wasn't defined in the new registry", otherD))
+		}
+		ourD.loadFrom(otherD)
+	}
+	atomic.StoreUint64(&r.lastAnonDeviceMinor, atomic.LoadUint64(&other.lastAnonDeviceMinor))
+}
+
 // ID identifies a device.
+//
+// +stateify savable
 type ID struct {
 	Major uint64
 	Minor uint64
@@ -41,18 +112,12 @@ func (i *ID) DeviceID() uint64 {
 	return uint64(linux.MakeDeviceID(uint16(i.Major), uint32(i.Minor)))
 }
 
-// nextAnonDeviceMinor is the next minor number for a new anonymous device.
-// Must be accessed atomically.
-var nextAnonDeviceMinor uint64
-
 // NewAnonDevice creates a new anonymous device. Packages that require an anonymous
 // device should initialize the device in a global variable in a file called device.go:
 //
 // var myDevice = device.NewAnonDevice()
 func NewAnonDevice() *Device {
-	return &Device{
-		ID: newAnonID(),
-	}
+	return SimpleDevices.newAnonDevice()
 }
 
 // NewAnonMultiDevice creates a new multi-keyed anonymous device. Packages that require
@@ -62,21 +127,13 @@ func NewAnonDevice() *Device {
 // var myDevice = device.NewAnonMultiDevice()
 func NewAnonMultiDevice() *MultiDevice {
 	return &MultiDevice{
-		ID: newAnonID(),
-	}
-}
-
-// newAnonID assigns a major and minor number to an anonymous device ID.
-func newAnonID() ID {
-	return ID{
-		// Anon devices always have a major number of 0.
-		Major: 0,
-		// Use the next minor number.
-		Minor: atomic.AddUint64(&nextAnonDeviceMinor, 1),
+		ID: SimpleDevices.newAnonID(),
 	}
 }
 
 // Device is a simple virtual kernel device.
+//
+// +stateify savable
 type Device struct {
 	ID
 
@@ -84,6 +141,14 @@ type Device struct {
 	last uint64
 }
 
+// loadFrom initializes d from other. The IDs of both devices must match.
+func (d *Device) loadFrom(other *Device) {
+	if d.ID != other.ID {
+		panic(fmt.Sprintf("Attempting to initialize a device %+v from %+v, but device IDs don't match", d, other))
+	}
+	atomic.StoreUint64(&d.last, atomic.LoadUint64(&other.last))
+}
+
 // NextIno generates a new inode number
 func (d *Device) NextIno() uint64 {
 	return atomic.AddUint64(&d.last, 1)
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 4d34bc733..99a2fd964 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -137,6 +137,7 @@ go_library(
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
     imports = [
         "gvisor.googlesource.com/gvisor/pkg/bpf",
+        "gvisor.googlesource.com/gvisor/pkg/sentry/device",
         "gvisor.googlesource.com/gvisor/pkg/tcpip",
     ],
     visibility = ["//:sandbox"],
@@ -156,6 +157,7 @@ go_library(
         "//pkg/secio",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
+        "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fs/timerfd",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index f5cbd6c23..f7f471aaa 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -185,6 +185,9 @@ type Kernel struct {
 	// socketTable is used to track all sockets on the system. Protected by
 	// extMu.
 	socketTable map[int]map[*refs.WeakRef]struct{}
+
+	// deviceRegistry is used to save/restore device.SimpleDevices.
+	deviceRegistry struct{} `state:".(*device.Registry)"`
 }
 
 // InitKernelArgs holds arguments to Init.
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
index a0a69b498..aae6f9ad2 100644
--- a/pkg/sentry/kernel/kernel_state.go
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/device"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip"
 )
 
@@ -29,3 +30,13 @@ func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) {
 		tcpip.AddDanglingEndpoint(e)
 	}
 }
+
+// saveDeviceRegistry is invoked by stateify.
+func (k *Kernel) saveDeviceRegistry() *device.Registry {
+	return device.SimpleDevices
+}
+
+// loadDeviceRegistry is invoked by stateify.
+func (k *Kernel) loadDeviceRegistry(r *device.Registry) {
+	device.SimpleDevices.LoadFrom(r)
+}
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index f96da5706..553fb7e56 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -429,6 +429,39 @@ TEST_F(StatTest, LstatELOOPPath) {
   ASSERT_THAT(lstat(path.c_str(), &s), SyscallFailsWithErrno(ELOOP));
 }
 
+// Ensure that inode allocation for anonymous devices work correctly across
+// save/restore. In particular, inode numbers should be unique across S/R.
+TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
+  // Use sockets as a convenient way to create inodes on an anonymous device.
+  int fd;
+  ASSERT_THAT(fd = socket(AF_UNIX, SOCK_STREAM, 0), SyscallSucceeds());
+  FileDescriptor fd1(fd);
+  MaybeSave();
+  ASSERT_THAT(fd = socket(AF_UNIX, SOCK_STREAM, 0), SyscallSucceeds());
+  FileDescriptor fd2(fd);
+
+  struct stat st1;
+  struct stat st2;
+  ASSERT_THAT(fstat(fd1.get(), &st1), SyscallSucceeds());
+  ASSERT_THAT(fstat(fd2.get(), &st2), SyscallSucceeds());
+
+  // The two fds should have different inode numbers. Specifically, since fd2
+  // was created later, it should have a higher inode number.
+  EXPECT_GT(st2.st_ino, st1.st_ino);
+
+  // Verify again after another S/R cycle. The inode numbers should remain the
+  // same.
+  MaybeSave();
+
+  struct stat st1_after;
+  struct stat st2_after;
+  ASSERT_THAT(fstat(fd1.get(), &st1_after), SyscallSucceeds());
+  ASSERT_THAT(fstat(fd2.get(), &st2_after), SyscallSucceeds());
+
+  EXPECT_EQ(st1_after.st_ino, st1.st_ino);
+  EXPECT_EQ(st2_after.st_ino, st2.st_ino);
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 1fcd40719dd89d92213c6ba5b4872b79198c309f Mon Sep 17 00:00:00 2001
From: Wei Zhang <zhangwei198900@gmail.com>
Date: Tue, 2 Apr 2019 14:50:09 -0700
Subject: device: fix device major/minor

Current gvisor doesn't give devices a right major and minor number.

When testing golang supporting of gvisor, I run the test case below:

```
$ docker run -ti --runtime runsc golang:1.12.1 bash -c "cd /usr/local/go/src && ./run.bash "
```

And it reports some errors, one of them is:

"--- FAIL: TestDevices (0.00s)
    --- FAIL: TestDevices//dev/null_1:3 (0.00s)
        dev_linux_test.go:45: for /dev/null Major(0x0) == 0, want 1
        dev_linux_test.go:48: for /dev/null Minor(0x0) == 0, want 3
        dev_linux_test.go:51: for /dev/null Mkdev(1, 3) == 0x103, want 0x0
    --- FAIL: TestDevices//dev/zero_1:5 (0.00s)
        dev_linux_test.go:45: for /dev/zero Major(0x0) == 0, want 1
        dev_linux_test.go:48: for /dev/zero Minor(0x0) == 0, want 5
        dev_linux_test.go:51: for /dev/zero Mkdev(1, 5) == 0x105, want 0x0
    --- FAIL: TestDevices//dev/random_1:8 (0.00s)
        dev_linux_test.go:45: for /dev/random Major(0x0) == 0, want 1
        dev_linux_test.go:48: for /dev/random Minor(0x0) == 0, want 8
        dev_linux_test.go:51: for /dev/random Mkdev(1, 8) == 0x108, want 0x0
    --- FAIL: TestDevices//dev/full_1:7 (0.00s)
        dev_linux_test.go:45: for /dev/full Major(0x0) == 0, want 1
        dev_linux_test.go:48: for /dev/full Minor(0x0) == 0, want 7
        dev_linux_test.go:51: for /dev/full Mkdev(1, 7) == 0x107, want 0x0
    --- FAIL: TestDevices//dev/urandom_1:9 (0.00s)
        dev_linux_test.go:45: for /dev/urandom Major(0x0) == 0, want 1
        dev_linux_test.go:48: for /dev/urandom Minor(0x0) == 0, want 9
        dev_linux_test.go:51: for /dev/urandom Mkdev(1, 9) == 0x109, want 0x0
"

So I think we'd better assign to them correct major/minor numbers following linux spec.

Signed-off-by: Wei Zhang <zhangwei198900@gmail.com>
Change-Id: I4521ee7884b4e214fd3a261929e3b6dac537ada9
PiperOrigin-RevId: 241609021
---
 pkg/sentry/fs/dev/dev.go | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index e1eaa08cb..fbc750a71 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -27,6 +27,19 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
+// Memory device numbers are from Linux's drivers/char/mem.c
+const (
+	// Mem device major.
+	memDevMajor uint16 = 1
+
+	// Mem device minors.
+	nullDevMinor    uint32 = 3
+	zeroDevMinor    uint32 = 5
+	fullDevMinor    uint32 = 7
+	randomDevMinor  uint32 = 8
+	urandomDevMinor uint32 = 9
+)
+
 func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode {
 	return fs.NewInode(iops, msrc, fs.StableAttr{
 		DeviceID:  devDevice.DeviceID(),
@@ -36,6 +49,17 @@ func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode
 	})
 }
 
+func newMemDevice(iops fs.InodeOperations, msrc *fs.MountSource, minor uint32) *fs.Inode {
+	return fs.NewInode(iops, msrc, fs.StableAttr{
+		DeviceID:        devDevice.DeviceID(),
+		InodeID:         devDevice.NextIno(),
+		BlockSize:       usermem.PageSize,
+		Type:            fs.CharacterDevice,
+		DeviceFileMajor: memDevMajor,
+		DeviceFileMinor: minor,
+	})
+}
+
 func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
 	iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555))
 	return fs.NewInode(iops, msrc, fs.StableAttr{
@@ -64,17 +88,17 @@ func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEn
 		"stdout": newSymlink(ctx, "/proc/self/fd/1", msrc),
 		"stderr": newSymlink(ctx, "/proc/self/fd/2", msrc),
 
-		"null": newCharacterDevice(newNullDevice(ctx, fs.RootOwner, 0666), msrc),
-		"zero": newCharacterDevice(newZeroDevice(ctx, fs.RootOwner, 0666), msrc),
-		"full": newCharacterDevice(newFullDevice(ctx, fs.RootOwner, 0666), msrc),
+		"null": newMemDevice(newNullDevice(ctx, fs.RootOwner, 0666), msrc, nullDevMinor),
+		"zero": newMemDevice(newZeroDevice(ctx, fs.RootOwner, 0666), msrc, zeroDevMinor),
+		"full": newMemDevice(newFullDevice(ctx, fs.RootOwner, 0666), msrc, fullDevMinor),
 
 		// This is not as good as /dev/random in linux because go
 		// runtime uses sys_random and /dev/urandom internally.
 		// According to 'man 4 random', this will be sufficient unless
 		// application uses this to generate long-lived GPG/SSL/SSH
 		// keys.
-		"random":  newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
-		"urandom": newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc),
+		"random":  newMemDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc, randomDevMinor),
+		"urandom": newMemDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc, urandomDevMinor),
 
 		"shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc),
 
-- 
cgit v1.2.3


From d14a7de65865e14383e3c4e68400446189b2e5e8 Mon Sep 17 00:00:00 2001
From: Rahat Mahmood <rahat@google.com>
Date: Tue, 2 Apr 2019 16:45:27 -0700
Subject: Fix more data races in shm debug messages.

PiperOrigin-RevId: 241630409
Change-Id: Ie0df5f5a2f20c2d32e615f16e2ba43c88f963181
---
 pkg/sentry/kernel/shm/shm.go | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 2b291e4f8..349f2a26e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -286,7 +286,7 @@ func (r *Registry) remove(s *Shm) {
 	defer s.mu.Unlock()
 
 	if s.key != linux.IPC_PRIVATE {
-		panic(fmt.Sprintf("Attempted to remove shm segment %d (key=%d) from the registry whose key is still associated", s.ID, s.key))
+		panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
 	}
 
 	delete(r.shms, s.ID)
@@ -370,6 +370,12 @@ type Shm struct {
 	pendingDestruction bool
 }
 
+// Precondition: Caller must hold s.mu.
+func (s *Shm) debugLocked() string {
+	return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
+		s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction)
+}
+
 // MappedName implements memmap.MappingIdentity.MappedName.
 func (s *Shm) MappedName(ctx context.Context) string {
 	s.mu.Lock()
@@ -412,7 +418,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
 	} else {
 		// AddMapping is called during a syscall, so ctx should always be a task
 		// context.
-		log.Warningf("Adding mapping to shm %+v but couldn't get the current pid; not updating the last attach pid", s)
+		log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
 	}
 	return nil
 }
@@ -434,7 +440,7 @@ func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ userme
 	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
 		s.lastAttachDetachPID = pid
 	} else {
-		log.Debugf("Couldn't obtain pid when removing mapping to shm %+v, not updating the last detach pid.", s)
+		log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
 	}
 }
 
-- 
cgit v1.2.3


From 1776ab28f0fb934d399361e6012945c70dcd996f Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 2 Apr 2019 17:27:30 -0700
Subject: Add test that symlinking over a directory returns EEXIST.

Also remove comments in InodeOperations that required that implementation of
some Create* operations ensure that the name does not already exist, since
these checks are all centralized in the Dirent.

PiperOrigin-RevId: 241637335
Change-Id: Id098dc6063ff7c38347af29d1369075ad1e89a58
---
 pkg/sentry/fs/inode_operations.go |  6 +-----
 pkg/sentry/fs/ramfs/dir.go        |  4 ----
 test/syscalls/linux/symlink.cc    | 24 ++++++++++--------------
 3 files changed, 11 insertions(+), 23 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 548f1eb8b..e8b9ab96b 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -104,15 +104,12 @@ type InodeOperations interface {
 	CreateLink(ctx context.Context, dir *Inode, oldname string, newname string) error
 
 	// CreateHardLink creates a hard link under dir between the target
-	// Inode and name. Implementations must ensure that name does not
-	// already exist.
+	// Inode and name.
 	//
 	// The caller must ensure this operation is permitted.
 	CreateHardLink(ctx context.Context, dir *Inode, target *Inode, name string) error
 
 	// CreateFifo creates a new named pipe under dir at name.
-	// Implementations must ensure that an Inode at name does not
-	// already exist.
 	//
 	// The caller must ensure that this operation is permitted.
 	CreateFifo(ctx context.Context, dir *Inode, name string, perm FilePermissions) error
@@ -144,7 +141,6 @@ type InodeOperations interface {
 	Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string, replacement bool) error
 
 	// Bind binds a new socket under dir at the given name.
-	// Implementations must ensure that name does not already exist.
 	//
 	// The caller must ensure that this operation is permitted.
 	Bind(ctx context.Context, dir *Inode, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error)
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index b60dab243..05d716afb 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -268,10 +268,6 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	if _, ok := d.children[name]; ok {
-		return nil, syscall.EEXIST
-	}
-
 	inode, err := makeInodeOperations()
 	if err != nil {
 		return nil, err
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index cfc87bc8f..ea6baf76a 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -113,23 +113,19 @@ TEST(SymlinkTest, CannotCreateSymlinkInReadOnlyDir) {
 }
 
 TEST(SymlinkTest, CannotSymlinkOverExistingFile) {
-  const std::string oldname = NewTempAbsPath();
-  const std::string newname = NewTempAbsPath();
-
-  int oldfd;
-  int newfd;
-  ASSERT_THAT(oldfd = open(oldname.c_str(), O_CREAT | O_RDWR, 0666),
-              SyscallSucceeds());
-  EXPECT_THAT(close(oldfd), SyscallSucceeds());
-  ASSERT_THAT(newfd = open(newname.c_str(), O_CREAT | O_RDWR, 0666),
-              SyscallSucceeds());
-  EXPECT_THAT(close(newfd), SyscallSucceeds());
+  const auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const auto newfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
-  EXPECT_THAT(symlink(oldname.c_str(), newname.c_str()),
+  EXPECT_THAT(symlink(oldfile.path().c_str(), newfile.path().c_str()),
               SyscallFailsWithErrno(EEXIST));
+}
 
-  EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
-  EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+TEST(SymlinkTest, CannotSymlinkOverExistingDir) {
+  const auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const auto newdir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  EXPECT_THAT(symlink(oldfile.path().c_str(), newdir.path().c_str()),
+              SyscallFailsWithErrno(EEXIST));
 }
 
 TEST(SymlinkTest, OldnameIsEmpty) {
-- 
cgit v1.2.3


From c4caccd54042ea80a6e8b5a8f5ce59ee87a7f424 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 2 Apr 2019 18:12:11 -0700
Subject: Set options on the correct Task in PTRACE_SEIZE.

$ docker run --rm --runtime=runsc -it --cap-add=SYS_PTRACE debian bash -c "apt-get update && apt-get install strace && strace ls"
...
Setting up strace (4.15-2) ...
execve("/bin/ls", ["ls"], [/* 6 vars */]) = 0
brk(NULL)                               = 0x5646d8c1e000
uname({sysname="Linux", nodename="114ef93d2db3", ...}) = 0
...

PiperOrigin-RevId: 241643321
Change-Id: Ie4bce27a7fb147eef07bbae5895c6ef3f529e177
---
 pkg/sentry/kernel/ptrace.go   |  2 +-
 test/syscalls/linux/ptrace.cc | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index e8043bf8a..8d78b2fb3 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -374,7 +374,7 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
 		return syserror.EPERM
 	}
 	if seize {
-		if err := t.ptraceSetOptionsLocked(opts); err != nil {
+		if err := target.ptraceSetOptionsLocked(opts); err != nil {
 			return syserror.EIO
 		}
 	}
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 6d5c425d8..8fc0045ce 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1120,6 +1120,46 @@ TEST(PtraceTest, Interrupt_Listen_RequireSeize) {
       << " status " << status;
 }
 
+TEST(PtraceTest, SeizeSetOptions) {
+  pid_t const child_pid = fork();
+  if (child_pid == 0) {
+    // In child process.
+    while (true) {
+      SleepSafe(absl::Seconds(1));
+    }
+  }
+
+  // In parent process.
+  ASSERT_THAT(child_pid, SyscallSucceeds());
+
+  // Attach to the child with PTRACE_SEIZE while setting PTRACE_O_TRACESYSGOOD.
+  ASSERT_THAT(ptrace(PTRACE_SEIZE, child_pid, 0, PTRACE_O_TRACESYSGOOD),
+              SyscallSucceeds());
+
+  // Stop the child with PTRACE_INTERRUPT.
+  ASSERT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0), SyscallSucceeds());
+  int status;
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8);
+
+  // Resume the child with PTRACE_SYSCALL and wait for it to enter
+  // syscall-enter-stop. The stop signal status from the syscall stop should be
+  // SIGTRAP|0x80, reflecting PTRACE_O_TRACESYSGOOD.
+  ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+      << " status " << status;
+
+  // SIGKILL the child (detaching the tracer) and wait for it to exit.
+  ASSERT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+  ASSERT_THAT(waitpid(child_pid, &status, 0),
+              SyscallSucceedsWithValue(child_pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+      << " status " << status;
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 77f01ee3c7f947a3be569e49e248187b2663607f Mon Sep 17 00:00:00 2001
From: Ian Lewis <ianlewis@google.com>
Date: Wed, 3 Apr 2019 03:09:26 -0700
Subject: Add syscall annotations for unimplemented syscalls

Added syscall annotations for unimplemented syscalls for later generation into
reference docs. Annotations are of the form:
@Syscall(<name>, <key:value>, ...)

Supported args and values are:

- arg: A syscall option. This entry only applies to the syscall when given this
       option.
- support: Indicates support level
  - UNIMPLEMENTED: Unimplemented (implies returns:ENOSYS)
  - PARTIAL: Partial support. Details should be provided in note.
  - FULL: Full support
- returns: Indicates a known return value. Values are
           syscall errors. This is treated as a string so you can use something
           like "returns:EPERM or ENOSYS".
- issue: A Github issue number.
- note: A note

Example:
// @Syscall(mmap, arg:MAP_PRIVATE, support:FULL, note:Private memory fully supported)
// @Syscall(mmap, arg:MAP_SHARED, support:UNIMPLEMENTED, issue:123, note:Shared memory not supported)
// @Syscall(setxattr, returns:ENOTSUP, note:Requires file system support)

Annotations should be placed as close to their implementation as possible
(preferrably as part of a supporting function's Godoc) and should be updated as
syscall support changes.

PiperOrigin-RevId: 241697482
Change-Id: I7a846135db124e1271dc5057d788cba82ca312d4
---
 pkg/sentry/syscalls/linux/linux64.go | 289 +++++++++++++++++++++++------------
 1 file changed, 192 insertions(+), 97 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 888b5aa9f..be793ca11 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -34,6 +34,33 @@ const _AUDIT_ARCH_X86_64 = 0xc000003e
 // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
 // numbers from Linux 4.4. The entries commented out are those syscalls we
 // don't currently support.
+//
+// Syscall support is documented as annotations in Go comments of the form:
+// @Syscall(<name>, <key:value>, ...)
+//
+// Supported args and values are:
+//
+// - arg: A syscall option. This entry only applies to the syscall when given
+//        this option.
+// - support: Indicates support level
+//   - UNIMPLEMENTED: Unimplemented (default, implies returns:ENOSYS)
+//   - PARTIAL: Partial support. Details should be provided in note.
+//   - FULL: Full support
+// - returns: Indicates a known return value. Values are syscall errors. This
+//            is treated as a string so you can use something like
+//            "returns:EPERM or ENOSYS".
+// - issue: A Github issue number.
+// - note: A note
+//
+// Example:
+// // @Syscall(mmap, arg:MAP_PRIVATE, support:FULL, note:Private memory fully supported)
+// // @Syscall(mmap, arg:MAP_SHARED, issue:123, note:Shared memory not supported)
+// // @Syscall(setxattr, returns:ENOTSUP, note:Requires file system support)
+//
+// Annotations should be placed as close to their implementation as possible
+// (preferrably as part of a supporting function's Godoc) and should be
+// updated as syscall support changes. Unimplemented syscalls are documented
+// here due to their lack of a supporting function or method.
 var AMD64 = &kernel.SyscallTable{
 	OS:   abi.Linux,
 	Arch: arch.AMD64,
@@ -116,10 +143,10 @@ var AMD64 = &kernel.SyscallTable{
 		65: Semop,
 		66: Semctl,
 		67: Shmdt,
-		//     68: Msgget, TODO
-		//     69: Msgsnd, TODO
-		//     70: Msgrcv, TODO
-		//     71: Msgctl, TODO
+		//     68: @Syscall(Msgget), TODO
+		//     69: @Syscall(Msgsnd), TODO
+		//     70: @Syscall(Msgrcv), TODO
+		//     71: @Syscall(Msgctl), TODO
 		72:  Fcntl,
 		73:  Flock,
 		74:  Fsync,
@@ -170,8 +197,8 @@ var AMD64 = &kernel.SyscallTable{
 		119: Setresgid,
 		120: Getresgid,
 		121: Getpgid,
-		//     122: Setfsuid, TODO
-		//     123: Setfsgid, TODO
+		//     122: @Syscall(Setfsuid), TODO
+		//     123: @Syscall(Setfsgid), TODO
 		124: Getsid,
 		125: Capget,
 		126: Capset,
@@ -182,93 +209,140 @@ var AMD64 = &kernel.SyscallTable{
 		131: Sigaltstack,
 		132: Utime,
 		133: Mknod,
-		134: syscalls.Error(syscall.ENOSYS),          // Uselib, obsolete
-		135: syscalls.ErrorWithEvent(syscall.EINVAL), // SetPersonality, unable to change personality
-		136: syscalls.ErrorWithEvent(syscall.ENOSYS), // Ustat, needs filesystem support
+		// @Syscall(Uselib, note:Obsolete)
+		134: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(SetPersonality, returns:EINVAL, note:Unable to change personality)
+		135: syscalls.ErrorWithEvent(syscall.EINVAL),
+		// @Syscall(Ustat, note:Needs filesystem support)
+		136: syscalls.ErrorWithEvent(syscall.ENOSYS),
 		137: Statfs,
 		138: Fstatfs,
-		//     139: Sysfs, TODO
+		//     139: @Syscall(Sysfs), TODO
 		140: Getpriority,
 		141: Setpriority,
-		142: syscalls.CapError(linux.CAP_SYS_NICE), // SchedSetparam, requires cap_sys_nice
+		// @Syscall(SchedSetparam, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
+		142: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice
 		143: SchedGetparam,
 		144: SchedSetscheduler,
 		145: SchedGetscheduler,
 		146: SchedGetPriorityMax,
 		147: SchedGetPriorityMin,
-		148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
+		// @Syscall(SchedRrGetInterval, returns:EPERM)
+		148: syscalls.ErrorWithEvent(syscall.EPERM),
 		149: Mlock,
 		150: Munlock,
 		151: Mlockall,
 		152: Munlockall,
-		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
-		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
-		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
-		156: syscalls.Error(syscall.EPERM),               // Sysctl, syscall is "worthless"
+		// @Syscall(Vhangup, returns:EPERM)
+		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG),
+		// @Syscall(ModifyLdt, returns:EPERM)
+		154: syscalls.Error(syscall.EPERM),
+		// @Syscall(PivotRoot, returns:EPERM)
+		155: syscalls.Error(syscall.EPERM),
+		// @Syscall(Sysctl, returns:EPERM)
+		156: syscalls.Error(syscall.EPERM), // syscall is "worthless"
 		157: Prctl,
 		158: ArchPrctl,
-		159: syscalls.CapError(linux.CAP_SYS_TIME), // Adjtimex, requires cap_sys_time
+		// @Syscall(Adjtimex, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise)
+		159: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
 		160: Setrlimit,
 		161: Chroot,
 		162: Sync,
-		163: syscalls.CapError(linux.CAP_SYS_PACCT), // Acct, requires cap_sys_pacct
-		164: syscalls.CapError(linux.CAP_SYS_TIME),  // Settimeofday, requires cap_sys_time
+		// @Syscall(Acct, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_pacct; ENOSYS otherwise)
+		163: syscalls.CapError(linux.CAP_SYS_PACCT), // requires cap_sys_pacct
+		// @Syscall(Settimeofday, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise)
+		164: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
 		165: Mount,
 		166: Umount2,
-		167: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapon, requires cap_sys_admin
-		168: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapoff, requires cap_sys_admin
-		169: syscalls.CapError(linux.CAP_SYS_BOOT),  // Reboot, requires cap_sys_boot
+		// @Syscall(Swapon, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
+		167: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
+		// @Syscall(Swapoff, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
+		168: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
+		// @Syscall(Reboot, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
+		169: syscalls.CapError(linux.CAP_SYS_BOOT), // requires cap_sys_boot
 		170: Sethostname,
 		171: Setdomainname,
-		172: syscalls.CapError(linux.CAP_SYS_RAWIO),  // Iopl, requires cap_sys_rawio
-		173: syscalls.CapError(linux.CAP_SYS_RAWIO),  // Ioperm, requires cap_sys_rawio
+		// @Syscall(Iopl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise)
+		172: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio
+		// @Syscall(Ioperm, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise)
+		173: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio
+		// @Syscall(CreateModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
 		174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module
-		175: syscalls.CapError(linux.CAP_SYS_MODULE), // InitModule, requires cap_sys_module
-		176: syscalls.CapError(linux.CAP_SYS_MODULE), // DeleteModule, requires cap_sys_module
-		177: syscalls.Error(syscall.ENOSYS),          // GetKernelSyms, not supported in > 2.6
-		178: syscalls.Error(syscall.ENOSYS),          // QueryModule, not supported in > 2.6
-		179: syscalls.CapError(linux.CAP_SYS_ADMIN),  // Quotactl, requires cap_sys_admin (most operations)
-		180: syscalls.Error(syscall.ENOSYS),          // Nfsservctl, does not exist > 3.1
-		181: syscalls.Error(syscall.ENOSYS),          // Getpmsg, not implemented in Linux
-		182: syscalls.Error(syscall.ENOSYS),          // Putpmsg, not implemented in Linux
-		183: syscalls.Error(syscall.ENOSYS),          // AfsSyscall, not implemented in Linux
-		184: syscalls.Error(syscall.ENOSYS),          // Tuxcall, not implemented in Linux
-		185: syscalls.Error(syscall.ENOSYS),          // Security, not implemented in Linux
+		// @Syscall(InitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
+		175: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module
+		// @Syscall(DeleteModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
+		176: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module
+		// @Syscall(GetKernelSyms, note:Not supported in > 2.6)
+		177: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(QueryModule, note:Not supported in > 2.6)
+		178: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(Quotactl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
+		179: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin (most operations)
+		// @Syscall(Nfsservctl, note:Does not exist > 3.1)
+		180: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(Getpmsg, note:Not implemented in Linux)
+		181: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(Putpmsg, note:Not implemented in Linux)
+		182: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(AfsSyscall, note:Not implemented in Linux)
+		183: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(Tuxcall, note:Not implemented in Linux)
+		184: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(Security, note:Not implemented in Linux)
+		185: syscalls.Error(syscall.ENOSYS),
 		186: Gettid,
-		187: nil,                                      // Readahead, TODO
-		188: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Setxattr, requires filesystem support
-		189: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lsetxattr, requires filesystem support
-		190: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fsetxattr, requires filesystem support
-		191: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Getxattr, requires filesystem support
-		192: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lgetxattr, requires filesystem support
-		193: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fgetxattr, requires filesystem support
-		194: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Listxattr, requires filesystem support
-		195: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Llistxattr, requires filesystem support
-		196: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Flistxattr, requires filesystem support
-		197: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Removexattr, requires filesystem support
-		198: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lremovexattr, requires filesystem support
-		199: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fremovexattr, requires filesystem support
+		187: nil, // @Syscall(Readahead), TODO
+		// @Syscall(Setxattr, returns:ENOTSUP, note:Requires filesystem support)
+		188: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Lsetxattr, returns:ENOTSUP, note:Requires filesystem support)
+		189: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Fsetxattr, returns:ENOTSUP, note:Requires filesystem support)
+		190: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Getxattr, returns:ENOTSUP, note:Requires filesystem support)
+		191: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Lgetxattr, returns:ENOTSUP, note:Requires filesystem support)
+		192: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Fgetxattr, returns:ENOTSUP, note:Requires filesystem support)
+		193: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Listxattr, returns:ENOTSUP, note:Requires filesystem support)
+		194: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Llistxattr, returns:ENOTSUP, note:Requires filesystem support)
+		195: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Flistxattr, returns:ENOTSUP, note:Requires filesystem support)
+		196: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Removexattr, returns:ENOTSUP, note:Requires filesystem support)
+		197: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Lremovexattr, returns:ENOTSUP, note:Requires filesystem support)
+		198: syscalls.ErrorWithEvent(syscall.ENOTSUP),
+		// @Syscall(Fremovexattr, returns:ENOTSUP, note:Requires filesystem support)
+		199: syscalls.ErrorWithEvent(syscall.ENOTSUP),
 		200: Tkill,
 		201: Time,
 		202: Futex,
 		203: SchedSetaffinity,
 		204: SchedGetaffinity,
-		205: syscalls.Error(syscall.ENOSYS), // SetThreadArea, expected to return ENOSYS on 64-bit
+		// @Syscall(SetThreadArea, note:Expected to return ENOSYS on 64-bit)
+		205: syscalls.Error(syscall.ENOSYS),
 		206: IoSetup,
 		207: IoDestroy,
 		208: IoGetevents,
 		209: IoSubmit,
 		210: IoCancel,
-		211: syscalls.Error(syscall.ENOSYS),         // GetThreadArea, expected to return ENOSYS on 64-bit
-		212: syscalls.CapError(linux.CAP_SYS_ADMIN), // LookupDcookie, requires cap_sys_admin
+		// @Syscall(GetThreadArea, note:Expected to return ENOSYS on 64-bit)
+		211: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(LookupDcookie, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
+		212: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
 		213: EpollCreate,
-		214: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollCtlOld, deprecated (afaik, unused)
-		215: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollWaitOld, deprecated (afaik, unused)
-		216: syscalls.ErrorWithEvent(syscall.ENOSYS), // RemapFilePages, deprecated since 3.16
+		// @Syscall(EpollCtlOld, note:Deprecated)
+		214: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused)
+		// @Syscall(EpollWaitOld, note:Deprecated)
+		215: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused)
+		// @Syscall(RemapFilePages, note:Deprecated)
+		216: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated since 3.16
 		217: Getdents64,
 		218: SetTidAddress,
 		219: RestartSyscall,
-		//     220: Semtimedop, TODO
+		//     220: @Syscall(Semtimedop), TODO
 		221: Fadvise64,
 		222: TimerCreate,
 		223: TimerSettime,
@@ -284,27 +358,35 @@ var AMD64 = &kernel.SyscallTable{
 		233: EpollCtl,
 		234: Tgkill,
 		235: Utimes,
-		236: syscalls.Error(syscall.ENOSYS),        // Vserver, not implemented by Linux
-		237: syscalls.CapError(linux.CAP_SYS_NICE), // Mbind, may require cap_sys_nice TODO
+		// @Syscall(Vserver, note:Not implemented by Linux)
+		236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux
+		// @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO
+		237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice
 		238: SetMempolicy,
 		239: GetMempolicy,
-		//     240: MqOpen, TODO
-		//     241: MqUnlink, TODO
-		//     242: MqTimedsend, TODO
-		//     243: MqTimedreceive, TODO
-		//     244: MqNotify, TODO
-		//     245: MqGetsetattr, TODO
+		//     240: @Syscall(MqOpen), TODO
+		//     241: @Syscall(MqUnlink), TODO
+		//     242: @Syscall(MqTimedsend), TODO
+		//     243: @Syscall(MqTimedreceive), TODO
+		//     244: @Syscall(MqNotify), TODO
+		//     245: @Syscall(MqGetsetattr), TODO
 		246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot
 		247: Waitid,
-		248: syscalls.Error(syscall.EACCES),         // AddKey, not available to user
-		249: syscalls.Error(syscall.EACCES),         // RequestKey, not available to user
-		250: syscalls.Error(syscall.EACCES),         // Keyctl, not available to user
-		251: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioSet, requires cap_sys_nice or cap_sys_admin (depending)
-		252: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioGet, requires cap_sys_nice or cap_sys_admin (depending)
+		// @Syscall(AddKey, returns:EACCES, note:Not available to user)
+		248: syscalls.Error(syscall.EACCES),
+		// @Syscall(RequestKey, returns:EACCES, note:Not available to user)
+		249: syscalls.Error(syscall.EACCES),
+		// @Syscall(Keyctl, returns:EACCES, note:Not available to user)
+		250: syscalls.Error(syscall.EACCES),
+		// @Syscall(IoprioSet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
+		251: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending)
+		// @Syscall(IoprioGet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
+		252: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending)
 		253: InotifyInit,
 		254: InotifyAddWatch,
 		255: InotifyRmWatch,
-		256: syscalls.CapError(linux.CAP_SYS_NICE), // MigratePages, requires cap_sys_nice
+		// @Syscall(MigratePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
+		256: syscalls.CapError(linux.CAP_SYS_NICE),
 		257: Openat,
 		258: Mkdirat,
 		259: Mknodat,
@@ -321,23 +403,26 @@ var AMD64 = &kernel.SyscallTable{
 		270: Pselect,
 		271: Ppoll,
 		272: Unshare,
-		273: syscalls.Error(syscall.ENOSYS), // SetRobustList, obsolete
-		274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete
-		//     275: Splice, TODO
-		//     276: Tee, TODO
+		// @Syscall(SetRobustList, note:Obsolete)
+		273: syscalls.Error(syscall.ENOSYS),
+		// @Syscall(GetRobustList, note:Obsolete)
+		274: syscalls.Error(syscall.ENOSYS),
+		//     275: @Syscall(Splice), TODO
+		//     276: @Syscall(Tee), TODO
 		277: SyncFileRange,
-		//     278: Vmsplice, TODO
-		279: syscalls.CapError(linux.CAP_SYS_NICE), // MovePages, requires cap_sys_nice (mostly)
+		//     278: @Syscall(Vmsplice), TODO
+		// @Syscall(MovePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
+		279: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice (mostly)
 		280: Utimensat,
 		281: EpollPwait,
-		//     282: Signalfd, TODO
+		//     282: @Syscall(Signalfd), TODO
 		283: TimerfdCreate,
 		284: Eventfd,
 		285: Fallocate,
 		286: TimerfdSettime,
 		287: TimerfdGettime,
 		288: Accept4,
-		//     289: Signalfd4, TODO
+		//     289: @Syscall(Signalfd4), TODO
 		290: Eventfd2,
 		291: EpollCreate1,
 		292: Dup3,
@@ -346,36 +431,46 @@ var AMD64 = &kernel.SyscallTable{
 		295: Preadv,
 		296: Pwritev,
 		297: RtTgsigqueueinfo,
-		298: syscalls.ErrorWithEvent(syscall.ENODEV), // PerfEventOpen, no support for perf counters
+		// @Syscall(PerfEventOpen, returns:ENODEV, note:No support for perf counters)
+		298: syscalls.ErrorWithEvent(syscall.ENODEV),
 		299: RecvMMsg,
-		300: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyInit, needs CONFIG_FANOTIFY
-		301: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyMark, needs CONFIG_FANOTIFY
+		// @Syscall(FanotifyInit, note:Needs CONFIG_FANOTIFY)
+		300: syscalls.ErrorWithEvent(syscall.ENOSYS),
+		// @Syscall(FanotifyMark, note:Needs CONFIG_FANOTIFY)
+		301: syscalls.ErrorWithEvent(syscall.ENOSYS),
 		302: Prlimit64,
-		303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // NameToHandleAt, needs filesystem support
-		304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // OpenByHandleAt, needs filesystem support
-		305: syscalls.CapError(linux.CAP_SYS_TIME),       // ClockAdjtime, requires cap_sys_time
+		// @Syscall(NameToHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support)
+		303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP),
+		// @Syscall(OpenByHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support)
+		304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP),
+		// @Syscall(ClockAdjtime, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
+		305: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
 		306: Syncfs,
 		307: SendMMsg,
-		//     308: Setns, TODO
+		//     308: @Syscall(Setns), TODO
 		309: Getcpu,
-		//     310: ProcessVmReadv, TODO may require cap_sys_ptrace
-		//     311: ProcessVmWritev, TODO may require cap_sys_ptrace
-		312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace
-		313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module
-		//     314: SchedSetattr, TODO, we have no scheduler
-		//     315: SchedGetattr, TODO, we have no scheduler
-		//     316: Renameat2, TODO
+		//     310: @Syscall(ProcessVmReadv), TODO may require cap_sys_ptrace
+		//     311: @Syscall(ProcessVmWritev), TODO may require cap_sys_ptrace
+		// @Syscall(Kcmp, returns:EPERM or ENOSYS, note:Requires cap_sys_ptrace)
+		312: syscalls.CapError(linux.CAP_SYS_PTRACE),
+		// @Syscall(FinitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
+		313: syscalls.CapError(linux.CAP_SYS_MODULE),
+		//     314: @Syscall(SchedSetattr), TODO, we have no scheduler
+		//     315: @Syscall(SchedGetattr), TODO, we have no scheduler
+		//     316: @Syscall(Renameat2), TODO
 		317: Seccomp,
 		318: GetRandom,
 		319: MemfdCreate,
-		320: syscalls.CapError(linux.CAP_SYS_BOOT),  // KexecFileLoad, infeasible to support
-		321: syscalls.CapError(linux.CAP_SYS_ADMIN), // Bpf, requires cap_sys_admin for all commands
-		//     322: Execveat, TODO
-		//     323: Userfaultfd, TODO
-		//     324: Membarrier, TODO
+		// @Syscall(KexecFileLoad, EPERM or ENOSYS, note:Infeasible to support. Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
+		320: syscalls.CapError(linux.CAP_SYS_BOOT),
+		// @Syscall(Bpf, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
+		321: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin for all commands
+		//     322: @Syscall(Execveat), TODO
+		//     323: @Syscall(Userfaultfd), TODO
+		//     324: @Syscall(Membarrier), TODO
 		325: Mlock2,
 		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
-		//	326: CopyFileRange,
+		//	326: @Syscall(CopyFileRange),
 		327: Preadv2,
 		328: Pwritev2,
 	},
-- 
cgit v1.2.3


From c79e81bd27cd9cccddb0cece30bf47efbfca41b7 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 3 Apr 2019 11:48:33 -0700
Subject: Addresses data race in tty implementation.

Also makes the safemem reading and writing inline, as it makes it easier to see
what locks are held.

PiperOrigin-RevId: 241775201
Change-Id: Ib1072f246773ef2d08b5b9a042eb7e9e0284175c
---
 pkg/sentry/fs/tty/queue.go | 87 ++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 45 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index f39f47941..5e88d84d9 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -63,49 +63,6 @@ type queue struct {
 	transformer
 }
 
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-func (q *queue) ReadToBlocks(dst safemem.BlockSeq) (uint64, error) {
-	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf))
-	n, err := safemem.CopySeq(dst, src)
-	if err != nil {
-		return 0, err
-	}
-	q.readBuf = q.readBuf[n:]
-
-	// If we read everything, this queue is no longer readable.
-	if len(q.readBuf) == 0 {
-		q.readable = false
-	}
-
-	return n, nil
-}
-
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-func (q *queue) WriteFromBlocks(src safemem.BlockSeq) (uint64, error) {
-	copyLen := src.NumBytes()
-	room := waitBufMaxBytes - q.waitBufLen
-	// If out of room, return EAGAIN.
-	if room == 0 && copyLen > 0 {
-		return 0, syserror.ErrWouldBlock
-	}
-	// Cap the size of the wait buffer.
-	if copyLen > room {
-		copyLen = room
-		src = src.TakeFirst64(room)
-	}
-	buf := make([]byte, copyLen)
-
-	// Copy the data into the wait buffer.
-	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))
-	n, err := safemem.CopySeq(dst, src)
-	if err != nil {
-		return 0, err
-	}
-	q.waitBufAppend(buf)
-
-	return n, nil
-}
-
 // readReadiness returns whether q is ready to be read from.
 func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
 	q.mu.Lock()
@@ -118,6 +75,8 @@ func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
 
 // writeReadiness returns whether q is ready to be written to.
 func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
 	if q.waitBufLen < waitBufMaxBytes {
 		return waiter.EventOut
 	}
@@ -158,7 +117,21 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 		dst = dst.TakeFirst(canonMaxBytes)
 	}
 
-	n, err := dst.CopyOutFrom(ctx, q)
+	n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) {
+		src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.readBuf = q.readBuf[n:]
+
+		// If we read everything, this queue is no longer readable.
+		if len(q.readBuf) == 0 {
+			q.readable = false
+		}
+
+		return n, nil
+	}))
 	if err != nil {
 		return 0, false, err
 	}
@@ -178,7 +151,30 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 	defer q.mu.Unlock()
 
 	// Copy data into the wait buffer.
-	n, err := src.CopyInTo(ctx, q)
+	n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) {
+		copyLen := src.NumBytes()
+		room := waitBufMaxBytes - q.waitBufLen
+		// If out of room, return EAGAIN.
+		if room == 0 && copyLen > 0 {
+			return 0, syserror.ErrWouldBlock
+		}
+		// Cap the size of the wait buffer.
+		if copyLen > room {
+			copyLen = room
+			src = src.TakeFirst64(room)
+		}
+		buf := make([]byte, copyLen)
+
+		// Copy the data into the wait buffer.
+		dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.waitBufAppend(buf)
+
+		return n, nil
+	}))
 	if err != nil {
 		return 0, err
 	}
@@ -241,6 +237,7 @@ func (q *queue) pushWaitBufLocked(l *lineDiscipline) int {
 	return total
 }
 
+// Precondition: q.mu must be locked.
 func (q *queue) waitBufAppend(b []byte) {
 	q.waitBuf = append(q.waitBuf, b)
 	q.waitBufLen += uint64(len(b))
-- 
cgit v1.2.3


From 82529becaee6f5050cb3ebb4aaa7a798357c1cf1 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 3 Apr 2019 12:59:27 -0700
Subject: Fix index out of bounds in tty implementation.

The previous implementation revolved around runes instead of bytes, which caused
weird behavior when converting between the two. For example, peekRune would read
the byte 0xff from a buffer, convert it to a rune, then return it. As rune is an
alias of int32, 0xff was 0-padded to int32(255), which is the hex code point for
?. However, peekRune also returned the length of the byte (1). When calling
utf8.EncodeRune, we only allocated 1 byte, but tried the write the 2-byte
character ?.

tl;dr: I apparently didn't understand runes when I wrote this.

PiperOrigin-RevId: 241789081
Change-Id: I14c788af4d9754973137801500ef6af7ab8a8727
---
 pkg/abi/linux/tty.go                 | 20 +++++++------
 pkg/sentry/fs/tty/line_discipline.go | 55 ++++++++++++++++--------------------
 test/syscalls/linux/pty.cc           |  6 ++++
 3 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index e6f7c5b2a..bff882d89 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -14,10 +14,6 @@
 
 package linux
 
-import (
-	"unicode/utf8"
-)
-
 const (
 	// NumControlCharacters is the number of control characters in Termios.
 	NumControlCharacters = 19
@@ -104,11 +100,19 @@ func (t *KernelTermios) FromTermios(term Termios) {
 }
 
 // IsTerminating returns whether c is a line terminating character.
-func (t *KernelTermios) IsTerminating(c rune) bool {
+func (t *KernelTermios) IsTerminating(cBytes []byte) bool {
+	// All terminating characters are 1 byte.
+	if len(cBytes) != 1 {
+		return false
+	}
+	c := cBytes[0]
+
+	// Is this the user-set EOF character?
 	if t.IsEOF(c) {
 		return true
 	}
-	switch byte(c) {
+
+	switch c {
 	case disabledChar:
 		return false
 	case '\n', t.ControlCharacters[VEOL]:
@@ -120,8 +124,8 @@ func (t *KernelTermios) IsTerminating(c rune) bool {
 }
 
 // IsEOF returns whether c is the EOF character.
-func (t *KernelTermios) IsEOF(c rune) bool {
-	return utf8.RuneLen(c) == 1 && byte(c) == t.ControlCharacters[VEOF] && t.ControlCharacters[VEOF] != disabledChar
+func (t *KernelTermios) IsEOF(c byte) bool {
+	return c == t.ControlCharacters[VEOF] && t.ControlCharacters[VEOF] != disabledChar
 }
 
 // Input flags.
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 31b6344f2..c4a364edb 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -280,10 +280,12 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 
 	var ret int
 	for len(buf) > 0 {
-		c, size := l.peekRune(buf)
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
 		ret += size
 		buf = buf[size:]
-		switch c {
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
 		case '\n':
 			if l.termios.OEnabled(linux.ONLRET) {
 				l.column = 0
@@ -297,7 +299,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 				continue
 			}
 			if l.termios.OEnabled(linux.OCRNL) {
-				c = '\n'
+				cBytes[0] = '\n'
 				if l.termios.OEnabled(linux.ONLRET) {
 					l.column = 0
 				}
@@ -319,10 +321,7 @@ func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte
 		default:
 			l.column++
 		}
-		// The compiler optimizes this by growing readBuf without
-		// creating the intermediate slice.
-		q.readBuf = append(q.readBuf, make([]byte, size)...)
-		utf8.EncodeRune(q.readBuf[len(q.readBuf)-size:], c)
+		q.readBuf = append(q.readBuf, cBytes...)
 	}
 	if len(q.readBuf) > 0 {
 		q.readable = true
@@ -358,8 +357,10 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 
 	var ret int
 	for len(buf) > 0 && len(q.readBuf) < canonMaxBytes {
-		c, size := l.peekRune(buf)
-		switch c {
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
 		case '\r':
 			if l.termios.IEnabled(linux.IGNCR) {
 				buf = buf[size:]
@@ -367,17 +368,17 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 				continue
 			}
 			if l.termios.IEnabled(linux.ICRNL) {
-				c = '\n'
+				cBytes[0] = '\n'
 			}
 		case '\n':
 			if l.termios.IEnabled(linux.INLCR) {
-				c = '\r'
+				cBytes[0] = '\r'
 			}
 		}
 
 		// In canonical mode, we discard non-terminating characters
 		// after the first 4095.
-		if l.shouldDiscard(q, c) {
+		if l.shouldDiscard(q, cBytes) {
 			buf = buf[size:]
 			ret += size
 			continue
@@ -387,20 +388,16 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 		if len(q.readBuf)+size > maxBytes {
 			break
 		}
-		cBytes := buf[:size]
 		buf = buf[size:]
 		ret += size
 
 		// If we get EOF, make the buffer available for reading.
-		if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(c) {
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) {
 			q.readable = true
 			break
 		}
 
-		// The compiler optimizes this by growing readBuf without
-		// creating the intermediate slice.
-		q.readBuf = append(q.readBuf, make([]byte, size)...)
-		utf8.EncodeRune(q.readBuf[len(q.readBuf)-size:], c)
+		q.readBuf = append(q.readBuf, cBytes...)
 
 		// Anything written to the readBuf will have to be echoed.
 		if l.termios.LEnabled(linux.ECHO) {
@@ -409,7 +406,7 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 		}
 
 		// If we finish a line, make it available for reading.
-		if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(c) {
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) {
 			q.readable = true
 			break
 		}
@@ -430,21 +427,17 @@ func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte)
 // Precondition:
 // * l.termiosMu must be held for reading.
 // * q.mu must be held.
-func (l *lineDiscipline) shouldDiscard(q *queue, c rune) bool {
-	return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+utf8.RuneLen(c) >= canonMaxBytes && !l.termios.IsTerminating(c)
+func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool {
+	return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes)
 }
 
-// peekRune returns the first rune from the byte array depending on whether
-// UTF8 is enabled.
-func (l *lineDiscipline) peekRune(b []byte) (rune, int) {
-	var c rune
-	var size int
+// peek returns the size in bytes of the next character to process. As long as
+// b isn't empty, peek returns a value of at least 1.
+func (l *lineDiscipline) peek(b []byte) int {
+	size := 1
 	// If UTF-8 support is enabled, runes might be multiple bytes.
 	if l.termios.IEnabled(linux.IUTF8) {
-		c, size = utf8.DecodeRune(b)
-	} else {
-		c = rune(b[0])
-		size = 1
+		_, size = utf8.DecodeRune(b)
 	}
-	return c, size
+	return size
 }
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 253aa26ba..5b2dc9ccb 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -568,6 +568,12 @@ TEST_F(PtyTest, WriteSlaveToMaster) {
   EXPECT_EQ(memcmp(buf, kExpected, sizeof(kExpected)), 0);
 }
 
+TEST_F(PtyTest, WriteInvalidUTF8) {
+  char c = 0xff;
+  ASSERT_THAT(syscall(__NR_write, master_.get(), &c, sizeof(c)),
+              SyscallSucceedsWithValue(sizeof(c)));
+}
+
 // Both the master and slave report the standard default termios settings.
 //
 // Note that TCGETS on the master actually redirects to the slave (see comment
-- 
cgit v1.2.3


From 4968dd1341a04e93557bdd9f4b4b83eb508e026d Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 3 Apr 2019 16:21:38 -0700
Subject: Cache ThreadGroups in PIDNamespace

If there are thousands of threads, ThreadGroupsAppend becomes very
expensive as it must iterate over all Tasks to find the ThreadGroup
leaders.

Reduce the cost by maintaining a map of ThreadGroups which can be used
to grab them all directly.

The one somewhat visible change is to convert PID namespace init
children zapping to a group-directed SIGKILL, as Linux did in
82058d668465 "signal: Use group_send_sig_info to kill all processes in a
pid namespace".

In a benchmark that creates N threads which sleep for two minutes, we
see approximately this much CPU time in ThreadGroupsAppend:

Before:

1 thread: 0ms
1024 threads: 30ms - 9130ms
4096 threads: 50ms - 2000ms
8192 threads: 18160ms
16384 threads: 17210ms

After:

1 thread: 0ms
1024 threads: 0ms
4096 threads: 0ms
8192 threads: 0ms
16384 threads: 0ms

The profiling is actually extremely noisy (likely due to cache effects),
as some runs show almost no samples at 1024, 4096 threads, but obviously
this does not scale to lots of threads.

PiperOrigin-RevId: 241828039
Change-Id: I17827c90045df4b3c49b3174f3a05bca3026a72c
---
 pkg/sentry/kernel/kernel.go     | 10 +++++-----
 pkg/sentry/kernel/sessions.go   | 18 +++++++++---------
 pkg/sentry/kernel/task_exec.go  |  2 ++
 pkg/sentry/kernel/task_exit.go  | 18 +++++++++++-------
 pkg/sentry/kernel/task_start.go |  7 +++++++
 pkg/sentry/kernel/threads.go    | 24 ++++++++++++++----------
 6 files changed, 48 insertions(+), 31 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index f7f471aaa..1cd2653ff 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -866,14 +866,14 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	defer k.tasks.mu.RUnlock()
 
 	var lastErr error
-	for t := range k.tasks.Root.tids {
-		if t == t.tg.leader && t.ContainerID() == cid {
-			t.tg.signalHandlers.mu.Lock()
-			defer t.tg.signalHandlers.mu.Unlock()
+	for tg := range k.tasks.Root.tgids {
+		if tg.leader.ContainerID() == cid {
+			tg.signalHandlers.mu.Lock()
 			infoCopy := *info
-			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
 				lastErr = err
 			}
+			tg.signalHandlers.mu.Unlock()
 		}
 	}
 	return lastErr
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index ae6daac60..65e2b73c4 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -240,14 +240,14 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
 	defer tasks.mu.RUnlock()
 
 	var lastErr error
-	for t := range tasks.Root.tids {
-		if t == t.tg.leader && t.tg.ProcessGroup() == pg {
-			t.tg.signalHandlers.mu.Lock()
-			defer t.tg.signalHandlers.mu.Unlock()
+	for tg := range tasks.Root.tgids {
+		if tg.ProcessGroup() == pg {
+			tg.signalHandlers.mu.Lock()
 			infoCopy := *info
-			if err := t.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
+			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
 				lastErr = err
 			}
+			tg.signalHandlers.mu.Unlock()
 		}
 	}
 	return lastErr
@@ -268,7 +268,7 @@ func (tg *ThreadGroup) CreateSession() error {
 // Precondition: callers must hold TaskSet.mu for writing.
 func (tg *ThreadGroup) createSession() error {
 	// Get the ID for this thread in the current namespace.
-	id := tg.pidns.tids[tg.leader]
+	id := tg.pidns.tgids[tg]
 
 	// Check if this ThreadGroup already leads a Session, or
 	// if the proposed group is already taken.
@@ -337,7 +337,7 @@ func (tg *ThreadGroup) createSession() error {
 
 	// Ensure a translation is added to all namespaces.
 	for ns := tg.pidns; ns != nil; ns = ns.parent {
-		local := ns.tids[tg.leader]
+		local := ns.tgids[tg]
 		ns.sids[s] = SessionID(local)
 		ns.sessions[SessionID(local)] = s
 		ns.pgids[pg] = ProcessGroupID(local)
@@ -356,7 +356,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 	defer tg.pidns.owner.mu.Unlock()
 
 	// Get the ID for this thread in the current namespace.
-	id := tg.pidns.tids[tg.leader]
+	id := tg.pidns.tgids[tg]
 
 	// Per above, check for a Session leader or existing group.
 	for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
@@ -401,7 +401,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 
 	// Ensure this translation is added to all namespaces.
 	for ns := tg.pidns; ns != nil; ns = ns.parent {
-		local := ns.tids[tg.leader]
+		local := ns.tgids[tg]
 		ns.pgids[pg] = ProcessGroupID(local)
 		ns.processGroups[ProcessGroupID(local)] = pg
 	}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index a9b74da8e..9fca90a1c 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -234,6 +234,8 @@ func (t *Task) promoteLocked() {
 		ns.tids[t] = leaderTID
 		ns.tasks[oldTID] = oldLeader
 		ns.tasks[leaderTID] = t
+		// Neither the ThreadGroup nor TGID change, so no need to
+		// update ns.tgids.
 	}
 
 	// Inherit the old leader's start time.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b9c558ccb..1a0734ab6 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -329,14 +329,15 @@ func (t *Task) exitChildren() {
 		// signal." - pid_namespaces(7)
 		t.Debugf("Init process terminating, killing namespace")
 		t.tg.pidns.exiting = true
-		for other := range t.tg.pidns.tids {
-			if other.tg != t.tg {
-				other.tg.signalHandlers.mu.Lock()
-				other.sendSignalLocked(&arch.SignalInfo{
-					Signo: int32(linux.SIGKILL),
-				}, false /* group */)
-				other.tg.signalHandlers.mu.Unlock()
+		for other := range t.tg.pidns.tgids {
+			if other == t.tg {
+				continue
 			}
+			other.signalHandlers.mu.Lock()
+			other.leader.sendSignalLocked(&arch.SignalInfo{
+				Signo: int32(linux.SIGKILL),
+			}, true /* group */)
+			other.signalHandlers.mu.Unlock()
 		}
 		// TODO: The init process waits for all processes in the
 		// namespace to exit before completing its own exit
@@ -652,6 +653,9 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
 			tid := ns.tids[t]
 			delete(ns.tasks, tid)
 			delete(ns.tids, t)
+			if t == t.tg.leader {
+				delete(ns.tgids, t.tg)
+			}
 		}
 		t.tg.exitedCPUStats.Accumulate(t.CPUStats())
 		t.tg.ioUsage.Accumulate(t.ioUsage)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index c82a32c78..b7534c0a2 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -212,11 +212,18 @@ func (ts *TaskSet) assignTIDsLocked(t *Task) error {
 			for _, a := range allocatedTIDs {
 				delete(a.ns.tasks, a.tid)
 				delete(a.ns.tids, t)
+				if t.tg.leader == nil {
+					delete(a.ns.tgids, t.tg)
+				}
 			}
 			return err
 		}
 		ns.tasks[tid] = t
 		ns.tids[t] = tid
+		if t.tg.leader == nil {
+			// New thread group.
+			ns.tgids[t.tg] = tid
+		}
 		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
 	}
 	return nil
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index bdb907905..4af1b7dfa 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -100,10 +100,8 @@ func newTaskSet() *TaskSet {
 //
 // Preconditions: ts.mu must be locked (for reading or writing).
 func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
-	for t := range ts.Root.tids {
-		if t == t.tg.leader {
-			f(t.tg)
-		}
+	for tg := range ts.Root.tgids {
+		f(tg)
 	}
 }
 
@@ -145,6 +143,13 @@ type PIDNamespace struct {
 	// identifiers in this namespace.
 	tids map[*Task]ThreadID
 
+	// tgids is a mapping from thread groups visible in this namespace to
+	// their identifiers in this namespace.
+	//
+	// The content of tgids is equivalent to tids[tg.leader]. This exists
+	// primarily as an optimization to quickly find all thread groups.
+	tgids map[*ThreadGroup]ThreadID
+
 	// sessions is a mapping from SessionIDs in this namespace to sessions
 	// visible in the namespace.
 	sessions map[SessionID]*Session
@@ -173,6 +178,7 @@ func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespa
 		userns:        userns,
 		tasks:         make(map[ThreadID]*Task),
 		tids:          make(map[*Task]ThreadID),
+		tgids:         make(map[*ThreadGroup]ThreadID),
 		sessions:      make(map[SessionID]*Session),
 		sids:          make(map[*Session]SessionID),
 		processGroups: make(map[ProcessGroupID]*ProcessGroup),
@@ -227,7 +233,7 @@ func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
 func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
 	ns.owner.mu.RLock()
 	defer ns.owner.mu.RUnlock()
-	return ns.tids[tg.leader]
+	return ns.tgids[tg]
 }
 
 // Tasks returns a snapshot of the tasks in ns.
@@ -250,10 +256,8 @@ func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
 func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
 	ns.owner.mu.RLock()
 	defer ns.owner.mu.RUnlock()
-	for t := range ns.tids {
-		if t == t.tg.leader {
-			tgs = append(tgs, t.tg)
-		}
+	for tg := range ns.tgids {
+		tgs = append(tgs, tg)
 	}
 	return tgs
 }
@@ -387,7 +391,7 @@ func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
 func (tg *ThreadGroup) ID() ThreadID {
 	tg.pidns.owner.mu.RLock()
 	defer tg.pidns.owner.mu.RUnlock()
-	return tg.pidns.tids[tg.leader]
+	return tg.pidns.tgids[tg]
 }
 
 // A taskNode defines the relationship between a task and the rest of the
-- 
cgit v1.2.3


From 61d8c361c6639ecbc262076be9f1214dd82065d1 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 3 Apr 2019 17:52:53 -0700
Subject: Don't release d.mu in checks for child-existence.

Dirent.exists() is called in Create to check whether a child with the given
name already exists.

Dirent.exists() calls walk(), and before this CL allowed walk() to drop d.mu
while calling d.Inode.Lookup. During this existence check, a racing Rename()
can acquire d.mu and create a new child of the dirent with the same name.
(Note that the source and destination of the rename must be in the same
directory, otherwise renameMu will be taken preventing the race.) In this
case, d.exists() can return false, even though a child with the same name
actually does exist.

This CL changes d.exists() so that it does not release d.mu while walking, thus
preventing the race with Rename.

It also adds comments noting that lockForRename may not take renameMu if the
source and destination are in the same directory, as this is a bit surprising
(at least it was to me).

PiperOrigin-RevId: 241842579
Change-Id: I56524870e39dfcd18cab82054eb3088846c34813
---
 pkg/sentry/fs/dirent.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 15a0129ce..3a1aa6c1e 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -629,7 +629,7 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent,
 // - d.mu must be held.
 // - name must must not contain "/"s.
 func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
-	child, err := d.walk(ctx, root, name, true /* may unlock */)
+	child, err := d.walk(ctx, root, name, false /* may unlock */)
 	if err != nil {
 		// Child may not exist.
 		return false
@@ -1377,8 +1377,13 @@ func (d *Dirent) dropExtendedReference() {
 // lockForRename takes locks on oldParent and newParent as required by Rename
 // and returns a function that will unlock the locks taken. The returned
 // function must be called even if a non-nil error is returned.
+//
+// Note that lockForRename does not take renameMu if the source and destination
+// of the rename are within the same directory.
 func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) {
 	if oldParent == newParent {
+		// Rename source and destination are in the same directory. In
+		// this case, we only need to take a lock on that directory.
 		oldParent.mu.Lock()
 		return oldParent.mu.Unlock, nil
 	}
-- 
cgit v1.2.3


From 9cf33960fc61309140a67587748570a36e78fc75 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 3 Apr 2019 18:05:30 -0700
Subject: Only CopyOut CPU when it changes

This will save copies when preemption is not caused by a CPU migration.

PiperOrigin-RevId: 241844399
Change-Id: I2ba3b64aa377846ab763425bd59b61158f576851
---
 pkg/sentry/kernel/rseq.go     |  2 +-
 pkg/sentry/kernel/task_run.go | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 46b03c700..0a954bc16 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -88,6 +88,7 @@ func (t *Task) RSEQCPUAddr() usermem.Addr {
 func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
 	t.rseqCPUAddr = addr
 	if addr != 0 {
+		t.rseqCPU = int32(hostcpu.GetCPU())
 		if err := t.rseqCopyOutCPU(); err != nil {
 			t.rseqCPUAddr = 0
 			t.rseqCPU = -1
@@ -102,7 +103,6 @@ func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
 // Preconditions: The caller must be running on the task goroutine. t's
 // AddressSpace must be active.
 func (t *Task) rseqCopyOutCPU() error {
-	t.rseqCPU = int32(hostcpu.GetCPU())
 	buf := t.CopyScratchBuffer(4)
 	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
 	_, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 596b9aa16..359986b21 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -21,6 +21,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
@@ -169,12 +170,16 @@ func (*runApp) execute(t *Task) taskRunState {
 	if t.rseqPreempted {
 		t.rseqPreempted = false
 		if t.rseqCPUAddr != 0 {
-			if err := t.rseqCopyOutCPU(); err != nil {
-				t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
-				t.forceSignal(linux.SIGSEGV, false)
-				t.SendSignal(sigPriv(linux.SIGSEGV))
-				// Re-enter the task run loop for signal delivery.
-				return (*runApp)(nil)
+			cpu := int32(hostcpu.GetCPU())
+			if t.rseqCPU != cpu {
+				t.rseqCPU = cpu
+				if err := t.rseqCopyOutCPU(); err != nil {
+					t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+					t.forceSignal(linux.SIGSEGV, false)
+					t.SendSignal(sigPriv(linux.SIGSEGV))
+					// Re-enter the task run loop for signal delivery.
+					return (*runApp)(nil)
+				}
 			}
 		}
 		t.rseqInterrupt()
-- 
cgit v1.2.3


From 75c8ac38e0f6cc4eb3726c89aee41357cd592c4b Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 4 Apr 2019 17:04:30 -0700
Subject: BUILD: Add useful go_path target

Change-Id: Ibd6d8a1a63826af6e62a0f0669f8f0866c8091b4
PiperOrigin-RevId: 242037969
---
 BUILD                                    | 17 +++++++++++++++++
 CONTRIBUTING.md                          | 18 ++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_utsname.go |  2 +-
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/BUILD b/BUILD
index 9ede10a9f..391791ca9 100644
--- a/BUILD
+++ b/BUILD
@@ -1,3 +1,7 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_path")
+
 # The sandbox filegroup is used for sandbox-internal dependencies.
 package_group(
     name = "sandbox",
@@ -5,3 +9,16 @@ package_group(
         "//...",
     ],
 )
+
+# gopath defines a directory that is structured in a way that is compatible
+# with standard Go tools. Things like godoc, editors and refactor tools should
+# work as expected.
+#
+# The files in this tree are symlinks to the true sources.
+go_path(
+    name = "gopath",
+    mode = "link",
+    deps = [
+        "//runsc",
+    ],
+)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c2d686d18..b91244de8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,6 +12,24 @@ You generally only need to submit a CLA once, so if you've already submitted one
 (even if it was for a different project), you probably don't need to do it
 again.
 
+### Using GOPATH
+
+Some editors may require the code to be structured in a `GOPATH` directory tree.
+In this case, you may use the `:gopath` target to generate a directory tree with
+symlinks to the original source files.
+
+```
+bazel build :gopath
+```
+
+You can then set the `GOPATH` in your editor to `bazel-bin/gopath`.
+
+If you use this mechanism, keep in mind that the generated tree is not the
+canonical source. You will still need to build and test with `bazel`. New files
+will need to be added to the appropriate `BUILD` files, and the `:gopath` target
+will need to be re-run to generate appropriate symlinks in the `GOPATH`
+directory tree.
+
 ### Coding Guidelines
 
 All code should conform to the [Go style guidelines][gostyle].
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index 689f2f838..f7545b965 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -35,7 +35,7 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	copy(u.Nodename[:], uts.HostName())
 	copy(u.Release[:], version.Release)
 	copy(u.Version[:], version.Version)
-	copy(u.Machine[:], "x86_64") // +build tag above.
+	copy(u.Machine[:], "x86_64") // build tag above.
 	copy(u.Domainname[:], uts.DomainName())
 
 	// Copy out the result.
-- 
cgit v1.2.3


From 75a5ccf5d98876c26305da0feff20e4a148027ec Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 4 Apr 2019 17:13:31 -0700
Subject: Remove defer from trivial ThreadID methods

In particular, ns.IDOfTask and tg.ID are used for gettid and getpid,
respectively, where removing defer saves ~100ns. This may be a small
improvement to application logging, which may call gettid/getpid
frequently.

PiperOrigin-RevId: 242039616
Change-Id: I860beb62db3fe077519835e6bafa7c74cba6ca80
---
 pkg/sentry/kernel/threads.go | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 4af1b7dfa..4fd6cf4e2 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -196,8 +196,9 @@ func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
 // task has that TID, TaskWithID returns nil.
 func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
 	ns.owner.mu.RLock()
-	defer ns.owner.mu.RUnlock()
-	return ns.tasks[tid]
+	t := ns.tasks[tid]
+	ns.owner.mu.RUnlock()
+	return t
 }
 
 // ThreadGroupWithID returns the thread group lead by the task with thread ID
@@ -224,16 +225,18 @@ func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
 // 0.
 func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
 	ns.owner.mu.RLock()
-	defer ns.owner.mu.RUnlock()
-	return ns.tids[t]
+	id := ns.tids[t]
+	ns.owner.mu.RUnlock()
+	return id
 }
 
 // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
 // If the task is not visible in that namespace, IDOfThreadGroup returns 0.
 func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
 	ns.owner.mu.RLock()
-	defer ns.owner.mu.RUnlock()
-	return ns.tgids[tg]
+	id := ns.tgids[tg]
+	ns.owner.mu.RUnlock()
+	return id
 }
 
 // Tasks returns a snapshot of the tasks in ns.
@@ -390,8 +393,9 @@ func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
 // is dead, ID returns 0.
 func (tg *ThreadGroup) ID() ThreadID {
 	tg.pidns.owner.mu.RLock()
-	defer tg.pidns.owner.mu.RUnlock()
-	return tg.pidns.tgids[tg]
+	id := tg.pidns.tgids[tg]
+	tg.pidns.owner.mu.RUnlock()
+	return id
 }
 
 // A taskNode defines the relationship between a task and the rest of the
-- 
cgit v1.2.3


From 88409e983c463b6d9c8085e7fdbe7ff45b3c5184 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 4 Apr 2019 17:42:51 -0700
Subject: gvisor: Add support for the MS_NOEXEC mount option

https://github.com/google/gvisor/issues/145

PiperOrigin-RevId: 242044115
Change-Id: I8f140fe05e32ecd438b6be218e224e4b7fe05878
---
 pkg/sentry/fs/context.go               |  5 +++++
 pkg/sentry/fs/filesystems.go           |  4 ++++
 pkg/sentry/fs/proc/mounts.go           |  3 +++
 pkg/sentry/syscalls/linux/sys_mount.go |  5 ++++-
 runsc/boot/fs.go                       |  2 ++
 runsc/specutils/fs.go                  |  5 ++---
 test/syscalls/linux/BUILD              |  1 +
 test/syscalls/linux/mount.cc           | 18 ++++++++++++++++++
 8 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index 1775d3486..c0e6075e4 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -46,6 +46,11 @@ func ContextCanAccessFile(ctx context.Context, inode *Inode, reqPerms PermMask)
 		p = uattr.Perms.Group
 	}
 
+	// Do not allow programs to be executed if MS_NOEXEC is set.
+	if IsFile(inode.StableAttr) && reqPerms.Execute && inode.MountSource.Flags.NoExec {
+		return false
+	}
+
 	// Are permissions satisfied without capability checks?
 	if p.SupersetOf(reqPerms) {
 		return true
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index aa664b973..a6b27c402 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -140,6 +140,10 @@ type MountSourceFlags struct {
 	// cache, even when the platform supports direct mapped I/O. This
 	// doesn't correspond to any Linux mount options.
 	ForcePageCache bool
+
+	// NoExec corresponds to mount(2)'s "MS_NOEXEC" and indicates that
+	// binaries from this file system can't be executed.
+	NoExec bool
 }
 
 // GenericMountSourceOptions splits a string containing comma separated tokens of the
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 7111e5c0f..1e62af8c6 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -129,6 +129,9 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		if m.Flags.NoAtime {
 			opts += ",noatime"
 		}
+		if m.Flags.NoExec {
+			opts += ",noexec"
+		}
 		fmt.Fprintf(&buf, "%s ", opts)
 
 		// (7) Optional fields: zero or more fields of the form "tag[:value]".
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 6b8d75d24..e110a553f 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -75,7 +75,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 	// Silently allow MS_NOSUID, since we don't implement set-id bits
 	// anyway.
-	const unsupportedFlags = linux.MS_NODEV | linux.MS_NOEXEC |
+	const unsupportedFlags = linux.MS_NODEV |
 		linux.MS_NODIRATIME | linux.MS_STRICTATIME
 
 	// Linux just allows passing any flags to mount(2) - it won't fail when
@@ -100,6 +100,9 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
 		superFlags.ReadOnly = true
 	}
+	if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+		superFlags.NoExec = true
+	}
 
 	rootInode, err := rsys.Mount(t, sourcePath, superFlags, data, nil)
 	if err != nil {
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 25e23c09b..8dfb6dce6 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -482,6 +482,8 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 			mf.ReadOnly = true
 		case "noatime":
 			mf.NoAtime = true
+		case "noexec":
+			mf.NoExec = true
 		default:
 			log.Warningf("ignoring unknown mount option %q", o)
 		}
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index aa17d4eb9..98c3b19c0 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -39,6 +39,7 @@ var optionsMap = map[string]mapping{
 	"diratime":      {set: false, val: syscall.MS_NODIRATIME},
 	"dirsync":       {set: true, val: syscall.MS_DIRSYNC},
 	"exec":          {set: false, val: syscall.MS_NOEXEC},
+	"noexec":        {set: true, val: syscall.MS_NOEXEC},
 	"iversion":      {set: true, val: syscall.MS_I_VERSION},
 	"loud":          {set: false, val: syscall.MS_SILENT},
 	"mand":          {set: true, val: syscall.MS_MANDLOCK},
@@ -76,9 +77,7 @@ var propOptionsMap = map[string]mapping{
 // invalidOptions list options not allowed.
 //   - shared: sandbox must be isolated from the host. Propagating mount changes
 //     from the sandbox to the host breaks the isolation.
-//   - noexec: not yet supported. Don't ignore it since it could break
-//     in-sandbox security.
-var invalidOptions = []string{"shared", "rshared", "noexec"}
+var invalidOptions = []string{"shared", "rshared"}
 
 // OptionsToFlags converts mount options to syscall flags.
 func OptionsToFlags(opts []string) uint32 {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 1e386193b..38faba267 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1080,6 +1080,7 @@ cc_binary(
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         "//test/util:mount_util",
+        "//test/util:multiprocess_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index 6bb4287a3..201b83e87 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -31,6 +31,7 @@
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
 #include "test/util/mount_util.h"
+#include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -277,6 +278,23 @@ TEST(MountTest, MountNoAtime) {
   EXPECT_EQ(before, after);
 }
 
+TEST(MountTest, MountNoExec) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+  auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+      Mount("", dir.path(), "tmpfs", MS_NOEXEC, "mode=0777", 0));
+
+  std::string const contents = "No no no, don't follow the instructions!";
+  auto const file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(dir.path(), contents, 0777));
+
+  int execve_errno;
+  ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {}, {}, nullptr, &execve_errno));
+  EXPECT_EQ(execve_errno, EACCES);
+}
+
 TEST(MountTest, RenameRemoveMountPoint) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
 
-- 
cgit v1.2.3


From ee7e6d33b2a017a53bebfdc55d182f53474d4d7d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Fri, 5 Apr 2019 15:48:26 -0700
Subject: Use string type for extended attribute values, instead of []byte.

Strings are a better fit for this usage because they are immutable in Go, and
can contain arbitrary bytes. It also allows us to avoid casting bytes to string
(and the associated allocation) in the hot path when checking for overlay
whiteouts.

PiperOrigin-RevId: 242208856
Change-Id: I7699ae6302492eca71787dd0b72e0a5a217a3db2
---
 pkg/sentry/fs/fsutil/inode.go       | 16 ++++++++--------
 pkg/sentry/fs/inode.go              |  2 +-
 pkg/sentry/fs/inode_operations.go   |  4 ++--
 pkg/sentry/fs/inode_overlay.go      | 18 +++++++++---------
 pkg/sentry/fs/inode_overlay_test.go |  6 +++---
 pkg/sentry/fs/tmpfs/tmpfs.go        |  4 ++--
 6 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index c1ad45e52..2673d73d7 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -197,25 +197,25 @@ func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
 type InodeSimpleExtendedAttributes struct {
 	// mu protects xattrs.
 	mu     sync.RWMutex `state:"nosave"`
-	xattrs map[string][]byte
+	xattrs map[string]string
 }
 
 // Getxattr implements fs.InodeOperations.Getxattr.
-func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) ([]byte, error) {
+func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) (string, error) {
 	i.mu.RLock()
 	value, ok := i.xattrs[name]
 	i.mu.RUnlock()
 	if !ok {
-		return nil, syserror.ENOATTR
+		return "", syserror.ENOATTR
 	}
 	return value, nil
 }
 
 // Setxattr implements fs.InodeOperations.Setxattr.
-func (i *InodeSimpleExtendedAttributes) Setxattr(_ *fs.Inode, name string, value []byte) error {
+func (i *InodeSimpleExtendedAttributes) Setxattr(_ *fs.Inode, name, value string) error {
 	i.mu.Lock()
 	if i.xattrs == nil {
-		i.xattrs = make(map[string][]byte)
+		i.xattrs = make(map[string]string)
 	}
 	i.xattrs[name] = value
 	i.mu.Unlock()
@@ -424,12 +424,12 @@ func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
 type InodeNoExtendedAttributes struct{}
 
 // Getxattr implements fs.InodeOperations.Getxattr.
-func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) ([]byte, error) {
-	return nil, syserror.EOPNOTSUPP
+func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) (string, error) {
+	return "", syserror.EOPNOTSUPP
 }
 
 // Setxattr implements fs.InodeOperations.Setxattr.
-func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, []byte) error {
+func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, string) error {
 	return syserror.EOPNOTSUPP
 }
 
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index b8b5c1528..d82f9740e 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -253,7 +253,7 @@ func (i *Inode) UnstableAttr(ctx context.Context) (UnstableAttr, error) {
 }
 
 // Getxattr calls i.InodeOperations.Getxattr with i as the Inode.
-func (i *Inode) Getxattr(name string) ([]byte, error) {
+func (i *Inode) Getxattr(name string) (string, error) {
 	if i.overlay != nil {
 		return overlayGetxattr(i.overlay, name)
 	}
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index e8b9ab96b..ceacc7659 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -174,11 +174,11 @@ type InodeOperations interface {
 	// do not support extended attributes return EOPNOTSUPP. Inodes that
 	// support extended attributes but don't have a value at name return
 	// ENODATA.
-	Getxattr(inode *Inode, name string) ([]byte, error)
+	Getxattr(inode *Inode, name string) (string, error)
 
 	// Setxattr sets the value of extended attribute name. Inodes that
 	// do not support extended attributes return EOPNOTSUPP.
-	Setxattr(inode *Inode, name string, value []byte) error
+	Setxattr(inode *Inode, name, value string) error
 
 	// Listxattr returns the set of all extended attributes names that
 	// have values. Inodes that do not support extended attributes return
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 6e1dfecf9..254646176 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -25,12 +25,12 @@ import (
 )
 
 func overlayHasWhiteout(parent *Inode, name string) bool {
-	buf, err := parent.Getxattr(XattrOverlayWhiteout(name))
-	return err == nil && string(buf) == "y"
+	s, err := parent.Getxattr(XattrOverlayWhiteout(name))
+	return err == nil && s == "y"
 }
 
 func overlayCreateWhiteout(parent *Inode, name string) error {
-	return parent.InodeOperations.Setxattr(parent, XattrOverlayWhiteout(name), []byte("y"))
+	return parent.InodeOperations.Setxattr(parent, XattrOverlayWhiteout(name), "y")
 }
 
 func overlayWriteOut(ctx context.Context, o *overlayEntry) error {
@@ -491,28 +491,28 @@ func overlayUnstableAttr(ctx context.Context, o *overlayEntry) (UnstableAttr, er
 	return attr, err
 }
 
-func overlayGetxattr(o *overlayEntry, name string) ([]byte, error) {
+func overlayGetxattr(o *overlayEntry, name string) (string, error) {
 	// Hot path. This is how the overlay checks for whiteout files.
 	// Avoid defers.
 	var (
-		b   []byte
+		s   string
 		err error
 	)
 
 	// Don't forward the value of the extended attribute if it would
 	// unexpectedly change the behavior of a wrapping overlay layer.
 	if strings.HasPrefix(XattrOverlayPrefix, name) {
-		return nil, syserror.ENODATA
+		return "", syserror.ENODATA
 	}
 
 	o.copyMu.RLock()
 	if o.upper != nil {
-		b, err = o.upper.Getxattr(name)
+		s, err = o.upper.Getxattr(name)
 	} else {
-		b, err = o.lower.Getxattr(name)
+		s, err = o.lower.Getxattr(name)
 	}
 	o.copyMu.RUnlock()
-	return b, err
+	return s, err
 }
 
 func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index bc91be226..fa8accf6c 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -383,13 +383,13 @@ type dir struct {
 }
 
 // Getxattr implements InodeOperations.Getxattr.
-func (d *dir) Getxattr(inode *fs.Inode, name string) ([]byte, error) {
+func (d *dir) Getxattr(inode *fs.Inode, name string) (string, error) {
 	for _, n := range d.negative {
 		if name == fs.XattrOverlayWhiteout(n) {
-			return []byte("y"), nil
+			return "y", nil
 		}
 	}
-	return nil, syserror.ENOATTR
+	return "", syserror.ENOATTR
 }
 
 // GetFile implements InodeOperations.GetFile.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index a1672a4d0..555692505 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -150,12 +150,12 @@ func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms
 }
 
 // Getxattr implements fs.InodeOperations.Getxattr.
-func (d *Dir) Getxattr(i *fs.Inode, name string) ([]byte, error) {
+func (d *Dir) Getxattr(i *fs.Inode, name string) (string, error) {
 	return d.ramfsDir.Getxattr(i, name)
 }
 
 // Setxattr implements fs.InodeOperations.Setxattr.
-func (d *Dir) Setxattr(i *fs.Inode, name string, value []byte) error {
+func (d *Dir) Setxattr(i *fs.Inode, name, value string) error {
 	return d.ramfsDir.Setxattr(i, name, value)
 }
 
-- 
cgit v1.2.3


From 70906f1d2428ec6e616fe9dada4a41f4ef4024a9 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 8 Apr 2019 11:55:31 -0700
Subject: Intermediate ram fs dirs should be writable.

We construct a ramfs tree of "scaffolding" directories for all mount points, so
that a directory exists that each mount point can be mounted over.

We were creating these directories without write permissions, which meant that
they were not wribable even when underlayed under a writable filesystem. They
should be writable.

PiperOrigin-RevId: 242507789
Change-Id: I86645e35417560d862442ff5962da211dbe9b731
---
 pkg/sentry/fs/ramfs/tree.go | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index f6d5ffdec..c1ac8a78b 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -28,6 +28,13 @@ import (
 // MakeDirectoryTree constructs a ramfs tree of all directories containing
 // subdirs. Each element of subdir must be a clean path, and cannot be empty or
 // "/".
+//
+// All directories in the created tree will have full (read-write-execute)
+// permissions, but note that file creation inside the directories is not
+// actually supported because ramfs.Dir.CreateOpts == nil. However, these
+// directory trees are normally "underlayed" under another filesystem (possibly
+// the root), and file creation inside these directories in the overlay will be
+// possible if the upper is writeable.
 func MakeDirectoryTree(ctx context.Context, msrc *fs.MountSource, subdirs []string) (*fs.Inode, error) {
 	root := emptyDir(ctx, msrc)
 	for _, subdir := range subdirs {
@@ -58,9 +65,9 @@ func makeSubdir(ctx context.Context, msrc *fs.MountSource, root *Dir, subdir str
 	}
 }
 
-// emptyDir returns an empty *ramfs.Dir that is traversable but not writable.
+// emptyDir returns an empty *ramfs.Dir with all permissions granted.
 func emptyDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode {
-	dir := NewDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0555))
+	dir := NewDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0777))
 	return fs.NewInode(dir, msrc, fs.StableAttr{
 		DeviceID:  anon.PseudoDevice.DeviceID(),
 		InodeID:   anon.PseudoDevice.NextIno(),
-- 
cgit v1.2.3


From 9471c013483b0709479c51d470ac840621ae7d46 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 8 Apr 2019 16:31:06 -0700
Subject: Export kernel.SignalInfoPriv.

Also add kernel.SignalInfoNoInfo, and use it in RLIMIT_FSIZE checks.

PiperOrigin-RevId: 242562428
Change-Id: I4887c0e1c8f5fddcabfe6d4281bf76d2f2eafe90
---
 pkg/sentry/fs/host/tty.go          |  7 ++-----
 pkg/sentry/kernel/fasync/BUILD     |  1 -
 pkg/sentry/kernel/fasync/fasync.go |  7 +------
 pkg/sentry/kernel/sessions.go      |  4 ++--
 pkg/sentry/kernel/signal.go        | 16 +++++++++++++---
 pkg/sentry/kernel/task_run.go      |  2 +-
 pkg/sentry/kernel/task_sched.go    | 10 +++++-----
 pkg/sentry/kernel/task_signals.go  |  2 +-
 pkg/sentry/kernel/task_syscall.go  |  8 ++++----
 pkg/sentry/kernel/thread_group.go  |  2 +-
 pkg/sentry/syscalls/linux/error.go |  7 ++-----
 test/syscalls/linux/write.cc       | 17 ++++++++++++++---
 12 files changed, 46 insertions(+), 37 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 21db0086e..c5cb75df7 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -344,11 +344,8 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e
 	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
 	// apply: the sentry will handle -ERESTARTSYS in
 	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
-	si := arch.SignalInfo{
-		Code:  arch.SignalInfoKernel,
-		Signo: int32(sig),
-	}
+	//
 	// Linux ignores the result of kill_pgrp().
-	_ = pg.SendSignal(&si)
+	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
 	return kernel.ERESTARTSYS
 }
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 5faf95909..59b4a49e1 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -9,7 +9,6 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index aa4aac109..298d988ea 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -19,7 +19,6 @@ import (
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
@@ -72,11 +71,7 @@ func (a *FileAsync) Callback(e *waiter.Entry) {
 		a.requester.EffectiveKUID == c.RealKUID ||
 		a.requester.RealKUID == c.SavedKUID ||
 		a.requester.RealKUID == c.RealKUID {
-		t.SendSignal(&arch.SignalInfo{
-			Signo: int32(linux.SIGIO),
-			// SEND_SIG_PRIV
-			Code: arch.SignalInfoKernel,
-		})
+		t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO))
 	}
 	a.mu.Unlock()
 }
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 65e2b73c4..070c2f930 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -219,8 +219,8 @@ func (pg *ProcessGroup) handleOrphan() {
 			return
 		}
 		tg.signalHandlers.mu.Lock()
-		tg.leader.sendSignalLocked(sigPriv(linux.SIGHUP), true /* group */)
-		tg.leader.sendSignalLocked(sigPriv(linux.SIGCONT), true /* group */)
+		tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGHUP), true /* group */)
+		tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGCONT), true /* group */)
 		tg.signalHandlers.mu.Unlock()
 	})
 
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index b066df132..22a56c6fc 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -56,11 +56,21 @@ func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) {
 	}
 }
 
-// sigPriv returns a SignalInfo representing a signal sent by the sentry. (The
-// name reflects its equivalence to Linux's SEND_SIG_PRIV.)
-func sigPriv(sig linux.Signal) *arch.SignalInfo {
+// SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV.
+func SignalInfoPriv(sig linux.Signal) *arch.SignalInfo {
 	return &arch.SignalInfo{
 		Signo: int32(sig),
 		Code:  arch.SignalInfoKernel,
 	}
 }
+
+// SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO.
+func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo {
+	info := &arch.SignalInfo{
+		Signo: int32(sig),
+		Code:  arch.SignalInfoUser,
+	}
+	info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
+	info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	return info
+}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 359986b21..6b5fe7165 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -176,7 +176,7 @@ func (*runApp) execute(t *Task) taskRunState {
 				if err := t.rseqCopyOutCPU(); err != nil {
 					t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
 					t.forceSignal(linux.SIGSEGV, false)
-					t.SendSignal(sigPriv(linux.SIGSEGV))
+					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 					// Re-enter the task run loop for signal delivery.
 					return (*runApp)(nil)
 				}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 3b3cdc24a..3d654bf93 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -394,7 +394,7 @@ func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
 			newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
 			tg.itimerVirtSetting = newItimerVirtSetting
 			if exp != 0 {
-				virtReceiver.sendSignalLocked(sigPriv(linux.SIGVTALRM), true)
+				virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true)
 			}
 		}
 		if profReceiver != nil {
@@ -402,18 +402,18 @@ func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
 			newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
 			tg.itimerProfSetting = newItimerProfSetting
 			if exp != 0 {
-				profReceiver.sendSignalLocked(sigPriv(linux.SIGPROF), true)
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true)
 			}
 			// RLIMIT_CPU soft limit
 			newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
 			tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
 			if exp != 0 {
-				profReceiver.sendSignalLocked(sigPriv(linux.SIGXCPU), true)
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true)
 			}
 			// RLIMIT_CPU hard limit
 			rlimitCPUMax := tg.limits.Get(limits.CPU).Max
 			if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
-				profReceiver.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
 			}
 		}
 		tg.signalHandlers.mu.Unlock()
@@ -471,7 +471,7 @@ func (t *Task) NotifyRlimitCPUUpdated() {
 			tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
 			tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
 			if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
-				t.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+				t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
 			}
 		}
 		t.tg.updateCPUTimersEnabledLocked()
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 6a204aa59..e177562d7 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -224,7 +224,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS
 			// Send a forced SIGSEGV. If the signal that couldn't be delivered
 			// was a SIGSEGV, force the handler to SIG_DFL.
 			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
-			t.SendSignal(sigPriv(linux.SIGSEGV))
+			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 		}
 
 	default:
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 9e43f089a..52f5fde8d 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -250,7 +250,7 @@ type runSyscallAfterSyscallEnterStop struct{}
 func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
 	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
 		t.tg.signalHandlers.mu.Lock()
-		t.sendSignalLocked(sigPriv(sig), false /* group */)
+		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
 		t.tg.signalHandlers.mu.Unlock()
 	}
 	if t.killed() {
@@ -270,7 +270,7 @@ type runSyscallAfterSysemuStop struct{}
 func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
 	if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
 		t.tg.signalHandlers.mu.Lock()
-		t.sendSignalLocked(sigPriv(sig), false /* group */)
+		t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
 		t.tg.signalHandlers.mu.Unlock()
 	}
 	if t.killed() {
@@ -335,7 +335,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
 	if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
 		t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
 		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
-		t.SendSignal(sigPriv(linux.SIGSEGV))
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 		return (*runApp)(nil)
 	}
 
@@ -405,7 +405,7 @@ func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, calle
 		t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
 		if err == syserror.EFAULT {
 			t.forceSignal(linux.SIGSEGV, false /* unconditional */)
-			t.SendSignal(sigPriv(linux.SIGSEGV))
+			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 			// A return is not emulated in this case.
 			return (*runApp)(nil)
 		}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 1b7b74319..58f3a7ec9 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -322,7 +322,7 @@ type itimerRealListener struct {
 
 // Notify implements ktime.TimerListener.Notify.
 func (l *itimerRealListener) Notify(exp uint64) {
-	l.tg.SendSignal(sigPriv(linux.SIGALRM))
+	l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM))
 }
 
 // Destroy implements ktime.TimerListener.Destroy.
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 9fd002955..e86bed313 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -19,9 +19,9 @@ import (
 	"sync"
 	"syscall"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/metric"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -53,10 +53,7 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		//
 		// Do not consume the error and return it as EFBIG.
 		// Simultaneously send a SIGXFSZ per setrlimit(2).
-		t.SendSignal(&arch.SignalInfo{
-			Signo: int32(syscall.SIGXFSZ),
-			Code:  arch.SignalInfoKernel,
-		})
+		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
 		return syscall.EFBIG
 	case syserror.ErrInterrupted:
 		// The syscall was interrupted. Return nil if it completed
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index ca6aafd18..432bd6066 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -17,6 +17,7 @@
 #include <signal.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
 
@@ -105,23 +106,33 @@ TEST_F(WriteTest, WriteExceedsRLimit) {
   EXPECT_THAT(write(fd, buf.data(), target_lim + 1),
               SyscallSucceedsWithValue(target_lim));
   EXPECT_THAT(write(fd, buf.data(), 1), SyscallFailsWithErrno(EFBIG));
+  siginfo_t info;
   struct timespec timelimit = {0, 0};
-  EXPECT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, nullptr, &timelimit),
+  ASSERT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, &info, &timelimit),
               SyscallSucceedsWithValue(SIGXFSZ));
+  EXPECT_EQ(info.si_code, SI_USER);
+  EXPECT_EQ(info.si_pid, getpid());
+  EXPECT_EQ(info.si_uid, getuid());
 
   EXPECT_THAT(pwrite(fd, buf.data(), target_lim + 1, 1),
               SyscallSucceedsWithValue(target_lim - 1));
   EXPECT_THAT(pwrite(fd, buf.data(), 1, target_lim),
               SyscallFailsWithErrno(EFBIG));
-  EXPECT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, nullptr, &timelimit),
+  ASSERT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, &info, &timelimit),
               SyscallSucceedsWithValue(SIGXFSZ));
+  EXPECT_EQ(info.si_code, SI_USER);
+  EXPECT_EQ(info.si_pid, getpid());
+  EXPECT_EQ(info.si_uid, getuid());
 
   EXPECT_THAT(pwrite64(fd, buf.data(), target_lim + 1, 1),
               SyscallSucceedsWithValue(target_lim - 1));
   EXPECT_THAT(pwrite64(fd, buf.data(), 1, target_lim),
               SyscallFailsWithErrno(EFBIG));
-  EXPECT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, nullptr, &timelimit),
+  ASSERT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, &info, &timelimit),
               SyscallSucceedsWithValue(SIGXFSZ));
+  EXPECT_EQ(info.si_code, SI_USER);
+  EXPECT_EQ(info.si_pid, getpid());
+  EXPECT_EQ(info.si_uid, getuid());
 
   ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &filesize_mask, nullptr),
               SyscallSucceeds());
-- 
cgit v1.2.3


From eaac2806ffadbb3db6317e58c61b855b1350f0aa Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Tue, 9 Apr 2019 11:22:28 -0700
Subject: Add TCP checksum verification.

PiperOrigin-RevId: 242704699
Change-Id: I87db368ca343b3b4bf4f969b17d3aa4ce2f8bd4f
---
 pkg/sentry/socket/epsocket/epsocket.go  |  1 +
 pkg/tcpip/header/tcp.go                 | 68 ++++++++++++++++-----------------
 pkg/tcpip/link/fdbased/endpoint.go      | 18 ++++++---
 pkg/tcpip/link/loopback/loopback.go     |  2 +-
 pkg/tcpip/link/muxed/injectable.go      |  2 +-
 pkg/tcpip/link/muxed/injectable_test.go |  4 +-
 pkg/tcpip/stack/registration.go         | 10 ++++-
 pkg/tcpip/tcpip.go                      |  3 ++
 pkg/tcpip/transport/tcp/connect.go      |  2 +-
 pkg/tcpip/transport/tcp/endpoint.go     |  9 ++++-
 pkg/tcpip/transport/tcp/forwarder.go    |  2 +-
 pkg/tcpip/transport/tcp/protocol.go     |  2 +-
 pkg/tcpip/transport/tcp/segment.go      | 31 +++++++++++++--
 pkg/tcpip/transport/tcp/tcp_test.go     | 29 +++++++++++++-
 pkg/tcpip/transport/udp/endpoint.go     |  2 +-
 runsc/boot/network.go                   |  1 +
 16 files changed, 129 insertions(+), 57 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e170da169..5bcafad98 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -154,6 +154,7 @@ var Metrics = tcpip.Stats{
 		SlowStartRetransmits:      mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
 		FastRetransmit:            mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
 		Timeouts:                  mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
+		ChecksumErrors:            mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
 	},
 	UDP: tcpip.UDPStats{
 		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 6e3ee2e50..e656ebb15 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -22,16 +22,17 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum"
 )
 
+// These constants are the offsets of the respective fields in the TCP header.
 const (
-	srcPort     = 0
-	dstPort     = 2
-	seqNum      = 4
-	ackNum      = 8
-	dataOffset  = 12
-	tcpFlags    = 13
-	winSize     = 14
-	tcpChecksum = 16
-	urgentPtr   = 18
+	TCPSrcPortOffset   = 0
+	TCPDstPortOffset   = 2
+	TCPSeqNumOffset    = 4
+	TCPAckNumOffset    = 8
+	TCPDataOffset      = 12
+	TCPFlagsOffset     = 13
+	TCPWinSizeOffset   = 14
+	TCPChecksumOffset  = 16
+	TCPUrgentPtrOffset = 18
 )
 
 const (
@@ -179,27 +180,27 @@ const (
 
 // SourcePort returns the "source port" field of the tcp header.
 func (b TCP) SourcePort() uint16 {
-	return binary.BigEndian.Uint16(b[srcPort:])
+	return binary.BigEndian.Uint16(b[TCPSrcPortOffset:])
 }
 
 // DestinationPort returns the "destination port" field of the tcp header.
 func (b TCP) DestinationPort() uint16 {
-	return binary.BigEndian.Uint16(b[dstPort:])
+	return binary.BigEndian.Uint16(b[TCPDstPortOffset:])
 }
 
 // SequenceNumber returns the "sequence number" field of the tcp header.
 func (b TCP) SequenceNumber() uint32 {
-	return binary.BigEndian.Uint32(b[seqNum:])
+	return binary.BigEndian.Uint32(b[TCPSeqNumOffset:])
 }
 
 // AckNumber returns the "ack number" field of the tcp header.
 func (b TCP) AckNumber() uint32 {
-	return binary.BigEndian.Uint32(b[ackNum:])
+	return binary.BigEndian.Uint32(b[TCPAckNumOffset:])
 }
 
 // DataOffset returns the "data offset" field of the tcp header.
 func (b TCP) DataOffset() uint8 {
-	return (b[dataOffset] >> 4) * 4
+	return (b[TCPDataOffset] >> 4) * 4
 }
 
 // Payload returns the data in the tcp packet.
@@ -209,32 +210,32 @@ func (b TCP) Payload() []byte {
 
 // Flags returns the flags field of the tcp header.
 func (b TCP) Flags() uint8 {
-	return b[tcpFlags]
+	return b[TCPFlagsOffset]
 }
 
 // WindowSize returns the "window size" field of the tcp header.
 func (b TCP) WindowSize() uint16 {
-	return binary.BigEndian.Uint16(b[winSize:])
+	return binary.BigEndian.Uint16(b[TCPWinSizeOffset:])
 }
 
 // Checksum returns the "checksum" field of the tcp header.
 func (b TCP) Checksum() uint16 {
-	return binary.BigEndian.Uint16(b[tcpChecksum:])
+	return binary.BigEndian.Uint16(b[TCPChecksumOffset:])
 }
 
 // SetSourcePort sets the "source port" field of the tcp header.
 func (b TCP) SetSourcePort(port uint16) {
-	binary.BigEndian.PutUint16(b[srcPort:], port)
+	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], port)
 }
 
 // SetDestinationPort sets the "destination port" field of the tcp header.
 func (b TCP) SetDestinationPort(port uint16) {
-	binary.BigEndian.PutUint16(b[dstPort:], port)
+	binary.BigEndian.PutUint16(b[TCPDstPortOffset:], port)
 }
 
 // SetChecksum sets the checksum field of the tcp header.
 func (b TCP) SetChecksum(checksum uint16) {
-	binary.BigEndian.PutUint16(b[tcpChecksum:], checksum)
+	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], checksum)
 }
 
 // CalculateChecksum calculates the checksum of the tcp segment.
@@ -258,20 +259,20 @@ func (b TCP) ParsedOptions() TCPOptions {
 }
 
 func (b TCP) encodeSubset(seq, ack uint32, flags uint8, rcvwnd uint16) {
-	binary.BigEndian.PutUint32(b[seqNum:], seq)
-	binary.BigEndian.PutUint32(b[ackNum:], ack)
-	b[tcpFlags] = flags
-	binary.BigEndian.PutUint16(b[winSize:], rcvwnd)
+	binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seq)
+	binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ack)
+	b[TCPFlagsOffset] = flags
+	binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
 }
 
 // Encode encodes all the fields of the tcp header.
 func (b TCP) Encode(t *TCPFields) {
 	b.encodeSubset(t.SeqNum, t.AckNum, t.Flags, t.WindowSize)
-	binary.BigEndian.PutUint16(b[srcPort:], t.SrcPort)
-	binary.BigEndian.PutUint16(b[dstPort:], t.DstPort)
-	b[dataOffset] = (t.DataOffset / 4) << 4
-	binary.BigEndian.PutUint16(b[tcpChecksum:], t.Checksum)
-	binary.BigEndian.PutUint16(b[urgentPtr:], t.UrgentPointer)
+	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], t.SrcPort)
+	binary.BigEndian.PutUint16(b[TCPDstPortOffset:], t.DstPort)
+	b[TCPDataOffset] = (t.DataOffset / 4) << 4
+	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], t.Checksum)
+	binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], t.UrgentPointer)
 }
 
 // EncodePartial updates a subset of the fields of the tcp header. It is useful
@@ -290,18 +291,13 @@ func (b TCP) EncodePartial(partialChecksum, length uint16, seqnum, acknum uint32
 	b.encodeSubset(seqnum, acknum, flags, rcvwnd)
 
 	// Add the contributions of the passed-in fields to the checksum.
-	checksum = Checksum(b[seqNum:seqNum+8], checksum)
-	checksum = Checksum(b[winSize:winSize+2], checksum)
+	checksum = Checksum(b[TCPSeqNumOffset:TCPSeqNumOffset+8], checksum)
+	checksum = Checksum(b[TCPWinSizeOffset:TCPWinSizeOffset+2], checksum)
 
 	// Encode the checksum.
 	b.SetChecksum(^checksum)
 }
 
-// TCPChecksumOffset returns offset of the checksum field.
-func TCPChecksumOffset() uint16 {
-	return tcpChecksum
-}
-
 // ParseSynOptions parses the options received in a SYN segment and returns the
 // relevant ones. opts should point to the option part of the TCP Header.
 func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions {
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 20e34c5ee..84439a9ed 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -122,13 +122,14 @@ type Options struct {
 	FD                 int
 	MTU                uint32
 	EthernetHeader     bool
-	ChecksumOffload    bool
 	ClosedFunc         func(*tcpip.Error)
 	Address            tcpip.LinkAddress
 	SaveRestore        bool
 	DisconnectOk       bool
 	GSOMaxSize         uint32
 	PacketDispatchMode PacketDispatchMode
+	TXChecksumOffload  bool
+	RXChecksumOffload  bool
 }
 
 // New creates a new fd-based endpoint.
@@ -142,8 +143,12 @@ func New(opts *Options) tcpip.LinkEndpointID {
 	}
 
 	caps := stack.LinkEndpointCapabilities(0)
-	if opts.ChecksumOffload {
-		caps |= stack.CapabilityChecksumOffload
+	if opts.RXChecksumOffload {
+		caps |= stack.CapabilityRXChecksumOffload
+	}
+
+	if opts.TXChecksumOffload {
+		caps |= stack.CapabilityTXChecksumOffload
 	}
 
 	hdrSize := 0
@@ -527,12 +532,13 @@ func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buf
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
-func NewInjectable(fd int, mtu uint32) (tcpip.LinkEndpointID, *InjectableEndpoint) {
+func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (tcpip.LinkEndpointID, *InjectableEndpoint) {
 	syscall.SetNonblock(fd, true)
 
 	e := &InjectableEndpoint{endpoint: endpoint{
-		fd:  fd,
-		mtu: mtu,
+		fd:   fd,
+		mtu:  mtu,
+		caps: capabilities,
 	}}
 
 	return stack.RegisterLinkEndpoint(e), e
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index d58c0f885..2dc4bcfda 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -56,7 +56,7 @@ func (*endpoint) MTU() uint32 {
 // Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises
 // itself as supporting checksum offload, but in reality it's just omitted.
 func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return stack.CapabilityChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
+	return stack.CapabilityRXChecksumOffload | stack.CapabilityTXChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
 }
 
 // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index 99edc232d..b3e71c7fc 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -105,7 +105,7 @@ func (m *InjectableEndpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *
 }
 
 // NewInjectableEndpoint creates a new multi-endpoint injectable endpoint.
-func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint, mtu uint32) (tcpip.LinkEndpointID, *InjectableEndpoint) {
+func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) (tcpip.LinkEndpointID, *InjectableEndpoint) {
 	e := &InjectableEndpoint{
 		routes: routes,
 	}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 7d25effad..031449a05 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -87,8 +87,8 @@ func makeTestInjectableEndpoint(t *testing.T) (*InjectableEndpoint, *os.File, tc
 	if err != nil {
 		t.Fatal("Failed to create socket pair:", err)
 	}
-	_, underlyingEndpoint := fdbased.NewInjectable(pair[1], 6500)
+	_, underlyingEndpoint := fdbased.NewInjectable(pair[1], 6500, stack.CapabilityNone)
 	routes := map[tcpip.Address]stack.InjectableLinkEndpoint{dstIP: underlyingEndpoint}
-	_, endpoint := NewInjectableEndpoint(routes, 6500)
+	_, endpoint := NewInjectableEndpoint(routes)
 	return endpoint, os.NewFile(uintptr(pair[0]), "test route end"), dstIP
 }
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index f3cc849ec..6e1660051 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -232,7 +232,15 @@ type LinkEndpointCapabilities uint
 
 // The following are the supported link endpoint capabilities.
 const (
-	CapabilityChecksumOffload LinkEndpointCapabilities = 1 << iota
+	CapabilityNone LinkEndpointCapabilities = 0
+	// CapabilityTXChecksumOffload indicates that the link endpoint supports
+	// checksum computation for outgoing packets and the stack can skip
+	// computing checksums when sending packets.
+	CapabilityTXChecksumOffload LinkEndpointCapabilities = 1 << iota
+	// CapabilityRXChecksumOffload indicates that the link endpoint supports
+	// checksum verification on received packets and that it's safe for the
+	// stack to skip checksum verification.
+	CapabilityRXChecksumOffload
 	CapabilityResolutionRequired
 	CapabilitySaveRestore
 	CapabilityDisconnectOk
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index e9f73635f..e898dcbca 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -801,6 +801,9 @@ type TCPStats struct {
 
 	// Timeouts is the number of times the RTO expired.
 	Timeouts *StatCounter
+
+	// ChecksumErrors is the number of segments dropped due to bad checksums.
+	ChecksumErrors *StatCounter
 }
 
 // UDPStats collects UDP-specific stats.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 056e0b09a..6c4a4d95e 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -595,7 +595,7 @@ func sendTCP(r *stack.Route, id stack.TransportEndpointID, data buffer.Vectorise
 		// TCP header, then the kernel calculate a checksum of the
 		// header and data and get the right sum of the TCP packet.
 		tcp.SetChecksum(xsum)
-	} else if r.Capabilities()&stack.CapabilityChecksumOffload == 0 {
+	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
 		xsum = header.ChecksumVV(data, xsum)
 		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
 	}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 41c87cc7e..b5d05af7d 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1447,6 +1447,13 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
 		return
 	}
 
+	if !s.csumValid {
+		e.stack.Stats().MalformedRcvdPackets.Increment()
+		e.stack.Stats().TCP.ChecksumErrors.Increment()
+		s.decRef()
+		return
+	}
+
 	e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
 	if (s.flags & header.TCPFlagRst) != 0 {
 		e.stack.Stats().TCP.ResetsReceived.Increment()
@@ -1721,7 +1728,7 @@ func (e *endpoint) initGSO() {
 		panic(fmt.Sprintf("Unknown netProto: %v", e.netProto))
 	}
 	gso.NeedsCsum = true
-	gso.CsumOffset = header.TCPChecksumOffset()
+	gso.CsumOffset = header.TCPChecksumOffset
 	gso.MaxSize = e.route.GSOMaxSize()
 	e.gso = gso
 }
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 7a6589cfd..6a7efaf1d 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -68,7 +68,7 @@ func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, n
 	defer s.decRef()
 
 	// We only care about well-formed SYN packets.
-	if !s.parse() || s.flags != header.TCPFlagSyn {
+	if !s.parse() || !s.csumValid || s.flags != header.TCPFlagSyn {
 		return false
 	}
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 230668b5d..b5fb160bc 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -130,7 +130,7 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
 	s := newSegment(r, id, vv)
 	defer s.decRef()
 
-	if !s.parse() {
+	if !s.parse() || !s.csumValid {
 		return false
 	}
 
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index df8402bf9..c603fe713 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -45,6 +45,10 @@ type segment struct {
 	ackNumber      seqnum.Value
 	flags          uint8
 	window         seqnum.Size
+	// csum is only populated for received segments.
+	csum uint16
+	// csumValid is true if the csum in the received segment is valid.
+	csumValid bool
 
 	// parsedOptions stores the parsed values from the options in the segment.
 	parsedOptions  header.TCPOptions
@@ -124,7 +128,13 @@ func (s *segment) logicalLen() seqnum.Size {
 
 // parse populates the sequence & ack numbers, flags, and window fields of the
 // segment from the TCP header stored in the data. It then updates the view to
-// skip the data. Returns boolean indicating if the parsing was successful.
+// skip the header.
+//
+// Returns boolean indicating if the parsing was successful.
+//
+// If checksum verification is not offloaded then parse also verifies the
+// TCP checksum and stores the checksum and result of checksum verification in
+// the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
 	h := header.TCP(s.data.First())
 
@@ -145,12 +155,27 @@ func (s *segment) parse() bool {
 
 	s.options = []byte(h[header.TCPMinimumSize:offset])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
-	s.data.TrimFront(offset)
+
+	// Query the link capabilities to decide if checksum validation is
+	// required.
+	verifyChecksum := true
+	if s.route.Capabilities()&stack.CapabilityRXChecksumOffload != 0 {
+		s.csumValid = true
+		verifyChecksum = false
+		s.data.TrimFront(offset)
+	}
+	if verifyChecksum {
+		s.csum = h.Checksum()
+		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
+		xsum = h.CalculateChecksum(xsum)
+		s.data.TrimFront(offset)
+		xsum = header.ChecksumVV(s.data, xsum)
+		s.csumValid = xsum == 0xffff
+	}
 
 	s.sequenceNumber = seqnum.Value(h.SequenceNumber())
 	s.ackNumber = seqnum.Value(h.AckNumber())
 	s.flags = h.Flags()
 	s.window = seqnum.Size(h.WindowSize())
-
 	return true
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 7f2615ca9..af50ac8af 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -2963,8 +2963,7 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 		RcvWnd:  30000,
 	})
 	tcpbuf := vv.First()[header.IPv4MinimumSize:]
-	// 12 is the TCP header data offset.
-	tcpbuf[12] = ((header.TCPMinimumSize - 1) / 4) << 4
+	tcpbuf[header.TCPDataOffset] = ((header.TCPMinimumSize - 1) / 4) << 4
 
 	c.SendSegment(vv)
 
@@ -2973,6 +2972,32 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 	}
 }
 
+func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	c.CreateConnected(789, 30000, nil)
+	stats := c.Stack().Stats()
+	want := stats.TCP.ChecksumErrors.Value() + 1
+	vv := c.BuildSegment([]byte{0x1, 0x2, 0x3}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqnum.Value(790),
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+	tcpbuf := vv.First()[header.IPv4MinimumSize:]
+	// Overwrite a byte in the payload which should cause checksum
+	// verification to fail.
+	tcpbuf[(tcpbuf[header.TCPDataOffset]>>4)*4] = 0x4
+
+	c.SendSegment(vv)
+
+	if got := stats.TCP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.TCP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+}
+
 func TestReceivedSegmentQueuing(t *testing.T) {
 	// This test sends 200 segments containing a few bytes each to an
 	// endpoint and checks that they're all received and acknowledged by
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 19e532180..1f9251de3 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -640,7 +640,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	})
 
 	// Only calculate the checksum if offloading isn't supported.
-	if r.Capabilities()&stack.CapabilityChecksumOffload == 0 {
+	if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
 		xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
 		for _, v := range data.Views() {
 			xsum = header.Checksum(v, xsum)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 77291415b..3915a021f 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -142,6 +142,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			Address:            mac,
 			PacketDispatchMode: fdbased.PacketMMap,
 			GSOMaxSize:         link.GSOMaxSize,
+			RXChecksumOffload:  true,
 		})
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
-- 
cgit v1.2.3


From b3b140ea4f9e1b632463cbf83c97f58464eceeac Mon Sep 17 00:00:00 2001
From: Li Qiang <pangpei.lq@antfin.com>
Date: Tue, 9 Apr 2019 14:56:04 -0700
Subject: syscalls: sendfile: limit the count to MAX_RW_COUNT

From sendfile spec and also the linux kernel code, we should
limit the count arg to 'MAX_RW_COUNT'. This patch export
'MAX_RW_COUNT' in kernel pkg and use it in the implementation
of sendfile syscall.

Signed-off-by: Li Qiang <pangpei.lq@antfin.com>
Change-Id: I1086fec0685587116984555abd22b07ac233fbd2
PiperOrigin-RevId: 242745831
---
 pkg/sentry/kernel/task_usermem.go     | 22 +++++++++++-----------
 pkg/sentry/syscalls/linux/sys_file.go |  4 ++++
 2 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index c8e973bd5..cb68799d3 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -22,10 +22,10 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
-// _MAX_RW_COUNT is the maximum size in bytes of a single read or write.
+// MAX_RW_COUNT is the maximum size in bytes of a single read or write.
 // Reads and writes that exceed this size may be silently truncated.
 // (Linux: include/linux/fs.h:MAX_RW_COUNT)
-var _MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
 
 // Activate ensures that the task has an active address space.
 func (t *Task) Activate() {
@@ -187,9 +187,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
 // - If any AddrRange would include addresses outside the application address
 // range, CopyInIovecs returns EFAULT.
 //
-// - The combined length of all AddrRanges is limited to _MAX_RW_COUNT. If the
+// - The combined length of all AddrRanges is limited to MAX_RW_COUNT. If the
 // combined length of all AddrRanges would otherwise exceed this amount, ranges
-// beyond _MAX_RW_COUNT are silently truncated.
+// beyond MAX_RW_COUNT are silently truncated.
 //
 // Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
 // task goroutine. t's AddressSpace must be active.
@@ -228,7 +228,7 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 
 			if numIovecs == 1 {
 				// Special case to avoid allocating dst.
-				return usermem.AddrRangeSeqOf(ar).TakeFirst(_MAX_RW_COUNT), nil
+				return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
 			}
 			dst = append(dst, ar)
 
@@ -239,11 +239,11 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 		return usermem.AddrRangeSeq{}, syserror.ENOSYS
 	}
 
-	// Truncate to _MAX_RW_COUNT.
+	// Truncate to MAX_RW_COUNT.
 	var total uint64
 	for i := range dst {
 		dstlen := uint64(dst[i].Length())
-		if rem := uint64(_MAX_RW_COUNT) - total; rem < dstlen {
+		if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen {
 			dst[i].End -= usermem.Addr(dstlen - rem)
 			dstlen = rem
 		}
@@ -256,16 +256,16 @@ func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRange
 // SingleIOSequence returns a usermem.IOSequence representing [addr,
 // addr+length) in t's address space. If this contains addresses outside the
 // application address range, it returns EFAULT. If length exceeds
-// _MAX_RW_COUNT, the range is silently truncated.
+// MAX_RW_COUNT, the range is silently truncated.
 //
 // SingleIOSequence is analogous to Linux's
 // lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
 // write syscalls in Linux do not use import_single_range(). However they check
 // access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address
-// ranges are truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
+// ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
 func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
-	if length > _MAX_RW_COUNT {
-		length = _MAX_RW_COUNT
+	if length > MAX_RW_COUNT {
+		length = MAX_RW_COUNT
 	}
 	ar, ok := t.MemoryManager().CheckIORange(addr, int64(length))
 	if !ok {
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 5a874d935..d2d351449 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -2002,6 +2002,10 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		return 0, nil, syserror.EINVAL
 	}
 
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
 	// Get files.
 	outFile := t.FDMap().GetFile(outFD)
 	if outFile == nil {
-- 
cgit v1.2.3


From 7140b1fdca1cc9c9c711955a49e6e7fc41f339d9 Mon Sep 17 00:00:00 2001
From: Shiva Prasanth <kesavarapu.siva@gmail.com>
Date: Wed, 10 Apr 2019 10:48:28 -0700
Subject: Fixed /proc/cpuinfo permissions

This also applies these permissions to other static proc files.

Change-Id: I4167e585fed49ad271aa4e1f1260babb3239a73d
PiperOrigin-RevId: 242898575
---
 pkg/sentry/fs/fsutil/inode.go | 12 ++++++++++++
 pkg/sentry/fs/proc/inode.go   |  2 +-
 test/syscalls/linux/proc.cc   |  4 ++++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 2673d73d7..37490e5b2 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -452,3 +452,15 @@ type InodeGenericChecker struct{}
 func (InodeGenericChecker) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
 	return fs.ContextCanAccessFile(ctx, inode, p)
 }
+
+// InodeDenyWriteChecker implements fs.InodeOperations.Check which denies all
+// write operations.
+type InodeDenyWriteChecker struct{}
+
+// Check implements fs.InodeOperations.Check.
+func (InodeDenyWriteChecker) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+	if p.Write {
+		return false
+	}
+	return fs.ContextCanAccessFile(ctx, inode, p)
+}
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index 3c36af5ea..8dde2ea46 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -53,7 +53,7 @@ func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) (
 //
 // +stateify savable
 type staticFileInodeOps struct {
-	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeDenyWriteChecker     `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 2da7006cf..67d3d18c7 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -725,6 +725,10 @@ TEST(ProcCpuinfo, RequiredFieldsArePresent) {
   }
 }
 
+TEST(ProcCpuinfo, DeniesWrite) {
+  EXPECT_THAT(open("/proc/cpuinfo", O_WRONLY), SyscallFailsWithErrno(EACCES));
+}
+
 // Sanity checks that uptime is present.
 TEST(ProcUptime, IsPresent) {
   std::string proc_uptime = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/uptime"));
-- 
cgit v1.2.3


From 0a0619216ec9ca96c181dd69d9bf31e7762090cb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 10 Apr 2019 11:26:10 -0700
Subject: Start saving MountSource.DirentCache.

DirentCache is already a savable type, and it ensures that it is empty at the
point of Save.  There is no reason not to save it along with the MountSource.

This did uncover an issue where not all MountSources were properly flushed
before Save.  If a mount point has an open file and is then unmounted, we save
the MountSource without flushing it first.  This CL also fixes that by flushing
all MountSources for all open FDs on Save.

PiperOrigin-RevId: 242906637
Change-Id: I3acd9d52b6ce6b8c989f835a408016cb3e67018f
---
 pkg/sentry/fs/BUILD          |  1 -
 pkg/sentry/fs/mount.go       |  2 +-
 pkg/sentry/fs/mount_state.go | 25 --------------
 pkg/sentry/kernel/kernel.go  | 78 ++++++++++++++++++++++++++++++--------------
 4 files changed, 55 insertions(+), 51 deletions(-)
 delete mode 100644 pkg/sentry/fs/mount_state.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index dda6a0c9f..1742d3a65 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -32,7 +32,6 @@ go_library(
         "mock.go",
         "mount.go",
         "mount_overlay.go",
-        "mount_state.go",
         "mounts.go",
         "offset.go",
         "overlay.go",
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index dd6e64b4c..5cc777bef 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -119,7 +119,7 @@ type MountSource struct {
 
 	// fscache keeps Dirents pinned beyond application references to them.
 	// It must be flushed before kernel.SaveTo.
-	fscache *DirentCache `state:"nosave"`
+	fscache *DirentCache
 
 	// direntRefs is the sum of references on all Dirents in this MountSource.
 	//
diff --git a/pkg/sentry/fs/mount_state.go b/pkg/sentry/fs/mount_state.go
deleted file mode 100644
index 6344d5160..000000000
--- a/pkg/sentry/fs/mount_state.go
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fs
-
-// afterLoad is invoked by stateify.
-//
-// Beyond the cache, this method's existence is required to ensure that this
-// object is not marked "complete" until all dependent objects are also marked
-// "complete". Implementations (e.g. see gofer_state.go) reach into the
-// MountSourceOperations through this object, this is necessary on restore.
-func (msrc *MountSource) afterLoad() {
-	msrc.fscache = NewDirentCache(defaultDirentCacheSize)
-}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 1cd2653ff..a9994f23b 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -311,7 +311,9 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	// Clear the dirent cache before saving because Dirents must be Loaded in a
 	// particular order (parents before children), and Loading dirents from a cache
 	// breaks that order.
-	k.mounts.FlushMountSourceRefs()
+	if err := k.flushMountSourceRefs(); err != nil {
+		return err
+	}
 
 	// Ensure that all pending asynchronous work is complete:
 	//   - inode and mount release
@@ -351,39 +353,67 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	return nil
 }
 
-func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+// flushMountSourceRefs flushes the MountSources for all mounted filesystems
+// and open FDs.
+func (k *Kernel) flushMountSourceRefs() error {
+	// Flush all mount sources for currently mounted filesystems.
+	k.mounts.FlushMountSourceRefs()
+
+	// There may be some open FDs whose filesystems have been unmounted. We
+	// must flush those as well.
+	return k.tasks.forEachFDPaused(func(desc descriptor) error {
+		desc.file.Dirent.Inode.MountSource.FlushDirentRefs()
+		return nil
+	})
+}
+
+// forEachFDPaused applies the given function to each open file descriptor in each
+// task.
+//
+// Precondition: Must be called with the kernel paused.
+func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error {
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 	for t := range ts.Root.tids {
 		// We can skip locking Task.mu here since the kernel is paused.
-		if fdmap := t.fds; fdmap != nil {
-			for _, desc := range fdmap.files {
-				if flags := desc.file.Flags(); !flags.Write {
-					continue
-				}
-				if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
-					continue
-				}
-				// Here we need all metadata synced.
-				syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
-				if err := fs.SaveFileFsyncError(syncErr); err != nil {
-					name, _ := desc.file.Dirent.FullName(nil /* root */)
-					// Wrap this error in ErrSaveRejection
-					// so that it will trigger a save
-					// error, rather than a panic. This
-					// also allows us to distinguish Fsync
-					// errors from state file errors in
-					// state.Save.
-					return fs.ErrSaveRejection{
-						Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
-					}
-				}
+		if t.fds == nil {
+			continue
+		}
+		for _, desc := range t.fds.files {
+			if err := f(desc); err != nil {
+				return err
 			}
 		}
 	}
 	return nil
 }
 
+func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+	return ts.forEachFDPaused(func(desc descriptor) error {
+		if flags := desc.file.Flags(); !flags.Write {
+			return nil
+		}
+		if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+			return nil
+		}
+		// Here we need all metadata synced.
+		syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+		if err := fs.SaveFileFsyncError(syncErr); err != nil {
+			name, _ := desc.file.Dirent.FullName(nil /* root */)
+			// Wrap this error in ErrSaveRejection
+			// so that it will trigger a save
+			// error, rather than a panic. This
+			// also allows us to distinguish Fsync
+			// errors from state file errors in
+			// state.Save.
+			return fs.ErrSaveRejection{
+				Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
+			}
+		}
+		return nil
+	})
+}
+
 // Preconditions: The kernel must be paused.
 func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
 	invalidated := make(map[*mm.MemoryManager]struct{})
-- 
cgit v1.2.3


From f7aff0aaa4320505933df838cf5b551b69d5e513 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 10 Apr 2019 12:35:43 -0700
Subject: Allow threads with CAP_SYS_RESOURCE to raise hard rlimits.

PiperOrigin-RevId: 242919489
Change-Id: Ie3267b3bcd8a54b54bc16a6556369a19e843376f
---
 pkg/sentry/kernel/fd_map_test.go        |  8 ++++----
 pkg/sentry/limits/limits.go             |  8 ++++++--
 pkg/sentry/limits/limits_test.go        | 30 ++++++++++++++++++------------
 pkg/sentry/limits/linux.go              |  2 +-
 pkg/sentry/mm/mm_test.go                |  2 +-
 pkg/sentry/syscalls/linux/sys_rlimit.go |  8 +++++++-
 test/syscalls/linux/rlimits.cc          | 15 +++++++++++----
 7 files changed, 48 insertions(+), 25 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
index b49996137..9e76f0a2d 100644
--- a/pkg/sentry/kernel/fd_map_test.go
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -40,7 +40,7 @@ func newTestFDMap() *FDMap {
 func TestFDMapMany(t *testing.T) {
 	file := filetest.NewTestFile(t)
 	limitSet := limits.NewLimitSet()
-	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */)
 
 	f := newTestFDMap()
 	for i := 0; i < maxFD; i++ {
@@ -64,7 +64,7 @@ func TestFDMapMany(t *testing.T) {
 func TestFDMap(t *testing.T) {
 	file := filetest.NewTestFile(t)
 	limitSet := limits.NewLimitSet()
-	limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD})
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true /* privileged */)
 
 	f := newTestFDMap()
 	if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
@@ -76,7 +76,7 @@ func TestFDMap(t *testing.T) {
 	}
 
 	largeLimit := limits.Limit{maxFD, maxFD}
-	limitSet.Set(limits.NumberOfFiles, largeLimit)
+	limitSet.Set(limits.NumberOfFiles, largeLimit, true /* privileged */)
 
 	if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
 		t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
@@ -117,7 +117,7 @@ func TestDescriptorFlags(t *testing.T) {
 	file := filetest.NewTestFile(t)
 	f := newTestFDMap()
 	limitSet := limits.NewLimitSet()
-	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+	limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true /* privileged */)
 
 	origFlags := FDFlags{CloseOnExec: true}
 
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index eeca01876..b0571739f 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -113,13 +113,17 @@ func (l *LimitSet) SetUnchecked(t LimitType, v Limit) {
 }
 
 // Set assigns value v to resource of LimitType t and returns the old value.
-func (l *LimitSet) Set(t LimitType, v Limit) (Limit, error) {
+// privileged should be true only when either the caller has CAP_SYS_RESOURCE
+// or when creating limits for a new kernel.
+func (l *LimitSet) Set(t LimitType, v Limit, privileged bool) (Limit, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
+
 	// If a limit is already set, make sure the new limit doesn't
 	// exceed the previous max limit.
 	if _, ok := l.data[t]; ok {
-		if l.data[t].Max < v.Max {
+		// Unprivileged users can only lower their hard limits.
+		if l.data[t].Max < v.Max && !privileged {
 			return Limit{}, syscall.EPERM
 		}
 		if v.Cur > v.Max {
diff --git a/pkg/sentry/limits/limits_test.go b/pkg/sentry/limits/limits_test.go
index d41f62554..945428163 100644
--- a/pkg/sentry/limits/limits_test.go
+++ b/pkg/sentry/limits/limits_test.go
@@ -20,18 +20,24 @@ import (
 )
 
 func TestSet(t *testing.T) {
-	ls := NewLimitSet()
-	ls.Set(1, Limit{Cur: 50, Max: 50})
-	if _, err := ls.Set(1, Limit{Cur: 20, Max: 50}); err != nil {
-		t.Fatalf("Tried to lower Limit to valid new value: got %v, wanted nil", err)
-	}
-	if _, err := ls.Set(1, Limit{Cur: 20, Max: 60}); err != syscall.EPERM {
-		t.Fatalf("Tried to raise limit.Max to invalid higher value: got %v, wanted syscall.EPERM", err)
-	}
-	if _, err := ls.Set(1, Limit{Cur: 60, Max: 50}); err != syscall.EINVAL {
-		t.Fatalf("Tried to raise limit.Cur to invalid higher value: got %v, wanted syscall.EINVAL", err)
+	testCases := []struct {
+		limit       Limit
+		privileged  bool
+		expectedErr error
+	}{
+		{limit: Limit{Cur: 50, Max: 50}, privileged: false, expectedErr: nil},
+		{limit: Limit{Cur: 20, Max: 50}, privileged: false, expectedErr: nil},
+		{limit: Limit{Cur: 20, Max: 60}, privileged: false, expectedErr: syscall.EPERM},
+		{limit: Limit{Cur: 60, Max: 50}, privileged: false, expectedErr: syscall.EINVAL},
+		{limit: Limit{Cur: 11, Max: 10}, privileged: false, expectedErr: syscall.EINVAL},
+		{limit: Limit{Cur: 20, Max: 60}, privileged: true, expectedErr: nil},
 	}
-	if _, err := ls.Set(1, Limit{Cur: 11, Max: 10}); err != syscall.EINVAL {
-		t.Fatalf("Tried to set new limit with Cur > Max: got %v, wanted syscall.EINVAL", err)
+
+	ls := NewLimitSet()
+	for _, tc := range testCases {
+		if _, err := ls.Set(1, tc.limit, tc.privileged); err != tc.expectedErr {
+			t.Fatalf("Tried to set Limit to %+v and privilege %t: got %v, wanted %v", tc.limit, tc.privileged, err, tc.expectedErr)
+		}
 	}
+
 }
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 295f9c398..e09d0d2fb 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -95,6 +95,6 @@ func NewLinuxDistroLimitSet() (*LimitSet, error) {
 	// 1,048,576 ought to be enough for anyone.
 	l := ls.Get(ProcessCount)
 	l.Cur = 1 << 20
-	ls.Set(ProcessCount, l)
+	ls.Set(ProcessCount, l, true /* privileged */)
 	return ls, nil
 }
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index e12cb3bd1..ae4fba478 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -70,7 +70,7 @@ func TestUsageASUpdates(t *testing.T) {
 
 func TestBrkDataLimitUpdates(t *testing.T) {
 	limitSet := limits.NewLimitSet()
-	limitSet.Set(limits.Data, limits.Limit{}) // zero RLIMIT_DATA
+	limitSet.Set(limits.Data, limits.Limit{}, true /* privileged */) // zero RLIMIT_DATA
 
 	ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet)
 	mm := testMemoryManager(ctx)
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index b0b216045..443334693 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -106,7 +106,13 @@ func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit)
 	if _, ok := setableLimits[resource]; !ok {
 		return limits.Limit{}, syserror.EPERM
 	}
-	oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim)
+
+	// "A privileged process (under Linux: one with the CAP_SYS_RESOURCE
+	// capability in the initial user namespace) may make arbitrary changes
+	// to either limit value."
+	privileged := t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.Kernel().RootUserNamespace())
+
+	oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim, privileged)
 	if err != nil {
 		return limits.Limit{}, err
 	}
diff --git a/test/syscalls/linux/rlimits.cc b/test/syscalls/linux/rlimits.cc
index dc31bad9a..5a6174d99 100644
--- a/test/syscalls/linux/rlimits.cc
+++ b/test/syscalls/linux/rlimits.cc
@@ -25,15 +25,12 @@ namespace {
 
 TEST(RlimitTest, SetRlimitHigher) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE)));
-  SKIP_IF(!IsRunningOnGvisor());
 
   struct rlimit rl = {};
   EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
 
-  // TODO: Even with CAP_SYS_RESOURCE, gVisor does not allow
-  // setting a higher rlimit.
   rl.rlim_max++;
-  EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallFailsWithErrno(EPERM));
+  EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
 }
 
 TEST(RlimitTest, UnprivilegedSetRlimit) {
@@ -56,6 +53,16 @@ TEST(RlimitTest, UnprivilegedSetRlimit) {
   EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallFailsWithErrno(EPERM));
 }
 
+TEST(RlimitTest, SetSoftRlimitAboveHard) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE)));
+
+  struct rlimit rl = {};
+  EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+
+  rl.rlim_cur = rl.rlim_max + 1;
+  EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallFailsWithErrno(EINVAL));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 89cc8eef9ba6440a5f1772bb2cd200da9b4890d9 Mon Sep 17 00:00:00 2001
From: Yong He <chenglang.hy@antfin.com>
Date: Wed, 10 Apr 2019 14:16:36 -0700
Subject: DATA RACE in fs.(*Dirent).fullName

add renameMu.Lock when oldParent == newParent
in order to avoid data race in following report:

WARNING: DATA RACE
Read at 0x00c000ba2160 by goroutine 405:
  gvisor.googlesource.com/gvisor/pkg/sentry/fs.(*Dirent).fullName()
      pkg/sentry/fs/dirent.go:246 +0x6c
  gvisor.googlesource.com/gvisor/pkg/sentry/fs.(*Dirent).FullName()
      pkg/sentry/fs/dirent.go:356 +0x8b
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*FDMap).String()
      pkg/sentry/kernel/fd_map.go:135 +0x1e0
  fmt.(*pp).handleMethods()
      GOROOT/src/fmt/print.go:603 +0x404
  fmt.(*pp).printArg()
      GOROOT/src/fmt/print.go:686 +0x255
  fmt.(*pp).doPrintf()
      GOROOT/src/fmt/print.go:1003 +0x33f
  fmt.Fprintf()
      GOROOT/src/fmt/print.go:188 +0x7f
  gvisor.googlesource.com/gvisor/pkg/log.(*Writer).Emit()
      pkg/log/log.go:121 +0x89
  gvisor.googlesource.com/gvisor/pkg/log.GoogleEmitter.Emit()
      pkg/log/glog.go:162 +0x1acc
  gvisor.googlesource.com/gvisor/pkg/log.(*GoogleEmitter).Emit()
      <autogenerated>:1 +0xe1
  gvisor.googlesource.com/gvisor/pkg/log.(*BasicLogger).Debugf()
      pkg/log/log.go:177 +0x111
  gvisor.googlesource.com/gvisor/pkg/log.Debugf()
      pkg/log/log.go:235 +0x66
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).Debugf()
      pkg/sentry/kernel/task_log.go:48 +0xfe
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).DebugDumpState()
      pkg/sentry/kernel/task_log.go:66 +0x11f
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*runApp).execute()
      pkg/sentry/kernel/task_run.go:272 +0xc80
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).run()
      pkg/sentry/kernel/task_run.go:91 +0x24b

Previous write at 0x00c000ba2160 by goroutine 423:
  gvisor.googlesource.com/gvisor/pkg/sentry/fs.Rename()
      pkg/sentry/fs/dirent.go:1628 +0x61f
  gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux.renameAt.func1.1()
      pkg/sentry/syscalls/linux/sys_file.go:1864 +0x1f8
  gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux.fileOpAt(  gvisor.googlesource.com/g/linux/sys_file.go:51 +0x20f
  gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux.renameAt.func1()
      pkg/sentry/syscalls/linux/sys_file.go:1852 +0x218
  gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux.fileOpAt()
      pkg/sentry/syscalls/linux/sys_file.go:51 +0x20f
  gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux.renameAt()
      pkg/sentry/syscalls/linux/sys_file.go:1840 +0x180
  gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux.Rename()
      pkg/sentry/syscalls/linux/sys_file.go:1873 +0x60
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).executeSyscall()
      pkg/sentry/kernel/task_syscall.go:165 +0x17a
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).doSyscallInvoke()
      pkg/sentry/kernel/task_syscall.go:283 +0xb4
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).doSyscallEnter()
      pkg/sentry/kernel/task_syscall.go:244 +0x10c
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).doSyscall()
      pkg/sentry/kernel/task_syscall.go:219 +0x1e3
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*runApp).execute()
      pkg/sentry/kernel/task_run.go:215 +0x15a9
  gvisor.googlesource.com/gvisor/pkg/sentry/kernel.(*Task).run()
      pkg/sentry/kernel/task_run.go:91 +0x24b

Reported-by: syzbot+e1babbf756fab380dfff@syzkaller.appspotmail.com
Change-Id: Icd2620bb3ea28b817bf0672d454a22b9d8ee189a
PiperOrigin-RevId: 242938741
---
 pkg/sentry/fs/dirent.go | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 3a1aa6c1e..4870e7d40 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1377,15 +1377,14 @@ func (d *Dirent) dropExtendedReference() {
 // lockForRename takes locks on oldParent and newParent as required by Rename
 // and returns a function that will unlock the locks taken. The returned
 // function must be called even if a non-nil error is returned.
-//
-// Note that lockForRename does not take renameMu if the source and destination
-// of the rename are within the same directory.
 func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) {
+	renameMu.Lock()
 	if oldParent == newParent {
-		// Rename source and destination are in the same directory. In
-		// this case, we only need to take a lock on that directory.
 		oldParent.mu.Lock()
-		return oldParent.mu.Unlock, nil
+		return func() {
+			oldParent.mu.Unlock()
+			renameMu.Unlock()
+		}, nil
 	}
 
 	// Renaming between directories is a bit subtle:
@@ -1398,7 +1397,6 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName
 	// lock on the ancestor; to avoid this, ensure we take locks in the same
 	// ancestor-to-descendant order. (Holding renameMu prevents this
 	// relationship from changing.)
-	renameMu.Lock()
 
 	// First check if newParent is a descendant of oldParent.
 	child := newParent
-- 
cgit v1.2.3


From d93d19fd4eefdfd868919a73c9498e7da7eb9258 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 10 Apr 2019 16:35:22 -0700
Subject: Fix uses of RootFromContext.

RootFromContext can return a dirent with reference taken, or nil. We must call
DecRef if (and only if) a real dirent is returned.

PiperOrigin-RevId: 242965515
Change-Id: Ie2b7b4cb19ee09b6ccf788b71f3fd7efcdf35a11
---
 pkg/sentry/fs/copy_up.go      | 10 +++++++---
 pkg/sentry/fs/file.go         |  6 +++++-
 pkg/sentry/fs/file_overlay.go | 10 ++++++++--
 pkg/sentry/fs/fsutil/file.go  |  4 +++-
 pkg/sentry/fs/gofer/file.go   |  4 +++-
 pkg/sentry/fs/host/file.go    |  4 +++-
 pkg/sentry/fs/proc/fds.go     |  4 +++-
 pkg/sentry/fs/proc/proc.go    |  4 +++-
 pkg/sentry/fs/proc/task.go    |  4 +++-
 pkg/sentry/fs/ramfs/dir.go    |  4 +++-
 pkg/sentry/fs/tty/dir.go      |  4 +++-
 pkg/sentry/kernel/kernel.go   |  4 ++++
 12 files changed, 48 insertions(+), 14 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 6d4ebaaa4..ba69e718d 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -188,11 +188,15 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 
 	var childUpperInode *Inode
 	parentUpper := parent.Inode.overlay.upper
+	root := RootFromContext(ctx)
+	if root != nil {
+		defer root.DecRef()
+	}
 
 	// Create the file in the upper filesystem and get an Inode for it.
 	switch next.Inode.StableAttr.Type {
 	case RegularFile:
-		childFile, err := parentUpper.Create(ctx, RootFromContext(ctx), next.name, FileFlags{Read: true, Write: true}, attrs.Perms)
+		childFile, err := parentUpper.Create(ctx, root, next.name, FileFlags{Read: true, Write: true}, attrs.Perms)
 		if err != nil {
 			log.Warningf("copy up failed to create file: %v", err)
 			return syserror.EIO
@@ -201,7 +205,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		childUpperInode = childFile.Dirent.Inode
 
 	case Directory:
-		if err := parentUpper.CreateDirectory(ctx, RootFromContext(ctx), next.name, attrs.Perms); err != nil {
+		if err := parentUpper.CreateDirectory(ctx, root, next.name, attrs.Perms); err != nil {
 			log.Warningf("copy up failed to create directory: %v", err)
 			return syserror.EIO
 		}
@@ -221,7 +225,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 			log.Warningf("copy up failed to read symlink value: %v", err)
 			return syserror.EIO
 		}
-		if err := parentUpper.CreateLink(ctx, RootFromContext(ctx), link, next.name); err != nil {
+		if err := parentUpper.CreateLink(ctx, root, link, next.name); err != nil {
 			log.Warningf("copy up failed to create symlink: %v", err)
 			return syserror.EIO
 		}
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index d66813103..01c18647c 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -410,7 +410,11 @@ func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 
 // MappedName implements memmap.MappingIdentity.MappedName.
 func (f *File) MappedName(ctx context.Context) string {
-	name, _ := f.Dirent.FullName(RootFromContext(ctx))
+	root := RootFromContext(ctx)
+	if root != nil {
+		defer root.DecRef()
+	}
+	name, _ := f.Dirent.FullName(root)
 	return name
 }
 
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index cd231bdef..4efe85832 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -169,7 +169,9 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
 // Readdir implements FileOperations.Readdir.
 func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) {
 	root := RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	dirCtx := &DirCtx{
 		Serializer: serializer,
 		DirCursor:  &f.dirCursor,
@@ -440,7 +442,11 @@ func (omi *overlayMappingIdentity) InodeID() uint64 {
 
 // MappedName implements MappingIdentity.MappedName.
 func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string {
-	name, _ := omi.overlayFile.Dirent.FullName(RootFromContext(ctx))
+	root := RootFromContext(ctx)
+	if root != nil {
+		defer root.DecRef()
+	}
+	name, _ := omi.overlayFile.Dirent.FullName(root)
 	return name
 }
 
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 32f8133fb..ce329b37a 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -277,7 +277,9 @@ func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.
 // Readdir implements fs.FileOperations.Readdir.
 func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
 	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	dirCtx := &fs.DirCtx{
 		Serializer: serializer,
 		DirCursor:  &sdfo.dirCursor,
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 631cc80ae..e49ae2201 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -120,7 +120,9 @@ func (f *fileOperations) Release() {
 // Readdir implements fs.FileOperations.Readdir.
 func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
 	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 
 	dirCtx := &fs.DirCtx{
 		Serializer: serializer,
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 4e84d1d6c..d67a0795f 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -166,7 +166,9 @@ func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
 // Readdir implements fs.FileOperations.Readdir.
 func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
 	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	dirCtx := &fs.DirCtx{
 		Serializer: serializer,
 		DirCursor:  &f.dirCursor,
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index b8a0a5eff..3c471bad9 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -119,7 +119,9 @@ func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error
 // Readlink returns the current target.
 func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
 	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	n, _ := f.Dirent.FullName(root)
 	return n, nil
 }
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 88018e707..c9e659533 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -210,7 +210,9 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent
 
 	// Add dot and dotdot.
 	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	dot, dotdot := file.Dirent.GetDotAttrs(root)
 	names = append(names, ".", "..")
 	m["."] = dot
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 5a90c5578..4b1f84942 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -162,7 +162,9 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry
 	if offset == 0 {
 		// Serialize "." and "..".
 		root := fs.RootFromContext(ctx)
-		defer root.DecRef()
+		if root != nil {
+			defer root.DecRef()
+		}
 		dot, dotdot := file.Dirent.GetDotAttrs(root)
 		if err := dirCtx.DirEmit(".", dot); err != nil {
 			return offset, err
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 05d716afb..a3b33c0f8 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -417,7 +417,9 @@ func (dfo *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx,
 // Readdir implements FileOperations.Readdir.
 func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
 	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	dirCtx := &fs.DirCtx{
 		Serializer: serializer,
 		DirCursor:  &dfo.dirCursor,
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 485cdb456..11bf736d6 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -315,7 +315,9 @@ func (df *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx,
 // Readdir implements FileOperations.Readdir.
 func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
 	root := fs.RootFromContext(ctx)
-	defer root.DecRef()
+	if root != nil {
+		defer root.DecRef()
+	}
 	dirCtx := &fs.DirCtx{
 		Serializer: serializer,
 		DirCursor:  &df.dirCursor,
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index a9994f23b..b8953657c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -676,6 +676,10 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	root := args.Root
 	if root == nil {
 		root = fs.RootFromContext(ctx)
+		// Is the root STILL nil?
+		if root == nil {
+			return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context")
+		}
 	}
 	defer root.DecRef()
 	args.Root = nil
-- 
cgit v1.2.3


From cc48969bb72e3efdc22746c5e7463b79b1942c2b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 10 Apr 2019 17:59:02 -0700
Subject: Internal change

PiperOrigin-RevId: 242978508
Change-Id: I0ea59ac5ba1dd499e87c53f2e24709371048679b
---
 pkg/sentry/arch/arch_x86.go        |  4 ++--
 pkg/sentry/arch/signal_amd64.go    | 11 ++++++-----
 pkg/sentry/kernel/syscalls.go      |  4 ++--
 pkg/sentry/loader/loader.go        |  6 +++---
 pkg/sentry/loader/vdso.go          |  5 +++--
 pkg/sentry/mm/syscalls.go          | 10 +++++-----
 pkg/sentry/strace/strace.go        |  4 ++--
 pkg/sentry/syscalls/linux/error.go |  4 ++--
 test/syscalls/linux/32bit.cc       |  4 ++--
 test/syscalls/linux/exec_binary.cc | 10 +++++-----
 vdso/vdso.lds                      |  4 ++--
 11 files changed, 34 insertions(+), 32 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index e50a76083..c8bf0e7f2 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -306,8 +306,8 @@ func (s *State) ptraceGetRegs() syscall.PtraceRegs {
 	// FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the
 	// same in PtraceSetRegs.)
 	//
-	// TODO: Remove this fixup since newer Linux doesn't have
-	// this behavior anymore.
+	// TODO: Remove this fixup since newer Linux
+	// doesn't have this behavior anymore.
 	if regs.Fs == 0 && regs.Fs_base <= 0xffffffff {
 		regs.Fs = _FS_TLS_SEL
 	}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index f7f054b0b..c9de36897 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -392,15 +392,16 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 		Sigset: sigset,
 	}
 
-	// TODO: Set SignalContext64.Err, Trapno, and Cr2 based on
-	// the fault that caused the signal. For now, leave Err and Trapno
-	// unset and assume CR2 == info.Addr() for SIGSEGVs and SIGBUSes.
+	// TODO: Set SignalContext64.Err, Trapno, and Cr2
+	// based on the fault that caused the signal. For now, leave Err and
+	// Trapno unset and assume CR2 == info.Addr() for SIGSEGVs and
+	// SIGBUSes.
 	if linux.Signal(info.Signo) == linux.SIGSEGV || linux.Signal(info.Signo) == linux.SIGBUS {
 		uc.MContext.Cr2 = info.Addr()
 	}
 
-	// "... the value (%rsp+8) is always a multiple of 16 (...) when control is
-	// transferred to the function entry point." - AMD64 ABI
+	// "... the value (%rsp+8) is always a multiple of 16 (...) when
+	// control is transferred to the function entry point." - AMD64 ABI
 	ucSize := binary.Size(uc)
 	if ucSize < 0 {
 		// This can only happen if we've screwed up the definition of
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 19b711e9c..7eb99718d 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -165,8 +165,8 @@ type Stracer interface {
 	//
 	// The returned private data is passed to SyscallExit.
 	//
-	// TODO: remove kernel imports from the strace package so
-	// that the type can be used directly.
+	// TODO: remove kernel imports from the strace
+	// package so that the type can be used directly.
 	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
 
 	// SyscallExit is called on syscall exit.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index deb8892f6..80ad59dde 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -70,9 +70,9 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	defer d.DecRef()
 
 	perms := fs.PermMask{
-		// TODO: Linux requires only execute permission,
-		// not read. However, our backing filesystems may prevent us
-		// from reading the file without read permission.
+		// TODO: Linux requires only execute
+		// permission, not read. However, our backing filesystems may
+		// prevent us from reading the file without read permission.
 		//
 		// Additionally, a task with a non-readable executable has
 		// additional constraints on access via ptrace and procfs.
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 273f6b5b9..fabf0cbe4 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -261,8 +261,9 @@ func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
 
 	return &VDSO{
 		ParamPage: mm.NewSpecialMappable("[vvar]", mfp, paramPage),
-		// TODO: Don't advertise the VDSO, as some applications may
-		// not be able to handle multiple [vdso] hints.
+		// TODO: Don't advertise the VDSO, as
+		// some applications may not be able to handle multiple [vdso]
+		// hints.
 		vdso:  mm.NewSpecialMappable("", mfp, vdso),
 		phdrs: info.phdrs,
 	}, nil
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index f8f095fed..cc7eb76d2 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -698,11 +698,11 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 		return mm.brk.End, syserror.EINVAL
 	}
 
-	// TODO: This enforces RLIMIT_DATA, but is slightly more
-	// permissive than the usual data limit. In particular, this only
-	// limits the size of the heap; a true RLIMIT_DATA limits the size of
-	// heap + data + bss. The segment sizes need to be plumbed from the
-	// loader package to fully enforce RLIMIT_DATA.
+	// TODO: This enforces RLIMIT_DATA, but is
+	// slightly more permissive than the usual data limit. In particular,
+	// this only limits the size of the heap; a true RLIMIT_DATA limits the
+	// size of heap + data + bss. The segment sizes need to be plumbed from
+	// the loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
 		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.ENOMEM
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 6c93d7de7..a7e9df268 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -686,8 +686,8 @@ func (s SyscallMap) Name(sysno uintptr) string {
 // N.B. This is not in an init function because we can't be sure all syscall
 // tables are registered with the kernel when init runs.
 //
-// TODO: remove kernel package dependencies from this package and
-// have the kernel package self-initialize all syscall tables.
+// TODO: remove kernel package dependencies from this
+// package and have the kernel package self-initialize all syscall tables.
 func Initialize() {
 	for _, table := range kernel.SyscallTables() {
 		// Is this known?
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index e86bed313..8759e5e32 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -89,8 +89,8 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		// side is gone. The partial write is returned. EPIPE will be
 		// returned on the next call.
 		//
-		// TODO: In some cases SIGPIPE should also be sent
-		// to the application.
+		// TODO: In some cases SIGPIPE should
+		// also be sent to the application.
 		return nil
 	case syserror.ErrWouldBlock:
 		// Syscall would block, but completed a partial read/write.
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index b8d5f0355..230648c9b 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -84,8 +84,8 @@ TEST(Syscall32Bit, Int80) {
       // disabled).
       return;
     case Platform::kPtrace:
-      // TODO: The ptrace platform does not have a consistent story
-      // here.
+      // TODO: The ptrace platform does not have a
+      // consistent story here.
       return;
     case Platform::kNative:
       break;
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index cfc898699..cb5c7ae51 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -285,9 +285,9 @@ ElfBinary<64> StandardElf() {
   elf.header.e_phoff = sizeof(elf.header);
   elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
 
-  // TODO: Always include a PT_GNU_STACK segment to disable
-  // executable stacks. With this omitted the stack (and all PROT_READ) mappings
-  // should be executable, but gVisor doesn't support that.
+  // TODO: Always include a PT_GNU_STACK segment to
+  // disable executable stacks. With this omitted the stack (and all PROT_READ)
+  // mappings should be executable, but gVisor doesn't support that.
   decltype(elf)::ElfPhdr phdr = {};
   phdr.p_type = PT_GNU_STACK;
   phdr.p_flags = PF_R | PF_W;
@@ -1005,8 +1005,8 @@ TEST(ElfTest, NoExecute) {
 
 // Execute, but no read permissions on the binary works just fine.
 TEST(ElfTest, NoRead) {
-  // TODO: gVisor's backing filesystem may prevent the sentry from
-  // reading the executable.
+  // TODO: gVisor's backing filesystem may prevent the
+  // sentry from reading the executable.
   SKIP_IF(IsRunningOnGvisor());
 
   ElfBinary<64> elf = StandardElf();
diff --git a/vdso/vdso.lds b/vdso/vdso.lds
index 97bb6d0c1..166779931 100644
--- a/vdso/vdso.lds
+++ b/vdso/vdso.lds
@@ -56,8 +56,8 @@ SECTIONS {
   .altinstr_replacement  : { *(.altinstr_replacement) }
 
   /*
-   * TODO: Remove this alignment? Then the VDSO would fit in a
-   * single page.
+   * TODO: Remove this alignment? Then the VDSO would fit
+   * in a single page.
    */
   . = ALIGN(0x1000);
   .text          : { *(.text*) }            :text    =0x90909090
-- 
cgit v1.2.3


From 4209edafb6a9eeff8741a4360100557179b47b35 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 11 Apr 2019 00:41:42 -0700
Subject: Use open fids when fstat()ing gofer files.

PiperOrigin-RevId: 243018347
Change-Id: I1e5b80607c1df0747482abea61db7fcf24536d37
---
 pkg/sentry/fs/ashmem/area.go            |  9 ++++----
 pkg/sentry/fs/binder/binder.go          |  7 +++---
 pkg/sentry/fs/dev/full.go               | 19 +++++++--------
 pkg/sentry/fs/dev/null.go               | 40 +++++++++++++++++---------------
 pkg/sentry/fs/dev/random.go             | 19 +++++++--------
 pkg/sentry/fs/fdpipe/pipe.go            | 15 ++++++------
 pkg/sentry/fs/file.go                   | 12 ++++++++++
 pkg/sentry/fs/file_operations.go        |  6 +++++
 pkg/sentry/fs/file_overlay.go           | 26 +++++++++++++++++++++
 pkg/sentry/fs/filetest/filetest.go      | 17 +++++++-------
 pkg/sentry/fs/fsutil/file.go            | 35 ++++++++++++++++++----------
 pkg/sentry/fs/fsutil/inode.go           | 19 +++++++--------
 pkg/sentry/fs/gofer/file.go             | 16 +++++++++++++
 pkg/sentry/fs/host/file.go              |  5 ++--
 pkg/sentry/fs/inotify.go                |  5 ++++
 pkg/sentry/fs/proc/exec_args.go         | 19 +++++++--------
 pkg/sentry/fs/proc/fds.go               | 21 +++++++++--------
 pkg/sentry/fs/proc/proc.go              |  3 ++-
 pkg/sentry/fs/proc/rpcinet_proc.go      | 17 +++++++-------
 pkg/sentry/fs/proc/seqfile/seqfile.go   | 17 +++++++-------
 pkg/sentry/fs/proc/sys.go               | 19 +++++++--------
 pkg/sentry/fs/proc/sys_net.go           | 34 ++++++++++++++-------------
 pkg/sentry/fs/proc/task.go              | 41 ++++++++++++++++++---------------
 pkg/sentry/fs/proc/uid_gid_map.go       | 17 +++++++-------
 pkg/sentry/fs/proc/uptime.go            | 19 +++++++--------
 pkg/sentry/fs/ramfs/dir.go              |  3 ++-
 pkg/sentry/fs/ramfs/socket.go           | 21 +++++++++--------
 pkg/sentry/fs/ramfs/symlink.go          | 21 +++++++++--------
 pkg/sentry/fs/timerfd/timerfd.go        | 13 ++++++-----
 pkg/sentry/fs/tmpfs/file_regular.go     | 15 ++++++------
 pkg/sentry/fs/tty/dir.go                | 15 ++++++------
 pkg/sentry/fs/tty/master.go             | 11 +++++----
 pkg/sentry/fs/tty/slave.go              | 11 +++++----
 pkg/sentry/kernel/epoll/epoll.go        | 13 ++++++-----
 pkg/sentry/kernel/eventfd/eventfd.go    | 15 ++++++------
 pkg/sentry/kernel/pipe/reader_writer.go | 11 +++++----
 pkg/sentry/loader/vdso.go               | 17 +++++++-------
 pkg/sentry/socket/epsocket/epsocket.go  | 11 +++++----
 pkg/sentry/socket/hostinet/socket.go    | 11 +++++----
 pkg/sentry/socket/netlink/socket.go     | 11 +++++----
 pkg/sentry/socket/rpcinet/socket.go     | 11 +++++----
 pkg/sentry/socket/unix/unix.go          | 11 +++++----
 pkg/sentry/syscalls/linux/sys_stat.go   | 37 +++++++++++++++++++----------
 43 files changed, 422 insertions(+), 293 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index 710b5185f..651cbc164 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -42,10 +42,11 @@ const (
 //
 // +stateify savable
 type Area struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	ad *Device
 
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index 188353961..a41b5dcae 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -85,9 +85,10 @@ func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags)
 //
 // +stateify savable
 type Proc struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	bd   *Device
 	task *kernel.Task
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 0cb513004..82da9aae9 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -59,15 +59,16 @@ func (f *fullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type fullFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	readZeros                `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	readZeros                       `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*fullFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 83f43c203..5d306d352 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -60,16 +60,17 @@ func (n *nullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type nullFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRead      `state:"nosave"`
-	fsutil.FileNoopWrite     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRead             `state:"nosave"`
+	fsutil.FileNoopWrite            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*nullFileOperations)(nil)
@@ -100,15 +101,16 @@ func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 
 // +stateify savable
 type zeroFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNoopWrite     `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	readZeros                `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoopWrite            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	readZeros                       `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*zeroFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index b9b78db7a..ffd5cf6c3 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -57,15 +57,16 @@ func (*randomDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type randomFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNoopWrite     `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoopWrite            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*randomFileOperations)(nil)
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index b4d11cb45..98483ab68 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -37,13 +37,14 @@ import (
 //
 // +stateify savable
 type pipeOperations struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	waiter.Queue             `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.Queue                    `state:"nosave"`
 
 	// flags are the flags used to open the pipe.
 	flags fs.FileFlags `state:".(fs.FileFlags)"`
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 01c18647c..2c2126f17 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -408,6 +408,18 @@ func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	return f.FileOperations.ConfigureMMap(ctx, f, opts)
 }
 
+// UnstableAttr calls f.FileOperations.UnstableAttr with f as the File.
+//
+// Returns syserror.ErrInterrupted if interrupted.
+func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) {
+	if !f.mu.Lock(ctx) {
+		return UnstableAttr{}, syserror.ErrInterrupted
+	}
+	defer f.mu.Unlock()
+
+	return f.FileOperations.UnstableAttr(ctx, f)
+}
+
 // MappedName implements memmap.MappingIdentity.MappedName.
 func (f *File) MappedName(ctx context.Context) string {
 	root := RootFromContext(ctx)
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index 81c6e2b5d..e0fa5135f 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -96,6 +96,12 @@ type FileOperations interface {
 	// memmap.Mappable.
 	ConfigureMMap(ctx context.Context, file *File, opts *memmap.MMapOpts) error
 
+	// UnstableAttr returns the "unstable" attributes of the inode represented
+	// by the file. Most implementations can embed
+	// fsutil.FileUseInodeUnstableAttr, which delegates to
+	// InodeOperations.UnstableAttr.
+	UnstableAttr(ctx context.Context, file *File) (UnstableAttr, error)
+
 	// Ioctl implements the ioctl(2) linux syscall.
 	//
 	// io provides access to the virtual memory space to which pointers in args
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 4efe85832..e1f02f0f4 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -339,6 +339,32 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt
 	return nil
 }
 
+// UnstableAttr implements fs.FileOperations.UnstableAttr.
+func (f *overlayFileOperations) UnstableAttr(ctx context.Context, file *File) (UnstableAttr, error) {
+	// Hot path. Avoid defers.
+	f.upperMu.Lock()
+	if f.upper != nil {
+		attr, err := f.upper.UnstableAttr(ctx)
+		f.upperMu.Unlock()
+		return attr, err
+	}
+	f.upperMu.Unlock()
+
+	// It's possible that copy-up has occurred, but we haven't opened a upper
+	// file yet. If this is the case, just use the upper inode's UnstableAttr
+	// rather than opening a file.
+	o := file.Dirent.Inode.overlay
+	o.copyMu.RLock()
+	if o.upper != nil {
+		attr, err := o.upper.UnstableAttr(ctx)
+		o.copyMu.RUnlock()
+		return attr, err
+	}
+	o.copyMu.RUnlock()
+
+	return f.lower.UnstableAttr(ctx)
+}
+
 // Ioctl implements fs.FileOperations.Ioctl and always returns ENOTTY.
 func (*overlayFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 40d84d9f2..388a1ce36 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -31,14 +31,15 @@ import (
 // TestFileOperations is an implementation of the File interface. It provides all
 // required methods.
 type TestFileOperations struct {
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	waiter.AlwaysReady       `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 // NewTestFile creates and initializes a new test file.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index ce329b37a..df34dc788 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -224,7 +224,7 @@ func (FileNoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallAr
 }
 
 // DirFileOperations implements most of fs.FileOperations for directories,
-// except for Readdir which the embedding type must implement.
+// except for Readdir and UnstableAttr which the embedding type must implement.
 type DirFileOperations struct {
 	waiter.AlwaysReady
 	FileGenericSeek
@@ -250,7 +250,8 @@ func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, i
 //
 // +stateify savable
 type StaticDirFileOperations struct {
-	DirFileOperations
+	DirFileOperations        `state:"nosave"`
+	FileUseInodeUnstableAttr `state:"nosave"`
 
 	// dentryMap is a SortedDentryMap used to implement Readdir.
 	dentryMap *fs.SortedDentryMap
@@ -291,16 +292,17 @@ func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File,
 //
 // +stateify savable
 type NoReadWriteFile struct {
-	waiter.AlwaysReady `state:"nosave"`
-	FileGenericSeek    `state:"nosave"`
-	FileNoIoctl        `state:"nosave"`
-	FileNoMMap         `state:"nosave"`
-	FileNoopFsync      `state:"nosave"`
-	FileNoopFlush      `state:"nosave"`
-	FileNoopRelease    `state:"nosave"`
-	FileNoRead         `state:"nosave"`
-	FileNoWrite        `state:"nosave"`
-	FileNotDirReaddir  `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	FileGenericSeek          `state:"nosave"`
+	FileNoIoctl              `state:"nosave"`
+	FileNoMMap               `state:"nosave"`
+	FileNoopFsync            `state:"nosave"`
+	FileNoopFlush            `state:"nosave"`
+	FileNoopRelease          `state:"nosave"`
+	FileNoRead               `state:"nosave"`
+	FileNoWrite              `state:"nosave"`
+	FileNotDirReaddir        `state:"nosave"`
+	FileUseInodeUnstableAttr `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*NoReadWriteFile)(nil)
@@ -365,3 +367,12 @@ type FileNoopRead struct{}
 func (FileNoopRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
 	return 0, nil
 }
+
+// FileUseInodeUnstableAttr implements fs.FileOperations.UnstableAttr by calling
+// InodeOperations.UnstableAttr.
+type FileUseInodeUnstableAttr struct{}
+
+// UnstableAttr implements fs.FileOperations.UnstableAttr.
+func (FileUseInodeUnstableAttr) UnstableAttr(ctx context.Context, file *fs.File) (fs.UnstableAttr, error) {
+	return file.Dirent.Inode.UnstableAttr(ctx)
+}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 37490e5b2..468171a9b 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -238,15 +238,16 @@ func (i *InodeSimpleExtendedAttributes) Listxattr(_ *fs.Inode) (map[string]struc
 //
 // +stateify savable
 type staticFile struct {
-	waiter.AlwaysReady `state:"nosave"`
-	FileGenericSeek    `state:"nosave"`
-	FileNoIoctl        `state:"nosave"`
-	FileNoMMap         `state:"nosave"`
-	FileNoopFsync      `state:"nosave"`
-	FileNoopFlush      `state:"nosave"`
-	FileNoopRelease    `state:"nosave"`
-	FileNoopWrite      `state:"nosave"`
-	FileNotDirReaddir  `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
+	FileGenericSeek          `state:"nosave"`
+	FileNoIoctl              `state:"nosave"`
+	FileNoMMap               `state:"nosave"`
+	FileNoopFsync            `state:"nosave"`
+	FileNoopFlush            `state:"nosave"`
+	FileNoopRelease          `state:"nosave"`
+	FileNoopWrite            `state:"nosave"`
+	FileNotDirReaddir        `state:"nosave"`
+	FileUseInodeUnstableAttr `state:"nosave"`
 
 	FileStaticContentReader
 }
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index e49ae2201..80d1e08a6 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -310,6 +310,22 @@ func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts
 	return f.inodeOperations.configureMMap(file, opts)
 }
 
+// UnstableAttr implements fs.FileOperations.UnstableAttr.
+func (f *fileOperations) UnstableAttr(ctx context.Context, file *fs.File) (fs.UnstableAttr, error) {
+	s := f.inodeOperations.session()
+	if s.cachePolicy.cacheUAttrs(file.Dirent.Inode) {
+		return f.inodeOperations.cachingInodeOps.UnstableAttr(ctx, file.Dirent.Inode)
+	}
+	// Use f.handles.File, which represents 9P fids that have been opened,
+	// instead of inodeFileState.file, which represents 9P fids that have not.
+	// This may be significantly more efficient in some implementations.
+	_, valid, pattr, err := getattr(ctx, f.handles.File)
+	if err != nil {
+		return fs.UnstableAttr{}, err
+	}
+	return unstable(ctx, valid, pattr, s.mounter, s.client), nil
+}
+
 // Seek implements fs.FileOperations.Seek.
 func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
 	return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor)
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index d67a0795f..2a8f285ff 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -36,8 +36,9 @@ import (
 //
 // +stateify savable
 type fileOperations struct {
-	fsutil.FileNoIoctl     `state:"nosave"`
-	fsutil.FileNoopRelease `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// iops are the Inode operations for this file.
 	iops *inodeOperations `state:"wait"`
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 5d6a7074b..59fa662f3 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -186,6 +186,11 @@ func (*Inotify) ConfigureMMap(context.Context, *File, *memmap.MMapOpts) error {
 	return syserror.ENODEV
 }
 
+// UnstableAttr implements FileOperations.UnstableAttr.
+func (i *Inotify) UnstableAttr(ctx context.Context, file *File) (UnstableAttr, error) {
+	return file.Dirent.Inode.UnstableAttr(ctx)
+}
+
 // Ioctl implements fs.FileOperations.Ioctl.
 func (i *Inotify) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	switch args[1].Int() {
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index 9daad5d2b..fc21dfbbd 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -77,15 +77,16 @@ func (i *execArgInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.
 
 // +stateify savable
 type execArgFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopWrite     `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopWrite            `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// arg is the type of exec argument this file contains.
 	arg execArgType
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 3c471bad9..939ebaba1 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -91,7 +91,7 @@ func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func
 // fd implements fs.InodeOperations for a file in /proc/TID/fd/.
 type fd struct {
 	ramfs.Symlink
-	*fs.File
+	file *fs.File
 }
 
 var _ fs.InodeOperations = (*fd)(nil)
@@ -103,7 +103,7 @@ func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode {
 	fd := &fd{
 		// RootOwner overridden by taskOwnedInodeOps.UnstableAttrs().
 		Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""),
-		File:    f,
+		file:    f,
 	}
 	return newProcInode(fd, msrc, fs.Symlink, t)
 }
@@ -112,8 +112,8 @@ func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode {
 // arguments are ignored.
 func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) {
 	// Take a reference on the fs.File.
-	f.File.IncRef()
-	return f.File, nil
+	f.file.IncRef()
+	return f.file, nil
 }
 
 // Readlink returns the current target.
@@ -122,14 +122,14 @@ func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) {
 	if root != nil {
 		defer root.DecRef()
 	}
-	n, _ := f.Dirent.FullName(root)
+	n, _ := f.file.Dirent.FullName(root)
 	return n, nil
 }
 
 // Getlink implements fs.InodeOperations.Getlink.
 func (f *fd) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
-	f.Dirent.IncRef()
-	return f.Dirent, nil
+	f.file.Dirent.IncRef()
+	return f.file.Dirent, nil
 }
 
 // Truncate is ignored.
@@ -139,12 +139,12 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error {
 
 func (f *fd) Release(ctx context.Context) {
 	f.Symlink.Release(ctx)
-	f.File.DecRef()
+	f.file.DecRef()
 }
 
 // Close releases the reference on the file.
 func (f *fd) Close() error {
-	f.DecRef()
+	f.file.DecRef()
 	return nil
 }
 
@@ -212,7 +212,8 @@ func (f *fdDir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFla
 
 // +stateify savable
 type fdDirFile struct {
-	fsutil.DirFileOperations `state:"nosave"`
+	fsutil.DirFileOperations        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	isInfoFile bool
 
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index c9e659533..64e1e1998 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -191,7 +191,8 @@ func (p *proc) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlag
 //
 // +stateify savable
 type rootProcFile struct {
-	fsutil.DirFileOperations `state:"nosave"`
+	fsutil.DirFileOperations        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	iops *proc
 }
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index 65faa21f2..81f64a28b 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -60,14 +60,15 @@ func (i *rpcInetInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.
 
 // rpcInetFile implements fs.FileOperations as RPCs.
 type rpcInetFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	inode *rpcInetInode
 }
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 16fc6789e..0a0eb45e2 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -183,14 +183,15 @@ func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
 //
 // +stateify savable
 type seqFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	seqFile *SeqFile
 }
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index ee6b9f262..a7bc9198e 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -134,15 +134,16 @@ var _ fs.InodeOperations = (*hostname)(nil)
 
 // +stateify savable
 type hostnameFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoSeek        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoWrite       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 }
 
 // Read implements fs.FileOperations.Read.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 42e9bc47f..728a46a74 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -85,14 +85,15 @@ func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 
 // +stateify savable
 type tcpMemFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	tcpMemInode *tcpMemInode
 }
@@ -197,14 +198,15 @@ func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF
 
 // +stateify savable
 type tcpSackFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	tcpSack *tcpSack
 
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 4b1f84942..0edcdfce2 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -142,7 +142,8 @@ func (s *subtasks) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.File
 
 // +stateify savable
 type subtasksFile struct {
-	fsutil.DirFileOperations `state:"nosave"`
+	fsutil.DirFileOperations        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	t     *kernel.Task
 	pidns *kernel.PIDNamespace
@@ -669,15 +670,16 @@ func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlag
 
 // +stateify savable
 type commFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoWrite       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	t *kernel.Task
 }
@@ -724,15 +726,16 @@ func (a *auxvec) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 
 // +stateify savable
 type auxvecFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoWrite       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	t *kernel.Task
 }
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 0c68bbfc9..d433632cf 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -81,14 +81,15 @@ func (imio *idMapInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent
 
 // +stateify savable
 type idMapFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	iops *idMapInodeOperations
 }
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index 40d0fd1fd..d7ae26fcf 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -54,15 +54,16 @@ func (u *uptime) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 
 // +stateify savable
 type uptimeFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoWrite       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	startTime ktime.Time
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index a3b33c0f8..011cf3a16 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -388,7 +388,8 @@ func (*Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, new
 //
 // +stateify savable
 type dirFileOperations struct {
-	fsutil.DirFileOperations `state:"nosave"`
+	fsutil.DirFileOperations        `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// dirCursor contains the name of the last directory entry that was
 	// serialized.
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 2c1295897..5bcb6c364 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -67,16 +67,17 @@ func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 
 // +stateify savable
 type socketFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNoRead        `state:"nosave"`
-	fsutil.FileNoSeek        `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoWrite       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoRead               `state:"nosave"`
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*socketFileOperations)(nil)
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 47dae380b..35dabdad2 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -88,16 +88,17 @@ func (s *Symlink) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF
 
 // +stateify savable
 type symlinkFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNoRead        `state:"nosave"`
-	fsutil.FileNoSeek        `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoWrite       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoRead               `state:"nosave"`
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*symlinkFileOperations)(nil)
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index b26466b9d..ef9a08854 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -33,12 +33,13 @@ import (
 //
 // +stateify savable
 type TimerOperations struct {
-	fsutil.FileZeroSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileZeroSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	events waiter.Queue `state:"zerovalue"`
 	timer  *ktime.Timer
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index be6298130..d0c9b8bea 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -28,13 +28,14 @@ import (
 //
 // +stateify savable
 type regularFileOperations struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileGenericSeek   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoopFsync     `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// iops is the InodeOperations of a regular tmpfs file. It is
 	// guaranteed to be the same as file.Dirent.Inode.InodeOperations,
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 11bf736d6..33b4c6438 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -285,13 +285,14 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
 //
 // +stateify savable
 type dirFileOperations struct {
-	waiter.AlwaysReady     `state:"nosave"`
-	fsutil.FileNoopRelease `state:"nosave"`
-	fsutil.FileGenericSeek `state:"nosave"`
-	fsutil.FileNoFsync     `state:"nosave"`
-	fsutil.FileNoopFlush   `state:"nosave"`
-	fsutil.FileNoMMap      `state:"nosave"`
-	fsutil.FileNoIoctl     `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// di is the inode operations.
 	di *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index b5e13ab36..7c256abb0 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -95,11 +95,12 @@ func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flag
 //
 // +stateify savable
 type masterFileOperations struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// d is the containing dir.
 	d *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 6dbce90b4..e8368bcdd 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -84,11 +84,12 @@ func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags
 //
 // +stateify savable
 type slaveFileOperations struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// si is the inode operations.
 	si *slaveInodeOperations
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 61c0fb7c5..befefb11c 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -98,12 +98,13 @@ func (p *pollEntry) WeakRefGone() {
 //
 // +stateify savable
 type EventPoll struct {
-	fsutil.FilePipeSeek      `state:"zerovalue"`
-	fsutil.FileNotDirReaddir `state:"zerovalue"`
-	fsutil.FileNoFsync       `state:"zerovalue"`
-	fsutil.FileNoopFlush     `state:"zerovalue"`
-	fsutil.FileNoMMap        `state:"zerovalue"`
-	fsutil.FileNoIoctl       `state:"zerovalue"`
+	fsutil.FilePipeSeek             `state:"zerovalue"`
+	fsutil.FileNotDirReaddir        `state:"zerovalue"`
+	fsutil.FileNoFsync              `state:"zerovalue"`
+	fsutil.FileNoopFlush            `state:"zerovalue"`
+	fsutil.FileNoMMap               `state:"zerovalue"`
+	fsutil.FileNoIoctl              `state:"zerovalue"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// Wait queue is used to notify interested parties when the event poll
 	// object itself becomes readable or writable.
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 2d43c986d..b448ad813 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -38,13 +38,14 @@ import (
 //
 // +stateify savable
 type EventOperations struct {
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// Mutex that protects accesses to the fields of this event.
 	mu sync.Mutex `state:"nosave"`
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 028175530..1090432d7 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -35,11 +35,12 @@ import (
 //
 // +stateify savable
 type ReaderWriter struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	*Pipe
 }
 
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index fabf0cbe4..18b7e90d8 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -52,14 +52,15 @@ func (f *fileContext) Value(key interface{}) interface{} {
 
 // byteReader implements fs.FileOperations for reading from a []byte source.
 type byteReader struct {
-	waiter.AlwaysReady       `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoIoctl       `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoopRelease   `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FilePipeSeek      `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	data []byte
 }
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 5bcafad98..78f43178f 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -210,11 +210,12 @@ type commonEndpoint interface {
 //
 // +stateify savable
 type SocketOperations struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 	*waiter.Queue
 
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index a0a8a3220..be63823d8 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -46,11 +46,12 @@ const (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	family int // Read-only.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 7223773ad..2503a67c5 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -65,11 +65,12 @@ var netlinkSocketDevice = device.NewAnonDevice()
 //
 // +stateify savable
 type Socket struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	// ports provides netlink port allocation.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 548a22f32..896b5b7ce 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -45,11 +45,12 @@ import (
 // socketOperations implements fs.FileOperations and socket.Socket for a socket
 // implemented using a host socket.
 type socketOperations struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
 	family   int    // Read-only.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index c857a0f33..92411c901 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -45,11 +45,12 @@ import (
 //
 // +stateify savable
 type SocketOperations struct {
-	fsutil.FilePipeSeek      `state:"nosave"`
-	fsutil.FileNotDirReaddir `state:"nosave"`
-	fsutil.FileNoFsync       `state:"nosave"`
-	fsutil.FileNoopFlush     `state:"nosave"`
-	fsutil.FileNoMMap        `state:"nosave"`
+	fsutil.FilePipeSeek             `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	refs.AtomicRefCount
 	socket.SendReceiveTimeout
 
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index bdfb9b3ef..02634b2dd 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -60,7 +60,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		}
 		defer file.DecRef()
 
-		return 0, nil, stat(t, file.Dirent, false, statAddr)
+		return 0, nil, fstat(t, file, statAddr)
 	}
 
 	return 0, nil, fileOpOn(t, fd, path, flags&linux.AT_SYMLINK_NOFOLLOW == 0, func(root *fs.Dirent, d *fs.Dirent) error {
@@ -98,7 +98,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	}
 	defer file.DecRef()
 
-	return 0, nil, stat(t, file.Dirent, false /* dirPath */, statAddr)
+	return 0, nil, fstat(t, file, statAddr)
 }
 
 // stat implements stat from the given *fs.Dirent.
@@ -110,9 +110,26 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 	if err != nil {
 		return err
 	}
+	return copyOutStat(t, statAddr, d.Inode.StableAttr, uattr)
+}
+
+// fstat implements fstat for the given *fs.File.
+func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
+	uattr, err := f.UnstableAttr(t)
+	if err != nil {
+		return err
+	}
+	return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr)
+}
 
+// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
+// address dst in t's address space. It encodes the stat struct to bytes
+// manually, as stat() is a very common syscall for many applications, and
+// t.CopyObjectOut has noticeable performance impact due to its many slice
+// allocations and use of reflection.
+func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
 	var mode uint32
-	switch d.Inode.StableAttr.Type {
+	switch sattr.Type {
 	case fs.RegularFile, fs.SpecialFile:
 		mode |= linux.ModeRegular
 	case fs.Symlink:
@@ -129,16 +146,12 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 		mode |= linux.ModeSocket
 	}
 
-	// We encode the stat struct to bytes manually, as stat() is a very
-	// common syscall for many applications, and t.CopyObjectOut has
-	// noticeable performance impact due to its many slice allocations and
-	// use of reflection.
 	b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
 
 	// Dev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(d.Inode.StableAttr.DeviceID))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
 	// Ino (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(d.Inode.StableAttr.InodeID))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
 	// Nlink (uint64)
 	b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
 	// Mode (uint32)
@@ -150,11 +163,11 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 	// Padding (uint32)
 	b = binary.AppendUint32(b, usermem.ByteOrder, 0)
 	// Rdev (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(d.Inode.StableAttr.DeviceFileMajor, d.Inode.StableAttr.DeviceFileMinor)))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
 	// Size (uint64)
 	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
 	// Blksize (uint64)
-	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(d.Inode.StableAttr.BlockSize))
+	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
 	// Blocks (uint64)
 	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
 
@@ -173,7 +186,7 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
 	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
 	b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
 
-	_, err = t.CopyOutBytes(statAddr, b)
+	_, err := t.CopyOutBytes(dst, b)
 	return err
 }
 
-- 
cgit v1.2.3


From 6b24f7ab0863004a30c2f1aff88440fbb4cf3b3c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 11 Apr 2019 16:47:26 -0700
Subject: Format FDs in strace logs

Normal files display their path in the current mount namespace:

I0410 10:57:54.964196  216336 x:0] [   1] ls X read(0x3 /proc/filesystems, 0x55cee3bdb2c0 "nodev\t9p\nnodev\tdevpts \nnodev\tdevtmpfs\nnodev\tproc\nnodev\tramdiskfs\nnodev\tsysfs\nnodev\ttmpfs\n", 0x1000) = 0x58 (24.462?s)

AT_FDCWD includes the CWD:

I0411 12:58:48.278427    1526 x:0] [   1] stat_test E newfstatat(AT_FDCWD /home/prattmic, 0x55ea719b564e /proc/self, 0x7ef5cefc2be8, 0x0)

Sockets (and other non-vfs files) display an inode number (like
/proc/PID/fd):

I0410 10:54:38.909123  207684 x:0] [   1] nc E bind(0x3 socket:[1], 0x55b5a1652040 {Family: AF_INET, Addr: , Port: 8080}, 0x10)

I also fixed a few syscall args that should be Path.

PiperOrigin-RevId: 243169025
Change-Id: Ic7dda6a82ae27062fe2a4a371557acfd6a21fa2a
---
 pkg/sentry/strace/BUILD       |   1 +
 pkg/sentry/strace/linux64.go  | 152 +++++++++++++++++++++---------------------
 pkg/sentry/strace/strace.go   |  32 +++++++++
 pkg/sentry/strace/syscalls.go |   3 +
 4 files changed, 112 insertions(+), 76 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 73f1e9814..bcd94b42e 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -29,6 +29,7 @@ go_library(
         "//pkg/seccomp",
         "//pkg/sentry/arch",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/epsocket",
         "//pkg/sentry/socket/netlink",
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index ca695e80f..22b76449c 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -17,27 +17,27 @@ package strace
 // linuxAMD64 provides a mapping of the Linux amd64 syscalls and their argument
 // types for display / formatting.
 var linuxAMD64 = SyscallMap{
-	0:   makeSyscallInfo("read", Hex, ReadBuffer, Hex),
-	1:   makeSyscallInfo("write", Hex, WriteBuffer, Hex),
+	0:   makeSyscallInfo("read", FD, ReadBuffer, Hex),
+	1:   makeSyscallInfo("write", FD, WriteBuffer, Hex),
 	2:   makeSyscallInfo("open", Path, OpenFlags, Mode),
-	3:   makeSyscallInfo("close", Hex),
+	3:   makeSyscallInfo("close", FD),
 	4:   makeSyscallInfo("stat", Path, Stat),
-	5:   makeSyscallInfo("fstat", Hex, Stat),
+	5:   makeSyscallInfo("fstat", FD, Stat),
 	6:   makeSyscallInfo("lstat", Path, Stat),
 	7:   makeSyscallInfo("poll", Hex, Hex, Hex),
 	8:   makeSyscallInfo("lseek", Hex, Hex, Hex),
-	9:   makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, Hex, Hex),
+	9:   makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex),
 	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
 	11:  makeSyscallInfo("munmap", Hex, Hex),
 	12:  makeSyscallInfo("brk", Hex),
 	13:  makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
 	14:  makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
 	15:  makeSyscallInfo("rt_sigreturn"),
-	16:  makeSyscallInfo("ioctl", Hex, Hex, Hex),
-	17:  makeSyscallInfo("pread64", Hex, ReadBuffer, Hex, Hex),
-	18:  makeSyscallInfo("pwrite64", Hex, WriteBuffer, Hex, Hex),
-	19:  makeSyscallInfo("readv", Hex, ReadIOVec, Hex),
-	20:  makeSyscallInfo("writev", Hex, WriteIOVec, Hex),
+	16:  makeSyscallInfo("ioctl", FD, Hex, Hex),
+	17:  makeSyscallInfo("pread64", FD, ReadBuffer, Hex, Hex),
+	18:  makeSyscallInfo("pwrite64", FD, WriteBuffer, Hex, Hex),
+	19:  makeSyscallInfo("readv", FD, ReadIOVec, Hex),
+	20:  makeSyscallInfo("writev", FD, WriteIOVec, Hex),
 	21:  makeSyscallInfo("access", Path, Oct),
 	22:  makeSyscallInfo("pipe", PipeFDs),
 	23:  makeSyscallInfo("select", Hex, Hex, Hex, Hex, Timeval),
@@ -49,30 +49,30 @@ var linuxAMD64 = SyscallMap{
 	29:  makeSyscallInfo("shmget", Hex, Hex, Hex),
 	30:  makeSyscallInfo("shmat", Hex, Hex, Hex),
 	31:  makeSyscallInfo("shmctl", Hex, Hex, Hex),
-	32:  makeSyscallInfo("dup", Hex),
-	33:  makeSyscallInfo("dup2", Hex, Hex),
+	32:  makeSyscallInfo("dup", FD),
+	33:  makeSyscallInfo("dup2", FD, FD),
 	34:  makeSyscallInfo("pause"),
 	35:  makeSyscallInfo("nanosleep", Timespec, PostTimespec),
 	36:  makeSyscallInfo("getitimer", ItimerType, PostItimerVal),
 	37:  makeSyscallInfo("alarm", Hex),
 	38:  makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal),
 	39:  makeSyscallInfo("getpid"),
-	40:  makeSyscallInfo("sendfile", Hex, Hex, Hex, Hex),
+	40:  makeSyscallInfo("sendfile", FD, FD, Hex, Hex),
 	41:  makeSyscallInfo("socket", SockFamily, SockType, SockProtocol),
-	42:  makeSyscallInfo("connect", Hex, SockAddr, Hex),
-	43:  makeSyscallInfo("accept", Hex, PostSockAddr, SockLen),
-	44:  makeSyscallInfo("sendto", Hex, Hex, Hex, Hex, SockAddr, Hex),
-	45:  makeSyscallInfo("recvfrom", Hex, Hex, Hex, Hex, PostSockAddr, SockLen),
-	46:  makeSyscallInfo("sendmsg", Hex, SendMsgHdr, Hex),
-	47:  makeSyscallInfo("recvmsg", Hex, RecvMsgHdr, Hex),
-	48:  makeSyscallInfo("shutdown", Hex, Hex),
-	49:  makeSyscallInfo("bind", Hex, SockAddr, Hex),
-	50:  makeSyscallInfo("listen", Hex, Hex),
-	51:  makeSyscallInfo("getsockname", Hex, PostSockAddr, SockLen),
-	52:  makeSyscallInfo("getpeername", Hex, PostSockAddr, SockLen),
+	42:  makeSyscallInfo("connect", FD, SockAddr, Hex),
+	43:  makeSyscallInfo("accept", FD, PostSockAddr, SockLen),
+	44:  makeSyscallInfo("sendto", FD, Hex, Hex, Hex, SockAddr, Hex),
+	45:  makeSyscallInfo("recvfrom", FD, Hex, Hex, Hex, PostSockAddr, SockLen),
+	46:  makeSyscallInfo("sendmsg", FD, SendMsgHdr, Hex),
+	47:  makeSyscallInfo("recvmsg", FD, RecvMsgHdr, Hex),
+	48:  makeSyscallInfo("shutdown", FD, Hex),
+	49:  makeSyscallInfo("bind", FD, SockAddr, Hex),
+	50:  makeSyscallInfo("listen", FD, Hex),
+	51:  makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen),
+	52:  makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen),
 	53:  makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
-	54:  makeSyscallInfo("setsockopt", Hex, Hex, Hex, Hex, Hex),
-	55:  makeSyscallInfo("getsockopt", Hex, Hex, Hex, Hex, Hex),
+	54:  makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex),
+	55:  makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex),
 	56:  makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
 	57:  makeSyscallInfo("fork"),
 	58:  makeSyscallInfo("vfork"),
@@ -89,16 +89,16 @@ var linuxAMD64 = SyscallMap{
 	69:  makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex),
 	70:  makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex),
 	71:  makeSyscallInfo("msgctl", Hex, Hex, Hex),
-	72:  makeSyscallInfo("fcntl", Hex, Hex, Hex),
-	73:  makeSyscallInfo("flock", Hex, Hex),
-	74:  makeSyscallInfo("fsync", Hex),
-	75:  makeSyscallInfo("fdatasync", Hex),
+	72:  makeSyscallInfo("fcntl", FD, Hex, Hex),
+	73:  makeSyscallInfo("flock", FD, Hex),
+	74:  makeSyscallInfo("fsync", FD),
+	75:  makeSyscallInfo("fdatasync", FD),
 	76:  makeSyscallInfo("truncate", Path, Hex),
-	77:  makeSyscallInfo("ftruncate", Hex, Hex),
-	78:  makeSyscallInfo("getdents", Hex, Hex, Hex),
+	77:  makeSyscallInfo("ftruncate", FD, Hex),
+	78:  makeSyscallInfo("getdents", FD, Hex, Hex),
 	79:  makeSyscallInfo("getcwd", PostPath, Hex),
 	80:  makeSyscallInfo("chdir", Path),
-	81:  makeSyscallInfo("fchdir", Hex),
+	81:  makeSyscallInfo("fchdir", FD),
 	82:  makeSyscallInfo("rename", Path, Path),
 	83:  makeSyscallInfo("mkdir", Path, Oct),
 	84:  makeSyscallInfo("rmdir", Path),
@@ -108,10 +108,10 @@ var linuxAMD64 = SyscallMap{
 	88:  makeSyscallInfo("symlink", Path, Path),
 	89:  makeSyscallInfo("readlink", Path, ReadBuffer, Hex),
 	90:  makeSyscallInfo("chmod", Path, Mode),
-	91:  makeSyscallInfo("fchmod", Hex, Mode),
+	91:  makeSyscallInfo("fchmod", FD, Mode),
 	92:  makeSyscallInfo("chown", Path, Hex, Hex),
-	93:  makeSyscallInfo("fchown", Hex, Hex, Hex),
-	94:  makeSyscallInfo("lchown", Hex, Hex, Hex),
+	93:  makeSyscallInfo("fchown", FD, Hex, Hex),
+	94:  makeSyscallInfo("lchown", Path, Hex, Hex),
 	95:  makeSyscallInfo("umask", Hex),
 	96:  makeSyscallInfo("gettimeofday", Timeval, Hex),
 	97:  makeSyscallInfo("getrlimit", Hex, Hex),
@@ -155,7 +155,7 @@ var linuxAMD64 = SyscallMap{
 	135: makeSyscallInfo("personality", Hex),
 	136: makeSyscallInfo("ustat", Hex, Hex),
 	137: makeSyscallInfo("statfs", Path, Hex),
-	138: makeSyscallInfo("fstatfs", Hex, Hex),
+	138: makeSyscallInfo("fstatfs", FD, Hex),
 	139: makeSyscallInfo("sysfs", Hex, Hex, Hex),
 	140: makeSyscallInfo("getpriority", Hex, Hex),
 	141: makeSyscallInfo("setpriority", Hex, Hex, Hex),
@@ -172,7 +172,7 @@ var linuxAMD64 = SyscallMap{
 	152: makeSyscallInfo("munlockall"),
 	153: makeSyscallInfo("vhangup"),
 	154: makeSyscallInfo("modify_ldt", Hex, Hex, Hex),
-	155: makeSyscallInfo("pivot_root", Hex, Hex),
+	155: makeSyscallInfo("pivot_root", Path, Path),
 	156: makeSyscallInfo("_sysctl", Hex),
 	157: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex),
 	158: makeSyscallInfo("arch_prctl", Hex, Hex),
@@ -207,16 +207,16 @@ var linuxAMD64 = SyscallMap{
 	187: makeSyscallInfo("readahead", Hex, Hex, Hex),
 	188: makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex),
 	189: makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex),
-	190: makeSyscallInfo("fsetxattr", Hex, Path, Hex, Hex, Hex),
+	190: makeSyscallInfo("fsetxattr", FD, Path, Hex, Hex, Hex),
 	191: makeSyscallInfo("getxattr", Path, Path, Hex, Hex),
 	192: makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex),
-	193: makeSyscallInfo("fgetxattr", Hex, Path, Hex, Hex),
+	193: makeSyscallInfo("fgetxattr", FD, Path, Hex, Hex),
 	194: makeSyscallInfo("listxattr", Path, Path, Hex),
 	195: makeSyscallInfo("llistxattr", Path, Path, Hex),
-	196: makeSyscallInfo("flistxattr", Hex, Path, Hex),
+	196: makeSyscallInfo("flistxattr", FD, Path, Hex),
 	197: makeSyscallInfo("removexattr", Path, Path),
 	198: makeSyscallInfo("lremovexattr", Path, Path),
-	199: makeSyscallInfo("fremovexattr", Hex, Path),
+	199: makeSyscallInfo("fremovexattr", FD, Path),
 	200: makeSyscallInfo("tkill", Hex, Signal),
 	201: makeSyscallInfo("time", Hex),
 	202: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex),
@@ -234,11 +234,11 @@ var linuxAMD64 = SyscallMap{
 	// 214: epoll_ctl_old (not implemented in the Linux kernel)
 	// 215: epoll_wait_old (not implemented in the Linux kernel)
 	216: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex),
-	217: makeSyscallInfo("getdents64", Hex, Hex, Hex),
+	217: makeSyscallInfo("getdents64", FD, Hex, Hex),
 	218: makeSyscallInfo("set_tid_address", Hex),
 	219: makeSyscallInfo("restart_syscall"),
 	220: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex),
-	221: makeSyscallInfo("fadvise64", Hex, Hex, Hex, Hex),
+	221: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex),
 	222: makeSyscallInfo("timer_create", Hex, Hex, Hex),
 	223: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
 	224: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec),
@@ -250,7 +250,7 @@ var linuxAMD64 = SyscallMap{
 	230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
 	231: makeSyscallInfo("exit_group", Hex),
 	232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex),
-	233: makeSyscallInfo("epoll_ctl", Hex, Hex, Hex, Hex),
+	233: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
 	234: makeSyscallInfo("tgkill", Hex, Hex, Signal),
 	235: makeSyscallInfo("utimes", Path, Timeval),
 	// 236: vserver (not implemented in the Linux kernel)
@@ -274,58 +274,58 @@ var linuxAMD64 = SyscallMap{
 	254: makeSyscallInfo("inotify_add_watch", Hex, Path, Hex),
 	255: makeSyscallInfo("inotify_rm_watch", Hex, Hex),
 	256: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
-	257: makeSyscallInfo("openat", Hex, Path, OpenFlags, Mode),
-	258: makeSyscallInfo("mkdirat", Hex, Path, Hex),
-	259: makeSyscallInfo("mknodat", Hex, Path, Mode, Hex),
-	260: makeSyscallInfo("fchownat", Hex, Path, Hex, Hex, Hex),
-	261: makeSyscallInfo("futimesat", Hex, Path, Hex),
-	262: makeSyscallInfo("newfstatat", Hex, Path, Stat, Hex),
-	263: makeSyscallInfo("unlinkat", Hex, Path, Hex),
-	264: makeSyscallInfo("renameat", Hex, Path, Hex, Path),
-	265: makeSyscallInfo("linkat", Hex, Path, Hex, Path, Hex),
+	257: makeSyscallInfo("openat", FD, Path, OpenFlags, Mode),
+	258: makeSyscallInfo("mkdirat", FD, Path, Hex),
+	259: makeSyscallInfo("mknodat", FD, Path, Mode, Hex),
+	260: makeSyscallInfo("fchownat", FD, Path, Hex, Hex, Hex),
+	261: makeSyscallInfo("futimesat", FD, Path, Hex),
+	262: makeSyscallInfo("newfstatat", FD, Path, Stat, Hex),
+	263: makeSyscallInfo("unlinkat", FD, Path, Hex),
+	264: makeSyscallInfo("renameat", FD, Path, Hex, Path),
+	265: makeSyscallInfo("linkat", FD, Path, Hex, Path, Hex),
 	266: makeSyscallInfo("symlinkat", Path, Hex, Path),
-	267: makeSyscallInfo("readlinkat", Hex, Path, ReadBuffer, Hex),
-	268: makeSyscallInfo("fchmodat", Hex, Path, Mode),
-	269: makeSyscallInfo("faccessat", Hex, Path, Oct, Hex),
+	267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
+	268: makeSyscallInfo("fchmodat", FD, Path, Mode),
+	269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
 	270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
 	271: makeSyscallInfo("ppoll", Hex, Hex, Timespec, SigSet, Hex),
 	272: makeSyscallInfo("unshare", CloneFlags),
 	273: makeSyscallInfo("set_robust_list", Hex, Hex),
 	274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
-	275: makeSyscallInfo("splice", Hex, Hex, Hex, Hex, Hex, Hex),
-	276: makeSyscallInfo("tee", Hex, Hex, Hex, Hex),
-	277: makeSyscallInfo("sync_file_range", Hex, Hex, Hex, Hex),
-	278: makeSyscallInfo("vmsplice", Hex, Hex, Hex, Hex),
+	275: makeSyscallInfo("splice", FD, Hex, FD, Hex, Hex, Hex),
+	276: makeSyscallInfo("tee", FD, FD, Hex, Hex),
+	277: makeSyscallInfo("sync_file_range", FD, Hex, Hex, Hex),
+	278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
 	279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
-	280: makeSyscallInfo("utimensat", Hex, Path, UTimeTimespec, Hex),
+	280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
 	281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
 	282: makeSyscallInfo("signalfd", Hex, Hex, Hex),
 	283: makeSyscallInfo("timerfd_create", Hex, Hex),
 	284: makeSyscallInfo("eventfd", Hex),
-	285: makeSyscallInfo("fallocate", Hex, Hex, Hex, Hex),
-	286: makeSyscallInfo("timerfd_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
-	287: makeSyscallInfo("timerfd_gettime", Hex, PostItimerSpec),
-	288: makeSyscallInfo("accept4", Hex, PostSockAddr, SockLen, SockFlags),
+	285: makeSyscallInfo("fallocate", FD, Hex, Hex, Hex),
+	286: makeSyscallInfo("timerfd_settime", FD, Hex, ItimerSpec, PostItimerSpec),
+	287: makeSyscallInfo("timerfd_gettime", FD, PostItimerSpec),
+	288: makeSyscallInfo("accept4", FD, PostSockAddr, SockLen, SockFlags),
 	289: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex),
 	290: makeSyscallInfo("eventfd2", Hex, Hex),
 	291: makeSyscallInfo("epoll_create1", Hex),
-	292: makeSyscallInfo("dup3", Hex, Hex, Hex),
+	292: makeSyscallInfo("dup3", FD, FD, Hex),
 	293: makeSyscallInfo("pipe2", PipeFDs, Hex),
 	294: makeSyscallInfo("inotify_init1", Hex),
-	295: makeSyscallInfo("preadv", Hex, ReadIOVec, Hex, Hex),
-	296: makeSyscallInfo("pwritev", Hex, WriteIOVec, Hex, Hex),
+	295: makeSyscallInfo("preadv", FD, ReadIOVec, Hex, Hex),
+	296: makeSyscallInfo("pwritev", FD, WriteIOVec, Hex, Hex),
 	297: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex),
 	298: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex),
-	299: makeSyscallInfo("recvmmsg", Hex, Hex, Hex, Hex, Hex),
+	299: makeSyscallInfo("recvmmsg", FD, Hex, Hex, Hex, Hex),
 	300: makeSyscallInfo("fanotify_init", Hex, Hex),
 	301: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex),
 	302: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex),
-	303: makeSyscallInfo("name_to_handle_at", Hex, Hex, Hex, Hex, Hex),
-	304: makeSyscallInfo("open_by_handle_at", Hex, Hex, Hex),
+	303: makeSyscallInfo("name_to_handle_at", FD, Hex, Hex, Hex, Hex),
+	304: makeSyscallInfo("open_by_handle_at", FD, Hex, Hex),
 	305: makeSyscallInfo("clock_adjtime", Hex, Hex),
-	306: makeSyscallInfo("syncfs", Hex),
-	307: makeSyscallInfo("sendmmsg", Hex, Hex, Hex, Hex),
-	308: makeSyscallInfo("setns", Hex, Hex),
+	306: makeSyscallInfo("syncfs", FD),
+	307: makeSyscallInfo("sendmmsg", FD, Hex, Hex, Hex),
+	308: makeSyscallInfo("setns", FD, Hex),
 	309: makeSyscallInfo("getcpu", Hex, Hex, Hex),
 	310: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex),
 	311: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex),
@@ -333,6 +333,6 @@ var linuxAMD64 = SyscallMap{
 	313: makeSyscallInfo("finit_module", Hex, Hex, Hex),
 	314: makeSyscallInfo("sched_setattr", Hex, Hex, Hex),
 	315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
-	316: makeSyscallInfo("renameat2", Hex, Path, Hex, Path, Hex),
+	316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
 	317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
 }
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a7e9df268..398035b65 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/seccomp"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
 	pb "gvisor.googlesource.com/gvisor/pkg/sentry/strace/strace_go_proto"
 	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -132,6 +133,35 @@ func path(t *kernel.Task, addr usermem.Addr) string {
 	return fmt.Sprintf("%#x %s", addr, path)
 }
 
+func fd(t *kernel.Task, fd kdefs.FD) string {
+	root := t.FSContext().RootDirectory()
+	if root != nil {
+		defer root.DecRef()
+	}
+
+	if fd == linux.AT_FDCWD {
+		wd := t.FSContext().WorkingDirectory()
+		var name string
+		if wd != nil {
+			defer wd.DecRef()
+			name, _ = wd.FullName(root)
+		} else {
+			name = "(unknown cwd)"
+		}
+		return fmt.Sprintf("AT_FDCWD %s", name)
+	}
+
+	file := t.FDMap().GetFile(fd)
+	if file == nil {
+		// Cast FD to uint64 to avoid printing negative hex.
+		return fmt.Sprintf("%#x (bad FD)", uint64(fd))
+	}
+	defer file.DecRef()
+
+	name, _ := file.Dirent.FullName(root)
+	return fmt.Sprintf("%#x %s", fd, name)
+}
+
 func fdpair(t *kernel.Task, addr usermem.Addr) string {
 	var fds [2]int32
 	_, err := t.CopyIn(addr, &fds)
@@ -344,6 +374,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			break
 		}
 		switch i.format[arg] {
+		case FD:
+			output = append(output, fd(t, kdefs.FD(args[arg].Int())))
 		case WriteBuffer:
 			output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize))
 		case WriteIOVec:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index b2715856e..1f255c717 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -35,6 +35,9 @@ const (
 	// Oct is just an octal number.
 	Oct
 
+	// FD is a file descriptor.
+	FD
+
 	// ReadBuffer is a buffer for a read-style call. The syscall return
 	// value is used for the length.
 	//
-- 
cgit v1.2.3


From 08d99c5fbea76ecc92038280387d24ecdf7ed814 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 17 Apr 2019 12:13:46 -0700
Subject: Convert poll/select to operate more directly on linux.PollFD

Current, doPoll copies the user struct pollfd array into a
[]syscalls.PollFD, which contains internal kdefs.FD and
waiter.EventMask types. While these are currently binary-compatible with
the Linux versions, we generally discourage copying directly to internal
types (someone may inadvertantly change kdefs.FD to uint64).

Instead, copy directly to a []linux.PollFD, which will certainly be
binary compatible. Most of syscalls/polling.go is included directly into
syscalls/linux/sys_poll.go, as it can then operate directly on
linux.PollFD. The additional syscalls.PollFD type is providing little
value.

I've also added explicit conversion functions for waiter.EventMask,
which creates the possibility of a different binary format.

PiperOrigin-RevId: 244042947
Change-Id: I24e5b642002a32b3afb95a9dcb80d4acd1288abf
---
 pkg/fdnotifier/fdnotifier.go                   |   6 +-
 pkg/fdnotifier/poll_unsafe.go                  |   6 +-
 pkg/sentry/socket/rpcinet/notifier/notifier.go |   8 +-
 pkg/sentry/syscalls/BUILD                      |   2 -
 pkg/sentry/syscalls/linux/sys_epoll.go         |   2 +-
 pkg/sentry/syscalls/linux/sys_poll.go          | 133 +++++++++++++++++++++---
 pkg/sentry/syscalls/polling.go                 | 137 -------------------------
 pkg/waiter/waiter.go                           |  26 +++--
 8 files changed, 153 insertions(+), 167 deletions(-)
 delete mode 100644 pkg/sentry/syscalls/polling.go

(limited to 'pkg/sentry')

diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go
index 624b1a0c5..aa4906ca0 100644
--- a/pkg/fdnotifier/fdnotifier.go
+++ b/pkg/fdnotifier/fdnotifier.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build linux
+
 // Package fdnotifier contains an adapter that translates IO events (e.g., a
 // file became readable/writable) from native FDs to the notifications in the
 // waiter package. It uses epoll in edge-triggered mode to receive notifications
@@ -70,7 +72,7 @@ func (n *notifier) waitFD(fd int32, fi *fdInfo, mask waiter.EventMask) error {
 	}
 
 	e := syscall.EpollEvent{
-		Events: uint32(mask) | -syscall.EPOLLET,
+		Events: mask.ToLinux() | -syscall.EPOLLET,
 		Fd:     fd,
 	}
 
@@ -155,7 +157,7 @@ func (n *notifier) waitAndNotify() error {
 		n.mu.Lock()
 		for i := 0; i < v; i++ {
 			if fi, ok := n.fdMap[e[i].Fd]; ok {
-				fi.queue.Notify(waiter.EventMask(e[i].Events))
+				fi.queue.Notify(waiter.EventMaskFromLinux(e[i].Events))
 			}
 		}
 		n.mu.Unlock()
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
index 8459d4c74..05be9aeb5 100644
--- a/pkg/fdnotifier/poll_unsafe.go
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build linux
+
 package fdnotifier
 
 import (
@@ -30,7 +32,7 @@ func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
 		revents int16
 	}{
 		fd:     fd,
-		events: int16(mask),
+		events: int16(mask.ToLinux()),
 	}
 
 	for {
@@ -51,7 +53,7 @@ func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
 		}
 
 		// Otherwise we got the ready events in the revents field.
-		return waiter.EventMask(e.revents)
+		return waiter.EventMaskFromLinux(uint32(e.revents))
 	}
 }
 
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index 73c255c33..d9bda78b0 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -76,7 +76,7 @@ func (n *Notifier) waitFD(fd uint32, fi *fdInfo, mask waiter.EventMask) error {
 	}
 
 	e := pb.EpollEvent{
-		Events: uint32(mask) | -syscall.EPOLLET,
+		Events: mask.ToLinux() | -syscall.EPOLLET,
 		Fd:     fd,
 	}
 
@@ -178,7 +178,7 @@ func (n *Notifier) waitAndNotify() error {
 		n.mu.Lock()
 		for _, e := range res.(*pb.EpollWaitResponse_Events).Events.Events {
 			if fi, ok := n.fdMap[e.Fd]; ok {
-				fi.queue.Notify(waiter.EventMask(e.Events))
+				fi.queue.Notify(waiter.EventMaskFromLinux(e.Events))
 			}
 		}
 		n.mu.Unlock()
@@ -214,7 +214,7 @@ func (n *Notifier) HasFD(fd uint32) bool {
 // although the syscall is non-blocking.
 func (n *Notifier) NonBlockingPoll(fd uint32, mask waiter.EventMask) waiter.EventMask {
 	for {
-		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Poll{&pb.PollRequest{Fd: fd, Events: uint32(mask)}}}, false /* ignoreResult */)
+		id, c := n.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Poll{&pb.PollRequest{Fd: fd, Events: mask.ToLinux()}}}, false /* ignoreResult */)
 		<-c
 
 		res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_Poll).Poll.Result
@@ -225,6 +225,6 @@ func (n *Notifier) NonBlockingPoll(fd uint32, mask waiter.EventMask) waiter.Even
 			return mask
 		}
 
-		return waiter.EventMask(res.(*pb.PollResponse_Events).Events)
+		return waiter.EventMaskFromLinux(res.(*pb.PollResponse_Events).Events)
 	}
 }
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD
index 6b5469e45..877318fa9 100644
--- a/pkg/sentry/syscalls/BUILD
+++ b/pkg/sentry/syscalls/BUILD
@@ -6,7 +6,6 @@ go_library(
     name = "syscalls",
     srcs = [
         "epoll.go",
-        "polling.go",
         "syscalls.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls",
@@ -14,7 +13,6 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/sentry/arch",
-        "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/epoll",
         "//pkg/sentry/kernel/kdefs",
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 62272efcd..200c46355 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -87,7 +87,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			flags |= epoll.EdgeTriggered
 		}
 
-		mask = waiter.EventMask(e.Events)
+		mask = waiter.EventMaskFromLinux(e.Events)
 		data[0] = e.Fd
 		data[1] = e.Pad
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 0cf6aad7f..23fcb907f 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -19,11 +19,11 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -37,23 +37,130 @@ const fileCap = 1024 * 1024
 const (
 	// selectReadEvents is analogous to the Linux kernel's
 	// fs/select.c:POLLIN_SET.
-	selectReadEvents = waiter.EventIn | waiter.EventHUp | waiter.EventErr
+	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
 
 	// selectWriteEvents is analogous to the Linux kernel's
 	// fs/select.c:POLLOUT_SET.
-	selectWriteEvents = waiter.EventOut | waiter.EventErr
+	selectWriteEvents = linux.POLLOUT | linux.POLLERR
 
 	// selectExceptEvents is analogous to the Linux kernel's
 	// fs/select.c:POLLEX_SET.
-	selectExceptEvents = waiter.EventPri
+	selectExceptEvents = linux.POLLPRI
 )
 
+// pollState tracks the associated file descriptor and waiter of a PollFD.
+type pollState struct {
+	file   *fs.File
+	waiter waiter.Entry
+}
+
+// initReadiness gets the current ready mask for the file represented by the FD
+// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
+// used to register with the file for event notifications, and a reference to
+// the file is stored in "state".
+func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
+	if pfd.FD < 0 {
+		pfd.REvents = 0
+		return
+	}
+
+	file := t.FDMap().GetFile(kdefs.FD(pfd.FD))
+	if file == nil {
+		pfd.REvents = linux.POLLNVAL
+		return
+	}
+
+	if ch == nil {
+		defer file.DecRef()
+	} else {
+		state.file = file
+		state.waiter, _ = waiter.NewChannelEntry(ch)
+		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	}
+
+	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
+	pfd.REvents = int16(r.ToLinux()) & pfd.Events
+}
+
+// releaseState releases all the pollState in "state".
+func releaseState(state []pollState) {
+	for i := range state {
+		if state[i].file != nil {
+			state[i].file.EventUnregister(&state[i].waiter)
+			state[i].file.DecRef()
+		}
+	}
+}
+
+// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
+// when "timeout" is greater than zero.
+//
+// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
+// positive if interrupted by a signal.
+func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
+	var ch chan struct{}
+	if timeout != 0 {
+		ch = make(chan struct{}, 1)
+	}
+
+	// Register for event notification in the files involved if we may
+	// block (timeout not zero). Once we find a file that has a non-zero
+	// result, we stop registering for events but still go through all files
+	// to get their ready masks.
+	state := make([]pollState, len(pfd))
+	defer releaseState(state)
+	n := uintptr(0)
+	for i := range pfd {
+		initReadiness(t, &pfd[i], &state[i], ch)
+		if pfd[i].REvents != 0 {
+			n++
+			ch = nil
+		}
+	}
+
+	if timeout == 0 {
+		return timeout, n, nil
+	}
+
+	forever := timeout < 0
+
+	for n == 0 {
+		var err error
+		// Wait for a notification.
+		timeout, err = t.BlockWithTimeout(ch, !forever, timeout)
+		if err != nil {
+			if err == syserror.ETIMEDOUT {
+				err = nil
+			}
+			return timeout, 0, err
+		}
+
+		// We got notified, count how many files are ready. If none,
+		// then this was a spurious notification, and we just go back
+		// to sleep with the remaining timeout.
+		for i := range state {
+			if state[i].file == nil {
+				continue
+			}
+
+			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
+			rl := int16(r.ToLinux()) & pfd[i].Events
+			if rl != 0 {
+				pfd[i].REvents = rl
+				n++
+			}
+		}
+	}
+
+	return timeout, n, nil
+}
+
 func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
 	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
 		return timeout, 0, syserror.EINVAL
 	}
 
-	pfd := make([]syscalls.PollFD, nfds)
+	pfd := make([]linux.PollFD, nfds)
 	if nfds > 0 {
 		if _, err := t.CopyIn(pfdAddr, &pfd); err != nil {
 			return timeout, 0, err
@@ -65,9 +172,9 @@ func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Durati
 	// polling, changing event masks here is an application-visible difference.
 	// (Linux also doesn't copy out event masks at all, only revents.)
 	for i := range pfd {
-		pfd[i].Events |= waiter.EventHUp | waiter.EventErr
+		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
 	}
-	remainingTimeout, n, err := syscalls.Poll(t, pfd, timeout)
+	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
 	err = syserror.ConvertIntr(err, syserror.EINTR)
 
 	// The poll entries are copied out regardless of whether
@@ -136,8 +243,8 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 	}
 
 	// Build the PollFD array.
-	pfd := make([]syscalls.PollFD, 0, fdCount)
-	fd := kdefs.FD(0)
+	pfd := make([]linux.PollFD, 0, fdCount)
+	var fd int32
 	for i := 0; i < byteCount; i++ {
 		rV, wV, eV := r[i], w[i], e[i]
 		v := rV | wV | eV
@@ -148,13 +255,13 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 				// immediately to ensure we don't leak. Note, another thread
 				// might be about to close fd. This is racy, but that's
 				// OK. Linux is racy in the same way.
-				file := t.FDMap().GetFile(fd)
+				file := t.FDMap().GetFile(kdefs.FD(fd))
 				if file == nil {
 					return 0, syserror.EBADF
 				}
 				file.DecRef()
 
-				mask := waiter.EventMask(0)
+				var mask int16
 				if (rV & m) != 0 {
 					mask |= selectReadEvents
 				}
@@ -167,7 +274,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 					mask |= selectExceptEvents
 				}
 
-				pfd = append(pfd, syscalls.PollFD{
+				pfd = append(pfd, linux.PollFD{
 					FD:     fd,
 					Events: mask,
 				})
@@ -179,7 +286,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
 	}
 
 	// Do the syscall, then count the number of bits set.
-	_, _, err := syscalls.Poll(t, pfd, timeout)
+	_, _, err := pollBlock(t, pfd, timeout)
 	if err != nil {
 		return 0, syserror.ConvertIntr(err, syserror.EINTR)
 	}
diff --git a/pkg/sentry/syscalls/polling.go b/pkg/sentry/syscalls/polling.go
deleted file mode 100644
index 2b33d6c19..000000000
--- a/pkg/sentry/syscalls/polling.go
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2018 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package syscalls
-
-import (
-	"syscall"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
-	"gvisor.googlesource.com/gvisor/pkg/waiter"
-)
-
-// PollFD describes a pollable FD.
-type PollFD struct {
-	FD      kdefs.FD
-	Events  waiter.EventMask
-	REvents waiter.EventMask
-}
-
-// pollState tracks the associated file descriptor and waiter of a PollFD.
-type pollState struct {
-	file   *fs.File
-	waiter waiter.Entry
-}
-
-// initReadiness gets the current ready mask for the file represented by the FD
-// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
-// used to register with the file for event notifications, and a reference to
-// the file is stored in "state".
-func (pfd *PollFD) initReadiness(t *kernel.Task, state *pollState, ch chan struct{}) {
-	if pfd.FD < 0 {
-		pfd.REvents = 0
-		return
-	}
-
-	file := t.FDMap().GetFile(pfd.FD)
-	if file == nil {
-		pfd.REvents = waiter.EventNVal
-		return
-	}
-
-	if ch == nil {
-		defer file.DecRef()
-	} else {
-		state.file = file
-		state.waiter, _ = waiter.NewChannelEntry(ch)
-		file.EventRegister(&state.waiter, pfd.Events)
-	}
-
-	pfd.REvents = file.Readiness(pfd.Events) & pfd.Events
-}
-
-// releaseState releases all the pollState in "state".
-func releaseState(state []pollState) {
-	for i := range state {
-		if state[i].file != nil {
-			state[i].file.EventUnregister(&state[i].waiter)
-			state[i].file.DecRef()
-		}
-	}
-}
-
-// Poll polls the PollFDs in "pfd" with a bounded time specified in "timeout"
-// when "timeout" is greater than zero.
-//
-// Poll returns the remaining timeout, which is always 0 on a timeout; and 0 or
-// positive if interrupted by a signal.
-func Poll(t *kernel.Task, pfd []PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
-	var ch chan struct{}
-	if timeout != 0 {
-		ch = make(chan struct{}, 1)
-	}
-
-	// Register for event notification in the files involved if we may
-	// block (timeout not zero). Once we find a file that has a non-zero
-	// result, we stop registering for events but still go through all files
-	// to get their ready masks.
-	state := make([]pollState, len(pfd))
-	defer releaseState(state)
-	n := uintptr(0)
-	for i := range pfd {
-		pfd[i].initReadiness(t, &state[i], ch)
-		if pfd[i].REvents != 0 {
-			n++
-			ch = nil
-		}
-	}
-
-	if timeout == 0 {
-		return timeout, n, nil
-	}
-
-	forever := timeout < 0
-
-	for n == 0 {
-		var err error
-		// Wait for a notification.
-		timeout, err = t.BlockWithTimeout(ch, !forever, timeout)
-		if err != nil {
-			if err == syscall.ETIMEDOUT {
-				err = nil
-			}
-			return timeout, 0, err
-		}
-
-		// We got notified, count how many files are ready. If none,
-		// then this was a spurious notification, and we just go back
-		// to sleep with the remaining timeout.
-		for i := range state {
-			if state[i].file == nil {
-				continue
-			}
-
-			ready := state[i].file.Readiness(pfd[i].Events) & pfd[i].Events
-			if ready != 0 {
-				pfd[i].REvents = ready
-				n++
-			}
-		}
-	}
-
-	return timeout, n, nil
-}
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index fd429f733..a6c9dff3c 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -67,14 +67,28 @@ type EventMask uint16
 // Events that waiters can wait on. The meaning is the same as those in the
 // poll() syscall.
 const (
-	EventIn   EventMask = 0x01 // syscall.EPOLLIN
-	EventPri  EventMask = 0x02 // syscall.EPOLLPRI
-	EventOut  EventMask = 0x04 // syscall.EPOLLOUT
-	EventErr  EventMask = 0x08 // syscall.EPOLLERR
-	EventHUp  EventMask = 0x10 // syscall.EPOLLHUP
-	EventNVal EventMask = 0x20 // Not defined in syscall.
+	EventIn  EventMask = 0x01 // POLLIN
+	EventPri EventMask = 0x02 // POLLPRI
+	EventOut EventMask = 0x04 // POLLOUT
+	EventErr EventMask = 0x08 // POLLERR
+	EventHUp EventMask = 0x10 // POLLHUP
+
+	allEvents EventMask = 0x1f
 )
 
+// EventMaskFromLinux returns an EventMask representing the supported events
+// from the Linux events e, which is in the format used by poll(2).
+func EventMaskFromLinux(e uint32) EventMask {
+	// Our flag definitions are currently identical to Linux.
+	return EventMask(e) & allEvents
+}
+
+// ToLinux returns e in the format used by Linux poll(2).
+func (e EventMask) ToLinux() uint32 {
+	// Our flag definitions are currently identical to Linux.
+	return uint32(e)
+}
+
 // Waitable contains the methods that need to be implemented by waitable
 // objects.
 type Waitable interface {
-- 
cgit v1.2.3


From c8cee7108f1a1b37e89961c6dd69ccab97952c86 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Wed, 17 Apr 2019 12:56:23 -0700
Subject: Use FD limit and file size limit from host

FD limit and file size limit is read from the host, instead
of using hard-coded defaults, given that they effect the sandbox
process. Also limit the direct cache to use no more than half
if the available FDs.

PiperOrigin-RevId: 244050323
Change-Id: I787ad0fdf07c49d589e51aebfeae477324fe26e6
---
 pkg/sentry/fs/BUILD                   |  1 +
 pkg/sentry/fs/context.go              | 12 +++++
 pkg/sentry/fs/dirent_cache.go         | 43 ++++++++++++++---
 pkg/sentry/fs/dirent_cache_limiter.go | 55 +++++++++++++++++++++
 pkg/sentry/fs/dirent_cache_test.go    | 90 +++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/gofer/session.go        |  9 ++++
 pkg/sentry/fs/mount.go                | 20 ++++++--
 pkg/sentry/fs/mount_overlay.go        | 11 ++++-
 pkg/sentry/kernel/kernel.go           |  9 ++++
 pkg/sentry/kernel/task.go             |  2 +
 runsc/boot/fs.go                      | 19 +++++++-
 runsc/boot/limits.go                  | 77 +++++++++++++++++++++++++++++-
 runsc/boot/loader.go                  |  4 ++
 test/syscalls/linux/poll.cc           |  9 ++++
 14 files changed, 347 insertions(+), 14 deletions(-)
 create mode 100644 pkg/sentry/fs/dirent_cache_limiter.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 1742d3a65..1fd9e30f6 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -12,6 +12,7 @@ go_library(
         "dentry.go",
         "dirent.go",
         "dirent_cache.go",
+        "dirent_cache_limiter.go",
         "dirent_list.go",
         "dirent_state.go",
         "event_list.go",
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index c0e6075e4..4869428a8 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -26,6 +26,9 @@ type contextID int
 const (
 	// CtxRoot is a Context.Value key for a Dirent.
 	CtxRoot contextID = iota
+
+	// CtxDirentCacheLimiter is a Context.Value key for DirentCacheLimiter.
+	CtxDirentCacheLimiter
 )
 
 // ContextCanAccessFile determines whether `file` can be accessed in the requested way
@@ -100,3 +103,12 @@ func RootFromContext(ctx context.Context) *Dirent {
 	}
 	return nil
 }
+
+// DirentCacheLimiterFromContext returns the DirentCacheLimiter used by ctx, or
+// nil if ctx does not have a dirent cache limiter.
+func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
+	if v := ctx.Value(CtxDirentCacheLimiter); v != nil {
+		return v.(*DirentCacheLimiter)
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 502b0a09b..d26a06971 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -32,6 +32,10 @@ type DirentCache struct {
 	// when cache is nil.
 	maxSize uint64
 
+	// limit restricts the number of entries in the cache amoung multiple caches.
+	// It may be nil if there are no global limit for this cache.
+	limit *DirentCacheLimiter
+
 	// mu protects currentSize and direntList.
 	mu sync.Mutex `state:"nosave"`
 
@@ -45,8 +49,7 @@ type DirentCache struct {
 	list direntList `state:"zerovalue"`
 }
 
-// NewDirentCache returns a new DirentCache with the given maxSize. If maxSize
-// is 0, nil is returned.
+// NewDirentCache returns a new DirentCache with the given maxSize.
 func NewDirentCache(maxSize uint64) *DirentCache {
 	return &DirentCache{
 		maxSize: maxSize,
@@ -71,15 +74,24 @@ func (c *DirentCache) Add(d *Dirent) {
 		return
 	}
 
+	// First check against the global limit.
+	for c.limit != nil && !c.limit.tryInc() {
+		if c.currentSize == 0 {
+			// If the global limit is reached, but there is nothing more to drop from
+			// this cache, there is not much else to do.
+			c.mu.Unlock()
+			return
+		}
+		c.remove(c.list.Back())
+	}
+
 	// d is not in cache. Add it and take a reference.
 	c.list.PushFront(d)
 	d.IncRef()
 	c.currentSize++
 
-	// Remove the oldest until we are under the size limit.
-	for c.maxSize > 0 && c.currentSize > c.maxSize {
-		c.remove(c.list.Back())
-	}
+	c.maybeShrink()
+
 	c.mu.Unlock()
 }
 
@@ -92,6 +104,9 @@ func (c *DirentCache) remove(d *Dirent) {
 	d.SetNext(nil)
 	d.DecRef()
 	c.currentSize--
+	if c.limit != nil {
+		c.limit.dec()
+	}
 }
 
 // Remove removes the element from the cache and decrements its refCount. It
@@ -142,3 +157,19 @@ func (c *DirentCache) Invalidate() {
 	}
 	c.mu.Unlock()
 }
+
+// setMaxSize sets cache max size. If current size is larger than max size, the
+// cache shrinks to acommodate the new max.
+func (c *DirentCache) setMaxSize(max uint64) {
+	c.mu.Lock()
+	c.maxSize = max
+	c.maybeShrink()
+	c.mu.Unlock()
+}
+
+// shrink removes the oldest element until the list is under the size limit.
+func (c *DirentCache) maybeShrink() {
+	for c.maxSize > 0 && c.currentSize > c.maxSize {
+		c.remove(c.list.Back())
+	}
+}
diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go
new file mode 100644
index 000000000..024c7b2d5
--- /dev/null
+++ b/pkg/sentry/fs/dirent_cache_limiter.go
@@ -0,0 +1,55 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"fmt"
+	"sync"
+)
+
+// DirentCacheLimiter acts as a global limit for all dirent caches in the
+// process.
+//
+// +stateify savable
+type DirentCacheLimiter struct {
+	mu    sync.Mutex `state:"nosave"`
+	max   uint64
+	count uint64 `state:"zerovalue"`
+}
+
+// NewDirentCacheLimiter creates a new DirentCacheLimiter.
+func NewDirentCacheLimiter(max uint64) *DirentCacheLimiter {
+	return &DirentCacheLimiter{max: max}
+}
+
+func (d *DirentCacheLimiter) tryInc() bool {
+	d.mu.Lock()
+	if d.count >= d.max {
+		d.mu.Unlock()
+		return false
+	}
+	d.count++
+	d.mu.Unlock()
+	return true
+}
+
+func (d *DirentCacheLimiter) dec() {
+	d.mu.Lock()
+	if d.count == 0 {
+		panic(fmt.Sprintf("underflowing DirentCacheLimiter count: %+v", d))
+	}
+	d.count--
+	d.mu.Unlock()
+}
diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go
index 5d0e9d91c..93e8d415f 100644
--- a/pkg/sentry/fs/dirent_cache_test.go
+++ b/pkg/sentry/fs/dirent_cache_test.go
@@ -120,6 +120,96 @@ func TestDirentCache(t *testing.T) {
 	}
 }
 
+func TestDirentCacheLimiter(t *testing.T) {
+	const (
+		globalMaxSize = 5
+		maxSize       = 3
+	)
+
+	limit := NewDirentCacheLimiter(globalMaxSize)
+	c1 := NewDirentCache(maxSize)
+	c1.limit = limit
+	c2 := NewDirentCache(maxSize)
+	c2.limit = limit
+
+	// Create a Dirent d.
+	d := NewNegativeDirent("")
+
+	// Add d to the cache.
+	c1.Add(d)
+	if got, want := c1.Size(), uint64(1); got != want {
+		t.Errorf("c1.Size() got %v, want %v", got, want)
+	}
+
+	// Add maxSize-1 more elements. d should be oldest element.
+	for i := 0; i < maxSize-1; i++ {
+		c1.Add(NewNegativeDirent(""))
+	}
+	if got, want := c1.Size(), uint64(maxSize); got != want {
+		t.Errorf("c1.Size() got %v, want %v", got, want)
+	}
+
+	// Check that d is still there.
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+
+	// Fill up the other cache, it will start dropping old entries from the cache
+	// when the global limit is reached.
+	for i := 0; i < maxSize; i++ {
+		c2.Add(NewNegativeDirent(""))
+	}
+
+	// Check is what's remaining from global max.
+	if got, want := c2.Size(), globalMaxSize-maxSize; int(got) != want {
+		t.Errorf("c2.Size() got %v, want %v", got, want)
+	}
+
+	// Check that d was not dropped.
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+
+	// Add an entry that will eventually be dropped. Check is done later...
+	drop := NewNegativeDirent("")
+	c1.Add(drop)
+
+	// Check that d is bumped to front even when global limit is reached.
+	c1.Add(d)
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+
+	// Add 2 more element and check that:
+	//   - d is still in the list: to verify that d was bumped
+	//   - d2/d3 are in the list: older entries are dropped when global limit is
+	//     reached.
+	//   - drop is not in the list: indeed older elements are dropped.
+	d2 := NewNegativeDirent("")
+	c1.Add(d2)
+	d3 := NewNegativeDirent("")
+	c1.Add(d3)
+	if got, want := c1.contains(d), true; got != want {
+		t.Errorf("c1.contains(d) got %v want %v", got, want)
+	}
+	if got, want := c1.contains(d2), true; got != want {
+		t.Errorf("c1.contains(d2) got %v want %v", got, want)
+	}
+	if got, want := c1.contains(d3), true; got != want {
+		t.Errorf("c1.contains(d3) got %v want %v", got, want)
+	}
+	if got, want := c1.contains(drop), false; got != want {
+		t.Errorf("c1.contains(drop) got %v want %v", got, want)
+	}
+
+	// Drop all entries from one cache. The other will be allowed to grow.
+	c1.Invalidate()
+	c2.Add(NewNegativeDirent(""))
+	if got, want := c2.Size(), uint64(maxSize); got != want {
+		t.Errorf("c2.Size() got %v, want %v", got, want)
+	}
+}
+
 // TestNilDirentCache tests that a nil cache supports all cache operations, but
 // treats them as noop.
 func TestNilDirentCache(t *testing.T) {
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index d626b86f5..ed5147c65 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -28,6 +28,10 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 )
 
+// DefaultDirentCacheSize is the default dirent cache size for 9P mounts. It can
+// be adjusted independentely from the other dirent caches.
+var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize
+
 // +stateify savable
 type endpointMaps struct {
 	// mu protexts the direntMap, the keyMap, and the pathMap below.
@@ -249,6 +253,11 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
 	// Construct the MountSource with the session and superBlockFlags.
 	m := fs.NewMountSource(s, filesystem, superBlockFlags)
 
+	// Given that gofer files can consume host FDs, restrict the number
+	// of files that can be held by the cache.
+	m.SetDirentCacheMaxSize(DefaultDirentCacheSize)
+	m.SetDirentCacheLimiter(fs.DirentCacheLimiterFromContext(ctx))
+
 	// Send the Tversion request.
 	s.client, err = p9.NewClient(conn, s.msize, s.version)
 	if err != nil {
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 5cc777bef..1e245ae5f 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -151,9 +151,9 @@ type MountSource struct {
 	children map[*MountSource]struct{}
 }
 
-// defaultDirentCacheSize is the number of Dirents that the VFS can hold an extra
-// reference on.
-const defaultDirentCacheSize uint64 = 1000
+// DefaultDirentCacheSize is the number of Dirents that the VFS can hold an
+// extra reference on.
+const DefaultDirentCacheSize uint64 = 1000
 
 // NewMountSource returns a new MountSource. Filesystem may be nil if there is no
 // filesystem backing the mount.
@@ -162,7 +162,7 @@ func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags Mou
 		MountSourceOperations: mops,
 		Flags:                 flags,
 		Filesystem:            filesystem,
-		fscache:               NewDirentCache(defaultDirentCacheSize),
+		fscache:               NewDirentCache(DefaultDirentCacheSize),
 		children:              make(map[*MountSource]struct{}),
 	}
 }
@@ -246,6 +246,18 @@ func (msrc *MountSource) FlushDirentRefs() {
 	msrc.fscache.Invalidate()
 }
 
+// SetDirentCacheMaxSize sets the max size to the dirent cache associated with
+// this mount source.
+func (msrc *MountSource) SetDirentCacheMaxSize(max uint64) {
+	msrc.fscache.setMaxSize(max)
+}
+
+// SetDirentCacheLimiter sets the limiter objcet to the dirent cache associated
+// with this mount source.
+func (msrc *MountSource) SetDirentCacheLimiter(l *DirentCacheLimiter) {
+	msrc.fscache.limit = l
+}
+
 // NewCachingMountSource returns a generic mount that will cache dirents
 // aggressively.
 func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index 4c89673b5..fb60a1aec 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -31,10 +31,19 @@ type overlayMountSourceOperations struct {
 func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *MountSource {
 	upper.IncRef()
 	lower.IncRef()
-	return NewMountSource(&overlayMountSourceOperations{
+	msrc := NewMountSource(&overlayMountSourceOperations{
 		upper: upper,
 		lower: lower,
 	}, &overlayFilesystem{}, flags)
+
+	// Use the minimum number to keep resource usage under limits.
+	size := lower.fscache.maxSize
+	if size > upper.fscache.maxSize {
+		size = upper.fscache.maxSize
+	}
+	msrc.fscache.setMaxSize(size)
+
+	return msrc
 }
 
 // Revalidate implements MountSourceOperations.Revalidate for an overlay by
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index b8953657c..290c4a53c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -188,6 +188,11 @@ type Kernel struct {
 
 	// deviceRegistry is used to save/restore device.SimpleDevices.
 	deviceRegistry struct{} `state:".(*device.Registry)"`
+
+	// DirentCacheLimiter controls the number of total dirent entries can be in
+	// caches. Not all caches use it, only the caches that use host resources use
+	// the limiter. It may be nil if disabled.
+	DirentCacheLimiter *fs.DirentCacheLimiter
 }
 
 // InitKernelArgs holds arguments to Init.
@@ -626,6 +631,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
 			return ctx.k.mounts.Root()
 		}
 		return nil
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
@@ -1170,6 +1177,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
 	case fs.CtxRoot:
 		return ctx.k.mounts.Root()
+	case fs.CtxDirentCacheLimiter:
+		return ctx.k.DirentCacheLimiter
 	case ktime.CtxRealtimeClock:
 		return ctx.k.RealtimeClock()
 	case limits.CtxLimits:
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 9c365e781..ed2175c37 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -601,6 +601,8 @@ func (t *Task) Value(key interface{}) interface{} {
 		return int32(t.ThreadGroup().ID())
 	case fs.CtxRoot:
 		return t.fsc.RootDirectory()
+	case fs.CtxDirentCacheLimiter:
+		return t.k.DirentCacheLimiter
 	case inet.CtxStack:
 		return t.NetworkContext()
 	case ktime.CtxRealtimeClock:
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 8dfb6dce6..761142d98 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -20,10 +20,10 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"syscall"
 
 	// Include filesystem types that OCI spec might mount.
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
-	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
@@ -38,6 +38,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
@@ -81,6 +82,22 @@ func (f *fdDispenser) empty() bool {
 	return len(f.fds) == 0
 }
 
+func adjustDirentCache(k *kernel.Kernel) error {
+	var hl syscall.Rlimit
+	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+	}
+	if int64(hl.Cur) != syscall.RLIM_INFINITY {
+		newSize := hl.Cur / 2
+		if newSize < gofer.DefaultDirentCacheSize {
+			log.Infof("Setting gofer dirent cache size to %d", newSize)
+			gofer.DefaultDirentCacheSize = newSize
+			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+		}
+	}
+	return nil
+}
+
 // setupRootContainerFS creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
 // 'setMountNS' is called after namespace is created. It must set the mount NS
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index e3e716bf9..32e62cdf7 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -16,8 +16,11 @@ package boot
 
 import (
 	"fmt"
+	"sync"
+	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 )
 
@@ -41,10 +44,43 @@ var fromLinuxResource = map[string]limits.LimitType{
 	"RLIMIT_STACK":      limits.Stack,
 }
 
-func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+func findName(lt limits.LimitType) string {
+	for k, v := range fromLinuxResource {
+		if v == lt {
+			return k
+		}
+	}
+	return "unknown"
+}
+
+var defaults defs
+
+type defs struct {
+	mu  sync.Mutex
+	set *limits.LimitSet
+	err error
+}
+
+func (d *defs) get() (*limits.LimitSet, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.err != nil {
+		return nil, d.err
+	}
+	if d.set == nil {
+		if err := d.initDefaults(); err != nil {
+			d.err = err
+			return nil, err
+		}
+	}
+	return d.set, nil
+}
+
+func (d *defs) initDefaults() error {
 	ls, err := limits.NewLinuxLimitSet()
 	if err != nil {
-		return nil, err
+		return err
 	}
 
 	// Set default limits based on what containers get by default, ex:
@@ -66,6 +102,43 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
 	ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
 	ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
 
+	// Read host limits that directly affect the sandbox and adjust the defaults
+	// based on them.
+	for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} {
+		var hl syscall.Rlimit
+		if err := syscall.Getrlimit(res, &hl); err != nil {
+			return err
+		}
+
+		lt, ok := limits.FromLinuxResource[res]
+		if !ok {
+			return fmt.Errorf("unknown rlimit type %v", res)
+		}
+		hostLimit := limits.Limit{
+			Cur: limits.FromLinux(hl.Cur),
+			Max: limits.FromLinux(hl.Max),
+		}
+
+		defaultLimit := ls.Get(lt)
+		if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur {
+			log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur)
+		}
+		if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max {
+			log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max)
+			ls.SetUnchecked(lt, hostLimit)
+		}
+	}
+
+	d.set = ls
+	return nil
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+	ls, err := defaults.get()
+	if err != nil {
+		return nil, err
+	}
+
 	// Then apply overwrites on top of defaults.
 	for _, rl := range spec.Process.Rlimits {
 		lt, ok := fromLinuxResource[rl.Type]
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 56cb137f0..88a834aa5 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -274,6 +274,10 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
+	if err := adjustDirentCache(k); err != nil {
+		return nil, err
+	}
+
 	// Turn on packet logging if enabled.
 	if args.Conf.LogPackets {
 		log.Infof("Packet logging enabled")
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index 7a6a39444..67a86cc22 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -255,7 +255,16 @@ TEST_F(PollTest, Nfds) {
   // Stash value of RLIMIT_NOFILES.
   struct rlimit rlim;
   TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0);
+
+  // gVisor caps the number of FDs that epoll can use beyond RLIMIT_NOFILE.
+  constexpr rlim_t gVisorMax = 1048576;
+  if (rlim.rlim_cur > gVisorMax) {
+    rlim.rlim_cur = gVisorMax;
+    TEST_PCHECK(setrlimit(RLIMIT_NOFILE, &rlim) == 0);
+  }
+
   rlim_t max_fds = rlim.rlim_cur;
+  LOG(INFO) << "Using limit: " << max_fds;
 
   // Create an eventfd. Since its value is initially zero, it is writable.
   FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
-- 
cgit v1.2.3


From b52cbd60280342f25411561702e97fe650fdaa9c Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 17 Apr 2019 13:42:16 -0700
Subject: Don't allow sigtimedwait to catch unblockable signals

The existing logic attempting to do this is incorrect. Unary ^ has
higher precedence than &^, so mask always has UnblockableSignals
cleared, allowing dequeueSignalLocked to dequeue unblockable signals
(which allows userspace to ignore them).

Switch the logic so that unblockable signals are always masked.

PiperOrigin-RevId: 244058487
Change-Id: Ib19630ac04068a1fbfb9dc4a8eab1ccbdb21edc3
---
 pkg/sentry/kernel/task_signals.go   |  2 +-
 test/syscalls/linux/BUILD           |  1 +
 test/syscalls/linux/sigtimedwait.cc | 76 +++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index e177562d7..3a8e61900 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -307,7 +307,7 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
 func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
 	// set is the set of signals we're interested in; invert it to get the set
 	// of signals to block.
-	mask := ^set &^ UnblockableSignals
+	mask := ^(set &^ UnblockableSignals)
 
 	t.tg.signalHandlers.mu.Lock()
 	defer t.tg.signalHandlers.mu.Unlock()
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 38faba267..d99733fc9 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1825,6 +1825,7 @@ cc_binary(
     srcs = ["sigtimedwait.cc"],
     linkstatic = 1,
     deps = [
+        "//test/util:file_descriptor",
         "//test/util:logging",
         "//test/util:signal_util",
         "//test/util:test_util",
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
index 3a350fc28..1df9c013f 100644
--- a/test/syscalls/linux/sigtimedwait.cc
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -18,6 +18,7 @@
 #include "gtest/gtest.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/logging.h"
 #include "test/util/signal_util.h"
 #include "test/util/test_util.h"
@@ -163,6 +164,81 @@ TEST(SigtimedwaitTest, ChildExitGeneratedSIGCHLDWithHandler) {
   EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
 }
 
+// sigtimedwait cannot catch SIGKILL.
+TEST(SigtimedwaitTest, SIGKILLUncaught) {
+  // This is a regression test for sigtimedwait dequeuing SIGKILLs, thus
+  // preventing the task from exiting.
+  //
+  // The explanation below is specific to behavior in gVisor. The Linux behavior
+  // here is irrelevant because without a bug that prevents delivery of SIGKILL,
+  // none of this behavior is visible (in Linux or gVisor).
+  //
+  // SIGKILL is rather intrusive. Simply sending the SIGKILL marks
+  // ThreadGroup.exitStatus as exiting with SIGKILL, before the SIGKILL is even
+  // delivered.
+  //
+  // As a result, we cannot simply exit the child with a different exit code if
+  // it survives and expect to see that code in waitpid because:
+  //   1. PrepareGroupExit will override Task.exitStatus with
+  //      ThreadGroup.exitStatus.
+  //   2. waitpid(2) will always return ThreadGroup.exitStatus rather than
+  //      Task.exitStatus.
+  //
+  // We could use exit(2) to set Task.exitStatus without override, and a SIGCHLD
+  // handler to receive Task.exitStatus in the parent, but with that much
+  // test complexity, it is cleaner to simply use a pipe to notify the parent
+  // that we survived.
+  constexpr auto kSigtimedwaitSetupTime = absl::Seconds(2);
+
+  int pipe_fds[2];
+  ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+  FileDescriptor rfd(pipe_fds[0]);
+  FileDescriptor wfd(pipe_fds[1]);
+
+  pid_t pid = fork();
+  if (pid == 0) {
+    rfd.reset();
+
+    sigset_t mask;
+    sigemptyset(&mask);
+    sigaddset(&mask, SIGKILL);
+    RetryEINTR(sigtimedwait)(&mask, nullptr, nullptr);
+
+    // Survived.
+    char c = 'a';
+    TEST_PCHECK(WriteFd(wfd.get(), &c, 1) == 1);
+    _exit(1);
+  }
+  ASSERT_THAT(pid, SyscallSucceeds());
+
+  wfd.reset();
+
+  // Wait for child to block in sigtimedwait, then kill it.
+  absl::SleepFor(kSigtimedwaitSetupTime);
+
+  // Sending SIGKILL will attempt to enqueue the signal twice: once in the
+  // normal signal sending path, and once to all Tasks in the ThreadGroup when
+  // applying SIGKILL side-effects.
+  //
+  // If we use kill(2), the former will be on the ThreadGroup signal queue and
+  // the latter will be on the Task signal queue. sigtimedwait can only dequeue
+  // one signal, so the other would kill the Task, masking bugs.
+  //
+  // If we use tkill(2), the former will be on the Task signal queue and the
+  // latter will be dropped as a duplicate. Then sigtimedwait can theoretically
+  // dequeue the single SIGKILL.
+  EXPECT_THAT(syscall(SYS_tkill, pid, SIGKILL), SyscallSucceeds());
+
+  int status;
+  EXPECT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+              SyscallSucceedsWithValue(pid));
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) << status;
+
+  // Child shouldn't have survived.
+  char c;
+  EXPECT_THAT(ReadFd(rfd.get(), &c, 1), SyscallSucceedsWithValue(0));
+}
+
 TEST(SigtimedwaitTest, IgnoredUnmaskedSignal) {
   constexpr int kSigno = SIGUSR1;
   constexpr auto kSigtimedwaitSetupTime = absl::Seconds(2);
-- 
cgit v1.2.3


From 133700007a8495c7d8df53801b1d34345d6c5cf8 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Thu, 18 Apr 2019 11:50:26 -0700
Subject: Only emit unimplemented syscall events for unsupported values.

Only emit unimplemented syscall events for setting SO_OOBINLINE and SO_LINGER
when attempting to set unsupported values.

PiperOrigin-RevId: 244229675
Change-Id: Icc4562af8f733dd75a90404621711f01a32a9fc1
---
 pkg/abi/linux/socket.go                | 13 +++++++++++--
 pkg/sentry/socket/epsocket/epsocket.go | 31 +++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 906776525..6fa4e7c3e 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -247,6 +247,15 @@ type SockAddrUnix struct {
 	Path   [UnixPathMax]int8
 }
 
+// Linger is struct linger, from include/linux/socket.h.
+type Linger struct {
+	OnOff  int32
+	Linger int32
+}
+
+// SizeOfLinger is the binary size of a Linger struct.
+const SizeOfLinger = 8
+
 // TCPInfo is a collection of TCP statistics.
 //
 // From uapi/linux/tcp.h.
@@ -322,8 +331,8 @@ type TCPInfo struct {
 	SndBufLimited uint64
 }
 
-// SizeOfTCPInfo is the binary size of a TCPInfo struct (104 bytes).
-var SizeOfTCPInfo = binary.Size(TCPInfo{})
+// SizeOfTCPInfo is the binary size of a TCPInfo struct.
+const SizeOfTCPInfo = 104
 
 // Control message types, from linux/socket.h.
 const (
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 78f43178f..f370b803b 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -783,10 +783,10 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		return int32(v), nil
 
 	case linux.SO_LINGER:
-		if outLen < syscall.SizeofLinger {
+		if outLen < linux.SizeOfLinger {
 			return nil, syserr.ErrInvalidArgument
 		}
-		return syscall.Linger{}, nil
+		return linux.Linger{}, nil
 
 	case linux.SO_SNDTIMEO:
 		// TODO: Linux allows shorter lengths for partial results.
@@ -1126,6 +1126,33 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
 		s.SetRecvTimeout(v.ToNsecCapped())
 		return nil
 
+	case linux.SO_OOBINLINE:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+
+		if v == 0 {
+			socket.SetSockOptEmitUnimplementedEvent(t, name)
+		}
+
+		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v)))
+
+	case linux.SO_LINGER:
+		if len(optVal) < linux.SizeOfLinger {
+			return syserr.ErrInvalidArgument
+		}
+
+		var v linux.Linger
+		binary.Unmarshal(optVal[:linux.SizeOfLinger], usermem.ByteOrder, &v)
+
+		if v != (linux.Linger{}) {
+			socket.SetSockOptEmitUnimplementedEvent(t, name)
+		}
+
+		return nil
+
 	default:
 		socket.SetSockOptEmitUnimplementedEvent(t, name)
 	}
-- 
cgit v1.2.3


From c931c8e0829914718a729e20d7db0c2bf4e73f0b Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 18 Apr 2019 15:22:47 -0700
Subject: Format struct pollfd in poll(2)/ppoll(2)

I0410 15:40:38.854295    3776 x:0] [   1] poll_test E poll(0x2b00bfb5c020 [{FD: 0x3 anon_inode:[eventfd], Events: POLLOUT, REvents: ...}], 0x1, 0x1)
I0410 15:40:38.854348    3776 x:0] [   1] poll_test X poll(0x2b00bfb5c020 [{FD: 0x3 anon_inode:[eventfd], Events: POLLOUT|POLLERR|POLLHUP, REvents: POLLOUT}], 0x1, 0x1) = 0x1 (10.765?s)

PiperOrigin-RevId: 244269879
Change-Id: If07ba54a486fdeaaedfc0123769b78d1da862307
---
 pkg/sentry/strace/BUILD               |  1 +
 pkg/sentry/strace/linux64.go          |  4 +-
 pkg/sentry/strace/poll.go             | 72 +++++++++++++++++++++++++++++++++++
 pkg/sentry/strace/strace.go           |  4 ++
 pkg/sentry/strace/syscalls.go         |  4 ++
 pkg/sentry/syscalls/linux/sys_poll.go | 20 +++++++---
 6 files changed, 98 insertions(+), 7 deletions(-)
 create mode 100644 pkg/sentry/strace/poll.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index bcd94b42e..eaaa4d118 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -11,6 +11,7 @@ go_library(
         "futex.go",
         "linux64.go",
         "open.go",
+        "poll.go",
         "ptrace.go",
         "signal.go",
         "socket.go",
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 22b76449c..6043b8cb1 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -24,7 +24,7 @@ var linuxAMD64 = SyscallMap{
 	4:   makeSyscallInfo("stat", Path, Stat),
 	5:   makeSyscallInfo("fstat", FD, Stat),
 	6:   makeSyscallInfo("lstat", Path, Stat),
-	7:   makeSyscallInfo("poll", Hex, Hex, Hex),
+	7:   makeSyscallInfo("poll", PollFDs, Hex, Hex),
 	8:   makeSyscallInfo("lseek", Hex, Hex, Hex),
 	9:   makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex),
 	10:  makeSyscallInfo("mprotect", Hex, Hex, Hex),
@@ -288,7 +288,7 @@ var linuxAMD64 = SyscallMap{
 	268: makeSyscallInfo("fchmodat", FD, Path, Mode),
 	269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
 	270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
-	271: makeSyscallInfo("ppoll", Hex, Hex, Timespec, SigSet, Hex),
+	271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
 	272: makeSyscallInfo("unshare", CloneFlags),
 	273: makeSyscallInfo("set_robust_list", Hex, Hex),
 	274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go
new file mode 100644
index 000000000..b6b05423c
--- /dev/null
+++ b/pkg/sentry/strace/poll.go
@@ -0,0 +1,72 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+	"fmt"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// PollEventSet is the set of poll(2) event flags.
+var PollEventSet = abi.FlagSet{
+	{Flag: linux.POLLIN, Name: "POLLIN"},
+	{Flag: linux.POLLPRI, Name: "POLLPRI"},
+	{Flag: linux.POLLOUT, Name: "POLLOUT"},
+	{Flag: linux.POLLERR, Name: "POLLERR"},
+	{Flag: linux.POLLHUP, Name: "POLLHUP"},
+	{Flag: linux.POLLNVAL, Name: "POLLNVAL"},
+	{Flag: linux.POLLRDNORM, Name: "POLLRDNORM"},
+	{Flag: linux.POLLRDBAND, Name: "POLLRDBAND"},
+	{Flag: linux.POLLWRNORM, Name: "POLLWRNORM"},
+	{Flag: linux.POLLWRBAND, Name: "POLLWRBAND"},
+	{Flag: linux.POLLMSG, Name: "POLLMSG"},
+	{Flag: linux.POLLREMOVE, Name: "POLLREMOVE"},
+	{Flag: linux.POLLRDHUP, Name: "POLLRDHUP"},
+	{Flag: linux.POLLFREE, Name: "POLLFREE"},
+	{Flag: linux.POLL_BUSY_LOOP, Name: "POLL_BUSY_LOOP"},
+}
+
+func pollFD(t *kernel.Task, pfd *linux.PollFD, post bool) string {
+	revents := "..."
+	if post {
+		revents = PollEventSet.Parse(uint64(pfd.REvents))
+	}
+	return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, kdefs.FD(pfd.FD)), PollEventSet.Parse(uint64(pfd.Events)), revents)
+}
+
+func pollFDs(t *kernel.Task, addr usermem.Addr, nfds uint, post bool) string {
+	if addr == 0 {
+		return "null"
+	}
+
+	pfds, err := slinux.CopyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return fmt.Sprintf("%#x (error decoding pollfds: %s)", addr, err)
+	}
+
+	s := make([]string, 0, len(pfds))
+	for i := range pfds {
+		s = append(s, pollFD(t, &pfds[i], post))
+	}
+
+	return fmt.Sprintf("%#x [%s]", addr, strings.Join(s, ", "))
+}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 398035b65..a6d870b44 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -438,6 +438,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
 			output = append(output, capHeader(t, args[arg].Pointer()))
 		case CapData:
 			output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
+		case PollFDs:
+			output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false))
 		case Oct:
 			output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
 		case Hex:
@@ -502,6 +504,8 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint
 			output[arg] = sigAction(t, args[arg].Pointer())
 		case PostCapData:
 			output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer())
+		case PollFDs:
+			output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true)
 		}
 	}
 }
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 1f255c717..8c897fcbe 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -202,6 +202,10 @@ const (
 	// PostCapData is the data argument to capget(2)/capset(2), formatted
 	// after syscall execution. The previous argument must be CapHeader.
 	PostCapData
+
+	// PollFDs is an array of struct pollfd. The number of entries in the
+	// array is in the next argument.
+	PollFDs
 )
 
 // defaultFormat is the syscall argument format to use if the actual format is
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 23fcb907f..17b6768e5 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -155,18 +155,28 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.
 	return timeout, n, nil
 }
 
-func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+// CopyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
+func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
 	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
-		return timeout, 0, syserror.EINVAL
+		return nil, syserror.EINVAL
 	}
 
 	pfd := make([]linux.PollFD, nfds)
 	if nfds > 0 {
-		if _, err := t.CopyIn(pfdAddr, &pfd); err != nil {
-			return timeout, 0, err
+		if _, err := t.CopyIn(addr, &pfd); err != nil {
+			return nil, err
 		}
 	}
 
+	return pfd, nil
+}
+
+func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
+	pfd, err := CopyInPollFDs(t, addr, nfds)
+	if err != nil {
+		return timeout, 0, err
+	}
+
 	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
 	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
 	// polling, changing event masks here is an application-visible difference.
@@ -180,7 +190,7 @@ func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Durati
 	// The poll entries are copied out regardless of whether
 	// any are set or not. This aligns with the Linux behavior.
 	if nfds > 0 && err == nil {
-		if _, err := t.CopyOut(pfdAddr, pfd); err != nil {
+		if _, err := t.CopyOut(addr, pfd); err != nil {
 			return remainingTimeout, 0, err
 		}
 	}
-- 
cgit v1.2.3


From 358eb52a76ebd41baf52972f901af0ff398e131b Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Fri, 19 Apr 2019 16:15:37 -0700
Subject: Add support for the MSG_TRUNC msghdr flag.

The MSG_TRUNC flag is set in the msghdr when a message is truncated.

Fixes google/gvisor#200

PiperOrigin-RevId: 244440486
Change-Id: I03c7d5e7f5935c0c6b8d69b012db1780ac5b8456
---
 pkg/sentry/socket/epsocket/epsocket.go      | 45 ++++++++--------
 pkg/sentry/socket/hostinet/socket.go        | 12 +++--
 pkg/sentry/socket/netlink/socket.go         | 18 +++++--
 pkg/sentry/socket/rpcinet/socket.go         | 16 +++---
 pkg/sentry/socket/socket.go                 |  2 +-
 pkg/sentry/socket/unix/unix.go              | 31 +++++++----
 pkg/sentry/syscalls/linux/sys_socket.go     | 16 +++---
 test/syscalls/linux/socket_netlink_route.cc | 80 +++++++++++++++++++++++++++++
 test/syscalls/linux/socket_non_stream.cc    | 55 ++++++++++++++++++++
 test/syscalls/linux/socket_stream.cc        | 27 ++++++++++
 10 files changed, 245 insertions(+), 57 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index f370b803b..23138d874 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -376,7 +376,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 	if dst.NumBytes() == 0 {
 		return 0, nil
 	}
-	n, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
+	n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
 	if err == syserr.ErrWouldBlock {
 		return int64(n), syserror.ErrWouldBlock
 	}
@@ -1696,7 +1696,7 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 // nonBlockingRead issues a non-blocking read.
 //
 // TODO: Support timestamps for stream sockets.
-func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	isPacket := s.isPacketBased()
 
 	// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
@@ -1712,14 +1712,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readMu.Lock()
 		n, err := s.coalescingRead(ctx, dst, trunc)
 		s.readMu.Unlock()
-		return n, nil, 0, socket.ControlMessages{}, err
+		return n, 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	s.readMu.Lock()
 	defer s.readMu.Unlock()
 
 	if err := s.fetchReadView(); err != nil {
-		return 0, nil, 0, socket.ControlMessages{}, err
+		return 0, 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	if !isPacket && peek && trunc {
@@ -1727,14 +1727,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		// amount that could be read.
 		var rql tcpip.ReceiveQueueSizeOption
 		if err := s.Endpoint.GetSockOpt(&rql); err != nil {
-			return 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
+			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
 		}
 		available := len(s.readView) + int(rql)
 		bufLen := int(dst.NumBytes())
 		if available < bufLen {
-			return available, nil, 0, socket.ControlMessages{}, nil
+			return available, 0, nil, 0, socket.ControlMessages{}, nil
 		}
-		return bufLen, nil, 0, socket.ControlMessages{}, nil
+		return bufLen, 0, nil, 0, socket.ControlMessages{}, nil
 	}
 
 	n, err := dst.CopyOut(ctx, s.readView)
@@ -1751,11 +1751,11 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 	if peek {
 		if l := len(s.readView); trunc && l > n {
 			// isPacket must be true.
-			return l, addr, addrLen, s.controlMessages(), syserr.FromError(err)
+			return l, linux.MSG_TRUNC, addr, addrLen, s.controlMessages(), syserr.FromError(err)
 		}
 
 		if isPacket || err != nil {
-			return int(n), addr, addrLen, s.controlMessages(), syserr.FromError(err)
+			return n, 0, addr, addrLen, s.controlMessages(), syserr.FromError(err)
 		}
 
 		// We need to peek beyond the first message.
@@ -1773,7 +1773,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 			// We got some data, so no need to return an error.
 			err = nil
 		}
-		return int(n), nil, 0, s.controlMessages(), syserr.FromError(err)
+		return n, 0, nil, 0, s.controlMessages(), syserr.FromError(err)
 	}
 
 	var msgLen int
@@ -1785,11 +1785,16 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		s.readView.TrimFront(int(n))
 	}
 
+	var flags int
+	if msgLen > int(n) {
+		flags |= linux.MSG_TRUNC
+	}
+
 	if trunc {
-		return msgLen, addr, addrLen, s.controlMessages(), syserr.FromError(err)
+		n = msgLen
 	}
 
-	return int(n), addr, addrLen, s.controlMessages(), syserr.FromError(err)
+	return n, flags, addr, addrLen, s.controlMessages(), syserr.FromError(err)
 }
 
 func (s *SocketOperations) controlMessages() socket.ControlMessages {
@@ -1810,7 +1815,7 @@ func (s *SocketOperations) updateTimestamp() {
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // tcpip.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -1819,16 +1824,16 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		// Stream sockets ignore the sender address.
 		senderRequested = false
 	}
-	n, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+	n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
 
 	if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
 		// In this situation we should return EAGAIN.
-		return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
+		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 	}
 
 	if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
 		// Read failed and we should not retry.
-		return 0, nil, 0, socket.ControlMessages{}, err
+		return 0, 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
@@ -1847,7 +1852,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 	for {
 		var rn int
-		rn, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
+		rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
 		n += rn
 		if err != nil && err != syserr.ErrWouldBlock {
 			// Always stop on errors other than would block as we generally
@@ -1866,12 +1871,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if n > 0 {
-				return n, senderAddr, senderAddrLen, controlMessages, nil
+				return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
 			}
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
+				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index be63823d8..c4848b313 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -345,14 +345,14 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
 	// FIXME: We can't support MSG_ERRQUEUE because it uses ancillary
 	// messages that netstack/tcpip/transport/unix doesn't understand. Kill the
 	// Socket interface's dependence on netstack.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
-		return 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
+		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
 	}
 
 	var senderAddr []byte
@@ -360,6 +360,8 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		senderAddr = make([]byte, sizeofSockaddr)
 	}
 
+	var msgFlags int
+
 	recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
 		// Refuse to do anything if any part of dst.Addrs was unusable.
 		if uint64(dst.NumBytes()) != dsts.NumBytes() {
@@ -391,6 +393,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			return 0, err
 		}
 		senderAddr = senderAddr[:msg.Namelen]
+		msgFlags = int(msg.Flags)
 		return n, nil
 	})
 
@@ -417,7 +420,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		}
 	}
 
-	return int(n), senderAddr, uint32(len(senderAddr)), socket.ControlMessages{}, syserr.FromError(err)
+	// We don't allow control messages.
+	msgFlags &^= linux.MSG_CTRUNC
+
+	return int(n), msgFlags, senderAddr, uint32(len(senderAddr)), socket.ControlMessages{}, syserr.FromError(err)
 }
 
 // SendMsg implements socket.Socket.SendMsg.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 2503a67c5..0fe9b39b6 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -397,7 +397,7 @@ func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	from := linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
 		PortID: 0,
@@ -412,10 +412,14 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 	}
 
 	if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+		var mflags int
+		if n < int64(r.MsgSize) {
+			mflags |= linux.MSG_TRUNC
+		}
 		if trunc {
 			n = int64(r.MsgSize)
 		}
-		return int(n), from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
+		return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
 	}
 
 	// We'll have to block. Register for notification and keep trying to
@@ -426,17 +430,21 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
 
 	for {
 		if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+			var mflags int
+			if n < int64(r.MsgSize) {
+				mflags |= linux.MSG_TRUNC
+			}
 			if trunc {
 				n = int64(r.MsgSize)
 			}
-			return int(n), from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
+			return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
+				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 896b5b7ce..3418a6d75 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -673,7 +673,7 @@ func (s *socketOperations) extractControlMessages(payload *pb.RecvmsgResponse_Re
 }
 
 // RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
 		Fd:         s.fd,
 		Length:     uint32(dst.NumBytes()),
@@ -694,10 +694,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			}
 		}
 		c := s.extractControlMessages(res)
-		return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
+		return int(res.Length), 0, res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
 	}
 	if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
-		return 0, nil, 0, socket.ControlMessages{}, err
+		return 0, 0, nil, 0, socket.ControlMessages{}, err
 	}
 
 	// We'll have to block. Register for notifications and keep trying to
@@ -718,23 +718,23 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 				}
 			}
 			c := s.extractControlMessages(res)
-			return int(res.Length), res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
+			return int(res.Length), 0, res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
 		}
 		if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
-			return 0, nil, 0, socket.ControlMessages{}, err
+			return 0, 0, nil, 0, socket.ControlMessages{}, err
 		}
 
 		if s.isShutRdSet() {
 			// Blocking would have caused us to block indefinitely so we return 0,
 			// this is the same behavior as Linux.
-			return 0, nil, 0, socket.ControlMessages{}, nil
+			return 0, 0, nil, 0, socket.ControlMessages{}, nil
 		}
 
 		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
 			if err == syserror.ETIMEDOUT {
-				return 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
+				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 5ab423f3c..62ba13782 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -88,7 +88,7 @@ type Socket interface {
 	// not necessarily the actual length of the address.
 	//
 	// If err != nil, the recv was not successful.
-	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
+	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
 
 	// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
 	// ownership of the ControlMessage on error.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 92411c901..01efd24d3 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -477,7 +477,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 
 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
 // a transport.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
 	trunc := flags&linux.MSG_TRUNC != 0
 	peek := flags&linux.MSG_PEEK != 0
 	dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -515,11 +515,17 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 		if r.From != nil {
 			from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
 		}
-		if trunc {
-			n = int64(r.MsgSize)
-		}
+
 		if err != nil || dontWait || !waitAll || s.isPacket || n >= dst.NumBytes() {
-			return int(n), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
+			if s.isPacket && n < int64(r.MsgSize) {
+				msgFlags |= linux.MSG_TRUNC
+			}
+
+			if trunc {
+				n = int64(r.MsgSize)
+			}
+
+			return int(n), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
 		}
 
 		// Don't overwrite any data we received.
@@ -541,14 +547,19 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 				from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
 			}
 			if trunc {
-				n = int64(r.MsgSize)
+				// n and r.MsgSize are the same for streams.
+				total += int64(r.MsgSize)
+			} else {
+				total += n
 			}
-			total += n
 			if err != nil || !waitAll || s.isPacket || n >= dst.NumBytes() {
 				if total > 0 {
 					err = nil
 				}
-				return int(total), from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
+				if s.isPacket && n < int64(r.MsgSize) {
+					msgFlags |= linux.MSG_TRUNC
+				}
+				return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
 			}
 
 			// Don't overwrite any data we received.
@@ -560,9 +571,9 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 				err = nil
 			}
 			if err == syserror.ETIMEDOUT {
-				return int(total), nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
+				return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
 			}
-			return int(total), nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+			return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
 		}
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 49e6f4aeb..30ccc3f66 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -742,17 +742,15 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 
 	// Fast path when no control message nor name buffers are provided.
 	if msg.ControlLen == 0 && msg.NameLen == 0 {
-		n, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
+		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
 		if err != nil {
 			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
 		}
 		cms.Unix.Release()
 
-		if msg.Flags != 0 {
+		if int(msg.Flags) != mflags {
 			// Copy out the flags to the caller.
-			//
-			// TODO: Plumb through actual flags.
-			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(0)); err != nil {
+			if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
 				return 0, err
 			}
 		}
@@ -763,7 +761,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	if msg.ControlLen > maxControlLen {
 		return 0, syscall.ENOBUFS
 	}
-	n, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
+	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
@@ -802,9 +800,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	}
 
 	// Copy out the flags to the caller.
-	//
-	// TODO: Plumb through actual flags.
-	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(0)); err != nil {
+	if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil {
 		return 0, err
 	}
 
@@ -856,7 +852,7 @@ func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, f
 		flags |= linux.MSG_DONTWAIT
 	}
 
-	n, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
+	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
 	cm.Unix.Release()
 	if e != nil {
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 5f83836df..fa895d841 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -220,6 +220,86 @@ TEST(NetlinkRouteTest, GetLinkDump) {
   EXPECT_TRUE(loopbackFound);
 }
 
+TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  struct iovec iov = {};
+  iov.iov_base = &req;
+  iov.iov_len = sizeof(req);
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  // No destination required; it defaults to pid 0, the kernel.
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  // Small enough to ensure that the response doesn't fit.
+  constexpr size_t kBufferSize = 10;
+  std::vector<char> buf(kBufferSize);
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0),
+              SyscallSucceedsWithValue(kBufferSize));
+  EXPECT_EQ((msg.msg_flags & MSG_TRUNC), MSG_TRUNC);
+}
+
+TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
+
+  struct request {
+    struct nlmsghdr hdr;
+    struct ifinfomsg ifm;
+  };
+
+  constexpr uint32_t kSeq = 12345;
+
+  struct request req = {};
+  req.hdr.nlmsg_len = sizeof(req);
+  req.hdr.nlmsg_type = RTM_GETLINK;
+  req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+  req.hdr.nlmsg_seq = kSeq;
+  req.ifm.ifi_family = AF_UNSPEC;
+
+  struct iovec iov = {};
+  iov.iov_base = &req;
+  iov.iov_len = sizeof(req);
+
+  struct msghdr msg = {};
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  // No destination required; it defaults to pid 0, the kernel.
+
+  ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+  // Small enough to ensure that the response doesn't fit.
+  constexpr size_t kBufferSize = 10;
+  std::vector<char> buf(kBufferSize);
+  iov.iov_base = buf.data();
+  iov.iov_len = buf.size();
+
+  int res = 0;
+  ASSERT_THAT(res = RetryEINTR(recvmsg)(fd.get(), &msg, MSG_TRUNC),
+              SyscallSucceeds());
+  EXPECT_GT(res, kBufferSize);
+  EXPECT_EQ((msg.msg_flags & MSG_TRUNC), MSG_TRUNC);
+}
+
 TEST(NetlinkRouteTest, ControlMessageIgnored) {
   FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket());
   uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index d49aab363..d170008a4 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -15,6 +15,7 @@
 #include "test/syscalls/linux/socket_non_stream.h"
 
 #include <stdio.h>
+#include <sys/socket.h>
 #include <sys/un.h>
 
 #include "gtest/gtest.h"
@@ -89,6 +90,33 @@ TEST_P(NonStreamSocketPairTest, SingleRecv) {
   EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
 }
 
+TEST_P(NonStreamSocketPairTest, RecvmsgMsghdrFlagMsgTrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data) / 2] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
+
+  // Check that msghdr flags were updated.
+  EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+}
+
 // Stream sockets allow data sent with multiple sends to be peeked at in a
 // single recv. Datagram sockets (except for unix sockets) do not.
 //
@@ -142,6 +170,33 @@ TEST_P(NonStreamSocketPairTest, MsgTruncTruncation) {
                       sizeof(sent_data) / 2));
 }
 
+TEST_P(NonStreamSocketPairTest, MsgTruncTruncationRecvmsgMsghdrFlagMsgTrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data) / 2] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+              SyscallSucceedsWithValue(sizeof(sent_data)));
+  EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
+
+  // Check that msghdr flags were updated.
+  EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+}
+
 TEST_P(NonStreamSocketPairTest, MsgTruncSameSize) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   char sent_data[512];
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index 32e9d958b..c8a8ad0f6 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -81,6 +81,33 @@ TEST_P(StreamSocketPairTest, WriteOneSideClosed) {
               SyscallFailsWithErrno(EPIPE));
 }
 
+TEST_P(StreamSocketPairTest, RecvmsgMsghdrFlagsNoMsgTrunc) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  char sent_data[10];
+  RandomizeBuffer(sent_data, sizeof(sent_data));
+  ASSERT_THAT(
+      RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+      SyscallSucceedsWithValue(sizeof(sent_data)));
+
+  char received_data[sizeof(sent_data) / 2] = {};
+
+  struct iovec iov;
+  iov.iov_base = received_data;
+  iov.iov_len = sizeof(received_data);
+  struct msghdr msg = {};
+  msg.msg_flags = -1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+
+  ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+              SyscallSucceedsWithValue(sizeof(received_data)));
+  EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
+
+  // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+  EXPECT_EQ(msg.msg_flags, 0);
+}
+
 TEST_P(StreamSocketPairTest, MsgTrunc) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
   char sent_data[512];
-- 
cgit v1.2.3


From f86c35a51ff92718e36ff6075339300be11e09b3 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 22 Apr 2019 18:17:25 -0700
Subject: Clean up state error handling

PiperOrigin-RevId: 244773836
Change-Id: I32223f79d2314fe1ac4ddfc63004fc22ff634adf
---
 pkg/sentry/control/state.go |  2 +-
 pkg/sentry/kernel/kernel.go | 23 ++++++++++++-----------
 pkg/state/state.go          | 20 +++++++++++++++-----
 3 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
index 0a480c84a..b6bbf69fa 100644
--- a/pkg/sentry/control/state.go
+++ b/pkg/sentry/control/state.go
@@ -64,7 +64,7 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error {
 				log.Infof("Save succeeded: exiting...")
 			} else {
 				log.Warningf("Save failed: exiting...")
-				s.Kernel.SetExitError(err)
+				s.Kernel.SetSaveError(err)
 			}
 			s.Kernel.Kill(kernel.ExitStatus{})
 		},
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 290c4a53c..ee6334509 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -175,9 +175,9 @@ type Kernel struct {
 	// netlinkPorts manages allocation of netlink socket port IDs.
 	netlinkPorts *port.Manager
 
-	// exitErr is the error causing the sandbox to exit, if any. It is
-	// protected by extMu.
-	exitErr error `state:"nosave"`
+	// saveErr is the error causing the sandbox to exit during save, if
+	// any. It is protected by extMu.
+	saveErr error `state:"nosave"`
 
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
@@ -1029,20 +1029,21 @@ func (k *Kernel) NetlinkPorts() *port.Manager {
 	return k.netlinkPorts
 }
 
-// ExitError returns the sandbox error that caused the kernel to exit.
-func (k *Kernel) ExitError() error {
+// SaveError returns the sandbox error that caused the kernel to exit during
+// save.
+func (k *Kernel) SaveError() error {
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
-	return k.exitErr
+	return k.saveErr
 }
 
-// SetExitError sets the sandbox error that caused the kernel to exit, if one is
-// not already set.
-func (k *Kernel) SetExitError(err error) {
+// SetSaveError sets the sandbox error that caused the kernel to exit during
+// save, if one is not already set.
+func (k *Kernel) SetSaveError(err error) {
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
-	if k.exitErr == nil {
-		k.exitErr = err
+	if k.saveErr == nil {
+		k.saveErr = err
 	}
 }
 
diff --git a/pkg/state/state.go b/pkg/state/state.go
index 4b141777e..4486f83a7 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -60,8 +60,8 @@ import (
 
 // ErrState is returned when an error is encountered during encode/decode.
 type ErrState struct {
-	// Err is the underlying error.
-	Err error
+	// err is the underlying error.
+	err error
 
 	// path is the visit path from root to the current object.
 	path string
@@ -72,7 +72,17 @@ type ErrState struct {
 
 // Error returns a sensible description of the state error.
 func (e *ErrState) Error() string {
-	return fmt.Sprintf("%v:\nstate path: %s\n%s", e.Err, e.path, e.trace)
+	return fmt.Sprintf("%v:\nstate path: %s\n%s", e.err, e.path, e.trace)
+}
+
+// UnwrapErrState returns the underlying error in ErrState.
+//
+// If err is not *ErrState, err is returned directly.
+func UnwrapErrState(err error) error {
+	if e, ok := err.(*ErrState); ok {
+		return e.err
+	}
+	return err
 }
 
 // Save saves the given object state.
@@ -318,9 +328,9 @@ func (sr *recoverable) safely(fn func()) (err error) {
 		if r := recover(); r != nil {
 			es := new(ErrState)
 			if e, ok := r.(error); ok {
-				es.Err = e
+				es.err = e
 			} else {
-				es.Err = fmt.Errorf("%v", r)
+				es.err = fmt.Errorf("%v", r)
 			}
 
 			es.path = sr.path()
-- 
cgit v1.2.3


From d6aac9387f6def9fa586f94dca39731fb3f6466d Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 22 Apr 2019 18:17:52 -0700
Subject: Fix doc typo

PiperOrigin-RevId: 244773890
Change-Id: I2d0cd7789771276ba545b38efff6d3e24133baaa
---
 pkg/sentry/fs/fs.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 36f263235..119689776 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -155,7 +155,7 @@ type ErrCorruption struct {
 	Err error
 }
 
-// Error returns a sensible description of the save rejection error.
+// Error returns a sensible description of the restore error.
 func (e ErrCorruption) Error() string {
 	return "restore failed due to external file system state in corruption: " + e.Err.Error()
 }
-- 
cgit v1.2.3


From 17ff6063a37551e83eebab98616a21bbc7e58764 Mon Sep 17 00:00:00 2001
From: Wei Zhang <zhangwei198900@gmail.com>
Date: Mon, 22 Apr 2019 20:06:09 -0700
Subject: Bugfix: fix fstatat symbol link to dir

For a symbol link to some directory, eg.

`/tmp/symlink -> /tmp/dir`

`fstatat("/tmp/symlink")` should return symbol link data, but
`fstatat("/tmp/symlink/")` (symlink with trailing slash) should return
directory data it points following linux behaviour.

Currently fstatat() a symlink with trailing slash will get "not a
directory" error which is wrong.

Signed-off-by: Wei Zhang <zhangwei198900@gmail.com>
Change-Id: I63469b1fb89d083d1c1255d32d52864606fbd7e2
PiperOrigin-RevId: 244783916
---
 pkg/sentry/syscalls/linux/sys_stat.go |  6 ++-
 test/syscalls/linux/stat.cc           | 92 +++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 02634b2dd..49c225011 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -63,7 +63,11 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 		return 0, nil, fstat(t, file, statAddr)
 	}
 
-	return 0, nil, fileOpOn(t, fd, path, flags&linux.AT_SYMLINK_NOFOLLOW == 0, func(root *fs.Dirent, d *fs.Dirent) error {
+	// If the path ends in a slash (i.e. dirPath is true) or if AT_SYMLINK_NOFOLLOW is unset,
+	// then we must resolve the final component.
+	resolve := dirPath || flags&linux.AT_SYMLINK_NOFOLLOW == 0
+
+	return 0, nil, fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error {
 		return stat(t, d, dirPath, statAddr)
 	})
 }
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 553fb7e56..48a2059de 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -207,6 +207,98 @@ TEST_F(StatTest, TrailingSlashNotCleanedReturnsENOTDIR) {
   EXPECT_THAT(lstat(bad_path.c_str(), &buf), SyscallFailsWithErrno(ENOTDIR));
 }
 
+// Test fstatating a symlink directory.
+TEST_F(StatTest, FstatatSymlinkDir) {
+  // Create a directory and symlink to it.
+  const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  const std::string symlink_to_dir = NewTempAbsPath();
+  EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+              SyscallSucceeds());
+  auto cleanup = Cleanup([&symlink_to_dir]() {
+    EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+  });
+
+  // Fstatat the link with AT_SYMLINK_NOFOLLOW should return symlink data.
+  struct stat st = {};
+  EXPECT_THAT(
+      fstatat(AT_FDCWD, symlink_to_dir.c_str(), &st, AT_SYMLINK_NOFOLLOW),
+      SyscallSucceeds());
+  EXPECT_FALSE(S_ISDIR(st.st_mode));
+  EXPECT_TRUE(S_ISLNK(st.st_mode));
+
+  // Fstatat the link should return dir data.
+  EXPECT_THAT(fstatat(AT_FDCWD, symlink_to_dir.c_str(), &st, 0),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st.st_mode));
+  EXPECT_FALSE(S_ISLNK(st.st_mode));
+}
+
+// Test fstatating a symlink directory with trailing slash.
+TEST_F(StatTest, FstatatSymlinkDirWithTrailingSlash) {
+  // Create a directory and symlink to it.
+  const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string symlink_to_dir = NewTempAbsPath();
+  EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+              SyscallSucceeds());
+  auto cleanup = Cleanup([&symlink_to_dir]() {
+    EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+  });
+
+  // Fstatat on the symlink with a trailing slash should return the directory
+  // data.
+  struct stat st = {};
+  EXPECT_THAT(
+      fstatat(AT_FDCWD, absl::StrCat(symlink_to_dir, "/").c_str(), &st, 0),
+      SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st.st_mode));
+  EXPECT_FALSE(S_ISLNK(st.st_mode));
+
+  // Fstatat on the symlink with a trailing slash with AT_SYMLINK_NOFOLLOW
+  // should return the directory data.
+  // Symlink to directory with trailing slash will ignore AT_SYMLINK_NOFOLLOW.
+  EXPECT_THAT(fstatat(AT_FDCWD, absl::StrCat(symlink_to_dir, "/").c_str(), &st,
+                      AT_SYMLINK_NOFOLLOW),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st.st_mode));
+  EXPECT_FALSE(S_ISLNK(st.st_mode));
+}
+
+// Test fstatating a symlink directory with a trailing slash
+// should return same stat data with fstatating directory.
+TEST_F(StatTest, FstatatSymlinkDirWithTrailingSlashSameInode) {
+  // Create a directory and symlink to it.
+  const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  // We are going to assert that the symlink inode id is the same as the linked
+  // dir's inode id. In order for the inode id to be stable across
+  // save/restore, it must be kept open. The FileDescriptor type will do that
+  // for us automatically.
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+  const std::string symlink_to_dir = NewTempAbsPath();
+  EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+              SyscallSucceeds());
+  auto cleanup = Cleanup([&symlink_to_dir]() {
+    EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+  });
+
+  // Fstatat on the symlink with a trailing slash should return the directory
+  // data.
+  struct stat st = {};
+  EXPECT_THAT(fstatat(AT_FDCWD, absl::StrCat(symlink_to_dir, "/").c_str(), &st,
+                      AT_SYMLINK_NOFOLLOW),
+              SyscallSucceeds());
+  EXPECT_TRUE(S_ISDIR(st.st_mode));
+
+  // Dir and symlink should point to same inode.
+  struct stat st_dir = {};
+  EXPECT_THAT(
+      fstatat(AT_FDCWD, dir.path().c_str(), &st_dir, AT_SYMLINK_NOFOLLOW),
+      SyscallSucceeds());
+  EXPECT_EQ(st.st_ino, st_dir.st_ino);
+}
+
 TEST_F(StatTest, LeadingDoubleSlash) {
   // Create a file, and make sure we can stat it.
   TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-- 
cgit v1.2.3


From 6b76c172b48ecb2c342882c0fe6474b2b973dad0 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Thu, 25 Apr 2019 16:03:32 -0700
Subject: Don't enforce NAME_MAX in fs.Dirent.walk().

Maximum filename length is filesystem-dependent, and obtained via
statfs::f_namelen. This limit is usually 255 bytes (NAME_MAX), but not
always. For example, VFAT supports filenames of up to 255... UCS-2
characters, which Linux conservatively takes to mean UTF-8-encoded
bytes: fs/fat/inode.c:fat_statfs(), FAT_LFN_LEN * NLS_MAX_CHARSET_SIZE.
As a result, Linux's VFS does not enforce NAME_MAX:

$ rg --maxdepth=1 '\WNAME_MAX\W' fs/ include/linux/
fs/libfs.c
38:     buf->f_namelen = NAME_MAX;
64:     if (dentry->d_name.len > NAME_MAX)

include/linux/relay.h
74:     char base_filename[NAME_MAX];   /* saved base filename */

include/linux/fscrypt.h
149: * filenames up to NAME_MAX bytes, since base64 encoding expands the length.

include/linux/exportfs.h
176: *    understanding that it is already pointing to a a %NAME_MAX+1 sized

Remove this check from core VFS, and add it to ramfs (and by extension
tmpfs), where it is actually applicable:
mm/shmem.c:shmem_dir_inode_operations.lookup == simple_lookup *does*
enforce NAME_MAX.

PiperOrigin-RevId: 245324748
Change-Id: I17567c4324bfd60e31746a5270096e75db963fac
---
 pkg/sentry/fs/dirent.go       |  5 -----
 pkg/sentry/fs/gofer/path.go   | 40 ++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/ramfs/dir.go    | 23 +++++++++++++++++++++++
 test/syscalls/linux/chdir.cc  |  5 -----
 test/syscalls/linux/statfs.cc |  1 +
 5 files changed, 64 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4870e7d40..4bcdf530a 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -459,11 +459,6 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl
 		return nil, syscall.ENOTDIR
 	}
 
-	// The component must be less than NAME_MAX.
-	if len(name) > linux.NAME_MAX {
-		return nil, syscall.ENAMETOOLONG
-	}
-
 	if name == "" || name == "." {
 		d.IncRef()
 		return d, nil
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 5e1a8b623..8ae33d286 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -27,9 +27,17 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
+// encoding of strings, which uses 2 bytes for the length prefix.
+const maxFilenameLen = (1 << 16) - 1
+
 // Lookup loads an Inode at name into a Dirent based on the session's cache
 // policy.
 func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) {
+	if len(name) > maxFilenameLen {
+		return nil, syserror.ENAMETOOLONG
+	}
+
 	cp := i.session().cachePolicy
 	if cp.cacheReaddir() {
 		// Check to see if we have readdirCache that indicates the
@@ -72,6 +80,10 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
 //
 // Ownership is currently ignored.
 func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) {
+	if len(name) > maxFilenameLen {
+		return nil, syserror.ENAMETOOLONG
+	}
+
 	// Create replaces the directory fid with the newly created/opened
 	// file, so clone this directory so it doesn't change out from under
 	// this node.
@@ -139,6 +151,10 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 
 // CreateLink uses Create to create a symlink between oldname and newname.
 func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error {
+	if len(newname) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+
 	owner := fs.FileOwnerFromContext(ctx)
 	if _, err := i.fileState.file.symlink(ctx, oldname, newname, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
 		return err
@@ -149,6 +165,10 @@ func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname
 
 // CreateHardLink implements InodeOperations.CreateHardLink.
 func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, target *fs.Inode, newName string) error {
+	if len(newName) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+
 	targetOpts, ok := target.InodeOperations.(*inodeOperations)
 	if !ok {
 		return syscall.EXDEV
@@ -167,6 +187,10 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t
 
 // CreateDirectory uses Create to create a directory named s under inodeOperations.
 func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s string, perm fs.FilePermissions) error {
+	if len(s) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+
 	owner := fs.FileOwnerFromContext(ctx)
 	if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
 		return err
@@ -184,6 +208,10 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s
 
 // Bind implements InodeOperations.Bind.
 func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep transport.BoundEndpoint, perm fs.FilePermissions) (*fs.Dirent, error) {
+	if len(name) > maxFilenameLen {
+		return nil, syserror.ENAMETOOLONG
+	}
+
 	if i.session().endpoints == nil {
 		return nil, syscall.EOPNOTSUPP
 	}
@@ -252,6 +280,10 @@ func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePe
 
 // Remove implements InodeOperations.Remove.
 func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error {
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+
 	var key device.MultiDeviceKey
 	removeSocket := false
 	if i.session().endpoints != nil {
@@ -284,6 +316,10 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 
 // Remove implements InodeOperations.RemoveDirectory.
 func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error {
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+
 	// 0x200 = AT_REMOVEDIR.
 	if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil {
 		return err
@@ -301,6 +337,10 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na
 
 // Rename renames this node.
 func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+	if len(newName) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+
 	// Unwrap the new parent to a *inodeOperations.
 	newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations)
 	if !ok {
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 011cf3a16..159fd2981 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -192,6 +192,10 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 
 // Remove removes the named non-directory.
 func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error {
+	if len(name) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
+
 	d.mu.Lock()
 	defer d.mu.Unlock()
 	inode, err := d.removeChildLocked(ctx, name)
@@ -206,6 +210,10 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error {
 
 // RemoveDirectory removes the named directory.
 func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) error {
+	if len(name) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
+
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
@@ -234,6 +242,10 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err
 
 // Lookup loads an inode at p into a Dirent.
 func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, error) {
+	if len(p) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
@@ -265,6 +277,10 @@ func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) {
 // createInodeOperationsCommon creates a new child node at this dir by calling
 // makeInodeOperations. It is the common logic for creating a new child.
 func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, makeInodeOperations func() (*fs.Inode, error)) (*fs.Inode, error) {
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
@@ -314,6 +330,10 @@ func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname st
 
 // CreateHardLink creates a new hard link.
 func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error {
+	if len(name) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
+
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
@@ -465,6 +485,9 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n
 	if !ok {
 		return syserror.EXDEV
 	}
+	if len(newName) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
 
 	np.mu.Lock()
 	defer np.mu.Unlock()
diff --git a/test/syscalls/linux/chdir.cc b/test/syscalls/linux/chdir.cc
index 4905ffb23..a4b54f0ee 100644
--- a/test/syscalls/linux/chdir.cc
+++ b/test/syscalls/linux/chdir.cc
@@ -54,11 +54,6 @@ TEST(ChdirTest, NotDir) {
   EXPECT_THAT(chdir(temp_file.path().c_str()), SyscallFailsWithErrno(ENOTDIR));
 }
 
-TEST(ChdirTest, NameTooLong) {
-  std::string name(NAME_MAX + 1, 'a');
-  ASSERT_THAT(chdir(name.c_str()), SyscallFailsWithErrno(ENAMETOOLONG));
-}
-
 TEST(ChdirTest, NotExist) {
   EXPECT_THAT(chdir("/foo/bar"), SyscallFailsWithErrno(ENOENT));
 }
diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc
index 1fc9758c9..e1e7fc707 100644
--- a/test/syscalls/linux/statfs.cc
+++ b/test/syscalls/linux/statfs.cc
@@ -49,6 +49,7 @@ TEST(StatfsTest, NameLen) {
   struct statfs st;
   EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
 
+  // This assumes that /dev/shm is tmpfs.
   EXPECT_EQ(st.f_namelen, NAME_MAX);
 }
 
-- 
cgit v1.2.3


From f17cfa4d53742923b5c91b149b82a05bcda3ea20 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Thu, 25 Apr 2019 17:45:56 -0700
Subject: Perform explicit CPUID and FP state compatibility checks on restore

PiperOrigin-RevId: 245341004
Change-Id: Ic4d581039d034a8ae944b43e45e84eb2c3973657
---
 pkg/cpuid/cpuid.go                | 34 ++++++++++++++++++++++
 pkg/sentry/arch/arch_state_x86.go | 59 ++++++++++++++++++++++++++++++---------
 pkg/sentry/kernel/kernel.go       | 30 ++++++++++++++++++++
 3 files changed, 110 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index 64e2e68f1..61441150e 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -446,6 +446,20 @@ const (
 	extendedFeatures                                       // Returns some extended feature bits in edx and ecx.
 )
 
+// These are the extended floating point state features. They are used to
+// enumerate floating point features in XCR0, XSTATE_BV, etc.
+const (
+	XSAVEFeatureX87         = 1 << 0
+	XSAVEFeatureSSE         = 1 << 1
+	XSAVEFeatureAVX         = 1 << 2
+	XSAVEFeatureBNDREGS     = 1 << 3
+	XSAVEFeatureBNDCSR      = 1 << 4
+	XSAVEFeatureAVX512op    = 1 << 5
+	XSAVEFeatureAVX512zmm0  = 1 << 6
+	XSAVEFeatureAVX512zmm16 = 1 << 7
+	XSAVEFeaturePKRU        = 1 << 9
+)
+
 var cpuFreqMHz float64
 
 // x86FeaturesFromString includes features from x86FeatureStrings and
@@ -561,6 +575,26 @@ func (fs *FeatureSet) Intel() bool {
 	return fs.VendorID == "GenuineIntel"
 }
 
+// ErrIncompatible is returned by FeatureSet.HostCompatible if fs is not a
+// subset of the host feature set.
+type ErrIncompatible struct {
+	message string
+}
+
+// Error implements error.
+func (e ErrIncompatible) Error() string {
+	return e.message
+}
+
+// CheckHostCompatible returns nil if fs is a subset of the host feature set.
+func (fs *FeatureSet) CheckHostCompatible() error {
+	hfs := HostFeatureSet()
+	if diff := fs.Subtract(hfs); diff != nil {
+		return ErrIncompatible{fmt.Sprintf("CPU feature set %v incompatible with host feature set %v (missing: %v)", fs.FlagsString(false), hfs.FlagsString(false), diff)}
+	}
+	return nil
+}
+
 // Helper to convert 3 regs into 12-byte vendor ID.
 func vendorIDFromRegs(bx, cx, dx uint32) string {
 	bytes := make([]byte, 0, 12)
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index 604bd08a6..01949049d 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -15,14 +15,31 @@
 package arch
 
 import (
-	"sync"
+	"fmt"
 	"syscall"
 
-	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
 
-// warnOnce is used to warn about truncated state only once.
-var warnOnce sync.Once
+// ErrFloatingPoint indicates a failed restore due to unusable floating point
+// state.
+type ErrFloatingPoint struct {
+	// supported is the supported floating point state.
+	supported uint64
+
+	// saved is the saved floating point state.
+	saved uint64
+}
+
+// Error returns a sensible description of the restore error.
+func (e ErrFloatingPoint) Error() string {
+	return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supported, e.saved)
+}
+
+// XSTATE_BV does not exist if FXSAVE is used, but FXSAVE implicitly saves x87
+// and SSE state, so this is the equivalent XSTATE_BV value.
+const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE
 
 // afterLoad is invoked by stateify.
 func (s *State) afterLoad() {
@@ -33,7 +50,8 @@ func (s *State) afterLoad() {
 	// state that may be saved by the new CPU. Even if extraneous new state
 	// is saved, the state we care about is guaranteed to be a subset of
 	// new state. Later optimizations can use less space when using a
-	// smaller state component bitmap. Intel SDM section 13 has more info.
+	// smaller state component bitmap. Intel SDM Volume 1 Chapter 13 has
+	// more info.
 	s.x86FPState = newX86FPState()
 
 	// x86FPState always contains all the FP state supported by the host.
@@ -41,15 +59,30 @@ func (s *State) afterLoad() {
 	// which we cannot restore.
 	//
 	// The x86 FP state areas are backwards compatible, so we can simply
-	// truncate the additional floating point state. Applications should
-	// not depend on the truncated state because it should relate only to
-	// features that were not exposed in the app FeatureSet.
+	// truncate the additional floating point state.
+	//
+	// Applications should not depend on the truncated state because it
+	// should relate only to features that were not exposed in the app
+	// FeatureSet. However, because we do not *prevent* them from using
+	// this state, we must verify here that there is no in-use state
+	// (according to XSTATE_BV) which we do not support.
 	if len(s.x86FPState) < len(old) {
-		warnOnce.Do(func() {
-			// This will occur on every instance of state, don't
-			// bother warning more than once.
-			log.Infof("dropping %d bytes of floating point state; the application should not depend on this state", len(old)-len(s.x86FPState))
-		})
+		// What do we support?
+		supportedBV := fxsaveBV
+		if fs := cpuid.HostFeatureSet(); fs.UseXsave() {
+			supportedBV = fs.ValidXCR0Mask()
+		}
+
+		// What was in use?
+		savedBV := fxsaveBV
+		if len(old) >= xstateBVOffset+8 {
+			savedBV = usermem.ByteOrder.Uint64(old[xstateBVOffset:])
+		}
+
+		// Supported features must be a superset of saved features.
+		if savedBV&^supportedBV != 0 {
+			panic(ErrFloatingPoint{supported: supportedBV, saved: savedBV})
+		}
 	}
 
 	// Copy to the new, aligned location.
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index ee6334509..a1b2d7161 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -337,6 +337,17 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
 	}
 
+	// Save the CPUID FeatureSet before the rest of the kernel so we can
+	// verify its compatibility on restore before attempting to restore the
+	// entire kernel, which may fail on an incompatible machine.
+	//
+	// N.B. This will also be saved along with the full kernel save below.
+	cpuidStart := time.Now()
+	if err := state.Save(w, k.FeatureSet(), nil); err != nil {
+		return err
+	}
+	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
+
 	// Save the kernel state.
 	kernelStart := time.Now()
 	var stats state.Stats
@@ -469,6 +480,25 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
 
 	initAppCores := k.applicationCores
 
+	// Load the pre-saved CPUID FeatureSet.
+	//
+	// N.B. This was also saved along with the full kernel below, so we
+	// don't need to explicitly install it in the Kernel.
+	cpuidStart := time.Now()
+	var features cpuid.FeatureSet
+	if err := state.Load(r, &features, nil); err != nil {
+		return err
+	}
+	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
+
+	// Verify that the FeatureSet is usable on this host. We do this before
+	// Kernel load so that the explicit CPUID mismatch error has priority
+	// over floating point state restore errors that may occur on load on
+	// an incompatible machine.
+	if err := features.CheckHostCompatible(); err != nil {
+		return err
+	}
+
 	// Load the kernel state.
 	kernelStart := time.Now()
 	var stats state.Stats
-- 
cgit v1.2.3


From 5f13338d30fb59241cf7f1aa6374c54c69677314 Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Fri, 26 Apr 2019 11:08:37 -0700
Subject: Fix reference counting bug in /proc/PID/fdinfo/.

PiperOrigin-RevId: 245452217
Change-Id: I7164d8f57fe34c17e601079eb9410a6d95af1869
---
 pkg/sentry/fs/proc/fds.go   | 19 +------------------
 test/syscalls/linux/pipe.cc | 36 +++++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 35 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 939ebaba1..25da06f5d 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -236,24 +236,6 @@ func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySer
 	})
 }
 
-// fdInfoInode is a single file in /proc/TID/fdinfo/.
-//
-// +stateify savable
-type fdInfoInode struct {
-	staticFileInodeOps
-
-	file    *fs.File
-	flags   fs.FileFlags
-	fdFlags kernel.FDFlags
-}
-
-var _ fs.InodeOperations = (*fdInfoInode)(nil)
-
-// Release implements fs.InodeOperations.Release.
-func (f *fdInfoInode) Release(ctx context.Context) {
-	f.file.DecRef()
-}
-
 // fdInfoDir implements /proc/TID/fdinfo.  It embeds an fdDir, but overrides
 // Lookup and Readdir.
 //
@@ -283,6 +265,7 @@ func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs
 		// locks, and other data.  For now we only have flags.
 		// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
 		flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags()
+		file.DecRef()
 		contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags))
 		return newStaticProcInode(ctx, dir.MountSource, contents)
 	})
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index 4731157e8..c49ec9f09 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -455,24 +455,26 @@ TEST_F(PipeTest, LargeFile) {
   EXPECT_EQ(rflags, 0);
 }
 
-// Test that accessing /proc/<PID>/fd/<FD> correctly decrements the refcount of
-// that file descriptor.
+// Test that accesses of /proc/<PID>/fd/<FD> and /proc/<PID>/fdinfo/<FD>
+// correctly decrement the refcount of that file descriptor.
 TEST_F(PipeTest, ProcFDReleasesFile) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  FileDescriptor rfd(fds[0]);
-  FileDescriptor wfd(fds[1]);
-
-  // Stat the pipe FD, which shouldn't alter the refcount of the write end of
-  // the pipe.
-  struct stat wst;
-  ASSERT_THAT(lstat(absl::StrCat("/proc/self/fd/", wfd.get()).c_str(), &wst),
-              SyscallSucceeds());
-
-  // Close the write end of the pipe and ensure that read indicates EOF.
-  wfd.reset();
-  char buf;
-  ASSERT_THAT(read(rfd.get(), &buf, 1), SyscallSucceedsWithValue(0));
+  std::vector<std::string> paths = {"/proc/self/fd/", "/proc/self/fdinfo/"};
+  for (const std::string& path : paths) {
+    int fds[2];
+    ASSERT_THAT(pipe(fds), SyscallSucceeds());
+    FileDescriptor rfd(fds[0]);
+    FileDescriptor wfd(fds[1]);
+
+    // Stat the pipe FD, which shouldn't alter the refcount of the write end of
+    // the pipe.
+    struct stat wst;
+    ASSERT_THAT(lstat(absl::StrCat(path.c_str(), wfd.get()).c_str(), &wst),
+                SyscallSucceeds());
+    // Close the write end of the pipe and ensure that read indicates EOF.
+    wfd.reset();
+    char buf;
+    ASSERT_THAT(read(rfd.get(), &buf, 1), SyscallSucceedsWithValue(0));
+  }
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 5749f64314d38516badec156ab048d3523294a81 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 26 Apr 2019 13:51:48 -0700
Subject: kvm: remove non-sane sanity check

Apparently some platforms don't have pSize < vSize.

Fixes #208

PiperOrigin-RevId: 245480998
Change-Id: I2a98229912f4ccbfcd8e79dfa355104f14275a9c
---
 pkg/sentry/platform/kvm/physical_map.go | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index b908cae6a..9d7dca5b3 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -50,8 +50,9 @@ type physicalRegion struct {
 var physicalRegions []physicalRegion
 
 // fillAddressSpace fills the host address space with PROT_NONE mappings until
-// the number of available bits until we have a host address space size that is
-// equal to the physical address space.
+// we have a host address space size that is less than or equal to the physical
+// address space. This allows us to have an injective host virtual to guest
+// physical mapping.
 //
 // The excluded regions are returned.
 func fillAddressSpace() (excludedRegions []region) {
@@ -67,11 +68,6 @@ func fillAddressSpace() (excludedRegions []region) {
 	pSize := uintptr(1) << ring0.PhysicalAddressBits()
 	pSize -= reservedMemory
 
-	// Sanity check.
-	if vSize < pSize {
-		panic(fmt.Sprintf("vSize (%x) < pSize (%x)", vSize, pSize))
-	}
-
 	// Add specifically excluded regions; see excludeVirtualRegion.
 	applyVirtualRegions(func(vr virtualRegion) {
 		if excludeVirtualRegion(vr) {
@@ -81,6 +77,11 @@ func fillAddressSpace() (excludedRegions []region) {
 		}
 	})
 
+	// Do we need any more work?
+	if vSize < pSize {
+		return excludedRegions
+	}
+
 	// Calculate the required space and fill it.
 	//
 	// Note carefully that we add faultBlockSize to required up front, and
-- 
cgit v1.2.3


From 2df64cd6d2c835ce5b37a8b9111d24ad382b5d3d Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 29 Apr 2019 10:29:14 -0700
Subject: createAt should return all errors from FindInode except ENOENT.

Previously, createAt was eating all errors from FindInode except for EACCES and
proceeding with the creation. This is incorrect, as FindInode can return many
other errors (like ENAMETOOLONG) that should stop creation.

This CL changes createAt to return all errors encountered except for ENOENT,
which we can ignore because we are about to create the thing.

PiperOrigin-RevId: 245773222
Change-Id: I1b317021de70f0550fb865506f6d8147d4aebc56
---
 pkg/sentry/syscalls/linux/sys_file.go |  9 +++++----
 test/syscalls/linux/creat.cc          | 11 +++++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index d2d351449..50151f7b6 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -347,10 +347,9 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 				return syserror.ConvertIntr(err, kernel.ERESTARTSYS)
 			}
 			defer newFile.DecRef()
-		case syserror.EACCES:
-			// Permission denied while walking to the file.
-			return err
-		default:
+		case syserror.ENOENT:
+			// File does not exist. Proceed with creation.
+
 			// Do we have write permissions on the parent?
 			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
 				return err
@@ -365,6 +364,8 @@ func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mod
 			}
 			defer newFile.DecRef()
 			targetDirent = newFile.Dirent
+		default:
+			return err
 		}
 
 		// Success.
diff --git a/test/syscalls/linux/creat.cc b/test/syscalls/linux/creat.cc
index 72a016b4c..df2cc0d5c 100644
--- a/test/syscalls/linux/creat.cc
+++ b/test/syscalls/linux/creat.cc
@@ -51,6 +51,17 @@ TEST(CreatTest, CreatTruncatesExistingFile) {
   EXPECT_EQ("", new_contents);
 }
 
+TEST(CreatTest, CreatWithNameTooLong) {
+  // Start with a unique name, and pad it to NAME_MAX + 1;
+  std::string name = NewTempRelPath();
+  int padding = (NAME_MAX + 1) - name.size();
+  name.append(padding, 'x');
+  const std::string& path = JoinPath(GetAbsoluteTestTmpdir(), name);
+
+  // Creation should return ENAMETOOLONG.
+  ASSERT_THAT(creat(path.c_str(), kMode), SyscallFailsWithErrno(ENAMETOOLONG));
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From f4ce43e1f426148d99c28c1b0e5c43ddda17a8cb Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 29 Apr 2019 14:03:04 -0700
Subject: Allow and document bug ids in gVisor codebase.

PiperOrigin-RevId: 245818639
Change-Id: I03703ef0fb9b6675955637b9fe2776204c545789
---
 CONTRIBUTING.md                                    |  7 +++
 pkg/cpuid/cpuid_test.go                            |  2 +-
 pkg/dhcp/client.go                                 |  2 +-
 pkg/log/glog.go                                    |  2 +-
 pkg/metric/metric.go                               |  4 +-
 pkg/segment/set.go                                 |  2 +-
 pkg/segment/test/set_functions.go                  |  2 +-
 pkg/sentry/arch/arch.go                            |  2 +-
 pkg/sentry/arch/arch_amd64.go                      |  4 +-
 pkg/sentry/arch/arch_x86.go                        |  2 +-
 pkg/sentry/arch/signal_amd64.go                    |  6 +--
 pkg/sentry/arch/stack.go                           |  6 +--
 pkg/sentry/context/context.go                      |  2 +-
 pkg/sentry/control/proc.go                         |  2 +-
 pkg/sentry/fs/README.md                            |  2 +-
 pkg/sentry/fs/ashmem/area.go                       |  4 +-
 pkg/sentry/fs/binder/binder.go                     | 22 ++++----
 pkg/sentry/fs/dentry.go                            |  2 +-
 pkg/sentry/fs/dirent.go                            |  8 +--
 pkg/sentry/fs/file.go                              |  2 +-
 pkg/sentry/fs/file_overlay.go                      |  4 +-
 pkg/sentry/fs/fsutil/file.go                       |  8 +--
 pkg/sentry/fs/fsutil/inode_cached.go               |  4 +-
 pkg/sentry/fs/gofer/cache_policy.go                |  4 +-
 pkg/sentry/fs/gofer/file.go                        |  2 +-
 pkg/sentry/fs/gofer/file_state.go                  |  2 +-
 pkg/sentry/fs/gofer/handles.go                     |  2 +-
 pkg/sentry/fs/gofer/inode.go                       |  6 +--
 pkg/sentry/fs/gofer/inode_state.go                 |  2 +-
 pkg/sentry/fs/gofer/session.go                     |  2 +-
 pkg/sentry/fs/gofer/session_state.go               |  2 +-
 pkg/sentry/fs/host/fs.go                           |  4 +-
 pkg/sentry/fs/host/inode.go                        | 10 ++--
 pkg/sentry/fs/inode.go                             |  6 +--
 pkg/sentry/fs/inode_operations.go                  |  2 +-
 pkg/sentry/fs/inode_overlay.go                     |  6 +--
 pkg/sentry/fs/mount.go                             |  4 +-
 pkg/sentry/fs/mount_test.go                        |  2 +-
 pkg/sentry/fs/proc/README.md                       | 12 ++---
 pkg/sentry/fs/proc/fds.go                          |  2 +-
 pkg/sentry/fs/proc/loadavg.go                      |  2 +-
 pkg/sentry/fs/proc/meminfo.go                      |  6 +--
 pkg/sentry/fs/proc/mounts.go                       |  2 +-
 pkg/sentry/fs/proc/net.go                          |  2 +-
 pkg/sentry/fs/proc/stat.go                         | 12 ++---
 pkg/sentry/fs/proc/sys_net.go                      |  2 +-
 pkg/sentry/fs/proc/task.go                         |  8 +--
 pkg/sentry/fs/proc/version.go                      |  2 +-
 pkg/sentry/fs/ramfs/dir.go                         |  2 +-
 pkg/sentry/fs/tmpfs/fs.go                          |  2 +-
 pkg/sentry/fs/tmpfs/inode_file.go                  |  2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                       |  2 +-
 pkg/sentry/fs/tty/dir.go                           |  6 +--
 pkg/sentry/fs/tty/fs.go                            |  2 +-
 pkg/sentry/fs/tty/master.go                        |  6 +--
 pkg/sentry/fs/tty/slave.go                         |  6 +--
 pkg/sentry/kernel/auth/credentials.go              |  2 +-
 pkg/sentry/kernel/auth/user_namespace.go           |  2 +-
 pkg/sentry/kernel/pending_signals.go               |  2 +-
 pkg/sentry/kernel/ptrace.go                        |  4 +-
 pkg/sentry/kernel/rseq.go                          |  2 +-
 pkg/sentry/kernel/sched/cpuset.go                  |  2 +-
 pkg/sentry/kernel/semaphore/semaphore.go           |  6 +--
 pkg/sentry/kernel/shm/shm.go                       |  2 +-
 pkg/sentry/kernel/syscalls.go                      |  2 +-
 pkg/sentry/kernel/task_context.go                  |  2 +-
 pkg/sentry/kernel/task_exec.go                     |  2 +-
 pkg/sentry/kernel/task_exit.go                     |  4 +-
 pkg/sentry/kernel/task_identity.go                 |  2 +-
 pkg/sentry/kernel/task_run.go                      |  2 +-
 pkg/sentry/kernel/task_signals.go                  |  4 +-
 pkg/sentry/kernel/task_stop.go                     |  2 +-
 pkg/sentry/loader/loader.go                        |  2 +-
 pkg/sentry/loader/vdso.go                          |  6 +--
 pkg/sentry/memmap/memmap.go                        |  2 +-
 pkg/sentry/mm/aio_context.go                       |  2 +-
 pkg/sentry/mm/procfs.go                            | 10 ++--
 pkg/sentry/mm/special_mappable.go                  |  2 +-
 pkg/sentry/mm/syscalls.go                          |  6 +--
 pkg/sentry/mm/vma.go                               |  2 +-
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go        |  2 +-
 pkg/sentry/platform/platform.go                    |  2 +-
 pkg/sentry/platform/ptrace/subprocess.go           |  2 +-
 pkg/sentry/platform/ring0/x86.go                   |  4 +-
 pkg/sentry/sighandling/sighandling.go              |  2 +-
 pkg/sentry/sighandling/sighandling_unsafe.go       |  2 +-
 pkg/sentry/socket/epsocket/epsocket.go             | 32 ++++++------
 pkg/sentry/socket/epsocket/save_restore.go         |  2 +-
 pkg/sentry/socket/epsocket/stack.go                |  2 +-
 pkg/sentry/socket/hostinet/socket.go               |  2 +-
 pkg/sentry/socket/netlink/route/protocol.go        |  8 +--
 pkg/sentry/socket/netlink/socket.go                | 10 ++--
 pkg/sentry/socket/rpcinet/conn/conn.go             |  2 +-
 pkg/sentry/socket/rpcinet/notifier/notifier.go     |  4 +-
 pkg/sentry/socket/rpcinet/socket.go                |  6 +--
 pkg/sentry/socket/rpcinet/syscall_rpc.proto        |  2 +-
 pkg/sentry/strace/strace.go                        |  2 +-
 pkg/sentry/syscalls/linux/error.go                 |  2 +-
 pkg/sentry/syscalls/linux/linux64.go               | 60 +++++++++++-----------
 pkg/sentry/syscalls/linux/sys_aio.go               |  2 +-
 pkg/sentry/syscalls/linux/sys_file.go              |  4 +-
 pkg/sentry/syscalls/linux/sys_mmap.go              |  4 +-
 pkg/sentry/syscalls/linux/sys_read.go              |  2 +-
 pkg/sentry/syscalls/linux/sys_socket.go            |  4 +-
 pkg/sentry/syscalls/linux/sys_thread.go            |  2 +-
 pkg/sentry/syscalls/linux/sys_write.go             |  4 +-
 pkg/sentry/time/calibrated_clock.go                |  6 +--
 pkg/sentry/time/parameters.go                      |  2 +-
 pkg/sentry/usermem/usermem.go                      |  4 +-
 pkg/sentry/watchdog/watchdog.go                    |  2 +-
 pkg/syserr/syserr.go                               | 10 ++--
 pkg/tcpip/network/ipv4/icmp.go                     |  2 +-
 pkg/tcpip/network/ipv6/icmp.go                     |  4 +-
 pkg/tcpip/stack/nic.go                             |  6 +--
 pkg/tcpip/stack/stack.go                           |  4 +-
 pkg/tcpip/stack/stack_global_state.go              |  2 +-
 pkg/tcpip/stack/transport_test.go                  |  2 +-
 pkg/tcpip/tcpip.go                                 |  2 +-
 pkg/tcpip/transport/raw/raw.go                     |  2 +-
 pkg/tcpip/transport/tcp/BUILD                      |  2 +-
 pkg/unet/unet.go                                   |  2 +-
 pkg/unet/unet_test.go                              |  2 +-
 runsc/boot/controller.go                           |  4 +-
 runsc/boot/fs.go                                   |  6 +--
 runsc/boot/loader.go                               |  2 +-
 runsc/cmd/checkpoint.go                            |  2 +-
 runsc/container/container.go                       |  2 +-
 runsc/container/container_test.go                  |  4 +-
 runsc/sandbox/sandbox.go                           |  6 +--
 runsc/specutils/specutils.go                       |  4 +-
 test/syscalls/BUILD                                |  6 +--
 test/syscalls/build_defs.bzl                       |  4 +-
 test/syscalls/linux/32bit.cc                       | 14 ++---
 test/syscalls/linux/aio.cc                         |  2 +-
 test/syscalls/linux/chmod.cc                       |  2 +-
 test/syscalls/linux/epoll.cc                       |  2 +-
 test/syscalls/linux/exec_binary.cc                 | 12 ++---
 test/syscalls/linux/file_base.h                    |  4 +-
 test/syscalls/linux/ioctl.cc                       |  4 +-
 test/syscalls/linux/ip_socket_test_util.cc         |  2 +-
 test/syscalls/linux/lseek.cc                       |  2 +-
 test/syscalls/linux/mkdir.cc                       |  2 +-
 test/syscalls/linux/mmap.cc                        | 18 +++----
 test/syscalls/linux/open.cc                        |  2 +-
 test/syscalls/linux/partial_bad_buffer.cc          | 18 +++----
 test/syscalls/linux/pipe.cc                        |  6 +--
 test/syscalls/linux/proc.cc                        | 32 ++++++------
 test/syscalls/linux/proc_pid_smaps.cc              |  2 +-
 test/syscalls/linux/ptrace.cc                      |  2 +-
 test/syscalls/linux/pwrite64.cc                    |  2 +-
 test/syscalls/linux/readv_socket.cc                |  2 +-
 test/syscalls/linux/rtsignal.cc                    |  2 +-
 test/syscalls/linux/socket_inet_loopback.cc        | 10 ++--
 .../socket_ipv4_udp_unbound_external_networking.cc |  4 +-
 test/syscalls/linux/socket_netlink_route.cc        |  4 +-
 test/syscalls/linux/socket_stream_blocking.cc      |  2 +-
 test/syscalls/linux/socket_test_util.cc            |  2 +-
 test/syscalls/linux/socket_unix.cc                 | 16 +++---
 test/syscalls/linux/socket_unix_dgram.cc           |  2 +-
 .../linux/socket_unix_dgram_non_blocking.cc        |  2 +-
 test/syscalls/linux/socket_unix_non_stream.cc      | 10 ++--
 .../linux/socket_unix_unbound_seqpacket.cc         |  2 +-
 test/syscalls/linux/socket_unix_unbound_stream.cc  |  4 +-
 test/syscalls/linux/stat.cc                        |  2 +-
 test/syscalls/linux/stat_times.cc                  |  8 +--
 test/syscalls/linux/tcp_socket.cc                  |  2 +-
 test/syscalls/linux/tkill.cc                       |  2 +-
 test/syscalls/linux/udp_bind.cc                    |  4 +-
 test/syscalls/linux/uidgid.cc                      |  2 +-
 test/syscalls/linux/utimes.cc                      |  4 +-
 test/syscalls/linux/wait.cc                        |  2 +-
 test/syscalls/linux/write.cc                       |  2 +-
 third_party/gvsync/downgradable_rwmutex_unsafe.go  |  2 +-
 vdso/cycle_clock.h                                 |  2 +-
 vdso/vdso_amd64.lds                                |  2 +-
 vdso/vdso_arm64.lds                                |  2 +-
 176 files changed, 403 insertions(+), 396 deletions(-)

(limited to 'pkg/sentry')

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d6dafc595..238dd6665 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -126,6 +126,13 @@ change.
 When approved, the change will be submitted by a team member and automatically
 merged into the repository.
 
+### Bug IDs
+
+Some TODOs and NOTEs sprinkled throughout the code have associated IDs of the
+form b/1234. These correspond to bugs in our internal bug tracker. Eventually
+these bugs will be moved to the GitHub Issues, but until then they can simply be
+ignored.
+
 ### The small print
 
 Contributions made by corporations are covered by a different agreement than the
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
index 35e7b8e50..64ade1cbe 100644
--- a/pkg/cpuid/cpuid_test.go
+++ b/pkg/cpuid/cpuid_test.go
@@ -78,7 +78,7 @@ func TestTakeFeatureIntersection(t *testing.T) {
 	}
 }
 
-// TODO: Run this test on a very old platform, and make sure more
+// TODO(b/73346484): Run this test on a very old platform, and make sure more
 // bits are enabled than just FPU and PAE. This test currently may not detect
 // if HostFeatureSet gives back junk bits.
 func TestHostFeatureSet(t *testing.T) {
diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 354205e63..2ba79be32 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -120,7 +120,7 @@ func (c *Client) Config() Config {
 // If the server sets a lease limit a timer is set to automatically
 // renew it.
 func (c *Client) Request(ctx context.Context, requestedAddr tcpip.Address) (cfg Config, reterr error) {
-	// TODO: remove calls to {Add,Remove}Address when they're no
+	// TODO(b/127321246): remove calls to {Add,Remove}Address when they're no
 	// longer required to send and receive broadcast.
 	if err := c.stack.AddAddressWithOptions(c.nicid, ipv4.ProtocolNumber, tcpipHeader.IPv4Any, stack.NeverPrimaryEndpoint); err != nil && err != tcpip.ErrDuplicateAddress {
 		return Config{}, fmt.Errorf("dhcp: AddAddressWithOptions(): %s", err)
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index fbb58501b..24d5390d7 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -144,7 +144,7 @@ func (g GoogleEmitter) Emit(level Level, timestamp time.Time, format string, arg
 	b.writeAll(pid)
 	b.write(' ')
 
-	// FIXME: The caller, fabricated. This really sucks, but it
+	// FIXME(b/73383460): The caller, fabricated. This really sucks, but it
 	// is unacceptable to put runtime.Callers() in the hot path.
 	b.writeAll(caller)
 	b.write(']')
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index 02af75974..e5eb95f89 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -44,8 +44,8 @@ var (
 //
 // Metrics are not saved across save/restore and thus reset to zero on restore.
 //
-// TODO: Support non-cumulative metrics.
-// TODO: Support metric fields.
+// TODO(b/67298402): Support non-cumulative metrics.
+// TODO(b/67298427): Support metric fields.
 //
 type Uint64Metric struct {
 	// value is the actual value of the metric. It must be accessed
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index a9a3b8875..74a916ea3 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -1270,7 +1270,7 @@ func segmentAfterPosition(n *node, i int) Iterator {
 }
 
 func zeroValueSlice(slice []Value) {
-	// TODO: check if Go is actually smart enough to optimize a
+	// TODO(jamieliu): check if Go is actually smart enough to optimize a
 	// ClearValue that assigns nil to a memset here
 	for i := range slice {
 		Functions{}.ClearValue(&slice[i])
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index 05ba5fbb9..41f649011 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -15,7 +15,7 @@
 package segment
 
 // Basic numeric constants that we define because the math package doesn't.
-// TODO: These should be Math.MaxInt64/MinInt64?
+// TODO(nlacasse): These should be Math.MaxInt64/MinInt64?
 const (
 	maxInt = int(^uint(0) >> 1)
 	minInt = -maxInt - 1
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 4cd7a9af5..16d8eb2b2 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -53,7 +53,7 @@ type FloatingPointData byte
 
 // Context provides architecture-dependent information for a specific thread.
 //
-// NOTE: Currently we use uintptr here to refer to a generic native
+// NOTE(b/34169503): Currently we use uintptr here to refer to a generic native
 // register value. While this will work for the foreseeable future, it isn't
 // strictly correct. We may want to create some abstraction that makes this
 // more clear or enables us to store values of arbitrary widths. This is
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 2507774f7..7ec2f2c84 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -305,7 +305,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
 		buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
 		return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
 	}
-	// TODO: debug registers
+	// TODO(b/34088053): debug registers
 	return c.Native(0), nil
 }
 
@@ -320,6 +320,6 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
 		_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
 		return err
 	}
-	// TODO: debug registers
+	// TODO(b/34088053): debug registers
 	return nil
 }
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index c8bf0e7f2..4305fe2cb 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -306,7 +306,7 @@ func (s *State) ptraceGetRegs() syscall.PtraceRegs {
 	// FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the
 	// same in PtraceSetRegs.)
 	//
-	// TODO: Remove this fixup since newer Linux
+	// TODO(gvisor.dev/issue/168): Remove this fixup since newer Linux
 	// doesn't have this behavior anymore.
 	if regs.Fs == 0 && regs.Fs_base <= 0xffffffff {
 		regs.Fs = _FS_TLS_SEL
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index c9de36897..7f76eba27 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -319,7 +319,7 @@ func (c *context64) NewSignalStack() NativeSignalStack {
 // From Linux 'arch/x86/include/uapi/asm/sigcontext.h' the following is the
 // size of the magic cookie at the end of the xsave frame.
 //
-// NOTE: Currently we don't actually populate the fpstate
+// NOTE(b/33003106#comment11): Currently we don't actually populate the fpstate
 // on the signal stack.
 const _FP_XSTATE_MAGIC2_SIZE = 4
 
@@ -392,7 +392,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
 		Sigset: sigset,
 	}
 
-	// TODO: Set SignalContext64.Err, Trapno, and Cr2
+	// TODO(gvisor.dev/issue/159): Set SignalContext64.Err, Trapno, and Cr2
 	// based on the fault that caused the signal. For now, leave Err and
 	// Trapno unset and assume CR2 == info.Addr() for SIGSEGVs and
 	// SIGBUSes.
@@ -505,7 +505,7 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalSt
 	l := len(c.sigFPState)
 	if l > 0 {
 		c.x86FPState = c.sigFPState[l-1]
-		// NOTE: State save requires that any slice
+		// NOTE(cl/133042258): State save requires that any slice
 		// elements from '[len:cap]' to be zero value.
 		c.sigFPState[l-1] = nil
 		c.sigFPState = c.sigFPState[0 : l-1]
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index f2cfb0426..2e33ccdf5 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -97,7 +97,7 @@ func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) {
 		if c < 0 {
 			return 0, fmt.Errorf("bad binary.Size for %T", v)
 		}
-		// TODO: Use a real context.Context.
+		// TODO(b/38173783): Use a real context.Context.
 		n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{})
 		if err != nil || c != n {
 			return 0, err
@@ -121,11 +121,11 @@ func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) {
 		var err error
 		if isVaddr {
 			value := s.Arch.Native(uintptr(0))
-			// TODO: Use a real context.Context.
+			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{})
 			*vaddr = usermem.Addr(s.Arch.Value(value))
 		} else {
-			// TODO: Use a real context.Context.
+			// TODO(b/38173783): Use a real context.Context.
 			n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{})
 		}
 		if err != nil {
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index 7ed6a5e8a..eefc3e1b4 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -114,7 +114,7 @@ var bgContext = &logContext{Logger: log.Log()}
 // Background returns an empty context using the default logger.
 //
 // Users should be wary of using a Background context. Please tag any use with
-// FIXME and a note to remove this use.
+// FIXME(b/38173783) and a note to remove this use.
 //
 // Generally, one should use the Task as their context when available, or avoid
 // having to use a context in places where a Task is unavailable.
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index e848def14..aca2267a7 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -261,7 +261,7 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error {
 }
 
 // Process contains information about a single process in a Sandbox.
-// TODO: Implement TTY field.
+// TODO(b/117881927): Implement TTY field.
 type Process struct {
 	UID auth.KUID       `json:"uid"`
 	PID kernel.ThreadID `json:"pid"`
diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md
index a88a0cd3a..f53ed3eaa 100644
--- a/pkg/sentry/fs/README.md
+++ b/pkg/sentry/fs/README.md
@@ -59,7 +59,7 @@ two categories:
 
 The first is always necessary to save and restore. An application may never have
 any open file descriptors, but across save and restore it should see a coherent
-view of any mount namespace. NOTE: Currently only one "initial"
+view of any mount namespace. NOTE(b/63601033): Currently only one "initial"
 mount namespace is supported.
 
 The second is so that system calls across save and restore are coherent with
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index 651cbc164..1f61c5711 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -240,7 +240,7 @@ func (a *Area) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 			return 0, syserror.EINVAL
 		}
 
-		// TODO: If personality flag
+		// TODO(b/30946773,gvisor.dev/issue/153): If personality flag
 		// READ_IMPLIES_EXEC is set, set PROT_EXEC if PORT_READ is set.
 
 		a.perms = perms
@@ -290,7 +290,7 @@ func (a *Area) pinOperation(pin linux.AshmemPin, op uint32) (uintptr, error) {
 		return linux.AshmemNotPurged, nil
 
 	case linux.AshmemUnpinIoctl:
-		// TODO: Implement purge on unpin.
+		// TODO(b/30946773): Implement purge on unpin.
 		a.pb.UnpinRange(r)
 		return 0, nil
 
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index a41b5dcae..d9f1559de 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -69,7 +69,7 @@ func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *
 
 // GetFile implements fs.InodeOperations.GetFile.
 //
-// TODO: Add functionality to GetFile: Additional fields will be
+// TODO(b/30946773): Add functionality to GetFile: Additional fields will be
 // needed in the Device structure, initialize them here. Also, Device will need
 // to keep track of the created Procs in order to implement BINDER_READ_WRITE
 // ioctl.
@@ -133,7 +133,7 @@ func (bp *Proc) Write(ctx context.Context, file *fs.File, src usermem.IOSequence
 
 // Flush implements fs.FileOperations.Flush.
 //
-// TODO: Implement.
+// TODO(b/30946773): Implement.
 func (bp *Proc) Flush(ctx context.Context, file *fs.File) error {
 	return nil
 }
@@ -149,7 +149,7 @@ func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.M
 	}
 	opts.MaxPerms.Write = false
 
-	// TODO: Binder sets VM_DONTCOPY, preventing the created vma
+	// TODO(b/30946773): Binder sets VM_DONTCOPY, preventing the created vma
 	// from being copied across fork(), but we don't support this yet. As
 	// a result, MMs containing a Binder mapping cannot be forked (MM.Fork will
 	// fail when AddMapping returns EBUSY).
@@ -159,7 +159,7 @@ func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.M
 
 // Ioctl implements fs.FileOperations.Ioctl.
 //
-// TODO: Implement.
+// TODO(b/30946773): Implement.
 func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// Switch on ioctl request.
 	switch uint32(args[1].Int()) {
@@ -173,22 +173,22 @@ func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgum
 		})
 		return 0, err
 	case linux.BinderWriteReadIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetIdleTimeoutIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetMaxThreadsIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetIdlePriorityIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderSetContextMgrIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		fallthrough
 	case linux.BinderThreadExitIoctl:
-		// TODO: Implement.
+		// TODO(b/30946773): Implement.
 		return 0, syserror.ENOSYS
 	default:
 		// Ioctls irrelevant to Binder.
@@ -228,7 +228,7 @@ func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR,
 
 // Translate implements memmap.Mappable.Translate.
 func (bp *Proc) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
-	// TODO: In addition to the page initially allocated and mapped
+	// TODO(b/30946773): In addition to the page initially allocated and mapped
 	// in AddMapping (Linux: binder_mmap), Binder allocates and maps pages for
 	// each transaction (Linux: binder_ioctl => binder_ioctl_write_read =>
 	// binder_thread_write => binder_transaction => binder_alloc_buf =>
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index 4879df4d6..29fb155a4 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -83,7 +83,7 @@ type DirCtx struct {
 	attrs map[string]DentAttr
 
 	// DirCursor is the directory cursor.
-	// TODO: Once Handles are removed this can just live in the
+	// TODO(b/67778717): Once Handles are removed this can just live in the
 	// respective FileOperations implementations and not need to get
 	// plumbed everywhere.
 	DirCursor *string
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 4bcdf530a..54fc11fe1 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -318,7 +318,7 @@ func (d *Dirent) SyncAll(ctx context.Context) {
 
 	// There is nothing to sync for a read-only filesystem.
 	if !d.Inode.MountSource.Flags.ReadOnly {
-		// FIXME: This should be a mount traversal, not a
+		// FIXME(b/34856369): This should be a mount traversal, not a
 		// Dirent traversal, because some Inodes that need to be synced
 		// may no longer be reachable by name (after sys_unlink).
 		//
@@ -1506,7 +1506,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	}
 
 	// Are we frozen?
-	// TODO: Is this the right errno?
+	// TODO(jamieliu): Is this the right errno?
 	if oldParent.frozen && !oldParent.Inode.IsVirtual() {
 		return syscall.ENOENT
 	}
@@ -1565,7 +1565,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 	} else {
 		// Check constraints on the dirent being replaced.
 
-		// NOTE: We don't want to keep replaced alive
+		// NOTE(b/111808347): We don't want to keep replaced alive
 		// across the Rename, so must call DecRef manually (no defer).
 
 		// Check that we can delete replaced.
@@ -1606,7 +1606,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string
 		// Allow the file system to drop extra references on replaced.
 		replaced.dropExtendedReference()
 
-		// NOTE: Keeping a dirent
+		// NOTE(b/31798319,b/31867149,b/31867671): Keeping a dirent
 		// open across renames is currently broken for multiple
 		// reasons, so we flush all references on the replaced node and
 		// its children.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 2c2126f17..5d5026661 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -65,7 +65,7 @@ const FileMaxOffset = math.MaxInt64
 // under a single abortable mutex which also synchronizes lseek(2), read(2),
 // and write(2).
 //
-// FIXME: Split synchronization from cancellation.
+// FIXME(b/38451980): Split synchronization from cancellation.
 //
 // +stateify savable
 type File struct {
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index e1f02f0f4..6e680f0a4 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -160,7 +160,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See
 	// If this was a seek on a directory, we must update the cursor.
 	if seekDir && whence == SeekSet && offset == 0 {
 		// Currently only seeking to 0 on a directory is supported.
-		// FIXME: Lift directory seeking limitations.
+		// FIXME(b/33075855): Lift directory seeking limitations.
 		f.dirCursor = ""
 	}
 	return n, nil
@@ -329,7 +329,7 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt
 	if !o.isMappableLocked() {
 		return syserror.ENODEV
 	}
-	// FIXME: This is a copy/paste of fsutil.GenericConfigureMMap,
+	// FIXME(jamieliu): This is a copy/paste of fsutil.GenericConfigureMMap,
 	// which we can't use because the overlay implementation is in package fs,
 	// so depending on fs/fsutil would create a circular dependency. Move
 	// overlay to fs/overlay.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index df34dc788..42afdd11c 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -36,7 +36,7 @@ func (FileNoopRelease) Release() {}
 //
 // Currently only seeking to 0 on a directory is supported.
 //
-// FIXME: Lift directory seeking limitations.
+// FIXME(b/33075855): Lift directory seeking limitations.
 func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) {
 	inode := file.Dirent.Inode
 	current := file.Offset()
@@ -50,7 +50,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 	if fs.IsCharDevice(inode.StableAttr) {
 		// Ignore seek requests.
 		//
-		// FIXME: This preserves existing
+		// FIXME(b/34716638): This preserves existing
 		// behavior but is not universally correct.
 		return 0, nil
 	}
@@ -104,7 +104,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 				return current, syserror.EINVAL
 			}
 			return sz + offset, nil
-		// FIXME: This is not universally correct.
+		// FIXME(b/34778850): This is not universally correct.
 		// Remove SpecialDirectory.
 		case fs.SpecialDirectory:
 			if offset != 0 {
@@ -112,7 +112,7 @@ func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence,
 			}
 			// SEEK_END to 0 moves the directory "cursor" to the end.
 			//
-			// FIXME: The ensures that after the seek,
+			// FIXME(b/35442290): The ensures that after the seek,
 			// reading on the directory will get EOF. But it is not
 			// correct in general because the directory can grow in
 			// size; attempting to read those new entries will be
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index b690cfe93..ba33b9912 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -479,7 +479,7 @@ func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst us
 	// common: getting a return value of 0 from a read syscall is the only way
 	// to detect EOF.
 	//
-	// TODO: Separate out c.attr.Size and use atomics instead of
+	// TODO(jamieliu): Separate out c.attr.Size and use atomics instead of
 	// c.dataMu.
 	c.dataMu.RLock()
 	size := c.attr.Size
@@ -776,7 +776,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option
 	var translatedEnd uint64
 	for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
 		segMR := seg.Range().Intersect(optional)
-		// TODO: Make Translations writable even if writability is
+		// TODO(jamieliu): Make Translations writable even if writability is
 		// not required if already kept-dirty by another writable translation.
 		perms := usermem.AccessType{
 			Read:    true,
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index d7fbb71b7..51c573aef 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -136,7 +136,7 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child
 
 	// Walk from parent to child again.
 	//
-	// TODO: If we have a directory FD in the parent
+	// TODO(b/112031682): If we have a directory FD in the parent
 	// inodeOperations, then we can use fstatat(2) to get the inode
 	// attributes instead of making this RPC.
 	qids, _, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
@@ -171,7 +171,7 @@ func (cp cachePolicy) keep(d *fs.Dirent) bool {
 		return false
 	}
 	sattr := d.Inode.StableAttr
-	// NOTE: Only cache files, directories, and symlinks.
+	// NOTE(b/31979197): Only cache files, directories, and symlinks.
 	return fs.IsFile(sattr) || fs.IsDir(sattr) || fs.IsSymlink(sattr)
 }
 
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 80d1e08a6..35caa42cd 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -297,7 +297,7 @@ func (f *fileOperations) Flush(ctx context.Context, file *fs.File) error {
 	// We do this because some p9 server implementations of Flush are
 	// over-zealous.
 	//
-	// FIXME: weaken these implementations and remove this check.
+	// FIXME(edahlgren): weaken these implementations and remove this check.
 	if !file.Flags().Write {
 		return nil
 	}
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index f770ca4ea..d0c64003c 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -28,7 +28,7 @@ func (f *fileOperations) afterLoad() {
 
 		// Manually load the open handles.
 		var err error
-		// TODO: Context is not plumbed to save/restore.
+		// TODO(b/38173783): Context is not plumbed to save/restore.
 		f.handles, err = f.inodeOperations.fileState.getHandles(context.Background(), f.flags)
 		if err != nil {
 			return fmt.Errorf("failed to re-open handle: %v", err)
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index f32e99ce0..0b33e80c3 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -49,7 +49,7 @@ func (h *handles) DecRef() {
 				log.Warningf("error closing host file: %v", err)
 			}
 		}
-		// FIXME: Context is not plumbed here.
+		// FIXME(b/38173783): Context is not plumbed here.
 		if err := h.File.close(context.Background()); err != nil {
 			log.Warningf("error closing p9 file: %v", err)
 		}
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 29af1010c..1181a24cc 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -570,13 +570,13 @@ func init() {
 }
 
 // AddLink implements InodeOperations.AddLink, but is currently a noop.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) AddLink() {}
 
 // DropLink implements InodeOperations.DropLink, but is currently a noop.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (*inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index ad4d3df58..44d76ba9f 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -123,7 +123,7 @@ func (i *inodeFileState) afterLoad() {
 			// beforeSave.
 			return fmt.Errorf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))
 		}
-		// TODO: Context is not plumbed to save/restore.
+		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		_, i.file, err = i.s.attach.walk(ctx, splitAbsolutePath(name))
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index ed5147c65..4ed688ce5 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -134,7 +134,7 @@ type session struct {
 	// socket files. This allows unix domain sockets to be used with paths that
 	// belong to a gofer.
 	//
-	// TODO: there are few possible races with someone stat'ing the
+	// TODO(b/77154739): there are few possible races with someone stat'ing the
 	// file and another deleting it concurrently, where the file will not be
 	// reported as socket file.
 	endpoints *endpointMaps `state:"wait"`
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index 0ad5d63b5..b1f299be5 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -104,7 +104,7 @@ func (s *session) afterLoad() {
 	// If private unix sockets are enabled, create and fill the session's endpoint
 	// maps.
 	if opts.privateunixsocket {
-		// TODO: Context is not plumbed to save/restore.
+		// TODO(b/38173783): Context is not plumbed to save/restore.
 		ctx := &dummyClockContext{context.Background()}
 
 		if err = s.restoreEndpointMaps(ctx); err != nil {
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index 800649211..de349a41a 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -87,7 +87,7 @@ func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFl
 	options := fs.GenericMountSourceOptions(data)
 
 	// Grab the whitelist if one was specified.
-	// TODO: require another option "testonly" in order to allow
+	// TODO(edahlgren/mpratt/hzy): require another option "testonly" in order to allow
 	// no whitelist.
 	if wl, ok := options[whitelistKey]; ok {
 		f.paths = strings.Split(wl, "|")
@@ -320,7 +320,7 @@ func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) {
 
 // Keep implements fs.MountSourceOperations.Keep.
 //
-// TODO: It is possible to change the permissions on a
+// TODO(b/72455313,b/77596690): It is possible to change the permissions on a
 // host file while it is in the dirent cache (say from RO to RW), but it is not
 // possible to re-open the file with more relaxed permissions, since the host
 // FD is already open and stored in the inode.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 2030edcb4..69c648f67 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -95,7 +95,7 @@ type inodeFileState struct {
 
 // ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
 func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
-	// TODO: Using safemem.FromIOReader here is wasteful for two
+	// TODO(jamieliu): Using safemem.FromIOReader here is wasteful for two
 	// reasons:
 	//
 	// - Using preadv instead of iterated preads saves on host system calls.
@@ -325,7 +325,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 
 // canMap returns true if this fs.Inode can be memory mapped.
 func canMap(inode *fs.Inode) bool {
-	// FIXME: Some obscure character devices can be mapped.
+	// FIXME(b/38213152): Some obscure character devices can be mapped.
 	return fs.IsFile(inode.StableAttr)
 }
 
@@ -428,15 +428,15 @@ func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) {
 }
 
 // AddLink implements fs.InodeOperations.AddLink.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) AddLink() {}
 
 // DropLink implements fs.InodeOperations.DropLink.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) DropLink() {}
 
 // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
-// FIXME: Remove this from InodeOperations altogether.
+// FIXME(b/63117438): Remove this from InodeOperations altogether.
 func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}
 
 // readdirAll returns all of the directory entries in i.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d82f9740e..fe411a766 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -93,10 +93,10 @@ func (i *Inode) DecRef() {
 
 // destroy releases the Inode and releases the msrc reference taken.
 func (i *Inode) destroy() {
-	// FIXME: Context is not plumbed here.
+	// FIXME(b/38173783): Context is not plumbed here.
 	ctx := context.Background()
 	if err := i.WriteOut(ctx); err != nil {
-		// FIXME: Mark as warning again once noatime is
+		// FIXME(b/65209558): Mark as warning again once noatime is
 		// properly supported.
 		log.Debugf("Inode %+v, failed to sync all metadata: %v", i.StableAttr, err)
 	}
@@ -359,7 +359,7 @@ func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) {
 // AddLink calls i.InodeOperations.AddLink.
 func (i *Inode) AddLink() {
 	if i.overlay != nil {
-		// FIXME: Remove this from InodeOperations altogether.
+		// FIXME(b/63117438): Remove this from InodeOperations altogether.
 		//
 		// This interface is only used by ramfs to update metadata of
 		// children. These filesystems should _never_ have overlay
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index ceacc7659..ff8b75f31 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -118,7 +118,7 @@ type InodeOperations interface {
 	//
 	// The caller must ensure that this operation is permitted.
 	//
-	// TODO: merge Remove and RemoveDirectory, Remove
+	// TODO(b/67778723): merge Remove and RemoveDirectory, Remove
 	// just needs a type flag.
 	Remove(ctx context.Context, dir *Inode, name string) error
 
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 254646176..bda3e1861 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -142,7 +142,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name
 				} else {
 					// If we have something from the upper, we can only use it if the types
 					// match.
-					// NOTE: Allow SpecialDirectories and Directories to merge.
+					// NOTE(b/112312863): Allow SpecialDirectories and Directories to merge.
 					// This is needed to allow submounts in /proc and /sys.
 					if upperInode.StableAttr.Type == child.Inode.StableAttr.Type ||
 						(IsDir(upperInode.StableAttr) && IsDir(child.Inode.StableAttr)) {
@@ -226,7 +226,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st
 		return nil, err
 	}
 
-	// NOTE: Replace the Dirent with a transient Dirent, since
+	// NOTE(b/71766861): Replace the Dirent with a transient Dirent, since
 	// we are about to create the real Dirent: an overlay Dirent.
 	//
 	// This ensures the *fs.File returned from overlayCreate is in the same
@@ -338,7 +338,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 		// directory will appear empty in the upper fs, which will then
 		// allow the rename to proceed when it should return ENOTEMPTY.
 		//
-		// NOTE: Ideally, we'd just pass in the replaced
+		// NOTE(b/111808347): Ideally, we'd just pass in the replaced
 		// Dirent from Rename, but we must drop the reference on
 		// replaced before we make the rename call, so Rename can't
 		// pass the Dirent to the Inode without significantly
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 1e245ae5f..4d1693204 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -42,7 +42,7 @@ type DirentOperations interface {
 
 // MountSourceOperations contains filesystem specific operations.
 type MountSourceOperations interface {
-	// TODO: Add:
+	// TODO(b/67778729): Add:
 	// BlockSize() int64
 	// FS() Filesystem
 
@@ -101,7 +101,7 @@ func (i InodeMappings) String() string {
 // amalgamation implies that a mount source cannot be shared by multiple mounts
 // (e.g. cannot be mounted at different locations).
 //
-// TODO: Move mount-specific information out of MountSource.
+// TODO(b/63601033): Move mount-specific information out of MountSource.
 //
 // +stateify savable
 type MountSource struct {
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 269d6b9da..d7605b2c9 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -33,7 +33,7 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool {
 }
 
 // TestMountSourceOnlyCachedOnce tests that a Dirent that is mounted over only ends
-// up in a single Dirent Cache. NOTE: Having a dirent in multiple
+// up in a single Dirent Cache. NOTE(b/63848693): Having a dirent in multiple
 // caches causes major consistency issues.
 func TestMountSourceOnlyCachedOnce(t *testing.T) {
 	ctx := contexttest.Context(t)
diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md
index 3cc5f197c..5d4ec6c7b 100644
--- a/pkg/sentry/fs/proc/README.md
+++ b/pkg/sentry/fs/proc/README.md
@@ -91,7 +91,7 @@ CPU.IO utilization in last 10 minutes | Always zero
 Num currently running processes       | Always zero
 Total num processes                   | Always zero
 
-TODO: Populate the columns with accurate statistics.
+TODO(b/62345059): Populate the columns with accurate statistics.
 
 ### meminfo
 
@@ -128,12 +128,12 @@ Field name        | Notes
 Buffers           | Always zero, no block devices
 SwapCache         | Always zero, no swap
 Inactive(anon)    | Always zero, see SwapCache
-Unevictable       | Always zero TODO
-Mlocked           | Always zero TODO
+Unevictable       | Always zero TODO(b/31823263)
+Mlocked           | Always zero TODO(b/31823263)
 SwapTotal         | Always zero, no swap
 SwapFree          | Always zero, no swap
-Dirty             | Always zero TODO
-Writeback         | Always zero TODO
+Dirty             | Always zero TODO(b/31823263)
+Writeback         | Always zero TODO(b/31823263)
 MemAvailable      | Uses the same value as MemFree since there is no swap.
 Slab              | Missing
 SReclaimable      | Missing
@@ -185,7 +185,7 @@ softirq 0 0 0 0 0 0 0 0 0 0 0
 
 All fields except for `btime` are always zero.
 
-TODO: Populate with accurate fields.
+TODO(b/37226836): Populate with accurate fields.
 
 ### sys
 
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index 25da06f5d..f2329e623 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -258,7 +258,7 @@ func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 // Lookup loads an fd in /proc/TID/fdinfo into a Dirent.
 func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) {
 	inode, err := walkDescriptors(fdid.t, p, func(file *fs.File, fdFlags kernel.FDFlags) *fs.Inode {
-		// TODO: Using a static inode here means that the
+		// TODO(b/121266871): Using a static inode here means that the
 		// data can be out-of-date if, for instance, the flags on the
 		// FD change before we read this file. We should switch to
 		// generating the data on Read(). Also, we should include pos,
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 78f3a1dc0..3ee0e570a 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -40,7 +40,7 @@ func (d *loadavgData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 
 	var buf bytes.Buffer
 
-	// TODO: Include real data in fields.
+	// TODO(b/62345059): Include real data in fields.
 	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
 	// Column 4-5: currently running processes and the total number of processes.
 	// Column 6: the last process ID used.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 620e93ce3..75cbf3e77 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -58,7 +58,7 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	fmt.Fprintf(&buf, "MemTotal:       %8d kB\n", totalSize/1024)
 	memFree := (totalSize - totalUsage) / 1024
 	// We use MemFree as MemAvailable because we don't swap.
-	// TODO: When reclaim is implemented the value of MemAvailable
+	// TODO(rahat): When reclaim is implemented the value of MemAvailable
 	// should change.
 	fmt.Fprintf(&buf, "MemFree:        %8d kB\n", memFree)
 	fmt.Fprintf(&buf, "MemAvailable:   %8d kB\n", memFree)
@@ -72,8 +72,8 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	fmt.Fprintf(&buf, "Inactive(anon):        0 kB\n")
 	fmt.Fprintf(&buf, "Active(file):   %8d kB\n", activeFile/1024)
 	fmt.Fprintf(&buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
-	fmt.Fprintf(&buf, "Unevictable:           0 kB\n") // TODO
-	fmt.Fprintf(&buf, "Mlocked:               0 kB\n") // TODO
+	fmt.Fprintf(&buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(&buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
 	fmt.Fprintf(&buf, "SwapTotal:             0 kB\n")
 	fmt.Fprintf(&buf, "SwapFree:              0 kB\n")
 	fmt.Fprintf(&buf, "Dirty:                 0 kB\n")
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 1e62af8c6..fe62b167b 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -114,7 +114,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		// (4) Root: the pathname of the directory in the filesystem
 		// which forms the root of this mount.
 		//
-		// NOTE: This will always be "/" until we implement
+		// NOTE(b/78135857): This will always be "/" until we implement
 		// bind mounts.
 		fmt.Fprintf(&buf, "/ ")
 
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 55a958f9e..d24b2d370 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -154,7 +154,7 @@ func (n *netDev) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
 	contents[1] = " face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n"
 
 	for _, i := range interfaces {
-		// TODO: Collect stats from each inet.Stack
+		// TODO(b/71872867): Collect stats from each inet.Stack
 		// implementation (hostinet, epsocket, and rpcinet).
 
 		// Implements the same format as
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index f2bbef375..18bd8e9b6 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -83,7 +83,7 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]
 
 	var buf bytes.Buffer
 
-	// TODO: We currently export only zero CPU stats. We could
+	// TODO(b/37226836): We currently export only zero CPU stats. We could
 	// at least provide some aggregate stats.
 	var cpu cpuStats
 	fmt.Fprintf(&buf, "cpu  %s\n", cpu)
@@ -100,7 +100,7 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]
 	const numInterrupts = 256
 
 	// The Kernel doesn't handle real interrupts, so report all zeroes.
-	// TODO: We could count page faults as #PF.
+	// TODO(b/37226836): We could count page faults as #PF.
 	fmt.Fprintf(&buf, "intr 0") // total
 	for i := 0; i < numInterrupts; i++ {
 		fmt.Fprintf(&buf, " 0")
@@ -108,22 +108,22 @@ func (s *statData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]
 	fmt.Fprintf(&buf, "\n")
 
 	// Total number of context switches.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "ctxt 0\n")
 
 	// CLOCK_REALTIME timestamp from boot, in seconds.
 	fmt.Fprintf(&buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
 
 	// Total number of clones.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "processes 0\n")
 
 	// Number of runnable tasks.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "procs_running 0\n")
 
 	// Number of tasks waiting on IO.
-	// TODO: Count this.
+	// TODO(b/37226836): Count this.
 	fmt.Fprintf(&buf, "procs_blocked 0\n")
 
 	// Number of each softirq handled.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 728a46a74..0ce77f04f 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -39,7 +39,7 @@ const (
 
 // tcpMemInode is used to read/write the size of netstack tcp buffers.
 //
-// TODO: If we have multiple proc mounts, concurrent writes can
+// TODO(b/121381035): If we have multiple proc mounts, concurrent writes can
 // leave netstack and the proc files in an inconsistent state. Since we set the
 // buffer size from these proc files on restore, we may also race and end up in
 // an inconsistent state on restore.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 0edcdfce2..9f65a8337 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -77,7 +77,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"fd":      newFdDir(t, msrc),
 		"fdinfo":  newFdInfoDir(t, msrc),
 		"gid_map": newGIDMap(t, msrc),
-		// FIXME: create the correct io file for threads.
+		// FIXME(b/123511468): create the correct io file for threads.
 		"io":        newIO(t, msrc),
 		"maps":      newMaps(t, msrc),
 		"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
@@ -93,7 +93,7 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		contents["task"] = newSubtasks(t, msrc, pidns)
 	}
 
-	// TODO: Set EUID/EGID based on dumpability.
+	// TODO(b/31916171): Set EUID/EGID based on dumpability.
 	d := &taskDir{
 		Dir:   *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
 		t:     t,
@@ -245,7 +245,7 @@ func (e *exe) executable() (d *fs.Dirent, err error) {
 	e.t.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
-			// TODO: Check shouldn't allow Readlink once the
+			// TODO(b/34851096): Check shouldn't allow Readlink once the
 			// Task is zombied.
 			err = syserror.EACCES
 			return
@@ -297,7 +297,7 @@ type namespaceSymlink struct {
 }
 
 func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode {
-	// TODO: Namespace symlinks should contain the namespace name and the
+	// TODO(rahat): Namespace symlinks should contain the namespace name and the
 	// inode number for the namespace instance, so for example user:[123456]. We
 	// currently fake the inode number by sticking the symlink inode in its
 	// place.
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index b6d49d5e9..58e0c793c 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -65,7 +65,7 @@ func (v *versionData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle)
 	// Since we don't really want to expose build information to
 	// applications, those fields are omitted.
 	//
-	// FIXME: Using Version from the init task SyscallTable
+	// FIXME(mpratt): Using Version from the init task SyscallTable
 	// disregards the different version a task may have (e.g., in a uts
 	// namespace).
 	ver := init.Leader().SyscallTable().Version
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 159fd2981..c0400b67d 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -358,7 +358,7 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewDir(ctx, dir, perms)
 	})
-	// TODO: Support updating status times, as those should be
+	// TODO(nlacasse): Support updating status times, as those should be
 	// updated by links.
 	return err
 }
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index d0c93028f..8e44421b6 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -34,7 +34,7 @@ const (
 	// GID for the root directory.
 	rootGIDKey = "gid"
 
-	// TODO: support a tmpfs size limit.
+	// TODO(edahlgren/mpratt): support a tmpfs size limit.
 	// size = "size"
 
 	// Permissions that exceed modeMask will be rejected.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 7c80d711b..4450e1363 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -309,7 +309,7 @@ func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst userm
 	// common: getting a return value of 0 from a read syscall is the only way
 	// to detect EOF.
 	//
-	// TODO: Separate out f.attr.Size and use atomics instead of
+	// TODO(jamieliu): Separate out f.attr.Size and use atomics instead of
 	// f.dataMu.
 	f.dataMu.RLock()
 	size := f.attr.Size
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 555692505..5bb4922cb 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -32,7 +32,7 @@ import (
 var fsInfo = fs.Info{
 	Type: linux.TMPFS_MAGIC,
 
-	// TODO: allow configuring a tmpfs size and enforce it.
+	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
 	TotalBlocks: 0,
 	FreeBlocks:  0,
 }
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 33b4c6438..f8713471a 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -66,7 +66,7 @@ type dirInodeOperations struct {
 
 	// msrc is the super block this directory is on.
 	//
-	// TODO: Plumb this through instead of storing it here.
+	// TODO(chrisko): Plumb this through instead of storing it here.
 	msrc *fs.MountSource
 
 	// mu protects the fields below.
@@ -89,7 +89,7 @@ type dirInodeOperations struct {
 
 	// next is the next pty index to use.
 	//
-	// TODO: reuse indices when ptys are closed.
+	// TODO(b/29356795): reuse indices when ptys are closed.
 	next uint32
 }
 
@@ -118,7 +118,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
 		// N.B. Linux always uses inode id 1 for the directory. See
 		// fs/devpts/inode.c:devpts_fill_super.
 		//
-		// TODO: Since ptsDevice must be shared between
+		// TODO(b/75267214): Since ptsDevice must be shared between
 		// different mounts, we must not assign fixed numbers.
 		InodeID:   ptsDevice.NextIno(),
 		BlockSize: usermem.PageSize,
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index 43e0e2a04..a53448c47 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -43,7 +43,7 @@ func (*filesystem) Name() string {
 
 // AllowUserMount allows users to mount(2) this file system.
 func (*filesystem) AllowUserMount() bool {
-	// TODO: Users may mount this once the terminals are in a
+	// TODO(b/29356795): Users may mount this once the terminals are in a
 	// usable state.
 	return false
 }
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 7c256abb0..e2686a074 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -51,7 +51,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn
 		// N.B. Linux always uses inode id 2 for ptmx. See
 		// fs/devpts/inode.c:mknod_ptmx.
 		//
-		// TODO: Since ptsDevice must be shared between
+		// TODO(b/75267214): Since ptsDevice must be shared between
 		// different mounts, we must not assign fixed numbers.
 		InodeID: ptsDevice.NextIno(),
 		Type:    fs.CharacterDevice,
@@ -157,7 +157,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args a
 		// of the slave end.
 		return mf.t.ld.setTermios(ctx, io, args)
 	case linux.TCSETSW:
-		// TODO: This should drain the output queue first.
+		// TODO(b/29356795): This should drain the output queue first.
 		return mf.t.ld.setTermios(ctx, io, args)
 	case linux.TIOCGPTN:
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{
@@ -165,7 +165,7 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args a
 		})
 		return 0, err
 	case linux.TIOCSPTLCK:
-		// TODO: Implement pty locking. For now just pretend we do.
+		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
 		return 0, mf.t.ld.windowSize(ctx, io, args)
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index e8368bcdd..ed080ca0f 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -56,7 +56,7 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne
 		// N.B. Linux always uses inode id = tty index + 3. See
 		// fs/devpts/inode.c:devpts_pty_new.
 		//
-		// TODO: Since ptsDevice must be shared between
+		// TODO(b/75267214): Since ptsDevice must be shared between
 		// different mounts, we must not assign fixed numbers.
 		InodeID: ptsDevice.NextIno(),
 		Type:    fs.CharacterDevice,
@@ -137,7 +137,7 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 	case linux.TCSETS:
 		return sf.si.t.ld.setTermios(ctx, io, args)
 	case linux.TCSETSW:
-		// TODO: This should drain the output queue first.
+		// TODO(b/29356795): This should drain the output queue first.
 		return sf.si.t.ld.setTermios(ctx, io, args)
 	case linux.TIOCGPTN:
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{
@@ -151,7 +151,7 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args ar
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		// TODO: Implement once we have support for job
+		// TODO(b/129283598): Implement once we have support for job
 		// control.
 		return 0, nil
 	default:
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index a843b9aab..2055da196 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -125,7 +125,7 @@ func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *T
 		creds.EffectiveCaps = capabilities.EffectiveCaps
 		creds.BoundingCaps = capabilities.BoundingCaps
 		creds.InheritableCaps = capabilities.InheritableCaps
-		// TODO: Support ambient capabilities.
+		// TODO(nlacasse): Support ambient capabilities.
 	} else {
 		// If no capabilities are specified, grant capabilities consistent with
 		// setresuid + setresgid from NewRootCredentials to the given uid and
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 30957bb9a..159940a69 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -49,7 +49,7 @@ type UserNamespace struct {
 	gidMapFromParent idMapSet
 	gidMapToParent   idMapSet
 
-	// TODO: Support disabling setgroups(2).
+	// TODO(b/27454212): Support disabling setgroups(2).
 }
 
 // NewRootUserNamespace returns a UserNamespace that is appropriate for a
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index 373e11772..deff6def9 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -30,7 +30,7 @@ const (
 	// rtSignalCap is the maximum number of instances of a given realtime
 	// signal that may be pending.
 	//
-	// TODO: In Linux, the minimum signal queue size is
+	// TODO(igudger): In Linux, the minimum signal queue size is
 	// RLIMIT_SIGPENDING, which is by default max_threads/2.
 	rtSignalCap = 32
 )
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 8d78b2fb3..15f2e2964 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -162,7 +162,7 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
 	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
 		return false
 	}
-	// TODO: dumpability check
+	// TODO(b/31916171): dumpability check
 	if callerCreds.UserNamespace != targetCreds.UserNamespace {
 		return false
 	}
@@ -396,7 +396,7 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
 	if target.stop == (*groupStop)(nil) {
 		target.trapStopPending = true
 		target.endInternalStopLocked()
-		// TODO: Linux blocks ptrace_attach() until the task has
+		// TODO(jamieliu): Linux blocks ptrace_attach() until the task has
 		// entered the ptrace-stop (or exited) via JOBCTL_TRAPPING.
 	}
 	target.tg.signalHandlers.mu.Unlock()
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 0a954bc16..6d3314e81 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -66,7 +66,7 @@ func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
 	if rscr.CriticalSection.Contains(rscr.Restart) {
 		return syserror.EINVAL
 	}
-	// TODO: check that rscr.CriticalSection and rscr.Restart are in
+	// TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in
 	// the application address range, for consistency with Linux
 	t.tg.rscr.Store(&rscr)
 	return nil
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
index 69aee9127..41ac1067d 100644
--- a/pkg/sentry/kernel/sched/cpuset.go
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -29,7 +29,7 @@ type CPUSet []byte
 
 // CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
 func CPUSetSize(num uint) uint {
-	// NOTE: Applications may expect that the size of a CPUSet in
+	// NOTE(b/68859821): Applications may expect that the size of a CPUSet in
 	// bytes is always a multiple of sizeof(unsigned long), since this is true
 	// in Linux. Thus we always round up.
 	bytes := (num + bitsPerByte - 1) / bitsPerByte
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 29a2eb804..2b7c1a9bc 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 		return syserror.ERANGE
 	}
 
-	// TODO: Clear undo entries in all processes
+	// TODO(b/29354920): Clear undo entries in all processes
 	sem.value = val
 	sem.pid = pid
 	s.changeTime = ktime.NowFromContext(ctx)
@@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 	for i, val := range vals {
 		sem := &s.sems[i]
 
-		// TODO: Clear undo entries in all processes
+		// TODO(b/29354920): Clear undo entries in all processes
 		sem.value = int16(val)
 		sem.pid = pid
 		sem.wakeWaiters()
@@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
 	}
 
 	// All operations succeeded, apply them.
-	// TODO: handle undo operations.
+	// TODO(b/29354920): handle undo operations.
 	for i, v := range tmpVals {
 		s.sems[i].value = v
 		s.sems[i].wakeWaiters()
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 349f2a26e..d4812a065 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -427,7 +427,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
 func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	// TODO: RemoveMapping may be called during task exit, when ctx
+	// TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
 	// is context.Background. Gracefully handle missing clocks. Failing to
 	// update the detach time in these cases is ok, since no one can observe the
 	// omission.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 7eb99718d..293b21249 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -165,7 +165,7 @@ type Stracer interface {
 	//
 	// The returned private data is passed to SyscallExit.
 	//
-	// TODO: remove kernel imports from the strace
+	// TODO(gvisor.dev/issue/155): remove kernel imports from the strace
 	// package so that the type can be used directly.
 	SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
 
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 1b4d4cf2f..ac38dd157 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -60,7 +60,7 @@ func (tc *TaskContext) release() {
 	// Nil out pointers so that if the task is saved after release, it doesn't
 	// follow the pointers to possibly now-invalid objects.
 	if tc.MemoryManager != nil {
-		// TODO
+		// TODO(b/38173783)
 		tc.MemoryManager.DecUsers(context.Background())
 		tc.MemoryManager = nil
 	}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 9fca90a1c..b49f902a5 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -208,7 +208,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tc = *r.tc
 	t.mu.Unlock()
 	t.unstopVforkParent()
-	// NOTE: All locks must be dropped prior to calling Activate.
+	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
 	t.MemoryManager().Activate()
 
 	t.ptraceExec(oldTID)
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 1a0734ab6..a07956208 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -339,7 +339,7 @@ func (t *Task) exitChildren() {
 			}, true /* group */)
 			other.signalHandlers.mu.Unlock()
 		}
-		// TODO: The init process waits for all processes in the
+		// TODO(b/37722272): The init process waits for all processes in the
 		// namespace to exit before completing its own exit
 		// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
 		// other tasks in the namespace are dead, except possibly for this
@@ -692,7 +692,7 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.Si
 		info.Code = arch.CLD_EXITED
 		info.SetStatus(int32(t.exitStatus.Code))
 	}
-	// TODO: Set utime, stime.
+	// TODO(b/72102453): Set utime, stime.
 	return info
 }
 
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index e105eba13..6c9608f8d 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -421,7 +421,7 @@ func (t *Task) SetKeepCaps(k bool) {
 
 // updateCredsForExec updates t.creds to reflect an execve().
 //
-// NOTE: We currently do not implement privileged executables
+// NOTE(b/30815691): We currently do not implement privileged executables
 // (set-user/group-ID bits and file capabilities). This allows us to make a lot
 // of simplifying assumptions:
 //
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 6b5fe7165..7115aa967 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -110,7 +110,7 @@ func (t *Task) doStop() {
 		return
 	}
 	t.Deactivate()
-	// NOTE: t.Activate() must be called without any locks held, so
+	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
 	// this defer must precede the defer for unlocking the signal mutex.
 	defer t.Activate()
 	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 3a8e61900..7f2e0df72 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -509,7 +509,7 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
 	if t.stop != nil {
 		return false
 	}
-	// - TODO: No special case for when t is also the sending task,
+	// - TODO(b/38173783): No special case for when t is also the sending task,
 	// because the identity of the sender is unknown.
 	// - Do not choose tasks that have already been interrupted, as they may be
 	// busy handling another signal.
@@ -895,7 +895,7 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
 		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
 		sigchld.SetStatus(status)
-		// TODO: Set utime, stime.
+		// TODO(b/72102453): Set utime, stime.
 		t.sendSignalLocked(sigchld, true /* group */)
 	}
 }
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 36846484c..1302cadc1 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -69,7 +69,7 @@ import (
 // A TaskStop is a condition visible to the task control flow graph that
 // prevents a task goroutine from running or exiting, i.e. an internal stop.
 //
-// NOTE: Most TaskStops don't contain any data; they're
+// NOTE(b/30793614): Most TaskStops don't contain any data; they're
 // distinguished by their type. The obvious way to implement such a TaskStop
 // is:
 //
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 80ad59dde..79051befa 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -70,7 +70,7 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
 	defer d.DecRef()
 
 	perms := fs.PermMask{
-		// TODO: Linux requires only execute
+		// TODO(gvisor.dev/issue/160): Linux requires only execute
 		// permission, not read. However, our backing filesystems may
 		// prevent us from reading the file without read permission.
 		//
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 18b7e90d8..8c196df84 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -194,7 +194,7 @@ func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error)
 
 // VDSO describes a VDSO.
 //
-// NOTE: to support multiple architectures or operating systems, this
+// NOTE(mpratt): to support multiple architectures or operating systems, this
 // would need to contain a VDSO for each.
 //
 // +stateify savable
@@ -262,7 +262,7 @@ func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
 
 	return &VDSO{
 		ParamPage: mm.NewSpecialMappable("[vvar]", mfp, paramPage),
-		// TODO: Don't advertise the VDSO, as
+		// TODO(gvisor.dev/issue/157): Don't advertise the VDSO, as
 		// some applications may not be able to handle multiple [vdso]
 		// hints.
 		vdso:  mm.NewSpecialMappable("", mfp, vdso),
@@ -279,7 +279,7 @@ func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
 // kernel simply directly maps the entire file into process memory, with very
 // little real ELF parsing.
 //
-// NOTE: This means that userspace can, and unfortunately does,
+// NOTE(b/25323870): This means that userspace can, and unfortunately does,
 // depend on parts of the ELF that would normally not be mapped.  To maintain
 // compatibility with such binaries, we load the VDSO much like Linux.
 //
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 1ef1f0dd8..3f6f7ebd0 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -356,6 +356,6 @@ type MMapOpts struct {
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
-	// TODO: Replace entirely with MappingIdentity?
+	// TODO(jamieliu): Replace entirely with MappingIdentity?
 	Hint string
 }
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index f7ff06de0..7075792e0 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -331,7 +331,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint
 		Length:          aioRingBufferSize,
 		MappingIdentity: m,
 		Mappable:        m,
-		// TODO: Linux does "do_mmap_pgoff(..., PROT_READ |
+		// TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ |
 		// PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this
 		// mapping read-only?
 		Perms:    usermem.Read,
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 0c4b8895d..7cdbf6e25 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -69,7 +69,7 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME: If we use a usermem.Addr for the handle, we get
+		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
 		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
@@ -88,7 +88,7 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
 	//
 	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
 	if start != vsyscallEnd {
-		// FIXME: Can't get a pointer to constant vsyscallEnd.
+		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallMapsEntry),
@@ -134,7 +134,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
 	if vma.hint != "" {
 		s = vma.hint
 	} else if vma.id != nil {
-		// FIXME: We are holding mm.mappingMu here, which is
+		// FIXME(jamieliu): We are holding mm.mappingMu here, which is
 		// consistent with Linux's holding mmap_sem in
 		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
 		// However, it's not clear that fs.File.MappedName() is actually
@@ -162,7 +162,7 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 		start = *handle.(*usermem.Addr)
 	}
 	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
-		// FIXME: If we use a usermem.Addr for the handle, we get
+		// FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
 		// "panic: autosave error: type usermem.Addr is not registered".
 		vmaAddr := vseg.End()
 		data = append(data, seqfile.SeqData{
@@ -174,7 +174,7 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
 	// We always emulate vsyscall, so advertise it here. See
 	// ReadMapsSeqFileData for additional commentary.
 	if start != vsyscallEnd {
-		// FIXME: Can't get a pointer to constant vsyscallEnd.
+		// FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
 		vmaAddr := vsyscallEnd
 		data = append(data, seqfile.SeqData{
 			Buf:    []byte(vsyscallSmapsEntry),
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index cfbf7a104..3b5161998 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -136,7 +136,7 @@ func (m *SpecialMappable) Length() uint64 {
 // NewSharedAnonMappable returns a SpecialMappable that implements the
 // semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
 //
-// TODO: The use of SpecialMappable is a lazy code reuse hack. Linux
+// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
 // uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
 // do the same to get non-zero device and inode IDs.
 func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index cc7eb76d2..7b675b9b5 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -137,7 +137,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		return 0, err
 	}
 
-	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
+	// TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
 	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
 	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
 	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
@@ -148,7 +148,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
-		// NOTE: Get pmas and map eagerly in the hope
+		// NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
 		// that doing so will save on future page faults. We only do this for
 		// anonymous mappings, since otherwise the cost of
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
@@ -698,7 +698,7 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 		return mm.brk.End, syserror.EINVAL
 	}
 
-	// TODO: This enforces RLIMIT_DATA, but is
+	// TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
 	// slightly more permissive than the usual data limit. In particular,
 	// this only limits the size of the heap; a true RLIMIT_DATA limits the
 	// size of heap + data + bss. The segment sizes need to be plumbed from
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index e9c9a80ea..931995254 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -274,7 +274,7 @@ func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange
 		// Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
 		vma := vseg.ValuePtr()
 		if addr < vseg.Start() {
-			// TODO: Implement vma.growsDown here.
+			// TODO(jamieliu): Implement vma.growsDown here.
 			return vbegin, vgap, syserror.EFAULT
 		}
 
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index c0a0af92d..d0f6bb225 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -62,7 +62,7 @@ func updateSystemValues(fd int) error {
 
 	// Calculate whether guestPCID is supported.
 	//
-	// FIXME: These should go through the much more pleasant
+	// FIXME(ascannell): These should go through the much more pleasant
 	// cpuid package interfaces, once a way to accept raw kvm CPUID entries
 	// is plumbed (or some rough equivalent).
 	for i := 0; i < int(cpuidSupported.nr); i++ {
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index d1c9458ea..0e48417b9 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -181,7 +181,7 @@ var (
 // this signal both to Contexts and to the sentry itself, under the assumption
 // that they originate from races with Context.Interrupt().
 //
-// NOTE: The Go runtime only guarantees that a small subset
+// NOTE(b/23420492): The Go runtime only guarantees that a small subset
 // of signals will be always be unblocked on all threads, one of which
 // is SIGCHLD.
 const SignalInterrupt = linux.SIGCHLD
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 82f125073..2a5d699ec 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -79,7 +79,7 @@ func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread)
 		// Before creating a new thread, see if we can find a thread
 		// whose system tid has disappeared.
 		//
-		// TODO: Other parts of this package depend on
+		// TODO(b/77216482): Other parts of this package depend on
 		// threads never exiting.
 		for origTID, t := range tp.threads {
 			// Signal zero is an easy existence check.
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 7c88010d8..4c6daec22 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -116,7 +116,7 @@ const (
 //
 // Note that sign-extension semantics apply to the highest order bit.
 //
-// FIXME: This should use the cpuid passed to Init.
+// FIXME(b/69382326): This should use the cpuid passed to Init.
 func VirtualAddressBits() uint32 {
 	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
 	return (ax >> 8) & 0xff
@@ -124,7 +124,7 @@ func VirtualAddressBits() uint32 {
 
 // PhysicalAddressBits returns the number of bits available for physical addresses.
 //
-// FIXME: This should use the cpuid passed to Init.
+// FIXME(b/69382326): This should use the cpuid passed to Init.
 func PhysicalAddressBits() uint32 {
 	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
 	return ax & 0xff
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 6b5d5f993..571245ce5 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -86,7 +86,7 @@ func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start,
 			//
 			// Otherwise ignore the signal.
 			//
-			// TODO: Drop in Go 1.12, which uses tgkill
+			// TODO(b/114489875): Drop in Go 1.12, which uses tgkill
 			// in runtime.raise.
 			switch signal {
 			case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index 5913d47a8..db6e71487 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -23,7 +23,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 )
 
-// TODO: Move to pkg/abi/linux along with definitions in
+// TODO(b/34161764): Move to pkg/abi/linux along with definitions in
 // pkg/sentry/arch.
 type sigaction struct {
 	handler  uintptr
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 23138d874..768fa0dfa 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -608,7 +608,7 @@ func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
-	// TODO: Unlike other socket options, SO_TIMESTAMP is
+	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
 	// implemented specifically for epsocket.SocketOperations rather than
 	// commonEndpoint. commonEndpoint should be extended to support socket
 	// options where the implementation is not shared, as unix sockets need
@@ -658,7 +658,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
 
 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
 func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) {
-	// TODO: Stop rejecting short optLen values in getsockopt.
+	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
 	switch name {
 	case linux.SO_TYPE:
 		if outLen < sizeOfInt32 {
@@ -789,7 +789,7 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		return linux.Linger{}, nil
 
 	case linux.SO_SNDTIMEO:
-		// TODO: Linux allows shorter lengths for partial results.
+		// TODO(igudger): Linux allows shorter lengths for partial results.
 		if outLen < linux.SizeOfTimeval {
 			return nil, syserr.ErrInvalidArgument
 		}
@@ -797,7 +797,7 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
 		return linux.NsecToTimeval(s.SendTimeout()), nil
 
 	case linux.SO_RCVTIMEO:
-		// TODO: Linux allows shorter lengths for partial results.
+		// TODO(igudger): Linux allows shorter lengths for partial results.
 		if outLen < linux.SizeOfTimeval {
 			return nil, syserr.ErrInvalidArgument
 		}
@@ -894,7 +894,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 			return nil, syserr.TranslateNetstackError(err)
 		}
 
-		// TODO: Translate fields once they are added to
+		// TODO(b/64800844): Translate fields once they are added to
 		// tcpip.TCPInfoOption.
 		info := linux.TCPInfo{}
 
@@ -995,7 +995,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
 // tcpip.Endpoint.
 func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
-	// TODO: Unlike other socket options, SO_TIMESTAMP is
+	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
 	// implemented specifically for epsocket.SocketOperations rather than
 	// commonEndpoint. commonEndpoint should be extended to support socket
 	// options where the implementation is not shared, as unix sockets need
@@ -1338,7 +1338,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
-			// TODO: Change AddMembership to use the standard
+			// TODO(igudger): Change AddMembership to use the standard
 			// any address representation.
 			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
 			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
@@ -1352,7 +1352,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{
 			NIC: tcpip.NICID(req.InterfaceIndex),
-			// TODO: Change DropMembership to use the standard
+			// TODO(igudger): Change DropMembership to use the standard
 			// any address representation.
 			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
 			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
@@ -1380,7 +1380,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
 		))
 
 	case linux.MCAST_JOIN_GROUP:
-		// FIXME: Implement MCAST_JOIN_GROUP.
+		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
 		t.Kernel().EmitUnimplementedEvent(t)
 		return syserr.ErrInvalidArgument
 
@@ -1695,7 +1695,7 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq
 
 // nonBlockingRead issues a non-blocking read.
 //
-// TODO: Support timestamps for stream sockets.
+// TODO(b/78348848): Support timestamps for stream sockets.
 func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	isPacket := s.isPacketBased()
 
@@ -1762,7 +1762,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
 		dst = dst.DropFirst(n)
 		num, err := dst.CopyOutFrom(ctx, safemem.FromVecReaderFunc{func(dsts [][]byte) (int64, error) {
 			n, _, err := s.Endpoint.Peek(dsts)
-			// TODO: Handle peek timestamp.
+			// TODO(b/78348848): Handle peek timestamp.
 			if err != nil {
 				return int64(n), syserr.TranslateNetstackError(err).ToError()
 			}
@@ -1963,7 +1963,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 func (s *SocketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
 	// SIOCGSTAMP is implemented by epsocket rather than all commonEndpoint
 	// sockets.
-	// TODO: Add a commonEndpoint method to support SIOCGSTAMP.
+	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
 	if int(args[1].Int()) == syscall.SIOCGSTAMP {
 		s.readMu.Lock()
 		defer s.readMu.Unlock()
@@ -2153,19 +2153,19 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
 
 	case syscall.SIOCGIFMAP:
 		// Gets the hardware parameters of the device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFTXQLEN:
 		// Gets the transmit queue length of the device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFDSTADDR:
 		// Gets the destination address of a point-to-point device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFBRDADDR:
 		// Gets the broadcast address of a device.
-		// TODO: Implement.
+		// TODO(b/71872867): Implement.
 
 	case syscall.SIOCGIFNETMASK:
 		// Gets the network mask of a device.
diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/epsocket/save_restore.go
index 34d9a7cf0..f19afb6c0 100644
--- a/pkg/sentry/socket/epsocket/save_restore.go
+++ b/pkg/sentry/socket/epsocket/save_restore.go
@@ -20,7 +20,7 @@ import (
 
 // afterLoad is invoked by stateify.
 func (s *Stack) afterLoad() {
-	s.Stack = stack.StackFromEnv // FIXME
+	s.Stack = stack.StackFromEnv // FIXME(b/36201077)
 	if s.Stack == nil {
 		panic("can't restore without netstack/tcpip/stack.Stack")
 	}
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index c0081c819..37c48f4bc 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -77,7 +77,7 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
 				Family:    family,
 				PrefixLen: uint8(len(a.Address) * 8),
 				Addr:      []byte(a.Address),
-				// TODO: Other fields.
+				// TODO(b/68878065): Other fields.
 			})
 		}
 		nicAddrs[int32(id)] = addrs
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index c4848b313..49349074f 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -348,7 +348,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
 func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
 	// Whitelist flags.
 	//
-	// FIXME: We can't support MSG_ERRQUEUE because it uses ancillary
+	// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
 	// messages that netstack/tcpip/transport/unix doesn't understand. Kill the
 	// Socket interface's dependence on netstack.
 	if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 7e70b09b2..e414b829b 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -110,7 +110,7 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
 		m.PutAttr(linux.IFLA_ADDRESS, mac)
 		m.PutAttr(linux.IFLA_BROADCAST, brd)
 
-		// TODO: There are many more attributes.
+		// TODO(b/68878065): There are many more attributes.
 	}
 
 	return nil
@@ -122,7 +122,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 	// netlink header and 1 byte protocol family common to all
 	// NETLINK_ROUTE requests.
 	//
-	// TODO: Filter output by passed protocol family.
+	// TODO(b/68878065): Filter output by passed protocol family.
 
 	// The RTM_GETADDR dump response is a set of RTM_NEWADDR messages each
 	// containing an InterfaceAddrMessage followed by a set of netlink
@@ -151,7 +151,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
 
 			m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
 
-			// TODO: There are many more attributes.
+			// TODO(b/68878065): There are many more attributes.
 		}
 	}
 
@@ -175,7 +175,7 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
 		}
 	}
 
-	// TODO: Only the dump variant of the types below are
+	// TODO(b/68878065): Only the dump variant of the types below are
 	// supported.
 	if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP {
 		return syserr.ErrNotSupported
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 0fe9b39b6..a34f9d3ca 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -168,7 +168,7 @@ func (s *Socket) EventUnregister(e *waiter.Entry) {
 
 // Ioctl implements fs.FileOperations.Ioctl.
 func (s *Socket) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	// TODO: no ioctls supported.
+	// TODO(b/68878065): no ioctls supported.
 	return 0, syserror.ENOTTY
 }
 
@@ -319,7 +319,7 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (in
 			t.Kernel().EmitUnimplementedEvent(t)
 		}
 	}
-	// TODO: other sockopts are not supported.
+	// TODO(b/68878065): other sockopts are not supported.
 	return nil, syserr.ErrProtocolNotAvailable
 }
 
@@ -369,7 +369,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
 		}
 
 	}
-	// TODO: other sockopts are not supported.
+	// TODO(b/68878065): other sockopts are not supported.
 	return syserr.ErrProtocolNotAvailable
 }
 
@@ -389,7 +389,7 @@ func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error
 func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
 	sa := linux.SockAddrNetlink{
 		Family: linux.AF_NETLINK,
-		// TODO: Support non-kernel peers. For now the peer
+		// TODO(b/68878065): Support non-kernel peers. For now the peer
 		// must be the kernel.
 		PortID: 0,
 	}
@@ -540,7 +540,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
 			continue
 		}
 
-		// TODO: ACKs not supported yet.
+		// TODO(b/68877377): ACKs not supported yet.
 		if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
 			return syserr.ErrNotSupported
 		}
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index 9c749b888..64106c4b5 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -50,7 +50,7 @@ type RPCConnection struct {
 // NewRPCConnection initializes a RPC connection to a socket gofer.
 func NewRPCConnection(s *unet.Socket) *RPCConnection {
 	conn := &RPCConnection{socket: s, requests: map[uint64]request{}}
-	go func() { // S/R-FIXME
+	go func() { // S/R-FIXME(b/77962828)
 		var nums [16]byte
 		for {
 			for n := 0; n < len(nums); {
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index d9bda78b0..f06d12231 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -64,7 +64,7 @@ func NewRPCNotifier(cn *conn.RPCConnection) (*Notifier, error) {
 		fdMap:   make(map[uint32]*fdInfo),
 	}
 
-	go w.waitAndNotify() // S/R-FIXME
+	go w.waitAndNotify() // S/R-FIXME(b/77962828)
 
 	return w, nil
 }
@@ -166,7 +166,7 @@ func (n *Notifier) waitAndNotify() error {
 		res := n.rpcConn.Request(id).Result.(*pb.SyscallResponse_EpollWait).EpollWait.Result
 		if e, ok := res.(*pb.EpollWaitResponse_ErrorNumber); ok {
 			err := syscall.Errno(e.ErrorNumber)
-			// NOTE: I don't think epoll_wait can return EAGAIN but I'm being
+			// NOTE(magi): I don't think epoll_wait can return EAGAIN but I'm being
 			// conseratively careful here since exiting the notification thread
 			// would be really bad.
 			if err == syscall.EINTR || err == syscall.EAGAIN {
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 3418a6d75..cf8f69efb 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -288,7 +288,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
 	if blocking && se == syserr.ErrTryAgain {
 		// Register for notifications.
 		e, ch := waiter.NewChannelEntry(nil)
-		// FIXME: This waiter.EventHUp is a partial
+		// FIXME(b/119878986): This waiter.EventHUp is a partial
 		// measure, need to figure out how to translate linux events to
 		// internal events.
 		s.EventRegister(&e, waiter.EventIn|waiter.EventHUp)
@@ -370,7 +370,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
 	// We save the shutdown state because of strange differences on linux
 	// related to recvs on blocking vs. non-blocking sockets after a SHUT_RD.
 	// We need to emulate that behavior on the blocking side.
-	// TODO: There is a possible race that can exist with loopback,
+	// TODO(b/120096741): There is a possible race that can exist with loopback,
 	// where data could possibly be lost.
 	s.setShutdownFlags(how)
 
@@ -771,7 +771,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
 		return 0, syserr.FromError(err)
 	}
 
-	// TODO: this needs to change to map directly to a SendMsg syscall
+	// TODO(bgeffon): this needs to change to map directly to a SendMsg syscall
 	// in the RPC.
 	totalWritten := 0
 	n, err := rpcSendMsg(t, &pb.SyscallRequest_Sendmsg{&pb.SendmsgRequest{
diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
index c056e4c9d..9586f5923 100644
--- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto
+++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
@@ -3,7 +3,7 @@ syntax = "proto3";
 // package syscall_rpc is a set of networking related system calls that can be
 // forwarded to a socket gofer.
 //
-// TODO: Document individual RPCs.
+// TODO(b/77963526): Document individual RPCs.
 package syscall_rpc;
 
 message SendmsgRequest {
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a6d870b44..434a200d9 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -722,7 +722,7 @@ func (s SyscallMap) Name(sysno uintptr) string {
 // N.B. This is not in an init function because we can't be sure all syscall
 // tables are registered with the kernel when init runs.
 //
-// TODO: remove kernel package dependencies from this
+// TODO(gvisor.dev/issue/155): remove kernel package dependencies from this
 // package and have the kernel package self-initialize all syscall tables.
 func Initialize() {
 	for _, table := range kernel.SyscallTables() {
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 8759e5e32..304a12dde 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -89,7 +89,7 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
 		// side is gone. The partial write is returned. EPIPE will be
 		// returned on the next call.
 		//
-		// TODO: In some cases SIGPIPE should
+		// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
 		// also be sent to the application.
 		return nil
 	case syserror.ErrWouldBlock:
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index be793ca11..b9b4ccbd1 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -143,10 +143,10 @@ var AMD64 = &kernel.SyscallTable{
 		65: Semop,
 		66: Semctl,
 		67: Shmdt,
-		//     68: @Syscall(Msgget), TODO
-		//     69: @Syscall(Msgsnd), TODO
-		//     70: @Syscall(Msgrcv), TODO
-		//     71: @Syscall(Msgctl), TODO
+		//     68: @Syscall(Msgget), TODO(b/29354921)
+		//     69: @Syscall(Msgsnd), TODO(b/29354921)
+		//     70: @Syscall(Msgrcv), TODO(b/29354921)
+		//     71: @Syscall(Msgctl), TODO(b/29354921)
 		72:  Fcntl,
 		73:  Flock,
 		74:  Fsync,
@@ -197,8 +197,8 @@ var AMD64 = &kernel.SyscallTable{
 		119: Setresgid,
 		120: Getresgid,
 		121: Getpgid,
-		//     122: @Syscall(Setfsuid), TODO
-		//     123: @Syscall(Setfsgid), TODO
+		//     122: @Syscall(Setfsuid), TODO(b/112851702)
+		//     123: @Syscall(Setfsgid), TODO(b/112851702)
 		124: Getsid,
 		125: Capget,
 		126: Capset,
@@ -217,7 +217,7 @@ var AMD64 = &kernel.SyscallTable{
 		136: syscalls.ErrorWithEvent(syscall.ENOSYS),
 		137: Statfs,
 		138: Fstatfs,
-		//     139: @Syscall(Sysfs), TODO
+		//     139: @Syscall(Sysfs), TODO(gvisor.dev/issue/165)
 		140: Getpriority,
 		141: Setpriority,
 		// @Syscall(SchedSetparam, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
@@ -291,7 +291,7 @@ var AMD64 = &kernel.SyscallTable{
 		// @Syscall(Security, note:Not implemented in Linux)
 		185: syscalls.Error(syscall.ENOSYS),
 		186: Gettid,
-		187: nil, // @Syscall(Readahead), TODO
+		187: nil, // @Syscall(Readahead), TODO(b/29351341)
 		// @Syscall(Setxattr, returns:ENOTSUP, note:Requires filesystem support)
 		188: syscalls.ErrorWithEvent(syscall.ENOTSUP),
 		// @Syscall(Lsetxattr, returns:ENOTSUP, note:Requires filesystem support)
@@ -342,7 +342,7 @@ var AMD64 = &kernel.SyscallTable{
 		217: Getdents64,
 		218: SetTidAddress,
 		219: RestartSyscall,
-		//     220: @Syscall(Semtimedop), TODO
+		//     220: @Syscall(Semtimedop), TODO(b/29354920)
 		221: Fadvise64,
 		222: TimerCreate,
 		223: TimerSettime,
@@ -360,16 +360,16 @@ var AMD64 = &kernel.SyscallTable{
 		235: Utimes,
 		// @Syscall(Vserver, note:Not implemented by Linux)
 		236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux
-		// @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO
+		// @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO(b/117792295)
 		237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice
 		238: SetMempolicy,
 		239: GetMempolicy,
-		//     240: @Syscall(MqOpen), TODO
-		//     241: @Syscall(MqUnlink), TODO
-		//     242: @Syscall(MqTimedsend), TODO
-		//     243: @Syscall(MqTimedreceive), TODO
-		//     244: @Syscall(MqNotify), TODO
-		//     245: @Syscall(MqGetsetattr), TODO
+		//     240: @Syscall(MqOpen), TODO(b/29354921)
+		//     241: @Syscall(MqUnlink), TODO(b/29354921)
+		//     242: @Syscall(MqTimedsend), TODO(b/29354921)
+		//     243: @Syscall(MqTimedreceive), TODO(b/29354921)
+		//     244: @Syscall(MqNotify), TODO(b/29354921)
+		//     245: @Syscall(MqGetsetattr), TODO(b/29354921)
 		246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot
 		247: Waitid,
 		// @Syscall(AddKey, returns:EACCES, note:Not available to user)
@@ -407,22 +407,22 @@ var AMD64 = &kernel.SyscallTable{
 		273: syscalls.Error(syscall.ENOSYS),
 		// @Syscall(GetRobustList, note:Obsolete)
 		274: syscalls.Error(syscall.ENOSYS),
-		//     275: @Syscall(Splice), TODO
-		//     276: @Syscall(Tee), TODO
+		//     275: @Syscall(Splice), TODO(b/29354098)
+		//     276: @Syscall(Tee), TODO(b/29354098)
 		277: SyncFileRange,
-		//     278: @Syscall(Vmsplice), TODO
+		//     278: @Syscall(Vmsplice), TODO(b/29354098)
 		// @Syscall(MovePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
 		279: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice (mostly)
 		280: Utimensat,
 		281: EpollPwait,
-		//     282: @Syscall(Signalfd), TODO
+		//     282: @Syscall(Signalfd), TODO(b/19846426)
 		283: TimerfdCreate,
 		284: Eventfd,
 		285: Fallocate,
 		286: TimerfdSettime,
 		287: TimerfdGettime,
 		288: Accept4,
-		//     289: @Syscall(Signalfd4), TODO
+		//     289: @Syscall(Signalfd4), TODO(b/19846426)
 		290: Eventfd2,
 		291: EpollCreate1,
 		292: Dup3,
@@ -447,17 +447,17 @@ var AMD64 = &kernel.SyscallTable{
 		305: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
 		306: Syncfs,
 		307: SendMMsg,
-		//     308: @Syscall(Setns), TODO
+		//     308: @Syscall(Setns), TODO(b/29354995)
 		309: Getcpu,
-		//     310: @Syscall(ProcessVmReadv), TODO may require cap_sys_ptrace
-		//     311: @Syscall(ProcessVmWritev), TODO may require cap_sys_ptrace
+		//     310: @Syscall(ProcessVmReadv), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
+		//     311: @Syscall(ProcessVmWritev), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
 		// @Syscall(Kcmp, returns:EPERM or ENOSYS, note:Requires cap_sys_ptrace)
 		312: syscalls.CapError(linux.CAP_SYS_PTRACE),
 		// @Syscall(FinitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
 		313: syscalls.CapError(linux.CAP_SYS_MODULE),
-		//     314: @Syscall(SchedSetattr), TODO, we have no scheduler
-		//     315: @Syscall(SchedGetattr), TODO, we have no scheduler
-		//     316: @Syscall(Renameat2), TODO
+		//     314: @Syscall(SchedSetattr), TODO(b/118902272), we have no scheduler
+		//     315: @Syscall(SchedGetattr), TODO(b/118902272), we have no scheduler
+		//     316: @Syscall(Renameat2), TODO(b/118902772)
 		317: Seccomp,
 		318: GetRandom,
 		319: MemfdCreate,
@@ -465,9 +465,9 @@ var AMD64 = &kernel.SyscallTable{
 		320: syscalls.CapError(linux.CAP_SYS_BOOT),
 		// @Syscall(Bpf, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
 		321: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin for all commands
-		//     322: @Syscall(Execveat), TODO
-		//     323: @Syscall(Userfaultfd), TODO
-		//     324: @Syscall(Membarrier), TODO
+		//     322: @Syscall(Execveat), TODO(b/118901836)
+		//     323: @Syscall(Userfaultfd), TODO(b/118906345)
+		//     324: @Syscall(Membarrier), TODO(b/118904897)
 		325: Mlock2,
 		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
 		//	326: @Syscall(CopyFileRange),
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 355071131..61c2647bf 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -120,7 +120,7 @@ func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		// Does not exist.
 		return 0, nil, syserror.EINVAL
 	}
-	// FIXME: Linux blocks until all AIO to the destroyed context is
+	// FIXME(fvoznika): Linux blocks until all AIO to the destroyed context is
 	// done.
 	return 0, nil, nil
 }
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 50151f7b6..967464c85 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -259,7 +259,7 @@ func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileM
 		case linux.ModeCharacterDevice:
 			fallthrough
 		case linux.ModeBlockDevice:
-			// TODO: We don't support creating block or character
+			// TODO(b/72101894): We don't support creating block or character
 			// devices at the moment.
 			//
 			// When we start supporting block and character devices, we'll
@@ -1532,7 +1532,7 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
 		owner.GID = kgid
 	}
 
-	// FIXME: This is racy; the inode's owner may have changed in
+	// FIXME(b/62949101): This is racy; the inode's owner may have changed in
 	// the meantime. (Linux holds i_mutex while calling
 	// fs/attr.c:notify_change() => inode_operations::setattr =>
 	// inode_change_ok().)
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 8732861e0..805b251b1 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -185,7 +185,7 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 	case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
 		fallthrough
 	case linux.MADV_DONTDUMP, linux.MADV_DODUMP:
-		// TODO: Core dumping isn't implemented, so these are
+		// TODO(b/72045799): Core dumping isn't implemented, so these are
 		// no-ops.
 		fallthrough
 	case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
@@ -223,7 +223,7 @@ func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.
 	nodeFlag := flags&linux.MPOL_F_NODE != 0
 	addrFlag := flags&linux.MPOL_F_ADDR != 0
 
-	// TODO: Once sysfs is implemented, report a single numa node in
+	// TODO(rahat): Once sysfs is implemented, report a single numa node in
 	// /sys/devices/system/node.
 	if nodemask != 0 && maxnode < 1 {
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 8105e9b43..50c7d7a74 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -192,7 +192,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 }
 
 // Preadv2 implements linux syscall preadv2(2).
-// TODO: Implement RWF_HIPRI functionality.
+// TODO(b/120162627): Implement RWF_HIPRI functionality.
 func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// While the syscall is
 	// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 30ccc3f66..c8748958a 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -317,7 +317,7 @@ func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr
 		return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
 	}
 	if peerRequested {
-		// NOTE: Linux does not give you an error if it can't
+		// NOTE(magi): Linux does not give you an error if it can't
 		// write the data back out so neither do we.
 		if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL {
 			return 0, err
@@ -735,7 +735,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 		return 0, err
 	}
 
-	// FIXME: Pretend we have an empty error queue.
+	// FIXME(b/63594852): Pretend we have an empty error queue.
 	if flags&linux.MSG_ERRQUEUE != 0 {
 		return 0, syscall.EAGAIN
 	}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 61cafefb9..ddcb5b789 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -350,7 +350,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	}
 	si.SetPid(int32(wr.TID))
 	si.SetUid(int32(wr.UID))
-	// TODO: convert kernel.ExitStatus to functions and make
+	// TODO(b/73541790): convert kernel.ExitStatus to functions and make
 	// WaitResult.Status a linux.WaitStatus
 	s := syscall.WaitStatus(wr.Status)
 	switch {
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index a5ad7efb2..e405608c4 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -192,8 +192,8 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 }
 
 // Pwritev2 implements linux syscall pwritev2(2).
-// TODO: Implement RWF_HIPRI functionality.
-// TODO: Implement O_SYNC and D_SYNC functionality.
+// TODO(b/120162627): Implement RWF_HIPRI functionality.
+// TODO(b/120161091): Implement O_SYNC and D_SYNC functionality.
 func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	// While the syscall is
 	// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index c8cf4eca4..a98bcd7de 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -37,7 +37,7 @@ var fallbackMetric = metric.MustCreateNewUint64Metric("/time/fallback", false /*
 // clock.
 type CalibratedClock struct {
 	// mu protects the fields below.
-	// TODO: consider a sequence counter for read locking.
+	// TODO(mpratt): consider a sequence counter for read locking.
 	mu sync.RWMutex
 
 	// ref sample the reference clock that this clock is calibrated
@@ -140,7 +140,7 @@ func (c *CalibratedClock) updateParams(actual Parameters) {
 		// N.B. logErrorAdjustment will have already logged the error
 		// at warning level.
 		//
-		// TODO: We could allow Realtime clock jumps here.
+		// TODO(mpratt): We could allow Realtime clock jumps here.
 		c.resetLocked("Extreme clock error.")
 		return
 	}
@@ -229,7 +229,7 @@ func (c *CalibratedClock) GetTime() (int64, error) {
 
 // CalibratedClocks contains calibrated monotonic and realtime clocks.
 //
-// TODO: We know that Linux runs the monotonic and realtime clocks at
+// TODO(mpratt): We know that Linux runs the monotonic and realtime clocks at
 // the same rate, so rather than tracking both individually, we could do one
 // calibration for both clocks.
 type CalibratedClocks struct {
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index f3ad58454..8568b1193 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -43,7 +43,7 @@ const (
 	// These statements assume that the host clock does not change. Actual
 	// error will depend upon host clock changes.
 	//
-	// TODO: make error correction more robust to delayed
+	// TODO(b/68779214): make error correction more robust to delayed
 	// updates.
 	ApproxUpdateInterval = 1 * time.Second
 
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 99766a803..4c7d5014a 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -28,7 +28,7 @@ import (
 
 // IO provides access to the contents of a virtual memory space.
 //
-// FIXME: Implementations of IO cannot expect ctx to contain any
+// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any
 // meaningful data.
 type IO interface {
 	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
@@ -85,7 +85,7 @@ type IO interface {
 	// order.
 	CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)
 
-	// TODO: The requirement that CopyOutFrom/CopyInTo call src/dst
+	// TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
 	// at most once, which is unnecessary in most cases, forces implementations
 	// to gather safemem.Blocks into a single slice to pass to src/dst. Add
 	// CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index c49b537a5..b4f1e3a4f 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -236,7 +236,7 @@ func (w *Watchdog) runTurn() {
 				if !ok {
 					// New stuck task detected.
 					//
-					// TODO: Tasks blocked doing IO may be considered stuck in kernel.
+					// TODO(b/65849403): Tasks blocked doing IO may be considered stuck in kernel.
 					tc = &offender{lastUpdateTime: lastUpdateTime}
 					stuckTasks.Increment()
 					newTaskFound = true
diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go
index dad83e80c..232634dd4 100644
--- a/pkg/syserr/syserr.go
+++ b/pkg/syserr/syserr.go
@@ -49,7 +49,7 @@ func New(message string, linuxTranslation *linux.Errno) *Error {
 		return err
 	}
 
-	// TODO: Remove this.
+	// TODO(b/34162363): Remove this.
 	errno := linuxTranslation.Number()
 	if errno <= 0 || errno >= len(linuxBackwardsTranslations) {
 		panic(fmt.Sprint("invalid errno: ", errno))
@@ -106,12 +106,12 @@ type linuxBackwardsTranslation struct {
 	ok  bool
 }
 
-// TODO: Remove this.
+// TODO(b/34162363): Remove this.
 var linuxBackwardsTranslations [maxErrno]linuxBackwardsTranslation
 
 // ToError translates an Error to a corresponding error value.
 //
-// TODO: Remove this.
+// TODO(b/34162363): Remove this.
 func (e *Error) ToError() error {
 	if e == nil {
 		return nil
@@ -138,7 +138,7 @@ func (e *Error) ToLinux() *linux.Errno {
 	return e.errno
 }
 
-// TODO: Remove or replace most of these errors.
+// TODO(b/34162363): Remove or replace most of these errors.
 //
 // Some of the errors should be replaced with package specific errors and
 // others should be removed entirely.
@@ -278,7 +278,7 @@ var (
 
 // FromError converts a generic error to an *Error.
 //
-// TODO: Remove this function.
+// TODO(b/34162363): Remove this function.
 func FromError(err error) *Error {
 	if err == nil {
 		return nil
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index ed9a4eee5..1c3acda4b 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -64,7 +64,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	}
 	h := header.ICMPv4(v)
 
-	// TODO: Meaningfully handle all ICMP types.
+	// TODO(b/112892170): Meaningfully handle all ICMP types.
 	switch h.Type() {
 	case header.ICMPv4Echo:
 		received.Echo.Increment()
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index 3210e6fc7..be28be36d 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -73,7 +73,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, vv buffer.V
 	}
 	h := header.ICMPv6(v)
 
-	// TODO: Meaningfully handle all ICMP types.
+	// TODO(b/112892170): Meaningfully handle all ICMP types.
 	switch h.Type() {
 	case header.ICMPv6PacketTooBig:
 		received.PacketTooBig.Increment()
@@ -247,7 +247,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 		DstAddr:       r.RemoteAddress,
 	})
 
-	// TODO: count this in ICMP stats.
+	// TODO(stijlist): count this in ICMP stats.
 	return linkEP.WritePacket(r, nil /* gso */, hdr, buffer.VectorisedView{}, ProtocolNumber)
 }
 
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 8b6c17a90..c18571b0f 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -176,7 +176,7 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
 
 	for e := list.Front(); e != nil; e = e.Next() {
 		r := e.(*referencedNetworkEndpoint)
-		// TODO: allow broadcast address when SO_BROADCAST is set.
+		// TODO(crawshaw): allow broadcast address when SO_BROADCAST is set.
 		switch r.ep.ID().LocalAddress {
 		case header.IPv4Broadcast, header.IPv4Any:
 			continue
@@ -476,7 +476,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 		n.mu.RUnlock()
 		if ok && ref.tryIncRef() {
 			r.RemoteAddress = src
-			// TODO: Update the source NIC as well.
+			// TODO(b/123449044): Update the source NIC as well.
 			ref.ep.HandlePacket(&r, vv)
 			ref.decRef()
 		} else {
@@ -485,7 +485,7 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddr
 			hdr := buffer.NewPrependableFromView(vv.First())
 			vv.RemoveFirst()
 
-			// TODO: use route.WritePacket.
+			// TODO(b/128629022): use route.WritePacket.
 			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, vv, protocol); err != nil {
 				r.Stats().IP.OutgoingPacketErrors.Increment()
 			} else {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 8f7b6f781..cb9ffe9c2 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -476,7 +476,7 @@ func (s *Stack) Stats() tcpip.Stats {
 
 // SetForwarding enables or disables the packet forwarding between NICs.
 func (s *Stack) SetForwarding(enable bool) {
-	// TODO: Expose via /proc/sys/net/ipv4/ip_forward.
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
 	s.mu.Lock()
 	s.forwarding = enable
 	s.mu.Unlock()
@@ -484,7 +484,7 @@ func (s *Stack) SetForwarding(enable bool) {
 
 // Forwarding returns if the packet forwarding between NICs is enabled.
 func (s *Stack) Forwarding() bool {
-	// TODO: Expose via /proc/sys/net/ipv4/ip_forward.
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	return s.forwarding
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index f2c6c9a8d..3d7e4b719 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -15,5 +15,5 @@
 package stack
 
 // StackFromEnv is the global stack created in restore run.
-// FIXME
+// FIXME(b/36201077)
 var StackFromEnv *Stack
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 0c2589083..2df974bf2 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -453,7 +453,7 @@ func TestTransportForwarding(t *testing.T) {
 	s := stack.New([]string{"fakeNet"}, []string{"fakeTrans"}, stack.Options{})
 	s.SetForwarding(true)
 
-	// TODO: Change this to a channel NIC.
+	// TODO(b/123449044): Change this to a channel NIC.
 	id1 := loopback.New()
 	if err := s.CreateNIC(1, id1); err != nil {
 		t.Fatalf("CreateNIC #1 failed: %v", err)
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 80cd6b4e5..b09137f08 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -444,7 +444,7 @@ type PasscredOption int
 
 // TCPInfoOption is used by GetSockOpt to expose TCP statistics.
 //
-// TODO: Add and populate stat fields.
+// TODO(b/64800844): Add and populate stat fields.
 type TCPInfoOption struct {
 	RTT    time.Duration
 	RTTVar time.Duration
diff --git a/pkg/tcpip/transport/raw/raw.go b/pkg/tcpip/transport/raw/raw.go
index 8dada2e4f..f0f60ce91 100644
--- a/pkg/tcpip/transport/raw/raw.go
+++ b/pkg/tcpip/transport/raw/raw.go
@@ -100,7 +100,7 @@ type endpoint struct {
 }
 
 // NewEndpoint returns a raw  endpoint for the given protocols.
-// TODO: IP_HDRINCL, IPPROTO_RAW, and AF_PACKET.
+// TODO(b/129292371): IP_HDRINCL, IPPROTO_RAW, and AF_PACKET.
 func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	if netProto != header.IPv4ProtocolNumber {
 		return nil, tcpip.ErrUnknownProtocol
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index e5c05f8c0..d44d63e95 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -73,7 +73,7 @@ go_test(
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
-    # FIXME
+    # FIXME(b/68809571)
     tags = ["flaky"],
     deps = [
         ":tcp",
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index deeea078d..114fb8c5b 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -211,7 +211,7 @@ func SocketPair(packet bool) (*Socket, *Socket, error) {
 	// variable between our two sockets. We only use SocketPair in tests
 	// anyway.
 	//
-	// NOTE: This is purely due to the fact that the raw
+	// NOTE(b/27107811): This is purely due to the fact that the raw
 	// syscall does not serve as a boundary for the sanitizer.
 	var race int32
 	a, err := NewSocket(fds[0])
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index ecc670925..db5485539 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -40,7 +40,7 @@ func randomFilename() (string, error) {
 		return "", err
 	}
 
-	// NOTE: We try to use relative path if possible. This is
+	// NOTE(b/26918832): We try to use relative path if possible. This is
 	// to help conforming to the unix path length limit.
 	if rel, err := filepath.Rel(cwd, file); err == nil {
 		return rel, nil
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 2488981f9..712c50ee9 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -231,7 +231,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	}
 	// Prevent CIDs containing ".." from confusing the sentry when creating
 	// /containers/<cid> directory.
-	// TODO: Once we have multiple independent roots, this
+	// TODO(b/129293409): Once we have multiple independent roots, this
 	// check won't be necessary.
 	if path.Clean(args.CID) != args.CID {
 		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
@@ -352,7 +352,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating network: %v", err)
 	}
 	if eps, ok := networkStack.(*epsocket.Stack); ok {
-		stack.StackFromEnv = eps.Stack // FIXME
+		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
 	}
 	info, err := o.FilePayload.Files[0].Stat()
 	if err != nil {
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 761142d98..07061b9b3 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -274,7 +274,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
-		// TODO: Support all the mount types and make this a
+		// TODO(nlacasse): Support all the mount types and make this a
 		// fatal error.  Most applications will "just work" without
 		// them, so this is a warning for now.
 		// we do not support.
@@ -425,7 +425,7 @@ func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, f
 	if err != nil {
 		return err
 	}
-	// TODO: Fix this when we support all the mount types and
+	// TODO(nlacasse): Fix this when we support all the mount types and
 	// make this a fatal error.
 	if fsName == "" {
 		return nil
@@ -475,7 +475,7 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser)
 		}
 	}
 
-	// TODO: handle '/tmp' properly (see mountTmp()).
+	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
 	if !tmpMounted {
 		tmpMount := specs.Mount{
 			Type:        tmpfs,
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 48ecb2626..75ec19c32 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -577,7 +577,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	// sentry currently supports only 1 mount namespace, which is tied to a
 	// single user namespace. Thus we must run in the same user namespace
 	// to access mounts.
-	// TODO: Create a new mount namespace for the container.
+	// TODO(b/63601033): Create a new mount namespace for the container.
 	creds := auth.NewUserCredentials(
 		auth.KUID(spec.Process.User.UID),
 		auth.KGID(spec.Process.User.GID),
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index d8f748aa0..f722df055 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -105,7 +105,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		return subcommands.ExitSuccess
 	}
 
-	// TODO: Make it possible to restore into same container.
+	// TODO(b/110843694): Make it possible to restore into same container.
 	// For now, we can fake it by destroying the container and making a
 	// new container with the same ID. This hack does not work with docker
 	// which uses the container pid to ensure that the restore-container is
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 1bed1a97e..a30c217f7 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -529,7 +529,7 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
 // SignalContainer sends the signal to the container. If all is true and signal
 // is SIGKILL, then waits for all processes to exit before returning.
 // SignalContainer returns an error if the container is already stopped.
-// TODO: Distinguish different error types.
+// TODO(b/113680494): Distinguish different error types.
 func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
 	log.Debugf("Signal container %q: %v", c.ID, sig)
 	// Signaling container in Stopped state is allowed. When all=false,
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 9fe584aa3..603c4d929 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -242,10 +242,10 @@ func configs(opts ...configOption) []*boot.Config {
 		case overlay:
 			c.Overlay = true
 		case kvm:
-			// TODO: KVM tests are flaky. Disable until fixed.
+			// TODO(b/112165693): KVM tests are flaky. Disable until fixed.
 			continue
 
-			// TODO: KVM doesn't work with --race.
+			// TODO(b/68787993): KVM doesn't work with --race.
 			if testutil.RaceEnabled {
 				continue
 			}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 92495c69e..48a0dafe2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -267,7 +267,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	defer conn.Close()
 
 	var e boot.Event
-	// TODO: Pass in the container id (cid) here. The sandbox
+	// TODO(b/129292330): Pass in the container id (cid) here. The sandbox
 	// should return events only for that container.
 	if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
 		return nil, fmt.Errorf("retrieving event data from sandbox: %v", err)
@@ -457,7 +457,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	if conf.Platform == boot.PlatformPtrace {
-		// TODO: Also set a new PID namespace so that we limit
+		// TODO(b/75837838): Also set a new PID namespace so that we limit
 		// access to other host processes.
 		log.Infof("Sandbox will be started in the current PID namespace")
 	} else {
@@ -520,7 +520,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 			// root for itself, so it has to have the CAP_SYS_ADMIN
 			// capability.
 			//
-			// FIXME: The current implementations of
+			// FIXME(b/122554829): The current implementations of
 			// os/exec doesn't allow to set ambient capabilities if
 			// a process is started in a new user namespace. As a
 			// workaround, we start the sandbox process with the 0
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 32f81b8d4..ac85bec71 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -90,7 +90,7 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
 	}
 
-	// TODO: Apply seccomp to application inside sandbox.
+	// TODO(b/72226747): Apply seccomp to application inside sandbox.
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
 	}
@@ -220,7 +220,7 @@ func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.Task
 		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
 			return nil, err
 		}
-		// TODO: Support ambient capabilities.
+		// TODO(nlacasse): Support ambient capabilities.
 	}
 	return &caps, nil
 }
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 94e0f24e0..d35f59433 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -277,7 +277,7 @@ syscall_test(test = "//test/syscalls/linux:sendfile_test")
 
 syscall_test(test = "//test/syscalls/linux:sigaction_test")
 
-# TODO: Enable once the test passes in runsc.
+# TODO(b/119826902): Enable once the test passes in runsc.
 # syscall_test(test = "//test/syscalls/linux:sigaltstack_test")
 
 syscall_test(test = "//test/syscalls/linux:sigiret_test")
@@ -414,7 +414,7 @@ syscall_test(
 )
 
 syscall_test(
-    # NOTE: Large sendmsg may stall a long time.
+    # NOTE(b/116636318): Large sendmsg may stall a long time.
     size = "enormous",
     test = "//test/syscalls/linux:socket_unix_dgram_local_test",
 )
@@ -437,7 +437,7 @@ syscall_test(
 )
 
 syscall_test(
-    # NOTE: Large sendmsg may stall a long time.
+    # NOTE(b/116636318): Large sendmsg may stall a long time.
     size = "enormous",
     test = "//test/syscalls/linux:socket_unix_seqpacket_local_test",
 )
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index 610b030b2..cd74a769d 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -78,10 +78,10 @@ def _syscall_test(
     tags += [full_platform, "file_" + file_access]
 
     # Add tag to prevent the tests from running in a Bazel sandbox.
-    # TODO: Make the tests run without this tag.
+    # TODO(b/120560048): Make the tests run without this tag.
     tags.append("no-sandbox")
 
-    # TODO: KVM tests are tagged "manual" to until the platform is
+    # TODO(b/112165693): KVM tests are tagged "manual" to until the platform is
     # more stable.
     if platform == "kvm":
         tags += ["manual"]
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 230648c9b..78baf548e 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -80,11 +80,11 @@ constexpr int kExitCode = 42;
 TEST(Syscall32Bit, Int80) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: 32-bit segments are broken (but not explictly
+      // TODO(b/111805002): 32-bit segments are broken (but not explictly
       // disabled).
       return;
     case Platform::kPtrace:
-      // TODO: The ptrace platform does not have a
+      // TODO(gvisor.dev/issue/167): The ptrace platform does not have a
       // consistent story here.
       return;
     case Platform::kNative:
@@ -99,10 +99,10 @@ TEST(Syscall32Bit, Int80) {
 TEST(Syscall32Bit, Sysenter) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: See above.
+      // TODO(b/111805002): See above.
       return;
     case Platform::kPtrace:
-      // TODO: See above.
+      // TODO(gvisor.dev/issue/167): See above.
       return;
     case Platform::kNative:
       break;
@@ -123,10 +123,10 @@ TEST(Syscall32Bit, Sysenter) {
 TEST(Syscall32Bit, Syscall) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: See above.
+      // TODO(b/111805002): See above.
       return;
     case Platform::kPtrace:
-      // TODO: See above.
+      // TODO(gvisor.dev/issue/167): See above.
       return;
     case Platform::kNative:
       break;
@@ -207,7 +207,7 @@ void FarCall32() {
 TEST(Call32Bit, Disallowed) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: See above.
+      // TODO(b/111805002): See above.
       return;
     case Platform::kPtrace:
       // The ptrace platform cannot prevent switching to compatibility mode.
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index 06643ccb8..b96aab9b9 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -103,7 +103,7 @@ TEST_F(AIOTest, BasicWrite) {
   // aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
   // Linux implements aio_ring, so skip the zeroes check.
   //
-  // TODO: Remove when gVisor implements aio_ring.
+  // TODO(b/65486370): Remove when gVisor implements aio_ring.
   auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
   auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
   EXPECT_EQ(ring->magic, magic);
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
index 2f2ff3b7d..2f42fe326 100644
--- a/test/syscalls/linux/chmod.cc
+++ b/test/syscalls/linux/chmod.cc
@@ -235,7 +235,7 @@ TEST(ChmodTest, FchmodFileToNoPermissionsSucceeds_NoRandomSave) {
 
 // Verify that we can get a RW FD after chmod, even if a RO fd is left open.
 TEST(ChmodTest, ChmodWritableWithOpenFD) {
-  // FIXME: broken on hostfs.
+  // FIXME(b/72455313): broken on hostfs.
   if (IsRunningOnGvisor()) {
     return;
   }
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index 7b1d83ad8..b4a3bfcba 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -56,7 +56,7 @@ TEST(EpollTest, AllWritable) {
   struct epoll_event result[kFDsPerEpoll];
   ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
               SyscallSucceedsWithValue(kFDsPerEpoll));
-  // TODO: Why do some tests check epoll_event::data, and others
+  // TODO(edahlgren): Why do some tests check epoll_event::data, and others
   // don't? Does Linux actually guarantee that, in any of these test cases,
   // epoll_wait will necessarily write out the epoll_events in the order that
   // they were registered?
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 187696ed9..c10d85398 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -285,7 +285,7 @@ ElfBinary<64> StandardElf() {
   elf.header.e_phoff = sizeof(elf.header);
   elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
 
-  // TODO: Always include a PT_GNU_STACK segment to
+  // TODO(gvisor.dev/issue/153): Always include a PT_GNU_STACK segment to
   // disable executable stacks. With this omitted the stack (and all PROT_READ)
   // mappings should be executable, but gVisor doesn't support that.
   decltype(elf)::ElfPhdr phdr = {};
@@ -403,7 +403,7 @@ TEST(ElfTest, DataSegment) {
 
 // Linux will allow PT_LOAD segments to overlap.
 TEST(ElfTest, DirectlyOverlappingSegments) {
-  // NOTE: see PIEOutOfOrderSegments.
+  // NOTE(b/37289926): see PIEOutOfOrderSegments.
   SKIP_IF(IsRunningOnGvisor());
 
   ElfBinary<64> elf = StandardElf();
@@ -439,7 +439,7 @@ TEST(ElfTest, DirectlyOverlappingSegments) {
 
 // Linux allows out-of-order PT_LOAD segments.
 TEST(ElfTest, OutOfOrderSegments) {
-  // NOTE: see PIEOutOfOrderSegments.
+  // NOTE(b/37289926): see PIEOutOfOrderSegments.
   SKIP_IF(IsRunningOnGvisor());
 
   ElfBinary<64> elf = StandardElf();
@@ -670,7 +670,7 @@ TEST(ElfTest, PIENonZeroStart) {
 }
 
 TEST(ElfTest, PIEOutOfOrderSegments) {
-  // TODO: This triggers a bug in Linux where it computes the size
+  // TODO(b/37289926): This triggers a bug in Linux where it computes the size
   // of the binary as 0x20000 - 0x40000 = 0xfffffffffffe0000, which obviously
   // fails to map.
   //
@@ -1005,7 +1005,7 @@ TEST(ElfTest, NoExecute) {
 
 // Execute, but no read permissions on the binary works just fine.
 TEST(ElfTest, NoRead) {
-  // TODO: gVisor's backing filesystem may prevent the
+  // TODO(gvisor.dev/issue/160): gVisor's backing filesystem may prevent the
   // sentry from reading the executable.
   SKIP_IF(IsRunningOnGvisor());
 
@@ -1024,7 +1024,7 @@ TEST(ElfTest, NoRead) {
 
   ASSERT_NO_ERRNO(WaitStopped(child));
 
-  // TODO: A task with a non-readable executable is marked
+  // TODO(gvisor.dev/issue/160): A task with a non-readable executable is marked
   // non-dumpable, preventing access to proc files. gVisor does not implement
   // this behavior.
 }
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 19c9a5053..43f568111 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -52,7 +52,7 @@ class FileTest : public ::testing::Test {
     test_file_fd_ = ASSERT_NO_ERRNO_AND_VALUE(
         Open(test_file_name_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR));
 
-    // FIXME: enable when mknod syscall is supported.
+    // FIXME(edahlgren): enable when mknod syscall is supported.
     // test_fifo_name_ = NewTempAbsPath();
     // ASSERT_THAT(mknod(test_fifo_name_.c_str()), S_IFIFO|0644, 0,
     //             SyscallSucceeds());
@@ -97,7 +97,7 @@ class FileTest : public ::testing::Test {
     UnlinkFile();
     ClosePipes();
 
-    // FIXME: enable when mknod syscall is supported.
+    // FIXME(edahlgren): enable when mknod syscall is supported.
     // close(test_fifo_[0]);
     // close(test_fifo_[1]);
     // unlink(test_fifo_name_.c_str());
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index de29047e0..c7741a177 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -158,7 +158,7 @@ TEST_F(IoctlTest, FIOASYNCNoTarget) {
 }
 
 TEST_F(IoctlTest, FIOASYNCSelfTarget) {
-  // FIXME: gVisor erroneously sends SIGIO on close(2), which would
+  // FIXME(b/120624367): gVisor erroneously sends SIGIO on close(2), which would
   // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
   // that the close signal is ignored.
   struct sigaction sa;
@@ -195,7 +195,7 @@ TEST_F(IoctlTest, FIOASYNCSelfTarget) {
 // Equivalent to FIOASYNCSelfTarget except that FIOSETOWN is called before
 // FIOASYNC.
 TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
-  // FIXME: gVisor erroneously sends SIGIO on close(2), which would
+  // FIXME(b/120624367): gVisor erroneously sends SIGIO on close(2), which would
   // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
   // that the close signal is ignored.
   struct sigaction sa;
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 4ad787cc0..0a149c2e5 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -24,7 +24,7 @@ namespace gvisor {
 namespace testing {
 
 PosixErrorOr<int> InterfaceIndex(std::string name) {
-  // TODO: Consider using netlink.
+  // TODO(igudger): Consider using netlink.
   ifreq req = {};
   memcpy(req.ifr_name, name.c_str(), name.size());
   ASSIGN_OR_RETURN_ERRNO(auto sock, Socket(AF_INET, SOCK_DGRAM, 0));
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
index fb6a1546e..6a4f1423c 100644
--- a/test/syscalls/linux/lseek.cc
+++ b/test/syscalls/linux/lseek.cc
@@ -194,7 +194,7 @@ TEST(LseekTest, EtcPasswdDup) {
   ASSERT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(1000));
 }
 
-// TODO: Add tests where we have donated in sockets.
+// TODO(magi): Add tests where we have donated in sockets.
 
 }  // namespace
 
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index 84db45eb3..50807b68f 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -36,7 +36,7 @@ class MkdirTest : public ::testing::Test {
 
   // TearDown unlinks created files.
   void TearDown() override {
-    // FIXME: We don't currently implement rmdir.
+    // FIXME(edahlgren): We don't currently implement rmdir.
     // We do this unconditionally because there's no harm in trying.
     rmdir(dirname_.c_str());
   }
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index b500e79a4..a4fb9d1e0 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -816,7 +816,7 @@ class MMapFileTest : public MMapTest {
 // MAP_POPULATE allowed.
 // There isn't a good way to verify it actually did anything.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, MapPopulate) {
   ASSERT_THAT(
       Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd_.get(), 0),
@@ -825,7 +825,7 @@ TEST_F(MMapFileTest, MapPopulate) {
 
 // MAP_POPULATE on a short file.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, MapPopulateShort) {
   ASSERT_THAT(Map(0, 2 * kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE,
                   fd_.get(), 0),
@@ -923,7 +923,7 @@ TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
 
 // MAP_SHARED PROT_READ not allowed on write-only FDs.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
@@ -936,7 +936,7 @@ TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
 // MAP_SHARED PROT_WRITE not allowed on write-only FDs.
 // The FD must always be readable.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, WriteSharedOnWriteOnlyFd) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
@@ -1371,7 +1371,7 @@ TEST_F(MMapFileTest, WritePrivate) {
 
 // SIGBUS raised when writing past end of file to a private mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
   SetupGvisorDeathTest();
 
@@ -1390,7 +1390,7 @@ TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
 
 // SIGBUS raised when reading past end of file on a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, SigBusDeathReadShared) {
   SetupGvisorDeathTest();
 
@@ -1410,7 +1410,7 @@ TEST_F(MMapFileTest, SigBusDeathReadShared) {
 
 // SIGBUS raised when reading past end of file on a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, SigBusDeathWriteShared) {
   SetupGvisorDeathTest();
 
@@ -1459,7 +1459,7 @@ TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWritePrivate) {
 // Tests that SIGBUS is not raised when reading from a file-mapped page
 // containing EOF, *after* the EOF for a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
@@ -1476,7 +1476,7 @@ TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
 // Tests that SIGBUS is not raised when writing to a file-mapped page containing
 // EOF, *after* the EOF for a shared mapping.
 //
-// FIXME: Parameterize.
+// FIXME(b/37222275): Parameterize.
 TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWriteShared) {
   uintptr_t addr;
   ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index cdc226300..22e4666c2 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -279,7 +279,7 @@ TEST_F(OpenTest, Null) {
   ASSERT_THAT(open(&c, O_RDONLY), SyscallFailsWithErrno(ENOENT));
 }
 
-// NOTE: While the man pages specify that this behavior should be
+// NOTE(b/119785738): While the man pages specify that this behavior should be
 // undefined, Linux truncates the file on opening read only if we have write
 // permission, so we will too.
 TEST_F(OpenTest, CanTruncateReadOnly) {
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 073a6b8c1..71288ebc4 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -158,7 +158,7 @@ TEST_F(PartialBadBufferTest, PreadvSmall) {
 }
 
 TEST_F(PartialBadBufferTest, WriteBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -168,7 +168,7 @@ TEST_F(PartialBadBufferTest, WriteBig) {
 }
 
 TEST_F(PartialBadBufferTest, WriteSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -178,7 +178,7 @@ TEST_F(PartialBadBufferTest, WriteSmall) {
 }
 
 TEST_F(PartialBadBufferTest, PwriteBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -188,7 +188,7 @@ TEST_F(PartialBadBufferTest, PwriteBig) {
 }
 
 TEST_F(PartialBadBufferTest, PwriteSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -198,7 +198,7 @@ TEST_F(PartialBadBufferTest, PwriteSmall) {
 }
 
 TEST_F(PartialBadBufferTest, WritevBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -211,7 +211,7 @@ TEST_F(PartialBadBufferTest, WritevBig) {
 }
 
 TEST_F(PartialBadBufferTest, WritevSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -224,7 +224,7 @@ TEST_F(PartialBadBufferTest, WritevSmall) {
 }
 
 TEST_F(PartialBadBufferTest, PwritevBig) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -238,7 +238,7 @@ TEST_F(PartialBadBufferTest, PwritevBig) {
 }
 
 TEST_F(PartialBadBufferTest, PwritevSmall) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
@@ -279,7 +279,7 @@ TEST_F(PartialBadBufferTest, GetdentsOneEntry) {
 // Verify that when write returns EFAULT the kernel hasn't silently written
 // the initial valid bytes.
 TEST_F(PartialBadBufferTest, WriteEfaultIsntPartial) {
-  // FIXME: The sentry write syscalls will return immediately
+  // FIXME(b/24788078): The sentry write syscalls will return immediately
   // if Access returns an error, but Access may not return an error
   // and the sentry will instead perform a partial write.
   SKIP_IF(IsRunningOnGvisor());
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index c49ec9f09..abd10b11b 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -36,7 +36,7 @@ namespace {
 
 // Buffer size of a pipe.
 //
-// TODO: Get this from F_GETPIPE_SZ.
+// TODO(b/35762278): Get this from F_GETPIPE_SZ.
 constexpr int kPipeSize = 65536;
 
 class PipeTest : public ::testing::Test {
@@ -316,7 +316,7 @@ TEST_F(PipeTest, BlockWriteClosed) {
 // Blocking write returns EPIPE when read end is closed even if something has
 // been written.
 //
-// FIXME: Pipe writes blocking early allows S/R to interrupt the
+// FIXME(b/35924046): Pipe writes blocking early allows S/R to interrupt the
 // write(2) call before the buffer is full. Then the next call will will return
 // non-zero instead of EPIPE.
 TEST_F(PipeTest, BlockPartialWriteClosed_NoRandomSave) {
@@ -329,7 +329,7 @@ TEST_F(PipeTest, BlockPartialWriteClosed_NoRandomSave) {
     // Write more than fits in the buffer. Blocks then returns partial write
     // when the other end is closed. The next call returns EPIPE.
     if (IsRunningOnGvisor()) {
-      // FIXME: Pipe writes block early on gVisor, resulting in a
+      // FIXME(b/35924046): Pipe writes block early on gVisor, resulting in a
       // shorter than expected partial write.
       ASSERT_THAT(write(wfd, buf.data(), buf.size()),
                   SyscallSucceedsWithValue(::testing::Gt(0)));
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 3ec31ae8b..7ba274226 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -61,7 +61,7 @@
 #include "test/util/thread_util.h"
 #include "test/util/timer_util.h"
 
-// NOTE: No, this isn't really a syscall but this is a really simple
+// NOTE(magi): No, this isn't really a syscall but this is a really simple
 // way to get it tested on both gVisor, PTrace and Linux.
 
 using ::testing::AllOf;
@@ -489,7 +489,7 @@ TEST(ProcSelfMaps, Map1) {
 }
 
 TEST(ProcSelfMaps, Map2) {
-  // NOTE: The permissions must be different or the pages will get merged.
+  // NOTE(magi): The permissions must be different or the pages will get merged.
   Mapping map1 = ASSERT_NO_ERRNO_AND_VALUE(
       MmapAnon(kPageSize, PROT_READ | PROT_EXEC, MAP_PRIVATE));
   Mapping map2 =
@@ -564,7 +564,7 @@ TEST(ProcSelfMaps, MapUnmap) {
 }
 
 TEST(ProcSelfMaps, Mprotect) {
-  // FIXME: Linux's mprotect() sometimes fails to merge VMAs in this
+  // FIXME(jamieliu): Linux's mprotect() sometimes fails to merge VMAs in this
   // case.
   SKIP_IF(!IsRunningOnGvisor());
 
@@ -977,7 +977,7 @@ void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
   *after = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
 }
 
-// TODO: Test for PROT_READ + MAP_POPULATE anonymous mappings. Their
+// TODO(b/73896574): Test for PROT_READ + MAP_POPULATE anonymous mappings. Their
 // semantics are more subtle:
 //
 // Small pages -> Zero page mapped, not counted in RSS
@@ -1140,7 +1140,7 @@ TEST(ProcPidStatusTest, ValuesAreTabDelimited) {
 
 // Threads properly counts running threads.
 //
-// TODO: Test zombied threads while the thread group leader is still
+// TODO(mpratt): Test zombied threads while the thread group leader is still
 // running with generalized fork and clone children from the wait test.
 TEST(ProcPidStatusTest, Threads) {
   char buf[4096] = {};
@@ -1274,7 +1274,7 @@ TEST(ProcPidSymlink, SubprocessRunning) {
               SyscallSucceedsWithValue(sizeof(buf)));
 }
 
-// FIXME: Inconsistent behavior between gVisor and linux
+// FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
 // on proc files.
 TEST(ProcPidSymlink, SubprocessZombied) {
   ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
@@ -1298,13 +1298,13 @@ TEST(ProcPidSymlink, SubprocessZombied) {
                 SyscallFailsWithErrno(want));
   }
 
-  // FIXME: Inconsistent behavior between gVisor and linux
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   // 4.17 & gVisor: Syscall succeeds and returns 1
   // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
   //            SyscallFailsWithErrno(EACCES));
 
-  // FIXME: Inconsistent behavior between gVisor and linux
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   // 4.17 &  gVisor: Syscall succeeds and returns 1.
   // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
@@ -1313,7 +1313,7 @@ TEST(ProcPidSymlink, SubprocessZombied) {
 
 // Test whether /proc/PID/ symlinks can be read for an exited process.
 TEST(ProcPidSymlink, SubprocessExited) {
-  // FIXME: These all succeed on gVisor.
+  // FIXME(gvisor.dev/issue/164): These all succeed on gVisor.
   SKIP_IF(IsRunningOnGvisor());
 
   char buf[1];
@@ -1404,7 +1404,7 @@ TEST(ProcPidFile, SubprocessZombie) {
   EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
-  // FIXME: Inconsistent behavior between gVisor and linux
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
   // on proc files.
   // gVisor & 4.17: Succeeds and returns 1.
   // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
@@ -1415,7 +1415,7 @@ TEST(ProcPidFile, SubprocessZombie) {
 TEST(ProcPidFile, SubprocessExited) {
   char buf[1];
 
-  // FIXME: Inconsistent behavior between kernels
+  // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels
   // gVisor: Fails with ESRCH.
   // 4.17: Succeeds and returns 1.
   // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
@@ -1425,7 +1425,7 @@ TEST(ProcPidFile, SubprocessExited) {
               SyscallFailsWithErrno(ESRCH));
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("comm", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
@@ -1434,25 +1434,25 @@ TEST(ProcPidFile, SubprocessExited) {
               SyscallSucceedsWithValue(sizeof(buf)));
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("io", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Returns EOF on gVisor.
+    // FIXME(gvisor.dev/issue/164): Returns EOF on gVisor.
     EXPECT_THAT(ReadWhileExited("maps", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("stat", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Succeeds on gVisor.
+    // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
     EXPECT_THAT(ReadWhileExited("status", buf, sizeof(buf)),
                 SyscallFailsWithErrno(ESRCH));
   }
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index 5f9c42ce5..cf5c462f3 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -82,7 +82,7 @@ struct ProcPidSmapsEntry {
 // Given the value part of a /proc/[pid]/smaps field containing a value in kB
 // (for example, "    4 kB", returns the value in kB (in this example, 4).
 PosixErrorOr<size_t> SmapsValueKb(absl::string_view value) {
-  // TODO: let us use RE2 or <regex>
+  // TODO(jamieliu): let us use RE2 or <regex>
   std::pair<absl::string_view, absl::string_view> parts =
       absl::StrSplit(value, ' ', absl::SkipEmpty());
   if (parts.second != "kB") {
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index 1c9d7d4f4..e0c56f1fc 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -823,7 +823,7 @@ TEST(PtraceTest,
 TEST(PtraceTest, Int3) {
   switch (GvisorPlatform()) {
     case Platform::kKVM:
-      // TODO: int3 isn't handled properly.
+      // TODO(b/124248694): int3 isn't handled properly.
       return;
     default:
       break;
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index 60ae6de1f..485b1e48d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -30,7 +30,7 @@ namespace {
 
 // This test is currently very rudimentary.
 //
-// TODO:
+// TODO(edahlgren):
 // * bad buffer states (EFAULT).
 // * bad fds (wrong permission, wrong type of file, EBADF).
 // * check offset is not incremented.
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index 2c129b7e8..cf22c395e 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -41,7 +41,7 @@ class ReadvSocketTest : public SocketTest {
     ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData,
                       kReadvTestDataSize),
                 SyscallSucceedsWithValue(kReadvTestDataSize));
-    // FIXME: Enable when possible.
+    // FIXME(b/69821513): Enable when possible.
     // ASSERT_THAT(write(test_tcp_socket_[1], kReadvTestData,
     // kReadvTestDataSize),
     //             SyscallSucceedsWithValue(kReadvTestDataSize));
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index 1f2fed7cc..ff948f9d5 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -75,7 +75,7 @@ class RtSignalTest : public ::testing::Test {
 static int rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t* uinfo) {
   int ret;
   do {
-    // NOTE: rt_sigqueueinfo(2) could return EAGAIN for RT signals.
+    // NOTE(b/25434735): rt_sigqueueinfo(2) could return EAGAIN for RT signals.
     ret = syscall(SYS_rt_sigqueueinfo, tgid, sig, uinfo);
   } while (ret == -1 && errno == EAGAIN);
   return ret;
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index cdc5c0ce8..14d7827c2 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -221,7 +221,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
   std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
   std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
   int accept_counts[kThreadCount] = {};
-  // TODO: figure how to not disable S/R for the whole test.
+  // TODO(avagin): figure how to not disable S/R for the whole test.
   // We need to take into account that this test executes a lot of system
   // calls from many threads.
   DisableSave ds;
@@ -325,7 +325,7 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread) {
   std::atomic<int> packets_received = ATOMIC_VAR_INIT(0);
   std::unique_ptr<ScopedThread> receiver_thread[kThreadCount];
   int packets_per_socket[kThreadCount] = {};
-  // TODO: figure how to not disable S/R for the whole test.
+  // TODO(avagin): figure how to not disable S/R for the whole test.
   DisableSave ds;  // Too expensive.
 
   for (int i = 0; i < kThreadCount; i++) {
@@ -642,7 +642,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME
+  // FIXME(b/114268588)
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -743,7 +743,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME
+  // FIXME(b/114268588)
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
@@ -867,7 +867,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
   auto const& param = GetParam();
 
-  // FIXME
+  // FIXME(b/114268588)
   SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_STREAM);
 
   for (int i = 0; true; i++) {
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 8b4fc57b6..9dd9e1bd6 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -244,7 +244,7 @@ TestAddress V4Multicast() {
 // set interface or group membership.
 TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
        TestSendMulticastSelfNoGroup) {
-  // FIXME: A group membership is not required for external
+  // FIXME(b/125485338): A group membership is not required for external
   // multicast on gVisor.
   SKIP_IF(IsRunningOnGvisor());
 
@@ -371,7 +371,7 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
 // Check that multicast packets won't be delivered to another socket with no
 // set interface or group membership.
 TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastNoGroup) {
-  // FIXME: A group membership is not required for external
+  // FIXME(b/125485338): A group membership is not required for external
   // multicast on gVisor.
   SKIP_IF(IsRunningOnGvisor());
 
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index 8d2e7d333..ed4ae1c71 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -180,7 +180,7 @@ void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
   // RTM_NEWLINK contains at least the header and ifinfomsg.
   EXPECT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
 
-  // TODO: Check ifinfomsg contents and following attrs.
+  // TODO(mpratt): Check ifinfomsg contents and following attrs.
 }
 
 TEST(NetlinkRouteTest, GetLinkDump) {
@@ -370,7 +370,7 @@ TEST(NetlinkRouteTest, GetAddrDump) {
         // RTM_NEWADDR contains at least the header and ifaddrmsg.
         EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
 
-        // TODO: Check ifaddrmsg contents and following attrs.
+        // TODO(mpratt): Check ifaddrmsg contents and following attrs.
       }));
 }
 
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index 8b3f6a647..f0f86c01c 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -33,7 +33,7 @@ namespace gvisor {
 namespace testing {
 
 TEST_P(BlockingStreamSocketPairTest, BlockPartialWriteClosed) {
-    // FIXME: gVisor doesn't support SO_SNDBUF on UDS, nor does it
+    // FIXME(b/35921550): gVisor doesn't support SO_SNDBUF on UDS, nor does it
     // enforce any limit; it will write arbitrary amounts of data without
     // blocking.
     SKIP_IF(IsRunningOnGvisor());
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 035087566..0be23e541 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -353,7 +353,7 @@ PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
   }
   MaybeSave();  // Successful accept.
 
-  // FIXME
+  // FIXME(b/110484944)
   if (connect_result == -1) {
     absl::SleepFor(absl::Seconds(1));
   }
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 7332b768e..fafb23ad1 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -186,7 +186,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNoSpace) {
 // BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
 // receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicFDPassNoSpaceMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -224,7 +224,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNoSpaceMsgCtrunc) {
 // accomidate the FD, but msg_control is set to NULL. In this case, msg_control
 // should override msg_controllen.
 TEST_P(UnixSocketPairTest, BasicFDPassNullControlMsgCtrunc) {
-  // FIXME: Fix handling of NULL msg_control.
+  // FIXME(gvisor.dev/issue/207): Fix handling of NULL msg_control.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -259,7 +259,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNullControlMsgCtrunc) {
 // space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
 // msghdr.
 TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -296,7 +296,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
 // space to receive two of them. It then verifies that the MSG_CTRUNC flag is
 // set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicThreeFDPassTruncationMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -408,7 +408,7 @@ TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
 // provides enough space to receive one of them. It then verifies that the
 // MSG_CTRUNC flag is set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -1010,7 +1010,7 @@ TEST_P(UnixSocketPairTest, CredPassNoMsgCtrunc) {
 // the data without providing space for any credentials and verifies that
 // MSG_CTRUNC is set in the msghdr.
 TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -1061,7 +1061,7 @@ TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
 // the data while providing enough space for only the first field of the
 // credentials and verifies that MSG_CTRUNC is set in the msghdr.
 TEST_P(UnixSocketPairTest, CredPassTruncatedMsgCtrunc) {
-  // FIXME: Support MSG_CTRUNC.
+  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
   SKIP_IF(IsRunningOnGvisor());
 
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -1615,7 +1615,7 @@ TEST_P(UnixSocketPairTest, SocketShutdown) {
 }
 
 TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
-  // TODO: We should be returning ENXIO and NOT EIO.
+  // TODO(b/122310852): We should be returning ENXIO and NOT EIO.
   SKIP_IF(IsRunningOnGvisor());
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index c17d3990f..5dd5e6d77 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -28,7 +28,7 @@ namespace testing {
 namespace {
 
 TEST_P(DgramUnixSocketPairTest, WriteOneSideClosed) {
-  // FIXME: gVisor datagram sockets return EPIPE instead of
+  // FIXME(b/35925052): gVisor datagram sockets return EPIPE instead of
   // ECONNREFUSED.
   SKIP_IF(IsRunningOnGvisor());
 
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 460eb8320..3becb513d 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -31,7 +31,7 @@ using NonBlockingDgramUnixSocketPairTest = SocketPairTest;
 
 TEST_P(NonBlockingDgramUnixSocketPairTest, ReadOneSideClosed) {
   if (IsRunningOnGvisor()) {
-    // FIXME: gVisor datagram sockets return 0 instead of
+    // FIXME(b/70803293): gVisor datagram sockets return 0 instead of
     // EAGAIN.
     return;
   }
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index 8e0cbee4c..a565978f9 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -47,7 +47,7 @@ TEST_P(UnixNonStreamSocketPairTest, RecvMsgTooLarge) {
   const int ret = RetryEINTR(write)(sockets->second_fd(), write_buf.data(),
                                     write_buf.size());
   if (ret < 0 && errno == ENOBUFS) {
-    // NOTE: Linux may stall the write for a long time and
+    // NOTE(b/116636318): Linux may stall the write for a long time and
     // ultimately return ENOBUFS. Allow this error, since a retry will likely
     // result in the same error.
     return;
@@ -136,7 +136,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
     // N.B. At minimum, the socketpair gofer should provide a socket that is
     // already the correct size.
     //
-    // TODO: When internal UDS support SO_SNDBUF, we can assert that
+    // TODO(b/35921550): When internal UDS support SO_SNDBUF, we can assert that
     // we always get the right SO_SNDBUF on gVisor.
     GTEST_SKIP() << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf;
   }
@@ -156,7 +156,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
   msg.msg_iov = &iov;
   msg.msg_iovlen = 1;
 
-  // NOTE: Linux has poor behavior in the presence of
+  // NOTE(b/116636318,b/115833655): Linux has poor behavior in the presence of
   // physical memory fragmentation. As a result, this may stall for a long time
   // and ultimately return ENOBUFS. Allow this error, since it means that we
   // made it to the host kernel and started the sendmsg.
@@ -192,7 +192,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
     // N.B. At minimum, the socketpair gofer should provide a socket that is
     // already the correct size.
     //
-    // TODO: When internal UDS support SO_SNDBUF, we can assert that
+    // TODO(b/35921550): When internal UDS support SO_SNDBUF, we can assert that
     // we always get the right SO_SNDBUF on gVisor.
     GTEST_SKIP() << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf;
   }
@@ -201,7 +201,7 @@ TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
   const int ret = RetryEINTR(write)(sockets->first_fd(), write_buf.data(),
                                     write_buf.size());
   if (ret < 0 && errno == ENOBUFS) {
-    // NOTE: Linux may stall the write for a long time and
+    // NOTE(b/116636318): Linux may stall the write for a long time and
     // ultimately return ENOBUFS. Allow this error, since a retry will likely
     // result in the same error.
     return;
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 270d7203f..21209b244 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -42,7 +42,7 @@ TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnect) {
 }
 
 TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
-  // FIXME: gVisor tries to find /foo/bar and thus returns ENOENT.
+  // FIXME(b/68223466): gVisor tries to find /foo/bar and thus returns ENOENT.
   if (IsRunningOnGvisor()) {
     return;
   }
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index 4db5b4be1..b95f9569e 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -269,7 +269,7 @@ TEST_P(UnixStreamSocketPairTest, SinglePeek) {
     // 9f389e35674f5b086edd70ed524ca0f287259725 which changes this behavior. We
     // used to target 3.11 compatibility, so disable this test on newer kernels.
     //
-    // NOTE: Bring this up to Linux 4.4 compatibility.
+    // NOTE(b/118902768): Bring this up to Linux 4.4 compatibility.
     auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
     SKIP_IF(version.major > 4 || (version.major == 4 && version.minor >= 3));
   }
@@ -686,7 +686,7 @@ TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnect) {
 }
 
 TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
-  // FIXME: gVisor tries to find /foo/bar and thus returns ENOENT.
+  // FIXME(b/68223466): gVisor tries to find /foo/bar and thus returns ENOENT.
   if (IsRunningOnGvisor()) {
     return;
   }
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 48a2059de..746318d09 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -416,7 +416,7 @@ TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
   EXPECT_EQ(st_child_before.st_gid, st_child_fd.st_gid);
   EXPECT_EQ(st_child_before.st_size, st_child_fd.st_size);
 
-  // TODO: This isn't ideal but since fstatfs(2) will always return
+  // TODO(b/34861058): This isn't ideal but since fstatfs(2) will always return
   // OVERLAYFS_SUPER_MAGIC we have no way to know if this fs is backed by a
   // gofer which doesn't support links.
   EXPECT_TRUE(st_child_fd.st_nlink == 0 || st_child_fd.st_nlink == 1);
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
index 442957c65..8346e9a8e 100644
--- a/test/syscalls/linux/stat_times.cc
+++ b/test/syscalls/linux/stat_times.cc
@@ -68,7 +68,7 @@ TEST_F(StatTimesTest, FileCreationTimes) {
 TEST_F(StatTimesTest, FileCtimeChanges) {
   auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
-  MaybeSave();  // FIXME: ctime is inconsistent.
+  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
 
   absl::Time atime, mtime, ctime;
   std::tie(atime, mtime, ctime) = GetTime(file);
@@ -150,7 +150,7 @@ TEST_F(StatTimesTest, FileAtimeChanges) {
   const auto file = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), contents, 0666));
 
-  MaybeSave();  // FIXME: ctime is inconsistent.
+  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
 
   absl::Time atime, mtime, ctime;
   std::tie(atime, mtime, ctime) = GetTime(file);
@@ -184,7 +184,7 @@ TEST_F(StatTimesTest, DirAtimeChanges) {
   const auto file =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
 
-  MaybeSave();  // FIXME: ctime is inconsistent.
+  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
 
   absl::Time atime, mtime, ctime;
   std::tie(atime, mtime, ctime) = GetTime(dir);
@@ -193,7 +193,7 @@ TEST_F(StatTimesTest, DirAtimeChanges) {
 
   const absl::Time before = absl::Now() - absl::Seconds(1);
 
-  // NOTE: Keep an fd open. This ensures that the inode backing the
+  // NOTE(b/37756234): Keep an fd open. This ensures that the inode backing the
   // directory won't be destroyed before the final GetTime to avoid writing out
   // timestamps and causing side effects.
   const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0));
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 1057f5892..33620a874 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -191,7 +191,7 @@ TEST_P(TcpSocketTest, SenderAddressIgnoredOnPeek) {
 TEST_P(TcpSocketTest, SendtoAddressIgnored) {
   struct sockaddr_storage addr;
   memset(&addr, 0, sizeof(addr));
-  addr.ss_family = GetParam();  // FIXME
+  addr.ss_family = GetParam();  // FIXME(b/63803955)
 
   char data = '\0';
   EXPECT_THAT(
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index 9842ccc9b..3e8ce5327 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -32,7 +32,7 @@ namespace {
 static int tkill(pid_t tid, int sig) {
   int ret;
   do {
-    // NOTE: tkill(2) could return EAGAIN for RT signals.
+    // NOTE(b/25434735): tkill(2) could return EAGAIN for RT signals.
     ret = syscall(SYS_tkill, tid, sig);
   } while (ret == -1 && errno == EAGAIN);
   return ret;
diff --git a/test/syscalls/linux/udp_bind.cc b/test/syscalls/linux/udp_bind.cc
index 902be47d3..547eb2a6c 100644
--- a/test/syscalls/linux/udp_bind.cc
+++ b/test/syscalls/linux/udp_bind.cc
@@ -286,7 +286,7 @@ INSTANTIATE_TEST_SUITE_P(
         []() {
           SendtoTestParam param = {};
           param.description = "connected IPv6 sendto IPv4 mapped IPv6";
-          // TODO: Determine if this inconsistent behavior is worth
+          // TODO(igudger): Determine if this inconsistent behavior is worth
           // implementing.
           param.skip_on_gvisor = true;
           param.send_domain = AF_INET6;
@@ -299,7 +299,7 @@ INSTANTIATE_TEST_SUITE_P(
         []() {
           SendtoTestParam param = {};
           param.description = "connected IPv6 sendto IPv4";
-          // TODO: Determine if this inconsistent behavior is worth
+          // TODO(igudger): Determine if this inconsistent behavior is worth
           // implementing.
           param.skip_on_gvisor = true;
           param.send_domain = AF_INET6;
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index c0c1f2960..d78a09b1e 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -169,7 +169,7 @@ TEST(UidGidRootTest, SetgidNotFromThreadGroupLeader) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
 
   const gid_t gid = FLAGS_scratch_gid1;
-  // NOTE: Do setgid in a separate thread so that we can test if
+  // NOTE(b/64676707): Do setgid in a separate thread so that we can test if
   // info.si_pid is set correctly.
   ScopedThread([gid] { ASSERT_THAT(setgid(gid), SyscallSucceeds()); });
   EXPECT_NO_ERRNO(CheckGIDs(gid, gid, gid));
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index d95ee74ec..bf776cd93 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -33,7 +33,7 @@ namespace testing {
 
 namespace {
 
-// TODO: utimes(nullptr) does not pick the "now" time in the
+// TODO(b/36516566): utimes(nullptr) does not pick the "now" time in the
 // application's time domain, so when asserting that times are within a window,
 // we expand the window to allow for differences between the time domains.
 constexpr absl::Duration kClockSlack = absl::Milliseconds(100);
@@ -235,7 +235,7 @@ void TestUtimensat(int dirFd, std::string const& path) {
   EXPECT_LE(mtime3, after);
 
   if (!IsRunningOnGvisor()) {
-    // FIXME: Gofers set atime and mtime to different "now" times.
+    // FIXME(b/36516566): Gofers set atime and mtime to different "now" times.
     EXPECT_EQ(atime3, mtime3);
   }
 }
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index cfab8a976..fcd606bec 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -40,7 +40,7 @@ using ::testing::UnorderedElementsAre;
 // These unit tests focus on the wait4(2) system call, but include a basic
 // checks for the i386 waitpid(2) syscall, which is a subset of wait4(2).
 //
-// NOTE: Some functionality is not tested as
+// NOTE(b/22640830,b/27680907,b/29049891): Some functionality is not tested as
 // it is not currently supported by gVisor:
 // * UID in waitid(2) siginfo.
 // * Process groups.
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 432bd6066..7f80b2fa8 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -33,7 +33,7 @@ namespace testing {
 namespace {
 // This test is currently very rudimentary.
 //
-// TODO:
+// TODO(edahlgren):
 // * bad buffer states (EFAULT).
 // * bad fds (wrong permission, wrong type of file, EBADF).
 // * check offset is incremented.
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
index a63a0d084..131f0a2ba 100644
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ b/third_party/gvsync/downgradable_rwmutex_unsafe.go
@@ -49,7 +49,7 @@ func (rw *DowngradableRWMutex) RLock() {
 // RUnlock undoes a single RLock call.
 func (rw *DowngradableRWMutex) RUnlock() {
 	if RaceEnabled {
-		// TODO: Why does this need to be ReleaseMerge instead of
+		// TODO(jamieliu): Why does this need to be ReleaseMerge instead of
 		// Release? IIUC this establishes Unlock happens-before RUnlock, which
 		// seems unnecessary.
 		RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
diff --git a/vdso/cycle_clock.h b/vdso/cycle_clock.h
index 26d6690c0..309e07a3f 100644
--- a/vdso/cycle_clock.h
+++ b/vdso/cycle_clock.h
@@ -23,7 +23,7 @@ namespace vdso {
 
 #if __x86_64__
 
-// TODO: The appropriate barrier instruction to use with rdtsc on
+// TODO(b/74613497): The appropriate barrier instruction to use with rdtsc on
 // x86_64 depends on the vendor. Intel processors can use lfence but AMD may
 // need mfence, depending on MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT.
 
diff --git a/vdso/vdso_amd64.lds b/vdso/vdso_amd64.lds
index 166779931..e2615ae9e 100644
--- a/vdso/vdso_amd64.lds
+++ b/vdso/vdso_amd64.lds
@@ -56,7 +56,7 @@ SECTIONS {
   .altinstr_replacement  : { *(.altinstr_replacement) }
 
   /*
-   * TODO: Remove this alignment? Then the VDSO would fit
+   * TODO(gvisor.dev/issue/157): Remove this alignment? Then the VDSO would fit
    * in a single page.
    */
   . = ALIGN(0x1000);
diff --git a/vdso/vdso_arm64.lds b/vdso/vdso_arm64.lds
index 19f8efa01..469185468 100644
--- a/vdso/vdso_arm64.lds
+++ b/vdso/vdso_arm64.lds
@@ -59,7 +59,7 @@ SECTIONS {
   .altinstr_replacement  : { *(.altinstr_replacement) }
 
   /*
-   * TODO: Remove this alignment? Then the VDSO would fit
+   * TODO(gvisor.dev/issue/157): Remove this alignment? Then the VDSO would fit
    * in a single page.
    */
   . = ALIGN(0x1000);
-- 
cgit v1.2.3


From 4d52a5520101a88424fb63dd99412a1db33fbd06 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 29 Apr 2019 14:25:05 -0700
Subject: Change copyright notice to "The gVisor Authors"

Based on the guidelines at
https://opensource.google.com/docs/releasing/authors/.

1. $ rg -l "Google LLC" | xargs sed -i 's/Google LLC.*/The gVisor Authors./'
2. Manual fixup of "Google Inc" references.
3. Add AUTHORS file. Authors may request to be added to this file.
4. Point netstack AUTHORS to gVisor AUTHORS. Drop CONTRIBUTORS.

Fixes #209

PiperOrigin-RevId: 245823212
Change-Id: I64530b24ad021a7d683137459cafc510f5ee1de9
---
 AUTHORS                                                           | 8 ++++++++
 kokoro/run_build.sh                                               | 2 +-
 kokoro/run_tests.sh                                               | 2 +-
 pkg/abi/abi.go                                                    | 2 +-
 pkg/abi/abi_linux.go                                              | 2 +-
 pkg/abi/flag.go                                                   | 2 +-
 pkg/abi/linux/aio.go                                              | 2 +-
 pkg/abi/linux/ashmem.go                                           | 2 +-
 pkg/abi/linux/audit.go                                            | 2 +-
 pkg/abi/linux/binder.go                                           | 2 +-
 pkg/abi/linux/bpf.go                                              | 2 +-
 pkg/abi/linux/capability.go                                       | 2 +-
 pkg/abi/linux/dev.go                                              | 2 +-
 pkg/abi/linux/elf.go                                              | 2 +-
 pkg/abi/linux/errors.go                                           | 2 +-
 pkg/abi/linux/eventfd.go                                          | 2 +-
 pkg/abi/linux/exec.go                                             | 2 +-
 pkg/abi/linux/fcntl.go                                            | 2 +-
 pkg/abi/linux/file.go                                             | 2 +-
 pkg/abi/linux/fs.go                                               | 2 +-
 pkg/abi/linux/futex.go                                            | 2 +-
 pkg/abi/linux/inotify.go                                          | 2 +-
 pkg/abi/linux/ioctl.go                                            | 2 +-
 pkg/abi/linux/ip.go                                               | 2 +-
 pkg/abi/linux/ipc.go                                              | 2 +-
 pkg/abi/linux/limits.go                                           | 2 +-
 pkg/abi/linux/linux.go                                            | 2 +-
 pkg/abi/linux/mm.go                                               | 2 +-
 pkg/abi/linux/netdevice.go                                        | 2 +-
 pkg/abi/linux/netlink.go                                          | 2 +-
 pkg/abi/linux/netlink_route.go                                    | 2 +-
 pkg/abi/linux/poll.go                                             | 2 +-
 pkg/abi/linux/prctl.go                                            | 2 +-
 pkg/abi/linux/ptrace.go                                           | 2 +-
 pkg/abi/linux/rusage.go                                           | 2 +-
 pkg/abi/linux/sched.go                                            | 2 +-
 pkg/abi/linux/seccomp.go                                          | 2 +-
 pkg/abi/linux/sem.go                                              | 2 +-
 pkg/abi/linux/shm.go                                              | 2 +-
 pkg/abi/linux/signal.go                                           | 2 +-
 pkg/abi/linux/socket.go                                           | 2 +-
 pkg/abi/linux/tcp.go                                              | 2 +-
 pkg/abi/linux/time.go                                             | 2 +-
 pkg/abi/linux/timer.go                                            | 2 +-
 pkg/abi/linux/tty.go                                              | 2 +-
 pkg/abi/linux/uio.go                                              | 2 +-
 pkg/abi/linux/utsname.go                                          | 2 +-
 pkg/amutex/amutex.go                                              | 2 +-
 pkg/amutex/amutex_test.go                                         | 2 +-
 pkg/atomicbitops/atomic_bitops.go                                 | 2 +-
 pkg/atomicbitops/atomic_bitops_amd64.s                            | 2 +-
 pkg/atomicbitops/atomic_bitops_common.go                          | 2 +-
 pkg/atomicbitops/atomic_bitops_test.go                            | 2 +-
 pkg/binary/binary.go                                              | 2 +-
 pkg/binary/binary_test.go                                         | 2 +-
 pkg/bits/bits.go                                                  | 2 +-
 pkg/bits/bits_template.go                                         | 2 +-
 pkg/bits/uint64_arch_amd64.go                                     | 2 +-
 pkg/bits/uint64_arch_amd64_asm.s                                  | 2 +-
 pkg/bits/uint64_arch_generic.go                                   | 2 +-
 pkg/bits/uint64_test.go                                           | 2 +-
 pkg/bpf/bpf.go                                                    | 2 +-
 pkg/bpf/decoder.go                                                | 2 +-
 pkg/bpf/decoder_test.go                                           | 2 +-
 pkg/bpf/input_bytes.go                                            | 2 +-
 pkg/bpf/interpreter.go                                            | 2 +-
 pkg/bpf/interpreter_test.go                                       | 2 +-
 pkg/bpf/program_builder.go                                        | 2 +-
 pkg/bpf/program_builder_test.go                                   | 2 +-
 pkg/compressio/compressio.go                                      | 2 +-
 pkg/compressio/compressio_test.go                                 | 2 +-
 pkg/control/client/client.go                                      | 2 +-
 pkg/control/server/server.go                                      | 2 +-
 pkg/cpuid/cpu_amd64.s                                             | 2 +-
 pkg/cpuid/cpuid.go                                                | 2 +-
 pkg/cpuid/cpuid_parse_test.go                                     | 2 +-
 pkg/cpuid/cpuid_test.go                                           | 2 +-
 pkg/dhcp/client.go                                                | 2 +-
 pkg/dhcp/dhcp.go                                                  | 2 +-
 pkg/dhcp/dhcp_string.go                                           | 2 +-
 pkg/dhcp/dhcp_test.go                                             | 2 +-
 pkg/dhcp/server.go                                                | 2 +-
 pkg/eventchannel/event.go                                         | 2 +-
 pkg/eventchannel/event.proto                                      | 2 +-
 pkg/fd/fd.go                                                      | 2 +-
 pkg/fd/fd_test.go                                                 | 2 +-
 pkg/fdnotifier/fdnotifier.go                                      | 2 +-
 pkg/fdnotifier/poll_unsafe.go                                     | 2 +-
 pkg/gate/gate.go                                                  | 2 +-
 pkg/gate/gate_test.go                                             | 2 +-
 pkg/ilist/list.go                                                 | 2 +-
 pkg/ilist/list_test.go                                            | 2 +-
 pkg/linewriter/linewriter.go                                      | 2 +-
 pkg/linewriter/linewriter_test.go                                 | 2 +-
 pkg/log/glog.go                                                   | 2 +-
 pkg/log/glog_unsafe.go                                            | 2 +-
 pkg/log/json.go                                                   | 2 +-
 pkg/log/json_k8s.go                                               | 2 +-
 pkg/log/json_test.go                                              | 2 +-
 pkg/log/log.go                                                    | 2 +-
 pkg/log/log_test.go                                               | 2 +-
 pkg/metric/metric.go                                              | 2 +-
 pkg/metric/metric.proto                                           | 2 +-
 pkg/metric/metric_test.go                                         | 2 +-
 pkg/p9/buffer.go                                                  | 2 +-
 pkg/p9/buffer_test.go                                             | 2 +-
 pkg/p9/client.go                                                  | 2 +-
 pkg/p9/client_file.go                                             | 2 +-
 pkg/p9/client_test.go                                             | 2 +-
 pkg/p9/file.go                                                    | 2 +-
 pkg/p9/handlers.go                                                | 2 +-
 pkg/p9/local_server/local_server.go                               | 2 +-
 pkg/p9/messages.go                                                | 2 +-
 pkg/p9/messages_test.go                                           | 2 +-
 pkg/p9/p9.go                                                      | 2 +-
 pkg/p9/p9_test.go                                                 | 2 +-
 pkg/p9/p9test/client_test.go                                      | 2 +-
 pkg/p9/p9test/p9test.go                                           | 2 +-
 pkg/p9/path_tree.go                                               | 2 +-
 pkg/p9/pool.go                                                    | 2 +-
 pkg/p9/pool_test.go                                               | 2 +-
 pkg/p9/server.go                                                  | 2 +-
 pkg/p9/transport.go                                               | 2 +-
 pkg/p9/transport_test.go                                          | 2 +-
 pkg/p9/version.go                                                 | 2 +-
 pkg/p9/version_test.go                                            | 2 +-
 pkg/rand/rand.go                                                  | 2 +-
 pkg/rand/rand_linux.go                                            | 2 +-
 pkg/refs/refcounter.go                                            | 2 +-
 pkg/refs/refcounter_state.go                                      | 2 +-
 pkg/refs/refcounter_test.go                                       | 2 +-
 pkg/seccomp/seccomp.go                                            | 2 +-
 pkg/seccomp/seccomp_rules.go                                      | 2 +-
 pkg/seccomp/seccomp_test.go                                       | 2 +-
 pkg/seccomp/seccomp_test_victim.go                                | 2 +-
 pkg/seccomp/seccomp_unsafe.go                                     | 2 +-
 pkg/secio/full_reader.go                                          | 2 +-
 pkg/secio/secio.go                                                | 2 +-
 pkg/secio/secio_test.go                                           | 2 +-
 pkg/segment/range.go                                              | 2 +-
 pkg/segment/set.go                                                | 2 +-
 pkg/segment/set_state.go                                          | 2 +-
 pkg/segment/test/segment_test.go                                  | 2 +-
 pkg/segment/test/set_functions.go                                 | 2 +-
 pkg/sentry/arch/aligned.go                                        | 2 +-
 pkg/sentry/arch/arch.go                                           | 2 +-
 pkg/sentry/arch/arch_amd64.go                                     | 2 +-
 pkg/sentry/arch/arch_amd64.s                                      | 2 +-
 pkg/sentry/arch/arch_state_x86.go                                 | 2 +-
 pkg/sentry/arch/arch_x86.go                                       | 2 +-
 pkg/sentry/arch/auxv.go                                           | 2 +-
 pkg/sentry/arch/registers.proto                                   | 2 +-
 pkg/sentry/arch/signal_act.go                                     | 2 +-
 pkg/sentry/arch/signal_amd64.go                                   | 2 +-
 pkg/sentry/arch/signal_info.go                                    | 2 +-
 pkg/sentry/arch/signal_stack.go                                   | 2 +-
 pkg/sentry/arch/stack.go                                          | 2 +-
 pkg/sentry/arch/syscalls_amd64.go                                 | 2 +-
 pkg/sentry/context/context.go                                     | 2 +-
 pkg/sentry/context/contexttest/contexttest.go                     | 2 +-
 pkg/sentry/control/control.go                                     | 2 +-
 pkg/sentry/control/pprof.go                                       | 2 +-
 pkg/sentry/control/proc.go                                        | 2 +-
 pkg/sentry/control/proc_test.go                                   | 2 +-
 pkg/sentry/control/state.go                                       | 2 +-
 pkg/sentry/device/device.go                                       | 2 +-
 pkg/sentry/device/device_test.go                                  | 2 +-
 pkg/sentry/fs/anon/anon.go                                        | 2 +-
 pkg/sentry/fs/anon/device.go                                      | 2 +-
 pkg/sentry/fs/ashmem/area.go                                      | 2 +-
 pkg/sentry/fs/ashmem/device.go                                    | 2 +-
 pkg/sentry/fs/ashmem/pin_board.go                                 | 2 +-
 pkg/sentry/fs/ashmem/pin_board_test.go                            | 2 +-
 pkg/sentry/fs/attr.go                                             | 2 +-
 pkg/sentry/fs/binder/binder.go                                    | 2 +-
 pkg/sentry/fs/context.go                                          | 2 +-
 pkg/sentry/fs/copy_up.go                                          | 2 +-
 pkg/sentry/fs/copy_up_test.go                                     | 2 +-
 pkg/sentry/fs/dentry.go                                           | 2 +-
 pkg/sentry/fs/dev/dev.go                                          | 2 +-
 pkg/sentry/fs/dev/device.go                                       | 2 +-
 pkg/sentry/fs/dev/fs.go                                           | 2 +-
 pkg/sentry/fs/dev/full.go                                         | 2 +-
 pkg/sentry/fs/dev/null.go                                         | 2 +-
 pkg/sentry/fs/dev/random.go                                       | 2 +-
 pkg/sentry/fs/dirent.go                                           | 2 +-
 pkg/sentry/fs/dirent_cache.go                                     | 2 +-
 pkg/sentry/fs/dirent_cache_limiter.go                             | 2 +-
 pkg/sentry/fs/dirent_cache_test.go                                | 2 +-
 pkg/sentry/fs/dirent_refs_test.go                                 | 2 +-
 pkg/sentry/fs/dirent_state.go                                     | 2 +-
 pkg/sentry/fs/fdpipe/pipe.go                                      | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener.go                               | 2 +-
 pkg/sentry/fs/fdpipe/pipe_opener_test.go                          | 2 +-
 pkg/sentry/fs/fdpipe/pipe_state.go                                | 2 +-
 pkg/sentry/fs/fdpipe/pipe_test.go                                 | 2 +-
 pkg/sentry/fs/file.go                                             | 2 +-
 pkg/sentry/fs/file_operations.go                                  | 2 +-
 pkg/sentry/fs/file_overlay.go                                     | 2 +-
 pkg/sentry/fs/file_overlay_test.go                                | 2 +-
 pkg/sentry/fs/file_state.go                                       | 2 +-
 pkg/sentry/fs/file_test.go                                        | 2 +-
 pkg/sentry/fs/filesystems.go                                      | 2 +-
 pkg/sentry/fs/filetest/filetest.go                                | 2 +-
 pkg/sentry/fs/flags.go                                            | 2 +-
 pkg/sentry/fs/fs.go                                               | 2 +-
 pkg/sentry/fs/fsutil/dirty_set.go                                 | 2 +-
 pkg/sentry/fs/fsutil/dirty_set_test.go                            | 2 +-
 pkg/sentry/fs/fsutil/file.go                                      | 2 +-
 pkg/sentry/fs/fsutil/file_range_set.go                            | 2 +-
 pkg/sentry/fs/fsutil/frame_ref_set.go                             | 2 +-
 pkg/sentry/fs/fsutil/fsutil.go                                    | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper.go                          | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_state.go                    | 2 +-
 pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go                   | 2 +-
 pkg/sentry/fs/fsutil/host_mappable.go                             | 2 +-
 pkg/sentry/fs/fsutil/inode.go                                     | 2 +-
 pkg/sentry/fs/fsutil/inode_cached.go                              | 2 +-
 pkg/sentry/fs/fsutil/inode_cached_test.go                         | 2 +-
 pkg/sentry/fs/gofer/attr.go                                       | 2 +-
 pkg/sentry/fs/gofer/cache_policy.go                               | 2 +-
 pkg/sentry/fs/gofer/context_file.go                               | 2 +-
 pkg/sentry/fs/gofer/device.go                                     | 2 +-
 pkg/sentry/fs/gofer/file.go                                       | 2 +-
 pkg/sentry/fs/gofer/file_state.go                                 | 2 +-
 pkg/sentry/fs/gofer/fs.go                                         | 2 +-
 pkg/sentry/fs/gofer/gofer_test.go                                 | 2 +-
 pkg/sentry/fs/gofer/handles.go                                    | 2 +-
 pkg/sentry/fs/gofer/inode.go                                      | 2 +-
 pkg/sentry/fs/gofer/inode_state.go                                | 2 +-
 pkg/sentry/fs/gofer/path.go                                       | 2 +-
 pkg/sentry/fs/gofer/session.go                                    | 2 +-
 pkg/sentry/fs/gofer/session_state.go                              | 2 +-
 pkg/sentry/fs/gofer/socket.go                                     | 2 +-
 pkg/sentry/fs/gofer/util.go                                       | 2 +-
 pkg/sentry/fs/host/control.go                                     | 2 +-
 pkg/sentry/fs/host/descriptor.go                                  | 2 +-
 pkg/sentry/fs/host/descriptor_state.go                            | 2 +-
 pkg/sentry/fs/host/descriptor_test.go                             | 2 +-
 pkg/sentry/fs/host/device.go                                      | 2 +-
 pkg/sentry/fs/host/file.go                                        | 2 +-
 pkg/sentry/fs/host/fs.go                                          | 2 +-
 pkg/sentry/fs/host/fs_test.go                                     | 2 +-
 pkg/sentry/fs/host/inode.go                                       | 2 +-
 pkg/sentry/fs/host/inode_state.go                                 | 2 +-
 pkg/sentry/fs/host/inode_test.go                                  | 2 +-
 pkg/sentry/fs/host/ioctl_unsafe.go                                | 2 +-
 pkg/sentry/fs/host/socket.go                                      | 2 +-
 pkg/sentry/fs/host/socket_iovec.go                                | 2 +-
 pkg/sentry/fs/host/socket_state.go                                | 2 +-
 pkg/sentry/fs/host/socket_test.go                                 | 2 +-
 pkg/sentry/fs/host/socket_unsafe.go                               | 2 +-
 pkg/sentry/fs/host/tty.go                                         | 2 +-
 pkg/sentry/fs/host/util.go                                        | 2 +-
 pkg/sentry/fs/host/util_unsafe.go                                 | 2 +-
 pkg/sentry/fs/host/wait_test.go                                   | 2 +-
 pkg/sentry/fs/inode.go                                            | 2 +-
 pkg/sentry/fs/inode_inotify.go                                    | 2 +-
 pkg/sentry/fs/inode_operations.go                                 | 2 +-
 pkg/sentry/fs/inode_overlay.go                                    | 2 +-
 pkg/sentry/fs/inode_overlay_test.go                               | 2 +-
 pkg/sentry/fs/inotify.go                                          | 2 +-
 pkg/sentry/fs/inotify_event.go                                    | 2 +-
 pkg/sentry/fs/inotify_watch.go                                    | 2 +-
 pkg/sentry/fs/lock/lock.go                                        | 2 +-
 pkg/sentry/fs/lock/lock_range_test.go                             | 2 +-
 pkg/sentry/fs/lock/lock_set_functions.go                          | 2 +-
 pkg/sentry/fs/lock/lock_test.go                                   | 2 +-
 pkg/sentry/fs/mock.go                                             | 2 +-
 pkg/sentry/fs/mount.go                                            | 2 +-
 pkg/sentry/fs/mount_overlay.go                                    | 2 +-
 pkg/sentry/fs/mount_test.go                                       | 2 +-
 pkg/sentry/fs/mounts.go                                           | 2 +-
 pkg/sentry/fs/mounts_test.go                                      | 2 +-
 pkg/sentry/fs/offset.go                                           | 2 +-
 pkg/sentry/fs/overlay.go                                          | 2 +-
 pkg/sentry/fs/path.go                                             | 2 +-
 pkg/sentry/fs/path_test.go                                        | 2 +-
 pkg/sentry/fs/proc/cpuinfo.go                                     | 2 +-
 pkg/sentry/fs/proc/device/device.go                               | 2 +-
 pkg/sentry/fs/proc/exec_args.go                                   | 2 +-
 pkg/sentry/fs/proc/fds.go                                         | 2 +-
 pkg/sentry/fs/proc/filesystems.go                                 | 2 +-
 pkg/sentry/fs/proc/fs.go                                          | 2 +-
 pkg/sentry/fs/proc/inode.go                                       | 2 +-
 pkg/sentry/fs/proc/loadavg.go                                     | 2 +-
 pkg/sentry/fs/proc/meminfo.go                                     | 2 +-
 pkg/sentry/fs/proc/mounts.go                                      | 2 +-
 pkg/sentry/fs/proc/net.go                                         | 2 +-
 pkg/sentry/fs/proc/net_test.go                                    | 2 +-
 pkg/sentry/fs/proc/proc.go                                        | 2 +-
 pkg/sentry/fs/proc/rpcinet_proc.go                                | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go                             | 2 +-
 pkg/sentry/fs/proc/seqfile/seqfile_test.go                        | 2 +-
 pkg/sentry/fs/proc/stat.go                                        | 2 +-
 pkg/sentry/fs/proc/sys.go                                         | 2 +-
 pkg/sentry/fs/proc/sys_net.go                                     | 2 +-
 pkg/sentry/fs/proc/sys_net_state.go                               | 2 +-
 pkg/sentry/fs/proc/sys_net_test.go                                | 2 +-
 pkg/sentry/fs/proc/task.go                                        | 2 +-
 pkg/sentry/fs/proc/uid_gid_map.go                                 | 2 +-
 pkg/sentry/fs/proc/uptime.go                                      | 2 +-
 pkg/sentry/fs/proc/version.go                                     | 2 +-
 pkg/sentry/fs/ramfs/dir.go                                        | 2 +-
 pkg/sentry/fs/ramfs/socket.go                                     | 2 +-
 pkg/sentry/fs/ramfs/symlink.go                                    | 2 +-
 pkg/sentry/fs/ramfs/tree.go                                       | 2 +-
 pkg/sentry/fs/ramfs/tree_test.go                                  | 2 +-
 pkg/sentry/fs/restore.go                                          | 2 +-
 pkg/sentry/fs/save.go                                             | 2 +-
 pkg/sentry/fs/seek.go                                             | 2 +-
 pkg/sentry/fs/sync.go                                             | 2 +-
 pkg/sentry/fs/sys/device.go                                       | 2 +-
 pkg/sentry/fs/sys/devices.go                                      | 2 +-
 pkg/sentry/fs/sys/fs.go                                           | 2 +-
 pkg/sentry/fs/sys/sys.go                                          | 2 +-
 pkg/sentry/fs/timerfd/timerfd.go                                  | 2 +-
 pkg/sentry/fs/tmpfs/device.go                                     | 2 +-
 pkg/sentry/fs/tmpfs/file_regular.go                               | 2 +-
 pkg/sentry/fs/tmpfs/file_test.go                                  | 2 +-
 pkg/sentry/fs/tmpfs/fs.go                                         | 2 +-
 pkg/sentry/fs/tmpfs/inode_file.go                                 | 2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go                                      | 2 +-
 pkg/sentry/fs/tty/dir.go                                          | 2 +-
 pkg/sentry/fs/tty/fs.go                                           | 2 +-
 pkg/sentry/fs/tty/line_discipline.go                              | 2 +-
 pkg/sentry/fs/tty/master.go                                       | 2 +-
 pkg/sentry/fs/tty/queue.go                                        | 2 +-
 pkg/sentry/fs/tty/slave.go                                        | 2 +-
 pkg/sentry/fs/tty/terminal.go                                     | 2 +-
 pkg/sentry/fs/tty/tty_test.go                                     | 2 +-
 pkg/sentry/hostcpu/getcpu_amd64.s                                 | 2 +-
 pkg/sentry/hostcpu/hostcpu.go                                     | 2 +-
 pkg/sentry/hostcpu/hostcpu_test.go                                | 2 +-
 pkg/sentry/inet/context.go                                        | 2 +-
 pkg/sentry/inet/inet.go                                           | 2 +-
 pkg/sentry/inet/test_stack.go                                     | 2 +-
 pkg/sentry/kernel/abstract_socket_namespace.go                    | 2 +-
 pkg/sentry/kernel/auth/auth.go                                    | 2 +-
 pkg/sentry/kernel/auth/capability_set.go                          | 2 +-
 pkg/sentry/kernel/auth/context.go                                 | 2 +-
 pkg/sentry/kernel/auth/credentials.go                             | 2 +-
 pkg/sentry/kernel/auth/id.go                                      | 2 +-
 pkg/sentry/kernel/auth/id_map.go                                  | 2 +-
 pkg/sentry/kernel/auth/id_map_functions.go                        | 2 +-
 pkg/sentry/kernel/auth/user_namespace.go                          | 2 +-
 pkg/sentry/kernel/context.go                                      | 2 +-
 pkg/sentry/kernel/contexttest/contexttest.go                      | 2 +-
 pkg/sentry/kernel/epoll/epoll.go                                  | 2 +-
 pkg/sentry/kernel/epoll/epoll_state.go                            | 2 +-
 pkg/sentry/kernel/epoll/epoll_test.go                             | 2 +-
 pkg/sentry/kernel/eventfd/eventfd.go                              | 2 +-
 pkg/sentry/kernel/eventfd/eventfd_test.go                         | 2 +-
 pkg/sentry/kernel/fasync/fasync.go                                | 2 +-
 pkg/sentry/kernel/fd_map.go                                       | 2 +-
 pkg/sentry/kernel/fd_map_test.go                                  | 2 +-
 pkg/sentry/kernel/fs_context.go                                   | 2 +-
 pkg/sentry/kernel/futex/futex.go                                  | 2 +-
 pkg/sentry/kernel/futex/futex_test.go                             | 2 +-
 pkg/sentry/kernel/ipc_namespace.go                                | 2 +-
 pkg/sentry/kernel/kdefs/kdefs.go                                  | 2 +-
 pkg/sentry/kernel/kernel.go                                       | 2 +-
 pkg/sentry/kernel/kernel_state.go                                 | 2 +-
 pkg/sentry/kernel/memevent/memory_events.go                       | 2 +-
 pkg/sentry/kernel/memevent/memory_events.proto                    | 2 +-
 pkg/sentry/kernel/pending_signals.go                              | 2 +-
 pkg/sentry/kernel/pending_signals_state.go                        | 2 +-
 pkg/sentry/kernel/pipe/buffers.go                                 | 2 +-
 pkg/sentry/kernel/pipe/device.go                                  | 2 +-
 pkg/sentry/kernel/pipe/node.go                                    | 2 +-
 pkg/sentry/kernel/pipe/node_test.go                               | 2 +-
 pkg/sentry/kernel/pipe/pipe.go                                    | 2 +-
 pkg/sentry/kernel/pipe/pipe_test.go                               | 2 +-
 pkg/sentry/kernel/pipe/reader.go                                  | 2 +-
 pkg/sentry/kernel/pipe/reader_writer.go                           | 2 +-
 pkg/sentry/kernel/pipe/writer.go                                  | 2 +-
 pkg/sentry/kernel/posixtimer.go                                   | 2 +-
 pkg/sentry/kernel/ptrace.go                                       | 2 +-
 pkg/sentry/kernel/ptrace_amd64.go                                 | 2 +-
 pkg/sentry/kernel/ptrace_arm64.go                                 | 2 +-
 pkg/sentry/kernel/rseq.go                                         | 2 +-
 pkg/sentry/kernel/sched/cpuset.go                                 | 2 +-
 pkg/sentry/kernel/sched/cpuset_test.go                            | 2 +-
 pkg/sentry/kernel/sched/sched.go                                  | 2 +-
 pkg/sentry/kernel/seccomp.go                                      | 2 +-
 pkg/sentry/kernel/semaphore/semaphore.go                          | 2 +-
 pkg/sentry/kernel/semaphore/semaphore_test.go                     | 2 +-
 pkg/sentry/kernel/sessions.go                                     | 2 +-
 pkg/sentry/kernel/shm/device.go                                   | 2 +-
 pkg/sentry/kernel/shm/shm.go                                      | 2 +-
 pkg/sentry/kernel/signal.go                                       | 2 +-
 pkg/sentry/kernel/signal_handlers.go                              | 2 +-
 pkg/sentry/kernel/syscalls.go                                     | 2 +-
 pkg/sentry/kernel/syscalls_state.go                               | 2 +-
 pkg/sentry/kernel/syslog.go                                       | 2 +-
 pkg/sentry/kernel/table_test.go                                   | 2 +-
 pkg/sentry/kernel/task.go                                         | 2 +-
 pkg/sentry/kernel/task_acct.go                                    | 2 +-
 pkg/sentry/kernel/task_block.go                                   | 2 +-
 pkg/sentry/kernel/task_clone.go                                   | 2 +-
 pkg/sentry/kernel/task_context.go                                 | 2 +-
 pkg/sentry/kernel/task_exec.go                                    | 2 +-
 pkg/sentry/kernel/task_exit.go                                    | 2 +-
 pkg/sentry/kernel/task_futex.go                                   | 2 +-
 pkg/sentry/kernel/task_identity.go                                | 2 +-
 pkg/sentry/kernel/task_log.go                                     | 2 +-
 pkg/sentry/kernel/task_net.go                                     | 2 +-
 pkg/sentry/kernel/task_run.go                                     | 2 +-
 pkg/sentry/kernel/task_sched.go                                   | 2 +-
 pkg/sentry/kernel/task_signals.go                                 | 2 +-
 pkg/sentry/kernel/task_start.go                                   | 2 +-
 pkg/sentry/kernel/task_stop.go                                    | 2 +-
 pkg/sentry/kernel/task_syscall.go                                 | 2 +-
 pkg/sentry/kernel/task_test.go                                    | 2 +-
 pkg/sentry/kernel/task_usermem.go                                 | 2 +-
 pkg/sentry/kernel/thread_group.go                                 | 2 +-
 pkg/sentry/kernel/threads.go                                      | 2 +-
 pkg/sentry/kernel/time/context.go                                 | 2 +-
 pkg/sentry/kernel/time/time.go                                    | 2 +-
 pkg/sentry/kernel/timekeeper.go                                   | 2 +-
 pkg/sentry/kernel/timekeeper_state.go                             | 2 +-
 pkg/sentry/kernel/timekeeper_test.go                              | 2 +-
 pkg/sentry/kernel/uncaught_signal.proto                           | 2 +-
 pkg/sentry/kernel/uts_namespace.go                                | 2 +-
 pkg/sentry/kernel/vdso.go                                         | 2 +-
 pkg/sentry/kernel/version.go                                      | 2 +-
 pkg/sentry/limits/context.go                                      | 2 +-
 pkg/sentry/limits/limits.go                                       | 2 +-
 pkg/sentry/limits/limits_test.go                                  | 2 +-
 pkg/sentry/limits/linux.go                                        | 2 +-
 pkg/sentry/loader/elf.go                                          | 2 +-
 pkg/sentry/loader/interpreter.go                                  | 2 +-
 pkg/sentry/loader/loader.go                                       | 2 +-
 pkg/sentry/loader/vdso.go                                         | 2 +-
 pkg/sentry/loader/vdso_state.go                                   | 2 +-
 pkg/sentry/memmap/mapping_set.go                                  | 2 +-
 pkg/sentry/memmap/mapping_set_test.go                             | 2 +-
 pkg/sentry/memmap/memmap.go                                       | 2 +-
 pkg/sentry/memutil/memutil.go                                     | 2 +-
 pkg/sentry/memutil/memutil_unsafe.go                              | 2 +-
 pkg/sentry/mm/address_space.go                                    | 2 +-
 pkg/sentry/mm/aio_context.go                                      | 2 +-
 pkg/sentry/mm/aio_context_state.go                                | 2 +-
 pkg/sentry/mm/debug.go                                            | 2 +-
 pkg/sentry/mm/io.go                                               | 2 +-
 pkg/sentry/mm/lifecycle.go                                        | 2 +-
 pkg/sentry/mm/metadata.go                                         | 2 +-
 pkg/sentry/mm/mm.go                                               | 2 +-
 pkg/sentry/mm/mm_test.go                                          | 2 +-
 pkg/sentry/mm/pma.go                                              | 2 +-
 pkg/sentry/mm/procfs.go                                           | 2 +-
 pkg/sentry/mm/save_restore.go                                     | 2 +-
 pkg/sentry/mm/shm.go                                              | 2 +-
 pkg/sentry/mm/special_mappable.go                                 | 2 +-
 pkg/sentry/mm/syscalls.go                                         | 2 +-
 pkg/sentry/mm/vma.go                                              | 2 +-
 pkg/sentry/pgalloc/context.go                                     | 2 +-
 pkg/sentry/pgalloc/pgalloc.go                                     | 2 +-
 pkg/sentry/pgalloc/pgalloc_test.go                                | 2 +-
 pkg/sentry/pgalloc/pgalloc_unsafe.go                              | 2 +-
 pkg/sentry/pgalloc/save_restore.go                                | 2 +-
 pkg/sentry/platform/context.go                                    | 2 +-
 pkg/sentry/platform/interrupt/interrupt.go                        | 2 +-
 pkg/sentry/platform/interrupt/interrupt_test.go                   | 2 +-
 pkg/sentry/platform/kvm/address_space.go                          | 2 +-
 pkg/sentry/platform/kvm/allocator.go                              | 2 +-
 pkg/sentry/platform/kvm/bluepill.go                               | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.go                         | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64.s                          | 2 +-
 pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go                  | 2 +-
 pkg/sentry/platform/kvm/bluepill_fault.go                         | 2 +-
 pkg/sentry/platform/kvm/bluepill_unsafe.go                        | 2 +-
 pkg/sentry/platform/kvm/context.go                                | 2 +-
 pkg/sentry/platform/kvm/kvm.go                                    | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64.go                              | 2 +-
 pkg/sentry/platform/kvm/kvm_amd64_unsafe.go                       | 2 +-
 pkg/sentry/platform/kvm/kvm_const.go                              | 2 +-
 pkg/sentry/platform/kvm/kvm_test.go                               | 2 +-
 pkg/sentry/platform/kvm/machine.go                                | 2 +-
 pkg/sentry/platform/kvm/machine_amd64.go                          | 2 +-
 pkg/sentry/platform/kvm/machine_amd64_unsafe.go                   | 2 +-
 pkg/sentry/platform/kvm/machine_unsafe.go                         | 2 +-
 pkg/sentry/platform/kvm/physical_map.go                           | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil.go                      | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.go                | 2 +-
 pkg/sentry/platform/kvm/testutil/testutil_amd64.s                 | 2 +-
 pkg/sentry/platform/kvm/virtual_map.go                            | 2 +-
 pkg/sentry/platform/kvm/virtual_map_test.go                       | 2 +-
 pkg/sentry/platform/mmap_min_addr.go                              | 2 +-
 pkg/sentry/platform/platform.go                                   | 2 +-
 pkg/sentry/platform/procid/procid.go                              | 2 +-
 pkg/sentry/platform/procid/procid_amd64.s                         | 2 +-
 pkg/sentry/platform/procid/procid_arm64.s                         | 2 +-
 pkg/sentry/platform/procid/procid_net_test.go                     | 2 +-
 pkg/sentry/platform/procid/procid_test.go                         | 2 +-
 pkg/sentry/platform/ptrace/ptrace.go                              | 2 +-
 pkg/sentry/platform/ptrace/ptrace_unsafe.go                       | 2 +-
 pkg/sentry/platform/ptrace/stub_amd64.s                           | 2 +-
 pkg/sentry/platform/ptrace/stub_unsafe.go                         | 2 +-
 pkg/sentry/platform/ptrace/subprocess.go                          | 2 +-
 pkg/sentry/platform/ptrace/subprocess_amd64.go                    | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go                    | 2 +-
 pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go       | 2 +-
 pkg/sentry/platform/ptrace/subprocess_unsafe.go                   | 2 +-
 pkg/sentry/platform/ring0/defs.go                                 | 2 +-
 pkg/sentry/platform/ring0/defs_amd64.go                           | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.go                          | 2 +-
 pkg/sentry/platform/ring0/entry_amd64.s                           | 2 +-
 pkg/sentry/platform/ring0/gen_offsets/main.go                     | 2 +-
 pkg/sentry/platform/ring0/kernel.go                               | 2 +-
 pkg/sentry/platform/ring0/kernel_amd64.go                         | 2 +-
 pkg/sentry/platform/ring0/kernel_unsafe.go                        | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.go                            | 2 +-
 pkg/sentry/platform/ring0/lib_amd64.s                             | 2 +-
 pkg/sentry/platform/ring0/offsets_amd64.go                        | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator.go                 | 2 +-
 pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go          | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables.go                | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go          | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go     | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_test.go           | 2 +-
 pkg/sentry/platform/ring0/pagetables/pagetables_x86.go            | 2 +-
 pkg/sentry/platform/ring0/pagetables/pcids_x86.go                 | 2 +-
 pkg/sentry/platform/ring0/pagetables/walker_amd64.go              | 2 +-
 pkg/sentry/platform/ring0/ring0.go                                | 2 +-
 pkg/sentry/platform/ring0/x86.go                                  | 2 +-
 pkg/sentry/platform/safecopy/atomic_amd64.s                       | 2 +-
 pkg/sentry/platform/safecopy/safecopy.go                          | 2 +-
 pkg/sentry/platform/safecopy/safecopy_test.go                     | 2 +-
 pkg/sentry/platform/safecopy/safecopy_unsafe.go                   | 2 +-
 pkg/sentry/platform/safecopy/sighandler_amd64.s                   | 2 +-
 pkg/sentry/platform/safecopy/sighandler_arm64.s                   | 2 +-
 pkg/sentry/safemem/block_unsafe.go                                | 2 +-
 pkg/sentry/safemem/io.go                                          | 2 +-
 pkg/sentry/safemem/io_test.go                                     | 2 +-
 pkg/sentry/safemem/safemem.go                                     | 2 +-
 pkg/sentry/safemem/seq_test.go                                    | 2 +-
 pkg/sentry/safemem/seq_unsafe.go                                  | 2 +-
 pkg/sentry/sighandling/sighandling.go                             | 2 +-
 pkg/sentry/sighandling/sighandling_unsafe.go                      | 2 +-
 pkg/sentry/socket/control/control.go                              | 2 +-
 pkg/sentry/socket/epsocket/device.go                              | 2 +-
 pkg/sentry/socket/epsocket/epsocket.go                            | 2 +-
 pkg/sentry/socket/epsocket/provider.go                            | 2 +-
 pkg/sentry/socket/epsocket/save_restore.go                        | 2 +-
 pkg/sentry/socket/epsocket/stack.go                               | 2 +-
 pkg/sentry/socket/hostinet/device.go                              | 2 +-
 pkg/sentry/socket/hostinet/hostinet.go                            | 2 +-
 pkg/sentry/socket/hostinet/save_restore.go                        | 2 +-
 pkg/sentry/socket/hostinet/socket.go                              | 2 +-
 pkg/sentry/socket/hostinet/socket_unsafe.go                       | 2 +-
 pkg/sentry/socket/hostinet/stack.go                               | 2 +-
 pkg/sentry/socket/netlink/message.go                              | 2 +-
 pkg/sentry/socket/netlink/port/port.go                            | 2 +-
 pkg/sentry/socket/netlink/port/port_test.go                       | 2 +-
 pkg/sentry/socket/netlink/provider.go                             | 2 +-
 pkg/sentry/socket/netlink/route/protocol.go                       | 2 +-
 pkg/sentry/socket/netlink/socket.go                               | 2 +-
 pkg/sentry/socket/rpcinet/conn/conn.go                            | 2 +-
 pkg/sentry/socket/rpcinet/device.go                               | 2 +-
 pkg/sentry/socket/rpcinet/notifier/notifier.go                    | 2 +-
 pkg/sentry/socket/rpcinet/rpcinet.go                              | 2 +-
 pkg/sentry/socket/rpcinet/socket.go                               | 2 +-
 pkg/sentry/socket/rpcinet/stack.go                                | 2 +-
 pkg/sentry/socket/rpcinet/stack_unsafe.go                         | 2 +-
 pkg/sentry/socket/socket.go                                       | 2 +-
 pkg/sentry/socket/unix/device.go                                  | 2 +-
 pkg/sentry/socket/unix/io.go                                      | 2 +-
 pkg/sentry/socket/unix/transport/connectioned.go                  | 2 +-
 pkg/sentry/socket/unix/transport/connectioned_state.go            | 2 +-
 pkg/sentry/socket/unix/transport/connectionless.go                | 2 +-
 pkg/sentry/socket/unix/transport/queue.go                         | 2 +-
 pkg/sentry/socket/unix/transport/unix.go                          | 2 +-
 pkg/sentry/socket/unix/unix.go                                    | 2 +-
 pkg/sentry/state/state.go                                         | 2 +-
 pkg/sentry/state/state_metadata.go                                | 2 +-
 pkg/sentry/state/state_unsafe.go                                  | 2 +-
 pkg/sentry/strace/capability.go                                   | 2 +-
 pkg/sentry/strace/clone.go                                        | 2 +-
 pkg/sentry/strace/futex.go                                        | 2 +-
 pkg/sentry/strace/linux64.go                                      | 2 +-
 pkg/sentry/strace/open.go                                         | 2 +-
 pkg/sentry/strace/poll.go                                         | 2 +-
 pkg/sentry/strace/ptrace.go                                       | 2 +-
 pkg/sentry/strace/signal.go                                       | 2 +-
 pkg/sentry/strace/socket.go                                       | 2 +-
 pkg/sentry/strace/strace.go                                       | 2 +-
 pkg/sentry/strace/strace.proto                                    | 2 +-
 pkg/sentry/strace/syscalls.go                                     | 2 +-
 pkg/sentry/syscalls/epoll.go                                      | 2 +-
 pkg/sentry/syscalls/linux/error.go                                | 2 +-
 pkg/sentry/syscalls/linux/flags.go                                | 2 +-
 pkg/sentry/syscalls/linux/linux64.go                              | 2 +-
 pkg/sentry/syscalls/linux/sigset.go                               | 2 +-
 pkg/sentry/syscalls/linux/sys_aio.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_capability.go                       | 2 +-
 pkg/sentry/syscalls/linux/sys_epoll.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_eventfd.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_file.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_futex.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_getdents.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_identity.go                         | 2 +-
 pkg/sentry/syscalls/linux/sys_inotify.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_lseek.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_mmap.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_mount.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_pipe.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_poll.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_prctl.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_random.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_read.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_rlimit.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_rusage.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_sched.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_seccomp.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_sem.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_shm.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_signal.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_socket.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_stat.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_sync.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_sysinfo.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_syslog.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_thread.go                           | 2 +-
 pkg/sentry/syscalls/linux/sys_time.go                             | 2 +-
 pkg/sentry/syscalls/linux/sys_timer.go                            | 2 +-
 pkg/sentry/syscalls/linux/sys_timerfd.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_tls.go                              | 2 +-
 pkg/sentry/syscalls/linux/sys_utsname.go                          | 2 +-
 pkg/sentry/syscalls/linux/sys_write.go                            | 2 +-
 pkg/sentry/syscalls/linux/timespec.go                             | 2 +-
 pkg/sentry/syscalls/syscalls.go                                   | 2 +-
 pkg/sentry/time/calibrated_clock.go                               | 2 +-
 pkg/sentry/time/calibrated_clock_test.go                          | 2 +-
 pkg/sentry/time/clock_id.go                                       | 2 +-
 pkg/sentry/time/clocks.go                                         | 2 +-
 pkg/sentry/time/muldiv_amd64.s                                    | 2 +-
 pkg/sentry/time/muldiv_arm64.s                                    | 2 +-
 pkg/sentry/time/parameters.go                                     | 2 +-
 pkg/sentry/time/parameters_test.go                                | 2 +-
 pkg/sentry/time/sampler.go                                        | 2 +-
 pkg/sentry/time/sampler_test.go                                   | 2 +-
 pkg/sentry/time/sampler_unsafe.go                                 | 2 +-
 pkg/sentry/time/tsc_amd64.s                                       | 2 +-
 pkg/sentry/time/tsc_arm64.s                                       | 2 +-
 pkg/sentry/unimpl/events.go                                       | 2 +-
 pkg/sentry/unimpl/unimplemented_syscall.proto                     | 2 +-
 pkg/sentry/uniqueid/context.go                                    | 2 +-
 pkg/sentry/usage/cpu.go                                           | 2 +-
 pkg/sentry/usage/io.go                                            | 2 +-
 pkg/sentry/usage/memory.go                                        | 2 +-
 pkg/sentry/usage/memory_unsafe.go                                 | 2 +-
 pkg/sentry/usage/usage.go                                         | 2 +-
 pkg/sentry/usermem/access_type.go                                 | 2 +-
 pkg/sentry/usermem/addr.go                                        | 2 +-
 pkg/sentry/usermem/addr_range_seq_test.go                         | 2 +-
 pkg/sentry/usermem/addr_range_seq_unsafe.go                       | 2 +-
 pkg/sentry/usermem/bytes_io.go                                    | 2 +-
 pkg/sentry/usermem/bytes_io_unsafe.go                             | 2 +-
 pkg/sentry/usermem/usermem.go                                     | 2 +-
 pkg/sentry/usermem/usermem_arm64.go                               | 2 +-
 pkg/sentry/usermem/usermem_test.go                                | 2 +-
 pkg/sentry/usermem/usermem_unsafe.go                              | 2 +-
 pkg/sentry/usermem/usermem_x86.go                                 | 2 +-
 pkg/sentry/watchdog/watchdog.go                                   | 2 +-
 pkg/sleep/commit_amd64.s                                          | 2 +-
 pkg/sleep/commit_asm.go                                           | 2 +-
 pkg/sleep/commit_noasm.go                                         | 2 +-
 pkg/sleep/empty.s                                                 | 2 +-
 pkg/sleep/sleep_test.go                                           | 2 +-
 pkg/sleep/sleep_unsafe.go                                         | 2 +-
 pkg/state/decode.go                                               | 2 +-
 pkg/state/encode.go                                               | 2 +-
 pkg/state/encode_unsafe.go                                        | 2 +-
 pkg/state/map.go                                                  | 2 +-
 pkg/state/object.proto                                            | 2 +-
 pkg/state/printer.go                                              | 2 +-
 pkg/state/state.go                                                | 2 +-
 pkg/state/state_test.go                                           | 2 +-
 pkg/state/statefile/statefile.go                                  | 2 +-
 pkg/state/statefile/statefile_test.go                             | 2 +-
 pkg/state/stats.go                                                | 2 +-
 pkg/syserr/host_linux.go                                          | 2 +-
 pkg/syserr/netstack.go                                            | 2 +-
 pkg/syserr/syserr.go                                              | 2 +-
 pkg/syserror/syserror.go                                          | 2 +-
 pkg/syserror/syserror_test.go                                     | 2 +-
 pkg/tcpip/adapters/gonet/gonet.go                                 | 2 +-
 pkg/tcpip/adapters/gonet/gonet_test.go                            | 2 +-
 pkg/tcpip/buffer/prependable.go                                   | 2 +-
 pkg/tcpip/buffer/view.go                                          | 2 +-
 pkg/tcpip/buffer/view_test.go                                     | 2 +-
 pkg/tcpip/checker/checker.go                                      | 2 +-
 pkg/tcpip/hash/jenkins/jenkins.go                                 | 2 +-
 pkg/tcpip/hash/jenkins/jenkins_test.go                            | 2 +-
 pkg/tcpip/header/arp.go                                           | 2 +-
 pkg/tcpip/header/checksum.go                                      | 2 +-
 pkg/tcpip/header/eth.go                                           | 2 +-
 pkg/tcpip/header/gue.go                                           | 2 +-
 pkg/tcpip/header/icmpv4.go                                        | 2 +-
 pkg/tcpip/header/icmpv6.go                                        | 2 +-
 pkg/tcpip/header/interfaces.go                                    | 2 +-
 pkg/tcpip/header/ipv4.go                                          | 2 +-
 pkg/tcpip/header/ipv6.go                                          | 2 +-
 pkg/tcpip/header/ipv6_fragment.go                                 | 2 +-
 pkg/tcpip/header/ipversion_test.go                                | 2 +-
 pkg/tcpip/header/tcp.go                                           | 2 +-
 pkg/tcpip/header/tcp_test.go                                      | 2 +-
 pkg/tcpip/header/udp.go                                           | 2 +-
 pkg/tcpip/link/channel/channel.go                                 | 2 +-
 pkg/tcpip/link/fdbased/endpoint.go                                | 2 +-
 pkg/tcpip/link/fdbased/endpoint_test.go                           | 2 +-
 pkg/tcpip/link/fdbased/endpoint_unsafe.go                         | 2 +-
 pkg/tcpip/link/fdbased/mmap.go                                    | 2 +-
 pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go                       | 2 +-
 pkg/tcpip/link/loopback/loopback.go                               | 2 +-
 pkg/tcpip/link/muxed/injectable.go                                | 2 +-
 pkg/tcpip/link/muxed/injectable_test.go                           | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_amd64.s                       | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go               | 2 +-
 pkg/tcpip/link/rawfile/blockingpoll_unsafe.go                     | 2 +-
 pkg/tcpip/link/rawfile/errors.go                                  | 2 +-
 pkg/tcpip/link/rawfile/rawfile_unsafe.go                          | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe.go                             | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_test.go                        | 2 +-
 pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go                      | 2 +-
 pkg/tcpip/link/sharedmem/pipe/rx.go                               | 2 +-
 pkg/tcpip/link/sharedmem/pipe/tx.go                               | 2 +-
 pkg/tcpip/link/sharedmem/queue/queue_test.go                      | 2 +-
 pkg/tcpip/link/sharedmem/queue/rx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/queue/tx.go                              | 2 +-
 pkg/tcpip/link/sharedmem/rx.go                                    | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem.go                             | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_test.go                        | 2 +-
 pkg/tcpip/link/sharedmem/sharedmem_unsafe.go                      | 2 +-
 pkg/tcpip/link/sharedmem/tx.go                                    | 2 +-
 pkg/tcpip/link/sniffer/pcap.go                                    | 2 +-
 pkg/tcpip/link/sniffer/sniffer.go                                 | 2 +-
 pkg/tcpip/link/tun/tun_unsafe.go                                  | 2 +-
 pkg/tcpip/link/waitable/waitable.go                               | 2 +-
 pkg/tcpip/link/waitable/waitable_test.go                          | 2 +-
 pkg/tcpip/network/arp/arp.go                                      | 2 +-
 pkg/tcpip/network/arp/arp_test.go                                 | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap.go                      | 2 +-
 pkg/tcpip/network/fragmentation/frag_heap_test.go                 | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation.go                  | 2 +-
 pkg/tcpip/network/fragmentation/fragmentation_test.go             | 2 +-
 pkg/tcpip/network/fragmentation/reassembler.go                    | 2 +-
 pkg/tcpip/network/fragmentation/reassembler_test.go               | 2 +-
 pkg/tcpip/network/hash/hash.go                                    | 2 +-
 pkg/tcpip/network/ip_test.go                                      | 2 +-
 pkg/tcpip/network/ipv4/icmp.go                                    | 2 +-
 pkg/tcpip/network/ipv4/ipv4.go                                    | 2 +-
 pkg/tcpip/network/ipv4/ipv4_test.go                               | 2 +-
 pkg/tcpip/network/ipv6/icmp.go                                    | 2 +-
 pkg/tcpip/network/ipv6/icmp_test.go                               | 2 +-
 pkg/tcpip/network/ipv6/ipv6.go                                    | 2 +-
 pkg/tcpip/ports/ports.go                                          | 2 +-
 pkg/tcpip/ports/ports_test.go                                     | 2 +-
 pkg/tcpip/sample/tun_tcp_connect/main.go                          | 2 +-
 pkg/tcpip/sample/tun_tcp_echo/main.go                             | 2 +-
 pkg/tcpip/seqnum/seqnum.go                                        | 2 +-
 pkg/tcpip/stack/linkaddrcache.go                                  | 2 +-
 pkg/tcpip/stack/linkaddrcache_test.go                             | 2 +-
 pkg/tcpip/stack/nic.go                                            | 2 +-
 pkg/tcpip/stack/registration.go                                   | 2 +-
 pkg/tcpip/stack/route.go                                          | 2 +-
 pkg/tcpip/stack/stack.go                                          | 2 +-
 pkg/tcpip/stack/stack_global_state.go                             | 2 +-
 pkg/tcpip/stack/stack_test.go                                     | 2 +-
 pkg/tcpip/stack/transport_demuxer.go                              | 2 +-
 pkg/tcpip/stack/transport_test.go                                 | 2 +-
 pkg/tcpip/tcpip.go                                                | 2 +-
 pkg/tcpip/tcpip_test.go                                           | 2 +-
 pkg/tcpip/time.s                                                  | 2 +-
 pkg/tcpip/time_unsafe.go                                          | 2 +-
 pkg/tcpip/transport/icmp/endpoint.go                              | 2 +-
 pkg/tcpip/transport/icmp/endpoint_state.go                        | 2 +-
 pkg/tcpip/transport/icmp/protocol.go                              | 2 +-
 pkg/tcpip/transport/raw/raw.go                                    | 2 +-
 pkg/tcpip/transport/raw/state.go                                  | 2 +-
 pkg/tcpip/transport/tcp/accept.go                                 | 2 +-
 pkg/tcpip/transport/tcp/connect.go                                | 2 +-
 pkg/tcpip/transport/tcp/cubic.go                                  | 2 +-
 pkg/tcpip/transport/tcp/dual_stack_test.go                        | 2 +-
 pkg/tcpip/transport/tcp/endpoint.go                               | 2 +-
 pkg/tcpip/transport/tcp/endpoint_state.go                         | 2 +-
 pkg/tcpip/transport/tcp/forwarder.go                              | 2 +-
 pkg/tcpip/transport/tcp/protocol.go                               | 2 +-
 pkg/tcpip/transport/tcp/rcv.go                                    | 2 +-
 pkg/tcpip/transport/tcp/reno.go                                   | 2 +-
 pkg/tcpip/transport/tcp/sack.go                                   | 2 +-
 pkg/tcpip/transport/tcp/sack_scoreboard.go                        | 2 +-
 pkg/tcpip/transport/tcp/sack_scoreboard_test.go                   | 2 +-
 pkg/tcpip/transport/tcp/segment.go                                | 2 +-
 pkg/tcpip/transport/tcp/segment_heap.go                           | 2 +-
 pkg/tcpip/transport/tcp/segment_queue.go                          | 2 +-
 pkg/tcpip/transport/tcp/segment_state.go                          | 2 +-
 pkg/tcpip/transport/tcp/snd.go                                    | 2 +-
 pkg/tcpip/transport/tcp/snd_state.go                              | 2 +-
 pkg/tcpip/transport/tcp/tcp_sack_test.go                          | 2 +-
 pkg/tcpip/transport/tcp/tcp_test.go                               | 2 +-
 pkg/tcpip/transport/tcp/tcp_timestamp_test.go                     | 2 +-
 pkg/tcpip/transport/tcp/testing/context/context.go                | 2 +-
 pkg/tcpip/transport/tcp/timer.go                                  | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go                 | 2 +-
 pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go            | 2 +-
 pkg/tcpip/transport/udp/endpoint.go                               | 2 +-
 pkg/tcpip/transport/udp/endpoint_state.go                         | 2 +-
 pkg/tcpip/transport/udp/forwarder.go                              | 2 +-
 pkg/tcpip/transport/udp/protocol.go                               | 2 +-
 pkg/tcpip/transport/udp/udp_test.go                               | 2 +-
 pkg/tmutex/tmutex.go                                              | 2 +-
 pkg/tmutex/tmutex_test.go                                         | 2 +-
 pkg/unet/unet.go                                                  | 2 +-
 pkg/unet/unet_test.go                                             | 2 +-
 pkg/unet/unet_unsafe.go                                           | 2 +-
 pkg/urpc/urpc.go                                                  | 2 +-
 pkg/urpc/urpc_test.go                                             | 2 +-
 pkg/waiter/waiter.go                                              | 2 +-
 pkg/waiter/waiter_test.go                                         | 2 +-
 runsc/boot/compat.go                                              | 2 +-
 runsc/boot/compat_amd64.go                                        | 2 +-
 runsc/boot/compat_test.go                                         | 2 +-
 runsc/boot/config.go                                              | 2 +-
 runsc/boot/controller.go                                          | 2 +-
 runsc/boot/debug.go                                               | 2 +-
 runsc/boot/events.go                                              | 2 +-
 runsc/boot/fds.go                                                 | 2 +-
 runsc/boot/filter/config.go                                       | 2 +-
 runsc/boot/filter/extra_filters.go                                | 2 +-
 runsc/boot/filter/extra_filters_msan.go                           | 2 +-
 runsc/boot/filter/extra_filters_race.go                           | 2 +-
 runsc/boot/filter/filter.go                                       | 2 +-
 runsc/boot/fs.go                                                  | 2 +-
 runsc/boot/limits.go                                              | 2 +-
 runsc/boot/loader.go                                              | 2 +-
 runsc/boot/loader_test.go                                         | 2 +-
 runsc/boot/network.go                                             | 2 +-
 runsc/boot/strace.go                                              | 2 +-
 runsc/cgroup/cgroup.go                                            | 2 +-
 runsc/cgroup/cgroup_test.go                                       | 2 +-
 runsc/cmd/boot.go                                                 | 2 +-
 runsc/cmd/capability.go                                           | 2 +-
 runsc/cmd/capability_test.go                                      | 2 +-
 runsc/cmd/checkpoint.go                                           | 2 +-
 runsc/cmd/chroot.go                                               | 2 +-
 runsc/cmd/cmd.go                                                  | 2 +-
 runsc/cmd/create.go                                               | 2 +-
 runsc/cmd/debug.go                                                | 2 +-
 runsc/cmd/delete.go                                               | 2 +-
 runsc/cmd/delete_test.go                                          | 2 +-
 runsc/cmd/do.go                                                   | 2 +-
 runsc/cmd/events.go                                               | 2 +-
 runsc/cmd/exec.go                                                 | 2 +-
 runsc/cmd/exec_test.go                                            | 2 +-
 runsc/cmd/gofer.go                                                | 2 +-
 runsc/cmd/gofer_test.go                                           | 2 +-
 runsc/cmd/kill.go                                                 | 2 +-
 runsc/cmd/list.go                                                 | 2 +-
 runsc/cmd/path.go                                                 | 2 +-
 runsc/cmd/pause.go                                                | 2 +-
 runsc/cmd/ps.go                                                   | 2 +-
 runsc/cmd/restore.go                                              | 2 +-
 runsc/cmd/resume.go                                               | 2 +-
 runsc/cmd/run.go                                                  | 2 +-
 runsc/cmd/spec.go                                                 | 2 +-
 runsc/cmd/start.go                                                | 2 +-
 runsc/cmd/state.go                                                | 2 +-
 runsc/cmd/wait.go                                                 | 2 +-
 runsc/console/console.go                                          | 2 +-
 runsc/container/console_test.go                                   | 2 +-
 runsc/container/container.go                                      | 2 +-
 runsc/container/container_test.go                                 | 2 +-
 runsc/container/hook.go                                           | 2 +-
 runsc/container/multi_container_test.go                           | 2 +-
 runsc/container/shared_volume_test.go                             | 2 +-
 runsc/container/status.go                                         | 2 +-
 runsc/container/test_app.go                                       | 2 +-
 runsc/fsgofer/filter/config.go                                    | 2 +-
 runsc/fsgofer/filter/extra_filters.go                             | 2 +-
 runsc/fsgofer/filter/extra_filters_msan.go                        | 2 +-
 runsc/fsgofer/filter/extra_filters_race.go                        | 2 +-
 runsc/fsgofer/filter/filter.go                                    | 2 +-
 runsc/fsgofer/fsgofer.go                                          | 2 +-
 runsc/fsgofer/fsgofer_test.go                                     | 2 +-
 runsc/fsgofer/fsgofer_unsafe.go                                   | 2 +-
 runsc/main.go                                                     | 2 +-
 runsc/sandbox/network.go                                          | 2 +-
 runsc/sandbox/network_unsafe.go                                   | 2 +-
 runsc/sandbox/sandbox.go                                          | 2 +-
 runsc/specutils/fs.go                                             | 2 +-
 runsc/specutils/namespace.go                                      | 2 +-
 runsc/specutils/specutils.go                                      | 2 +-
 runsc/specutils/specutils_test.go                                 | 2 +-
 runsc/test/image/image.go                                         | 2 +-
 runsc/test/image/image_test.go                                    | 2 +-
 runsc/test/image/mysql.sql                                        | 2 +-
 runsc/test/image/ruby.rb                                          | 2 +-
 runsc/test/image/ruby.sh                                          | 2 +-
 runsc/test/install.sh                                             | 2 +-
 runsc/test/integration/exec_test.go                               | 2 +-
 runsc/test/integration/integration.go                             | 2 +-
 runsc/test/integration/integration_test.go                        | 2 +-
 runsc/test/root/cgroup_test.go                                    | 2 +-
 runsc/test/root/chroot_test.go                                    | 2 +-
 runsc/test/root/crictl_test.go                                    | 2 +-
 runsc/test/root/root.go                                           | 2 +-
 runsc/test/root/testdata/busybox.go                               | 2 +-
 runsc/test/root/testdata/containerd_config.go                     | 2 +-
 runsc/test/root/testdata/httpd.go                                 | 2 +-
 runsc/test/root/testdata/httpd_mount_paths.go                     | 2 +-
 runsc/test/root/testdata/sandbox.go                               | 2 +-
 runsc/test/testutil/crictl.go                                     | 2 +-
 runsc/test/testutil/docker.go                                     | 2 +-
 runsc/test/testutil/testutil.go                                   | 2 +-
 runsc/test/testutil/testutil_race.go                              | 2 +-
 runsc/tools/dockercfg/dockercfg.go                                | 2 +-
 runsc/version.go                                                  | 2 +-
 test/syscalls/gtest/gtest.go                                      | 2 +-
 test/syscalls/linux/32bit.cc                                      | 2 +-
 test/syscalls/linux/accept_bind.cc                                | 2 +-
 test/syscalls/linux/accept_bind_stream.cc                         | 2 +-
 test/syscalls/linux/access.cc                                     | 2 +-
 test/syscalls/linux/affinity.cc                                   | 2 +-
 test/syscalls/linux/aio.cc                                        | 2 +-
 test/syscalls/linux/alarm.cc                                      | 2 +-
 test/syscalls/linux/arch_prctl.cc                                 | 2 +-
 test/syscalls/linux/bad.cc                                        | 2 +-
 test/syscalls/linux/base_poll_test.cc                             | 2 +-
 test/syscalls/linux/base_poll_test.h                              | 2 +-
 test/syscalls/linux/bind.cc                                       | 2 +-
 test/syscalls/linux/brk.cc                                        | 2 +-
 test/syscalls/linux/chdir.cc                                      | 2 +-
 test/syscalls/linux/chmod.cc                                      | 2 +-
 test/syscalls/linux/chown.cc                                      | 2 +-
 test/syscalls/linux/chroot.cc                                     | 2 +-
 test/syscalls/linux/clock_getres.cc                               | 2 +-
 test/syscalls/linux/clock_gettime.cc                              | 2 +-
 test/syscalls/linux/clock_nanosleep.cc                            | 2 +-
 test/syscalls/linux/concurrency.cc                                | 2 +-
 test/syscalls/linux/creat.cc                                      | 2 +-
 test/syscalls/linux/dev.cc                                        | 2 +-
 test/syscalls/linux/dup.cc                                        | 2 +-
 test/syscalls/linux/epoll.cc                                      | 2 +-
 test/syscalls/linux/eventfd.cc                                    | 2 +-
 test/syscalls/linux/exceptions.cc                                 | 2 +-
 test/syscalls/linux/exec.cc                                       | 2 +-
 test/syscalls/linux/exec.h                                        | 2 +-
 test/syscalls/linux/exec_assert_closed_workload.cc                | 2 +-
 test/syscalls/linux/exec_basic_workload.cc                        | 2 +-
 test/syscalls/linux/exec_binary.cc                                | 2 +-
 test/syscalls/linux/exec_proc_exe_workload.cc                     | 2 +-
 test/syscalls/linux/exec_state_workload.cc                        | 2 +-
 test/syscalls/linux/exit.cc                                       | 2 +-
 test/syscalls/linux/exit_script.sh                                | 2 +-
 test/syscalls/linux/fadvise64.cc                                  | 2 +-
 test/syscalls/linux/fallocate.cc                                  | 2 +-
 test/syscalls/linux/fault.cc                                      | 2 +-
 test/syscalls/linux/fchdir.cc                                     | 2 +-
 test/syscalls/linux/fcntl.cc                                      | 2 +-
 test/syscalls/linux/file_base.h                                   | 2 +-
 test/syscalls/linux/flock.cc                                      | 2 +-
 test/syscalls/linux/fork.cc                                       | 2 +-
 test/syscalls/linux/fpsig_fork.cc                                 | 2 +-
 test/syscalls/linux/fpsig_nested.cc                               | 2 +-
 test/syscalls/linux/fsync.cc                                      | 2 +-
 test/syscalls/linux/futex.cc                                      | 2 +-
 test/syscalls/linux/getcpu.cc                                     | 2 +-
 test/syscalls/linux/getdents.cc                                   | 2 +-
 test/syscalls/linux/getrandom.cc                                  | 2 +-
 test/syscalls/linux/getrusage.cc                                  | 2 +-
 test/syscalls/linux/inotify.cc                                    | 2 +-
 test/syscalls/linux/ioctl.cc                                      | 2 +-
 test/syscalls/linux/ip_socket_test_util.cc                        | 2 +-
 test/syscalls/linux/ip_socket_test_util.h                         | 2 +-
 test/syscalls/linux/itimer.cc                                     | 2 +-
 test/syscalls/linux/kill.cc                                       | 2 +-
 test/syscalls/linux/link.cc                                       | 2 +-
 test/syscalls/linux/lseek.cc                                      | 2 +-
 test/syscalls/linux/madvise.cc                                    | 2 +-
 test/syscalls/linux/memfd.cc                                      | 2 +-
 test/syscalls/linux/memory_accounting.cc                          | 2 +-
 test/syscalls/linux/mempolicy.cc                                  | 2 +-
 test/syscalls/linux/mincore.cc                                    | 2 +-
 test/syscalls/linux/mkdir.cc                                      | 2 +-
 test/syscalls/linux/mknod.cc                                      | 2 +-
 test/syscalls/linux/mlock.cc                                      | 2 +-
 test/syscalls/linux/mmap.cc                                       | 2 +-
 test/syscalls/linux/mount.cc                                      | 2 +-
 test/syscalls/linux/mremap.cc                                     | 2 +-
 test/syscalls/linux/msync.cc                                      | 2 +-
 test/syscalls/linux/munmap.cc                                     | 2 +-
 test/syscalls/linux/open.cc                                       | 2 +-
 test/syscalls/linux/open_create.cc                                | 2 +-
 test/syscalls/linux/partial_bad_buffer.cc                         | 2 +-
 test/syscalls/linux/pause.cc                                      | 2 +-
 test/syscalls/linux/pipe.cc                                       | 2 +-
 test/syscalls/linux/poll.cc                                       | 2 +-
 test/syscalls/linux/ppoll.cc                                      | 2 +-
 test/syscalls/linux/prctl.cc                                      | 2 +-
 test/syscalls/linux/prctl_setuid.cc                               | 2 +-
 test/syscalls/linux/pread64.cc                                    | 2 +-
 test/syscalls/linux/preadv.cc                                     | 2 +-
 test/syscalls/linux/preadv2.cc                                    | 2 +-
 test/syscalls/linux/priority.cc                                   | 2 +-
 test/syscalls/linux/priority_execve.cc                            | 2 +-
 test/syscalls/linux/proc.cc                                       | 2 +-
 test/syscalls/linux/proc_net.cc                                   | 2 +-
 test/syscalls/linux/proc_net_unix.cc                              | 2 +-
 test/syscalls/linux/proc_pid_smaps.cc                             | 2 +-
 test/syscalls/linux/proc_pid_uid_gid_map.cc                       | 2 +-
 test/syscalls/linux/pselect.cc                                    | 2 +-
 test/syscalls/linux/ptrace.cc                                     | 2 +-
 test/syscalls/linux/pty.cc                                        | 2 +-
 test/syscalls/linux/pwrite64.cc                                   | 2 +-
 test/syscalls/linux/pwritev2.cc                                   | 2 +-
 test/syscalls/linux/raw_socket_ipv4.cc                            | 2 +-
 test/syscalls/linux/read.cc                                       | 2 +-
 test/syscalls/linux/readv.cc                                      | 2 +-
 test/syscalls/linux/readv_common.cc                               | 2 +-
 test/syscalls/linux/readv_common.h                                | 2 +-
 test/syscalls/linux/readv_socket.cc                               | 2 +-
 test/syscalls/linux/rename.cc                                     | 2 +-
 test/syscalls/linux/rlimits.cc                                    | 2 +-
 test/syscalls/linux/rtsignal.cc                                   | 2 +-
 test/syscalls/linux/sched.cc                                      | 2 +-
 test/syscalls/linux/sched_yield.cc                                | 2 +-
 test/syscalls/linux/seccomp.cc                                    | 2 +-
 test/syscalls/linux/select.cc                                     | 2 +-
 test/syscalls/linux/semaphore.cc                                  | 2 +-
 test/syscalls/linux/sendfile.cc                                   | 2 +-
 test/syscalls/linux/sendfile_socket.cc                            | 2 +-
 test/syscalls/linux/shm.cc                                        | 2 +-
 test/syscalls/linux/sigaction.cc                                  | 2 +-
 test/syscalls/linux/sigaltstack.cc                                | 2 +-
 test/syscalls/linux/sigaltstack_check.cc                          | 2 +-
 test/syscalls/linux/sigiret.cc                                    | 2 +-
 test/syscalls/linux/sigprocmask.cc                                | 2 +-
 test/syscalls/linux/sigstop.cc                                    | 2 +-
 test/syscalls/linux/sigtimedwait.cc                               | 2 +-
 test/syscalls/linux/socket_abstract.cc                            | 2 +-
 test/syscalls/linux/socket_blocking.cc                            | 2 +-
 test/syscalls/linux/socket_blocking.h                             | 2 +-
 test/syscalls/linux/socket_filesystem.cc                          | 2 +-
 test/syscalls/linux/socket_generic.cc                             | 2 +-
 test/syscalls/linux/socket_generic.h                              | 2 +-
 test/syscalls/linux/socket_inet_loopback.cc                       | 2 +-
 test/syscalls/linux/socket_ip_loopback_blocking.cc                | 2 +-
 test/syscalls/linux/socket_ip_tcp_generic.cc                      | 2 +-
 test/syscalls/linux/socket_ip_tcp_generic.h                       | 2 +-
 test/syscalls/linux/socket_ip_tcp_generic_loopback.cc             | 2 +-
 test/syscalls/linux/socket_ip_tcp_loopback.cc                     | 2 +-
 test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc            | 2 +-
 test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc            | 2 +-
 test/syscalls/linux/socket_ip_tcp_udp_generic.cc                  | 2 +-
 test/syscalls/linux/socket_ip_udp_generic.cc                      | 2 +-
 test/syscalls/linux/socket_ip_udp_generic.h                       | 2 +-
 test/syscalls/linux/socket_ip_udp_loopback.cc                     | 2 +-
 test/syscalls/linux/socket_ip_udp_loopback_blocking.cc            | 2 +-
 test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc            | 2 +-
 .../syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc | 2 +-
 test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h | 2 +-
 .../linux/socket_ipv4_tcp_unbound_external_networking_test.cc     | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound.cc                    | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound.h                     | 2 +-
 .../syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h | 2 +-
 .../linux/socket_ipv4_udp_unbound_external_networking_test.cc     | 2 +-
 test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc           | 2 +-
 test/syscalls/linux/socket_netdevice.cc                           | 2 +-
 test/syscalls/linux/socket_netlink_route.cc                       | 2 +-
 test/syscalls/linux/socket_netlink_util.cc                        | 2 +-
 test/syscalls/linux/socket_netlink_util.h                         | 2 +-
 test/syscalls/linux/socket_non_blocking.cc                        | 2 +-
 test/syscalls/linux/socket_non_blocking.h                         | 2 +-
 test/syscalls/linux/socket_non_stream.cc                          | 2 +-
 test/syscalls/linux/socket_non_stream.h                           | 2 +-
 test/syscalls/linux/socket_non_stream_blocking.cc                 | 2 +-
 test/syscalls/linux/socket_non_stream_blocking.h                  | 2 +-
 test/syscalls/linux/socket_stream.cc                              | 2 +-
 test/syscalls/linux/socket_stream.h                               | 2 +-
 test/syscalls/linux/socket_stream_blocking.cc                     | 2 +-
 test/syscalls/linux/socket_stream_blocking.h                      | 2 +-
 test/syscalls/linux/socket_stream_nonblock.cc                     | 2 +-
 test/syscalls/linux/socket_stream_nonblock.h                      | 2 +-
 test/syscalls/linux/socket_test_util.cc                           | 2 +-
 test/syscalls/linux/socket_test_util.h                            | 2 +-
 test/syscalls/linux/socket_unix.cc                                | 2 +-
 test/syscalls/linux/socket_unix.h                                 | 2 +-
 test/syscalls/linux/socket_unix_abstract.cc                       | 2 +-
 test/syscalls/linux/socket_unix_abstract_nonblock.cc              | 2 +-
 test/syscalls/linux/socket_unix_blocking_local.cc                 | 2 +-
 test/syscalls/linux/socket_unix_dgram.cc                          | 2 +-
 test/syscalls/linux/socket_unix_dgram.h                           | 2 +-
 test/syscalls/linux/socket_unix_dgram_local.cc                    | 2 +-
 test/syscalls/linux/socket_unix_dgram_non_blocking.cc             | 2 +-
 test/syscalls/linux/socket_unix_domain.cc                         | 2 +-
 test/syscalls/linux/socket_unix_filesystem.cc                     | 2 +-
 test/syscalls/linux/socket_unix_filesystem_nonblock.cc            | 2 +-
 test/syscalls/linux/socket_unix_non_stream.cc                     | 2 +-
 test/syscalls/linux/socket_unix_non_stream.h                      | 2 +-
 test/syscalls/linux/socket_unix_non_stream_blocking_local.cc      | 2 +-
 test/syscalls/linux/socket_unix_pair.cc                           | 2 +-
 test/syscalls/linux/socket_unix_pair_nonblock.cc                  | 2 +-
 test/syscalls/linux/socket_unix_seqpacket.cc                      | 2 +-
 test/syscalls/linux/socket_unix_seqpacket.h                       | 2 +-
 test/syscalls/linux/socket_unix_seqpacket_local.cc                | 2 +-
 test/syscalls/linux/socket_unix_stream.cc                         | 2 +-
 test/syscalls/linux/socket_unix_stream_blocking_local.cc          | 2 +-
 test/syscalls/linux/socket_unix_stream_local.cc                   | 2 +-
 test/syscalls/linux/socket_unix_stream_nonblock_local.cc          | 2 +-
 test/syscalls/linux/socket_unix_unbound_abstract.cc               | 2 +-
 test/syscalls/linux/socket_unix_unbound_dgram.cc                  | 2 +-
 test/syscalls/linux/socket_unix_unbound_filesystem.cc             | 2 +-
 test/syscalls/linux/socket_unix_unbound_seqpacket.cc              | 2 +-
 test/syscalls/linux/socket_unix_unbound_stream.cc                 | 2 +-
 test/syscalls/linux/stat.cc                                       | 2 +-
 test/syscalls/linux/stat_times.cc                                 | 2 +-
 test/syscalls/linux/statfs.cc                                     | 2 +-
 test/syscalls/linux/sticky.cc                                     | 2 +-
 test/syscalls/linux/symlink.cc                                    | 2 +-
 test/syscalls/linux/sync.cc                                       | 2 +-
 test/syscalls/linux/sync_file_range.cc                            | 2 +-
 test/syscalls/linux/sysinfo.cc                                    | 2 +-
 test/syscalls/linux/syslog.cc                                     | 2 +-
 test/syscalls/linux/sysret.cc                                     | 2 +-
 test/syscalls/linux/tcp_socket.cc                                 | 2 +-
 test/syscalls/linux/temp_umask.h                                  | 2 +-
 test/syscalls/linux/tgkill.cc                                     | 2 +-
 test/syscalls/linux/time.cc                                       | 2 +-
 test/syscalls/linux/timerfd.cc                                    | 2 +-
 test/syscalls/linux/timers.cc                                     | 2 +-
 test/syscalls/linux/tkill.cc                                      | 2 +-
 test/syscalls/linux/truncate.cc                                   | 2 +-
 test/syscalls/linux/udp_bind.cc                                   | 2 +-
 test/syscalls/linux/udp_socket.cc                                 | 2 +-
 test/syscalls/linux/uidgid.cc                                     | 2 +-
 test/syscalls/linux/uname.cc                                      | 2 +-
 test/syscalls/linux/unix_domain_socket_test_util.cc               | 2 +-
 test/syscalls/linux/unix_domain_socket_test_util.h                | 2 +-
 test/syscalls/linux/unlink.cc                                     | 2 +-
 test/syscalls/linux/unshare.cc                                    | 2 +-
 test/syscalls/linux/utimes.cc                                     | 2 +-
 test/syscalls/linux/vdso.cc                                       | 2 +-
 test/syscalls/linux/vdso_clock_gettime.cc                         | 2 +-
 test/syscalls/linux/vfork.cc                                      | 2 +-
 test/syscalls/linux/vsyscall.cc                                   | 2 +-
 test/syscalls/linux/wait.cc                                       | 2 +-
 test/syscalls/linux/write.cc                                      | 2 +-
 test/syscalls/syscall_test_runner.go                              | 2 +-
 test/syscalls/syscall_test_runner.sh                              | 2 +-
 test/util/capability_util.cc                                      | 2 +-
 test/util/capability_util.h                                       | 2 +-
 test/util/cleanup.h                                               | 2 +-
 test/util/epoll_util.cc                                           | 2 +-
 test/util/epoll_util.h                                            | 2 +-
 test/util/eventfd_util.h                                          | 2 +-
 test/util/file_descriptor.h                                       | 2 +-
 test/util/fs_util.cc                                              | 2 +-
 test/util/fs_util.h                                               | 2 +-
 test/util/fs_util_test.cc                                         | 2 +-
 test/util/logging.cc                                              | 2 +-
 test/util/logging.h                                               | 2 +-
 test/util/memory_util.h                                           | 2 +-
 test/util/mount_util.h                                            | 2 +-
 test/util/multiprocess_util.cc                                    | 2 +-
 test/util/multiprocess_util.h                                     | 2 +-
 test/util/posix_error.cc                                          | 2 +-
 test/util/posix_error.h                                           | 2 +-
 test/util/posix_error_test.cc                                     | 2 +-
 test/util/proc_util.cc                                            | 2 +-
 test/util/proc_util.h                                             | 2 +-
 test/util/proc_util_test.cc                                       | 2 +-
 test/util/rlimit_util.cc                                          | 2 +-
 test/util/rlimit_util.h                                           | 2 +-
 test/util/save_util.cc                                            | 2 +-
 test/util/save_util.h                                             | 2 +-
 test/util/signal_util.cc                                          | 2 +-
 test/util/signal_util.h                                           | 2 +-
 test/util/temp_path.cc                                            | 2 +-
 test/util/temp_path.h                                             | 2 +-
 test/util/test_main.cc                                            | 2 +-
 test/util/test_util.cc                                            | 2 +-
 test/util/test_util.h                                             | 2 +-
 test/util/test_util_test.cc                                       | 2 +-
 test/util/thread_util.h                                           | 2 +-
 test/util/timer_util.cc                                           | 2 +-
 test/util/timer_util.h                                            | 2 +-
 third_party/gvsync/atomicptr_unsafe.go                            | 2 +-
 third_party/gvsync/atomicptrtest/atomicptr_test.go                | 2 +-
 third_party/gvsync/downgradable_rwmutex_test.go                   | 2 +-
 third_party/gvsync/downgradable_rwmutex_unsafe.go                 | 2 +-
 third_party/gvsync/gvsync.go                                      | 2 +-
 third_party/gvsync/memmove_unsafe.go                              | 2 +-
 third_party/gvsync/norace_unsafe.go                               | 2 +-
 third_party/gvsync/race_unsafe.go                                 | 2 +-
 third_party/gvsync/seqatomic_unsafe.go                            | 2 +-
 third_party/gvsync/seqatomictest/seqatomic_test.go                | 2 +-
 third_party/gvsync/seqcount.go                                    | 2 +-
 third_party/gvsync/seqcount_test.go                               | 2 +-
 tools/go_generics/generics.go                                     | 2 +-
 tools/go_generics/generics_tests/all_stmts/input.go               | 2 +-
 tools/go_generics/generics_tests/all_stmts/output/output.go       | 2 +-
 tools/go_generics/generics_tests/all_types/input.go               | 2 +-
 tools/go_generics/generics_tests/all_types/lib/lib.go             | 2 +-
 tools/go_generics/generics_tests/all_types/output/output.go       | 2 +-
 tools/go_generics/generics_tests/consts/input.go                  | 2 +-
 tools/go_generics/generics_tests/consts/output/output.go          | 2 +-
 tools/go_generics/generics_tests/imports/input.go                 | 2 +-
 tools/go_generics/generics_tests/imports/output/output.go         | 2 +-
 tools/go_generics/generics_tests/remove_typedef/input.go          | 2 +-
 tools/go_generics/generics_tests/remove_typedef/output/output.go  | 2 +-
 tools/go_generics/generics_tests/simple/input.go                  | 2 +-
 tools/go_generics/generics_tests/simple/output/output.go          | 2 +-
 tools/go_generics/globals/globals_visitor.go                      | 2 +-
 tools/go_generics/globals/scope.go                                | 2 +-
 tools/go_generics/go_generics_unittest.sh                         | 2 +-
 tools/go_generics/go_merge/main.go                                | 2 +-
 tools/go_generics/imports.go                                      | 2 +-
 tools/go_generics/remove.go                                       | 2 +-
 tools/go_generics/rules_tests/template.go                         | 2 +-
 tools/go_generics/rules_tests/template_test.go                    | 2 +-
 tools/go_stateify/main.go                                         | 2 +-
 tools/tag_release.sh                                              | 2 +-
 tools/workspace_status.sh                                         | 2 +-
 vdso/barrier.h                                                    | 2 +-
 vdso/check_vdso.py                                                | 2 +-
 vdso/compiler.h                                                   | 2 +-
 vdso/cycle_clock.h                                                | 2 +-
 vdso/seqlock.h                                                    | 2 +-
 vdso/syscalls.h                                                   | 2 +-
 vdso/vdso.cc                                                      | 2 +-
 vdso/vdso_time.cc                                                 | 2 +-
 vdso/vdso_time.h                                                  | 2 +-
 1235 files changed, 1242 insertions(+), 1234 deletions(-)
 create mode 100644 AUTHORS

(limited to 'pkg/sentry')

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 000000000..01ba46567
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,8 @@
+# This is the list of gVisor authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+#
+# Please send a patch if you would like to be included in this list.
+Google LLC
diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
index 89e24b037..63fffda48 100755
--- a/kokoro/run_build.sh
+++ b/kokoro/run_build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 8a3ce7402..08f678e39 100755
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi.go b/pkg/abi/abi.go
index 7770f0405..d56c481c9 100644
--- a/pkg/abi/abi.go
+++ b/pkg/abi/abi.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/abi_linux.go b/pkg/abi/abi_linux.go
index 9d9f361a4..3059479bd 100644
--- a/pkg/abi/abi_linux.go
+++ b/pkg/abi/abi_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/flag.go b/pkg/abi/flag.go
index b48757da8..dcdd66d4e 100644
--- a/pkg/abi/flag.go
+++ b/pkg/abi/flag.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/aio.go b/pkg/abi/linux/aio.go
index 1b7ca714a..3c6e0079d 100644
--- a/pkg/abi/linux/aio.go
+++ b/pkg/abi/linux/aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ashmem.go b/pkg/abi/linux/ashmem.go
index ced1e44d4..2a722abe0 100644
--- a/pkg/abi/linux/ashmem.go
+++ b/pkg/abi/linux/ashmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/audit.go b/pkg/abi/linux/audit.go
index b39ba4515..6cca69af9 100644
--- a/pkg/abi/linux/audit.go
+++ b/pkg/abi/linux/audit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/binder.go b/pkg/abi/linux/binder.go
index 522dc6f53..63b08324a 100644
--- a/pkg/abi/linux/binder.go
+++ b/pkg/abi/linux/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/bpf.go b/pkg/abi/linux/bpf.go
index d9cd09948..aa3d3ce70 100644
--- a/pkg/abi/linux/bpf.go
+++ b/pkg/abi/linux/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index 7d96f013e..c120cac64 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go
index 5b1199aac..421e11256 100644
--- a/pkg/abi/linux/dev.go
+++ b/pkg/abi/linux/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/elf.go b/pkg/abi/linux/elf.go
index 928067c04..fb1c679d2 100644
--- a/pkg/abi/linux/elf.go
+++ b/pkg/abi/linux/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/errors.go b/pkg/abi/linux/errors.go
index e5f6f3f07..93f85a864 100644
--- a/pkg/abi/linux/errors.go
+++ b/pkg/abi/linux/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/eventfd.go b/pkg/abi/linux/eventfd.go
index 5614f5cf1..9c479fc8f 100644
--- a/pkg/abi/linux/eventfd.go
+++ b/pkg/abi/linux/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/exec.go b/pkg/abi/linux/exec.go
index a07c29243..579d46c41 100644
--- a/pkg/abi/linux/exec.go
+++ b/pkg/abi/linux/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index c8558933a..cc8f2702d 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 46b10ca97..753fec3ed 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index a9f2ba132..c82ab9b5b 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/futex.go b/pkg/abi/linux/futex.go
index afdf4123b..08bfde3b5 100644
--- a/pkg/abi/linux/futex.go
+++ b/pkg/abi/linux/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/inotify.go b/pkg/abi/linux/inotify.go
index 79c5d3593..2d08194ba 100644
--- a/pkg/abi/linux/inotify.go
+++ b/pkg/abi/linux/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 191b26e4d..04bb767dc 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index 77ac1062c..31e56ffa6 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ipc.go b/pkg/abi/linux/ipc.go
index 10681768b..2ef8d6cbb 100644
--- a/pkg/abi/linux/ipc.go
+++ b/pkg/abi/linux/ipc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go
index e0aa5b31d..c74dfcd53 100644
--- a/pkg/abi/linux/limits.go
+++ b/pkg/abi/linux/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/linux.go b/pkg/abi/linux/linux.go
index d365f693d..8a8f831cd 100644
--- a/pkg/abi/linux/linux.go
+++ b/pkg/abi/linux/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index eda8d9788..0b02f938a 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netdevice.go b/pkg/abi/linux/netdevice.go
index e3b6b1e40..aef1acf75 100644
--- a/pkg/abi/linux/netdevice.go
+++ b/pkg/abi/linux/netdevice.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink.go b/pkg/abi/linux/netlink.go
index 25c5e17fd..5e718c363 100644
--- a/pkg/abi/linux/netlink.go
+++ b/pkg/abi/linux/netlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/netlink_route.go b/pkg/abi/linux/netlink_route.go
index 4200b6506..630dc339a 100644
--- a/pkg/abi/linux/netlink_route.go
+++ b/pkg/abi/linux/netlink_route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/poll.go b/pkg/abi/linux/poll.go
index 9f0b15d1c..c04d26e4c 100644
--- a/pkg/abi/linux/poll.go
+++ b/pkg/abi/linux/poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index db3206f36..dae2de290 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/ptrace.go b/pkg/abi/linux/ptrace.go
index 7db4f5464..23e605ab2 100644
--- a/pkg/abi/linux/ptrace.go
+++ b/pkg/abi/linux/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/rusage.go b/pkg/abi/linux/rusage.go
index 7fea4b589..d8302dc85 100644
--- a/pkg/abi/linux/rusage.go
+++ b/pkg/abi/linux/rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sched.go b/pkg/abi/linux/sched.go
index ef96a3801..193d9a242 100644
--- a/pkg/abi/linux/sched.go
+++ b/pkg/abi/linux/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 8673a27bf..4eeb5cd7a 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/sem.go b/pkg/abi/linux/sem.go
index b80c93daf..de422c519 100644
--- a/pkg/abi/linux/sem.go
+++ b/pkg/abi/linux/sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/shm.go b/pkg/abi/linux/shm.go
index 82a80e609..e45aadb10 100644
--- a/pkg/abi/linux/shm.go
+++ b/pkg/abi/linux/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go
index 395f9f31e..9cbd77dda 100644
--- a/pkg/abi/linux/signal.go
+++ b/pkg/abi/linux/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 6fa4e7c3e..417840731 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go
index 67908deb9..174d470e2 100644
--- a/pkg/abi/linux/tcp.go
+++ b/pkg/abi/linux/tcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index bbd21e726..fa9ee27e1 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/timer.go b/pkg/abi/linux/timer.go
index a6f420bdb..e32d09e10 100644
--- a/pkg/abi/linux/timer.go
+++ b/pkg/abi/linux/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/tty.go b/pkg/abi/linux/tty.go
index bff882d89..8ac02aee8 100644
--- a/pkg/abi/linux/tty.go
+++ b/pkg/abi/linux/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/uio.go b/pkg/abi/linux/uio.go
index 7e00d9959..1fd1e9802 100644
--- a/pkg/abi/linux/uio.go
+++ b/pkg/abi/linux/uio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/abi/linux/utsname.go b/pkg/abi/linux/utsname.go
index f80ed7d4a..60f220a67 100644
--- a/pkg/abi/linux/utsname.go
+++ b/pkg/abi/linux/utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index 26b674435..85e819304 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go
index 104e0dab1..6a0af006e 100644
--- a/pkg/amutex/amutex_test.go
+++ b/pkg/amutex/amutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops.go b/pkg/atomicbitops/atomic_bitops.go
index 9a57f9599..63aa2b7f1 100644
--- a/pkg/atomicbitops/atomic_bitops.go
+++ b/pkg/atomicbitops/atomic_bitops.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_amd64.s b/pkg/atomicbitops/atomic_bitops_amd64.s
index b37e3aad3..db0972001 100644
--- a/pkg/atomicbitops/atomic_bitops_amd64.s
+++ b/pkg/atomicbitops/atomic_bitops_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_common.go b/pkg/atomicbitops/atomic_bitops_common.go
index b03242baa..b2a943dcb 100644
--- a/pkg/atomicbitops/atomic_bitops_common.go
+++ b/pkg/atomicbitops/atomic_bitops_common.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
index ee6207cb3..965e9be79 100644
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ b/pkg/atomicbitops/atomic_bitops_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary.go b/pkg/binary/binary.go
index 02f7e9fb8..631785f7b 100644
--- a/pkg/binary/binary.go
+++ b/pkg/binary/binary.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/binary/binary_test.go b/pkg/binary/binary_test.go
index 200961c70..4d609a438 100644
--- a/pkg/binary/binary_test.go
+++ b/pkg/binary/binary_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits.go b/pkg/bits/bits.go
index eb3c80f49..a26433ad6 100644
--- a/pkg/bits/bits.go
+++ b/pkg/bits/bits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go
index 8c578cca2..93a435b80 100644
--- a/pkg/bits/bits_template.go
+++ b/pkg/bits/bits_template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64.go b/pkg/bits/uint64_arch_amd64.go
index 1fef89394..faccaa61a 100644
--- a/pkg/bits/uint64_arch_amd64.go
+++ b/pkg/bits/uint64_arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_amd64_asm.s b/pkg/bits/uint64_arch_amd64_asm.s
index 8c7322f0f..8ff364181 100644
--- a/pkg/bits/uint64_arch_amd64_asm.s
+++ b/pkg/bits/uint64_arch_amd64_asm.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_arch_generic.go b/pkg/bits/uint64_arch_generic.go
index cfb47400b..7dd2d1480 100644
--- a/pkg/bits/uint64_arch_generic.go
+++ b/pkg/bits/uint64_arch_generic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go
index d6dbaf602..1b018d808 100644
--- a/pkg/bits/uint64_test.go
+++ b/pkg/bits/uint64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/bpf.go b/pkg/bpf/bpf.go
index 98d44d911..eb546f48f 100644
--- a/pkg/bpf/bpf.go
+++ b/pkg/bpf/bpf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder.go b/pkg/bpf/decoder.go
index ae6b8839a..45c192215 100644
--- a/pkg/bpf/decoder.go
+++ b/pkg/bpf/decoder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/decoder_test.go b/pkg/bpf/decoder_test.go
index f093e1e41..8c4bdad21 100644
--- a/pkg/bpf/decoder_test.go
+++ b/pkg/bpf/decoder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/input_bytes.go b/pkg/bpf/input_bytes.go
index 745c0749b..86b216cfc 100644
--- a/pkg/bpf/input_bytes.go
+++ b/pkg/bpf/input_bytes.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter.go b/pkg/bpf/interpreter.go
index 86c7add4d..86de523a2 100644
--- a/pkg/bpf/interpreter.go
+++ b/pkg/bpf/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/interpreter_test.go b/pkg/bpf/interpreter_test.go
index c46a43991..67b00ffe3 100644
--- a/pkg/bpf/interpreter_test.go
+++ b/pkg/bpf/interpreter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder.go b/pkg/bpf/program_builder.go
index b4ce228e1..fc9d27203 100644
--- a/pkg/bpf/program_builder.go
+++ b/pkg/bpf/program_builder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/bpf/program_builder_test.go b/pkg/bpf/program_builder_test.go
index 0e0b79d88..5b2ad67de 100644
--- a/pkg/bpf/program_builder_test.go
+++ b/pkg/bpf/program_builder_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go
index 4daaa82b6..8c14ccbfa 100644
--- a/pkg/compressio/compressio.go
+++ b/pkg/compressio/compressio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/compressio/compressio_test.go b/pkg/compressio/compressio_test.go
index 1bbabee79..86dc47e44 100644
--- a/pkg/compressio/compressio_test.go
+++ b/pkg/compressio/compressio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/client/client.go b/pkg/control/client/client.go
index 0d0c9f148..3fec27846 100644
--- a/pkg/control/client/client.go
+++ b/pkg/control/client/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go
index c46b5d70b..1a15da1a8 100644
--- a/pkg/control/server/server.go
+++ b/pkg/control/server/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpu_amd64.s b/pkg/cpuid/cpu_amd64.s
index 905c1d12e..ac80d3c8a 100644
--- a/pkg/cpuid/cpu_amd64.s
+++ b/pkg/cpuid/cpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go
index 61441150e..3eb2bcd2b 100644
--- a/pkg/cpuid/cpuid.go
+++ b/pkg/cpuid/cpuid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_parse_test.go b/pkg/cpuid/cpuid_parse_test.go
index e8f87a10e..dd9969db4 100644
--- a/pkg/cpuid/cpuid_parse_test.go
+++ b/pkg/cpuid/cpuid_parse_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/cpuid/cpuid_test.go b/pkg/cpuid/cpuid_test.go
index 64ade1cbe..6ae14d2da 100644
--- a/pkg/cpuid/cpuid_test.go
+++ b/pkg/cpuid/cpuid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/client.go b/pkg/dhcp/client.go
index 2ba79be32..b7cde3819 100644
--- a/pkg/dhcp/client.go
+++ b/pkg/dhcp/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp.go b/pkg/dhcp/dhcp.go
index 6945bcd35..f96ffd891 100644
--- a/pkg/dhcp/dhcp.go
+++ b/pkg/dhcp/dhcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_string.go b/pkg/dhcp/dhcp_string.go
index 8533895bd..29ce98593 100644
--- a/pkg/dhcp/dhcp_string.go
+++ b/pkg/dhcp/dhcp_string.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/dhcp_test.go b/pkg/dhcp/dhcp_test.go
index e1d8ef603..751626bb0 100644
--- a/pkg/dhcp/dhcp_test.go
+++ b/pkg/dhcp/dhcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/dhcp/server.go b/pkg/dhcp/server.go
index 9549ff705..6a1972860 100644
--- a/pkg/dhcp/server.go
+++ b/pkg/dhcp/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index 41a7b5ed3..4c8ae573b 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/eventchannel/event.proto b/pkg/eventchannel/event.proto
index c1679c7e7..34468f072 100644
--- a/pkg/eventchannel/event.proto
+++ b/pkg/eventchannel/event.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd.go b/pkg/fd/fd.go
index d40758c22..2785243a2 100644
--- a/pkg/fd/fd.go
+++ b/pkg/fd/fd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fd/fd_test.go b/pkg/fd/fd_test.go
index 42bb3ef6c..5fb0ad47d 100644
--- a/pkg/fd/fd_test.go
+++ b/pkg/fd/fd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go
index aa4906ca0..f0b028b0b 100644
--- a/pkg/fdnotifier/fdnotifier.go
+++ b/pkg/fdnotifier/fdnotifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/fdnotifier/poll_unsafe.go b/pkg/fdnotifier/poll_unsafe.go
index 05be9aeb5..bc5e0ac44 100644
--- a/pkg/fdnotifier/poll_unsafe.go
+++ b/pkg/fdnotifier/poll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate.go b/pkg/gate/gate.go
index 48122bf5a..bda6aae09 100644
--- a/pkg/gate/gate.go
+++ b/pkg/gate/gate.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 95620fa8e..7467e7d07 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list.go b/pkg/ilist/list.go
index 51c9b6df3..019caadca 100644
--- a/pkg/ilist/list.go
+++ b/pkg/ilist/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/ilist/list_test.go b/pkg/ilist/list_test.go
index f37946dc2..3f9abfb56 100644
--- a/pkg/ilist/list_test.go
+++ b/pkg/ilist/list_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go
index 5fbd4e779..cd6e4e2ce 100644
--- a/pkg/linewriter/linewriter.go
+++ b/pkg/linewriter/linewriter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/linewriter/linewriter_test.go b/pkg/linewriter/linewriter_test.go
index 9140ee6af..96dc7e6e0 100644
--- a/pkg/linewriter/linewriter_test.go
+++ b/pkg/linewriter/linewriter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog.go b/pkg/log/glog.go
index 24d5390d7..5732785b4 100644
--- a/pkg/log/glog.go
+++ b/pkg/log/glog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/glog_unsafe.go b/pkg/log/glog_unsafe.go
index bb06aa7d3..ea17ae349 100644
--- a/pkg/log/glog_unsafe.go
+++ b/pkg/log/glog_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json.go b/pkg/log/json.go
index 96bd13d87..a278c8fc8 100644
--- a/pkg/log/json.go
+++ b/pkg/log/json.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json_k8s.go b/pkg/log/json_k8s.go
index 9c2f8d2b7..c2c019915 100644
--- a/pkg/log/json_k8s.go
+++ b/pkg/log/json_k8s.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/json_test.go b/pkg/log/json_test.go
index b8c7a795e..f25224fe1 100644
--- a/pkg/log/json_test.go
+++ b/pkg/log/json_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log.go b/pkg/log/log.go
index b8d456aae..7d563241e 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/log/log_test.go b/pkg/log/log_test.go
index a59d457dd..0634e7c1f 100644
--- a/pkg/log/log_test.go
+++ b/pkg/log/log_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index e5eb95f89..803709cc4 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric.proto b/pkg/metric/metric.proto
index 917fda1ac..a2c2bd1ba 100644
--- a/pkg/metric/metric.proto
+++ b/pkg/metric/metric.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/metric/metric_test.go b/pkg/metric/metric_test.go
index 40034a589..b8b124c83 100644
--- a/pkg/metric/metric_test.go
+++ b/pkg/metric/metric_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/buffer.go b/pkg/p9/buffer.go
index b7bb14ef9..4c8c6555d 100644
--- a/pkg/p9/buffer.go
+++ b/pkg/p9/buffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/buffer_test.go b/pkg/p9/buffer_test.go
index 18d55e5c0..a9c75f86b 100644
--- a/pkg/p9/buffer_test.go
+++ b/pkg/p9/buffer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 67887874a..2f9c716d0 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 992d1daf7..63c65129a 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/client_test.go b/pkg/p9/client_test.go
index f7145452d..fc49729d8 100644
--- a/pkg/p9/client_test.go
+++ b/pkg/p9/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 55ceb52e1..a52a0f3e7 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index c1d1ac1e8..6da2ce4e3 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index 69b90c6cd..f4077a9d4 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 97decd3cc..833defbd6 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 68395a396..10a0587cf 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 4ea9f2f9a..78c7d3f86 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9_test.go b/pkg/p9/p9_test.go
index 02498346c..8dda6cc64 100644
--- a/pkg/p9/p9_test.go
+++ b/pkg/p9/p9_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 242d81b95..e00dd03ab 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go
index f9bacbf84..1c8eff200 100644
--- a/pkg/p9/p9test/p9test.go
+++ b/pkg/p9/p9test/p9test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go
index 60b20578e..f37ad4ab2 100644
--- a/pkg/p9/path_tree.go
+++ b/pkg/p9/path_tree.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
index 34ed898e8..52de889e1 100644
--- a/pkg/p9/pool.go
+++ b/pkg/p9/pool.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/pool_test.go b/pkg/p9/pool_test.go
index 71052d8c4..e4746b8da 100644
--- a/pkg/p9/pool_test.go
+++ b/pkg/p9/pool_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 3ef151595..b2a86d8fa 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index bafb377de..ef59077ff 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/transport_test.go b/pkg/p9/transport_test.go
index b7b7825bd..c833d1c9c 100644
--- a/pkg/p9/transport_test.go
+++ b/pkg/p9/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index ceb6fabbf..a36a499a1 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/p9/version_test.go b/pkg/p9/version_test.go
index c053614c9..291e8580e 100644
--- a/pkg/p9/version_test.go
+++ b/pkg/p9/version_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand.go b/pkg/rand/rand.go
index 593a14380..a2714784d 100644
--- a/pkg/rand/rand.go
+++ b/pkg/rand/rand.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 7ebe8f3b0..2b92db3e6 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index 8f08c74c7..20f515391 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_state.go b/pkg/refs/refcounter_state.go
index 136f06fbf..7c99fd2b5 100644
--- a/pkg/refs/refcounter_state.go
+++ b/pkg/refs/refcounter_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go
index abaa87453..ffd3d3f07 100644
--- a/pkg/refs/refcounter_test.go
+++ b/pkg/refs/refcounter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index e113f3574..50c9409e4 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go
index a9278c64b..29eec8db1 100644
--- a/pkg/seccomp/seccomp_rules.go
+++ b/pkg/seccomp/seccomp_rules.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 11ed90eb4..47ecac6f7 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index dd5ed0041..afc2f755f 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index a31c6471d..ccd40d9db 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/full_reader.go b/pkg/secio/full_reader.go
index 90b1772a7..aed2564bd 100644
--- a/pkg/secio/full_reader.go
+++ b/pkg/secio/full_reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio.go b/pkg/secio/secio.go
index e5f74a497..b43226035 100644
--- a/pkg/secio/secio.go
+++ b/pkg/secio/secio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/secio/secio_test.go b/pkg/secio/secio_test.go
index 8304c4f74..d1d905187 100644
--- a/pkg/secio/secio_test.go
+++ b/pkg/secio/secio_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/range.go b/pkg/segment/range.go
index 057bcd7ff..4d4aeffef 100644
--- a/pkg/segment/range.go
+++ b/pkg/segment/range.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 74a916ea3..982eb3fdd 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/set_state.go b/pkg/segment/set_state.go
index b86e1b75f..76de92591 100644
--- a/pkg/segment/set_state.go
+++ b/pkg/segment/set_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go
index 0825105db..f19a005f3 100644
--- a/pkg/segment/test/segment_test.go
+++ b/pkg/segment/test/segment_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index 41f649011..bcddb39bb 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/aligned.go b/pkg/sentry/arch/aligned.go
index c88c034f6..df01a903d 100644
--- a/pkg/sentry/arch/aligned.go
+++ b/pkg/sentry/arch/aligned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 16d8eb2b2..53f0c9018 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 7ec2f2c84..135c2ee1f 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s
index fa9857df7..bd61402cf 100644
--- a/pkg/sentry/arch/arch_amd64.s
+++ b/pkg/sentry/arch/arch_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index 01949049d..bb52d8db0 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 4305fe2cb..4d167ce98 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go
index 5df65a691..80c923103 100644
--- a/pkg/sentry/arch/auxv.go
+++ b/pkg/sentry/arch/auxv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index f4c2f7043..9dc83e241 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go
index ad098c746..f9ca2e74e 100644
--- a/pkg/sentry/arch/signal_act.go
+++ b/pkg/sentry/arch/signal_act.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index 7f76eba27..aa030fd70 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_info.go b/pkg/sentry/arch/signal_info.go
index fa0ecbec5..f93ee8b46 100644
--- a/pkg/sentry/arch/signal_info.go
+++ b/pkg/sentry/arch/signal_info.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index c02ae3b7c..a442f9fdc 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go
index 2e33ccdf5..7e6324e82 100644
--- a/pkg/sentry/arch/stack.go
+++ b/pkg/sentry/arch/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/arch/syscalls_amd64.go b/pkg/sentry/arch/syscalls_amd64.go
index 47c31d4b9..8b4f23007 100644
--- a/pkg/sentry/arch/syscalls_amd64.go
+++ b/pkg/sentry/arch/syscalls_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go
index eefc3e1b4..d70f3a5c3 100644
--- a/pkg/sentry/context/context.go
+++ b/pkg/sentry/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index a29087775..a42038711 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/control.go b/pkg/sentry/control/control.go
index 32d30b6ea..6060b9b4f 100644
--- a/pkg/sentry/control/control.go
+++ b/pkg/sentry/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 1af092af3..94ed149f2 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index aca2267a7..f7f02a3e1 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index 5d52cd829..b7895d03c 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go
index b6bbf69fa..11efcaba1 100644
--- a/pkg/sentry/control/state.go
+++ b/pkg/sentry/control/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index ae4fa1d93..458d03b30 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/device/device_test.go b/pkg/sentry/device/device_test.go
index 5d8805c2f..e3f51ce4f 100644
--- a/pkg/sentry/device/device_test.go
+++ b/pkg/sentry/device/device_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go
index a5e8c4f0d..a6ea8b9e7 100644
--- a/pkg/sentry/fs/anon/anon.go
+++ b/pkg/sentry/fs/anon/anon.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/anon/device.go b/pkg/sentry/fs/anon/device.go
index 2d1249299..5927bd11e 100644
--- a/pkg/sentry/fs/anon/device.go
+++ b/pkg/sentry/fs/anon/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index 1f61c5711..b53746519 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index 5369d1b0d..5e005bc2e 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go
index 7c997f533..bdf23b371 100644
--- a/pkg/sentry/fs/ashmem/pin_board.go
+++ b/pkg/sentry/fs/ashmem/pin_board.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go
index 736e628dc..24f5d86d6 100644
--- a/pkg/sentry/fs/ashmem/pin_board_test.go
+++ b/pkg/sentry/fs/ashmem/pin_board_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 3523b068a..591e35e6a 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index d9f1559de..acbbd5466 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index 4869428a8..c80ea0175 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index ba69e718d..ee2d3d115 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 98a0b7638..54810afca 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index 29fb155a4..fe656cc24 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go
index fbc750a71..34ac01173 100644
--- a/pkg/sentry/fs/dev/dev.go
+++ b/pkg/sentry/fs/dev/dev.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/device.go b/pkg/sentry/fs/dev/device.go
index 3cecdf6e2..9f4e41fc9 100644
--- a/pkg/sentry/fs/dev/device.go
+++ b/pkg/sentry/fs/dev/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go
index cf4e7d00f..6096a40f8 100644
--- a/pkg/sentry/fs/dev/fs.go
+++ b/pkg/sentry/fs/dev/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 82da9aae9..6b11afa44 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 5d306d352..069212b6d 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index ffd5cf6c3..de0f3e5e5 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 54fc11fe1..c0bc261a2 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index d26a06971..71f2d11de 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go
index 024c7b2d5..ebb80bd50 100644
--- a/pkg/sentry/fs/dirent_cache_limiter.go
+++ b/pkg/sentry/fs/dirent_cache_limiter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go
index 93e8d415f..395c879f5 100644
--- a/pkg/sentry/fs/dirent_cache_test.go
+++ b/pkg/sentry/fs/dirent_cache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go
index 325404e27..db88d850e 100644
--- a/pkg/sentry/fs/dirent_refs_test.go
+++ b/pkg/sentry/fs/dirent_refs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go
index 5cf151dab..18652b809 100644
--- a/pkg/sentry/fs/dirent_state.go
+++ b/pkg/sentry/fs/dirent_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 98483ab68..95e66ea8d 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go
index 92ab6ff0e..0cabe2e18 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
index 69516e048..8c8b1b40c 100644
--- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index 4395666ad..8b347aa11 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go
index 7e3ee5257..b59a6aa0e 100644
--- a/pkg/sentry/fs/fdpipe/pipe_test.go
+++ b/pkg/sentry/fs/fdpipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 5d5026661..62b35dabc 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index e0fa5135f..ab0acb6eb 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 6e680f0a4..948ce9c6f 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go
index a4ac58763..6a2b8007c 100644
--- a/pkg/sentry/fs/file_overlay_test.go
+++ b/pkg/sentry/fs/file_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go
index 1c3bae3e8..523182d59 100644
--- a/pkg/sentry/fs/file_state.go
+++ b/pkg/sentry/fs/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/file_test.go b/pkg/sentry/fs/file_test.go
index f3ed9a70b..d867a0257 100644
--- a/pkg/sentry/fs/file_test.go
+++ b/pkg/sentry/fs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index a6b27c402..acd84dfcc 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index 388a1ce36..f6b827800 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go
index bf2a20b33..5c8cb773f 100644
--- a/pkg/sentry/fs/flags.go
+++ b/pkg/sentry/fs/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 119689776..632055cce 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 5add16ac4..9cd196d7d 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go
index f5c9d9215..d9c68baa3 100644
--- a/pkg/sentry/fs/fsutil/dirty_set_test.go
+++ b/pkg/sentry/fs/fsutil/dirty_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index 42afdd11c..e355d8594 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 32ebf64ff..b5ac6c71c 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index b6e783614..6565c28c8 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
index 319c4841b..c9587b1d9 100644
--- a/pkg/sentry/fs/fsutil/fsutil.go
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index 9599665f0..2bdfc0db6 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
index bbd15b30b..576d2a3df 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_state.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
index 86df76822..7167be263 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 4a182baa1..28686f3b3 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 468171a9b..b6366d906 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index ba33b9912..919d2534c 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 2a8a1639c..661ec41f6 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go
index 98700d014..c572f3396 100644
--- a/pkg/sentry/fs/gofer/attr.go
+++ b/pkg/sentry/fs/gofer/attr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 51c573aef..35cd0c1d6 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 455953237..d512afefc 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/device.go b/pkg/sentry/fs/gofer/device.go
index 52c5acf48..1de6c247c 100644
--- a/pkg/sentry/fs/gofer/device.go
+++ b/pkg/sentry/fs/gofer/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index 35caa42cd..bc2be546e 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go
index d0c64003c..31264e065 100644
--- a/pkg/sentry/fs/gofer/file_state.go
+++ b/pkg/sentry/fs/gofer/file_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index adff0abac..6ab89fcc2 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go
index 36201f017..29d34da7e 100644
--- a/pkg/sentry/fs/gofer/gofer_test.go
+++ b/pkg/sentry/fs/gofer/gofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go
index 0b33e80c3..c7098cd36 100644
--- a/pkg/sentry/fs/gofer/handles.go
+++ b/pkg/sentry/fs/gofer/handles.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 1181a24cc..f6f20844d 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go
index 44d76ba9f..ac22ee4b1 100644
--- a/pkg/sentry/fs/gofer/inode_state.go
+++ b/pkg/sentry/fs/gofer/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 8ae33d286..4cbf9e9d9 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 4ed688ce5..4cb65e7c6 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go
index b1f299be5..68fbf3417 100644
--- a/pkg/sentry/fs/gofer/session_state.go
+++ b/pkg/sentry/fs/gofer/session_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index ce6d3d5c3..cbd5b9a84 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go
index 1a759370d..d0e1096ce 100644
--- a/pkg/sentry/fs/gofer/util.go
+++ b/pkg/sentry/fs/gofer/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 0753640a2..480f0c8f4 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go
index 554e1693a..ffcd57a94 100644
--- a/pkg/sentry/fs/host/descriptor.go
+++ b/pkg/sentry/fs/host/descriptor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go
index 530c0109f..8167390a9 100644
--- a/pkg/sentry/fs/host/descriptor_state.go
+++ b/pkg/sentry/fs/host/descriptor_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/descriptor_test.go b/pkg/sentry/fs/host/descriptor_test.go
index 5dec84ab2..ff08e43af 100644
--- a/pkg/sentry/fs/host/descriptor_test.go
+++ b/pkg/sentry/fs/host/descriptor_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go
index b5adedf44..055024c44 100644
--- a/pkg/sentry/fs/host/device.go
+++ b/pkg/sentry/fs/host/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 2a8f285ff..82e2ae3b9 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go
index de349a41a..b1b8dc0b6 100644
--- a/pkg/sentry/fs/host/fs.go
+++ b/pkg/sentry/fs/host/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go
index c83b29a16..16c89ddf1 100644
--- a/pkg/sentry/fs/host/fs_test.go
+++ b/pkg/sentry/fs/host/fs_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 69c648f67..20e077f77 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go
index b7c1a9581..26cc755bc 100644
--- a/pkg/sentry/fs/host/inode_state.go
+++ b/pkg/sentry/fs/host/inode_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go
index 9f1561bd5..ad1878b5a 100644
--- a/pkg/sentry/fs/host/inode_test.go
+++ b/pkg/sentry/fs/host/inode_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go
index 175dca613..b5a85c4d9 100644
--- a/pkg/sentry/fs/host/ioctl_unsafe.go
+++ b/pkg/sentry/fs/host/ioctl_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index be2c3581f..3034e9441 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index d4ce4a8c1..5efbb3ae8 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go
index 2932c1f16..5676c451a 100644
--- a/pkg/sentry/fs/host/socket_state.go
+++ b/pkg/sentry/fs/host/socket_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 83e8e1b3c..cc760a7e1 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index f35e2492d..8873705c0 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index c5cb75df7..e45b339f5 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go
index 40c450660..94ff7708e 100644
--- a/pkg/sentry/fs/host/util.go
+++ b/pkg/sentry/fs/host/util.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go
index a8721d197..b95a57c3f 100644
--- a/pkg/sentry/fs/host/util_unsafe.go
+++ b/pkg/sentry/fs/host/util_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go
index 9ca8c399f..afcb74724 100644
--- a/pkg/sentry/fs/host/wait_test.go
+++ b/pkg/sentry/fs/host/wait_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index fe411a766..d764ef93d 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index d2b653bc7..0f2a66a79 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index ff8b75f31..ac287e1e4 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index bda3e1861..3d015328e 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index fa8accf6c..66b3da2d0 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 59fa662f3..2652582c3 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go
index f09928b68..d52f956e4 100644
--- a/pkg/sentry/fs/inotify_event.go
+++ b/pkg/sentry/fs/inotify_event.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index d33e7e498..a0b488467 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 5ff800d2d..f2aee4512 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_range_test.go b/pkg/sentry/fs/lock/lock_range_test.go
index b0ab882b9..6221199d1 100644
--- a/pkg/sentry/fs/lock/lock_range_test.go
+++ b/pkg/sentry/fs/lock/lock_range_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index 395592a4b..8a3ace0c1 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index 67fa4b1dd..ba002aeb7 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 118e30f63..cf359a1f1 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 4d1693204..a169ea4c9 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go
index fb60a1aec..535f812c8 100644
--- a/pkg/sentry/fs/mount_overlay.go
+++ b/pkg/sentry/fs/mount_overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index d7605b2c9..9f7fbeff2 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index f6f7be0aa..01eb4607e 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go
index 54000614f..56d726dd1 100644
--- a/pkg/sentry/fs/mounts_test.go
+++ b/pkg/sentry/fs/mounts_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go
index 38aee765a..3f68da149 100644
--- a/pkg/sentry/fs/offset.go
+++ b/pkg/sentry/fs/offset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index f3e2d5cbe..db89a5f70 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go
index 52139b648..e4dc02dbb 100644
--- a/pkg/sentry/fs/path.go
+++ b/pkg/sentry/fs/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go
index 4ba1498f6..e6f57ebba 100644
--- a/pkg/sentry/fs/path_test.go
+++ b/pkg/sentry/fs/path_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go
index f756c45bf..15031234e 100644
--- a/pkg/sentry/fs/proc/cpuinfo.go
+++ b/pkg/sentry/fs/proc/cpuinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/device/device.go b/pkg/sentry/fs/proc/device/device.go
index 04b687bcf..0de466c73 100644
--- a/pkg/sentry/fs/proc/device/device.go
+++ b/pkg/sentry/fs/proc/device/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index fc21dfbbd..d49dad685 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go
index f2329e623..744b31c74 100644
--- a/pkg/sentry/fs/proc/fds.go
+++ b/pkg/sentry/fs/proc/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go
index c050a00be..7bb081d0e 100644
--- a/pkg/sentry/fs/proc/filesystems.go
+++ b/pkg/sentry/fs/proc/filesystems.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 666a2d054..7c5f8484a 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index 8dde2ea46..b03807043 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go
index 3ee0e570a..2dfe7089a 100644
--- a/pkg/sentry/fs/proc/loadavg.go
+++ b/pkg/sentry/fs/proc/loadavg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go
index 75cbf3e77..d2b9b92c7 100644
--- a/pkg/sentry/fs/proc/meminfo.go
+++ b/pkg/sentry/fs/proc/meminfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index fe62b167b..37ed30724 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index d24b2d370..4a107c739 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/net_test.go b/pkg/sentry/fs/proc/net_test.go
index 94677cc1d..9aed5fdca 100644
--- a/pkg/sentry/fs/proc/net_test.go
+++ b/pkg/sentry/fs/proc/net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 64e1e1998..196fa5128 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index 81f64a28b..db53686f6 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 0a0eb45e2..10ea1f55d 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
index 35403ab7f..c4de565eb 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go
index 18bd8e9b6..397f9ec6b 100644
--- a/pkg/sentry/fs/proc/stat.go
+++ b/pkg/sentry/fs/proc/stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index a7bc9198e..b889ed625 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 0ce77f04f..e49794a48 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go
index 5f481a1cf..6eba709c6 100644
--- a/pkg/sentry/fs/proc/sys_net_state.go
+++ b/pkg/sentry/fs/proc/sys_net_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go
index ea0d94fce..78135ba13 100644
--- a/pkg/sentry/fs/proc/sys_net_test.go
+++ b/pkg/sentry/fs/proc/sys_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 9f65a8337..0f400e80f 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index d433632cf..d649da0f1 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index d7ae26fcf..1ddf9fafa 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go
index 58e0c793c..a5479990c 100644
--- a/pkg/sentry/fs/proc/version.go
+++ b/pkg/sentry/fs/proc/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index c0400b67d..a6b6a5c33 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 5bcb6c364..9406a07ca 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index 35dabdad2..f7835fe05 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go
index c1ac8a78b..8c6b31f70 100644
--- a/pkg/sentry/fs/ramfs/tree.go
+++ b/pkg/sentry/fs/ramfs/tree.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go
index 8bee9cfc1..27abeb6ba 100644
--- a/pkg/sentry/fs/ramfs/tree_test.go
+++ b/pkg/sentry/fs/ramfs/tree_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index a6645b41e..f10168125 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/save.go b/pkg/sentry/fs/save.go
index 90988d385..2eaf6ab69 100644
--- a/pkg/sentry/fs/save.go
+++ b/pkg/sentry/fs/save.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/seek.go b/pkg/sentry/fs/seek.go
index 72f3fb632..0f43918ad 100644
--- a/pkg/sentry/fs/seek.go
+++ b/pkg/sentry/fs/seek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sync.go b/pkg/sentry/fs/sync.go
index 6dcc2fe8d..1fff8059c 100644
--- a/pkg/sentry/fs/sync.go
+++ b/pkg/sentry/fs/sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/device.go b/pkg/sentry/fs/sys/device.go
index 38ecd0c18..128d3a9d9 100644
--- a/pkg/sentry/fs/sys/device.go
+++ b/pkg/sentry/fs/sys/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index 8b728a4e4..db91de435 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go
index 44ae43754..f0c2322e0 100644
--- a/pkg/sentry/fs/sys/fs.go
+++ b/pkg/sentry/fs/sys/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go
index c5b56fe69..d20ef91fa 100644
--- a/pkg/sentry/fs/sys/sys.go
+++ b/pkg/sentry/fs/sys/sys.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index ef9a08854..749961f51 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/device.go b/pkg/sentry/fs/tmpfs/device.go
index aade93c26..179c3a46f 100644
--- a/pkg/sentry/fs/tmpfs/device.go
+++ b/pkg/sentry/fs/tmpfs/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index d0c9b8bea..1ef256511 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go
index 743061190..b44c06556 100644
--- a/pkg/sentry/fs/tmpfs/file_test.go
+++ b/pkg/sentry/fs/tmpfs/file_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index 8e44421b6..b7c29a4d1 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 4450e1363..f89d86c83 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 5bb4922cb..832914453 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index f8713471a..0fc777e67 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go
index a53448c47..701b2f7d9 100644
--- a/pkg/sentry/fs/tty/fs.go
+++ b/pkg/sentry/fs/tty/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index c4a364edb..20d29d130 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index e2686a074..45e167e5f 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 5e88d84d9..11fb92be3 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index ed080ca0f..0ae57a02c 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index 79f9d76d7..2b4160ba5 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go
index ad535838f..d2e75a511 100644
--- a/pkg/sentry/fs/tty/tty_test.go
+++ b/pkg/sentry/fs/tty/tty_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/getcpu_amd64.s b/pkg/sentry/hostcpu/getcpu_amd64.s
index 409db1450..aa00316da 100644
--- a/pkg/sentry/hostcpu/getcpu_amd64.s
+++ b/pkg/sentry/hostcpu/getcpu_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu.go b/pkg/sentry/hostcpu/hostcpu.go
index 3adc847bb..d78f78402 100644
--- a/pkg/sentry/hostcpu/hostcpu.go
+++ b/pkg/sentry/hostcpu/hostcpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/hostcpu/hostcpu_test.go b/pkg/sentry/hostcpu/hostcpu_test.go
index 38de0e1f6..7d6885c9e 100644
--- a/pkg/sentry/hostcpu/hostcpu_test.go
+++ b/pkg/sentry/hostcpu/hostcpu_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go
index d05e96f15..8550c4793 100644
--- a/pkg/sentry/inet/context.go
+++ b/pkg/sentry/inet/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 8206377cc..7c104fd47 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 05c1a1792..624371eb6 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 1ea2cee36..5ce52e66c 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
index 19f15fd36..847d121aa 100644
--- a/pkg/sentry/kernel/auth/auth.go
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
index 88d6243aa..7a0c967cd 100644
--- a/pkg/sentry/kernel/auth/capability_set.go
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
index f7e945599..16d110610 100644
--- a/pkg/sentry/kernel/auth/context.go
+++ b/pkg/sentry/kernel/auth/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index 2055da196..1511a0324 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
index e5bed44d7..0a58ba17c 100644
--- a/pkg/sentry/kernel/auth/id.go
+++ b/pkg/sentry/kernel/auth/id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
index 43f439825..e5d6028d6 100644
--- a/pkg/sentry/kernel/auth/id_map.go
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
index 8f1a189ec..432dbfb6d 100644
--- a/pkg/sentry/kernel/auth/id_map_functions.go
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index 159940a69..a40dd668f 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index b629521eb..a1a084eab 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go
index eb56a6a07..ae67e2a25 100644
--- a/pkg/sentry/kernel/contexttest/contexttest.go
+++ b/pkg/sentry/kernel/contexttest/contexttest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index befefb11c..2399ae6f2 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
index f6e3e4825..4c3c38f9e 100644
--- a/pkg/sentry/kernel/epoll/epoll_state.go
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
index d89c1b745..49b781b69 100644
--- a/pkg/sentry/kernel/epoll/epoll_test.go
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index b448ad813..5d3139eef 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
index 14e8996d9..1159638e5 100644
--- a/pkg/sentry/kernel/eventfd/eventfd_test.go
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 298d988ea..84cd08501 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
index 715f4714d..c5636d233 100644
--- a/pkg/sentry/kernel/fd_map.go
+++ b/pkg/sentry/kernel/fd_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
index 9e76f0a2d..22db4c7cf 100644
--- a/pkg/sentry/kernel/fd_map_test.go
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 3cf0db280..d8115f59a 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index cd7d51621..bb38eb81e 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 9d44ee8e5..2de5239bf 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 9ceb9bd92..ebe12812c 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
index 8eafe810b..304da2032 100644
--- a/pkg/sentry/kernel/kdefs/kdefs.go
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index a1b2d7161..0468dd678 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go
index aae6f9ad2..48c3ff5a9 100644
--- a/pkg/sentry/kernel/kernel_state.go
+++ b/pkg/sentry/kernel/kernel_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index d09d6debf..0e2cee807 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
index 43b8deb76..bf8029ff5 100644
--- a/pkg/sentry/kernel/memevent/memory_events.proto
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
index deff6def9..c93f6598a 100644
--- a/pkg/sentry/kernel/pending_signals.go
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go
index 72be6702f..2c902c7e3 100644
--- a/pkg/sentry/kernel/pending_signals_state.go
+++ b/pkg/sentry/kernel/pending_signals_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
index 54e059f8b..ba53fd482 100644
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
index eec5c5de8..eb59e15a1 100644
--- a/pkg/sentry/kernel/pipe/device.go
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 1336b6293..99188dddf 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ad103b195..7ddecdad8 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 357d1162e..bd7649d2f 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index 3b9895927..de340c40c 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index f27379969..48fab45d1 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 1090432d7..ddcc5e09a 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index 6fea9769c..0f29fbc43 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go
index 40b5acca3..a016b4087 100644
--- a/pkg/sentry/kernel/posixtimer.go
+++ b/pkg/sentry/kernel/posixtimer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 15f2e2964..4423e7efd 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go
index 1f88efca3..048eeaa3f 100644
--- a/pkg/sentry/kernel/ptrace_amd64.go
+++ b/pkg/sentry/kernel/ptrace_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go
index 4636405e6..4899c813f 100644
--- a/pkg/sentry/kernel/ptrace_arm64.go
+++ b/pkg/sentry/kernel/ptrace_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 6d3314e81..c4fb2c56c 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
index 41ac1067d..c6c436690 100644
--- a/pkg/sentry/kernel/sched/cpuset.go
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
index a036ed513..3af9f1197 100644
--- a/pkg/sentry/kernel/sched/cpuset_test.go
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
index e59909baf..de18c9d02 100644
--- a/pkg/sentry/kernel/sched/sched.go
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 4bed4d373..cc75eb08a 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 2b7c1a9bc..9d0620e02 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
index 2e51e6ee5..abfcd0fb4 100644
--- a/pkg/sentry/kernel/semaphore/semaphore_test.go
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 070c2f930..610e199da 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go
index bbc653ed8..3cb759072 100644
--- a/pkg/sentry/kernel/shm/device.go
+++ b/pkg/sentry/kernel/shm/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index d4812a065..00393b5f0 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index 22a56c6fc..b528ec0dc 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index 60cbe85b8..ce8bcb5e5 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 293b21249..0572053db 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 981455d46..00358326b 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 2aecf3eea..175d1b247 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
index 3b29d3c6a..8f7cdb9f3 100644
--- a/pkg/sentry/kernel/table_test.go
+++ b/pkg/sentry/kernel/table_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index ed2175c37..f9378c2de 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index 24230af89..1ca2a82eb 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index e5027e551..30a7f6b1e 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index daf974920..bba8ddd39 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index ac38dd157..bbd294141 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index b49f902a5..5d1425d5c 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index a07956208..6e9701b01 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index 351cf47d7..f98097c2c 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 6c9608f8d..17f08729a 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index f4c881c2d..e0e57e8bd 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
index fc7cefc1f..04c684c1a 100644
--- a/pkg/sentry/kernel/task_net.go
+++ b/pkg/sentry/kernel/task_net.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 7115aa967..4549b437e 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 3d654bf93..5455f6ea9 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 7f2e0df72..654cf7525 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index b7534c0a2..b42531e57 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
index 1302cadc1..e735a5dd0 100644
--- a/pkg/sentry/kernel/task_stop.go
+++ b/pkg/sentry/kernel/task_stop.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index 52f5fde8d..a9283d0df 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
index 3f37f505d..b895361d0 100644
--- a/pkg/sentry/kernel/task_test.go
+++ b/pkg/sentry/kernel/task_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index cb68799d3..461bd7316 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 58f3a7ec9..8bd53928e 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 4fd6cf4e2..656bbd46c 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
index 3675ea20d..c0660d362 100644
--- a/pkg/sentry/kernel/time/context.go
+++ b/pkg/sentry/kernel/time/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index ca0f4ba2e..3846cf1ea 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index d7bd85e78..505a4fa4f 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
index f3a3ed543..6ce358a05 100644
--- a/pkg/sentry/kernel/timekeeper_state.go
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
index 6084bcb18..a92ad689e 100644
--- a/pkg/sentry/kernel/timekeeper_test.go
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/uncaught_signal.proto b/pkg/sentry/kernel/uncaught_signal.proto
index c7f6a1978..0bdb062cb 100644
--- a/pkg/sentry/kernel/uncaught_signal.proto
+++ b/pkg/sentry/kernel/uncaught_signal.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index ed5f0c031..96fe3cbb9 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 3a35f1d00..d40ad74f4 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
index 8d2f14209..5640dd71d 100644
--- a/pkg/sentry/kernel/version.go
+++ b/pkg/sentry/kernel/version.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go
index bf413eb7d..9200edb52 100644
--- a/pkg/sentry/limits/context.go
+++ b/pkg/sentry/limits/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index b0571739f..b6c22656b 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/limits_test.go b/pkg/sentry/limits/limits_test.go
index 945428163..658a20f56 100644
--- a/pkg/sentry/limits/limits_test.go
+++ b/pkg/sentry/limits/limits_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index e09d0d2fb..a2b401e3d 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 385ad0102..97e32c8ba 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 35b83654d..b88062ae5 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 79051befa..dc1a52398 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 8c196df84..207d8ed3d 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/loader/vdso_state.go b/pkg/sentry/loader/vdso_state.go
index b327f0e1e..db378e90a 100644
--- a/pkg/sentry/loader/vdso_state.go
+++ b/pkg/sentry/loader/vdso_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go
index bd07e9aac..3cf2b338f 100644
--- a/pkg/sentry/memmap/mapping_set.go
+++ b/pkg/sentry/memmap/mapping_set.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go
index 45d1d4688..c702555ce 100644
--- a/pkg/sentry/memmap/mapping_set_test.go
+++ b/pkg/sentry/memmap/mapping_set_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 3f6f7ebd0..0106c857d 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil.go b/pkg/sentry/memutil/memutil.go
index 286d50ca4..a4154c42a 100644
--- a/pkg/sentry/memutil/memutil.go
+++ b/pkg/sentry/memutil/memutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go
index bc2c72f55..92eab8a26 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/sentry/memutil/memutil_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 4dddcf7b5..06f587fde 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 7075792e0..5c61acf36 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go
index 192a6f744..c37fc9f7b 100644
--- a/pkg/sentry/mm/aio_context_state.go
+++ b/pkg/sentry/mm/aio_context_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go
index d075ee1ca..fe58cfc4c 100644
--- a/pkg/sentry/mm/debug.go
+++ b/pkg/sentry/mm/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go
index 81787a6fd..e4c057d28 100644
--- a/pkg/sentry/mm/io.go
+++ b/pkg/sentry/mm/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 2fe03172c..e6aa6f9ef 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 5ef1ba0b1..9768e51f1 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index a3417a46e..d25aa5136 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index ae4fba478..f4917419f 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go
index 0cca743ef..ece561ff0 100644
--- a/pkg/sentry/mm/pma.go
+++ b/pkg/sentry/mm/pma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 7cdbf6e25..c8302a553 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go
index 46e0e0754..0385957bd 100644
--- a/pkg/sentry/mm/save_restore.go
+++ b/pkg/sentry/mm/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go
index 3bc48c7e7..12913007b 100644
--- a/pkg/sentry/mm/shm.go
+++ b/pkg/sentry/mm/shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 3b5161998..687959005 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 7b675b9b5..a25318abb 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 931995254..ad901344b 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go
index adc97e78f..cb9809b1f 100644
--- a/pkg/sentry/pgalloc/context.go
+++ b/pkg/sentry/pgalloc/context.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google Inc.
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 0754e608f..411dafa07 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 726623c1a..14a39bb9e 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/pgalloc_unsafe.go b/pkg/sentry/pgalloc/pgalloc_unsafe.go
index 33b0a68a8..a4b5d581c 100644
--- a/pkg/sentry/pgalloc/pgalloc_unsafe.go
+++ b/pkg/sentry/pgalloc/pgalloc_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index 21024e656..cf169af55 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go
index cca21a23e..793f57fd7 100644
--- a/pkg/sentry/platform/context.go
+++ b/pkg/sentry/platform/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index 9c83f41eb..a4651f500 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/interrupt/interrupt_test.go b/pkg/sentry/platform/interrupt/interrupt_test.go
index fb3284395..0ecdf6e7a 100644
--- a/pkg/sentry/platform/interrupt/interrupt_test.go
+++ b/pkg/sentry/platform/interrupt/interrupt_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index f2f7ab1e8..689122175 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
index b25cad155..42bcc9733 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index f24f1c662..a926e6f8b 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 6520682d7..c258408f9 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
index 65b01f358..2bc34a435 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 21de2488e..92fde7ee0 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index e79a30ef2..3c452f5ba 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 2605f8c93..4184939e5 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index c75a4b415..0eb0020f7 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index c5a4435b1..ed0521c3f 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index 70d0ac63b..61493ccaf 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
index d0f6bb225..46c4b9113 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index cac8d9937..d05f05c29 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 361200622..e83db71e9 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index b8b3c9a4a..f5953b96e 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index ccfe837b5..b6821122a 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 69ba67ced..06a2e3b0c 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 22ae60b63..452d88d7f 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 9d7dca5b3..450eb8201 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go
index 0d496561d..6cf2359a3 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
index fcba33813..203d71528 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
index f1da41a44..491ec0c2a 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
index 0343e9267..28a1b4414 100644
--- a/pkg/sentry/platform/kvm/virtual_map.go
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
index 935e0eb93..d03ec654a 100644
--- a/pkg/sentry/platform/kvm/virtual_map_test.go
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go
index 1bcc1f8e9..90976735b 100644
--- a/pkg/sentry/platform/mmap_min_addr.go
+++ b/pkg/sentry/platform/mmap_min_addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
index 0e48417b9..ae37276ad 100644
--- a/pkg/sentry/platform/platform.go
+++ b/pkg/sentry/platform/platform.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/sentry/platform/procid/procid.go
index 3f49ab093..78b92422c 100644
--- a/pkg/sentry/platform/procid/procid.go
+++ b/pkg/sentry/platform/procid/procid.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
index ef3439c03..272c9fc14 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/sentry/platform/procid/procid_arm64.s
index 02e907b6b..7a1684a18 100644
--- a/pkg/sentry/platform/procid/procid_arm64.s
+++ b/pkg/sentry/platform/procid/procid_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/sentry/platform/procid/procid_net_test.go
index e8dcc479d..b628e2285 100644
--- a/pkg/sentry/platform/procid/procid_net_test.go
+++ b/pkg/sentry/platform/procid/procid_net_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/sentry/platform/procid/procid_test.go
index 7a57c7cdc..88dd0b3ae 100644
--- a/pkg/sentry/platform/procid/procid_test.go
+++ b/pkg/sentry/platform/procid/procid_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 3c0713e95..6a890dd81 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 223b23199..585f6c1fb 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
index 63f98e40d..64c718d21 100644
--- a/pkg/sentry/platform/ptrace/stub_amd64.s
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go
index 48c16c4a1..54d5021a9 100644
--- a/pkg/sentry/platform/ptrace/stub_unsafe.go
+++ b/pkg/sentry/platform/ptrace/stub_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 2a5d699ec..83b43057f 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index d23a1133e..77a0e908f 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index e2aab8135..2c07b4ac3 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
index 0c9263060..1bf7eab28 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index ca6c4ac97..17736b05b 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 98d0a6de0..5bbd4612d 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 67242b92b..413c3dbc4 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
index 4a9affe64..a5ce67885 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.go
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
index afb040a6f..8cb8c4996 100644
--- a/pkg/sentry/platform/ring0/entry_amd64.s
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go
index 11c49855f..a4927da2f 100644
--- a/pkg/sentry/platform/ring0/gen_offsets/main.go
+++ b/pkg/sentry/platform/ring0/gen_offsets/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 19ac6eb7c..900c0bba7 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 5ed4342dd..3577b5127 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
index faf4240e5..16955ad91 100644
--- a/pkg/sentry/platform/ring0/kernel_unsafe.go
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
index 2b95a0141..9c5f26962 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.go
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
index 98a130525..75d742750 100644
--- a/pkg/sentry/platform/ring0/lib_amd64.s
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
index 806e07ec0..85cc3fdad 100644
--- a/pkg/sentry/platform/ring0/offsets_amd64.go
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index ee6e90a11..23fd5c352 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
index f48647b3a..1b996b4e2 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index c7207ec18..e5dcaada7 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 746f614e5..7aa6c524e 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
index 2f82c4353..a1ec4b109 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index 3e5dc7dc7..36e424495 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
index 6bd8c3584..ff427fbe9 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index 0d9a51aa5..0f029f25d 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
index c4c71d23e..8f9dacd93 100644
--- a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
index 10c51e88d..cdeb1b43a 100644
--- a/pkg/sentry/platform/ring0/ring0.go
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
index 4c6daec22..7e5ceafdb 100644
--- a/pkg/sentry/platform/ring0/x86.go
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
index f90b4bfd1..a0cd78f33 100644
--- a/pkg/sentry/platform/safecopy/atomic_amd64.s
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
index 69c66a3b7..5126871eb 100644
--- a/pkg/sentry/platform/safecopy/safecopy.go
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go
index 1a682d28a..5818f7f9b 100644
--- a/pkg/sentry/platform/safecopy/safecopy_test.go
+++ b/pkg/sentry/platform/safecopy/safecopy_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
index f84527484..eef028e68 100644
--- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
index db7701a29..475ae48e9 100644
--- a/pkg/sentry/platform/safecopy/sighandler_amd64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s
index cdfca8207..53e4ac2c1 100644
--- a/pkg/sentry/platform/safecopy/sighandler_arm64.s
+++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go
index c3a9780d2..1f72deb61 100644
--- a/pkg/sentry/safemem/block_unsafe.go
+++ b/pkg/sentry/safemem/block_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index 6cb52439f..5c3d73eb7 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go
index 2eda8c3bb..629741bee 100644
--- a/pkg/sentry/safemem/io_test.go
+++ b/pkg/sentry/safemem/io_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go
index 090932d3e..3e70d33a2 100644
--- a/pkg/sentry/safemem/safemem.go
+++ b/pkg/sentry/safemem/safemem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go
index fddcaf714..eba4bb535 100644
--- a/pkg/sentry/safemem/seq_test.go
+++ b/pkg/sentry/safemem/seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go
index 83a6b7183..354a95dde 100644
--- a/pkg/sentry/safemem/seq_unsafe.go
+++ b/pkg/sentry/safemem/seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 571245ce5..659b43363 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index db6e71487..aca77888a 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index d44f5e88a..abda364c9 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/device.go b/pkg/sentry/socket/epsocket/device.go
index 3cc138eb0..ab4083efe 100644
--- a/pkg/sentry/socket/epsocket/device.go
+++ b/pkg/sentry/socket/epsocket/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 768fa0dfa..520d82f68 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 0d9c2df24..5a89a63fb 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/save_restore.go b/pkg/sentry/socket/epsocket/save_restore.go
index f19afb6c0..feaafb7cc 100644
--- a/pkg/sentry/socket/epsocket/save_restore.go
+++ b/pkg/sentry/socket/epsocket/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 37c48f4bc..edefa225b 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/device.go b/pkg/sentry/socket/hostinet/device.go
index c5133f3bb..4267e3691 100644
--- a/pkg/sentry/socket/hostinet/device.go
+++ b/pkg/sentry/socket/hostinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/hostinet.go b/pkg/sentry/socket/hostinet/hostinet.go
index 7858892ab..0d6f51d2b 100644
--- a/pkg/sentry/socket/hostinet/hostinet.go
+++ b/pkg/sentry/socket/hostinet/hostinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/save_restore.go b/pkg/sentry/socket/hostinet/save_restore.go
index 3827f082a..1dec33897 100644
--- a/pkg/sentry/socket/hostinet/save_restore.go
+++ b/pkg/sentry/socket/hostinet/save_restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 49349074f..71884d3db 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index 59c8910ca..eed0c7837 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index 4ce73c1f1..9c45991ba 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go
index a95172cba..5bd3b49ce 100644
--- a/pkg/sentry/socket/netlink/message.go
+++ b/pkg/sentry/socket/netlink/message.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index 20b9a6e37..e9d3275b1 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/port/port_test.go b/pkg/sentry/socket/netlink/port/port_test.go
index 49b3b48ab..516f6cd6c 100644
--- a/pkg/sentry/socket/netlink/port/port_test.go
+++ b/pkg/sentry/socket/netlink/port/port_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 06786bd50..76cf12fd4 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index e414b829b..9f0a81403 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index a34f9d3ca..dc688eb00 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index 64106c4b5..f537c7f63 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/device.go b/pkg/sentry/socket/rpcinet/device.go
index d2b9f9222..44c0a39b7 100644
--- a/pkg/sentry/socket/rpcinet/device.go
+++ b/pkg/sentry/socket/rpcinet/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index f06d12231..601e05994 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/rpcinet.go b/pkg/sentry/socket/rpcinet/rpcinet.go
index 6c98e6acb..5d4fd4dac 100644
--- a/pkg/sentry/socket/rpcinet/rpcinet.go
+++ b/pkg/sentry/socket/rpcinet/rpcinet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index cf8f69efb..c028ed4dd 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index cb8344ec6..a1be711df 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/rpcinet/stack_unsafe.go b/pkg/sentry/socket/rpcinet/stack_unsafe.go
index d04fb2069..e53f578ba 100644
--- a/pkg/sentry/socket/rpcinet/stack_unsafe.go
+++ b/pkg/sentry/socket/rpcinet/stack_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 62ba13782..7e840b452 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/device.go b/pkg/sentry/socket/unix/device.go
index 41820dbb3..734d39ee6 100644
--- a/pkg/sentry/socket/unix/device.go
+++ b/pkg/sentry/socket/unix/device.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 7d80e4393..382911d51 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 62641bb34..18e492862 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectioned_state.go b/pkg/sentry/socket/unix/transport/connectioned_state.go
index 608a6a97a..7e02a5db8 100644
--- a/pkg/sentry/socket/unix/transport/connectioned_state.go
+++ b/pkg/sentry/socket/unix/transport/connectioned_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 728863f3f..43ff875e4 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index 45a58c600..b650caae7 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 12b1576bd..d5f7f7aa8 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 01efd24d3..e9607aa01 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 224f8b709..27fde505b 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_metadata.go b/pkg/sentry/state/state_metadata.go
index 7f047b808..b8e128c40 100644
--- a/pkg/sentry/state/state_metadata.go
+++ b/pkg/sentry/state/state_metadata.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/state/state_unsafe.go b/pkg/sentry/state/state_unsafe.go
index f02e12b2a..7745b6ac6 100644
--- a/pkg/sentry/state/state_unsafe.go
+++ b/pkg/sentry/state/state_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/capability.go b/pkg/sentry/strace/capability.go
index 9001181e7..f85d6636e 100644
--- a/pkg/sentry/strace/capability.go
+++ b/pkg/sentry/strace/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/clone.go b/pkg/sentry/strace/clone.go
index e18ce84dc..ff6a432c6 100644
--- a/pkg/sentry/strace/clone.go
+++ b/pkg/sentry/strace/clone.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/futex.go b/pkg/sentry/strace/futex.go
index f4aa7fcad..24301bda6 100644
--- a/pkg/sentry/strace/futex.go
+++ b/pkg/sentry/strace/futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64.go
index 6043b8cb1..3650fd6e1 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/open.go b/pkg/sentry/strace/open.go
index 3bf348d7a..140727b02 100644
--- a/pkg/sentry/strace/open.go
+++ b/pkg/sentry/strace/open.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go
index b6b05423c..15605187d 100644
--- a/pkg/sentry/strace/poll.go
+++ b/pkg/sentry/strace/poll.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/ptrace.go b/pkg/sentry/strace/ptrace.go
index 8c4b79227..485aacb8a 100644
--- a/pkg/sentry/strace/ptrace.go
+++ b/pkg/sentry/strace/ptrace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go
index 524be0e15..f82460e1c 100644
--- a/pkg/sentry/strace/signal.go
+++ b/pkg/sentry/strace/signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 4c1a9d469..dbe53b9a2 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 434a200d9..f4c1be4ce 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/strace.proto b/pkg/sentry/strace/strace.proto
index f1fc539d6..4b2f73a5f 100644
--- a/pkg/sentry/strace/strace.proto
+++ b/pkg/sentry/strace/strace.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 8c897fcbe..eae2d6c12 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go
index b90d191b7..ec1eab331 100644
--- a/pkg/sentry/syscalls/epoll.go
+++ b/pkg/sentry/syscalls/epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 304a12dde..1ba3695fb 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go
index d2aec963a..d83e12971 100644
--- a/pkg/sentry/syscalls/linux/flags.go
+++ b/pkg/sentry/syscalls/linux/flags.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index b9b4ccbd1..9a460ebdf 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go
index a033b7c70..5438b664b 100644
--- a/pkg/sentry/syscalls/linux/sigset.go
+++ b/pkg/sentry/syscalls/linux/sigset.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go
index 61c2647bf..1b27b2415 100644
--- a/pkg/sentry/syscalls/linux/sys_aio.go
+++ b/pkg/sentry/syscalls/linux/sys_aio.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go
index cf972dc28..622cb8d0d 100644
--- a/pkg/sentry/syscalls/linux/sys_capability.go
+++ b/pkg/sentry/syscalls/linux/sys_capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 200c46355..1467feb4e 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go
index 903172890..ca4ead488 100644
--- a/pkg/sentry/syscalls/linux/sys_eventfd.go
+++ b/pkg/sentry/syscalls/linux/sys_eventfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 967464c85..893322647 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index f0c89cba4..7cef4b50c 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 4b441b31b..1b597d5bc 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go
index 8d594aa83..27e765a2d 100644
--- a/pkg/sentry/syscalls/linux/sys_identity.go
+++ b/pkg/sentry/syscalls/linux/sys_identity.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go
index 26a505782..20269a769 100644
--- a/pkg/sentry/syscalls/linux/sys_inotify.go
+++ b/pkg/sentry/syscalls/linux/sys_inotify.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go
index ad3bfd761..8aadc6d8c 100644
--- a/pkg/sentry/syscalls/linux/sys_lseek.go
+++ b/pkg/sentry/syscalls/linux/sys_lseek.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 805b251b1..64a6e639c 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index e110a553f..cf613bad0 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 3652c429e..036845c13 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 17b6768e5..e32099dd4 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 7a29bd9b7..117ae1a0e 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go
index 452dff058..fc3959a7e 100644
--- a/pkg/sentry/syscalls/linux/sys_random.go
+++ b/pkg/sentry/syscalls/linux/sys_random.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index 50c7d7a74..48b0fd49d 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 443334693..8b0379779 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go
index ab07c77f9..003d718da 100644
--- a/pkg/sentry/syscalls/linux/sys_rusage.go
+++ b/pkg/sentry/syscalls/linux/sys_rusage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index e679a6694..8aea03abe 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go
index f08fdf5cb..b4262162a 100644
--- a/pkg/sentry/syscalls/linux/sys_seccomp.go
+++ b/pkg/sentry/syscalls/linux/sys_seccomp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index 86f850ef1..5bd61ab87 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index a0d3a73c5..d0eceac7c 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go
index a539354c5..7fbeb4fcd 100644
--- a/pkg/sentry/syscalls/linux/sys_signal.go
+++ b/pkg/sentry/syscalls/linux/sys_signal.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index c8748958a..69862f110 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index 49c225011..10fc201ef 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go
index 68488330f..4352482fb 100644
--- a/pkg/sentry/syscalls/linux/sys_sync.go
+++ b/pkg/sentry/syscalls/linux/sys_sync.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go
index 6f7acf98f..ecf88edc1 100644
--- a/pkg/sentry/syscalls/linux/sys_sysinfo.go
+++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go
index 7193b7aed..9efc58d34 100644
--- a/pkg/sentry/syscalls/linux/sys_syslog.go
+++ b/pkg/sentry/syscalls/linux/sys_syslog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index ddcb5b789..23c2f7035 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go
index 063fbb106..b4f2609c0 100644
--- a/pkg/sentry/syscalls/linux/sys_time.go
+++ b/pkg/sentry/syscalls/linux/sys_time.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go
index 6baf4599b..04ea7a4e9 100644
--- a/pkg/sentry/syscalls/linux/sys_timer.go
+++ b/pkg/sentry/syscalls/linux/sys_timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go
index f70d13682..ec0155cbb 100644
--- a/pkg/sentry/syscalls/linux/sys_timerfd.go
+++ b/pkg/sentry/syscalls/linux/sys_timerfd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go
index 8ea78093b..1e8312e00 100644
--- a/pkg/sentry/syscalls/linux/sys_tls.go
+++ b/pkg/sentry/syscalls/linux/sys_tls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go
index f7545b965..fa81fe10e 100644
--- a/pkg/sentry/syscalls/linux/sys_utsname.go
+++ b/pkg/sentry/syscalls/linux/sys_utsname.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index e405608c4..1da72d606 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go
index 752ec326d..fa6fcdc0b 100644
--- a/pkg/sentry/syscalls/linux/timespec.go
+++ b/pkg/sentry/syscalls/linux/timespec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index 425ce900c..5d10b3824 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index a98bcd7de..c27e391c9 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/calibrated_clock_test.go b/pkg/sentry/time/calibrated_clock_test.go
index a9237630e..d6622bfe2 100644
--- a/pkg/sentry/time/calibrated_clock_test.go
+++ b/pkg/sentry/time/calibrated_clock_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clock_id.go b/pkg/sentry/time/clock_id.go
index 1317a5dad..724f59dd9 100644
--- a/pkg/sentry/time/clock_id.go
+++ b/pkg/sentry/time/clock_id.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/clocks.go b/pkg/sentry/time/clocks.go
index e26386520..837e86094 100644
--- a/pkg/sentry/time/clocks.go
+++ b/pkg/sentry/time/clocks.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/muldiv_amd64.s b/pkg/sentry/time/muldiv_amd64.s
index bfcb8c724..028c6684e 100644
--- a/pkg/sentry/time/muldiv_amd64.s
+++ b/pkg/sentry/time/muldiv_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s
index 5fa82a136..5ad57a8a3 100644
--- a/pkg/sentry/time/muldiv_arm64.s
+++ b/pkg/sentry/time/muldiv_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go
index 8568b1193..63cf7c4a3 100644
--- a/pkg/sentry/time/parameters.go
+++ b/pkg/sentry/time/parameters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index 4a0c4e880..e1b9084ac 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler.go b/pkg/sentry/time/sampler.go
index 445690d49..2140a99b7 100644
--- a/pkg/sentry/time/sampler.go
+++ b/pkg/sentry/time/sampler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_test.go b/pkg/sentry/time/sampler_test.go
index ec0e442b6..3e70a1134 100644
--- a/pkg/sentry/time/sampler_test.go
+++ b/pkg/sentry/time/sampler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/sampler_unsafe.go b/pkg/sentry/time/sampler_unsafe.go
index 0f8eb4fc8..e76180217 100644
--- a/pkg/sentry/time/sampler_unsafe.go
+++ b/pkg/sentry/time/sampler_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/tsc_amd64.s b/pkg/sentry/time/tsc_amd64.s
index e53d477f7..6a8eed664 100644
--- a/pkg/sentry/time/tsc_amd64.s
+++ b/pkg/sentry/time/tsc_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/time/tsc_arm64.s b/pkg/sentry/time/tsc_arm64.s
index c1c9760ef..da9fa4112 100644
--- a/pkg/sentry/time/tsc_arm64.s
+++ b/pkg/sentry/time/tsc_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/unimpl/events.go b/pkg/sentry/unimpl/events.go
index f78f8c981..d92766e2d 100644
--- a/pkg/sentry/unimpl/events.go
+++ b/pkg/sentry/unimpl/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/unimpl/unimplemented_syscall.proto b/pkg/sentry/unimpl/unimplemented_syscall.proto
index 41579b016..0d7a94be7 100644
--- a/pkg/sentry/unimpl/unimplemented_syscall.proto
+++ b/pkg/sentry/unimpl/unimplemented_syscall.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go
index 399d98c29..e55b89689 100644
--- a/pkg/sentry/uniqueid/context.go
+++ b/pkg/sentry/uniqueid/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/cpu.go b/pkg/sentry/usage/cpu.go
index cbd7cfe19..bfc282d69 100644
--- a/pkg/sentry/usage/cpu.go
+++ b/pkg/sentry/usage/cpu.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/io.go b/pkg/sentry/usage/io.go
index 8e27a0a88..dfcd3a49d 100644
--- a/pkg/sentry/usage/io.go
+++ b/pkg/sentry/usage/io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index 5be9ed9c6..c316f1597 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/memory_unsafe.go b/pkg/sentry/usage/memory_unsafe.go
index a3ae668a5..9e0014ca0 100644
--- a/pkg/sentry/usage/memory_unsafe.go
+++ b/pkg/sentry/usage/memory_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usage/usage.go b/pkg/sentry/usage/usage.go
index ab327f8e2..e3d33a965 100644
--- a/pkg/sentry/usage/usage.go
+++ b/pkg/sentry/usage/usage.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go
index 9e6a27bcf..9c1742a59 100644
--- a/pkg/sentry/usermem/access_type.go
+++ b/pkg/sentry/usermem/access_type.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go
index 2a75aa60c..e79210804 100644
--- a/pkg/sentry/usermem/addr.go
+++ b/pkg/sentry/usermem/addr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go
index bd6a1ec8a..82f735026 100644
--- a/pkg/sentry/usermem/addr_range_seq_test.go
+++ b/pkg/sentry/usermem/addr_range_seq_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go
index f5fd446fa..c09337c15 100644
--- a/pkg/sentry/usermem/addr_range_seq_unsafe.go
+++ b/pkg/sentry/usermem/addr_range_seq_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go
index 274f568d0..f98d82168 100644
--- a/pkg/sentry/usermem/bytes_io.go
+++ b/pkg/sentry/usermem/bytes_io.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go
index 7add8bc82..bb49d2ff3 100644
--- a/pkg/sentry/usermem/bytes_io_unsafe.go
+++ b/pkg/sentry/usermem/bytes_io_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 4c7d5014a..31e4d6ada 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_arm64.go b/pkg/sentry/usermem/usermem_arm64.go
index 7fd4ce963..fdfc30a66 100644
--- a/pkg/sentry/usermem/usermem_arm64.go
+++ b/pkg/sentry/usermem/usermem_arm64.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
index 1991a9641..4a07118b7 100644
--- a/pkg/sentry/usermem/usermem_test.go
+++ b/pkg/sentry/usermem/usermem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_unsafe.go b/pkg/sentry/usermem/usermem_unsafe.go
index 3895e7871..876783e78 100644
--- a/pkg/sentry/usermem/usermem_unsafe.go
+++ b/pkg/sentry/usermem/usermem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go
index 9ec90f9ff..8059b72d2 100644
--- a/pkg/sentry/usermem/usermem_x86.go
+++ b/pkg/sentry/usermem/usermem_x86.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index b4f1e3a4f..2fc4472dd 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_amd64.s b/pkg/sleep/commit_amd64.s
index d08df7f37..bc4ac2c3c 100644
--- a/pkg/sleep/commit_amd64.s
+++ b/pkg/sleep/commit_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_asm.go b/pkg/sleep/commit_asm.go
index 90eef4cbc..35e2cc337 100644
--- a/pkg/sleep/commit_asm.go
+++ b/pkg/sleep/commit_asm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/commit_noasm.go b/pkg/sleep/commit_noasm.go
index 967d22e24..686b1da3d 100644
--- a/pkg/sleep/commit_noasm.go
+++ b/pkg/sleep/commit_noasm.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/empty.s b/pkg/sleep/empty.s
index 85d52cd9c..fb37360ac 100644
--- a/pkg/sleep/empty.s
+++ b/pkg/sleep/empty.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index 8feb9ffc2..130806c86 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 45fb6f0ea..62e0abc34 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/decode.go b/pkg/state/decode.go
index 54b5ad8b8..73a59f871 100644
--- a/pkg/state/decode.go
+++ b/pkg/state/decode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode.go b/pkg/state/encode.go
index fe8512bbf..b0714170b 100644
--- a/pkg/state/encode.go
+++ b/pkg/state/encode.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/encode_unsafe.go b/pkg/state/encode_unsafe.go
index be94742a8..457e6dbb7 100644
--- a/pkg/state/encode_unsafe.go
+++ b/pkg/state/encode_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/map.go b/pkg/state/map.go
index 0035d7250..1fb9b47b8 100644
--- a/pkg/state/map.go
+++ b/pkg/state/map.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/object.proto b/pkg/state/object.proto
index d3b46ea97..952289069 100644
--- a/pkg/state/object.proto
+++ b/pkg/state/object.proto
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/printer.go b/pkg/state/printer.go
index aee4b69fb..5174c3ba3 100644
--- a/pkg/state/printer.go
+++ b/pkg/state/printer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state.go b/pkg/state/state.go
index 4486f83a7..cf7df803a 100644
--- a/pkg/state/state.go
+++ b/pkg/state/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/state_test.go b/pkg/state/state_test.go
index 22bcad9e1..7c24bbcda 100644
--- a/pkg/state/state_test.go
+++ b/pkg/state/state_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile.go b/pkg/state/statefile/statefile.go
index c21e3bb0e..ad4e3b43e 100644
--- a/pkg/state/statefile/statefile.go
+++ b/pkg/state/statefile/statefile.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/statefile/statefile_test.go b/pkg/state/statefile/statefile_test.go
index b4f400e01..60b769895 100644
--- a/pkg/state/statefile/statefile_test.go
+++ b/pkg/state/statefile/statefile_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/state/stats.go b/pkg/state/stats.go
index 17ca258fc..eb51cda47 100644
--- a/pkg/state/stats.go
+++ b/pkg/state/stats.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/host_linux.go b/pkg/syserr/host_linux.go
index 74bbe9f5b..fc6ef60a1 100644
--- a/pkg/syserr/host_linux.go
+++ b/pkg/syserr/host_linux.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index 1a23919ef..bd489b424 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserr/syserr.go b/pkg/syserr/syserr.go
index 232634dd4..4ddbd3322 100644
--- a/pkg/syserr/syserr.go
+++ b/pkg/syserr/syserr.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 5558cccff..345653544 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/syserror/syserror_test.go b/pkg/syserror/syserror_test.go
index 0f0da5781..f2a10ee7b 100644
--- a/pkg/syserror/syserror_test.go
+++ b/pkg/syserror/syserror_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 628e28f57..df8bf435d 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index e84f73feb..2c81c5697 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
index d3a9a0f88..43cbbc74c 100644
--- a/pkg/tcpip/buffer/prependable.go
+++ b/pkg/tcpip/buffer/prependable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 43cbb9461..1a9d40778 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
index 74a0a96fc..ebc3a17b7 100644
--- a/pkg/tcpip/buffer/view_test.go
+++ b/pkg/tcpip/buffer/view_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 5dfb3ca1d..6e7edf3ab 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/hash/jenkins/jenkins.go b/pkg/tcpip/hash/jenkins/jenkins.go
index e66d5f12b..52c22230e 100644
--- a/pkg/tcpip/hash/jenkins/jenkins.go
+++ b/pkg/tcpip/hash/jenkins/jenkins.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/hash/jenkins/jenkins_test.go b/pkg/tcpip/hash/jenkins/jenkins_test.go
index 9d86174aa..4c78b5808 100644
--- a/pkg/tcpip/hash/jenkins/jenkins_test.go
+++ b/pkg/tcpip/hash/jenkins/jenkins_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/arp.go b/pkg/tcpip/header/arp.go
index 22b259ccb..55fe7292c 100644
--- a/pkg/tcpip/header/arp.go
+++ b/pkg/tcpip/header/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
index 2e8c65fac..2eaa7938a 100644
--- a/pkg/tcpip/header/checksum.go
+++ b/pkg/tcpip/header/checksum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index 77365bc41..76143f454 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/gue.go b/pkg/tcpip/header/gue.go
index 2ad13955a..10d358c0e 100644
--- a/pkg/tcpip/header/gue.go
+++ b/pkg/tcpip/header/gue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index 3ac89cdae..782e1053c 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index e317975e8..d0b10d849 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/interfaces.go b/pkg/tcpip/header/interfaces.go
index ac327d8a5..fb250ea30 100644
--- a/pkg/tcpip/header/interfaces.go
+++ b/pkg/tcpip/header/interfaces.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index c3b8fb00e..96e461491 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 3d24736c7..66820a466 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipv6_fragment.go b/pkg/tcpip/header/ipv6_fragment.go
index e36d5177b..6d896355a 100644
--- a/pkg/tcpip/header/ipv6_fragment.go
+++ b/pkg/tcpip/header/ipv6_fragment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/ipversion_test.go b/pkg/tcpip/header/ipversion_test.go
index 8301ba5cf..0c830180e 100644
--- a/pkg/tcpip/header/ipversion_test.go
+++ b/pkg/tcpip/header/ipversion_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index e656ebb15..0cd89b992 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/tcp_test.go b/pkg/tcpip/header/tcp_test.go
index 7cd98df3b..9a2b99489 100644
--- a/pkg/tcpip/header/tcp_test.go
+++ b/pkg/tcpip/header/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
index e8c860436..2205fec18 100644
--- a/pkg/tcpip/header/udp.go
+++ b/pkg/tcpip/header/udp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index f7501a1bc..ee9dd8700 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 8f4d67074..4da376774 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index c8b037d57..31138e4ac 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
index 36e7fe5a9..97a477b61 100644
--- a/pkg/tcpip/link/fdbased/endpoint_unsafe.go
+++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index f1e71c233..430c85a42 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
index e5ac7996d..135da2498 100644
--- a/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
+++ b/pkg/tcpip/link/fdbased/mmap_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 2dc4bcfda..2c1148123 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index b3e71c7fc..be07b7c29 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 031449a05..5d40dfacc 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
index 9dade5421..b54131573 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
index 3ba96a123..0b51982c6 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
index 94ddad8ea..4eab77c74 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
index 7359849b1..8bde41637 100644
--- a/pkg/tcpip/link/rawfile/errors.go
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index fe2779125..86db7a487 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe.go b/pkg/tcpip/link/sharedmem/pipe/pipe.go
index e014324cc..74c9f0311 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index 30742ccb1..59ef69a8b 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
index f491d74a2..62d17029e 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/rx.go b/pkg/tcpip/link/sharedmem/pipe/rx.go
index 8d641c76f..f22e533ac 100644
--- a/pkg/tcpip/link/sharedmem/pipe/rx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/pipe/tx.go b/pkg/tcpip/link/sharedmem/pipe/tx.go
index e75175d98..9841eb231 100644
--- a/pkg/tcpip/link/sharedmem/pipe/tx.go
+++ b/pkg/tcpip/link/sharedmem/pipe/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/queue_test.go b/pkg/tcpip/link/sharedmem/queue/queue_test.go
index 391165bc3..d3f8f4b8b 100644
--- a/pkg/tcpip/link/sharedmem/queue/queue_test.go
+++ b/pkg/tcpip/link/sharedmem/queue/queue_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/rx.go b/pkg/tcpip/link/sharedmem/queue/rx.go
index d3a5da08a..d9aecf2d9 100644
--- a/pkg/tcpip/link/sharedmem/queue/rx.go
+++ b/pkg/tcpip/link/sharedmem/queue/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/queue/tx.go b/pkg/tcpip/link/sharedmem/queue/tx.go
index 845108db1..a24dccd11 100644
--- a/pkg/tcpip/link/sharedmem/queue/tx.go
+++ b/pkg/tcpip/link/sharedmem/queue/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go
index 3eeab769e..215cb607f 100644
--- a/pkg/tcpip/link/sharedmem/rx.go
+++ b/pkg/tcpip/link/sharedmem/rx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 6e6aa5a13..e34b780f8 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 1f44e224c..65b9d7085 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
index b91adbaf7..f7e816a41 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go
index 37da34831..ac3577aa6 100644
--- a/pkg/tcpip/link/sharedmem/tx.go
+++ b/pkg/tcpip/link/sharedmem/tx.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go
index 3d0d8d852..c16c19647 100644
--- a/pkg/tcpip/link/sniffer/pcap.go
+++ b/pkg/tcpip/link/sniffer/pcap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 462a6e3a3..e87ae07d7 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go
index e4c589dda..09ca9b527 100644
--- a/pkg/tcpip/link/tun/tun_unsafe.go
+++ b/pkg/tcpip/link/tun/tun_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index bd9f9845b..21690a226 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index a2df6be95..62054fb7f 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 975919e80..a3f2bce3e 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 14b9cb8b6..1b971b1a3 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap.go b/pkg/tcpip/network/fragmentation/frag_heap.go
index 55615c8e6..9ad3e5a8a 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/frag_heap_test.go b/pkg/tcpip/network/fragmentation/frag_heap_test.go
index 1b1b72e88..3a2486ba8 100644
--- a/pkg/tcpip/network/fragmentation/frag_heap_test.go
+++ b/pkg/tcpip/network/fragmentation/frag_heap_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index a5dda0398..e90edb375 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 5bf3463a9..99ded68a3 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index c9ad2bef6..04f9ab964 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index a2bc9707a..7eee0710d 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
index 07960ddf0..0c91905dc 100644
--- a/pkg/tcpip/network/hash/hash.go
+++ b/pkg/tcpip/network/hash/hash.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 522009fac..4b822e2c6 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 1c3acda4b..9cb81245a 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index cbdca98a5..c6af0db79 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 42e85564e..146143ab3 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index be28be36d..9c011e107 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index 8b57a0641..d8737a616 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 9a743ea80..4b8cd496b 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index d212a5792..a1712b590 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 01e7320b4..8466c661b 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index cf8900c4d..1681de56e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index da6202f97..642607f83 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/seqnum/seqnum.go b/pkg/tcpip/seqnum/seqnum.go
index f2b988839..b40a3c212 100644
--- a/pkg/tcpip/seqnum/seqnum.go
+++ b/pkg/tcpip/seqnum/seqnum.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 40e4bdb4a..42b9768ae 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index 77a09ca86..91b2ffea8 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index c18571b0f..8008d9870 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 6e1660051..c70533a35 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 8ae562dcd..3d4c282a9 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index cb9ffe9c2..f204ca790 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
index 3d7e4b719..dfec4258a 100644
--- a/pkg/tcpip/stack/stack_global_state.go
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index b5375df3c..351f63221 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index a8ac18e72..e8b562ad9 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 2df974bf2..8d74f1543 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index b09137f08..9367c8c02 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
index 1f7b04398..ebb1c1b56 100644
--- a/pkg/tcpip/tcpip_test.go
+++ b/pkg/tcpip/tcpip_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time.s b/pkg/tcpip/time.s
index 85d52cd9c..fb37360ac 100644
--- a/pkg/tcpip/time.s
+++ b/pkg/tcpip/time.s
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 7ec5741af..1a307483b 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 8f2e3aa20..00840cfcf 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/icmp/endpoint_state.go b/pkg/tcpip/transport/icmp/endpoint_state.go
index 8a7909246..332b3cd33 100644
--- a/pkg/tcpip/transport/icmp/endpoint_state.go
+++ b/pkg/tcpip/transport/icmp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 09ee2f892..954fde9d8 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/raw/raw.go b/pkg/tcpip/transport/raw/raw.go
index f0f60ce91..7004c7ff4 100644
--- a/pkg/tcpip/transport/raw/raw.go
+++ b/pkg/tcpip/transport/raw/raw.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/raw/state.go b/pkg/tcpip/transport/raw/state.go
index e3891a8b8..e8907ebb1 100644
--- a/pkg/tcpip/transport/raw/state.go
+++ b/pkg/tcpip/transport/raw/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index a3894ed8f..e506d7133 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 6c4a4d95e..eaa67aeb7 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/cubic.go b/pkg/tcpip/transport/tcp/cubic.go
index 003525d86..e618cd2b9 100644
--- a/pkg/tcpip/transport/tcp/cubic.go
+++ b/pkg/tcpip/transport/tcp/cubic.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 2886cc707..43bcfa070 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 09eff5be1..982f491cc 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 7f9dabb4d..27b0be046 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 6a7efaf1d..e088e24cb 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index b5fb160bc..b86473891 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index fa6bdddba..b08a0e356 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
index e4f8b7d5a..f83ebc717 100644
--- a/pkg/tcpip/transport/tcp/reno.go
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack.go b/pkg/tcpip/transport/tcp/sack.go
index 24e48fe7b..6a013d99b 100644
--- a/pkg/tcpip/transport/tcp/sack.go
+++ b/pkg/tcpip/transport/tcp/sack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
index 21878ad82..99560d5b4 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard_test.go b/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
index 3cf2ff451..8f6890cdf 100644
--- a/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index c603fe713..187effb6b 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
index 98422fadf..9fd061d7d 100644
--- a/pkg/tcpip/transport/tcp/segment_heap.go
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 0c637d7ad..3b020e580 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
index 68b049f06..dd7e14aa6 100644
--- a/pkg/tcpip/transport/tcp/segment_state.go
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 6317748cf..50743670e 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 86bbd643f..12eff8afc 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index 06b0702c5..dbfbd5c4f 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index c5732ad1c..a8b290dae 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 87c640967..039bbcfba 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6e2fed880..fa721a7f8 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcp/timer.go b/pkg/tcpip/transport/tcp/timer.go
index 38240d2d5..fc1c7cbd2 100644
--- a/pkg/tcpip/transport/tcp/timer.go
+++ b/pkg/tcpip/transport/tcp/timer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index b94568fb1..f1dcd36d5 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
index aaeae9b18..435e136de 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 1f9251de3..db65a4e88 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index b2daaf751..163dcbc13 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index d80c47e34..25bdd2929 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 616a9f388..8b47cce17 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 2f4e94c58..86a8fa19b 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex.go b/pkg/tmutex/tmutex.go
index df61d89f5..c4685020d 100644
--- a/pkg/tmutex/tmutex.go
+++ b/pkg/tmutex/tmutex.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
index a4537cb3b..ce34c7962 100644
--- a/pkg/tmutex/tmutex_test.go
+++ b/pkg/tmutex/tmutex_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet.go b/pkg/unet/unet.go
index 114fb8c5b..2aa1af4ff 100644
--- a/pkg/unet/unet.go
+++ b/pkg/unet/unet.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index db5485539..763b23c7c 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go
index 1d6ec286c..fa0916439 100644
--- a/pkg/unet/unet_unsafe.go
+++ b/pkg/unet/unet_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 719f0e92f..0f155ec74 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/urpc/urpc_test.go b/pkg/urpc/urpc_test.go
index f1b9a85ca..5bf2c5ed2 100644
--- a/pkg/urpc/urpc_test.go
+++ b/pkg/urpc/urpc_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index a6c9dff3c..8a65ed164 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/pkg/waiter/waiter_test.go b/pkg/waiter/waiter_test.go
index 60853f9c1..c1b94a4f3 100644
--- a/pkg/waiter/waiter_test.go
+++ b/pkg/waiter/waiter_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index b3499bcde..c1b33c551 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 0c9472f18..99df5e614 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index f1940dd72..ccec3d20c 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index ba47effc1..b6771de30 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 712c50ee9..ab7c58838 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
index d224d08b7..79f7387ac 100644
--- a/runsc/boot/debug.go
+++ b/runsc/boot/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index 717adfedd..ffd99f5e9 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index a3d21d963..4e428b49c 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 9c72e3b1a..652da1cef 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
index 67f3101fe..5c5ec4e06 100644
--- a/runsc/boot/filter/extra_filters.go
+++ b/runsc/boot/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index fb95283ab..ac5a0f1aa 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index 02a122c95..ba3c1ce87 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index fb197f9b1..17479e0dd 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 07061b9b3..aeb1c52cc 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 32e62cdf7..3364aa5e6 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 75ec19c32..0b5be0a42 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 01578cfc5..9a864ad3f 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 35baa36ad..598ec969e 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index 028bcc1f4..19c7f8fbd 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 2b338b6c6..7431b17d6 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index ecc184f74..548c80e9a 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index ff2fa2fb9..ac937f7bc 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
index e5da021e5..312e5b471 100644
--- a/runsc/cmd/capability.go
+++ b/runsc/cmd/capability.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index dd278b32d..ee74d33d8 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index f722df055..96d3c3378 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
index ed1dafef1..1a774db04 100644
--- a/runsc/cmd/chroot.go
+++ b/runsc/cmd/chroot.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index 208cf5304..aa7b1a636 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 30c8fa283..629c198fd 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 3ee9a9b49..000f694c7 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 3206b267a..9039723e9 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index 4a5b4774a..45fc91016 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 343461130..67d415733 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 208d2f74b..c6bc8fc3a 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 718d01067..ad2508405 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
index 686c5e150..6f0f258c0 100644
--- a/runsc/cmd/exec_test.go
+++ b/runsc/cmd/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 82487887c..bccb29397 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/gofer_test.go b/runsc/cmd/gofer_test.go
index 8e692feb9..cbea7f127 100644
--- a/runsc/cmd/gofer_test.go
+++ b/runsc/cmd/gofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index e67f82473..aed5f3291 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 1dcea2af0..1f5ca2473 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
index 1276f0dbd..0e9ef7fa5 100644
--- a/runsc/cmd/path.go
+++ b/runsc/cmd/path.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 2c93e5f3e..11b36aa10 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 060d796f2..3a3e6f17a 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 66b23c38e..27b06713a 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index 5551d1450..9a2ade41e 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index be1c1b678..4d5f5c139 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 063bd39c5..344da13ba 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 9e2e0c11d..657726251 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index c3ef65ab5..f0d449b19 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 6498dd15c..a55a682f3 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 2eb9a8807..64b23639a 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 0b0dfb4cb..b8af27c15 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container.go b/runsc/container/container.go
index a30c217f7..884bbc0fb 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 603c4d929..9458dbb90 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
index 6b9e5550a..acae6781e 100644
--- a/runsc/container/hook.go
+++ b/runsc/container/hook.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 8922e6dbe..e554237cf 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index 8f81ed630..9d5a592a5 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/status.go b/runsc/container/status.go
index 234ffb0dd..91d9112f1 100644
--- a/runsc/container/status.go
+++ b/runsc/container/status.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index b5071ada6..62923f1ef 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 75a087848..a1ad49fb2 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
index 67f3101fe..5c5ec4e06 100644
--- a/runsc/fsgofer/filter/extra_filters.go
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
index 7e142b790..553060bc3 100644
--- a/runsc/fsgofer/filter/extra_filters_msan.go
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
index 3cd29472a..28555f898 100644
--- a/runsc/fsgofer/filter/extra_filters_race.go
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index c120d57a6..ff8154369 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index c964a2a3b..158f22ddc 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index e74df7ede..695836927 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index 94413db86..58af5e44d 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/main.go b/runsc/main.go
index b35726a74..11bc73f75 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 6c6b665a0..2a68d7043 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/network_unsafe.go b/runsc/sandbox/network_unsafe.go
index f7447f002..2a2a0fb7e 100644
--- a/runsc/sandbox/network_unsafe.go
+++ b/runsc/sandbox/network_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 48a0dafe2..dac35ca0b 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index 98c3b19c0..1f3afb4e4 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 35da789f4..7d194335c 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index ac85bec71..c72207fb4 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
index 02af6e6ad..2c86fffe8 100644
--- a/runsc/specutils/specutils_test.go
+++ b/runsc/specutils/specutils_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image.go b/runsc/test/image/image.go
index bcb6f876f..297f1ab92 100644
--- a/runsc/test/image/image.go
+++ b/runsc/test/image/image.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/image_test.go b/runsc/test/image/image_test.go
index f7e750d71..0c45602f9 100644
--- a/runsc/test/image/image_test.go
+++ b/runsc/test/image/image_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/mysql.sql b/runsc/test/image/mysql.sql
index c1271e719..51554b98d 100644
--- a/runsc/test/image/mysql.sql
+++ b/runsc/test/image/mysql.sql
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.rb b/runsc/test/image/ruby.rb
index 25d1ac129..aced49c6d 100644
--- a/runsc/test/image/ruby.rb
+++ b/runsc/test/image/ruby.rb
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/image/ruby.sh b/runsc/test/image/ruby.sh
index d3a9b5656..ebe8d5b0e 100644
--- a/runsc/test/image/ruby.sh
+++ b/runsc/test/image/ruby.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/install.sh b/runsc/test/install.sh
index 32e1e884e..457df2d26 100755
--- a/runsc/test/install.sh
+++ b/runsc/test/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index d87957e2d..7af064d79 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration.go b/runsc/test/integration/integration.go
index e15321c87..4cd5f6c24 100644
--- a/runsc/test/integration/integration.go
+++ b/runsc/test/integration/integration.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go
index 4a2770d48..b2e86aacc 100644
--- a/runsc/test/integration/integration_test.go
+++ b/runsc/test/integration/integration_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/cgroup_test.go b/runsc/test/root/cgroup_test.go
index 91839048c..edb6dee1d 100644
--- a/runsc/test/root/cgroup_test.go
+++ b/runsc/test/root/cgroup_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/chroot_test.go b/runsc/test/root/chroot_test.go
index 0deca0532..da2f473b9 100644
--- a/runsc/test/root/chroot_test.go
+++ b/runsc/test/root/chroot_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/crictl_test.go b/runsc/test/root/crictl_test.go
index 37fe53ba3..3cc176104 100644
--- a/runsc/test/root/crictl_test.go
+++ b/runsc/test/root/crictl_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/root.go b/runsc/test/root/root.go
index 586ea0fe3..349c752cc 100644
--- a/runsc/test/root/root.go
+++ b/runsc/test/root/root.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/busybox.go b/runsc/test/root/testdata/busybox.go
index 544571c63..e4dbd2843 100644
--- a/runsc/test/root/testdata/busybox.go
+++ b/runsc/test/root/testdata/busybox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/containerd_config.go b/runsc/test/root/testdata/containerd_config.go
index 949354987..e12f1ec88 100644
--- a/runsc/test/root/testdata/containerd_config.go
+++ b/runsc/test/root/testdata/containerd_config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/httpd.go b/runsc/test/root/testdata/httpd.go
index f65b1da5d..45d5e33d4 100644
--- a/runsc/test/root/testdata/httpd.go
+++ b/runsc/test/root/testdata/httpd.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/httpd_mount_paths.go b/runsc/test/root/testdata/httpd_mount_paths.go
index 5ca14340e..ac3f4446a 100644
--- a/runsc/test/root/testdata/httpd_mount_paths.go
+++ b/runsc/test/root/testdata/httpd_mount_paths.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/root/testdata/sandbox.go b/runsc/test/root/testdata/sandbox.go
index 194242a27..0db210370 100644
--- a/runsc/test/root/testdata/sandbox.go
+++ b/runsc/test/root/testdata/sandbox.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/crictl.go b/runsc/test/testutil/crictl.go
index 84bb4475a..4f9ee0c05 100644
--- a/runsc/test/testutil/crictl.go
+++ b/runsc/test/testutil/crictl.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index b651319ed..29ef505b4 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 79f0a8b6b..6a4c045a8 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/test/testutil/testutil_race.go b/runsc/test/testutil/testutil_race.go
index 9267af150..86db6ffa1 100644
--- a/runsc/test/testutil/testutil_race.go
+++ b/runsc/test/testutil/testutil_race.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/tools/dockercfg/dockercfg.go b/runsc/tools/dockercfg/dockercfg.go
index cc7a67816..6fb134558 100644
--- a/runsc/tools/dockercfg/dockercfg.go
+++ b/runsc/tools/dockercfg/dockercfg.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/runsc/version.go b/runsc/version.go
index 4894f2de6..ce0573a9b 100644
--- a/runsc/version.go
+++ b/runsc/version.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/gtest/gtest.go b/test/syscalls/gtest/gtest.go
index dfe5037cd..bdec8eb07 100644
--- a/test/syscalls/gtest/gtest.go
+++ b/test/syscalls/gtest/gtest.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
index 78baf548e..a7cbee06b 100644
--- a/test/syscalls/linux/32bit.cc
+++ b/test/syscalls/linux/32bit.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index c2bb4a7ce..56377feab 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
index 1501e526e..b6cdb3f4f 100644
--- a/test/syscalls/linux/accept_bind_stream.cc
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/access.cc b/test/syscalls/linux/access.cc
index 6ea070a5d..bcc25cef4 100644
--- a/test/syscalls/linux/access.cc
+++ b/test/syscalls/linux/access.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/affinity.cc b/test/syscalls/linux/affinity.cc
index 81bd9bcb5..f2d8375b6 100644
--- a/test/syscalls/linux/affinity.cc
+++ b/test/syscalls/linux/affinity.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index b96aab9b9..68dc05417 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
index e0ddbb415..d89269985 100644
--- a/test/syscalls/linux/alarm.cc
+++ b/test/syscalls/linux/alarm.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
index 5687ceb86..81bf5a775 100644
--- a/test/syscalls/linux/arch_prctl.cc
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
index a2634a8bf..f246a799e 100644
--- a/test/syscalls/linux/bad.cc
+++ b/test/syscalls/linux/bad.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/base_poll_test.cc b/test/syscalls/linux/base_poll_test.cc
index bba0108ea..ab7a19dd0 100644
--- a/test/syscalls/linux/base_poll_test.cc
+++ b/test/syscalls/linux/base_poll_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/base_poll_test.h b/test/syscalls/linux/base_poll_test.h
index 9b9b81933..088831f9f 100644
--- a/test/syscalls/linux/base_poll_test.h
+++ b/test/syscalls/linux/base_poll_test.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/bind.cc b/test/syscalls/linux/bind.cc
index f5aa9c500..de8cca53b 100644
--- a/test/syscalls/linux/bind.cc
+++ b/test/syscalls/linux/bind.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/brk.cc b/test/syscalls/linux/brk.cc
index 33d353959..a03a44465 100644
--- a/test/syscalls/linux/brk.cc
+++ b/test/syscalls/linux/brk.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chdir.cc b/test/syscalls/linux/chdir.cc
index a4b54f0ee..3182c228b 100644
--- a/test/syscalls/linux/chdir.cc
+++ b/test/syscalls/linux/chdir.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
index 2f42fe326..79e98597f 100644
--- a/test/syscalls/linux/chmod.cc
+++ b/test/syscalls/linux/chmod.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc
index ad892cf6a..eb1762ddf 100644
--- a/test/syscalls/linux/chown.cc
+++ b/test/syscalls/linux/chown.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
index 6c200f63e..a4354ff62 100644
--- a/test/syscalls/linux/chroot.cc
+++ b/test/syscalls/linux/chroot.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/clock_getres.cc b/test/syscalls/linux/clock_getres.cc
index 8f8842299..c408b936c 100644
--- a/test/syscalls/linux/clock_getres.cc
+++ b/test/syscalls/linux/clock_getres.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index 4ecb5f5b1..082ae1c39 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/clock_nanosleep.cc b/test/syscalls/linux/clock_nanosleep.cc
index 61c67a5ff..52a69d230 100644
--- a/test/syscalls/linux/clock_nanosleep.cc
+++ b/test/syscalls/linux/clock_nanosleep.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
index 7978845c1..4e0a13f8b 100644
--- a/test/syscalls/linux/concurrency.cc
+++ b/test/syscalls/linux/concurrency.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/creat.cc b/test/syscalls/linux/creat.cc
index df2cc0d5c..3c270d6da 100644
--- a/test/syscalls/linux/creat.cc
+++ b/test/syscalls/linux/creat.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
index a140d3b30..b86ebe233 100644
--- a/test/syscalls/linux/dev.cc
+++ b/test/syscalls/linux/dev.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/dup.cc b/test/syscalls/linux/dup.cc
index e8de2f4c4..4f773bc75 100644
--- a/test/syscalls/linux/dup.cc
+++ b/test/syscalls/linux/dup.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
index b4a3bfcba..a4f8f3cec 100644
--- a/test/syscalls/linux/epoll.cc
+++ b/test/syscalls/linux/epoll.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
index 8111da30e..5e5c39d44 100644
--- a/test/syscalls/linux/eventfd.cc
+++ b/test/syscalls/linux/eventfd.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 3f0aa8bf1..0da4c817d 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 30bc4b608..06c322a99 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec.h b/test/syscalls/linux/exec.h
index b82bfffd1..5c0f7e654 100644
--- a/test/syscalls/linux/exec.h
+++ b/test/syscalls/linux/exec.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_assert_closed_workload.cc b/test/syscalls/linux/exec_assert_closed_workload.cc
index 4448431e1..95643618d 100644
--- a/test/syscalls/linux/exec_assert_closed_workload.cc
+++ b/test/syscalls/linux/exec_assert_closed_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_basic_workload.cc b/test/syscalls/linux/exec_basic_workload.cc
index d4bdf511f..1bbd6437e 100644
--- a/test/syscalls/linux/exec_basic_workload.cc
+++ b/test/syscalls/linux/exec_basic_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index c10d85398..bdd6eb10b 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
index b9a4ac749..b3fbd5042 100644
--- a/test/syscalls/linux/exec_proc_exe_workload.cc
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exec_state_workload.cc b/test/syscalls/linux/exec_state_workload.cc
index b66e22565..725c2977f 100644
--- a/test/syscalls/linux/exec_state_workload.cc
+++ b/test/syscalls/linux/exec_state_workload.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exit.cc b/test/syscalls/linux/exit.cc
index 7246a7b3b..99de2b376 100644
--- a/test/syscalls/linux/exit.cc
+++ b/test/syscalls/linux/exit.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/exit_script.sh b/test/syscalls/linux/exit_script.sh
index f014fcf99..527518e06 100755
--- a/test/syscalls/linux/exit_script.sh
+++ b/test/syscalls/linux/exit_script.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fadvise64.cc b/test/syscalls/linux/fadvise64.cc
index 041e8b7b6..2af7aa6d9 100644
--- a/test/syscalls/linux/fadvise64.cc
+++ b/test/syscalls/linux/fadvise64.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index e51538734..61b8acc7a 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fault.cc b/test/syscalls/linux/fault.cc
index cfa7d0d1f..f6e19026f 100644
--- a/test/syscalls/linux/fault.cc
+++ b/test/syscalls/linux/fault.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fchdir.cc b/test/syscalls/linux/fchdir.cc
index 2b13e36c3..08bcae1e8 100644
--- a/test/syscalls/linux/fchdir.cc
+++ b/test/syscalls/linux/fchdir.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 32a90a163..2f8e7c9dd 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 43f568111..b5b972c07 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
index 1388d3839..d89cfcbd7 100644
--- a/test/syscalls/linux/flock.cc
+++ b/test/syscalls/linux/flock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
index 73ac885b5..dd6e1a422 100644
--- a/test/syscalls/linux/fork.cc
+++ b/test/syscalls/linux/fork.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc
index e8f1dfa8a..e7e9f06a1 100644
--- a/test/syscalls/linux/fpsig_fork.cc
+++ b/test/syscalls/linux/fpsig_fork.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fpsig_nested.cc b/test/syscalls/linux/fpsig_nested.cc
index 2fa40b42d..395463aed 100644
--- a/test/syscalls/linux/fpsig_nested.cc
+++ b/test/syscalls/linux/fpsig_nested.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/fsync.cc b/test/syscalls/linux/fsync.cc
index b34229248..e7e057f06 100644
--- a/test/syscalls/linux/fsync.cc
+++ b/test/syscalls/linux/fsync.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index c7a709a0a..bfec95466 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getcpu.cc b/test/syscalls/linux/getcpu.cc
index 3a52b25fa..f4d94bd6a 100644
--- a/test/syscalls/linux/getcpu.cc
+++ b/test/syscalls/linux/getcpu.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
index e8a7bcd43..d146c8db7 100644
--- a/test/syscalls/linux/getdents.cc
+++ b/test/syscalls/linux/getdents.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getrandom.cc b/test/syscalls/linux/getrandom.cc
index be5325497..f97f60029 100644
--- a/test/syscalls/linux/getrandom.cc
+++ b/test/syscalls/linux/getrandom.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/getrusage.cc b/test/syscalls/linux/getrusage.cc
index 1ae603858..9bdb1e4cd 100644
--- a/test/syscalls/linux/getrusage.cc
+++ b/test/syscalls/linux/getrusage.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index b99d339e5..6a3539e22 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index c7741a177..c525d41d2 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 0a149c2e5..7612919d4 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index cac790e64..6898effb8 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index ddfbc28fc..57ffd1595 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/kill.cc b/test/syscalls/linux/kill.cc
index cd98de41f..18ad923b8 100644
--- a/test/syscalls/linux/kill.cc
+++ b/test/syscalls/linux/kill.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc
index ed74437bc..a91703070 100644
--- a/test/syscalls/linux/link.cc
+++ b/test/syscalls/linux/link.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
index 6a4f1423c..a8af8e545 100644
--- a/test/syscalls/linux/lseek.cc
+++ b/test/syscalls/linux/lseek.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
index a79c8c75d..f6ad4d18b 100644
--- a/test/syscalls/linux/madvise.cc
+++ b/test/syscalls/linux/madvise.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
index c2513682d..7e103124b 100644
--- a/test/syscalls/linux/memfd.cc
+++ b/test/syscalls/linux/memfd.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
index b4b680c34..a6e20f9c3 100644
--- a/test/syscalls/linux/memory_accounting.cc
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 9f8033bdf..4ac4cb88f 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mincore.cc b/test/syscalls/linux/mincore.cc
index c572bf5ec..5c1240c89 100644
--- a/test/syscalls/linux/mincore.cc
+++ b/test/syscalls/linux/mincore.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
index 50807b68f..cf138d328 100644
--- a/test/syscalls/linux/mkdir.cc
+++ b/test/syscalls/linux/mkdir.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
index 361ca299b..b1675b9c7 100644
--- a/test/syscalls/linux/mknod.cc
+++ b/test/syscalls/linux/mknod.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
index a492b2404..aee4f7d1a 100644
--- a/test/syscalls/linux/mlock.cc
+++ b/test/syscalls/linux/mlock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index a4fb9d1e0..5b5b4c2e8 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
index 201b83e87..3a17672aa 100644
--- a/test/syscalls/linux/mount.cc
+++ b/test/syscalls/linux/mount.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/mremap.cc b/test/syscalls/linux/mremap.cc
index 01116c1ab..7298d4ca8 100644
--- a/test/syscalls/linux/mremap.cc
+++ b/test/syscalls/linux/mremap.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
index 5afbfce72..ac7146017 100644
--- a/test/syscalls/linux/msync.cc
+++ b/test/syscalls/linux/msync.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/munmap.cc b/test/syscalls/linux/munmap.cc
index e20039950..067241f4d 100644
--- a/test/syscalls/linux/munmap.cc
+++ b/test/syscalls/linux/munmap.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 22e4666c2..42646bb02 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
index b2cbd63d1..e5a85ef9d 100644
--- a/test/syscalls/linux/open_create.cc
+++ b/test/syscalls/linux/open_create.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 71288ebc4..83b1ad4e4 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pause.cc b/test/syscalls/linux/pause.cc
index 4e1148c24..8c05efd6f 100644
--- a/test/syscalls/linux/pause.cc
+++ b/test/syscalls/linux/pause.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index abd10b11b..8698295b3 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index cd2161bb1..9e5aa7fd0 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ppoll.cc b/test/syscalls/linux/ppoll.cc
index f8c388c00..8245a11e8 100644
--- a/test/syscalls/linux/ppoll.cc
+++ b/test/syscalls/linux/ppoll.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index 854dec714..bce42dc74 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
index c1b561464..00dd6523e 100644
--- a/test/syscalls/linux/prctl_setuid.cc
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
index 4e5bcfcde..5e3eb1735 100644
--- a/test/syscalls/linux/pread64.cc
+++ b/test/syscalls/linux/pread64.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
index 4a31123d8..eebd129f2 100644
--- a/test/syscalls/linux/preadv.cc
+++ b/test/syscalls/linux/preadv.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index 58a4f9224..aac960130 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/priority.cc b/test/syscalls/linux/priority.cc
index 3906c7132..1d9bdfa70 100644
--- a/test/syscalls/linux/priority.cc
+++ b/test/syscalls/linux/priority.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/priority_execve.cc b/test/syscalls/linux/priority_execve.cc
index 5604bd3d0..5cb343bad 100644
--- a/test/syscalls/linux/priority_execve.cc
+++ b/test/syscalls/linux/priority_execve.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 7ba274226..654f26242 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
index 6060d0644..03d0665eb 100644
--- a/test/syscalls/linux/proc_net.cc
+++ b/test/syscalls/linux/proc_net.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index ea7c93012..6d745f728 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
index cf5c462f3..7f2e8f203 100644
--- a/test/syscalls/linux/proc_pid_smaps.cc
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/proc_pid_uid_gid_map.cc b/test/syscalls/linux/proc_pid_uid_gid_map.cc
index 96c58c564..df70b7eb9 100644
--- a/test/syscalls/linux/proc_pid_uid_gid_map.cc
+++ b/test/syscalls/linux/proc_pid_uid_gid_map.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pselect.cc b/test/syscalls/linux/pselect.cc
index 3294f6c14..4e43c4d7f 100644
--- a/test/syscalls/linux/pselect.cc
+++ b/test/syscalls/linux/pselect.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
index e0c56f1fc..4c212836c 100644
--- a/test/syscalls/linux/ptrace.cc
+++ b/test/syscalls/linux/ptrace.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index 5b2dc9ccb..0485d187c 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
index 485b1e48d..e1603fc2d 100644
--- a/test/syscalls/linux/pwrite64.cc
+++ b/test/syscalls/linux/pwrite64.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
index a6949f08e..db519f4e0 100644
--- a/test/syscalls/linux/pwritev2.cc
+++ b/test/syscalls/linux/pwritev2.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/raw_socket_ipv4.cc b/test/syscalls/linux/raw_socket_ipv4.cc
index 8b8d032cb..e20b5cb50 100644
--- a/test/syscalls/linux/raw_socket_ipv4.cc
+++ b/test/syscalls/linux/raw_socket_ipv4.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/read.cc b/test/syscalls/linux/read.cc
index eb1b5bc10..4430fa3c2 100644
--- a/test/syscalls/linux/read.cc
+++ b/test/syscalls/linux/read.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
index 0b933673a..f327ec3a9 100644
--- a/test/syscalls/linux/readv.cc
+++ b/test/syscalls/linux/readv.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 349b80d7f..35d2dd9e3 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv_common.h b/test/syscalls/linux/readv_common.h
index e261d545a..b16179fca 100644
--- a/test/syscalls/linux/readv_common.h
+++ b/test/syscalls/linux/readv_common.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index cf22c395e..3c315cc02 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
index c0cbc7cd9..c9d76c2e2 100644
--- a/test/syscalls/linux/rename.cc
+++ b/test/syscalls/linux/rename.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/rlimits.cc b/test/syscalls/linux/rlimits.cc
index 7b255d0f6..860f0f688 100644
--- a/test/syscalls/linux/rlimits.cc
+++ b/test/syscalls/linux/rlimits.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
index ff948f9d5..81d193ffd 100644
--- a/test/syscalls/linux/rtsignal.cc
+++ b/test/syscalls/linux/rtsignal.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sched.cc b/test/syscalls/linux/sched.cc
index 60cb6c443..735e99411 100644
--- a/test/syscalls/linux/sched.cc
+++ b/test/syscalls/linux/sched.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sched_yield.cc b/test/syscalls/linux/sched_yield.cc
index fc45aa5c2..5d24f5b58 100644
--- a/test/syscalls/linux/sched_yield.cc
+++ b/test/syscalls/linux/sched_yield.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
index 27740d7ef..e77586852 100644
--- a/test/syscalls/linux/seccomp.cc
+++ b/test/syscalls/linux/seccomp.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
index 41e6043cc..88c010aec 100644
--- a/test/syscalls/linux/select.cc
+++ b/test/syscalls/linux/select.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
index 1c47b6851..421318fcb 100644
--- a/test/syscalls/linux/semaphore.cc
+++ b/test/syscalls/linux/semaphore.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index 15fd01ff0..2fbb3f4ef 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index e2ccf17ce..66adda515 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
index 2c0f9b04a..eb7a3966f 100644
--- a/test/syscalls/linux/shm.cc
+++ b/test/syscalls/linux/shm.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigaction.cc b/test/syscalls/linux/sigaction.cc
index cdd2dbf31..9a53fd3e0 100644
--- a/test/syscalls/linux/sigaction.cc
+++ b/test/syscalls/linux/sigaction.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 5741720f4..7d4a12c1d 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigaltstack_check.cc b/test/syscalls/linux/sigaltstack_check.cc
index b71f812a8..5ac1b661d 100644
--- a/test/syscalls/linux/sigaltstack_check.cc
+++ b/test/syscalls/linux/sigaltstack_check.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
index 1b7cecccb..a47c781ea 100644
--- a/test/syscalls/linux/sigiret.cc
+++ b/test/syscalls/linux/sigiret.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
index 1aea1ecb8..654c6a47f 100644
--- a/test/syscalls/linux/sigprocmask.cc
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
index e21d23d51..9c7210e17 100644
--- a/test/syscalls/linux/sigstop.cc
+++ b/test/syscalls/linux/sigstop.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
index 1df9c013f..1e5bf5942 100644
--- a/test/syscalls/linux/sigtimedwait.cc
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
index 639cd4e59..2faf678f7 100644
--- a/test/syscalls/linux/socket_abstract.cc
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc
index c1bca467f..00c50d1bf 100644
--- a/test/syscalls/linux/socket_blocking.cc
+++ b/test/syscalls/linux/socket_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_blocking.h b/test/syscalls/linux/socket_blocking.h
index 5cddee54b..db26e5ef5 100644
--- a/test/syscalls/linux/socket_blocking.h
+++ b/test/syscalls/linux/socket_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
index 2653be158..f7cb72df4 100644
--- a/test/syscalls/linux/socket_filesystem.cc
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
index d04d5abe0..f99f3fe62 100644
--- a/test/syscalls/linux/socket_generic.cc
+++ b/test/syscalls/linux/socket_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_generic.h b/test/syscalls/linux/socket_generic.h
index cd826abcf..00ae7bfc3 100644
--- a/test/syscalls/linux/socket_generic.h
+++ b/test/syscalls/linux/socket_generic.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 14d7827c2..f86a0f30c 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index 9cec7a71d..d7fc20aad 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 54f00cd9b..5b198f49d 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.h b/test/syscalls/linux/socket_ip_tcp_generic.h
index f38500d14..a3eff3c73 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.h
+++ b/test/syscalls/linux/socket_ip_tcp_generic.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index 1963d5deb..2c6ae17bf 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc
index 7e36c35d2..831de53b8 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index 9e2a18d3e..d1ea8ef12 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 54053360f..96c1b3b3d 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
index 5bf1de7c6..251817a9f 100644
--- a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index ac15154f2..044394ba7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_generic.h b/test/syscalls/linux/socket_ip_udp_generic.h
index 8b8fc7c6e..106c54e9f 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.h
+++ b/test/syscalls/linux/socket_ip_udp_generic.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index 0e4463649..fc124e9ef 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
index 0c3b669bf..1c3d1c0ad 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
index 7bf8597fe..7554b08d5 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
index 8e1c13ff4..3a068aacf 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
index b23de08d1..fb582b224 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 773d84b13..040bb176e 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index c99958ed5..709172580 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.h b/test/syscalls/linux/socket_ipv4_udp_unbound.h
index a780c0144..8e07bfbbf 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 9dd9e1bd6..53dcd58cd 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
index 5cf9fa8eb..45e1d37ea 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index 535a5fa10..ffbb8e6eb 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
index d6a8e428c..cb0105471 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
index b4e9fe51b..6a5fa8965 100644
--- a/test/syscalls/linux/socket_netdevice.cc
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
index ed4ae1c71..c8693225f 100644
--- a/test/syscalls/linux/socket_netlink_route.cc
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
index edf549544..728d25434 100644
--- a/test/syscalls/linux/socket_netlink_util.cc
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
index 44b1f148c..bea449107 100644
--- a/test/syscalls/linux/socket_netlink_util.h
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_blocking.cc b/test/syscalls/linux/socket_non_blocking.cc
index 1bcc6fb7f..73e6dc618 100644
--- a/test/syscalls/linux/socket_non_blocking.cc
+++ b/test/syscalls/linux/socket_non_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_blocking.h b/test/syscalls/linux/socket_non_blocking.h
index 287e096bb..bd3e02fd2 100644
--- a/test/syscalls/linux/socket_non_blocking.h
+++ b/test/syscalls/linux/socket_non_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index d170008a4..3c599b6e8 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream.h b/test/syscalls/linux/socket_non_stream.h
index 02dd2a958..469fbe6a2 100644
--- a/test/syscalls/linux/socket_non_stream.h
+++ b/test/syscalls/linux/socket_non_stream.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index 9e92628c3..76127d181 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_non_stream_blocking.h b/test/syscalls/linux/socket_non_stream_blocking.h
index bde355452..6e205a039 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.h
+++ b/test/syscalls/linux/socket_non_stream_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index c8a8ad0f6..0417dd347 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream.h b/test/syscalls/linux/socket_stream.h
index 35e591e17..b837b8f8c 100644
--- a/test/syscalls/linux/socket_stream.h
+++ b/test/syscalls/linux/socket_stream.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
index f0f86c01c..8367460d2 100644
--- a/test/syscalls/linux/socket_stream_blocking.cc
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_blocking.h b/test/syscalls/linux/socket_stream_blocking.h
index 06113ad03..9fd19ff90 100644
--- a/test/syscalls/linux/socket_stream_blocking.h
+++ b/test/syscalls/linux/socket_stream_blocking.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_nonblock.cc b/test/syscalls/linux/socket_stream_nonblock.cc
index a3202ffe4..b00748b97 100644
--- a/test/syscalls/linux/socket_stream_nonblock.cc
+++ b/test/syscalls/linux/socket_stream_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_stream_nonblock.h b/test/syscalls/linux/socket_stream_nonblock.h
index 491f53848..c3b7fad91 100644
--- a/test/syscalls/linux/socket_stream_nonblock.h
+++ b/test/syscalls/linux/socket_stream_nonblock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
index 0be23e541..da69de37c 100644
--- a/test/syscalls/linux/socket_test_util.cc
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
index dfabdf179..058313986 100644
--- a/test/syscalls/linux/socket_test_util.h
+++ b/test/syscalls/linux/socket_test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index fafb23ad1..bb3397fa2 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix.h b/test/syscalls/linux/socket_unix.h
index d2a16afb2..3625cc404 100644
--- a/test/syscalls/linux/socket_unix.h
+++ b/test/syscalls/linux/socket_unix.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_abstract.cc b/test/syscalls/linux/socket_unix_abstract.cc
index c4a3c889c..8241bf997 100644
--- a/test/syscalls/linux/socket_unix_abstract.cc
+++ b/test/syscalls/linux/socket_unix_abstract.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
index a69ee027e..9de0f6dfe 100644
--- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 57af118c5..320915b0f 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
index 5dd5e6d77..3e0f611d2 100644
--- a/test/syscalls/linux/socket_unix_dgram.cc
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram.h b/test/syscalls/linux/socket_unix_dgram.h
index 722a3d8e6..0764ef85b 100644
--- a/test/syscalls/linux/socket_unix_dgram.h
+++ b/test/syscalls/linux/socket_unix_dgram.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
index da8f59704..4ba2c80ae 100644
--- a/test/syscalls/linux/socket_unix_dgram_local.cc
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 3becb513d..9fe86cee8 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc
index f081c601f..fa3efc7f8 100644
--- a/test/syscalls/linux/socket_unix_domain.cc
+++ b/test/syscalls/linux/socket_unix_domain.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_filesystem.cc b/test/syscalls/linux/socket_unix_filesystem.cc
index 6a67da75f..5dbe67773 100644
--- a/test/syscalls/linux/socket_unix_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_filesystem.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
index c13a1e564..137db53c4 100644
--- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
index a565978f9..dafe82494 100644
--- a/test/syscalls/linux/socket_unix_non_stream.cc
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_non_stream.h b/test/syscalls/linux/socket_unix_non_stream.h
index e4214d949..7478ab172 100644
--- a/test/syscalls/linux/socket_unix_non_stream.h
+++ b/test/syscalls/linux/socket_unix_non_stream.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index 6c435669b..98cf1fe8a 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
index c575fdcb2..bacfc11e4 100644
--- a/test/syscalls/linux/socket_unix_pair.cc
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
index 1ae7f9b5e..583506f08 100644
--- a/test/syscalls/linux/socket_unix_pair_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
index ad0af77e9..6f6367dd5 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_seqpacket.h b/test/syscalls/linux/socket_unix_seqpacket.h
index da8eb2b2b..30d9b9edf 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.h
+++ b/test/syscalls/linux/socket_unix_seqpacket.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
index e6484d9b4..b903a9e8f 100644
--- a/test/syscalls/linux/socket_unix_seqpacket_local.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 95f454251..659c93945 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index ec0fc6955..ce0f1e50d 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
index bf4c5f2eb..6b840189c 100644
--- a/test/syscalls/linux/socket_unix_stream_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index df80b105a..ebec4e0ec 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
index b6fe7a9ce..4b5832de8 100644
--- a/test/syscalls/linux/socket_unix_unbound_abstract.cc
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
index 1ec11a08d..2ddc5c11f 100644
--- a/test/syscalls/linux/socket_unix_unbound_dgram.cc
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
index d09142aa6..8cb03c450 100644
--- a/test/syscalls/linux/socket_unix_unbound_filesystem.cc
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
index 21209b244..0575f2e1d 100644
--- a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
index b95f9569e..091d546b3 100644
--- a/test/syscalls/linux/socket_unix_unbound_stream.cc
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
index 746318d09..80ba67496 100644
--- a/test/syscalls/linux/stat.cc
+++ b/test/syscalls/linux/stat.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
index 8346e9a8e..9b53739a0 100644
--- a/test/syscalls/linux/stat_times.cc
+++ b/test/syscalls/linux/stat_times.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc
index e1e7fc707..aca51d30f 100644
--- a/test/syscalls/linux/statfs.cc
+++ b/test/syscalls/linux/statfs.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
index 58cf0d014..59fb5dfe6 100644
--- a/test/syscalls/linux/sticky.cc
+++ b/test/syscalls/linux/sticky.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
index 318917f4b..494072a9b 100644
--- a/test/syscalls/linux/symlink.cc
+++ b/test/syscalls/linux/symlink.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sync.cc b/test/syscalls/linux/sync.cc
index 5b777b6eb..fe479390d 100644
--- a/test/syscalls/linux/sync.cc
+++ b/test/syscalls/linux/sync.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sync_file_range.cc b/test/syscalls/linux/sync_file_range.cc
index d11f58481..36cc42043 100644
--- a/test/syscalls/linux/sync_file_range.cc
+++ b/test/syscalls/linux/sync_file_range.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sysinfo.cc b/test/syscalls/linux/sysinfo.cc
index a0dd82640..1a71256da 100644
--- a/test/syscalls/linux/sysinfo.cc
+++ b/test/syscalls/linux/sysinfo.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/syslog.cc b/test/syscalls/linux/syslog.cc
index 5bd0d1cc3..9a7407d96 100644
--- a/test/syscalls/linux/syslog.cc
+++ b/test/syscalls/linux/syslog.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
index 8e10220eb..819fa655a 100644
--- a/test/syscalls/linux/sysret.cc
+++ b/test/syscalls/linux/sysret.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 33620a874..e3f9f9f9d 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/temp_umask.h b/test/syscalls/linux/temp_umask.h
index f202dfa59..81a25440c 100644
--- a/test/syscalls/linux/temp_umask.h
+++ b/test/syscalls/linux/temp_umask.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/tgkill.cc b/test/syscalls/linux/tgkill.cc
index 2d258ef11..80acae5de 100644
--- a/test/syscalls/linux/tgkill.cc
+++ b/test/syscalls/linux/tgkill.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
index 5a3dfd026..c7eead17e 100644
--- a/test/syscalls/linux/time.cc
+++ b/test/syscalls/linux/time.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/timerfd.cc b/test/syscalls/linux/timerfd.cc
index b85321795..9df53612f 100644
--- a/test/syscalls/linux/timerfd.cc
+++ b/test/syscalls/linux/timerfd.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
index 14506eb12..fd42e81e1 100644
--- a/test/syscalls/linux/timers.cc
+++ b/test/syscalls/linux/timers.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
index 3e8ce5327..bae377c69 100644
--- a/test/syscalls/linux/tkill.cc
+++ b/test/syscalls/linux/tkill.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc
index 2616a9147..e5cc5d97c 100644
--- a/test/syscalls/linux/truncate.cc
+++ b/test/syscalls/linux/truncate.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/udp_bind.cc b/test/syscalls/linux/udp_bind.cc
index 547eb2a6c..6d92bdbeb 100644
--- a/test/syscalls/linux/udp_bind.cc
+++ b/test/syscalls/linux/udp_bind.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index f39281d5c..31db8a2ad 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
index d78a09b1e..bf1ca8679 100644
--- a/test/syscalls/linux/uidgid.cc
+++ b/test/syscalls/linux/uidgid.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/uname.cc b/test/syscalls/linux/uname.cc
index d22a34bd7..0a5d91017 100644
--- a/test/syscalls/linux/uname.cc
+++ b/test/syscalls/linux/uname.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc
index 2d7a530b9..6f49e3660 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.cc
+++ b/test/syscalls/linux/unix_domain_socket_test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h
index 1b09aeae7..aae990245 100644
--- a/test/syscalls/linux/unix_domain_socket_test_util.h
+++ b/test/syscalls/linux/unix_domain_socket_test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unlink.cc b/test/syscalls/linux/unlink.cc
index b10aae025..b6f65e027 100644
--- a/test/syscalls/linux/unlink.cc
+++ b/test/syscalls/linux/unlink.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/unshare.cc b/test/syscalls/linux/unshare.cc
index 9dd6ec4b6..e32619efe 100644
--- a/test/syscalls/linux/unshare.cc
+++ b/test/syscalls/linux/unshare.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
index bf776cd93..80716859a 100644
--- a/test/syscalls/linux/utimes.cc
+++ b/test/syscalls/linux/utimes.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vdso.cc b/test/syscalls/linux/vdso.cc
index 0f6e1c7c6..19c80add8 100644
--- a/test/syscalls/linux/vdso.cc
+++ b/test/syscalls/linux/vdso.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
index 0e936594b..759a50569 100644
--- a/test/syscalls/linux/vdso_clock_gettime.cc
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
index 9999a909e..631a53654 100644
--- a/test/syscalls/linux/vfork.cc
+++ b/test/syscalls/linux/vfork.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
index cb6840cc6..2c2303358 100644
--- a/test/syscalls/linux/vsyscall.cc
+++ b/test/syscalls/linux/vsyscall.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index fcd606bec..50d0725a7 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
index 7f80b2fa8..9b219cfd6 100644
--- a/test/syscalls/linux/write.cc
+++ b/test/syscalls/linux/write.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index c4af28103..28f312b8b 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/syscalls/syscall_test_runner.sh b/test/syscalls/syscall_test_runner.sh
index 87d62786b..864bb2de4 100755
--- a/test/syscalls/syscall_test_runner.sh
+++ b/test/syscalls/syscall_test_runner.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/test/util/capability_util.cc b/test/util/capability_util.cc
index d1dd95e76..5d733887b 100644
--- a/test/util/capability_util.cc
+++ b/test/util/capability_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/capability_util.h b/test/util/capability_util.h
index 8708f5e69..e968a2583 100644
--- a/test/util/capability_util.h
+++ b/test/util/capability_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/cleanup.h b/test/util/cleanup.h
index fb4724f97..c76482ef4 100644
--- a/test/util/cleanup.h
+++ b/test/util/cleanup.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/epoll_util.cc b/test/util/epoll_util.cc
index 0b95aa8cd..2e5051468 100644
--- a/test/util/epoll_util.cc
+++ b/test/util/epoll_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/epoll_util.h b/test/util/epoll_util.h
index 521e7a3d3..f233b37d5 100644
--- a/test/util/epoll_util.h
+++ b/test/util/epoll_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/eventfd_util.h b/test/util/eventfd_util.h
index 1fdb07d3b..cb9ce829c 100644
--- a/test/util/eventfd_util.h
+++ b/test/util/eventfd_util.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/file_descriptor.h b/test/util/file_descriptor.h
index be8812d01..fc5caa55b 100644
--- a/test/util/file_descriptor.h
+++ b/test/util/file_descriptor.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 6bd424417..bc90bd78e 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/fs_util.h b/test/util/fs_util.h
index 9412b2f71..eb7cdaa24 100644
--- a/test/util/fs_util.h
+++ b/test/util/fs_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/fs_util_test.cc b/test/util/fs_util_test.cc
index ce70d58aa..4e12076a1 100644
--- a/test/util/fs_util_test.cc
+++ b/test/util/fs_util_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/logging.cc b/test/util/logging.cc
index 86ea71df3..cc71d77b0 100644
--- a/test/util/logging.cc
+++ b/test/util/logging.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/logging.h b/test/util/logging.h
index 6e957b172..589166fab 100644
--- a/test/util/logging.h
+++ b/test/util/logging.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/memory_util.h b/test/util/memory_util.h
index 8f6e99ba6..8c77778ea 100644
--- a/test/util/memory_util.h
+++ b/test/util/memory_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/mount_util.h b/test/util/mount_util.h
index 468170646..7782e6bf2 100644
--- a/test/util/mount_util.h
+++ b/test/util/mount_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/multiprocess_util.cc b/test/util/multiprocess_util.cc
index 12637db8c..95f5f3b4f 100644
--- a/test/util/multiprocess_util.cc
+++ b/test/util/multiprocess_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/multiprocess_util.h b/test/util/multiprocess_util.h
index ba5f2601f..0aecd3439 100644
--- a/test/util/multiprocess_util.h
+++ b/test/util/multiprocess_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/posix_error.cc b/test/util/posix_error.cc
index ead9ede16..cebf7e0ac 100644
--- a/test/util/posix_error.cc
+++ b/test/util/posix_error.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/posix_error.h b/test/util/posix_error.h
index 2a66e2e94..b604f4f8f 100644
--- a/test/util/posix_error.h
+++ b/test/util/posix_error.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/posix_error_test.cc b/test/util/posix_error_test.cc
index c5427b8e5..d67270842 100644
--- a/test/util/posix_error_test.cc
+++ b/test/util/posix_error_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/proc_util.cc b/test/util/proc_util.cc
index 2d9eb1986..9d4db37c3 100644
--- a/test/util/proc_util.cc
+++ b/test/util/proc_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/proc_util.h b/test/util/proc_util.h
index e1ee2db9c..af209a51e 100644
--- a/test/util/proc_util.h
+++ b/test/util/proc_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/proc_util_test.cc b/test/util/proc_util_test.cc
index 75335415a..71dd2355e 100644
--- a/test/util/proc_util_test.cc
+++ b/test/util/proc_util_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/rlimit_util.cc b/test/util/rlimit_util.cc
index a9912c372..684253f78 100644
--- a/test/util/rlimit_util.cc
+++ b/test/util/rlimit_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/rlimit_util.h b/test/util/rlimit_util.h
index fa5cc70dc..873252a32 100644
--- a/test/util/rlimit_util.h
+++ b/test/util/rlimit_util.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/save_util.cc b/test/util/save_util.cc
index 5540e2146..05f52b80d 100644
--- a/test/util/save_util.cc
+++ b/test/util/save_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/save_util.h b/test/util/save_util.h
index 919e4af3d..90460701e 100644
--- a/test/util/save_util.h
+++ b/test/util/save_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/signal_util.cc b/test/util/signal_util.cc
index 3e2df32a6..26738864f 100644
--- a/test/util/signal_util.cc
+++ b/test/util/signal_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/signal_util.h b/test/util/signal_util.h
index 80f1808f6..7fd2af015 100644
--- a/test/util/signal_util.h
+++ b/test/util/signal_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc
index 48ce82d20..c5d8fc635 100644
--- a/test/util/temp_path.cc
+++ b/test/util/temp_path.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/temp_path.h b/test/util/temp_path.h
index 33eb6a72c..89302e0fd 100644
--- a/test/util/temp_path.h
+++ b/test/util/temp_path.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_main.cc b/test/util/test_main.cc
index 4c6b5e860..5c7ee0064 100644
--- a/test/util/test_main.cc
+++ b/test/util/test_main.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 9b7cfa4dc..c52fd9a4a 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_util.h b/test/util/test_util.h
index 905412b24..8f5eb5089 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/test_util_test.cc b/test/util/test_util_test.cc
index 5889651d1..b7300d9e5 100644
--- a/test/util/test_util_test.cc
+++ b/test/util/test_util_test.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/thread_util.h b/test/util/thread_util.h
index df09ac8cf..860e77531 100644
--- a/test/util/thread_util.h
+++ b/test/util/thread_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/timer_util.cc b/test/util/timer_util.cc
index 681fafb69..43a26b0d3 100644
--- a/test/util/timer_util.cc
+++ b/test/util/timer_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/util/timer_util.h b/test/util/timer_util.h
index 9bdc51a57..2cebfa5d1 100644
--- a/test/util/timer_util.h
+++ b/test/util/timer_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/third_party/gvsync/atomicptr_unsafe.go b/third_party/gvsync/atomicptr_unsafe.go
index da9f16240..53a943282 100644
--- a/third_party/gvsync/atomicptr_unsafe.go
+++ b/third_party/gvsync/atomicptr_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/atomicptrtest/atomicptr_test.go b/third_party/gvsync/atomicptrtest/atomicptr_test.go
index 15d0936d4..8fdc5112e 100644
--- a/third_party/gvsync/atomicptrtest/atomicptr_test.go
+++ b/third_party/gvsync/atomicptrtest/atomicptr_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/downgradable_rwmutex_test.go b/third_party/gvsync/downgradable_rwmutex_test.go
index 6517dd5dc..40c384b8b 100644
--- a/third_party/gvsync/downgradable_rwmutex_test.go
+++ b/third_party/gvsync/downgradable_rwmutex_test.go
@@ -1,5 +1,5 @@
 // Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
index 131f0a2ba..4d43eb765 100644
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ b/third_party/gvsync/downgradable_rwmutex_unsafe.go
@@ -1,5 +1,5 @@
 // Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/third_party/gvsync/gvsync.go b/third_party/gvsync/gvsync.go
index 46a2565fd..3bbef13c3 100644
--- a/third_party/gvsync/gvsync.go
+++ b/third_party/gvsync/gvsync.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go
index d483fc739..4c8aa9ab6 100644
--- a/third_party/gvsync/memmove_unsafe.go
+++ b/third_party/gvsync/memmove_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/norace_unsafe.go b/third_party/gvsync/norace_unsafe.go
index f9c88d13f..e3852db8c 100644
--- a/third_party/gvsync/norace_unsafe.go
+++ b/third_party/gvsync/norace_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/race_unsafe.go b/third_party/gvsync/race_unsafe.go
index 2cdcdf7f7..13c02a830 100644
--- a/third_party/gvsync/race_unsafe.go
+++ b/third_party/gvsync/race_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/seqatomic_unsafe.go b/third_party/gvsync/seqatomic_unsafe.go
index ef61503e2..c52d378f1 100644
--- a/third_party/gvsync/seqatomic_unsafe.go
+++ b/third_party/gvsync/seqatomic_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/seqatomictest/seqatomic_test.go b/third_party/gvsync/seqatomictest/seqatomic_test.go
index d0c373bae..2da73cf96 100644
--- a/third_party/gvsync/seqatomictest/seqatomic_test.go
+++ b/third_party/gvsync/seqatomictest/seqatomic_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/third_party/gvsync/seqcount.go b/third_party/gvsync/seqcount.go
index c7ae91cfa..2c9c2c3d6 100644
--- a/third_party/gvsync/seqcount.go
+++ b/third_party/gvsync/seqcount.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/third_party/gvsync/seqcount_test.go b/third_party/gvsync/seqcount_test.go
index ee6579ed8..085e574b3 100644
--- a/third_party/gvsync/seqcount_test.go
+++ b/third_party/gvsync/seqcount_test.go
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC
+// Copyright 2019 The gVisor Authors.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go
index eaf5c4970..ca414d8cb 100644
--- a/tools/go_generics/generics.go
+++ b/tools/go_generics/generics.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/input.go b/tools/go_generics/generics_tests/all_stmts/input.go
index 19184a3fe..4791d1ff1 100644
--- a/tools/go_generics/generics_tests/all_stmts/input.go
+++ b/tools/go_generics/generics_tests/all_stmts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_stmts/output/output.go b/tools/go_generics/generics_tests/all_stmts/output/output.go
index 51582346c..a53d84535 100644
--- a/tools/go_generics/generics_tests/all_stmts/output/output.go
+++ b/tools/go_generics/generics_tests/all_stmts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/input.go b/tools/go_generics/generics_tests/all_types/input.go
index ed6e97c29..3575d02ec 100644
--- a/tools/go_generics/generics_tests/all_types/input.go
+++ b/tools/go_generics/generics_tests/all_types/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/lib/lib.go b/tools/go_generics/generics_tests/all_types/lib/lib.go
index 7e73e678e..988786496 100644
--- a/tools/go_generics/generics_tests/all_types/lib/lib.go
+++ b/tools/go_generics/generics_tests/all_types/lib/lib.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/all_types/output/output.go b/tools/go_generics/generics_tests/all_types/output/output.go
index ec09a6be4..41fd147a1 100644
--- a/tools/go_generics/generics_tests/all_types/output/output.go
+++ b/tools/go_generics/generics_tests/all_types/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/input.go b/tools/go_generics/generics_tests/consts/input.go
index 394bcc262..04b95fcc6 100644
--- a/tools/go_generics/generics_tests/consts/input.go
+++ b/tools/go_generics/generics_tests/consts/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/consts/output/output.go b/tools/go_generics/generics_tests/consts/output/output.go
index 91a07fdc2..18d316cc9 100644
--- a/tools/go_generics/generics_tests/consts/output/output.go
+++ b/tools/go_generics/generics_tests/consts/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/input.go b/tools/go_generics/generics_tests/imports/input.go
index 22e6641a6..0f032c2a1 100644
--- a/tools/go_generics/generics_tests/imports/input.go
+++ b/tools/go_generics/generics_tests/imports/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/imports/output/output.go b/tools/go_generics/generics_tests/imports/output/output.go
index 2555c0004..2488ca58c 100644
--- a/tools/go_generics/generics_tests/imports/output/output.go
+++ b/tools/go_generics/generics_tests/imports/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/input.go b/tools/go_generics/generics_tests/remove_typedef/input.go
index d9c9b8530..cf632bae7 100644
--- a/tools/go_generics/generics_tests/remove_typedef/input.go
+++ b/tools/go_generics/generics_tests/remove_typedef/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/remove_typedef/output/output.go b/tools/go_generics/generics_tests/remove_typedef/output/output.go
index f111a9426..d44fd8e1c 100644
--- a/tools/go_generics/generics_tests/remove_typedef/output/output.go
+++ b/tools/go_generics/generics_tests/remove_typedef/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/input.go b/tools/go_generics/generics_tests/simple/input.go
index 711687cf5..2a917f16c 100644
--- a/tools/go_generics/generics_tests/simple/input.go
+++ b/tools/go_generics/generics_tests/simple/input.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/generics_tests/simple/output/output.go b/tools/go_generics/generics_tests/simple/output/output.go
index 139c9bf9d..6bfa0b25b 100644
--- a/tools/go_generics/generics_tests/simple/output/output.go
+++ b/tools/go_generics/generics_tests/simple/output/output.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/globals_visitor.go b/tools/go_generics/globals/globals_visitor.go
index daaa17b1d..7ae48c662 100644
--- a/tools/go_generics/globals/globals_visitor.go
+++ b/tools/go_generics/globals/globals_visitor.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/globals/scope.go b/tools/go_generics/globals/scope.go
index b75a91689..96c965ea2 100644
--- a/tools/go_generics/globals/scope.go
+++ b/tools/go_generics/globals/scope.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/go_generics_unittest.sh b/tools/go_generics/go_generics_unittest.sh
index e7553a071..44b22db91 100755
--- a/tools/go_generics/go_generics_unittest.sh
+++ b/tools/go_generics/go_generics_unittest.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/go_merge/main.go b/tools/go_generics/go_merge/main.go
index 2f83facf8..f6a331123 100644
--- a/tools/go_generics/go_merge/main.go
+++ b/tools/go_generics/go_merge/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/imports.go b/tools/go_generics/imports.go
index 57f7c3dce..3a7230c97 100644
--- a/tools/go_generics/imports.go
+++ b/tools/go_generics/imports.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/remove.go b/tools/go_generics/remove.go
index 139d03955..568a6bbd3 100644
--- a/tools/go_generics/remove.go
+++ b/tools/go_generics/remove.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template.go b/tools/go_generics/rules_tests/template.go
index f3f31ae8e..aace61da1 100644
--- a/tools/go_generics/rules_tests/template.go
+++ b/tools/go_generics/rules_tests/template.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_generics/rules_tests/template_test.go b/tools/go_generics/rules_tests/template_test.go
index 3a38c8629..b2a3446ef 100644
--- a/tools/go_generics/rules_tests/template_test.go
+++ b/tools/go_generics/rules_tests/template_test.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index 9e2c8e106..db7a7107b 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/tools/tag_release.sh b/tools/tag_release.sh
index 6906a952f..02a49cdf1 100755
--- a/tools/tag_release.sh
+++ b/tools/tag_release.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
index a0e646e45..64a905fc9 100755
--- a/tools/workspace_status.sh
+++ b/tools/workspace_status.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/barrier.h b/vdso/barrier.h
index 5b6c763f6..edba4afb5 100644
--- a/vdso/barrier.h
+++ b/vdso/barrier.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/check_vdso.py b/vdso/check_vdso.py
index 6f7d7e7ec..e41b09709 100644
--- a/vdso/check_vdso.py
+++ b/vdso/check_vdso.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Google LLC
+# Copyright 2018 The gVisor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vdso/compiler.h b/vdso/compiler.h
index d65f148fb..54a510000 100644
--- a/vdso/compiler.h
+++ b/vdso/compiler.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/cycle_clock.h b/vdso/cycle_clock.h
index 309e07a3f..5d3fbb257 100644
--- a/vdso/cycle_clock.h
+++ b/vdso/cycle_clock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/seqlock.h b/vdso/seqlock.h
index ab2f3fda3..7a173174b 100644
--- a/vdso/seqlock.h
+++ b/vdso/seqlock.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/syscalls.h b/vdso/syscalls.h
index 90fb424ce..f5865bb72 100644
--- a/vdso/syscalls.h
+++ b/vdso/syscalls.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso.cc b/vdso/vdso.cc
index 550729035..6265ad217 100644
--- a/vdso/vdso.cc
+++ b/vdso/vdso.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.cc b/vdso/vdso_time.cc
index 9fc262f60..1bb4bb86b 100644
--- a/vdso/vdso_time.cc
+++ b/vdso/vdso_time.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/vdso/vdso_time.h b/vdso/vdso_time.h
index 464dadff2..70d079efc 100644
--- a/vdso/vdso_time.h
+++ b/vdso/vdso_time.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Google LLC
+// Copyright 2018 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3


From 81ecd8b6eab7457b331762626f8c210fec3504e6 Mon Sep 17 00:00:00 2001
From: Ian Gudger <igudger@google.com>
Date: Mon, 29 Apr 2019 21:20:05 -0700
Subject: Implement the MSG_CTRUNC msghdr flag for Unix sockets.

Updates google/gvisor#206

PiperOrigin-RevId: 245880573
Change-Id: Ifa715e98d47f64b8a32b04ae9378d6cd6bd4025e
---
 pkg/sentry/fs/host/control.go            |  7 ++++--
 pkg/sentry/fs/host/socket.go             | 16 ++++++------
 pkg/sentry/fs/host/socket_test.go        |  2 +-
 pkg/sentry/fs/host/socket_unsafe.go      | 12 +++++----
 pkg/sentry/socket/control/control.go     | 42 ++++++++++++++++++++------------
 pkg/sentry/socket/socket.go              |  5 ++++
 pkg/sentry/socket/unix/io.go             |  7 +++++-
 pkg/sentry/socket/unix/transport/unix.go | 35 +++++++++++++++-----------
 pkg/sentry/socket/unix/unix.go           | 13 ++++++++++
 pkg/sentry/syscalls/linux/sys_socket.go  |  9 ++++---
 test/syscalls/linux/socket_unix.cc       | 18 --------------
 11 files changed, 99 insertions(+), 67 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go
index 480f0c8f4..9ebb9bbb3 100644
--- a/pkg/sentry/fs/host/control.go
+++ b/pkg/sentry/fs/host/control.go
@@ -32,17 +32,20 @@ func newSCMRights(fds []int) control.SCMRights {
 }
 
 // Files implements control.SCMRights.Files.
-func (c *scmRights) Files(ctx context.Context, max int) control.RightsFiles {
+func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) {
 	n := max
+	var trunc bool
 	if l := len(c.fds); n > l {
 		n = l
+	} else if n < l {
+		trunc = true
 	}
 
 	rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n]))
 
 	// Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
 	c.fds = c.fds[len(rf):]
-	return rf
+	return rf, trunc
 }
 
 // Clone implements transport.RightsControlMessage.Clone.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 3034e9441..3ed137006 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -282,11 +282,11 @@ func (c *ConnectedEndpoint) EventUpdate() {
 }
 
 // Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, tcpip.FullAddress, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
 	c.mu.RLock()
 	defer c.mu.RUnlock()
 	if c.readClosed {
-		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive
 	}
 
 	var cm unet.ControlMessage
@@ -296,7 +296,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 
 	// N.B. Unix sockets don't have a receive buffer, the send buffer
 	// serves both purposes.
-	rl, ml, cl, err := fdReadVec(c.file.FD(), data, []byte(cm), peek, c.sndbuf)
+	rl, ml, cl, cTrunc, err := fdReadVec(c.file.FD(), data, []byte(cm), peek, c.sndbuf)
 	if rl > 0 && err != nil {
 		// We got some data, so all we need to do on error is return
 		// the data that we got. Short reads are fine, no need to
@@ -304,7 +304,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 		err = nil
 	}
 	if err != nil {
-		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, syserr.FromError(err)
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
 	}
 
 	// There is no need for the callee to call RecvNotify because fdReadVec uses
@@ -317,18 +317,18 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, p
 
 	// Avoid extra allocations in the case where there isn't any control data.
 	if len(cm) == 0 {
-		return rl, ml, transport.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
 	}
 
 	fds, err := cm.ExtractFDs()
 	if err != nil {
-		return 0, 0, transport.ControlMessages{}, tcpip.FullAddress{}, false, syserr.FromError(err)
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
 	}
 
 	if len(fds) == 0 {
-		return rl, ml, transport.ControlMessages{}, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
 	}
-	return rl, ml, control.New(nil, nil, newSCMRights(fds)), tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
+	return rl, ml, control.New(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, nil
 }
 
 // close releases all resources related to the endpoint.
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index cc760a7e1..06392a65a 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -207,7 +207,7 @@ func TestSend(t *testing.T) {
 
 func TestRecv(t *testing.T) {
 	e := ConnectedEndpoint{readClosed: true}
-	if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != syserr.ErrClosedForReceive {
+	if _, _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != syserr.ErrClosedForReceive {
 		t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, syserr.ErrClosedForReceive)
 	}
 }
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index 8873705c0..e57be0506 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -23,7 +23,7 @@ import (
 //
 // If the total length of bufs is > maxlen, fdReadVec will do a partial read
 // and err will indicate why the message was truncated.
-func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) {
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, controlTrunc bool, err error) {
 	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
 	if peek {
 		flags |= syscall.MSG_PEEK
@@ -34,7 +34,7 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (re
 	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
 	if err != nil && len(iovecs) == 0 {
 		// No partial write to do, return error immediately.
-		return 0, 0, 0, err
+		return 0, 0, 0, false, err
 	}
 
 	var msg syscall.Msghdr
@@ -51,7 +51,7 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (re
 	n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
 	if e != 0 {
 		// N.B. prioritize the syscall error over the buildIovec error.
-		return 0, 0, 0, e
+		return 0, 0, 0, false, e
 	}
 
 	// Copy data back to bufs.
@@ -59,11 +59,13 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (re
 		copyToMulti(bufs, intermediate)
 	}
 
+	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
+
 	if n > length {
-		return length, n, msg.Controllen, err
+		return length, n, msg.Controllen, controlTrunc, err
 	}
 
-	return n, n, msg.Controllen, err
+	return n, n, msg.Controllen, controlTrunc, err
 }
 
 // fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index abda364c9..c0238691d 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -45,7 +45,10 @@ type SCMRights interface {
 	transport.RightsControlMessage
 
 	// Files returns up to max RightsFiles.
-	Files(ctx context.Context, max int) RightsFiles
+	//
+	// Returned files are consumed and ownership is transferred to the caller.
+	// Subsequent calls to Files will return the next files.
+	Files(ctx context.Context, max int) (rf RightsFiles, truncated bool)
 }
 
 // RightsFiles represents a SCM_RIGHTS socket control message. A reference is
@@ -71,14 +74,17 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) {
 }
 
 // Files implements SCMRights.Files.
-func (fs *RightsFiles) Files(ctx context.Context, max int) RightsFiles {
+func (fs *RightsFiles) Files(ctx context.Context, max int) (RightsFiles, bool) {
 	n := max
+	var trunc bool
 	if l := len(*fs); n > l {
 		n = l
+	} else if n < l {
+		trunc = true
 	}
 	rf := (*fs)[:n]
 	*fs = (*fs)[n:]
-	return rf
+	return rf, trunc
 }
 
 // Clone implements transport.RightsControlMessage.Clone.
@@ -99,8 +105,8 @@ func (fs *RightsFiles) Release() {
 }
 
 // rightsFDs gets up to the specified maximum number of FDs.
-func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) []int32 {
-	files := rights.Files(t, max)
+func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32, bool) {
+	files, trunc := rights.Files(t, max)
 	fds := make([]int32, 0, len(files))
 	for i := 0; i < max && len(files) > 0; i++ {
 		fd, err := t.FDMap().NewFDFrom(0, files[0], kernel.FDFlags{cloexec}, t.ThreadGroup().Limits())
@@ -114,19 +120,23 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) []int32
 
 		fds = append(fds, int32(fd))
 	}
-	return fds
+	return fds, trunc
 }
 
 // PackRights packs as many FDs as will fit into the unused capacity of buf.
-func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte) []byte {
+func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flags int) ([]byte, int) {
 	maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4
 	// Linux does not return any FDs if none fit.
 	if maxFDs <= 0 {
-		return buf
+		flags |= linux.MSG_CTRUNC
+		return buf, flags
+	}
+	fds, trunc := rightsFDs(t, rights, cloexec, maxFDs)
+	if trunc {
+		flags |= linux.MSG_CTRUNC
 	}
-	fds := rightsFDs(t, rights, cloexec, maxFDs)
 	align := t.Arch().Width()
-	return putCmsg(buf, linux.SCM_RIGHTS, align, fds)
+	return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds)
 }
 
 // scmCredentials represents an SCM_CREDENTIALS socket control message.
@@ -176,7 +186,7 @@ func putUint32(buf []byte, n uint32) []byte {
 
 // putCmsg writes a control message header and as much data as will fit into
 // the unused capacity of a buffer.
-func putCmsg(buf []byte, msgType uint32, align uint, data []int32) []byte {
+func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) {
 	space := AlignDown(cap(buf)-len(buf), 4)
 
 	// We can't write to space that doesn't exist, so if we are going to align
@@ -193,7 +203,8 @@ func putCmsg(buf []byte, msgType uint32, align uint, data []int32) []byte {
 	// a partial int32, so the length of the message will be
 	// min(aligned length, header + datas).
 	if space < linux.SizeOfControlMessageHeader {
-		return buf
+		flags |= linux.MSG_CTRUNC
+		return buf, flags
 	}
 
 	length := 4*len(data) + linux.SizeOfControlMessageHeader
@@ -205,11 +216,12 @@ func putCmsg(buf []byte, msgType uint32, align uint, data []int32) []byte {
 	buf = putUint32(buf, msgType)
 	for _, d := range data {
 		if len(buf)+4 > cap(buf) {
+			flags |= linux.MSG_CTRUNC
 			break
 		}
 		buf = putUint32(buf, uint32(d))
 	}
-	return alignSlice(buf, align)
+	return alignSlice(buf, align), flags
 }
 
 func putCmsgStruct(buf []byte, msgType uint32, align uint, data interface{}) []byte {
@@ -253,7 +265,7 @@ func (c *scmCredentials) Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID,
 
 // PackCredentials packs the credentials in the control message (or default
 // credentials if none) into a buffer.
-func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte) []byte {
+func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int) ([]byte, int) {
 	align := t.Arch().Width()
 
 	// Default credentials if none are available.
@@ -265,7 +277,7 @@ func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte) []byte {
 		pid, uid, gid = creds.Credentials(t)
 	}
 	c := []int32{int32(pid), int32(uid), int32(gid)}
-	return putCmsg(buf, linux.SCM_CREDENTIALS, align, c)
+	return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c)
 }
 
 // AlignUp rounds a length up to an alignment. align must be a power of 2.
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 7e840b452..9393acd28 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -87,6 +87,11 @@ type Socket interface {
 	// senderAddrLen is the address length to be returned to the application,
 	// not necessarily the actual length of the address.
 	//
+	// flags control how RecvMsg should be completed. msgFlags indicate how
+	// the RecvMsg call was completed. Note that control message truncation
+	// may still be required even if the MSG_CTRUNC bit is not set in
+	// msgFlags. In that case, the caller should set MSG_CTRUNC appropriately.
+	//
 	// If err != nil, the recv was not successful.
 	RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
 
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 382911d51..5a1475ec2 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -72,13 +72,18 @@ type EndpointReader struct {
 
 	// Control contains the received control messages.
 	Control transport.ControlMessages
+
+	// ControlTrunc indicates that SCM_RIGHTS FDs were discarded based on
+	// the value of NumRights.
+	ControlTrunc bool
 }
 
 // ReadToBlocks implements safemem.Reader.ReadToBlocks.
 func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) {
-		n, ms, c, err := r.Endpoint.RecvMsg(bufs, r.Creds, r.NumRights, r.Peek, r.From)
+		n, ms, c, ct, err := r.Endpoint.RecvMsg(bufs, r.Creds, r.NumRights, r.Peek, r.From)
 		r.Control = c
+		r.ControlTrunc = ct
 		r.MsgSize = ms
 		if err != nil {
 			return int64(n), err.ToError()
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index d5f7f7aa8..b734b4c20 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -130,7 +130,11 @@ type Endpoint interface {
 	//
 	// msgLen is the length of the read message consumed for datagram Endpoints.
 	// msgLen is always the same as recvLen for stream Endpoints.
-	RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, err *syserr.Error)
+	//
+	// CMTruncated indicates that the numRights hint was used to receive fewer
+	// than the total available SCM_RIGHTS FDs. Additional truncation may be
+	// required by the caller.
+	RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, CMTruncated bool, err *syserr.Error)
 
 	// SendMsg writes data and a control message to the endpoint's peer.
 	// This method does not block if the data cannot be written.
@@ -288,7 +292,7 @@ type Receiver interface {
 	// See Endpoint.RecvMsg for documentation on shared arguments.
 	//
 	// notify indicates if RecvNotify should be called.
-	Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, source tcpip.FullAddress, notify bool, err *syserr.Error)
+	Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
 
 	// RecvNotify notifies the Receiver of a successful Recv. This must not be
 	// called while holding any endpoint locks.
@@ -328,7 +332,7 @@ type queueReceiver struct {
 }
 
 // Recv implements Receiver.Recv.
-func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
 	var m *message
 	var notify bool
 	var err *syserr.Error
@@ -338,7 +342,7 @@ func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek
 		m, notify, err = q.readQueue.Dequeue()
 	}
 	if err != nil {
-		return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
+		return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
 	}
 	src := []byte(m.Data)
 	var copied uintptr
@@ -347,7 +351,7 @@ func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek
 		copied += uintptr(n)
 		src = src[n:]
 	}
-	return copied, uintptr(len(m.Data)), m.Control, m.Address, notify, nil
+	return copied, uintptr(len(m.Data)), m.Control, false, m.Address, notify, nil
 }
 
 // RecvNotify implements Receiver.RecvNotify.
@@ -440,7 +444,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
 }
 
 // Recv implements Receiver.Recv.
-func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 
@@ -453,7 +457,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 		// the next time Recv() is called.
 		m, n, err := q.readQueue.Dequeue()
 		if err != nil {
-			return 0, 0, ControlMessages{}, tcpip.FullAddress{}, false, err
+			return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
 		}
 		notify = n
 		q.buffer = []byte(m.Data)
@@ -469,7 +473,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 		// Don't consume data since we are peeking.
 		copied, data, _ = vecCopy(data, q.buffer)
 
-		return copied, copied, c, q.addr, notify, nil
+		return copied, copied, c, false, q.addr, notify, nil
 	}
 
 	// Consume data and control message since we are not peeking.
@@ -484,9 +488,11 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 		c.Credentials = nil
 	}
 
+	var cmTruncated bool
 	if c.Rights != nil && numRights == 0 {
 		c.Rights.Release()
 		c.Rights = nil
+		cmTruncated = true
 	}
 
 	haveRights := c.Rights != nil
@@ -538,6 +544,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 		if q.control.Rights != nil {
 			// Consume rights.
 			if numRights == 0 {
+				cmTruncated = true
 				q.control.Rights.Release()
 			} else {
 				c.Rights = q.control.Rights
@@ -546,7 +553,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
 			q.control.Rights = nil
 		}
 	}
-	return copied, copied, c, q.addr, notify, nil
+	return copied, copied, c, cmTruncated, q.addr, notify, nil
 }
 
 // A ConnectedEndpoint is an Endpoint that can be used to send Messages.
@@ -775,18 +782,18 @@ func (e *baseEndpoint) Connected() bool {
 }
 
 // RecvMsg reads data and a control message from the endpoint.
-func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, *syserr.Error) {
+func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, bool, *syserr.Error) {
 	e.Lock()
 
 	if e.receiver == nil {
 		e.Unlock()
-		return 0, 0, ControlMessages{}, syserr.ErrNotConnected
+		return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected
 	}
 
-	recvLen, msgLen, cms, a, notify, err := e.receiver.Recv(data, creds, numRights, peek)
+	recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek)
 	e.Unlock()
 	if err != nil {
-		return 0, 0, ControlMessages{}, err
+		return 0, 0, ControlMessages{}, false, err
 	}
 
 	if notify {
@@ -796,7 +803,7 @@ func (e *baseEndpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, pee
 	if addr != nil {
 		*addr = a
 	}
-	return recvLen, msgLen, cms, nil
+	return recvLen, msgLen, cms, cmt, nil
 }
 
 // SendMsg writes data and a control message to the endpoint's peer.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index e9607aa01..26788ec31 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -490,6 +490,9 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 	if s.Passcred() {
 		// Credentials take priority if they are enabled and there is space.
 		wantCreds = rightsLen > 0
+		if !wantCreds {
+			msgFlags |= linux.MSG_CTRUNC
+		}
 		credLen := syscall.CmsgSpace(syscall.SizeofUcred)
 		rightsLen -= credLen
 	}
@@ -516,6 +519,10 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
 		}
 
+		if r.ControlTrunc {
+			msgFlags |= linux.MSG_CTRUNC
+		}
+
 		if err != nil || dontWait || !waitAll || s.isPacket || n >= dst.NumBytes() {
 			if s.isPacket && n < int64(r.MsgSize) {
 				msgFlags |= linux.MSG_TRUNC
@@ -546,12 +553,18 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
 			if r.From != nil {
 				from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
 			}
+
+			if r.ControlTrunc {
+				msgFlags |= linux.MSG_CTRUNC
+			}
+
 			if trunc {
 				// n and r.MsgSize are the same for streams.
 				total += int64(r.MsgSize)
 			} else {
 				total += n
 			}
+
 			if err != nil || !waitAll || s.isPacket || n >= dst.NumBytes() {
 				if total > 0 {
 					err = nil
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 69862f110..8f4dbf3bc 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -746,7 +746,10 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 		if err != nil {
 			return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS)
 		}
-		cms.Unix.Release()
+		if !cms.Unix.Empty() {
+			mflags |= linux.MSG_CTRUNC
+			cms.Unix.Release()
+		}
 
 		if int(msg.Flags) != mflags {
 			// Copy out the flags to the caller.
@@ -771,7 +774,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 
 	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
 		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
-		controlData = control.PackCredentials(t, creds, controlData)
+		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
 	}
 
 	if cms.IP.HasTimestamp {
@@ -779,7 +782,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
 	}
 
 	if cms.Unix.Rights != nil {
-		controlData = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData)
+		controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
 	}
 
 	// Copy the address to the caller.
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index bb3397fa2..09a1c1c6e 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -186,9 +186,6 @@ TEST_P(UnixSocketPairTest, BasicFDPassNoSpace) {
 // BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
 // receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicFDPassNoSpaceMsgCtrunc) {
-  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[20];
@@ -259,9 +256,6 @@ TEST_P(UnixSocketPairTest, BasicFDPassNullControlMsgCtrunc) {
 // space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
 // msghdr.
 TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
-  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[20];
@@ -296,9 +290,6 @@ TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
 // space to receive two of them. It then verifies that the MSG_CTRUNC flag is
 // set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicThreeFDPassTruncationMsgCtrunc) {
-  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[20];
@@ -408,9 +399,6 @@ TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
 // provides enough space to receive one of them. It then verifies that the
 // MSG_CTRUNC flag is set in the msghdr.
 TEST_P(UnixSocketPairTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
-  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[20];
@@ -1010,9 +998,6 @@ TEST_P(UnixSocketPairTest, CredPassNoMsgCtrunc) {
 // the data without providing space for any credentials and verifies that
 // MSG_CTRUNC is set in the msghdr.
 TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
-  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[20];
@@ -1061,9 +1046,6 @@ TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
 // the data while providing enough space for only the first field of the
 // credentials and verifies that MSG_CTRUNC is set in the msghdr.
 TEST_P(UnixSocketPairTest, CredPassTruncatedMsgCtrunc) {
-  // FIXME(gvisor.dev/issue/206): Support MSG_CTRUNC.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   char sent_data[20];
-- 
cgit v1.2.3


From 8bfb83d0acdea553082b897d3fd0ad1c1580eaa9 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 30 Apr 2019 13:55:41 -0700
Subject: Implement async MemoryFile eviction, and use it in
 CachingInodeOperations.

This feature allows MemoryFile to delay eviction of "optional"
allocations, such as unused cached file pages.

Note that this incidentally makes CachingInodeOperations writeback
asynchronous, in the sense that it doesn't occur until eviction; this is
necessary because between when a cached page becomes evictable and when
it's evicted, file writes (via CachingInodeOperations.Write) may dirty
the page.

As currently implemented, this feature won't meaningfully impact
steady-state memory usage or caching; the reclaimer goroutine will
schedule eviction as soon as it runs out of other work to do. Future CLs
increase caching by adding constraints on when eviction is scheduled.

PiperOrigin-RevId: 246014822
Change-Id: Ia85feb25a2de92a48359eb84434b6ec6f9bea2cb
---
 pkg/sentry/context/contexttest/contexttest.go |   2 +-
 pkg/sentry/fs/fsutil/dirty_set.go             |  22 ++
 pkg/sentry/fs/fsutil/inode_cached.go          |  78 +++-
 pkg/sentry/fs/fsutil/inode_cached_test.go     |   8 +-
 pkg/sentry/kernel/kernel.go                   |   5 +
 pkg/sentry/pgalloc/BUILD                      |  27 ++
 pkg/sentry/pgalloc/pgalloc.go                 | 504 +++++++++++++++++++-------
 pkg/sentry/pgalloc/save_restore.go            |  14 +
 runsc/boot/loader.go                          |   2 +-
 9 files changed, 511 insertions(+), 151 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index a42038711..210a235d2 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -44,7 +44,7 @@ func Context(tb testing.TB) context.Context {
 		tb.Fatalf("error creating application memory file: %v", err)
 	}
 	memfile := os.NewFile(uintptr(memfd), memfileName)
-	mf, err := pgalloc.NewMemoryFile(memfile)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
 	if err != nil {
 		memfile.Close()
 		tb.Fatalf("error creating pgalloc.MemoryFile: %v", err)
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 9cd196d7d..f1451d77a 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -107,6 +107,7 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
 	var changedAny bool
 	defer func() {
 		if changedAny {
+			// Merge segments split by Isolate to reduce cost of iteration.
 			ds.MergeRange(mr)
 		}
 	}()
@@ -132,6 +133,26 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
 	}
 }
 
+// AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the
+// effect of a previous call to KeepDirty. (It does not itself mark those
+// offsets as not dirty.)
+func (ds *DirtySet) AllowClean(mr memmap.MappableRange) {
+	var changedAny bool
+	defer func() {
+		if changedAny {
+			// Merge segments split by Isolate to reduce cost of iteration.
+			ds.MergeRange(mr)
+		}
+	}()
+	for seg := ds.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() {
+		if seg.Value().Keep {
+			changedAny = true
+			seg = ds.Isolate(seg, mr)
+			seg.ValuePtr().Keep = false
+		}
+	}
+}
+
 // SyncDirty passes pages in the range mr that are stored in cache and
 // identified as dirty to writeAt, updating dirty to reflect successful writes.
 // If writeAt returns a successful partial write, SyncDirty will call it
@@ -142,6 +163,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet
 	var changedDirty bool
 	defer func() {
 		if changedDirty {
+			// Merge segments split by Isolate to reduce cost of iteration.
 			dirty.MergeRange(mr)
 		}
 	}()
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 919d2534c..76644e69d 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -175,11 +175,22 @@ func (c *CachingInodeOperations) Release() {
 	defer c.mapsMu.Unlock()
 	c.dataMu.Lock()
 	defer c.dataMu.Unlock()
-	// The cache should be empty (something has gone terribly wrong if we're
-	// releasing an inode that is still memory-mapped).
-	if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() {
-		panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty))
+
+	// Something has gone terribly wrong if we're releasing an inode that is
+	// still memory-mapped.
+	if !c.mappings.IsEmpty() {
+		panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings))
+	}
+
+	// Drop any cached pages that are still awaiting MemoryFile eviction. (This
+	// means that MemoryFile no longer needs to evict them.)
+	mf := c.mfp.MemoryFile()
+	mf.MarkAllUnevictable(c)
+	if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
+		panic(fmt.Sprintf("Failed to writeback cached data: %v", err))
 	}
+	c.cache.DropAll(mf)
+	c.dirty.RemoveAll()
 }
 
 // UnstableAttr implements fs.InodeOperations.UnstableAttr.
@@ -679,6 +690,13 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
 	return done, nil
 }
 
+// useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O
+// and memory mappings, and false if c.cache may contain data cached from
+// c.backingFile.
+func (c *CachingInodeOperations) useHostPageCache() bool {
+	return !c.forcePageCache && c.backingFile.FD() >= 0
+}
+
 // AddMapping implements memmap.Mappable.AddMapping.
 func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
 	// Hot path. Avoid defers.
@@ -689,7 +707,15 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi
 	for _, r := range mapped {
 		c.hostFileMapper.IncRefOn(r)
 	}
-	if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 {
+	if !c.useHostPageCache() {
+		// c.Evict() will refuse to evict memory-mapped pages, so tell the
+		// MemoryFile to not bother trying.
+		mf := c.mfp.MemoryFile()
+		for _, r := range mapped {
+			mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End})
+		}
+	}
+	if c.useHostPageCache() && !usage.IncrementalMappedAccounting {
 		for _, r := range mapped {
 			usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
 		}
@@ -706,7 +732,7 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 	for _, r := range unmapped {
 		c.hostFileMapper.DecRefOn(r)
 	}
-	if !c.forcePageCache && c.backingFile.FD() >= 0 {
+	if c.useHostPageCache() {
 		if !usage.IncrementalMappedAccounting {
 			for _, r := range unmapped {
 				usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
@@ -716,17 +742,16 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
 		return
 	}
 
-	// Writeback dirty mapped memory now that there are no longer any
-	// mappings that reference it. This is our naive memory eviction
-	// strategy.
+	// Pages that are no longer referenced by any application memory mappings
+	// are now considered unused; allow MemoryFile to evict them when
+	// necessary.
 	mf := c.mfp.MemoryFile()
 	c.dataMu.Lock()
 	for _, r := range unmapped {
-		if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
-			log.Warningf("Failed to writeback cached data %v: %v", r, err)
-		}
-		c.cache.Drop(r, mf)
-		c.dirty.KeepClean(r)
+		// Since these pages are no longer mapped, they are no longer
+		// concurrently dirtyable by a writable memory mapping.
+		c.dirty.AllowClean(r)
+		mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End})
 	}
 	c.dataMu.Unlock()
 	c.mapsMu.Unlock()
@@ -740,7 +765,7 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp
 // Translate implements memmap.Mappable.Translate.
 func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
 	// Hot path. Avoid defer.
-	if !c.forcePageCache && c.backingFile.FD() >= 0 {
+	if c.useHostPageCache() {
 		return []memmap.Translation{
 			{
 				Source: optional,
@@ -853,6 +878,29 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error
 	return nil
 }
 
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+	c.mapsMu.Lock()
+	defer c.mapsMu.Unlock()
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+
+	mr := memmap.MappableRange{er.Start, er.End}
+	mf := c.mfp.MemoryFile()
+	// Only allow pages that are no longer memory-mapped to be evicted.
+	for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+		mgapMR := mgap.Range().Intersect(mr)
+		if mgapMR.Length() == 0 {
+			continue
+		}
+		if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
+			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+		}
+		c.cache.Drop(mgapMR, mf)
+		c.dirty.KeepClean(mgapMR)
+	}
+}
+
 // IncRef implements platform.File.IncRef. This is used when we directly map an
 // underlying host fd and CachingInodeOperations is used as the platform.File
 // during translation.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 661ec41f6..3f10efc12 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -311,12 +311,10 @@ func TestRead(t *testing.T) {
 		t.Errorf("Read back bytes %v, want %v", rbuf, buf)
 	}
 
-	// Delete the memory mapping and expect it to cause the cached page to be
-	// uncached.
+	// Delete the memory mapping before iops.Release(). The cached page will
+	// either be evicted by ctx's pgalloc.MemoryFile, or dropped by
+	// iops.Release().
 	iops.RemoveMapping(ctx, ms, ar, usermem.PageSize, true)
-	if cached := iops.cache.Span(); cached != 0 {
-		t.Fatalf("Span got %d, want 0", cached)
-	}
 }
 
 func TestWrite(t *testing.T) {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0468dd678..91889b573 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -303,7 +303,12 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	k.pauseTimeLocked()
 	defer k.resumeTimeLocked()
 
+	// Evict all evictable MemoryFile allocations.
+	k.mf.FlushEvictions()
+
 	// Flush write operations on open files so data reaches backing storage.
+	// This must come after k.mf.FlushEvictions() since eviction may cause file
+	// writes.
 	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
 		return err
 	}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 7efa55c20..8a8a0e4e4 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -3,6 +3,31 @@ package(licenses = ["notice"])
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
+go_template_instance(
+    name = "evictable_range",
+    out = "evictable_range.go",
+    package = "pgalloc",
+    prefix = "Evictable",
+    template = "//pkg/segment:generic_range",
+    types = {
+        "T": "uint64",
+    },
+)
+
+go_template_instance(
+    name = "evictable_range_set",
+    out = "evictable_range_set.go",
+    package = "pgalloc",
+    prefix = "evictableRange",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "EvictableRange",
+        "Value": "evictableRangeSetValue",
+        "Functions": "evictableRangeSetFunctions",
+    },
+)
+
 go_template_instance(
     name = "usage_set",
     out = "usage_set.go",
@@ -27,6 +52,8 @@ go_library(
     name = "pgalloc",
     srcs = [
         "context.go",
+        "evictable_range.go",
+        "evictable_range_set.go",
         "pgalloc.go",
         "pgalloc_unsafe.go",
         "save_restore.go",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 411dafa07..9c1313f6f 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -31,6 +31,7 @@ import (
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -41,6 +42,9 @@ import (
 // MemoryFile is a platform.File whose pages may be allocated to arbitrary
 // users.
 type MemoryFile struct {
+	// opts holds options passed to NewMemoryFile. opts is immutable.
+	opts MemoryFileOpts
+
 	// MemoryFile owns a single backing file, which is modeled as follows:
 	//
 	// Each page in the file can be committed or uncommitted. A page is
@@ -115,6 +119,24 @@ type MemoryFile struct {
 	// fileSize is protected by mu.
 	fileSize int64
 
+	// Pages from the backing file are mapped into the local address space on
+	// the granularity of large pieces called chunks. mappings is a []uintptr
+	// that stores, for each chunk, the start address of a mapping of that
+	// chunk in the current process' address space, or 0 if no such mapping
+	// exists. Once a chunk is mapped, it is never remapped or unmapped until
+	// the MemoryFile is destroyed.
+	//
+	// Mutating the mappings slice or its contents requires both holding
+	// mappingsMu and using atomic memory operations. (The slice is mutated
+	// whenever the file is expanded. Per the above, the only permitted
+	// mutation of the slice's contents is the assignment of a mapping to a
+	// chunk that was previously unmapped.) Reading the slice or its contents
+	// only requires *either* holding mappingsMu or using atomic memory
+	// operations. This allows MemoryFile.MapInternal to avoid locking in the
+	// common case where chunk mappings already exist.
+	mappingsMu sync.Mutex
+	mappings   atomic.Value
+
 	// destroyed is set by Destroy to instruct the reclaimer goroutine to
 	// release resources and exit. destroyed is protected by mu.
 	destroyed bool
@@ -133,26 +155,44 @@ type MemoryFile struct {
 	// transitions from false to true.
 	reclaimCond sync.Cond
 
-	// Pages from the backing file are mapped into the local address space on
-	// the granularity of large pieces called chunks. mappings is a []uintptr
-	// that stores, for each chunk, the start address of a mapping of that
-	// chunk in the current process' address space, or 0 if no such mapping
-	// exists. Once a chunk is mapped, it is never remapped or unmapped until
-	// the MemoryFile is destroyed.
+	// evictable maps EvictableMemoryUsers to eviction state.
 	//
-	// Mutating the mappings slice or its contents requires both holding
-	// mappingsMu and using atomic memory operations. (The slice is mutated
-	// whenever the file is expanded. Per the above, the only permitted
-	// mutation of the slice's contents is the assignment of a mapping to a
-	// chunk that was previously unmapped.) Reading the slice or its contents
-	// only requires *either* holding mappingsMu or using atomic memory
-	// operations. This allows MemoryFile.MapInternal to avoid locking in the
-	// common case where chunk mappings already exist.
-	mappingsMu sync.Mutex
-	mappings   atomic.Value
+	// evictable is protected by mu.
+	evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
+
+	// evictionWG counts the number of goroutines currently performing evictions.
+	evictionWG sync.WaitGroup
+}
+
+// MemoryFileOpts provides options to NewMemoryFile.
+type MemoryFileOpts struct {
+	// DelayedEviction controls the extent to which the MemoryFile may delay
+	// eviction of evictable allocations.
+	DelayedEviction DelayedEvictionType
 }
 
-// usage tracks usage information.
+// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
+type DelayedEvictionType int
+
+const (
+	// DelayedEvictionDefault has unspecified behavior.
+	DelayedEvictionDefault DelayedEvictionType = iota
+
+	// DelayedEvictionDisabled requires that evictable allocations are evicted
+	// as soon as possible.
+	DelayedEvictionDisabled
+
+	// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
+	// evictable allocations until doing so is considered necessary to avoid
+	// performance degradation due to host memory pressure, or OOM kills.
+	//
+	// As of this writing, DelayedEvictionEnabled delays evictions until the
+	// reclaimer goroutine is out of work (pages to reclaim), then evicts all
+	// pending evictable allocations immediately.
+	DelayedEvictionEnabled
+)
+
+// usageInfo tracks usage information.
 //
 // +stateify savable
 type usageInfo struct {
@@ -166,6 +206,46 @@ type usageInfo struct {
 	refs uint64
 }
 
+// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
+// may be asked to deallocate that memory in the presence of memory pressure.
+type EvictableMemoryUser interface {
+	// Evict requests that the EvictableMemoryUser deallocate memory used by
+	// er, which was registered as evictable by a previous call to
+	// MemoryFile.MarkEvictable.
+	//
+	// Evict is not required to deallocate memory. In particular, since pgalloc
+	// must call Evict without holding locks to avoid circular lock ordering,
+	// it is possible that the passed range has already been marked as
+	// unevictable by a racing call to MemoryFile.MarkUnevictable.
+	// Implementations of EvictableMemoryUser must detect such races and handle
+	// them by making Evict have no effect on unevictable ranges.
+	//
+	// After a call to Evict, the MemoryFile will consider the evicted range
+	// unevictable (i.e. it will not call Evict on the same range again) until
+	// informed otherwise by a subsequent call to MarkEvictable.
+	Evict(ctx context.Context, er EvictableRange)
+}
+
+// An EvictableRange represents a range of uint64 offsets in an
+// EvictableMemoryUser.
+//
+// In practice, most EvictableMemoryUsers will probably be implementations of
+// memmap.Mappable, and EvictableRange therefore corresponds to
+// memmap.MappableRange. However, this package cannot depend on the memmap
+// package, since doing so would create a circular dependency.
+//
+// type EvictableRange <generated using go_generics>
+
+// evictableMemoryUserInfo is the value type of MemoryFile.evictable.
+type evictableMemoryUserInfo struct {
+	// ranges tracks all evictable ranges for the given user.
+	ranges evictableRangeSet
+
+	// If evicting is true, there is a goroutine currently evicting all
+	// evictable ranges for this user.
+	evicting bool
+}
+
 const (
 	chunkShift = 24
 	chunkSize  = 1 << chunkShift // 16 MB
@@ -180,7 +260,15 @@ const (
 // NewMemoryFile creates a MemoryFile backed by the given file. If
 // NewMemoryFile succeeds, ownership of file is transferred to the returned
 // MemoryFile.
-func NewMemoryFile(file *os.File) (*MemoryFile, error) {
+func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
+	switch opts.DelayedEviction {
+	case DelayedEvictionDefault:
+		opts.DelayedEviction = DelayedEvictionEnabled
+	case DelayedEvictionDisabled, DelayedEvictionEnabled:
+	default:
+		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
+	}
+
 	// Truncate the file to 0 bytes first to ensure that it's empty.
 	if err := file.Truncate(0); err != nil {
 		return nil, err
@@ -189,14 +277,16 @@ func NewMemoryFile(file *os.File) (*MemoryFile, error) {
 		return nil, err
 	}
 	f := &MemoryFile{
+		opts:     opts,
 		fileSize: initialSize,
 		file:     file,
 		// No pages are reclaimable. DecRef will always be able to
 		// decrease minReclaimablePage from this point.
 		minReclaimablePage: maxPage,
+		evictable:          make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
 	}
-	f.reclaimCond.L = &f.mu
 	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	f.reclaimCond.L = &f.mu
 	go f.runReclaim() // S/R-SAFE: f.mu
 
 	// The Linux kernel contains an optional feature called "Integrity
@@ -434,113 +524,6 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
 	f.usage.MergeRange(fr)
 }
 
-// runReclaim implements the reclaimer goroutine, which continuously decommits
-// reclaimable pages in order to reduce memory usage and make them available
-// for allocation.
-func (f *MemoryFile) runReclaim() {
-	for {
-		fr, ok := f.findReclaimable()
-		if !ok {
-			break
-		}
-
-		if err := f.Decommit(fr); err != nil {
-			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
-			// Zero the pages manually. This won't reduce memory usage, but at
-			// least ensures that the pages will be zero when reallocated.
-			f.forEachMappingSlice(fr, func(bs []byte) {
-				for i := range bs {
-					bs[i] = 0
-				}
-			})
-			// Pretend the pages were decommitted even though they weren't,
-			// since the memory accounting implementation has no idea how to
-			// deal with this.
-			f.markDecommitted(fr)
-		}
-		f.markReclaimed(fr)
-	}
-	// We only get here if findReclaimable finds f.destroyed set and returns
-	// false.
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	if !f.destroyed {
-		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
-	}
-	f.file.Close()
-	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
-	// that has possibly been reassigned.
-	f.file = nil
-	mappings := f.mappings.Load().([]uintptr)
-	for i, m := range mappings {
-		if m != 0 {
-			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
-			if errno != 0 {
-				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
-			}
-		}
-	}
-	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
-	f.mappings.Store([]uintptr{})
-}
-
-func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	for {
-		for {
-			if f.destroyed {
-				return platform.FileRange{}, false
-			}
-			if f.reclaimable {
-				break
-			}
-			f.reclaimCond.Wait()
-		}
-		// Allocate returns the first usable range in offset order and is
-		// currently a linear scan, so reclaiming from the beginning of the
-		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
-			if seg.ValuePtr().refs == 0 {
-				f.minReclaimablePage = seg.End()
-				return seg.Range(), true
-			}
-		}
-		// No pages are reclaimable.
-		f.reclaimable = false
-		f.minReclaimablePage = maxPage
-	}
-}
-
-func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	seg := f.usage.FindSegment(fr.Start)
-	// All of fr should be mapped to a single uncommitted reclaimable segment
-	// accounted to System.
-	if !seg.Ok() {
-		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
-	}
-	if !seg.Range().IsSupersetOf(fr) {
-		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
-	}
-	if got, want := seg.Value(), (usageInfo{
-		kind:           usage.System,
-		knownCommitted: false,
-		refs:           0,
-	}); got != want {
-		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
-	}
-	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
-	// caller of markReclaimed may not have decommitted it, so we can only mark
-	// fr as reclaimed.
-	f.usage.Remove(f.usage.Isolate(seg, fr))
-	if fr.Start < f.minUnallocatedPage {
-		// We've deallocated at least one lower page.
-		f.minUnallocatedPage = fr.Start
-	}
-}
-
 // IncRef implements platform.File.IncRef.
 func (f *MemoryFile) IncRef(fr platform.FileRange) {
 	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
@@ -677,9 +660,82 @@ func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
 	return mappings, m, nil
 }
 
-// FD implements platform.File.FD.
-func (f *MemoryFile) FD() int {
-	return int(f.file.Fd())
+// MarkEvictable allows f to request memory deallocation by calling
+// user.Evict(er) in the future.
+//
+// Redundantly marking an already-evictable range as evictable has no effect.
+func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		info = &evictableMemoryUserInfo{}
+		f.evictable[user] = info
+	}
+	gap := info.ranges.LowerBoundGap(er.Start)
+	for gap.Ok() && gap.Start() < er.End {
+		gapER := gap.Range().Intersect(er)
+		if gapER.Length() == 0 {
+			gap = gap.NextGap()
+			continue
+		}
+		gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
+	}
+	if !info.evicting {
+		switch f.opts.DelayedEviction {
+		case DelayedEvictionDisabled:
+			// Kick off eviction immediately.
+			f.startEvictionGoroutineLocked(user, info)
+		case DelayedEvictionEnabled:
+			// Ensure that the reclaimer goroutine is running, so that it can
+			// start eviction when necessary.
+			f.reclaimCond.Signal()
+		}
+	}
+}
+
+// MarkUnevictable informs f that user no longer considers er to be evictable,
+// so the MemoryFile should no longer call user.Evict(er). Note that, per
+// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
+// called even after MarkUnevictable returns due to race conditions, and
+// implementations of EvictableMemoryUser must handle this possibility.
+//
+// Redundantly marking an already-unevictable range as unevictable has no
+// effect.
+func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		return
+	}
+	seg := info.ranges.LowerBoundSegment(er.Start)
+	for seg.Ok() && seg.Start() < er.End {
+		seg = info.ranges.Isolate(seg, er)
+		seg = info.ranges.Remove(seg).NextSegment()
+	}
+	// We can only remove info if there's no eviction goroutine running on its
+	// behalf.
+	if !info.evicting && info.ranges.IsEmpty() {
+		delete(f.evictable, user)
+	}
+}
+
+// MarkAllUnevictable informs f that user no longer considers any offsets to be
+// evictable. It otherwise has the same semantics as MarkUnevictable.
+func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		return
+	}
+	info.ranges.RemoveAll()
+	// We can only remove info if there's no eviction goroutine running on its
+	// behalf.
+	if !info.evicting {
+		delete(f.evictable, user)
+	}
 }
 
 // UpdateUsage ensures that the memory usage statistics in
@@ -889,6 +945,11 @@ func (f *MemoryFile) File() *os.File {
 	return f.file
 }
 
+// FD implements platform.File.FD.
+func (f *MemoryFile) FD() int {
+	return int(f.file.Fd())
+}
+
 // String implements fmt.Stringer.String.
 //
 // Note that because f.String locks f.mu, calling f.String internally
@@ -900,6 +961,167 @@ func (f *MemoryFile) String() string {
 	return f.usage.String()
 }
 
+// runReclaim implements the reclaimer goroutine, which continuously decommits
+// reclaimable pages in order to reduce memory usage and make them available
+// for allocation.
+func (f *MemoryFile) runReclaim() {
+	for {
+		fr, ok := f.findReclaimable()
+		if !ok {
+			break
+		}
+
+		if err := f.Decommit(fr); err != nil {
+			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+			// Zero the pages manually. This won't reduce memory usage, but at
+			// least ensures that the pages will be zero when reallocated.
+			f.forEachMappingSlice(fr, func(bs []byte) {
+				for i := range bs {
+					bs[i] = 0
+				}
+			})
+			// Pretend the pages were decommitted even though they weren't,
+			// since the memory accounting implementation has no idea how to
+			// deal with this.
+			f.markDecommitted(fr)
+		}
+		f.markReclaimed(fr)
+	}
+	// We only get here if findReclaimable finds f.destroyed set and returns
+	// false.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if !f.destroyed {
+		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
+	}
+	f.file.Close()
+	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
+	// that has possibly been reassigned.
+	f.file = nil
+	f.mappingsMu.Lock()
+	defer f.mappingsMu.Unlock()
+	mappings := f.mappings.Load().([]uintptr)
+	for i, m := range mappings {
+		if m != 0 {
+			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
+			if errno != 0 {
+				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
+			}
+		}
+	}
+	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
+	f.mappings.Store([]uintptr{})
+}
+
+func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for {
+		for {
+			if f.destroyed {
+				return platform.FileRange{}, false
+			}
+			if f.reclaimable {
+				break
+			}
+			if f.opts.DelayedEviction == DelayedEvictionEnabled {
+				// No work to do. Evict any pending evictable allocations to
+				// get more reclaimable pages before going to sleep.
+				f.startEvictionsLocked()
+			}
+			f.reclaimCond.Wait()
+		}
+		// Allocate returns the first usable range in offset order and is
+		// currently a linear scan, so reclaiming from the beginning of the
+		// file minimizes the expected latency of Allocate.
+		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
+			if seg.ValuePtr().refs == 0 {
+				f.minReclaimablePage = seg.End()
+				return seg.Range(), true
+			}
+		}
+		// No pages are reclaimable.
+		f.reclaimable = false
+		f.minReclaimablePage = maxPage
+	}
+}
+
+func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	seg := f.usage.FindSegment(fr.Start)
+	// All of fr should be mapped to a single uncommitted reclaimable segment
+	// accounted to System.
+	if !seg.Ok() {
+		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
+	}
+	if !seg.Range().IsSupersetOf(fr) {
+		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
+	}
+	if got, want := seg.Value(), (usageInfo{
+		kind:           usage.System,
+		knownCommitted: false,
+		refs:           0,
+	}); got != want {
+		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
+	}
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
+	// caller of markReclaimed may not have decommitted it, so we can only mark
+	// fr as reclaimed.
+	f.usage.Remove(f.usage.Isolate(seg, fr))
+	if fr.Start < f.minUnallocatedPage {
+		// We've deallocated at least one lower page.
+		f.minUnallocatedPage = fr.Start
+	}
+}
+
+// Preconditions: f.mu must be locked.
+func (f *MemoryFile) startEvictionsLocked() {
+	for user, info := range f.evictable {
+		// Don't start multiple goroutines to evict the same user's
+		// allocations.
+		if !info.evicting {
+			f.startEvictionGoroutineLocked(user, info)
+		}
+	}
+}
+
+// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
+// locked.
+func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
+	info.evicting = true
+	f.evictionWG.Add(1)
+	go func() { // S/R-SAFE: f.evictionWG
+		defer f.evictionWG.Done()
+		for {
+			f.mu.Lock()
+			info, ok := f.evictable[user]
+			if !ok {
+				// This shouldn't happen: only this goroutine is permitted
+				// to delete this entry.
+				f.mu.Unlock()
+				panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
+			}
+			if info.ranges.IsEmpty() {
+				delete(f.evictable, user)
+				f.mu.Unlock()
+				return
+			}
+			// Evict from the end of info.ranges, under the assumption that
+			// if ranges in user start being used again (and are
+			// consequently marked unevictable), such uses are more likely
+			// to start from the beginning of user.
+			seg := info.ranges.LastSegment()
+			er := seg.Range()
+			info.ranges.Remove(seg)
+			// user.Evict() must be called without holding f.mu to avoid
+			// circular lock ordering.
+			f.mu.Unlock()
+			user.Evict(context.Background(), er)
+		}
+	}()
+}
+
 type usageSetFunctions struct{}
 
 func (usageSetFunctions) MinKey() uint64 {
@@ -920,3 +1142,27 @@ func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.
 func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
 	return val, val
 }
+
+// evictableRangeSetValue is the value type of evictableRangeSet.
+type evictableRangeSetValue struct{}
+
+type evictableRangeSetFunctions struct{}
+
+func (evictableRangeSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (evictableRangeSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
+}
+
+func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
+	return evictableRangeSetValue{}, true
+}
+
+func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
+	return evictableRangeSetValue{}, evictableRangeSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index cf169af55..9534d1aed 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -28,6 +28,15 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/state"
 )
 
+// FlushEvictions blocks until f has finished evicting all evictable
+// allocations.
+func (f *MemoryFile) FlushEvictions() {
+	f.mu.Lock()
+	f.startEvictionsLocked()
+	f.mu.Unlock()
+	f.evictionWG.Wait()
+}
+
 // SaveTo writes f's state to the given stream.
 func (f *MemoryFile) SaveTo(w io.Writer) error {
 	// Wait for reclaim.
@@ -40,6 +49,11 @@ func (f *MemoryFile) SaveTo(w io.Writer) error {
 		f.mu.Lock()
 	}
 
+	// Ensure that there are no pending evictions.
+	if len(f.evictable) != 0 {
+		panic(fmt.Sprintf("evictions still pending for %d users; call FlushEvictions before SaveTo", len(f.evictable)))
+	}
+
 	// Ensure that all pages that contain data have knownCommitted set, since
 	// we only store knownCommitted pages below.
 	zeroPage := make([]byte, usermem.PageSize)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0b5be0a42..05122a6a8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -424,7 +424,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 		return nil, fmt.Errorf("error creating memfd: %v", err)
 	}
 	memfile := os.NewFile(uintptr(memfd), memfileName)
-	mf, err := pgalloc.NewMemoryFile(memfile)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
 	if err != nil {
 		memfile.Close()
 		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
-- 
cgit v1.2.3


From 23ca9886c6cfe499438f1b994ee66a4f803673ae Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 30 Apr 2019 15:41:42 -0700
Subject: Update reference to old type

PiperOrigin-RevId: 246036806
Change-Id: I5554a43a1f8146c927402db3bf98488a2da0fbe7
---
 pkg/sentry/fs/dentry.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index fe656cc24..c0fc74723 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -73,8 +73,8 @@ func (c *CollectEntriesSerializer) Written() int {
 	return len(c.Entries)
 }
 
-// DirCtx is used by node.Readdir to emit directory entries.  It is not
-// thread-safe.
+// DirCtx is used in FileOperations.IterateDir to emit directory entries. It is
+// not thread-safe.
 type DirCtx struct {
 	// Serializer is used to serialize the node attributes.
 	Serializer DentrySerializer
-- 
cgit v1.2.3


From 24d8656585e6072ff7d5a00a7eb4bd25cba42dc4 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Fri, 3 May 2019 14:00:31 -0700
Subject: gofer: don't leak file descriptors

Fixes #219

PiperOrigin-RevId: 246568639
Change-Id: Ic7afd15dde922638d77f6429c508d1cbe2e4288a
---
 pkg/sentry/fs/gofer/cache_policy.go | 3 ++-
 pkg/sentry/fs/gofer/path.go         | 4 ++++
 runsc/fsgofer/fsgofer.go            | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go
index 35cd0c1d6..c59344589 100644
--- a/pkg/sentry/fs/gofer/cache_policy.go
+++ b/pkg/sentry/fs/gofer/cache_policy.go
@@ -139,11 +139,12 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child
 	// TODO(b/112031682): If we have a directory FD in the parent
 	// inodeOperations, then we can use fstatat(2) to get the inode
 	// attributes instead of making this RPC.
-	qids, _, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
+	qids, f, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		// Can't look up the name. Trigger reload.
 		return true
 	}
+	f.close(ctx)
 
 	// If the Path has changed, then we are not looking at the file file.
 	// We must reload.
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 4cbf9e9d9..aa3d3aaa6 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -109,6 +109,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	hostFile, err := newFile.create(ctx, name, openFlags, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID))
 	if err != nil {
 		// Could not create the file.
+		newFile.close(ctx)
 		return nil, err
 	}
 
@@ -120,11 +121,14 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	qids, unopened, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		newFile.close(ctx)
+		hostFile.Close()
 		return nil, err
 	}
 	if len(qids) != 1 {
 		log.Warningf("WalkGetAttr(%s) succeeded, but returned %d QIDs (%v), wanted 1", name, len(qids), qids)
 		newFile.close(ctx)
+		hostFile.Close()
+		unopened.close(ctx)
 		return nil, syserror.EIO
 	}
 	qid := qids[0]
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 158f22ddc..3a0806837 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -502,6 +502,9 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
 	last := l
 	for _, name := range names {
 		f, path, err := openAnyFileFromParent(last, name)
+		if last != l {
+			last.Close()
+		}
 		if err != nil {
 			return nil, nil, extractErrno(err)
 		}
-- 
cgit v1.2.3


From 14f0e7618e28dac78ca7b00ec61fcec062159009 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 6 May 2019 16:38:37 -0700
Subject: Ensure all uses of MM.brk occur under MM.mappingMu in MM.Brk().

PiperOrigin-RevId: 246921386
Change-Id: I71d8908858f45a9a33a0483470d0240eaf0fd012
---
 pkg/sentry/mm/syscalls.go | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index a25318abb..70c9aa7f6 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -694,8 +694,9 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// Can't defer mm.mappingMu.Unlock(); see below.
 
 	if addr < mm.brk.Start {
+		addr = mm.brk.End
 		mm.mappingMu.Unlock()
-		return mm.brk.End, syserror.EINVAL
+		return addr, syserror.EINVAL
 	}
 
 	// TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
@@ -704,22 +705,20 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// size of heap + data + bss. The segment sizes need to be plumbed from
 	// the loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		addr = mm.brk.End
 		mm.mappingMu.Unlock()
-		return mm.brk.End, syserror.ENOMEM
+		return addr, syserror.ENOMEM
 	}
 
 	oldbrkpg, _ := mm.brk.End.RoundUp()
 	newbrkpg, ok := addr.RoundUp()
 	if !ok {
+		addr = mm.brk.End
 		mm.mappingMu.Unlock()
-		return mm.brk.End, syserror.EFAULT
+		return addr, syserror.EFAULT
 	}
 
 	switch {
-	case newbrkpg < oldbrkpg:
-		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
-		mm.mappingMu.Unlock()
-
 	case oldbrkpg < newbrkpg:
 		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length: uint64(newbrkpg - oldbrkpg),
@@ -736,21 +735,26 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 			Hint:      "[heap]",
 		})
 		if err != nil {
+			addr = mm.brk.End
 			mm.mappingMu.Unlock()
-			return mm.brk.End, err
+			return addr, err
 		}
+		mm.brk.End = addr
 		if mm.defMLockMode == memmap.MLockEager {
 			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
 		} else {
 			mm.mappingMu.Unlock()
 		}
 
+	case newbrkpg < oldbrkpg:
+		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+		fallthrough
+
 	default:
-		// Nothing to do.
+		mm.brk.End = addr
 		mm.mappingMu.Unlock()
 	}
 
-	mm.brk.End = addr
 	return addr, nil
 }
 
-- 
cgit v1.2.3


From e5432fa1b365edcebf9c8c01e2c40ade3014f282 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 7 May 2019 10:53:50 -0700
Subject: Remove defers from gofer.contextFile

Most are single line methods in hot paths.

PiperOrigin-RevId: 247050267
Change-Id: I428d78723fe00b57483185899dc8fa9e1f01e2ea
---
 pkg/sentry/fs/gofer/context_file.go | 118 ++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index d512afefc..842a34af8 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -28,156 +28,156 @@ type contextFile struct {
 
 func (c *contextFile) walk(ctx context.Context, names []string) ([]p9.QID, contextFile, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
 
 	q, f, err := c.file.Walk(names)
 	if err != nil {
+		ctx.UninterruptibleSleepFinish(false)
 		return nil, contextFile{}, err
 	}
+	ctx.UninterruptibleSleepFinish(false)
 	return q, contextFile{file: f}, nil
 }
 
 func (c *contextFile) statFS(ctx context.Context) (p9.FSStat, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.StatFS()
+	s, err := c.file.StatFS()
+	ctx.UninterruptibleSleepFinish(false)
+	return s, err
 }
 
 func (c *contextFile) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.GetAttr(req)
+	q, m, a, err := c.file.GetAttr(req)
+	ctx.UninterruptibleSleepFinish(false)
+	return q, m, a, err
 }
 
 func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.SetAttr(valid, attr)
+	err := c.file.SetAttr(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
 }
 
 func (c *contextFile) rename(ctx context.Context, directory contextFile, name string) error {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Rename(directory.file, name)
+	err := c.file.Rename(directory.file, name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
 }
 
 func (c *contextFile) close(ctx context.Context) error {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Close()
+	err := c.file.Close()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
 }
 
 func (c *contextFile) open(ctx context.Context, mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Open(mode)
+	f, q, u, err := c.file.Open(mode)
+	ctx.UninterruptibleSleepFinish(false)
+	return f, q, u, err
 }
 
 func (c *contextFile) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.ReadAt(p, offset)
+	n, err := c.file.ReadAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
 }
 
 func (c *contextFile) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.WriteAt(p, offset)
+	n, err := c.file.WriteAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
 }
 
 func (c *contextFile) fsync(ctx context.Context) error {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.FSync()
+	err := c.file.FSync()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
 }
 
 func (c *contextFile) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
 	fd, _, _, _, err := c.file.Create(name, flags, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
 	return fd, err
 }
 
 func (c *contextFile) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Mkdir(name, permissions, uid, gid)
+	q, err := c.file.Mkdir(name, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return q, err
 }
 
 func (c *contextFile) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Symlink(oldName, newName, uid, gid)
+	q, err := c.file.Symlink(oldName, newName, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return q, err
 }
 
 func (c *contextFile) link(ctx context.Context, target *contextFile, newName string) error {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Link(target.file, newName)
+	err := c.file.Link(target.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
 }
 
 func (c *contextFile) mknod(ctx context.Context, name string, permissions p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Mknod(name, permissions, major, minor, uid, gid)
+	q, err := c.file.Mknod(name, permissions, major, minor, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return q, err
 }
 
 func (c *contextFile) unlinkAt(ctx context.Context, name string, flags uint32) error {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.UnlinkAt(name, flags)
+	err := c.file.UnlinkAt(name, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
 }
 
 func (c *contextFile) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Readdir(offset, count)
+	d, err := c.file.Readdir(offset, count)
+	ctx.UninterruptibleSleepFinish(false)
+	return d, err
 }
 
 func (c *contextFile) readlink(ctx context.Context) (string, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Readlink()
+	s, err := c.file.Readlink()
+	ctx.UninterruptibleSleepFinish(false)
+	return s, err
 }
 
 func (c *contextFile) flush(ctx context.Context) error {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Flush()
+	err := c.file.Flush()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
 }
 
 func (c *contextFile) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, contextFile, p9.AttrMask, p9.Attr, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
 	q, f, m, a, err := c.file.WalkGetAttr(names)
 	if err != nil {
+		ctx.UninterruptibleSleepFinish(false)
 		return nil, contextFile{}, p9.AttrMask{}, p9.Attr{}, err
 	}
+	ctx.UninterruptibleSleepFinish(false)
 	return q, contextFile{file: f}, m, a, nil
 }
 
 func (c *contextFile) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
 	ctx.UninterruptibleSleepStart(false)
-	defer ctx.UninterruptibleSleepFinish(false)
-
-	return c.file.Connect(flags)
+	f, err := c.file.Connect(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return f, err
 }
-- 
cgit v1.2.3


From bfd9f75ba4390de824d2c3d44c15bdca9dd0ff35 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Wed, 8 May 2019 14:34:01 -0700
Subject: Set the FilesytemType in MountSource from the Filesystem.

And stop storing the Filesystem in the MountSource.

This allows us to decouple the MountSource filesystem type from the name of the
filesystem.

PiperOrigin-RevId: 247292982
Change-Id: I49cbcce3c17883b7aa918ba76203dfd6d1b03cc8
---
 pkg/sentry/fs/mount.go        | 11 +++++++----
 pkg/sentry/fs/proc/mounts.go  | 12 ++----------
 pkg/tcpip/link/muxed/BUILD    |  4 +++-
 pkg/tcpip/transport/raw/BUILD |  4 +++-
 runsc/BUILD                   |  4 +++-
 5 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index a169ea4c9..9740f1fc6 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -110,9 +110,8 @@ type MountSource struct {
 	// MountSourceOperations defines filesystem specific behavior.
 	MountSourceOperations
 
-	// Filesystem is the filesystem backing the mount. Can be nil if there
-	// is no filesystem backing the mount.
-	Filesystem Filesystem
+	// FilesystemType is the type of the filesystem backing this mount.
+	FilesystemType string
 
 	// Flags are the flags that this filesystem was mounted with.
 	Flags MountSourceFlags
@@ -158,10 +157,14 @@ const DefaultDirentCacheSize uint64 = 1000
 // NewMountSource returns a new MountSource. Filesystem may be nil if there is no
 // filesystem backing the mount.
 func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags MountSourceFlags) *MountSource {
+	fsType := "none"
+	if filesystem != nil {
+		fsType = filesystem.Name()
+	}
 	return &MountSource{
 		MountSourceOperations: mops,
 		Flags:                 flags,
-		Filesystem:            filesystem,
+		FilesystemType:        fsType,
 		fscache:               NewDirentCache(DefaultDirentCacheSize),
 		children:              make(map[*MountSource]struct{}),
 	}
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index 37ed30724..b5e01301f 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -139,11 +139,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		fmt.Fprintf(&buf, "- ")
 
 		// (9) Filesystem type.
-		name := "none"
-		if m.Filesystem != nil {
-			name = m.Filesystem.Name()
-		}
-		fmt.Fprintf(&buf, "%s ", name)
+		fmt.Fprintf(&buf, "%s ", m.FilesystemType)
 
 		// (10) Mount source: filesystem-specific information or "none".
 		fmt.Fprintf(&buf, "none ")
@@ -190,11 +186,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		if m.Flags.ReadOnly {
 			opts = "ro"
 		}
-		name := "none"
-		if m.Filesystem != nil {
-			name = m.Filesystem.Name()
-		}
-		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, name, opts, 0, 0)
+		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, m.FilesystemType, opts, 0, 0)
 	})
 
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index f991dca83..84cfae784 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -1,6 +1,8 @@
 load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 go_library(
     name = "muxed",
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index 52f6b9759..6d3f0130e 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -1,4 +1,6 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 load("//tools/go_stateify:defs.bzl", "go_library")
diff --git a/runsc/BUILD b/runsc/BUILD
index 4d2046ed3..af8e928c5 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,4 +1,6 @@
-package(licenses = ["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
-- 
cgit v1.2.3


From 1bee43be13549b01e18d87df194ac219845de5cf Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 9 May 2019 15:34:44 -0700
Subject: Implement fallocate(2)

Closes #225

PiperOrigin-RevId: 247508791
Change-Id: I04f47cf2770b30043e5a272aba4ba6e11d0476cc
---
 pkg/abi/linux/file.go                     |  11 +++
 pkg/p9/BUILD                              |   1 +
 pkg/p9/client_file.go                     |  12 +++
 pkg/p9/file.go                            |   4 +
 pkg/p9/handlers.go                        |  34 +++++++++
 pkg/p9/local_server/local_server.go       |   5 ++
 pkg/p9/messages.go                        |  59 ++++++++++++++
 pkg/p9/p9.go                              |  81 ++++++++++++++++++++
 pkg/p9/version.go                         |   7 +-
 pkg/sentry/fs/ashmem/device.go            |   1 +
 pkg/sentry/fs/binder/binder.go            |   1 +
 pkg/sentry/fs/dev/full.go                 |   3 +-
 pkg/sentry/fs/dev/null.go                 |   9 ++-
 pkg/sentry/fs/dev/random.go               |   9 ++-
 pkg/sentry/fs/fsutil/BUILD                |   1 +
 pkg/sentry/fs/fsutil/host_mappable.go     |  10 ++-
 pkg/sentry/fs/fsutil/inode.go             |  25 ++++++
 pkg/sentry/fs/fsutil/inode_cached.go      |  28 +++++++
 pkg/sentry/fs/fsutil/inode_cached_test.go |   9 +++
 pkg/sentry/fs/gofer/context_file.go       |   7 ++
 pkg/sentry/fs/gofer/inode.go              |  24 ++++++
 pkg/sentry/fs/host/inode.go               |  18 +++++
 pkg/sentry/fs/inode.go                    |   7 ++
 pkg/sentry/fs/inode_operations.go         |   4 +
 pkg/sentry/fs/inode_overlay.go            |   7 ++
 pkg/sentry/fs/inode_overlay_test.go       |   1 +
 pkg/sentry/fs/mock.go                     |   5 ++
 pkg/sentry/fs/proc/inode.go               |   1 +
 pkg/sentry/fs/proc/seqfile/seqfile.go     |   3 +-
 pkg/sentry/fs/proc/uid_gid_map.go         |   3 +-
 pkg/sentry/fs/ramfs/dir.go                |   1 +
 pkg/sentry/fs/ramfs/socket.go             |   3 +-
 pkg/sentry/fs/ramfs/symlink.go            |   5 +-
 pkg/sentry/fs/sys/devices.go              |   3 +-
 pkg/sentry/fs/tmpfs/inode_file.go         |  27 +++++++
 pkg/sentry/fs/tmpfs/tmpfs.go              |   8 +-
 pkg/sentry/fs/tty/dir.go                  |   5 +-
 pkg/sentry/kernel/pipe/node.go            |   4 +
 pkg/sentry/syscalls/linux/sys_file.go     |  38 ++++++++-
 runsc/boot/compat.go                      |   2 +-
 runsc/fsgofer/filter/config.go            |  10 ++-
 runsc/fsgofer/fsgofer.go                  |  12 +++
 test/syscalls/linux/BUILD                 |   2 +
 test/syscalls/linux/fallocate.cc          | 123 +++++++++++++++++++++++++-----
 44 files changed, 589 insertions(+), 44 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 753fec3ed..81ff9fe9e 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -254,3 +254,14 @@ const (
 	F_SEAL_GROW   = 0x0004 // Prevent file from growing.
 	F_SEAL_WRITE  = 0x0008 // Prevent writes.
 )
+
+// Constants related to fallocate(2). Source: include/uapi/linux/falloc.h
+const (
+	FALLOC_FL_KEEP_SIZE      = 0x01
+	FALLOC_FL_PUNCH_HOLE     = 0x02
+	FALLOC_FL_NO_HIDE_STALE  = 0x04
+	FALLOC_FL_COLLAPSE_RANGE = 0x08
+	FALLOC_FL_ZERO_RANGE     = 0x10
+	FALLOC_FL_INSERT_RANGE   = 0x20
+	FALLOC_FL_UNSHARE_RANGE  = 0x40
+)
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index 5d972309d..36b2ec5f6 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -26,6 +26,7 @@ go_library(
         "//pkg/fd",
         "//pkg/log",
         "//pkg/unet",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 63c65129a..471c3a80b 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -171,6 +171,18 @@ func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error {
 	return c.client.sendRecv(&Tsetattr{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattr{})
 }
 
+// Allocate implements File.Allocate.
+func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
+	if atomic.LoadUint32(&c.closed) != 0 {
+		return syscall.EBADF
+	}
+	if !versionSupportsTallocate(c.client.version) {
+		return syscall.EOPNOTSUPP
+	}
+
+	return c.client.sendRecv(&Tallocate{FID: c.fid, Mode: mode, Offset: offset, Length: length}, &Rallocate{})
+}
+
 // Remove implements File.Remove.
 //
 // N.B. This method is no longer part of the file interface and should be
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index a52a0f3e7..89e814d50 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -89,6 +89,10 @@ type File interface {
 	// On the server, SetAttr has a write concurrency guarantee.
 	SetAttr(valid SetAttrMask, attr SetAttr) error
 
+	// Allocate allows the caller to directly manipulate the allocated disk space
+	// for the file. See fallocate(2) for more details.
+	Allocate(mode AllocateMode, offset, length uint64) error
+
 	// Close is called when all references are dropped on the server side,
 	// and Close should be called by the client to drop all references.
 	//
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 6da2ce4e3..533ead98a 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -877,6 +877,40 @@ func (t *Tsetattr) handle(cs *connState) message {
 	return &Rsetattr{}
 }
 
+// handle implements handler.handle.
+func (t *Tallocate) handle(cs *connState) message {
+	// Lookup the FID.
+	ref, ok := cs.LookupFID(t.FID)
+	if !ok {
+		return newErr(syscall.EBADF)
+	}
+	defer ref.DecRef()
+
+	if err := ref.safelyWrite(func() error {
+		// Has it been opened already?
+		openFlags, opened := ref.OpenFlags()
+		if !opened {
+			return syscall.EINVAL
+		}
+
+		// Can it be written? Check permissions.
+		if openFlags&OpenFlagsModeMask == ReadOnly {
+			return syscall.EBADF
+		}
+
+		// We don't allow allocate on files that have been deleted.
+		if ref.isDeleted() {
+			return syscall.EINVAL
+		}
+
+		return ref.file.Allocate(t.Mode, t.Offset, t.Length)
+	}); err != nil {
+		return newErr(err)
+	}
+
+	return &Rallocate{}
+}
+
 // handle implements handler.handle.
 func (t *Txattrwalk) handle(cs *connState) message {
 	// Lookup the FID.
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index f4077a9d4..d49d94550 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -323,6 +323,11 @@ func (l *local) Renamed(parent p9.File, newName string) {
 	l.path = path.Join(parent.(*local).path, newName)
 }
 
+// Allocate implements p9.File.Allocate.
+func (l *local) Allocate(mode p9.AllocateMode, offset, length uint64) error {
+	return syscall.Fallocate(int(l.file.Fd()), mode.ToLinux(), int64(offset), int64(length))
+}
+
 func main() {
 	log.SetLevel(log.Debug)
 
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 3c7898cc1..703753c31 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1424,6 +1424,63 @@ func (r *Rsetattr) String() string {
 	return fmt.Sprintf("Rsetattr{}")
 }
 
+// Tallocate is an allocate request. This is an extension to 9P protocol, not
+// present in the 9P2000.L standard.
+type Tallocate struct {
+	FID    FID
+	Mode   AllocateMode
+	Offset uint64
+	Length uint64
+}
+
+// Decode implements encoder.Decode.
+func (t *Tallocate) Decode(b *buffer) {
+	t.FID = b.ReadFID()
+	t.Mode.Decode(b)
+	t.Offset = b.Read64()
+	t.Length = b.Read64()
+}
+
+// Encode implements encoder.Encode.
+func (t *Tallocate) Encode(b *buffer) {
+	b.WriteFID(t.FID)
+	t.Mode.Encode(b)
+	b.Write64(t.Offset)
+	b.Write64(t.Length)
+}
+
+// Type implements message.Type.
+func (*Tallocate) Type() MsgType {
+	return MsgTallocate
+}
+
+// String implements fmt.Stringer.
+func (t *Tallocate) String() string {
+	return fmt.Sprintf("Tallocate{FID: %d, Offset: %d, Length: %d}", t.FID, t.Offset, t.Length)
+}
+
+// Rallocate is an allocate response.
+type Rallocate struct {
+}
+
+// Decode implements encoder.Decode.
+func (*Rallocate) Decode(b *buffer) {
+}
+
+// Encode implements encoder.Encode.
+func (*Rallocate) Encode(b *buffer) {
+}
+
+// Type implements message.Type.
+func (*Rallocate) Type() MsgType {
+	return MsgRallocate
+}
+
+// String implements fmt.Stringer.
+func (r *Rallocate) String() string {
+	return fmt.Sprintf("Rallocate{}")
+}
+
 // Txattrwalk walks extended attributes.
 type Txattrwalk struct {
 	// FID is the FID to check for attributes.
@@ -2297,4 +2354,6 @@ func init() {
 	msgRegistry.register(MsgRusymlink, func() message { return &Rusymlink{} })
 	msgRegistry.register(MsgTlconnect, func() message { return &Tlconnect{} })
 	msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} })
+	msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} })
+	msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} })
 }
diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go
index 78c7d3f86..4039862e6 100644
--- a/pkg/p9/p9.go
+++ b/pkg/p9/p9.go
@@ -22,6 +22,8 @@ import (
 	"strings"
 	"sync/atomic"
 	"syscall"
+
+	"golang.org/x/sys/unix"
 )
 
 // OpenFlags is the mode passed to Open and Create operations.
@@ -374,6 +376,8 @@ const (
 	MsgRusymlink            = 135
 	MsgTlconnect            = 136
 	MsgRlconnect            = 137
+	MsgTallocate            = 138
+	MsgRallocate            = 139
 )
 
 // QIDType represents the file type for QIDs.
@@ -1058,3 +1062,80 @@ func (d *Dirent) Encode(b *buffer) {
 	b.WriteQIDType(d.Type)
 	b.WriteString(d.Name)
 }
+
+// AllocateMode are possible modes to p9.File.Allocate().
+type AllocateMode struct {
+	KeepSize      bool
+	PunchHole     bool
+	NoHideStale   bool
+	CollapseRange bool
+	ZeroRange     bool
+	InsertRange   bool
+	Unshare       bool
+}
+
+// ToLinux converts to a value compatible with fallocate(2)'s mode.
+func (a *AllocateMode) ToLinux() uint32 {
+	rv := uint32(0)
+	if a.KeepSize {
+		rv |= unix.FALLOC_FL_KEEP_SIZE
+	}
+	if a.PunchHole {
+		rv |= unix.FALLOC_FL_PUNCH_HOLE
+	}
+	if a.NoHideStale {
+		rv |= unix.FALLOC_FL_NO_HIDE_STALE
+	}
+	if a.CollapseRange {
+		rv |= unix.FALLOC_FL_COLLAPSE_RANGE
+	}
+	if a.ZeroRange {
+		rv |= unix.FALLOC_FL_ZERO_RANGE
+	}
+	if a.InsertRange {
+		rv |= unix.FALLOC_FL_INSERT_RANGE
+	}
+	if a.Unshare {
+		rv |= unix.FALLOC_FL_UNSHARE_RANGE
+	}
+	return rv
+}
+
+// Decode implements encoder.Decode.
+func (a *AllocateMode) Decode(b *buffer) {
+	mask := b.Read32()
+	a.KeepSize = mask&0x01 != 0
+	a.PunchHole = mask&0x02 != 0
+	a.NoHideStale = mask&0x04 != 0
+	a.CollapseRange = mask&0x08 != 0
+	a.ZeroRange = mask&0x10 != 0
+	a.InsertRange = mask&0x20 != 0
+	a.Unshare = mask&0x40 != 0
+}
+
+// Encode implements encoder.Encode.
+func (a *AllocateMode) Encode(b *buffer) {
+	mask := uint32(0)
+	if a.KeepSize {
+		mask |= 0x01
+	}
+	if a.PunchHole {
+		mask |= 0x02
+	}
+	if a.NoHideStale {
+		mask |= 0x04
+	}
+	if a.CollapseRange {
+		mask |= 0x08
+	}
+	if a.ZeroRange {
+		mask |= 0x10
+	}
+	if a.InsertRange {
+		mask |= 0x20
+	}
+	if a.Unshare {
+		mask |= 0x40
+	}
+	b.Write32(mask)
+}
diff --git a/pkg/p9/version.go b/pkg/p9/version.go
index a36a499a1..c2a2885ae 100644
--- a/pkg/p9/version.go
+++ b/pkg/p9/version.go
@@ -26,7 +26,7 @@ const (
 	//
 	// Clients are expected to start requesting this version number and
 	// to continuously decrement it until a Tversion request succeeds.
-	highestSupportedVersion uint32 = 6
+	highestSupportedVersion uint32 = 7
 
 	// lowestSupportedVersion is the lowest supported version X in a
 	// version string of the format 9P2000.L.Google.X.
@@ -143,3 +143,8 @@ func VersionSupportsAnonymous(v uint32) bool {
 func VersionSupportsMultiUser(v uint32) bool {
 	return v >= 6
 }
+
+// versionSupportsTallocate returns true if version v supports Allocate().
+func versionSupportsTallocate(v uint32) bool {
+	return v >= 7
+}
diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go
index 5e005bc2e..22e1530e9 100644
--- a/pkg/sentry/fs/ashmem/device.go
+++ b/pkg/sentry/fs/ashmem/device.go
@@ -29,6 +29,7 @@ import (
 type Device struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index acbbd5466..a992253e6 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -46,6 +46,7 @@ const (
 type Device struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 6b11afa44..17d68b5c4 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -30,6 +30,7 @@ import (
 type fullDevice struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
@@ -59,7 +60,6 @@ func (f *fullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type fullFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
@@ -69,6 +69,7 @@ type fullFileOperations struct {
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	readZeros                       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*fullFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index 069212b6d..ee13183c8 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -29,6 +29,7 @@ import (
 type nullDevice struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
@@ -60,17 +61,17 @@ func (n *nullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type nullFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRead             `state:"nosave"`
-	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
+	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*nullFileOperations)(nil)
@@ -101,16 +102,16 @@ func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 
 // +stateify savable
 type zeroFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	readZeros                       `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*zeroFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index de0f3e5e5..b0a412382 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -29,6 +29,7 @@ import (
 type randomDevice struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
@@ -57,16 +58,16 @@ func (*randomDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type randomFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
-	fsutil.FileNotDirReaddir        `state:"nosave"`
+	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
-	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopWrite            `state:"nosave"`
+	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*randomFileOperations)(nil)
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index 01098675d..44f43b965 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -113,5 +113,6 @@ go_test(
         "//pkg/sentry/memmap",
         "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
+        "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 28686f3b3..ad0518b8f 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -149,7 +149,7 @@ func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error {
 	}
 
 	// Invalidate COW mappings that may exist beyond the new size in case the file
-	// is being shrunk. Other mappinsg don't need to be invalidated because
+	// is being shrunk. Other mappings don't need to be invalidated because
 	// translate will just return identical mappings after invalidation anyway,
 	// and SIGBUS will be raised and handled when the mappings are touched.
 	//
@@ -167,6 +167,14 @@ func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error {
 	return nil
 }
 
+// Allocate reserves space in the backing file.
+func (h *HostMappable) Allocate(ctx context.Context, offset int64, length int64) error {
+	h.truncateMu.RLock()
+	err := h.backingFile.Allocate(ctx, offset, length)
+	h.truncateMu.RUnlock()
+	return err
+}
+
 // Write writes to the file backing this mappable.
 func (h *HostMappable) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
 	h.truncateMu.RLock()
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index b6366d906..151be1d0d 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -34,6 +34,7 @@ type SimpleFileInode struct {
 	InodeNoExtendedAttributes `state:"nosave"`
 	InodeNoopRelease          `state:"nosave"`
 	InodeNoopWriteOut         `state:"nosave"`
+	InodeNotAllocatable       `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
 	InodeNotMappable          `state:"nosave"`
 	InodeNotOpenable          `state:"nosave"`
@@ -61,6 +62,7 @@ type NoReadWriteFileInode struct {
 	InodeNoExtendedAttributes `state:"nosave"`
 	InodeNoopRelease          `state:"nosave"`
 	InodeNoopWriteOut         `state:"nosave"`
+	InodeNotAllocatable       `state:"nosave"`
 	InodeNotDirectory         `state:"nosave"`
 	InodeNotMappable          `state:"nosave"`
 	InodeNotSocket            `state:"nosave"`
@@ -465,3 +467,26 @@ func (InodeDenyWriteChecker) Check(ctx context.Context, inode *fs.Inode, p fs.Pe
 	}
 	return fs.ContextCanAccessFile(ctx, inode, p)
 }
+
+//InodeNotAllocatable can be used by Inodes that do not support Allocate().
+type InodeNotAllocatable struct{}
+
+func (InodeNotAllocatable) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EOPNOTSUPP
+}
+
+// InodeNoopAllocate implements fs.InodeOperations.Allocate as a noop.
+type InodeNoopAllocate struct{}
+
+// Allocate implements fs.InodeOperations.Allocate.
+func (InodeNoopAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return nil
+}
+
+// InodeIsDirAllocate implements fs.InodeOperations.Allocate for directories.
+type InodeIsDirAllocate struct{}
+
+// Allocate implements fs.InodeOperations.Allocate.
+func (InodeIsDirAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EISDIR
+}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 76644e69d..03cad37f3 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -135,6 +135,10 @@ type CachedFileObject interface {
 	// the file was opened.
 	SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error
 
+	// Allocate allows the caller to reserve disk space for the inode.
+	// It's equivalent to fallocate(2) with 'mode=0'.
+	Allocate(ctx context.Context, offset int64, length int64) error
+
 	// Sync instructs the remote filesystem to sync the file to stable storage.
 	Sync(ctx context.Context) error
 
@@ -336,6 +340,30 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode,
 	return nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length int64) error {
+	newSize := offset + length
+
+	// c.attr.Size is protected by both c.attrMu and c.dataMu.
+	c.attrMu.Lock()
+	defer c.attrMu.Unlock()
+	c.dataMu.Lock()
+	defer c.dataMu.Unlock()
+
+	if newSize <= c.attr.Size {
+		return nil
+	}
+
+	now := ktime.NowFromContext(ctx)
+	if err := c.backingFile.Allocate(ctx, offset, length); err != nil {
+		return err
+	}
+
+	c.attr.Size = newSize
+	c.touchModificationTimeLocked(now)
+	return nil
+}
+
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
 	c.attrMu.Lock()
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 3f10efc12..be3d4b6fc 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 type noopBackingFile struct{}
@@ -50,6 +51,10 @@ func (noopBackingFile) FD() int {
 	return -1
 }
 
+func (noopBackingFile) Allocate(ctx context.Context, offset int64, length int64) error {
+	return nil
+}
+
 func TestSetPermissions(t *testing.T) {
 	ctx := contexttest.Context(t)
 
@@ -237,6 +242,10 @@ func (*sliceBackingFile) FD() int {
 	return -1
 }
 
+func (f *sliceBackingFile) Allocate(ctx context.Context, offset int64, length int64) error {
+	return syserror.EOPNOTSUPP
+}
+
 type noopMappingSpace struct{}
 
 // Invalidate implements memmap.MappingSpace.Invalidate.
diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go
index 842a34af8..be53ac4d9 100644
--- a/pkg/sentry/fs/gofer/context_file.go
+++ b/pkg/sentry/fs/gofer/context_file.go
@@ -59,6 +59,13 @@ func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9
 	return err
 }
 
+func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := c.file.Allocate(mode, offset, length)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (c *contextFile) rename(ctx context.Context, directory contextFile, name string) error {
 	ctx.UninterruptibleSleepStart(false)
 	err := c.file.Rename(directory.file, name)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index f6f20844d..dcb3b2880 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -322,6 +322,15 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 	return unstable(ctx, valid, pattr, i.s.mounter, i.s.client), nil
 }
 
+func (i *inodeFileState) Allocate(ctx context.Context, offset, length int64) error {
+	i.handlesMu.RLock()
+	defer i.handlesMu.RUnlock()
+
+	// No options are supported for now.
+	mode := p9.AllocateMode{}
+	return i.writeHandles.File.allocate(ctx, mode, uint64(offset), uint64(length))
+}
+
 // session extracts the gofer's session from the MountSource.
 func (i *inodeOperations) session() *session {
 	return i.fileState.s
@@ -498,6 +507,21 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length
 	return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)})
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
+	// This can only be called for files anyway.
+	if i.session().cachePolicy.useCachingInodeOps(inode) {
+		return i.cachingInodeOps.Allocate(ctx, offset, length)
+	}
+	if i.session().cachePolicy == cacheRemoteRevalidating {
+		return i.fileState.hostMappable.Allocate(ctx, offset, length)
+	}
+
+	// No options are supported for now.
+	mode := p9.AllocateMode{}
+	return i.fileState.file.allocate(ctx, mode, uint64(offset), uint64(length))
+}
+
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
 	if !i.session().cachePolicy.cacheUAttrs(inode) {
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 20e077f77..d36ac9a87 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -163,6 +163,11 @@ func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, err
 	return unstableAttr(i.mops, &s), nil
 }
 
+// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
+func (i *inodeFileState) Allocate(_ context.Context, offset, length int64) error {
+	return syscall.Fallocate(i.FD(), 0, offset, length)
+}
+
 // inodeOperations implements fs.InodeOperations.
 var _ fs.InodeOperations = (*inodeOperations)(nil)
 
@@ -397,6 +402,19 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size in
 	return i.cachingInodeOps.Truncate(ctx, inode, size)
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
+	// Is the file not memory-mappable?
+	if !canMap(inode) {
+		// Then just send the call to the FD, the host will synchronize the metadata
+		// update with any host inode and page cache.
+		return i.fileState.Allocate(ctx, offset, length)
+	}
+	// Otherwise we need to go through cachingInodeOps, even if the host page
+	// cache is in use, to invalidate private copies of truncated pages.
+	return i.cachingInodeOps.Allocate(ctx, offset, length)
+}
+
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
 	// Have we been using host kernel metadata caches?
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index d764ef93d..22f316daf 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -340,6 +340,13 @@ func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error {
 	return i.InodeOperations.Truncate(ctx, i, size)
 }
 
+func (i *Inode) Allocate(ctx context.Context, d *Dirent, offset int64, length int64) error {
+	if i.overlay != nil {
+		return overlayAllocate(ctx, i.overlay, d, offset, length)
+	}
+	return i.InodeOperations.Allocate(ctx, i, offset, length)
+}
+
 // Readlink calls i.InodeOperations.Readlnk with i as the Inode.
 func (i *Inode) Readlink(ctx context.Context) (string, error) {
 	if i.overlay != nil {
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index ac287e1e4..abafe4791 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -223,6 +223,10 @@ type InodeOperations interface {
 	// Implementations need not check that length >= 0.
 	Truncate(ctx context.Context, inode *Inode, size int64) error
 
+	// Allocate allows the caller to reserve disk space for the inode.
+	// It's equivalent to fallocate(2) with 'mode=0'.
+	Allocate(ctx context.Context, inode *Inode, offset int64, length int64) error
+
 	// WriteOut writes cached Inode state to a backing filesystem in a
 	// synchronous manner.
 	//
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index 3d015328e..ead487097 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -582,6 +582,13 @@ func overlayTruncate(ctx context.Context, o *overlayEntry, d *Dirent, size int64
 	return o.upper.InodeOperations.Truncate(ctx, o.upper, size)
 }
 
+func overlayAllocate(ctx context.Context, o *overlayEntry, d *Dirent, offset, length int64) error {
+	if err := copyUp(ctx, d); err != nil {
+		return err
+	}
+	return o.upper.InodeOperations.Allocate(ctx, o.upper, offset, length)
+}
+
 func overlayReadlink(ctx context.Context, o *overlayEntry) (string, error) {
 	o.copyMu.RLock()
 	defer o.copyMu.RUnlock()
diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go
index 66b3da2d0..52ce1d29e 100644
--- a/pkg/sentry/fs/inode_overlay_test.go
+++ b/pkg/sentry/fs/inode_overlay_test.go
@@ -422,6 +422,7 @@ type inode struct {
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotAllocatable       `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
 	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index cf359a1f1..a71144b2c 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -150,6 +150,11 @@ func (n *MockInodeOperations) Truncate(ctx context.Context, inode *Inode, size i
 	return nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (n *MockInodeOperations) Allocate(ctx context.Context, inode *Inode, offset, length int64) error {
+	return nil
+}
+
 // Remove implements fs.InodeOperations.Remove.
 func (n *MockInodeOperations) Remove(context.Context, *Inode, string) error {
 	return nil
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index b03807043..379569823 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -55,6 +55,7 @@ func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) (
 type staticFileInodeOps struct {
 	fsutil.InodeDenyWriteChecker     `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
+	fsutil.InodeNoopAllocate         `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopTruncate         `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 10ea1f55d..6b0ae9e60 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -93,6 +93,7 @@ type SeqFile struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
@@ -183,7 +184,6 @@ func (s *SeqFile) updateSourceLocked(ctx context.Context, record int) {
 //
 // +stateify savable
 type seqFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
@@ -192,6 +192,7 @@ type seqFileOperations struct {
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	seqFile *SeqFile
 }
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index d649da0f1..5df3cee13 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -38,6 +38,7 @@ type idMapInodeOperations struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
@@ -81,7 +82,6 @@ func (imio *idMapInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent
 
 // +stateify savable
 type idMapFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
@@ -90,6 +90,7 @@ type idMapFileOperations struct {
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	iops *idMapInodeOperations
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index a6b6a5c33..eb98b59cc 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -50,6 +50,7 @@ type CreateOps struct {
 // +stateify savable
 type Dir struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
+	fsutil.InodeIsDirAllocate  `state:"nosave"`
 	fsutil.InodeIsDirTruncate  `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index 9406a07ca..a7cb1bb86 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -30,6 +30,7 @@ type Socket struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
 	fsutil.InodeNotSymlink     `state:"nosave"`
@@ -67,7 +68,6 @@ func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 
 // +stateify savable
 type socketFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
@@ -78,6 +78,7 @@ type socketFileOperations struct {
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*socketFileOperations)(nil)
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index f7835fe05..dd2585b02 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -29,10 +29,11 @@ type Symlink struct {
 	fsutil.InodeGenericChecker `state:"nosave"`
 	fsutil.InodeNoopRelease    `state:"nosave"`
 	fsutil.InodeNoopWriteOut   `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 	fsutil.InodeNotDirectory   `state:"nosave"`
 	fsutil.InodeNotMappable    `state:"nosave"`
-	fsutil.InodeNotTruncatable `state:"nosave"`
 	fsutil.InodeNotSocket      `state:"nosave"`
+	fsutil.InodeNotTruncatable `state:"nosave"`
 	fsutil.InodeVirtual        `state:"nosave"`
 
 	fsutil.InodeSimpleAttributes
@@ -88,7 +89,6 @@ func (s *Symlink) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF
 
 // +stateify savable
 type symlinkFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
@@ -99,6 +99,7 @@ type symlinkFileOperations struct {
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*symlinkFileOperations)(nil)
diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index db91de435..bacc93af8 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -30,12 +30,13 @@ type cpunum struct {
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
 	fsutil.InodeNoopRelease          `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
+	fsutil.InodeNotAllocatable       `state:"nosave"`
 	fsutil.InodeNotDirectory         `state:"nosave"`
 	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
 	fsutil.InodeNotSymlink           `state:"nosave"`
-	fsutil.InodeNotVirtual           `state:"nosave"`
 	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeNotVirtual           `state:"nosave"`
 
 	fsutil.InodeSimpleAttributes
 	fsutil.InodeStaticFileGetter
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index f89d86c83..c90062a22 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -259,6 +259,33 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in
 	return nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (f *fileInodeOperations) Allocate(ctx context.Context, _ *fs.Inode, offset, length int64) error {
+	newSize := offset + length
+
+	f.attrMu.Lock()
+	defer f.attrMu.Unlock()
+	f.dataMu.Lock()
+	defer f.dataMu.Unlock()
+
+	if newSize <= f.attr.Size {
+		return nil
+	}
+
+	// Check if current seals allow growth.
+	if f.seals&linux.F_SEAL_GROW != 0 {
+		return syserror.EPERM
+	}
+
+	f.attr.Size = newSize
+
+	now := ktime.NowFromContext(ctx)
+	f.attr.ModificationTime = now
+	f.attr.StatusChangeTime = now
+
+	return nil
+}
+
 // AddLink implements fs.InodeOperations.AddLink.
 func (f *fileInodeOperations) AddLink() {
 	f.attrMu.Lock()
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 832914453..6ad5c5adb 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -242,11 +242,16 @@ func (d *Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, n
 	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
-// StatFS implments fs.InodeOperations.StatFS.
+// StatFS implements fs.InodeOperations.StatFS.
 func (*Dir) StatFS(context.Context) (fs.Info, error) {
 	return fsInfo, nil
 }
 
+// Allocate implements fs.InodeOperations.Allocate.
+func (d *Dir) Allocate(ctx context.Context, node *fs.Inode, offset, length int64) error {
+	return d.ramfsDir.Allocate(ctx, node, offset, length)
+}
+
 // Symlink is a symlink.
 //
 // +stateify savable
@@ -281,6 +286,7 @@ func (s *Symlink) StatFS(context.Context) (fs.Info, error) {
 type Socket struct {
 	ramfs.Socket
 	fsutil.InodeNotTruncatable `state:"nosave"`
+	fsutil.InodeNotAllocatable `state:"nosave"`
 }
 
 // NewSocket returns a new socket with the provided permissions.
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 0fc777e67..8dc40e1f2 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -53,13 +53,14 @@ import (
 // +stateify savable
 type dirInodeOperations struct {
 	fsutil.InodeGenericChecker       `state:"nosave"`
+	fsutil.InodeIsDirAllocate        `state:"nosave"`
+	fsutil.InodeIsDirTruncate        `state:"nosave"`
 	fsutil.InodeNoExtendedAttributes `state:"nosave"`
 	fsutil.InodeNoopWriteOut         `state:"nosave"`
 	fsutil.InodeNotMappable          `state:"nosave"`
 	fsutil.InodeNotRenameable        `state:"nosave"`
-	fsutil.InodeNotSymlink           `state:"nosave"`
 	fsutil.InodeNotSocket            `state:"nosave"`
-	fsutil.InodeNotTruncatable       `state:"nosave"`
+	fsutil.InodeNotSymlink           `state:"nosave"`
 	fsutil.InodeVirtual              `state:"nosave"`
 
 	fsutil.InodeSimpleAttributes
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 99188dddf..7c3739360 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -191,3 +191,7 @@ func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) {
 		*wakeupChan = nil
 	}
 }
+
+func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error {
+	return syserror.EPIPE
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 893322647..1764bb4b6 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -1900,9 +1900,9 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 }
 
 // Fallocate implements linux system call fallocate(2).
-// (well, not really, but at least we return the expected error codes)
 func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	fd := kdefs.FD(args[0].Int())
+	mode := args[1].Int64()
 	offset := args[2].Int64()
 	length := args[3].Int64()
 
@@ -1915,8 +1915,42 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 	if offset < 0 || length <= 0 {
 		return 0, nil, syserror.EINVAL
 	}
+	if mode != 0 {
+		t.Kernel().EmitUnimplementedEvent(t)
+		return 0, nil, syserror.ENOTSUP
+	}
+	if !file.Flags().Write {
+		return 0, nil, syserror.EBADF
+	}
+	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ESPIPE
+	}
+	if fs.IsDir(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EISDIR
+	}
+	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.ENODEV
+	}
+	size := offset + length
+	if size < 0 {
+		return 0, nil, syserror.EFBIG
+	}
+	if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
+		t.SendSignal(&arch.SignalInfo{
+			Signo: int32(syscall.SIGXFSZ),
+			Code:  arch.SignalInfoUser,
+		})
+		return 0, nil, syserror.EFBIG
+	}
+
+	if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil {
+		return 0, nil, err
+	}
+
+	// File length modified, generate notification.
+	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
 
-	return 0, nil, syserror.EOPNOTSUPP
+	return 0, nil, nil
 }
 
 // Flock implements linux syscall flock(2).
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index c1b33c551..c369e4d64 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -99,7 +99,7 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
-		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX:
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE:
 			// args: fd/addr, cmd, ...
 			tr = newArgsTracker(1)
 
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index a1ad49fb2..4faab2946 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -62,8 +62,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 	syscall.SYS_EXIT:       {},
 	syscall.SYS_EXIT_GROUP: {},
-	syscall.SYS_FCHMOD:     {},
-	syscall.SYS_FCHOWNAT:   {},
+	syscall.SYS_FALLOCATE: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_FCHMOD:   {},
+	syscall.SYS_FCHOWNAT: {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 3a0806837..b185015b6 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -731,6 +731,18 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	return err
 }
 
+// Allocate implements p9.File.
+func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
+	if !l.isOpen() {
+		return syscall.EBADF
+	}
+
+	if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
 // Rename implements p9.File; this should never be called.
 func (l *localFile) Rename(p9.File, string) error {
 	panic("rename called directly")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d99733fc9..7ff4e4883 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -649,6 +649,8 @@ cc_binary(
     srcs = ["fallocate.cc"],
     linkstatic = 1,
     deps = [
+        ":file_base",
+        "//test/util:cleanup",
         "//test/util:file_descriptor",
         "//test/util:temp_path",
         "//test/util:test_main",
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index 61b8acc7a..1c3d00287 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -12,45 +12,130 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <errno.h>
 #include <fcntl.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <syscall.h>
+#include <time.h>
 #include <unistd.h>
 
 #include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/cleanup.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 
 namespace gvisor {
 namespace testing {
-
 namespace {
 
-// These tests are very rudimentary because fallocate is not
-// implemented.  We just want to make sure the expected error codes are
-// returned.
+int fallocate(int fd, int mode, off_t offset, off_t len) {
+  return syscall(__NR_fallocate, fd, mode, offset, len);
+}
+
+class AllocateTest : public FileTest {
+  void SetUp() override { FileTest::SetUp(); }
+};
+
+TEST_F(AllocateTest, Fallocate) {
+  // Check that it starts at size zero.
+  struct stat buf;
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 0);
+
+  // Grow to ten bytes.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 10);
 
-TEST(FallocateTest, NotImplemented) {
-  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
+  // Allocate to a smaller size should be noop.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 10);
 
-  // Test that a completely unassigned fallocate mode returns EOPNOTSUPP.
-  ASSERT_THAT(fallocate(fd.get(), 0x80, 0, 32768),
-              SyscallFailsWithErrno(EOPNOTSUPP));
+  // Grow again.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 20);
+
+  // Grow with offset.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 30);
+
+  // Grow with offset beyond EOF.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
+  ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+  EXPECT_EQ(buf.st_size, 40);
 }
 
-TEST(FallocateTest, BadOffset) {
-  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
-  ASSERT_THAT(fallocate(fd.get(), 0, -1, 32768), SyscallFailsWithErrno(EINVAL));
+TEST_F(AllocateTest, FallocateInvalid) {
+  // Invalid FD
+  EXPECT_THAT(fallocate(-1, 0, 0, 10), SyscallFailsWithErrno(EBADF));
+
+  // Negative offset and size.
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, -1, 10),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, -1),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, -1, -1),
+              SyscallFailsWithErrno(EINVAL));
 }
 
-TEST(FallocateTest, BadLength) {
-  auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_path.path(), O_RDWR));
-  ASSERT_THAT(fallocate(fd.get(), 0, 0, -1), SyscallFailsWithErrno(EINVAL));
+TEST_F(AllocateTest, FallocateReadonly) {
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+  EXPECT_THAT(fallocate(fd.get(), 0, 0, 10), SyscallFailsWithErrno(EBADF));
 }
 
-}  // namespace
+TEST_F(AllocateTest, FallocatePipe) {
+  int pipes[2];
+  EXPECT_THAT(pipe(pipes), SyscallSucceeds());
+  auto cleanup = Cleanup([&pipes] {
+    EXPECT_THAT(close(pipes[0]), SyscallSucceeds());
+    EXPECT_THAT(close(pipes[1]), SyscallSucceeds());
+  });
+
+  EXPECT_THAT(fallocate(pipes[1], 0, 0, 10), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST_F(AllocateTest, FallocateChar) {
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDWR));
+  EXPECT_THAT(fallocate(fd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+}
+
+TEST_F(AllocateTest, FallocateRlimit) {
+  // Get the current rlimit and restore after test run.
+  struct rlimit initial_lim;
+  ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  auto cleanup = Cleanup([&initial_lim] {
+    EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+  });
+
+  // Try growing past the file size limit.
+  sigset_t new_mask;
+  sigemptyset(&new_mask);
+  sigaddset(&new_mask, SIGXFSZ);
+  sigprocmask(SIG_BLOCK, &new_mask, nullptr);
 
+  struct rlimit setlim = {};
+  setlim.rlim_cur = 1024;
+  setlim.rlim_max = RLIM_INFINITY;
+  ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+
+  EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 1025),
+              SyscallFailsWithErrno(EFBIG));
+
+  struct timespec timelimit = {};
+  timelimit.tv_sec = 10;
+  EXPECT_EQ(sigtimedwait(&new_mask, nullptr, &timelimit), SIGXFSZ);
+  ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds());
+}
+
+}  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 5ee8218483ce172400c21780d5dbc1ec2ba54d63 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 10 May 2019 13:36:56 -0700
Subject: Add pgalloc.DelayedEvictionManual.

PiperOrigin-RevId: 247667272
Change-Id: I16b04e11bb93f50b7e05e888992303f730e4a877
---
 pkg/sentry/kernel/kernel.go        |  5 +++--
 pkg/sentry/pgalloc/pgalloc.go      | 21 ++++++++++++++++++++-
 pkg/sentry/pgalloc/save_restore.go | 11 +----------
 3 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 91889b573..85d73ace2 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -304,10 +304,11 @@ func (k *Kernel) SaveTo(w io.Writer) error {
 	defer k.resumeTimeLocked()
 
 	// Evict all evictable MemoryFile allocations.
-	k.mf.FlushEvictions()
+	k.mf.StartEvictions()
+	k.mf.WaitForEvictions()
 
 	// Flush write operations on open files so data reaches backing storage.
-	// This must come after k.mf.FlushEvictions() since eviction may cause file
+	// This must come after MemoryFile eviction since eviction may cause file
 	// writes.
 	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
 		return err
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 9c1313f6f..2b9924ad7 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -190,6 +190,11 @@ const (
 	// reclaimer goroutine is out of work (pages to reclaim), then evicts all
 	// pending evictable allocations immediately.
 	DelayedEvictionEnabled
+
+	// DelayedEvictionManual requires that evictable allocations are only
+	// evicted when MemoryFile.StartEvictions() is called. This is extremely
+	// dangerous outside of tests.
+	DelayedEvictionManual
 )
 
 // usageInfo tracks usage information.
@@ -264,7 +269,7 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
 	switch opts.DelayedEviction {
 	case DelayedEvictionDefault:
 		opts.DelayedEviction = DelayedEvictionEnabled
-	case DelayedEvictionDisabled, DelayedEvictionEnabled:
+	case DelayedEvictionDisabled, DelayedEvictionEnabled, DelayedEvictionManual:
 	default:
 		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
 	}
@@ -1075,6 +1080,14 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
 	}
 }
 
+// StartEvictions requests that f evict all evictable allocations. It does not
+// wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
+func (f *MemoryFile) StartEvictions() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.startEvictionsLocked()
+}
+
 // Preconditions: f.mu must be locked.
 func (f *MemoryFile) startEvictionsLocked() {
 	for user, info := range f.evictable {
@@ -1122,6 +1135,12 @@ func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info
 	}()
 }
 
+// WaitForEvictions blocks until f is no longer evicting any evictable
+// allocations.
+func (f *MemoryFile) WaitForEvictions() {
+	f.evictionWG.Wait()
+}
+
 type usageSetFunctions struct{}
 
 func (usageSetFunctions) MinKey() uint64 {
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index 9534d1aed..d4ba384b1 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -28,15 +28,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/state"
 )
 
-// FlushEvictions blocks until f has finished evicting all evictable
-// allocations.
-func (f *MemoryFile) FlushEvictions() {
-	f.mu.Lock()
-	f.startEvictionsLocked()
-	f.mu.Unlock()
-	f.evictionWG.Wait()
-}
-
 // SaveTo writes f's state to the given stream.
 func (f *MemoryFile) SaveTo(w io.Writer) error {
 	// Wait for reclaim.
@@ -51,7 +42,7 @@ func (f *MemoryFile) SaveTo(w io.Writer) error {
 
 	// Ensure that there are no pending evictions.
 	if len(f.evictable) != 0 {
-		panic(fmt.Sprintf("evictions still pending for %d users; call FlushEvictions before SaveTo", len(f.evictable)))
+		panic(fmt.Sprintf("evictions still pending for %d users; call StartEvictions and WaitForEvictions before SaveTo", len(f.evictable)))
 	}
 
 	// Ensure that all pages that contain data have knownCommitted set, since
-- 
cgit v1.2.3


From 330a1bbd04815b846e9396e9f5763afe6350b537 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 14 May 2019 18:04:28 -0700
Subject: Remove false comment

PiperOrigin-RevId: 248249285
Change-Id: I9b6d267baa666798b22def590ff20c9a118efd47
---
 pkg/sentry/fs/gofer/session.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 4cb65e7c6..085a358fe 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -101,7 +101,7 @@ type session struct {
 	// version is the value of the version mount option, see fs/gofer/fs.go.
 	version string `state:"wait"`
 
-	// cachePolicy is the cache policy. It may be either cacheAll or cacheNone.
+	// cachePolicy is the cache policy.
 	cachePolicy cachePolicy `state:"wait"`
 
 	// aname is the value of the aname mount option, see fs/gofer/fs.go.
-- 
cgit v1.2.3


From dd153c014de69968dac96629e457ee17944e410e Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 14 May 2019 20:33:44 -0700
Subject: Start of support for /proc/pid/cgroup file.

PiperOrigin-RevId: 248263378
Change-Id: Ic057d2bb0b6212110f43ac4df3f0ac9bf931ab98
---
 pkg/sentry/fs/proc/BUILD     |  1 +
 pkg/sentry/fs/proc/cgroup.go | 41 +++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/fs/proc/fs.go     |  9 +++++++--
 pkg/sentry/fs/proc/proc.go   | 16 +++++++++++-----
 pkg/sentry/fs/proc/task.go   | 34 ++++++++++++++++++----------------
 5 files changed, 78 insertions(+), 23 deletions(-)
 create mode 100644 pkg/sentry/fs/proc/cgroup.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 3aa70a28e..d19c360e0 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -5,6 +5,7 @@ load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
 go_library(
     name = "proc",
     srcs = [
+        "cgroup.go",
         "cpuinfo.go",
         "exec_args.go",
         "fds.go",
diff --git a/pkg/sentry/fs/proc/cgroup.go b/pkg/sentry/fs/proc/cgroup.go
new file mode 100644
index 000000000..1019f862a
--- /dev/null
+++ b/pkg/sentry/fs/proc/cgroup.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+func newCGroupInode(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string]string) *fs.Inode {
+	// From man 7 cgroups: "For each cgroup hierarchy of which the process
+	// is a member, there is one entry containing three colon-separated
+	// fields: hierarchy-ID:controller-list:cgroup-path"
+
+	// The hierarchy ids must be positive integers (for cgroup v1), but the
+	// exact number does not matter, so long as they are unique. We can
+	// just use a counter, but since linux sorts this file in descending
+	// order, we must count down to perserve this behavior.
+	i := len(cgroupControllers)
+	var data string
+	for name, dir := range cgroupControllers {
+		data += fmt.Sprintf("%d:%s:%s\n", i, name, dir)
+		i--
+	}
+
+	return newStaticProcInode(ctx, msrc, []byte(data))
+}
diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go
index 7c5f8484a..d57d6cc5d 100644
--- a/pkg/sentry/fs/proc/fs.go
+++ b/pkg/sentry/fs/proc/fs.go
@@ -57,7 +57,7 @@ func (*filesystem) Flags() fs.FilesystemFlags {
 }
 
 // Mount returns the root of a procfs that can be positioned in the vfs.
-func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, _ interface{}) (*fs.Inode, error) {
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, cgroupsInt interface{}) (*fs.Inode, error) {
 	// device is always ignored.
 
 	// Parse generic comma-separated key=value options, this file system expects them.
@@ -70,7 +70,12 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
 		return nil, fmt.Errorf("unsupported mount options: %v", options)
 	}
 
+	var cgroups map[string]string
+	if cgroupsInt != nil {
+		cgroups = cgroupsInt.(map[string]string)
+	}
+
 	// Construct the procfs root. Since procfs files are all virtual, we
 	// never want them cached.
-	return New(ctx, fs.NewNonCachingMountSource(f, flags))
+	return New(ctx, fs.NewNonCachingMountSource(f, flags), cgroups)
 }
diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go
index 196fa5128..0e15894b4 100644
--- a/pkg/sentry/fs/proc/proc.go
+++ b/pkg/sentry/fs/proc/proc.go
@@ -43,10 +43,15 @@ type proc struct {
 	// pidns is the PID namespace of the task that mounted the proc filesystem
 	// that this node represents.
 	pidns *kernel.PIDNamespace
+
+	// cgroupControllers is a map of controller name to directory in the
+	// cgroup hierarchy. These controllers are immutable and will be listed
+	// in /proc/pid/cgroup if not nil.
+	cgroupControllers map[string]string
 }
 
 // New returns the root node of a partial simple procfs.
-func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
+func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string]string) (*fs.Inode, error) {
 	k := kernel.KernelFromContext(ctx)
 	if k == nil {
 		return nil, fmt.Errorf("procfs requires a kernel")
@@ -73,9 +78,10 @@ func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) {
 
 	// Construct the proc InodeOperations.
 	p := &proc{
-		Dir:   *ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
-		k:     k,
-		pidns: pidns,
+		Dir:               *ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		k:                 k,
+		pidns:             pidns,
+		cgroupControllers: cgroupControllers,
 	}
 
 	// Add more contents that need proc to be initialized.
@@ -178,7 +184,7 @@ func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dire
 	}
 
 	// Wrap it in a taskDir.
-	td := newTaskDir(otherTask, dir.MountSource, p.pidns, true)
+	td := p.newTaskDir(otherTask, dir.MountSource, true)
 	return fs.NewDirent(td, name), nil
 }
 
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 0f400e80f..66d76d194 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -67,7 +67,7 @@ type taskDir struct {
 var _ fs.InodeOperations = (*taskDir)(nil)
 
 // newTaskDir creates a new proc task entry.
-func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace, showSubtasks bool) *fs.Inode {
+func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool) *fs.Inode {
 	contents := map[string]*fs.Inode{
 		"auxv":    newAuxvec(t, msrc),
 		"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
@@ -84,20 +84,22 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 		"mounts":    seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
 		"ns":        newNamespaceDir(t, msrc),
 		"smaps":     newSmaps(t, msrc),
-		"stat":      newTaskStat(t, msrc, showSubtasks, pidns),
+		"stat":      newTaskStat(t, msrc, showSubtasks, p.pidns),
 		"statm":     newStatm(t, msrc),
-		"status":    newStatus(t, msrc, pidns),
+		"status":    newStatus(t, msrc, p.pidns),
 		"uid_map":   newUIDMap(t, msrc),
 	}
 	if showSubtasks {
-		contents["task"] = newSubtasks(t, msrc, pidns)
+		contents["task"] = p.newSubtasks(t, msrc)
+	}
+	if len(p.cgroupControllers) > 0 {
+		contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
 	}
 
 	// TODO(b/31916171): Set EUID/EGID based on dumpability.
 	d := &taskDir{
-		Dir:   *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
-		t:     t,
-		pidns: pidns,
+		Dir: *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		t:   t,
 	}
 	return newProcInode(d, msrc, fs.SpecialDirectory, t)
 }
@@ -108,17 +110,17 @@ func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace
 type subtasks struct {
 	ramfs.Dir
 
-	t     *kernel.Task
-	pidns *kernel.PIDNamespace
+	t *kernel.Task
+	p *proc
 }
 
 var _ fs.InodeOperations = (*subtasks)(nil)
 
-func newSubtasks(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode {
+func (p *proc) newSubtasks(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
 	s := &subtasks{
-		Dir:   *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555)),
-		t:     t,
-		pidns: pidns,
+		Dir: *ramfs.NewDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555)),
+		t:   t,
+		p:   p,
 	}
 	return newProcInode(s, msrc, fs.SpecialDirectory, t)
 }
@@ -137,7 +139,7 @@ func (s *subtasks) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.Unstab
 
 // GetFile implements fs.InodeOperations.GetFile.
 func (s *subtasks) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
-	return fs.NewFile(ctx, dirent, flags, &subtasksFile{t: s.t, pidns: s.pidns}), nil
+	return fs.NewFile(ctx, dirent, flags, &subtasksFile{t: s.t, pidns: s.p.pidns}), nil
 }
 
 // +stateify savable
@@ -212,7 +214,7 @@ func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dir
 		return nil, syserror.ENOENT
 	}
 
-	task := s.pidns.TaskWithID(kernel.ThreadID(tid))
+	task := s.p.pidns.TaskWithID(kernel.ThreadID(tid))
 	if task == nil {
 		return nil, syserror.ENOENT
 	}
@@ -220,7 +222,7 @@ func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dir
 		return nil, syserror.ENOENT
 	}
 
-	td := newTaskDir(task, dir.MountSource, s.pidns, false)
+	td := s.p.newTaskDir(task, dir.MountSource, false)
 	return fs.NewDirent(td, p), nil
 }
 
-- 
cgit v1.2.3


From 2105158d4bee4fb36658dba32eb8104cf4e96467 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 15 May 2019 17:20:14 -0700
Subject: gofer: don't call hostfile.Close if hostFile is nil

PiperOrigin-RevId: 248437159
Change-Id: Ife71f6ca032fca59ec97a82961000ed0af257101
---
 pkg/sentry/fs/gofer/path.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index aa3d3aaa6..875de8b5f 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -121,13 +121,17 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 	qids, unopened, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
 	if err != nil {
 		newFile.close(ctx)
-		hostFile.Close()
+		if hostFile != nil {
+			hostFile.Close()
+		}
 		return nil, err
 	}
 	if len(qids) != 1 {
 		log.Warningf("WalkGetAttr(%s) succeeded, but returned %d QIDs (%v), wanted 1", name, len(qids), qids)
 		newFile.close(ctx)
-		hostFile.Close()
+		if hostFile != nil {
+			hostFile.Close()
+		}
 		unopened.close(ctx)
 		return nil, syserror.EIO
 	}
-- 
cgit v1.2.3


From 04105781ad558662e1e48bad17197df244ff7841 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 17 May 2019 13:04:44 -0700
Subject: Fix gofer rename ctime and cleanup stat_times test

There is a lot of redundancy that we can simplify in the stat_times
test. This will make it easier to add new tests. However, the
simplification reveals that cached uattrs on goferfs don't properly
update ctime on rename.

PiperOrigin-RevId: 248773425
Change-Id: I52662728e1e9920981555881f9a85f9ce04041cf
---
 pkg/sentry/fs/fsutil/inode.go        |   4 +-
 pkg/sentry/fs/fsutil/inode_cached.go |   8 +
 pkg/sentry/fs/gofer/path.go          |   7 +-
 pkg/sentry/fs/host/inode.go          |   2 +-
 pkg/sentry/fs/inode.go               |   2 +-
 pkg/sentry/fs/inode_operations.go    |   5 +-
 pkg/sentry/fs/inode_overlay.go       |   2 +-
 pkg/sentry/fs/mock.go                |   2 +-
 pkg/sentry/fs/ramfs/dir.go           |   2 +-
 pkg/sentry/fs/tmpfs/inode_file.go    |   2 +-
 pkg/sentry/fs/tmpfs/tmpfs.go         |   8 +-
 test/syscalls/linux/stat_times.cc    | 319 ++++++++++++++++++++---------------
 12 files changed, 213 insertions(+), 150 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 151be1d0d..5e1bfeb58 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -341,7 +341,7 @@ func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) err
 }
 
 // Rename implements fs.FileOperations.Rename.
-func (InodeNotDirectory) Rename(context.Context, *fs.Inode, string, *fs.Inode, string, bool) error {
+func (InodeNotDirectory) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error {
 	return syserror.EINVAL
 }
 
@@ -381,7 +381,7 @@ func (InodeNoopTruncate) Truncate(context.Context, *fs.Inode, int64) error {
 type InodeNotRenameable struct{}
 
 // Rename implements fs.InodeOperations.Rename.
-func (InodeNotRenameable) Rename(context.Context, *fs.Inode, string, *fs.Inode, string, bool) error {
+func (InodeNotRenameable) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error {
 	return syserror.EINVAL
 }
 
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 03cad37f3..bc0b8c744 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -451,6 +451,14 @@ func (c *CachingInodeOperations) touchModificationTimeLocked(now time.Time) {
 	c.dirtyAttr.StatusChangeTime = true
 }
 
+// TouchStatusChangeTime updates the cached status change time in-place to the
+// current time.
+func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) {
+	c.attrMu.Lock()
+	c.touchStatusChangeTimeLocked(ktime.NowFromContext(ctx))
+	c.attrMu.Unlock()
+}
+
 // touchStatusChangeTimeLocked updates the cached status change time
 // in-place to the current time.
 //
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 875de8b5f..babfa4560 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -344,7 +344,7 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na
 }
 
 // Rename renames this node.
-func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	if len(newName) > maxFilenameLen {
 		return syserror.ENAMETOOLONG
 	}
@@ -389,6 +389,11 @@ func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldNa
 			newParentInodeOperations.markDirectoryDirty()
 		}
 	}
+
+	// Rename always updates ctime.
+	if i.session().cachePolicy.cacheUAttrs(inode) {
+		i.cachingInodeOps.TouchStatusChangeTime(ctx)
+	}
 	return nil
 }
 
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index d36ac9a87..ebf2154bc 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -301,7 +301,7 @@ func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, na
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	op, ok := oldParent.InodeOperations.(*inodeOperations)
 	if !ok {
 		return syscall.EXDEV
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 22f316daf..aef1a1cb9 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -216,7 +216,7 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent,
 	if i.overlay != nil {
 		return overlayRename(ctx, i.overlay, oldParent, renamed, newParent, newName, replacement)
 	}
-	return i.InodeOperations.Rename(ctx, oldParent.Inode, renamed.name, newParent.Inode, newName, replacement)
+	return i.InodeOperations.Rename(ctx, renamed.Inode, oldParent.Inode, renamed.name, newParent.Inode, newName, replacement)
 }
 
 // Bind calls i.InodeOperations.Bind with i as the directory.
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index abafe4791..3211f1817 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -131,14 +131,15 @@ type InodeOperations interface {
 	RemoveDirectory(ctx context.Context, dir *Inode, name string) error
 
 	// Rename atomically renames oldName under oldParent to newName under
-	// newParent where oldParent and newParent are directories.
+	// newParent where oldParent and newParent are directories. inode is
+	// the Inode of this InodeOperations.
 	//
 	// If replacement is true, then newName already exists and this call
 	// will replace it with oldName.
 	//
 	// Implementations are responsible for rejecting renames that replace
 	// non-empty directories.
-	Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string, replacement bool) error
+	Rename(ctx context.Context, inode *Inode, oldParent *Inode, oldName string, newParent *Inode, newName string, replacement bool) error
 
 	// Bind binds a new socket under dir at the given name.
 	//
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index ead487097..ea574224f 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -389,7 +389,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
 		return err
 	}
 	oldName := renamed.name
-	if err := o.upper.InodeOperations.Rename(ctx, oldParent.Inode.overlay.upper, oldName, newParent.Inode.overlay.upper, newName, replacement); err != nil {
+	if err := o.upper.InodeOperations.Rename(ctx, renamed.Inode.overlay.upper, oldParent.Inode.overlay.upper, oldName, newParent.Inode.overlay.upper, newName, replacement); err != nil {
 		return err
 	}
 	if renamed.Inode.overlay.lowerExists {
diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index a71144b2c..064943c5b 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -132,7 +132,7 @@ func (n *MockInodeOperations) CreateDirectory(context.Context, *Inode, string, F
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (n *MockInodeOperations) Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string, replacement bool) error {
+func (n *MockInodeOperations) Rename(ctx context.Context, inode *Inode, oldParent *Inode, oldName string, newParent *Inode, newName string, replacement bool) error {
 	n.renameCalled = true
 	return nil
 }
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index eb98b59cc..c97ad26f5 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -401,7 +401,7 @@ func (d *Dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (*Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName, replacement)
 }
 
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index c90062a22..3fe659543 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -149,7 +149,7 @@ func (f *fileInodeOperations) Mappable(*fs.Inode) memmap.Mappable {
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (*fileInodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (*fileInodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 6ad5c5adb..263d10cfe 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -238,7 +238,7 @@ func (d *Dir) newCreateOps() *ramfs.CreateOps {
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (d *Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (d *Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
@@ -271,7 +271,7 @@ func NewSymlink(ctx context.Context, target string, owner fs.FileOwner, msrc *fs
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (s *Symlink) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (s *Symlink) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
@@ -301,7 +301,7 @@ func NewSocket(ctx context.Context, socket transport.BoundEndpoint, owner fs.Fil
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (s *Socket) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (s *Socket) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
@@ -338,7 +338,7 @@ func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions,
 }
 
 // Rename implements fs.InodeOperations.Rename.
-func (f *Fifo) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
+func (f *Fifo) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
 	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
 }
 
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
index 9b53739a0..195c87ca5 100644
--- a/test/syscalls/linux/stat_times.cc
+++ b/test/syscalls/linux/stat_times.cc
@@ -33,23 +33,88 @@ namespace {
 using ::testing::IsEmpty;
 using ::testing::Not;
 
-class StatTimesTest : public ::testing::Test {
- protected:
-  std::tuple<absl::Time, absl::Time, absl::Time> GetTime(const TempPath& file) {
-    struct stat statbuf = {};
-    EXPECT_THAT(stat(file.path().c_str(), &statbuf), SyscallSucceeds());
-
-    const auto atime = absl::TimeFromTimespec(statbuf.st_atim);
-    const auto mtime = absl::TimeFromTimespec(statbuf.st_mtim);
-    const auto ctime = absl::TimeFromTimespec(statbuf.st_ctim);
-    return std::make_tuple(atime, mtime, ctime);
-  }
+std::tuple<absl::Time, absl::Time, absl::Time> GetTime(const TempPath& file) {
+  struct stat statbuf = {};
+  EXPECT_THAT(stat(file.path().c_str(), &statbuf), SyscallSucceeds());
+
+  const auto atime = absl::TimeFromTimespec(statbuf.st_atim);
+  const auto mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+  const auto ctime = absl::TimeFromTimespec(statbuf.st_ctim);
+  return std::make_tuple(atime, mtime, ctime);
+}
+
+enum class AtimeEffect {
+  Unchanged,
+  Changed,
+};
+
+enum class MtimeEffect {
+  Unchanged,
+  Changed,
 };
 
-TEST_F(StatTimesTest, FileCreationTimes) {
+enum class CtimeEffect {
+  Unchanged,
+  Changed,
+};
+
+// Tests that fn modifies the atime/mtime/ctime of path as specified.
+void CheckTimes(const TempPath& path, std::function<void()> fn,
+                AtimeEffect atime_effect, MtimeEffect mtime_effect,
+                CtimeEffect ctime_effect) {
+  absl::Time atime, mtime, ctime;
+  std::tie(atime, mtime, ctime) = GetTime(path);
+
+  // FIXME(b/132819225): gVisor filesystem timestamps inconsistently use the
+  // internal or host clock, which may diverge slightly. Allow some slack on
+  // times to account for the difference.
+  //
+  // Here we sleep for 1s so that initial creation of path doesn't fall within
+  // the before slack window.
+  absl::SleepFor(absl::Seconds(1));
+
+  const absl::Time before = absl::Now() - absl::Seconds(1);
+
+  // Perform the op.
+  fn();
+
+  const absl::Time after = absl::Now() + absl::Seconds(1);
+
+  absl::Time atime2, mtime2, ctime2;
+  std::tie(atime2, mtime2, ctime2) = GetTime(path);
+
+  if (atime_effect == AtimeEffect::Changed) {
+    EXPECT_LE(before, atime2);
+    EXPECT_GE(after, atime2);
+    EXPECT_GT(atime2, atime);
+  } else {
+    EXPECT_EQ(atime2, atime);
+  }
+
+  if (mtime_effect == MtimeEffect::Changed) {
+    EXPECT_LE(before, mtime2);
+    EXPECT_GE(after, mtime2);
+    EXPECT_GT(mtime2, mtime);
+  } else {
+    EXPECT_EQ(mtime2, mtime);
+  }
+
+  if (ctime_effect == CtimeEffect::Changed) {
+    EXPECT_LE(before, ctime2);
+    EXPECT_GE(after, ctime2);
+    EXPECT_GT(ctime2, ctime);
+  } else {
+    EXPECT_EQ(ctime2, ctime);
+  }
+}
+
+// File creation time is reflected in atime, mtime, and ctime.
+TEST(StatTimesTest, FileCreation) {
   const DisableSave ds;  // Timing-related test.
 
   // Get a time for when the file is created.
+  //
+  // FIXME(b/132819225): See above.
   const absl::Time before = absl::Now() - absl::Seconds(1);
   const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
   const absl::Time after = absl::Now() + absl::Seconds(1);
@@ -65,153 +130,137 @@ TEST_F(StatTimesTest, FileCreationTimes) {
   EXPECT_GE(after, ctime);
 }
 
-TEST_F(StatTimesTest, FileCtimeChanges) {
-  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
-
-  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
-
-  absl::Time atime, mtime, ctime;
-  std::tie(atime, mtime, ctime) = GetTime(file);
-
-  absl::SleepFor(absl::Seconds(1));
-
-  // Chmod should only change ctime.
-  EXPECT_THAT(chmod(file.path().c_str(), 0666), SyscallSucceeds());
+// Calling chmod on a file changes ctime.
+TEST(StatTimesTest, FileChmod) {
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
-  absl::Time atime2, mtime2, ctime2;
-  std::tie(atime2, mtime2, ctime2) = GetTime(file);
-  EXPECT_EQ(atime2, atime);
-  EXPECT_EQ(mtime2, mtime);
-  EXPECT_GT(ctime2, ctime);
+  auto fn = [&] {
+    EXPECT_THAT(chmod(file.path().c_str(), 0666), SyscallSucceeds());
+  };
+  CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Unchanged,
+             CtimeEffect::Changed);
+}
 
-  absl::SleepFor(absl::Seconds(1));
+// Renaming a file changes ctime.
+TEST(StatTimesTest, FileRename) {
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
 
-  // Rename should only change ctime.
-  const auto newpath = NewTempAbsPath();
-  EXPECT_THAT(rename(file.path().c_str(), newpath.c_str()), SyscallSucceeds());
-  file.reset(newpath);
+  const std::string newpath = NewTempAbsPath();
 
-  std::tie(atime, mtime, ctime) = GetTime(file);
-  EXPECT_EQ(atime, atime2);
-  EXPECT_EQ(mtime, mtime2);
-  EXPECT_GT(ctime, ctime2);
+  auto fn = [&] {
+    ASSERT_THAT(rename(file.release().c_str(), newpath.c_str()),
+                SyscallSucceeds());
+    file.reset(newpath);
+  };
+  CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Unchanged,
+             CtimeEffect::Changed);
+}
 
-  absl::SleepFor(absl::Seconds(1));
+// Renaming a file changes ctime, even with an open FD.
+//
+// NOTE(b/132732387): This is a regression test for fs/gofer failing to update
+// cached ctime.
+TEST(StatTimesTest, FileRenameOpenFD) {
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Holding an FD shouldn't affect behavior.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+  const std::string newpath = NewTempAbsPath();
+
+  // FIXME(b/132814682): Restore fails with an uncached gofer and an open FD
+  // across rename.
+  //
+  // N.B. The logic here looks backwards because it isn't possible to
+  // conditionally disable save, only conditionally re-enable it.
+  DisableSave ds;
+  if (!getenv("GVISOR_GOFER_UNCACHED")) {
+    ds.reset();
+  }
 
-  // Utimes should only change ctime and the time that we ask to change (atime
-  // to now in this case).
-  const absl::Time before = absl::Now() - absl::Seconds(1);
-  const struct timespec ts[2] = {{0, UTIME_NOW}, {0, UTIME_OMIT}};
-  ASSERT_THAT(utimensat(AT_FDCWD, file.path().c_str(), ts, 0),
-              SyscallSucceeds());
-  const absl::Time after = absl::Now() + absl::Seconds(1);
+  auto fn = [&] {
+    ASSERT_THAT(rename(file.release().c_str(), newpath.c_str()),
+                SyscallSucceeds());
+    file.reset(newpath);
+  };
+  CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Unchanged,
+             CtimeEffect::Changed);
+}
 
-  std::tie(atime2, mtime2, ctime2) = GetTime(file);
-  EXPECT_LE(before, atime2);
-  EXPECT_GE(after, atime2);
-  EXPECT_EQ(mtime2, mtime);
-  EXPECT_GT(ctime2, ctime);
+// Calling utimes on a file changes ctime and the time that we ask to change
+// (atime to now in this case).
+TEST(StatTimesTest, FileUtimes) {
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  auto fn = [&] {
+    const struct timespec ts[2] = {{0, UTIME_NOW}, {0, UTIME_OMIT}};
+    ASSERT_THAT(utimensat(AT_FDCWD, file.path().c_str(), ts, 0),
+                SyscallSucceeds());
+  };
+  CheckTimes(file, fn, AtimeEffect::Changed, MtimeEffect::Unchanged,
+             CtimeEffect::Changed);
 }
 
-TEST_F(StatTimesTest, FileMtimeChanges) {
-  const auto file = ASSERT_NO_ERRNO_AND_VALUE(
+// Truncating a file changes mtime and ctime.
+TEST(StatTimesTest, FileTruncate) {
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "yaaass", 0666));
 
-  absl::Time atime, mtime, ctime;
-  std::tie(atime, mtime, ctime) = GetTime(file);
-
-  absl::SleepFor(absl::Seconds(1));
-
-  // Truncate should only change mtime and ctime.
-  EXPECT_THAT(truncate(file.path().c_str(), 0), SyscallSucceeds());
-
-  absl::Time atime2, mtime2, ctime2;
-  std::tie(atime2, mtime2, ctime2) = GetTime(file);
-  EXPECT_EQ(atime2, atime);
-  EXPECT_GT(mtime2, mtime);
-  EXPECT_GT(ctime2, ctime);
+  auto fn = [&] {
+    EXPECT_THAT(truncate(file.path().c_str(), 0), SyscallSucceeds());
+  };
+  CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+             CtimeEffect::Changed);
+}
 
-  absl::SleepFor(absl::Seconds(1));
+// Writing a file changes mtime and ctime.
+TEST(StatTimesTest, FileWrite) {
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "yaaass", 0666));
 
-  // Write should only change mtime and ctime.
-  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0));
-  const std::string contents = "all the single dollars";
-  EXPECT_THAT(write(fd.get(), contents.data(), contents.size()),
-              SyscallSucceeds());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0));
 
-  std::tie(atime, mtime, ctime) = GetTime(file);
-  EXPECT_EQ(atime, atime2);
-  EXPECT_GT(mtime, mtime2);
-  EXPECT_GT(ctime, ctime2);
+  auto fn = [&] {
+    const std::string contents = "all the single dollars";
+    EXPECT_THAT(WriteFd(fd.get(), contents.data(), contents.size()),
+                SyscallSucceeds());
+  };
+  CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+             CtimeEffect::Changed);
 }
 
-TEST_F(StatTimesTest, FileAtimeChanges) {
+// Reading a file changes atime.
+TEST(StatTimesTest, FileRead) {
   const std::string contents = "bills bills bills";
-  const auto file = ASSERT_NO_ERRNO_AND_VALUE(
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
       TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), contents, 0666));
 
-  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
-
-  absl::Time atime, mtime, ctime;
-  std::tie(atime, mtime, ctime) = GetTime(file);
-
-  absl::SleepFor(absl::Seconds(1));
-
-  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY, 0));
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY, 0));
 
-  // Read should only change atime.
-  char buf[20];
-  const absl::Time before = absl::Now() - absl::Seconds(1);
-  int read_result;
-  ASSERT_THAT(read_result = read(fd.get(), buf, sizeof(buf)),
-              SyscallSucceeds());
-  const absl::Time after = absl::Now() + absl::Seconds(1);
-
-  EXPECT_EQ(std::string(buf, read_result), contents);
-
-  absl::Time atime2, mtime2, ctime2;
-  std::tie(atime2, mtime2, ctime2) = GetTime(file);
-
-  EXPECT_LE(before, atime2);
-  EXPECT_GE(after, atime2);
-  EXPECT_GT(atime2, atime);
-  EXPECT_EQ(mtime2, mtime);
-  EXPECT_EQ(ctime2, ctime);
+  auto fn = [&] {
+    char buf[20];
+    ASSERT_THAT(ReadFd(fd.get(), buf, sizeof(buf)),
+                SyscallSucceedsWithValue(contents.size()));
+  };
+  CheckTimes(file, fn, AtimeEffect::Changed, MtimeEffect::Unchanged,
+             CtimeEffect::Unchanged);
 }
 
-TEST_F(StatTimesTest, DirAtimeChanges) {
-  const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
-  const auto file =
+// Listing files in a directory changes atime.
+TEST(StatTimesTest, DirList) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
 
-  MaybeSave();  // FIXME(b/69865927): ctime is inconsistent.
-
-  absl::Time atime, mtime, ctime;
-  std::tie(atime, mtime, ctime) = GetTime(dir);
-
-  absl::SleepFor(absl::Seconds(1));
-
-  const absl::Time before = absl::Now() - absl::Seconds(1);
-
-  // NOTE(b/37756234): Keep an fd open. This ensures that the inode backing the
-  // directory won't be destroyed before the final GetTime to avoid writing out
-  // timestamps and causing side effects.
-  const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0));
-
-  // Listing the directory contents should only change atime.
-  auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), false));
-  EXPECT_THAT(contents, Not(IsEmpty()));
-
-  const absl::Time after = absl::Now() + absl::Seconds(1);
-
-  absl::Time atime2, mtime2, ctime2;
-  std::tie(atime2, mtime2, ctime2) = GetTime(dir);
-
-  EXPECT_LE(before, atime2);
-  EXPECT_GE(after, atime2);
-  EXPECT_GT(atime2, atime);
-  EXPECT_EQ(mtime2, mtime);
-  EXPECT_EQ(ctime2, ctime);
+  auto fn = [&] {
+    const auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), false));
+    EXPECT_THAT(contents, Not(IsEmpty()));
+  };
+  CheckTimes(dir, fn, AtimeEffect::Changed, MtimeEffect::Unchanged,
+             CtimeEffect::Unchanged);
 }
 
 }  // namespace
-- 
cgit v1.2.3


From 4a842836e560322bb3944b59ff43b9d60cc0f867 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Fri, 17 May 2019 13:46:18 -0700
Subject: Return EPERM for mknod

This more directly matches what Linux does with unsupported
nodes.

PiperOrigin-RevId: 248780425
Change-Id: I17f3dd0b244f6dc4eb00e2e42344851b8367fbec
---
 pkg/sentry/fs/gofer/path.go | 4 ++--
 pkg/sentry/fs/host/inode.go | 2 +-
 runsc/fsgofer/fsgofer.go    | 5 ++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index babfa4560..148e2f038 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -281,9 +281,9 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 }
 
 // CreateFifo implements fs.InodeOperations.CreateFifo. Gofer nodes do not support the
-// creation of fifos and always returns EOPNOTSUPP.
+// creation of fifos and always returns EPERM.
 func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return syscall.EOPNOTSUPP
+	return syscall.EPERM
 }
 
 // Remove implements InodeOperations.Remove.
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index ebf2154bc..7a230e426 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -287,7 +287,7 @@ func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, st
 
 // CreateFifo implements fs.InodeOperations.CreateFifo.
 func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return syserror.EOPNOTSUPP
+	return syserror.EPERM
 }
 
 // Remove implements fs.InodeOperations.Remove.
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index b185015b6..2cf50290a 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -860,7 +860,10 @@ func (l *localFile) Link(target p9.File, newName string) error {
 //
 // Not implemented.
 func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
-	return p9.QID{}, syscall.ENOSYS
+	// From mknod(2) man page:
+	// "EPERM: [...] if the filesystem containing pathname does not support
+	// the type of node requested."
+	return p9.QID{}, syscall.EPERM
 }
 
 // UnlinkAt implements p9.File.
-- 
cgit v1.2.3


From 6588427451c605ee00c8b1a9b6cba06724627ccb Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 20 May 2019 13:34:06 -0700
Subject: Fix incorrect tmpfs timestamp updates

* Creation of files, directories (and other fs objects) in a directory
  should always update ctime.
* Same for removal.
* atime should not be updated on lookup, only readdir.

I've also renamed some misleading functions that update mtime and ctime.

PiperOrigin-RevId: 249115063
Change-Id: I30fa275fa7db96d01aa759ed64628c18bb3a7dc7
---
 pkg/sentry/fs/fsutil/inode.go        | 10 +++++++++
 pkg/sentry/fs/fsutil/inode_cached.go | 28 ++++++++++++------------
 pkg/sentry/fs/gofer/path.go          | 16 ++++++++------
 pkg/sentry/fs/ramfs/dir.go           | 41 ++++++++++++++++++------------------
 test/syscalls/linux/stat_times.cc    | 34 ++++++++++++++++++++++++++++++
 5 files changed, 87 insertions(+), 42 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 5e1bfeb58..a22b6ce9c 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -192,6 +192,16 @@ func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
 	i.mu.Unlock()
 }
 
+// NotifyModificationAndStatusChange updates the modification and status change
+// times.
+func (i *InodeSimpleAttributes) NotifyModificationAndStatusChange(ctx context.Context) {
+	i.mu.Lock()
+	now := ktime.NowFromContext(ctx)
+	i.unstable.ModificationTime = now
+	i.unstable.StatusChangeTime = now
+	i.mu.Unlock()
+}
+
 // InodeSimpleExtendedAttributes implements
 // fs.InodeOperations.{Get,Set,List}xattr.
 //
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index bc0b8c744..7bee2eb5f 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -299,7 +299,7 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode,
 	}
 	oldSize := c.attr.Size
 	c.attr.Size = size
-	c.touchModificationTimeLocked(now)
+	c.touchModificationAndStatusChangeTimeLocked(now)
 
 	// We drop c.dataMu here so that we can lock c.mapsMu and invalidate
 	// mappings below. This allows concurrent calls to Read/Translate/etc.
@@ -360,7 +360,7 @@ func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length in
 	}
 
 	c.attr.Size = newSize
-	c.touchModificationTimeLocked(now)
+	c.touchModificationAndStatusChangeTimeLocked(now)
 	return nil
 }
 
@@ -394,19 +394,19 @@ func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode)
 	return c.backingFile.Sync(ctx)
 }
 
-// IncLinks increases the link count and updates cached access time.
+// IncLinks increases the link count and updates cached modification time.
 func (c *CachingInodeOperations) IncLinks(ctx context.Context) {
 	c.attrMu.Lock()
 	c.attr.Links++
-	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
+	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
 	c.attrMu.Unlock()
 }
 
-// DecLinks decreases the link count and updates cached access time.
+// DecLinks decreases the link count and updates cached modification time.
 func (c *CachingInodeOperations) DecLinks(ctx context.Context) {
 	c.attrMu.Lock()
 	c.attr.Links--
-	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
+	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
 	c.attrMu.Unlock()
 }
 
@@ -432,19 +432,19 @@ func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) {
 	c.dirtyAttr.AccessTime = true
 }
 
-// TouchModificationTime updates the cached modification and status change time
-// in-place to the current time.
-func (c *CachingInodeOperations) TouchModificationTime(ctx context.Context) {
+// TouchModificationAndStatusChangeTime updates the cached modification and
+// status change times in-place to the current time.
+func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx context.Context) {
 	c.attrMu.Lock()
-	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
+	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
 	c.attrMu.Unlock()
 }
 
-// touchModificationTimeLocked updates the cached modification and status
-// change time in-place to the current time.
+// touchModificationAndStatusChangeTimeLocked updates the cached modification
+// and status change times in-place to the current time.
 //
 // Preconditions: c.attrMu is locked for writing.
-func (c *CachingInodeOperations) touchModificationTimeLocked(now time.Time) {
+func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) {
 	c.attr.ModificationTime = now
 	c.dirtyAttr.ModificationTime = true
 	c.attr.StatusChangeTime = now
@@ -554,7 +554,7 @@ func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequen
 
 	c.attrMu.Lock()
 	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
-	c.touchModificationTimeLocked(ktime.NowFromContext(ctx))
+	c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
 	n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
 	c.attrMu.Unlock()
 	return n, err
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 148e2f038..6ed50a77f 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -113,7 +113,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string
 		return nil, err
 	}
 
-	i.touchModificationTime(ctx, dir)
+	i.touchModificationAndStatusChangeTime(ctx, dir)
 
 	// Get an unopened p9.File for the file we created so that it can be cloned
 	// and re-opened multiple times after creation, while also getting its
@@ -167,7 +167,7 @@ func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname
 	if _, err := i.fileState.file.symlink(ctx, oldname, newname, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
 		return err
 	}
-	i.touchModificationTime(ctx, dir)
+	i.touchModificationAndStatusChangeTime(ctx, dir)
 	return nil
 }
 
@@ -189,7 +189,7 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t
 		// Increase link count.
 		targetOpts.cachingInodeOps.IncLinks(ctx)
 	}
-	i.touchModificationTime(ctx, inode)
+	i.touchModificationAndStatusChangeTime(ctx, inode)
 	return nil
 }
 
@@ -205,6 +205,8 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s
 	}
 	if i.session().cachePolicy.cacheUAttrs(dir) {
 		// Increase link count.
+		//
+		// N.B. This will update the modification time.
 		i.cachingInodeOps.IncLinks(ctx)
 	}
 	if i.session().cachePolicy.cacheReaddir() {
@@ -246,7 +248,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	// We're not going to use this file.
 	hostFile.Close()
 
-	i.touchModificationTime(ctx, dir)
+	i.touchModificationAndStatusChangeTime(ctx, dir)
 
 	// Get the attributes of the file to create inode key.
 	qid, mask, attr, err := getattr(ctx, newFile)
@@ -317,7 +319,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string
 	if removeSocket {
 		i.session().endpoints.remove(key)
 	}
-	i.touchModificationTime(ctx, dir)
+	i.touchModificationAndStatusChangeTime(ctx, dir)
 
 	return nil
 }
@@ -397,9 +399,9 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent
 	return nil
 }
 
-func (i *inodeOperations) touchModificationTime(ctx context.Context, inode *fs.Inode) {
+func (i *inodeOperations) touchModificationAndStatusChangeTime(ctx context.Context, inode *fs.Inode) {
 	if i.session().cachePolicy.cacheUAttrs(inode) {
-		i.cachingInodeOps.TouchModificationTime(ctx)
+		i.cachingInodeOps.TouchModificationAndStatusChangeTime(ctx)
 	}
 	if i.session().cachePolicy.cacheReaddir() {
 		// Invalidate readdir cache.
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index c97ad26f5..cd6e03d66 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -112,7 +112,7 @@ func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwn
 }
 
 // addChildLocked add the child inode, inheriting its reference.
-func (d *Dir) addChildLocked(name string, inode *fs.Inode) {
+func (d *Dir) addChildLocked(ctx context.Context, name string, inode *fs.Inode) {
 	d.children[name] = inode
 	d.dentryMap.Add(name, fs.DentAttr{
 		Type:    inode.StableAttr.Type,
@@ -123,18 +123,25 @@ func (d *Dir) addChildLocked(name string, inode *fs.Inode) {
 	// corresponding to '..' from the subdirectory.
 	if fs.IsDir(inode.StableAttr) {
 		d.AddLink()
+		// ctime updated below.
 	}
 
 	// Given we're now adding this inode to the directory we must also
-	// increase its link count. Similarly we decremented it in removeChildLocked.
+	// increase its link count. Similarly we decrement it in removeChildLocked.
+	//
+	// Changing link count updates ctime.
 	inode.AddLink()
+	inode.InodeOperations.NotifyStatusChange(ctx)
+
+	// We've change the directory. This always updates our mtime and ctime.
+	d.NotifyModificationAndStatusChange(ctx)
 }
 
 // AddChild adds a child to this dir.
 func (d *Dir) AddChild(ctx context.Context, name string, inode *fs.Inode) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
-	d.addChildLocked(name, inode)
+	d.addChildLocked(ctx, name, inode)
 }
 
 // FindChild returns (child, true) if the directory contains name.
@@ -179,14 +186,18 @@ func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, er
 	// link count which was the child's ".." directory entry.
 	if fs.IsDir(inode.StableAttr) {
 		d.DropLink()
+		// ctime changed below.
 	}
 
-	// Update ctime.
-	inode.InodeOperations.NotifyStatusChange(ctx)
-
 	// Given we're now removing this inode to the directory we must also
 	// decrease its link count. Similarly it is increased in addChildLocked.
+	//
+	// Changing link count updates ctime.
 	inode.DropLink()
+	inode.InodeOperations.NotifyStatusChange(ctx)
+
+	// We've change the directory. This always updates our mtime and ctime.
+	d.NotifyModificationAndStatusChange(ctx)
 
 	return inode, nil
 }
@@ -263,8 +274,6 @@ func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, er
 
 // walkLocked must be called with d.mu held.
 func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) {
-	d.NotifyAccess(ctx)
-
 	// Lookup a child node.
 	if inode, ok := d.children[p]; ok {
 		return inode, nil
@@ -290,8 +299,7 @@ func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, make
 		return nil, err
 	}
 
-	d.addChildLocked(name, inode)
-	d.NotifyModification(ctx)
+	d.addChildLocked(ctx, name, inode)
 
 	return inode, nil
 }
@@ -342,11 +350,7 @@ func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inod
 	target.IncRef()
 
 	// The link count will be incremented in addChildLocked.
-	d.addChildLocked(name, target)
-	d.NotifyModification(ctx)
-
-	// Update ctime.
-	target.InodeOperations.NotifyStatusChange(ctx)
+	d.addChildLocked(ctx, name, target)
 
 	return nil
 }
@@ -359,8 +363,6 @@ func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, p
 	_, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) {
 		return d.NewDir(ctx, dir, perms)
 	})
-	// TODO(nlacasse): Support updating status times, as those should be
-	// updated by links.
 	return err
 }
 
@@ -526,10 +528,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n
 	// Do the swap.
 	n := op.children[oldName]
 	op.removeChildLocked(ctx, oldName)
-	np.addChildLocked(newName, n)
-
-	// Update ctime.
-	n.InodeOperations.NotifyStatusChange(ctx)
+	np.addChildLocked(ctx, newName, n)
 
 	return nil
 }
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
index 195c87ca5..68c0bef09 100644
--- a/test/syscalls/linux/stat_times.cc
+++ b/test/syscalls/linux/stat_times.cc
@@ -263,6 +263,40 @@ TEST(StatTimesTest, DirList) {
              CtimeEffect::Unchanged);
 }
 
+// Creating a file in a directory changes mtime and ctime.
+TEST(StatTimesTest, DirCreateFile) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  TempPath file;
+  auto fn = [&] {
+    file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+  };
+  CheckTimes(dir, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+             CtimeEffect::Changed);
+}
+
+// Creating a directory in a directory changes mtime and ctime.
+TEST(StatTimesTest, DirCreateDir) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  TempPath dir2;
+  auto fn = [&] {
+    dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+  };
+  CheckTimes(dir, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+             CtimeEffect::Changed);
+}
+
+// Removing a file from a directory changes mtime and ctime.
+TEST(StatTimesTest, DirRemoveFile) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+  auto fn = [&] { file.reset(); };
+  CheckTimes(dir, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+             CtimeEffect::Changed);
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 80cc2c78e52389015459114b1689cd3265726679 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Mon, 20 May 2019 16:52:03 -0700
Subject: Forward named pipe creation to the gofer

The backing 9p server must allow named pipe creation, which the runsc
fsgofer currently does not.

There are small changes to the overlay here. GetFile may block when
opening a named pipe, which can cause a deadlock:

1. open(O_RDONLY) -> copyMu.Lock() -> GetFile()
2. open(O_WRONLY) -> copyMu.Lock() -> Deadlock

A named pipe usable for writing must already be on the upper filesystem,
but we are still taking copyMu for write when checking for upper. That
can be changed to a read lock to fix the common case.

However, a named pipe on the lower filesystem would still deadlock in
open(O_WRONLY) when it tries to actually perform copy up (which would
simply return EINVAL). Move the copy up type check before taking copyMu
for write to avoid this.

p9 must be modified, as it was incorrectly removing the file mode when
sending messages on the wire.

PiperOrigin-RevId: 249154033
Change-Id: Id6637130e567b03758130eb6c7cdbc976384b7d6
---
 pkg/p9/client_file.go               | 14 +++++++-------
 pkg/p9/file.go                      |  2 +-
 pkg/p9/handlers.go                  |  2 +-
 pkg/p9/local_server/local_server.go |  2 +-
 pkg/p9/messages.go                  | 10 +++++-----
 pkg/p9/messages_test.go             | 24 ++++++++++++------------
 pkg/sentry/fs/copy_up.go            | 18 ++++++++++++++----
 pkg/sentry/fs/gofer/path.go         | 20 ++++++++++++++++----
 pkg/sentry/fs/inode_operations.go   |  4 +++-
 9 files changed, 60 insertions(+), 36 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 471c3a80b..258080f67 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -533,18 +533,18 @@ func (c *clientFile) Link(target File, newname string) error {
 }
 
 // Mknod implements File.Mknod.
-func (c *clientFile) Mknod(name string, permissions FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error) {
+func (c *clientFile) Mknod(name string, mode FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error) {
 	if atomic.LoadUint32(&c.closed) != 0 {
 		return QID{}, syscall.EBADF
 	}
 
 	msg := Tmknod{
-		Directory:   c.fid,
-		Name:        name,
-		Permissions: permissions,
-		Major:       major,
-		Minor:       minor,
-		GID:         NoGID,
+		Directory: c.fid,
+		Name:      name,
+		Mode:      mode,
+		Major:     major,
+		Minor:     minor,
+		GID:       NoGID,
 	}
 
 	if versionSupportsTucreation(c.client.version) {
diff --git a/pkg/p9/file.go b/pkg/p9/file.go
index 89e814d50..a456e8b3d 100644
--- a/pkg/p9/file.go
+++ b/pkg/p9/file.go
@@ -170,7 +170,7 @@ type File interface {
 	// Mknod makes a new device node.
 	//
 	// On the server, Mknod has a write concurrency guarantee.
-	Mknod(name string, permissions FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error)
+	Mknod(name string, mode FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error)
 
 	// Rename renames the file.
 	//
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 533ead98a..f32368763 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -768,7 +768,7 @@ func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
 		}
 
 		// Do the mknod.
-		qid, err = ref.file.Mknod(t.Name, t.Permissions, t.Major, t.Minor, uid, t.GID)
+		qid, err = ref.file.Mknod(t.Name, t.Mode, t.Major, t.Minor, uid, t.GID)
 		return err
 	}); err != nil {
 		return nil, err
diff --git a/pkg/p9/local_server/local_server.go b/pkg/p9/local_server/local_server.go
index d49d94550..9546b3de5 100644
--- a/pkg/p9/local_server/local_server.go
+++ b/pkg/p9/local_server/local_server.go
@@ -252,7 +252,7 @@ func (l *local) Link(target p9.File, newname string) error {
 // Mknod implements p9.File.Mknod.
 //
 // Not implemented.
-func (l *local) Mknod(name string, permissions p9.FileMode, major uint32, minor uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+func (l *local) Mknod(name string, mode p9.FileMode, major uint32, minor uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
 	return p9.QID{}, syscall.ENOSYS
 }
 
diff --git a/pkg/p9/messages.go b/pkg/p9/messages.go
index 703753c31..75d6bc832 100644
--- a/pkg/p9/messages.go
+++ b/pkg/p9/messages.go
@@ -1163,8 +1163,8 @@ type Tmknod struct {
 	// Name is the device name.
 	Name string
 
-	// Permissions are the device permissions.
-	Permissions FileMode
+	// Mode is the device mode and permissions.
+	Mode FileMode
 
 	// Major is the device major number.
 	Major uint32
@@ -1180,7 +1180,7 @@ type Tmknod struct {
 func (t *Tmknod) Decode(b *buffer) {
 	t.Directory = b.ReadFID()
 	t.Name = b.ReadString()
-	t.Permissions = b.ReadPermissions()
+	t.Mode = b.ReadFileMode()
 	t.Major = b.Read32()
 	t.Minor = b.Read32()
 	t.GID = b.ReadGID()
@@ -1190,7 +1190,7 @@ func (t *Tmknod) Decode(b *buffer) {
 func (t *Tmknod) Encode(b *buffer) {
 	b.WriteFID(t.Directory)
 	b.WriteString(t.Name)
-	b.WritePermissions(t.Permissions)
+	b.WriteFileMode(t.Mode)
 	b.Write32(t.Major)
 	b.Write32(t.Minor)
 	b.WriteGID(t.GID)
@@ -1203,7 +1203,7 @@ func (*Tmknod) Type() MsgType {
 
 // String implements fmt.Stringer.
 func (t *Tmknod) String() string {
-	return fmt.Sprintf("Tmknod{DirectoryFID: %d, Name: %s, Permissions: 0o%o, Major: %d, Minor: %d, GID: %d}", t.Directory, t.Name, t.Permissions, t.Major, t.Minor, t.GID)
+	return fmt.Sprintf("Tmknod{DirectoryFID: %d, Name: %s, Mode: 0o%o, Major: %d, Minor: %d, GID: %d}", t.Directory, t.Name, t.Mode, t.Major, t.Minor, t.GID)
 }
 
 // Rmknod is a mknod response.
diff --git a/pkg/p9/messages_test.go b/pkg/p9/messages_test.go
index 513b30e8b..6ba6a1654 100644
--- a/pkg/p9/messages_test.go
+++ b/pkg/p9/messages_test.go
@@ -142,12 +142,12 @@ func TestEncodeDecode(t *testing.T) {
 			QID: QID{Type: 1},
 		},
 		&Tmknod{
-			Directory:   1,
-			Name:        "a",
-			Permissions: 2,
-			Major:       3,
-			Minor:       4,
-			GID:         5,
+			Directory: 1,
+			Name:      "a",
+			Mode:      2,
+			Major:     3,
+			Minor:     4,
+			GID:       5,
 		},
 		&Rmknod{
 			QID: QID{Type: 1},
@@ -349,12 +349,12 @@ func TestEncodeDecode(t *testing.T) {
 		},
 		&Tumknod{
 			Tmknod: Tmknod{
-				Directory:   1,
-				Name:        "a",
-				Permissions: 2,
-				Major:       3,
-				Minor:       4,
-				GID:         5,
+				Directory: 1,
+				Name:      "a",
+				Mode:      2,
+				Major:     3,
+				Minor:     4,
+				GID:       5,
 			},
 			UID: 6,
 		},
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index ee2d3d115..41265704c 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -113,13 +113,13 @@ func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
 		// Did we race with another copy up or does there
 		// already exist something in the upper filesystem
 		// for d?
-		d.Inode.overlay.copyMu.Lock()
+		d.Inode.overlay.copyMu.RLock()
 		if d.Inode.overlay.upper != nil {
-			d.Inode.overlay.copyMu.Unlock()
+			d.Inode.overlay.copyMu.RUnlock()
 			// Done, d is in the upper filesystem.
 			return nil
 		}
-		d.Inode.overlay.copyMu.Unlock()
+		d.Inode.overlay.copyMu.RUnlock()
 
 		// Find the next component to copy up. We will work our way
 		// down to the last component of d and finally copy it.
@@ -155,6 +155,14 @@ func findNextCopyUp(ctx context.Context, d *Dirent) *Dirent {
 }
 
 func doCopyUp(ctx context.Context, d *Dirent) error {
+	// Fail fast on Inode types we won't be able to copy up anyways. These
+	// Inodes may block in GetFile while holding copyMu for reading. If we
+	// then try to take copyMu for writing here, we'd deadlock.
+	t := d.Inode.overlay.lower.StableAttr.Type
+	if t != RegularFile && t != Directory && t != Symlink {
+		return syserror.EINVAL
+	}
+
 	// Wait to get exclusive access to the upper Inode.
 	d.Inode.overlay.copyMu.Lock()
 	defer d.Inode.overlay.copyMu.Unlock()
@@ -177,6 +185,8 @@ func doCopyUp(ctx context.Context, d *Dirent) error {
 // - parent.Inode.overlay.upper must be non-nil.
 // - next.Inode.overlay.copyMu must be locked writable.
 // - next.Inode.overlay.lower must be non-nil.
+// - next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
+//   or Symlink.
 // - upper filesystem must support setting file ownership and timestamps.
 func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 	// Extract the attributes of the file we wish to copy.
@@ -239,7 +249,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
 		childUpperInode = childUpper.Inode
 
 	default:
-		return syserror.EINVAL
+		panic(fmt.Sprintf("copy up of invalid type %v on %+v", next.Inode.StableAttr.Type, next))
 	}
 
 	// Bring file attributes up to date. This does not include size, which will be
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 6ed50a77f..092f8b586 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -282,10 +282,22 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
 	return childDir, nil
 }
 
-// CreateFifo implements fs.InodeOperations.CreateFifo. Gofer nodes do not support the
-// creation of fifos and always returns EPERM.
-func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
-	return syscall.EPERM
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error {
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+
+	owner := fs.FileOwnerFromContext(ctx)
+	mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe
+
+	// N.B. FIFOs use major/minor numbers 0.
+	if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
+		return err
+	}
+
+	i.touchModificationAndStatusChangeTime(ctx, dir)
+	return nil
 }
 
 // Remove implements InodeOperations.Remove.
diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 3211f1817..2ed89d482 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -161,7 +161,9 @@ type InodeOperations interface {
 	BoundEndpoint(inode *Inode, path string) transport.BoundEndpoint
 
 	// GetFile returns a new open File backed by a Dirent and FileFlags.
-	// It may block as long as it is done with ctx.
+	//
+	// Special Inode types may block using ctx.Sleeper. RegularFiles,
+	// Directories, and Symlinks must not block (see doCopyUp).
 	//
 	// The returned File will uniquely back an application fd.
 	GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File, error)
-- 
cgit v1.2.3


From adeb99709bda40e62363b229464b8ae3a90e237b Mon Sep 17 00:00:00 2001
From: Neel Natu <neelnatu@google.com>
Date: Tue, 21 May 2019 12:18:17 -0700
Subject: Remove unused struct member.

Remove unused struct member.

PiperOrigin-RevId: 249300446
Change-Id: Ifb16538f684bc3200342462c3da927eb564bf52d
---
 pkg/sentry/fs/sys/devices.go | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go
index bacc93af8..54f35c6a0 100644
--- a/pkg/sentry/fs/sys/devices.go
+++ b/pkg/sentry/fs/sys/devices.go
@@ -40,9 +40,6 @@ type cpunum struct {
 
 	fsutil.InodeSimpleAttributes
 	fsutil.InodeStaticFileGetter
-
-	// k is the system kernel.
-	k *kernel.Kernel
 }
 
 var _ fs.InodeOperations = (*cpunum)(nil)
-- 
cgit v1.2.3


From 9cdae51feca5cee9faa198161b92a0aeece52d6c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 21 May 2019 15:17:05 -0700
Subject: Add basic plumbing for splice and stub implementation.

This does not actually implement an efficient splice or sendfile. Rather, it
adds a generic plumbing to the file internals so that this can be added. All
file implementations use the stub fileutil.NoSplice implementation, which
causes sendfile and splice to fall back to an internal copy.

A basic splice system call interface is added, along with a test.

PiperOrigin-RevId: 249335960
Change-Id: Ic5568be2af0a505c19e7aec66d5af2480ab0939b
---
 pkg/abi/linux/BUILD                     |   1 +
 pkg/abi/linux/splice.go                 |  23 ++
 pkg/sentry/fs/BUILD                     |   4 +-
 pkg/sentry/fs/ashmem/area.go            |   3 +-
 pkg/sentry/fs/binder/binder.go          |   3 +-
 pkg/sentry/fs/dev/full.go               |   3 +-
 pkg/sentry/fs/dev/null.go               |   4 +-
 pkg/sentry/fs/dev/random.go             |   1 +
 pkg/sentry/fs/fdpipe/pipe.go            |   1 +
 pkg/sentry/fs/file.go                   | 139 +++++++----
 pkg/sentry/fs/file_operations.go        |  47 ++++
 pkg/sentry/fs/file_overlay.go           |  79 ++++---
 pkg/sentry/fs/file_test.go              |  24 --
 pkg/sentry/fs/filetest/filetest.go      |   1 +
 pkg/sentry/fs/fsutil/file.go            |  16 ++
 pkg/sentry/fs/fsutil/inode.go           |   3 +-
 pkg/sentry/fs/gofer/file.go             |   5 +-
 pkg/sentry/fs/host/file.go              |   1 +
 pkg/sentry/fs/inotify.go                |  10 +
 pkg/sentry/fs/proc/exec_args.go         |   3 +-
 pkg/sentry/fs/proc/rpcinet_proc.go      |   3 +-
 pkg/sentry/fs/proc/seqfile/seqfile.go   |   1 +
 pkg/sentry/fs/proc/sys.go               |   3 +-
 pkg/sentry/fs/proc/sys_net.go           |   6 +-
 pkg/sentry/fs/proc/task.go              |  10 +-
 pkg/sentry/fs/proc/uid_gid_map.go       |   1 +
 pkg/sentry/fs/proc/uptime.go            |   5 +-
 pkg/sentry/fs/ramfs/socket.go           |   7 +-
 pkg/sentry/fs/ramfs/symlink.go          |   7 +-
 pkg/sentry/fs/splice.go                 | 187 +++++++++++++++
 pkg/sentry/fs/timerfd/timerfd.go        |   5 +-
 pkg/sentry/fs/tmpfs/file_regular.go     |   3 +-
 pkg/sentry/fs/tty/dir.go                |   3 +-
 pkg/sentry/fs/tty/master.go             |   3 +-
 pkg/sentry/fs/tty/slave.go              |   3 +-
 pkg/sentry/kernel/epoll/epoll.go        |   3 +-
 pkg/sentry/kernel/eventfd/eventfd.go    |   5 +-
 pkg/sentry/kernel/pipe/reader_writer.go |   3 +-
 pkg/sentry/loader/vdso.go               |   3 +-
 pkg/sentry/socket/epsocket/epsocket.go  |   3 +-
 pkg/sentry/socket/hostinet/socket.go    |   3 +-
 pkg/sentry/socket/netlink/socket.go     |   3 +-
 pkg/sentry/socket/rpcinet/socket.go     |   3 +-
 pkg/sentry/socket/unix/unix.go          |   3 +-
 pkg/sentry/syscalls/linux/BUILD         |   1 +
 pkg/sentry/syscalls/linux/linux64.go    |   2 +-
 pkg/sentry/syscalls/linux/sys_file.go   |  98 --------
 pkg/sentry/syscalls/linux/sys_splice.go | 293 +++++++++++++++++++++++
 test/syscalls/BUILD                     |   2 +
 test/syscalls/linux/BUILD               |  16 ++
 test/syscalls/linux/splice.cc           | 404 ++++++++++++++++++++++++++++++++
 51 files changed, 1221 insertions(+), 242 deletions(-)
 create mode 100644 pkg/abi/linux/splice.go
 delete mode 100644 pkg/sentry/fs/file_test.go
 create mode 100644 pkg/sentry/fs/splice.go
 create mode 100644 pkg/sentry/syscalls/linux/sys_splice.go
 create mode 100644 test/syscalls/linux/splice.cc

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index fdf193873..96e8d4641 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -45,6 +45,7 @@ go_library(
         "shm.go",
         "signal.go",
         "socket.go",
+        "splice.go",
         "tcp.go",
         "time.go",
         "timer.go",
diff --git a/pkg/abi/linux/splice.go b/pkg/abi/linux/splice.go
new file mode 100644
index 000000000..650eb87e8
--- /dev/null
+++ b/pkg/abi/linux/splice.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Constants for splice(2), sendfile(2) and tee(2).
+const (
+	SPLICE_F_MOVE = 1 << iota
+	SPLICE_F_NONBLOCK
+	SPLICE_F_MORE
+	SPLICE_F_GIFT
+)
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index 1fd9e30f6..142a00840 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -40,6 +40,7 @@ go_library(
         "restore.go",
         "save.go",
         "seek.go",
+        "splice.go",
         "sync.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs",
@@ -51,6 +52,7 @@ go_library(
         "//pkg/metric",
         "//pkg/p9",
         "//pkg/refs",
+        "//pkg/secio",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/device",
@@ -66,7 +68,6 @@ go_library(
         "//pkg/sentry/usermem",
         "//pkg/state",
         "//pkg/syserror",
-        "//pkg/tcpip",
         "//pkg/waiter",
     ],
 )
@@ -122,7 +123,6 @@ go_test(
     srcs = [
         "dirent_cache_test.go",
         "dirent_refs_test.go",
-        "file_test.go",
         "mount_test.go",
         "path_test.go",
     ],
diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go
index b53746519..b4b0cc08b 100644
--- a/pkg/sentry/fs/ashmem/area.go
+++ b/pkg/sentry/fs/ashmem/area.go
@@ -42,11 +42,12 @@ const (
 //
 // +stateify savable
 type Area struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	ad *Device
 
diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go
index a992253e6..c78f1fc40 100644
--- a/pkg/sentry/fs/binder/binder.go
+++ b/pkg/sentry/fs/binder/binder.go
@@ -86,10 +86,11 @@ func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags)
 //
 // +stateify savable
 type Proc struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	bd   *Device
 	task *kernel.Task
diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go
index 17d68b5c4..8f6c6da2d 100644
--- a/pkg/sentry/fs/dev/full.go
+++ b/pkg/sentry/fs/dev/full.go
@@ -60,6 +60,7 @@ func (f *fullDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.Fi
 
 // +stateify savable
 type fullFileOperations struct {
+	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
@@ -68,8 +69,8 @@ type fullFileOperations struct {
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	readZeros                       `state:"nosave"`
-	waiter.AlwaysReady              `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*fullFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go
index ee13183c8..3f1accef8 100644
--- a/pkg/sentry/fs/dev/null.go
+++ b/pkg/sentry/fs/dev/null.go
@@ -64,6 +64,7 @@ type nullFileOperations struct {
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRead             `state:"nosave"`
@@ -104,14 +105,15 @@ func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 type zeroFileOperations struct {
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
-	readZeros                       `state:"nosave"`
 	waiter.AlwaysReady              `state:"nosave"`
+	readZeros                       `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*zeroFileOperations)(nil)
diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go
index b0a412382..e5a01a906 100644
--- a/pkg/sentry/fs/dev/random.go
+++ b/pkg/sentry/fs/dev/random.go
@@ -61,6 +61,7 @@ type randomFileOperations struct {
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 95e66ea8d..4ef7ea08a 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -43,6 +43,7 @@ type pipeOperations struct {
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	waiter.Queue                    `state:"nosave"`
 
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 62b35dabc..8f1baca23 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -21,7 +21,6 @@ import (
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/amutex"
-	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/metric"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -35,8 +34,13 @@ import (
 )
 
 var (
-	// RecordWaitTime controls writing metrics for filesystem reads. Enabling this comes at a small
-	// CPU cost due to performing two monotonic clock reads per read call.
+	// RecordWaitTime controls writing metrics for filesystem reads.
+	// Enabling this comes at a small CPU cost due to performing two
+	// monotonic clock reads per read call.
+	//
+	// Note that this is only performed in the direct read path, and may
+	// not be consistently applied for other forms of reads, such as
+	// splice.
 	RecordWaitTime = false
 
 	reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
@@ -306,14 +310,28 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 		return 0, syserror.ErrInterrupted
 	}
 
-	offset, err := f.checkWriteLocked(ctx, &src, f.offset)
-	if err != nil {
+	// Handle append mode.
+	if f.Flags().Append {
+		if err := f.offsetForAppend(ctx, &f.offset); err != nil {
+			f.mu.Unlock()
+			return 0, err
+		}
+	}
+
+	// Enforce file limits.
+	limit, ok := f.checkLimit(ctx, f.offset)
+	switch {
+	case ok && limit == 0:
 		f.mu.Unlock()
-		return 0, err
+		return 0, syserror.ErrExceedsFileSizeLimit
+	case ok:
+		src = src.TakeFirst64(limit)
 	}
-	n, err := f.FileOperations.Write(ctx, f, src, offset)
+
+	// We must hold the lock during the write.
+	n, err := f.FileOperations.Write(ctx, f, src, f.offset)
 	if n >= 0 {
-		atomic.StoreInt64(&f.offset, offset+n)
+		atomic.StoreInt64(&f.offset, f.offset+n)
 	}
 	f.mu.Unlock()
 	return n, err
@@ -325,51 +343,67 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 //
 // Otherwise same as Writev.
 func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
-	if !f.mu.Lock(ctx) {
-		return 0, syserror.ErrInterrupted
+	// "POSIX requires that opening a file with the O_APPEND flag should
+	// have no effect on the location at which pwrite() writes data.
+	// However, on Linux, if a file is opened with O_APPEND,  pwrite()
+	// appends data to the end of the file, regardless of the value of
+	// offset."
+	if f.Flags().Append {
+		if !f.mu.Lock(ctx) {
+			return 0, syserror.ErrInterrupted
+		}
+		defer f.mu.Unlock()
+		if err := f.offsetForAppend(ctx, &offset); err != nil {
+			f.mu.Unlock()
+			return 0, err
+		}
 	}
 
-	offset, err := f.checkWriteLocked(ctx, &src, offset)
-	if err != nil {
-		f.mu.Unlock()
-		return 0, err
+	// Enforce file limits.
+	limit, ok := f.checkLimit(ctx, offset)
+	switch {
+	case ok && limit == 0:
+		return 0, syserror.ErrExceedsFileSizeLimit
+	case ok:
+		src = src.TakeFirst64(limit)
 	}
-	n, err := f.FileOperations.Write(ctx, f, src, offset)
-	f.mu.Unlock()
-	return n, err
+
+	return f.FileOperations.Write(ctx, f, src, offset)
 }
 
-// checkWriteLocked returns the offset to write at or an error if the write
-// would not succeed. May update src to fit a write operation into a file
-// size limit.
-func (f *File) checkWriteLocked(ctx context.Context, src *usermem.IOSequence, offset int64) (int64, error) {
-	// Handle append only files. Note that this is still racy for network
-	// filesystems.
-	if f.Flags().Append {
-		uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			// This is an odd error, most likely it is evidence
-			// that something is terribly wrong with the filesystem.
-			// Return a generic EIO error.
-			log.Warningf("Failed to check write of inode %#v: %v", f.Dirent.Inode.StableAttr, err)
-			return offset, syserror.EIO
-		}
-		offset = uattr.Size
+// offsetForAppend sets the given offset to the end of the file.
+//
+// Precondition: the underlying file mutex should be held.
+func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
+	uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
+	if err != nil {
+		// This is an odd error, we treat it as evidence that
+		// something is terribly wrong with the filesystem.
+		return syserror.EIO
 	}
 
-	// Is this a regular file?
+	// Update the offset.
+	*offset = uattr.Size
+
+	return nil
+}
+
+// checkLimit checks the offset that the write will be performed at. The
+// returned boolean indicates that the write must be limited. The returned
+// integer indicates the new maximum write length.
+func (f *File) checkLimit(ctx context.Context, offset int64) (int64, bool) {
 	if IsRegular(f.Dirent.Inode.StableAttr) {
 		// Enforce size limits.
 		fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
 		if fileSizeLimit <= math.MaxInt64 {
 			if offset >= int64(fileSizeLimit) {
-				return offset, syserror.ErrExceedsFileSizeLimit
+				return 0, true
 			}
-			*src = src.TakeFirst64(int64(fileSizeLimit) - offset)
+			return int64(fileSizeLimit) - offset, true
 		}
 	}
 
-	return offset, nil
+	return 0, false
 }
 
 // Fsync calls f.FileOperations.Fsync with f as the File.
@@ -466,8 +500,13 @@ func (f *File) Async(newAsync func() FileAsync) FileAsync {
 	return f.async
 }
 
-// FileReader implements io.Reader and io.ReaderAt.
-type FileReader struct {
+// lockedReader implements io.Reader and io.ReaderAt.
+//
+// Note this reads the underlying file using the file operations directly. It
+// is the responsibility of the caller to ensure that locks are appropriately
+// held and offsets updated if required. This should be used only by internal
+// functions that perform these operations and checks at other times.
+type lockedReader struct {
 	// Ctx is the context for the file reader.
 	Ctx context.Context
 
@@ -476,19 +515,21 @@ type FileReader struct {
 }
 
 // Read implements io.Reader.Read.
-func (r *FileReader) Read(buf []byte) (int, error) {
-	n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf))
+func (r *lockedReader) Read(buf []byte) (int, error) {
+	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.File.offset)
 	return int(n), err
 }
 
 // ReadAt implements io.Reader.ReadAt.
-func (r *FileReader) ReadAt(buf []byte, offset int64) (int, error) {
-	n, err := r.File.Preadv(r.Ctx, usermem.BytesIOSequence(buf), offset)
+func (r *lockedReader) ReadAt(buf []byte, offset int64) (int, error) {
+	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), offset)
 	return int(n), err
 }
 
-// FileWriter implements io.Writer and io.WriterAt.
-type FileWriter struct {
+// lockedWriter implements io.Writer and io.WriterAt.
+//
+// The same constraints as lockedReader apply; see above.
+type lockedWriter struct {
 	// Ctx is the context for the file writer.
 	Ctx context.Context
 
@@ -497,13 +538,13 @@ type FileWriter struct {
 }
 
 // Write implements io.Writer.Write.
-func (w *FileWriter) Write(buf []byte) (int, error) {
-	n, err := w.File.Writev(w.Ctx, usermem.BytesIOSequence(buf))
+func (w *lockedWriter) Write(buf []byte) (int, error) {
+	n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), w.File.offset)
 	return int(n), err
 }
 
 // WriteAt implements io.Writer.WriteAt.
-func (w *FileWriter) WriteAt(buf []byte, offset int64) (int, error) {
-	n, err := w.File.Pwritev(w.Ctx, usermem.BytesIOSequence(buf), offset)
+func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
+	n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), offset)
 	return int(n), err
 }
diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go
index ab0acb6eb..0f2dfa273 100644
--- a/pkg/sentry/fs/file_operations.go
+++ b/pkg/sentry/fs/file_operations.go
@@ -22,6 +22,38 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
+// SpliceOpts define how a splice works.
+type SpliceOpts struct {
+	// Length is the length of the splice operation.
+	Length int64
+
+	// SrcOffset indicates whether the existing source file offset should
+	// be used. If this is true, then the Start value below is used.
+	//
+	// When passed to FileOperations object, this should always be true as
+	// the offset will be provided by a layer above, unless the object in
+	// question is a pipe or socket. This value can be relied upon for such
+	// an indicator.
+	SrcOffset bool
+
+	// SrcStart is the start of the source file. This is used only if
+	// SrcOffset is false.
+	SrcStart int64
+
+	// Dup indicates that the contents should not be consumed from the
+	// source (e.g. in the case of a socket or a pipe), but duplicated.
+	Dup bool
+
+	// DstOffset indicates that the destination file offset should be used.
+	//
+	// See SrcOffset for additional information.
+	DstOffset bool
+
+	// DstStart is the start of the destination file. This is used only if
+	// DstOffset is false.
+	DstStart int64
+}
+
 // FileOperations are operations on a File that diverge per file system.
 //
 // Operations that take a *File may use only the following interfaces:
@@ -67,6 +99,15 @@ type FileOperations interface {
 	// Read must not be called if !FileFlags.Read.
 	Read(ctx context.Context, file *File, dst usermem.IOSequence, offset int64) (int64, error)
 
+	// WriteTo is a variant of read that takes another file as a
+	// destination. For a splice (copy or move from one file to another),
+	// first a WriteTo on the source is attempted, followed by a ReadFrom
+	// on the destination, following by a buffered copy with standard Read
+	// and Write operations.
+	//
+	// The same preconditions as Read apply.
+	WriteTo(ctx context.Context, file *File, dst *File, opts SpliceOpts) (int64, error)
+
 	// Write writes src to file at offset and returns the number of bytes
 	// written which must be greater than or equal to 0. Like Read, file
 	// systems that do not support writing at an offset (i.e. pipefs, sockfs)
@@ -81,6 +122,12 @@ type FileOperations interface {
 	// Write must not be called if !FileFlags.Write.
 	Write(ctx context.Context, file *File, src usermem.IOSequence, offset int64) (int64, error)
 
+	// ReadFrom is a variant of write that takes a another file as a
+	// source. See WriteTo for details regarding how this is called.
+	//
+	// The same preconditions as Write apply; FileFlags.Write must be set.
+	ReadFrom(ctx context.Context, file *File, src *File, opts SpliceOpts) (int64, error)
+
 	// Fsync writes buffered modifications of file and/or flushes in-flight
 	// operations to backing storage based on syncType. The range to sync is
 	// [start, end]. The end is inclusive so that the last byte of a maximally
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 948ce9c6f..273de1e14 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -17,7 +17,6 @@ package fs
 import (
 	"sync"
 
-	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/refs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
@@ -222,31 +221,50 @@ func (f *overlayFileOperations) IterateDir(ctx context.Context, dirCtx *DirCtx,
 	return offset + n, err
 }
 
-// Read implements FileOperations.Read.
-func (f *overlayFileOperations) Read(ctx context.Context, file *File, dst usermem.IOSequence, offset int64) (int64, error) {
-	o := file.Dirent.Inode.overlay
+// onTop performs the given operation on the top-most available layer.
+func (f *overlayFileOperations) onTop(ctx context.Context, file *File, fn func(*File, FileOperations) error) error {
+	file.Dirent.Inode.overlay.copyMu.RLock()
+	defer file.Dirent.Inode.overlay.copyMu.RUnlock()
 
-	o.copyMu.RLock()
-	defer o.copyMu.RUnlock()
+	// Only lower layer is available.
+	if file.Dirent.Inode.overlay.upper == nil {
+		return fn(f.lower, f.lower.FileOperations)
+	}
 
-	if o.upper != nil {
-		// We may need to acquire an open file handle to read from if
-		// copy up has occurred. Otherwise we risk reading from the
-		// wrong source.
-		f.upperMu.Lock()
-		if f.upper == nil {
-			var err error
-			f.upper, err = overlayFile(ctx, o.upper, file.Flags())
-			if err != nil {
-				f.upperMu.Unlock()
-				log.Warningf("failed to acquire handle with flags %v: %v", file.Flags(), err)
-				return 0, syserror.EIO
-			}
+	f.upperMu.Lock()
+	if f.upper == nil {
+		upper, err := overlayFile(ctx, file.Dirent.Inode.overlay.upper, file.Flags())
+		if err != nil {
+			// Something very wrong; return a generic filesystem
+			// error to avoid propagating internals.
+			f.upperMu.Unlock()
+			return syserror.EIO
 		}
-		f.upperMu.Unlock()
-		return f.upper.FileOperations.Read(ctx, f.upper, dst, offset)
+
+		// Save upper file.
+		f.upper = upper
 	}
-	return f.lower.FileOperations.Read(ctx, f.lower, dst, offset)
+	f.upperMu.Unlock()
+
+	return fn(f.upper, f.upper.FileOperations)
+}
+
+// Read implements FileOperations.Read.
+func (f *overlayFileOperations) Read(ctx context.Context, file *File, dst usermem.IOSequence, offset int64) (n int64, err error) {
+	err = f.onTop(ctx, file, func(file *File, ops FileOperations) error {
+		n, err = ops.Read(ctx, file, dst, offset)
+		return err // Will overwrite itself.
+	})
+	return
+}
+
+// WriteTo implements FileOperations.WriteTo.
+func (f *overlayFileOperations) WriteTo(ctx context.Context, file *File, dst *File, opts SpliceOpts) (n int64, err error) {
+	err = f.onTop(ctx, file, func(file *File, ops FileOperations) error {
+		n, err = ops.WriteTo(ctx, file, dst, opts)
+		return err // Will overwrite itself.
+	})
+	return
 }
 
 // Write implements FileOperations.Write.
@@ -257,15 +275,20 @@ func (f *overlayFileOperations) Write(ctx context.Context, file *File, src userm
 	return f.upper.FileOperations.Write(ctx, f.upper, src, offset)
 }
 
+// ReadFrom implements FileOperations.ReadFrom.
+func (f *overlayFileOperations) ReadFrom(ctx context.Context, file *File, src *File, opts SpliceOpts) (n int64, err error) {
+	// See above; f.upper must be non-nil.
+	return f.upper.FileOperations.ReadFrom(ctx, f.upper, src, opts)
+}
+
 // Fsync implements FileOperations.Fsync.
-func (f *overlayFileOperations) Fsync(ctx context.Context, file *File, start, end int64, syncType SyncType) error {
-	var err error
+func (f *overlayFileOperations) Fsync(ctx context.Context, file *File, start, end int64, syncType SyncType) (err error) {
 	f.upperMu.Lock()
 	if f.upper != nil {
 		err = f.upper.FileOperations.Fsync(ctx, f.upper, start, end, syncType)
 	}
 	f.upperMu.Unlock()
-	if f.lower != nil {
+	if err == nil && f.lower != nil {
 		// N.B. Fsync on the lower filesystem can cause writes of file
 		// attributes (i.e. access time) despite the fact that we must
 		// treat the lower filesystem as read-only.
@@ -277,15 +300,14 @@ func (f *overlayFileOperations) Fsync(ctx context.Context, file *File, start, en
 }
 
 // Flush implements FileOperations.Flush.
-func (f *overlayFileOperations) Flush(ctx context.Context, file *File) error {
+func (f *overlayFileOperations) Flush(ctx context.Context, file *File) (err error) {
 	// Flush whatever handles we have.
-	var err error
 	f.upperMu.Lock()
 	if f.upper != nil {
 		err = f.upper.FileOperations.Flush(ctx, f.upper)
 	}
 	f.upperMu.Unlock()
-	if f.lower != nil {
+	if err == nil && f.lower != nil {
 		err = f.lower.FileOperations.Flush(ctx, f.lower)
 	}
 	return err
@@ -329,6 +351,7 @@ func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opt
 	if !o.isMappableLocked() {
 		return syserror.ENODEV
 	}
+
 	// FIXME(jamieliu): This is a copy/paste of fsutil.GenericConfigureMMap,
 	// which we can't use because the overlay implementation is in package fs,
 	// so depending on fs/fsutil would create a circular dependency. Move
diff --git a/pkg/sentry/fs/file_test.go b/pkg/sentry/fs/file_test.go
deleted file mode 100644
index d867a0257..000000000
--- a/pkg/sentry/fs/file_test.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package fs
-
-import "io"
-
-var (
-	_ = io.Reader(&FileReader{})
-	_ = io.ReaderAt(&FileReader{})
-	_ = io.Writer(&FileWriter{})
-	_ = io.WriterAt(&FileWriter{})
-)
diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go
index f6b827800..c0b1b088d 100644
--- a/pkg/sentry/fs/filetest/filetest.go
+++ b/pkg/sentry/fs/filetest/filetest.go
@@ -38,6 +38,7 @@ type TestFileOperations struct {
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	waiter.AlwaysReady              `state:"nosave"`
 }
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
index e355d8594..9381963d0 100644
--- a/pkg/sentry/fs/fsutil/file.go
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -223,6 +223,20 @@ func (FileNoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallAr
 	return 0, syserror.ENOTTY
 }
 
+// FileNoSplice implements fs.FileOperations.ReadFrom and
+// fs.FileOperations.WriteTo for files that don't support splice.
+type FileNoSplice struct{}
+
+// WriteTo implements fs.FileOperations.WriteTo.
+func (FileNoSplice) WriteTo(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) {
+	return 0, syserror.ENOSYS
+}
+
+// ReadFrom implements fs.FileOperations.ReadFrom.
+func (FileNoSplice) ReadFrom(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) {
+	return 0, syserror.ENOSYS
+}
+
 // DirFileOperations implements most of fs.FileOperations for directories,
 // except for Readdir and UnstableAttr which the embedding type must implement.
 type DirFileOperations struct {
@@ -233,6 +247,7 @@ type DirFileOperations struct {
 	FileNoopFlush
 	FileNoopFsync
 	FileNoopRelease
+	FileNoSplice
 }
 
 // Read implements fs.FileOperations.Read
@@ -303,6 +318,7 @@ type NoReadWriteFile struct {
 	FileNoWrite              `state:"nosave"`
 	FileNotDirReaddir        `state:"nosave"`
 	FileUseInodeUnstableAttr `state:"nosave"`
+	FileNoSplice             `state:"nosave"`
 }
 
 var _ fs.FileOperations = (*NoReadWriteFile)(nil)
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index a22b6ce9c..925887335 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -250,16 +250,17 @@ func (i *InodeSimpleExtendedAttributes) Listxattr(_ *fs.Inode) (map[string]struc
 //
 // +stateify savable
 type staticFile struct {
-	waiter.AlwaysReady       `state:"nosave"`
 	FileGenericSeek          `state:"nosave"`
 	FileNoIoctl              `state:"nosave"`
 	FileNoMMap               `state:"nosave"`
+	FileNoSplice             `state:"nosave"`
 	FileNoopFsync            `state:"nosave"`
 	FileNoopFlush            `state:"nosave"`
 	FileNoopRelease          `state:"nosave"`
 	FileNoopWrite            `state:"nosave"`
 	FileNotDirReaddir        `state:"nosave"`
 	FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady       `state:"nosave"`
 
 	FileStaticContentReader
 }
diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go
index bc2be546e..fb4f50113 100644
--- a/pkg/sentry/fs/gofer/file.go
+++ b/pkg/sentry/fs/gofer/file.go
@@ -46,8 +46,9 @@ var (
 //
 // +stateify savable
 type fileOperations struct {
-	fsutil.FileNoIoctl `state:"nosave"`
-	waiter.AlwaysReady `state:"nosave"`
+	fsutil.FileNoIoctl  `state:"nosave"`
+	fsutil.FileNoSplice `state:"nosplice"`
+	waiter.AlwaysReady  `state:"nosave"`
 
 	// inodeOperations is the inodeOperations backing the file. It is protected
 	// by a reference held by File.Dirent.Inode which is stable until
diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go
index 82e2ae3b9..ad0a3ec85 100644
--- a/pkg/sentry/fs/host/file.go
+++ b/pkg/sentry/fs/host/file.go
@@ -37,6 +37,7 @@ import (
 // +stateify savable
 type fileOperations struct {
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosplice"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index 2652582c3..7dfd31020 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -171,11 +171,21 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i
 	return writeLen, nil
 }
 
+// WriteTo implements FileOperations.WriteTo.
+func (*Inotify) WriteTo(context.Context, *File, *File, SpliceOpts) (int64, error) {
+	return 0, syserror.ENOSYS
+}
+
 // Fsync implements FileOperations.Fsync.
 func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error {
 	return syserror.EINVAL
 }
 
+// ReadFrom implements FileOperations.ReadFrom.
+func (*Inotify) ReadFrom(context.Context, *File, *File, SpliceOpts) (int64, error) {
+	return 0, syserror.ENOSYS
+}
+
 // Flush implements FileOperations.Flush.
 func (*Inotify) Flush(context.Context, *File) error {
 	return nil
diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go
index d49dad685..cb28f6bc3 100644
--- a/pkg/sentry/fs/proc/exec_args.go
+++ b/pkg/sentry/fs/proc/exec_args.go
@@ -77,16 +77,17 @@ func (i *execArgInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.
 
 // +stateify savable
 type execArgFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopWrite            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	// arg is the type of exec argument this file contains.
 	arg execArgType
diff --git a/pkg/sentry/fs/proc/rpcinet_proc.go b/pkg/sentry/fs/proc/rpcinet_proc.go
index db53686f6..e36c0bfa6 100644
--- a/pkg/sentry/fs/proc/rpcinet_proc.go
+++ b/pkg/sentry/fs/proc/rpcinet_proc.go
@@ -60,15 +60,16 @@ func (i *rpcInetInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.
 
 // rpcInetFile implements fs.FileOperations as RPCs.
 type rpcInetFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	inode *rpcInetInode
 }
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 6b0ae9e60..8364d86ed 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -187,6 +187,7 @@ type seqFileOperations struct {
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go
index b889ed625..59846af4f 100644
--- a/pkg/sentry/fs/proc/sys.go
+++ b/pkg/sentry/fs/proc/sys.go
@@ -134,7 +134,6 @@ var _ fs.InodeOperations = (*hostname)(nil)
 
 // +stateify savable
 type hostnameFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoSeek               `state:"nosave"`
@@ -143,7 +142,9 @@ type hostnameFile struct {
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoWrite              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 }
 
 // Read implements fs.FileOperations.Read.
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index e49794a48..dbf1a987c 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -85,15 +85,16 @@ func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F
 
 // +stateify savable
 type tcpMemFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	tcpMemInode *tcpMemInode
 }
@@ -198,15 +199,16 @@ func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF
 
 // +stateify savable
 type tcpSackFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	tcpSack *tcpSack
 
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 66d76d194..494b195cd 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -672,16 +672,17 @@ func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlag
 
 // +stateify savable
 type commFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	t *kernel.Task
 }
@@ -728,16 +729,17 @@ func (a *auxvec) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 
 // +stateify savable
 type auxvecFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	t *kernel.Task
 }
diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go
index 5df3cee13..a14b1b45f 100644
--- a/pkg/sentry/fs/proc/uid_gid_map.go
+++ b/pkg/sentry/fs/proc/uid_gid_map.go
@@ -85,6 +85,7 @@ type idMapFileOperations struct {
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go
index 1ddf9fafa..35c3851e1 100644
--- a/pkg/sentry/fs/proc/uptime.go
+++ b/pkg/sentry/fs/proc/uptime.go
@@ -54,16 +54,17 @@ func (u *uptime) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 
 // +stateify savable
 type uptimeFile struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	startTime ktime.Time
 }
diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go
index a7cb1bb86..7d8bca70e 100644
--- a/pkg/sentry/fs/ramfs/socket.go
+++ b/pkg/sentry/fs/ramfs/socket.go
@@ -70,13 +70,14 @@ func (s *Socket) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFl
 type socketFileOperations struct {
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoRead               `state:"nosave"`
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
-	fsutil.FileNoRead               `state:"nosave"`
-	fsutil.FileNoSeek               `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	waiter.AlwaysReady              `state:"nosave"`
 }
diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go
index dd2585b02..21c246169 100644
--- a/pkg/sentry/fs/ramfs/symlink.go
+++ b/pkg/sentry/fs/ramfs/symlink.go
@@ -91,13 +91,14 @@ func (s *Symlink) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileF
 type symlinkFileOperations struct {
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoRead               `state:"nosave"`
+	fsutil.FileNoSeek               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
-	fsutil.FileNoRead               `state:"nosave"`
-	fsutil.FileNoSeek               `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoWrite              `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	waiter.AlwaysReady              `state:"nosave"`
 }
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
new file mode 100644
index 000000000..65937f44d
--- /dev/null
+++ b/pkg/sentry/fs/splice.go
@@ -0,0 +1,187 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+	"io"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/secio"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Splice moves data to this file, directly from another.
+//
+// Offsets are updated only if DstOffset and SrcOffset are set.
+func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64, error) {
+	// Verify basic file flag permissions.
+	if !dst.Flags().Write || !src.Flags().Read {
+		return 0, syserror.EBADF
+	}
+
+	// Check whether or not the objects being sliced are stream-oriented
+	// (i.e. pipes or sockets). If yes, we elide checks and offset locks.
+	srcPipe := IsPipe(src.Dirent.Inode.StableAttr) || IsSocket(src.Dirent.Inode.StableAttr)
+	dstPipe := IsPipe(dst.Dirent.Inode.StableAttr) || IsSocket(dst.Dirent.Inode.StableAttr)
+
+	if !dstPipe && !opts.DstOffset && !srcPipe && !opts.SrcOffset {
+		switch {
+		case dst.UniqueID < src.UniqueID:
+			// Acquire dst first.
+			if !dst.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			defer dst.mu.Unlock()
+			if !src.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			defer src.mu.Unlock()
+		case dst.UniqueID > src.UniqueID:
+			// Acquire src first.
+			if !src.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			defer src.mu.Unlock()
+			if !dst.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			defer dst.mu.Unlock()
+		case dst.UniqueID == src.UniqueID:
+			// Acquire only one lock; it's the same file. This is a
+			// bit of a edge case, but presumably it's possible.
+			if !dst.mu.Lock(ctx) {
+				return 0, syserror.ErrInterrupted
+			}
+			defer dst.mu.Unlock()
+		}
+		// Use both offsets (locked).
+		opts.DstStart = dst.offset
+		opts.SrcStart = src.offset
+	} else if !dstPipe && !opts.DstOffset {
+		// Acquire only dst.
+		if !dst.mu.Lock(ctx) {
+			return 0, syserror.ErrInterrupted
+		}
+		defer dst.mu.Unlock()
+		opts.DstStart = dst.offset // Safe: locked.
+	} else if !srcPipe && !opts.SrcOffset {
+		// Acquire only src.
+		if !src.mu.Lock(ctx) {
+			return 0, syserror.ErrInterrupted
+		}
+		defer src.mu.Unlock()
+		opts.SrcStart = src.offset // Safe: locked.
+	}
+
+	// Check append-only mode and the limit.
+	if !dstPipe {
+		if dst.Flags().Append {
+			if opts.DstOffset {
+				// We need to acquire the lock.
+				if !dst.mu.Lock(ctx) {
+					return 0, syserror.ErrInterrupted
+				}
+				defer dst.mu.Unlock()
+			}
+			// Figure out the appropriate offset to use.
+			if err := dst.offsetForAppend(ctx, &opts.DstStart); err != nil {
+				return 0, err
+			}
+		}
+
+		// Enforce file limits.
+		limit, ok := dst.checkLimit(ctx, opts.DstStart)
+		switch {
+		case ok && limit == 0:
+			return 0, syserror.ErrExceedsFileSizeLimit
+		case ok && limit < opts.Length:
+			opts.Length = limit // Cap the write.
+		}
+	}
+
+	// Attempt to do a WriteTo; this is likely the most efficient.
+	//
+	// The underlying implementation may be able to donate buffers.
+	newOpts := SpliceOpts{
+		Length:    opts.Length,
+		SrcStart:  opts.SrcStart,
+		SrcOffset: !srcPipe,
+		Dup:       opts.Dup,
+		DstStart:  opts.DstStart,
+		DstOffset: !dstPipe,
+	}
+	n, err := src.FileOperations.WriteTo(ctx, src, dst, newOpts)
+	if n == 0 && err != nil {
+		// Attempt as a ReadFrom. If a WriteTo, a ReadFrom may also
+		// be more efficient than a copy if buffers are cached or readily
+		// available. (It's unlikely that they can actually be donate
+		n, err = dst.FileOperations.ReadFrom(ctx, dst, src, newOpts)
+	}
+	if n == 0 && err != nil {
+		// If we've failed up to here, and at least one of the sources
+		// is a pipe or socket, then we can't properly support dup.
+		// Return an error indicating that this operation is not
+		// supported.
+		if (srcPipe || dstPipe) && newOpts.Dup {
+			return 0, syserror.EINVAL
+		}
+
+		// We failed to splice the files. But that's fine; we just fall
+		// back to a slow path in this case. This copies without doing
+		// any mode changes, so should still be more efficient.
+		var (
+			r io.Reader
+			w io.Writer
+		)
+		fw := &lockedWriter{
+			Ctx:  ctx,
+			File: dst,
+		}
+		if newOpts.DstOffset {
+			// Use the provided offset.
+			w = secio.NewOffsetWriter(fw, newOpts.DstStart)
+		} else {
+			// Writes will proceed with no offset.
+			w = fw
+		}
+		fr := &lockedReader{
+			Ctx:  ctx,
+			File: src,
+		}
+		if newOpts.SrcOffset {
+			// Limit to the given offset and length.
+			r = io.NewSectionReader(fr, opts.SrcStart, opts.Length)
+		} else {
+			// Limit just to the given length.
+			r = &io.LimitedReader{fr, opts.Length}
+		}
+
+		// Copy between the two.
+		n, err = io.Copy(w, r)
+	}
+
+	// Update offsets, if required.
+	if n > 0 {
+		if !dstPipe && !opts.DstOffset {
+			atomic.StoreInt64(&dst.offset, dst.offset+n)
+		}
+		if !srcPipe && !opts.SrcOffset {
+			atomic.StoreInt64(&src.offset, src.offset+n)
+		}
+	}
+
+	return n, err
+}
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index 749961f51..bce5f091d 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -36,9 +36,10 @@ type TimerOperations struct {
 	fsutil.FileZeroSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	events waiter.Queue `state:"zerovalue"`
diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go
index 1ef256511..d1c163879 100644
--- a/pkg/sentry/fs/tmpfs/file_regular.go
+++ b/pkg/sentry/fs/tmpfs/file_regular.go
@@ -28,14 +28,15 @@ import (
 //
 // +stateify savable
 type regularFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoopFsync            `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	// iops is the InodeOperations of a regular tmpfs file. It is
 	// guaranteed to be the same as file.Dirent.Inode.InodeOperations,
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 8dc40e1f2..2603354c4 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -286,14 +286,15 @@ func (d *dirInodeOperations) masterClose(t *Terminal) {
 //
 // +stateify savable
 type dirFileOperations struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileGenericSeek          `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	// di is the inode operations.
 	di *dirInodeOperations
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 45e167e5f..afdf44cd1 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -98,8 +98,9 @@ type masterFileOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// d is the containing dir.
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 0ae57a02c..2abf32e57 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -87,8 +87,9 @@ type slaveFileOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// si is the inode operations.
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 2399ae6f2..bbacba1f4 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -102,8 +102,9 @@ type EventPoll struct {
 	fsutil.FileNotDirReaddir        `state:"zerovalue"`
 	fsutil.FileNoFsync              `state:"zerovalue"`
 	fsutil.FileNoopFlush            `state:"zerovalue"`
-	fsutil.FileNoMMap               `state:"zerovalue"`
 	fsutil.FileNoIoctl              `state:"zerovalue"`
+	fsutil.FileNoMMap               `state:"zerovalue"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// Wait queue is used to notify interested parties when the event poll
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 5d3139eef..2f900be38 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -42,9 +42,10 @@ type EventOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
-	fsutil.FileNoMMap               `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
+	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 
 	// Mutex that protects accesses to the fields of this event.
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index ddcc5e09a..59899be49 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -38,8 +38,9 @@ type ReaderWriter struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	*Pipe
 }
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 207d8ed3d..4e73527cf 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -52,15 +52,16 @@ func (f *fileContext) Value(key interface{}) interface{} {
 
 // byteReader implements fs.FileOperations for reading from a []byte source.
 type byteReader struct {
-	waiter.AlwaysReady              `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
 	fsutil.FileNoIoctl              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoopRelease          `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+	waiter.AlwaysReady              `state:"nosave"`
 
 	data []byte
 }
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 520d82f68..31a449cf2 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -212,9 +212,10 @@ type commonEndpoint interface {
 type SocketOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
-	fsutil.FileNoFsync              `state:"nosave"`
 	fsutil.FileNoopFlush            `state:"nosave"`
+	fsutil.FileNoFsync              `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 	*waiter.Queue
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 71884d3db..41f9693bb 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -49,8 +49,9 @@ type socketOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index dc688eb00..afd06ca33 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -68,8 +68,9 @@ type Socket struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index c028ed4dd..55e0b6665 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -48,8 +48,9 @@ type socketOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	socket.SendReceiveTimeout
 
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 26788ec31..931056d51 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -48,8 +48,9 @@ type SocketOperations struct {
 	fsutil.FilePipeSeek             `state:"nosave"`
 	fsutil.FileNotDirReaddir        `state:"nosave"`
 	fsutil.FileNoFsync              `state:"nosave"`
-	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileNoMMap               `state:"nosave"`
+	fsutil.FileNoSplice             `state:"nosave"`
+	fsutil.FileNoopFlush            `state:"nosave"`
 	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
 	refs.AtomicRefCount
 	socket.SendReceiveTimeout
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 6e2843b36..f76989ae2 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -34,6 +34,7 @@ go_library(
         "sys_shm.go",
         "sys_signal.go",
         "sys_socket.go",
+        "sys_splice.go",
         "sys_stat.go",
         "sys_sync.go",
         "sys_sysinfo.go",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 9a460ebdf..3e4d312af 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -407,7 +407,7 @@ var AMD64 = &kernel.SyscallTable{
 		273: syscalls.Error(syscall.ENOSYS),
 		// @Syscall(GetRobustList, note:Obsolete)
 		274: syscalls.Error(syscall.ENOSYS),
-		//     275: @Syscall(Splice), TODO(b/29354098)
+		275: Splice,
 		//     276: @Syscall(Tee), TODO(b/29354098)
 		277: SyncFileRange,
 		//     278: @Syscall(Vmsplice), TODO(b/29354098)
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 1764bb4b6..8a80cd430 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -15,7 +15,6 @@
 package linux
 
 import (
-	"io"
 	"syscall"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -2025,103 +2024,6 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return 0, nil, nil
 }
 
-// Sendfile implements linux system call sendfile(2).
-func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
-	outFD := kdefs.FD(args[0].Int())
-	inFD := kdefs.FD(args[1].Int())
-	offsetAddr := args[2].Pointer()
-	count := int64(args[3].SizeT())
-
-	// Don't send a negative number of bytes.
-	if count < 0 {
-		return 0, nil, syserror.EINVAL
-	}
-
-	if count > int64(kernel.MAX_RW_COUNT) {
-		count = int64(kernel.MAX_RW_COUNT)
-	}
-
-	// Get files.
-	outFile := t.FDMap().GetFile(outFD)
-	if outFile == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer outFile.DecRef()
-
-	inFile := t.FDMap().GetFile(inFD)
-	if inFile == nil {
-		return 0, nil, syserror.EBADF
-	}
-	defer inFile.DecRef()
-
-	// Verify that the outfile is writable.
-	outFlags := outFile.Flags()
-	if !outFlags.Write {
-		return 0, nil, syserror.EBADF
-	}
-
-	// Verify that the outfile Append flag is not set.
-	if outFlags.Append {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Verify that we have a regular infile.
-	// http://elixir.free-electrons.com/linux/latest/source/fs/splice.c#L933
-	if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) {
-		return 0, nil, syserror.EINVAL
-	}
-
-	// Verify that the infile is readable.
-	if !inFile.Flags().Read {
-		return 0, nil, syserror.EBADF
-	}
-
-	// Setup for sending data.
-	var n int64
-	var err error
-	w := &fs.FileWriter{t, outFile}
-	hasOffset := offsetAddr != 0
-	// If we have a provided offset.
-	if hasOffset {
-		// Verify that when offset address is not null, infile must be seekable
-		if !inFile.Flags().Pread {
-			return 0, nil, syserror.ESPIPE
-		}
-		// Copy in the offset.
-		var offset int64
-		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
-			return 0, nil, err
-		}
-		if offset < 0 {
-			return 0, nil, syserror.EINVAL
-		}
-		// Send data using Preadv.
-		r := io.NewSectionReader(&fs.FileReader{t, inFile}, offset, count)
-		n, err = io.Copy(w, r)
-		// Copy out the new offset.
-		if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
-			return 0, nil, err
-		}
-		// If we don't have a provided offset.
-	} else {
-		// Send data using readv.
-		inOff := inFile.Offset()
-		r := &io.LimitedReader{R: &fs.FileReader{t, inFile}, N: count}
-		n, err = io.Copy(w, r)
-		inOff += n
-		if inFile.Offset() != inOff {
-			// Adjust file position in case more bytes were read than written.
-			if _, err := inFile.Seek(t, fs.SeekSet, inOff); err != nil {
-				return 0, nil, syserror.EIO
-			}
-		}
-	}
-
-	// We can only pass a single file to handleIOError, so pick inFile
-	// arbitrarily.
-	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile)
-}
-
 const (
 	memfdPrefix     = "/memfd:"
 	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
new file mode 100644
index 000000000..37303606f
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -0,0 +1,293 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// doSplice implements a blocking splice operation.
+func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
+	var (
+		total int64
+		n     int64
+		err   error
+		ch    chan struct{}
+		inW   bool
+		outW  bool
+	)
+	for opts.Length > 0 {
+		n, err = fs.Splice(t, outFile, inFile, opts)
+		opts.Length -= n
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		} else if err == syserror.ErrWouldBlock && nonBlocking {
+			break
+		}
+
+		// Are we a registered waiter?
+		if ch == nil {
+			ch = make(chan struct{}, 1)
+		}
+		if !inW && inFile.Readiness(EventMaskRead) == 0 && !inFile.Flags().NonBlocking {
+			w, _ := waiter.NewChannelEntry(ch)
+			inFile.EventRegister(&w, EventMaskRead)
+			defer inFile.EventUnregister(&w)
+			inW = true // Registered.
+		} else if !outW && outFile.Readiness(EventMaskWrite) == 0 && !outFile.Flags().NonBlocking {
+			w, _ := waiter.NewChannelEntry(ch)
+			outFile.EventRegister(&w, EventMaskWrite)
+			defer outFile.EventUnregister(&w)
+			outW = true // Registered.
+		}
+
+		// Was anything registered? If no, everything is non-blocking.
+		if !inW && !outW {
+			break
+		}
+
+		// Block until there's data.
+		if err = t.Block(ch); err != nil {
+			break
+		}
+	}
+
+	return total, err
+}
+
+// Sendfile implements linux system call sendfile(2).
+func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	outFD := kdefs.FD(args[0].Int())
+	inFD := kdefs.FD(args[1].Int())
+	offsetAddr := args[2].Pointer()
+	count := int64(args[3].SizeT())
+
+	// Don't send a negative number of bytes.
+	if count < 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get files.
+	outFile := t.FDMap().GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	inFile := t.FDMap().GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	// Verify that the outfile Append flag is not set. Note that fs.Splice
+	// itself validates that the output file is writable.
+	if outFile.Flags().Append {
+		return 0, nil, syserror.EBADF
+	}
+
+	// Verify that we have a regular infile. This is a requirement; the
+	// same check appears in Linux (fs/splice.c:splice_direct_to_actor).
+	if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var (
+		n   int64
+		err error
+	)
+	if offsetAddr != 0 {
+		// Verify that when offset address is not null, infile must be
+		// seekable. The fs.Splice routine itself validates basic read.
+		if !inFile.Flags().Pread {
+			return 0, nil, syserror.ESPIPE
+		}
+
+		// Copy in the offset.
+		var offset int64
+		if _, err := t.CopyIn(offsetAddr, &offset); err != nil {
+			return 0, nil, err
+		}
+
+		// The offset must be valid.
+		if offset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+
+		// Do the splice.
+		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
+			Length:    count,
+			SrcOffset: true,
+			SrcStart:  offset,
+		}, false)
+
+		// Copy out the new offset.
+		if _, err := t.CopyOut(offsetAddr, n+offset); err != nil {
+			return 0, nil, err
+		}
+	} else {
+		// Send data using splice.
+		n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{
+			Length: count,
+		}, false)
+	}
+
+	// We can only pass a single file to handleIOError, so pick inFile
+	// arbitrarily. This is used only for debugging purposes.
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile)
+}
+
+// Splice implements splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := kdefs.FD(args[0].Int())
+	inOffset := args[1].Pointer()
+	outFD := kdefs.FD(args[2].Int())
+	outOffset := args[3].Pointer()
+	count := int64(args[4].SizeT())
+	flags := args[5].Int()
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Only non-blocking is meaningful. Note that unlike in Linux, this
+	// flag is applied consistently. We will have either fully blocking or
+	// non-blocking behavior below, regardless of the underlying files
+	// being spliced to. It's unclear if this is a bug or not yet.
+	nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
+
+	// Get files.
+	outFile := t.FDMap().GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	inFile := t.FDMap().GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	// Construct our options.
+	//
+	// Note that exactly one of the underlying buffers must be a pipe. We
+	// don't actually have this constraint internally, but we enforce it
+	// for the semantics of the call.
+	opts := fs.SpliceOpts{
+		Length: count,
+	}
+	switch {
+	case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && !fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+		if inOffset != 0 {
+			return 0, nil, syserror.ESPIPE
+		}
+		if outOffset != 0 {
+			var offset int64
+			if _, err := t.CopyIn(outOffset, &offset); err != nil {
+				return 0, nil, err
+			}
+			// Use the destination offset.
+			opts.DstOffset = true
+			opts.DstStart = offset
+		}
+	case !fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+		if outOffset != 0 {
+			return 0, nil, syserror.ESPIPE
+		}
+		if inOffset != 0 {
+			var offset int64
+			if _, err := t.CopyIn(inOffset, &offset); err != nil {
+				return 0, nil, err
+			}
+			// Use the source offset.
+			opts.SrcOffset = true
+			opts.SrcStart = offset
+		}
+	case fs.IsPipe(inFile.Dirent.Inode.StableAttr) && fs.IsPipe(outFile.Dirent.Inode.StableAttr):
+		if inOffset != 0 || outOffset != 0 {
+			return 0, nil, syserror.ESPIPE
+		}
+	default:
+		return 0, nil, syserror.EINVAL
+	}
+
+	// We may not refer to the same pipe; otherwise it's a continuous loop.
+	if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Splice data.
+	n, err := doSplice(t, outFile, inFile, opts, nonBlocking)
+
+	// See above; inFile is chosen arbitrarily here.
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile)
+}
+
+// Tee imlements tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := kdefs.FD(args[0].Int())
+	outFD := kdefs.FD(args[1].Int())
+	count := int64(args[2].SizeT())
+	flags := args[3].Int()
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Only non-blocking is meaningful.
+	nonBlocking := (flags & linux.SPLICE_F_NONBLOCK) != 0
+
+	// Get files.
+	outFile := t.FDMap().GetFile(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	inFile := t.FDMap().GetFile(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+
+	// All files must be pipes.
+	if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// We may not refer to the same pipe; see above.
+	if inFile.Dirent.Inode.StableAttr.InodeID == outFile.Dirent.Inode.StableAttr.InodeID {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Splice data.
+	n, err := doSplice(t, outFile, inFile, fs.SpliceOpts{
+		Length: count,
+		Dup:    true,
+	}, nonBlocking)
+
+	// See above; inFile is chosen arbitrarily here.
+	return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "tee", inFile)
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 79be06494..b531d7629 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -277,6 +277,8 @@ syscall_test(test = "//test/syscalls/linux:sendfile_socket_test")
 
 syscall_test(test = "//test/syscalls/linux:sendfile_test")
 
+syscall_test(test = "//test/syscalls/linux:splice_test")
+
 syscall_test(test = "//test/syscalls/linux:sigaction_test")
 
 # TODO(b/119826902): Enable once the test passes in runsc.
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ee40be569..d4e49bb3a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1747,6 +1747,22 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "splice_test",
+    testonly = 1,
+    srcs = ["splice.cc"],
+    linkstatic = 1,
+    deps = [
+        "//test/util:file_descriptor",
+        "//test/util:temp_path",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "sigaction_test",
     testonly = 1,
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
new file mode 100644
index 000000000..1875f4533
--- /dev/null
+++ b/test/syscalls/linux/splice.cc
@@ -0,0 +1,404 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/sendfile.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SpliceTest, TwoRegularFiles) {
+  // Create temp files.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+  // Open the input file as read only.
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+  // Open the output file as write only.
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+  // Verify that it is rejected as expected; regardless of offsets.
+  loff_t in_offset = 0;
+  loff_t out_offset = 0;
+  EXPECT_THAT(splice(inf.get(), &in_offset, outf.get(), &out_offset, 1, 0),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(splice(inf.get(), nullptr, outf.get(), &out_offset, 1, 0),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(splice(inf.get(), &in_offset, outf.get(), nullptr, 1, 0),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(splice(inf.get(), nullptr, outf.get(), nullptr, 1, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SpliceTest, SamePipe) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Attempt to splice to itself.
+  EXPECT_THAT(splice(rfd.get(), nullptr, wfd.get(), nullptr, kPageSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TeeTest, SamePipe) {
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill the pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Attempt to tee to itself.
+  EXPECT_THAT(tee(rfd.get(), wfd.get(), kPageSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TeeTest, RegularFile) {
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Open some file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Attempt to tee from the file.
+  EXPECT_THAT(tee(inf.get(), wfd.get(), kPageSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(tee(rfd.get(), inf.get(), kPageSize, 0),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SpliceTest, PipeOffsets) {
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // All pipe offsets should be rejected.
+  loff_t in_offset = 0;
+  loff_t out_offset = 0;
+  EXPECT_THAT(splice(rfd1.get(), &in_offset, wfd2.get(), &out_offset, 1, 0),
+              SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(splice(rfd1.get(), nullptr, wfd2.get(), &out_offset, 1, 0),
+              SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(splice(rfd1.get(), &in_offset, wfd2.get(), nullptr, 1, 0),
+              SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(SpliceTest, ToPipe) {
+  // Open the input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(inf.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  ASSERT_THAT(lseek(inf.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Splice to the pipe.
+  EXPECT_THAT(splice(inf.get(), nullptr, wfd.get(), nullptr, kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Contents should be equal.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+
+TEST(SpliceTest, ToPipeOffset) {
+  // Open the input file.
+  const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(inf.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Splice to the pipe.
+  loff_t in_offset = kPageSize / 2;
+  EXPECT_THAT(
+      splice(inf.get(), &in_offset, wfd.get(), nullptr, kPageSize / 2, 0),
+      SyscallSucceedsWithValue(kPageSize / 2));
+
+  // Contents should be equal to only the second part.
+  std::vector<char> rbuf(kPageSize / 2);
+  ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize / 2));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data() + (kPageSize / 2), rbuf.size()), 0);
+}
+
+TEST(SpliceTest, FromPipe) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the input file.
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+  // Splice to the output file.
+  EXPECT_THAT(splice(rfd.get(), nullptr, outf.get(), nullptr, kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // The offset of the output should be equal to kPageSize. We assert that and
+  // reset to zero so that we can read the contents and ensure they match.
+  EXPECT_THAT(lseek(outf.get(), 0, SEEK_CUR),
+              SyscallSucceedsWithValue(kPageSize));
+  ASSERT_THAT(lseek(outf.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+  // Contents should be equal.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(outf.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+
+TEST(SpliceTest, FromPipeOffset) {
+  // Create a new pipe.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor rfd(fds[0]);
+  const FileDescriptor wfd(fds[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Open the input file.
+  const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor outf =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+  // Splice to the output file.
+  loff_t out_offset = kPageSize / 2;
+  EXPECT_THAT(splice(rfd.get(), nullptr, outf.get(), &out_offset, kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Content should reflect the splice. We write to a specific offset in the
+  // file, so the internals should now be allocated sparsely.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(outf.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  std::vector<char> zbuf(kPageSize / 2);
+  memset(zbuf.data(), 0, zbuf.size());
+  EXPECT_EQ(memcmp(rbuf.data(), zbuf.data(), zbuf.size()), 0);
+  EXPECT_EQ(memcmp(rbuf.data() + kPageSize / 2, buf.data(), kPageSize / 2), 0);
+}
+
+TEST(SpliceTest, TwoPipes) {
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // Fill with some random data.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(wfd1.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Splice to the second pipe, using two operations.
+  EXPECT_THAT(
+      splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize / 2, 0),
+      SyscallSucceedsWithValue(kPageSize / 2));
+  EXPECT_THAT(
+      splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize / 2, 0),
+      SyscallSucceedsWithValue(kPageSize / 2));
+
+  // Content should reflect the splice.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+}
+
+TEST(SpliceTest, Blocking) {
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // This thread writes to the main pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ScopedThread t([&]() {
+    ASSERT_THAT(write(wfd1.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(kPageSize));
+  });
+
+  // Attempt a splice immediately; it should block.
+  EXPECT_THAT(splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Thread should be joinable.
+  t.Join();
+
+  // Content should reflect the splice.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+}
+
+TEST(TeeTest, Blocking) {
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // This thread writes to the main pipe.
+  std::vector<char> buf(kPageSize);
+  RandomizeBuffer(buf.data(), buf.size());
+  ScopedThread t([&]() {
+    ASSERT_THAT(write(wfd1.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(kPageSize));
+  });
+
+  // Attempt a tee immediately; it should block.
+  EXPECT_THAT(tee(rfd1.get(), wfd2.get(), kPageSize, 0),
+              SyscallSucceedsWithValue(kPageSize));
+
+  // Thread should be joinable.
+  t.Join();
+
+  // Content should reflect the splice, in both pipes.
+  std::vector<char> rbuf(kPageSize);
+  ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+  ASSERT_THAT(read(rfd1.get(), rbuf.data(), rbuf.size()),
+              SyscallSucceedsWithValue(kPageSize));
+  EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+}
+
+TEST(SpliceTest, NonBlocking) {
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // Splice with no data to back it.
+  EXPECT_THAT(splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize,
+                     SPLICE_F_NONBLOCK),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(TeeTest, NonBlocking) {
+  SKIP_IF(IsRunningOnGvisor());
+
+  // Create two new pipes.
+  int first[2], second[2];
+  ASSERT_THAT(pipe(first), SyscallSucceeds());
+  const FileDescriptor rfd1(first[0]);
+  const FileDescriptor wfd1(first[1]);
+  ASSERT_THAT(pipe(second), SyscallSucceeds());
+  const FileDescriptor rfd2(second[0]);
+  const FileDescriptor wfd2(second[1]);
+
+  // Splice with no data to back it.
+  EXPECT_THAT(tee(rfd1.get(), wfd2.get(), kPageSize, SPLICE_F_NONBLOCK),
+              SyscallFailsWithErrno(EAGAIN));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
-- 
cgit v1.2.3


From c8857f72696c1097a427b75f4340969e20cc0e95 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 21 May 2019 17:04:58 -0700
Subject: Fix inconsistencies in ELF anonymous mappings

* A segment with filesz == 0, memsz > 0 should be an anonymous only
  mapping. We were failing to load such an ELF.
* Anonymous pages are always mapped RW, regardless of the segment
  protections.

PiperOrigin-RevId: 249355239
Change-Id: I251e5c0ce8848cf8420c3aadf337b0d77b1ad991
---
 pkg/sentry/loader/elf.go           | 118 +++++++++++++++++---------------
 test/syscalls/linux/exec_binary.cc | 135 +++++++++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+), 56 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 97e32c8ba..900236531 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -223,13 +223,8 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
 // mapSegment maps a phdr into the Task. offset is the offset to apply to
 // phdr.Vaddr.
 func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
-	// Alignment of vaddr and offset must match. We'll need to map on the
-	// page boundary.
+	// We must make a page-aligned mapping.
 	adjust := usermem.Addr(phdr.Vaddr).PageOffset()
-	if adjust != usermem.Addr(phdr.Off).PageOffset() {
-		ctx.Infof("Alignment of vaddr %#x != off %#x", phdr.Vaddr, phdr.Off)
-		return syserror.ENOEXEC
-	}
 
 	addr, ok := offset.AddLength(phdr.Vaddr)
 	if !ok {
@@ -239,17 +234,11 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.
 	}
 	addr -= usermem.Addr(adjust)
 
-	fileOffset := phdr.Off - adjust
 	fileSize := phdr.Filesz + adjust
 	if fileSize < phdr.Filesz {
 		ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust)
 		return syserror.ENOEXEC
 	}
-	memSize := phdr.Memsz + adjust
-	if memSize < phdr.Memsz {
-		ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust)
-		return syserror.ENOEXEC
-	}
 	ms, ok := usermem.Addr(fileSize).RoundUp()
 	if !ok {
 		ctx.Infof("fileSize %#x too large", fileSize)
@@ -257,51 +246,64 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.
 	}
 	mapSize := uint64(ms)
 
-	prot := progFlagsAsPerms(phdr.Flags)
-	mopts := memmap.MMapOpts{
-		Length: mapSize,
-		Offset: fileOffset,
-		Addr:   addr,
-		Fixed:  true,
-		// Linux will happily allow conflicting segments to map over
-		// one another.
-		Unmap:    true,
-		Private:  true,
-		Perms:    prot,
-		MaxPerms: usermem.AnyAccess,
-	}
-	defer func() {
-		if mopts.MappingIdentity != nil {
-			mopts.MappingIdentity.DecRef()
-		}
-	}()
-	if err := f.ConfigureMMap(ctx, &mopts); err != nil {
-		ctx.Infof("File is not memory-mappable: %v", err)
-		return err
-	}
-	if _, err := m.MMap(ctx, mopts); err != nil {
-		ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err)
-		return err
-	}
-
-	// We need to clear the end of the last page that exceeds fileSize so
-	// we don't map part of the file beyond fileSize.
-	//
-	// Note that Linux *does not* clear the portion of the first page
-	// before phdr.Off.
-	if mapSize > fileSize {
-		zeroAddr, ok := addr.AddLength(fileSize)
-		if !ok {
-			panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize))
+	if mapSize > 0 {
+		// This must result in a page-aligned offset. i.e., the original
+		// phdr.Off must have the same alignment as phdr.Vaddr. If that is not
+		// true, MMap will reject the mapping.
+		fileOffset := phdr.Off - adjust
+
+		prot := progFlagsAsPerms(phdr.Flags)
+		mopts := memmap.MMapOpts{
+			Length: mapSize,
+			Offset: fileOffset,
+			Addr:   addr,
+			Fixed:  true,
+			// Linux will happily allow conflicting segments to map over
+			// one another.
+			Unmap:    true,
+			Private:  true,
+			Perms:    prot,
+			MaxPerms: usermem.AnyAccess,
 		}
-		zeroSize := int64(mapSize - fileSize)
-		if zeroSize < 0 {
-			panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize)))
+		defer func() {
+			if mopts.MappingIdentity != nil {
+				mopts.MappingIdentity.DecRef()
+			}
+		}()
+		if err := f.ConfigureMMap(ctx, &mopts); err != nil {
+			ctx.Infof("File is not memory-mappable: %v", err)
+			return err
 		}
-		if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil {
-			ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+usermem.Addr(zeroSize), err)
+		if _, err := m.MMap(ctx, mopts); err != nil {
+			ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err)
 			return err
 		}
+
+		// We need to clear the end of the last page that exceeds fileSize so
+		// we don't map part of the file beyond fileSize.
+		//
+		// Note that Linux *does not* clear the portion of the first page
+		// before phdr.Off.
+		if mapSize > fileSize {
+			zeroAddr, ok := addr.AddLength(fileSize)
+			if !ok {
+				panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize))
+			}
+			zeroSize := int64(mapSize - fileSize)
+			if zeroSize < 0 {
+				panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize)))
+			}
+			if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil {
+				ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+usermem.Addr(zeroSize), err)
+				return err
+			}
+		}
+	}
+
+	memSize := phdr.Memsz + adjust
+	if memSize < phdr.Memsz {
+		ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust)
+		return syserror.ENOEXEC
 	}
 
 	// Allocate more anonymous pages if necessary.
@@ -321,9 +323,13 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.
 			Addr:   anonAddr,
 			// Fixed without Unmap will fail the mmap if something is
 			// already at addr.
-			Fixed:    true,
-			Private:  true,
-			Perms:    progFlagsAsPerms(phdr.Flags),
+			Fixed:   true,
+			Private: true,
+			// N.B. Linux uses vm_brk to map these pages, ignoring
+			// the segment protections, instead always mapping RW.
+			// These pages are not included in the final brk
+			// region.
+			Perms:    usermem.ReadWrite,
 			MaxPerms: usermem.AnyAccess,
 		}); err != nil {
 			ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err)
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index bdd6eb10b..91b55015c 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -401,6 +401,141 @@ TEST(ElfTest, DataSegment) {
              })));
 }
 
+// Additonal pages beyond filesz are always RW.
+//
+// N.B. Linux uses set_brk -> vm_brk to additional pages beyond filesz (even
+// though start_brk itself will always be beyond memsz). As a result, the
+// segment permissions don't apply; the mapping is always RW.
+TEST(ElfTest, ExtraMemPages) {
+  ElfBinary<64> elf = StandardElf();
+
+  // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+  // beginning of a multi-page data + bss segment.
+  elf.data.resize(kPageSize + kPageSize / 2);
+
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  // RWX segment. The extra anon page will be RW anyways.
+  //
+  // N.B. Linux uses clear_user to clear the end of the file-mapped page, which
+  // respects the mapping protections. Thus if we map this RO with memsz >
+  // (unaligned) filesz, then execve will fail with EFAULT. See padzero(elf_bss)
+  // in fs/binfmt_elf.c:load_elf_binary.
+  //
+  // N.N.B.B. The above only applies to the last segment. For earlier segments,
+  // the clear_user error is ignored.
+  phdr.p_flags = PF_R | PF_W | PF_X;
+  phdr.p_offset = kPageSize;
+  phdr.p_vaddr = 0x41000;
+  phdr.p_filesz = kPageSize / 2;
+  // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+  // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+  phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  EXPECT_THAT(child,
+              ContainsMappings(std::vector<ProcMapsEntry>({
+                  // text page.
+                  {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+                   file.path().c_str()},
+                  // data + bss page from file.
+                  {0x41000, 0x42000, true, true, true, true, kPageSize, 0, 0, 0,
+                   file.path().c_str()},
+                  // extra page from anon.
+                  {0x42000, 0x43000, true, true, false, true, 0, 0, 0, 0, ""},
+              })));
+}
+
+// An aligned segment with filesz == 0, memsz > 0 is anon-only.
+TEST(ElfTest, AnonOnlySegment) {
+  ElfBinary<64> elf = StandardElf();
+
+  decltype(elf)::ElfPhdr phdr = {};
+  phdr.p_type = PT_LOAD;
+  // RO segment. The extra anon page will be RW anyways.
+  phdr.p_flags = PF_R;
+  phdr.p_offset = 0;
+  phdr.p_vaddr = 0x41000;
+  phdr.p_filesz = 0;
+  phdr.p_memsz = kPageSize - 0xe8;
+  elf.phdrs.push_back(phdr);
+
+  elf.UpdateOffsets();
+
+  // UpdateOffsets adjusts p_vaddr and p_offset by the header size, but we need
+  // a page-aligned p_vaddr to get a truly anon-only page.
+  elf.phdrs[2].p_vaddr = 0x41000;
+  // N.B. p_offset is now unaligned, but Linux doesn't care since this is
+  // anon-only.
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+  ASSERT_EQ(execve_errno, 0);
+
+  ASSERT_NO_ERRNO(WaitStopped(child));
+
+  EXPECT_THAT(child,
+              ContainsMappings(std::vector<ProcMapsEntry>({
+                  // text page.
+                  {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+                   file.path().c_str()},
+                  // anon page.
+                  {0x41000, 0x42000, true, true, false, true, 0, 0, 0, 0, ""},
+              })));
+}
+
+// p_offset must have the same alignment as p_vaddr.
+TEST(ElfTest, UnalignedOffset) {
+  ElfBinary<64> elf = StandardElf();
+
+  // Unaligned offset.
+  elf.phdrs[1].p_offset += 1;
+
+  elf.UpdateOffsets();
+
+  TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+  pid_t child;
+  int execve_errno;
+  auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+      ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+
+  // execve(2) return EINVAL, but behavior varies between Linux and gVisor.
+  //
+  // On Linux, the new mm is committed before attempting to map into it. By the
+  // time we hit EINVAL in the segment mmap, the old mm is gone. Linux returns
+  // to an empty mm, which immediately segfaults.
+  //
+  // OTOH, gVisor maps into the new mm before committing it. Thus when it hits
+  // failure, the caller is still intact to receive the error.
+  if (IsRunningOnGvisor()) {
+    ASSERT_EQ(execve_errno, EINVAL);
+  } else {
+    ASSERT_EQ(execve_errno, 0);
+
+    int status;
+    ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+                SyscallSucceedsWithValue(child));
+    EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) << status;
+  }
+}
+
 // Linux will allow PT_LOAD segments to overlap.
 TEST(ElfTest, DirectlyOverlappingSegments) {
   // NOTE(b/37289926): see PIEOutOfOrderSegments.
-- 
cgit v1.2.3


From ae1bb08871758844fa23fc7255ffeb0392f9dee6 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Tue, 21 May 2019 20:11:26 -0700
Subject: Clean up pipe internals and add fcntl support

Pipe internals are made more efficient by avoiding garbage collection.
A pool is now used that can be shared by all pipes, and buffers are
chained via an intrusive list. The documentation for pipe structures
and methods is also simplified and clarified.

The pipe tests are now parameterized, so that they are run on all
different variants (named pipes, small buffers, default buffers).

The pipe buffer sizes are exposed by fcntl, which is now supported
by this change. A size change test has been added to the suite.

These new tests uncovered a bug regarding the semantics of open
named pipes with O_NONBLOCK, which is also fixed by this CL. This
fix also addresses the lack of the O_LARGEFILE flag for named pipes.

PiperOrigin-RevId: 249375888
Change-Id: I48e61e9c868aedb0cadda2dff33f09a560dee773
---
 pkg/abi/linux/fcntl.go                  |   4 +-
 pkg/sentry/kernel/pipe/BUILD            |   8 +-
 pkg/sentry/kernel/pipe/buffer.go        |  90 ++++
 pkg/sentry/kernel/pipe/buffer_test.go   |  32 ++
 pkg/sentry/kernel/pipe/buffers.go       |  48 --
 pkg/sentry/kernel/pipe/node.go          |   7 +-
 pkg/sentry/kernel/pipe/pipe.go          | 315 ++++++++-----
 pkg/sentry/kernel/pipe/pipe_test.go     |   7 +-
 pkg/sentry/kernel/pipe/reader.go        |   3 +
 pkg/sentry/kernel/pipe/reader_writer.go |   6 +-
 pkg/sentry/kernel/pipe/writer.go        |   3 +
 pkg/sentry/syscalls/linux/sys_file.go   |  14 +
 test/syscalls/BUILD                     |   6 +-
 test/syscalls/linux/BUILD               |   2 +
 test/syscalls/linux/pipe.cc             | 761 +++++++++++++++++++-------------
 15 files changed, 828 insertions(+), 478 deletions(-)
 create mode 100644 pkg/sentry/kernel/pipe/buffer.go
 create mode 100644 pkg/sentry/kernel/pipe/buffer_test.go
 delete mode 100644 pkg/sentry/kernel/pipe/buffers.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index cc8f2702d..b30350193 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -17,7 +17,6 @@ package linux
 // Comands from linux/fcntl.h.
 const (
 	F_DUPFD         = 0
-	F_DUPFD_CLOEXEC = 1030
 	F_GETFD         = 1
 	F_GETFL         = 3
 	F_GETOWN        = 9
@@ -26,6 +25,9 @@ const (
 	F_SETLK         = 6
 	F_SETLKW        = 7
 	F_SETOWN        = 8
+	F_DUPFD_CLOEXEC = 1024 + 6
+	F_SETPIPE_SZ    = 1024 + 7
+	F_GETPIPE_SZ    = 1024 + 8
 )
 
 // Flags for fcntl.
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 6b23117d9..b07d15a2a 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -10,16 +10,16 @@ go_template_instance(
     prefix = "buffer",
     template = "//pkg/ilist:generic_list",
     types = {
-        "Element": "*Buffer",
-        "Linker": "*Buffer",
+        "Element": "*buffer",
+        "Linker": "*buffer",
     },
 )
 
 go_library(
     name = "pipe",
     srcs = [
+        "buffer.go",
         "buffer_list.go",
-        "buffers.go",
         "device.go",
         "node.go",
         "pipe.go",
@@ -37,6 +37,7 @@ go_library(
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/safemem",
         "//pkg/sentry/usermem",
         "//pkg/syserror",
         "//pkg/waiter",
@@ -47,6 +48,7 @@ go_test(
     name = "pipe_test",
     size = "small",
     srcs = [
+        "buffer_test.go",
         "node_test.go",
         "pipe_test.go",
     ],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
new file mode 100644
index 000000000..4360dc44f
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/buffer.go
@@ -0,0 +1,90 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+)
+
+// buffer encapsulates a queueable byte buffer.
+//
+// Note that the total size is slightly less than two pages. This
+// is done intentionally to ensure that the buffer object aligns
+// with runtime internals. We have no hard size or alignment
+// requirements. This two page size will effectively minimize
+// internal fragmentation, but still have a large enough chunk
+// to limit excessive segmentation.
+//
+// +stateify savable
+type buffer struct {
+	data  [8144]byte
+	read  int
+	write int
+	bufferEntry
+}
+
+// Reset resets internal data.
+//
+// This must be called before use.
+func (b *buffer) Reset() {
+	b.read = 0
+	b.write = 0
+}
+
+// Empty indicates the buffer is empty.
+//
+// This indicates there is no data left to read.
+func (b *buffer) Empty() bool {
+	return b.read == b.write
+}
+
+// Full indicates the buffer is full.
+//
+// This indicates there is no capacity left to write.
+func (b *buffer) Full() bool {
+	return b.write == len(b.data)
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:]))
+	n, err := safemem.CopySeq(dst, srcs)
+	b.write += int(n)
+	return n, err
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write]))
+	n, err := safemem.CopySeq(dsts, src)
+	b.read += int(n)
+	return n, err
+}
+
+// bufferPool is a pool for buffers.
+var bufferPool = sync.Pool{
+	New: func() interface{} {
+		return new(buffer)
+	},
+}
+
+// newBuffer grabs a new buffer from the pool.
+func newBuffer() *buffer {
+	b := bufferPool.Get().(*buffer)
+	b.Reset()
+	return b
+}
diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/buffer_test.go
new file mode 100644
index 000000000..4b7dbc43f
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/buffer_test.go
@@ -0,0 +1,32 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"testing"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func TestBufferSize(t *testing.T) {
+	bufferSize := unsafe.Sizeof(buffer{})
+	if bufferSize < usermem.PageSize {
+		t.Errorf("buffer is less than a page")
+	}
+	if bufferSize > (2 * usermem.PageSize) {
+		t.Errorf("buffer is greater than two pages")
+	}
+}
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
deleted file mode 100644
index ba53fd482..000000000
--- a/pkg/sentry/kernel/pipe/buffers.go
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pipe
-
-// Buffer encapsulates a queueable byte buffer that can
-// easily be truncated.  It is designed only for use with pipes.
-//
-// +stateify savable
-type Buffer struct {
-	bufferEntry
-	data []byte
-}
-
-// newBuffer initializes a Buffer.
-func newBuffer(buf []byte) *Buffer {
-	return &Buffer{data: buf}
-}
-
-// bytes returns the bytes contained in the buffer.
-func (b *Buffer) bytes() []byte {
-	return b.data
-}
-
-// size returns the number of bytes contained in the buffer.
-func (b *Buffer) size() int {
-	return len(b.data)
-}
-
-// truncate removes the first n bytes from the buffer.
-func (b *Buffer) truncate(n int) int {
-	if n > len(b.data) {
-		panic("Trying to truncate past end of array.")
-	}
-	b.data = b.data[n:]
-	return len(b.data)
-}
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 7c3739360..926c4c623 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -67,7 +67,6 @@ func NewInodeOperations(ctx context.Context, perms fs.FilePermissions, p *Pipe)
 		InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), perms, linux.PIPEFS_MAGIC),
 		p:                     p,
 	}
-
 }
 
 // GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
@@ -87,7 +86,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 
 	switch {
 	case flags.Read && !flags.Write: // O_RDONLY.
-		r := i.p.ROpen(ctx)
+		r := i.p.Open(ctx, flags)
 		i.newHandleLocked(&i.rWakeup)
 
 		if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
@@ -103,7 +102,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 		return r, nil
 
 	case flags.Write && !flags.Read: // O_WRONLY.
-		w := i.p.WOpen(ctx)
+		w := i.p.Open(ctx, flags)
 		i.newHandleLocked(&i.wWakeup)
 
 		if i.p.isNamed && !i.p.HasReaders() {
@@ -123,7 +122,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
 
 	case flags.Read && flags.Write: // O_RDWR.
 		// Pipes opened for read-write always succeeds without blocking.
-		rw := i.p.RWOpen(ctx)
+		rw := i.p.Open(ctx, flags)
 		i.newHandleLocked(&i.rWakeup)
 		i.newHandleLocked(&i.wWakeup)
 		return rw, nil
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index bd7649d2f..b65204492 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -12,11 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package pipe provides an in-memory implementation of a unidirectional
-// pipe.
-//
-// The goal of this pipe is to emulate the pipe syscall in all of its
-// edge cases and guarantees of atomic IO.
+// Package pipe provides a pipe implementation.
 package pipe
 
 import (
@@ -32,8 +28,29 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
-// DefaultPipeSize is the system-wide default size of a pipe in bytes.
-const DefaultPipeSize = 65536
+const (
+	// MinimumPipeSize is a hard limit of the minimum size of a pipe.
+	MinimumPipeSize = 64 << 10
+
+	// DefaultPipeSize is the system-wide default size of a pipe in bytes.
+	DefaultPipeSize = MinimumPipeSize
+
+	// MaximumPipeSize is a hard limit on the maximum size of a pipe.
+	MaximumPipeSize = 8 << 20
+)
+
+// Sizer is an interface for setting and getting the size of a pipe.
+//
+// It is implemented by Pipe and, through embedding, all other types.
+type Sizer interface {
+	// PipeSize returns the pipe capacity in bytes.
+	PipeSize() int64
+
+	// SetPipeSize sets the new pipe capacity in bytes.
+	//
+	// The new size is returned (which may be capped).
+	SetPipeSize(int64) (int64, error)
+}
 
 // Pipe is an encapsulation of a platform-independent pipe.
 // It manages a buffered byte queue shared between a reader/writer
@@ -43,49 +60,76 @@ const DefaultPipeSize = 65536
 type Pipe struct {
 	waiter.Queue `state:"nosave"`
 
-	// Whether this is a named or anonymous pipe.
+	// isNamed indicates whether this is a named pipe.
+	//
+	// This value is immutable.
 	isNamed bool
 
+	// atomicIOBytes is the maximum number of bytes that the pipe will
+	// guarantee atomic reads or writes atomically.
+	//
+	// This value is immutable.
+	atomicIOBytes int64
+
 	// The dirent backing this pipe. Shared by all readers and writers.
+	//
+	// This value is immutable.
 	Dirent *fs.Dirent
 
-	// The buffered byte queue.
-	data bufferList
+	// The number of active readers for this pipe.
+	//
+	// Access atomically.
+	readers int32
 
-	// Max size of the pipe in bytes.  When this max has been reached,
-	// writers will get EWOULDBLOCK.
-	max int
+	// The number of active writes for this pipe.
+	//
+	// Access atomically.
+	writers int32
 
-	// Current size of the pipe in bytes.
-	size int
+	// mu protects all pipe internal state below.
+	mu sync.Mutex `state:"nosave"`
 
-	// Max number of bytes the pipe can guarantee to read or write
-	// atomically.
-	atomicIOBytes int
+	// data is the buffer queue of pipe contents.
+	//
+	// This is protected by mu.
+	data bufferList
 
-	// The number of active readers for this pipe. Load/store atomically.
-	readers int32
+	// max is the maximum size of the pipe in bytes. When this max has been
+	// reached, writers will get EWOULDBLOCK.
+	//
+	// This is protected by mu.
+	max int64
 
-	// The number of active writes for this pipe. Load/store atomically.
-	writers int32
+	// size is the current size of the pipe in bytes.
+	//
+	// This is protected by mu.
+	size int64
 
-	// This flag indicates if this pipe ever had a writer. Note that this does
-	// not necessarily indicate there is *currently* a writer, just that there
-	// has been a writer at some point since the pipe was created.
+	// hadWriter indicates if this pipe ever had a writer. Note that this
+	// does not necessarily indicate there is *currently* a writer, just
+	// that there has been a writer at some point since the pipe was
+	// created.
 	//
-	// Protected by mu.
+	// This is protected by mu.
 	hadWriter bool
-
-	// Lock protecting all pipe internal state.
-	mu sync.Mutex `state:"nosave"`
 }
 
-// NewPipe initializes and returns a pipe. A pipe created by this function is
-// persistent, and will remain valid even without any open fds to it. Named
-// pipes for mknod(2) are created via this function. Note that the
-// implementation of blocking semantics for opening the read and write ends of a
-// named pipe are left to filesystems.
-func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *Pipe {
+// NewPipe initializes and returns a pipe.
+//
+// N.B. The size and atomicIOBytes will be bounded.
+func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe {
+	if sizeBytes < MinimumPipeSize {
+		sizeBytes = MinimumPipeSize
+	}
+	if sizeBytes > MaximumPipeSize {
+		sizeBytes = MaximumPipeSize
+	}
+	if atomicIOBytes <= 0 {
+		atomicIOBytes = 1
+	}
+	if atomicIOBytes > sizeBytes {
+		atomicIOBytes = sizeBytes
+	}
 	p := &Pipe{
 		isNamed:       isNamed,
 		max:           sizeBytes,
@@ -110,48 +154,45 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *P
 	return p
 }
 
-// NewConnectedPipe initializes a pipe and returns a pair of objects (which
-// implement kio.File) representing the read and write ends of the pipe. A pipe
-// created by this function becomes invalid as soon as either the read or write
-// end is closed, and errors on subsequent operations on either end. Pipes
-// for pipe(2) and pipe2(2) are generally created this way.
-func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*fs.File, *fs.File) {
+// NewConnectedPipe initializes a pipe and returns a pair of objects
+// representing the read and write ends of the pipe.
+func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
 	p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
-	return p.ROpen(ctx), p.WOpen(ctx)
-}
-
-// ROpen opens the pipe for reading.
-func (p *Pipe) ROpen(ctx context.Context) *fs.File {
-	p.rOpen()
-	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Read: true}, &Reader{
-		ReaderWriter: ReaderWriter{Pipe: p},
-	})
-}
-
-// WOpen opens the pipe for writing.
-func (p *Pipe) WOpen(ctx context.Context) *fs.File {
-	p.wOpen()
-	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Write: true}, &Writer{
-		ReaderWriter: ReaderWriter{Pipe: p},
-	})
+	return p.Open(ctx, fs.FileFlags{Read: true}), p.Open(ctx, fs.FileFlags{Write: true})
 }
 
-// RWOpen opens the pipe for both reading and writing.
-func (p *Pipe) RWOpen(ctx context.Context) *fs.File {
-	p.rOpen()
-	p.wOpen()
-	return fs.NewFile(ctx, p.Dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
-		Pipe: p,
-	})
+// Open opens the pipe and returns a new file.
+//
+// Precondition: at least one of flags.Read or flags.Write must be set.
+func (p *Pipe) Open(ctx context.Context, flags fs.FileFlags) *fs.File {
+	switch {
+	case flags.Read && flags.Write:
+		p.rOpen()
+		p.wOpen()
+		return fs.NewFile(ctx, p.Dirent, flags, &ReaderWriter{
+			Pipe: p,
+		})
+	case flags.Read:
+		p.rOpen()
+		return fs.NewFile(ctx, p.Dirent, flags, &Reader{
+			ReaderWriter: ReaderWriter{Pipe: p},
+		})
+	case flags.Write:
+		p.wOpen()
+		return fs.NewFile(ctx, p.Dirent, flags, &Writer{
+			ReaderWriter: ReaderWriter{Pipe: p},
+		})
+	default:
+		// Precondition violated.
+		panic("invalid pipe flags")
+	}
 }
 
 // read reads data from the pipe into dst and returns the number of bytes
 // read, or returns ErrWouldBlock if the pipe is empty.
+//
+// Precondition: this pipe must have readers.
 func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
-	if !p.HasReaders() {
-		return 0, syscall.EBADF
-	}
-
 	// Don't block for a zero-length read even if the pipe is empty.
 	if dst.NumBytes() == 0 {
 		return 0, nil
@@ -159,8 +200,8 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	// If there is nothing to read at the moment but there is a writer, tell the
-	// caller to block.
+
+	// Is the pipe empty?
 	if p.size == 0 {
 		if !p.HasWriters() {
 			// There are no writers, return EOF.
@@ -168,64 +209,94 @@ func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error)
 		}
 		return 0, syserror.ErrWouldBlock
 	}
-	var n int64
-	for buffer := p.data.Front(); buffer != nil; buffer = p.data.Front() {
-		n0, err := dst.CopyOut(ctx, buffer.bytes())
-		n += int64(n0)
-		p.size -= n0
-		if buffer.truncate(n0) == 0 {
-			p.data.Remove(buffer)
+
+	// Limit how much we consume.
+	if dst.NumBytes() > p.size {
+		dst = dst.TakeFirst64(p.size)
+	}
+
+	done := int64(0)
+	for dst.NumBytes() > 0 {
+		// Pop the first buffer.
+		first := p.data.Front()
+		if first == nil {
+			break
 		}
-		dst = dst.DropFirst(n0)
-		if dst.NumBytes() == 0 || err != nil {
-			return n, err
+
+		// Copy user data.
+		n, err := dst.CopyOutFrom(ctx, first)
+		done += int64(n)
+		p.size -= n
+		dst = dst.DropFirst64(n)
+
+		// Empty buffer?
+		if first.Empty() {
+			// Push to the free list.
+			p.data.Remove(first)
+			bufferPool.Put(first)
+		}
+
+		// Handle errors.
+		if err != nil {
+			return done, err
 		}
 	}
-	return n, nil
+
+	return done, nil
 }
 
 // write writes data from sv into the pipe and returns the number of bytes
 // written. If no bytes are written because the pipe is full (or has less than
 // atomicIOBytes free capacity), write returns ErrWouldBlock.
+//
+// Precondition: this pipe must have writers.
 func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 
-	if !p.HasWriters() {
-		return 0, syscall.EBADF
-	}
+	// Can't write to a pipe with no readers.
 	if !p.HasReaders() {
 		return 0, syscall.EPIPE
 	}
 
 	// POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
-	// atomic, but requires no atomicity for writes larger than this. However,
-	// Linux appears to provide stronger semantics than this in practice:
-	// unmerged writes are done one PAGE_SIZE buffer at a time, so for larger
-	// writes, the writing of each PIPE_BUF-sized chunk is atomic. We implement
-	// this by writing at most atomicIOBytes at a time if we can't service the
-	// write in its entirety.
-	canWrite := src.NumBytes()
-	if canWrite > int64(p.max-p.size) {
-		if p.max-p.size >= p.atomicIOBytes {
-			canWrite = int64(p.atomicIOBytes)
-		} else {
+	// atomic, but requires no atomicity for writes larger than this.
+	wanted := src.NumBytes()
+	if avail := p.max - p.size; wanted > avail {
+		if wanted <= p.atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
+		// Limit to the available capacity.
+		src = src.TakeFirst64(avail)
 	}
 
-	// Copy data from user memory into a pipe-owned buffer.
-	buf := make([]byte, canWrite)
-	n, err := src.CopyIn(ctx, buf)
-	if n > 0 {
-		p.data.PushBack(newBuffer(buf[:n]))
+	done := int64(0)
+	for src.NumBytes() > 0 {
+		// Need a new buffer?
+		last := p.data.Back()
+		if last == nil || last.Full() {
+			// Add a new buffer to the data list.
+			last = newBuffer()
+			p.data.PushBack(last)
+		}
+
+		// Copy user data.
+		n, err := src.CopyInTo(ctx, last)
+		done += int64(n)
 		p.size += n
+		src = src.DropFirst64(n)
+
+		// Handle errors.
+		if err != nil {
+			return done, err
+		}
 	}
-	if int64(n) < src.NumBytes() && err == nil {
+	if wanted > done {
 		// Partial write due to full pipe.
-		err = syserror.ErrWouldBlock
+		return done, syserror.ErrWouldBlock
 	}
-	return int64(n), err
+
+	return done, nil
 }
 
 // rOpen signals a new reader of the pipe.
@@ -267,6 +338,9 @@ func (p *Pipe) HasWriters() bool {
 	return atomic.LoadInt32(&p.writers) > 0
 }
 
+// rReadinessLocked calculates the read readiness.
+//
+// Precondition: mu must be held.
 func (p *Pipe) rReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
 	if p.HasReaders() && p.data.Front() != nil {
@@ -290,6 +364,9 @@ func (p *Pipe) rReadiness() waiter.EventMask {
 	return p.rReadinessLocked()
 }
 
+// wReadinessLocked calculates the write readiness.
+//
+// Precondition: mu must be held.
 func (p *Pipe) wReadinessLocked() waiter.EventMask {
 	ready := waiter.EventMask(0)
 	if p.HasWriters() && p.size < p.max {
@@ -317,8 +394,36 @@ func (p *Pipe) rwReadiness() waiter.EventMask {
 	return p.rReadinessLocked() | p.wReadinessLocked()
 }
 
-func (p *Pipe) queuedSize() int {
+// queued returns the amount of queued data.
+func (p *Pipe) queued() int64 {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	return p.size
 }
+
+// PipeSize implements PipeSizer.PipeSize.
+func (p *Pipe) PipeSize() int64 {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	return p.max
+}
+
+// SetPipeSize implements PipeSize.SetPipeSize.
+func (p *Pipe) SetPipeSize(size int64) (int64, error) {
+	if size < 0 {
+		return 0, syserror.EINVAL
+	}
+	if size < MinimumPipeSize {
+		size = MinimumPipeSize // Per spec.
+	}
+	if size > MaximumPipeSize {
+		return 0, syserror.EPERM
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if size < p.size {
+		return 0, syserror.EBUSY
+	}
+	p.max = size
+	return size, nil
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index de340c40c..298c6587b 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -58,15 +58,16 @@ func TestPipeReadBlock(t *testing.T) {
 
 func TestPipeWriteBlock(t *testing.T) {
 	const atomicIOBytes = 2
+	const capacity = MinimumPipeSize
 
 	ctx := contexttest.Context(t)
-	r, w := NewConnectedPipe(ctx, 10, atomicIOBytes)
+	r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
 	defer r.DecRef()
 	defer w.DecRef()
 
-	msg := []byte("here's some bytes")
+	msg := make([]byte, capacity+1)
 	n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
-	if wantN, wantErr := int64(atomicIOBytes), syserror.ErrWouldBlock; n != wantN || err != wantErr {
+	if wantN, wantErr := int64(capacity), syserror.ErrWouldBlock; n != wantN || err != wantErr {
 		t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr)
 	}
 }
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
index 48fab45d1..656be824d 100644
--- a/pkg/sentry/kernel/pipe/reader.go
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -27,8 +27,11 @@ type Reader struct {
 }
 
 // Release implements fs.FileOperations.Release.
+//
+// This overrides ReaderWriter.Release.
 func (r *Reader) Release() {
 	r.Pipe.rClose()
+
 	// Wake up writers.
 	r.Pipe.Notify(waiter.EventOut)
 }
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
index 59899be49..e560b9be9 100644
--- a/pkg/sentry/kernel/pipe/reader_writer.go
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -15,7 +15,6 @@
 package pipe
 
 import (
-	"fmt"
 	"math"
 	"syscall"
 
@@ -49,6 +48,7 @@ type ReaderWriter struct {
 func (rw *ReaderWriter) Release() {
 	rw.Pipe.rClose()
 	rw.Pipe.wClose()
+
 	// Wake up readers and writers.
 	rw.Pipe.Notify(waiter.EventIn | waiter.EventOut)
 }
@@ -81,9 +81,9 @@ func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.Sysc
 	// Switch on ioctl request.
 	switch int(args[1].Int()) {
 	case linux.FIONREAD:
-		v := rw.queuedSize()
+		v := rw.queued()
 		if v > math.MaxInt32 {
-			panic(fmt.Sprintf("Impossibly large pipe queued size: %d", v))
+			v = math.MaxInt32 // Silently truncate.
 		}
 		// Copy result to user-space.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
index 0f29fbc43..8d5b68541 100644
--- a/pkg/sentry/kernel/pipe/writer.go
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -27,8 +27,11 @@ type Writer struct {
 }
 
 // Release implements fs.FileOperations.Release.
+//
+// This overrides ReaderWriter.Release.
 func (w *Writer) Release() {
 	w.Pipe.wClose()
+
 	// Wake up readers.
 	w.Pipe.Notify(waiter.EventHUp)
 }
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 8a80cd430..19f579930 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe"
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -943,6 +944,19 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		}
 		err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
 		return 0, nil, err
+	case linux.F_GETPIPE_SZ:
+		sz, ok := file.FileOperations.(pipe.Sizer)
+		if !ok {
+			return 0, nil, syserror.EINVAL
+		}
+		return uintptr(sz.PipeSize()), nil, nil
+	case linux.F_SETPIPE_SZ:
+		sz, ok := file.FileOperations.(pipe.Sizer)
+		if !ok {
+			return 0, nil, syserror.EINVAL
+		}
+		n, err := sz.SetPipeSize(int64(args[2].Int()))
+		return uintptr(n), nil, err
 	default:
 		// Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index b531d7629..0d6b6ccc7 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -191,7 +191,11 @@ syscall_test(test = "//test/syscalls/linux:partial_bad_buffer_test")
 
 syscall_test(test = "//test/syscalls/linux:pause_test")
 
-syscall_test(test = "//test/syscalls/linux:pipe_test")
+syscall_test(
+    size = "large",
+    shard_count = 5,
+    test = "//test/syscalls/linux:pipe_test",
+)
 
 syscall_test(test = "//test/syscalls/linux:poll_test")
 
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index d4e49bb3a..4e239617b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1237,6 +1237,8 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:file_descriptor",
+        "//test/util:posix_error",
+        "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
         "//test/util:thread_util",
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index 8698295b3..bce351e08 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -26,6 +26,8 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
 
@@ -34,449 +36,588 @@ namespace testing {
 
 namespace {
 
-// Buffer size of a pipe.
-//
-// TODO(b/35762278): Get this from F_GETPIPE_SZ.
-constexpr int kPipeSize = 65536;
+// Used as a non-zero sentinel value, below.
+constexpr int kTestValue = 0x12345678;
+
+// Used for synchronization in race tests.
+const absl::Duration syncDelay = absl::Seconds(2);
+
+struct PipeCreator {
+  std::string name_;
+
+  // void (fds, is_blocking, is_namedpipe).
+  std::function<void(int[2], bool*, bool*)> create_;
+};
+
+class PipeTest : public ::testing::TestWithParam<PipeCreator> {
+ protected:
+  FileDescriptor rfd;
+  FileDescriptor wfd;
 
-class PipeTest : public ::testing::Test {
  public:
   static void SetUpTestCase() {
     // Tests intentionally generate SIGPIPE.
     TEST_PCHECK(signal(SIGPIPE, SIG_IGN) != SIG_ERR);
   }
 
+  // Initializes rfd and wfd as a blocking pipe.
+  //
+  // The return value indicates success: the test should be skipped otherwise.
+  bool CreateBlocking() { return create(true); }
+
+  // Initializes rfd and wfd as a non-blocking pipe.
+  //
+  // The return value is per CreateBlocking.
+  bool CreateNonBlocking() { return create(false); }
+
+  // Returns true iff the pipe represents a named pipe.
+  bool IsNamedPipe() { return namedpipe_; }
+
+  int Size() {
+    int s1 = fcntl(rfd.get(), F_GETPIPE_SZ);
+    int s2 = fcntl(wfd.get(), F_GETPIPE_SZ);
+    EXPECT_GT(s1, 0);
+    EXPECT_GT(s2, 0);
+    EXPECT_EQ(s1, s2);
+    return s1;
+  }
+
   static void TearDownTestCase() {
     TEST_PCHECK(signal(SIGPIPE, SIG_DFL) != SIG_ERR);
   }
+
+ private:
+  bool namedpipe_ = false;
+
+  bool create(bool wants_blocking) {
+    // Generate the pipe.
+    int fds[2] = {-1, -1};
+    bool is_blocking = false;
+    GetParam().create_(fds, &is_blocking, &namedpipe_);
+    if (fds[0] < 0 || fds[1] < 0) {
+      return false;
+    }
+
+    // Save descriptors.
+    rfd.reset(fds[0]);
+    wfd.reset(fds[1]);
+
+    // Adjust blocking, if needed.
+    if (!is_blocking && wants_blocking) {
+      // Clear the blocking flag.
+      EXPECT_THAT(fcntl(fds[0], F_SETFL, 0), SyscallSucceeds());
+      EXPECT_THAT(fcntl(fds[1], F_SETFL, 0), SyscallSucceeds());
+    } else if (is_blocking && !wants_blocking) {
+      // Set the descriptors to blocking.
+      EXPECT_THAT(fcntl(fds[0], F_SETFL, O_NONBLOCK), SyscallSucceeds());
+      EXPECT_THAT(fcntl(fds[1], F_SETFL, O_NONBLOCK), SyscallSucceeds());
+    }
+
+    return true;
+  }
 };
 
-TEST_F(PipeTest, Basic) {
-  // fds[0] is read end, fds[1] is write end.
-  int fds[2];
-  int i = 0x12345678;
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+TEST_P(PipeTest, Inode) {
+  SKIP_IF(!CreateBlocking());
 
   // Ensure that the inode number is the same for each end.
   struct stat rst;
-  ASSERT_THAT(fstat(fds[0], &rst), SyscallSucceeds());
+  ASSERT_THAT(fstat(rfd.get(), &rst), SyscallSucceeds());
   struct stat wst;
-  ASSERT_THAT(fstat(fds[1], &wst), SyscallSucceeds());
+  ASSERT_THAT(fstat(wfd.get(), &wst), SyscallSucceeds());
   EXPECT_EQ(rst.st_ino, wst.st_ino);
-
-  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
-  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
-
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-              SyscallSucceedsWithValue(sizeof(i)));
-  int j;
-  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
-  EXPECT_EQ(i, j);
-
-  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(fds[1], F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
-
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
 }
 
-TEST_F(PipeTest, BasicCloExec) {
-  // fds[0] is read end, fds[1] is write end.
-  int fds[2];
-  int i = 0x12345678;
-  ASSERT_THAT(pipe2(fds, O_CLOEXEC), SyscallSucceeds());
+TEST_P(PipeTest, Permissions) {
+  SKIP_IF(!CreateBlocking());
 
-  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
-  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
+  // Attempt bad operations.
+  int buf = kTestValue;
+  ASSERT_THAT(write(rfd.get(), &buf, sizeof(buf)),
+              SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(read(wfd.get(), &buf, sizeof(buf)), SyscallFailsWithErrno(EBADF));
+}
 
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-              SyscallSucceedsWithValue(sizeof(i)));
-  int j;
-  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
-  EXPECT_EQ(i, j);
+TEST_P(PipeTest, Flags) {
+  SKIP_IF(!CreateBlocking());
+
+  if (IsNamedPipe()) {
+    // May be stubbed to zero; define locally.
+    constexpr int kLargefile = 0100000;
+    EXPECT_THAT(fcntl(rfd.get(), F_GETFL),
+                SyscallSucceedsWithValue(kLargefile | O_RDONLY));
+    EXPECT_THAT(fcntl(wfd.get(), F_GETFL),
+                SyscallSucceedsWithValue(kLargefile | O_WRONLY));
+  } else {
+    EXPECT_THAT(fcntl(rfd.get(), F_GETFL), SyscallSucceedsWithValue(O_RDONLY));
+    EXPECT_THAT(fcntl(wfd.get(), F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
+  }
+}
 
-  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(fds[1], F_GETFL), SyscallSucceeds());
+TEST_P(PipeTest, Write) {
+  SKIP_IF(!CreateBlocking());
 
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+  int wbuf = kTestValue;
+  int rbuf = ~kTestValue;
+  ASSERT_THAT(write(wfd.get(), &wbuf, sizeof(wbuf)),
+              SyscallSucceedsWithValue(sizeof(wbuf)));
+  ASSERT_THAT(read(rfd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(wbuf, rbuf);
 }
 
-TEST_F(PipeTest, BasicNoBlock) {
-  // fds[0] is read end, fds[1] is write end.
-  int fds[2];
-  int i = 0x12345678;
-  ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
-
-  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
-  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
-
-  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-              SyscallSucceedsWithValue(sizeof(i)));
-  int j;
-  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
-  EXPECT_EQ(i, j);
-  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
-
-  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceedsWithValue(O_NONBLOCK));
-  ASSERT_THAT(fcntl(fds[1], F_GETFL),
-              SyscallSucceedsWithValue(O_NONBLOCK | O_WRONLY));
-
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+TEST_P(PipeTest, NonBlocking) {
+  SKIP_IF(!CreateNonBlocking());
+
+  int wbuf = kTestValue;
+  int rbuf = ~kTestValue;
+  EXPECT_THAT(read(rfd.get(), &rbuf, sizeof(rbuf)),
+              SyscallFailsWithErrno(EWOULDBLOCK));
+  ASSERT_THAT(write(wfd.get(), &wbuf, sizeof(wbuf)),
+              SyscallSucceedsWithValue(sizeof(wbuf)));
+
+  ASSERT_THAT(read(rfd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(wbuf, rbuf);
+  EXPECT_THAT(read(rfd.get(), &rbuf, sizeof(rbuf)),
+              SyscallFailsWithErrno(EWOULDBLOCK));
 }
 
-TEST_F(PipeTest, BasicBothOptions) {
-  // fds[0] is read end, fds[1] is write end.
+TEST(Pipe2Test, CloExec) {
   int fds[2];
-  int i = 0x12345678;
-  ASSERT_THAT(pipe2(fds, O_NONBLOCK | O_CLOEXEC), SyscallSucceeds());
-
-  ASSERT_THAT(write(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
-  ASSERT_THAT(read(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EBADF));
-
-  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-              SyscallSucceedsWithValue(sizeof(i)));
-  int j;
-  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
-  EXPECT_EQ(i, j);
-  ASSERT_THAT(read(fds[0], &i, sizeof(i)), SyscallFailsWithErrno(EWOULDBLOCK));
-
-  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceedsWithValue(O_NONBLOCK));
-  ASSERT_THAT(fcntl(fds[1], F_GETFL),
-              SyscallSucceedsWithValue(O_NONBLOCK | O_WRONLY));
-
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+  ASSERT_THAT(pipe2(fds, O_CLOEXEC), SyscallSucceeds());
+  EXPECT_THAT(fcntl(fds[0], F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+  EXPECT_THAT(fcntl(fds[1], F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+  EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(fds[1]), SyscallSucceeds());
 }
 
-TEST_F(PipeTest, BasicBadOptions) {
+TEST(Pipe2Test, BadOptions) {
   int fds[2];
-  ASSERT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL));
 }
 
-TEST_F(PipeTest, Seek) {
-  // fds[0] is read end, fds[1] is write end.
-  int fds[2];
-  int i = 0x12345678;
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-
-  ASSERT_THAT(lseek(fds[0], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[0], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[0], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-
-  ASSERT_THAT(lseek(fds[0], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[0], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-              SyscallSucceedsWithValue(sizeof(i)));
-  int j;
-
-  ASSERT_THAT(lseek(fds[0], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[0], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
-
-  ASSERT_THAT(lseek(fds[0], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[0], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-  ASSERT_THAT(lseek(fds[1], 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
-
-  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
-  EXPECT_EQ(i, j);
-
-  ASSERT_THAT(fcntl(fds[0], F_GETFL), SyscallSucceeds());
-  ASSERT_THAT(fcntl(fds[1], F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
-
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+TEST_P(PipeTest, Seek) {
+  SKIP_IF(!CreateBlocking());
+
+  for (int i = 0; i < 4; i++) {
+    // Attempt absolute seeks.
+    EXPECT_THAT(lseek(rfd.get(), 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(rfd.get(), 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(wfd.get(), 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(wfd.get(), 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+
+    // Attempt relative seeks.
+    EXPECT_THAT(lseek(rfd.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(rfd.get(), 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(wfd.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(wfd.get(), 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+
+    // Attempt end-of-file seeks.
+    EXPECT_THAT(lseek(rfd.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(rfd.get(), -4, SEEK_END), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(wfd.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+    EXPECT_THAT(lseek(wfd.get(), -4, SEEK_END), SyscallFailsWithErrno(ESPIPE));
+
+    // Add some more data to the pipe.
+    int buf = kTestValue;
+    ASSERT_THAT(write(wfd.get(), &buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+  }
 }
 
-TEST_F(PipeTest, AbsoluteOffsetSyscallsFail) {
-  // Syscalls for IO at absolute offsets fail because pipes are not seekable.
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-
-  std::vector<char> buf(4096);
-  struct iovec iov;
+TEST_P(PipeTest, OffsetCalls) {
+  SKIP_IF(!CreateBlocking());
 
-  EXPECT_THAT(pread(fds[1], buf.data(), buf.size(), 0),
+  int buf;
+  EXPECT_THAT(pread(wfd.get(), &buf, sizeof(buf), 0),
               SyscallFailsWithErrno(ESPIPE));
-  EXPECT_THAT(pwrite(fds[0], buf.data(), buf.size(), 0),
+  EXPECT_THAT(pwrite(rfd.get(), &buf, sizeof(buf), 0),
               SyscallFailsWithErrno(ESPIPE));
-  EXPECT_THAT(preadv(fds[1], &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
-  EXPECT_THAT(pwritev(fds[0], &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
 
-  EXPECT_THAT(close(fds[0]), SyscallSucceeds());
-  EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+  struct iovec iov;
+  EXPECT_THAT(preadv(wfd.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
+  EXPECT_THAT(pwritev(rfd.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
 }
 
-TEST_F(PipeTest, WriterSideCloses) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  int rfd = fds[0];
-  int i = 123;
-  ScopedThread t([rfd]() {
-    int j;
-    ASSERT_THAT(read(rfd, &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
+TEST_P(PipeTest, WriterSideCloses) {
+  SKIP_IF(!CreateBlocking());
+
+  ScopedThread t([this]() {
+    int buf = ~kTestValue;
+    ASSERT_THAT(read(rfd.get(), &buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    EXPECT_EQ(buf, kTestValue);
     // This will return when the close() completes.
-    ASSERT_THAT(read(rfd, &j, sizeof(j)), SyscallSucceeds());
+    ASSERT_THAT(read(rfd.get(), &buf, sizeof(buf)), SyscallSucceeds());
     // This will return straight away.
-    ASSERT_THAT(read(rfd, &j, sizeof(j)), SyscallSucceeds());
+    ASSERT_THAT(read(rfd.get(), &buf, sizeof(buf)),
+                SyscallSucceedsWithValue(0));
   });
+
   // Sleep a bit so the thread can block.
-  absl::SleepFor(absl::Seconds(1.0));
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-              SyscallSucceedsWithValue(sizeof(i)));
+  absl::SleepFor(syncDelay);
+
+  // Write to unblock.
+  int buf = kTestValue;
+  ASSERT_THAT(write(wfd.get(), &buf, sizeof(buf)),
+              SyscallSucceedsWithValue(sizeof(buf)));
+
   // Sleep a bit so the thread can block again.
-  absl::SleepFor(absl::Seconds(3.0));
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
-  t.Join();
+  absl::SleepFor(syncDelay);
 
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  // Allow the thread to complete.
+  ASSERT_THAT(close(wfd.release()), SyscallSucceeds());
+  t.Join();
 }
 
-TEST_F(PipeTest, WriterSideClosesReadDataFirst) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  int i = 123;
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-              SyscallSucceedsWithValue(sizeof(i)));
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
-  int j;
-  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceedsWithValue(sizeof(j)));
-  ASSERT_EQ(j, i);
-  ASSERT_THAT(read(fds[0], &j, sizeof(j)), SyscallSucceeds());
-
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+TEST_P(PipeTest, WriterSideClosesReadDataFirst) {
+  SKIP_IF(!CreateBlocking());
+
+  int wbuf = kTestValue;
+  ASSERT_THAT(write(wfd.get(), &wbuf, sizeof(wbuf)),
+              SyscallSucceedsWithValue(sizeof(wbuf)));
+  ASSERT_THAT(close(wfd.release()), SyscallSucceeds());
+
+  int rbuf;
+  ASSERT_THAT(read(rfd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(wbuf, rbuf);
+  EXPECT_THAT(read(rfd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(0));
 }
 
-TEST_F(PipeTest, ReaderSideCloses) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  int i = 123;
-  ASSERT_THAT(write(fds[1], &i, sizeof(i)), SyscallFailsWithErrno(EPIPE));
+TEST_P(PipeTest, ReaderSideCloses) {
+  SKIP_IF(!CreateBlocking());
 
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+  ASSERT_THAT(close(rfd.release()), SyscallSucceeds());
+  int buf = kTestValue;
+  EXPECT_THAT(write(wfd.get(), &buf, sizeof(buf)),
+              SyscallFailsWithErrno(EPIPE));
 }
 
-TEST_F(PipeTest, CloseTwice) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[0]), SyscallFailsWithErrno(EBADF));
-  ASSERT_THAT(close(fds[1]), SyscallFailsWithErrno(EBADF));
-
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-  ASSERT_THAT(close(fds[0]), SyscallFailsWithErrno(EBADF));
-  ASSERT_THAT(close(fds[1]), SyscallFailsWithErrno(EBADF));
+TEST_P(PipeTest, CloseTwice) {
+  SKIP_IF(!CreateBlocking());
+
+  int _rfd = rfd.release();
+  int _wfd = wfd.release();
+  ASSERT_THAT(close(_rfd), SyscallSucceeds());
+  ASSERT_THAT(close(_wfd), SyscallSucceeds());
+  EXPECT_THAT(close(_rfd), SyscallFailsWithErrno(EBADF));
+  EXPECT_THAT(close(_wfd), SyscallFailsWithErrno(EBADF));
 }
 
 // Blocking write returns EPIPE when read end is closed if nothing has been
 // written.
-TEST_F(PipeTest, BlockWriteClosed) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  int wfd = fds[1];
+TEST_P(PipeTest, BlockWriteClosed) {
+  SKIP_IF(!CreateBlocking());
 
   absl::Notification notify;
-  ScopedThread t([wfd, &notify]() {
-    std::vector<char> buf(kPipeSize);
+  ScopedThread t([this, &notify]() {
+    std::vector<char> buf(Size());
     // Exactly fill the pipe buffer.
-    ASSERT_THAT(WriteFd(wfd, buf.data(), buf.size()),
+    ASSERT_THAT(WriteFd(wfd.get(), buf.data(), buf.size()),
                 SyscallSucceedsWithValue(buf.size()));
 
     notify.Notify();
 
     // Attempt to write one more byte. Blocks.
     // N.B. Don't use WriteFd, we don't want a retry.
-    ASSERT_THAT(write(wfd, buf.data(), 1), SyscallFailsWithErrno(EPIPE));
+    EXPECT_THAT(write(wfd.get(), buf.data(), 1), SyscallFailsWithErrno(EPIPE));
   });
 
   notify.WaitForNotification();
-  absl::SleepFor(absl::Seconds(1.0));
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-
+  ASSERT_THAT(close(rfd.release()), SyscallSucceeds());
   t.Join();
-
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
 }
 
 // Blocking write returns EPIPE when read end is closed even if something has
 // been written.
-//
-// FIXME(b/35924046): Pipe writes blocking early allows S/R to interrupt the
-// write(2) call before the buffer is full. Then the next call will will return
-// non-zero instead of EPIPE.
-TEST_F(PipeTest, BlockPartialWriteClosed_NoRandomSave) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  int wfd = fds[1];
+TEST_P(PipeTest, BlockPartialWriteClosed) {
+  SKIP_IF(!CreateBlocking());
 
-  ScopedThread t([wfd]() {
-    std::vector<char> buf(2 * kPipeSize);
+  ScopedThread t([this]() {
+    std::vector<char> buf(2 * Size());
     // Write more than fits in the buffer. Blocks then returns partial write
     // when the other end is closed. The next call returns EPIPE.
-    if (IsRunningOnGvisor()) {
-      // FIXME(b/35924046): Pipe writes block early on gVisor, resulting in a
-      // shorter than expected partial write.
-      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
-                  SyscallSucceedsWithValue(::testing::Gt(0)));
-    } else {
-      ASSERT_THAT(write(wfd, buf.data(), buf.size()),
-                  SyscallSucceedsWithValue(kPipeSize));
-    }
-    ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+    ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+                SyscallSucceedsWithValue(Size()));
+    EXPECT_THAT(write(wfd.get(), buf.data(), buf.size()),
                 SyscallFailsWithErrno(EPIPE));
   });
 
   // Leave time for write to become blocked.
-  absl::SleepFor(absl::Seconds(1.0));
-
-  ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+  absl::SleepFor(syncDelay);
 
+  // Unblock the above.
+  ASSERT_THAT(close(rfd.release()), SyscallSucceeds());
   t.Join();
-
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
 }
 
-TEST_F(PipeTest, ReadFromClosedFd_NoRandomSave) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  int rfd = fds[0];
+TEST_P(PipeTest, ReadFromClosedFd_NoRandomSave) {
+  SKIP_IF(!CreateBlocking());
+
   absl::Notification notify;
-  ScopedThread t([rfd, &notify]() {
-    int f;
+  ScopedThread t([this, &notify]() {
     notify.Notify();
-    ASSERT_THAT(read(rfd, &f, sizeof(f)), SyscallSucceedsWithValue(sizeof(f)));
-    ASSERT_EQ(123, f);
+    int buf;
+    ASSERT_THAT(read(rfd.get(), &buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
+    ASSERT_EQ(kTestValue, buf);
   });
   notify.WaitForNotification();
+
   // Make sure that the thread gets to read().
-  absl::SleepFor(absl::Seconds(5.0));
+  absl::SleepFor(syncDelay);
+
   {
     // We cannot save/restore here as the read end of pipe is closed but there
     // is ongoing read() above. We will not be able to restart the read()
     // successfully in restore run since the read fd is closed.
     const DisableSave ds;
-    ASSERT_THAT(close(fds[0]), SyscallSucceeds());
-    int i = 123;
-    ASSERT_THAT(write(fds[1], &i, sizeof(i)),
-                SyscallSucceedsWithValue(sizeof(i)));
+    ASSERT_THAT(close(rfd.release()), SyscallSucceeds());
+    int buf = kTestValue;
+    ASSERT_THAT(write(wfd.get(), &buf, sizeof(buf)),
+                SyscallSucceedsWithValue(sizeof(buf)));
     t.Join();
   }
-  ASSERT_THAT(close(fds[1]), SyscallSucceeds());
 }
 
-TEST_F(PipeTest, FionRead) {
-  // fds[0] is read end, fds[1] is write end.
-  int fds[2];
-  int data[2] = {0x12345678, 0x9101112};
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+TEST_P(PipeTest, FionRead) {
+  SKIP_IF(!CreateBlocking());
 
-  int n = -1;
-  EXPECT_THAT(ioctl(fds[0], FIONREAD, &n), SyscallSucceedsWithValue(0));
+  int n;
+  ASSERT_THAT(ioctl(rfd.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
   EXPECT_EQ(n, 0);
-  n = -1;
-  EXPECT_THAT(ioctl(fds[1], FIONREAD, &n), SyscallSucceedsWithValue(0));
+  ASSERT_THAT(ioctl(wfd.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
   EXPECT_EQ(n, 0);
 
-  EXPECT_THAT(write(fds[1], data, sizeof(data)),
-              SyscallSucceedsWithValue(sizeof(data)));
+  std::vector<char> buf(Size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
 
-  n = -1;
-  EXPECT_THAT(ioctl(fds[0], FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(data));
-  n = -1;
-  EXPECT_THAT(ioctl(fds[1], FIONREAD, &n), SyscallSucceedsWithValue(0));
-  EXPECT_EQ(n, sizeof(data));
+  EXPECT_THAT(ioctl(rfd.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, buf.size());
+  EXPECT_THAT(ioctl(wfd.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+  EXPECT_EQ(n, buf.size());
 }
 
 // Test that opening an empty anonymous pipe RDONLY via /proc/self/fd/N does not
 // block waiting for a writer.
-TEST_F(PipeTest, OpenViaProcSelfFD) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  FileDescriptor rfd(fds[0]);
-  FileDescriptor wfd(fds[1]);
+TEST_P(PipeTest, OpenViaProcSelfFD) {
+  SKIP_IF(!CreateBlocking());
+  SKIP_IF(IsNamedPipe());
 
   // Close the write end of the pipe.
-  wfd.release();
+  ASSERT_THAT(close(wfd.release()), SyscallSucceeds());
 
   // Open other side via /proc/self/fd.  It should not block.
   FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
-      Open(absl::StrCat("/proc/self/fd/", fds[0]), O_RDONLY));
+      Open(absl::StrCat("/proc/self/fd/", rfd.get()), O_RDONLY));
 }
 
 // Test that opening and reading from an anonymous pipe (with existing writes)
 // RDONLY via /proc/self/fd/N returns the existing data.
-TEST_F(PipeTest, OpenViaProcSelfFDWithWrites) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  FileDescriptor rfd(fds[0]);
-  FileDescriptor wfd(fds[1]);
+TEST_P(PipeTest, OpenViaProcSelfFDWithWrites) {
+  SKIP_IF(!CreateBlocking());
+  SKIP_IF(IsNamedPipe());
 
   // Write to the pipe and then close the write fd.
-  char data = 'x';
-  ASSERT_THAT(write(fds[1], &data, 1), SyscallSucceedsWithValue(1));
-  wfd.release();
+  int wbuf = kTestValue;
+  ASSERT_THAT(write(wfd.get(), &wbuf, sizeof(wbuf)),
+              SyscallSucceedsWithValue(sizeof(wbuf)));
+  ASSERT_THAT(close(wfd.release()), SyscallSucceeds());
 
   // Open read side via /proc/self/fd, and read from it.
   FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
-      Open(absl::StrCat("/proc/self/fd/", fds[0]), O_RDONLY));
-  char got;
-  ASSERT_THAT(read(proc_self_fd.get(), &got, 1), SyscallSucceedsWithValue(1));
+      Open(absl::StrCat("/proc/self/fd/", rfd.get()), O_RDONLY));
+  int rbuf;
+  ASSERT_THAT(read(proc_self_fd.get(), &rbuf, sizeof(rbuf)),
+              SyscallSucceedsWithValue(sizeof(rbuf)));
+  EXPECT_EQ(wbuf, rbuf);
+}
+
+// Test that accesses of /proc/<PID>/fd correctly decrement the refcount.
+TEST_P(PipeTest, ProcFDReleasesFile) {
+  SKIP_IF(!CreateBlocking());
 
-  // We should get what we sent.
-  EXPECT_EQ(got, data);
+  // Stat the pipe FD, which shouldn't alter the refcount.
+  struct stat wst;
+  ASSERT_THAT(lstat(absl::StrCat("/proc/self/fd/", wfd.get()).c_str(), &wst),
+              SyscallSucceeds());
+
+  // Close the write end and ensure that read indicates EOF.
+  wfd.reset();
+  char buf;
+  ASSERT_THAT(read(rfd.get(), &buf, 1), SyscallSucceedsWithValue(0));
 }
 
-TEST_F(PipeTest, LargeFile) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-  FileDescriptor rfd(fds[0]);
-  FileDescriptor wfd(fds[1]);
+// Same for /proc/<PID>/fdinfo.
+TEST_P(PipeTest, ProcFDInfoReleasesFile) {
+  SKIP_IF(!CreateBlocking());
+
+  // Stat the pipe FD, which shouldn't alter the refcount.
+  struct stat wst;
+  ASSERT_THAT(
+      lstat(absl::StrCat("/proc/self/fdinfo/", wfd.get()).c_str(), &wst),
+      SyscallSucceeds());
+
+  // Close the write end and ensure that read indicates EOF.
+  wfd.reset();
+  char buf;
+  ASSERT_THAT(read(rfd.get(), &buf, 1), SyscallSucceedsWithValue(0));
+}
+
+TEST_P(PipeTest, SizeChange) {
+  SKIP_IF(!CreateBlocking());
+
+  // Set the minimum possible size.
+  ASSERT_THAT(fcntl(rfd.get(), F_SETPIPE_SZ, 0), SyscallSucceeds());
+  int min = Size();
+  EXPECT_GT(min, 0);  // Should be rounded up.
+
+  // Set from the read end.
+  ASSERT_THAT(fcntl(rfd.get(), F_SETPIPE_SZ, min + 1), SyscallSucceeds());
+  int med = Size();
+  EXPECT_GT(med, min);  // Should have grown, may be rounded.
+
+  // Set from the write end.
+  ASSERT_THAT(fcntl(wfd.get(), F_SETPIPE_SZ, med + 1), SyscallSucceeds());
+  int max = Size();
+  EXPECT_GT(max, med);  // Ditto.
+}
 
-  int rflags;
-  EXPECT_THAT(rflags = fcntl(rfd.get(), F_GETFL), SyscallSucceeds());
+TEST_P(PipeTest, SizeChangeMax) {
+  SKIP_IF(!CreateBlocking());
 
-  // The kernel did *not* set O_LARGEFILE.
-  EXPECT_EQ(rflags, 0);
+  // Assert there's some maximum.
+  EXPECT_THAT(fcntl(rfd.get(), F_SETPIPE_SZ, 0x7fffffffffffffff),
+              SyscallFailsWithErrno(EINVAL));
+  EXPECT_THAT(fcntl(wfd.get(), F_SETPIPE_SZ, 0x7fffffffffffffff),
+              SyscallFailsWithErrno(EINVAL));
 }
 
-// Test that accesses of /proc/<PID>/fd/<FD> and /proc/<PID>/fdinfo/<FD>
-// correctly decrement the refcount of that file descriptor.
-TEST_F(PipeTest, ProcFDReleasesFile) {
-  std::vector<std::string> paths = {"/proc/self/fd/", "/proc/self/fdinfo/"};
-  for (const std::string& path : paths) {
-    int fds[2];
-    ASSERT_THAT(pipe(fds), SyscallSucceeds());
-    FileDescriptor rfd(fds[0]);
-    FileDescriptor wfd(fds[1]);
-
-    // Stat the pipe FD, which shouldn't alter the refcount of the write end of
-    // the pipe.
-    struct stat wst;
-    ASSERT_THAT(lstat(absl::StrCat(path.c_str(), wfd.get()).c_str(), &wst),
-                SyscallSucceeds());
-    // Close the write end of the pipe and ensure that read indicates EOF.
-    wfd.reset();
-    char buf;
-    ASSERT_THAT(read(rfd.get(), &buf, 1), SyscallSucceedsWithValue(0));
+TEST_P(PipeTest, SizeChangeFull) {
+  SKIP_IF(!CreateBlocking());
+
+  // Ensure that we adjust to a large enough size to avoid rounding when we
+  // perform the size decrease. If rounding occurs, we may not actually
+  // adjust the size and the call below will return success. It was found via
+  // experimentation that this granularity avoids the rounding for Linux.
+  constexpr int kDelta = 64 * 1024;
+  ASSERT_THAT(fcntl(wfd.get(), F_SETPIPE_SZ, Size() + kDelta),
+              SyscallSucceeds());
+
+  // Fill the buffer and try to change down.
+  std::vector<char> buf(Size());
+  ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  EXPECT_THAT(fcntl(wfd.get(), F_SETPIPE_SZ, Size() - kDelta),
+              SyscallFailsWithErrno(EBUSY));
+}
+
+TEST_P(PipeTest, Streaming) {
+  SKIP_IF(!CreateBlocking());
+
+  // We make too many calls to go through full save cycles.
+  DisableSave ds;
+
+  absl::Notification notify;
+  ScopedThread t([this, &notify]() {
+    // Don't start until it's full.
+    notify.WaitForNotification();
+    for (int i = 0; i < 2 * Size(); i++) {
+      int rbuf;
+      ASSERT_THAT(read(rfd.get(), &rbuf, sizeof(rbuf)),
+                  SyscallSucceedsWithValue(sizeof(rbuf)));
+      EXPECT_EQ(rbuf, i);
+    }
+  });
+  for (int i = 0; i < 2 * Size(); i++) {
+    int wbuf = i;
+    ASSERT_THAT(write(wfd.get(), &wbuf, sizeof(wbuf)),
+                SyscallSucceedsWithValue(sizeof(wbuf)));
+    // Did that write just fill up the buffer? Wake up the reader. Once only.
+    if ((i * sizeof(wbuf)) < Size() && ((i + 1) * sizeof(wbuf)) >= Size()) {
+      notify.Notify();
+    }
   }
 }
 
+std::string PipeCreatorName(::testing::TestParamInfo<PipeCreator> info) {
+  return info.param.name_;  // Use the name specified.
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Pipes, PipeTest,
+    ::testing::Values(
+        PipeCreator{
+            "pipe",
+            [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+              ASSERT_THAT(pipe(fds), SyscallSucceeds());
+              *is_blocking = true;
+              *is_namedpipe = false;
+            },
+        },
+        PipeCreator{
+            "pipe2blocking",
+            [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+              ASSERT_THAT(pipe2(fds, 0), SyscallSucceeds());
+              *is_blocking = true;
+              *is_namedpipe = false;
+            },
+        },
+        PipeCreator{
+            "pipe2nonblocking",
+            [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+              ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
+              *is_blocking = false;
+              *is_namedpipe = false;
+            },
+        },
+        PipeCreator{
+            "smallbuffer",
+            [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+              // Set to the minimum available size (will round up).
+              ASSERT_THAT(pipe(fds), SyscallSucceeds());
+              ASSERT_THAT(fcntl(fds[0], F_SETPIPE_SZ, 0), SyscallSucceeds());
+              *is_blocking = true;
+              *is_namedpipe = false;
+            },
+        },
+        PipeCreator{
+            "namednonblocking",
+            [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+              // Create a new file-based pipe (non-blocking).
+              auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+              ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+              SKIP_IF(mkfifo(file.path().c_str(), 0644) != 0);
+              fds[0] = open(file.path().c_str(), O_NONBLOCK | O_RDONLY);
+              fds[1] = open(file.path().c_str(), O_NONBLOCK | O_WRONLY);
+              MaybeSave();
+              *is_blocking = false;
+              *is_namedpipe = true;
+            },
+        },
+        PipeCreator{
+            "namedblocking",
+            [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+              // Create a new file-based pipe (blocking).
+              auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+              ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+              SKIP_IF(mkfifo(file.path().c_str(), 0644) != 0);
+              ScopedThread t([&file, &fds]() {
+                fds[1] = open(file.path().c_str(), O_WRONLY);
+              });
+              fds[0] = open(file.path().c_str(), O_RDONLY);
+              t.Join();
+              MaybeSave();
+              *is_blocking = true;
+              *is_namedpipe = true;
+            },
+        }),
+    PipeCreatorName);
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
-- 
cgit v1.2.3


From 69eac1198f3dae9a41ddf1903e9dda7972ed5d77 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 22 May 2019 11:14:29 -0700
Subject: Move wait constants to abi/linux package

Updates #214

PiperOrigin-RevId: 249483756
Change-Id: I0d3cf4112bed75a863d5eb08c2063fbc506cd875
---
 pkg/abi/linux/BUILD                     |  1 +
 pkg/abi/linux/wait.go                   | 36 ++++++++++++++++++++++++++++
 pkg/sentry/syscalls/linux/sys_thread.go | 42 +++++++++++++--------------------
 3 files changed, 54 insertions(+), 25 deletions(-)
 create mode 100644 pkg/abi/linux/wait.go

(limited to 'pkg/sentry')

diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 96e8d4641..fbd0e4674 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -52,6 +52,7 @@ go_library(
         "tty.go",
         "uio.go",
         "utsname.go",
+        "wait.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/pkg/abi/linux",
     visibility = ["//visibility:public"],
diff --git a/pkg/abi/linux/wait.go b/pkg/abi/linux/wait.go
new file mode 100644
index 000000000..4bdc280d1
--- /dev/null
+++ b/pkg/abi/linux/wait.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Options for waitpid(2), wait4(2), and/or waitid(2), from
+// include/uapi/linux/wait.h.
+const (
+	WNOHANG    = 0x00000001
+	WUNTRACED  = 0x00000002
+	WSTOPPED   = WUNTRACED
+	WEXITED    = 0x00000004
+	WCONTINUED = 0x00000008
+	WNOWAIT    = 0x01000000
+	WNOTHREAD  = 0x20000000
+	WALL       = 0x40000000
+	WCLONE     = 0x80000000
+)
+
+// ID types for waitid(2), from include/uapi/linux/wait.h.
+const (
+	P_ALL  = 0x0
+	P_PID  = 0x1
+	P_PGID = 0x2
+)
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 23c2f7035..cc441460c 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -42,14 +42,6 @@ const (
 	exitSignalMask = 0xff
 )
 
-// Possible values for the idtype argument to waitid(2), defined in Linux's
-// include/uapi/linux/wait.h.
-const (
-	_P_ALL  = 0
-	_P_PID  = 1
-	_P_PGID = 2
-)
-
 // Getppid implements linux syscall getppid(2).
 func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
 	parent := t.Parent()
@@ -191,7 +183,7 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 // wait4 waits for the given child process to exit.
 func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
-	if options&^(syscall.WNOHANG|syscall.WUNTRACED|syscall.WCONTINUED|syscall.WALL|syscall.WCLONE) != 0 {
+	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WALL|linux.WCLONE) != 0 {
 		return 0, syscall.EINVAL
 	}
 	wopts := kernel.WaitOptions{
@@ -215,24 +207,24 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage
 		wopts.SpecificTID = kernel.ThreadID(pid)
 	}
 
-	switch options & (syscall.WCLONE | syscall.WALL) {
+	switch options & (linux.WCLONE | linux.WALL) {
 	case 0:
 		wopts.NonCloneTasks = true
-	case syscall.WCLONE:
+	case linux.WCLONE:
 		wopts.CloneTasks = true
-	case syscall.WALL:
+	case linux.WALL:
 		wopts.NonCloneTasks = true
 		wopts.CloneTasks = true
 	default:
 		return 0, syscall.EINVAL
 	}
-	if options&syscall.WUNTRACED != 0 {
+	if options&linux.WUNTRACED != 0 {
 		wopts.Events |= kernel.EventChildGroupStop
 	}
-	if options&syscall.WCONTINUED != 0 {
+	if options&linux.WCONTINUED != 0 {
 		wopts.Events |= kernel.EventGroupContinue
 	}
-	if options&syscall.WNOHANG == 0 {
+	if options&linux.WNOHANG == 0 {
 		wopts.BlockInterruptErr = kernel.ERESTARTSYS
 	}
 
@@ -286,36 +278,36 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	options := int(args[3].Uint())
 	rusageAddr := args[4].Pointer()
 
-	if options&^(syscall.WNOHANG|syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED|syscall.WNOWAIT) != 0 {
+	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
-	if options&(syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED) == 0 {
+	if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
 		return 0, nil, syscall.EINVAL
 	}
 	wopts := kernel.WaitOptions{
 		NonCloneTasks: true,
 		Events:        kernel.EventTraceeStop,
-		ConsumeEvent:  options&syscall.WNOWAIT == 0,
+		ConsumeEvent:  options&linux.WNOWAIT == 0,
 	}
 	switch idtype {
-	case _P_ALL:
-	case _P_PID:
+	case linux.P_ALL:
+	case linux.P_PID:
 		wopts.SpecificTID = kernel.ThreadID(id)
-	case _P_PGID:
+	case linux.P_PGID:
 		wopts.SpecificPGID = kernel.ProcessGroupID(id)
 	default:
 		return 0, nil, syscall.EINVAL
 	}
-	if options&syscall.WEXITED != 0 {
+	if options&linux.WEXITED != 0 {
 		wopts.Events |= kernel.EventExit
 	}
-	if options&syscall.WSTOPPED != 0 {
+	if options&linux.WSTOPPED != 0 {
 		wopts.Events |= kernel.EventChildGroupStop
 	}
-	if options&syscall.WCONTINUED != 0 {
+	if options&linux.WCONTINUED != 0 {
 		wopts.Events |= kernel.EventGroupContinue
 	}
-	if options&syscall.WNOHANG == 0 {
+	if options&linux.WNOHANG == 0 {
 		wopts.BlockInterruptErr = kernel.ERESTARTSYS
 	}
 
-- 
cgit v1.2.3


From c1cdf18e7bd21a9785462914ca8aa2056c81369a Mon Sep 17 00:00:00 2001
From: Kevin Krakauer <krakauer@google.com>
Date: Wed, 22 May 2019 13:44:07 -0700
Subject: UDP and TCP raw socket support.

PiperOrigin-RevId: 249511348
Change-Id: I34539092cc85032d9473ff4dd308fc29dc9bfd6b
---
 pkg/sentry/socket/epsocket/provider.go |   4 +
 pkg/tcpip/stack/nic.go                 |   8 +
 pkg/tcpip/stack/transport_demuxer.go   |  36 ++-
 pkg/tcpip/transport/raw/endpoint.go    |   2 +-
 pkg/tcpip/transport/tcp/BUILD          |   1 +
 pkg/tcpip/transport/tcp/protocol.go    |   3 +-
 pkg/tcpip/transport/udp/BUILD          |   2 +-
 pkg/tcpip/transport/udp/endpoint.go    |   2 +
 pkg/tcpip/transport/udp/protocol.go    |   7 +-
 test/syscalls/linux/BUILD              |  18 ++
 test/syscalls/linux/raw_socket_icmp.cc | 453 +++++++++++++++++++++++++++++
 test/syscalls/linux/raw_socket_ipv4.cc | 501 +++++++++------------------------
 12 files changed, 653 insertions(+), 384 deletions(-)
 create mode 100644 test/syscalls/linux/raw_socket_icmp.cc

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index 5a89a63fb..fb1815c2d 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -70,6 +70,10 @@ func getTransportProtocol(ctx context.Context, stype transport.SockType, protoco
 		switch protocol {
 		case syscall.IPPROTO_ICMP:
 			return header.ICMPv4ProtocolNumber, nil
+		case syscall.IPPROTO_UDP:
+			return header.UDPProtocolNumber, nil
+		case syscall.IPPROTO_TCP:
+			return header.TCPProtocolNumber, nil
 		}
 	}
 	return 0, syserr.ErrInvalidArgument
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index a4117d98e..50d35de88 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -593,6 +593,14 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	}
 
 	transProto := state.proto
+
+	// Raw socket packets are delivered based solely on the transport
+	// protocol number. We do not inspect the payload to ensure it's
+	// validly formed.
+	if !n.demux.deliverRawPacket(r, protocol, netHeader, vv) {
+		n.stack.demux.deliverRawPacket(r, protocol, netHeader, vv)
+	}
+
 	if len(vv.First()) < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 807c3ba5e..605bfadeb 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -286,20 +286,10 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 		destEps = append(destEps, ep)
 	}
 
-	// As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via
-	// raw endpoint first. If there are multipe raw endpoints, they all
-	// receive the packet.
-	foundRaw := false
-	for _, rawEP := range eps.rawEndpoints {
-		// Each endpoint gets its own copy of the packet for the sake
-		// of save/restore.
-		rawEP.HandlePacket(r, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView())
-		foundRaw = true
-	}
 	eps.mu.RUnlock()
 
 	// Fail if we didn't find at least one matching transport endpoint.
-	if len(destEps) == 0 && !foundRaw {
+	if len(destEps) == 0 {
 		// UDP packet could not be delivered to an unknown destination port.
 		if protocol == header.UDPProtocolNumber {
 			r.Stats().UDP.UnknownPortErrors.Increment()
@@ -315,6 +305,30 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 	return true
 }
 
+// deliverRawPacket attempts to deliver the given packet and returns whether it
+// was delivered successfully.
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) bool {
+	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
+	if !ok {
+		return false
+	}
+
+	// As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via
+	// raw endpoint first. If there are multiple raw endpoints, they all
+	// receive the packet.
+	foundRaw := false
+	eps.mu.RLock()
+	for _, rawEP := range eps.rawEndpoints {
+		// Each endpoint gets its own copy of the packet for the sake
+		// of save/restore.
+		rawEP.HandlePacket(r, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView())
+		foundRaw = true
+	}
+	eps.mu.RUnlock()
+
+	return foundRaw
+}
+
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
 func (d *transportDemuxer) deliverControlPacket(net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView, id TransportEndpointID) bool {
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 1a16a3607..e7b383ad5 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -280,7 +280,7 @@ func (ep *endpoint) finishWrite(payload tcpip.Payload, route *stack.Route) (uint
 	switch ep.netProto {
 	case header.IPv4ProtocolNumber:
 		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
-		if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), header.ICMPv4ProtocolNumber, route.DefaultTTL()); err != nil {
+		if err := route.WritePacket(nil /* gso */, hdr, buffer.View(payloadBytes).ToVectorisedView(), ep.transProto, route.DefaultTTL()); err != nil {
 			return 0, nil, err
 		}
 
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index d44d63e95..e31b03f7d 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -49,6 +49,7 @@ go_library(
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/raw",
         "//pkg/tmutex",
         "//pkg/waiter",
         "@com_github_google_btree//:go_default_library",
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index b86473891..d31a1edcb 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/raw"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -104,7 +105,7 @@ func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolN
 // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
 // unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
 func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return nil, tcpip.ErrUnknownProtocol
+	return raw.NewEndpoint(stack, netProto, header.TCPProtocolNumber, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid tcp packet size.
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 361132a25..b9520d6e0 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -28,12 +28,12 @@ go_library(
     imports = ["gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"],
     visibility = ["//visibility:public"],
     deps = [
-        "//pkg/log",
         "//pkg/sleep",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/raw",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 0ed0902b0..d9ca097c9 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -51,6 +51,8 @@ const (
 // have concurrent goroutines make calls into the endpoint, they are properly
 // synchronized.
 //
+// It implements tcpip.Endpoint.
+//
 // +stateify savable
 type endpoint struct {
 	// The following fields are initialized at creation time and do not
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 8b47cce17..3d31dfbf1 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
 	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/raw"
 	"gvisor.googlesource.com/gvisor/pkg/waiter"
 )
 
@@ -48,10 +49,10 @@ func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolN
 	return newEndpoint(stack, netProto, waiterQueue), nil
 }
 
-// NewRawEndpoint creates a new raw UDP endpoint. Raw UDP sockets are currently
-// unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
+// NewRawEndpoint creates a new raw UDP endpoint. It implements
+// stack.TransportProtocol.NewRawEndpoint.
 func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return nil, tcpip.ErrUnknownProtocol
+	return raw.NewEndpoint(stack, netProto, header.UDPProtocolNumber, waiterQueue)
 }
 
 // MinimumPacketSize returns the minimum valid udp packet size.
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 4e239617b..e8caf31fc 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -1545,6 +1545,24 @@ cc_binary(
     linkstatic = 1,
     deps = [
         ":socket_test_util",
+        ":unix_domain_socket_test_util",
+        "//test/util:capability_util",
+        "//test/util:file_descriptor",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "raw_socket_icmp_test",
+    testonly = 1,
+    srcs = ["raw_socket_icmp.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        ":unix_domain_socket_test_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:test_main",
diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc
new file mode 100644
index 000000000..24d9dc79a
--- /dev/null
+++ b/test/syscalls/linux/raw_socket_icmp.cc
@@ -0,0 +1,453 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/capability.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Compute the internet checksum of the ICMP header (assuming no payload).
+static uint16_t Checksum(struct icmphdr* icmp) {
+  uint32_t total = 0;
+  uint16_t* num = reinterpret_cast<uint16_t*>(icmp);
+
+  // This is just the ICMP header, so there's an even number of bytes.
+  static_assert(
+      sizeof(*icmp) % sizeof(*num) == 0,
+      "sizeof(struct icmphdr) is not an integer multiple of sizeof(uint16_t)");
+  for (unsigned int i = 0; i < sizeof(*icmp); i += sizeof(*num)) {
+    total += *num;
+    num++;
+  }
+
+  // Combine the upper and lower 16 bits. This happens twice in case the first
+  // combination causes a carry.
+  unsigned short upper = total >> 16;
+  unsigned short lower = total & 0xffff;
+  total = upper + lower;
+  upper = total >> 16;
+  lower = total & 0xffff;
+  total = upper + lower;
+
+  return ~total;
+}
+
+// The size of an empty ICMP packet and IP header together.
+constexpr size_t kEmptyICMPSize = 28;
+
+// ICMP raw sockets get their own special tests because Linux automatically
+// responds to ICMP echo requests, and thus a single echo request sent via
+// loopback leads to 2 received ICMP packets.
+
+class RawSocketICMPTest : public ::testing::Test {
+ protected:
+  // Creates a socket to be used in tests.
+  void SetUp() override;
+
+  // Closes the socket created by SetUp().
+  void TearDown() override;
+
+  // Checks that both an ICMP echo request and reply are received. Calls should
+  // be wrapped in ASSERT_NO_FATAL_FAILURE.
+  void ExpectICMPSuccess(const struct icmphdr& icmp);
+
+  // Sends icmp via s_.
+  void SendEmptyICMP(const struct icmphdr& icmp);
+
+  // Sends icmp via s_ to the given address.
+  void SendEmptyICMPTo(int sock, const struct sockaddr_in& addr,
+                       const struct icmphdr& icmp);
+
+  // Reads from s_ into recv_buf.
+  void ReceiveICMP(char* recv_buf, size_t recv_buf_len, size_t expected_size,
+                   struct sockaddr_in* src);
+
+  // Reads from sock into recv_buf.
+  void ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len,
+                       size_t expected_size, struct sockaddr_in* src, int sock);
+
+  // The socket used for both reading and writing.
+  int s_;
+
+  // The loopback address.
+  struct sockaddr_in addr_;
+};
+
+void RawSocketICMPTest::SetUp() {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+
+  addr_ = {};
+
+  // "On raw sockets sin_port is set to the IP protocol." - ip(7).
+  addr_.sin_port = IPPROTO_IP;
+  addr_.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  addr_.sin_family = AF_INET;
+}
+
+void RawSocketICMPTest::TearDown() {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  EXPECT_THAT(close(s_), SyscallSucceeds());
+}
+
+// We'll only read an echo in this case, as the kernel won't respond to the
+// malformed ICMP checksum.
+TEST_F(RawSocketICMPTest, SendAndReceiveBadChecksum) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+  // and ID. None of that should matter for raw sockets - the kernel should
+  // still give us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = 0;
+  icmp.un.echo.sequence = 2012;
+  icmp.un.echo.id = 2014;
+  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+  // Veryify that we get the echo, then that there's nothing else to read.
+  char recv_buf[kEmptyICMPSize];
+  struct sockaddr_in src;
+  ASSERT_NO_FATAL_FAILURE(
+      ReceiveICMP(recv_buf, sizeof(recv_buf), sizeof(struct icmphdr), &src));
+  EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+  // The packet should be identical to what we sent.
+  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)), 0);
+
+  // And there should be nothing left to read.
+  EXPECT_THAT(RetryEINTR(recv)(s_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+              SyscallFailsWithErrno(EAGAIN));
+}
+//
+// Send and receive an ICMP packet.
+TEST_F(RawSocketICMPTest, SendAndReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
+  // None of that should matter for raw sockets - the kernel should still give
+  // us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = 0;
+  icmp.un.echo.sequence = 2012;
+  icmp.un.echo.id = 2014;
+  icmp.checksum = Checksum(&icmp);
+  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+// We should be able to create multiple raw sockets for the same protocol and
+// receive the same packet on both.
+TEST_F(RawSocketICMPTest, MultipleSocketReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  FileDescriptor s2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_ICMP));
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
+  // None of that should matter for raw sockets - the kernel should still give
+  // us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = 0;
+  icmp.un.echo.sequence = 2016;
+  icmp.un.echo.id = 2018;
+  icmp.checksum = Checksum(&icmp);
+  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+  // Both sockets will receive the echo request and reply in indeterminate
+  // order, so we'll need to read 2 packets from each.
+
+  // Receive on socket 1.
+  constexpr int kBufSize = kEmptyICMPSize;
+  std::vector<char[kBufSize]> recv_buf1(2);
+  struct sockaddr_in src;
+  for (int i = 0; i < 2; i++) {
+    ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf1[i],
+                                        ABSL_ARRAYSIZE(recv_buf1[i]),
+                                        sizeof(struct icmphdr), &src));
+    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+  }
+
+  // Receive on socket 2.
+  std::vector<char[kBufSize]> recv_buf2(2);
+  for (int i = 0; i < 2; i++) {
+    ASSERT_NO_FATAL_FAILURE(
+        ReceiveICMPFrom(recv_buf2[i], ABSL_ARRAYSIZE(recv_buf2[i]),
+                        sizeof(struct icmphdr), &src, s2.get()));
+    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+  }
+
+  // Ensure both sockets receive identical packets.
+  int types[] = {ICMP_ECHO, ICMP_ECHOREPLY};
+  for (int type : types) {
+    auto match_type = [=](char buf[kBufSize]) {
+      struct icmphdr* icmp =
+          reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr));
+      return icmp->type == type;
+    };
+    const char* icmp1 =
+        *std::find_if(recv_buf1.begin(), recv_buf1.end(), match_type);
+    const char* icmp2 =
+        *std::find_if(recv_buf2.begin(), recv_buf2.end(), match_type);
+    ASSERT_NE(icmp1, *recv_buf1.end());
+    ASSERT_NE(icmp2, *recv_buf2.end());
+    EXPECT_EQ(memcmp(icmp1 + sizeof(struct iphdr), icmp2 + sizeof(struct iphdr),
+                     sizeof(icmp)),
+              0);
+  }
+}
+
+// A raw ICMP socket and ping socket should both receive the ICMP packets
+// indended for the ping socket.
+TEST_F(RawSocketICMPTest, RawAndPingSockets) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  FileDescriptor ping_sock =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+
+  // Ping sockets take care of the ICMP ID and checksum.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.un.echo.sequence = *static_cast<unsigned short*>(&icmp.un.echo.sequence);
+  ASSERT_THAT(RetryEINTR(sendto)(ping_sock.get(), &icmp, sizeof(icmp), 0,
+                                 reinterpret_cast<struct sockaddr*>(&addr_),
+                                 sizeof(addr_)),
+              SyscallSucceedsWithValue(sizeof(icmp)));
+
+  // Receive on socket 1, which receives the echo request and reply in
+  // indeterminate order.
+  constexpr int kBufSize = kEmptyICMPSize;
+  std::vector<char[kBufSize]> recv_buf1(2);
+  struct sockaddr_in src;
+  for (int i = 0; i < 2; i++) {
+    ASSERT_NO_FATAL_FAILURE(
+        ReceiveICMP(recv_buf1[i], kBufSize, sizeof(struct icmphdr), &src));
+    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+  }
+
+  // Receive on socket 2. Ping sockets only get the echo reply, not the initial
+  // echo.
+  char ping_recv_buf[kBufSize];
+  ASSERT_THAT(RetryEINTR(recv)(ping_sock.get(), ping_recv_buf, kBufSize, 0),
+              SyscallSucceedsWithValue(sizeof(struct icmphdr)));
+
+  // Ensure both sockets receive identical echo reply packets.
+  auto match_type_raw = [=](char buf[kBufSize]) {
+    struct icmphdr* icmp =
+        reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr));
+    return icmp->type == ICMP_ECHOREPLY;
+  };
+  char* raw_reply =
+      *std::find_if(recv_buf1.begin(), recv_buf1.end(), match_type_raw);
+  ASSERT_NE(raw_reply, *recv_buf1.end());
+  EXPECT_EQ(
+      memcmp(raw_reply + sizeof(struct iphdr), ping_recv_buf, sizeof(icmp)), 0);
+}
+
+// Test that connect() sends packets to the right place.
+TEST_F(RawSocketICMPTest, SendAndReceiveViaConnect) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  ASSERT_THAT(
+      connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+      SyscallSucceeds());
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
+  // None of that should matter for raw sockets - the kernel should still give
+  // us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = 0;
+  icmp.un.echo.sequence = 2003;
+  icmp.un.echo.id = 2004;
+  icmp.checksum = Checksum(&icmp);
+  ASSERT_THAT(send(s_, &icmp, sizeof(icmp), 0),
+              SyscallSucceedsWithValue(sizeof(icmp)));
+
+  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+// Bind to localhost, then send and receive packets.
+TEST_F(RawSocketICMPTest, BindSendAndReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  ASSERT_THAT(
+      bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+      SyscallSucceeds());
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+  // and ID. None of that should matter for raw sockets - the kernel should
+  // still give us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = 0;
+  icmp.un.echo.sequence = 2004;
+  icmp.un.echo.id = 2007;
+  icmp.checksum = Checksum(&icmp);
+  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+// Bind and connect to localhost and send/receive packets.
+TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  ASSERT_THAT(
+      bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+      SyscallSucceeds());
+  ASSERT_THAT(
+      connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+      SyscallSucceeds());
+
+  // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+  // and ID. None of that should matter for raw sockets - the kernel should
+  // still give us the packet.
+  struct icmphdr icmp;
+  icmp.type = ICMP_ECHO;
+  icmp.code = 0;
+  icmp.checksum = 0;
+  icmp.un.echo.sequence = 2010;
+  icmp.un.echo.id = 7;
+  icmp.checksum = Checksum(&icmp);
+  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+void RawSocketICMPTest::ExpectICMPSuccess(const struct icmphdr& icmp) {
+  // We're going to receive both the echo request and reply, but the order is
+  // indeterminate.
+  char recv_buf[kEmptyICMPSize];
+  struct sockaddr_in src;
+  bool received_request = false;
+  bool received_reply = false;
+
+  for (int i = 0; i < 2; i++) {
+    // Receive the packet.
+    ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf),
+                                        sizeof(struct icmphdr), &src));
+    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+    struct icmphdr* recvd_icmp =
+        reinterpret_cast<struct icmphdr*>(recv_buf + sizeof(struct iphdr));
+    switch (recvd_icmp->type) {
+      case ICMP_ECHO:
+        EXPECT_FALSE(received_request);
+        received_request = true;
+        // The packet should be identical to what we sent.
+        EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)),
+                  0);
+        break;
+
+      case ICMP_ECHOREPLY:
+        EXPECT_FALSE(received_reply);
+        received_reply = true;
+        // Most fields should be the same.
+        EXPECT_EQ(recvd_icmp->code, icmp.code);
+        EXPECT_EQ(recvd_icmp->un.echo.sequence, icmp.un.echo.sequence);
+        EXPECT_EQ(recvd_icmp->un.echo.id, icmp.un.echo.id);
+        // A couple are different.
+        EXPECT_EQ(recvd_icmp->type, ICMP_ECHOREPLY);
+        // The checksum is computed in such a way that it is guaranteed to have
+        // changed.
+        EXPECT_NE(recvd_icmp->checksum, icmp.checksum);
+        break;
+    }
+  }
+
+  ASSERT_TRUE(received_request);
+  ASSERT_TRUE(received_reply);
+}
+
+void RawSocketICMPTest::SendEmptyICMP(const struct icmphdr& icmp) {
+  ASSERT_NO_FATAL_FAILURE(SendEmptyICMPTo(s_, addr_, icmp));
+}
+
+void RawSocketICMPTest::SendEmptyICMPTo(int sock,
+                                        const struct sockaddr_in& addr,
+                                        const struct icmphdr& icmp) {
+  // It's safe to use const_cast here because sendmsg won't modify the iovec or
+  // address.
+  struct iovec iov = {};
+  iov.iov_base = static_cast<void*>(const_cast<struct icmphdr*>(&icmp));
+  iov.iov_len = sizeof(icmp);
+  struct msghdr msg = {};
+  msg.msg_name = static_cast<void*>(const_cast<struct sockaddr_in*>(&addr));
+  msg.msg_namelen = sizeof(addr);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = NULL;
+  msg.msg_controllen = 0;
+  msg.msg_flags = 0;
+  ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallSucceedsWithValue(sizeof(icmp)));
+}
+
+void RawSocketICMPTest::ReceiveICMP(char* recv_buf, size_t recv_buf_len,
+                                    size_t expected_size,
+                                    struct sockaddr_in* src) {
+  ASSERT_NO_FATAL_FAILURE(
+      ReceiveICMPFrom(recv_buf, recv_buf_len, expected_size, src, s_));
+}
+
+void RawSocketICMPTest::ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len,
+                                        size_t expected_size,
+                                        struct sockaddr_in* src, int sock) {
+  struct iovec iov = {};
+  iov.iov_base = recv_buf;
+  iov.iov_len = recv_buf_len;
+  struct msghdr msg = {};
+  msg.msg_name = src;
+  msg.msg_namelen = sizeof(*src);
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
+  msg.msg_control = NULL;
+  msg.msg_controllen = 0;
+  msg.msg_flags = 0;
+  // We should receive the ICMP packet plus 20 bytes of IP header.
+  ASSERT_THAT(recvmsg(sock, &msg, 0),
+              SyscallSucceedsWithValue(expected_size + sizeof(struct iphdr)));
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/raw_socket_ipv4.cc b/test/syscalls/linux/raw_socket_ipv4.cc
index 7c7779f3e..352037c88 100644
--- a/test/syscalls/linux/raw_socket_ipv4.cc
+++ b/test/syscalls/linux/raw_socket_ipv4.cc
@@ -25,6 +25,7 @@
 
 #include "gtest/gtest.h"
 #include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
 #include "test/util/capability_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/test_util.h"
@@ -37,8 +38,8 @@ namespace testing {
 
 namespace {
 
-// Fixture for tests parameterized by address family (currently only AF_INET).
-class RawSocketTest : public ::testing::Test {
+// Fixture for tests parameterized by protocol.
+class RawSocketTest : public ::testing::TestWithParam<int> {
  protected:
   // Creates a socket to be used in tests.
   void SetUp() override;
@@ -46,23 +47,17 @@ class RawSocketTest : public ::testing::Test {
   // Closes the socket created by SetUp().
   void TearDown() override;
 
-  // Checks that both an ICMP echo request and reply are received. Calls should
-  // be wrapped in ASSERT_NO_FATAL_FAILURE.
-  void ExpectICMPSuccess(const struct icmphdr& icmp);
+  // Sends buf via s_.
+  void SendBuf(const char* buf, int buf_len);
 
-  void SendEmptyICMP(const struct icmphdr& icmp);
+  // Sends buf to the provided address via the provided socket.
+  void SendBufTo(int sock, const struct sockaddr_in& addr, const char* buf,
+                 int buf_len);
 
-  void SendEmptyICMPTo(int sock, struct sockaddr_in* addr,
-                       const struct icmphdr& icmp);
+  // Reads from s_ into recv_buf.
+  void ReceiveBuf(char* recv_buf, size_t recv_buf_len);
 
-  void ReceiveICMP(char* recv_buf, size_t recv_buf_len, size_t expected_size,
-                   struct sockaddr_in* src);
-
-  void ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len,
-                       size_t expected_size, struct sockaddr_in* src, int sock);
-
-  // Compute the internet checksum of the ICMP header (assuming no payload).
-  unsigned short Checksum(struct icmphdr* icmp);
+  int Protocol() { return GetParam(); }
 
   // The socket used for both reading and writing.
   int s_;
@@ -74,7 +69,7 @@ class RawSocketTest : public ::testing::Test {
 void RawSocketTest::SetUp() {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
-  ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+  ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, Protocol()), SyscallSucceeds());
 
   addr_ = {};
 
@@ -92,177 +87,17 @@ void RawSocketTest::TearDown() {
 // We should be able to create multiple raw sockets for the same protocol.
 // BasicRawSocket::Setup creates the first one, so we only have to create one
 // more here.
-TEST_F(RawSocketTest, MultipleCreation) {
+TEST_P(RawSocketTest, MultipleCreation) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   int s2;
-  ASSERT_THAT(s2 = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+  ASSERT_THAT(s2 = socket(AF_INET, SOCK_RAW, Protocol()), SyscallSucceeds());
 
   ASSERT_THAT(close(s2), SyscallSucceeds());
 }
 
-// We'll only read an echo in this case, as the kernel won't respond to the
-// malformed ICMP checksum.
-TEST_F(RawSocketTest, SendAndReceiveBadChecksum) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
-  // and ID. None of that should matter for raw sockets - the kernel should
-  // still give us the packet.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2012;
-  icmp.un.echo.id = 2014;
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
-
-  // Veryify that we get the echo, then that there's nothing else to read.
-  char recv_buf[sizeof(icmp) + sizeof(struct iphdr)];
-  struct sockaddr_in src;
-  ASSERT_NO_FATAL_FAILURE(
-      ReceiveICMP(recv_buf, sizeof(recv_buf), sizeof(struct icmphdr), &src));
-  EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
-  // The packet should be identical to what we sent.
-  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)), 0);
-
-  // And there should be nothing left to read.
-  EXPECT_THAT(RetryEINTR(recv)(s_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
-              SyscallFailsWithErrno(EAGAIN));
-}
-
-// Send and receive an ICMP packet.
-TEST_F(RawSocketTest, SendAndReceive) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
-  // None of that should matter for raw sockets - the kernel should still give
-  // us the packet.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2012;
-  icmp.un.echo.id = 2014;
-  icmp.checksum = Checksum(&icmp);
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
-
-  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
-}
-
-// We should be able to create multiple raw sockets for the same protocol and
-// receive the same packet on both.
-TEST_F(RawSocketTest, MultipleSocketReceive) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  FileDescriptor s2 =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_ICMP));
-
-  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
-  // None of that should matter for raw sockets - the kernel should still give
-  // us the packet.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2016;
-  icmp.un.echo.id = 2018;
-  icmp.checksum = Checksum(&icmp);
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
-
-  // Both sockets will receive the echo request and reply in indeterminate
-  // order, so we'll need to read 2 packets from each.
-
-  // Receive on socket 1.
-  constexpr int kBufSize = sizeof(icmp) + sizeof(struct iphdr);
-  std::array<char[kBufSize], 2> recv_buf1;
-  struct sockaddr_in src;
-  for (int i = 0; i < 2; i++) {
-    ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf1[i],
-                                        ABSL_ARRAYSIZE(recv_buf1[i]),
-                                        sizeof(struct icmphdr), &src));
-    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
-  }
-
-  // Receive on socket 2.
-  std::array<char[kBufSize], 2> recv_buf2;
-  for (int i = 0; i < 2; i++) {
-    ASSERT_NO_FATAL_FAILURE(
-        ReceiveICMPFrom(recv_buf2[i], ABSL_ARRAYSIZE(recv_buf2[i]),
-                        sizeof(struct icmphdr), &src, s2.get()));
-    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
-  }
-
-  // Ensure both sockets receive identical packets.
-  int types[] = {ICMP_ECHO, ICMP_ECHOREPLY};
-  for (int type : types) {
-    auto match_type = [=](char buf[kBufSize]) {
-      struct icmphdr* icmp =
-          reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr));
-      return icmp->type == type;
-    };
-    const char* icmp1 =
-        *std::find_if(recv_buf1.begin(), recv_buf1.end(), match_type);
-    const char* icmp2 =
-        *std::find_if(recv_buf2.begin(), recv_buf2.end(), match_type);
-    ASSERT_NE(icmp1, *recv_buf1.end());
-    ASSERT_NE(icmp2, *recv_buf2.end());
-    EXPECT_EQ(memcmp(icmp1 + sizeof(struct iphdr), icmp2 + sizeof(struct iphdr),
-                     sizeof(icmp)),
-              0);
-  }
-}
-
-// A raw ICMP socket and ping socket should both receive the ICMP packets
-// indended for the ping socket.
-TEST_F(RawSocketTest, RawAndPingSockets) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  FileDescriptor ping_sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
-
-  // Ping sockets take care of the ICMP ID and checksum.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.un.echo.sequence = *static_cast<unsigned short*>(&icmp.un.echo.sequence);
-  ASSERT_THAT(RetryEINTR(sendto)(ping_sock.get(), &icmp, sizeof(icmp), 0,
-                                 reinterpret_cast<struct sockaddr*>(&addr_),
-                                 sizeof(addr_)),
-              SyscallSucceedsWithValue(sizeof(icmp)));
-
-  // Receive on socket 1, which receives the echo request and reply in
-  // indeterminate order.
-  constexpr int kBufSize = sizeof(icmp) + sizeof(struct iphdr);
-  std::array<char[kBufSize], 2> recv_buf1;
-  struct sockaddr_in src;
-  for (int i = 0; i < 2; i++) {
-    ASSERT_NO_FATAL_FAILURE(
-        ReceiveICMP(recv_buf1[i], kBufSize, sizeof(struct icmphdr), &src));
-    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
-  }
-
-  // Receive on socket 2. Ping sockets only get the echo reply, not the initial
-  // echo.
-  char ping_recv_buf[kBufSize];
-  ASSERT_THAT(RetryEINTR(recv)(ping_sock.get(), ping_recv_buf, kBufSize, 0),
-              SyscallSucceedsWithValue(sizeof(struct icmphdr)));
-
-  // Ensure both sockets receive identical echo reply packets.
-  auto match_type_raw = [=](char buf[kBufSize]) {
-    struct icmphdr* icmp =
-        reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr));
-    return icmp->type == ICMP_ECHOREPLY;
-  };
-  char* raw_reply =
-      *std::find_if(recv_buf1.begin(), recv_buf1.end(), match_type_raw);
-  ASSERT_NE(raw_reply, *recv_buf1.end());
-  EXPECT_EQ(
-      memcmp(raw_reply + sizeof(struct iphdr), ping_recv_buf, sizeof(icmp)), 0);
-}
-
 // Test that shutting down an unconnected socket fails.
-TEST_F(RawSocketTest, FailShutdownWithoutConnect) {
+TEST_P(RawSocketTest, FailShutdownWithoutConnect) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
@@ -270,7 +105,7 @@ TEST_F(RawSocketTest, FailShutdownWithoutConnect) {
 }
 
 // Shutdown is a no-op for raw sockets (and datagram sockets in general).
-TEST_F(RawSocketTest, ShutdownWriteNoop) {
+TEST_P(RawSocketTest, ShutdownWriteNoop) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(
@@ -278,13 +113,14 @@ TEST_F(RawSocketTest, ShutdownWriteNoop) {
       SyscallSucceeds());
   ASSERT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
 
+  // Arbitrary.
   constexpr char kBuf[] = "noop";
   ASSERT_THAT(RetryEINTR(write)(s_, kBuf, sizeof(kBuf)),
               SyscallSucceedsWithValue(sizeof(kBuf)));
 }
 
 // Shutdown is a no-op for raw sockets (and datagram sockets in general).
-TEST_F(RawSocketTest, ShutdownReadNoop) {
+TEST_P(RawSocketTest, ShutdownReadNoop) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(
@@ -292,29 +128,24 @@ TEST_F(RawSocketTest, ShutdownReadNoop) {
       SyscallSucceeds());
   ASSERT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
 
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2012;
-  icmp.un.echo.id = 2014;
-  icmp.checksum = Checksum(&icmp);
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
-
-  char c[sizeof(icmp) + sizeof(struct iphdr)];
-  ASSERT_THAT(read(s_, &c, sizeof(c)),
-              SyscallSucceedsWithValue(sizeof(icmp) + sizeof(struct iphdr)));
+  // Arbitrary.
+  constexpr char kBuf[] = "gdg";
+  ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+  constexpr size_t kReadSize = sizeof(kBuf) + sizeof(struct iphdr);
+  char c[kReadSize];
+  ASSERT_THAT(read(s_, &c, sizeof(c)), SyscallSucceedsWithValue(kReadSize));
 }
 
 // Test that listen() fails.
-TEST_F(RawSocketTest, FailListen) {
+TEST_P(RawSocketTest, FailListen) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(listen(s_, 1), SyscallFailsWithErrno(ENOTSUP));
 }
 
 // Test that accept() fails.
-TEST_F(RawSocketTest, FailAccept) {
+TEST_P(RawSocketTest, FailAccept) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   struct sockaddr saddr;
@@ -323,7 +154,7 @@ TEST_F(RawSocketTest, FailAccept) {
 }
 
 // Test that getpeername() returns nothing before connect().
-TEST_F(RawSocketTest, FailGetPeerNameBeforeConnect) {
+TEST_P(RawSocketTest, FailGetPeerNameBeforeConnect) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   struct sockaddr saddr;
@@ -333,7 +164,7 @@ TEST_F(RawSocketTest, FailGetPeerNameBeforeConnect) {
 }
 
 // Test that getpeername() returns something after connect().
-TEST_F(RawSocketTest, GetPeerName) {
+TEST_P(RawSocketTest, GetPeerName) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(
@@ -347,7 +178,7 @@ TEST_F(RawSocketTest, GetPeerName) {
 }
 
 // Test that the socket is writable immediately.
-TEST_F(RawSocketTest, PollWritableImmediately) {
+TEST_P(RawSocketTest, PollWritableImmediately) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   struct pollfd pfd = {};
@@ -357,7 +188,7 @@ TEST_F(RawSocketTest, PollWritableImmediately) {
 }
 
 // Test that the socket isn't readable before receiving anything.
-TEST_F(RawSocketTest, PollNotReadableInitially) {
+TEST_P(RawSocketTest, PollNotReadableInitially) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   // Try to receive data with MSG_DONTWAIT, which returns immediately if there's
@@ -368,12 +199,13 @@ TEST_F(RawSocketTest, PollNotReadableInitially) {
 }
 
 // Test that the socket becomes readable once something is written to it.
-TEST_F(RawSocketTest, PollTriggeredOnWrite) {
+TEST_P(RawSocketTest, PollTriggeredOnWrite) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   // Write something so that there's data to be read.
-  struct icmphdr icmp = {};
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+  // Arbitrary.
+  constexpr char kBuf[] = "JP5";
+  ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
 
   struct pollfd pfd = {};
   pfd.fd = s_;
@@ -382,7 +214,7 @@ TEST_F(RawSocketTest, PollTriggeredOnWrite) {
 }
 
 // Test that we can connect() to a valid IP (loopback).
-TEST_F(RawSocketTest, ConnectToLoopback) {
+TEST_P(RawSocketTest, ConnectToLoopback) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(
@@ -390,50 +222,18 @@ TEST_F(RawSocketTest, ConnectToLoopback) {
       SyscallSucceeds());
 }
 
-// Test that connect() sends packets to the right place.
-TEST_F(RawSocketTest, SendAndReceiveViaConnect) {
-  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
-
-  ASSERT_THAT(
-      connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
-      SyscallSucceeds());
-
-  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
-  // None of that should matter for raw sockets - the kernel should still give
-  // us the packet.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2003;
-  icmp.un.echo.id = 2004;
-  icmp.checksum = Checksum(&icmp);
-  ASSERT_THAT(send(s_, &icmp, sizeof(icmp), 0),
-              SyscallSucceedsWithValue(sizeof(icmp)));
-
-  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
-}
-
 // Test that calling send() without connect() fails.
-TEST_F(RawSocketTest, SendWithoutConnectFails) {
+TEST_P(RawSocketTest, SendWithoutConnectFails) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
-  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
-  // None of that should matter for raw sockets - the kernel should still give
-  // us the packet.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2017;
-  icmp.un.echo.id = 2019;
-  icmp.checksum = Checksum(&icmp);
-  ASSERT_THAT(send(s_, &icmp, sizeof(icmp), 0),
+  // Arbitrary.
+  constexpr char kBuf[] = "Endgame was good";
+  ASSERT_THAT(send(s_, kBuf, sizeof(kBuf), 0),
               SyscallFailsWithErrno(EDESTADDRREQ));
 }
 
 // Bind to localhost.
-TEST_F(RawSocketTest, BindToLocalhost) {
+TEST_P(RawSocketTest, BindToLocalhost) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(
@@ -442,7 +242,7 @@ TEST_F(RawSocketTest, BindToLocalhost) {
 }
 
 // Bind to a different address.
-TEST_F(RawSocketTest, BindToInvalid) {
+TEST_P(RawSocketTest, BindToInvalid) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   struct sockaddr_in bind_addr = {};
@@ -453,31 +253,86 @@ TEST_F(RawSocketTest, BindToInvalid) {
               SyscallFailsWithErrno(EADDRNOTAVAIL));
 }
 
+// Send and receive an packet.
+TEST_P(RawSocketTest, SendAndReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  // Arbitrary.
+  constexpr char kBuf[] = "TB12";
+  ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+  // Receive the packet and make sure it's identical.
+  char recv_buf[sizeof(kBuf) + sizeof(struct iphdr)];
+  ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf, sizeof(recv_buf)));
+  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), kBuf, sizeof(kBuf)), 0);
+}
+
+// We should be able to create multiple raw sockets for the same protocol and
+// receive the same packet on both.
+TEST_P(RawSocketTest, MultipleSocketReceive) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  int s2;
+  ASSERT_THAT(s2 = socket(AF_INET, SOCK_RAW, Protocol()), SyscallSucceeds());
+
+  // Arbitrary.
+  constexpr char kBuf[] = "TB10";
+  ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+  // Receive it on socket 1.
+  char recv_buf1[sizeof(kBuf) + sizeof(struct iphdr)];
+  ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf1, sizeof(recv_buf1)));
+
+  // Receive it on socket 2.
+  char recv_buf2[sizeof(kBuf) + sizeof(struct iphdr)];
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s2, recv_buf2, sizeof(recv_buf2)));
+
+  EXPECT_EQ(memcmp(recv_buf1 + sizeof(struct iphdr),
+                   recv_buf2 + sizeof(struct iphdr), sizeof(kBuf)),
+            0);
+
+  ASSERT_THAT(close(s2), SyscallSucceeds());
+}
+
+// Test that connect sends packets to the right place.
+TEST_P(RawSocketTest, SendAndReceiveViaConnect) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+  ASSERT_THAT(
+      connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+      SyscallSucceeds());
+
+  // Arbitrary.
+  constexpr char kBuf[] = "JH4";
+  ASSERT_THAT(send(s_, kBuf, sizeof(kBuf), 0),
+              SyscallSucceedsWithValue(sizeof(kBuf)));
+
+  // Receive the packet and make sure it's identical.
+  char recv_buf[sizeof(kBuf) + sizeof(struct iphdr)];
+  ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf, sizeof(recv_buf)));
+  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), kBuf, sizeof(kBuf)), 0);
+}
+
 // Bind to localhost, then send and receive packets.
-TEST_F(RawSocketTest, BindSendAndReceive) {
+TEST_P(RawSocketTest, BindSendAndReceive) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(
       bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
       SyscallSucceeds());
 
-  // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
-  // None of that should matter for raw sockets - the kernel should still give
-  // us the packet.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2004;
-  icmp.un.echo.id = 2007;
-  icmp.checksum = Checksum(&icmp);
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
-
-  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+  // Arbitrary.
+  constexpr char kBuf[] = "DR16";
+  ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+  // Receive the packet and make sure it's identical.
+  char recv_buf[sizeof(kBuf) + sizeof(struct iphdr)];
+  ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf, sizeof(recv_buf)));
+  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), kBuf, sizeof(kBuf)), 0);
 }
 
 // Bind and connect to localhost and send/receive packets.
-TEST_F(RawSocketTest, BindConnectSendAndReceive) {
+TEST_P(RawSocketTest, BindConnectSendAndReceive) {
   SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
 
   ASSERT_THAT(
@@ -487,132 +342,44 @@ TEST_F(RawSocketTest, BindConnectSendAndReceive) {
       connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
       SyscallSucceeds());
 
-  // Prepare and send an ICMP packet. Use arbitrary junk for sequence
-  // and ID. None of that should matter for raw sockets - the kernel should
-  // still give us the packet.
-  struct icmphdr icmp;
-  icmp.type = ICMP_ECHO;
-  icmp.code = 0;
-  icmp.checksum = 0;
-  icmp.un.echo.sequence = 2010;
-  icmp.un.echo.id = 7;
-  icmp.checksum = Checksum(&icmp);
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
-
-  ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
-}
+  // Arbitrary.
+  constexpr char kBuf[] = "DG88";
+  ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
 
-void RawSocketTest::ExpectICMPSuccess(const struct icmphdr& icmp) {
-  // We're going to receive both the echo request and reply, but the order is
-  // indeterminate.
-  char recv_buf[sizeof(icmp) + sizeof(struct iphdr)];
-  struct sockaddr_in src;
-  bool received_request = false;
-  bool received_reply = false;
-
-  for (int i = 0; i < 2; i++) {
-    // Receive the packet.
-    ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf),
-                                        sizeof(struct icmphdr), &src));
-    EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
-    struct icmphdr* recvd_icmp =
-        reinterpret_cast<struct icmphdr*>(recv_buf + sizeof(struct iphdr));
-    switch (recvd_icmp->type) {
-      case ICMP_ECHO:
-        EXPECT_FALSE(received_request);
-        received_request = true;
-        // The packet should be identical to what we sent.
-        EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)),
-                  0);
-        break;
-
-      case ICMP_ECHOREPLY:
-        EXPECT_FALSE(received_reply);
-        received_reply = true;
-        // Most fields should be the same.
-        EXPECT_EQ(recvd_icmp->code, icmp.code);
-        EXPECT_EQ(recvd_icmp->un.echo.sequence, icmp.un.echo.sequence);
-        EXPECT_EQ(recvd_icmp->un.echo.id, icmp.un.echo.id);
-        // A couple are different.
-        EXPECT_EQ(recvd_icmp->type, ICMP_ECHOREPLY);
-        // The checksum is computed in such a way that it is guaranteed to have
-        // changed.
-        EXPECT_NE(recvd_icmp->checksum, icmp.checksum);
-        break;
-    }
-  }
-
-  ASSERT_TRUE(received_request);
-  ASSERT_TRUE(received_reply);
+  // Receive the packet and make sure it's identical.
+  char recv_buf[sizeof(kBuf) + sizeof(struct iphdr)];
+  ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf, sizeof(recv_buf)));
+  EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), kBuf, sizeof(kBuf)), 0);
 }
 
-void RawSocketTest::SendEmptyICMP(const struct icmphdr& icmp) {
-  ASSERT_NO_FATAL_FAILURE(SendEmptyICMPTo(s_, &addr_, icmp));
+void RawSocketTest::SendBuf(const char* buf, int buf_len) {
+  ASSERT_NO_FATAL_FAILURE(SendBufTo(s_, addr_, buf, buf_len));
 }
 
-void RawSocketTest::SendEmptyICMPTo(int sock, struct sockaddr_in* addr,
-                                    const struct icmphdr& icmp) {
-  // It's safe to use const_cast here because sendmsg won't modify the iovec.
+void RawSocketTest::SendBufTo(int sock, const struct sockaddr_in& addr,
+                              const char* buf, int buf_len) {
+  // It's safe to use const_cast here because sendmsg won't modify the iovec or
+  // address.
   struct iovec iov = {};
-  iov.iov_base = static_cast<void*>(const_cast<struct icmphdr*>(&icmp));
-  iov.iov_len = sizeof(icmp);
+  iov.iov_base = static_cast<void*>(const_cast<char*>(buf));
+  iov.iov_len = static_cast<size_t>(buf_len);
   struct msghdr msg = {};
-  msg.msg_name = addr;
-  msg.msg_namelen = sizeof(*addr);
+  msg.msg_name = static_cast<void*>(const_cast<struct sockaddr_in*>(&addr));
+  msg.msg_namelen = sizeof(addr);
   msg.msg_iov = &iov;
   msg.msg_iovlen = 1;
   msg.msg_control = NULL;
   msg.msg_controllen = 0;
   msg.msg_flags = 0;
-  ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallSucceedsWithValue(sizeof(icmp)));
-}
-
-unsigned short RawSocketTest::Checksum(struct icmphdr* icmp) {
-  unsigned int total = 0;
-  unsigned short* num = reinterpret_cast<unsigned short*>(icmp);
-
-  // This is just the ICMP header, so there's an even number of bytes.
-  for (unsigned int i = 0; i < sizeof(*icmp); i += sizeof(*num)) {
-    total += *num;
-    num++;
-  }
-
-  // Combine the upper and lower 16 bits. This happens twice in case the first
-  // combination causes a carry.
-  unsigned short upper = total >> 16;
-  unsigned short lower = total & 0xffff;
-  total = upper + lower;
-  upper = total >> 16;
-  lower = total & 0xffff;
-  total = upper + lower;
-
-  return ~total;
+  ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallSucceedsWithValue(buf_len));
 }
 
-void RawSocketTest::ReceiveICMP(char* recv_buf, size_t recv_buf_len,
-                                size_t expected_size, struct sockaddr_in* src) {
-  ASSERT_NO_FATAL_FAILURE(
-      ReceiveICMPFrom(recv_buf, recv_buf_len, expected_size, src, s_));
+void RawSocketTest::ReceiveBuf(char* recv_buf, size_t recv_buf_len) {
+  ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, recv_buf_len));
 }
 
-void RawSocketTest::ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len,
-                                    size_t expected_size,
-                                    struct sockaddr_in* src, int sock) {
-  struct iovec iov = {};
-  iov.iov_base = recv_buf;
-  iov.iov_len = recv_buf_len;
-  struct msghdr msg = {};
-  msg.msg_name = src;
-  msg.msg_namelen = sizeof(*src);
-  msg.msg_iov = &iov;
-  msg.msg_iovlen = 1;
-  msg.msg_control = NULL;
-  msg.msg_controllen = 0;
-  msg.msg_flags = 0;
-  // We should receive the ICMP packet plus 20 bytes of IP header.
-  ASSERT_THAT(recvmsg(sock, &msg, 0),
-              SyscallSucceedsWithValue(expected_size + sizeof(struct iphdr)));
-}
+INSTANTIATE_TEST_SUITE_P(AllInetTests, RawSocketTest,
+                         ::testing::Values(IPPROTO_TCP, IPPROTO_UDP));
 
 }  // namespace
 
-- 
cgit v1.2.3


From 711290a7f6c434ddbfe401e46002afd30df26aa5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 22 May 2019 15:53:13 -0700
Subject: Add support for wait(WNOTHREAD)

PiperOrigin-RevId: 249537694
Change-Id: Iaa4bca73a2d8341e03064d59a2eb490afc3f80da
---
 pkg/sentry/kernel/task_exit.go          | 164 ++++++++++++++++------------
 pkg/sentry/syscalls/linux/sys_thread.go |  10 +-
 test/syscalls/linux/BUILD               |   2 +
 test/syscalls/linux/wait.cc             | 185 ++++++++++++++++++++++----------
 4 files changed, 236 insertions(+), 125 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 6e9701b01..2e1e46582 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -782,6 +782,10 @@ type WaitOptions struct {
 	// for.
 	CloneTasks bool
 
+	// If SiblingChildren is true, events from children tasks of any task
+	// in the thread group of the waiter are eligible to be waited for.
+	SiblingChildren bool
+
 	// Events is a bitwise combination of the events defined above that specify
 	// what events are of interest to the call to Wait.
 	Events waiter.EventMask
@@ -869,87 +873,109 @@ func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
 	t.tg.pidns.owner.mu.Lock()
 	defer t.tg.pidns.owner.mu.Unlock()
 
-	// Without the (unimplemented) __WNOTHREAD flag, a task can wait on the
-	// children and tracees of any task in the same thread group.
-	for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
-		for child := range parent.children {
-			if !opts.matchesTask(child, parent.tg.pidns) {
-				continue
-			}
-			// Non-leaders don't notify parents on exit and aren't eligible to
-			// be waited on.
-			if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
-				anyWaitableTasks = true
-				if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
-					return wr, nil
-				}
-			}
-			// Check for group stops and continues. Tasks that have passed
-			// TaskExitInitiated can no longer participate in group stops.
-			if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
-				continue
-			}
-			if child.exitState >= TaskExitInitiated {
-				continue
-			}
-			// If the waiter is in the same thread group as the task's
-			// tracer, do not report its group stops; they will be reported
-			// as ptrace stops instead. This also skips checking for group
-			// continues, but they'll be checked for when scanning tracees
-			// below. (Per kernel/exit.c:wait_consider_task(): "If a
-			// ptracer wants to distinguish the two events for its own
-			// children, it should create a separate process which takes
-			// the role of real parent.")
-			if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
-				continue
+	if opts.SiblingChildren {
+		// We can wait on the children and tracees of any task in the
+		// same thread group.
+		for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
+			wr, any := t.waitParentLocked(opts, parent)
+			if wr != nil {
+				return wr, nil
 			}
+			anyWaitableTasks = anyWaitableTasks || any
+		}
+	} else {
+		// We can only wait on this task.
+		var wr *WaitResult
+		wr, anyWaitableTasks = t.waitParentLocked(opts, t)
+		if wr != nil {
+			return wr, nil
+		}
+	}
+
+	if anyWaitableTasks {
+		return nil, ErrNoWaitableEvent
+	}
+	return nil, syserror.ECHILD
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) {
+	anyWaitableTasks := false
+
+	for child := range parent.children {
+		if !opts.matchesTask(child, parent.tg.pidns) {
+			continue
+		}
+		// Non-leaders don't notify parents on exit and aren't eligible to
+		// be waited on.
+		if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
 			anyWaitableTasks = true
-			if opts.Events&EventChildGroupStop != 0 {
-				if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
-					return wr, nil
-				}
-			}
-			if opts.Events&EventGroupContinue != 0 {
-				if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
-					return wr, nil
-				}
+			if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
+				return wr, anyWaitableTasks
 			}
 		}
-		for tracee := range parent.ptraceTracees {
-			if !opts.matchesTask(tracee, parent.tg.pidns) {
-				continue
-			}
-			// Non-leaders do notify tracers on exit.
-			if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
-				anyWaitableTasks = true
-				if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
-					return wr, nil
-				}
-			}
-			if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
-				continue
+		// Check for group stops and continues. Tasks that have passed
+		// TaskExitInitiated can no longer participate in group stops.
+		if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
+			continue
+		}
+		if child.exitState >= TaskExitInitiated {
+			continue
+		}
+		// If the waiter is in the same thread group as the task's
+		// tracer, do not report its group stops; they will be reported
+		// as ptrace stops instead. This also skips checking for group
+		// continues, but they'll be checked for when scanning tracees
+		// below. (Per kernel/exit.c:wait_consider_task(): "If a
+		// ptracer wants to distinguish the two events for its own
+		// children, it should create a separate process which takes
+		// the role of real parent.")
+		if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
+			continue
+		}
+		anyWaitableTasks = true
+		if opts.Events&EventChildGroupStop != 0 {
+			if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
+				return wr, anyWaitableTasks
 			}
-			if tracee.exitState >= TaskExitInitiated {
-				continue
+		}
+		if opts.Events&EventGroupContinue != 0 {
+			if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
+				return wr, anyWaitableTasks
 			}
+		}
+	}
+	for tracee := range parent.ptraceTracees {
+		if !opts.matchesTask(tracee, parent.tg.pidns) {
+			continue
+		}
+		// Non-leaders do notify tracers on exit.
+		if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
 			anyWaitableTasks = true
-			if opts.Events&EventTraceeStop != 0 {
-				if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
-					return wr, nil
-				}
+			if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
+				return wr, anyWaitableTasks
 			}
-			if opts.Events&EventGroupContinue != 0 {
-				if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
-					return wr, nil
-				}
+		}
+		if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
+			continue
+		}
+		if tracee.exitState >= TaskExitInitiated {
+			continue
+		}
+		anyWaitableTasks = true
+		if opts.Events&EventTraceeStop != 0 {
+			if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
+				return wr, anyWaitableTasks
+			}
+		}
+		if opts.Events&EventGroupContinue != 0 {
+			if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
+				return wr, anyWaitableTasks
 			}
 		}
 	}
 
-	if anyWaitableTasks {
-		return nil, ErrNoWaitableEvent
-	}
-	return nil, syserror.ECHILD
+	return nil, anyWaitableTasks
 }
 
 // Preconditions: The TaskSet mutex must be locked for writing.
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index cc441460c..14fa7ef92 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -183,7 +183,7 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 
 // wait4 waits for the given child process to exit.
 func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
-	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WALL|linux.WCLONE) != 0 {
+	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
 		return 0, syscall.EINVAL
 	}
 	wopts := kernel.WaitOptions{
@@ -227,6 +227,9 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage
 	if options&linux.WNOHANG == 0 {
 		wopts.BlockInterruptErr = kernel.ERESTARTSYS
 	}
+	if options&linux.WNOTHREAD == 0 {
+		wopts.SiblingChildren = true
+	}
 
 	wr, err := t.Wait(&wopts)
 	if err != nil {
@@ -278,7 +281,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	options := int(args[3].Uint())
 	rusageAddr := args[4].Pointer()
 
-	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT) != 0 {
+	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 	if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
@@ -310,6 +313,9 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	if options&linux.WNOHANG == 0 {
 		wopts.BlockInterruptErr = kernel.ERESTARTSYS
 	}
+	if options&linux.WNOTHREAD == 0 {
+		wopts.SiblingChildren = true
+	}
 
 	wr, err := t.Wait(&wopts)
 	if err != nil {
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index e8caf31fc..014679ec5 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3179,7 +3179,9 @@ cc_binary(
         "//test/util:signal_util",
         "//test/util:test_main",
         "//test/util:test_util",
+        "//test/util:thread_util",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index 50d0725a7..da3b97828 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -21,11 +21,13 @@
 #include <unistd.h>
 
 #include <functional>
+#include <tuple>
 #include <vector>
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/cleanup.h"
@@ -34,6 +36,7 @@
 #include "test/util/posix_error.h"
 #include "test/util/signal_util.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 using ::testing::UnorderedElementsAre;
 
@@ -42,10 +45,8 @@ using ::testing::UnorderedElementsAre;
 //
 // NOTE(b/22640830,b/27680907,b/29049891): Some functionality is not tested as
 // it is not currently supported by gVisor:
-// * UID in waitid(2) siginfo.
 // * Process groups.
 // * Core dump status (WCOREDUMP).
-// * Linux only option __WNOTHREAD.
 //
 // Tests for waiting on stopped/continued children are in sigstop.cc.
 
@@ -357,13 +358,22 @@ INSTANTIATE_TEST_SUITE_P(
           return static_cast<pid_t>(si.si_pid);
         }));
 
-// Fixture for tests parameterized by a function that takes the PID of a
-// specific child to wait for, waits for it to exit, and checks that it exits
-// with the given code.
+// Fixture for tests parameterized by a (sysno, function) tuple. The function
+// takes the PID of a specific child to wait for, waits for it to exit, and
+// checks that it exits with the given code.
 class WaitSpecificChildTest
-    : public ::testing::TestWithParam<std::function<PosixError(pid_t, int)>> {
+    : public ::testing::TestWithParam<
+          std::tuple<int, std::function<PosixError(pid_t, int, int)>>> {
  protected:
-  PosixError WaitFor(pid_t pid, int code) { return GetParam()(pid, code); }
+  int Sysno() { return std::get<0>(GetParam()); }
+
+  PosixError WaitForWithOptions(pid_t pid, int options, int code) {
+    return std::get<1>(GetParam())(pid, options, code);
+  }
+
+  PosixError WaitFor(pid_t pid, int code) {
+    return std::get<1>(GetParam())(pid, 0, code);
+  }
 };
 
 // Wait for specific child to exit.
@@ -432,6 +442,75 @@ TEST_P(WaitSpecificChildTest, AfterExit) {
   EXPECT_NO_ERRNO(WaitFor(child, 0));
 }
 
+// Wait for child of sibling thread.
+TEST_P(WaitSpecificChildTest, SiblingChildren) {
+  absl::Mutex mu;
+  pid_t child;
+  bool ready = false;
+  bool stop = false;
+
+  ScopedThread t([&] {
+    absl::MutexLock ml(&mu);
+    EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+    ready = true;
+    mu.Await(absl::Condition(&stop));
+  });
+
+  // N.B. This must be declared after ScopedThread, so it is destructed first,
+  // thus waking the thread.
+  absl::MutexLock ml(&mu);
+  mu.Await(absl::Condition(&ready));
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+
+  // Keep the sibling alive until after we've waited so the child isn't
+  // reparented.
+  stop = true;
+}
+
+// Waiting for child of sibling thread not allowed with WNOTHREAD.
+TEST_P(WaitSpecificChildTest, SiblingChildrenWNOTHREAD) {
+  // Linux added WNOTHREAD support to waitid(2) in
+  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+  //
+  // Skip the test if it isn't supported yet.
+  if (Sysno() == SYS_waitid) {
+    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WNOTHREAD);
+    SKIP_IF(ret < 0 && errno == EINVAL);
+  }
+
+  absl::Mutex mu;
+  pid_t child;
+  bool ready = false;
+  bool stop = false;
+
+  ScopedThread t([&] {
+    absl::MutexLock ml(&mu);
+    EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+    ready = true;
+    mu.Await(absl::Condition(&stop));
+
+    // This thread can wait on child.
+    EXPECT_NO_ERRNO(WaitForWithOptions(child, __WNOTHREAD, 0));
+  });
+
+  // N.B. This must be declared after ScopedThread, so it is destructed first,
+  // thus waking the thread.
+  absl::MutexLock ml(&mu);
+  mu.Await(absl::Condition(&ready));
+
+  // This thread can't wait on child.
+  EXPECT_THAT(
+      WaitForWithOptions(child, __WNOTHREAD, 0),
+      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
+                                            ::testing::StrEq("wait4"))));
+
+  // Keep the sibling alive until after we've waited so the child isn't
+  // reparented.
+  stop = true;
+}
+
 // Wait for specific child to exit.
 // A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like
 // a forked process.
@@ -551,55 +630,53 @@ TEST_P(WaitSpecificChildTest, AfterChildExecve) {
   EXPECT_NO_ERRNO(WaitFor(child, 0));
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    Waiters, WaitSpecificChildTest,
-    ::testing::Values(
-        [](pid_t pid, int code) -> PosixError {
-          int status;
-          auto const rv = Wait4(pid, &status, 0, nullptr);
-          MaybeSave();
-          if (rv < 0) {
-            return PosixError(errno, "wait4");
-          } else if (rv != pid) {
-            return PosixError(EINVAL, absl::StrCat("unexpected pid: got ", rv,
-                                                   ", wanted ", pid));
-          }
-          if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
-            return PosixError(
-                EINVAL, absl::StrCat("unexpected wait status: got ", status,
-                                     ", wanted ", code));
-          }
-          return NoError();
-        },
-        [](pid_t pid, int code) -> PosixError {
-          siginfo_t si;
-          auto const rv = Waitid(P_PID, pid, &si, WEXITED);
-          MaybeSave();
-          if (rv < 0) {
-            return PosixError(errno, "waitid");
-          }
-          if (si.si_pid != pid) {
-            return PosixError(EINVAL,
-                              absl::StrCat("unexpected pid: got ", si.si_pid,
+PosixError CheckWait4(pid_t pid, int options, int code) {
+  int status;
+  auto const rv = Wait4(pid, &status, options, nullptr);
+  MaybeSave();
+  if (rv < 0) {
+    return PosixError(errno, "wait4");
+  } else if (rv != pid) {
+    return PosixError(
+        EINVAL, absl::StrCat("unexpected pid: got ", rv, ", wanted ", pid));
+  }
+  if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
+    return PosixError(EINVAL, absl::StrCat("unexpected wait status: got ",
+                                           status, ", wanted ", code));
+  }
+  return NoError();
+};
+
+PosixError CheckWaitid(pid_t pid, int options, int code) {
+  siginfo_t si;
+  auto const rv = Waitid(P_PID, pid, &si, options | WEXITED);
+  MaybeSave();
+  if (rv < 0) {
+    return PosixError(errno, "waitid");
+  }
+  if (si.si_pid != pid) {
+    return PosixError(EINVAL, absl::StrCat("unexpected pid: got ", si.si_pid,
                                            ", wanted ", pid));
-          }
-          if (si.si_signo != SIGCHLD) {
-            return PosixError(
-                EINVAL, absl::StrCat("unexpected signo: got ", si.si_signo,
-                                     ", wanted ", SIGCHLD));
-          }
-          if (si.si_status != code) {
-            return PosixError(
-                EINVAL, absl::StrCat("unexpected status: got ", si.si_status,
-                                     ", wanted ", code));
-          }
-          if (si.si_code != CLD_EXITED) {
-            return PosixError(EINVAL,
-                              absl::StrCat("unexpected code: got ", si.si_code,
+  }
+  if (si.si_signo != SIGCHLD) {
+    return PosixError(EINVAL, absl::StrCat("unexpected signo: got ",
+                                           si.si_signo, ", wanted ", SIGCHLD));
+  }
+  if (si.si_status != code) {
+    return PosixError(EINVAL, absl::StrCat("unexpected status: got ",
+                                           si.si_status, ", wanted ", code));
+  }
+  if (si.si_code != CLD_EXITED) {
+    return PosixError(EINVAL, absl::StrCat("unexpected code: got ", si.si_code,
                                            ", wanted ", CLD_EXITED));
-          }
-          return NoError();
-        }));
+  }
+  return NoError();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Waiters, WaitSpecificChildTest,
+    ::testing::Values(std::make_tuple(SYS_wait4, CheckWait4),
+                      std::make_tuple(SYS_waitid, CheckWaitid)));
 
 // WIFEXITED, WIFSIGNALED, WTERMSIG indicate signal exit.
 TEST(WaitTest, SignalExit) {
-- 
cgit v1.2.3


From 21915eb58b875809b60c0a43e53a97ea0560c299 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 22 May 2019 16:59:21 -0700
Subject: Remove obsolete TODO.

There no obvious reason to require that BlockSize and StatFS
are MountSource operations. Today they are in INodeOperations,
and they can be moved elsewhere in the future as part of a
normal refactor process.

PiperOrigin-RevId: 249549982
Change-Id: Ib832e02faeaf8253674475df4e385bcc53d780f3
---
 pkg/sentry/fs/mount.go | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 9740f1fc6..63fcf4380 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -42,10 +42,6 @@ type DirentOperations interface {
 
 // MountSourceOperations contains filesystem specific operations.
 type MountSourceOperations interface {
-	// TODO(b/67778729): Add:
-	// BlockSize() int64
-	// FS() Filesystem
-
 	// DirentOperations provide optional extra management of Dirents.
 	DirentOperations
 
-- 
cgit v1.2.3


From f65dfec09650768626a9af916b0487afa557a930 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 22 May 2019 18:10:54 -0700
Subject: Add WCLONE / WALL support to waitid

The previous commit adds WNOTHREAD support to waitid, so we may as well
complete the upstream change.

Linux added WCLONE, WALL, WNOTHREAD support to waitid(2) in
91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.

PiperOrigin-RevId: 249560587
Change-Id: Iff177b0848a3f7bae6cb5592e44500c5a942fbeb
---
 pkg/sentry/syscalls/linux/sys_thread.go |  67 ++++++++---------
 test/syscalls/linux/wait.cc             | 124 +++++++++++++++++++++-----------
 2 files changed, 117 insertions(+), 74 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 14fa7ef92..26f7e8ead 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -181,6 +181,32 @@ func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0)
 }
 
+// parseCommonWaitOptions applies the options common to wait4 and waitid to
+// wopts.
+func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
+	switch options & (linux.WCLONE | linux.WALL) {
+	case 0:
+		wopts.NonCloneTasks = true
+	case linux.WCLONE:
+		wopts.CloneTasks = true
+	case linux.WALL:
+		wopts.NonCloneTasks = true
+		wopts.CloneTasks = true
+	default:
+		return syscall.EINVAL
+	}
+	if options&linux.WCONTINUED != 0 {
+		wopts.Events |= kernel.EventGroupContinue
+	}
+	if options&linux.WNOHANG == 0 {
+		wopts.BlockInterruptErr = kernel.ERESTARTSYS
+	}
+	if options&linux.WNOTHREAD == 0 {
+		wopts.SiblingChildren = true
+	}
+	return nil
+}
+
 // wait4 waits for the given child process to exit.
 func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) {
 	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
@@ -207,29 +233,12 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage
 		wopts.SpecificTID = kernel.ThreadID(pid)
 	}
 
-	switch options & (linux.WCLONE | linux.WALL) {
-	case 0:
-		wopts.NonCloneTasks = true
-	case linux.WCLONE:
-		wopts.CloneTasks = true
-	case linux.WALL:
-		wopts.NonCloneTasks = true
-		wopts.CloneTasks = true
-	default:
-		return 0, syscall.EINVAL
+	if err := parseCommonWaitOptions(&wopts, options); err != nil {
+		return 0, err
 	}
 	if options&linux.WUNTRACED != 0 {
 		wopts.Events |= kernel.EventChildGroupStop
 	}
-	if options&linux.WCONTINUED != 0 {
-		wopts.Events |= kernel.EventGroupContinue
-	}
-	if options&linux.WNOHANG == 0 {
-		wopts.BlockInterruptErr = kernel.ERESTARTSYS
-	}
-	if options&linux.WNOTHREAD == 0 {
-		wopts.SiblingChildren = true
-	}
 
 	wr, err := t.Wait(&wopts)
 	if err != nil {
@@ -281,16 +290,15 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	options := int(args[3].Uint())
 	rusageAddr := args[4].Pointer()
 
-	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD) != 0 {
+	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
 		return 0, nil, syscall.EINVAL
 	}
 	if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
 		return 0, nil, syscall.EINVAL
 	}
 	wopts := kernel.WaitOptions{
-		NonCloneTasks: true,
-		Events:        kernel.EventTraceeStop,
-		ConsumeEvent:  options&linux.WNOWAIT == 0,
+		Events:       kernel.EventTraceeStop,
+		ConsumeEvent: options&linux.WNOWAIT == 0,
 	}
 	switch idtype {
 	case linux.P_ALL:
@@ -301,21 +309,16 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 	default:
 		return 0, nil, syscall.EINVAL
 	}
+
+	if err := parseCommonWaitOptions(&wopts, options); err != nil {
+		return 0, nil, err
+	}
 	if options&linux.WEXITED != 0 {
 		wopts.Events |= kernel.EventExit
 	}
 	if options&linux.WSTOPPED != 0 {
 		wopts.Events |= kernel.EventChildGroupStop
 	}
-	if options&linux.WCONTINUED != 0 {
-		wopts.Events |= kernel.EventGroupContinue
-	}
-	if options&linux.WNOHANG == 0 {
-		wopts.BlockInterruptErr = kernel.ERESTARTSYS
-	}
-	if options&linux.WNOTHREAD == 0 {
-		wopts.SiblingChildren = true
-	}
 
 	wr, err := t.Wait(&wopts)
 	if err != nil {
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index da3b97828..f413ee6ae 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -233,18 +233,14 @@ TEST_P(WaitAnyChildTest, ForkAndClone) {
 
 // Return immediately if no child has exited.
 TEST_P(WaitAnyChildTest, WaitWNOHANG) {
-  EXPECT_THAT(
-      WaitAnyWithOptions(0, WNOHANG),
-      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
-                                            ::testing::StrEq("wait4"))));
+  EXPECT_THAT(WaitAnyWithOptions(0, WNOHANG),
+              PosixErrorIs(ECHILD, ::testing::_));
 }
 
 // Bad options passed
 TEST_P(WaitAnyChildTest, BadOption) {
-  EXPECT_THAT(
-      WaitAnyWithOptions(0, 123456),
-      PosixErrorIs(EINVAL, ::testing::AnyOf(::testing::StrEq("waitid"),
-                                            ::testing::StrEq("wait4"))));
+  EXPECT_THAT(WaitAnyWithOptions(0, 123456),
+              PosixErrorIs(EINVAL, ::testing::_));
 }
 
 TEST_P(WaitAnyChildTest, WaitedChildRusage) {
@@ -295,9 +291,7 @@ TEST_P(WaitAnyChildTest, IgnoredChildRusage) {
   pid_t child;
   ASSERT_THAT(child = ForkSpinAndExit(0, absl::ToInt64Seconds(kSpin)),
               SyscallSucceeds());
-  ASSERT_THAT(WaitAny(0), PosixErrorIs(ECHILD, ::testing::AnyOf(
-                                                   ::testing::StrEq("waitid"),
-                                                   ::testing::StrEq("wait4"))));
+  ASSERT_THAT(WaitAny(0), PosixErrorIs(ECHILD, ::testing::_));
   const absl::Duration end =
       absl::Nanoseconds(clock_gettime_nsecs(CLOCK_MONOTONIC));
   EXPECT_GE(end - start, kSpin - kSpinGrace);
@@ -501,10 +495,8 @@ TEST_P(WaitSpecificChildTest, SiblingChildrenWNOTHREAD) {
   mu.Await(absl::Condition(&ready));
 
   // This thread can't wait on child.
-  EXPECT_THAT(
-      WaitForWithOptions(child, __WNOTHREAD, 0),
-      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
-                                            ::testing::StrEq("wait4"))));
+  EXPECT_THAT(WaitForWithOptions(child, __WNOTHREAD, 0),
+              PosixErrorIs(ECHILD, ::testing::_));
 
   // Keep the sibling alive until after we've waited so the child isn't
   // reparented.
@@ -538,10 +530,7 @@ TEST_P(WaitSpecificChildTest, CloneNoSIGCHLD) {
   int child;
   ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
 
-  EXPECT_THAT(
-      WaitFor(child, 0),
-      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
-                                            ::testing::StrEq("wait4"))));
+  EXPECT_THAT(WaitFor(child, 0), PosixErrorIs(ECHILD, ::testing::_));
 }
 
 // Waiting after the child has already exited returns immediately.
@@ -571,10 +560,7 @@ TEST_P(WaitSpecificChildTest, CloneThread) {
   ASSERT_THAT(child = CloneAndExit(15, stack, CLONE_THREAD), SyscallSucceeds());
   auto start = absl::Now();
 
-  EXPECT_THAT(
-      WaitFor(child, 0),
-      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
-                                            ::testing::StrEq("wait4"))));
+  EXPECT_THAT(WaitFor(child, 0), PosixErrorIs(ECHILD, ::testing::_));
 
   // Ensure wait4 didn't block.
   EXPECT_LE(absl::Now() - start, absl::Seconds(10));
@@ -584,12 +570,81 @@ TEST_P(WaitSpecificChildTest, CloneThread) {
   absl::SleepFor(absl::Seconds(5));
 }
 
+// A child that does not send a SIGCHLD on exit may be waited on with
+// the __WCLONE flag.
+TEST_P(WaitSpecificChildTest, CloneWCLONE) {
+  // Linux added WCLONE support to waitid(2) in
+  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+  //
+  // Skip the test if it isn't supported yet.
+  if (Sysno() == SYS_waitid) {
+    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WCLONE);
+    SKIP_IF(ret < 0 && errno == EINVAL);
+  }
+
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free =
+      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  int child;
+  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitForWithOptions(child, __WCLONE, 0));
+}
+
+// A forked child cannot be waited on with WCLONE.
+TEST_P(WaitSpecificChildTest, ForkWCLONE) {
+  // Linux added WCLONE support to waitid(2) in
+  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+  //
+  // Skip the test if it isn't supported yet.
+  if (Sysno() == SYS_waitid) {
+    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WCLONE);
+    SKIP_IF(ret < 0 && errno == EINVAL);
+  }
+
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+  EXPECT_THAT(WaitForWithOptions(child, WNOHANG | __WCLONE, 0),
+              PosixErrorIs(ECHILD, ::testing::_));
+
+  EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Any type of child can be waited on with WALL.
+TEST_P(WaitSpecificChildTest, WALL) {
+  // Linux added WALL support to waitid(2) in
+  // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+  // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+  //
+  // Skip the test if it isn't supported yet.
+  if (Sysno() == SYS_waitid) {
+    int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WALL);
+    SKIP_IF(ret < 0 && errno == EINVAL);
+  }
+
+  pid_t child;
+  ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitForWithOptions(child, __WALL, 0));
+
+  uintptr_t stack;
+  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+  auto free =
+      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
+
+  EXPECT_NO_ERRNO(WaitForWithOptions(child, __WALL, 0));
+}
+
 // Return ECHILD for bad child.
 TEST_P(WaitSpecificChildTest, BadChild) {
-  EXPECT_THAT(
-      WaitFor(42, 0),
-      PosixErrorIs(ECHILD, ::testing::AnyOf(::testing::StrEq("waitid"),
-                                            ::testing::StrEq("wait4"))));
+  EXPECT_THAT(WaitFor(42, 0), PosixErrorIs(ECHILD, ::testing::_));
 }
 
 // Wait for a child process that only exits after calling execve(2) from a
@@ -694,21 +749,6 @@ TEST(WaitTest, SignalExit) {
   EXPECT_EQ(SIGKILL, WTERMSIG(status));
 }
 
-// A child that does not send a SIGCHLD on exit may be waited on with
-// the __WCLONE flag.
-TEST(WaitTest, CloneWCLONE) {
-  uintptr_t stack;
-  ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
-  auto free =
-      Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
-
-  int child;
-  ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
-
-  EXPECT_THAT(Wait4(child, nullptr, __WCLONE, nullptr),
-              SyscallSucceedsWithValue(child));
-}
-
 // waitid requires at least one option.
 TEST(WaitTest, WaitidOptions) {
   EXPECT_THAT(Waitid(P_ALL, 0, nullptr, 0), SyscallFailsWithErrno(EINVAL));
-- 
cgit v1.2.3


From 79738d3958a027bcf449cf1bd608f3adec42b72c Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Wed, 22 May 2019 18:18:01 -0700
Subject: Log unhandled faults only at DEBUG level.

PiperOrigin-RevId: 249561399
Change-Id: Ic73c68c8538bdca53068f38f82b7260939addac2
---
 pkg/sentry/kernel/task_run.go | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 4549b437e..a79101a18 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
 // A taskRunState is a reified state in the task state machine. See README.md
@@ -267,13 +266,8 @@ func (*runApp) execute(t *Task) taskRunState {
 				}
 			}
 
-			// The JVM will trigger these errors constantly, so don't
-			// spam logs with this error.
-			if err == syserror.EFAULT || err == syserror.EPERM {
-				t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
-			} else {
-				t.Warningf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
-			}
+			// Faults are common, log only at debug level.
+			t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
 			t.DebugDumpState()
 
 			// Continue to signal handling.
-- 
cgit v1.2.3


From 9006304dfecf3670ad03c9629f9a4ac3273c386a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 23 May 2019 04:15:18 -0700
Subject: Initial support for bind mounts

Separate MountSource from Mount. This is needed to allow
mounts to be shared by multiple containers within the same
pod.

PiperOrigin-RevId: 249617810
Change-Id: Id2944feb7e4194951f355cbe6d4944ae3c02e468
---
 pkg/sentry/fs/mock.go        |   1 -
 pkg/sentry/fs/mount.go       |  73 +-----------
 pkg/sentry/fs/mount_test.go  | 167 ++++++++++++++++++----------
 pkg/sentry/fs/mounts.go      | 258 +++++++++++++++++++++++++++++--------------
 pkg/sentry/fs/proc/mounts.go |  48 ++++----
 runsc/boot/fs.go             |  16 +--
 6 files changed, 317 insertions(+), 246 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go
index 064943c5b..ff04e9b22 100644
--- a/pkg/sentry/fs/mock.go
+++ b/pkg/sentry/fs/mock.go
@@ -62,7 +62,6 @@ func NewMockMountSource(cache *DirentCache) *MountSource {
 	return &MountSource{
 		MountSourceOperations: &MockMountSourceOps{keep: keep},
 		fscache:               cache,
-		children:              make(map[*MountSource]struct{}),
 	}
 }
 
diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go
index 63fcf4380..41e0d285b 100644
--- a/pkg/sentry/fs/mount.go
+++ b/pkg/sentry/fs/mount.go
@@ -17,7 +17,6 @@ package fs
 import (
 	"bytes"
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/refs"
@@ -89,15 +88,7 @@ func (i InodeMappings) String() string {
 // one mount source. Each file object may only be represented using one inode
 // object in a sentry instance.
 //
-// This is an amalgamation of structs super_block, vfsmount, and mount, while
-// MountSourceOperations is akin to struct super_operations.
-//
-// Hence, mount source also contains common mounted file system state, such as
-// mount flags, the root Dirent, and children mounts. For now, this
-// amalgamation implies that a mount source cannot be shared by multiple mounts
-// (e.g. cannot be mounted at different locations).
-//
-// TODO(b/63601033): Move mount-specific information out of MountSource.
+// TODO(b/63601033): Move Flags out of MountSource to Mount.
 //
 // +stateify savable
 type MountSource struct {
@@ -128,22 +119,6 @@ type MountSource struct {
 	//
 	// direntRefs must be atomically changed.
 	direntRefs uint64
-
-	// mu protects the fields below, which are set by the MountNamespace
-	// during MountSource/Unmount.
-	mu sync.Mutex `state:"nosave"`
-
-	// id is a unique id for this mount.
-	id uint64
-
-	// root is the root Dirent of this mount.
-	root *Dirent
-
-	// parent is the parent MountSource, or nil if this MountSource is the root.
-	parent *MountSource
-
-	// children are the child MountSources of this MountSource.
-	children map[*MountSource]struct{}
 }
 
 // DefaultDirentCacheSize is the number of Dirents that the VFS can hold an
@@ -162,53 +137,7 @@ func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags Mou
 		Flags:                 flags,
 		FilesystemType:        fsType,
 		fscache:               NewDirentCache(DefaultDirentCacheSize),
-		children:              make(map[*MountSource]struct{}),
-	}
-}
-
-// Parent returns the parent mount, or nil if this mount is the root.
-func (msrc *MountSource) Parent() *MountSource {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-	return msrc.parent
-}
-
-// ID returns the ID of this mount.
-func (msrc *MountSource) ID() uint64 {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-	return msrc.id
-}
-
-// Children returns the (immediate) children of this MountSource.
-func (msrc *MountSource) Children() []*MountSource {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-
-	ms := make([]*MountSource, 0, len(msrc.children))
-	for c := range msrc.children {
-		ms = append(ms, c)
 	}
-	return ms
-}
-
-// Submounts returns all mounts that are descendants of this mount.
-func (msrc *MountSource) Submounts() []*MountSource {
-	var ms []*MountSource
-	for _, c := range msrc.Children() {
-		ms = append(ms, c)
-		ms = append(ms, c.Submounts()...)
-	}
-	return ms
-}
-
-// Root returns the root dirent of this mount. Callers must call DecRef on the
-// returned dirent.
-func (msrc *MountSource) Root() *Dirent {
-	msrc.mu.Lock()
-	defer msrc.mu.Unlock()
-	msrc.root.IncRef()
-	return msrc.root
 }
 
 // DirentRefs returns the current mount direntRefs.
diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go
index 9f7fbeff2..2e2716643 100644
--- a/pkg/sentry/fs/mount_test.go
+++ b/pkg/sentry/fs/mount_test.go
@@ -32,6 +32,27 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool {
 	return false
 }
 
+func mountPathsAre(root *Dirent, got []*Mount, want ...string) error {
+	gotPaths := make(map[string]struct{}, len(got))
+	gotStr := make([]string, len(got))
+	for i, g := range got {
+		groot := g.Root()
+		name, _ := groot.FullName(root)
+		groot.DecRef()
+		gotStr[i] = name
+		gotPaths[name] = struct{}{}
+	}
+	if len(got) != len(want) {
+		return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want)
+	}
+	for _, w := range want {
+		if _, ok := gotPaths[w]; !ok {
+			return fmt.Errorf("no mount with path %q found", w)
+		}
+	}
+	return nil
+}
+
 // TestMountSourceOnlyCachedOnce tests that a Dirent that is mounted over only ends
 // up in a single Dirent Cache. NOTE(b/63848693): Having a dirent in multiple
 // caches causes major consistency issues.
@@ -91,8 +112,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) {
 	}
 }
 
-// Test that mounts have proper parent/child relationships.
-func TestMountSourceParentChildRelationship(t *testing.T) {
+func TestAllMountsUnder(t *testing.T) {
 	ctx := contexttest.Context(t)
 
 	rootCache := NewDirentCache(100)
@@ -122,101 +142,130 @@ func TestMountSourceParentChildRelationship(t *testing.T) {
 		if err != nil {
 			t.Fatalf("could not find path %q in mount manager: %v", p, err)
 		}
+
 		submountInode := NewMockInode(ctx, NewMockMountSource(nil), StableAttr{
 			Type: Directory,
 		})
 		if err := mm.Mount(ctx, d, submountInode); err != nil {
 			t.Fatalf("could not mount at %q: %v", p, err)
 		}
+		d.DecRef()
 	}
 
-	// mm root should contain all submounts (and does not include the root
-	// mount).
-	allMountSources := rootDirent.Inode.MountSource.Submounts()
-	if err := mountPathsAre(rootDirent, allMountSources, paths...); err != nil {
+	// mm root should contain all submounts (and does not include the root mount).
+	rootMnt := mm.FindMount(rootDirent)
+	submounts := mm.AllMountsUnder(rootMnt)
+	allPaths := append(paths, "/")
+	if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
 		t.Error(err)
 	}
 
 	// Each mount should have a unique ID.
 	foundIDs := make(map[uint64]struct{})
-	for _, m := range allMountSources {
-		id := m.ID()
-		if _, ok := foundIDs[id]; ok {
-			t.Errorf("got multiple mounts with id %d", id)
+	for _, m := range submounts {
+		if _, ok := foundIDs[m.ID]; ok {
+			t.Errorf("got multiple mounts with id %d", m.ID)
 		}
-		foundIDs[id] = struct{}{}
+		foundIDs[m.ID] = struct{}{}
 	}
 
 	// Root mount should have no parent.
-	rootMountSource := mm.root.Inode.MountSource
-	if p := rootMountSource.Parent(); p != nil {
+	if p := rootMnt.ParentID; p != invalidMountID {
 		t.Errorf("root.Parent got %v wanted nil", p)
 	}
 
-	// Root mount should have 2 children: foo and waldo.
-	rootChildren := rootMountSource.Children()
-	if err := mountPathsAre(rootDirent, rootChildren, "/foo", "/waldo"); err != nil {
-		t.Error(err)
-	}
-	// All root mount children should have root as parent.
-	for _, c := range rootChildren {
-		if p := c.Parent(); p != rootMountSource {
-			t.Errorf("root mount child got parent %+v, wanted root mount", p)
-		}
-	}
-
-	// "foo" mount should have two children: /foo/bar, and /foo/qux.
+	// Check that "foo" mount has 3 children.
 	maxTraversals = 0
 	d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/foo", err)
 	}
-	fooMountSource := d.Inode.MountSource
-	fooMountSourceChildren := fooMountSource.Children()
-	if err := mountPathsAre(rootDirent, fooMountSourceChildren, "/foo/bar", "/foo/qux"); err != nil {
-		t.Error(err)
-	}
-	// Each child should have fooMountSource as parent.
-	for _, c := range fooMountSourceChildren {
-		if p := c.Parent(); p != fooMountSource {
-			t.Errorf("foo mount child got parent %+v, wanted foo mount", p)
-		}
-	}
-	// Submounts of foo are /foo/bar, /foo/qux, and /foo/bar/baz.
-	if err := mountPathsAre(rootDirent, fooMountSource.Submounts(), "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
+	defer d.DecRef()
+	submounts = mm.AllMountsUnder(mm.FindMount(d))
+	if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil {
 		t.Error(err)
 	}
 
-	// "waldo" mount should have no submounts or children.
+	// "waldo" mount should have no children.
 	maxTraversals = 0
 	waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", &maxTraversals)
 	if err != nil {
 		t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err)
 	}
-	waldoMountSource := waldo.Inode.MountSource
-	if got := len(waldoMountSource.Children()); got != 0 {
-		t.Errorf("waldo got %d children, wanted 0", got)
-	}
-	if got := len(waldoMountSource.Submounts()); got != 0 {
-		t.Errorf("waldo got %d children, wanted 0", got)
+	defer waldo.DecRef()
+	submounts = mm.AllMountsUnder(mm.FindMount(waldo))
+	if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil {
+		t.Error(err)
 	}
 }
 
-func mountPathsAre(root *Dirent, got []*MountSource, want ...string) error {
-	if len(got) != len(want) {
-		return fmt.Errorf("mount paths have different lengths: got %d want %d", len(got), len(want))
+func TestUnmount(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	rootCache := NewDirentCache(100)
+	rootInode := NewMockInode(ctx, NewMockMountSource(rootCache), StableAttr{
+		Type: Directory,
+	})
+	mm, err := NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		t.Fatalf("NewMountNamespace failed: %v", err)
 	}
-	gotPaths := make(map[string]struct{}, len(got))
-	for _, g := range got {
-		groot := g.Root()
-		n, _ := groot.FullName(root)
-		groot.DecRef()
-		gotPaths[n] = struct{}{}
+	rootDirent := mm.Root()
+	defer rootDirent.DecRef()
+
+	// Add mounts at the following paths:
+	paths := []string{
+		"/foo",
+		"/foo/bar",
+		"/foo/bar/goo",
+		"/foo/bar/goo/abc",
+		"/foo/abc",
+		"/foo/def",
+		"/waldo",
+		"/wally",
 	}
-	for _, w := range want {
-		if _, ok := gotPaths[w]; !ok {
-			return fmt.Errorf("no mount with path %q found", w)
+
+	var maxTraversals uint
+	for _, p := range paths {
+		maxTraversals = 0
+		d, err := mm.FindLink(ctx, rootDirent, nil, p, &maxTraversals)
+		if err != nil {
+			t.Fatalf("could not find path %q in mount manager: %v", p, err)
+		}
+
+		submountInode := NewMockInode(ctx, NewMockMountSource(nil), StableAttr{
+			Type: Directory,
+		})
+		if err := mm.Mount(ctx, d, submountInode); err != nil {
+			t.Fatalf("could not mount at %q: %v", p, err)
+		}
+		d.DecRef()
+	}
+
+	allPaths := make([]string, len(paths)+1)
+	allPaths[0] = "/"
+	copy(allPaths[1:], paths)
+
+	rootMnt := mm.FindMount(rootDirent)
+	for i := len(paths) - 1; i >= 0; i-- {
+		maxTraversals = 0
+		p := paths[i]
+		d, err := mm.FindLink(ctx, rootDirent, nil, p, &maxTraversals)
+		if err != nil {
+			t.Fatalf("could not find path %q in mount manager: %v", p, err)
+		}
+
+		if err := mm.Unmount(ctx, d, false); err != nil {
+			t.Fatalf("could not unmount at %q: %v", p, err)
+		}
+		d.DecRef()
+
+		// Remove the path that has been unmounted and the check that the remaining
+		// mounts are still there.
+		allPaths = allPaths[:len(allPaths)-1]
+		submounts := mm.AllMountsUnder(rootMnt)
+		if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil {
+			t.Error(err)
 		}
 	}
-	return nil
 }
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 01eb4607e..a5c52d7ba 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -16,6 +16,7 @@ package fs
 
 import (
 	"fmt"
+	"math"
 	"path"
 	"strings"
 	"sync"
@@ -35,6 +36,94 @@ import (
 // sane.
 const DefaultTraversalLimit = 10
 
+const invalidMountID = math.MaxUint64
+
+// Mount represents a mount in the file system. It holds the root dirent for the
+// mount. It also points back to the dirent or mount where it was mounted over,
+// so that it can be restored when unmounted. The chained mount can be either:
+//   - Mount: when it's mounted on top of another mount point.
+//   - Dirent: when it's mounted on top of a dirent. In this case the mount is
+//     called an "undo" mount and only 'root' is set. All other fields are
+//     either invalid or nil.
+//
+// +stateify savable
+type Mount struct {
+	// ID is a unique id for this mount. It may be invalidMountID if this is
+	// used to cache a dirent that was mounted over.
+	ID uint64
+
+	// ParentID is the parent's mount unique id. It may be invalidMountID if this
+	// is the root mount or if this is used to cache a dirent that was mounted
+	// over.
+	ParentID uint64
+
+	// root is the root Dirent of this mount. A reference on this Dirent must be
+	// held through the lifetime of the Mount which contains it.
+	root *Dirent
+
+	// previous is the existing dirent or mount that this object was mounted over.
+	// It's nil for the root mount and for the last entry in the chain (always an
+	// "undo" mount).
+	previous *Mount
+}
+
+// newMount creates a new mount, taking a reference on 'root'. Caller must
+// release the reference when it's done with the mount.
+func newMount(id, pid uint64, root *Dirent) *Mount {
+	root.IncRef()
+	return &Mount{
+		ID:       id,
+		ParentID: pid,
+		root:     root,
+	}
+}
+
+// newRootMount creates a new root mount (no parent), taking a reference on
+// 'root'. Caller must release the reference when it's done with the mount.
+func newRootMount(id uint64, root *Dirent) *Mount {
+	root.IncRef()
+	return &Mount{
+		ID:       id,
+		ParentID: invalidMountID,
+		root:     root,
+	}
+}
+
+// newUndoMount creates a new undo mount, taking a reference on 'd'. Caller must
+// release the reference when it's done with the mount.
+func newUndoMount(d *Dirent) *Mount {
+	d.IncRef()
+	return &Mount{
+		ID:       invalidMountID,
+		ParentID: invalidMountID,
+		root:     d,
+	}
+}
+
+// Root returns the root dirent of this mount. Callers must call DecRef on the
+// returned dirent.
+func (m *Mount) Root() *Dirent {
+	m.root.IncRef()
+	return m.root
+}
+
+// IsRoot returns true if the mount has no parent.
+func (m *Mount) IsRoot() bool {
+	return !m.IsUndo() && m.ParentID == invalidMountID
+}
+
+// IsUndo returns true if 'm' is an undo mount that should be used to restore
+// the original dirent during unmount only and it's not a valid mount.
+func (m *Mount) IsUndo() bool {
+	if m.ID == invalidMountID {
+		if m.ParentID != invalidMountID {
+			panic(fmt.Sprintf("Undo mount with valid parentID: %+v", m))
+		}
+		return true
+	}
+	return false
+}
+
 // MountNamespace defines a collection of mounts.
 //
 // +stateify savable
@@ -55,13 +144,16 @@ type MountNamespace struct {
 	// mu protects mounts and mountID counter.
 	mu sync.Mutex `state:"nosave"`
 
-	// mounts is a map of the last mounted Dirent -> stack of old Dirents
-	// that were mounted over, with the oldest mounted Dirent first and
-	// more recent mounted Dirents at the end of the slice.
-	//
-	// A reference to all Dirents in mounts (keys and values) must be held
-	// to ensure the Dirents are recoverable when unmounting.
-	mounts map[*Dirent][]*Dirent
+	// mounts is a map of mounted Dirent -> Mount object. There are three
+	// possible cases:
+	//   - Dirent is mounted over a mount point: the stored Mount object will be
+	//     the Mount for that mount point.
+	//   - Dirent is mounted over a regular (non-mount point) Dirent: the stored
+	//     Mount object will be an "undo" mount containing the mounted-over
+	//     Dirent.
+	//   - Dirent is the root mount: the stored Mount object will be a root mount
+	//     containing the Dirent itself.
+	mounts map[*Dirent]*Mount
 
 	// mountID is the next mount id to assign.
 	mountID uint64
@@ -72,18 +164,18 @@ type MountNamespace struct {
 func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
 	creds := auth.CredentialsFromContext(ctx)
 
-	root.MountSource.mu.Lock()
-	defer root.MountSource.mu.Unlock()
-
-	// Set the root dirent and id on the root mount.
+	// Set the root dirent and id on the root mount. The reference returned from
+	// NewDirent will be donated to the MountNamespace constructed below.
 	d := NewDirent(root, "/")
-	root.MountSource.root = d
-	root.MountSource.id = 1
+
+	mnts := map[*Dirent]*Mount{
+		d: newRootMount(1, d),
+	}
 
 	return &MountNamespace{
 		userns:  creds.UserNamespace,
 		root:    d,
-		mounts:  make(map[*Dirent][]*Dirent),
+		mounts:  mnts,
 		mountID: 2,
 	}, nil
 }
@@ -110,10 +202,9 @@ func (mns *MountNamespace) FlushMountSourceRefs() {
 
 func (mns *MountNamespace) flushMountSourceRefsLocked() {
 	// Flush mounts' MountSource references.
-	for current, stack := range mns.mounts {
-		current.Inode.MountSource.FlushDirentRefs()
-		for _, prev := range stack {
-			prev.Inode.MountSource.FlushDirentRefs()
+	for _, mp := range mns.mounts {
+		for ; mp != nil; mp = mp.previous {
+			mp.root.Inode.MountSource.FlushDirentRefs()
 		}
 	}
 
@@ -136,12 +227,11 @@ func (mns *MountNamespace) destroy() {
 	mns.flushMountSourceRefsLocked()
 
 	// Teardown mounts.
-	for current, mp := range mns.mounts {
+	for _, mp := range mns.mounts {
 		// Drop the mount reference on all mounted dirents.
-		for _, d := range mp {
-			d.DecRef()
+		for ; mp != nil; mp = mp.previous {
+			mp.root.DecRef()
 		}
-		current.DecRef()
 	}
 	mns.mounts = nil
 
@@ -208,46 +298,34 @@ func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error
 }
 
 // Mount mounts a `inode` over the subtree at `node`.
-func (mns *MountNamespace) Mount(ctx context.Context, node *Dirent, inode *Inode) error {
-	return mns.withMountLocked(node, func() error {
-		// replacement already has one reference taken; this is the mount
-		// reference.
-		replacement, err := node.mount(ctx, inode)
+func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode *Inode) error {
+	return mns.withMountLocked(mountPoint, func() error {
+		replacement, err := mountPoint.mount(ctx, inode)
 		if err != nil {
 			return err
 		}
-
-		// Set child/parent dirent relationship.
-		parentMountSource := node.Inode.MountSource
-		childMountSource := inode.MountSource
-		parentMountSource.mu.Lock()
-		defer parentMountSource.mu.Unlock()
-		childMountSource.mu.Lock()
-		defer childMountSource.mu.Unlock()
-
-		parentMountSource.children[childMountSource] = struct{}{}
-		childMountSource.parent = parentMountSource
+		defer replacement.DecRef()
 
 		// Set the mount's root dirent and id.
-		childMountSource.root = replacement
-		childMountSource.id = mns.mountID
+		parentMnt := mns.findMountLocked(mountPoint)
+		childMnt := newMount(mns.mountID, parentMnt.ID, replacement)
 		mns.mountID++
 
-		// Drop node from its dirent cache.
-		node.dropExtendedReference()
+		// Drop mountPoint from its dirent cache.
+		mountPoint.dropExtendedReference()
 
-		// If node is already a mount point, push node on the stack so it can
+		// If mountPoint is already a mount, push mountPoint on the stack so it can
 		// be recovered on unmount.
-		if stack, ok := mns.mounts[node]; ok {
-			mns.mounts[replacement] = append(stack, node)
-			delete(mns.mounts, node)
+		if prev := mns.mounts[mountPoint]; prev != nil {
+			childMnt.previous = prev
+			mns.mounts[replacement] = childMnt
+			delete(mns.mounts, mountPoint)
 			return nil
 		}
 
 		// Was not already mounted, just add another mount point.
-		// Take a reference on node so it can be recovered on unmount.
-		node.IncRef()
-		mns.mounts[replacement] = []*Dirent{node}
+		childMnt.previous = newUndoMount(mountPoint)
+		mns.mounts[replacement] = childMnt
 		return nil
 	})
 }
@@ -268,13 +346,13 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 	// This takes locks to prevent further walks to Dirents in this mount
 	// under the assumption that `node` is the root of the mount.
 	return mns.withMountLocked(node, func() error {
-		origs, ok := mns.mounts[node]
+		orig, ok := mns.mounts[node]
 		if !ok {
 			// node is not a mount point.
 			return syserror.EINVAL
 		}
 
-		if len(origs) == 0 {
+		if orig.previous == nil {
 			panic("cannot unmount initial dirent")
 		}
 
@@ -298,44 +376,62 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly
 			}
 		}
 
-		// Lock the parent MountSource first, if it exists. We are
-		// holding mns.Lock, so the parent can not change out
-		// from under us.
-		parent := m.Parent()
-		if parent != nil {
-			parent.mu.Lock()
-			defer parent.mu.Unlock()
+		prev := orig.previous
+		if err := node.unmount(ctx, prev.root); err != nil {
+			return err
 		}
 
-		// Lock the mount that is being unmounted.
-		m.mu.Lock()
-		defer m.mu.Unlock()
-
-		if m.parent != nil {
-			// Sanity check.
-			if _, ok := m.parent.children[m]; !ok {
-				panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent))
+		if prev.previous == nil {
+			if !prev.IsUndo() {
+				panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev))
 			}
-			delete(m.parent.children, m)
+			// Drop mount reference taken at the end of MountNamespace.Mount.
+			prev.root.DecRef()
+		} else {
+			mns.mounts[prev.root] = prev
 		}
+		delete(mns.mounts, node)
 
-		original := origs[len(origs)-1]
-		if err := node.unmount(ctx, original); err != nil {
-			return err
-		}
+		return nil
+	})
+}
+
+// FindMount returns the mount that 'd' belongs to. It walks the dirent back
+// until a mount is found. It may return nil if no mount was found.
+func (mns *MountNamespace) FindMount(d *Dirent) *Mount {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+	renameMu.Lock()
+	defer renameMu.Unlock()
 
-		switch {
-		case len(origs) > 1:
-			mns.mounts[original] = origs[:len(origs)-1]
-		case len(origs) == 1:
-			// Drop mount reference taken at the end of
-			// MountNamespace.Mount.
-			original.DecRef()
+	return mns.findMountLocked(d)
+}
+
+func (mns *MountNamespace) findMountLocked(d *Dirent) *Mount {
+	for {
+		if mnt := mns.mounts[d]; mnt != nil {
+			return mnt
+		}
+		if d.parent == nil {
+			return nil
 		}
+		d = d.parent
+	}
+}
 
-		delete(mns.mounts, node)
-		return nil
-	})
+// AllMountsUnder returns a slice of all mounts under the parent, including
+// itself.
+func (mns *MountNamespace) AllMountsUnder(parent *Mount) []*Mount {
+	mns.mu.Lock()
+	defer mns.mu.Unlock()
+
+	var rv []*Mount
+	for _, mp := range mns.mounts {
+		if !mp.IsUndo() && mp.root.descendantOf(parent.root) {
+			rv = append(rv, mp)
+		}
+	}
+	return rv
 }
 
 // FindLink returns an Dirent from a given node, which may be a symlink.
diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go
index b5e01301f..1f7817947 100644
--- a/pkg/sentry/fs/proc/mounts.go
+++ b/pkg/sentry/fs/proc/mounts.go
@@ -27,7 +27,7 @@ import (
 
 // forEachMountSource runs f for the process root mount and  each mount that is a
 // descendant of the root.
-func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
+func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
 	var fsctx *kernel.FSContext
 	t.WithMuLocked(func(t *kernel.Task) {
 		fsctx = t.FSContext()
@@ -46,16 +46,14 @@ func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) {
 	}
 	defer rootDir.DecRef()
 
-	if rootDir.Inode == nil {
-		panic(fmt.Sprintf("root dirent has nil inode: %+v", rootDir))
-	}
-	if rootDir.Inode.MountSource == nil {
-		panic(fmt.Sprintf("root dirent has nil mount: %+v", rootDir))
+	mnt := t.MountNamespace().FindMount(rootDir)
+	if mnt == nil {
+		// Has it just been unmounted?
+		return
 	}
-
-	ms := append(rootDir.Inode.MountSource.Submounts(), rootDir.Inode.MountSource)
+	ms := t.MountNamespace().AllMountsUnder(mnt)
 	sort.Slice(ms, func(i, j int) bool {
-		return ms[i].ID() < ms[j].ID()
+		return ms[i].ID < ms[j].ID
 	})
 	for _, m := range ms {
 		mroot := m.Root()
@@ -89,26 +87,27 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 	}
 
 	var buf bytes.Buffer
-	forEachMountSource(mif.t, func(mountPath string, m *fs.MountSource) {
+	forEachMount(mif.t, func(mountPath string, m *fs.Mount) {
 		// Format:
 		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
 		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
 
 		// (1) MountSource ID.
-		fmt.Fprintf(&buf, "%d ", m.ID())
+		fmt.Fprintf(&buf, "%d ", m.ID)
 
 		// (2)  Parent ID (or this ID if there is no parent).
-		pID := m.ID()
-		if p := m.Parent(); p != nil {
-			pID = p.ID()
+		pID := m.ID
+		if !m.IsRoot() && !m.IsUndo() {
+			pID = m.ParentID
 		}
 		fmt.Fprintf(&buf, "%d ", pID)
 
 		// (3) Major:Minor device ID. We don't have a superblock, so we
 		// just use the root inode device number.
 		mroot := m.Root()
+		defer mroot.DecRef()
+
 		sa := mroot.Inode.StableAttr
-		mroot.DecRef()
 		fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
 
 		// (4) Root: the pathname of the directory in the filesystem
@@ -122,14 +121,15 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		fmt.Fprintf(&buf, "%s ", mountPath)
 
 		// (6) Mount options.
+		flags := mroot.Inode.MountSource.Flags
 		opts := "rw"
-		if m.Flags.ReadOnly {
+		if flags.ReadOnly {
 			opts = "ro"
 		}
-		if m.Flags.NoAtime {
+		if flags.NoAtime {
 			opts += ",noatime"
 		}
-		if m.Flags.NoExec {
+		if flags.NoExec {
 			opts += ",noexec"
 		}
 		fmt.Fprintf(&buf, "%s ", opts)
@@ -139,7 +139,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se
 		fmt.Fprintf(&buf, "- ")
 
 		// (9) Filesystem type.
-		fmt.Fprintf(&buf, "%s ", m.FilesystemType)
+		fmt.Fprintf(&buf, "%s ", mroot.Inode.MountSource.FilesystemType)
 
 		// (10) Mount source: filesystem-specific information or "none".
 		fmt.Fprintf(&buf, "none ")
@@ -171,7 +171,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 	}
 
 	var buf bytes.Buffer
-	forEachMountSource(mf.t, func(mountPath string, m *fs.MountSource) {
+	forEachMount(mf.t, func(mountPath string, m *fs.Mount) {
 		// Format:
 		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
 		//
@@ -182,11 +182,15 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan
 		// Only ro/rw option is supported for now.
 		//
 		// The "needs dump"and fsck flags are always 0, which is allowed.
+		root := m.Root()
+		defer root.DecRef()
+
+		flags := root.Inode.MountSource.Flags
 		opts := "rw"
-		if m.Flags.ReadOnly {
+		if flags.ReadOnly {
 			opts = "ro"
 		}
-		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, m.FilesystemType, opts, 0, 0)
+		fmt.Fprintf(&buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0)
 	})
 
 	return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 1611dda2c..bc05b3491 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -685,27 +685,21 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 
 	// Iterate through all submounts and unmount them. We unmount lazily by
 	// setting detach=true, so we can unmount in any order.
-	for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
+	mnt := mns.FindMount(containerRootDirent)
+	for _, m := range mns.AllMountsUnder(mnt) {
 		root := m.Root()
 		defer root.DecRef()
 
 		// Do a best-effort unmount by flushing the refs and unmount
 		// with "detach only = true". Unmount returns EINVAL when the mount point
 		// doesn't exist, i.e. it has already been unmounted.
-		log.Debugf("Unmounting container submount %q", root.BaseName())
-		m.FlushDirentRefs()
+		log.Debugf("Unmounting container mount %q", root.BaseName())
+		root.Inode.MountSource.FlushDirentRefs()
 		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
-			return fmt.Errorf("unmounting container submount %q: %v", root.BaseName(), err)
+			return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err)
 		}
 	}
 
-	// Unmount the container root itself.
-	log.Debugf("Unmounting container root %q", containerRoot)
-	containerRootDirent.Inode.MountSource.FlushDirentRefs()
-	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
-		return fmt.Errorf("unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
-	}
-
 	// Get a reference to the parent directory and remove the root
 	// container directory.
 	maxTraversals = 0
-- 
cgit v1.2.3


From 6240abb205f9e5cdbad1c864dbed345d92f04b09 Mon Sep 17 00:00:00 2001
From: Ayush Ranjan <ayushranjan@google.com>
Date: Thu, 23 May 2019 16:54:38 -0700
Subject: Added boilerplate code for ext4 fs.

Initialized BUILD with license
Mount is still unimplemented and is not meant to be
part of this CL. Rest of the fs interface is implemented.
Referenced the Linux kernel appropriately when needed

PiperOrigin-RevId: 249741997
Change-Id: Id1e4c7c9e68b3f6946da39896fc6a0c3dcd7f98c
---
 pkg/sentry/fs/ext4/BUILD | 14 +++++++++++
 pkg/sentry/fs/ext4/fs.go | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 pkg/sentry/fs/ext4/BUILD
 create mode 100644 pkg/sentry/fs/ext4/fs.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/ext4/BUILD b/pkg/sentry/fs/ext4/BUILD
new file mode 100644
index 000000000..9df9084c3
--- /dev/null
+++ b/pkg/sentry/fs/ext4/BUILD
@@ -0,0 +1,14 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+    name = "ext4",
+    srcs = ["fs.go"],
+    importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ext4",
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+    ],
+)
diff --git a/pkg/sentry/fs/ext4/fs.go b/pkg/sentry/fs/ext4/fs.go
new file mode 100644
index 000000000..de5f0ef63
--- /dev/null
+++ b/pkg/sentry/fs/ext4/fs.go
@@ -0,0 +1,61 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ext4 implements the ext4 filesystem.
+package ext4
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// filesystem implements fs.Filesystem for ext4.
+//
+// +stateify savable
+type filesystem struct{}
+
+func init() {
+	fs.RegisterFilesystem(&filesystem{})
+}
+
+// FilesystemName is the name under which the filesystem is registered.
+// Name matches fs/ext4/super.c:ext4_fs_type.name.
+const FilesystemName = "ext4"
+
+// Name is the name of the file system.
+func (*filesystem) Name() string {
+	return FilesystemName
+}
+
+// AllowUserMount prohibits users from using mount(2) with this file system.
+func (*filesystem) AllowUserMount() bool {
+	return false
+}
+
+// AllowUserList prohibits this filesystem to be listed in /proc/filesystems.
+func (*filesystem) AllowUserList() bool {
+	return false
+}
+
+// Flags returns properties of the filesystem.
+//
+// In Linux, ext4 returns FS_REQUIRES_DEV. See fs/ext4/super.c
+func (*filesystem) Flags() fs.FilesystemFlags {
+	return fs.FilesystemRequiresDev
+}
+
+// Mount returns the root inode of the ext4 fs.
+func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string, cgroupsInt interface{}) (*fs.Inode, error) {
+	panic("unimplemented")
+}
-- 
cgit v1.2.3


From a949133c4b22a87c79310b2d825f2899028d6088 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Thu, 23 May 2019 23:20:11 -0700
Subject: gvisor: interrupt the sendfile system call if a task has been
 interrupted

sendfile can be called for a big range and it can require significant
amount of time to process it, so we need to handle task interrupts in
this system call.

PiperOrigin-RevId: 249781023
Change-Id: Ifc2ec505d74c06f5ee76f93b8d30d518ec2d4015
---
 pkg/amutex/amutex.go                | 6 ++++++
 pkg/amutex/amutex_test.go           | 4 ++++
 pkg/sentry/fs/file.go               | 6 ++++++
 pkg/sentry/kernel/pipe/node_test.go | 4 ++++
 pkg/sentry/kernel/task_block.go     | 5 +++++
 5 files changed, 25 insertions(+)

(limited to 'pkg/sentry')

diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go
index 85e819304..4f7759b87 100644
--- a/pkg/amutex/amutex.go
+++ b/pkg/amutex/amutex.go
@@ -33,6 +33,9 @@ type Sleeper interface {
 	// SleepFinish is called by AbortableMutex.Lock() once a contended mutex
 	// is acquired or the wait is aborted.
 	SleepFinish(success bool)
+
+	// Interrupted returns true if the wait is aborted.
+	Interrupted() bool
 }
 
 // NoopSleeper is a stateless no-op implementation of Sleeper for anonymous
@@ -47,6 +50,9 @@ func (NoopSleeper) SleepStart() <-chan struct{} {
 // SleepFinish implements Sleeper.SleepFinish.
 func (NoopSleeper) SleepFinish(success bool) {}
 
+// Interrupted implements Sleeper.Interrupted.
+func (NoopSleeper) Interrupted() bool { return false }
+
 // AbortableMutex is an abortable mutex. It allows Lock() to be aborted while it
 // waits to acquire the mutex.
 type AbortableMutex struct {
diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go
index 6a0af006e..211bdda4b 100644
--- a/pkg/amutex/amutex_test.go
+++ b/pkg/amutex/amutex_test.go
@@ -31,6 +31,10 @@ func (s *sleeper) SleepStart() <-chan struct{} {
 func (*sleeper) SleepFinish(bool) {
 }
 
+func (s *sleeper) Interrupted() bool {
+	return len(s.ch) != 0
+}
+
 func TestMutualExclusion(t *testing.T) {
 	var m AbortableMutex
 	m.Init()
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 8f1baca23..8c1307235 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -516,12 +516,18 @@ type lockedReader struct {
 
 // Read implements io.Reader.Read.
 func (r *lockedReader) Read(buf []byte) (int, error) {
+	if r.Ctx.Interrupted() {
+		return 0, syserror.ErrInterrupted
+	}
 	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), r.File.offset)
 	return int(n), err
 }
 
 // ReadAt implements io.Reader.ReadAt.
 func (r *lockedReader) ReadAt(buf []byte, offset int64) (int, error) {
+	if r.Ctx.Interrupted() {
+		return 0, syserror.ErrInterrupted
+	}
 	n, err := r.File.FileOperations.Read(r.Ctx, r.File, usermem.BytesIOSequence(buf), offset)
 	return int(n), err
 }
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index 7ddecdad8..31d9b0443 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -48,6 +48,10 @@ func (s *sleeper) Cancel() {
 	s.ch <- struct{}{}
 }
 
+func (s *sleeper) Interrupted() bool {
+	return len(s.ch) != 0
+}
+
 type openResult struct {
 	*fs.File
 	error
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 30a7f6b1e..1c76c4d84 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -158,6 +158,11 @@ func (t *Task) SleepFinish(success bool) {
 	t.Activate()
 }
 
+// Interrupted implements amutex.Sleeper.Interrupted
+func (t *Task) Interrupted() bool {
+	return len(t.interruptChan) != 0
+}
+
 // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
 func (t *Task) UninterruptibleSleepStart(deactivate bool) {
 	if deactivate {
-- 
cgit v1.2.3


From ed5793808e9d97789c9494d86c9fa4ed62df46bb Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 24 May 2019 16:16:54 -0700
Subject: Remove obsolete TODO.

We don't need to model internal interfaces after the system
call interfaces (which are objectively worse and simply use a
flag to distinguish between two logically different operations).

PiperOrigin-RevId: 249916814
Change-Id: I45d02e0ec0be66b782a685b1f305ea027694cab9
---
 pkg/sentry/fs/inode_operations.go | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go
index 2ed89d482..ea089dfae 100644
--- a/pkg/sentry/fs/inode_operations.go
+++ b/pkg/sentry/fs/inode_operations.go
@@ -117,9 +117,6 @@ type InodeOperations interface {
 	// Remove removes the given named non-directory under dir.
 	//
 	// The caller must ensure that this operation is permitted.
-	//
-	// TODO(b/67778723): merge Remove and RemoveDirectory, Remove
-	// just needs a type flag.
 	Remove(ctx context.Context, dir *Inode, name string) error
 
 	// RemoveDirectory removes the given named directory under dir.
-- 
cgit v1.2.3


From 2165b77774eaa40bb7d870fddea733cd899006b9 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Fri, 24 May 2019 17:10:43 -0700
Subject: Remove obsolete bug.

The original bug is no longer relevant, and the FIXME here
contains lots of obsolete information.

PiperOrigin-RevId: 249924036
---
 pkg/sentry/fs/dentry.go | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go
index c0fc74723..7a2d4b180 100644
--- a/pkg/sentry/fs/dentry.go
+++ b/pkg/sentry/fs/dentry.go
@@ -83,9 +83,6 @@ type DirCtx struct {
 	attrs map[string]DentAttr
 
 	// DirCursor is the directory cursor.
-	// TODO(b/67778717): Once Handles are removed this can just live in the
-	// respective FileOperations implementations and not need to get
-	// plumbed everywhere.
 	DirCursor *string
 }
 
-- 
cgit v1.2.3


From 507a15dce974d0cff18253ba50af29d6579bacc5 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 28 May 2019 18:02:07 -0700
Subject: Always wait on tracee children

After bf959931ddb88c4e4366e96dd22e68fa0db9527c ("wait/ptrace: assume
__WALL if the child is traced") (Linux 4.7), tracees are always eligible
for waiting, regardless of type.

PiperOrigin-RevId: 250399527
---
 pkg/sentry/kernel/task_exit.go | 10 ++++++---
 test/syscalls/linux/BUILD      |  1 +
 test/syscalls/linux/wait.cc    | 46 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 2e1e46582..158e665d3 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -803,13 +803,17 @@ type WaitOptions struct {
 }
 
 // Preconditions: The TaskSet mutex must be locked (for reading or writing).
-func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace) bool {
+func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool {
 	if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
 		return false
 	}
 	if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
 		return false
 	}
+	// Tracees are always eligible.
+	if tracee {
+		return true
+	}
 	if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
 		return o.NonCloneTasks
 	}
@@ -903,7 +907,7 @@ func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, b
 	anyWaitableTasks := false
 
 	for child := range parent.children {
-		if !opts.matchesTask(child, parent.tg.pidns) {
+		if !opts.matchesTask(child, parent.tg.pidns, false) {
 			continue
 		}
 		// Non-leaders don't notify parents on exit and aren't eligible to
@@ -946,7 +950,7 @@ func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, b
 		}
 	}
 	for tracee := range parent.ptraceTracees {
-		if !opts.matchesTask(tracee, parent.tg.pidns) {
+		if !opts.matchesTask(tracee, parent.tg.pidns, true) {
 			continue
 		}
 		// Non-leaders do notify tracers on exit.
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 750f3a1e2..ec57ec129 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3179,6 +3179,7 @@ cc_binary(
     linkstatic = 1,
     deps = [
         "//test/util:cleanup",
+        "//test/util:file_descriptor",
         "//test/util:logging",
         "//test/util:multiprocess_util",
         "//test/util:posix_error",
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
index aa27194cb..944149d5e 100644
--- a/test/syscalls/linux/wait.cc
+++ b/test/syscalls/linux/wait.cc
@@ -14,6 +14,7 @@
 
 #include <signal.h>
 #include <sys/mman.h>
+#include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -31,6 +32,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
 #include "test/util/logging.h"
 #include "test/util/multiprocess_util.h"
 #include "test/util/posix_error.h"
@@ -861,6 +863,50 @@ TEST(WaitTest, WaitidRusage) {
   EXPECT_GE(RusageCpuTime(rusage), kSpin);
 }
 
+// After bf959931ddb88c4e4366e96dd22e68fa0db9527c ("wait/ptrace: assume __WALL
+// if the child is traced") (Linux 4.7), tracees are always eligible for
+// waiting, regardless of type.
+TEST(WaitTest, TraceeWALL) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  FileDescriptor rfd(fds[0]);
+  FileDescriptor wfd(fds[1]);
+
+  pid_t child = fork();
+  if (child == 0) {
+    // Child.
+    rfd.reset();
+
+    TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) == 0);
+
+    // Notify parent that we're now a tracee.
+    wfd.reset();
+
+    _exit(0);
+  }
+  ASSERT_THAT(child, SyscallSucceeds());
+
+  wfd.reset();
+
+  // Wait for child to become tracee.
+  char c;
+  EXPECT_THAT(ReadFd(rfd.get(), &c, sizeof(c)), SyscallSucceedsWithValue(0));
+
+  // We can wait on the fork child with WCLONE, as it is a tracee.
+  int status;
+  if (IsRunningOnGvisor()) {
+    ASSERT_THAT(Wait4(child, &status, __WCLONE, nullptr),
+                SyscallSucceedsWithValue(child));
+
+    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+  } else {
+    // On older versions of Linux, we may get ECHILD.
+    ASSERT_THAT(Wait4(child, &status, __WCLONE, nullptr),
+                ::testing::AnyOf(SyscallSucceedsWithValue(child),
+                                 SyscallFailsWithErrno(ECHILD)));
+  }
+}
+
 }  // namespace
 
 }  // namespace testing
-- 
cgit v1.2.3


From 4b9cb381572e0f61f2a6c2259094548172900e0d Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 28 May 2019 22:28:01 -0700
Subject: gvisor: socket() returns EPROTONOSUPPORT if protocol is not supported

PiperOrigin-RevId: 250426407
---
 pkg/sentry/socket/epsocket/provider.go |  2 +-
 pkg/sentry/socket/unix/unix.go         |  8 +++---
 test/syscalls/BUILD                    |  2 ++
 test/syscalls/linux/BUILD              | 13 +++++++++
 test/syscalls/linux/socket.cc          | 48 ++++++++++++++++++++++++++++++++++
 test/syscalls/linux/socket_unix.cc     |  5 ++--
 6 files changed, 70 insertions(+), 8 deletions(-)
 create mode 100644 test/syscalls/linux/socket.cc

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index fb1815c2d..ec930d8d5 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -76,7 +76,7 @@ func getTransportProtocol(ctx context.Context, stype transport.SockType, protoco
 			return header.TCPProtocolNumber, nil
 		}
 	}
-	return 0, syserr.ErrInvalidArgument
+	return 0, syserr.ErrProtocolNotSupported
 }
 
 // Socket creates a new socket object for the AF_INET or AF_INET6 family.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 931056d51..1414be0c6 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -598,8 +598,8 @@ type provider struct{}
 // Socket returns a new unix domain socket.
 func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
 	// Check arguments.
-	if protocol != 0 {
-		return nil, syserr.ErrInvalidArgument
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, syserr.ErrProtocolNotSupported
 	}
 
 	// Create the endpoint and socket.
@@ -624,8 +624,8 @@ func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int)
 // Pair creates a new pair of AF_UNIX connected sockets.
 func (*provider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
 	// Check arguments.
-	if protocol != 0 {
-		return nil, nil, syserr.ErrInvalidArgument
+	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
+		return nil, nil, syserr.ErrProtocolNotSupported
 	}
 
 	var isPacket bool
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 0d6b6ccc7..c53742d14 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -35,6 +35,8 @@ syscall_test(
 
 syscall_test(test = "//test/syscalls/linux:brk_test")
 
+syscall_test(test = "//test/syscalls/linux:socket_test")
+
 syscall_test(test = "//test/syscalls/linux:chdir_test")
 
 syscall_test(test = "//test/syscalls/linux:chmod_test")
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index ec57ec129..8465e5ad0 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -312,6 +312,19 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "socket_test",
+    testonly = 1,
+    srcs = ["socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_binary(
     name = "brk_test",
     testonly = 1,
diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc
new file mode 100644
index 000000000..0404190a0
--- /dev/null
+++ b/test/syscalls/linux/socket.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST(SocketTest, UnixSocketPairProtocol) {
+  int socks[2];
+  ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks),
+              SyscallSucceeds());
+  close(socks[0]);
+  close(socks[1]);
+}
+
+TEST(SocketTest, Protocol) {
+  struct {
+    int domain, type, protocol;
+  } tests[] = {
+      {AF_UNIX, SOCK_STREAM, PF_UNIX},     {AF_UNIX, SOCK_SEQPACKET, PF_UNIX},
+      {AF_UNIX, SOCK_DGRAM, PF_UNIX},      {AF_INET, SOCK_DGRAM, IPPROTO_UDP},
+      {AF_INET, SOCK_STREAM, IPPROTO_TCP},
+  };
+  for (int i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+    ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(tests[i].domain, tests[i].type, tests[i].protocol));
+  }
+}
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 09a1c1c6e..95cf8d2a3 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -1567,15 +1567,14 @@ TEST_P(UnixSocketPairTest, TIOCOUTQSucceeds) {
 }
 
 TEST_P(UnixSocketPairTest, NetdeviceIoctlsSucceed) {
-  FileDescriptor sock =
-      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_DGRAM, 0));
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   // Prepare the request.
   struct ifreq ifr;
   snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
 
   // Check that the ioctl either succeeds or fails with ENODEV.
-  int err = ioctl(sock.get(), SIOCGIFINDEX, &ifr);
+  int err = ioctl(sockets->first_fd(), SIOCGIFINDEX, &ifr);
   if (err < 0) {
     ASSERT_EQ(errno, ENODEV);
   }
-- 
cgit v1.2.3


From 035a8fa38ed21da2e06db22d3dfd6122610fb856 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Wed, 29 May 2019 11:30:59 -0700
Subject: Add support for collecting execution trace to runsc.

Updates #220

PiperOrigin-RevId: 250532302
---
 pkg/sentry/control/pprof.go | 44 ++++++++++++++++++++++++++++++++++++
 runsc/boot/controller.go    |  2 ++
 runsc/cmd/debug.go          | 55 ++++++++++++++++++++++++++++++++++-----------
 runsc/sandbox/sandbox.go    | 35 +++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 13 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 94ed149f2..d63916600 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -18,6 +18,7 @@ import (
 	"errors"
 	"runtime"
 	"runtime/pprof"
+	"runtime/trace"
 	"sync"
 
 	"gvisor.googlesource.com/gvisor/pkg/fd"
@@ -52,6 +53,9 @@ type Profile struct {
 
 	// cpuFile is the current CPU profile output file.
 	cpuFile *fd.FD
+
+	// traceFile is the current execution trace output file.
+	traceFile *fd.FD
 }
 
 // StartCPUProfile is an RPC stub which starts recording the CPU profile in a
@@ -122,3 +126,43 @@ func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error {
 	}
 	return nil
 }
+
+// StartTrace is an RPC stub which starts collection of an execution trace.
+func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error {
+	if len(o.FilePayload.Files) < 1 {
+		return errNoOutput
+	}
+
+	output, err := fd.NewFromFile(o.FilePayload.Files[0])
+	if err != nil {
+		return err
+	}
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Returns an error if profiling is already started.
+	if err := trace.Start(output); err != nil {
+		output.Close()
+		return err
+	}
+
+	p.traceFile = output
+	return nil
+}
+
+// StopTrace is an RPC stub which stops collection of an ongoing execution
+// trace and flushes the trace data. It takes no argument.
+func (p *Profile) StopTrace(_, _ *struct{}) error {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	if p.traceFile == nil {
+		return errors.New("Execution tracing not start")
+	}
+
+	trace.Stop()
+	p.traceFile.Close()
+	p.traceFile = nil
+	return nil
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index f09c1bd85..72ab9ef86 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -101,6 +101,8 @@ const (
 	StartCPUProfile = "Profile.StartCPUProfile"
 	StopCPUProfile  = "Profile.StopCPUProfile"
 	HeapProfile     = "Profile.HeapProfile"
+	StartTrace      = "Profile.StartTrace"
+	StopTrace       = "Profile.StopTrace"
 )
 
 // ControlSocketAddr generates an abstract unix socket name for the given ID.
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 000f694c7..27eb51172 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -35,6 +35,7 @@ type Debug struct {
 	profileHeap  string
 	profileCPU   string
 	profileDelay int
+	trace        string
 }
 
 // Name implements subcommands.Command.
@@ -59,6 +60,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
 	f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 }
 
@@ -122,35 +124,62 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("     *** Stack dump ***\n%s", stacks)
 	}
-	if d.profileCPU != "" {
-		f, err := os.Create(d.profileCPU)
+	if d.profileHeap != "" {
+		f, err := os.Create(d.profileHeap)
 		if err != nil {
 			Fatalf(err.Error())
 		}
 		defer f.Close()
 
-		if err := c.Sandbox.StartCPUProfile(f); err != nil {
+		if err := c.Sandbox.HeapProfile(f); err != nil {
 			Fatalf(err.Error())
 		}
-		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
-		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+		log.Infof("Heap profile written to %q", d.profileHeap)
+	}
 
-		if err := c.Sandbox.StopCPUProfile(); err != nil {
+	delay := false
+	if d.profileCPU != "" {
+		delay = true
+		f, err := os.Create(d.profileCPU)
+		if err != nil {
 			Fatalf(err.Error())
 		}
-		log.Infof("CPU profile written to %q", d.profileCPU)
+		defer func() {
+			f.Close()
+			if err := c.Sandbox.StopCPUProfile(); err != nil {
+				Fatalf(err.Error())
+			}
+			log.Infof("CPU profile written to %q", d.profileCPU)
+		}()
+		if err := c.Sandbox.StartCPUProfile(f); err != nil {
+			Fatalf(err.Error())
+		}
+		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
 	}
-	if d.profileHeap != "" {
-		f, err := os.Create(d.profileHeap)
+	if d.trace != "" {
+		delay = true
+		f, err := os.Create(d.trace)
 		if err != nil {
 			Fatalf(err.Error())
 		}
-		defer f.Close()
-
-		if err := c.Sandbox.HeapProfile(f); err != nil {
+		defer func() {
+			f.Close()
+			if err := c.Sandbox.StopTrace(); err != nil {
+				Fatalf(err.Error())
+			}
+			log.Infof("Trace written to %q", d.trace)
+		}()
+		if err := c.Sandbox.StartTrace(f); err != nil {
 			Fatalf(err.Error())
 		}
-		log.Infof("Heap profile written to %q", d.profileHeap)
+		log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
+
 	}
+
+	if delay {
+		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+
+	}
+
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index bc69a9d61..47a66afb2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -883,6 +883,41 @@ func (s *Sandbox) StopCPUProfile() error {
 	return nil
 }
 
+// StartTrace start trace  writing to the given file.
+func (s *Sandbox) StartTrace(f *os.File) error {
+	log.Debugf("Trace start %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.StartTrace, &opts, nil); err != nil {
+		return fmt.Errorf("starting sandbox %q trace: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StopTrace stops a previously started trace..
+func (s *Sandbox) StopTrace() error {
+	log.Debugf("Trace stop %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.StopTrace, nil, nil); err != nil {
+		return fmt.Errorf("stopping sandbox %q trace: %v", s.ID, err)
+	}
+	return nil
+}
+
 // DestroyContainer destroys the given container. If it is the root container,
 // then the entire sandbox is destroyed.
 func (s *Sandbox) DestroyContainer(cid string) error {
-- 
cgit v1.2.3


From b18df9bed6af3ff9b526c9ebdcde33dffeac161e Mon Sep 17 00:00:00 2001
From: "chris.zn" <chris.zn@alibaba-inc.com>
Date: Wed, 29 May 2019 16:48:19 -0700
Subject: Add VmData field to /proc/{pid}/status

VmData is the size of private data segments.
It has the same meaning as in Linux.

Change-Id: Iebf1ae85940a810524a6cde9c2e767d4233ddb2a
PiperOrigin-RevId: 250593739
---
 pkg/sentry/fs/proc/task.go  |  4 +++-
 pkg/sentry/mm/lifecycle.go  |  1 +
 pkg/sentry/mm/mm.go         |  6 +++++
 pkg/sentry/mm/mm_test.go    | 54 +++++++++++++++++++++++++++++++++++++++++++++
 pkg/sentry/mm/syscalls.go   | 27 ++++++++++++++++++++---
 pkg/sentry/mm/vma.go        | 19 ++++++++++++++--
 test/syscalls/linux/proc.cc | 15 ++++++++++++-
 7 files changed, 119 insertions(+), 7 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 494b195cd..77e03d349 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -578,7 +578,7 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
 	}
 	fmt.Fprintf(&buf, "TracerPid:\t%d\n", tpid)
 	var fds int
-	var vss, rss uint64
+	var vss, rss, data uint64
 	s.t.WithMuLocked(func(t *kernel.Task) {
 		if fdm := t.FDMap(); fdm != nil {
 			fds = fdm.Size()
@@ -586,11 +586,13 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
 			rss = mm.ResidentSetSize()
+			data = mm.VirtualDataSize()
 		}
 	})
 	fmt.Fprintf(&buf, "FDSize:\t%d\n", fds)
 	fmt.Fprintf(&buf, "VmSize:\t%d kB\n", vss>>10)
 	fmt.Fprintf(&buf, "VmRSS:\t%d kB\n", rss>>10)
+	fmt.Fprintf(&buf, "VmData:\t%d kB\n", data>>10)
 	fmt.Fprintf(&buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
 	creds := s.t.Credentials()
 	fmt.Fprintf(&buf, "CapInh:\t%016x\n", creds.InheritableCaps)
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index e6aa6f9ef..7a65a62a2 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -69,6 +69,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		users:       1,
 		brk:         mm.brk,
 		usageAS:     mm.usageAS,
+		dataAS:      mm.dataAS,
 		// "The child does not inherit its parent's memory locks (mlock(2),
 		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
 		// MLockNone, both of which are zero values. vma.mlockMode is reset
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index d25aa5136..eb6defa2b 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -111,6 +111,12 @@ type MemoryManager struct {
 	// lockedAS is protected by mappingMu.
 	lockedAS uint64
 
+	// dataAS is the size of private data segments, like mm_struct->data_vm.
+	// It means the vma which is private, writable, not stack.
+	//
+	// dataAS is protected by mappingMu.
+	dataAS uint64
+
 	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
 	// defMLockMode is greater.
 	//
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index f4917419f..7209c73ce 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -68,6 +68,60 @@ func TestUsageASUpdates(t *testing.T) {
 	}
 }
 
+func (mm *MemoryManager) realDataAS() uint64 {
+	var sz uint64
+	for seg := mm.vmas.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		vma := seg.Value()
+		if vma.isPrivateDataLocked() {
+			sz += uint64(seg.Range().Length())
+		}
+	}
+	return sz
+}
+
+func TestDataASUpdates(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mm := testMemoryManager(ctx)
+	defer mm.DecUsers(ctx)
+
+	addr, err := mm.MMap(ctx, memmap.MMapOpts{
+		Length:   3 * usermem.PageSize,
+		Private:  true,
+		Perms:    usermem.Write,
+		MaxPerms: usermem.AnyAccess,
+	})
+	if err != nil {
+		t.Fatalf("MMap got err %v want nil", err)
+	}
+	if mm.dataAS == 0 {
+		t.Fatalf("dataAS is 0, wanted not 0")
+	}
+	realDataAS := mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+
+	mm.MUnmap(ctx, addr, usermem.PageSize)
+	realDataAS = mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+
+	mm.MProtect(addr+usermem.PageSize, usermem.PageSize, usermem.Read, false)
+	realDataAS = mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+
+	mm.MRemap(ctx, addr+2*usermem.PageSize, usermem.PageSize, 2*usermem.PageSize, MRemapOpts{
+		Move: MRemapMayMove,
+	})
+	realDataAS = mm.realDataAS()
+	if mm.dataAS != realDataAS {
+		t.Fatalf("dataAS believes %v bytes are mapped; %v bytes are actually mapped", mm.dataAS, realDataAS)
+	}
+}
+
 func TestBrkDataLimitUpdates(t *testing.T) {
 	limitSet := limits.NewLimitSet()
 	limitSet.Set(limits.Data, limits.Limit{}, true /* privileged */) // zero RLIMIT_DATA
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 70c9aa7f6..0368c6794 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -527,6 +527,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		}
 		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 		mm.usageAS += uint64(newAR.Length())
+		if vma.isPrivateDataLocked() {
+			mm.dataAS += uint64(newAR.Length())
+		}
 		if vma.mlockMode != memmap.MLockNone {
 			mm.lockedAS += uint64(newAR.Length())
 			if vma.mlockMode == memmap.MLockEager {
@@ -556,6 +559,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	mm.vmas.Remove(vseg)
 	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if vma.isPrivateDataLocked() {
+		mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	}
 	if vma.mlockMode != memmap.MLockNone {
 		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
 	}
@@ -643,8 +649,16 @@ func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms us
 
 		// Update vma permissions.
 		vma := vseg.ValuePtr()
+		vmaLength := vseg.Range().Length()
+		if vma.isPrivateDataLocked() {
+			mm.dataAS -= uint64(vmaLength)
+		}
+
 		vma.realPerms = realPerms
 		vma.effectivePerms = effectivePerms
+		if vma.isPrivateDataLocked() {
+			mm.dataAS += uint64(vmaLength)
+		}
 
 		// Propagate vma permission changes to pmas.
 		for pseg.Ok() && pseg.Start() < vseg.End() {
@@ -1150,7 +1164,7 @@ func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Add
 func (mm *MemoryManager) VirtualMemorySize() uint64 {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
-	return uint64(mm.usageAS)
+	return mm.usageAS
 }
 
 // VirtualMemorySizeRange returns the combined length in bytes of all mappings
@@ -1165,12 +1179,19 @@ func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
 func (mm *MemoryManager) ResidentSetSize() uint64 {
 	mm.activeMu.RLock()
 	defer mm.activeMu.RUnlock()
-	return uint64(mm.curRSS)
+	return mm.curRSS
 }
 
 // MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
 func (mm *MemoryManager) MaxResidentSetSize() uint64 {
 	mm.activeMu.RLock()
 	defer mm.activeMu.RUnlock()
-	return uint64(mm.maxRSS)
+	return mm.maxRSS
+}
+
+// VirtualDataSize returns the size of private data segments in mm.
+func (mm *MemoryManager) VirtualDataSize() uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return mm.dataAS
 }
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index ad901344b..02203f79f 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -98,7 +98,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 	}
 
 	// Finally insert the vma.
-	vseg := mm.vmas.Insert(vgap, ar, vma{
+	v := vma{
 		mappable:       opts.Mappable,
 		off:            opts.Offset,
 		realPerms:      opts.Perms,
@@ -109,8 +109,13 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		mlockMode:      opts.MLockMode,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
-	})
+	}
+
+	vseg := mm.vmas.Insert(vgap, ar, v)
 	mm.usageAS += opts.Length
+	if v.isPrivateDataLocked() {
+		mm.dataAS += opts.Length
+	}
 	if opts.MLockMode != memmap.MLockNone {
 		mm.lockedAS += opts.Length
 	}
@@ -374,6 +379,9 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 			vma.id.DecRef()
 		}
 		mm.usageAS -= uint64(vmaAR.Length())
+		if vma.isPrivateDataLocked() {
+			mm.dataAS -= uint64(vmaAR.Length())
+		}
 		if vma.mlockMode != memmap.MLockNone {
 			mm.lockedAS -= uint64(vmaAR.Length())
 		}
@@ -396,6 +404,13 @@ func (vma *vma) canWriteMappableLocked() bool {
 	return !vma.private && vma.maxPerms.Write
 }
 
+// isPrivateDataLocked identify the data segments - private, writable, not stack
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) isPrivateDataLocked() bool {
+	return vma.realPerms.Write && vma.private && !vma.growsDown
+}
+
 // vmaSetFunctions implements segment.Functions for vmaSet.
 type vmaSetFunctions struct{}
 
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index e2e8a4ff1..ede6fb860 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -1180,7 +1180,7 @@ bool IsDigits(absl::string_view s) {
   return std::all_of(s.begin(), s.end(), absl::ascii_isdigit);
 }
 
-TEST(ProcPidStatTest, VSSRSS) {
+TEST(ProcPidStatTest, VmStats) {
   std::string status_str =
       ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/status"));
   ASSERT_FALSE(status_str.empty());
@@ -1211,6 +1211,19 @@ TEST(ProcPidStatTest, VSSRSS) {
   EXPECT_TRUE(IsDigits(rss_str.substr(0, rss_str.length() - 3))) << rss_str;
   // ... which is not 0.
   EXPECT_NE('0', rss_str[0]);
+
+  const auto data_it = status.find("VmData");
+  ASSERT_NE(data_it, status.end());
+
+  absl::string_view data_str(data_it->second);
+
+  // Room for the " kB" suffix plus at least one digit.
+  ASSERT_GT(data_str.length(), 3);
+  EXPECT_TRUE(absl::EndsWith(data_str, " kB"));
+  // Everything else is part of a number.
+  EXPECT_TRUE(IsDigits(data_str.substr(0, data_str.length() - 3))) << data_str;
+  // ... which is not 0.
+  EXPECT_NE('0', data_str[0]);
 }
 
 // Parse an array of NUL-terminated char* arrays, returning a vector of strings.
-- 
cgit v1.2.3


From 8d25cd0b40694d1911724816d72b34d0717878d6 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 29 May 2019 17:46:50 -0700
Subject: Update procid for Go 1.13

Upstream Go has no changes here.

PiperOrigin-RevId: 250602731
---
 pkg/sentry/platform/procid/procid_amd64.s | 2 +-
 pkg/sentry/platform/procid/procid_arm64.s | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
index 272c9fc14..30ec8e6e2 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.13
+// +build !go1.14
 
 #include "textflag.h"
 
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/sentry/platform/procid/procid_arm64.s
index 7a1684a18..e340d9f98 100644
--- a/pkg/sentry/platform/procid/procid_arm64.s
+++ b/pkg/sentry/platform/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.13
+// +build !go1.14
 
 #include "textflag.h"
 
-- 
cgit v1.2.3


From ae26b2c425d53aa8720238183d1c156a45904311 Mon Sep 17 00:00:00 2001
From: Bhasker Hariharan <bhaskerh@google.com>
Date: Thu, 30 May 2019 10:47:11 -0700
Subject: Fixes to TCP listen behavior.

Netstack listen loop can get stuck if cookies are in-use and the app is slow to
accept incoming connections. Further we continue to complete handshake for a
connection even if the backlog is full. This creates a problem when a lots of
connections come in rapidly and we end up with lots of completed connections
just hanging around to be delivered.

These fixes change netstack behaviour to mirror what linux does as described
here in the following article

http://veithen.io/2014/01/01/how-tcp-backlog-works-in-linux.html

Now when cookies are not in-use Netstack will silently drop the ACK to a SYN-ACK
and not complete the handshake if the backlog is full.  This will result in the
connection staying in a half-complete state. Eventually the sender will
retransmit the ACK and if backlog has space we will transition to a connected
state and deliver the endpoint.

Similarly when cookies are in use we do not try and create an endpoint unless
there is space in the accept queue to accept the newly created endpoint. If
there is no space then we again silently drop the ACK as we can just recreate it
when the ACK is retransmitted by the peer.

We also now use the backlog to cap the size of the SYN-RCVD queue for a given
endpoint. So at any time there can be N connections in the backlog and N in a
SYN-RCVD state if the application is not accepting connections. Any new SYNs
will be dropped.

This CL also fixes another small bug where we mark a new endpoint which has not
completed handshake as connected. We should wait till handshake successfully
completes before marking it connected.

Updates #236

PiperOrigin-RevId: 250717817
---
 pkg/sentry/socket/epsocket/epsocket.go             |  35 +-
 pkg/tcpip/tcpip.go                                 |  19 +
 pkg/tcpip/transport/tcp/accept.go                  | 161 +++++--
 pkg/tcpip/transport/tcp/connect.go                 |  27 +-
 pkg/tcpip/transport/tcp/endpoint.go                |   6 +-
 pkg/tcpip/transport/tcp/forwarder.go               |   2 +-
 pkg/tcpip/transport/tcp/tcp_test.go                | 499 +++++++++++++++++++--
 pkg/tcpip/transport/tcp/testing/context/context.go |   6 +
 8 files changed, 644 insertions(+), 111 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index 31a449cf2..de4b963da 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -140,21 +140,26 @@ var Metrics = tcpip.Stats{
 		OutgoingPacketErrors:     mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
 	},
 	TCP: tcpip.TCPStats{
-		ActiveConnectionOpenings:  mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
-		PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
-		FailedConnectionAttempts:  mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
-		ValidSegmentsReceived:     mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
-		InvalidSegmentsReceived:   mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
-		SegmentsSent:              mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
-		ResetsSent:                mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
-		ResetsReceived:            mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
-		Retransmits:               mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
-		FastRecovery:              mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
-		SACKRecovery:              mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
-		SlowStartRetransmits:      mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
-		FastRetransmit:            mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
-		Timeouts:                  mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
-		ChecksumErrors:            mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
+		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
+		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
+		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
+		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
+		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
+		ListenOverflowSynCookieRcvd:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
+		ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
+		FailedConnectionAttempts:           mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
+		ValidSegmentsReceived:              mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
+		InvalidSegmentsReceived:            mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
+		SegmentsSent:                       mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
+		ResetsSent:                         mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
+		ResetsReceived:                     mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
+		Retransmits:                        mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
+		FastRecovery:                       mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
+		SACKRecovery:                       mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
+		SlowStartRetransmits:               mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
+		FastRetransmit:                     mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
+		Timeouts:                           mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
+		ChecksumErrors:                     mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
 	},
 	UDP: tcpip.UDPStats{
 		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index c8164c0f0..f9886c6e4 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -760,6 +760,25 @@ type TCPStats struct {
 	// successfully via Listen.
 	PassiveConnectionOpenings *StatCounter
 
+	// ListenOverflowSynDrop is the number of times the listen queue overflowed
+	// and a SYN was dropped.
+	ListenOverflowSynDrop *StatCounter
+
+	// ListenOverflowAckDrop is the number of times the final ACK
+	// in the handshake was dropped due to overflow.
+	ListenOverflowAckDrop *StatCounter
+
+	// ListenOverflowCookieSent is the number of times a SYN cookie was sent.
+	ListenOverflowSynCookieSent *StatCounter
+
+	// ListenOverflowSynCookieRcvd is the number of times a valid SYN
+	// cookie was received.
+	ListenOverflowSynCookieRcvd *StatCounter
+
+	// ListenOverflowInvalidSynCookieRcvd is the number of times an invalid SYN cookie
+	// was received.
+	ListenOverflowInvalidSynCookieRcvd *StatCounter
+
 	// FailedConnectionAttempts is the number of calls to Connect or Listen
 	// (active and passive openings, respectively) that end in an error.
 	FailedConnectionAttempts *StatCounter
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index e506d7133..d4b860975 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -19,6 +19,7 @@ import (
 	"encoding/binary"
 	"hash"
 	"io"
+	"log"
 	"sync"
 	"time"
 
@@ -87,9 +88,10 @@ var synRcvdCount struct {
 // and must not be accessed or have its methods called concurrently as they
 // may mutate the stored objects.
 type listenContext struct {
-	stack  *stack.Stack
-	rcvWnd seqnum.Size
-	nonce  [2][sha1.BlockSize]byte
+	stack    *stack.Stack
+	rcvWnd   seqnum.Size
+	nonce    [2][sha1.BlockSize]byte
+	listenEP *endpoint
 
 	hasherMu sync.Mutex
 	hasher   hash.Hash
@@ -107,15 +109,16 @@ func timeStamp() uint32 {
 // threshold, and fails otherwise.
 func incSynRcvdCount() bool {
 	synRcvdCount.Lock()
-	defer synRcvdCount.Unlock()
 
 	if synRcvdCount.value >= SynRcvdCountThreshold {
+		synRcvdCount.Unlock()
 		return false
 	}
 
 	synRcvdCount.pending.Add(1)
 	synRcvdCount.value++
 
+	synRcvdCount.Unlock()
 	return true
 }
 
@@ -124,20 +127,21 @@ func incSynRcvdCount() bool {
 // succeeded.
 func decSynRcvdCount() {
 	synRcvdCount.Lock()
-	defer synRcvdCount.Unlock()
 
 	synRcvdCount.value--
 	synRcvdCount.pending.Done()
+	synRcvdCount.Unlock()
 }
 
 // newListenContext creates a new listen context.
-func newListenContext(stack *stack.Stack, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
+func newListenContext(stack *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
 	l := &listenContext{
 		stack:    stack,
 		rcvWnd:   rcvWnd,
 		hasher:   sha1.New(),
 		v6only:   v6only,
 		netProto: netProto,
+		listenEP: listenEP,
 	}
 
 	rand.Read(l.nonce[0][:])
@@ -195,9 +199,9 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 	return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true
 }
 
-// createConnectedEndpoint creates a new connected endpoint, with the connection
-// parameters given by the arguments.
-func (l *listenContext) createConnectedEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
+// createConnectingEndpoint creates a new endpoint in a connecting state, with
+// the connection parameters given by the arguments.
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions) (*endpoint, *tcpip.Error) {
 	// Create a new endpoint.
 	netProto := l.netProto
 	if netProto == 0 {
@@ -223,7 +227,7 @@ func (l *listenContext) createConnectedEndpoint(s *segment, iss seqnum.Value, ir
 	}
 
 	n.isRegistered = true
-	n.state = stateConnected
+	n.state = stateConnecting
 
 	// Create sender and receiver.
 	//
@@ -241,7 +245,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	cookie := l.createCookie(s.id, irs, encodeMSS(opts.MSS))
-	ep, err := l.createConnectedEndpoint(s, cookie, irs, opts)
+	ep, err := l.createConnectingEndpoint(s, cookie, irs, opts)
 	if err != nil {
 		return nil, err
 	}
@@ -249,12 +253,15 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Perform the 3-way handshake.
 	h := newHandshake(ep, l.rcvWnd)
 
-	h.resetToSynRcvd(cookie, irs, opts)
+	h.resetToSynRcvd(cookie, irs, opts, l.listenEP)
 	if err := h.execute(); err != nil {
+		ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		ep.Close()
 		return nil, err
 	}
 
+	ep.state = stateConnected
+
 	// Update the receive window scaling. We can't do it before the
 	// handshake because it's possible that the peer doesn't support window
 	// scaling.
@@ -268,13 +275,14 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 // instead.
 func (e *endpoint) deliverAccepted(n *endpoint) {
 	e.mu.RLock()
-	if e.state == stateListen {
+	state := e.state
+	e.mu.RUnlock()
+	if state == stateListen {
 		e.acceptedChan <- n
 		e.waiterQueue.Notify(waiter.EventIn)
 	} else {
 		n.Close()
 	}
-	e.mu.RUnlock()
 }
 
 // handleSynSegment is called in its own goroutine once the listening endpoint
@@ -285,16 +293,36 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 // cookies to accept connections.
 func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) {
 	defer decSynRcvdCount()
+	defer e.decSynRcvdCount()
 	defer s.decRef()
 
 	n, err := ctx.createEndpointAndPerformHandshake(s, opts)
 	if err != nil {
+		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 		return
 	}
 
 	e.deliverAccepted(n)
 }
 
+func (e *endpoint) incSynRcvdCount() bool {
+	e.mu.Lock()
+	log.Printf("l: %d, c: %d, e.synRcvdCount: %d", len(e.acceptedChan), cap(e.acceptedChan), e.synRcvdCount)
+	if l, c := len(e.acceptedChan), cap(e.acceptedChan); l == c && e.synRcvdCount >= c {
+		e.mu.Unlock()
+		return false
+	}
+	e.synRcvdCount++
+	e.mu.Unlock()
+	return true
+}
+
+func (e *endpoint) decSynRcvdCount() {
+	e.mu.Lock()
+	e.synRcvdCount--
+	e.mu.Unlock()
+}
+
 // handleListenSegment is called when a listening endpoint receives a segment
 // and needs to handle it.
 func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
@@ -302,9 +330,20 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 	case header.TCPFlagSyn:
 		opts := parseSynSegmentOptions(s)
 		if incSynRcvdCount() {
-			s.incRef()
-			go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
+			// Drop the SYN if the listen endpoint's accept queue is
+			// overflowing.
+			if e.incSynRcvdCount() {
+				log.Printf("processing syn packet")
+				s.incRef()
+				go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
+				return
+			}
+			log.Printf("dropping syn packet")
+			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
+			e.stack.Stats().DroppedPackets.Increment()
+			return
 		} else {
+			// TODO(bhaskerh): Increment syncookie sent stat.
 			cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
 			// Send SYN with window scaling because we currently
 			// dont't encode this information in the cookie.
@@ -318,36 +357,72 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 				TSEcr: opts.TSVal,
 			}
 			sendSynTCP(&s.route, s.id, header.TCPFlagSyn|header.TCPFlagAck, cookie, s.sequenceNumber+1, ctx.rcvWnd, synOpts)
+			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
 		}
 
 	case header.TCPFlagAck:
-		if data, ok := ctx.isCookieValid(s.id, s.ackNumber-1, s.sequenceNumber-1); ok && int(data) < len(mssTable) {
-			// Create newly accepted endpoint and deliver it.
-			rcvdSynOptions := &header.TCPSynOptions{
-				MSS: mssTable[data],
-				// Disable Window scaling as original SYN is
-				// lost.
-				WS: -1,
-			}
-			// When syn cookies are in use we enable timestamp only
-			// if the ack specifies the timestamp option assuming
-			// that the other end did in fact negotiate the
-			// timestamp option in the original SYN.
-			if s.parsedOptions.TS {
-				rcvdSynOptions.TS = true
-				rcvdSynOptions.TSVal = s.parsedOptions.TSVal
-				rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
-			}
-			n, err := ctx.createConnectedEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions)
-			if err == nil {
-				// clear the tsOffset for the newly created
-				// endpoint as the Timestamp was already
-				// randomly offset when the original SYN-ACK was
-				// sent above.
-				n.tsOffset = 0
-				e.deliverAccepted(n)
-			}
+		if len(e.acceptedChan) == cap(e.acceptedChan) {
+			// Silently drop the ack as the application can't accept
+			// the connection at this point. The ack will be
+			// retransmitted by the sender anyway and we can
+			// complete the connection at the time of retransmit if
+			// the backlog has space.
+			e.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
+			e.stack.Stats().DroppedPackets.Increment()
+			return
+		}
+
+		// Validate the cookie.
+		data, ok := ctx.isCookieValid(s.id, s.ackNumber-1, s.sequenceNumber-1)
+		if !ok || int(data) >= len(mssTable) {
+			e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
+			e.stack.Stats().DroppedPackets.Increment()
+			return
 		}
+		e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment()
+		// Create newly accepted endpoint and deliver it.
+		rcvdSynOptions := &header.TCPSynOptions{
+			MSS: mssTable[data],
+			// Disable Window scaling as original SYN is
+			// lost.
+			WS: -1,
+		}
+
+		// When syn cookies are in use we enable timestamp only
+		// if the ack specifies the timestamp option assuming
+		// that the other end did in fact negotiate the
+		// timestamp option in the original SYN.
+		if s.parsedOptions.TS {
+			rcvdSynOptions.TS = true
+			rcvdSynOptions.TSVal = s.parsedOptions.TSVal
+			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
+		}
+
+		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions)
+		if err != nil {
+			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
+			return
+		}
+
+		// clear the tsOffset for the newly created
+		// endpoint as the Timestamp was already
+		// randomly offset when the original SYN-ACK was
+		// sent above.
+		n.tsOffset = 0
+
+		// Switch state to connected.
+		n.state = stateConnected
+
+		// Do the delivery in a separate goroutine so
+		// that we don't block the listen loop in case
+		// the application is slow to accept or stops
+		// accepting.
+		//
+		// NOTE: This won't result in an unbounded
+		// number of goroutines as we do check before
+		// entering here that there was at least some
+		// space available in the backlog.
+		go e.deliverAccepted(n)
 	}
 }
 
@@ -377,7 +452,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
 	v6only := e.v6only
 	e.mu.Unlock()
 
-	ctx := newListenContext(e.stack, rcvWnd, v6only, e.netProto)
+	ctx := newListenContext(e.stack, e, rcvWnd, v6only, e.netProto)
 
 	s := sleep.Sleeper{}
 	s.AddWaker(&e.notificationWaker, wakerForNotification)
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 3b927d82e..2aed6f286 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -60,11 +60,12 @@ const (
 
 // handshake holds the state used during a TCP 3-way handshake.
 type handshake struct {
-	ep     *endpoint
-	state  handshakeState
-	active bool
-	flags  uint8
-	ackNum seqnum.Value
+	ep       *endpoint
+	listenEP *endpoint // only non nil when doing passive connects.
+	state    handshakeState
+	active   bool
+	flags    uint8
+	ackNum   seqnum.Value
 
 	// iss is the initial send sequence number, as defined in RFC 793.
 	iss seqnum.Value
@@ -141,7 +142,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
 
 // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
 // state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, listenEP *endpoint) {
 	h.active = false
 	h.state = handshakeSynRcvd
 	h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -149,6 +150,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
 	h.ackNum = irs + 1
 	h.mss = opts.MSS
 	h.sndWndScale = opts.WS
+	h.listenEP = listenEP
 }
 
 // checkAck checks if the ACK number, if present, of a segment received during
@@ -279,7 +281,18 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 	// We have previously received (and acknowledged) the peer's SYN. If the
 	// peer acknowledges our SYN, the handshake is completed.
 	if s.flagIsSet(header.TCPFlagAck) {
-
+		// listenContext is also used by a tcp.Forwarder and in that
+		// context we do not have a listening endpoint to check the
+		// backlog. So skip this check if listenEP is nil.
+		if h.listenEP != nil && len(h.listenEP.acceptedChan) == cap(h.listenEP.acceptedChan) {
+			// If there is no space in the accept queue to accept
+			// this endpoint then silently drop this ACK. The peer
+			// will anyway resend the ack and we can complete the
+			// connection the next time it's retransmitted.
+			h.ep.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
 		// If the timestamp option is negotiated and the segment does
 		// not carry a timestamp option then the segment must be dropped
 		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 00962a63e..b66610ee2 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -198,6 +198,10 @@ type endpoint struct {
 	// and dropped when it is.
 	segmentQueue segmentQueue `state:"wait"`
 
+	// synRcvdCount is the number of connections for this endpoint that are
+	// in SYN-RCVD state.
+	synRcvdCount int
+
 	// The following fields are used to manage the send buffer. When
 	// segments are ready to be sent, they are added to sndQueue and the
 	// protocol goroutine is signaled via sndWaker.
@@ -1302,7 +1306,6 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
 	}
 	e.workerRunning = true
 
-	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 	go e.protocolListenLoop( // S/R-SAFE: drained on save.
 		seqnum.Size(e.receiveBufferAvailable()))
 
@@ -1339,6 +1342,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 	// Start the protocol goroutine.
 	wq := &waiter.Queue{}
 	n.startAcceptedLoop(wq)
+	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
 
 	return n, wq, nil
 }
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index e088e24cb..c30b45c2c 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -53,7 +53,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 		maxInFlight: maxInFlight,
 		handler:     handler,
 		inFlight:    make(map[stack.TransportEndpointID]struct{}),
-		listen:      newListenContext(s, seqnum.Size(rcvWnd), true, 0),
+		listen:      newListenContext(s, nil /* listenEP */, seqnum.Size(rcvWnd), true, 0),
 	}
 }
 
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index e341bb4aa..fe037602b 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -124,50 +124,6 @@ func TestActiveFailedConnectionAttemptIncrement(t *testing.T) {
 	}
 }
 
-func TestPassiveConnectionAttemptIncrement(t *testing.T) {
-	c := context.New(t, defaultMTU)
-	defer c.Cleanup()
-
-	stats := c.Stack().Stats()
-	want := stats.TCP.PassiveConnectionOpenings.Value() + 1
-	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
-	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
-	}
-
-	if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
-	}
-	if err := ep.Listen(1); err != nil {
-		t.Fatalf("Listen failed: %v", err)
-	}
-
-	if got := stats.TCP.PassiveConnectionOpenings.Value(); got != want {
-		t.Errorf("got stats.TCP.PassiveConnectionOpenings.Value() = %v, want = %v", got, want)
-	}
-}
-
-func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
-	c := context.New(t, defaultMTU)
-	defer c.Cleanup()
-
-	stats := c.Stack().Stats()
-	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
-	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
-	}
-	c.EP = ep
-	want := stats.TCP.FailedConnectionAttempts.Value() + 1
-
-	if err := ep.Listen(1); err != tcpip.ErrInvalidEndpointState {
-		t.Errorf("got ep.Listen(1) = %v, want = %v", err, tcpip.ErrInvalidEndpointState)
-	}
-
-	if got := stats.TCP.FailedConnectionAttempts.Value(); got != want {
-		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = %v", got, want)
-	}
-}
-
 func TestTCPSegmentsSentIncrement(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -3900,3 +3856,458 @@ func TestKeepalive(t *testing.T) {
 		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrConnectionReset)
 	}
 }
+
+func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
+	// Send a SYN request.
+	irs = seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	iss = seqnum.Value(tcp.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(srcPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+
+	if synCookieInUse {
+		// When cookies are in use window scaling is disabled.
+		tcpCheckers = append(tcpCheckers, checker.TCPSynOptions(header.TCPSynOptions{
+			WS:  -1,
+			MSS: c.MSSWithoutOptions(),
+		}))
+	}
+
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
+
+	// Send ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+	return irs, iss
+}
+
+// TestListenBacklogFull tests that netstack does not complete handshakes if the
+// listen backlog for the endpoint is full.
+func TestListenBacklogFull(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	// Start listening.
+	listenBacklog := 2
+	if err := c.EP.Listen(listenBacklog); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	for i := 0; i < listenBacklog; i++ {
+		executeHandshake(t, c, context.TestPort+uint16(i), false /*synCookieInUse */)
+	}
+
+	time.Sleep(50 * time.Millisecond)
+
+	// Now execute one more handshake. This should not be completed and
+	// delivered on an Accept() call as the backlog is full at this point.
+	irs, iss := executeHandshake(t, c, context.TestPort+uint16(listenBacklog), false /* synCookieInUse */)
+
+	time.Sleep(50 * time.Millisecond)
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	for i := 0; i < listenBacklog; i++ {
+		_, _, err = c.EP.Accept()
+		if err == tcpip.ErrWouldBlock {
+			// Wait for connection to be established.
+			select {
+			case <-ch:
+				_, _, err = c.EP.Accept()
+				if err != nil {
+					t.Fatalf("Accept failed: %v", err)
+				}
+
+			case <-time.After(1 * time.Second):
+				t.Fatalf("Timed out waiting for accept")
+			}
+		}
+	}
+
+	// Now verify that there are no more connections that can be accepted.
+	_, _, err = c.EP.Accept()
+	if err != tcpip.ErrWouldBlock {
+		select {
+		case <-ch:
+			t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
+		case <-time.After(1 * time.Second):
+		}
+	}
+
+	// Now craft the ACK again and verify that the connection is now ready
+	// to be accepted.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + uint16(listenBacklog),
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	newEP, _, err := c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			newEP, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+	// Now verify that the TCP socket is usable and in a connected state.
+	data := "Don't panic"
+	newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	if string(tcp.Payload()) != data {
+		t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
+	}
+}
+
+func TestListenBacklogFullSynCookieInUse(t *testing.T) {
+	saved := tcp.SynRcvdCountThreshold
+	defer func() {
+		tcp.SynRcvdCountThreshold = saved
+	}()
+	tcp.SynRcvdCountThreshold = 1
+
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	// Start listening.
+	listenBacklog := 1
+	portOffset := uint16(0)
+	if err := c.EP.Listen(listenBacklog); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	executeHandshake(t, c, context.TestPort+portOffset, false)
+	portOffset++
+	// Wait for this to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+
+	nonCookieIRS, nonCookieISS := executeHandshake(t, c, context.TestPort+portOffset, false)
+
+	// Since the backlog is full at this point this connection will not
+	// transition out of handshake and ignore the ACK.
+	//
+	// At this point there should be 1 completed connection in the backlog
+	// and one incomplete one pending for a final ACK and hence not ready to be
+	// delivered to the endpoint.
+	//
+	// Now execute one more handshake. This should not be completed and
+	// delivered on an Accept() call as the backlog is full at this point
+	// and there is already 1 pending endpoint.
+	//
+	// This one should use a SYN cookie as the synRcvdCount is equal to the
+	// SynRcvdCountThreshold.
+	time.Sleep(50 * time.Millisecond)
+	portOffset++
+	irs, iss := executeHandshake(t, c, context.TestPort+portOffset, true)
+
+	time.Sleep(50 * time.Millisecond)
+
+	// Verify that there is only one acceptable connection at this point.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now verify that there are no more connections that can be accepted.
+	_, _, err = c.EP.Accept()
+	if err != tcpip.ErrWouldBlock {
+		select {
+		case <-ch:
+			t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
+		case <-time.After(1 * time.Second):
+		}
+	}
+
+	// Now send an ACK for the half completed connection
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + portOffset - 1,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  nonCookieIRS + 1,
+		AckNum:  nonCookieISS + 1,
+		RcvWnd:  30000,
+	})
+
+	// Verify that the connection is now delivered to the backlog.
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Finally send an ACK for the connection that used a cookie and verify that
+	// it's also completed and delivered.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + portOffset,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs,
+		AckNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	time.Sleep(50 * time.Millisecond)
+	newEP, _, err := c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			newEP, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now verify that the TCP socket is usable and in a connected state.
+	data := "Don't panic"
+	newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	if string(tcp.Payload()) != data {
+		t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
+	}
+}
+
+func TestPassiveConnectionAttemptIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+	c.EP = ep
+	if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+	if err := c.EP.Listen(1); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	stats := c.Stack().Stats()
+	want := stats.TCP.PassiveConnectionOpenings.Value() + 1
+
+	srcPort := uint16(context.TestPort)
+	executeHandshake(t, c, srcPort+1, false)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	// Verify that there is only one acceptable connection at this point.
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	if got := stats.TCP.PassiveConnectionOpenings.Value(); got != want {
+		t.Errorf("got stats.TCP.PassiveConnectionOpenings.Value() = %v, want = %v", got, want)
+	}
+}
+
+func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+	c.EP = ep
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+	if err := c.EP.Listen(1); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	srcPort := uint16(context.TestPort)
+	// Now attempt 3 handshakes, the first two will fill up the accept and the SYN-RCVD
+	// queue for the endpoint.
+	executeHandshake(t, c, srcPort, false)
+
+	// Give time for the final ACK to be processed as otherwise the next handshake could
+	// get accepted before the previous one based on goroutine scheduling.
+	time.Sleep(50 * time.Millisecond)
+	irs, iss := executeHandshake(t, c, srcPort+1, false)
+
+	// Wait for a short while for the accepted connection to be delivered to
+	// the channel before trying to send the 3rd SYN.
+	time.Sleep(40 * time.Millisecond)
+
+	want := stats.TCP.ListenOverflowSynDrop.Value() + 1
+
+	// Now we will send one more SYN and this one should get dropped
+	// Send a SYN request.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: srcPort + 2,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(789),
+		RcvWnd:  30000,
+	})
+
+	time.Sleep(50 * time.Millisecond)
+	if got := stats.TCP.ListenOverflowSynDrop.Value(); got != want {
+		t.Errorf("got stats.TCP.ListenOverflowSynDrop.Value() = %v, want = %v", got, want)
+	}
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	// Now check that there is one acceptable connections.
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now complete the next connection in SYN-RCVD state as it should
+	// have dropped the final ACK to the handshake due to accept queue
+	// being full.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: srcPort + 1,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	// Now check that there is one more acceptable connections.
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Try and accept a 3rd one this should fail.
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			ep, _, err = c.EP.Accept()
+			if err == nil {
+				t.Fatalf("Accept succeeded when it should have failed got: %+v", ep)
+			}
+
+		case <-time.After(1 * time.Second):
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index e08eb6533..6e12413c6 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -989,3 +989,9 @@ func (c *Context) SACKEnabled() bool {
 func (c *Context) SetGSOEnabled(enable bool) {
 	c.linkEP.GSO = enable
 }
+
+// MSSWithoutOptions returns the value for the MSS used by the stack when no
+// options are in use.
+func (c *Context) MSSWithoutOptions() uint16 {
+	return uint16(c.linkEP.MTU() - header.IPv4MinimumSize - header.TCPMinimumSize)
+}
-- 
cgit v1.2.3


From 38de91b028639ef5f4a4c8874b3ee23503fd2f3a Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Thu, 30 May 2019 12:01:41 -0700
Subject: Add build guard to files using go:linkname

Funcion signatures are not validated during compilation. Since
they are not exported, they can change at any time. The guard
ensures that they are verified at least on every version upgrade.

PiperOrigin-RevId: 250733742
---
 pkg/sentry/platform/kvm/bluepill_unsafe.go          |  5 +++++
 pkg/sentry/platform/kvm/machine_unsafe.go           |  5 +++++
 pkg/sentry/platform/ptrace/subprocess_unsafe.go     |  5 +++++
 pkg/sleep/sleep_unsafe.go                           |  4 +++-
 pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go |  5 +++--
 pkg/tcpip/time_unsafe.go                            |  4 +++-
 third_party/gvsync/BUILD                            |  4 ++++
 .../gvsync/downgradable_rwmutex_1_12_unsafe.go      | 21 +++++++++++++++++++++
 .../gvsync/downgradable_rwmutex_1_13_unsafe.go      | 16 ++++++++++++++++
 third_party/gvsync/downgradable_rwmutex_unsafe.go   | 20 +++++++++++---------
 third_party/gvsync/memmove_unsafe.go                | 13 +++++++++----
 11 files changed, 85 insertions(+), 17 deletions(-)
 create mode 100644 third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
 create mode 100644 third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 4184939e5..7e8e9f42a 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
 package kvm
 
 import (
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index 452d88d7f..1d3c6d2d6 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
 package kvm
 
 import (
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 17736b05b..b80a3604d 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
 package ptrace
 
 import (
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 0526f52de..8f5e60a25 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.13
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
 
 // Package sleep allows goroutines to efficiently sleep on multiple sources of
 // notifications (wakers). It offers O(1) complexity, which is different from
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
index 0b51982c6..c87268610 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64_unsafe.go
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 // +build linux,amd64
-// +build !go1.13
+// +build go1.12
+// +build !go1.14
 
-// This must be validated with Go 1.13 and future releases.
+// Check go:linkname function signatures when updating Go version.
 
 package rawfile
 
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 1a307483b..a52262e87 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.13
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
 
 package tcpip
 
diff --git a/third_party/gvsync/BUILD b/third_party/gvsync/BUILD
index 4764eaa83..04a1fbeba 100644
--- a/third_party/gvsync/BUILD
+++ b/third_party/gvsync/BUILD
@@ -5,6 +5,8 @@ package(
     licenses = ["notice"],
 )
 
+exports_files(["LICENSE"])
+
 load("//tools/go_generics:defs.bzl", "go_template")
 
 go_template(
@@ -29,6 +31,8 @@ go_template(
 go_library(
     name = "gvsync",
     srcs = [
+        "downgradable_rwmutex_1_12_unsafe.go",
+        "downgradable_rwmutex_1_13_unsafe.go",
         "downgradable_rwmutex_unsafe.go",
         "gvsync.go",
         "memmove_unsafe.go",
diff --git a/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
new file mode 100644
index 000000000..855b2a2b1
--- /dev/null
+++ b/third_party/gvsync/downgradable_rwmutex_1_12_unsafe.go
@@ -0,0 +1,21 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.12
+// +build !go1.13
+
+// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
+
+package gvsync
+
+import _ "unsafe"
+
+//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
+func runtimeSemrelease112(s *uint32, handoff bool)
+
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
+	// 'skipframes' is only available starting from 1.13.
+	runtimeSemrelease112(s, handoff)
+}
diff --git a/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go b/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
new file mode 100644
index 000000000..8baec5458
--- /dev/null
+++ b/third_party/gvsync/downgradable_rwmutex_1_13_unsafe.go
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The gVisor Authors.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.13
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package gvsync
+
+import _ "unsafe"
+
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/third_party/gvsync/downgradable_rwmutex_unsafe.go b/third_party/gvsync/downgradable_rwmutex_unsafe.go
index 4d43eb765..069939033 100644
--- a/third_party/gvsync/downgradable_rwmutex_unsafe.go
+++ b/third_party/gvsync/downgradable_rwmutex_unsafe.go
@@ -3,6 +3,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
 // This is mostly copied from the standard library's sync/rwmutex.go.
 //
 // Happens-before relationships indicated to the race detector:
@@ -19,6 +24,9 @@ import (
 	"unsafe"
 )
 
+//go:linkname runtimeSemacquire sync.runtime_Semacquire
+func runtimeSemacquire(s *uint32)
+
 // DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
 // method.
 type DowngradableRWMutex struct {
@@ -62,7 +70,7 @@ func (rw *DowngradableRWMutex) RUnlock() {
 		// A writer is pending.
 		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
 			// The last reader unblocks the writer.
-			runtimeSemrelease(&rw.writerSem, false)
+			runtimeSemrelease(&rw.writerSem, false, 0)
 		}
 	}
 	if RaceEnabled {
@@ -103,7 +111,7 @@ func (rw *DowngradableRWMutex) Unlock() {
 	}
 	// Unblock blocked readers, if any.
 	for i := 0; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false)
+		runtimeSemrelease(&rw.readerSem, false, 0)
 	}
 	// Allow other writers to proceed.
 	rw.w.Unlock()
@@ -126,7 +134,7 @@ func (rw *DowngradableRWMutex) DowngradeLock() {
 	// Unblock blocked readers, if any. Note that this loop starts as 1 since r
 	// includes this goroutine.
 	for i := 1; i < int(r); i++ {
-		runtimeSemrelease(&rw.readerSem, false)
+		runtimeSemrelease(&rw.readerSem, false, 0)
 	}
 	// Allow other writers to proceed to rw.w.Lock(). Note that they will still
 	// block on rw.writerSem since at least this reader exists, such that
@@ -136,9 +144,3 @@ func (rw *DowngradableRWMutex) DowngradeLock() {
 		RaceEnable()
 	}
 }
-
-//go:linkname runtimeSemacquire sync.runtime_Semacquire
-func runtimeSemacquire(s *uint32)
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool)
diff --git a/third_party/gvsync/memmove_unsafe.go b/third_party/gvsync/memmove_unsafe.go
index 4c8aa9ab6..84b69f215 100644
--- a/third_party/gvsync/memmove_unsafe.go
+++ b/third_party/gvsync/memmove_unsafe.go
@@ -3,12 +3,21 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
 package gvsync
 
 import (
 	"unsafe"
 )
 
+//go:linkname memmove runtime.memmove
+//go:noescape
+func memmove(to, from unsafe.Pointer, n uintptr)
+
 // Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad<T>, which can't
 // define it because go_generics can't update the go:linkname annotation.
 // Furthermore, go:linkname silently doesn't work if the local name is exported
@@ -17,7 +26,3 @@ import (
 func Memmove(to, from unsafe.Pointer, n uintptr) {
 	memmove(to, from, n)
 }
-
-//go:linkname memmove runtime.memmove
-//go:noescape
-func memmove(to, from unsafe.Pointer, n uintptr)
-- 
cgit v1.2.3


From 6f73d79c32594cb85cc00b1eaf72bf4c1def2a79 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Thu, 30 May 2019 17:19:00 -0700
Subject: Simplify overlayBoundEndpoint.

There is no reason to do the recursion manually, since
Inode.BoundEndpoint will do it for us.

PiperOrigin-RevId: 250794903
---
 pkg/sentry/fs/inode_overlay.go | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'pkg/sentry')

diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index ea574224f..cdffe173b 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -433,12 +433,7 @@ func overlayBoundEndpoint(o *overlayEntry, path string) transport.BoundEndpoint
 		return o.upper.InodeOperations.BoundEndpoint(o.upper, path)
 	}
 
-	// If the lower is itself an overlay, recurse.
-	if o.lower.overlay != nil {
-		return overlayBoundEndpoint(o.lower.overlay, path)
-	}
-	// Lower is not an overlay. Call BoundEndpoint directly.
-	return o.lower.InodeOperations.BoundEndpoint(o.lower, path)
+	return o.lower.BoundEndpoint(path)
 }
 
 func overlayGetFile(ctx context.Context, o *overlayEntry, d *Dirent, flags FileFlags) (*File, error) {
-- 
cgit v1.2.3